def _craw(self, url, param=None, *args): # while True: res = requests.get(url, params=param, headers=headers) # param["page"] = param["page"] + 1 if res.status_code == 200: # response body_json = res.json() if body_json: res_list = [] for arti in body_json: arti = arti['object']['data'] data = third_post_db.find_by_pt_id( "jianshu-" + str(arti['id']), self.third_id) if data is None: # 构建 post = ThirdPost(self.third_id, self.third_name, 0) post.tags = '' # 顺序 文章id、标题、标签、作者、喜欢数、评论数、跳转url、创建时间 post.post_id = "jianshu-" + str(arti['id']) post.title = arti['title'] post.author = arti['user']['nickname'] post.content = arti['public_abbr'] post.like_num = arti['likes_count'] post.comment_num = arti['public_comments_count'] post.redirect_url = 'https://www.jianshu.com/p/' + \ arti["slug"] post.creatime = arrow.get( arti['first_shared_at']).format( 'YYYY-MM-DD HH:mm:ss') res_list.append(post) log.info("[%s]爬取-> %s %d条记录", self.third_name, url, len(res_list)) self.batch_insert(res_list)
def _craw(self, url, param=None, *args): res = requests.post(url, json.dumps(param), headers=headers) if res.status_code == 200: # html文档 body_json = res.json() print(body_json) article_list = body_json['data'] res_list = [] for post in article_list: p = ThirdPost(self.third_id, self.third_name, 0) p.title = post['article_title'] tags = [] for t in post['topic']: tags.append(t['name']) p.tags = ",".join(tags) p.post_id = "infoq-" + post['uuid'] if 'author' in post.keys(): p.author = post['author'][0]['nickname'] else: p.author = "InfoQ" p.content = post['article_summary'] p.redirect_url = "https://www.infoq.cn/article/" + post['uuid'] p.creatime = arrow.get(post['utime'] / 1000).format('YYYY-MM-DD HH:mm:ss') data = third_post_db.find_by_pt_id(p.post_id, p.third_id) if data is None: res_list.append(p) log.info("[%s]爬取-> %s %d条记录", self.third_name, url, len(res_list)) self.batch_insert(res_list)
def _craw(self, url, param=None, *args): res = requests.get(url) if res.status_code == 200: # html文档 html = res.text soup = BeautifulSoup(html, "html.parser") # 所有的文章 posts = soup.find_all("div", class_="list_article_item") res_list = [] # 多个文章解析 for post in posts: p = ThirdPost(self.third_id, self.third_name, 0) tip_spans = post.find("div", class_="tip").find_all("span") # postId p.post_id = post['data-id'] # 标题 p.title = post.find("div", class_="title").a.string # 跳转路由 p.redirect_url = host + post.find("div", class_="title").a['href'] # 创建时间 now_year = datetime.datetime.now().year p.creatime = str(now_year) + "-" + list( tip_spans)[2].string.strip() # 作者 p.author = list(tip_spans)[0].string.strip() # 标签 p.tags = args[0] data = third_post_db.find_by_pt_id(p.post_id, p.third_id) if data is None: res_list.append(p) log.info("[%s]爬取-> %s %d条记录", self.third_name, url, len(res_list)) self.batch_insert(res_list)
def _craw(self, url, param=None, *args): res = requests.get(url, param) if res.status_code == 200: # html文档 html = res.text soup = BeautifulSoup(html, 'html.parser') res_list = [] # 所有的文章 posts = soup.find_all("div", class_="article-lwrap") for post in posts: p = ThirdPost(self.third_id, self.third_name,0) post_a = post.find("a", "title") # 标题 p.title = post_a.p.string # 跳转路由 p.redirect_url = host+post_a['href'] p.author = post.find("a", class_='nickName').string.strip() # 创建时间 p.creatime = datetime.now().strftime("%Y-%m-%d %H:%M:%S") # postId p.post_id = "/imooc"+post_a['href'] # tags p_skills = post.find("span", class_="skill") p_tags = p_skills.find_all("a") tags = [] for tag in p_tags: tags.append(tag.span.string) p.tags = ",".join(tags) data = third_post_db.find_by_pt_id(p.post_id, p.third_id) if data is None: res_list.append(p) log.info("[%s]爬取-> %s %d条记录", self.third_name, url, len(res_list)) self.batch_insert(res_list)
def _craw(self, url, param=None, *args): res = requests.post(url, json.dumps(param), headers=header) if res.status_code == 200: like_total = args[0] # 至少喜欢的数量 # juejin response body_json = res.json() print(body_json) if body_json['data'] is None: log.error("爬取掘金失败" + body_json['errors']) return article_list = body_json['data']['articleFeed']['items']['edges'] res_list = [] for artiCol in article_list: arti = artiCol['node'] data = third_post_db.find_by_pt_id( arti['id'], self.third_id) if data is None and arti['likeCount'] > like_total: # 大于30喜欢的加入 # 构建 post = ThirdPost(self.third_id, self.third_name, 0) tags = [] for t in arti['tags']: tags.append(t['title']) post.tags = ",".join(tags) # 顺序 文章id、标题、标签、作者、喜欢数、评论数、跳转url、创建时间 post.post_id = arti['id'] post.title = arti['title'] post.author = arti['user']['username'] post.content = arti['content'] post.like_num = arti['likeCount'] post.comment_num = arti['commentsCount'] post.redirect_url = arti['originalUrl'] post.creatime = arrow.get( arti['createdAt']).format('YYYY-MM-DD HH:mm:ss') res_list.append(post) log.info("[%s]爬取-> %s %d条记录", self.third_name, url, len(res_list)) self.batch_insert(res_list)