def _craw(self, url, param=None, *args): # while True: res = requests.get(url, params=param, headers=headers) # param["page"] = param["page"] + 1 if res.status_code == 200: # response body_json = res.json() if body_json: res_list = [] for arti in body_json: arti = arti['object']['data'] data = third_post_db.find_by_pt_id( "jianshu-" + str(arti['id']), self.third_id) if data is None: # 构建 post = ThirdPost(self.third_id, self.third_name, 0) post.tags = '' # 顺序 文章id、标题、标签、作者、喜欢数、评论数、跳转url、创建时间 post.post_id = "jianshu-" + str(arti['id']) post.title = arti['title'] post.author = arti['user']['nickname'] post.content = arti['public_abbr'] post.like_num = arti['likes_count'] post.comment_num = arti['public_comments_count'] post.redirect_url = 'https://www.jianshu.com/p/' + \ arti["slug"] post.creatime = arrow.get( arti['first_shared_at']).format( 'YYYY-MM-DD HH:mm:ss') res_list.append(post) log.info("[%s]爬取-> %s %d条记录", self.third_name, url, len(res_list)) self.batch_insert(res_list)
def _craw(self, url, param=None, *args): res = requests.get(url, param, headers=headers) if res.status_code == 200: # html文档 html = res.text soup = BeautifulSoup(html, "html.parser") # 所有的文章 posts = soup.find_all("div", class_="post") res_list = [] # 多个文章解析 for post in posts: p = ThirdPost(self.third_id, self.third_name, 0) post_content = post.find("div", class_="content") meta = post_content.find("div", class_="meta") if param == None: user_info = post.find("div", class_="user-info").find( "div", class_="info") # 作者 p.author = user_info.h4.text # postId p.post_id = post_content.p.a["href"] # 内容 p.content = post_content.p.a.string else: # 作者 author_wrap = soup.find("div", class_="m-b").h3.text p.author = author_wrap # postId p.post_id = post_content["data-url"] p.content = "" # 标题 p.title = post_content.h3.a.string # 赞数量 p.like_num = post.find( "a", class_="like-button").find("span").string # 评论数量 p.comment_num = list(meta.find("span").stripped_strings)[0] # 跳转路由 p.redirect_url = host + post_content.h3.a['href'] data = third_post_db.find_by_pt_id(p.post_id, p.third_id) if data is None: res_list.append(p) log.info("[%s]爬取-> %s %d条记录", self.third_name, url, len(res_list)) self.batch_insert(res_list)
def _craw(self, url, param=None, *args): res = requests.post(url, json.dumps(param), headers=header) if res.status_code == 200: like_total = args[0] # 至少喜欢的数量 # juejin response body_json = res.json() print(body_json) if body_json['data'] is None: log.error("爬取掘金失败" + body_json['errors']) return article_list = body_json['data']['articleFeed']['items']['edges'] res_list = [] for artiCol in article_list: arti = artiCol['node'] data = third_post_db.find_by_pt_id( arti['id'], self.third_id) if data is None and arti['likeCount'] > like_total: # 大于30喜欢的加入 # 构建 post = ThirdPost(self.third_id, self.third_name, 0) tags = [] for t in arti['tags']: tags.append(t['title']) post.tags = ",".join(tags) # 顺序 文章id、标题、标签、作者、喜欢数、评论数、跳转url、创建时间 post.post_id = arti['id'] post.title = arti['title'] post.author = arti['user']['username'] post.content = arti['content'] post.like_num = arti['likeCount'] post.comment_num = arti['commentsCount'] post.redirect_url = arti['originalUrl'] post.creatime = arrow.get( arti['createdAt']).format('YYYY-MM-DD HH:mm:ss') res_list.append(post) log.info("[%s]爬取-> %s %d条记录", self.third_name, url, len(res_list)) self.batch_insert(res_list)