def parse_other_list_page(self, response): """解析其他帖子列表页""" article_count = response.meta.get("article_count") # 终止翻页 if article_count.get("end_collect"): return [] # 用于翻页 user_account = response.meta.get("UserAccount") current_page = response.meta.get('current_page') # if current_page > 5: # 固定只采5页(100篇) # return [] try: res = json.loads(response.text) except: return [] article_list_response = HtmlResponse(url=response.url, body=res.get('items_html'), encoding='utf-8') article_list_response.meta2 = response.meta article_requests = self.make_article_request(article_list_response, current_page=current_page) for articleRequest in article_requests: yield articleRequest # 第三页及以后的帖子列表页 has_next = res.get('has_more_items') and res.get('min_position') if has_next: next_page = res.get('min_position') next_url = f'https://twitter.com/i/profiles/show/{user_account}/timeline/tweets?include_available_features=1&include_entities=1&max_position={next_page}&reset_error_state=false' meta = response.meta meta['current_page'] = current_page + 1 yield scrapy.Request(url=next_url, callback=self.parse_other_list_page, meta=meta)
def parse_show_more_more_replies(self, response): """每个二级评论块可能有--另外?条回复""" try: res = json.loads(response.text) except: return [] comment_list3_response = HtmlResponse(url=response.url, body=res.get('conversation_html'), encoding='utf-8') comment_list3_response.meta2 = copy.deepcopy(response.meta) comments = self.parse_secondary_comment(comment_list3_response) for comment in comments: # 创建评论对象,同时更新社交关系 if isinstance(comment, scrapy.Request): yield comment else: comment.pop("is_end_comment") # 被评论人等信息 data_reply_to_users = comment.pop("data_reply_to_users") new_relations_dct = comment.pop("NEW_RELATIONS") comment_item = CommentItem(**comment) # pop后就创建字典 yield comment_item # social_relations = self.make_social_relation_dict(data_reply_to_users, comment) # for relation in social_relations: # relation_item = SocialRelationItem(ListSocialRelation=[]) # relation_item["ListSocialRelation"].append(relation) # yield relation_item social_relation = self.make_social_relation_dict_bad(new_relations_dct, comment) relation_item = SocialRelationItem(**social_relation) yield relation_item
def parse_other_comment_page(self, response): """解析其他页一级评论页(json格式)""" # 翻页做准备 commentator_account = response.meta.get("commentator_account") comment_id = response.meta.get("comment_id") # 评论翻页计数 current_comment_page = response.meta.get("current_comment_page") is_new = response.meta.get("is_new") # if self.increment_crawl and not is_new and current_comment_page >= 3: # return [] try: res = json.loads(response.text) except: return [] comment_list_response = HtmlResponse(url=response.url, body=res.get('items_html'), encoding='utf-8') comment_list_response.meta2 = copy.deepcopy(response.meta) comments = self.parse_comment(comment_list_response) for comment in comments: # 创建评论对象,同时更新社交关系 if isinstance(comment, scrapy.Request): yield comment else: if comment.pop("is_end_comment"): # 增量爬取,终止继续采集评论 return [] # 被评论人等信息 data_reply_to_users = comment.pop("data_reply_to_users") new_relations_dct = comment.pop("NEW_RELATIONS") comment_item = CommentItem(**comment) # pop后就创建字典 yield comment_item # social_relations = self.make_social_relation_dict(data_reply_to_users, comment) # for relation in social_relations: # relation_item = SocialRelationItem(ListSocialRelation=[]) # relation_item["ListSocialRelation"].append(relation) # yield relation_item social_relation = self.make_social_relation_dict_bad( new_relations_dct, comment) relation_item = SocialRelationItem(**social_relation) yield relation_item # 其他一级评论页 再翻页 has_more_comment = self.has_more_comment(comment_list_response) has_next = res.get('min_position') or has_more_comment if has_next: # 是否含有下一页 next_url = 'https://twitter.com/i/' + commentator_account + '/conversation/' + comment_id + '?include_available_features=1&include_entities=1&max_position=' + has_next if len(next_url) >= 10000: return [] else: meta = copy.deepcopy(response.meta) meta["current_comment_page"] = current_comment_page + 1 yield scrapy.Request(url=next_url, callback=self.parse_other_comment_page, meta=meta)