コード例 #1
0
    def parse_other_list_page(self, response):
        """解析其他帖子列表页"""
        article_count = response.meta.get("article_count")
        # 终止翻页
        if article_count.get("end_collect"):
            return []

        # 用于翻页
        user_account = response.meta.get("UserAccount")
        current_page = response.meta.get('current_page')
        # if current_page > 5:  # 固定只采5页(100篇)
        #     return []
        try:
            res = json.loads(response.text)
        except:
            return []
        article_list_response = HtmlResponse(url=response.url, body=res.get('items_html'), encoding='utf-8')
        article_list_response.meta2 = response.meta
        article_requests = self.make_article_request(article_list_response, current_page=current_page)
        for articleRequest in article_requests:
            yield articleRequest

        # 第三页及以后的帖子列表页
        has_next = res.get('has_more_items') and res.get('min_position')
        if has_next:
            next_page = res.get('min_position')
            next_url = f'https://twitter.com/i/profiles/show/{user_account}/timeline/tweets?include_available_features=1&include_entities=1&max_position={next_page}&reset_error_state=false'
            meta = response.meta
            meta['current_page'] = current_page + 1
            yield scrapy.Request(url=next_url, callback=self.parse_other_list_page, meta=meta)
コード例 #2
0
    def parse_show_more_more_replies(self, response):
        """每个二级评论块可能有--另外?条回复"""
        try:
            res = json.loads(response.text)
        except:
            return []
        comment_list3_response = HtmlResponse(url=response.url, body=res.get('conversation_html'), encoding='utf-8')
        comment_list3_response.meta2 = copy.deepcopy(response.meta)
        comments = self.parse_secondary_comment(comment_list3_response)
        for comment in comments:  # 创建评论对象,同时更新社交关系
            if isinstance(comment, scrapy.Request):
                yield comment
            else:
                comment.pop("is_end_comment")
                # 被评论人等信息
                data_reply_to_users = comment.pop("data_reply_to_users")
                new_relations_dct = comment.pop("NEW_RELATIONS")
                comment_item = CommentItem(**comment)  # pop后就创建字典
                yield comment_item
                # social_relations = self.make_social_relation_dict(data_reply_to_users, comment)
                # for relation in social_relations:
                #     relation_item = SocialRelationItem(ListSocialRelation=[])
                #     relation_item["ListSocialRelation"].append(relation)
                #     yield relation_item

                social_relation = self.make_social_relation_dict_bad(new_relations_dct, comment)
                relation_item = SocialRelationItem(**social_relation)
                yield relation_item
コード例 #3
0
    def parse_other_comment_page(self, response):
        """解析其他页一级评论页(json格式)"""
        # 翻页做准备
        commentator_account = response.meta.get("commentator_account")
        comment_id = response.meta.get("comment_id")
        # 评论翻页计数
        current_comment_page = response.meta.get("current_comment_page")
        is_new = response.meta.get("is_new")
        # if self.increment_crawl and not is_new and current_comment_page >= 3:
        #     return []
        try:
            res = json.loads(response.text)
        except:
            return []
        comment_list_response = HtmlResponse(url=response.url,
                                             body=res.get('items_html'),
                                             encoding='utf-8')
        comment_list_response.meta2 = copy.deepcopy(response.meta)
        comments = self.parse_comment(comment_list_response)
        for comment in comments:  # 创建评论对象,同时更新社交关系
            if isinstance(comment, scrapy.Request):
                yield comment
            else:
                if comment.pop("is_end_comment"):  # 增量爬取,终止继续采集评论
                    return []
                # 被评论人等信息
                data_reply_to_users = comment.pop("data_reply_to_users")
                new_relations_dct = comment.pop("NEW_RELATIONS")
                comment_item = CommentItem(**comment)  # pop后就创建字典
                yield comment_item
                # social_relations = self.make_social_relation_dict(data_reply_to_users, comment)
                # for relation in social_relations:
                #     relation_item = SocialRelationItem(ListSocialRelation=[])
                #     relation_item["ListSocialRelation"].append(relation)
                #     yield relation_item

                social_relation = self.make_social_relation_dict_bad(
                    new_relations_dct, comment)
                relation_item = SocialRelationItem(**social_relation)
                yield relation_item

        # 其他一级评论页 再翻页
        has_more_comment = self.has_more_comment(comment_list_response)
        has_next = res.get('min_position') or has_more_comment
        if has_next:  # 是否含有下一页
            next_url = 'https://twitter.com/i/' + commentator_account + '/conversation/' + comment_id + '?include_available_features=1&include_entities=1&max_position=' + has_next
            if len(next_url) >= 10000:
                return []
            else:
                meta = copy.deepcopy(response.meta)
                meta["current_comment_page"] = current_comment_page + 1
                yield scrapy.Request(url=next_url,
                                     callback=self.parse_other_comment_page,
                                     meta=meta)