Beispiel #1
0
def is_dup_detail(detail_url, spider_name, channel_id=0):
    """
    检查详细页是否重复
    :param detail_url:
    :param spider_name:
    :param channel_id:
    :return:
    """
    detail_dup_key = 'dup:%s:%s' % (spider_name, channel_id)
    detail_url_finger = get_request_finger(detail_url)
    return redis_client.sismember(detail_dup_key, detail_url_finger)
Beispiel #2
0
def add_dup_detail(detail_url, spider_name, channel_id=0):
    """
    把当前详细页加入集合
    :param detail_url:
    :param spider_name:
    :param channel_id:
    :return:
    """
    detail_dup_key = 'dup:%s:%s' % (spider_name, channel_id)
    detail_url_finger = get_request_finger(detail_url)
    return redis_client.sadd(detail_dup_key, detail_url_finger)
Beispiel #3
0
    def parse_article_list(self, response):
        """
        文章列表解析
        没有翻页特征 <a class=\"page next S_txt1 S_line1 page_dis\"><span>下一页<\/span>
        解析链接 href=\"\/p\/1005051627825392\/wenzhang?pids=Pl_Core_ArticleList__61&cfs=600&Pl_Core_ArticleList__61_filter=&Pl_Core_ArticleList__61_page=6#Pl_Core_ArticleList__61\"
        """
        print('task_url: %s' % response.url)
        # 页面解析(微博是JS动态数据, 无法直接解析页面)
        article_list_body = response.body_as_unicode()

        article_list_rule = r'<script>FM.view\({"ns":"pl.content.miniTab.index","domid":"Pl_Core_ArticleList__\d+".*?"html":"(.*?)"}\)</script>'
        article_list_re_parse = re.compile(article_list_rule,
                                           re.S).findall(article_list_body)
        if not article_list_re_parse:
            return
        article_list_html = ''.join(article_list_re_parse)

        # 转义字符处理
        article_list_html = article_list_html.replace('\\r', '')
        article_list_html = article_list_html.replace('\\t', '')
        article_list_html = article_list_html.replace('\\n', '')
        article_list_html = article_list_html.replace('\\"', '"')
        article_list_html = article_list_html.replace('\\/', '/')

        article_list_doc = fromstring(article_list_html)
        article_list_doc_parse = article_list_doc.xpath(
            '//div[@class="text_box"]')

        for article_item in article_list_doc_parse:
            article_detail_url = article_item.xpath(
                './div[@class="title W_autocut"]/a[@class="W_autocut S_txt1"]/@href'
            )
            article_detail_title = article_item.xpath(
                './div[@class="title W_autocut"]/a[@class="W_autocut S_txt1"]/text()'
            )
            article_detail_abstract = article_item.xpath(
                './div[@class="text"]/a[@class="S_txt1"]/text()')
            if not (article_detail_url and article_detail_title):
                continue
            article_detail_url = article_detail_url[0].strip()
            article_detail_url = response.urljoin(article_detail_url)
            article_detail_title = article_detail_title[0].strip()

            article_detail_abstract = article_detail_abstract[0].strip(
            ) if article_detail_abstract else ''

            meta_article_item = {
                'article_url': article_detail_url,
                'article_title': article_detail_title,
                'article_abstract': article_detail_abstract,
                'article_id': get_request_finger(article_detail_url),
            }

            meta = dict(response.meta, **meta_article_item)

            # 两种不同类型页面
            if '/ttarticle/p/show?id=' in article_detail_url:
                yield scrapy.Request(url=article_detail_url,
                                     callback=self.parse_article_detail_html,
                                     meta=meta)
            else:
                yield scrapy.Request(url=article_detail_url,
                                     callback=self.parse_article_detail_js,
                                     meta=meta)

        # 翻页处理
        next_url_parse = article_list_doc.xpath(
            '//a[@class="page next S_txt1 S_line1"]/@href')
        if not next_url_parse:
            print('当前条件列表页最后一页:%s' % response.url)
        else:
            next_url = next_url_parse[0]
            next_url = response.urljoin(next_url)
            print(next_url)
            yield scrapy.Request(url=next_url,
                                 callback=self.parse_article_list,
                                 meta=response.meta)