コード例 #1
0
ファイル: hu.py プロジェクト: ck03/huxiu_scrapy
    def parse(self, response):
        print("yes in ")
        if self.pg == 1:
            # 第一頁用get
            # html_str = response.text
            # print(html_str)
            div_list = response.xpath("//div[@class='container']//div[@class='mod-info-flow']/div")
            print(len(div_list))
            for div in div_list:
                item = HuxiuItem()
                item["title"] = div.xpath(".//div[contains(@class,'mob-ctt')]/h2/a/text()").extract_first()
                item["article_url"] = "{}".format("https://www.huxiu.com") + div.xpath("./div/a/@href").extract_first()
                item["img"] = div.xpath("./div/a/img/@src").extract_first()
                item["img2"] = div.xpath("./div[2]/div/div/a/img/@src").extract_first()
                item["href2"] = "{}".format("https://www.huxiu.com") + div.xpath("./div[2]/div/div/a/@href").extract_first() if len(div.xpath("./div[2]/div/div/a/@href")) > 0 else None
                yield item
            self.pg += 1

        if self.pg > 1:
            # 下一頁
            params = {
                "huxiu_hash_code": "b46b6dad804d7d9362a29fe56a8e47a2",
                "page": "{}".format(self.pg),

            }
            # self.pg += 1
            # print(self.pg)
            url = "https://www.huxiu.com/v2_action/article_list"
            yield scrapy.FormRequest(
                url,
                callback=self.parse_nextpage,
                method="POST",
                formdata=params
            )
コード例 #2
0
    def post_parse(self, response):
        page = response.meta['page'] + 1
        # time.sleep(random.randint(1,3))
        data = json.loads(response.text)
        if data:
            if 'data' in data.keys():
                html_post = data['data']
            if 'total_page' in data.keys():
                total_page = data['total_page']
            if 'last_dateline' in data.keys():
                last_dateline = data['last_dateline']
        item = HuxiuItem()
        sel = etree.HTML(html_post)
        item['title'] = sel.xpath('//h2//a/text()')
        # print(type(item['title']))
        item['url'] = [('https://www.huxiu.com' + url)
                       for url in sel.xpath('//h2//a/@href')]
        item['author'] = sel.xpath('//span[@class="author-name"]/text()')
        if item['title'] and item['url'] and item['author']:
            print('获取item内容出错', page - 1)
        item['updata'] = datetime.datetime.now()

        yield item

        if page < int(total_page + 1):
            data = {
                'huxiu_hash_code': '27ab1e6d0b9252b75cefec3c71dbcfba',
                'page': str(page),
                'last_dateline': str(last_dateline),
            }
            yield scrapy.FormRequest(
                'https://www.huxiu.com/v2_action/article_list',
                formdata=data,
                callback=self.post_parse,
                meta={'page': page})
コード例 #3
0
ファイル: hu.py プロジェクト: ck03/huxiu_scrapy
    def parse_nextpage(self, response):
        print("nextpage")
        json_dict = json.loads(response.body_as_unicode())
        # print(json_dict)
        data_str = json_dict["data"]
        # print(data_str)
        # print(type(data_str))
        if len(data_str) > 0:
            html_str = etree.HTML(data_str)
            div_list = html_str.xpath("//div[@class='mod-b mod-art']")
            for div in div_list:
                item = HuxiuItem()
                item["title"] = div.xpath(".//div[@class='mob-ctt']/h2/a/text()")[0]
                item["article_url"] = "{}".format("https://www.huxiu.com") + div.xpath(".//div[@class='mob-ctt']/h2/a/@href")[0]
                item["img"] = div.xpath(".//div[contains(@class,'mod-thumb')]//img/@data-original")[0] if len(div.xpath(".//div[contains(@class,'mod-thumb')]//img/@data-original")) > 0 else None
                item["img2"] = div.xpath(".//div[@class='mob-ctt']/div[@class='mob-author']/div/a/img/@src")[0]
                item["href2"] = "{}".format("https://www.huxiu.com") + div.xpath(".//div[@class='mob-ctt']/div[@class='mob-author']/a/@href")[0]
                # print(item)
                yield item
            self.pg += 1
            print(self.pg)
            # 測試用的,之後要mark====================
            if self.pg == 3:
                # return
                item = HuxiuItem()
                yield item
                print("虎嗅爬蟲結束.........")
                return
            # =======================================
            params = {
                "huxiu_hash_code": "b46b6dad804d7d9362a29fe56a8e47a2",
                "page": "{}".format(self.pg),

            }
            url = "https://www.huxiu.com/v2_action/article_list"
            yield scrapy.FormRequest(
                url,
                callback=self.parse_nextpage,
                method="POST",
                formdata=params
            )
        else:
            item = HuxiuItem()
            yield item
            print("虎嗅爬蟲結束.........")
コード例 #4
0
 def parse(self, response):
     for sel in response.xpath(
             '//div[@class="mod-info-flow"]/div/div[@class="mob-ctt"]'):
         item = HuxiuItem()
         item['title'] = sel.xpath('h3/a/text()')[0].extract()
         item['link'] = sel.xpath('h3/a/@href')[0].extract()
         url = response.urljoin(item['link'])
         item['desc'] = sel.xpath(
             'div[@class="mob-sub"]/text()')[0].extract()
         print(item['title'], item['link'], item['desc'])
コード例 #5
0
ファイル: huxiu_spider.py プロジェクト: yuuuuuy/huxiu
 def parse_article(self, response):
     try:
         detail = response.xpath('//div[@class="article-wrap"]')
         item = HuxiuItem()
         item['title'] = detail.xpath('h1/text()')[0].extract().strip()
         item['link'] = response.url
         item['post_time'] = detail.xpath(
             "div[@class='article-author']/div[@class='column-link-box']/"
             "span[@class='article-time pull-left']/text()")[0].extract()
         yield item
     except IndexError:
         yield scrapy.Request(response.url, callback=self.parse_article)
コード例 #6
0
    def parse_article(self, response):
        # '//div[@class="article-wrap"]/div/p/span[@class="text-remarks"]/text()'
        item = HuxiuItem()
        time.sleep(1)

        detail = response.xpath('//div[@class="container"]/div[@class="wrap-left pull-left"]/div[@class="article-wrap"]')
        detail_1 = response.xpath('//div[@class="container"]/div[@class="wrap-right pull-right"]')
        item['title'] = detail.xpath('h1/text()')[0].extract()
        item['posttime'] = detail.xpath('div/div/span[@class="article-time pull-left"]/text()')[0].extract()
        item['author'] = detail.xpath('div/span/a/text()')[0].extract()
        item['nexttitle'] = detail_1.xpath('div/div[@class="author-next-article"]/a/text()')[0].extract()
        item['link'] = response.url
        item['instime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

        yield item
コード例 #7
0
 def parse(self, response):
     item = HuxiuItem()
     # le = LinkExtractor(restrict_css='div.message-box',
     #                    deny='/article\/(d+)\.html$')
     data_list = response.css('div.mob-ctt')
     for data in data_list:
         item['author'] = '钱德虎'
         item['date'] = data.css(
             'div.mob-author span.time::text').extract_first()
         item['sub'] = data.css('div.mob-sub::text').extract_first()
         href = response.urljoin(
             data.css('h3 a::attr(href)').extract_first())
         # print(item)
         yield scrapy.Request(href,
                              callback=self.detail_parse,
                              dont_filter=True,
                              meta={'item': deepcopy(item)})
コード例 #8
0
ファイル: huxiu_spider.py プロジェクト: yuuuuuy/huxiu
 def parse(self, response):
     for sel in response.xpath(
             "//div[@class='mod-info-flow']/div[@class='mod-b mod-art clearfix ']"
     ):
         item = HuxiuItem()
         title_datas = sel.xpath(
             "div[@class='mob-ctt index-article-list-yh']/h2/a/text()")
         item['title'] = title_datas[0].extract() if title_datas else ''
         link_datas = sel.xpath(
             "div[@class='mob-ctt index-article-list-yh']/h2/a/@href")
         item['link'] = link_datas[0].extract() if link_datas else ''
         url = response.urljoin(item['link'])
         desc_datas = sel.xpath(
             "div[@class='mob-ctt index-article-list-yh']/div[@class='mob-sub']/text()"
         )
         item['desc'] = desc_datas[0].extract() if desc_datas else ''
         yield scrapy.Request(url, callback=self.parse_article)
コード例 #9
0
    def parse(self, response):

        item = HuxiuItem()
        selector = scrapy.Selector(response)
        # print(response)
        title = str(selector.xpath('//h1[@class="t-h1"]/text()').extract()[0]).strip('\n').strip()
        time = selector.xpath('//span[@class="article-time pull-left"]/text() | //span[@class="article-time"]/text()').extract()[0]
        author = selector.xpath('//span[@class="author-name"]/a/text()').extract()[0]
        collection_num = selector.xpath('//span[@class="article-share pull-left"]/text() | //span[@class="article-share"]/text()').extract()[0].strip("收藏")
        comment_num = selector.xpath('//span[@class="article-pl pull-left"]/text() | //span[@class="article-pl"]/text()').extract()[0].strip("评论")
        content = selector.xpath('//div[@class="article-content-wrap"]/p/text()').extract()

        content_all = ''
        for x in content:
            content_all = content_all + x
        url = 'url' # 原文url
        content = content_all

        category = selector.xpath('//div[@class="column-link-box"]/a/text()').extract()
        category_all = ''
        for x in category:
            category_all = category_all + x
        category = category_all

        print(title)
        # 填入item
        item['title'] = title
        item['time'] = time
        item['author'] = author
        item['collection_num'] = collection_num
        item['comment_num'] = comment_num
        item['content'] = content
        item['url'] = url
        item['category'] = category

        yield item
        url_next = "https://www.huxiu.com" + selector.xpath('//div[@class="hot-article-img"]/a/@href').extract()[0]
        print("@@@@@@@@@@@@@@@2", url_next)
        yield scrapy.Request(url_next, callback=self.parse)
コード例 #10
0
 def parse(self, response):
     l = loader.ItemLoader(item=HuxiuItem(), response=response)
     l.add_xpath('title', '//div[@class="mod-info-flow"]//h2//a/text()')
     l.add_xpath('url', '//div[@class="mod-info-flow"]//h2//a/@href',
                 MapCompose(lambda i: urljoin('https://www.huxiu.com', i)))
     l.add_xpath('author', '//span[@class="author-name "]/text()')
     l.add_value('updata', datetime.datetime.now())
     TIME_STAMP = response.xpath(
         '//div[@class="get-mod-more js-get-mod-more-list transition"]/@data-last_dateline'
     ).extract()[0]
     data = {
         'huxiu_hash_code': '27ab1e6d0b9252b75cefec3c71dbcfba',
         'page': '2',
         'last_dateline': str(TIME_STAMP),
     }
     page = 2
     yield scrapy.FormRequest(
         'https://www.huxiu.com/v2_action/article_list',
         formdata=data,
         callback=self.post_parse,
         meta={'page': page})
     yield l.load_item()