Esempio n. 1
0
 def parse_content(self, response):
     try:
         tmp = response.xpath(
             '//div[@class="bm cl"]/div[@class="z"]/a[@href]//text()'
         ).extract()
         if tmp:
             first = tmp[2].strip()
             second = tmp[3].strip()
             title = tmp[4].strip()
         #tmp = '\t'.join(response.xpath('//div[@class="wp cl"]//div[@class="pl c"]')[0].xpath('//blockquote/text()').extract()[0])
         #tmp = response.xpath('//div[@class="pl bm"]//div[@class="pl hin"]//tbody').extract()
         tmp = filter_tags(
             response.xpath(
                 '//div[@class="pl bm"]//table[@class="plhin"]//td[@class="t_f"]//blockquote'
             ).extract_first()).strip().replace('\n',
                                                '').replace('\xa0', ' ')
         if tmp:
             item = HuxiuItem()
             content = tmp.strip().split('\r')
             #print('============================')
             #print(content)
             item['first'] = first
             item['second'] = second
             item['title'] = title
             item['content'] = content
             item['url'] = str(response.request.url).strip()
             return item
     except Exception as err:
         print(tmp)
         print('Exception occurred', traceback.format_exc())
         print(err)
         pass
Esempio n. 2
0
 def parse_article(self, response):
     detail = response.xpath('//div[@class="article__bottom-content__right fl"]')
     item = HuxiuItem()
     item['title'] = detail.xpath('div/h1/text()')[0].extract()
     item['link'] = response.url
     item['posttime'] = detail.xpath('div/span[@class="article__time"]/text()')[0].extract()
     print(item['title'], item['link'], item['posttime'])
     yield item
Esempio n. 3
0
 def parse(self, response):
     for sel in response.xpath('//div[@class="mod-info-flow"]/div/div[@class="mob-ctt"]'):
         item = HuxiuItem()
         item['title'] = sel.xpath('h3/a/text()')[0].extract()
         item['link'] = sel.xpath('h3/a/@href')[0].extract()
         url = response.urljoin(item['link'])
         item['desc'] = sel.xpath('div[@class="mob-sub"]/text()')[0].extract()
         # print(item['title'],item['link'],item['desc'])
         yield scrapy.Request(url, callback=self.parse_article)
Esempio n. 4
0
 def parse_article(self, response):
     detail = response.xpath('//div[@class="article-wrap"]')
     item = HuxiuItem()
     item['title'] = detail.xpath('h1/text()')[0].extract()
     item['link'] = response.url
     item['posttime'] = detail.xpath(
         'div[@class="article-author"]/span[@class="article-time"]/text()')[0].extract()
     print(item['title'],item['link'],item['posttime'])
     yield item
Esempio n. 5
0
 def parse_article(self, response):
     detail = response.xpath('//div[@class="article-wrapper"]')
     #inspect_response(response, self)
     item = HuxiuItem()
     item['title'] = detail.xpath('div/section/div/div[1]/div[1]/h1/text()')[0].extract()
     item['link'] = response.url
     item['posttime'] = detail.xpath('div/section/div/div[1]/div[1]/span/text()')[0].extract()
     print(item['title'],item['link'],item['posttime'])
     yield item
Esempio n. 6
0
 def parse_item(self, response):
     self.logger.info('Hi, this is an item page! %s', response.url)
     detail = response.xpath('//div[@class="article-wrap"]')
     item = HuxiuItem()
     item['title'] = detail.xpath('h1/text()')[0].extract()
     item['link'] = response.url
     item['published'] = detail.xpath(
         'div[@class="article-author"]/span[@class="article-time"]/text()')[0].extract()
     logging.info(item['title'],item['link'],item['published'])
     yield item
Esempio n. 7
0
 def parse(self, response):
     for sel in response.xpath('//div[@class="mob-ctt index-article-list-yh"]'):
         item = HuxiuItem()
         item['title'] = sel.xpath('h2/a/text()').extract_first()
         item['link']  = sel.xpath('h2/a/@href')[0].extract()
         # url = response.urljoin("https://www.huxiu.com"+item['link'])
         url = response.urljoin(item['link'])
         item['desc'] = sel.xpath('div[@class="mob-sub"]/text()')[0].extract()
         # print(item['title'], item['link'], url,item['desc'])
         yield scrapy.Request(url, callback=self.parse_article)
Esempio n. 8
0
 def parse_item(self, response):
     detail = response.xpath('//div[@class="article-wrap"]')
     item = HuxiuItem()
     item['link'] = response.url
     item['image_url'] = detail.xpath(
         'div[@class="article-img-box"]/img/@src').extract_first()
     item['title'] = detail.xpath('h1/text()').extract_first()
     item['posttime'] = detail.xpath(
         'div[@class="article-author"]/div[@class="column-link-box"]/span[@class="article-time pull-left"]/text()'
     ).extract_first()
     yield item
Esempio n. 9
0
    def parse_article(self, response):
        detail = response.xpath('//div[@class="article-wrap"]')

        item = HuxiuItem()
        item['title'] = detail.xpath('h1/text()')[0].extract().strip()
        item['link'] = response.url.strip()
        item['author'] = detail.xpath(
            'div[@class="article-author"]/span[@class="author-name"]/a/text()'
        )[0].extract().strip()
        item['published'] = detail.xpath(
            'div[@class="article-author"]/div[@class="column-link-box"]/span/text()'
        )[0].extract().strip()
        yield item
Esempio n. 10
0
    def parse_article(self, response):
        detail = response.xpath('//div[@class="article-wrap"]')

        item = HuxiuItem()
        item['title'] = detail.xpath('h1/text()')[0].extract()
        item['url'] = response.url
        item['body'] = response.url
        item['source_site'] = response.url
        item['published'] = detail.xpath(
            'div[@class="article-author"]/span[@class="article-time"]/text()'
        )[0].extract()

        # logging.info(item['title'],item['link'],item['published'])
        yield item
Esempio n. 11
0
    def parse(self, response):
        for sel in response.xpath(
                '//div[@class="mod-info-flow"]/div/div[@class="mob-ctt"]'):
            item = HuxiuItem()
            item['title'] = sel.xpath('h2/a/text()')[0].extract()
            item['link'] = sel.xpath('h2/a/@href')[0].extract()

            sub = sel.xpath('div[contains(@class, "mob-sub")]')

            if len(sub.xpath('span')) > 0:
                item['desc'] = sub.xpath('span/text()')[0].extract()
            else:
                item['desc'] = sub.xpath('text()')[0].extract()
            url = response.urljoin(item['link'])
            yield scrapy.Request(url, callback=self.parse_article)
Esempio n. 12
0
 def parse_content(self, response):
     try:
         tmp = response.xpath(
             '//div[@class="bm cl"]/div/a[@href]//text()').extract()
         if tmp:
             first = tmp[1].strip()
             second = tmp[2].strip()
             title = tmp[3].strip()
         tmp = ','.join(
             response.xpath('//div[@class="pl bm"]/div')[0].xpath(
                 '//td[@class="t_f"]/text()').extract())
         if tmp:
             item = HuxiuItem()
             content = tmp.strip()
             item['first'] = first
             item['second'] = second
             item['title'] = title
             item['content'] = content.replace('\n', '').replace('\r', '')
             item['url'] = str(response.request.url).strip()
             yield item
     except Exception as err:
         pass