Example #1
0
 def parse_item(self, response):
     item = NewsScrapItem()
     item[u'news_title'] = response.xpath(u'//h1/text()').extract_first()
     item["news_link"] = response.url
     # item["news_article"] = ''.join(response.xpath(u'//div[@class="td-post-content"]')
     #                             .extract()).strip()
     if item['news_title']:
         yield item
Example #2
0
 def parse(self, response):
     item = NewsScrapItem()
     item[u'news_title'] = response.xpath(
         u'//h1[@class="title"]/text()').extract_first()
     item["news_link"] = response.url
     item["news_article"] = response.xpath(
         u'//section[@class="article-detail"]//div[contains(@class, "content-all")]'
     ).extract_first()
     return item
Example #3
0
 def parse_data(self, response):
     item = NewsScrapItem()
     item[u'news_title'] = response.xpath(u'//h1/text()').extract_first()
     item["news_link"] = response.url
     item["news_article"] = ''.join(
         response.xpath(
             '//*[@id = "innity-in-post"]//table//td[@valign = "baseline"]/text()'
         ).extract()).strip()
     if 'ViewNews.aspx' in response.url:
         return item
Example #4
0
 def parse(self, response):
     item = NewsScrapItem()
     item[u'news_title'] = response.xpath(
         u'//head/meta[@property="og:title"]/@content').extract_first()
     item["news_link"] = response.url
     # no need article
     item["news_article"] = ''.join(
         response.xpath(
             u'//div[@class="text_post_block"]//p').extract()).strip()
     if item[u'news_title']:
         yield item
Example #5
0
    def parse_item(self, response):

        #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
        #i['name'] = response.xpath('//div[@id="name"]').extract()
        #i['description'] = response.xpath('//div[@id="description"]').extract()
        item = NewsScrapItem()
        item[u'news_title'] = response.xpath(u'//h1/text()').extract_first()
        item["news_link"] = response.url
        item["news_article"] = ''.join(
            response.xpath(
                '//div[@class="mw-parser-output"]//p').extract()).strip()
        if item["news_article"]:
            return item
Example #6
0
    def parse_item(self, response):

        #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
        #i['name'] = response.xpath('//div[@id="name"]').extract()
        #i['description'] = response.xpath('//div[@id="description"]').extract()

        item = NewsScrapItem()
        item[u'news_title'] = response.xpath(
            u'//head/meta[@property="og:title"]/@content').extract_first()
        item["news_link"] = response.url
        # item["news_article"] = ''.join(response.xpath(u'//div[@class="td-post-content"]')
        #                             .extract()).strip()
        if item['news_title']:
            yield item
Example #7
0
    def parse(self, response):
        # img = ['.jpg','.png','.gif']
        for link in response.xpath('//a/@href').extract():
            if not link.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')):
                link = urlparse.urljoin(response.url, link)
                yield scrapy.Request(link, callback=self.parse)

        item = NewsScrapItem()
        item[u'news_title'] = response.xpath(
            u'//head/meta[@property="og:title"]/@content').extract_first()
        item["news_link"] = response.url
        # item["news_article"] = ''.join(response.xpath(u'//div[@class="td-post-content"]')
        #                             .extract()).strip()
        if item['news_title']:
            yield item
Example #8
0
 def parse(self, response):
     img = ['.jpg', '.png', '.gif']
     if re.search("news_\d*", response.url):
         item = NewsScrapItem()
         item[u'news_title'] = response.xpath(
             u'//h1/text()').extract_first()
         item["news_link"] = response.url
         item["news_article"] = ''.join(
             response.xpath(
                 u'//div[@class="td-post-content"]').extract()).strip()
         yield item
     # if re.search("news_\d*",response.url):
     for link in response.xpath('//a/@href').extract():
         if not link.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')):
             link = urlparse.urljoin(response.url, link)
             yield scrapy.Request(link, callback=self.parse)
Example #9
0
 def parse(self, response):
     item = NewsScrapItem()
     item[u'news_title'] = response.xpath(u'//h1/text()').extract_first()
     item["news_link"] = response.url
     item["news_article"] = response.xpath("//article//p").extract_first()
     return item
Example #10
0
 def parse_data(self, response):
     item = NewsScrapItem()
     item[u'news_title']  = response.xpath(u'//head/meta[@property="og:title"]/@content').extract_first()
     item["news_link"]    = response.url
     item["news_article"] = ''.join(response.xpath(u'//div[@class="postmain"]//p').extract()).strip()
     yield item