Python NewsScrapItem Examples

Programming Language: Python

Namespace/Package Name: news_scrap.items

Class/Type: NewsScrapItem

Examples at hotexamples.com: 10

Python NewsScrapItem - 10 examples found. These are the top rated real world Python examples of news_scrap.items.NewsScrapItem extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

NewsScrapItem(10)

Frequently Used Methods

NewsScrapItem (10)

Example #1

Show file

File: bts.py Project: xbklairith/thai-web-crawler

 def parse_item(self, response):
     item = NewsScrapItem()
     item[u'news_title'] = response.xpath(u'//h1/text()').extract_first()
     item["news_link"] = response.url
     # item["news_article"] = ''.join(response.xpath(u'//div[@class="td-post-content"]')
     #                             .extract()).strip()
     if item['news_title']:
         yield item

Example #2

Show file

 def parse(self, response):
     item = NewsScrapItem()
     item[u'news_title'] = response.xpath(
         u'//h1[@class="title"]/text()').extract_first()
     item["news_link"] = response.url
     item["news_article"] = response.xpath(
         u'//section[@class="article-detail"]//div[contains(@class, "content-all")]'
     ).extract_first()
     return item

Example #3

Show file

File: manager.py Project: xbklairith/thai-web-crawler

 def parse_data(self, response):
     item = NewsScrapItem()
     item[u'news_title'] = response.xpath(u'//h1/text()').extract_first()
     item["news_link"] = response.url
     item["news_article"] = ''.join(
         response.xpath(
             '//*[@id = "innity-in-post"]//table//td[@valign = "baseline"]/text()'
         ).extract()).strip()
     if 'ViewNews.aspx' in response.url:
         return item

Example #4

Show file

 def parse(self, response):
     item = NewsScrapItem()
     item[u'news_title'] = response.xpath(
         u'//head/meta[@property="og:title"]/@content').extract_first()
     item["news_link"] = response.url
     # no need article
     item["news_article"] = ''.join(
         response.xpath(
             u'//div[@class="text_post_block"]//p').extract()).strip()
     if item[u'news_title']:
         yield item

Example #5

Show file

    def parse_item(self, response):

        #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
        #i['name'] = response.xpath('//div[@id="name"]').extract()
        #i['description'] = response.xpath('//div[@id="description"]').extract()
        item = NewsScrapItem()
        item[u'news_title'] = response.xpath(u'//h1/text()').extract_first()
        item["news_link"] = response.url
        item["news_article"] = ''.join(
            response.xpath(
                '//div[@class="mw-parser-output"]//p').extract()).strip()
        if item["news_article"]:
            return item

Example #6

Show file

File: baabin.py Project: xbklairith/thai-web-crawler

    def parse_item(self, response):

        #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
        #i['name'] = response.xpath('//div[@id="name"]').extract()
        #i['description'] = response.xpath('//div[@id="description"]').extract()

        item = NewsScrapItem()
        item[u'news_title'] = response.xpath(
            u'//head/meta[@property="og:title"]/@content').extract_first()
        item["news_link"] = response.url
        # item["news_article"] = ''.join(response.xpath(u'//div[@class="td-post-content"]')
        #                             .extract()).strip()
        if item['news_title']:
            yield item

Example #7

Show file

File: meekhao.py Project: xbklairith/thai-web-crawler

    def parse(self, response):
        # img = ['.jpg','.png','.gif']
        for link in response.xpath('//a/@href').extract():
            if not link.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')):
                link = urlparse.urljoin(response.url, link)
                yield scrapy.Request(link, callback=self.parse)

        item = NewsScrapItem()
        item[u'news_title'] = response.xpath(
            u'//head/meta[@property="og:title"]/@content').extract_first()
        item["news_link"] = response.url
        # item["news_article"] = ''.join(response.xpath(u'//div[@class="td-post-content"]')
        #                             .extract()).strip()
        if item['news_title']:
            yield item

Example #8

Show file

 def parse(self, response):
     img = ['.jpg', '.png', '.gif']
     if re.search("news_\d*", response.url):
         item = NewsScrapItem()
         item[u'news_title'] = response.xpath(
             u'//h1/text()').extract_first()
         item["news_link"] = response.url
         item["news_article"] = ''.join(
             response.xpath(
                 u'//div[@class="td-post-content"]').extract()).strip()
         yield item
     # if re.search("news_\d*",response.url):
     for link in response.xpath('//a/@href').extract():
         if not link.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')):
             link = urlparse.urljoin(response.url, link)
             yield scrapy.Request(link, callback=self.parse)

Example #9

Show file

 def parse(self, response):
     item = NewsScrapItem()
     item[u'news_title'] = response.xpath(u'//h1/text()').extract_first()
     item["news_link"] = response.url
     item["news_article"] = response.xpath("//article//p").extract_first()
     return item

Example #10

Show file

File: liekr.py Project: xbklairith/thai-web-crawler

 def parse_data(self, response):
     item = NewsScrapItem()
     item[u'news_title']  = response.xpath(u'//head/meta[@property="og:title"]/@content').extract_first()
     item["news_link"]    = response.url
     item["news_article"] = ''.join(response.xpath(u'//div[@class="postmain"]//p').extract()).strip()
     yield item