Example #1
0
    def parse(self, response):
        for news in response.css('div.news-card'):
            item = ScrapenewsItem()
            item['image'] = news.css(
                'div.news-card-image::attr(style)').extract_first()[23:-3]
            item['title'] = news.css('a.clickable>span::text').extract_first()
            item['content'] = news.css(
                'div[itemprop*=articleBody]::text').extract_first()
            item['newsDate'] = news.css(
                'span.time::attr(content)').extract_first()[:-5]
            item['link'] = news.css(
                'div.read-more>a::attr(href)').extract_first()
            item['source'] = 105
            yield item

        #news_id extraction
        pattern = re.compile('var min_news_id\s+=\s+"(.*?)"')
        js = response.xpath(
            '//script[@type="text/javascript"]/text()').extract()[-1]
        self.news_id = pattern.search(js).group(1)

        while (self.pages > 1 and not self.infinite):
            yield FormRequest('https://www.inshorts.com/en/ajax/more_news',
                              formdata={'news-offset': self.news_id},
                              callback=self.parse_more_news,
                              errback=self.errorRequestHandler,
                              dont_filter=True)
            self.pages -= 1

        while (self.infinite):
            yield FormRequest('https://www.inshorts.com/en/ajax/more_news',
                              formdata={'news-offset': self.news_id},
                              callback=self.parse_more_news,
                              errback=self.errorRequestHandler,
                              dont_filter=True)
Example #2
0
 def parse_more_news(self, response):
     try:
         ajax_response = json.loads(response.text)
         self.news_id = ajax_response['min_news_id']
         html = Selector(text=ajax_response['html'])
         for news in html.css('div.news-card'):
             self.urls_parsed += 1
             try:
                 item = ScrapenewsItem()
                 item['image'] = news.css('div.news-card-image::attr(style)'
                                          ).extract_first()[23:-3]
                 item['title'] = news.css(
                     'a.clickable>span::text').extract_first()
                 item['content'] = news.css(
                     'div[itemprop*=articleBody]::text').extract_first()
                 item['newsDate'] = news.css(
                     'span.time::attr(content)').extract_first()[:-5]
                 item['link'] = news.css(
                     'div.read-more>a::attr(href)').extract_first()
                 item['source'] = 105
                 yield item
                 self.urls_scraped += 1
             except Exception as e:
                 logger.error(__name__ +
                              " [UNHANDLED] Unable to Extract Data : " +
                              str(e))
                 self.urls_dropped += 1
     except Exception as e:
         logger.error(__name__ + " [UNHANDLED] " + str(e) +
                      " for response url " + response.url)
Example #3
0
 def parse_article(self, response):
     item = ScrapenewsItem()  # Scraper Items
     item['image'] = self.getPageImage(response)
     item['title'] = self.getPageTitle(response)
     item['content'] = self.getPageContent(response)
     item['newsDate'] = self.getPageDate(response)
     item['link'] = response.url
     item['source'] = 108
     if item['title'] is not 'Error' and item['content'] is not 'Error' and item['newsDate'] is not 'Error':
         self.urls_scraped += 1
         yield item
Example #4
0
 def parse_article(self, response):
     item = ScrapenewsItem()  # Scraper Items
     item['image'] = self.parse_image(response)
     item['title'] = self.parse_title(response)
     item['content'] = self.parse_content(response)
     item['link'] = response.url
     item['newsDate'] = self.parse_date(response)
     item['source'] = 104
     if item['image'] is not None and item['title'] is not None and item[
             'content'] is not None and item['newsDate'] is not None:
         yield item
Example #5
0
    def parse_article(self, response):
        i = ScrapenewsItem()
        i['title'] = response.xpath("//h1/text()").extract_first()
        i['link'] = response.url
        i['image'] = response.xpath(
            '//div[contains(@itemprop, "image")]//img/@src').extract_first()
        i['newsDate'] = response.xpath(
            '//span[contains(@itemprop, "date")]/text()').extract_first()
        i['content'] = response.xpath(
            '//div[@itemprop ="articleBody"]//p/text()').extract()
        i['source'] = 117

        yield i
Example #6
0
 def parse_article(self, response):
     try:
         item = ScrapenewsItem()  # Scraper Items
         item['image'] = self.getPageImage(response)
         item['title'] = self.getPageTitle(response)
         item['content'] = self.getPageContent(response)
         item['newsDate'] = self.getPageDate(response)
         item['link'] = response.url
         item['source'] = 115
         if item['title'] is not 'Error' and item['content'] is not 'Error' and item['newsDate'] is not 'Error':
             self.urls_scraped += 1
             yield item
     except Exception as e:
         logger.error(__name__ + " [UNHANDLED] " + str(e) + " for response url " + response.url)
Example #7
0
    def parse_news(self, response):
        i = ScrapenewsItem()
        i['title'] = response.xpath(
            '//h1[contains(@class, "article-heading margin")]/text()'
        ).extract_first()  #scrapes headline
        i['newsDate'] = response.xpath(
            '//span[contains(@class, "date")]/text()').extract_first()[
                10:-4]  #scrapes datetime
        i['image'] = self.getimage(response)
        i['content'] = self.getcontent(response)
        i['link'] = response.url  #scrapes link; article page
        i['source'] = 106

        yield i
Example #8
0
 def parse_article(self, response):
     if (str(response.url)[:32] != "http://www.firstpost.com/photos/"):
         item = ScrapenewsItem()  # Scraper Items
         item['image'] = self.getPageImage(response)
         item['title'] = self.getPageTitle(response)
         item['content'] = self.getPageContent(response)
         item['newsDate'] = self.getPageDate(response)
         item['link'] = response.url
         item['source'] = 112
         if item['title'] is not 'Error' and item[
                 'content'] is not 'Error' and item[
                     'link'] is not 'Error' and item[
                         'newsDate'] is not 'Error':
             self.urls_scraped += 1
             yield item
Example #9
0
 def parse_more_news(self, response):
     ajax_response = json.loads(response.text)
     self.news_id = ajax_response['min_news_id']
     html = Selector(text=ajax_response['html'])
     for news in html.css('div.news-card'):
         item = ScrapenewsItem()
         item['image'] = news.css(
             'div.news-card-image::attr(style)').extract_first()[23:-3]
         item['title'] = news.css('a.clickable>span::text').extract_first()
         item['content'] = news.css(
             'div[itemprop*=articleBody]::text').extract_first()
         item['newsDate'] = news.css(
             'span.time::attr(content)').extract_first()[:-5]
         item['link'] = news.css(
             'div.read-more>a::attr(href)').extract_first()
         item['source'] = 105
         yield item
Example #10
0
    def parse_news(self, response):
        try:
            item = ScrapenewsItem()

            item['link'] = response.url
            item['source'] = self.custom_settings['site_id']
            news_parser = "default"

            for parser_str in self.xpaths:
                match = r'\/' + re.escape(parser_str) + r'\/'
                if re.search(match, item['link']) is not None:
                    news_parser = parser_str
                    break

            logger.debug(__name__ + " Using " + news_parser +
                         " parser for url " + response.url)

            item['title'] = response.xpath(
                self.xpaths[news_parser]['title']).extract_first()
            item['content'] = response.xpath(
                self.xpaths[news_parser]['description']).extract()[0]
            item['image'] = response.xpath(
                self.xpaths[news_parser]['image']).extract_first()
            item['newsDate'] = response.xpath(
                self.xpaths[news_parser]['date']).extract_first()

            # Remove Escaped Characters
            item['content'] = re.sub(r"[\t\r\n]{0,}", "", item['content'])

            # Remove Unwanted Spaces from start and end
            item['content'] = re.sub(r"^\s{0,}|\s{0,}$", "", item['content'])

            for key in item:
                if item[key] == None and key != 'image':
                    logger.error(__name__ +
                                 " A Required Key wasn't extracted: " + key)
                    self.urls_dropped += 1
                    yield None

            yield item
            self.urls_scraped += 1

        except Exception as e:
            logger.error(__name__ + " [UNHANDLED] : " + str(e) +
                         " for response url " + response.url)
            self.urls_dropped += 1
Example #11
0
 def parse_article(self, response):
     try:
         item = ScrapenewsItem()  # Scraper Items
         item['image'] = self.parse_image(response)
         item['title'] = self.parse_title(response)
         item['content'] = self.parse_content(response)
         item['link'] = response.url
         item['newsDate'] = str(self.parse_date(response))
         item['source'] = 104
         if item['image'] is not None and item['title'] is not None and item[
                 'content'] is not None and item['newsDate'] is not None:
             self.urls_scraped += 1
             yield item
         else:
             self.urls_dropped += 1
     except Exception as e:
         logger.error(__name__ + " [UNHANDLED] " + str(e) +
                      " for response url " + response.url)
Example #12
0
    def fun(self, response):
        text = response.text
        headline = response.xpath(
            '//h1[@itemprop="headline"]/text()').extract_first()
        images = response.xpath(
            '//div[@class="thumbnail"]/img/@src').extract_first()
        body = response.xpath(
            '//div[@itemprop="articlebody"]/p').extract_first()
        date = response.css('span.text-dt::text').extract_first()

        item = ScrapenewsItem({
            'title': headline,
            'link': response.url,
            'newsDate': date,
            'content': body,
            'image': images,
            'source': 114
        })
        self.urls_scraped += 1
        yield item
Example #13
0
    def parse_article(self, response):
        try:
            i = ScrapenewsItem()
            i['title'] = self.gettitle(response)
            i['link'] = self.getlink(response)  #response.url
            i['image'] = self.getimage(response)
            i['newsDate'] = self.getdatetime(response)
            i['content'] = self.getcontent(response)
            i['source'] = 117

            if i['title'] is not 'Error' and i['content'] is not 'Error' and i[
                    'link'] is not 'Error' and i['newsDate'] is not 'Error':
                self.urls_scraped += 1
                yield i
            else:
                self.urls_dropped += 1
                yield None
        except Exception as e:
            logger.error(__name__ + " [UNHNADLED] " + str(e) +
                         " for response url " + response.url)
            yield None
Example #14
0
 def parse_article(self, response):
     if ((str(response.url) != "https://hindi.firstpost.com/") and
         ((not response.xpath("//div[@id='play_home_video']")) and
          (not response.xpath('//div[contains(@class,"pht-artcl-top")]'))
          and (not self.postgres.checkUrlExists(response.url)))):
         self.urls_parsed -= 1
         item = ScrapenewsItem()  # Scraper Items
         item['image'] = self.getPageImage(response)
         item['title'] = self.getPageTitle(response)
         item['content'] = self.getPageContent(response)
         item['newsDate'] = self.getPageDate(response)
         item['link'] = response.url
         item['source'] = 111
         if item['title'] is not 'Error' and item[
                 'content'] is not 'Error' and item[
                     'link'] is not 'Error' and item[
                         'newsDate'] is not 'Error':
             self.urls_scraped += 1
             yield item
     else:
         self.urls_parsed -= 1
         yield None
Example #15
0
    def parse_news(self,response):
        try:
            i = ScrapenewsItem()
            i['title'] =  self.gettitle(response)   #scrapes headline
            i['newsDate'] = self.getdatetime(response)  #scrapes date
            i['image'] = self.getimage(response)    #scrapes image link
            i['content'] = self.getcontent(response)    #scrapes content
            i['link'] = response.url    #scrapes link 
            i['source'] = 106   #database entry

            flag = 0
            for x in i:
                if i[x] == 'Error':
                    self.urls_dropped += 1
                    flag = 1
                    yield None
                    break
            if flag == 0:
                self.urls_scraped += 1
                yield i
        except Exception as e:
            logger.error(__name__ + " [UNHANDLED] " + str(e) + " for response url " + response.url)
            yield None
Example #16
0
 def parse_article(self, response):
     try:
         if (str(response.url)[:32] != "http://www.firstpost.com/photos/"):
             item = ScrapenewsItem()  # Scraper Items
             item['image'] = self.getPageImage(response)
             item['title'] = self.getPageTitle(response)
             item['content'] = self.getPageContent(response)
             item['newsDate'] = self.getPageDate(response)
             item['link'] = response.url
             item['source'] = 112
             if item['title'] is not 'Error' and item[
                     'content'] is not 'Error' and item[
                         'link'] is not 'Error' and item[
                             'newsDate'] is not 'Error':
                 self.urls_scraped += 1
                 yield item
             else:
                 self.urls_dropped += 1
                 yield None
     except Exception as e:
         logger.error(__name__ + " [UNHANDLED] " + str(e) +
                      " for response url " + response.url)
         self.urls_dropped += 1