def parse(self, response): for news in response.css('div.news-card'): item = ScrapenewsItem() item['image'] = news.css( 'div.news-card-image::attr(style)').extract_first()[23:-3] item['title'] = news.css('a.clickable>span::text').extract_first() item['content'] = news.css( 'div[itemprop*=articleBody]::text').extract_first() item['newsDate'] = news.css( 'span.time::attr(content)').extract_first()[:-5] item['link'] = news.css( 'div.read-more>a::attr(href)').extract_first() item['source'] = 105 yield item #news_id extraction pattern = re.compile('var min_news_id\s+=\s+"(.*?)"') js = response.xpath( '//script[@type="text/javascript"]/text()').extract()[-1] self.news_id = pattern.search(js).group(1) while (self.pages > 1 and not self.infinite): yield FormRequest('https://www.inshorts.com/en/ajax/more_news', formdata={'news-offset': self.news_id}, callback=self.parse_more_news, errback=self.errorRequestHandler, dont_filter=True) self.pages -= 1 while (self.infinite): yield FormRequest('https://www.inshorts.com/en/ajax/more_news', formdata={'news-offset': self.news_id}, callback=self.parse_more_news, errback=self.errorRequestHandler, dont_filter=True)
def parse_more_news(self, response): try: ajax_response = json.loads(response.text) self.news_id = ajax_response['min_news_id'] html = Selector(text=ajax_response['html']) for news in html.css('div.news-card'): self.urls_parsed += 1 try: item = ScrapenewsItem() item['image'] = news.css('div.news-card-image::attr(style)' ).extract_first()[23:-3] item['title'] = news.css( 'a.clickable>span::text').extract_first() item['content'] = news.css( 'div[itemprop*=articleBody]::text').extract_first() item['newsDate'] = news.css( 'span.time::attr(content)').extract_first()[:-5] item['link'] = news.css( 'div.read-more>a::attr(href)').extract_first() item['source'] = 105 yield item self.urls_scraped += 1 except Exception as e: logger.error(__name__ + " [UNHANDLED] Unable to Extract Data : " + str(e)) self.urls_dropped += 1 except Exception as e: logger.error(__name__ + " [UNHANDLED] " + str(e) + " for response url " + response.url)
def parse_article(self, response): item = ScrapenewsItem() # Scraper Items item['image'] = self.getPageImage(response) item['title'] = self.getPageTitle(response) item['content'] = self.getPageContent(response) item['newsDate'] = self.getPageDate(response) item['link'] = response.url item['source'] = 108 if item['title'] is not 'Error' and item['content'] is not 'Error' and item['newsDate'] is not 'Error': self.urls_scraped += 1 yield item
def parse_article(self, response): item = ScrapenewsItem() # Scraper Items item['image'] = self.parse_image(response) item['title'] = self.parse_title(response) item['content'] = self.parse_content(response) item['link'] = response.url item['newsDate'] = self.parse_date(response) item['source'] = 104 if item['image'] is not None and item['title'] is not None and item[ 'content'] is not None and item['newsDate'] is not None: yield item
def parse_article(self, response): i = ScrapenewsItem() i['title'] = response.xpath("//h1/text()").extract_first() i['link'] = response.url i['image'] = response.xpath( '//div[contains(@itemprop, "image")]//img/@src').extract_first() i['newsDate'] = response.xpath( '//span[contains(@itemprop, "date")]/text()').extract_first() i['content'] = response.xpath( '//div[@itemprop ="articleBody"]//p/text()').extract() i['source'] = 117 yield i
def parse_article(self, response): try: item = ScrapenewsItem() # Scraper Items item['image'] = self.getPageImage(response) item['title'] = self.getPageTitle(response) item['content'] = self.getPageContent(response) item['newsDate'] = self.getPageDate(response) item['link'] = response.url item['source'] = 115 if item['title'] is not 'Error' and item['content'] is not 'Error' and item['newsDate'] is not 'Error': self.urls_scraped += 1 yield item except Exception as e: logger.error(__name__ + " [UNHANDLED] " + str(e) + " for response url " + response.url)
def parse_news(self, response): i = ScrapenewsItem() i['title'] = response.xpath( '//h1[contains(@class, "article-heading margin")]/text()' ).extract_first() #scrapes headline i['newsDate'] = response.xpath( '//span[contains(@class, "date")]/text()').extract_first()[ 10:-4] #scrapes datetime i['image'] = self.getimage(response) i['content'] = self.getcontent(response) i['link'] = response.url #scrapes link; article page i['source'] = 106 yield i
def parse_article(self, response): if (str(response.url)[:32] != "http://www.firstpost.com/photos/"): item = ScrapenewsItem() # Scraper Items item['image'] = self.getPageImage(response) item['title'] = self.getPageTitle(response) item['content'] = self.getPageContent(response) item['newsDate'] = self.getPageDate(response) item['link'] = response.url item['source'] = 112 if item['title'] is not 'Error' and item[ 'content'] is not 'Error' and item[ 'link'] is not 'Error' and item[ 'newsDate'] is not 'Error': self.urls_scraped += 1 yield item
def parse_more_news(self, response): ajax_response = json.loads(response.text) self.news_id = ajax_response['min_news_id'] html = Selector(text=ajax_response['html']) for news in html.css('div.news-card'): item = ScrapenewsItem() item['image'] = news.css( 'div.news-card-image::attr(style)').extract_first()[23:-3] item['title'] = news.css('a.clickable>span::text').extract_first() item['content'] = news.css( 'div[itemprop*=articleBody]::text').extract_first() item['newsDate'] = news.css( 'span.time::attr(content)').extract_first()[:-5] item['link'] = news.css( 'div.read-more>a::attr(href)').extract_first() item['source'] = 105 yield item
def parse_news(self, response): try: item = ScrapenewsItem() item['link'] = response.url item['source'] = self.custom_settings['site_id'] news_parser = "default" for parser_str in self.xpaths: match = r'\/' + re.escape(parser_str) + r'\/' if re.search(match, item['link']) is not None: news_parser = parser_str break logger.debug(__name__ + " Using " + news_parser + " parser for url " + response.url) item['title'] = response.xpath( self.xpaths[news_parser]['title']).extract_first() item['content'] = response.xpath( self.xpaths[news_parser]['description']).extract()[0] item['image'] = response.xpath( self.xpaths[news_parser]['image']).extract_first() item['newsDate'] = response.xpath( self.xpaths[news_parser]['date']).extract_first() # Remove Escaped Characters item['content'] = re.sub(r"[\t\r\n]{0,}", "", item['content']) # Remove Unwanted Spaces from start and end item['content'] = re.sub(r"^\s{0,}|\s{0,}$", "", item['content']) for key in item: if item[key] == None and key != 'image': logger.error(__name__ + " A Required Key wasn't extracted: " + key) self.urls_dropped += 1 yield None yield item self.urls_scraped += 1 except Exception as e: logger.error(__name__ + " [UNHANDLED] : " + str(e) + " for response url " + response.url) self.urls_dropped += 1
def parse_article(self, response): try: item = ScrapenewsItem() # Scraper Items item['image'] = self.parse_image(response) item['title'] = self.parse_title(response) item['content'] = self.parse_content(response) item['link'] = response.url item['newsDate'] = str(self.parse_date(response)) item['source'] = 104 if item['image'] is not None and item['title'] is not None and item[ 'content'] is not None and item['newsDate'] is not None: self.urls_scraped += 1 yield item else: self.urls_dropped += 1 except Exception as e: logger.error(__name__ + " [UNHANDLED] " + str(e) + " for response url " + response.url)
def fun(self, response): text = response.text headline = response.xpath( '//h1[@itemprop="headline"]/text()').extract_first() images = response.xpath( '//div[@class="thumbnail"]/img/@src').extract_first() body = response.xpath( '//div[@itemprop="articlebody"]/p').extract_first() date = response.css('span.text-dt::text').extract_first() item = ScrapenewsItem({ 'title': headline, 'link': response.url, 'newsDate': date, 'content': body, 'image': images, 'source': 114 }) self.urls_scraped += 1 yield item
def parse_article(self, response): try: i = ScrapenewsItem() i['title'] = self.gettitle(response) i['link'] = self.getlink(response) #response.url i['image'] = self.getimage(response) i['newsDate'] = self.getdatetime(response) i['content'] = self.getcontent(response) i['source'] = 117 if i['title'] is not 'Error' and i['content'] is not 'Error' and i[ 'link'] is not 'Error' and i['newsDate'] is not 'Error': self.urls_scraped += 1 yield i else: self.urls_dropped += 1 yield None except Exception as e: logger.error(__name__ + " [UNHNADLED] " + str(e) + " for response url " + response.url) yield None
def parse_article(self, response): if ((str(response.url) != "https://hindi.firstpost.com/") and ((not response.xpath("//div[@id='play_home_video']")) and (not response.xpath('//div[contains(@class,"pht-artcl-top")]')) and (not self.postgres.checkUrlExists(response.url)))): self.urls_parsed -= 1 item = ScrapenewsItem() # Scraper Items item['image'] = self.getPageImage(response) item['title'] = self.getPageTitle(response) item['content'] = self.getPageContent(response) item['newsDate'] = self.getPageDate(response) item['link'] = response.url item['source'] = 111 if item['title'] is not 'Error' and item[ 'content'] is not 'Error' and item[ 'link'] is not 'Error' and item[ 'newsDate'] is not 'Error': self.urls_scraped += 1 yield item else: self.urls_parsed -= 1 yield None
def parse_news(self,response): try: i = ScrapenewsItem() i['title'] = self.gettitle(response) #scrapes headline i['newsDate'] = self.getdatetime(response) #scrapes date i['image'] = self.getimage(response) #scrapes image link i['content'] = self.getcontent(response) #scrapes content i['link'] = response.url #scrapes link i['source'] = 106 #database entry flag = 0 for x in i: if i[x] == 'Error': self.urls_dropped += 1 flag = 1 yield None break if flag == 0: self.urls_scraped += 1 yield i except Exception as e: logger.error(__name__ + " [UNHANDLED] " + str(e) + " for response url " + response.url) yield None
def parse_article(self, response): try: if (str(response.url)[:32] != "http://www.firstpost.com/photos/"): item = ScrapenewsItem() # Scraper Items item['image'] = self.getPageImage(response) item['title'] = self.getPageTitle(response) item['content'] = self.getPageContent(response) item['newsDate'] = self.getPageDate(response) item['link'] = response.url item['source'] = 112 if item['title'] is not 'Error' and item[ 'content'] is not 'Error' and item[ 'link'] is not 'Error' and item[ 'newsDate'] is not 'Error': self.urls_scraped += 1 yield item else: self.urls_dropped += 1 yield None except Exception as e: logger.error(__name__ + " [UNHANDLED] " + str(e) + " for response url " + response.url) self.urls_dropped += 1