def parse_page(self, response): """Scrapes information from pages into items""" item = CrawlerItem() item['url'] = response.url.encode('utf-8') item['visited'] = datetime.datetime.now().isoformat().encode('utf-8') item['published'] = parser.parse( get_first( response.selector.xpath( '//meta[@property="vr:published_time"]/@content').extract( ))).isoformat().encode('utf-8') item['title'] = get_first( response.selector.xpath( '//meta[@property="og:title"]/@content').extract()) item['description'] = get_first( response.selector.xpath( '//meta[@property="og:description"]/@content').extract() ).strip() item['text'] = "".join([ s.strip().encode('utf-8') for s in response.selector.xpath( '//div[@class="main-text "]/p/text()').extract() ]) item['author'] = [ s.encode('utf-8') for s in response.selector.xpath( '//meta[@name="author"]/@content').extract() ] item['keywords'] = [ s.encode('utf-8') for s in response.selector.xpath( '//meta[@name="keywords"]/@content').extract() ] item['resource'] = self.name item['publication_id'] = hashlib.sha1( (str(item['url']) + str(item['published']))).hexdigest() return item
def parse_page(self, response): """Scrapes information from pages into items""" item = CrawlerItem() item['url'] = response.url.encode('utf-8') item['visited'] = datetime.datetime.now().isoformat().encode('utf-8') item['published'] = get_first( response.selector.xpath('//meta[@name="date"]/@content').extract()) item['title'] = get_first( response.selector.css('.headline').xpath('./text()').extract()) item['description'] = get_first( response.selector.xpath( '//meta[@name="description"]/@content').extract()) item['text'] = "".join([ s.strip().encode('utf-8') for s in response.selector.xpath( '//div[@class="article-section clearfix"]/p/text()').extract() ]) item['author'] = [ s.encode('utf-8') for s in response.selector.xpath( '//p[@class="author"]/a/text()').extract() ] item['keywords'] = [ s.encode('utf-8') for s in response.selector.xpath( '//meta[@name="news_keywords"]/@content').extract() ] return item
def parse_page(self, response): """Scrapes information from pages into items""" item = CrawlerItem() item['url'] = response.url.encode('utf-8') item['visited'] = datetime.datetime.now().isoformat().encode('utf-8') item['published'] = get_first( response.selector.xpath('//meta[@name="date"]/@content').extract()) item['title'] = get_first( response.selector.xpath( '//meta[@property="og:title"]/@content').extract()) item['description'] = get_first( response.selector.xpath( '//meta[@name="description"]/@content').extract()) item['text'] = "".join([ s.strip().encode('utf-8') for s in response.selector.css('.article__item').css( '.paragraph').xpath('.//text()').extract() ]) item['author'] = [ s.encode('utf-8') for s in response.selector.css('.byline').css( 'span[itemprop="name"]').xpath('./text()').extract() ] item['keywords'] = [ s.encode('utf-8') for s in response.selector.xpath( '//meta[@name="keywords"]/@content').extract() ] # Handle next pages next_page = get_first( response.selector.xpath('//link[@rel="next"]/@href').extract()) if next_page: self.logger.debug("Next page found: " + next_page) yield Request(next_page, callback=self.parse_page) yield item
def parse_page(self, response): """Scrapes information from pages into items""" #settings = get_project_settings() published = parser.parse(get_first(response.selector.xpath('//meta[@name="date"]/@content').extract())) published = published.replace(tzinfo=timezone('UTC')) # earliest = parser.parse(settings.get('EARLIEST_PUBLISHED')) # if published < earliest: # raise DropItem('Dropping this article published on %s at %s which is before earliest published global setting %s' % (self.name, published.isoformat(), earliest.isoformat())) #raise CloseSpider('Article was published on %s at %s which is before earliest published global setting %s' % (self.name, published.isoformat(), earliest.isoformat())) # else: item = CrawlerItem() item['url'] = response.url.encode('utf-8') item['visited'] = datetime.datetime.now().isoformat().encode('utf-8') item['published'] = published.isoformat().encode('utf-8') item['title'] = get_first(response.selector.xpath('//meta[@property="og:title"]/@content').extract()) item['description'] = get_first(response.selector.xpath('//meta[@name="description"]/@content').extract()) #item['text'] = "".join([s.strip().encode('utf-8') for s in response.selector.css('.article__item').css('.paragraph').xpath('.//text()').extract()]) item['author'] = [s.encode('utf-8') for s in response.selector.css('.byline').css('span[itemprop="name"]').xpath('./text()').extract()] item['keywords'] = [s.encode('utf-8') for s in response.selector.xpath('//meta[@name="keywords"]/@content').extract()] item['resource'] = self.name item['publication_id'] = hashlib.sha1((str(item['url']) + str(item['published']))).hexdigest() # Handle next pages next_page = get_first(response.selector.xpath('//link[@rel="next"]/@href').extract()) if next_page: self.logger.debug("Next page found: "+next_page) yield Request(next_page,callback=self.parse_page) #else: # raise CloseSpider('Article was published on %s at %s which is before earliest published global setting %s' % (self.name, published.isoformat(), earliest.isoformat())) yield item
def parse_page(self, response): """Scrapes information from pages into items""" item = CrawlerItem() item['url'] = response.url.encode('utf-8') item['visited'] = datetime.datetime.now().isoformat().encode('utf-8') item['published'] = parser.parse( get_first(response.selector.xpath( '//time/@datetime').extract())).isoformat().encode('utf-8') item['title'] = get_first( response.selector.xpath( '//meta[@property="og:title"]/@content').extract()) item['description'] = get_first( response.selector.xpath( '//meta[@name="description"]/@content').extract()) item['text'] = "".join([ s.strip().encode('utf-8') for s in response.selector.css( '.article>.body>p').xpath('.//text()').extract() ]) item['author'] = [ s.encode('utf-8') for s in response.selector.css('.authorContainer').xpath( './/span/strong/span/text()').extract() ] item['keywords'] = [ s.encode('utf-8') for s in response.selector.xpath( '//meta[@name="news_keywords"]/@content').extract() ] item['resource'] = self.name item['publication_id'] = hashlib.sha1( (str(item['url']) + str(item['published']))).hexdigest() return item
def parse_page(self, response): """Scrapes information from pages into items""" item = CrawlerItem() item['url'] = response.url.encode('utf-8') item['visited'] = datetime.datetime.now().isoformat().encode('utf-8') item['published'] = get_first( response.selector.xpath('//time/@datetime').extract()) item['title'] = get_first( response.selector.xpath( '//meta[@property="og:title"]/@content').extract()) item['description'] = get_first( response.selector.xpath( '//meta[@name="description"]/@content').extract()) item['text'] = "".join([ s.strip().encode('utf-8') for s in response.selector.css( '.article-content>.rtf-content-wrapper>P').xpath( './/text()').extract() ]) item['author'] = [ s.encode('utf-8') for s in response.selector.xpath( '//div[@class="name"]/a[@rel="author"]/text()').extract() ] item['keywords'] = [ s.encode('utf-8') for s in response.selector.xpath( '//meta[@name="news_keywords"]/@content').extract() ] return item
def parse_page(self, response): """Scrapes information from pages into items""" item = CrawlerItem() item['url'] = response.url.encode('utf-8') item['visited'] = datetime.datetime.now().isoformat().encode('utf-8') item['published'] = get_first( response.selector.xpath( '//span[@class="Datum"]/@content').extract()) item['title'] = get_first( response.selector.xpath( '//meta[@property="og:title"]/@content').extract()) item['description'] = get_first( response.selector.xpath( '//meta[@property="og:description"]/@content').extract() ).strip() item['text'] = "".join([ s.strip().encode('utf-8') for s in response.selector.xpath( '//div[@class="FAZArtikelText"]/div/p/text()').extract() ]) item['author'] = [ s.encode('utf-8') for s in response.selector.xpath( '//span[@class="Autor"]/span[@class="caps last"]/a/span/text()' ).extract() ] item['keywords'] = [ s.encode('utf-8') for s in response.selector.xpath( '//meta[@name="keywords"]/@content').extract() ] return item
def parse_page(self, response): """Scrapes information from pages into items""" item = CrawlerItem() item['url'] = response.url.encode('utf-8') item['visited'] = datetime.datetime.now().isoformat().encode('utf-8') item['published'] = get_first(response.selector.xpath('//meta[@name="date"]/@content').extract()) item['title'] = get_first(response.selector.xpath('//meta[@property="og:title"]/@content').extract()) item['description'] = get_first(response.selector.xpath('//meta[@name="description"]/@content').extract()) item['text'] = "".join([s.strip().encode('utf-8') for s in response.selector.css('.artContent').xpath('.//text()').extract()]) item['author'] = [s.encode('utf-8') for s in response.selector.xpath('//meta[@name="author"]/@content').extract()] item['keywords'] = [s.encode('utf-8') for s in response.selector.xpath('//meta[@name="keywords"]/@content').extract()] return item
def parse_page(self, response): """Scrapes information from pages into items""" item = CrawlerItem() item['url'] = response.url.encode('utf-8') item['visited'] = datetime.datetime.now().isoformat().encode('utf-8') item['published'] = get_first(response.selector.xpath('//span[@class="Datum"]/@content').extract()) item['title'] = get_first(response.selector.xpath('//meta[@property="og:title"]/@content').extract()) item['description'] = get_first(response.selector.xpath('//meta[@property="og:description"]/@content').extract()).strip() item['text'] = "".join([s.strip().encode('utf-8') for s in response.selector.xpath('//div[@class="FAZArtikelText"]/div/p/text()').extract()]) item['author'] = [s.encode('utf-8') for s in response.selector.xpath('//span[@class="Autor"]/span[@class="caps last"]/a/span/text()').extract()] item['keywords'] = [s.encode('utf-8') for s in response.selector.xpath('//meta[@name="keywords"]/@content').extract()] return item
def parse_page(self, response): """Scrapes information from pages into items""" item = CrawlerItem() item['url'] = response.url.encode('utf-8') item['visited'] = datetime.datetime.now().isoformat().encode('utf-8') item['published'] = get_first(response.selector.xpath('//meta[@name="date"]/@content').extract()) item['title'] = get_first(response.selector.css('.headline').xpath('./text()').extract()) item['description'] = get_first(response.selector.xpath('//meta[@name="description"]/@content').extract()) item['text'] = "".join([s.strip().encode('utf-8') for s in response.selector.xpath('//div[@class="article-section clearfix"]/p/text()').extract()]) item['author'] = [s.encode('utf-8') for s in response.selector.xpath('//p[@class="author"]/a/text()').extract()] item['keywords'] = [s.encode('utf-8') for s in response.selector.xpath('//meta[@name="news_keywords"]/@content').extract()] return item
def parse_page(self, response): """Scrapes information from pages into items""" item = CrawlerItem() item['url'] = response.url.encode('utf-8') item['visited'] = datetime.datetime.now().isoformat().encode('utf-8') item['published'] = get_first(response.selector.xpath('//meta[@name="date"]/@content').extract()) item['title'] = get_first(response.selector.xpath('//meta[@property="og:title"]/@content').extract()) item['description'] = get_first(response.selector.xpath('//meta[@name="description"]/@content').extract()) item['text'] = "".join([s.strip().encode('utf-8') for s in response.selector.css('.article__item').css('.paragraph').xpath('.//text()').extract()]) item['author'] = [s.encode('utf-8') for s in response.selector.css('.byline').css('span[itemprop="name"]').xpath('./text()').extract()] item['keywords'] = [s.encode('utf-8') for s in response.selector.xpath('//meta[@name="keywords"]/@content').extract()] # Handle next pages next_page = get_first(response.selector.xpath('//link[@rel="next"]/@href').extract()) if next_page: self.logger.debug("Next page found: "+next_page) yield Request(next_page,callback=self.parse_page) yield item
def parse_page(self, response): """Scrapes information from pages into items""" #settings = get_project_settings() published = parser.parse( get_first(response.selector.xpath('//time/@datetime').extract())) #published = published.replace(tzinfo=timezone('UTC')) #earliest = parser.parse(settings.get('EARLIEST_PUBLISHED')) # logger.warning(published) # logger.warning(earliest) # if published < earliest: # raise DropItem('Dropping this article published on %s at %s which is before earliest published global setting %s' % (self.name, published.isoformat(), earliest.isoformat())) #raise CloseSpider('Article was published on %s at %s which is before earliest published global setting %s' % (self.name, published.isoformat(), earliest.isoformat())) # else: item = CrawlerItem() item['url'] = response.url.encode('utf-8') item['visited'] = datetime.datetime.now().isoformat().encode('utf-8') item['published'] = published.isoformat().encode('utf-8') # logger.warning(parser.parse(get_first(response.selector.xpath('//time/@datetime').extract()))) item['title'] = get_first( response.selector.xpath( '//meta[@property="og:title"]/@content').extract()) item['description'] = get_first( response.selector.xpath( '//meta[@property="og:description"]/@content').extract() ).strip() #item['text'] = "".join([s.strip().encode('utf-8') for s in response.selector.xpath('//div[@class="atc-Text "]/p[@class="atc-TextParagraph"]/text()').extract()]) item['author'] = [ s.encode('utf-8') for s in response.selector.xpath( '//span[@class="Autor"]/span[@class="caps last"]/a/span/text()' ).extract() ] item['keywords'] = [ s.encode('utf-8') for s in response.selector.xpath( '//meta[@name="keywords"]/@content').extract() ] item['resource'] = self.name item['publication_id'] = hashlib.sha1( (str(item['url']) + str(item['published']))).hexdigest() return item
def parse_page(self, response): text = [] for s in response.xpath('//text()').extract(): if s.strip() != "": text.append(s.strip()) if text: print "@@@@@@", response.url #print text item = CrawlerItem() item['url'] = response.url item['title'] = get_first( response.xpath('//title//text()').extract()) item['text'] = text return item else: pass