def parse_page(self, response): hxs = HtmlXPathSelector(response) title = hxs.select( '//meta[@property="og:title"]/@content').extract_first() body = [ s.strip() for s in hxs.select( '//div[@class="article__body"]//p//text()').extract() ] time = hxs.select( '//meta[@itemprop="datePublished"]/@content').extract_first() if body: if time.find('2016') == -1: return else: item = OpenpoliticsItem() item['title'] = title item['text'] = body item['url'] = response.url if time: item['date'] = dateutil.parser.parse(time) else: item['time'] = hxs.select( '//div[@class="akt_bar"]/span/text()').extract() # item['time'] = time item['i'] = 10 return item
def parse_page(self, response): hxs = HtmlXPathSelector(response) title = hxs.select( '//h1[@itemprop="headline name"]//text()[not(ancestor::script|ancestor::style|ancestor::noscript)]' ).extract_first() if title: title = title.strip() body = [ s.strip() for s in hxs.select( '//span[@itemprop="articleBody"]//text()[not(ancestor::script|ancestor::style|ancestor::noscript|ancestor::h1)]' ).extract() ] time = hxs.select('//meta[@property="article:published_time"]/@content' ).extract_first() if body and time: if time.find('2016') == -1: return else: item = OpenpoliticsItem() item['title'] = title item['text'] = body item['url'] = response.url item['date'] = dateutil.parser.parse(time) item['i'] = 1 return item
def parse_page(self, response): hxs = HtmlXPathSelector(response) title = hxs.select( '//meta[@property="og:title"]/@content').extract_first() body = [ s.strip() for s in hxs.select( '//article[@id="item-article"]//p//text()[not(' 'ancestor::script|ancestor::style|ancestor::noscript)]'). extract() ] time = hxs.select('//article[@id="item-article"]/header/time/@datetime' ).extract_first() if not time: time = hxs.select( '//header[@id="page-header"]/time/@datetime').extract_first() if body: if time.find('2016') == -1: print time else: item = OpenpoliticsItem() item['title'] = title item['text'] = body item['url'] = response.url if time: item['date'] = dateutil.parser.parse(time) # else: # item['time'] = time item['i'] = 7 return item
def parse_page(self, response): hxs = HtmlXPathSelector(response) title = hxs.select('//h1[@class="art-title"]/text()').extract_first() body = [ s.strip() for s in hxs.select( '//section[@class="art_content"]//p//text()').extract() ] if not body: body = [ s.strip() for s in hxs.select('//div[@id="artykul"]//text()').extract() ] time = hxs.select( '//time[@class="art-datetime"]/@datetime').extract_first() # if time: # print time if body: item = OpenpoliticsItem() item['title'] = title item['text'] = body item['url'] = response.url # if time: # item['date'] = dateutil.parser.parse(time) item['time'] = time item['i'] = 9 return item
def parse_page(self, response): hxs = HtmlXPathSelector(response) title = hxs.select('//span[@class="article-heading__title"]/text()').extract_first() body = hxs.select('//section[@class="article-page"]/p//text()').extract() item = OpenpoliticsItem() item['title'] = title item['body'] = body item['url'] = response.url # item['simhash'] = str(Simhash(body)) return item
def parse_page(self, response): hxs = HtmlXPathSelector(response) title = hxs.select('//meta[@property="og:title"]/@content').extract_first() body = [s.strip() for s in hxs.select('//div[@class="text"]//p//text()').extract()] time = hxs.select('//meta[@property="article:published_time"]/@content').extract_first() if body: if time.find('2016') == -1: print time else: item = OpenpoliticsItem() item['title'] = title item['text'] = body item['url'] = response.url if time: item['date'] = dateutil.parser.parse(time) item['i'] = 2 return item
def parse_page(self, response): hxs = HtmlXPathSelector(response) title = hxs.select('//meta[@property="og:title"]/@content').extract_first() body = [s.strip() for s in hxs.select('//div[@id="intext_content_tag"]//div//text()').extract()] time = hxs.select('//div[@class="article_info"]/text()').extract_first() if time: time = time.strip() if body and time: if time.find('2016') == -1: return else: item = OpenpoliticsItem() item['title'] = title item['text'] = body item['url'] = response.url # if time: # item['date'] = dateutil.parser.parse(time) item['time'] = time item['i'] = 8 return item
def parse_page(self, response): hxs = HtmlXPathSelector(response) title = hxs.select( '//meta[@property="og:title"]/@content').extract_first() body = [ s.strip() for s in hxs.select( '//div[@class="FAZArtikelText"]//p//text()').extract() ] time = hxs.select('//span[@class="Datum"]/@content').extract_first() if body and time: if time.find('2016') == -1: return else: item = OpenpoliticsItem() item['title'] = title item['text'] = body item['url'] = response.url # if time: item['time'] = time item['i'] = 5 return item
def parse_page(self, response): hxs = HtmlXPathSelector(response) title = hxs.select( '//meta[@property="og:title"]/@content').extract_first() body = [ s.strip() for s in hxs.select( '//section[@class="body"]//p//text()').extract() ] time = hxs.select( '//time[@class="timeformat"]/@datetime').extract_first() if body and time: if time.find('2016') == -1: return else: item = OpenpoliticsItem() item['title'] = title item['text'] = body item['url'] = response.url item['date'] = dateutil.parser.parse(time) item['i'] = 4 return item
def parse_page(self, response): hxs = HtmlXPathSelector(response) title = hxs.select('//span[@class="headline"]//text()').extract_first() if title: title = title.strip() body = [ s.strip() for s in hxs.select('//div[@class="txt"]/p//text()').extract() ] time = hxs.select( '//div[@class="authors"]//time/@datetime').extract_first() if body and time: if time.find('2016') == -1: return else: item = OpenpoliticsItem() item['title'] = title item['text'] = body item['url'] = response.url item['date'] = dateutil.parser.parse(time) item['i'] = 0 return item