def parse_item(self, response): item = NewsArticleItem() title = response.xpath('//title/text()').extract_first() article = (response.xpath('//p/text()').extract() + response.xpath('//br/text()').extract() + response.xpath('//div[@class="article"]/text()').extract() + response.xpath( '//div["font-family: arial; font-size: 13px"]/text()' ).extract()) self.logger.info('Scraping Title: ' + title) item['title'] = title item['article'] = article item['link'] = response.url.replace('http://', '').replace('https://', '') raw_date = ( response.xpath( '//span[@class="news_article_date"]/text()').extract_first() or response.xpath('//td[@class="pubName"]/text()').extract_first()) clean_date = self._parse_wg_date(self._clean_date(raw_date)) if clean_date.year < 1900: raise DropItem('Incorrect Format for Date in %s' % item) else: item['date'] = str(clean_date) return item
def parse_item(self, response): item = NewsArticleItem() title = response.xpath('//a/text()')[8].extract() article = response.xpath('//div/text()').extract() self.logger.info('Scraping Title: ' + title) item['title'] = title item['article'] = article item['link'] = (response.url.replace('http://', '').replace( 'https://', '').replace("blogspot.co.id", "blogspot.com")) try: token = filter(lambda x: '--' in x, article) try: raw_date = token[0].split(' -- ')[0].replace('\n', '') date = datetime.strptime(raw_date, '%d/%m/%y') except (ValueError, IndexError): try: raw_date = title.split("Nogger's Blog: ")[1] date = datetime.strptime(raw_date, '%d-%b-%Y') except (ValueError, IndexError): date = '' item['date'] = str(date) return item except Exception as e: pass
def parse_item(self, response): item = NewsArticleItem() title = response.xpath('//title/text()')[0].extract() raw_date = response.xpath('//font/text()')[2].extract().split(',')[1] + \ response.xpath('//font/text()')[3].extract().split(',')[0] article = response.xpath('//body//text()').extract() self.logger.info("Scraping Title: " + title) item['title'] = title.replace('Agrimoney.com | ', '') item['article'] = article item['link'] = response.url try: date = datetime.strptime( raw_date.split(',')[1].replace('Sept', 'Sep'), ' %d %b %Y') item['date'] = str(date) return item except (ValueError, IndexError): raw_date = response.xpath('//font/text()')[2].extract( ) + response.xpath('//font/text()')[3].extract() try: date = datetime.strptime(raw_date.split(',')[2], ' %d %b %Y') item['date'] = str(date) return item except (ValueError, IndexError): try: date = datetime.strptime( raw_date.split(',')[1], ' %d %b %Y') item['date'] = str(date) return item except Exception as e: self.logf.write("Failed to scrape {0}: {1}\n".format( str(response.url), str(e)))
def parse_item(self, response): item = NewsArticleItem() title = response.xpath('//title/text()')[0].extract() article = response.xpath('//p/text()').extract() self.logger.info("Scraping Title: " + title) try: item['title'] = title item['article'] = article item['link'] = response.url raw_date = response.url.split("/")[-2] date = datetime.strptime(raw_date, '%Y-%m-%d') item['date'] = str(date) return item except Exception as e: self.logf.write("Failed to scrape {0}: {1}\n".format( str(response.url), str(e)))
def parse_item(self, response): item = NewsArticleItem() title = response.xpath('//title/text()')[0].extract() article = response.xpath('//p/text()').extract() self.logger.info("Scraping Title: " + title) item['title'] = title item['article'] = article item['link'] = response.url try: raw_date = response.xpath( '//*[contains(@class,"ea-dateformat")]/text()').extract( )[1].strip() date = datetime.strptime(raw_date, '%d-%m-%Y') item['date'] = str(date) return item except Exception as e: self.logf.write("Failed to scrape {0}: {1}\n".format( str(response.url), str(e)))
def parse_item(self, response): item = NewsArticleItem() title = response.xpath('//title/text()').extract_first() if title == 'Bloomberg': title = response.xpath( '//meta[@property="og:title"]/text()').extract_first() article = response.xpath('//p/text()').extract() self.logger.info('Scraping Title: ' + title) try: item['title'] = title item['article'] = article item['link'] = response.url.replace('http://', '').replace('https://', '') raw_date = response.url.split('/')[-2] date = datetime.strptime(raw_date, '%Y-%m-%d') item['date'] = str(date) return item except Exception as e: pass
def parse_item(self, response): cleaned_response = response.replace( body=response.body_as_unicode().encode('utf-8', 'ignore'), encoding='utf-8') item = NewsArticleItem() try: title = response.xpath('//title/text()')[0].extract().encode( 'utf-8', 'ignore') raw_date = response.xpath('//font/text()')[2].extract().split(',')[1] + \ response.xpath('//font/text()')[3].extract().split(',')[0] self.logger.info('Scraping Title: ' + title) item['title'] = title.replace('Agrimoney.com | ', '').encode('utf-8', 'ignore') item['article'] = [ art.encode('utf-8') for art in response.xpath('//body//text()').extract() ] item['link'] = response.url.replace('http://', '').replace( 'https://', '').encode('utf-8', 'ignore') except UnicodeDecodeError: pass try: date = datetime.strptime( raw_date.split(',')[1].replace('Sept', 'Sep'), ' %d %b %Y') item['date'] = str(date) return item except (ValueError, IndexError): raw_date = response.xpath('//font/text()')[2].extract() + \ response.xpath('//font/text()')[3].extract() try: date = datetime.strptime(raw_date.split(',')[2], ' %d %b %Y') item['date'] = str(date) return item except (ValueError, IndexError): try: date = datetime.strptime( raw_date.split(',')[1], ' %d %b %Y') item['date'] = str(date) return item except Exception as e: pass
def parse_item(self, response): item = NewsArticleItem() try: title = response.xpath('//title/text()')[0].extract() article = response.xpath('//p/text()').extract() item['title'] = title item['article'] = article item['link'] = response.url.replace('http://', '').replace('https://', '') except UnicodeDecodeError: pass self.logger.info('Scraping Title: ' + title) try: raw_date = response.xpath( '//*[contains(@class,"ea-dateformat")]/text()').extract( )[1].strip() date = datetime.strptime(raw_date, '%d-%m-%Y') item['date'] = str(date) return item except Exception as e: pass
def parse_item(self, response): item = NewsArticleItem() title = response.xpath('//a/text()')[8].extract() article = response.xpath('//div/text()').extract() self.logger.info("Scraping Title: " + title) item['title'] = title item['article'] = article item['link'] = response.url try: token = filter(lambda x: '--' in x, article) try: raw_date = token[0].split(' -- ')[0].replace('\n', '') date = datetime.strptime(raw_date, '%d/%M/%y') except (ValueError, IndexError): try: raw_date = title.split("Nogger's Blog: ")[1] date = datetime.strptime(raw_date, '%d-%b-%Y') except (ValueError, IndexError): date = "" item['date'] = str(date) return item except Exception as e: self.logf.write("Failed to scrape {0}: {1}\n".format( str(response.url), str(e)))