def article_info(self, url): """Extracts article data using newspaper Args: url: url of article Returns: Dictionary of article data """ try: try: article = Article( url, browser_user_agent=self.HEADERS['User-Agent']) # fixes issue with bloomberg if 'bloomberg.com' in article.url: article.url = article.url.replace( 'www.bloomberg.com', 'www.bloombergquint.com') article.download() article.parse() except Exception as e: logging.info(e) # sometimes we need to use googlebot if an error occurs article = Article( url, browser_user_agent= 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' ) article.download() article.parse() logging.info(article.url[8:40] + "... article scraped with googlebot") finally: article.nlp() # prevents Yahoo finance articles from being scraped incorrectly if 'finance.yahoo.com' in article.url: return self.yahoo_get_text(article) else: keywords = ", ".join([item for item in article.keywords]) return { 'title': article.title, 'keywords': keywords, 'summary': article.summary, 'full_text': article.text, 'meta_descr': article.meta_description } # prints and logs error except Exception as e: logging.info(article.url + "... article skipped due to error: " + str(e)) return {'error': 'article skipped'}