def parse_article_from_url(self, url, tweet_id): """ downloads and parses an article to json :param url: :param tweet_id: :return: """ try: a = Article(url) a.download() article_html = a.html a.parse() text = a.text title = a.title meta_data = a.meta_data is_media_news = a.is_media_news() is_parsed = a.is_parsed is_downloaded = a.download_state authors = a.authors canonical_link = a.canonical_link is_valid = True is_reloaded = False except Exception as inst: self.logger.info( "Error %s while loading and parsing article for tweet id %s with url %s" % (inst, tweet_id, url)) article_html = '' text = '' title = '' meta_data = '' is_media_news = '' is_parsed = '' is_downloaded = '' authors = '' canonical_link = '' is_valid = False is_reloaded = False result = { #'html_b64': article_html, 'text_b64': text, 'url': url, 'id': tweet_id, 'title': title, 'meta_data': meta_data, 'is_media_news': is_media_news, 'publish_date': '', 'is_parsed': is_parsed, 'is_downloaded': is_downloaded, 'authors': authors, 'canonical_link': canonical_link, 'is_valid': is_valid, 'is_reloaded': is_reloaded } # result = { # 'html_b64': article_html, # 'text_b64': text, # 'url': url, # 'id': tweet_id, # 'title': title # } return result, article_html