def download(self): try: logger.info('Downloading article for {}'.format( self._pocket_item.url)) article = Article(self._pocket_item.url) article.download() logger.info('Parsing article for {}'.format(self._pocket_item.url)) article.parse() logger.info('Performing NLP on article for {}'.format( self._pocket_item.url)) article.nlp() article.tags = list(article.tags) if article.publish_date: article.publish_date = article.publish_date.timestamp() article.images = list(article.images) self._pocket_item.article = dict( (k, v) for k, v in article.__dict__.items() if k in self.ARTICLE_ATTRIBUTES_TO_KEEP) except ArticleException: logger.warning('Could not download article for {}'.format( self._pocket_item.url)) return {}
def test_convert_to_dict_most_fields_works(): faker = Faker() source = Article(url=faker.url()) source.authors = [faker.name(), faker.name()] source.top_image = faker.image_url() source.article_html = faker.text() source.images = [faker.image_url(), faker.image_url()] source.meta_data = [faker.city(), faker.state(), faker.country()] result = extractor.to_dict(source, "article_html", "authors", "images", "keywords", "meta_data", "source_url", "summary", "top_image", "url", "tags", "meta_favicon") assert result assert len(result) == 7 assert "article_html" in result assert "authors" in result assert "images" in result assert "keywords" not in result assert "meta_data" in result assert "source_url" in result assert "summary" not in result assert "top_image" in result assert "url" in result assert "tags" not in result assert "meta_favicon" not in result