Exemple #1
0
    def download(self):
        try:
            logger.info('Downloading article for {}'.format(
                self._pocket_item.url))
            article = Article(self._pocket_item.url)
            article.download()
            logger.info('Parsing article for {}'.format(self._pocket_item.url))
            article.parse()
            logger.info('Performing NLP on article for {}'.format(
                self._pocket_item.url))
            article.nlp()

            article.tags = list(article.tags)
            if article.publish_date:
                article.publish_date = article.publish_date.timestamp()

            article.images = list(article.images)

            self._pocket_item.article = dict(
                (k, v) for k, v in article.__dict__.items()
                if k in self.ARTICLE_ATTRIBUTES_TO_KEEP)
        except ArticleException:
            logger.warning('Could not download article for {}'.format(
                self._pocket_item.url))
            return {}
Exemple #2
0
def test_convert_to_dict_most_fields_works():
    faker = Faker()

    source = Article(url=faker.url())

    source.authors = [faker.name(), faker.name()]
    source.top_image = faker.image_url()
    source.article_html = faker.text()
    source.images = [faker.image_url(), faker.image_url()]
    source.meta_data = [faker.city(), faker.state(), faker.country()]

    result = extractor.to_dict(source, "article_html", "authors", "images",
                               "keywords", "meta_data", "source_url",
                               "summary", "top_image", "url", "tags",
                               "meta_favicon")

    assert result
    assert len(result) == 7

    assert "article_html" in result
    assert "authors" in result
    assert "images" in result
    assert "keywords" not in result
    assert "meta_data" in result
    assert "source_url" in result
    assert "summary" not in result
    assert "top_image" in result
    assert "url" in result
    assert "tags" not in result
    assert "meta_favicon" not in result