Exemple #1
0
def test_convert_to_dict_most_fields_works():
    faker = Faker()

    source = Article(url=faker.url())

    source.authors = [faker.name(), faker.name()]
    source.top_image = faker.image_url()
    source.article_html = faker.text()
    source.images = [faker.image_url(), faker.image_url()]
    source.meta_data = [faker.city(), faker.state(), faker.country()]

    result = extractor.to_dict(source, "article_html", "authors", "images",
                               "keywords", "meta_data", "source_url",
                               "summary", "top_image", "url", "tags",
                               "meta_favicon")

    assert result
    assert len(result) == 7

    assert "article_html" in result
    assert "authors" in result
    assert "images" in result
    assert "keywords" not in result
    assert "meta_data" in result
    assert "source_url" in result
    assert "summary" not in result
    assert "top_image" in result
    assert "url" in result
    assert "tags" not in result
    assert "meta_favicon" not in result
Exemple #2
0
    def build_news_article_from_url(source_url, sNLP):
        """build new article object from source url, if build fail would return None
        """
        try:
            print('start to scrape from url: ', source_url)

            # pre-process news by NewsPaper3k and Boilerpipe library
            article = Article(source_url, keep_article_html=True)
            article.build()
            article.nlp()
            e = Extractor(extractor='DefaultExtractor', html=article.html)
            article.text = e.getText()
            article.article_html = e.getHTML()

            news_article = NewsArticle(article, sNLP)
            print('success to scrape from url: ', source_url)
            return news_article
        except Exception as e:
            print('fail to scrape from url: ', source_url)
            print('reason:', e)
            return None
Exemple #3
0
            #parse

            #title and
            try:
                article = Article(url, language='de', keep_article_html=True)
                article.download()
                article.parse()

                filename = feed[3] + ''.join(
                    random.choices(string.ascii_uppercase + string.digits,
                                   k=8))

                file = open("html/" + filename + ".html", "w")

                article.article_html = "<meta property='baseurl' content='" + feed[
                    4] + "'>" + article.article_html
                article.article_html = "<script src='https://ajax.googleapis.com/ajax/libs/jquery/3.2.1/jquery.min.js'></script>" + article.article_html
                article.article_html = "<link href='https://fonts.googleapis.com/css?family=Slabo+27px' rel='stylesheet'>" + article.article_html
                article.article_html = "<img src='" + article.top_image + "' width='100%' >" + article.article_html
                article.article_html = "<h1 class='realTitle'>" + entrie.title + "</h1>" + article.article_html
                article.article_html = article.article_html + "<link type='text/css' rel='stylesheet' href='../css/renderStyles.css'/>"
                article.article_html = article.article_html + "<script src='../js/renderScript.js'></script>"
                article.article_html = "<div class='pr0news pr0-text text-orange " + feed[
                    3] + "'>" + article.article_html + "</div>"

                file.write(article.article_html)
                file.close()

                if article.title != entrie.title:
                    print(
                        str(len(article.title)) + " || " +