Exemple #1
0
def test_convert_to_dict_most_fields_works():
    faker = Faker()

    source = Article(url=faker.url())

    source.authors = [faker.name(), faker.name()]
    source.top_image = faker.image_url()
    source.article_html = faker.text()
    source.images = [faker.image_url(), faker.image_url()]
    source.meta_data = [faker.city(), faker.state(), faker.country()]

    result = extractor.to_dict(source, "article_html", "authors", "images",
                               "keywords", "meta_data", "source_url",
                               "summary", "top_image", "url", "tags",
                               "meta_favicon")

    assert result
    assert len(result) == 7

    assert "article_html" in result
    assert "authors" in result
    assert "images" in result
    assert "keywords" not in result
    assert "meta_data" in result
    assert "source_url" in result
    assert "summary" not in result
    assert "top_image" in result
    assert "url" in result
    assert "tags" not in result
    assert "meta_favicon" not in result
Exemple #2
0
def getArticleInfo():
    post_data = (literal_eval(request.data.decode('utf8')))
    country = post_data["country"]
    articleInfo = {}
    urls = getNewsUrls(country)
    count = 0
    goodCount = 0
    while count < len(urls):
        article = Article(urls[count])
        try:
            article.download()
            article.parse()
            if (isinstance(article.publish_date, datetime)):
                date = article.publish_date.strftime('%m/%d/%Y')
            else:
                date = article.publish_date
            authors = []
            for x in article.authors:
                if len(x.split(" ")) == 2:
                    authors.append(x)
            if not authors:
                authors[0] = "No Author"
            if date == None:
                date = "No Date"
            if article.top_image == None:
                article.top_image = "No imageURL"
            if article.title == None:
                article.title = "No title"
            if count != 0 and goodCount != 0 and urls[count] == articleInfo[
                    goodCount - 1]["url"]:
                print("Inside if statement")
                raise Exception
            articleInfo[goodCount] = {
                "authors": authors,
                "date": date,
                "url": urls[count],
                "imageURL": article.top_image,
                "title": article.title
            }
            count = count + 1
            goodCount = goodCount + 1
        except Exception as e:
            print(e)
            count = count + 1
            print("bad article")
    return articleInfo
Exemple #3
0
pip install nltk

pip install newspaper3k

import nltk
from newspaper import Article

url = 'https://www.cnbc.com/2020/03/19/ecb-launches-new-820-billion-coronavirus-package.html'
article= Article(url)

article.download()
article.parse()
nltk.download('punkt')
article.nlp()

# get the authors
article.authors

# get the authors
article.publish_date()

article.top_image()

print(article.text)

# get summary of the artile 
print(article.summary)