def test_convert_to_dict_most_fields_works(): faker = Faker() source = Article(url=faker.url()) source.authors = [faker.name(), faker.name()] source.top_image = faker.image_url() source.article_html = faker.text() source.images = [faker.image_url(), faker.image_url()] source.meta_data = [faker.city(), faker.state(), faker.country()] result = extractor.to_dict(source, "article_html", "authors", "images", "keywords", "meta_data", "source_url", "summary", "top_image", "url", "tags", "meta_favicon") assert result assert len(result) == 7 assert "article_html" in result assert "authors" in result assert "images" in result assert "keywords" not in result assert "meta_data" in result assert "source_url" in result assert "summary" not in result assert "top_image" in result assert "url" in result assert "tags" not in result assert "meta_favicon" not in result
def getArticleInfo(): post_data = (literal_eval(request.data.decode('utf8'))) country = post_data["country"] articleInfo = {} urls = getNewsUrls(country) count = 0 goodCount = 0 while count < len(urls): article = Article(urls[count]) try: article.download() article.parse() if (isinstance(article.publish_date, datetime)): date = article.publish_date.strftime('%m/%d/%Y') else: date = article.publish_date authors = [] for x in article.authors: if len(x.split(" ")) == 2: authors.append(x) if not authors: authors[0] = "No Author" if date == None: date = "No Date" if article.top_image == None: article.top_image = "No imageURL" if article.title == None: article.title = "No title" if count != 0 and goodCount != 0 and urls[count] == articleInfo[ goodCount - 1]["url"]: print("Inside if statement") raise Exception articleInfo[goodCount] = { "authors": authors, "date": date, "url": urls[count], "imageURL": article.top_image, "title": article.title } count = count + 1 goodCount = goodCount + 1 except Exception as e: print(e) count = count + 1 print("bad article") return articleInfo
pip install nltk pip install newspaper3k import nltk from newspaper import Article url = 'https://www.cnbc.com/2020/03/19/ecb-launches-new-820-billion-coronavirus-package.html' article= Article(url) article.download() article.parse() nltk.download('punkt') article.nlp() # get the authors article.authors # get the authors article.publish_date() article.top_image() print(article.text) # get summary of the artile print(article.summary)