def summarise_one(url, title=True, keywords=True, summary=False, \ top_img_src=False): ''' Get url and return summary ''' article = Article(url) # configuration for Newspaper to minimize processing time configure = Config() configure.fetch_images = False configure.MAX_SUMMARY = 300 configure.MAX_SUMMARY_SENT = 3 try: article.download() article.parse() except: print(url) title = article.title if keywords or summary: try: article.nlp() if keywords: keywords = article.keywords if summary: summary = article.summary except : print('NEwspaper error with nlp() call') if top_img_src: top_img_src = article.top_image return title, keywords, summary, top_img_src
def load_and_parse_full_article_text_and_image(url: str) -> Article: config = Config() config.MAX_SUMMARY_SENT = 8 article = Article(url, config=config) article.set_html(load_page_safe(url)) # safer than article.download() article.parse() return article
def analysis(): articles = request.data articles = json.loads(articles) urls = articles['urls'] summaries = [] results = [] config = Config() config.MAX_SUMMARY_SENT = 2 for pairs in urls: url = pairs['url'] source = pairs['source'] print(url) article = Article(url, config=config) article.download() article.parse() summary, summary2 = generateSummary(article) str_summary = ' '.join(summary) blob = TextBlob(str_summary) summaries.append(str_summary) print(str_summary) subjectivity = [] polarity = [] for sentence in blob.sentences: subjectivity.append(sentence.sentiment.subjectivity) polarity.append(sentence.sentiment.polarity) subj_sum = 0.5 if len(subjectivity) == 0 else sum(subjectivity)/len(subjectivity) pol_sum = 0.5 if len(polarity) == 0 else ((sum(polarity) / len(polarity)) + 1)/2 results.append({"headline": article.title, "summary": summary2, "_summary": str_summary, "image": article.top_image, "subjectivity": subj_sum, "p_group": pol_sum, "url": url, "source": source, "keywords": article.keywords}) clusters = clusterSentences(summaries, 3) for cluster in range(3): for i, article_summary in enumerate(clusters[cluster]): for res in results: if res['_summary'] == summaries[article_summary]: print("FOUND") res['group'] = convertNum(cluster) return json.dumps({"results": results})
import pandas as pd from tqdm import tqdm import newspaper from newspaper import Config config = Config() config.memoize_articles = False config.fetch_images = False config.verbose = True config.MAX_SUMMARY_SENT = 10 config.language = "vi" vnexpress = newspaper.build('https://vnexpress.net', config=config) zingvn = newspaper.build('https://news.zing.vn', config=config) kenh14 = newspaper.build('https://kenh14.vn/', config=config) # list_url_slate_paper = [] # for article in slate_paper.articles: # list_url_slate_paper.append(article.url) def extract_data(news, file_name): print("crawling: ", file_name) data = [] for article in tqdm(news.articles): # count += 1 # if count > 10: # break if "#box_comment" in article.url: continue temp = {"link": article.url}