Ejemplo n.º 1
0
def summarise_one(url, title=True, keywords=True, summary=False, \
    top_img_src=False):
    '''
    Get url and return summary 
    '''
    article = Article(url)

    # configuration for Newspaper to minimize processing time
    configure = Config()
    configure.fetch_images = False
    configure.MAX_SUMMARY = 300
    configure.MAX_SUMMARY_SENT = 3
    
    try:
        article.download()
        article.parse()
    except:
        print(url) 

    title = article.title
    if keywords or summary:
        try:
            article.nlp()
            if keywords:
                keywords = article.keywords
            if summary:
                summary = article.summary
        except :
            print('NEwspaper error with nlp() call')
        
    if top_img_src:
        top_img_src = article.top_image
   
    return title, keywords, summary, top_img_src
Ejemplo n.º 2
0
def load_and_parse_full_article_text_and_image(url: str) -> Article:
    config = Config()
    config.MAX_SUMMARY_SENT = 8

    article = Article(url, config=config)
    article.set_html(load_page_safe(url))  # safer than article.download()
    article.parse()

    return article
Ejemplo n.º 3
0
def analysis():
	articles = request.data
	articles = json.loads(articles)
	urls = articles['urls']
	summaries = []
	results = []
	config = Config()
	config.MAX_SUMMARY_SENT = 2
	for pairs in urls:
		url = pairs['url']
		source = pairs['source']

		print(url)
		article = Article(url, config=config)
		article.download()
		article.parse()
		summary, summary2 = generateSummary(article)
		str_summary = ' '.join(summary)
		blob = TextBlob(str_summary)
		summaries.append(str_summary)
		print(str_summary)
		subjectivity = []
		polarity = []
		for sentence in blob.sentences:
			subjectivity.append(sentence.sentiment.subjectivity)
			polarity.append(sentence.sentiment.polarity)
		subj_sum = 0.5 if len(subjectivity) == 0 else sum(subjectivity)/len(subjectivity)
		pol_sum = 0.5 if len(polarity) == 0 else ((sum(polarity) / len(polarity)) + 1)/2
		results.append({"headline": article.title, "summary": summary2, "_summary": str_summary, "image": article.top_image, "subjectivity": subj_sum, "p_group": pol_sum, "url": url, "source": source, "keywords": article.keywords})
	clusters = clusterSentences(summaries, 3)
	for cluster in range(3):
		for i, article_summary in enumerate(clusters[cluster]):
			for res in results:
				if res['_summary'] == summaries[article_summary]:
					print("FOUND")
					res['group'] = convertNum(cluster)
	return json.dumps({"results": results})
Ejemplo n.º 4
0
import pandas as pd
from tqdm import tqdm

import newspaper
from newspaper import Config

config = Config()
config.memoize_articles = False
config.fetch_images = False
config.verbose = True
config.MAX_SUMMARY_SENT = 10
config.language = "vi"

vnexpress = newspaper.build('https://vnexpress.net', config=config)
zingvn = newspaper.build('https://news.zing.vn', config=config)
kenh14 = newspaper.build('https://kenh14.vn/', config=config)


# list_url_slate_paper = []
# for article in slate_paper.articles:
#     list_url_slate_paper.append(article.url)
def extract_data(news, file_name):
    print("crawling: ", file_name)
    data = []
    for article in tqdm(news.articles):
        # count += 1
        # if count > 10:
        #     break
        if "#box_comment" in article.url:
            continue
        temp = {"link": article.url}