def summarise_one(url, title=True, keywords=True, summary=False, \ top_img_src=False): ''' Get url and return summary ''' article = Article(url) # configuration for Newspaper to minimize processing time configure = Config() configure.fetch_images = False configure.MAX_SUMMARY = 300 configure.MAX_SUMMARY_SENT = 3 try: article.download() article.parse() except: print(url) title = article.title if keywords or summary: try: article.nlp() if keywords: keywords = article.keywords if summary: summary = article.summary except : print('NEwspaper error with nlp() call') if top_img_src: top_img_src = article.top_image return title, keywords, summary, top_img_src
import newspaper from newspaper import Config, Article from lxml import etree import multiprocessing import logging import traceback config = Config() config.language = 'pt' config.MAX_SUMMARY = 500 #number max of characters config.fetch_images = False #we don't want images # Requires an URL and returns the Article object def get_article(url): article = Article(url=url, config=config) #Create a article try: article.download() article.parse() article.nlp() return article except Exception as e: logging.error(traceback.format_exc()) # Gets the Articles using multiprocessing to raise performance def get_articles(urls): pool = multiprocessing.Pool( multiprocessing.cpu_count() - 1 or 1) # gets maximum number of CPU cores minus 1 articles = [