Ejemplo n.º 1
0
def summarise_one(url, title=True, keywords=True, summary=False, \
    top_img_src=False):
    '''
    Get url and return summary 
    '''
    article = Article(url)

    # configuration for Newspaper to minimize processing time
    configure = Config()
    configure.fetch_images = False
    configure.MAX_SUMMARY = 300
    configure.MAX_SUMMARY_SENT = 3
    
    try:
        article.download()
        article.parse()
    except:
        print(url) 

    title = article.title
    if keywords or summary:
        try:
            article.nlp()
            if keywords:
                keywords = article.keywords
            if summary:
                summary = article.summary
        except :
            print('NEwspaper error with nlp() call')
        
    if top_img_src:
        top_img_src = article.top_image
   
    return title, keywords, summary, top_img_src
Ejemplo n.º 2
0
import newspaper
from newspaper import Config, Article
from lxml import etree
import multiprocessing
import logging
import traceback

config = Config()
config.language = 'pt'
config.MAX_SUMMARY = 500  #number max of characters
config.fetch_images = False  #we don't want images


# Requires an URL and returns the Article object
def get_article(url):
    article = Article(url=url, config=config)  #Create a article
    try:
        article.download()
        article.parse()
        article.nlp()
        return article
    except Exception as e:
        logging.error(traceback.format_exc())


# Gets the Articles using multiprocessing to raise performance
def get_articles(urls):
    pool = multiprocessing.Pool(
        multiprocessing.cpu_count() - 1
        or 1)  # gets maximum number of CPU cores minus 1
    articles = [