Exemple #1
0
def get_article(link, news, date):
    article = Article(link)
    article.download()
    article.parse()
    article.nlp()
    lang = 'ENGLISH'
    if len(article.title) < 5 or len(article.text) < 5:
        lang = 'INDONESIA'
        print('found BM/ID article')
        article = Article(link, language='id')
        article.download()
        article.parse()
        if xgb_language:
            lang = xgb_language.predict(article.text)
            malaya_summarized = get_malaya_summary(article.text.split('\n'))
            article.summary = malaya_summarized['summary']
            article.keywords = malaya_summarized['cluster-top-words']
        else:
            article.nlp()
    return {
        'title': article.title,
        'url': link,
        'authors': article.authors,
        'top-image': article.top_image,
        'text': article.text,
        'keyword': article.keywords,
        'summary': article.summary,
        'news': news,
        'date': date,
        'language': lang,
    }
def process_html(url, html):
    log(f'Processing {url}')
    article = Article(url, KEYWORD_COUNT=25)
    article.download(input_html=html)
    article.parse()
    article.authors = '; '.join(article.authors)
    log(f'Parsed {len(article.text)} bytes of natural text')
    article.nlp()
    keywords = copy.deepcopy(article.keywords)
    article.keywords = ', '.join(keywords)
    return article, keywords
Exemple #3
0
def processUrl():
    global article
    print('start processing')
    article = Article(url)
    article.download()
    article.parse()

    nltk.download('punkt')
    article.nlp()

    keywords = [b for b in article.keywords if
                all(a not in b for a in non_keywords)]

    article.keywords = keywords
    print('end processing')