def get_article(link, news, date): article = Article(link) article.download() article.parse() article.nlp() lang = 'ENGLISH' if len(article.title) < 5 or len(article.text) < 5: lang = 'INDONESIA' print('found BM/ID article') article = Article(link, language='id') article.download() article.parse() if xgb_language: lang = xgb_language.predict(article.text) malaya_summarized = get_malaya_summary(article.text.split('\n')) article.summary = malaya_summarized['summary'] article.keywords = malaya_summarized['cluster-top-words'] else: article.nlp() return { 'title': article.title, 'url': link, 'authors': article.authors, 'top-image': article.top_image, 'text': article.text, 'keyword': article.keywords, 'summary': article.summary, 'news': news, 'date': date, 'language': lang, }
def process_html(url, html): log(f'Processing {url}') article = Article(url, KEYWORD_COUNT=25) article.download(input_html=html) article.parse() article.authors = '; '.join(article.authors) log(f'Parsed {len(article.text)} bytes of natural text') article.nlp() keywords = copy.deepcopy(article.keywords) article.keywords = ', '.join(keywords) return article, keywords
def processUrl(): global article print('start processing') article = Article(url) article.download() article.parse() nltk.download('punkt') article.nlp() keywords = [b for b in article.keywords if all(a not in b for a in non_keywords)] article.keywords = keywords print('end processing')