def worker(news_text): try: print(news_text.pk) text = news_text.text refined_text = text_prerparer.text_preparer(text) return news_text, refined_text except: print(news_text)
def worker(news): html = urlOpen.get_html(news.url) print(str(news.pk) + " ", end='\n') if html: text = textParser.get_text_from_html(html) url_list = [url for url in aParser.get_a_from_news_text(news_url=news.url, text=text)] text = aParser.remove_all_tags(text) text = text_prerparer.text_preparer(text) return NewsText(news=news, text=text), url_list
def worker(news): html = urlOpen.get_html(news.url) print(str(news.pk) + " ", end='\n') if html: text = textParser.get_text_from_html(html) url_list = [ url for url in aParser.get_a_from_news_text(news_url=news.url, text=text) ] text = aParser.remove_all_tags(text) text = text_prerparer.text_preparer(text) return NewsText(news=news, text=text), url_list
def get_key_word(news, news_title): plain_news_title = text_preparer(news_title) interesting_words_from_news = get_only_noun(news) interesting_words_from_title = get_only_noun(plain_news_title) all_interesting_words = interesting_words_from_news + interesting_words_from_title counted_words = Counter(all_interesting_words) for word in interesting_words_from_title: if word in counted_words: counted_words[word] *= 3 key_words_count = 10 all_words = counted_words.most_common(key_words_count) with open('anti_key_words.txt') as anti_key_words_file: anti_key_words = set(anti_key_words_file.read().splitlines()) key_words = [] for word in all_words: if word[0] not in anti_key_words: key_words.append(word[0]) return key_words