def worker(news_text):
     try:
         print(news_text.pk)
         text = news_text.text
         refined_text = text_prerparer.text_preparer(text)
         return news_text, refined_text
     except:
         print(news_text)
Example #2
0
 def worker(news_text):
     try:
         print(news_text.pk)
         text = news_text.text
         refined_text = text_prerparer.text_preparer(text)
         return news_text, refined_text
     except:
         print(news_text)
 def worker(news):
     html = urlOpen.get_html(news.url)
     print(str(news.pk) + "     ", end='\n')
     if html:
         text = textParser.get_text_from_html(html)
         url_list = [url for url in aParser.get_a_from_news_text(news_url=news.url, text=text)]
         text = aParser.remove_all_tags(text)
         text = text_prerparer.text_preparer(text)
         return NewsText(news=news, text=text), url_list
Example #4
0
 def worker(news):
     html = urlOpen.get_html(news.url)
     print(str(news.pk) + "     ", end='\n')
     if html:
         text = textParser.get_text_from_html(html)
         url_list = [
             url for url in aParser.get_a_from_news_text(news_url=news.url,
                                                         text=text)
         ]
         text = aParser.remove_all_tags(text)
         text = text_prerparer.text_preparer(text)
         return NewsText(news=news, text=text), url_list
Example #5
0
def get_key_word(news, news_title):
    plain_news_title = text_preparer(news_title)
    interesting_words_from_news = get_only_noun(news)
    interesting_words_from_title = get_only_noun(plain_news_title)
    all_interesting_words = interesting_words_from_news + interesting_words_from_title
    counted_words = Counter(all_interesting_words)

    for word in interesting_words_from_title:
        if word in counted_words:
            counted_words[word] *= 3

    key_words_count = 10
    all_words = counted_words.most_common(key_words_count)

    with open('anti_key_words.txt') as anti_key_words_file:
        anti_key_words = set(anti_key_words_file.read().splitlines())

    key_words = []
    for word in all_words:
        if word[0] not in anti_key_words:
            key_words.append(word[0])

    return key_words