def test_remove_stopwords_en(english_language_model, text, english_stopwords, expectedoutput): # arange - not needed # act new_text = rm_stopwords(english_language_model, text, english_stopwords) # assert assert new_text == expectedoutput
def test_remove_stopwords_mixed_no(norwegian_language_model, text, all_stopwords, expectedoutput): # arange - not needed # act new_text = rm_stopwords(norwegian_language_model, text, all_stopwords) # assert assert new_text == expectedoutput
def test_remove_stopwords_simple(english_language_model, all_stopwords): # arange text = 'to be or not to be' model = english_language_model stopwords = all_stopwords # act new_text = rm_stopwords(model, text, stopwords) # assert assert new_text == ""
############################################################################### # Preprocess text - for this example we have a very small corpus to allow the documentation # to build therefore we will split the single document into paragraphs for processing to # imitate multiple document input and we will also remove stopwords and punctuation as the text is too small. # Split text into paragraphs to imitate documents docs = text.split('\n\n') # Remove \n and replace with space docs = [d.replace('\n',' ') for d in docs] # Because example text is small, remove stopwords and punctuation en = spacy.load('en_core_web_md') stopwords, stops_nb, stops_en = stdt.get_stopwords() docs = [stdt.rm_punctuation(en,stdt.rm_stopwords(en, d, stops_en)) for d in docs] ############################################################################### # Create topic model & visualise keywords per topic tp_model, dictionary = tp.bow_topic_modelling(docs,no_topics=3) # print topics tp.print_topic_words(tp_model) ############################################################################### # Determine the topic of a new document fake_doc = 'This is a sentence about the importance of artificial intelligence.' doc_topics = tp.determine_topics(fake_doc, tp_model, dictionary) # Visualise the top topics for the document
############################################################################### # Preprocess text - for this example we have a very small corpus to allow the documentation # to build therefore we will split the single document into paragraphs for processing to # imitate multiple document input and we will also remove stopwords as the text is too small # for the tf-idf computation to handle them. For normal procedures you should NOT remove stopwords # prior to computing tf-idf scores. # Split text into paragraphs to imitate documents paragraphs = text.split('\n\n') # Remove \n and replace with space paragraphs = [p.replace('\n', ' ') for p in paragraphs] # Because example text is small, remove stopwords as they'll influence tf-idf scores en = spacy.load('en_core_web_md') stopwords, stops_nb, stops_en = stdt.get_stopwords() paragraphs = [stdt.rm_stopwords(en, p, stops_en) for p in paragraphs] ############################################################################### # Compute tf-idf scores and determine most important words across full corpus scores = dists.compute_tfidf(paragraphs) print(dists.important_words_per_corpus(scores)) ############################################################################### # Determine most important words per document (or for this example, per paragraph) iw_p_d = dists.important_words_per_doc(scores) for i, iw in enumerate(iw_p_d): print('Paragraph %i important words: ' % int(i + 1)) print(iw)
print (text) # Load spacy language model langmodel = spacy.load('en_core_web_md') ############################################################################### # Get stopwords: the stopwords function will get all english and norwegian stopwords stopwords_func, stopwords_nb_func, stopwords_en_func = get_stopwords() print (stopwords_en_func[:5]) ############################################################################### # Remove english stopwords processed_text = rm_stopwords(langmodel, text, stopwords_en_func) print (processed_text) ############################################################################### # Now that the stopwords have been removed then the sentences no longer make much sense. Therefore, an alternative to # viewing the text output is to look at the distribution of the remaining text orig_top10 = pd.DataFrame(dists.freq_dist(tokenise(langmodel, text)), columns=['token', 'count']) pr_top10 = pd.DataFrame(dists.freq_dist(tokenise(langmodel, processed_text)), columns=['token', 'count']) print ("ORIGINAL - top 10 words") print (orig_top10.head(10)) print (" ") print ("PROCESSED - top 10 words") print (pr_top10.head(10))