Exemple #1
0
def test_remove_stopwords_en(english_language_model, text, english_stopwords, expectedoutput):

    # arange - not needed

    # act
    new_text = rm_stopwords(english_language_model, text, english_stopwords)

    # assert
    assert new_text == expectedoutput
Exemple #2
0
def test_remove_stopwords_mixed_no(norwegian_language_model, text, all_stopwords, expectedoutput):

    # arange - not needed

    # act
    new_text = rm_stopwords(norwegian_language_model, text, all_stopwords)

    # assert
    assert new_text == expectedoutput
Exemple #3
0
def test_remove_stopwords_simple(english_language_model, all_stopwords):

    # arange
    text = 'to be or not to be'
    model = english_language_model
    stopwords = all_stopwords

    # act
    new_text = rm_stopwords(model, text, stopwords)

    # assert
    assert new_text == ""
Exemple #4
0

###############################################################################
# Preprocess text - for this example we have a very small corpus to allow the documentation
# to build therefore we will split the single document into paragraphs for processing to
# imitate multiple document input and we will also remove stopwords and punctuation  as the text is too small.

# Split text into paragraphs to imitate documents
docs = text.split('\n\n')
# Remove \n and replace with space
docs = [d.replace('\n',' ') for d in docs]

# Because example text is small, remove stopwords and punctuation
en = spacy.load('en_core_web_md')
stopwords, stops_nb, stops_en = stdt.get_stopwords()
docs = [stdt.rm_punctuation(en,stdt.rm_stopwords(en, d, stops_en)) for d in docs]

###############################################################################
# Create topic model & visualise keywords per topic

tp_model, dictionary = tp.bow_topic_modelling(docs,no_topics=3)
# print topics
tp.print_topic_words(tp_model)

###############################################################################
# Determine the topic of a new document

fake_doc = 'This is a sentence about the importance of artificial intelligence.'
doc_topics = tp.determine_topics(fake_doc, tp_model, dictionary)

# Visualise the top topics for the document
###############################################################################
# Preprocess text - for this example we have a very small corpus to allow the documentation
# to build therefore we will split the single document into paragraphs for processing to
# imitate multiple document input and we will also remove stopwords as the text is too small
# for the tf-idf computation to handle them. For normal procedures you should NOT remove stopwords
# prior to computing tf-idf scores.

# Split text into paragraphs to imitate documents
paragraphs = text.split('\n\n')
# Remove \n and replace with space
paragraphs = [p.replace('\n', ' ') for p in paragraphs]

# Because example text is small, remove stopwords as they'll influence tf-idf scores
en = spacy.load('en_core_web_md')
stopwords, stops_nb, stops_en = stdt.get_stopwords()
paragraphs = [stdt.rm_stopwords(en, p, stops_en) for p in paragraphs]

###############################################################################
# Compute tf-idf scores and determine most important words across full corpus

scores = dists.compute_tfidf(paragraphs)
print(dists.important_words_per_corpus(scores))

###############################################################################
# Determine most important words per document (or for this example, per paragraph)
iw_p_d = dists.important_words_per_doc(scores)

for i, iw in enumerate(iw_p_d):
    print('Paragraph %i important words: ' % int(i + 1))
    print(iw)
print (text)

# Load spacy language model
langmodel = spacy.load('en_core_web_md')


###############################################################################
# Get stopwords: the stopwords function will get all english and norwegian stopwords

stopwords_func, stopwords_nb_func, stopwords_en_func = get_stopwords()
print (stopwords_en_func[:5])

###############################################################################
# Remove english stopwords

processed_text = rm_stopwords(langmodel, text, stopwords_en_func)
print (processed_text)

###############################################################################
# Now that the stopwords have been removed then the sentences no longer make much sense. Therefore, an alternative to
# viewing the text output is to look at the distribution of the remaining text


orig_top10 = pd.DataFrame(dists.freq_dist(tokenise(langmodel, text)), columns=['token', 'count'])
pr_top10 = pd.DataFrame(dists.freq_dist(tokenise(langmodel, processed_text)), columns=['token', 'count'])

print ("ORIGINAL - top 10 words")
print (orig_top10.head(10))
print (" ")
print ("PROCESSED - top 10 words")
print (pr_top10.head(10))