Example #1
0
def test_whitespace_removal(whitespace_removal_actual,
                            whitespace_removal_expected):
    preprocessed = preprocess_text(whitespace_removal_actual,
                                   lowercase=False,
                                   regex='(?!).*',
                                   eng_lang=False)
    pd.testing.assert_series_equal(whitespace_removal_expected, preprocessed)
Example #2
0
def test_regex_default(sents_regex, sents_default_expected):
    sents_preprocessed = preprocess_text(sents_regex,
                                         lowercase=False,
                                         eng_lang=False)
    pd.testing.assert_series_equal(sents_default_expected, sents_preprocessed)
Example #3
0
def test_lowercase(lowercase_actual, lowercase_expected):
    preprocessed = preprocess_text(lowercase_actual,
                                   regex='(?!).*',
                                   eng_lang=False)
    pd.testing.assert_series_equal(lowercase_expected, preprocessed)
Example #4
0
def test_stopword_default(stopword_sents):
    preprocessed = preprocess_text(stopword_sents)
    pd.testing.assert_series_equal(stopword_sents, preprocessed)
Example #5
0
def test_stopword_removal(stopword_sents, stopword_removal_expected):
    preprocessed = preprocess_text(stopword_sents, stop_words=True)
    pd.testing.assert_series_equal(stopword_removal_expected, preprocessed)
Example #6
0
def test_stem_and_lemma(stem_actual):
    with pytest.raises(Exception):
        preprocess_text(stem_actual, stem=True, lemma=True, eng_lang=False)
Example #7
0
def test_language(language_actual, language_expected):
    preprocessed = preprocess_text(language_actual, eng_lang=True)
    pd.testing.assert_series_equal(language_expected, preprocessed)
Example #8
0
def test_token_list2(token_list_actual2, token_list_expected2):
    preprocessed = preprocess_text(token_list_actual2,
                                   lemma=False,
                                   token_list=True,
                                   eng_lang=False)
    pd.testing.assert_series_equal(token_list_expected2, preprocessed)
Example #9
0
def test_stem(stem_actual, stem_expected):
    preprocessed = preprocess_text(stem_actual, stem=True, eng_lang=False)
    pd.testing.assert_series_equal(stem_expected, preprocessed)
Example #10
0
def test_token_list(lemma_actual, token_list_expected):
    preprocessed = preprocess_text(lemma_actual,
                                   lemma=True,
                                   token_list=True,
                                   eng_lang=False)
    pd.testing.assert_series_equal(token_list_expected, preprocessed)
Example #11
0
def test_nan_replace(nan_removal_actual, nan_replace_expected):
    preprocessed = preprocess_text(nan_removal_actual,
                                   nan_handling='bad',
                                   eng_lang=False)
    pd.testing.assert_series_equal(nan_replace_expected, preprocessed)
Example #12
0
def test_dict_replace(dict_replace_actual, dict_replace_expected, sample_dict):
    preprocessed = preprocess_text(dict_replace_actual,
                                   replace_dict=sample_dict,
                                   eng_lang=False)
    pd.testing.assert_series_equal(dict_replace_expected, preprocessed)
Example #13
0
def test_regex_cases(sents_regex, expected, regex):
    sents_preprocessed = preprocess_text(sents_regex,
                                         lowercase=False,
                                         regex=regex,
                                         eng_lang=False)
    pd.testing.assert_series_equal(expected, sents_preprocessed)
Example #14
0
"""
Copyright © 2020 Johnson & Johnson
"""

import pandas as pd
from nlprov.preprocessing import preprocess_text
from nlprov.vectorize import vectorize_text, vectorize_new_text
from nlprov.similarity_calc import similarity_calculation

text = pd.Series(data=[
    "  Combination  of   spaces.    ", "MixEd CASe", ",./;'[]\-=",
    '<>?:"{}|_+', '!@#$%^&*()`~"', "lemmas needed",
    "ducks and cats and ponies are not similar", "c'est français",
    "das ist deutsch", "this is una mezcla"
])
preprocessed_text = preprocess_text(text)
vec_text, vec_obj = vectorize_text(preprocessed_text)

new_text = pd.Series(data=["ducks and cats are not similar"])
new_preprocessed_text = preprocess_text(new_text)
new_vec_text = vectorize_new_text(new_preprocessed_text, vec_obj)

similarity = similarity_calculation(new_vec_text, vec_text)