def test_whitespace_removal(whitespace_removal_actual, whitespace_removal_expected): preprocessed = preprocess_text(whitespace_removal_actual, lowercase=False, regex='(?!).*', eng_lang=False) pd.testing.assert_series_equal(whitespace_removal_expected, preprocessed)
def test_regex_default(sents_regex, sents_default_expected): sents_preprocessed = preprocess_text(sents_regex, lowercase=False, eng_lang=False) pd.testing.assert_series_equal(sents_default_expected, sents_preprocessed)
def test_lowercase(lowercase_actual, lowercase_expected): preprocessed = preprocess_text(lowercase_actual, regex='(?!).*', eng_lang=False) pd.testing.assert_series_equal(lowercase_expected, preprocessed)
def test_stopword_default(stopword_sents): preprocessed = preprocess_text(stopword_sents) pd.testing.assert_series_equal(stopword_sents, preprocessed)
def test_stopword_removal(stopword_sents, stopword_removal_expected): preprocessed = preprocess_text(stopword_sents, stop_words=True) pd.testing.assert_series_equal(stopword_removal_expected, preprocessed)
def test_stem_and_lemma(stem_actual): with pytest.raises(Exception): preprocess_text(stem_actual, stem=True, lemma=True, eng_lang=False)
def test_language(language_actual, language_expected): preprocessed = preprocess_text(language_actual, eng_lang=True) pd.testing.assert_series_equal(language_expected, preprocessed)
def test_token_list2(token_list_actual2, token_list_expected2): preprocessed = preprocess_text(token_list_actual2, lemma=False, token_list=True, eng_lang=False) pd.testing.assert_series_equal(token_list_expected2, preprocessed)
def test_stem(stem_actual, stem_expected): preprocessed = preprocess_text(stem_actual, stem=True, eng_lang=False) pd.testing.assert_series_equal(stem_expected, preprocessed)
def test_token_list(lemma_actual, token_list_expected): preprocessed = preprocess_text(lemma_actual, lemma=True, token_list=True, eng_lang=False) pd.testing.assert_series_equal(token_list_expected, preprocessed)
def test_nan_replace(nan_removal_actual, nan_replace_expected): preprocessed = preprocess_text(nan_removal_actual, nan_handling='bad', eng_lang=False) pd.testing.assert_series_equal(nan_replace_expected, preprocessed)
def test_dict_replace(dict_replace_actual, dict_replace_expected, sample_dict): preprocessed = preprocess_text(dict_replace_actual, replace_dict=sample_dict, eng_lang=False) pd.testing.assert_series_equal(dict_replace_expected, preprocessed)
def test_regex_cases(sents_regex, expected, regex): sents_preprocessed = preprocess_text(sents_regex, lowercase=False, regex=regex, eng_lang=False) pd.testing.assert_series_equal(expected, sents_preprocessed)
""" Copyright © 2020 Johnson & Johnson """ import pandas as pd from nlprov.preprocessing import preprocess_text from nlprov.vectorize import vectorize_text, vectorize_new_text from nlprov.similarity_calc import similarity_calculation text = pd.Series(data=[ " Combination of spaces. ", "MixEd CASe", ",./;'[]\-=", '<>?:"{}|_+', '!@#$%^&*()`~"', "lemmas needed", "ducks and cats and ponies are not similar", "c'est français", "das ist deutsch", "this is una mezcla" ]) preprocessed_text = preprocess_text(text) vec_text, vec_obj = vectorize_text(preprocessed_text) new_text = pd.Series(data=["ducks and cats are not similar"]) new_preprocessed_text = preprocess_text(new_text) new_vec_text = vectorize_new_text(new_preprocessed_text, vec_obj) similarity = similarity_calculation(new_vec_text, vec_text)