def test_snowball_stemmer_english(self): pipeline = Pipeline({ 'language_detect': None, 'snowball_stemmer': None }) value = pipeline.consume('how can i trouble troubling troubled') self.assertEqual('how can i troubl troubl troubl', value)
def test_snowball_stemmer_german(self): pipeline = Pipeline({ 'language_detect': None, 'snowball_stemmer': None }) value = pipeline.consume( 'Wie kann ich kategorie kategorien kategorischen kategorisch') self.assertEqual('wie kann ich kategori kategori kategor kategor', value)
def test_token_replacement(self): handler = open("./token_replacement_testdata.csv", "r") pipeline = Pipeline({"token_replacement": handler.read()}) handler.close() value = pipeline.consume("test asd bla 1212") self.assertEqual('test www blub 1212', value)
def test_token_replacement_do_not_crash_for_no_data(self): pipeline = Pipeline({"token_replacement": None}) value = pipeline.consume("test text") self.assertEqual("test text", value)
def test_text_only(self): pipeline = Pipeline({'text_only': None}) value = pipeline.consume('123 text - "more" text , and .') self.assertEqual(' text more text and ', value)
def test_regex_replacement(self): handler = open("./regex_replacement_testdata.csv", "r") pipeline = Pipeline({"regex_replacement": handler.read()}) handler.close() # date value = pipeline.consume( "test 1.1.2019 20.2.2003 1.1.20 01.01.20 1.1.1900 1.1. 01.01. test" ) self.assertEqual( 'test replaceddate replaceddate replaceddate replaceddate replaceddate replaceddate replaceddate test', value) # iban value = pipeline.consume("test DE12500101170648489890") self.assertEqual('test replacediban ', value) # postcode value = pipeline.consume("test 92637 92709 test") self.assertEqual('test replacedpostcode replacedpostcode test', value) # german phone value = pipeline.consume("test 0961123456 test") self.assertEqual('test replacedgermanphonenumber test', value) value = pipeline.consume("test (0961)123456 test") self.assertEqual('test replacedgermanphonenumber test', value) value = pipeline.consume("test +49(0)121-79536-77 test") self.assertEqual('test replacedgermanphonenumber test', value) # german handy value = pipeline.consume("test 015125391111 test") self.assertEqual('test replacedgermanphonenumber test', value) # some password variation value = pipeline.consume("test pw test") self.assertEqual('test password test', value) value = pipeline.consume("test pwort test") self.assertEqual('test password test', value) value = pipeline.consume("test pass word test") self.assertEqual('test password test', value)
def test_multiple_steps(self): pipeline = Pipeline({'text_only': None, 'to_lower': None}) value = pipeline.consume('123 CamelCase') self.assertEqual(' camelcase', value)
def test_text_to_lower(self): pipeline = Pipeline({'to_lower': None}) value = pipeline.consume('This is a Test text with CamelCase') self.assertEqual('this is a test text with camelcase', value)
def test_spellcheck_should_not_throw_exception_for_short_values(self): pipeline = Pipeline({"spellcheck": "kopie\r\nartikel\r\n"}) value = pipeline.consume("k koipe artikel") self.assertEqual('k kopie artikel', value)
def test_remove_punctuation(self): pipeline = Pipeline({'remove_punctuation': None}) value = pipeline.consume('123 text - "more" text123 , and .') self.assertEqual('123 text more text123 and ', value)
def test_spellcheck(self): pipeline = Pipeline({"spellcheck": None}) value = pipeline.consume("kopie koipe artikel artikle artilek artleki") self.assertEqual('kopie koipe artikel artikle artilek artleki', value)
def test_spellcheck_do_not_crash_for_no_data(self): pipeline = Pipeline({"spellcheck": "kopie\r\nartikel\r\n"}) value = pipeline.consume("kopie koipe artikel artikle artilek artleki") self.assertEqual('kopie kopie artikel artikel artikel artleki', value)
def test_token_replacement_also_replace_dots_at_end_of_phrase(self): handler = open("./token_replacement_testdata.csv", "r") pipeline = Pipeline({"token_replacement": handler.read()}) handler.close() value = pipeline.consume("abg. 1212") self.assertEqual('abgeschlossen 1212', value)
def test_remove_numbers(self): pipeline = Pipeline({'remove_numbers': None}) value = pipeline.consume('123 text - "more" text123 , and .') self.assertEqual(' text - "more" text , and .', value)
def test_token_replacement_do_not_replace_parts_of_word(self): handler = open("./token_replacement_testdata.csv", "r") pipeline = Pipeline({"token_replacement": handler.read()}) handler.close() value = pipeline.consume("test abg. abgabgeschlossen 1212") self.assertEqual('test abgeschlossen abgabgeschlossen 1212', value)