def test_load_bigram_dictionary_bad_dict(self): dictionary_path = os.path.join(self.fortests_path, "bad_dict.txt") edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) self.assertEqual( True, sym_spell.load_bigram_dictionary(dictionary_path, 0, 2))
def test_load_dictionary_invalid_path(self): edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) self.assertEqual( False, sym_spell.load_dictionary("invalid/dictionary/path.txt", 0, 1))
def test_save_pickle_symspellcpppy(benchmark): sym_spell = SymSpellCpp(max_dictionary_edit_distance=2, prefix_length=7) sym_spell.load_dictionary(dict_path, term_index=0, count_index=1, separator=" ") os.makedirs("temp_cpppy", exist_ok=True) result = benchmark(sym_spell.save_pickle, "temp_cpppy/temp.bin") assert (sym_spell.max_length() == 28)
def test_load_dictionary_separator(self): dictionary_path = os.path.join(self.fortests_path, "separator_dict.txt") edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) self.assertEqual(True, sym_spell.load_dictionary(dictionary_path, 0, 1, "$")) self.assertEqual(5, sym_spell.word_count())
def test_create_dictionary(self): corpus_path = os.path.join(self.fortests_path, "big_modified.txt") big_words_path = os.path.join(self.fortests_path, "big_words.txt") edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.create_dictionary(corpus_path) self.assertEqual(68, sym_spell.max_length())
def test_lookup_transfer_casing_symspellcpppy(benchmark): sym_spell = SymSpellCpp(max_dictionary_edit_distance=2, prefix_length=7) sym_spell.create_dictionary_entry("steam", 4) result = benchmark(sym_spell.lookup, "StreaM", VerbosityCpp.TOP, 2, transfer_casing=True) assert (result[0].term == "SteaM")
def test_word_segmentation_apostrophe(self): edit_distance_max = 0 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.load_dictionary(self.dictionary_path, 0, 1) typo = "There'resomewords" correction = ("There' re some words") result = sym_spell.word_segmentation(typo) self.assertEqual(correction, result.corrected_string)
def test_words_with_shared_prefix_should_retain_counts(self): sym_spell = SymSpell(1, 3) sym_spell.create_dictionary_entry("pipe", 5) sym_spell.create_dictionary_entry("pips", 10) result = sym_spell.lookup("pipe", Verbosity.ALL, 1) self.assertEqual(2, len(result)) self.assertEqual("pipe", result[0].term) self.assertEqual(5, result[0].count) self.assertEqual("pips", result[1].term) self.assertEqual(10, result[1].count) result = sym_spell.lookup("pips", Verbosity.ALL, 1) self.assertEqual(2, len(result)) self.assertEqual("pips", result[0].term) self.assertEqual(10, result[0].count) self.assertEqual("pipe", result[1].term) self.assertEqual(5, result[1].count) result = sym_spell.lookup("pip", Verbosity.ALL, 1) self.assertEqual(2, len(result)) self.assertEqual("pips", result[0].term) self.assertEqual(10, result[0].count) self.assertEqual("pipe", result[1].term) self.assertEqual(5, result[1].count)
def test_lookup_compound_term_symspellcpppy(benchmark): sym_spell = SymSpellCpp(max_dictionary_edit_distance=2, prefix_length=7) sym_spell.load_dictionary(dict_path, term_index=0, count_index=1, separator=" ") input_term = "whereis th elove" result = benchmark(sym_spell.lookup_compound, input_term, max_edit_distance=2) assert (result[0].term == "whereas the love")
def test_load_dictionary_encoding(self): dictionary_path = os.path.join(self.fortests_path, "non_en_dict.txt") edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.load_dictionary(dictionary_path, 0, 1) result = sym_spell.lookup("АБ", Verbosity.TOP, 2) self.assertEqual(1, len(result)) self.assertEqual("АБИ", result[0].term)
def test_lookup_term_symspellcpppy(benchmark): sym_spell = SymSpellCpp(max_dictionary_edit_distance=2, prefix_length=7) sym_spell.load_dictionary(dict_path, term_index=0, count_index=1, separator=" ") input_term = "mEmEbers" result = benchmark(sym_spell.lookup, input_term, VerbosityCpp.CLOSEST, max_edit_distance=2) assert (result[0].term == "members")
def test_lookup_compund_transfer_casing_symspellcpppy(benchmark): sym_spell = SymSpellCpp(max_dictionary_edit_distance=2, prefix_length=7) sym_spell.load_dictionary(dict_path, 0, 1) typo = ("Whereis th elove hehaD Dated forImuch of thepast who " "couqdn'tread in sixthgrade AND ins pired him") correction = ("Whereas the love heaD Dated for much of the past " "who couldn't read in sixth grade AND inspired him") results = benchmark(sym_spell.lookup_compound, typo, 2, transfer_casing=True) assert (results[0].term == correction)
def test_word_segmentation_symspellcpppy(benchmark): sym_spell = SymSpellCpp(max_dictionary_edit_distance=2, prefix_length=7) sym_spell.load_dictionary(dict_path, term_index=0, count_index=1, separator=" ") input_term = "thequickbrownfoxjumpsoverthelazydog" result = benchmark(sym_spell.word_segmentation, input_term, max_edit_distance=0, max_segmentation_word_length=5) assert (result.segmented_string == "the quick brown fox jumps over the lazy dog")
def test_lookup_compound_transfer_casing_no_bigram(self): edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.load_dictionary(self.dictionary_path, 0, 1) typo = ("Whereis th elove hehaD Dated forImuch of thepast who " "couqdn'tread in sixthgrade AND ins pired him") correction = ("Whereas the love heaD Dated for much of the past " "who couldn't read in sixth grade AND inspired him") results = sym_spell.lookup_compound(typo, edit_distance_max, transfer_casing=True) self.assertEqual(correction, results[0].term)
def test_add_additional_counts_should_not_add_word_again(self): sym_spell = SymSpell() word = "hello" sym_spell.create_dictionary_entry(word, 11) self.assertEqual(1, sym_spell.word_count()) sym_spell.create_dictionary_entry(word, 3) self.assertEqual(1, sym_spell.word_count())
def setUpClass(cls): cls.symSpell = SymSpell() cls.symSpell.load_dictionary( "resources/frequency_dictionary_en_82_765.txt", 0, 1, " ") cls.fortests_path = "tests/fortests" cls.dictionary_path = "resources/frequency_dictionary_en_82_765.txt" cls.bigram_path = "resources/frequency_bigramdictionary_en_243_342.txt"
def test_deletes(self): sym_spell = SymSpell() sym_spell.create_dictionary_entry("steama", 4) sym_spell.create_dictionary_entry("steamb", 6) sym_spell.create_dictionary_entry("steamc", 2) result = sym_spell.lookup("stream", Verbosity.TOP, 2) self.assertEqual(1, len(result)) self.assertEqual("steamb", result[0].term) self.assertEqual(6, result[0].count) self.assertTrue(sym_spell.entry_count())
def test_lookup_should_not_return_low_count_word_that_are_also_delete_word( self): sym_spell = SymSpell(2, 7, 10) sym_spell.create_dictionary_entry("flame", 20) sym_spell.create_dictionary_entry("flam", 1) result = sym_spell.lookup("flam", Verbosity.TOP, 0) self.assertEqual(0, len(result))
def test_lookup_include_unknown(self): sym_spell = SymSpell(2, 7, 10) sym_spell.create_dictionary_entry("flame", 20) sym_spell.create_dictionary_entry("flam", 1) result = sym_spell.lookup("flam", Verbosity.TOP, 0, True) self.assertEqual(1, len(result)) self.assertEqual("flam", result[0].term)
def test_create_dictionary_entry_negative_count(self): sym_spell = SymSpell(1, 3) self.assertEqual(False, sym_spell.create_dictionary_entry("pipe", 0)) self.assertEqual(False, sym_spell.create_dictionary_entry("pipe", -1)) sym_spell = SymSpell(1, 3, count_threshold=0) self.assertEqual(True, sym_spell.create_dictionary_entry("pipe", 0))
def test_lookup_should_find_exact_match(self): sym_spell = SymSpell() sym_spell.create_dictionary_entry("steama", 4) sym_spell.create_dictionary_entry("steamb", 6) sym_spell.create_dictionary_entry("steamc", 2) result = sym_spell.lookup("streama", Verbosity.TOP, 2) self.assertEqual(1, len(result)) self.assertEqual("steama", result[0].term)
def test_add_additional_counts_should_not_overflow(self): sym_spell = SymSpell() word = "hello" sym_spell.create_dictionary_entry(word, sys.maxsize - 10) result = sym_spell.lookup(word, Verbosity.ALL) count = result[0].count if len(result) == 1 else 0 self.assertEqual(sys.maxsize - 10, count) sym_spell.create_dictionary_entry(word, 11) result = sym_spell.lookup(word, Verbosity.ALL) count = result[0].count if len(result) == 1 else 0 self.assertEqual(sys.maxsize, count)
def test_add_additional_counts_should_increase_count(self): sym_spell = SymSpell() word = "hello" sym_spell.create_dictionary_entry(word, 11) result = sym_spell.lookup(word, Verbosity.ALL) count = result[0].count if len(result) == 1 else 0 self.assertEqual(11, count) sym_spell.create_dictionary_entry(word, 3) result = sym_spell.lookup(word, Verbosity.ALL) count = result[0].count if len(result) == 1 else 0 self.assertEqual(11 + 3, count)
def test_lookup_should_return_most_frequent(self): sym_spell = SymSpell() sym_spell.create_dictionary_entry("steama", 4) sym_spell.create_dictionary_entry("steamb", 6) sym_spell.create_dictionary_entry("steamc", 2) result = sym_spell.lookup("stream", Verbosity.TOP, 2) self.assertEqual(1, len(result)) self.assertEqual("steamb", result[0].term) self.assertEqual(6, result[0].count)
def test_lookup_should_not_return_non_word_delete(self): sym_spell = SymSpell(2, 7, 10) sym_spell.create_dictionary_entry("pawn", 10) result = sym_spell.lookup("paw", Verbosity.TOP, 0) self.assertEqual(0, len(result)) result = sym_spell.lookup("awn", Verbosity.TOP, 0) self.assertEqual(0, len(result))
def test_lookup_compound_no_suggestion(self): edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.create_dictionary_entry("steam", 1) sym_spell.create_dictionary_entry("machine", 1) typo = "qwer erty ytui a" results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(typo, results[0].term)
def test_lookup_compound_only_combi(self): edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.create_dictionary_entry("steam", 1) sym_spell.create_dictionary_entry("machine", 1) typo = "ste am machie" correction = "steam machine" results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term)
def test_save_load(self): before_save = self.symSpell.lookup("tke", Verbosity.CLOSEST)[0].term before_max_length = self.symSpell.max_length() os.makedirs("temp", exist_ok=True) self.symSpell.save_pickle("temp/temp.bin") load_sym_spell = SymSpell() load_sym_spell.load_pickle("temp/temp.bin") after_load = load_sym_spell.lookup("tke", Verbosity.CLOSEST)[0].term after_max_length = load_sym_spell.max_length() os.remove("temp/temp.bin") os.rmdir("temp") assert (before_save == after_load) assert (before_max_length == after_max_length)
def test_word_segmentation_capitalize(self): edit_distance_max = 0 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.load_dictionary(self.dictionary_path, 0, 1) typo = "Thequickbrownfoxjumpsoverthelazydog" correction = "The quick brown fox jumps over the lazy dog" result = sym_spell.word_segmentation(typo) self.assertEqual(correction, result.corrected_string) typo = "Itwasabrightcolddayinaprilandtheclockswerestrikingthirteen" correction = ("It was a bright cold day in april and the clocks " "were striking thirteen") result = sym_spell.word_segmentation(typo) self.assertEqual(correction, result.segmented_string) typo = ("Itwasthebestoftimesitwastheworstoftimesitwastheageofwisdom" "itwastheageoffoolishness") correction = ("It was the best of times it was the worst of times " "it was the age of wisdom it was the age of foolishness") result = sym_spell.word_segmentation(typo) self.assertEqual(correction, result.segmented_string)
def test_empty_deletes(self): self.assertEqual(SymSpell(2).lookup("ab", Verbosity.CLOSEST), []) self.assertEqual(SymSpell().entry_count(), 0)