def test_save_pickle_symspellcpppy(benchmark): sym_spell = SymSpellCpp(max_dictionary_edit_distance=2, prefix_length=7) sym_spell.load_dictionary(dict_path, term_index=0, count_index=1, separator=" ") os.makedirs("temp_cpppy", exist_ok=True) result = benchmark(sym_spell.save_pickle, "temp_cpppy/temp.bin") assert (sym_spell.max_length() == 28)
def test_word_segmentation_apostrophe(self): edit_distance_max = 0 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.load_dictionary(self.dictionary_path, 0, 1) typo = "There'resomewords" correction = ("There' re some words") result = sym_spell.word_segmentation(typo) self.assertEqual(correction, result.corrected_string)
def test_load_dictionary_encoding(self): dictionary_path = os.path.join(self.fortests_path, "non_en_dict.txt") edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.load_dictionary(dictionary_path, 0, 1) result = sym_spell.lookup("АБ", Verbosity.TOP, 2) self.assertEqual(1, len(result)) self.assertEqual("АБИ", result[0].term)
def test_lookup_compound_term_symspellcpppy(benchmark): sym_spell = SymSpellCpp(max_dictionary_edit_distance=2, prefix_length=7) sym_spell.load_dictionary(dict_path, term_index=0, count_index=1, separator=" ") input_term = "whereis th elove" result = benchmark(sym_spell.lookup_compound, input_term, max_edit_distance=2) assert (result[0].term == "whereas the love")
def test_lookup_term_symspellcpppy(benchmark): sym_spell = SymSpellCpp(max_dictionary_edit_distance=2, prefix_length=7) sym_spell.load_dictionary(dict_path, term_index=0, count_index=1, separator=" ") input_term = "mEmEbers" result = benchmark(sym_spell.lookup, input_term, VerbosityCpp.CLOSEST, max_edit_distance=2) assert (result[0].term == "members")
def test_lookup_compund_transfer_casing_symspellcpppy(benchmark): sym_spell = SymSpellCpp(max_dictionary_edit_distance=2, prefix_length=7) sym_spell.load_dictionary(dict_path, 0, 1) typo = ("Whereis th elove hehaD Dated forImuch of thepast who " "couqdn'tread in sixthgrade AND ins pired him") correction = ("Whereas the love heaD Dated for much of the past " "who couldn't read in sixth grade AND inspired him") results = benchmark(sym_spell.lookup_compound, typo, 2, transfer_casing=True) assert (results[0].term == correction)
def test_word_segmentation_symspellcpppy(benchmark): sym_spell = SymSpellCpp(max_dictionary_edit_distance=2, prefix_length=7) sym_spell.load_dictionary(dict_path, term_index=0, count_index=1, separator=" ") input_term = "thequickbrownfoxjumpsoverthelazydog" result = benchmark(sym_spell.word_segmentation, input_term, max_edit_distance=0, max_segmentation_word_length=5) assert (result.segmented_string == "the quick brown fox jumps over the lazy dog")
def test_lookup_compound_transfer_casing_no_bigram(self): edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.load_dictionary(self.dictionary_path, 0, 1) typo = ("Whereis th elove hehaD Dated forImuch of thepast who " "couqdn'tread in sixthgrade AND ins pired him") correction = ("Whereas the love heaD Dated for much of the past " "who couldn't read in sixth grade AND inspired him") results = sym_spell.lookup_compound(typo, edit_distance_max, transfer_casing=True) self.assertEqual(correction, results[0].term)
def test_pickle_compressed(self): pickle_path = os.path.join(self.fortests_path, "dictionary.pickle") edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.load_dictionary(self.dictionary_path, 0, 1) sym_spell.save_pickle(pickle_path) sym_spell_2 = SymSpell(edit_distance_max, prefix_length) sym_spell_2.load_pickle(pickle_path) self.assertEqual(sym_spell.max_length(), sym_spell_2.max_length()) self.assertEqual( sym_spell.lookup("flam", Verbosity.TOP, 0, True)[0].term, sym_spell_2.lookup("flam", Verbosity.TOP, 0, True)[0].term) os.remove(pickle_path)
def test_load_dictionary_invalid_path(self): edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) self.assertEqual( False, sym_spell.load_dictionary("invalid/dictionary/path.txt", 0, 1))
def test_load_dictionary_bad_dictionary(self): dictionary_path = os.path.join(self.fortests_path, "bad_dict.txt") edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) self.assertEqual(True, sym_spell.load_dictionary(dictionary_path, 0, 1)) self.assertEqual(7, sym_spell.word_count())
def test_word_segmentation_capitalize(self): edit_distance_max = 0 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.load_dictionary(self.dictionary_path, 0, 1) typo = "Thequickbrownfoxjumpsoverthelazydog" correction = "The quick brown fox jumps over the lazy dog" result = sym_spell.word_segmentation(typo) self.assertEqual(correction, result.corrected_string) typo = "Itwasabrightcolddayinaprilandtheclockswerestrikingthirteen" correction = ("It was a bright cold day in april and the clocks " "were striking thirteen") result = sym_spell.word_segmentation(typo) self.assertEqual(correction, result.segmented_string) typo = ("Itwasthebestoftimesitwastheworstoftimesitwastheageofwisdom" "itwastheageoffoolishness") correction = ("It was the best of times it was the worst of times " "it was the age of wisdom it was the age of foolishness") result = sym_spell.word_segmentation(typo) self.assertEqual(correction, result.segmented_string)
def test_lookup_compound_no_bigram(self): edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.load_dictionary(self.dictionary_path, 0, 1) typo = "whereis th elove" correction = "whereas the love" results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) self.assertEqual(2, results[0].distance) self.assertEqual(64, results[0].count) typo = "the bigjest playrs" correction = "the biggest players" results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) self.assertEqual(2, results[0].distance) self.assertEqual(34, results[0].count) typo = "can yu readthis" correction = "can you read this" results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) self.assertEqual(2, results[0].distance) self.assertEqual(3, results[0].count) typo = ("whereis th elove hehad dated forImuch of thepast who " "couqdn'tread in sixthgrade and ins pired him") correction = ("whereas the love head dated for much of the past who " "couldn't read in sixth grade and inspired him") results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) self.assertEqual(9, results[0].distance) self.assertEqual(0, results[0].count) typo = "in te dhird qarter oflast jear he hadlearned ofca sekretplan" correction = ("in the third quarter of last year he had learned of " "a secret plan") results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) self.assertEqual(9, results[0].distance) self.assertEqual(0, results[0].count) typo = ("the bigjest playrs in te strogsommer film slatew ith plety " "of funn") correction = ("the biggest players in the strong summer film slate " "with plenty of fun") results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) self.assertEqual(9, results[0].distance) self.assertEqual(0, results[0].count) typo = ("can yu readthis messa ge despite thehorible sppelingmsitakes") correction = ("can you read this message despite the horrible " "spelling mistakes") results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) self.assertEqual(9, results[0].distance) self.assertEqual(0, results[0].count)