def test_01_word_trie(self): # small test l = lab.make_word_trie('toonces was a cat who could drive a car very fast until he crashed.') expected = read_expected('6.pickle') self.assertEqual(expected, dictify(l)) l = lab.make_word_trie('a man at the market murmered that he had met a mermaid. ' 'mark didnt believe the man had met a mermaid.') expected = read_expected('7.pickle') self.assertEqual(expected, dictify(l)) l = lab.make_word_trie('what happened to the cat who had eaten the ball of yarn? she had mittens!') expected = read_expected('8.pickle') self.assertEqual(expected, dictify(l))
def test_02_big_autocomplete_1(self): alphabet = a = "abcdefghijklmnopqrstuvwxyz" word_list = [ "aa" + l1 + l2 + l3 + l4 for l1 in a for l2 in a for l3 in a for l4 in a ] word_list.extend( ["apple", "application", "apple", "apricot", "apricot", "apple"]) word_list.append("bruteforceisbad") trie = lab.make_word_trie(' '.join(word_list)) for i in range(10): result1 = lab.autocomplete(trie, 'ap', 1) result2 = lab.autocomplete(trie, 'ap', 2) result3 = lab.autocomplete(trie, 'ap', 3) result4 = lab.autocomplete(trie, 'ap') self.assertEqual(1, len(result1)) self.assertEqual(2, len(result2)) self.assertEqual(3, len(result3)) self.assertEqual(3, len(result4)) self.assertEqual(["apple"], result1) self.assertEqual(set(["apple", "apricot"]), set(result2)) self.assertEqual(set(["apple", "apricot", "application"]), set(result3)) self.assertEqual(set(result4), set(result3))
def test_autocomplete_big_2(): nums = { 't': [0, 1, 25, None], 'th': [0, 1, 21, None], 'the': [0, 5, 21, None], 'thes': [0, 1, 21, None] } with open(os.path.join(TEST_DIRECTORY, 'testing_data', 'frankenstein.txt'), encoding='utf-8') as f: text = f.read() w = lab.make_word_trie(text) for i in sorted(nums): for n in nums[i]: result = lab.autocomplete(w, i, n) expected = read_expected('frank_autocomplete_%s_%s.pickle' % (i, n)) assert len(expected) == len( result), ('missing' if len(result) < len(expected) else 'too many') + ' autocomplete results for ' + repr( i) + ' with maxcount = ' + str(n) assert set(expected) == set( result), 'autocomplete included ' + repr( set(result) - set(expected)) + ' instead of ' + repr( set(expected) - set(result)) + ' for ' + repr( i) + ' with maxcount = ' + str(n) with pytest.raises(TypeError): result = lab.autocomplete(w, ('tuple', ), None)
def test_02_big_autocomplete(self): nums = { 't': [0, 1, 25, None], 'th': [0, 1, 21, None], 'the': [0, 5, 21, None], 'thes': [0, 1, 21, None] } with open(os.path.join(TEST_DIRECTORY, 'testing_data', 'frankenstein.txt'), encoding='utf-8') as f: text = f.read() w = lab.make_word_trie(text) for i in sorted(nums): for n in nums[i]: result = lab.autocomplete(w, i, n) expected = read_expected('frank_autocomplete_%s_%s.pickle' % (i, n)) self.assertEqual(len(result), len(expected), msg='wrong autocomplete of ' + repr(i) + ' with maxcount = ' + str(n)) self.assertEqual(set(result), set(expected), msg='wrong autocomplete of ' + repr(i) + ' with maxcount = ' + str(n)) with self.assertRaises(TypeError): result = lab.autocomplete(w, ('tuple', ), None)
def test_01_autocomplete(self): # Autocomplete on simple trie with less than N valid words trie = lab.make_word_trie("cat car carpet") result = lab.autocomplete(trie, 'car', 3) self.assertIsInstance(result, list, "result not a list.") for w in result: self.assertIsInstance(w, str, "expecting list of strings.") result.sort() expect = ["car", "carpet"] self.assertEqual(result, expect, msg="incorrect result from autocomplete.") trie = lab.make_word_trie("a an ant anteater a an ant a") result = lab.autocomplete(trie, 'a', 2) self.assertIsInstance(result, list, "result not a list.") for w in result: self.assertIsInstance(w, str, "expecting list of strings.") result.sort() expect_one_of = [["a", "an"], ["a", "ant"]] self.assertIn(result, expect_one_of, msg="incorrect result from autocomplete.") trie = lab.make_word_trie( "man mat mattress map me met a man a a a map man met") result = lab.autocomplete(trie, 'm', 3) self.assertIsInstance(result, list, "result not a list.") for w in result: self.assertIsInstance(w, str, "expecting list of strings.") result.sort() expect = ["man", "map", "met"] self.assertEqual(result, expect, msg="incorrect result from autocomplete.") trie = lab.make_word_trie("hello hell history") result = lab.autocomplete(trie, 'help', 3) self.assertIsInstance(result, list, "result not a list.") for w in result: self.assertIsInstance(w, str, "expecting list of strings.") expect = [] self.assertEqual(result, expect, msg="incorrect result from autocomplete.") with self.assertRaises(TypeError): result = lab.autocomplete(trie, ('tuple', ), None)
def load_corpus_file(path): corpus_name = ''.join(os.path.basename(path).split('.')[:-1]) with open(path, encoding="utf-8") as f: text = f.read() wordTrie = lab.make_word_trie(text) sentenceTrie = lab.make_phrase_trie(text) corpusTries[corpus_name] = (wordTrie, sentenceTrie) return corpus_name
def test_01_autocorrect(self): # Autocorrect on cat in small corpus trie = lab.make_word_trie("cats cattle hat car act at chat crate act car act") result = lab.autocorrect(trie, 'cat',4) self.assertIsInstance(result,list,"result not a list.") for w in result: self.assertIsInstance(w,str,"expecting list of strings.") result.sort() expect = ["act", "car", "cats", "cattle"] self.assertEqual(expect,result,msg="incorrect result from autocorrect.")
def test_03_big_filter_2(self): patterns = ('*ing', '*ing?', '****ing', '**ing**', '????', 'mon*', '*?*?*?*', '*???') with open(os.path.join(TEST_DIRECTORY, 'testing_data', 'frankenstein.txt'), encoding='utf-8') as f: text = f.read() w = lab.make_word_trie(text) for ix, i in enumerate(patterns): result = lab.word_filter(w, i) expected = read_expected('frank_filter_%s.pickle' % (ix, )) self.assertEqual(len(expected), len(result), msg='incorrect word_filter of '+repr(i)) self.assertEqual(set(expected), set(result), msg='incorrect word_filter of '+repr(i))
def test_autocomplete_small(): # Autocomplete on simple tries with less than N valid words trie = lab.make_word_trie("cat car carpet") result = lab.autocomplete(trie, 'car', 3) assert set(result) == {"car", "carpet"} trie = lab.make_word_trie("a an ant anteater a an ant a") result = lab.autocomplete(trie, 'a', 2) assert set(result) in [{"a", "an"}, {"a", "ant"}] trie = lab.make_word_trie( "man mat mattress map me met a man a a a map man met") result = lab.autocomplete(trie, 'm', 3) assert set(result) == {"man", "map", "met"} trie = lab.make_word_trie("hello hell history") result = lab.autocomplete(trie, 'help', 3) assert result == [] with pytest.raises(TypeError): result = lab.autocomplete(trie, ('tuple', ), None)
def test_03_big_autocomplete_2(self): with open(os.path.join(TEST_DIRECTORY, 'testing_data', 'frankenstein.txt'), encoding='utf-8') as f: text = f.read() w = lab.make_word_trie(text) the_word = 'accompany' for ix in range(len(the_word)+1): test = the_word[:ix] result = lab.autocomplete(w, test) expected = read_expected('frank_autocomplete_%s_%s.pickle' % (test, None)) self.assertEqual(len(result), len(expected), msg='wrong autocomplete of '+repr(test)) self.assertEqual(set(result), set(expected), msg='wrong autocomplete of '+repr(test))
def test_big_corpora(bigtext): with open(os.path.join(TEST_DIRECTORY, 'testing_data', '%s.txt' % bigtext), encoding='utf-8') as f: text = f.read() w = lab.make_word_trie(text) p = lab.make_phrase_trie(text) w_e = read_expected('%s_words.pickle' % bigtext) p_e = read_expected('%s_phrases.pickle' % bigtext) assert w_e == dictify(w), 'word trie does not match for %s' % bigtext assert p_e == dictify(p), 'phrase trie does not match for %s' % bigtext
def test_filter_big_2(): patterns = ('*ing', '*ing?', '****ing', '**ing**', '????', 'mon*', '*?*?*?*', '*???') with open(os.path.join(TEST_DIRECTORY, 'testing_data', 'frankenstein.txt'), encoding='utf-8') as f: text = f.read() w = lab.make_word_trie(text) for ix, i in enumerate(patterns): result = lab.word_filter(w, i) expected = read_expected('frank_filter_%s.pickle' % (ix, )) assert len(expected) == len(result), 'incorrect word_filter of %r' % i assert set(expected) == set(result), 'incorrect word_filter of %r' % i
def test_03_big_corpora(self): for bigtext in ('holmes', 'earnest', 'frankenstein'): with open(os.path.join(TEST_DIRECTORY, 'testing_data', '%s.txt' % bigtext), encoding='utf-8') as f: text = f.read() w = lab.make_word_trie(text) p = lab.make_phrase_trie(text) w_e = read_expected('%s_words.pickle' % bigtext) p_e = read_expected('%s_phrases.pickle' % bigtext) self.assertEqual(w_e, dictify(w), 'word trie does not match for '+bigtext) self.assertEqual(p_e, dictify(p), 'phrase trie does not match for '+bigtext)
def test_02_big_filter_1(self): alphabet = a = "abcdefghijklmnopqrstuvwxyz" word_list = ["aa" + l1 + l2 + l3 + l4 for l1 in a for l2 in a for l3 in a for l4 in a] word_list.extend(["apple", "application", "apple", "apricot", "apricot", "apple"]) word_list.append("bruteforceisbad") trie = lab.make_word_trie(' '.join(word_list)) for i in range(20): result = lab.word_filter(trie, "ap*") expected = [('apple', 3), ('apricot', 2), ('application', 1)] self.assertEqual(len(expected), len(result), msg='incorrect word_filter of ap*') self.assertEqual(set(expected), set(result), msg='incorrect word_filter of ap*')
def test_02_big_autocorrect(self): nums = {'thin': [0, 8, 10, None], 'tom': [0, 2, 4, None], 'mon': [0, 2, 15, 17, 20, None]} with open(os.path.join(TEST_DIRECTORY, 'testing_data', 'frankenstein.txt'), encoding='utf-8') as f: text = f.read() w = lab.make_word_trie(text) for i in sorted(nums): for n in nums[i]: result = lab.autocorrect(w, i, n) expected = read_expected('frank_autocorrect_%s_%s.pickle' % (i, n)) self.assertEqual(len(result), len(expected), msg='wrong autocorrect of '+repr(i)+' with maxcount = '+str(n)) self.assertEqual(set(result), set(expected), msg='wrong autocorrect of '+repr(i)+' with maxcount = '+str(n))
def test_04_big_autocomplete_3(self): with open(os.path.join(TEST_DIRECTORY, 'testing_data', 'frankenstein.txt'), encoding='utf-8') as f: text = f.read() w = lab.make_word_trie(text) the_word = 'accompany' for ix in range(len(the_word)+1): test = the_word[:ix] result = lab.autocomplete(w, test) expected = read_expected('frank_autocomplete_%s_%s.pickle' % (test, None)) self.assertEqual(len(expected), len(result), msg=('missing' if len(result) < len(expected)\ else 'too many') + ' autocomplete results for ' + repr(test) + ' with maxcount = None') self.assertEqual(set(expected), set(result), msg='autocomplete included ' + repr(set(result) - set(expected))\ + ' instead of ' + repr(set(expected) - set(result)) + ' for ' + repr(test) + ' with maxcount = None') with self.assertRaises(TypeError): result = lab.autocomplete(w, ('tuple', ), None)
def test_tiny2(self): trie = lab.make_word_trie('do down down drown drown drown doing doing \ done done done dead dead dead dead at and cat cat car car car' ) result = lab.autocomplete(trie, 'do', 2) expect = ['done', 'down'] self.assertEqual(result, expect) result = lab.autocomplete(trie, 'd', 3) expect = ['dead', 'done', 'drown'] self.assertEqual(result, expect) result = sorted(lab.autocomplete(trie, 'do', None)) expect = sorted(['done', 'down', 'doing', 'do']) self.assertEqual(result, expect)
def test_01_filter(self): # Filter to select all words in trie trie = lab.make_word_trie( "man mat mattress map me met a man a a a map man met") result = lab.word_filter(trie, '*') self.assertIsInstance(result, list, "result not a list.") result.sort() expect = [("a", 4), ("man", 3), ("map", 2), ("mat", 1), ("mattress", 1), ("me", 1), ("met", 2)] self.assertEqual(result, expect, msg="incorrect result from filter.") # All three-letter words in trie result = lab.word_filter(trie, '???') self.assertIsInstance(result, list, "result not a list.") result.sort() expect = [("man", 3), ("map", 2), ("mat", 1), ("met", 2)] self.assertEqual(result, expect, msg="incorrect result from filter.") # Words beginning with 'mat' result = lab.word_filter(trie, 'mat*') self.assertIsInstance(result, list, "result not a list.") result.sort() expect = [("mat", 1), ("mattress", 1)] self.assertEqual(result, expect, msg="incorrect result from filter.") # Words beginning with 'm', third letter is t result = lab.word_filter(trie, 'm?t*') self.assertIsInstance(result, list, "result not a list.") result.sort() expect = [("mat", 1), ("mattress", 1), ("met", 2)] self.assertEqual(result, expect, msg="incorrect result from filter.") # Words with at least 4 letters result = lab.word_filter(trie, '*????') self.assertIsInstance(result, list, "result not a list.") result.sort() expect = [("mattress", 1)] self.assertEqual(result, expect, msg="incorrect result from filter.") # All words result = lab.word_filter(trie, '**') self.assertIsInstance(result, list, "result not a list.") result.sort() expect = [("a", 4), ("man", 3), ("map", 2), ("mat", 1), ("mattress", 1), ("me", 1), ("met", 2)] self.assertEqual(result, expect, msg="incorrect result from filter.")
def test_filter_big_1(): alphabet = a = "abcdefghijklmnopqrstuvwxyz" word_list = [ "aa" + l1 + l2 + l3 + l4 for l1 in a for l2 in a for l3 in a for l4 in a ] word_list.extend( ["apple", "application", "apple", "apricot", "apricot", "apple"]) word_list.append("bruteforceisbad") trie = lab.make_word_trie(' '.join(word_list)) for i in range(1000): result = lab.word_filter(trie, "ap*") expected = {('apple', 3), ('apricot', 2), ('application', 1)} assert len(expected) == len(result), 'incorrect word_filter of ap*' assert set(expected) == set(result), 'incorrect word_filter of ap*'
def test_tiny1(self): trie = lab.make_word_trie("bat bat bark bar") result = lab.autocomplete(trie, 'ba', 1) expect = ['bat'] self.assertEqual(result, expect) result = sorted(lab.autocomplete(trie, 'ba', 2)) expect = sorted(['bat', 'bar']) self.assertEqual(result, expect) result = lab.autocomplete(trie, 'c', 2) expect = [] self.assertEqual(result, expect) result = lab.autocomplete(trie, 'b', None) expect = ['bat', 'bar', 'bark'] self.assertEqual(result, expect)
def test_02_big_autocorrect(self): nums = { 'thin': [0, 8, 10, None], 'tom': [0, 2, 4, None], 'mon': [0, 2, 15, 17, 20, None] } with open(os.path.join(TEST_DIRECTORY, 'resources', 'testing_data', 'frankenstein.txt'), encoding='utf-8') as f: text = f.read() w = lab.make_word_trie(text) for i in sorted(nums): for n in nums[i]: result = lab.autocorrect(w, i, n) expected = read_expected('frank_autocorrect_%s_%s.pickle' % (i, n)) self.assertEqual(len(result), len(expected), msg=('missing' if len(result) < len(expected)\ else 'too many') + ' autocorrect results for ' + repr(i) + ' with macount = ' + str(n)) self.assertEqual(set(result), set(expected), msg='autocorrect included ' + repr(set(result) - set(expected))\ + ' instead of ' + repr(set(expected) - set(result)) + ' for ' + repr(i) + ' with maxcount = '+str(n))
def test_filter_small(): # Filter to select all words in trie trie = lab.make_word_trie( "man mat mattress map me met a man a a a map man met") result = lab.word_filter(trie, '*') assert isinstance(result, list) result.sort() assert result == [("a", 4), ("man", 3), ("map", 2), ("mat", 1), ("mattress", 1), ("me", 1), ("met", 2)] # All three-letter words in trie result = lab.word_filter(trie, '???') assert isinstance(result, list) result.sort() assert result == [("man", 3), ("map", 2), ("mat", 1), ("met", 2)] # Words beginning with 'mat' result = lab.word_filter(trie, 'mat*') assert isinstance(result, list) result.sort() assert result == [("mat", 1), ("mattress", 1)] # Words beginning with 'm', third letter is t result = lab.word_filter(trie, 'm?t*') assert isinstance(result, list) result.sort() assert result == [("mat", 1), ("mattress", 1), ("met", 2)] # Words with at least 4 letters result = lab.word_filter(trie, '*????') assert isinstance(result, list) result.sort() assert result == [("mattress", 1)] # All words result = lab.word_filter(trie, '**') assert isinstance(result, list) result.sort() assert result == [("a", 4), ("man", 3), ("map", 2), ("mat", 1), ("mattress", 1), ("me", 1), ("met", 2)]
def test_autocomplete_big_1(): alphabet = a = "abcdefghijklmnopqrstuvwxyz" word_list = [ "aa" + l1 + l2 + l3 + l4 for l1 in a for l2 in a for l3 in a for l4 in a ] word_list.extend( ["apple", "application", "apple", "apricot", "apricot", "apple"]) word_list.append("bruteforceisbad") trie = lab.make_word_trie(' '.join(word_list)) for i in range(1000): result1 = lab.autocomplete(trie, 'ap', 1) result2 = lab.autocomplete(trie, 'ap', 2) result3 = lab.autocomplete(trie, 'ap', 3) result4 = lab.autocomplete(trie, 'ap') assert set(result1) == {'apple'} assert set(result2) == {'apple', 'apricot'} assert set(result4) == set(result3) == { 'apple', 'apricot', 'application' }
def test_autocorrect_small(): # Autocorrect on cat in small corpus trie = lab.make_word_trie( "cats cattle hat car act at chat crate act car act") result = lab.autocorrect(trie, 'cat', 4) assert set(result) == {"act", "car", "cats", "cattle"}