def test_04_big_phrase_autocomplete(self): nums = { ('i', ): [0, 1, 2, 5, 11, None], ('i', 'do'): [0, 1, 2, 5, 8, None], ('i', 'do', 'not', 'like', 'them'): [0, 1, 2, 4, 100, None], ('i', 'do', 'not', 'like', 'them', 'here'): [0, 1, 2, 100, None] } with open(os.path.join(TEST_DIRECTORY, 'testing_data', 'seuss.txt'), encoding='utf-8') as f: text = f.read() p = lab.make_phrase_trie(text) for i in sorted(nums): for n in nums[i]: result = lab.autocomplete(p, i, n) expected = read_expected('seuss_autocomplete_%s_%s.pickle' % (len(i), n)) self.assertEqual(len(result), len(expected), msg='wrong autocomplete of ' + repr(i) + ' with maxcount = ' + str(n)) self.assertEqual(set(result), set(expected), msg='wrong autocomplete of ' + repr(i) + ' with maxcount = ' + str(n)) with self.assertRaises(TypeError): result = lab.autocomplete(p, 'string', None)
def test_02_phrase_trie(self): # small test l = lab.make_phrase_trie('toonces was a cat who could drive a car very fast until he crashed.') expected = read_expected('9.pickle') self.assertEqual(expected, dictify(l)) l = lab.make_phrase_trie('a man at the market murmered that he had met a mermaid. ' 'i dont believe that he had met a mermaid.') expected = read_expected('10.pickle') self.assertEqual(expected, dictify(l)) l = lab.make_phrase_trie(('What happened to the cat who ate the ball of yarn? She had mittens! ' 'What happened to the frog who was double parked? He got toad! ' 'What happened yesterday? I dont remember.')) expected = read_expected('11.pickle') self.assertEqual(expected, dictify(l))
def test_autocomplete_big_phrase(): nums = { ('i', ): [0, 1, 2, 5, 11, None], ('i', 'do'): [0, 1, 2, 5, 8, None], ('i', 'do', 'not', 'like', 'them'): [0, 1, 2, 4, 100, None], ('i', 'do', 'not', 'like', 'them', 'here'): [0, 1, 2, 100, None] } with open(os.path.join(TEST_DIRECTORY, 'testing_data', 'seuss.txt'), encoding='utf-8') as f: text = f.read() p = lab.make_phrase_trie(text) for i in sorted(nums): for n in nums[i]: result = lab.autocomplete(p, i, n) expected = read_expected('seuss_autocomplete_%s_%s.pickle' % (len(i), n)) assert len(expected) == len( result), ('missing' if len(result) < len(expected) else 'too many') + ' autocomplete results for ' + repr( i) + ' with maxcount = ' + str(n) assert set(expected) == set( result), 'autocomplete included ' + repr( set(result) - set(expected)) + ' instead of ' + repr( set(expected) - set(result)) + ' for ' + repr( i) + ' with maxcount = ' + str(n) with pytest.raises(TypeError): result = lab.autocomplete(p, 'string', None)
def load_corpus_file(path): corpus_name = ''.join(os.path.basename(path).split('.')[:-1]) with open(path, encoding="utf-8") as f: text = f.read() wordTrie = lab.make_word_trie(text) sentenceTrie = lab.make_phrase_trie(text) corpusTries[corpus_name] = (wordTrie, sentenceTrie) return corpus_name
def test_03_big_corpora(self): for bigtext in ('holmes', 'earnest', 'frankenstein'): with open(os.path.join(TEST_DIRECTORY, 'testing_data', '%s.txt' % bigtext), encoding='utf-8') as f: text = f.read() w = lab.make_word_trie(text) p = lab.make_phrase_trie(text) w_e = read_expected('%s_words.pickle' % bigtext) p_e = read_expected('%s_phrases.pickle' % bigtext) self.assertEqual(w_e, dictify(w), 'word trie does not match for '+bigtext) self.assertEqual(p_e, dictify(p), 'phrase trie does not match for '+bigtext)
def test_big_corpora(bigtext): with open(os.path.join(TEST_DIRECTORY, 'testing_data', '%s.txt' % bigtext), encoding='utf-8') as f: text = f.read() w = lab.make_word_trie(text) p = lab.make_phrase_trie(text) w_e = read_expected('%s_words.pickle' % bigtext) p_e = read_expected('%s_phrases.pickle' % bigtext) assert w_e == dictify(w), 'word trie does not match for %s' % bigtext assert p_e == dictify(p), 'phrase trie does not match for %s' % bigtext