def get_defects(usertext_filename, corpuses=None): """Return a list of bigrams that doesn't exist in the corpuses.""" usertext_words = generate_words(usertext_filename) bigrams = get_bigrams(usertext_words) print('Found {0} bigrams in {1}'.format(len(list(get_bigrams(usertext_words))), usertext_filename)) corpus = Corpus(corpuses) defects = [] for bigram in bigrams: if not corpus.bigram_exists(bigram): defects.append(bigram) return defects
def test_generate_words_from_one_corpus(self): expected = [ b'I', b'like', b'AI', b'.', b'It', b"'", b's', b'the', b'ever', b'.', b'I', b'like', b'AI', b'.', ] actual = generate_words(self.corpus_one) self.assertEqual(expected, list(actual))
def test_generate_words_from_multiple_corpus(self): expected = [ b'I', b'like', b'AI', b'.', b'It', b"'", b's', b'the', b'ever', b'.', b'I', b'like', b'AI', b'.', b'Duckhunt', b'is', b'the', b'best', b'assignment', b'.', b'I', b'really', b'enjoy', b'shooting', b'ducks', b'.', b'Especially', b'with', b'hmm', b'.' ] files = [self.corpus_one, self.corpus_two] actual = generate_words(files) self.assertEqual(expected, list(actual))