def test_end(self):
        """
             Checks that after decoding no end in result
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'cat', '<END>', 'his',
                  'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'cat',
                  '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(3, encoded)
        context = (
            storage.get_id('a'),
            storage.get_id('cat'),
        )

        generator = LikelihoodBasedTextGenerator(storage, trie)

        to_decode = generator.generate_text(context, 1)
        actual = decode_text(storage, to_decode)
        expected = ('A cat', )
        self.assertEqual(expected, actual)
コード例 #2
0
    def test_decode_text_ideal_conditions(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(3, encoded)

        context = (
            storage.get_id('name'),
            storage.get_id('is'),
        )

        generator = LikelihoodBasedTextGenerator(storage, trie)

        to_decode = generator.generate_text(context, 2)
        actual = decode_text(storage, to_decode)

        for sentence in actual:
            self.assertTrue('<END>' not in sentence)
            self.assertTrue(sentence[0].isupper())
            self.assertTrue(sentence[-1].isalpha())
    def test_load_model_ideal(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(2, encoded)

        generator = NGramTextGenerator(storage, trie)

        save_model(generator, 'my_awesome_model')
        loaded_model = load_model('my_awesome_model')

        self.assertEqual(generator._n_gram_trie.n_grams,
                         loaded_model._n_gram_trie.n_grams)
        self.assertEqual(len(generator._n_gram_trie.n_gram_frequencies),
                         len(loaded_model._n_gram_trie.n_gram_frequencies))
        for ngram, frequency in generator._n_gram_trie.n_gram_frequencies.items(
        ):
            self.assertTrue(
                ngram in loaded_model._n_gram_trie.n_gram_frequencies)
            self.assertEqual(
                frequency, loaded_model._n_gram_trie.n_gram_frequencies[ngram])

        self.assertEqual(len(generator._word_storage.storage),
                         len(loaded_model._word_storage.storage))
        for word, id_num in generator._word_storage.storage.items():
            self.assertTrue(word in loaded_model._word_storage.storage)
            self.assertEqual(id_num, loaded_model._word_storage.storage[word])
コード例 #4
0
    def test_generate_text_large_context(self):
        """
        should generate simple case with three sentences out of small corpus
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(5, encoded)

        generator = NGramTextGenerator(storage, trie)

        context = (
            storage.get_id('i'),
            storage.get_id('have'),
            storage.get_id('a'),
            storage.get_id('bruno'),
        )
        end = storage.get_id('<END>')
        actual = generator.generate_text(context, 3)
        self.assertEqual(actual.count(end), 3)
コード例 #5
0
    def test_text_generator_generate_sentence_proper_beginning(self):
        """
        Checks that class creates correct sentence from a context '<END>' without '<END>' in the beginning
        """
        corpus = ('my', 'favourite', 'subject', 'is', 'maths', '<END>', 'his',
                  'favourite', 'thing', 'is', 'music'
                  '<END>', 'i', 'have', 'a', 'favourite', 'film', '<END>',
                  'my', 'family', 'likes', 'avatar', '<END>', 'my',
                  'favourite', 'subject', 'is', 'music', '<END>')

        storage = WordStorage()
        storage.update(corpus)
        encoded = encode_text(storage, corpus)
        trie = NGramTrie(2, encoded)
        context = (storage.get_id('<END>'), )

        first_generated = storage.get_id('my')
        last_generated = storage.get_id('<END>')

        generator = NGramTextGenerator(storage, trie)
        actual = generator._generate_sentence(context)

        self.assertNotEqual(storage.get_id('<END>'), actual[0])

        self.assertEqual(first_generated, actual[0])
        self.assertEqual(last_generated, actual[-1])
コード例 #6
0
    def test_generate_next_word_context_incorrect(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>',
                  'his', 'name', 'is', 'bruno', '<END>',
                  'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>',
                  'her', 'name', 'is', 'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        four = NGramTrie(4, encoded)
        trie = NGramTrie(3, encoded)
        two = NGramTrie(2, encoded)

        expected_word = storage.get_id('rex')
        context = (storage.get_id('name'),
                   storage.get_id('is'),
                   storage.get_id('cat'))

        generator = BackOffGenerator(storage, four, two, trie)

        actual = generator._generate_next_word(context)
        self.assertEqual(expected_word, actual)
    def test_text_generator_no_context(self):
        """
        checks if the program can generate sentences without given context
        """

        corpus = ('cat', 'has', 'paws', '<END>', 'dogs', 'have', 'noses',
                  '<END>', 'cat', 'has', 'whiskers', '<END>')
        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(3, encoded)
        two = NGramTrie(2, encoded)
        four = NGramTrie(4, encoded)

        context = (
            storage.get_id('cat'),
            storage.get_id('dogs'),
        )

        generator = BackOffGenerator(storage, trie, two, four)

        actual = generator.generate_text(context, 3)
        self.assertTrue(all(actual))
    def test_load_model_takes_less_time(self):
        with open('lab_1/data.txt', 'r', encoding='utf-8') as big_file:
            big_data = big_file.read()
        tokenized_data = tokenize_by_sentence(big_data)
        storage = WordStorage()
        storage.update(tokenized_data)
        context = (
            storage.get_id('despite'),
            storage.get_id('the'),
        )

        start_time_generate = time()
        encoded = encode_text(storage, tokenized_data)
        trie = NGramTrie(3, encoded)
        generator = NGramTextGenerator(storage, trie)
        generated_text = generator.generate_text(context, 3)
        end_time_generate = time() - start_time_generate
        save_model(generator, 'model_training')

        start_time_saved = time()
        loaded_model = load_model('model_training')
        new_result = loaded_model.generate_text(context, 3)
        end_time_saved = time() - start_time_saved

        self.assertGreater(end_time_generate, end_time_saved)
        self.assertEqual(generated_text, new_result)
コード例 #9
0
    def test_most_freq_word_end(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        five = NGramTrie(5, encoded)
        trie = NGramTrie(3, encoded)
        four = NGramTrie(4, encoded)

        expected_word = storage.get_id('<END>')
        context = (
            storage.get_id('his'),
            storage.get_id('name'),
            storage.get_id('is'),
            storage.get_id('bruno'),
        )

        generator = BackOffGenerator(storage, five, trie, four)

        actual = generator.most_freq_word(context)
        self.assertEqual(expected_word, actual)
コード例 #10
0
    def test_most_freq_word_incorrect_context(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(3, encoded)
        two = NGramTrie(2, encoded)
        four = NGramTrie(4, encoded)

        bad_inputs = [[], {}, (
            2000,
            1000,
        ), None, 9, 9.34, True]

        generator = BackOffGenerator(storage, trie, two, four)

        for bad_context in bad_inputs:
            self.assertRaises(ValueError, generator.most_freq_word,
                              bad_context)
コード例 #11
0
    def test_decode_text_ideal(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(3, encoded)

        context = (
            storage.get_id('name'),
            storage.get_id('is'),
        )
        print('Я ТЕСТ', context)
        end = storage.get_id('<END>')

        generator = LikelihoodBasedTextGenerator(storage, trie)

        to_decode = generator.generate_text(context, 2)
        self.assertEqual(to_decode[-1], end)

        expected = ('Name is rex', 'Her name is rex')
        actual = decode_text(storage, to_decode)
        self.assertEqual(expected, actual)
コード例 #12
0
def main():
    text = ('I have a cat. His name is Bruno. '
            'I have a dog too. His name is Rex. '
            'Her name is Rex too.')

    corpus = tokenize_by_sentence(text)

    storage = WordStorage()
    storage.update(corpus)

    encoded = encode_text(storage, corpus)

    two = NGramTrie(2, encoded)
    trie = NGramTrie(3, encoded)

    context = (
        storage.get_id('name'),
        storage.get_id('is'),
    )

    generator = BackOffGenerator(storage, trie, two)

    expected = 'rex'
    actual = storage.get_word(generator._generate_next_word(context))

    print(f'TEXT:\n{text}')
    print(f'\nEXPECTED WORD AFTER name is IS {expected}')
    print(f'ACTUAL WORD AFTER name is IS {actual}')

    save_model(generator, 'model.txt')
    load_model('model.txt')

    return actual == expected
コード例 #13
0
    def test_decode_text_incorrect_storage(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(3, encoded)

        context = (
            storage.get_id('name'),
            storage.get_id('is'),
        )

        generator = LikelihoodBasedTextGenerator(storage, trie)

        to_decode = generator.generate_text(context, 2)

        bad_inputs = [(), [], 123, None, NGramTrie]

        for bad_storage in bad_inputs:
            self.assertRaises(ValueError, decode_text, bad_storage, to_decode)
コード例 #14
0
ファイル: start.py プロジェクト: katearb/2020-2-level-labs
def realize_n_gram_text_generator(text):
    n_gram_storage = WordStorage()
    n_gram_storage.update(text)
    n_gram_context = (n_gram_storage.get_id('my'), n_gram_storage.get_id('dear'))
    n_gram_encoded = encode_text(n_gram_storage, text)
    n_gram_trie = NGramTrie(3, n_gram_encoded)
    n_gram_generator = NGramTextGenerator(n_gram_storage, n_gram_trie)
    n_gram_text_generated = n_gram_generator.generate_text(n_gram_context, 3)
    return decode_text(n_gram_storage, n_gram_text_generated)
コード例 #15
0
 def test_encode_text_empty_sentence(self):
     """
     Tests that encode_corpus function
         can handle empty sentence input
     """
     word_storage = WordStorage()
     corpus = ()
     expected = ()
     word_storage.update(corpus)
     actual = encode_text(word_storage, corpus)
     self.assertEqual(expected, actual)
コード例 #16
0
ファイル: start.py プロジェクト: katearb/2020-2-level-labs
def realize_backoff_generator(text):
    backoff_storage = WordStorage()
    backoff_storage.update(text)
    backoff_encoded = encode_text(backoff_storage, text)
    two = NGramTrie(2, backoff_encoded)
    trie = NGramTrie(3, backoff_encoded)
    backoff_context = (backoff_storage.get_id('if'),
                       backoff_storage.get_id('you'),)
    backoff_generator = BackOffGenerator(backoff_storage, trie, two)
    backoff_text_generated = backoff_generator.generate_text(backoff_context, 3)

    return decode_text(backoff_storage, backoff_text_generated)
コード例 #17
0
    def test_ngram_text_generator_duplicates_words(self):
        corpus = ('stop', 'it', 'stop', 'it', 'stop', 'it', '<END>')

        storage = WordStorage()
        storage.update(corpus)
        encoded = encode_text(storage, corpus)
        trie = NGramTrie(2, encoded)
        context = (storage.get_id('stop'), )

        generator = NGramTextGenerator(storage, trie)
        actual = generator._generate_sentence(context)

        self.assertEqual(20 + len(context) + 1, len(actual))
コード例 #18
0
    def test_save_model_incorrect_path(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>',
                  'his', 'name', 'is', 'bruno', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(2, encoded)

        generator = NGramTextGenerator(storage, trie)

        self.assertRaises(FileNotFoundError, save_model, generator, r'some_folder/some_file')
    def test_context_end(self):
        """
        checks if <END> is in the context
        """
        context = ('cat', '<END>')
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)
        encoded = encode_text(word_storage, corpus)
        trie = NGramTrie(3, encoded)
        generator = NGramTextGenerator(word_storage, trie)

        self.assertRaises(ValueError, generator._generate_sentence, context)
コード例 #20
0
    def test_encode_text_same_words_count(self):
        """
        Tests that encode_text function
            can assign correct id to the same words
        """
        word_storage = WordStorage()

        corpus = ('i', 'have', 'a', 'cat', '<END>', 'i', 'have', 'a', 'cat',
                  '<END>')

        word_storage.update(corpus)

        actual = encode_text(word_storage, corpus)

        self.assertEqual(actual[:5], actual[5:])
コード例 #21
0
    def test_ngram_text_generator_identical_words(self):
        corpus = ('deadline', 'deadline', 'deadline', 'deadline', 'deadline',
                  '<END>')

        storage = WordStorage()
        storage.update(corpus)
        encoded = encode_text(storage, corpus)
        trie = NGramTrie(3, encoded)
        context = (storage.get_id('deadline'), storage.get_id('deadline'))

        generator = NGramTextGenerator(storage, trie)
        actual = generator._generate_sentence(context)

        self.assertEqual(20 + len(context) + 1,
                         len(actual))  # +1 it is for <END>
コード例 #22
0
    def test_encode_text_ideal(self):
        """
        Tests that encode_text function
            generates id for each word
        """
        word_storage = WordStorage()

        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>')

        word_storage.update(corpus)

        actual = encode_text(word_storage, corpus)

        for token in actual:
            self.assertTrue(isinstance(token, int))
コード例 #23
0
    def test_ngram_text_generator_bad_num_input(self):  # new test
        """
        throws errors with bad inputs
        """
        bad_inputs = (-5, 0, -2, -1, -8)
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)
        encoded = encode_text(word_storage, corpus)
        trie = NGramTrie(2, encoded)
        generator = NGramTextGenerator(word_storage, trie)

        for bad_input in bad_inputs:
            self.assertRaises(ValueError, generator._generate_sentence,
                              bad_input)
コード例 #24
0
 def test_get_most_frequent_gram_bad_inputs(self):
     """
     Checks that method returns empty tuple
     """
     corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
               'bruno', '<END>')
     word_storage = WordStorage()
     word_storage.update(corpus)
     encoded = encode_text(word_storage, corpus)
     ngram = NGramTrie(3, encoded)
     expected = ()
     generator = NGramTextGenerator(word_storage, ngram)
     bad_inputs = [[], {}, (), None, 9, 9.34, True]
     for bad_input in bad_inputs:
         self.assertEqual(expected,
                          generator.get_most_frequent_gram(bad_input))
    def test_text_generator_throws_errors(self):
        """
        throws errors with bad inputs
        """
        bad_inputs = [[], {}, None, 9, 9.34, True]
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)
        encoded = encode_text(word_storage, corpus)
        trie = NGramTrie(2, encoded)
        generator = NGramTextGenerator(word_storage, trie)

        for bad_input in bad_inputs:
            self.assertRaises(ValueError, generator.generate_text, bad_input,
                              10)
    def test_ngram_text_generator_generate_next_word(self):
        """
        Checks that next word generates properly
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'i', 'have', 'a', 'bruno',
                  'cat', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)
        encoded = encode_text(word_storage, corpus)
        ngram = NGramTrie(3, encoded)

        generator = NGramTextGenerator(word_storage, ngram)
        context = (word_storage.get_id('i'), word_storage.get_id('have'))
        expected = word_storage.get_id('a')
        actual = generator._generate_next_word(context)
        self.assertEqual(expected, actual)
    def test_ngram_text_generator_generate_sentence_properly(self):
        """
        generates correct output according to simple case
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)
        encoded = encode_text(word_storage, corpus)
        trie = NGramTrie(2, encoded)
        context = (word_storage.get_id('i'), )

        end = word_storage.get_id('<END>')

        generator = NGramTextGenerator(word_storage, trie)
        actual = generator._generate_sentence(context)
        self.assertEqual(actual[-1], end)
    def test_ngram_text_generator_generate_next_word_incorrect_context(self):
        """
        Checks that method throws error
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)
        encoded = encode_text(word_storage, corpus)
        ngram = NGramTrie(3, encoded)

        generator = NGramTextGenerator(word_storage, ngram)
        bad_inputs = [[], {}, (3, ), None, 9, 9.34,
                      True]  # (3, ) - it is incorrect sized ngram
        for bad_input in bad_inputs:
            self.assertRaises(ValueError, generator._generate_next_word,
                              bad_input)
コード例 #29
0
    def test_get_most_frequent_gram_ideal(self):
        """
        Checks that most frequent ngram gets properly
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'i', 'have', 'a', 'bruno',
                  'cat', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)
        encoded = encode_text(word_storage, corpus)
        ngram = NGramTrie(3, encoded)

        generator = NGramTextGenerator(word_storage, ngram)
        context = (word_storage.get_id('i'), word_storage.get_id('have'))
        expected = (word_storage.get_id('i'), word_storage.get_id('have'),
                    word_storage.get_id('a'))
        actual = generator.get_most_frequent_gram(context)
        self.assertEqual(expected, actual)
    def test_length_of_sentence(self):
        """
        generates sentences with length less than 20
        """
        corpus = ('i', 'have', 'a', 'cat', 'his', 'name', 'is', 'bruno', 'i',
                  'have', 'a', 'dog', 'too', 'his', 'name', 'is', 'rex', 'her',
                  'name', 'is', 'rex', 'too', 'he', 'funny', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)
        encoded = encode_text(word_storage, corpus)
        trie = NGramTrie(2, encoded)
        context = (word_storage.get_id('cat'), )

        generator = NGramTextGenerator(word_storage, trie)
        actual = len(generator._generate_sentence(context))
        expected = len(
            context) + 21  # cause we generate not more than 20 words + end
        self.assertLessEqual(actual, expected)