Example #1
0
 def test_check_X_negative002(self):
     """ If list of texts is specified as the NumPy array, then it must be a 1-D array. """
     texts = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]])
     true_err_msg = re.escape('`X` must be a 1-D array!')
     try:
         checking_method = self.assertRaisesRegex
     except:
         checking_method = self.assertRaisesRegexp
     with checking_method(ValueError, true_err_msg):
         Seq2SeqLSTM.check_X(texts, 'X')
Example #2
0
 def test_check_X_negative001(self):
     """ All texts must be a string and have a `split` method. """
     texts = ['123', 4, '567']
     true_err_msg = re.escape('Sample 1 of `X` is wrong! This sample have not the `split` method.')
     try:
         checking_method = self.assertRaisesRegex
     except:
         checking_method = self.assertRaisesRegexp
     with checking_method(ValueError, true_err_msg):
         Seq2SeqLSTM.check_X(texts, 'X')
Example #3
0
 def test_creation(self):
     seq2seq = Seq2SeqLSTM(batch_size=256,
                           epochs=200,
                           latent_dim=500,
                           validation_split=0.1,
                           grad_clipping=50.0,
                           lr=0.01,
                           rho=0.8,
                           epsilon=0.2,
                           lowercase=False,
                           verbose=True)
     self.assertIsInstance(seq2seq, Seq2SeqLSTM)
     self.assertTrue(hasattr(seq2seq, 'batch_size'))
     self.assertEqual(seq2seq.batch_size, 256)
     self.assertTrue(hasattr(seq2seq, 'epochs'))
     self.assertEqual(seq2seq.epochs, 200)
     self.assertTrue(hasattr(seq2seq, 'latent_dim'))
     self.assertEqual(seq2seq.latent_dim, 500)
     self.assertTrue(hasattr(seq2seq, 'validation_split'))
     self.assertAlmostEqual(seq2seq.validation_split, 0.1)
     self.assertTrue(hasattr(seq2seq, 'grad_clipping'))
     self.assertAlmostEqual(seq2seq.grad_clipping, 50.0)
     self.assertTrue(hasattr(seq2seq, 'lr'))
     self.assertAlmostEqual(seq2seq.lr, 0.01)
     self.assertTrue(hasattr(seq2seq, 'rho'))
     self.assertAlmostEqual(seq2seq.rho, 0.8)
     self.assertTrue(hasattr(seq2seq, 'lowercase'))
     self.assertFalse(seq2seq.lowercase)
     self.assertTrue(hasattr(seq2seq, 'verbose'))
     self.assertTrue(seq2seq.verbose)
Example #4
0
 def test_serialize_untrained(self):
     seq2seq = Seq2SeqLSTM(batch_size=256, epochs=200, latent_dim=500, validation_split=0.1,
                           grad_clipping=50.0, lr=0.01, weight_decay=0.0001, lowercase=False, verbose=True,
                           random_state=42)
     with open(self.model_name, 'wb') as fp:
         pickle.dump(seq2seq, fp)
     with open(self.model_name, 'rb') as fp:
         another_seq2seq = pickle.load(fp)
     self.assertIsInstance(another_seq2seq, Seq2SeqLSTM)
     self.assertTrue(hasattr(another_seq2seq, 'batch_size'))
     self.assertEqual(another_seq2seq.batch_size, 256)
     self.assertTrue(hasattr(another_seq2seq, 'epochs'))
     self.assertEqual(another_seq2seq.epochs, 200)
     self.assertTrue(hasattr(another_seq2seq, 'latent_dim'))
     self.assertEqual(another_seq2seq.latent_dim, 500)
     self.assertTrue(hasattr(another_seq2seq, 'validation_split'))
     self.assertAlmostEqual(another_seq2seq.validation_split, 0.1)
     self.assertTrue(hasattr(another_seq2seq, 'grad_clipping'))
     self.assertAlmostEqual(another_seq2seq.grad_clipping, 50.0)
     self.assertTrue(hasattr(another_seq2seq, 'lr'))
     self.assertAlmostEqual(another_seq2seq.lr, 0.01)
     self.assertTrue(hasattr(another_seq2seq, 'weight_decay'))
     self.assertAlmostEqual(another_seq2seq.weight_decay, 0.0001)
     self.assertTrue(hasattr(another_seq2seq, 'lowercase'))
     self.assertFalse(another_seq2seq.lowercase)
     self.assertTrue(hasattr(another_seq2seq, 'verbose'))
     self.assertTrue(another_seq2seq.verbose)
     self.assertTrue(hasattr(another_seq2seq, 'random_state'))
     self.assertEqual(another_seq2seq.random_state, 42)
Example #5
0
 def test_serialize_trained(self):
     input_texts, target_texts = self.load_text_pairs(self.data_set_name)
     indices = list(range(len(input_texts)))
     random.shuffle(indices)
     n = int(round(0.2 * len(indices)))
     input_texts_for_training = []
     target_texts_for_training = []
     for ind in indices[:-n]:
         input_texts_for_training.append(input_texts[ind])
         target_texts_for_training.append(target_texts[ind])
     input_texts_for_testing = []
     target_texts_for_testing = []
     for ind in indices[-n:]:
         input_texts_for_testing.append(input_texts[ind])
         target_texts_for_testing.append(target_texts[ind])
     seq2seq = Seq2SeqLSTM(validation_split=None, epochs=10, lr=1e-3)
     seq2seq.fit(input_texts_for_training, target_texts_for_training,
                 eval_set=(input_texts_for_testing, target_texts_for_testing))
     predicted_texts_1 = seq2seq.predict(input_texts_for_testing)
     with open(self.model_name, 'wb') as fp:
         pickle.dump(seq2seq, fp)
     del seq2seq
     with open(self.model_name, 'rb') as fp:
         another_seq2seq = pickle.load(fp)
     predicted_texts_2 = another_seq2seq.predict(input_texts_for_testing)
     self.assertEqual(predicted_texts_1, predicted_texts_2)
Example #6
0
 def test_fit_positive05(self):
     """ Prepared evaluation set is used in the early stopping criterion. """
     input_texts_for_training, target_texts_for_training = self.load_text_pairs(
         self.data_set_name)
     seq2seq = Seq2SeqLSTM(validation_split=None, lr=1e-2)
     res = seq2seq.fit(input_texts_for_training[:-20],
                       target_texts_for_training[:-20],
                       eval_set=(input_texts_for_training[-20:],
                                 target_texts_for_training[-20:]))
     self.assertIsInstance(res, Seq2SeqLSTM)
     self.assertTrue(hasattr(res, 'input_token_index_'))
     self.assertIsInstance(res.input_token_index_, dict)
     self.assertTrue(hasattr(res, 'target_token_index_'))
     self.assertIsInstance(res.target_token_index_, dict)
     self.assertTrue(hasattr(res, 'reverse_target_char_index_'))
     self.assertIsInstance(res.reverse_target_char_index_, dict)
     self.assertTrue(hasattr(res, 'max_encoder_seq_length_'))
     self.assertIsInstance(res.max_encoder_seq_length_, int)
     self.assertGreater(res.max_encoder_seq_length_, 0)
     self.assertTrue(hasattr(res, 'max_decoder_seq_length_'))
     self.assertIsInstance(res.max_decoder_seq_length_, int)
     self.assertGreater(res.max_decoder_seq_length_, 0)
     self.assertTrue(hasattr(res, 'encoder_model_'))
     self.assertIsInstance(res.encoder_model_, Model)
     self.assertTrue(hasattr(res, 'decoder_model_'))
     self.assertIsInstance(res.decoder_model_, Model)
Example #7
0
 def test_fit_negative04(self):
     """ Some parameter of the `Seq2SeqLSTM` object is wrong. """
     input_texts_for_training, target_texts_for_training = self.load_text_pairs(self.data_set_name)
     seq2seq = Seq2SeqLSTM(batch_size=0)
     true_err_msg = re.escape('`batch_size` must be a positive number! 0 is not positive.')
     try:
         checking_method = self.assertRaisesRegex
     except:
         checking_method = self.assertRaisesRegexp
     with checking_method(ValueError, true_err_msg):
         seq2seq.fit(input_texts_for_training, target_texts_for_training)
Example #8
0
 def test_fit_negative03(self):
     """ Number of input texts does not equal to number of target texts. """
     input_texts_for_training, target_texts_for_training = self.load_text_pairs(self.data_set_name)
     seq2seq = Seq2SeqLSTM()
     true_err_msg = re.escape(f'`X` does not correspond to `y`! {len(input_texts_for_training)} != {len(target_texts_for_training) - 1}.')
     try:
         checking_method = self.assertRaisesRegex
     except:
         checking_method = self.assertRaisesRegexp
     with checking_method(ValueError, true_err_msg):
         seq2seq.fit(input_texts_for_training, target_texts_for_training[:-1])
Example #9
0
 def test_fit_negative02(self):
     """ Object with target texts is not one of the basic sequence types. """
     input_texts_for_training, target_texts_for_training = self.load_text_pairs(self.data_set_name)
     seq2seq = Seq2SeqLSTM()
     true_err_msg = re.escape(f'`{type({1, 2})}` is wrong type for `y`.')
     try:
         checking_method = self.assertRaisesRegex
     except:
         checking_method = self.assertRaisesRegexp
     with checking_method(ValueError, true_err_msg):
         seq2seq.fit(input_texts_for_training, set(target_texts_for_training))
Example #10
0
 def test_predict_negative002(self):
     """ Input texts for prediction are wrong. """
     input_texts_for_testing, target_texts_for_testing = self.load_text_pairs(self.data_set_name)
     seq2seq = Seq2SeqLSTM(validation_split=None, epochs=20)
     seq2seq.fit(input_texts_for_testing, target_texts_for_testing)
     true_err_msg = re.escape(f'`{type({1, 2})}` is wrong type for `X`.')
     try:
         checking_method = self.assertRaisesRegex
     except:
         checking_method = self.assertRaisesRegexp
     with checking_method(ValueError, true_err_msg):
         _ = seq2seq.predict(set(input_texts_for_testing))
Example #11
0
 def test_fit_negative09(self):
     """ Number of input texts does not equal to number of target texts in the special evaluation set. """
     input_texts_for_training, target_texts_for_training = self.load_text_pairs(self.data_set_name)
     seq2seq = Seq2SeqLSTM()
     true_err_msg = re.escape('`X_eval_set` does not correspond to `y_eval_set`! 20 != 19.')
     try:
         checking_method = self.assertRaisesRegex
     except:
         checking_method = self.assertRaisesRegexp
     with checking_method(ValueError, true_err_msg):
         seq2seq.fit(input_texts_for_training[:-20], target_texts_for_training[:-20],
                     eval_set=(input_texts_for_training[-20:], target_texts_for_training[-19:]))
Example #12
0
 def test_fit_negative07(self):
     """ Object with input texts in the special evaluation set is not one of the basic sequence types. """
     input_texts_for_training, target_texts_for_training = self.load_text_pairs(self.data_set_name)
     seq2seq = Seq2SeqLSTM()
     true_err_msg = re.escape(f'`{type({1, 2})}` is wrong type for `X_eval_set`.')
     try:
         checking_method = self.assertRaisesRegex
     except:
         checking_method = self.assertRaisesRegexp
     with checking_method(ValueError, true_err_msg):
         seq2seq.fit(input_texts_for_training[:-20], target_texts_for_training[:-20],
                     eval_set=(set(input_texts_for_training[-20:]), target_texts_for_training[-20:]))
Example #13
0
 def test_fit_negative06(self):
     """ Special evaluation set is not a two-element tuple. """
     input_texts_for_training, target_texts_for_training = self.load_text_pairs(self.data_set_name)
     seq2seq = Seq2SeqLSTM(validation_split=None)
     true_err_msg = re.escape('`eval_set` must be a two-element sequence! 3 != 2')
     try:
         checking_method = self.assertRaisesRegex
     except:
         checking_method = self.assertRaisesRegexp
     with checking_method(ValueError, true_err_msg):
         seq2seq.fit(input_texts_for_training[:-20], target_texts_for_training[:-20],
                     eval_set=(input_texts_for_training[-20:], target_texts_for_training[-20:], [3, 4]))
Example #14
0
 def test_fit_negative05(self):
     """ Special evaluation set is neither list nor tuple. """
     input_texts_for_training, target_texts_for_training = self.load_text_pairs(self.data_set_name)
     seq2seq = Seq2SeqLSTM(validation_split=None)
     true_err_msg = re.escape(f'`eval_set` must be `{type((1, 2))}` or `{type([1, 2])}`, not `{type({1: "a", 2: "b"})}`!')
     try:
         checking_method = self.assertRaisesRegex
     except:
         checking_method = self.assertRaisesRegexp
     with checking_method(ValueError, true_err_msg):
         seq2seq.fit(input_texts_for_training[:-20], target_texts_for_training[:-20],
                     eval_set={'X': input_texts_for_training[-20:], 'y': target_texts_for_training[-20:]})
Example #15
0
 def test_predict_positive001(self):
     """ Part of correctly predicted texts must be greater than 0.1. """
     input_texts, target_texts = self.load_text_pairs(self.data_set_name)
     seq2seq = Seq2SeqLSTM(validation_split=None, epochs=200, lr=1e-2, verbose=True, lowercase=False)
     predicted_texts = seq2seq.fit_predict(input_texts, target_texts)
     self.assertIsInstance(predicted_texts, list)
     self.assertEqual(len(predicted_texts), len(input_texts))
     indices = list(range(len(predicted_texts)))
     random.shuffle(indices)
     print('')
     print('Some predicted texts:')
     for ind in range(min(5, len(predicted_texts))):
         print('    True: ' + self.detokenize_text(target_texts[indices[ind]]) +
               '\t Predicted: ' + self.detokenize_text(predicted_texts[indices[ind]]))
     self.assertGreater(self.estimate(predicted_texts, target_texts), 0.0001)
Example #16
0
 def test_fit_positive02(self):
     """ Input and target texts for training are the 1-D numpy arrays. """
     input_texts_for_training, target_texts_for_training = self.load_text_pairs(self.data_set_name)
     seq2seq = Seq2SeqLSTM(lr=1e-2)
     res = seq2seq.fit(np.array(input_texts_for_training), np.array(target_texts_for_training))
     self.assertIsInstance(res, Seq2SeqLSTM)
     self.assertTrue(hasattr(res, 'input_token_index_'))
     self.assertIsInstance(res.input_token_index_, dict)
     self.assertTrue(hasattr(res, 'target_token_index_'))
     self.assertIsInstance(res.target_token_index_, dict)
     self.assertTrue(hasattr(res, 'reverse_target_char_index_'))
     self.assertIsInstance(res.reverse_target_char_index_, dict)
     self.assertTrue(hasattr(res, 'max_encoder_seq_length_'))
     self.assertIsInstance(res.max_encoder_seq_length_, int)
     self.assertGreater(res.max_encoder_seq_length_, 0)
     self.assertTrue(hasattr(res, 'max_decoder_seq_length_'))
     self.assertIsInstance(res.max_decoder_seq_length_, int)
     self.assertGreater(res.max_decoder_seq_length_, 0)
     self.assertTrue(hasattr(res, 'encoder_model_'))
     self.assertIsInstance(res.encoder_model_, Model)
     self.assertTrue(hasattr(res, 'decoder_model_'))
     self.assertIsInstance(res.decoder_model_, Model)
Example #17
0
 def test_fit_positive04(self):
     """ Early stopping is not used in the training process. """
     input_texts_for_training, target_texts_for_training = self.load_text_pairs(self.data_set_name)
     seq2seq = Seq2SeqLSTM(validation_split=None, lr=1e-2)
     res = seq2seq.fit(input_texts_for_training, target_texts_for_training)
     self.assertIsInstance(res, Seq2SeqLSTM)
     self.assertTrue(hasattr(res, 'input_token_index_'))
     self.assertIsInstance(res.input_token_index_, dict)
     self.assertTrue(hasattr(res, 'target_token_index_'))
     self.assertIsInstance(res.target_token_index_, dict)
     self.assertTrue(hasattr(res, 'reverse_target_char_index_'))
     self.assertIsInstance(res.reverse_target_char_index_, dict)
     self.assertTrue(hasattr(res, 'max_encoder_seq_length_'))
     self.assertIsInstance(res.max_encoder_seq_length_, int)
     self.assertGreater(res.max_encoder_seq_length_, 0)
     self.assertTrue(hasattr(res, 'max_decoder_seq_length_'))
     self.assertIsInstance(res.max_decoder_seq_length_, int)
     self.assertGreater(res.max_decoder_seq_length_, 0)
     self.assertTrue(hasattr(res, 'encoder_model_'))
     self.assertIsInstance(res.encoder_model_, Model)
     self.assertTrue(hasattr(res, 'decoder_model_'))
     self.assertIsInstance(res.decoder_model_, Model)
Example #18
0
 def test_creation(self):
     seq2seq = Seq2SeqLSTM(batch_size=256, epochs=200, latent_dim=500, validation_split=0.1,
                           grad_clipping=50.0, lr=0.01, weight_decay=0.0001, lowercase=False, verbose=True)
     self.assertIsInstance(seq2seq, Seq2SeqLSTM)
     self.assertTrue(hasattr(seq2seq, 'batch_size'))
     self.assertEqual(seq2seq.batch_size, 256)
     self.assertTrue(hasattr(seq2seq, 'epochs'))
     self.assertEqual(seq2seq.epochs, 200)
     self.assertTrue(hasattr(seq2seq, 'latent_dim'))
     self.assertEqual(seq2seq.latent_dim, 500)
     self.assertTrue(hasattr(seq2seq, 'validation_split'))
     self.assertAlmostEqual(seq2seq.validation_split, 0.1)
     self.assertTrue(hasattr(seq2seq, 'grad_clipping'))
     self.assertAlmostEqual(seq2seq.grad_clipping, 50.0)
     self.assertTrue(hasattr(seq2seq, 'lr'))
     self.assertAlmostEqual(seq2seq.lr, 0.01)
     self.assertTrue(hasattr(seq2seq, 'weight_decay'))
     self.assertAlmostEqual(seq2seq.weight_decay, 0.0001)
     self.assertTrue(hasattr(seq2seq, 'lowercase'))
     self.assertFalse(seq2seq.lowercase)
     self.assertTrue(hasattr(seq2seq, 'verbose'))
     self.assertTrue(seq2seq.verbose)
     self.assertTrue(hasattr(seq2seq, 'random_state'))
     self.assertIsNone(seq2seq.random_state)
Example #19
0
 def test_tokenize_text_positive02(self):
     """ Tokenization with bringing the resulting tokens to lowercase. """
     src = 'a\t B  c Мама мыла \n\r раму 1\n'
     dst_true = ['a', 'b', 'c', 'мама', 'мыла', 'раму', '1']
     dst_predicted = Seq2SeqLSTM.tokenize_text(src, lowercase=True)
     self.assertEqual(dst_predicted, dst_true)
Example #20
0
 def test_tokenize_text_positive01(self):
     """ Tokenization with saving of the characters register. """
     src = 'a\t B  c Мама мыла \n\r раму 1\n'
     dst_true = ['a', 'B', 'c', 'Мама', 'мыла', 'раму', '1']
     dst_predicted = Seq2SeqLSTM.tokenize_text(src, lowercase=False)
     self.assertEqual(dst_predicted, dst_true)
def main():
    if len(sys.argv) > 1:
        model_name = os.path.normpath(sys.argv[1].strip())
        if len(model_name) == 0:
            model_name = None
        else:
            model_dir_name = os.path.dirname(model_name)
            if len(model_dir_name) > 0:
                assert os.path.isdir(
                    model_dir_name), u'Directory "{0}" does not exist!'.format(
                        model_dir_name)
    else:
        model_name = None

    input_texts_for_training, target_texts_for_training = shuffle_text_pairs(
        *load_text_pairs(
            os.path.join(os.path.dirname(__file__), 'data',
                         'eng_rus_for_training.txt')))
    print(u'')
    print(u'There are {0} text pairs in the training data.'.format(
        len(input_texts_for_training)))
    print(u'Some samples of these text pairs:')
    for ind in range(10):
        input_text = input_texts_for_training[ind]
        target_text = target_texts_for_training[ind]
        print(u'    ' + detokenize_text(input_text) + u'\t' +
              detokenize_text(target_text))
    print(u'')

    input_texts_for_testing, target_texts_for_testing = load_text_pairs(
        os.path.join(os.path.dirname(__file__), 'data',
                     'eng_rus_for_testing.txt'))
    print(u'There are {0} text pairs in the testing data.'.format(
        len(input_texts_for_testing)))
    print(u'Some samples of these text pairs:')
    indices = list(range(len(input_texts_for_testing)))
    random.shuffle(indices)
    for ind in indices[:10]:
        input_text = input_texts_for_testing[ind]
        target_text = target_texts_for_testing[ind]
        print(u'    ' + detokenize_text(input_text) + u'\t' +
              detokenize_text(target_text))
    print(u'')

    if (model_name is not None) and os.path.isfile(model_name):
        with open(model_name, 'rb') as fp:
            seq2seq = pickle.load(fp)
        assert isinstance(seq2seq, Seq2SeqLSTM), \
            u'A sequence-to-sequence neural model cannot be loaded from file "{0}".'.format(model_name)
        print(u'')
        print(u'Model has been successfully loaded from file "{0}".'.format(
            model_name))
    else:
        seq2seq = Seq2SeqLSTM(latent_dim=256,
                              validation_split=0.1,
                              epochs=1,
                              lr=1e-3,
                              verbose=True,
                              lowercase=False,
                              batch_size=64)
        seq2seq.fit(input_texts_for_training, target_texts_for_training)
        print(u'')
        print(u'Training has been successfully finished.')
        if model_name is not None:
            with open(model_name, 'wb') as fp:
                pickle.dump(seq2seq, fp, protocol=2)
            print(u'Model has been successfully saved into file "{0}".'.format(
                model_name))

    start_time = time.time()
    predicted_texts = seq2seq.predict(input_texts_for_testing)
    end_time = time.time()
    sentence_correct, word_correct, character_correct = estimate(
        predicted_texts, target_texts_for_testing)
    print(u'')
    print(u'{0} texts have been predicted.'.format(len(predicted_texts)))
    print(u'Some samples of predicted text pairs:')
    for ind in indices[:10]:
        input_text = input_texts_for_testing[ind]
        target_text = predicted_texts[ind]
        print(u'    ' + detokenize_text(input_text) + u'\t' +
              detokenize_text(target_text))
    print(u'')
    print(u'Total sentence correct is {0:.2%}.'.format(sentence_correct))
    print(u'Total word correct is {0:.2%}.'.format(word_correct))
    print(u'Total character correct is {0:.2%}.'.format(character_correct))
    print(u'')
    print(u'Mean time of sentence prediction is {0:.3} sec.'.format(
        (end_time - start_time) / len(predicted_texts)))
Example #22
0
 def test_predict_negative001(self):
     """ Usage of the seq2seq model for prediction without training. """
     input_texts_for_testing, _ = self.load_text_pairs(self.data_set_name)
     seq2seq = Seq2SeqLSTM(validation_split=None, epochs=20)
     with self.assertRaises(NotFittedError):
         _ = seq2seq.predict(input_texts_for_testing)