def test_check_X_negative002(self): """ If list of texts is specified as the NumPy array, then it must be a 1-D array. """ texts = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]) true_err_msg = re.escape('`X` must be a 1-D array!') try: checking_method = self.assertRaisesRegex except: checking_method = self.assertRaisesRegexp with checking_method(ValueError, true_err_msg): Seq2SeqLSTM.check_X(texts, 'X')
def test_check_X_negative001(self): """ All texts must be a string and have a `split` method. """ texts = ['123', 4, '567'] true_err_msg = re.escape('Sample 1 of `X` is wrong! This sample have not the `split` method.') try: checking_method = self.assertRaisesRegex except: checking_method = self.assertRaisesRegexp with checking_method(ValueError, true_err_msg): Seq2SeqLSTM.check_X(texts, 'X')
def test_creation(self): seq2seq = Seq2SeqLSTM(batch_size=256, epochs=200, latent_dim=500, validation_split=0.1, grad_clipping=50.0, lr=0.01, rho=0.8, epsilon=0.2, lowercase=False, verbose=True) self.assertIsInstance(seq2seq, Seq2SeqLSTM) self.assertTrue(hasattr(seq2seq, 'batch_size')) self.assertEqual(seq2seq.batch_size, 256) self.assertTrue(hasattr(seq2seq, 'epochs')) self.assertEqual(seq2seq.epochs, 200) self.assertTrue(hasattr(seq2seq, 'latent_dim')) self.assertEqual(seq2seq.latent_dim, 500) self.assertTrue(hasattr(seq2seq, 'validation_split')) self.assertAlmostEqual(seq2seq.validation_split, 0.1) self.assertTrue(hasattr(seq2seq, 'grad_clipping')) self.assertAlmostEqual(seq2seq.grad_clipping, 50.0) self.assertTrue(hasattr(seq2seq, 'lr')) self.assertAlmostEqual(seq2seq.lr, 0.01) self.assertTrue(hasattr(seq2seq, 'rho')) self.assertAlmostEqual(seq2seq.rho, 0.8) self.assertTrue(hasattr(seq2seq, 'lowercase')) self.assertFalse(seq2seq.lowercase) self.assertTrue(hasattr(seq2seq, 'verbose')) self.assertTrue(seq2seq.verbose)
def test_serialize_untrained(self): seq2seq = Seq2SeqLSTM(batch_size=256, epochs=200, latent_dim=500, validation_split=0.1, grad_clipping=50.0, lr=0.01, weight_decay=0.0001, lowercase=False, verbose=True, random_state=42) with open(self.model_name, 'wb') as fp: pickle.dump(seq2seq, fp) with open(self.model_name, 'rb') as fp: another_seq2seq = pickle.load(fp) self.assertIsInstance(another_seq2seq, Seq2SeqLSTM) self.assertTrue(hasattr(another_seq2seq, 'batch_size')) self.assertEqual(another_seq2seq.batch_size, 256) self.assertTrue(hasattr(another_seq2seq, 'epochs')) self.assertEqual(another_seq2seq.epochs, 200) self.assertTrue(hasattr(another_seq2seq, 'latent_dim')) self.assertEqual(another_seq2seq.latent_dim, 500) self.assertTrue(hasattr(another_seq2seq, 'validation_split')) self.assertAlmostEqual(another_seq2seq.validation_split, 0.1) self.assertTrue(hasattr(another_seq2seq, 'grad_clipping')) self.assertAlmostEqual(another_seq2seq.grad_clipping, 50.0) self.assertTrue(hasattr(another_seq2seq, 'lr')) self.assertAlmostEqual(another_seq2seq.lr, 0.01) self.assertTrue(hasattr(another_seq2seq, 'weight_decay')) self.assertAlmostEqual(another_seq2seq.weight_decay, 0.0001) self.assertTrue(hasattr(another_seq2seq, 'lowercase')) self.assertFalse(another_seq2seq.lowercase) self.assertTrue(hasattr(another_seq2seq, 'verbose')) self.assertTrue(another_seq2seq.verbose) self.assertTrue(hasattr(another_seq2seq, 'random_state')) self.assertEqual(another_seq2seq.random_state, 42)
def test_serialize_trained(self): input_texts, target_texts = self.load_text_pairs(self.data_set_name) indices = list(range(len(input_texts))) random.shuffle(indices) n = int(round(0.2 * len(indices))) input_texts_for_training = [] target_texts_for_training = [] for ind in indices[:-n]: input_texts_for_training.append(input_texts[ind]) target_texts_for_training.append(target_texts[ind]) input_texts_for_testing = [] target_texts_for_testing = [] for ind in indices[-n:]: input_texts_for_testing.append(input_texts[ind]) target_texts_for_testing.append(target_texts[ind]) seq2seq = Seq2SeqLSTM(validation_split=None, epochs=10, lr=1e-3) seq2seq.fit(input_texts_for_training, target_texts_for_training, eval_set=(input_texts_for_testing, target_texts_for_testing)) predicted_texts_1 = seq2seq.predict(input_texts_for_testing) with open(self.model_name, 'wb') as fp: pickle.dump(seq2seq, fp) del seq2seq with open(self.model_name, 'rb') as fp: another_seq2seq = pickle.load(fp) predicted_texts_2 = another_seq2seq.predict(input_texts_for_testing) self.assertEqual(predicted_texts_1, predicted_texts_2)
def test_fit_positive05(self): """ Prepared evaluation set is used in the early stopping criterion. """ input_texts_for_training, target_texts_for_training = self.load_text_pairs( self.data_set_name) seq2seq = Seq2SeqLSTM(validation_split=None, lr=1e-2) res = seq2seq.fit(input_texts_for_training[:-20], target_texts_for_training[:-20], eval_set=(input_texts_for_training[-20:], target_texts_for_training[-20:])) self.assertIsInstance(res, Seq2SeqLSTM) self.assertTrue(hasattr(res, 'input_token_index_')) self.assertIsInstance(res.input_token_index_, dict) self.assertTrue(hasattr(res, 'target_token_index_')) self.assertIsInstance(res.target_token_index_, dict) self.assertTrue(hasattr(res, 'reverse_target_char_index_')) self.assertIsInstance(res.reverse_target_char_index_, dict) self.assertTrue(hasattr(res, 'max_encoder_seq_length_')) self.assertIsInstance(res.max_encoder_seq_length_, int) self.assertGreater(res.max_encoder_seq_length_, 0) self.assertTrue(hasattr(res, 'max_decoder_seq_length_')) self.assertIsInstance(res.max_decoder_seq_length_, int) self.assertGreater(res.max_decoder_seq_length_, 0) self.assertTrue(hasattr(res, 'encoder_model_')) self.assertIsInstance(res.encoder_model_, Model) self.assertTrue(hasattr(res, 'decoder_model_')) self.assertIsInstance(res.decoder_model_, Model)
def test_fit_negative04(self): """ Some parameter of the `Seq2SeqLSTM` object is wrong. """ input_texts_for_training, target_texts_for_training = self.load_text_pairs(self.data_set_name) seq2seq = Seq2SeqLSTM(batch_size=0) true_err_msg = re.escape('`batch_size` must be a positive number! 0 is not positive.') try: checking_method = self.assertRaisesRegex except: checking_method = self.assertRaisesRegexp with checking_method(ValueError, true_err_msg): seq2seq.fit(input_texts_for_training, target_texts_for_training)
def test_fit_negative03(self): """ Number of input texts does not equal to number of target texts. """ input_texts_for_training, target_texts_for_training = self.load_text_pairs(self.data_set_name) seq2seq = Seq2SeqLSTM() true_err_msg = re.escape(f'`X` does not correspond to `y`! {len(input_texts_for_training)} != {len(target_texts_for_training) - 1}.') try: checking_method = self.assertRaisesRegex except: checking_method = self.assertRaisesRegexp with checking_method(ValueError, true_err_msg): seq2seq.fit(input_texts_for_training, target_texts_for_training[:-1])
def test_fit_negative02(self): """ Object with target texts is not one of the basic sequence types. """ input_texts_for_training, target_texts_for_training = self.load_text_pairs(self.data_set_name) seq2seq = Seq2SeqLSTM() true_err_msg = re.escape(f'`{type({1, 2})}` is wrong type for `y`.') try: checking_method = self.assertRaisesRegex except: checking_method = self.assertRaisesRegexp with checking_method(ValueError, true_err_msg): seq2seq.fit(input_texts_for_training, set(target_texts_for_training))
def test_predict_negative002(self): """ Input texts for prediction are wrong. """ input_texts_for_testing, target_texts_for_testing = self.load_text_pairs(self.data_set_name) seq2seq = Seq2SeqLSTM(validation_split=None, epochs=20) seq2seq.fit(input_texts_for_testing, target_texts_for_testing) true_err_msg = re.escape(f'`{type({1, 2})}` is wrong type for `X`.') try: checking_method = self.assertRaisesRegex except: checking_method = self.assertRaisesRegexp with checking_method(ValueError, true_err_msg): _ = seq2seq.predict(set(input_texts_for_testing))
def test_fit_negative09(self): """ Number of input texts does not equal to number of target texts in the special evaluation set. """ input_texts_for_training, target_texts_for_training = self.load_text_pairs(self.data_set_name) seq2seq = Seq2SeqLSTM() true_err_msg = re.escape('`X_eval_set` does not correspond to `y_eval_set`! 20 != 19.') try: checking_method = self.assertRaisesRegex except: checking_method = self.assertRaisesRegexp with checking_method(ValueError, true_err_msg): seq2seq.fit(input_texts_for_training[:-20], target_texts_for_training[:-20], eval_set=(input_texts_for_training[-20:], target_texts_for_training[-19:]))
def test_fit_negative07(self): """ Object with input texts in the special evaluation set is not one of the basic sequence types. """ input_texts_for_training, target_texts_for_training = self.load_text_pairs(self.data_set_name) seq2seq = Seq2SeqLSTM() true_err_msg = re.escape(f'`{type({1, 2})}` is wrong type for `X_eval_set`.') try: checking_method = self.assertRaisesRegex except: checking_method = self.assertRaisesRegexp with checking_method(ValueError, true_err_msg): seq2seq.fit(input_texts_for_training[:-20], target_texts_for_training[:-20], eval_set=(set(input_texts_for_training[-20:]), target_texts_for_training[-20:]))
def test_fit_negative06(self): """ Special evaluation set is not a two-element tuple. """ input_texts_for_training, target_texts_for_training = self.load_text_pairs(self.data_set_name) seq2seq = Seq2SeqLSTM(validation_split=None) true_err_msg = re.escape('`eval_set` must be a two-element sequence! 3 != 2') try: checking_method = self.assertRaisesRegex except: checking_method = self.assertRaisesRegexp with checking_method(ValueError, true_err_msg): seq2seq.fit(input_texts_for_training[:-20], target_texts_for_training[:-20], eval_set=(input_texts_for_training[-20:], target_texts_for_training[-20:], [3, 4]))
def test_fit_negative05(self): """ Special evaluation set is neither list nor tuple. """ input_texts_for_training, target_texts_for_training = self.load_text_pairs(self.data_set_name) seq2seq = Seq2SeqLSTM(validation_split=None) true_err_msg = re.escape(f'`eval_set` must be `{type((1, 2))}` or `{type([1, 2])}`, not `{type({1: "a", 2: "b"})}`!') try: checking_method = self.assertRaisesRegex except: checking_method = self.assertRaisesRegexp with checking_method(ValueError, true_err_msg): seq2seq.fit(input_texts_for_training[:-20], target_texts_for_training[:-20], eval_set={'X': input_texts_for_training[-20:], 'y': target_texts_for_training[-20:]})
def test_predict_positive001(self): """ Part of correctly predicted texts must be greater than 0.1. """ input_texts, target_texts = self.load_text_pairs(self.data_set_name) seq2seq = Seq2SeqLSTM(validation_split=None, epochs=200, lr=1e-2, verbose=True, lowercase=False) predicted_texts = seq2seq.fit_predict(input_texts, target_texts) self.assertIsInstance(predicted_texts, list) self.assertEqual(len(predicted_texts), len(input_texts)) indices = list(range(len(predicted_texts))) random.shuffle(indices) print('') print('Some predicted texts:') for ind in range(min(5, len(predicted_texts))): print(' True: ' + self.detokenize_text(target_texts[indices[ind]]) + '\t Predicted: ' + self.detokenize_text(predicted_texts[indices[ind]])) self.assertGreater(self.estimate(predicted_texts, target_texts), 0.0001)
def test_fit_positive02(self): """ Input and target texts for training are the 1-D numpy arrays. """ input_texts_for_training, target_texts_for_training = self.load_text_pairs(self.data_set_name) seq2seq = Seq2SeqLSTM(lr=1e-2) res = seq2seq.fit(np.array(input_texts_for_training), np.array(target_texts_for_training)) self.assertIsInstance(res, Seq2SeqLSTM) self.assertTrue(hasattr(res, 'input_token_index_')) self.assertIsInstance(res.input_token_index_, dict) self.assertTrue(hasattr(res, 'target_token_index_')) self.assertIsInstance(res.target_token_index_, dict) self.assertTrue(hasattr(res, 'reverse_target_char_index_')) self.assertIsInstance(res.reverse_target_char_index_, dict) self.assertTrue(hasattr(res, 'max_encoder_seq_length_')) self.assertIsInstance(res.max_encoder_seq_length_, int) self.assertGreater(res.max_encoder_seq_length_, 0) self.assertTrue(hasattr(res, 'max_decoder_seq_length_')) self.assertIsInstance(res.max_decoder_seq_length_, int) self.assertGreater(res.max_decoder_seq_length_, 0) self.assertTrue(hasattr(res, 'encoder_model_')) self.assertIsInstance(res.encoder_model_, Model) self.assertTrue(hasattr(res, 'decoder_model_')) self.assertIsInstance(res.decoder_model_, Model)
def test_fit_positive04(self): """ Early stopping is not used in the training process. """ input_texts_for_training, target_texts_for_training = self.load_text_pairs(self.data_set_name) seq2seq = Seq2SeqLSTM(validation_split=None, lr=1e-2) res = seq2seq.fit(input_texts_for_training, target_texts_for_training) self.assertIsInstance(res, Seq2SeqLSTM) self.assertTrue(hasattr(res, 'input_token_index_')) self.assertIsInstance(res.input_token_index_, dict) self.assertTrue(hasattr(res, 'target_token_index_')) self.assertIsInstance(res.target_token_index_, dict) self.assertTrue(hasattr(res, 'reverse_target_char_index_')) self.assertIsInstance(res.reverse_target_char_index_, dict) self.assertTrue(hasattr(res, 'max_encoder_seq_length_')) self.assertIsInstance(res.max_encoder_seq_length_, int) self.assertGreater(res.max_encoder_seq_length_, 0) self.assertTrue(hasattr(res, 'max_decoder_seq_length_')) self.assertIsInstance(res.max_decoder_seq_length_, int) self.assertGreater(res.max_decoder_seq_length_, 0) self.assertTrue(hasattr(res, 'encoder_model_')) self.assertIsInstance(res.encoder_model_, Model) self.assertTrue(hasattr(res, 'decoder_model_')) self.assertIsInstance(res.decoder_model_, Model)
def test_creation(self): seq2seq = Seq2SeqLSTM(batch_size=256, epochs=200, latent_dim=500, validation_split=0.1, grad_clipping=50.0, lr=0.01, weight_decay=0.0001, lowercase=False, verbose=True) self.assertIsInstance(seq2seq, Seq2SeqLSTM) self.assertTrue(hasattr(seq2seq, 'batch_size')) self.assertEqual(seq2seq.batch_size, 256) self.assertTrue(hasattr(seq2seq, 'epochs')) self.assertEqual(seq2seq.epochs, 200) self.assertTrue(hasattr(seq2seq, 'latent_dim')) self.assertEqual(seq2seq.latent_dim, 500) self.assertTrue(hasattr(seq2seq, 'validation_split')) self.assertAlmostEqual(seq2seq.validation_split, 0.1) self.assertTrue(hasattr(seq2seq, 'grad_clipping')) self.assertAlmostEqual(seq2seq.grad_clipping, 50.0) self.assertTrue(hasattr(seq2seq, 'lr')) self.assertAlmostEqual(seq2seq.lr, 0.01) self.assertTrue(hasattr(seq2seq, 'weight_decay')) self.assertAlmostEqual(seq2seq.weight_decay, 0.0001) self.assertTrue(hasattr(seq2seq, 'lowercase')) self.assertFalse(seq2seq.lowercase) self.assertTrue(hasattr(seq2seq, 'verbose')) self.assertTrue(seq2seq.verbose) self.assertTrue(hasattr(seq2seq, 'random_state')) self.assertIsNone(seq2seq.random_state)
def test_tokenize_text_positive02(self): """ Tokenization with bringing the resulting tokens to lowercase. """ src = 'a\t B c Мама мыла \n\r раму 1\n' dst_true = ['a', 'b', 'c', 'мама', 'мыла', 'раму', '1'] dst_predicted = Seq2SeqLSTM.tokenize_text(src, lowercase=True) self.assertEqual(dst_predicted, dst_true)
def test_tokenize_text_positive01(self): """ Tokenization with saving of the characters register. """ src = 'a\t B c Мама мыла \n\r раму 1\n' dst_true = ['a', 'B', 'c', 'Мама', 'мыла', 'раму', '1'] dst_predicted = Seq2SeqLSTM.tokenize_text(src, lowercase=False) self.assertEqual(dst_predicted, dst_true)
def main(): if len(sys.argv) > 1: model_name = os.path.normpath(sys.argv[1].strip()) if len(model_name) == 0: model_name = None else: model_dir_name = os.path.dirname(model_name) if len(model_dir_name) > 0: assert os.path.isdir( model_dir_name), u'Directory "{0}" does not exist!'.format( model_dir_name) else: model_name = None input_texts_for_training, target_texts_for_training = shuffle_text_pairs( *load_text_pairs( os.path.join(os.path.dirname(__file__), 'data', 'eng_rus_for_training.txt'))) print(u'') print(u'There are {0} text pairs in the training data.'.format( len(input_texts_for_training))) print(u'Some samples of these text pairs:') for ind in range(10): input_text = input_texts_for_training[ind] target_text = target_texts_for_training[ind] print(u' ' + detokenize_text(input_text) + u'\t' + detokenize_text(target_text)) print(u'') input_texts_for_testing, target_texts_for_testing = load_text_pairs( os.path.join(os.path.dirname(__file__), 'data', 'eng_rus_for_testing.txt')) print(u'There are {0} text pairs in the testing data.'.format( len(input_texts_for_testing))) print(u'Some samples of these text pairs:') indices = list(range(len(input_texts_for_testing))) random.shuffle(indices) for ind in indices[:10]: input_text = input_texts_for_testing[ind] target_text = target_texts_for_testing[ind] print(u' ' + detokenize_text(input_text) + u'\t' + detokenize_text(target_text)) print(u'') if (model_name is not None) and os.path.isfile(model_name): with open(model_name, 'rb') as fp: seq2seq = pickle.load(fp) assert isinstance(seq2seq, Seq2SeqLSTM), \ u'A sequence-to-sequence neural model cannot be loaded from file "{0}".'.format(model_name) print(u'') print(u'Model has been successfully loaded from file "{0}".'.format( model_name)) else: seq2seq = Seq2SeqLSTM(latent_dim=256, validation_split=0.1, epochs=1, lr=1e-3, verbose=True, lowercase=False, batch_size=64) seq2seq.fit(input_texts_for_training, target_texts_for_training) print(u'') print(u'Training has been successfully finished.') if model_name is not None: with open(model_name, 'wb') as fp: pickle.dump(seq2seq, fp, protocol=2) print(u'Model has been successfully saved into file "{0}".'.format( model_name)) start_time = time.time() predicted_texts = seq2seq.predict(input_texts_for_testing) end_time = time.time() sentence_correct, word_correct, character_correct = estimate( predicted_texts, target_texts_for_testing) print(u'') print(u'{0} texts have been predicted.'.format(len(predicted_texts))) print(u'Some samples of predicted text pairs:') for ind in indices[:10]: input_text = input_texts_for_testing[ind] target_text = predicted_texts[ind] print(u' ' + detokenize_text(input_text) + u'\t' + detokenize_text(target_text)) print(u'') print(u'Total sentence correct is {0:.2%}.'.format(sentence_correct)) print(u'Total word correct is {0:.2%}.'.format(word_correct)) print(u'Total character correct is {0:.2%}.'.format(character_correct)) print(u'') print(u'Mean time of sentence prediction is {0:.3} sec.'.format( (end_time - start_time) / len(predicted_texts)))
def test_predict_negative001(self): """ Usage of the seq2seq model for prediction without training. """ input_texts_for_testing, _ = self.load_text_pairs(self.data_set_name) seq2seq = Seq2SeqLSTM(validation_split=None, epochs=20) with self.assertRaises(NotFittedError): _ = seq2seq.predict(input_texts_for_testing)