def update_graph(self, word_vocab, char_vocab, max_word_length, epoch): with self._graph.as_default(): with tf.device('/gpu:%d' % self._gpu_id): initializer = tf.random_uniform_initializer( -FLAGS.param_init, FLAGS.param_init) # TODO: other way to rebuild graph? reuse = None # if epoch > 0: # reuse = True with tf.variable_scope("Epoch_%d_Individual_%d" % (epoch, self._id_number), initializer=initializer, reuse=reuse): my_model = model.individual_graph( char_vocab_size=char_vocab.size, word_vocab_size=word_vocab.size, char_embed_size=self._knowledge.char_embed_size[-1], batch_size=FLAGS.batch_size, max_word_length=max_word_length, num_unroll_steps=FLAGS.num_unroll_steps, num_highway_layers=2, cnn_layer=self._cnn_layer, rnn_layers=self._rnn_layers, dropout=self._knowledge.dropout[-1]) my_model.update( model.loss_graph(my_model.logits, FLAGS.batch_size, FLAGS.num_unroll_steps)) my_model.update( model.training_graph( my_model.loss * FLAGS.num_unroll_steps, FLAGS.learning_rate, FLAGS.max_grad_norm)) saver = tf.train.Saver(max_to_keep=1) with tf.variable_scope("Epoch_%d_Individual_%d" % (epoch, self._id_number), reuse=True): valid_model = model.individual_graph( char_vocab_size=char_vocab.size, word_vocab_size=word_vocab.size, char_embed_size=self._knowledge.char_embed_size[-1], batch_size=FLAGS.batch_size, max_word_length=max_word_length, num_unroll_steps=FLAGS.num_unroll_steps, num_highway_layers=2, cnn_layer=self._cnn_layer, rnn_layers=self._rnn_layers, dropout=self._knowledge.dropout[-1]) valid_model.update( model.loss_graph(valid_model.logits, FLAGS.batch_size, FLAGS.num_unroll_steps)) return my_model, valid_model, saver
def model(self): m = model.inference_graph(char_vocab_size=51, word_vocab_size=10000, char_embed_size=3, batch_size=4, num_highway_layers=0, num_rnn_layers=1, rnn_size=5, max_word_length=11, kernels= [2], kernel_features=[2], num_unroll_steps=3, dropout=0.0) m.update(model.loss_graph(m.logits, batch_size=4, num_unroll_steps=3)) return m
def model(self): m = model.inference_graph(char_vocab_size=51, word_vocab_size=10000, char_embed_size=3, batch_size=4, num_highway_layers=0, num_rnn_layers=1, rnn_size=5, max_word_length=11, kernels=[2], kernel_features=[2], num_unroll_steps=3, dropout=0.0) m.update(model.loss_graph(m.logits, batch_size=4, num_unroll_steps=3)) return m
def test_loss_avg(self): with self.test_session() as sess: logits = tf.placeholder(tf.float32, [2, 2, 5], name='logits') l = model.loss_graph(logits, 2, 2) loss = sess.run( l.loss, { 'logits:0': np.array([ [ [-10, -10, -10, -10, 10], [-10, -10, -10, -10, 10], ], [ [-10, -10, -10, -10, 10], [-10, -10, -10, -10, 10], ], ]), 'Loss/targets:0': np.array([[4, 4], [4, 4]]) }) print(loss, np.exp(loss)) self.assertAllClose(loss, 0) loss = sess.run( l.loss, { 'logits:0': np.array([ [ [-10, -10, -10, -10, 10], [-10, -10, -10, -10, 10], ], [ [-10, -10, -10, -10, 10], [-10, -10, -10, -10, 10], ], ]), 'Loss/targets:0': np.array([[0, 0], [4, 4]]) }) print(loss, np.exp(loss)) self.assertAllClose(loss, 10)
def test_loss(self): with self.test_session() as sess: logits = tf.placeholder(tf.float32, [1, 1, 5], name='logits') l = model.loss_graph(logits, 1, 1) loss = sess.run(l.loss, { 'logits:0': np.array([[[-10, -10, -10, -10, 10]]]), 'Loss/targets:0': np.array([[4]]) }) print(loss, np.exp(loss)) self.assertAllClose(loss, 0) loss = sess.run(l.loss, { 'logits:0': np.array([[[0, 0, 0, 0, 0]]]), 'Loss/targets:0': np.array([[0]]) }) print(loss, np.exp(loss)) self.assertAllClose(loss, np.log(5))
def main(file, batch_size=20, num_unroll_steps=35, char_embed_size=15, rnn_size=650, kernels="[1,2,3,4,5,6,7]", kernel_features="[50,100,150,200,200,200,200]", max_grad_norm=5.0, learning_rate=1.0, learning_rate_decay=0.5, decay_when=1.0, seed=3435, param_init=0.05, max_epochs=25, print_every=5): ''' Trains model from data ''' if not os.path.exists(TRAINING_DIR): os.mkdir(TRAINING_DIR) print('Created training directory', TRAINING_DIR) word_vocab, char_vocab, word_tensors, char_tensors, max_word_length = \ load_dataset() print('initialized all dataset readers') with tf.Graph().as_default(), tf.Session() as session: train_reader = DataReader(word_tensors['train'], char_tensors['train'], batch_size, num_unroll_steps, char_vocab) valid_reader = DataReader(word_tensors['valid'], char_tensors['valid'], batch_size, num_unroll_steps, char_vocab) test_reader = DataReader(word_tensors['test'], char_tensors['test'], batch_size, num_unroll_steps, char_vocab) # tensorflow seed must be inside graph tf.set_random_seed(seed) np.random.seed(seed=seed) ''' build training graph ''' initializer = tf.random_uniform_initializer(param_init, param_init) with tf.variable_scope("Model", initializer=initializer): train_model = model.inference_graph( char_vocab_size=char_vocab.size(), word_vocab_size=word_vocab.size(), char_embed_size=char_embed_size, batch_size=batch_size, rnn_size=rnn_size, max_word_length=max_word_length, kernels=eval(kernels), kernel_features=eval(kernel_features), num_unroll_steps=num_unroll_steps) train_model.update( model.loss_graph(train_model.logits, batch_size, num_unroll_steps)) # scaling loss by FLAGS.num_unroll_steps effectively scales gradients by the same factor. # we need it to reproduce how the original Torch code optimizes. Without this, our gradients will be # much smaller (i.e. 35 times smaller) and to get system to learn we'd have to scale learning rate and max_grad_norm appropriately. # Thus, scaling gradients so that this trainer is exactly compatible with the original train_model.update( model.training_graph(train_model.loss * num_unroll_steps, learning_rate, max_grad_norm)) # create saver before creating more graph nodes, so that we do not save any vars defined below saver = tf.train.Saver(max_to_keep=50) ''' build graph for validation and testing (shares parameters with the training graph!) ''' with tf.variable_scope("Model", reuse=True): valid_model = model.inference_graph( char_vocab_size=char_vocab.size(), word_vocab_size=word_vocab.size(), char_embed_size=char_embed_size, batch_size=batch_size, rnn_size=rnn_size, max_word_length=max_word_length, kernels=eval(kernels), kernel_features=eval(kernel_features), num_unroll_steps=num_unroll_steps) valid_model.update( model.loss_graph(valid_model.logits, batch_size, num_unroll_steps)) '''if load_model: saver.restore(session, load_model) print('Loaded model from', load_model, 'saved at global step', train_model.global_step.eval()) else:''' tf.global_variables_initializer().run() session.run(train_model.clear_char_embedding_padding) print('Created and initialized fresh model. Size:', model.model_size()) summary_writer = tf.summary.FileWriter(TRAINING_DIR, graph=session.graph) ''' take learning rate from CLI, not from saved graph ''' session.run(tf.assign(train_model.learning_rate, learning_rate), ) ''' training starts here ''' best_valid_loss = None rnn_state = session.run(train_model.initial_rnn_state) for epoch in range(max_epochs): epoch_start_time = time.time() avg_train_loss = 0.0 count = 0 for x, y in train_reader.iter(): count += 1 start_time = time.time() loss, _, rnn_state, gradient_norm, step, _ = session.run( [ train_model.loss, train_model.train_op, train_model.final_rnn_state, train_model.global_norm, train_model.global_step, train_model.clear_char_embedding_padding ], { train_model.input: x, train_model.targets: y, train_model.initial_rnn_state: rnn_state }) avg_train_loss += 0.05 * (loss - avg_train_loss) time_elapsed = time.time() - start_time if count % print_every == 0: print( '%6d: %d [%5d/%5d], train_loss/perplexity = %6.8f/%6.7f secs/batch = %.4fs, grad.norm=%6.8f' % (step, epoch, count, train_reader.length, loss, np.exp(loss), time_elapsed, gradient_norm)) print('Epoch training time:', time.time() - epoch_start_time) # epoch done: time to evaluate avg_valid_loss = 0.0 count = 0 rnn_state = session.run(valid_model.initial_rnn_state) for x, y in valid_reader.iter(): count += 1 start_time = time.time() loss, rnn_state = session.run( [valid_model.loss, valid_model.final_rnn_state], { valid_model.input: x, valid_model.targets: y, valid_model.initial_rnn_state: rnn_state, }) if count % print_every == 0: print("\t> validation loss = %6.8f, perplexity = %6.8f" % (loss, np.exp(loss))) avg_valid_loss += loss / valid_reader.length print("at the end of epoch:", epoch) print("train loss = %6.8f, perplexity = %6.8f" % (avg_train_loss, np.exp(avg_train_loss))) print("validation loss = %6.8f, perplexity = %6.8f" % (avg_valid_loss, np.exp(avg_valid_loss))) save_as = '%s/epoch%03d_%.4f.model' % (TRAINING_DIR, epoch, avg_valid_loss) saver.save(session, save_as) print('Saved model', save_as) ''' write out summary events ''' summary = tf.Summary(value=[ tf.Summary.Value(tag="train_loss", simple_value=avg_train_loss), tf.Summary.Value(tag="valid_loss", simple_value=avg_valid_loss) ]) summary_writer.add_summary(summary, step) ''' decide if need to decay learning rate ''' if best_valid_loss is not None and np.exp( avg_valid_loss) > np.exp(best_valid_loss) - decay_when: print( 'validation perplexity did not improve enough, decay learning rate' ) current_learning_rate = session.run(train_model.learning_rate) print('learning rate was:', current_learning_rate) current_learning_rate *= learning_rate_decay if current_learning_rate < 1.e-5: print('learning rate too small - stopping now') break session.run( train_model.learning_rate.assign(current_learning_rate)) print('new learning rate is:', current_learning_rate) else: best_valid_loss = avg_valid_loss
def main(print): ''' Loads trained model and evaluates it on test split ''' if FLAGS.load_model_for_test is None: print('Please specify checkpoint file to load model from') return -1 if not os.path.exists(FLAGS.load_model_for_test + ".index"): print('Checkpoint file not found', FLAGS.load_model_for_test) return -1 word_vocab, char_vocab, word_tensors, char_tensors, max_word_length, words_list, wers, acoustics, files_name, kaldi_sents_index = \ load_test_data(FLAGS.data_dir, FLAGS.max_word_length, num_unroll_steps=FLAGS.num_unroll_steps, eos=FLAGS.EOS, datas=['test']) test_reader = TestDataReader(word_tensors['test'], char_tensors['test'], FLAGS.batch_size, FLAGS.num_unroll_steps, wers['test'], files_name['test'], kaldi_sents_index['test']) fasttext_model_path = None if FLAGS.fasttext_model_path: fasttext_model_path = FLAGS.fasttext_model_path if 'fasttext' in FLAGS.embedding: fasttext_model = FasttextModel( fasttext_path=fasttext_model_path).get_fasttext_model() test_ft_reader = DataReaderFastText( words_list=words_list, batch_size=FLAGS.batch_size, num_unroll_steps=FLAGS.num_unroll_steps, model=fasttext_model, data='test', acoustics=acoustics) print('initialized test dataset reader') with tf.Graph().as_default(), tf.Session() as session: # tensorflow seed must be inside graph tf.set_random_seed(FLAGS.seed) np.random.seed(seed=FLAGS.seed) ''' build inference graph ''' with tf.variable_scope("Model"): m = model.inference_graph(char_vocab_size=char_vocab.size, word_vocab_size=word_vocab.size, char_embed_size=FLAGS.char_embed_size, batch_size=FLAGS.batch_size, num_highway_layers=FLAGS.highway_layers, num_rnn_layers=FLAGS.rnn_layers, rnn_size=FLAGS.rnn_size, max_word_length=max_word_length, kernels=eval(FLAGS.kernels), kernel_features=eval( FLAGS.kernel_features), num_unroll_steps=FLAGS.num_unroll_steps, dropout=0, embedding=FLAGS.embedding, fasttext_word_dim=300, acoustic_features_dim=4) m.update(model.loss_graph(m.logits, FLAGS.batch_size)) global_step = tf.Variable(0, dtype=tf.int32, name='global_step') variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) saver = tf.train.Saver() saver.restore(session, FLAGS.load_model_for_test) print('Loaded model from' + str(FLAGS.load_model_for_test) + 'saved at global step' + str(global_step.eval())) ''' training starts here ''' rnn_state = session.run(m.initial_rnn_state) count = 0 avg_loss = 0 labels = [] predictions = [] files_name_list = [] kaldi_sents_index_list = [] start_time = time.time() for batch_kim, batch_ft in zip(test_reader.iter(), test_ft_reader.iter()): count += 1 x, y, files_name_batch, kaldi_sents_index_batch = batch_kim loss, logits = session.run( [m.loss, m.logits], { m.input2: batch_ft, m.input: x, m.targets: y, m.initial_rnn_state: rnn_state }) labels.append(y) predictions.append(logits) files_name_list.append(files_name_batch) kaldi_sents_index_list.append(kaldi_sents_index_batch) avg_loss /= count time_elapsed = time.time() - start_time print("test loss = %6.8f, perplexity = %6.8f" % (avg_loss, np.exp(avg_loss))) print("test samples:" + str(count * FLAGS.batch_size) + "time elapsed:" + str(time_elapsed) + "time per one batch:" + str(time_elapsed / count)) df = pd.DataFrame({ "labels": labels, "predictions": predictions, "files_name": files_name_list, "kaldi_sents_index": kaldi_sents_index_list }) df['predictions'] = df['predictions'].apply(lambda x: x[0]) final_df = pd.DataFrame() final_df['labels'] = df.explode('labels')['labels'] final_df['predictions'] = df.explode('predictions')['predictions'] final_df['files_name'] = df.explode('files_name')['files_name'] final_df['kaldi_sents_index'] = df.explode( 'kaldi_sents_index')['kaldi_sents_index'] final_df.reset_index(drop=True, inplace=True) for col in final_df.columns: final_df[col] = final_df[col].apply(lambda column: column[0]) final_df.to_pickle(FLAGS.train_dir + '/test_results.pkl') def get_wers_results(group): file_name = group.name our_best_prediction_index = group['predictions'].values.argmin() our_wer_label = group.iloc[our_best_prediction_index]['labels'] kaldis_best_prediction_row = group[group['kaldi_sents_index'] == 1] kaldis_wer_label = kaldis_best_prediction_row['labels'] min_wer = min(our_wer_label, kaldis_wer_label.values) return pd.DataFrame({ 'file_name': file_name, 'our_wer_label': our_wer_label, 'kaldis_wer_label': kaldis_wer_label, 'min': min_wer })
def main(print): ''' Trains model from data ''' if not os.path.exists(FLAGS.train_dir): os.mkdir(FLAGS.train_dir) print('Created training directory' + FLAGS.train_dir) # CSV initialize pd.DataFrame(FLAGS.flag_values_dict(), index=range(1)).to_csv(FLAGS.train_dir + '/train_parameters.csv') epochs_results = initialize_epoch_data_dict() fasttext_model_path = None if FLAGS.fasttext_model_path: fasttext_model_path = FLAGS.fasttext_model_path word_vocab, char_vocab, word_tensors, char_tensors, max_word_length, words_list = \ load_data(FLAGS.data_dir, FLAGS.max_word_length, eos=FLAGS.EOS) fasttext_model = None if 'fasttext' in FLAGS.embedding: fasttext_model = FasttextModel( fasttext_path=fasttext_model_path).get_fasttext_model() train_ft_reader = DataReaderFastText( words_list=words_list, batch_size=FLAGS.batch_size, num_unroll_steps=FLAGS.num_unroll_steps, model=fasttext_model, data='train') valid_ft_reader = DataReaderFastText( words_list=words_list, batch_size=FLAGS.batch_size, num_unroll_steps=FLAGS.num_unroll_steps, model=fasttext_model, data='valid') train_reader = DataReader(word_tensors['train'], char_tensors['train'], FLAGS.batch_size, FLAGS.num_unroll_steps) valid_reader = DataReader(word_tensors['valid'], char_tensors['valid'], FLAGS.batch_size, FLAGS.num_unroll_steps) test_reader = DataReader(word_tensors['test'], char_tensors['test'], FLAGS.batch_size, FLAGS.num_unroll_steps) print('initialized all dataset readers') with tf.Graph().as_default(), tf.Session() as session: # tensorflow seed must be inside graph tf.set_random_seed(FLAGS.seed) np.random.seed(seed=FLAGS.seed) ''' build training graph ''' initializer = tf.random_uniform_initializer(-FLAGS.param_init, FLAGS.param_init) with tf.variable_scope("Model", initializer=initializer): train_model = model.inference_graph( char_vocab_size=char_vocab.size, word_vocab_size=word_vocab.size, char_embed_size=FLAGS.char_embed_size, batch_size=FLAGS.batch_size, num_highway_layers=FLAGS.highway_layers, num_rnn_layers=FLAGS.rnn_layers, rnn_size=FLAGS.rnn_size, max_word_length=max_word_length, kernels=eval(FLAGS.kernels), kernel_features=eval(FLAGS.kernel_features), num_unroll_steps=FLAGS.num_unroll_steps, dropout=FLAGS.dropout, embedding=FLAGS.embedding, fasttext_word_dim=300, acoustic_features_dim=4) train_model.update( model.loss_graph(train_model.logits, FLAGS.batch_size, FLAGS.num_unroll_steps)) train_model.update( model.training_graph(train_model.loss * FLAGS.num_unroll_steps, FLAGS.learning_rate, FLAGS.max_grad_norm)) # create saver before creating more graph nodes, so that we do not save any vars defined below saver = tf.train.Saver(max_to_keep=50) ''' build graph for validation and testing (shares parameters with the training graph!) ''' with tf.variable_scope("Model", reuse=True): valid_model = model.inference_graph( char_vocab_size=char_vocab.size, word_vocab_size=word_vocab.size, char_embed_size=FLAGS.char_embed_size, batch_size=FLAGS.batch_size, num_highway_layers=FLAGS.highway_layers, num_rnn_layers=FLAGS.rnn_layers, rnn_size=FLAGS.rnn_size, max_word_length=max_word_length, kernels=eval(FLAGS.kernels), kernel_features=eval(FLAGS.kernel_features), num_unroll_steps=FLAGS.num_unroll_steps, dropout=0.0, embedding=FLAGS.embedding, fasttext_word_dim=300, acoustic_features_dim=4) valid_model.update( model.loss_graph(valid_model.logits, FLAGS.batch_size, FLAGS.num_unroll_steps)) if FLAGS.load_model_for_training: saver.restore(session, FLAGS.load_model_for_training) string = str('Loaded model from' + str(FLAGS.load_model_for_training) + 'saved at global step' + str(train_model.global_step.eval())) print(string) else: tf.global_variables_initializer().run() session.run(train_model.clear_char_embedding_padding) string = str('Created and initialized fresh model. Size:' + str(model.model_size())) print(string) summary_writer = tf.summary.FileWriter(FLAGS.train_dir, graph=session.graph) ''' take learning rate from CLI, not from saved graph ''' session.run(tf.assign(train_model.learning_rate, FLAGS.learning_rate), ) ''' training starts here ''' best_valid_loss = None rnn_state = session.run(train_model.initial_rnn_state) for epoch in range(FLAGS.max_epochs): epoch_start_time = time.time() avg_train_loss = 0.0 count = 0 if fasttext_model: iter_over = zip(train_reader.iter(), train_ft_reader.iter()) else: iter_over = train_reader.iter() for batch_kim, batch_ft in iter_over: if fasttext_model: x, y = batch_kim else: x, y = batch_kim, batch_ft count += 1 start_time = time.time() if fasttext_model: ft_vectors = fasttext_model.wv[ words_list['train'][count]].reshape( fasttext_model.wv.vector_size, 1) loss, _, rnn_state, gradient_norm, step, _, probas = session.run( [ train_model.loss, train_model.train_op, train_model.final_rnn_state, train_model.global_norm, train_model.global_step, train_model.clear_char_embedding_padding ], { train_model.input2: batch_ft, train_model.input: x, train_model.targets: y, train_model.initial_rnn_state: rnn_state }) else: loss, _, rnn_state, gradient_norm, step, _ = session.run( [ train_model.loss, train_model.train_op, train_model.final_rnn_state, train_model.global_norm, train_model.global_step, train_model.clear_char_embedding_padding ], { train_model.input: x, train_model.targets: y, train_model.initial_rnn_state: rnn_state }) avg_train_loss += 0.05 * (loss - avg_train_loss) time_elapsed = time.time() - start_time if count % FLAGS.print_every == 0: string = str( '%6d: %d [%5d/%5d], train_loss/perplexity = %6.8f/%6.7f secs/batch = %.4fs, grad.norm=%6.8f' % (step, epoch, count, train_reader.length, loss, np.exp(loss), time_elapsed, gradient_norm)) print(string) string = str('Epoch training time:' + str(time.time() - epoch_start_time)) print(string) epochs_results['epoch_training_time'].append( str(time.time() - epoch_start_time)) # epoch done: time to evaluate avg_valid_loss = 0.0 count = 0 rnn_state = session.run(valid_model.initial_rnn_state) for batch_kim, batch_ft in zip(valid_reader.iter(), valid_ft_reader.iter()): x, y = batch_kim count += 1 start_time = time.time() loss, rnn_state = session.run( [valid_model.loss, valid_model.final_rnn_state], { valid_model.input2: batch_ft, valid_model.input: x, valid_model.targets: y, valid_model.initial_rnn_state: rnn_state, }) if count % FLAGS.print_every == 0: string = str( "\t> validation loss = %6.8f, perplexity = %6.8f" % (loss, np.exp(loss))) print(string) avg_valid_loss += loss / valid_reader.length print("at the end of epoch:" + str(epoch)) epochs_results['epoch_number'].append(str(epoch)) print("train loss = %6.8f, perplexity = %6.8f" % (avg_train_loss, np.exp(avg_train_loss))) epochs_results['train_loss'].append(avg_train_loss) epochs_results['train_perplexity'].append(np.exp(avg_train_loss)) print("validation loss = %6.8f, perplexity = %6.8f" % (avg_valid_loss, np.exp(avg_valid_loss))) epochs_results['validation_loss'].append(avg_valid_loss) epochs_results['valid_perplexity'].append(np.exp(avg_valid_loss)) save_as = '%s/epoch%03d_%.4f.model' % (FLAGS.train_dir, epoch, avg_valid_loss) saver.save(session, save_as) print('Saved model' + str(save_as)) epochs_results['model_name'].append(str(save_as)) epochs_results['learning_rate'].append( str(session.run(train_model.learning_rate))) ''' write out summary events ''' summary = tf.Summary(value=[ tf.Summary.Value(tag="train_loss", simple_value=avg_train_loss), tf.Summary.Value(tag="train_perplexity", simple_value=np.exp(avg_train_loss)), tf.Summary.Value(tag="valid_loss", simple_value=avg_valid_loss), tf.Summary.Value(tag="valid_perplexity", simple_value=np.exp(avg_valid_loss)), ]) summary_writer.add_summary(summary, step) ''' decide if need to decay learning rate ''' if best_valid_loss is not None and np.exp(avg_valid_loss) > np.exp( best_valid_loss) - FLAGS.decay_when: print( 'validation perplexity did not improve enough, decay learning rate' ) current_learning_rate = session.run(train_model.learning_rate) string = str('learning rate was:' + str(current_learning_rate)) print(string) current_learning_rate *= FLAGS.learning_rate_decay if current_learning_rate < 1.e-3: print('learning rate too small - stopping now') break session.run( train_model.learning_rate.assign(current_learning_rate)) string = str('new learning rate is:' + str(current_learning_rate)) print(string) else: best_valid_loss = avg_valid_loss # Save model performance data pd.DataFrame(epochs_results).to_csv(FLAGS.train_dir + '/train_results.csv')
def evaluation(): assert FLAGS.load_model != None input_tensors, label_tensors, seq_tensors = dl.make_batches() test_reader = dl.DataReader(input_tensors['Test'], label_tensors['Test'], seq_tensors['Test'], FLAGS.batch_size, FLAGS.num_unroll_steps) labels = tf.placeholder(tf.float32, [None, FLAGS.num_unroll_steps, 3], name='labels') test_model = model.inference_graph(word_vocab_size=FLAGS.word_vocab_size, kernels=eval(FLAGS.kernels), kernel_features=eval( FLAGS.kernel_features), rnn_size=FLAGS.rnn_size, dropout=FLAGS.dropout, num_rnn_layers=FLAGS.rnn_layers, num_highway_layers=FLAGS.highway_layers, num_unroll_steps=FLAGS.num_unroll_steps, max_sent_length=FLAGS.max_sent_length, batch_size=FLAGS.batch_size, embed_size=FLAGS.word_embed_size) predictions = test_model.predictions print(predictions) losses = model.loss_graph(predictions, labels) loss_arousal = losses.loss_arousal loss_valence = losses.loss_valence loss_liking = losses.loss_liking metric_arousal = 1. - loss_arousal metric_valence = 1. - loss_valence metric_liking = 1. - loss_liking saver = tf.train.Saver() with tf.Session() as sess: print('load model %s ...' % SAVE_PATH) saver.restore(sess, SAVE_PATH) print('done!') metric = [] for minibatch in test_reader.iter(): x, y = minibatch m_arousal, m_valence, m_liking = sess.run( [metric_arousal, metric_valence, metric_liking], feed_dict={ test_model.input: x, labels: y }) metric.append([m_arousal, m_valence, m_liking]) metric = np.mean(np.array(metric), axis=0) print('Test Reuslt: arousal: %.4f -- valence: %.4f -- liking: %.4f' % (metric[0], metric[1], metric[2]))
def main(_): ''' Trains model from data ''' print("we in main") print(sys.argv[2]) print(FLAGS) if not os.path.exists(FLAGS.train_dir): os.mkdir(FLAGS.train_dir) print('Created training directory', FLAGS.train_dir) word_vocab, char_vocab, word_tensors, char_tensors, max_word_length = \ load_data(FLAGS.data_dir, FLAGS.max_word_length, eos=FLAGS.EOS) train_reader = DataReader(word_tensors['train'], char_tensors['train'], FLAGS.batch_size, FLAGS.num_unroll_steps) valid_reader = DataReader(word_tensors['valid'], char_tensors['valid'], FLAGS.batch_size, FLAGS.num_unroll_steps) test_reader = DataReader(word_tensors['test'], char_tensors['test'], FLAGS.batch_size, FLAGS.num_unroll_steps) print('initialized all dataset readers') with tf.Graph().as_default(), tf.Session() as session: # tensorflow seed must be inside graph tf.set_random_seed(FLAGS.seed) np.random.seed(seed=FLAGS.seed) ''' build training graph ''' initializer = tf.random_uniform_initializer(-FLAGS.param_init, FLAGS.param_init) with tf.variable_scope("Model", initializer=initializer): train_model = model.inference_graph( char_vocab_size=char_vocab.size, word_vocab_size=word_vocab.size, char_embed_size=FLAGS.char_embed_size, batch_size=FLAGS.batch_size, num_highway_layers=FLAGS.highway_layers, num_rnn_layers=FLAGS.rnn_layers, rnn_size=FLAGS.rnn_size, max_word_length=max_word_length, kernels=eval(FLAGS.kernels), kernel_features=eval(FLAGS.kernel_features), num_unroll_steps=FLAGS.num_unroll_steps, dropout=FLAGS.dropout) train_model.update(model.loss_graph(train_model.logits, FLAGS.batch_size, FLAGS.num_unroll_steps)) # scaling loss by FLAGS.num_unroll_steps effectively scales gradients by the same factor. # we need it to reproduce how the original Torch code optimizes. Without this, our gradients will be # much smaller (i.e. 35 times smaller) and to get system to learn we'd have to scale learning rate and max_grad_norm appropriately. # Thus, scaling gradients so that this trainer is exactly compatible with the original train_model.update(model.training_graph(train_model.loss * FLAGS.num_unroll_steps, FLAGS.learning_rate, FLAGS.max_grad_norm)) # create saver before creating more graph nodes, so that we do not save any vars defined below saver = tf.train.Saver(max_to_keep=50) ''' build graph for validation and testing (shares parameters with the training graph!) ''' with tf.variable_scope("Model", reuse=True): valid_model = model.inference_graph( char_vocab_size=char_vocab.size, word_vocab_size=word_vocab.size, char_embed_size=FLAGS.char_embed_size, batch_size=FLAGS.batch_size, num_highway_layers=FLAGS.highway_layers, num_rnn_layers=FLAGS.rnn_layers, rnn_size=FLAGS.rnn_size, max_word_length=max_word_length, kernels=eval(FLAGS.kernels), kernel_features=eval(FLAGS.kernel_features), num_unroll_steps=FLAGS.num_unroll_steps, dropout=0.0) valid_model.update(model.loss_graph(valid_model.logits, FLAGS.batch_size, FLAGS.num_unroll_steps)) with tf.variable_scope("Model", reuse=True): test_model = model.inference_graph( char_vocab_size=char_vocab.size, word_vocab_size=word_vocab.size, char_embed_size=FLAGS.char_embed_size, batch_size=1, num_highway_layers=FLAGS.highway_layers, num_rnn_layers=FLAGS.rnn_layers, rnn_size=FLAGS.rnn_size, max_word_length=max_word_length, kernels=eval(FLAGS.kernels), kernel_features=eval(FLAGS.kernel_features), num_unroll_steps=1, dropout=0.0) test_model.update(model.loss_graph(test_model.logits, 1, 1)) if FLAGS.load_model: saver.restore(session, FLAGS.load_model) print('Loaded model from', FLAGS.load_model, 'saved at global step', train_model.global_step.eval()) else: tf.initialize_all_variables().run() print('Created and initialized fresh model. Size:', model.model_size()) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, graph=session.graph) ''' take learning rate from CLI, not from saved graph ''' session.run( tf.assign(train_model.learning_rate, FLAGS.learning_rate), ) def clear_char_embedding_padding(): char_embedding = session.run(train_model.char_embedding) char_embedding[0,:] = 0.0 session.run(tf.assign(train_model.char_embedding, char_embedding)) char_embedding = session.run(train_model.char_embedding) clear_char_embedding_padding() run_test2(session, test_model, train_reader) #exit(1) ''' training starts here ''' best_valid_loss = None rnn_state = session.run(train_model.initial_rnn_state) for epoch in range(FLAGS.max_epochs): avg_train_loss = 0.0 count = 0 for x, y in train_reader.iter(): count += 1 start_time = time.time() print (x) exit(1) loss, _, rnn_state, gradient_norm, step = session.run([ train_model.loss, train_model.train_op, train_model.final_rnn_state, train_model.global_norm, train_model.global_step, ], { train_model.input : x, train_model.targets: y, train_model.initial_rnn_state: rnn_state }) clear_char_embedding_padding() avg_train_loss += 0.05 * (loss - avg_train_loss) time_elapsed = time.time() - start_time if count % FLAGS.print_every == 0: print('%6d: %d [%5d/%5d], train_loss/perplexity = %6.8f/%6.7f secs/batch = %.4fs, grad.norm=%6.8f' % (step, epoch, count, train_reader.length, loss, np.exp(loss), time_elapsed, gradient_norm)) # epoch done: time to evaluate avg_valid_loss = 0.0 count = 0 rnn_state = session.run(valid_model.initial_rnn_state) for x, y in valid_reader.iter(): count += 1 start_time = time.time() loss, rnn_state = session.run([ valid_model.loss, valid_model.final_rnn_state ], { valid_model.input : x, valid_model.targets: y, valid_model.initial_rnn_state: rnn_state, }) if count % FLAGS.print_every == 0: print("\t> validation loss = %6.8f, perplexity = %6.8f" % (loss, np.exp(loss))) avg_valid_loss += loss / valid_reader.length print("at the end of epoch:", epoch) print("train loss = %6.8f, perplexity = %6.8f" % (avg_train_loss, np.exp(avg_train_loss))) print("validation loss = %6.8f, perplexity = %6.8f" % (avg_valid_loss, np.exp(avg_valid_loss))) save_as = '%s/epoch%03d_%.4f.model' % (FLAGS.train_dir, epoch, avg_valid_loss) saver.save(session, save_as) print('Saved model', save_as) ''' write out summary events ''' summary = tf.Summary(value=[ tf.Summary.Value(tag="train_loss", simple_value=avg_train_loss), tf.Summary.Value(tag="valid_loss", simple_value=avg_valid_loss) ]) summary_writer.add_summary(summary, step) ''' decide if need to decay learning rate ''' if best_valid_loss is not None and np.exp(avg_valid_loss) > np.exp(best_valid_loss) - FLAGS.decay_when: print('** validation perplexity did not improve enough, decay learning rate') current_learning_rate = session.run(train_model.learning_rate) print('learning rate was:', current_learning_rate) current_learning_rate *= FLAGS.learning_rate_decay if current_learning_rate < 1.e-5: print('learning rate too small - stopping now') break session.run(train_model.learning_rate.assign(current_learning_rate)) print('new learning rate is:', current_learning_rate) else: best_valid_loss = avg_valid_loss run_test2(session, test_model, train_reader) print ("AGAIN") run_test2(session, test_model, train_reader)
def test(self): with self.test_session() as sess: m = self.model() loss = model.loss_graph(m.logits, batch_size=4, num_unroll_steps=3) rnn_outputs = [ np.array([[-0.00840133, 0.00178184, 0.00585286, 0.00937691, 0.00332699], [-0.00840504, 0.00177166, 0.00586006, 0.00935978, 0.00331423], [-0.00839551, 0.0017945 , 0.00585306, 0.00938957, 0.00333546], [-0.00839595, 0.00178647, 0.0058777 , 0.00935818, 0.00331012]]), np.array([[-0.0126155 , 0.00264827, 0.00886869, 0.01411371, 0.00518486], [-0.01261795, 0.00264249, 0.00887132, 0.01410431, 0.00517832], [-0.01261209, 0.00266095, 0.00885007, 0.01414492, 0.00520893], [-0.01261059, 0.00265393, 0.00888564, 0.01410206, 0.00517435]]), np.array([[-0.01473925, 0.00305038, 0.01042287, 0.01648509, 0.00619734], [-0.01472822, 0.00307533, 0.01042284, 0.01650903, 0.00621392], [-0.01473146, 0.00307552, 0.01039554, 0.01653865, 0.00623778], [-0.01472719, 0.00307848, 0.01041825, 0.01651621, 0.00621954]]) ] feed = { 'LSTM/WordEmbedding/SimpleLinear/Matrix:0': SOFTMAX_W, 'LSTM/WordEmbedding/SimpleLinear/Bias:0': SOFTMAX_B, loss.targets: Y } for o,r in zip(rnn_outputs, m.rnn_outputs): feed[r] = o l = sess.run(loss.loss, feed) print(l) ''' [[-0.00115102 -0.01835673 0.01088401 0.00553839 -0.02548739 0.00961501 -0.04911561 0.04094783 0.01729541 0.04113884 0.0110002 0.03410089 -0.02663253 0.01714642 0.03581101 -0.03634553 -0.01540088 -0.01764538 0.03884879 -0.03207963] [-0.00115117 -0.01835723 0.01088434 0.00553844 -0.02548673 0.00961541 -0.04911538 0.04094752 0.01729532 0.04113849 0.01100097 0.0341017 -0.02663185 0.01714566 0.03581182 -0.03634511 -0.0154006 -0.01764595 0.03884758 -0.03208043] [-0.00115108 -0.01835609 0.01088368 0.00553811 -0.0254877 0.0096147 -0.04911536 0.04094845 0.01729582 0.04113897 0.01099989 0.03410037 -0.02663329 0.01714694 0.03581046 -0.03634582 -0.01540092 -0.01764458 0.03884939 -0.03207891] [-0.0011517 -0.01835642 0.01088412 0.00553769 -0.02548616 0.00961538 -0.0491141 0.0409487 0.01729641 0.04113809 0.01100182 0.03410203 -0.02663257 0.01714548 0.03581202 -0.03634498 -0.01540009 -0.01764486 0.03884656 -0.03208012]] [[-0.00137119 -0.01813851 0.01110794 0.00582019 -0.02566941 0.00940851 -0.04911464 0.04097762 0.0171818 0.04152314 0.01122282 0.0339342 -0.02648103 0.01748628 0.03570804 -0.0365119 -0.01505298 -0.01722943 0.03911369 -0.03211264] [-0.00137125 -0.01813885 0.01110811 0.00582023 -0.02566908 0.00940875 -0.04911457 0.04097738 0.0171817 0.04152295 0.0112232 0.03393462 -0.02648065 0.01748586 0.03570845 -0.03651169 -0.01505288 -0.01722979 0.03911307 -0.03211308] [-0.00137074 -0.01813789 0.01110745 0.00582038 -0.02567078 0.0094078 -0.04911549 0.04097777 0.01718157 0.04152391 0.01122116 0.03393264 -0.02648198 0.01748771 0.03570654 -0.03651269 -0.01505364 -0.01722879 0.03911621 -0.03211133] [-0.00137166 -0.01813823 0.01110794 0.00581962 -0.02566858 0.00940876 -0.04911353 0.04097832 0.01718257 0.04152259 0.01122391 0.03393493 -0.02648121 0.01748567 0.03570866 -0.03651157 -0.01505247 -0.01722896 0.03911219 -0.03211286]] [[-0.00148683 -0.01802572 0.01122714 0.00596118 -0.02575907 0.00930692 -0.04911366 0.04099131 0.01712257 0.04171915 0.01133703 0.033848 -0.02640488 0.01765947 0.03566255 -0.03659378 -0.01487576 -0.0170169 0.03924932 -0.03213345] [-0.00148696 -0.01802452 0.01122649 0.00596062 -0.02575966 0.00930636 -0.04911316 0.04099248 0.01712336 0.04171939 0.01133645 0.033847 -0.02640637 0.01766046 0.03566148 -0.03659437 -0.01487585 -0.01701534 0.03925047 -0.03213206] [-0.0014862 -0.01802442 0.01122626 0.00596135 -0.0257613 0.00930568 -0.04911482 0.04099185 0.01712243 0.04172042 0.0113344 0.03384538 -0.02640666 0.0176619 0.03566003 -0.03659511 -0.01487673 -0.0170155 0.03925342 -0.03213113] [-0.00148684 -0.01802438 0.01122635 0.00596064 -0.02575997 0.0093062 -0.04911336 0.04099251 0.01712332 0.04171955 0.01133605 0.03384664 -0.02640661 0.01766078 0.03566112 -0.03659455 -0.01487602 -0.0170152 0.03925106 -0.03213174]] ''' assert False
def train(): dataset_tensors, labels_tensors = dl.make_batches() input_tensor_tr, label_tensor_tr, seq_tensor_tr = dl.sequence_init(dataset_tensors, labels_tensors, FLAGS.num_unroll_steps, 'Train', allow_short_seq= False) input_tensor_te, label_tensor_te, seq_tensor_te = dl.sequence_init(dataset_tensors, labels_tensors, FLAGS.num_unroll_steps, 'Test', allow_short_seq= True) train_reader = dl.TrainDataReader(input_tensor_tr, label_tensor_tr, seq_tensor_tr, FLAGS.batch_size, FLAGS.num_unroll_steps, False) eval_reader = dl.EvalDataReader(input_tensor_te, label_tensor_te, seq_tensor_te, FLAGS.batch_size_eval, FLAGS.num_unroll_steps, False) ''' input_tensors, label_tensors, seq_tensors = dl.make_batches(60) train_reader = dl.DataReader(input_tensors['Train'], label_tensors['Train'], seq_tensors['Train'], FLAGS.batch_size, FLAGS.num_unroll_steps) eval_reader = dl.DataReader(input_tensors['Devel'], label_tensors['Devel'], seq_tensors['Devel'], FLAGS.batch_size, FLAGS.num_unroll_steps) ''' labels = tf.placeholder(tf.float32, [None, FLAGS.num_unroll_steps, 3], name = 'labels') #labels = tf.reshape(labels, [-1, 3]) train_model = model.inference_graph(word_vocab_size= FLAGS.word_vocab_size, kernels= eval(FLAGS.kernels), kernel_features= eval(FLAGS.kernel_features), rnn_size= FLAGS.rnn_size, dropout= FLAGS.dropout, num_rnn_layers= FLAGS.rnn_layers, num_highway_layers= FLAGS.highway_layers, num_unroll_steps= FLAGS.num_unroll_steps, max_sent_length= FLAGS.max_sent_length, #batch_size= FLAGS.batch_size, embed_size= FLAGS.word_embed_size) predictions = train_model.predictions #print(predictions) losses = model.loss_graph(predictions, labels) eval_model = model.eval_metric_graph() loss_arousal = losses.loss_arousal loss_valence = losses.loss_valence loss_liking = losses.loss_liking #loss_list = [(model.loss_graph(predictions[:,i], labels[:,i]) for i in range(3))] #print(loss_list) #loss = tf.convert_to_tensor(loss_list) #metric = [1. - x for x in loss_list] metric_arousal = 1. - loss_arousal metric_valence = 1. - loss_valence metric_liking = 1. - loss_liking eval_arousal = eval_model.eval_metric_arousal eval_valence = eval_model.eval_metric_valence eval_liking = eval_model.eval_metric_liking loss_op = loss_arousal + loss_liking + loss_valence optimizer = tf.train.AdamOptimizer(learning_rate= FLAGS.learning_rate).minimize(loss_op) saver = tf.train.Saver() patience = FLAGS.patience with tf.Session() as sess: sess.run(tf.initialize_all_variables()) best_metric_arousal = 0.0 best_metric_valence = 0.0 best_metric_liking = 0.0 Done = False epoch = 0 while epoch < FLAGS.max_epochs and not Done: batch = 1 epoch += 1 for minibatch in train_reader.iter(): x, y = minibatch #print(x.shape, y.shape) _, l, m_arousal, m_valence, m_liking = sess.run( [optimizer, loss_op, metric_arousal, metric_valence, metric_liking], feed_dict={ train_model.input: x, labels: y, train_model.sequence_length: [120] * FLAGS.batch_size, train_model.batch_size: FLAGS.batch_size }) print('Epoch: %5d/%5d -- batch: %5d -- loss: %.4f' % (epoch, FLAGS.max_epochs, batch, l)) if batch % 3 == 0: print('arousal: %.4f -- valence: %.4f, liking: %.4f' % (m_arousal, m_valence, m_liking)) log = open(LOGGING_PATH, 'a') log.write('%s, %6d, %.5f, %.5f, %.5f, %.5f, \n' % ('train', epoch * batch, l, m_arousal, m_valence, m_liking)) log.close() if batch % 14 == 0: print('evaluation process------------------------------------------') eval_metric = [] cnt = 0 prev = None for mb in eval_reader.iter(): eval_x_list, eval_y_list, eval_z_list = mb for eval_x, eval_z in zip(eval_x_list, eval_z_list): cnt += np.sum(eval_z) eval_tmp_preds = sess.run([predictions], feed_dict={ train_model.input : eval_x, train_model.sequence_length : eval_z, train_model.batch_size: FLAGS.batch_size_eval }) if prev is None: prev = eval_tmp_preds[0] else: prev = np.vstack((prev, eval_tmp_preds[0])) prev = prev[:cnt] eval_y_list = np.array(eval_y_list).reshape([-1, 3])[:cnt] #print(prev) #print(eval_y_list) e_arousal, e_valence, e_liking = sess.run([eval_arousal, eval_liking, eval_valence], feed_dict= { eval_model.eval_predictions : prev, eval_model.eval_labels : eval_y_list }) eval_metric.append([e_arousal, e_valence, e_liking]) prev = None cnt = 0 eval_res = np.mean(np.array(eval_metric), axis= 0) eval_loss = np.sum(1. - eval_res) print('Epoch: %5d/%5d -- batch: %5d -- loss: %.4f -- arousal: %.4f -- valence: %.4f -- liking: %.4f' % (epoch, FLAGS.max_epochs, batch, eval_loss, eval_res[0], eval_res[1], eval_res[2])) log = open(LOGGING_PATH, 'a') log.write('%s, %6d, %.5f, %.5f, %.5f, %.5f, \n' % ('train', epoch * batch, eval_loss, eval_res[0], eval_res[1], eval_res[2])) log.close() print('done evaluation------------------------------------------\n') ''' if batch % 10 == 0: print('evaluation process------------------------------------------') metr = [] eval_loss = 0.0 cnt = 0 for mb in eval_reader.iter(): eval_x, eval_y = mb cnt += 1 l_e, me_arousal, me_valence, me_liking = sess.run( [loss_op, metric_arousal, metric_valence, metric_liking], feed_dict={ train_model.input: eval_x, labels: eval_y }) eval_loss += l_e metr.append([me_arousal, m_valence, me_liking]) mean_metr = np.mean(np.array(metr), axis= 0) eval_loss /= cnt if mean_metr[0] > best_metric_arousal or mean_metr[1] > best_metric_valence \ or mean_metr[2] > best_metric_liking: save_path = saver.save(sess, SAVE_PATH) best_metric_arousal, best_metric_valence, best_metric_liking = mean_metr[0], \ mean_metr[1], mean_metr[2] patience = FLAGS.patience print('Model saved in file: %s' % save_path) else: patience -= 500 patience -= 500 if patience <= 0: Done = True break print('Epoch: %5d/%5d -- batch: %5d -- loss: %.4f -- arousal: %.4f -- valence: %.4f -- liking: %.4f' % (epoch, FLAGS.max_epochs, batch, eval_loss, mean_metr[0], mean_metr[1], mean_metr[2])) log = open(LOGGING_PATH, 'a') log.write('%s, %6d, %.5f, %.5f, %.5f, %.5f, \n' % ('train', epoch * batch, eval_loss, mean_metr[0], mean_metr[1], mean_metr[2])) log.close() print('done evaluation------------------------------------------\n') ''' batch += 1
def main(_): ''' Loads trained model and evaluates it on test split ''' if FLAGS.load_model is None: print('Please specify checkpoint file to load model from') return -1 if not os.path.exists(FLAGS.load_model + ".index"): print('Checkpoint file not found', FLAGS.load_model) return -1 word_vocab, char_vocab, word_tensors, char_tensors, max_word_length = \ load_data(FLAGS.data_dir, FLAGS.max_word_length, eos=FLAGS.EOS) test_reader = DataReader(word_tensors['test'], char_tensors['test'], FLAGS.batch_size, FLAGS.num_unroll_steps) print('initialized test dataset reader') with tf.Graph().as_default(), tf.Session() as session: # tensorflow seed must be inside graph tf.set_random_seed(FLAGS.seed) np.random.seed(seed=FLAGS.seed) ''' build inference graph ''' with tf.variable_scope("Model"): m = model.inference_graph(char_vocab_size=char_vocab.size, word_vocab_size=word_vocab.size, char_embed_size=FLAGS.char_embed_size, batch_size=FLAGS.batch_size, num_highway_layers=FLAGS.highway_layers, num_rnn_layers=FLAGS.rnn_layers, rnn_size=FLAGS.rnn_size, max_word_length=max_word_length, kernels=eval(FLAGS.kernels), kernel_features=eval( FLAGS.kernel_features), num_unroll_steps=FLAGS.num_unroll_steps, dropout=0) m.update( model.loss_graph(m.logits, FLAGS.batch_size, FLAGS.num_unroll_steps)) global_step = tf.Variable(0, dtype=tf.int32, name='global_step') saver = tf.train.Saver() saver.restore(session, FLAGS.load_model) print('Loaded model from', FLAGS.load_model, 'saved at global step', global_step.eval()) ''' training starts here ''' rnn_state = session.run(m.initial_rnn_state) count = 0 avg_loss = 0 start_time = time.time() for x, y in test_reader.iter(): count += 1 loss, rnn_state = session.run([m.loss, m.final_rnn_state], { m.input: x, m.targets: y, m.initial_rnn_state: rnn_state }) avg_loss += loss avg_loss /= count time_elapsed = time.time() - start_time print("test loss = %6.8f, perplexity = %6.8f" % (avg_loss, np.exp(avg_loss))) print("test samples:", count * FLAGS.batch_size, "time elapsed:", time_elapsed, "time per one batch:", time_elapsed / count)
def main(print): ''' Trains model from data ''' if not os.path.exists(FLAGS.train_dir): os.mkdir(FLAGS.train_dir) print('Created training directory' + FLAGS.train_dir) # CSV initialize df_train_params = pd.DataFrame(FLAGS.flag_values_dict(), index=range(1)) df_train_params['comment'] = '' df_train_params.to_csv(FLAGS.train_dir + '/train_parameters.csv') epochs_results = initialize_epoch_data_dict() fasttext_model_path = None if FLAGS.fasttext_model_path: fasttext_model_path = FLAGS.fasttext_model_path word_vocab, char_vocab, word_tensors, char_tensors, max_word_length, words_list, wers, acoustics = \ load_data(FLAGS.data_dir, FLAGS.max_word_length, num_unroll_steps=FLAGS.num_unroll_steps, eos=FLAGS.EOS, batch_size=FLAGS.batch_size) word_vocab_valid, char_vocab_valid, word_tensors_valid, char_tensors_valid, max_word_length_valid, words_list_valid, wers_valid,\ acoustics_valid, files_name_valid, kaldi_sents_index_valid = \ load_test_data(FLAGS.data_dir, FLAGS.max_word_length, num_unroll_steps=FLAGS.num_unroll_steps, eos=FLAGS.EOS, datas=['valid']) fasttext_model = None if 'fasttext' in FLAGS.embedding: fasttext_model = FasttextModel( fasttext_path=fasttext_model_path).get_fasttext_model() train_ft_reader = DataReaderFastText( words_list=words_list, batch_size=FLAGS.batch_size, num_unroll_steps=FLAGS.num_unroll_steps, model=fasttext_model, data='train', acoustics=acoustics) valid_ft_reader = DataReaderFastText( words_list=words_list, batch_size=FLAGS.batch_size, num_unroll_steps=FLAGS.num_unroll_steps, model=fasttext_model, data='valid', acoustics=acoustics) train_reader = DataReader(word_tensors['train'], char_tensors['train'], FLAGS.batch_size, FLAGS.num_unroll_steps, wers['train']) valid_reader = TestDataReader(word_tensors_valid['valid'], char_tensors_valid['valid'], FLAGS.batch_size, FLAGS.num_unroll_steps, wers_valid['valid'], files_name_valid['valid'], kaldi_sents_index_valid['valid']) # test_reader = DataReader(word_tensors['test'], char_tensors['test'], # FLAGS.batch_size, FLAGS.num_unroll_steps, wers['train'], word_vocab, char_vocab) print('initialized all dataset readers') with tf.Graph().as_default(), tf.Session() as session: # tensorflow seed must be inside graph tf.set_random_seed(FLAGS.seed) np.random.seed(seed=FLAGS.seed) ''' build training graph ''' initializer = tf.random_uniform_initializer(-FLAGS.param_init, FLAGS.param_init) with tf.variable_scope("Model", initializer=initializer): train_model = model.inference_graph( char_vocab_size=char_vocab.size, word_vocab_size=word_vocab.size, char_embed_size=FLAGS.char_embed_size, batch_size=FLAGS.batch_size, num_highway_layers=FLAGS.highway_layers, num_rnn_layers=FLAGS.rnn_layers, rnn_size=FLAGS.rnn_size, max_word_length=max_word_length, kernels=eval(FLAGS.kernels), kernel_features=eval(FLAGS.kernel_features), num_unroll_steps=FLAGS.num_unroll_steps, dropout=FLAGS.dropout, embedding=FLAGS.embedding, fasttext_word_dim=300, acoustic_features_dim=4) train_model.update( model.loss_graph(train_model.logits, FLAGS.batch_size)) # scaling loss by FLAGS.num_unroll_steps effectively scales gradients by the same factor. # we need it to reproduce how the original Torch code optimizes. Without this, our gradients will be # much smaller (i.e. 35 times smaller) and to get system to learn we'd have to scale learning rate and max_grad_norm appropriately. # Thus, scaling gradients so that this trainer is exactly compatible with the original train_model.update( model.training_graph(train_model.loss * FLAGS.num_unroll_steps, FLAGS.learning_rate, FLAGS.max_grad_norm)) ''' build graph for validation and testing (shares parameters with the training graph!) ''' with tf.variable_scope("Model", reuse=True): valid_model = model.inference_graph( char_vocab_size=char_vocab_valid.size, word_vocab_size=word_vocab_valid.size, char_embed_size=FLAGS.char_embed_size, batch_size=FLAGS.batch_size, num_highway_layers=FLAGS.highway_layers, num_rnn_layers=FLAGS.rnn_layers, rnn_size=FLAGS.rnn_size, max_word_length=max_word_length, kernels=eval(FLAGS.kernels), kernel_features=eval(FLAGS.kernel_features), num_unroll_steps=FLAGS.num_unroll_steps, dropout=0.0, embedding=FLAGS.embedding, fasttext_word_dim=300, acoustic_features_dim=4) valid_model.update( model.loss_graph(valid_model.logits, FLAGS.batch_size)) # create saver before creating more graph nodes, so that we do not save any vars defined below if FLAGS.load_model_for_training: # delete last layers (softmax) - SimpleLinear/Matrix + Bias variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) subset_grpah_for_loading = variables[:29] + variables[31:] loader = tf.train.Saver(max_to_keep=50, var_list=subset_grpah_for_loading) saver = tf.train.Saver(max_to_keep=50) if FLAGS.load_model_for_training: loader.restore(session, FLAGS.load_model_for_training) string = str('Loaded model from' + str(FLAGS.load_model_for_training) + 'saved at global step' + str(train_model.global_step.eval())) print(string) session.run(tf.variables_initializer(var_list=variables[29:31])) string = str('initialized specific scope for fresh model. Size:' + str(model.model_size())) print(string) else: tf.global_variables_initializer().run() session.run(train_model.clear_char_embedding_padding) string = str('Created and initialized fresh model. Size:' + str(model.model_size())) print(string) summary_writer = tf.summary.FileWriter(FLAGS.train_dir, graph=session.graph) ''' take learning rate from CLI, not from saved graph ''' session.run(tf.assign(train_model.learning_rate, FLAGS.learning_rate), ) ''' training starts here ''' best_valid_loss = None rnn_state = session.run(train_model.initial_rnn_state) for epoch in range(FLAGS.max_epochs): epoch_start_time = time.time() avg_train_loss = 0.0 count = 0 for batch_kim, batch_ft in zip(train_reader.iter(), train_ft_reader.iter()): x, y = batch_kim count += 1 start_time = time.time() if fasttext_model: ft_vectors = fasttext_model.wv[ words_list['train'][count]].reshape( fasttext_model.wv.vector_size, 1) loss, _, rnn_state, gradient_norm, step, _, logits = session.run( [ train_model.loss, train_model.train_op, train_model.final_rnn_state, train_model.global_norm, train_model.global_step, train_model.clear_char_embedding_padding, train_model.logits ], { train_model.input2: batch_ft, train_model.input: x, train_model.targets: y, train_model.initial_rnn_state: rnn_state }) else: loss, _, rnn_state, gradient_norm, step, _, logits = session.run( [ train_model.loss, train_model.train_op, train_model.final_rnn_state, train_model.global_norm, train_model.global_step, train_model.clear_char_embedding_padding, train_model.logits ], { train_model.input: x, train_model.targets: y, train_model.initial_rnn_state: rnn_state }) avg_train_loss += 0.05 * (loss - avg_train_loss) time_elapsed = time.time() - start_time if count % FLAGS.print_every == 0: string = str( '%6d: %d [%5d/%5d], train_loss = %6.8f secs/batch = %.4fs' % (step, epoch, count, train_reader.length, loss, time_elapsed)) print(string) string = str('Epoch training time:' + str(time.time() - epoch_start_time)) print(string) epochs_results['epoch_training_time'].append( str(time.time() - epoch_start_time)) # epoch done: time to evaluate avg_valid_loss = 0. labels = [] predictions = [] files_name_list = [] kaldi_sents_index_list = [] count = 0 rnn_state = session.run(valid_model.initial_rnn_state) for batch_kim, batch_ft in zip(valid_reader.iter(), valid_ft_reader.iter()): x, y, files_name_batch, kaldi_sents_index_batch = batch_kim count += 1 start_time = time.time() loss, logits = session.run( [valid_model.loss, valid_model.logits], { valid_model.input2: batch_ft, valid_model.input: x, valid_model.targets: y, valid_model.initial_rnn_state: rnn_state, }) labels.append(y) predictions.append(logits) files_name_list.append(files_name_batch) kaldi_sents_index_list.append(kaldi_sents_index_batch) if count % FLAGS.print_every == 0: string = str("\t> validation loss = %6.8f" % (loss)) print(string) avg_valid_loss = get_valid_rescore_loss(labels, predictions, files_name_list, kaldi_sents_index_list) print("at the end of epoch:" + str(epoch)) epochs_results['epoch_number'].append(str(epoch)) print("train loss = %6.8f" % (avg_train_loss)) epochs_results['train_loss'].append(avg_train_loss) print("validation loss = %6.8f" % (avg_valid_loss)) epochs_results['validation_loss'].append(avg_valid_loss) save_as = '%s/epoch%03d_%.4f.model' % (FLAGS.train_dir, epoch, avg_valid_loss) saver.save(session, save_as) print('Saved model' + str(save_as)) epochs_results['model_name'].append(str(save_as)) epochs_results['learning_rate'].append( str(session.run(train_model.learning_rate))) current_learning_rate = session.run(train_model.learning_rate) ''' decide if need to decay learning rate ''' if best_valid_loss is not None and avg_valid_loss > best_valid_loss - FLAGS.decay_when: print( 'validation perplexity did not improve enough, decay learning rate' ) current_learning_rate = session.run(train_model.learning_rate) string = str('learning rate was:' + str(current_learning_rate)) print(string) current_learning_rate *= FLAGS.learning_rate_decay if current_learning_rate < 1.e-6: print('learning rate too small - stopping now') break session.run( train_model.learning_rate.assign(current_learning_rate)) string = str('new learning rate is:' + str(current_learning_rate)) print(string) else: best_valid_loss = avg_valid_loss ''' write out summary events ''' summary = tf.Summary(value=[ tf.Summary.Value(tag="train_loss", simple_value=avg_train_loss), tf.Summary.Value(tag="valid_loss", simple_value=avg_valid_loss), tf.Summary.Value(tag="learning_rate", simple_value=current_learning_rate) ]) summary_writer.add_summary(summary, step) # Save model performance data pd.DataFrame(epochs_results).to_csv(FLAGS.train_dir + '/train_results.csv')
def main(_): ''' Trains model from data ''' min = [1000, 1000, 1000, 1000] # [t_loss, t_ppl, v_loss, v_ppl] total_time = 0. if not os.path.exists(FLAGS.train_dir): os.mkdir(FLAGS.train_dir) print('Created training directory', FLAGS.train_dir) word_vocab, \ char_vocab, \ word_tensors, \ char_tensors, \ max_word_length = load_data(FLAGS.data_dir, FLAGS.max_word_length, flist = FILE_NAME_LIST, eos=FLAGS.EOS) train_reader = DataReader(word_tensors[FILE_NAME_LIST[0]], FLAGS.batch_size, FLAGS.num_unroll_steps) valid_reader = DataReader(word_tensors[FILE_NAME_LIST[1]], FLAGS.batch_size, FLAGS.num_unroll_steps) test_reader = DataReader(word_tensors[FILE_NAME_LIST[2]], FLAGS.batch_size, FLAGS.num_unroll_steps) print('initialized all dataset readers') with tf.Graph().as_default(), tf.Session() as session: # tensorflow seed must be inside graph tf.set_random_seed(FLAGS.seed) np.random.seed(seed=FLAGS.seed) ''' build training graph ''' initializer = tf.random_uniform_initializer(-FLAGS.param_init, FLAGS.param_init) with tf.variable_scope("Model", initializer=initializer): train_model = model.inference_graph( word_vocab_size=word_vocab.size, word_embed_size=FLAGS.word_embed_size, batch_size=FLAGS.batch_size, num_highway_layers=FLAGS.highway_layers, num_rnn_layers=FLAGS.rnn_layers, rnn_size=FLAGS.rnn_size, num_unroll_steps=FLAGS.num_unroll_steps, dropout=FLAGS.dropout) train_model.update( model.loss_graph(train_model.logits, FLAGS.batch_size, FLAGS.num_unroll_steps)) # scaling loss by FLAGS.num_unroll_steps effectively scales gradients by the same factor. # we need it to reproduce how the original Torch code optimizes. Without this, our gradients will be # much smaller (i.e. 35 times smaller) and to get system to learn we'd have to scale learning rate and max_grad_norm appropriately. # Thus, scaling gradients so that this trainer is exactly compatible with the original train_model.update( model.training_graph(train_model.loss * FLAGS.num_unroll_steps, FLAGS.learning_rate, FLAGS.max_grad_norm)) # create saver before creating more graph nodes, so that we do not save any vars defined below saver = tf.train.Saver(max_to_keep=5) ''' build graph for validation and testing (shares parameters with the training graph!) ''' with tf.variable_scope("Model", reuse=True): valid_model = model.inference_graph( word_vocab_size=word_vocab.size, word_embed_size=FLAGS.word_embed_size, batch_size=FLAGS.batch_size, num_highway_layers=FLAGS.highway_layers, num_rnn_layers=FLAGS.rnn_layers, rnn_size=FLAGS.rnn_size, num_unroll_steps=FLAGS.num_unroll_steps, dropout=0.0) valid_model.update( model.loss_graph(valid_model.logits, FLAGS.batch_size, FLAGS.num_unroll_steps)) if FLAGS.load_model: saver.restore(session, FLAGS.load_model) print('Loaded model from', FLAGS.load_model, 'saved at global step', train_model.global_step.eval()) else: tf.global_variables_initializer().run() session.run(train_model.clear_char_embedding_padding) print('Created and initialized fresh model. Size:', model.model_size()) summary_writer = tf.summary.FileWriter(FLAGS.train_dir, graph=session.graph) ''' take learning rate from CLI, not from saved graph ''' session.run(tf.assign(train_model.learning_rate, FLAGS.learning_rate)) print("=" * 89) print("=" * 89) all_weights = {v.name: v for v in tf.trainable_variables()} total_size = 0 pi = 1 # 0 is for sum of grad_sses for v_name in list(all_weights): # sorted() v = all_weights[v_name] v_size = int(np.prod(np.array(v.shape.as_list()))) print("%02d-Weight %s\tshape %s\ttsize %d" % (pi, v.name[:-2].ljust(80), str(v.shape).ljust(20), v_size)) total_size += v_size pi += 1 print("Total size %d, %.3fMiB" % (total_size, (total_size * 4) / (1024 * 1024))) print("-" * 89) ''' training starts here ''' best_valid_loss = None rnn_state = session.run(train_model.initial_rnn_state) for epoch in range(1, FLAGS.max_epochs + 1): epoch_start_time = time.time() avg_train_loss = 0.0 count = 0 for x, y in train_reader.iter(): count += 1 start_time = time.time() loss, _, rnn_state, gradient_norm, step, _ = session.run( [ train_model.loss, train_model.train_op, train_model.final_rnn_state, train_model.global_norm, train_model.global_step, train_model.clear_char_embedding_padding ], { train_model.input: x, train_model.targets: y, train_model.initial_rnn_state: rnn_state }) avg_train_loss += 0.05 * (loss - avg_train_loss) time_elapsed = time.time() - start_time if count % FLAGS.print_every == 0: cur_lr = session.run(train_model.learning_rate) print( '%6d: -%d- [%5d/%5d], train_loss/ppl = %6.8f/%6.7f batch/secs = %.1fb/s, cur_lr = %2.5f, grad.norm=%6.8f' % (step, epoch, count, train_reader.length, loss, np.exp(loss), FLAGS.print_every / time_elapsed, cur_lr, gradient_norm)) print('Epoch training time:', time.time() - epoch_start_time) total_time += (time.time() - epoch_start_time) # epoch done: time to evaluate avg_valid_loss = 0.0 count = 0 rnn_state = session.run(valid_model.initial_rnn_state) for x, y in valid_reader.iter(): count += 1 start_time = time.time() loss, rnn_state = session.run( [valid_model.loss, valid_model.final_rnn_state], { valid_model.input: x, valid_model.targets: y, valid_model.initial_rnn_state: rnn_state, }) if count % FLAGS.print_every == 0: print("\t> validation loss = %6.8f, perplexity = %6.8f" % (loss, np.exp(loss))) avg_valid_loss += loss / valid_reader.length print("at the end of epoch:", epoch) print("train loss = %6.8f, perplexity = %6.8f" % (avg_train_loss, np.exp(avg_train_loss))) print("validation loss = %6.8f, perplexity = %6.8f" % (avg_valid_loss, np.exp(avg_valid_loss))) if min[2] > avg_valid_loss: min[0] = avg_train_loss min[1] = np.exp(avg_train_loss) min[2] = avg_valid_loss min[3] = np.exp(avg_valid_loss) save_as = '%s/epoch%03d_%.4f.model' % (FLAGS.train_dir, epoch, avg_valid_loss) saver.save(session, save_as) print('Saved model', save_as) ''' write out summary events ''' summary = tf.Summary(value=[ tf.Summary.Value(tag="train_loss", simple_value=avg_train_loss), tf.Summary.Value(tag="valid_loss", simple_value=avg_valid_loss) ]) summary_writer.add_summary(summary, step) ''' decide if need to decay learning rate ''' if best_valid_loss is not None and np.exp(avg_valid_loss) > np.exp( best_valid_loss) - FLAGS.decay_when: print( 'validation perplexity did not improve enough, decay learning rate' ) current_learning_rate = session.run(train_model.learning_rate) print('learning rate was:', current_learning_rate) current_learning_rate *= FLAGS.learning_rate_decay if current_learning_rate < 1.e-5: print('learning rate too small - stopping now') break session.run( train_model.learning_rate.assign(current_learning_rate)) print('new learning rate is:', current_learning_rate) else: best_valid_loss = avg_valid_loss ''' test on the test set ''' ave_test_loss = 0. trnn_state = session.run(valid_model.initial_rnn_state) for x, y in test_reader.iter(): loss, trnn_state = session.run( [valid_model.loss, valid_model.final_rnn_state], { valid_model.input: x, valid_model.targets: y, valid_model.initial_rnn_state: trnn_state }) disp_loss = loss ave_test_loss += disp_loss / test_reader.length print("=" * 89) print("=" * 89) print("Total training time(not included the valid time): %f" % total_time) print("The best result:") print("train loss = %.3f, ppl = %.4f" % (min[0], min[1])) print("valid loss = %.3f, ppl = %.4f" % (min[2], min[3])) print("test loss = %.3f, ppl = %.4f" % (ave_test_loss, np.exp(ave_test_loss))) print("=" * 89)
def main(print): ''' Loads trained model and evaluates it on test split ''' if FLAGS.load_model_for_test is None: print('Please specify checkpoint file to load model from') return -1 if not os.path.exists(FLAGS.load_model_for_test + ".index"): print('Checkpoint file not found', FLAGS.load_model_for_test) return -1 word_vocab, char_vocab, word_tensors, char_tensors, max_word_length, words_list = \ load_data(FLAGS.data_dir, FLAGS.max_word_length, eos=FLAGS.EOS) test_reader = DataReader(word_tensors['test'], char_tensors['test'], FLAGS.batch_size, FLAGS.num_unroll_steps) fasttext_model_path = None if FLAGS.fasttext_model_path: fasttext_model_path = FLAGS.fasttext_model_path if 'fasttext' in FLAGS.embedding: fasttext_model = FasttextModel( fasttext_path=fasttext_model_path).get_fasttext_model() test_ft_reader = DataReaderFastText( words_list=words_list, batch_size=FLAGS.batch_size, num_unroll_steps=FLAGS.num_unroll_steps, model=fasttext_model, data='test') print('initialized test dataset reader') with tf.Graph().as_default(), tf.Session() as session: # tensorflow seed must be inside graph tf.set_random_seed(FLAGS.seed) np.random.seed(seed=FLAGS.seed) ''' build inference graph ''' with tf.variable_scope("Model"): m = model.inference_graph(char_vocab_size=char_vocab.size, word_vocab_size=word_vocab.size, char_embed_size=FLAGS.char_embed_size, batch_size=FLAGS.batch_size, num_highway_layers=FLAGS.highway_layers, num_rnn_layers=FLAGS.rnn_layers, rnn_size=FLAGS.rnn_size, max_word_length=max_word_length, kernels=eval(FLAGS.kernels), kernel_features=eval( FLAGS.kernel_features), num_unroll_steps=FLAGS.num_unroll_steps, dropout=0, embedding=FLAGS.embedding, fasttext_word_dim=300, acoustic_features_dim=4) m.update( model.loss_graph(m.logits, FLAGS.batch_size, FLAGS.num_unroll_steps)) global_step = tf.Variable(0, dtype=tf.int32, name='global_step') saver = tf.train.Saver() saver.restore(session, FLAGS.load_model_for_test) print('Loaded model from' + str(FLAGS.load_model_for_test) + 'saved at global step' + str(global_step.eval())) ''' training starts here ''' rnn_state = session.run(m.initial_rnn_state) count = 0 avg_loss = 0 start_time = time.time() for batch_kim, batch_ft in zip(test_reader.iter(), test_ft_reader.iter()): count += 1 x, y = batch_kim loss, rnn_state, logits = session.run( [m.loss, m.final_rnn_state, m.logits], { m.input2: batch_ft, m.input: x, m.targets: y, m.initial_rnn_state: rnn_state }) avg_loss += loss avg_loss /= count time_elapsed = time.time() - start_time print("test loss = %6.8f, perplexity = %6.8f" % (avg_loss, np.exp(avg_loss))) print("test samples:" + str(count * FLAGS.batch_size) + "time elapsed:" + str(time_elapsed) + "time per one batch:" + str(time_elapsed / count)) save_data_to_csv(avg_loss, count, time_elapsed)
def test(self): with self.test_session() as sess: m = self.model() loss = model.loss_graph(m.logits, batch_size=4, num_unroll_steps=3) rnn_outputs = [ np.array([[ -0.00840133, 0.00178184, 0.00585286, 0.00937691, 0.00332699 ], [ -0.00840504, 0.00177166, 0.00586006, 0.00935978, 0.00331423 ], [ -0.00839551, 0.0017945, 0.00585306, 0.00938957, 0.00333546 ], [ -0.00839595, 0.00178647, 0.0058777, 0.00935818, 0.00331012 ]]), np.array([[ -0.0126155, 0.00264827, 0.00886869, 0.01411371, 0.00518486 ], [ -0.01261795, 0.00264249, 0.00887132, 0.01410431, 0.00517832 ], [ -0.01261209, 0.00266095, 0.00885007, 0.01414492, 0.00520893 ], [ -0.01261059, 0.00265393, 0.00888564, 0.01410206, 0.00517435 ]]), np.array([[ -0.01473925, 0.00305038, 0.01042287, 0.01648509, 0.00619734 ], [ -0.01472822, 0.00307533, 0.01042284, 0.01650903, 0.00621392 ], [ -0.01473146, 0.00307552, 0.01039554, 0.01653865, 0.00623778 ], [ -0.01472719, 0.00307848, 0.01041825, 0.01651621, 0.00621954 ]]) ] feed = { 'LSTM/WordEmbedding/SimpleLinear/Matrix:0': SOFTMAX_W, 'LSTM/WordEmbedding/SimpleLinear/Bias:0': SOFTMAX_B, loss.targets: Y } for o, r in zip(rnn_outputs, m.rnn_outputs): feed[r] = o l = sess.run(loss.loss, feed) print(l) ''' [[-0.00115102 -0.01835673 0.01088401 0.00553839 -0.02548739 0.00961501 -0.04911561 0.04094783 0.01729541 0.04113884 0.0110002 0.03410089 -0.02663253 0.01714642 0.03581101 -0.03634553 -0.01540088 -0.01764538 0.03884879 -0.03207963] [-0.00115117 -0.01835723 0.01088434 0.00553844 -0.02548673 0.00961541 -0.04911538 0.04094752 0.01729532 0.04113849 0.01100097 0.0341017 -0.02663185 0.01714566 0.03581182 -0.03634511 -0.0154006 -0.01764595 0.03884758 -0.03208043] [-0.00115108 -0.01835609 0.01088368 0.00553811 -0.0254877 0.0096147 -0.04911536 0.04094845 0.01729582 0.04113897 0.01099989 0.03410037 -0.02663329 0.01714694 0.03581046 -0.03634582 -0.01540092 -0.01764458 0.03884939 -0.03207891] [-0.0011517 -0.01835642 0.01088412 0.00553769 -0.02548616 0.00961538 -0.0491141 0.0409487 0.01729641 0.04113809 0.01100182 0.03410203 -0.02663257 0.01714548 0.03581202 -0.03634498 -0.01540009 -0.01764486 0.03884656 -0.03208012]] [[-0.00137119 -0.01813851 0.01110794 0.00582019 -0.02566941 0.00940851 -0.04911464 0.04097762 0.0171818 0.04152314 0.01122282 0.0339342 -0.02648103 0.01748628 0.03570804 -0.0365119 -0.01505298 -0.01722943 0.03911369 -0.03211264] [-0.00137125 -0.01813885 0.01110811 0.00582023 -0.02566908 0.00940875 -0.04911457 0.04097738 0.0171817 0.04152295 0.0112232 0.03393462 -0.02648065 0.01748586 0.03570845 -0.03651169 -0.01505288 -0.01722979 0.03911307 -0.03211308] [-0.00137074 -0.01813789 0.01110745 0.00582038 -0.02567078 0.0094078 -0.04911549 0.04097777 0.01718157 0.04152391 0.01122116 0.03393264 -0.02648198 0.01748771 0.03570654 -0.03651269 -0.01505364 -0.01722879 0.03911621 -0.03211133] [-0.00137166 -0.01813823 0.01110794 0.00581962 -0.02566858 0.00940876 -0.04911353 0.04097832 0.01718257 0.04152259 0.01122391 0.03393493 -0.02648121 0.01748567 0.03570866 -0.03651157 -0.01505247 -0.01722896 0.03911219 -0.03211286]] [[-0.00148683 -0.01802572 0.01122714 0.00596118 -0.02575907 0.00930692 -0.04911366 0.04099131 0.01712257 0.04171915 0.01133703 0.033848 -0.02640488 0.01765947 0.03566255 -0.03659378 -0.01487576 -0.0170169 0.03924932 -0.03213345] [-0.00148696 -0.01802452 0.01122649 0.00596062 -0.02575966 0.00930636 -0.04911316 0.04099248 0.01712336 0.04171939 0.01133645 0.033847 -0.02640637 0.01766046 0.03566148 -0.03659437 -0.01487585 -0.01701534 0.03925047 -0.03213206] [-0.0014862 -0.01802442 0.01122626 0.00596135 -0.0257613 0.00930568 -0.04911482 0.04099185 0.01712243 0.04172042 0.0113344 0.03384538 -0.02640666 0.0176619 0.03566003 -0.03659511 -0.01487673 -0.0170155 0.03925342 -0.03213113] [-0.00148684 -0.01802438 0.01122635 0.00596064 -0.02575997 0.0093062 -0.04911336 0.04099251 0.01712332 0.04171955 0.01133605 0.03384664 -0.02640661 0.01766078 0.03566112 -0.03659455 -0.01487602 -0.0170152 0.03925106 -0.03213174]] ''' assert False
def main(_): ''' Loads trained model and evaluates it on test split ''' if FLAGS.load_model is None: print('Please specify checkpoint file to load model from') return -1 if not os.path.exists(FLAGS.load_model): print('Checkpoint file not found', FLAGS.load_model) return -1 word_vocab, char_vocab, word_tensors, char_tensors, max_word_length = load_data(FLAGS.data_dir, FLAGS.max_word_length, eos=FLAGS.EOS) test_reader = DataReader(word_tensors['test'], char_tensors['test'], FLAGS.batch_size, FLAGS.num_unroll_steps) print('initialized test dataset reader') with tf.Graph().as_default(), tf.Session() as session: # tensorflow seed must be inside graph tf.set_random_seed(FLAGS.seed) np.random.seed(seed=FLAGS.seed) ''' build inference graph ''' with tf.variable_scope("Model"): m = model.inference_graph( char_vocab_size=char_vocab.size, word_vocab_size=word_vocab.size, char_embed_size=FLAGS.char_embed_size, batch_size=FLAGS.batch_size, num_highway_layers=FLAGS.highway_layers, num_rnn_layers=FLAGS.rnn_layers, rnn_size=FLAGS.rnn_size, max_word_length=max_word_length, kernels=eval(FLAGS.kernels), kernel_features=eval(FLAGS.kernel_features), num_unroll_steps=FLAGS.num_unroll_steps, dropout=0) m.update(model.loss_graph(m.logits, FLAGS.batch_size, FLAGS.num_unroll_steps)) global_step = tf.Variable(0, dtype=tf.int32, name='global_step') saver = tf.train.Saver() saver.restore(session, FLAGS.load_model) print('Loaded model from', FLAGS.load_model, 'saved at global step', global_step.eval()) ''' training starts here ''' rnn_state = session.run(m.initial_rnn_state) count = 0 avg_loss = 0 start_time = time.time() for x, y in test_reader.iter(): count += 1 loss, rnn_state = session.run([ m.loss, m.final_rnn_state ], { m.input : x, m.targets: y, m.initial_rnn_state: rnn_state }) avg_loss += loss avg_loss /= count time_elapsed = time.time() - start_time print("test loss = %6.8f, perplexity = %6.8f" % (avg_loss, np.exp(avg_loss))) print("test samples:", count*FLAGS.batch_size, "time elapsed:", time_elapsed, "time per one batch:", time_elapsed/count)
def main(_): ''' Trains model from data ''' if not os.path.exists(FLAGS.train_dir): os.mkdir(FLAGS.train_dir) print('Created training directory', FLAGS.train_dir) word_vocab, char_vocab, word_tensors, char_tensors, max_word_length = \ load_data(FLAGS.data_dir, FLAGS.max_word_length, eos=FLAGS.EOS) train_reader = DataReader(word_tensors['train'], char_tensors['train'], FLAGS.batch_size, FLAGS.num_unroll_steps) valid_reader = DataReader(word_tensors['valid'], char_tensors['valid'], FLAGS.batch_size, FLAGS.num_unroll_steps) test_reader = DataReader(word_tensors['test'], char_tensors['test'], FLAGS.batch_size, FLAGS.num_unroll_steps) print('initialized all dataset readers') minimum_valid_ppl = 1000000 minimum_vl_epoch = 0 text_file = open("train_log.txt", "w") # text_file.write("Purchase Amount: %s" % TotalAmount) with tf.Graph().as_default(), tf.Session() as session: # tensorflow seed must be inside graph tf.set_random_seed(FLAGS.seed) np.random.seed(seed=FLAGS.seed) ''' build training graph ''' initializer = tf.random_uniform_initializer(-FLAGS.param_init, FLAGS.param_init) with tf.variable_scope("Model", initializer=initializer): train_model = model.inference_graph( char_vocab_size=char_vocab.size, word_vocab_size=word_vocab.size, char_embed_size=FLAGS.char_embed_size, batch_size=FLAGS.batch_size, num_highway_layers=FLAGS.highway_layers, num_rnn_layers=FLAGS.rnn_layers, rnn_size=FLAGS.rnn_size, max_word_length=max_word_length, kernels=eval(FLAGS.kernels), kernel_features=eval(FLAGS.kernel_features), num_unroll_steps=FLAGS.num_unroll_steps, dropout=FLAGS.dropout) train_model.update( model.loss_graph(train_model.logits, FLAGS.batch_size, FLAGS.num_unroll_steps)) # scaling loss by FLAGS.num_unroll_steps effectively scales gradients by the same factor. # we need it to reproduce how the original Torch code optimizes. Without this, our gradients will be # much smaller (i.e. 35 times smaller) and to get system to learn we'd have to scale learning rate and max_grad_norm appropriately. # Thus, scaling gradients so that this trainer is exactly compatible with the original train_model.update( model.training_graph(train_model.loss * FLAGS.num_unroll_steps, FLAGS.learning_rate, FLAGS.max_grad_norm)) # create saver before creating more graph nodes, so that we do not save any vars defined below saver = tf.train.Saver(max_to_keep=10) ''' build graph for validation and testing (shares parameters with the training graph!) ''' with tf.variable_scope("Model", reuse=True): valid_model = model.inference_graph( char_vocab_size=char_vocab.size, word_vocab_size=word_vocab.size, char_embed_size=FLAGS.char_embed_size, batch_size=FLAGS.batch_size, num_highway_layers=FLAGS.highway_layers, num_rnn_layers=FLAGS.rnn_layers, rnn_size=FLAGS.rnn_size, max_word_length=max_word_length, kernels=eval(FLAGS.kernels), kernel_features=eval(FLAGS.kernel_features), num_unroll_steps=FLAGS.num_unroll_steps, dropout=0.0) valid_model.update( model.loss_graph(valid_model.logits, FLAGS.batch_size, FLAGS.num_unroll_steps)) if FLAGS.load_model: saver.restore(session, FLAGS.load_model) print('Loaded model from', FLAGS.load_model, 'saved at global step', train_model.global_step.eval()) else: tf.global_variables_initializer().run() session.run(train_model.clear_char_embedding_padding) print('Created and initialized fresh model. Size:', model.model_size()) summary_writer = tf.summary.FileWriter(FLAGS.train_dir, graph=session.graph) ''' take learning rate from CLI, not from saved graph ''' session.run(tf.assign(train_model.learning_rate, FLAGS.learning_rate), ) ''' training starts here ''' best_valid_loss = None rnn_state = session.run(train_model.initial_rnn_state) for epoch in range(FLAGS.max_epochs): epoch_start_time = time.time() avg_train_loss = 0.0 count = 0 for x, y in train_reader.iter(): count += 1 start_time = time.time() loss, _, rnn_state, gradient_norm, step, _ = session.run( [ train_model.loss, train_model.train_op, train_model.final_rnn_state, train_model.global_norm, train_model.global_step, train_model.clear_char_embedding_padding ], { train_model.input: x, train_model.targets: y, train_model.initial_rnn_state: rnn_state }) avg_train_loss += 0.05 * (loss - avg_train_loss) time_elapsed = time.time() - start_time if count % FLAGS.print_every == 0: print( '%6d: %d [%5d/%5d], train_loss/perplexity = %6.8f/%6.7f secs/batch = %.4fs, grad.norm=%6.8f' % (step, epoch, count, train_reader.length, loss, np.exp(loss), time_elapsed, gradient_norm)) text_file.write( '%6d: %d [%5d/%5d], train_loss/perplexity = %6.8f/%6.7f secs/batch = %.4fs, grad.norm=%6.8f \n' % (step, epoch, count, train_reader.length, loss, np.exp(loss), time_elapsed, gradient_norm)) print('Epoch training time:', time.time() - epoch_start_time) # text_file.write('Epoch training time:'+str( time.time()-epoch_start_time) # epoch done: time to evaluate avg_valid_loss = 0.0 count = 0 rnn_state = session.run(valid_model.initial_rnn_state) for x, y in valid_reader.iter(): count += 1 start_time = time.time() loss, rnn_state = session.run( [valid_model.loss, valid_model.final_rnn_state], { valid_model.input: x, valid_model.targets: y, valid_model.initial_rnn_state: rnn_state, }) if count % FLAGS.print_every == 0: print("\t> validation loss = %6.8f, perplexity = %6.8f" % (loss, np.exp(loss))) avg_valid_loss += loss / valid_reader.length print("at the end of epoch:", epoch) print("train loss = %6.8f, perplexity = %6.8f" % (avg_train_loss, np.exp(avg_train_loss))) print("validation loss = %6.8f, perplexity = %6.8f" % (avg_valid_loss, np.exp(avg_valid_loss))) text_file.write("at the end of epoch:" + str(epoch) + '\n') text_file.write("train loss = %6.8f, perplexity = %6.8f \n" % (avg_train_loss, np.exp(avg_train_loss))) text_file.write("validation loss = %6.8f, perplexity = %6.8f \n" % (avg_valid_loss, np.exp(avg_valid_loss))) if (np.exp(avg_valid_loss) < minimum_valid_ppl): minimum_valid_ppl = np.exp(avg_valid_loss) minimum_vl_epoch = epoch save_as = '%s/epoch%03d_%.4f.model' % (FLAGS.train_dir, epoch, avg_valid_loss) saver.save(session, save_as) print('Saved model', save_as) elif (epoch % 4 == 0): save_as = '%s/epoch%03d_%.4f.model' % (FLAGS.train_dir, epoch, avg_valid_loss) saver.save(session, save_as) print('Saved model', save_as) ''' write out summary events ''' summary = tf.Summary(value=[ tf.Summary.Value(tag="train_loss", simple_value=avg_train_loss), tf.Summary.Value(tag="valid_loss", simple_value=avg_valid_loss) ]) summary_writer.add_summary(summary, step) ''' decide if need to decay learning rate ''' if best_valid_loss is not None and np.exp(avg_valid_loss) > np.exp( best_valid_loss) - FLAGS.decay_when: print( 'validation perplexity did not improve enough, decay learning rate' ) current_learning_rate = session.run(train_model.learning_rate) print('learning rate was:', current_learning_rate) current_learning_rate *= FLAGS.learning_rate_decay if current_learning_rate < 1.e-5: print('learning rate too small - stopping now') break session.run( train_model.learning_rate.assign(current_learning_rate)) print('new learning rate is:', current_learning_rate) else: best_valid_loss = avg_valid_loss save_as = '%s/epoch%03d_%.4f.model' % (FLAGS.train_dir, epoch, avg_valid_loss) saver.save(session, save_as) print('Saved model', save_as) print("----------------------------------------------") print( "Minimum Valid PPL is attained in epoch:%d and Validation PPL is %6.8f" % (minimum_vl_epoch, minimum_valid_ppl))
def main(): pretrain_word2id, pretrain_id2word, pretrain_emb = reader.load_pretrain( FLAGS.pretrain_path, [FLAGS.train_path, FLAGS.validate_path, FLAGS.test_path]) vocabs = reader.build_vocab(FLAGS.train_path) traindata = reader.DataSet(FLAGS.train_path, FLAGS.max_word_len, pretrain_word2id, pretrain_id2word, pretrain_emb, vocabs) traindata.load_data() validate = reader.DataSet(FLAGS.validate_path, FLAGS.max_word_len, pretrain_word2id, pretrain_id2word, pretrain_emb, vocabs) validate.load_data() test = reader.DataSet(FLAGS.test_path, FLAGS.max_word_len, pretrain_word2id, pretrain_id2word, pretrain_emb, vocabs) test.load_data() seq_lens = FLAGS.num_steps * np.ones(FLAGS.batch_size) with tf.Graph().as_default(), tf.Session() as sess: with tf.variable_scope("Model"): train_model = model.inference_graph( char_vocab_size=len(traindata.char2id), pretrain_embedding=traindata.pretrain_emb, max_word_len=FLAGS.max_word_len, ntags=len(traindata.tag2id), batch_size=FLAGS.batch_size, num_steps=FLAGS.num_steps, char_emb_size=FLAGS.char_emb_size, lstm_state_size=FLAGS.lstm_state_size, num_rnn_layers=FLAGS.num_rnn_layers, dropout=FLAGS.dropout, filter_sizes=[FLAGS.filter_size], nfilters=[FLAGS.nfilter]) train_model.update(model.loss_graph(train_model.logits, FLAGS.batch_size, FLAGS.num_steps, FLAGS.crf, seq_lens)) train_model.update(model.training_graph(train_model.loss * FLAGS.num_steps, FLAGS.learning_rate, FLAGS.max_grad_norm)) #train_model.update(model.training_graph(train_model.loss)) saver = tf.train.Saver() '''Validate model''' with tf.variable_scope("Model", reuse=True): validate_model=model.inference_graph( char_vocab_size=len(validate.char2id), pretrain_embedding=validate.pretrain_emb, max_word_len=FLAGS.max_word_len, ntags=len(validate.tag2id), batch_size=FLAGS.batch_size, num_steps=FLAGS.num_steps, char_emb_size=FLAGS.char_emb_size, lstm_state_size=FLAGS.lstm_state_size, num_rnn_layers=FLAGS.num_rnn_layers, dropout=0, #No dropout when testing! filter_sizes=[FLAGS.filter_size], nfilters=[FLAGS.nfilter]) validate_model.update(model.loss_graph(validate_model.logits, FLAGS.batch_size, FLAGS.num_steps, FLAGS.crf, seq_lens)) validate_model.update(model.adict(name="validation")) '''Test model''' with tf.variable_scope("Model", reuse=True): test_model=model.inference_graph( char_vocab_size=len(test.char2id), pretrain_embedding=test.pretrain_emb, max_word_len=FLAGS.max_word_len, ntags=len(test.tag2id), batch_size=FLAGS.batch_size, num_steps=FLAGS.num_steps, char_emb_size=FLAGS.char_emb_size, lstm_state_size=FLAGS.lstm_state_size, num_rnn_layers=FLAGS.num_rnn_layers, dropout=0, filter_sizes=[FLAGS.filter_size], nfilters=[FLAGS.nfilter]) test_model.update(model.loss_graph(test_model.logits, FLAGS.batch_size, FLAGS.num_steps, FLAGS.crf, seq_lens)) test_model.update(model.adict(name="test")) init_op = tf.global_variables_initializer() sess.run(init_op) lstm_state_fw = sess.run(train_model.initial_lstm_state_fw) lstm_state_bw = sess.run(train_model.initial_lstm_state_bw) print "Start Training..." current_best_Fscore = 0.0 for epoch in range(FLAGS.total_epoch): print "epoch", epoch start_time = time.time() loss = run_epoch(sess, traindata, train_model, lstm_state_fw, lstm_state_bw, FLAGS.batch_size, FLAGS.num_steps) if FLAGS.crf: Fscore = crf_eval(sess, validate, validate_model, FLAGS.batch_size, FLAGS.num_steps, FLAGS.eval_path, FLAGS.eval_script_path) else: Fscore = evaluate(sess, validate, validate_model, FLAGS.batch_size, FLAGS.num_steps, FLAGS.eval_path) if Fscore > current_best_Fscore: current_best_Fscore = Fscore print "**Results on test set with current best F:", current_best_Fscore crf_eval(sess, test, test_model, FLAGS.batch_size, FLAGS.num_steps, FLAGS.eval_path, FLAGS.eval_script_path) saver.save(sess, FLAGS.checkpoint_path) print "Model saved!" new_learning_rate = FLAGS.learning_rate / (1 + FLAGS.decay_rate * (epoch + 1)) sess.run(train_model.learning_rate.assign(new_learning_rate)) end_time = time.time() print "Epoch training time:", end_time - start_time