def check_training(self, model_type, label_type): print('----- ' + model_type + ', ' + label_type + ' -----') tf.reset_default_graph() with tf.Graph().as_default(): # Load batch data batch_size = 4 inputs, labels, inputs_seq_len, labels_seq_len = generate_data( label_type=label_type, model='attention', batch_size=batch_size) # Define placeholders inputs_pl = tf.placeholder( tf.float32, shape=[batch_size, None, inputs.shape[-1]], name='input') # `[batch_size, max_time]` labels_pl = tf.placeholder(tf.int32, shape=[None, None], name='label') # These are prepared for computing LER indices_true_pl = tf.placeholder(tf.int64, name='indices') values_true_pl = tf.placeholder(tf.int32, name='values') shape_true_pl = tf.placeholder(tf.int64, name='shape') labels_st_true_pl = tf.SparseTensor(indices_true_pl, values_true_pl, shape_true_pl) indices_pred_pl = tf.placeholder(tf.int64, name='indices') values_pred_pl = tf.placeholder(tf.int32, name='values') shape_pred_pl = tf.placeholder(tf.int64, name='shape') labels_st_pred_pl = tf.SparseTensor(indices_pred_pl, values_pred_pl, shape_pred_pl) inputs_seq_len_pl = tf.placeholder(tf.int32, shape=[None], name='inputs_seq_len') labels_seq_len_pl = tf.placeholder(tf.int32, shape=[None], name='labels_seq_len') keep_prob_input_pl = tf.placeholder(tf.float32, name='keep_prob_input') keep_prob_hidden_pl = tf.placeholder(tf.float32, name='keep_prob_hidden') # Define model graph output_size = 26 + 2 if label_type == 'character' else 61 + 2 # model = load(model_type=model_type) network = BLSTMAttetion(batch_size=batch_size, input_size=inputs[0].shape[1], encoder_num_unit=256, encoder_num_layer=2, attention_dim=128, decoder_num_unit=256, decoder_num_layer=1, embedding_dim=20, output_size=output_size, sos_index=output_size - 2, eos_index=output_size - 1, max_decode_length=50, attention_weights_tempareture=1, logits_tempareture=1, parameter_init=0.1, clip_grad=5.0, clip_activation_encoder=50, clip_activation_decoder=50, dropout_ratio_input=1.0, dropout_ratio_hidden=1.0, weight_decay=0, beam_width=0, time_major=False) # Add to the graph each operation loss_op, logits, decoder_outputs_train, decoder_outputs_infer = network.compute_loss( inputs_pl, labels_pl, inputs_seq_len_pl, labels_seq_len_pl, keep_prob_input_pl, keep_prob_hidden_pl) learning_rate = 1e-3 train_op = network.train(loss_op, optimizer='rmsprop', learning_rate_init=learning_rate, is_scheduled=False) decode_op_train, decode_op_infer = network.decoder( decoder_outputs_train, decoder_outputs_infer, decode_type='greedy', beam_width=1) ler_op = network.compute_ler(labels_st_true_pl, labels_st_pred_pl) attention_weights = decoder_outputs_infer.attention_scores # Add the variable initializer operation init_op = tf.global_variables_initializer() # Count total parameters parameters_dict, total_parameters = count_total_parameters( tf.trainable_variables()) for parameter_name in sorted(parameters_dict.keys()): print("%s %d" % (parameter_name, parameters_dict[parameter_name])) print("Total %d variables, %s M parameters" % (len(parameters_dict.keys()), "{:,}".format( total_parameters / 1000000))) # Make feed dict feed_dict = { inputs_pl: inputs, labels_pl: labels, inputs_seq_len_pl: inputs_seq_len, labels_seq_len_pl: labels_seq_len, keep_prob_input_pl: network.dropout_ratio_input, keep_prob_hidden_pl: network.dropout_ratio_hidden, network.lr: learning_rate } with tf.Session() as sess: # Initialize parameters sess.run(init_op) # Wrapper for tfdbg # sess = tf_debug.LocalCLIDebugWrapperSession(sess) # Train model max_steps = 400 start_time_global = time.time() start_time_step = time.time() ler_train_pre = 1 not_improved_count = 0 for step in range(max_steps): # Compute loss _, loss_train = sess.run([train_op, loss_op], feed_dict=feed_dict) # Gradient check # grads = sess.run(network.clipped_grads, # feed_dict=feed_dict) # for grad in grads: # print(np.max(grad)) if (step + 1) % 10 == 0: # Change to evaluation mode feed_dict[keep_prob_input_pl] = 1.0 feed_dict[keep_prob_hidden_pl] = 1.0 # Predict class ids predicted_ids_train, predicted_ids_infer = sess.run( [decode_op_train, decode_op_infer], feed_dict=feed_dict) # Compute accuracy feed_dict_ler = { labels_st_true_pl: list2sparsetensor(labels), labels_st_pred_pl: list2sparsetensor(predicted_ids_infer) } ler_train = sess.run(ler_op, feed_dict=feed_dict_ler) duration_step = time.time() - start_time_step print('Step %d: loss = %.3f / ler = %.4f (%.3f sec)' % (step + 1, loss_train, ler_train, duration_step)) start_time_step = time.time() # Visualize if label_type == 'character': print('True : %s' % num2alpha(labels[0])) print('Pred (Training) : <%s' % num2alpha(predicted_ids_train[0])) print('Pred (Inference): <%s' % num2alpha(predicted_ids_infer[0])) else: print('True : %s' % num2phone(labels[0])) print('Pred (Training) : < %s' % num2phone(predicted_ids_train[0])) print('Pred (Inference): < %s' % num2phone(predicted_ids_infer[0])) if ler_train >= ler_train_pre: not_improved_count += 1 else: not_improved_count = 0 if not_improved_count >= 5: print('Model is Converged.') break ler_train_pre = ler_train duration_global = time.time() - start_time_global print('Total time: %.3f sec' % (duration_global))
def check_training(self): print('----- multitask -----') tf.reset_default_graph() with tf.Graph().as_default(): # Load batch data batch_size = 4 inputs, labels_true_char_st, labels_true_phone_st, inputs_seq_len = generate_data( label_type='multitask', model='ctc', batch_size=batch_size) # Define placeholders inputs_pl = tf.placeholder(tf.float32, shape=[None, None, inputs.shape[-1]], name='input') indices_pl = tf.placeholder(tf.int64, name='indices') values_pl = tf.placeholder(tf.int32, name='values') shape_pl = tf.placeholder(tf.int64, name='shape') labels_pl = tf.SparseTensor(indices_pl, values_pl, shape_pl) indices_second_pl = tf.placeholder(tf.int64, name='indices_second') values_second_pl = tf.placeholder(tf.int32, name='values_second') shape_second_pl = tf.placeholder(tf.int64, name='shape_second') labels_second_pl = tf.SparseTensor(indices_second_pl, values_second_pl, shape_second_pl) inputs_seq_len_pl = tf.placeholder(tf.int64, shape=[None], name='inputs_seq_len') keep_prob_input_pl = tf.placeholder(tf.float32, name='keep_prob_input') keep_prob_hidden_pl = tf.placeholder(tf.float32, name='keep_prob_hidden') # Define model graph output_size_main = 26 output_size_second = 61 network = Multitask_BLSTM_CTC( batch_size=batch_size, input_size=inputs[0].shape[1], num_unit=256, num_layer_main=2, num_layer_second=1, output_size_main=output_size_main, output_size_second=output_size_second, main_task_weight=0.8, parameter_init=0.1, clip_grad=5.0, clip_activation=50, dropout_ratio_input=1.0, dropout_ratio_hidden=1.0, num_proj=None, weight_decay=1e-6) # Add to the graph each operation loss_op, logits_main, logits_second = network.compute_loss( inputs_pl, labels_pl, labels_second_pl, inputs_seq_len_pl, keep_prob_input_pl, keep_prob_hidden_pl) learning_rate = 1e-3 train_op = network.train(loss_op, optimizer='rmsprop', learning_rate_init=learning_rate, is_scheduled=False) decode_op_main, decode_op_second = network.decoder( logits_main, logits_second, inputs_seq_len_pl, decode_type='beam_search', beam_width=20) ler_op_main, ler_op_second = network.compute_ler( decode_op_main, decode_op_second, labels_pl, labels_second_pl) # Add the variable initializer operation init_op = tf.global_variables_initializer() # Count total parameters parameters_dict, total_parameters = count_total_parameters( tf.trainable_variables()) for parameter_name in sorted(parameters_dict.keys()): print("%s %d" % (parameter_name, parameters_dict[parameter_name])) print("Total %d variables, %s M parameters" % (len(parameters_dict.keys()), "{:,}".format( total_parameters / 1000000))) # Make feed dict feed_dict = { inputs_pl: inputs, labels_pl: labels_true_char_st, labels_second_pl: labels_true_phone_st, inputs_seq_len_pl: inputs_seq_len, keep_prob_input_pl: network.dropout_ratio_input, keep_prob_hidden_pl: network.dropout_ratio_hidden, network.lr: learning_rate } with tf.Session() as sess: # Initialize parameters sess.run(init_op) # Wrapper for tfdbg # sess = tf_debug.LocalCLIDebugWrapperSession(sess) # Train model max_steps = 400 start_time_global = time.time() start_time_step = time.time() ler_train_char_pre = 1 not_improved_count = 0 for step in range(max_steps): # Compute loss _, loss_train = sess.run([train_op, loss_op], feed_dict=feed_dict) # Gradient check # grads = sess.run(network.clipped_grads, feed_dict=feed_dict) # for grad in grads: # print(np.max(grad)) if (step + 1) % 10 == 0: # Change to evaluation mode feed_dict[keep_prob_input_pl] = 1.0 feed_dict[keep_prob_hidden_pl] = 1.0 # Compute accuracy ler_train_char, ler_train_phone = sess.run( [ler_op_main, ler_op_second], feed_dict=feed_dict) duration_step = time.time() - start_time_step print( 'Step %d: loss = %.3f / cer = %.4f / per = %.4f (%.3f sec)\n' % (step + 1, loss_train, ler_train_char, ler_train_phone, duration_step)) start_time_step = time.time() # Visualize labels_pred_char_st, labels_pred_phone_st = sess.run( [decode_op_main, decode_op_second], feed_dict=feed_dict) labels_true_char = sparsetensor2list( labels_true_char_st, batch_size=batch_size) labels_true_phone = sparsetensor2list( labels_true_phone_st, batch_size=batch_size) labels_pred_char = sparsetensor2list( labels_pred_char_st, batch_size=batch_size) labels_pred_phone = sparsetensor2list( labels_pred_phone_st, batch_size=batch_size) # character print('Character') print(' True: %s' % num2alpha(labels_true_char[0])) print(' Pred: %s' % num2alpha(labels_pred_char[0])) print('Phone') print(' True: %s' % num2phone(labels_true_phone[0])) print(' Pred: %s' % num2phone(labels_pred_phone[0])) print('----------------------------------------') if ler_train_char >= ler_train_char_pre: not_improved_count += 1 else: not_improved_count = 0 if not_improved_count >= 5: print('Modle is Converged.') break ler_train_char_pre = ler_train_char # Change to training mode network.is_training = True duration_global = time.time() - start_time_global print('Total time: %.3f sec' % (duration_global))
def check_training(self, model_type, label_type): print('----- ' + model_type + ', ' + label_type + ' -----') tf.reset_default_graph() with tf.Graph().as_default(): # Load batch data batch_size = 4 inputs, labels, inputs_seq_len, labels_seq_len = generate_data( label_type=label_type, model='attention', batch_size=batch_size) # Define model if label_type == 'character': output_size = 28 else: output_size = 63 # model = load(model_type=model_type) network = blstm_attention_seq2seq.BLSTMAttetion( batch_size=batch_size, input_size=inputs[0].shape[1], encoder_num_units=256, encoder_num_layer=2, attention_dim=128, decoder_num_units=256, decoder_num_layer=1, output_size=output_size, # + <SOS> & <EOS> sos_index=output_size - 2, eos_index=output_size - 1, max_decode_length=50, parameter_init=0.1, clip_grad=5.0, clip_activation_encoder=50, clip_activation_decoder=50, dropout_ratio_input=1.0, dropout_ratio_hidden=1.0, weight_decay=1e-6, beam_width=0) network.define() # NOTE: define model under tf.Graph() # Add to the graph each operation loss_op = network.loss learning_rate = 1e-3 train_op = network.train(optimizer='adam', learning_rate_init=learning_rate, is_scheduled=False) # decode_op = network.greedy_decoder() # decode_op = network.beam_search_decoder(beam_width=20) # posteriors_op = network.posteriors(decode_op) # ler_op = network.ler(decode_op) logits_train, predicted_ids_train, _, _, _ = network.decoder_outputs_train logits_train, predicted_ids_infer, _, _, _ = network.decoder_outputs_infer # Add the variable initializer operation init_op = tf.global_variables_initializer() with tf.Session() as sess: # Initialize parameters sess.run(init_op) # Wrapper for tfdbg # sess = tf_debug.LocalCLIDebugWrapperSession(sess) # Train model max_steps = 400 start_time_global = time.time() start_time_step = time.time() ler_train_pre = 1 not_improved_count = 0 for step in range(max_steps): feed_dict = { network.inputs: inputs, network.labels: labels, network.inputs_seq_len: inputs_seq_len, network.labels_seq_len: labels_seq_len, network.keep_prob_input: network.dropout_ratio_input, network.keep_prob_hidden: network.dropout_ratio_hidden, network.learning_rate: learning_rate } # Compute loss _, loss_train = sess.run([train_op, loss_op], feed_dict=feed_dict) # Gradient check # grads = sess.run(network.clipped_grads, # feed_dict=feed_dict) # for grad in grads: # print(np.max(grad)) if (step + 1) % 10 == 0: # change feed dict for evaluation feed_dict[network.keep_prob_input] = 1.0 feed_dict[network.keep_prob_hidden] = 1.0 # compute accuracy # ler_train = sess.run(ler_op, feed_dict=feed_dict) duration_step = time.time() - start_time_step print('Step %d: loss = %.3f / ler = %.4f (%.3f sec)' % (step + 1, loss_train, 1, duration_step)) start_time_step = time.time() # Visualize ids_train, ids_infer = sess.run( [predicted_ids_train, predicted_ids_infer], feed_dict=feed_dict) if label_type == 'character': print('True: %s' % num2alpha(labels[0])) print('Pred: < %s' % num2alpha(ids_train[0])) else: print('True: %s' % num2phone(labels[0])) print('Pred: < %s' % num2phone(ids_train[0])) # if ler_train >= ler_train_pre: # not_improved_count += 1 # else: # not_improved_count = 0 # if not_improved_count >= 3: # print('Modle is Converged.') # break # ler_train_pre = ler_train duration_global = time.time() - start_time_global print('Total time: %.3f sec' % (duration_global))
def check_training(self, model_type, label_type): print('----- ' + model_type + ', ' + label_type + ' -----') tf.reset_default_graph() with tf.Graph().as_default(): # Load batch data batch_size = 4 inputs, labels, seq_len = generate_data(label_type=label_type, model='ctc', batch_size=batch_size) indices, values, dense_shape = list2sparsetensor(labels) # Define model output_size = 26 if label_type == 'character' else 61 model = load(model_type=model_type) network = model(batch_size=batch_size, input_size=inputs[0].shape[1], num_cell=256, num_layer=2, bottleneck_dim=128, output_size=output_size, parameter_init=0.1, clip_grad=5.0, clip_activation=50, dropout_ratio_input=1.0, dropout_ratio_hidden=1.0, weight_decay=1e-6) network.define() # NOTE: define model under tf.Graph() # Add to the graph each operation loss_op = network.loss() learning_rate = 1e-3 train_op = network.train(optimizer='adam', learning_rate_init=learning_rate, is_scheduled=False) decode_op = network.decoder(decode_type='beam_search', beam_width=20) ler_op = network.ler(decode_op) # Add the variable initializer operation init_op = tf.global_variables_initializer() with tf.Session() as sess: # Initialize parameters sess.run(init_op) # Wrapper for tfdbg # sess = tf_debug.LocalCLIDebugWrapperSession(sess) # Train model max_steps = 400 start_time_global = time.time() start_time_step = time.time() ler_train_pre = 1 not_improved_count = 0 for step in range(max_steps): feed_dict = { network.inputs: inputs, network.label_indices: indices, network.label_values: values, network.label_shape: dense_shape, network.seq_len: seq_len, network.keep_prob_input: network.dropout_ratio_input, network.keep_prob_hidden: network.dropout_ratio_hidden, network.learning_rate: learning_rate } # Compute loss _, loss_train = sess.run( [train_op, loss_op], feed_dict=feed_dict) # Gradient check # grads = sess.run(network.clipped_grads, feed_dict=feed_dict) # for grad in grads: # print(np.max(grad)) if (step + 1) % 10 == 0: # Change feed dict for evaluation feed_dict[network.keep_prob_input] = 1.0 feed_dict[network.keep_prob_hidden] = 1.0 # Compute accuracy ler_train = sess.run(ler_op, feed_dict=feed_dict) duration_step = time.time() - start_time_step print('Step %d: loss = %.3f / ler = %.4f (%.3f sec)' % (step + 1, loss_train, ler_train, duration_step)) start_time_step = time.time() # Visualize labels_st = sess.run(decode_op, feed_dict=feed_dict) labels_pred = sparsetensor2list( labels_st, batch_size=1) if label_type == 'character': print('True: %s' % num2alpha(labels[0])) print('Pred: %s' % num2alpha(labels_pred[0])) else: print('True: %s' % num2phone(labels[0])) print('Pred: %s' % num2phone(labels_pred[0])) if ler_train >= ler_train_pre: not_improved_count += 1 else: not_improved_count = 0 if not_improved_count >= 3: print('Modle is Converged.') break ler_train_pre = ler_train duration_global = time.time() - start_time_global print('Total time: %.3f sec' % (duration_global))
def check_training(self): print('----- multitask -----') tf.reset_default_graph() with tf.Graph().as_default(): # Load batch data batch_size = 4 inputs, labels_char, labels_phone, seq_len = generate_data( label_type='multitask', model='ctc', batch_size=batch_size) indices_char, values_char, dense_shape_char = list2sparsetensor( labels_char) indices_phone, values_phone, dense_shape_phone = list2sparsetensor( labels_phone) # Define model output_size_main = 26 output_size_second = 61 network = Multitask_BLSTM_CTC( batch_size=batch_size, input_size=inputs[0].shape[1], num_cell=256, num_layer_main=2, num_layer_second=1, output_size_main=output_size_main, output_size_second=output_size_second, main_task_weight=0.8, parameter_init=0.1, clip_grad=5.0, clip_activation=50, dropout_ratio_input=1.0, dropout_ratio_hidden=1.0, weight_decay=1e-6) network.define() # NOTE: define model under tf.Graph() # Add to the graph each operation loss_op = network.loss() learning_rate = 1e-3 train_op = network.train(optimizer='rmsprop', learning_rate_init=learning_rate, is_scheduled=False) decode_op_main, decode_op_second = network.decoder( decode_type='beam_search', beam_width=20) ler_op_main, ler_op_second = network.ler(decode_op_main, decode_op_second) # Add the variable initializer operation init_op = tf.global_variables_initializer() with tf.Session() as sess: # Initialize parameters sess.run(init_op) # Train model max_steps = 400 start_time_global = time.time() start_time_step = time.time() ler_train_char_pre = 1 not_improved_count = 0 for step in range(max_steps): feed_dict = { network.inputs: inputs, network.label_indices: indices_char, network.label_values: values_char, network.label_shape: dense_shape_char, network.label_indices_second: indices_phone, network.label_values_second: values_phone, network.label_shape_second: dense_shape_phone, network.seq_len: seq_len, network.keep_prob_input: network.dropout_ratio_input, network.keep_prob_hidden: network.dropout_ratio_hidden, network.learning_rate: learning_rate } # Compute loss _, loss_train = sess.run([train_op, loss_op], feed_dict=feed_dict) # Gradient check # grads = sess.run(network.clipped_grads, feed_dict=feed_dict) # for grad in grads: # print(np.max(grad)) if (step + 1) % 10 == 0: # Change feed dict for evaluation feed_dict[network.keep_prob_input] = 1.0 feed_dict[network.keep_prob_hidden] = 1.0 # Compute accuracy ler_train_char, ler_train_phone = sess.run( [ler_op_main, ler_op_second], feed_dict=feed_dict) duration_step = time.time() - start_time_step print( 'Step %d: loss = %.3f / cer = %.4f / per = %.4f (%.3f sec)\n' % (step + 1, loss_train, ler_train_char, ler_train_phone, duration_step)) start_time_step = time.time() # Visualize labels_st_char, labels_st_phone = sess.run( [decode_op_main, decode_op_main], feed_dict=feed_dict) labels_pred_char = sparsetensor2list(labels_st_char, batch_size=1) labels_pred_phone = sparsetensor2list(labels_st_phone, batch_size=1) # character print('Character') print(' True: %s' % num2alpha(labels_char[0])) print(' Pred: %s' % num2alpha(labels_pred_char[0])) print('Phone') print(' True: %s' % num2phone(labels_phone[0])) print(' Pred: %s' % num2phone(labels_pred_phone[0])) print('----------------------------------------') if ler_train_char >= ler_train_char_pre: not_improved_count += 1 else: not_improved_count = 0 if not_improved_count >= 3: print('Modle is Converged.') break ler_train_char_pre = ler_train_char duration_global = time.time() - start_time_global print('Total time: %.3f sec' % (duration_global))