def decode(session, decode_op, model, dataset, label_type, is_test=True, save_path=None): """Visualize label outputs of CTC model. Args: session: session of training model decode_op: operation for decoding model: the model to evaluate dataset: An instance of a `Dataset` class label_type (string): phone39 or phone48 or phone61 or character or character_capital_divide is_test (bool, optional): save_path (string, optional): path to save decoding results """ if label_type == 'character': map_fn = Idx2char( map_file_path='../metrics/mapping_files/character.txt') elif label_type == 'character_capital_divide': map_fn = Idx2char( map_file_path= '../metrics/mapping_files/character_capital_divide.txt', capital_divide=True) else: map_fn = Idx2phone(map_file_path='../metrics/mapping_files/' + label_type + '.txt') if save_path is not None: sys.stdout = open(join(model.model_dir, 'decode.txt'), 'w') for data, is_new_epoch in dataset: # Create feed dictionary for next mini batch inputs, labels_true, inputs_seq_len, input_names = data feed_dict = { model.inputs_pl_list[0]: inputs[0], model.inputs_seq_len_pl_list[0]: inputs_seq_len[0], model.keep_prob_pl_list[0]: 1.0 } batch_size = inputs[0].shape[0] labels_pred_st = session.run(decode_op, feed_dict=feed_dict) try: labels_pred = sparsetensor2list(labels_pred_st, batch_size=batch_size) except IndexError: # no output labels_pred = [''] for i_batch in range(batch_size): print('----- wav: %s -----' % input_names[0][i_batch]) if 'char' in label_type: if is_test: str_true = labels_true[0][i_batch][0] else: str_true = map_fn(labels_true[0][i_batch]) str_pred = map_fn(labels_pred[i_batch]) else: if is_test: str_true = labels_true[0][i_batch][0] else: str_true = map_fn(labels_true[0][i_batch]) str_pred = map_fn(labels_pred[i_batch]) print('Ref: %s' % str_true) print('Hyp: %s' % str_pred) if is_new_epoch: break
def check(self, encoder_type, lstm_impl, time_major=False): print('==================================================') print(' encoder_type: %s' % str(encoder_type)) print(' lstm_impl: %s' % str(lstm_impl)) print(' time_major: %s' % str(time_major)) print('==================================================') tf.reset_default_graph() with tf.Graph().as_default(): # Load batch data batch_size = 2 inputs, labels_char, labels_phone, inputs_seq_len = generate_data( label_type='multitask', model='ctc', batch_size=batch_size) # Define model graph num_classes_main = 27 num_classes_sub = 61 model = MultitaskCTC( encoder_type=encoder_type, input_size=inputs[0].shape[1], num_units=256, num_layers_main=2, num_layers_sub=1, num_classes_main=num_classes_main, num_classes_sub=num_classes_sub, main_task_weight=0.8, lstm_impl=lstm_impl, parameter_init=0.1, clip_grad_norm=5.0, clip_activation=50, num_proj=256, weight_decay=1e-8, # bottleneck_dim=50, bottleneck_dim=None, time_major=time_major) # Define placeholders model.create_placeholders() learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate') # Add to the graph each operation loss_op, logits_main, logits_sub = model.compute_loss( model.inputs_pl_list[0], model.labels_pl_list[0], model.labels_sub_pl_list[0], model.inputs_seq_len_pl_list[0], model.keep_prob_pl_list[0]) train_op = model.train(loss_op, optimizer='adam', learning_rate=learning_rate_pl) decode_op_main, decode_op_sub = model.decoder( logits_main, logits_sub, model.inputs_seq_len_pl_list[0], beam_width=20) ler_op_main, ler_op_sub = model.compute_ler( decode_op_main, decode_op_sub, model.labels_pl_list[0], model.labels_sub_pl_list[0]) # Define learning rate controller learning_rate = 1e-3 lr_controller = Controller(learning_rate_init=learning_rate, decay_start_epoch=20, decay_rate=0.9, decay_patient_epoch=5, lower_better=True) # Add the variable initializer operation init_op = tf.global_variables_initializer() # Count total parameters parameters_dict, total_parameters = count_total_parameters( tf.trainable_variables()) for parameter_name in sorted(parameters_dict.keys()): print("%s %d" % (parameter_name, parameters_dict[parameter_name])) print("Total %d variables, %s M parameters" % (len(parameters_dict.keys()), "{:,}".format( total_parameters / 1000000))) # Make feed dict feed_dict = { model.inputs_pl_list[0]: inputs, model.labels_pl_list[0]: list2sparsetensor(labels_char, padded_value=-1), model.labels_sub_pl_list[0]: list2sparsetensor(labels_phone, padded_value=-1), model.inputs_seq_len_pl_list[0]: inputs_seq_len, model.keep_prob_pl_list[0]: 0.9, learning_rate_pl: learning_rate } idx2phone = Idx2phone(map_file_path='./phone61.txt') with tf.Session() as sess: # Initialize parameters sess.run(init_op) # Wrapper for tfdbg # sess = tf_debug.LocalCLIDebugWrapperSession(sess) # Train model max_steps = 1000 start_time_step = time.time() for step in range(max_steps): # Compute loss _, loss_train = sess.run([train_op, loss_op], feed_dict=feed_dict) # Gradient check # grads = sess.run(model.clipped_grads, # feed_dict=feed_dict) # for grad in grads: # print(np.max(grad)) if (step + 1) % 10 == 0: # Change to evaluation mode feed_dict[model.keep_prob_pl_list[0]] = 1.0 # Compute accuracy ler_train_char, ler_train_phone = sess.run( [ler_op_main, ler_op_sub], feed_dict=feed_dict) duration_step = time.time() - start_time_step print( 'Step %d: loss = %.3f / cer = %.3f / per = %.3f (%.3f sec) / lr = %.5f' % (step + 1, loss_train, ler_train_char, ler_train_phone, duration_step, learning_rate)) start_time_step = time.time() # Visualize labels_pred_char_st, labels_pred_phone_st = sess.run( [decode_op_main, decode_op_sub], feed_dict=feed_dict) labels_pred_char = sparsetensor2list( labels_pred_char_st, batch_size=batch_size) labels_pred_phone = sparsetensor2list( labels_pred_phone_st, batch_size=batch_size) print('Character') try: print(' Ref: %s' % idx2alpha(labels_char[0])) print(' Hyp: %s' % idx2alpha(labels_pred_char[0])) except IndexError: print('Character') print(' Ref: %s' % idx2alpha(labels_char[0])) print(' Hyp: %s' % '') print('Phone') try: print(' Ref: %s' % idx2phone(labels_phone[0])) print(' Hyp: %s' % idx2phone(labels_pred_phone[0])) except IndexError: print(' Ref: %s' % idx2phone(labels_phone[0])) print(' Hyp: %s' % '') # NOTE: This is for no prediction print('-' * 30) if ler_train_char < 0.1: print('Modle is Converged.') break # Update learning rate learning_rate = lr_controller.decay_lr( learning_rate=learning_rate, epoch=step, value=ler_train_char) feed_dict[learning_rate_pl] = learning_rate
def do_eval_cer(session, decode_op, model, dataset, label_type, eval_batch_size=None, progressbar=False, is_multitask=False): """Evaluate trained model by Character Error Rate. Args: session: session of training model decode_op: operation for decoding model: the model to evaluate dataset: An instance of a `Dataset` class label_type (string): character or character_capital_divide eval_batch_size (int, optional): the batch size when evaluating the model progressbar (bool, optional): if True, visualize the progressbar is_multitask (bool, optional): if True, evaluate the multitask model Return: cer_mean (float): An average of CER wer_mean (float): An average of WER """ # Reset data counter dataset.reset() if label_type == 'character': idx2char = Idx2char( map_file_path='../metrics/mapping_files/ctc/character.txt') elif label_type == 'character_capital_divide': idx2char = Idx2char( map_file_path= '../metrics/mapping_files/ctc/character_capital_divide.txt', capital_divide=True, space_mark='_') cer_mean, wer_mean = 0, 0 if progressbar: pbar = tqdm(total=len(dataset)) for data, is_new_epoch in dataset: # Create feed dictionary for next mini batch if is_multitask: inputs, labels_true, _, inputs_seq_len, _ = data else: inputs, labels_true, inputs_seq_len, _ = data feed_dict = { model.inputs_pl_list[0]: inputs, model.inputs_seq_len_pl_list[0]: inputs_seq_len, model.keep_prob_input_pl_list[0]: 1.0, model.keep_prob_hidden_pl_list[0]: 1.0, model.keep_prob_output_pl_list[0]: 1.0 } batch_size_each = len(inputs) labels_pred_st = session.run(decode_op, feed_dict=feed_dict) labels_pred = sparsetensor2list(labels_pred_st, batch_size_each) for i_batch in range(batch_size_each): # Convert from list of index to string str_true = idx2char(labels_true[i_batch]) str_pred = idx2char(labels_pred[i_batch]) # Remove consecutive spaces str_pred = re.sub(r'[_]+', '_', str_pred) # Remove garbage labels str_true = re.sub(r'[\'\":;!?,.-]+', "", str_true) str_pred = re.sub(r'[\'\":;!?,.-]+', "", str_pred) # Compute WER wer_mean += compute_wer(hyp=str_pred.split('_'), ref=str_true.split('_'), normalize=True) # substitute, insert, delete = wer_align( # ref=str_pred.split('_'), # hyp=str_true.split('_')) # print(substitute) # print(insert) # print(delete) # Remove spaces str_pred = re.sub(r'[_]+', "", str_pred) str_true = re.sub(r'[_]+', "", str_true) # Compute CER cer_mean += compute_cer(str_pred=str_pred, str_true=str_true, normalize=True) if progressbar: pbar.update(1) if is_new_epoch: break cer_mean /= len(dataset) wer_mean /= len(dataset) return cer_mean, wer_mean
def decode(session, decode_op, model, dataset, label_type, ss_type, is_test=False, eval_batch_size=None, save_path=None): """Visualize label outputs of CTC model. Args: session: session of training model decode_op: operation for decoding model: the model to evaluate dataset: An instance of a `Dataset` class label_type (string): kana ss_type (string): remove or insert_left or insert_both or insert_right is_test (bool, optional): set to True when evaluating by the test set eval_batch_size (int, optional): the batch size when evaluating the model save_path (string, optional): path to save decoding results """ batch_size_original = dataset.batch_size # Reset data counter dataset.reset() # Set batch size in the evaluation if eval_batch_size is not None: dataset.batch_size = eval_batch_size idx2char = Idx2char(map_file_path='../metrics/mapping_files/' + label_type + '_' + ss_type + '.txt') if save_path is not None: sys.stdout = open(join(model.model_dir, 'decode.txt'), 'w') for data, is_new_epoch in dataset: # Create feed dictionary for next mini batch inputs, labels_true, inputs_seq_len, input_names = data feed_dict = { model.inputs_pl_list[0]: inputs[0], model.inputs_seq_len_pl_list[0]: inputs_seq_len[0], model.keep_prob_pl_list[0]: 1.0 } batch_size = inputs[0].shape[0] labels_pred_st = session.run(decode_op, feed_dict=feed_dict) try: labels_pred = sparsetensor2list(labels_pred_st, batch_size=batch_size) except IndexError: # no output labels_pred = [''] for i_batch in range(batch_size): print('----- wav: %s -----' % input_names[0][i_batch]) if 'char' in label_type: if is_test: str_true = labels_true[0][i_batch][0] else: str_true = idx2char(labels_true[0][i_batch]) str_pred = idx2char(labels_pred[i_batch]) else: if is_test: str_true = labels_true[0][i_batch][0] else: str_true = idx2char(labels_true[0][i_batch]) str_pred = idx2char(labels_pred[i_batch]) print('Ref: %s' % str_true) print('Hyp: %s' % str_pred) if is_new_epoch: break # Register original batch size if eval_batch_size is not None: dataset.batch_size = batch_size_original
def do_eval_cer(session, decode_ops, model, dataset, label_type, train_data_size, is_test=False, eval_batch_size=None, progressbar=False, is_multitask=False, is_main=False): """Evaluate trained model by Character Error Rate. Args: session: session of training model decode_ops (list): operations for decoding model: the model to evaluate dataset: An instance of `Dataset` class label_type (string): kanji or kanji or kanji_divide or kana_divide train_data_size (string): train_subset or train_fullset is_test (bool, optional): set to True when evaluating by the test set eval_batch_size (int, optional): the batch size when evaluating the model progressbar (bool, optional): if True, visualize progressbar is_multitask (bool, optional): if True, evaluate the multitask model is_main (bool, optional): if True, evaluate the main task Return: cer_mean (float): An average of CER """ # NOTE: add multitask version assert isinstance(decode_ops, list), "decode_ops must be a list." batch_size_original = dataset.batch_size # Reset data counter dataset.reset() # Set batch size in the evaluation if eval_batch_size is not None: dataset.batch_size = eval_batch_size if 'kanji' in label_type: map_file_path = '../metrics/mapping_files/' + \ label_type + '_' + train_data_size + '.txt' elif 'kana' in label_type: map_file_path = '../metrics/mapping_files/' + label_type + '.txt' else: raise TypeError idx2char = Idx2char(map_file_path=map_file_path) cer_mean = 0 skip_data_num = 0 if progressbar: pbar = tqdm(total=len(dataset)) for data, is_new_epoch in dataset: # Create feed dictionary for next mini batch inputs, labels_true, inputs_seq_len, _ = data feed_dict = {} for i_device in range(len(decode_ops)): feed_dict[model.inputs_pl_list[i_device]] = inputs[i_device] feed_dict[model.inputs_seq_len_pl_list[i_device]] = inputs_seq_len[ i_device] feed_dict[model.keep_prob_pl_list[i_device]] = 1.0 labels_pred_st_list = session.run(decode_ops, feed_dict=feed_dict) for i_device in range(len(labels_pred_st_list)): batch_size_device = len(inputs[i_device]) try: labels_pred = sparsetensor2list(labels_pred_st_list[i_device], batch_size_device) for i_batch in range(batch_size_device): # Convert from list of index to string if is_test: str_true = labels_true[i_device][i_batch][0] # NOTE: transcript may be seperated by space('_') else: str_true = idx2char(labels_true[i_device][i_batch]) str_pred = idx2char(labels_pred[i_batch]) # Remove garbage labels str_true = re.sub(r'[_NZー・]+', '', str_true) str_pred = re.sub(r'[_NZー・]+', '', str_pred) # Compute CER cer_mean += compute_cer(str_pred=str_pred, str_true=str_true, normalize=True) if progressbar: pbar.update(1) except: print('skipped') skip_data_num += batch_size_device # TODO: Conduct decoding again with batch size 1 if progressbar: pbar.update(batch_size_device) if is_new_epoch: break cer_mean /= (len(dataset) - skip_data_num) # Register original batch size if eval_batch_size is not None: dataset.batch_size = batch_size_original return cer_mean
def do_eval_fmeasure(session, decode_op, model, dataset, label_type, ss_type, is_test=False, eval_batch_size=None, progressbar=False): """Evaluate trained model by F-measure. Args: session: session of training model decode_op: operation for decoding model: the model to evaluate dataset: An instance of a `Dataset` class label_type (string): kana ss_type (string): remove or insert_left or insert_both or insert_right is_test (bool, optional): set to True when evaluating by the test set eval_batch_size (int, optional): the batch size when evaluating the model progressbar (bool, optional): if True, visualize the progressbar Returns: f_mean (float): An average of F-measure of each social signal """ batch_size_original = dataset.batch_size # Reset data counter dataset.reset() # Set batch size in the evaluation if eval_batch_size is not None: dataset.batch_size = eval_batch_size idx2char = Idx2char( map_file_path='../metrics/mapping_files/' + label_type + '_' + ss_type + '.txt') tp_lau, fp_lau, fn_lau = 0., 0., 0. tp_fil, fp_fil, fn_fil = 0., 0., 0. tp_bac, fp_bac, fn_bac = 0., 0., 0. tp_dis, fp_dis, fn_dis = 0., 0., 0. if progressbar: pbar = tqdm(total=len(dataset)) for data, is_new_epoch in dataset: # Create feed dictionary for next mini batch inputs, labels_true, inputs_seq_len, _ = data feed_dict = { model.inputs_pl_list[0]: inputs[0], model.inputs_seq_len_pl_list[0]: inputs_seq_len[0], model.keep_prob_pl_list[0]: 1.0 } batch_size = inputs[0].shape[0] labels_pred_st = session.run(decode_op, feed_dict=feed_dict) try: labels_pred = sparsetensor2list(labels_pred_st, batch_size) for i_batch in range(batch_size): # Convert from list of index to string if is_test: str_true = labels_true[0][i_batch][0] # NOTE: transcript is seperated by space('_') else: # Convert from list of index to string str_true = idx2char(labels_true[0][i_batch], padded_value=dataset.padded_value) str_pred = idx2char(labels_pred[i_batch]) detected_lau_num = str_pred.count('L') detected_fil_num = str_pred.count('F') detected_bac_num = str_pred.count('B') detected_dis_num = str_pred.count('D') true_lau_num = str_true.count('L') true_fil_num = str_true.count('F') true_bac_num = str_true.count('B') true_dis_num = str_true.count('D') # Laughter if detected_lau_num <= true_lau_num: tp_lau += detected_lau_num fn_lau += true_lau_num - detected_lau_num else: tp_lau += true_lau_num fp_lau += detected_lau_num - true_lau_num # Filler if detected_fil_num <= true_fil_num: tp_fil += detected_fil_num fn_fil += true_fil_num - detected_fil_num else: tp_fil += true_fil_num fp_fil += detected_fil_num - true_fil_num # Backchannel if detected_bac_num <= true_bac_num: tp_bac += detected_bac_num fn_bac += true_bac_num - detected_bac_num else: tp_bac += true_bac_num fp_bac += detected_bac_num - true_bac_num # Disfluency if detected_dis_num <= true_dis_num: tp_dis += detected_dis_num fn_dis += true_dis_num - detected_dis_num else: tp_dis += true_dis_num fp_dis += detected_dis_num - true_dis_num if progressbar: pbar.update(1) except: print('skipped') if progressbar: pbar.update(batch_size) if is_new_epoch: break p_lau = tp_lau / (tp_lau + fp_lau) if (tp_lau + fp_lau) != 0 else 0 r_lau = tp_lau / (tp_lau + fn_lau) if (tp_lau + fn_lau) != 0 else 0 f_lau = 2 * r_lau * p_lau / (r_lau + p_lau) if (r_lau + p_lau) != 0 else 0 r_fil = tp_fil / (tp_fil + fn_fil) if (tp_fil + fn_fil) != 0 else 0 p_fil = tp_fil / (tp_fil + fp_fil) if (tp_fil + fp_fil) != 0 else 0 f_fil = 2 * r_fil * p_fil / (r_fil + p_fil) if (r_fil + p_fil) != 0 else 0 p_bac = tp_bac / (tp_bac + fp_bac) if (tp_bac + fp_bac) != 0 else 0 r_bac = tp_bac / (tp_bac + fn_bac) if (tp_bac + fn_bac) != 0 else 0 f_bac = 2 * r_bac * p_bac / (r_bac + p_bac) if (r_bac + p_bac) != 0 else 0 r_dis = tp_dis / (tp_dis + fn_dis) if (tp_dis + fn_dis) != 0 else 0 p_dis = tp_dis / (tp_dis + fp_dis) if (tp_dis + fp_dis) != 0 else 0 f_dis = 2 * r_dis * p_dis / (r_dis + p_dis) if (r_dis + p_dis) != 0 else 0 acc_lau = [p_lau, r_lau, f_lau] acc_fil = [p_fil, r_fil, f_fil] acc_bac = [p_bac, r_bac, f_bac] acc_dis = [p_dis, r_dis, f_dis] mean = [(p_lau + p_fil + p_bac + p_dis) / 4., (r_lau + r_fil + r_bac + r_dis) / 4., (f_lau + f_fil + f_bac + f_dis) / 4.] df_acc = pd.DataFrame({'Laughter': acc_lau, 'Filler': acc_fil, 'Backchannel': acc_bac, 'Disfluency': acc_dis, 'Mean': mean}, columns=['Laughter', 'Filler', 'Backchannel', 'Disfluency', 'Mean'], index=['Precision', 'Recall', 'F-measure']) # Register original batch size if eval_batch_size is not None: dataset.batch_size = batch_size_original return df_acc
def check_training(self, encoder_type, label_type, lstm_impl='LSTMBlockCell', save_params=False): print('==================================================') print(' encoder_type: %s' % encoder_type) print(' label_type: %s' % label_type) print(' lstm_impl: %s' % lstm_impl) print('==================================================') tf.reset_default_graph() with tf.Graph().as_default(): # Load batch data batch_size = 1 splice = 11 if encoder_type in [ 'vgg_blstm', 'vgg_lstm', 'vgg_wang', 'resnet_wang', 'cnn_zhang' ] else 1 inputs, labels_true_st, inputs_seq_len = generate_data( label_type=label_type, model='ctc', batch_size=batch_size, splice=splice) # NOTE: input_size must be even number when using CudnnLSTM # Define model graph num_classes = 26 if label_type == 'character' else 61 model = CTC( encoder_type=encoder_type, input_size=inputs[0].shape[-1] // splice, splice=splice, num_units=256, num_layers=2, num_classes=num_classes, lstm_impl=lstm_impl, parameter_init=0.1, clip_grad=5.0, clip_activation=50, num_proj=256, # bottleneck_dim=50, bottleneck_dim=None, weight_decay=1e-8) # Define placeholders model.create_placeholders() learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate') # Add to the graph each operation loss_op, logits = model.compute_loss( model.inputs_pl_list[0], model.labels_pl_list[0], model.inputs_seq_len_pl_list[0], model.keep_prob_input_pl_list[0], model.keep_prob_hidden_pl_list[0], model.keep_prob_output_pl_list[0]) train_op = model.train(loss_op, optimizer='adam', learning_rate=learning_rate_pl) # NOTE: Adam does not run on CudnnLSTM decode_op = model.decoder(logits, model.inputs_seq_len_pl_list[0], beam_width=20) ler_op = model.compute_ler(decode_op, model.labels_pl_list[0]) # Define learning rate controller learning_rate = 1e-3 lr_controller = Controller(learning_rate_init=learning_rate, decay_start_epoch=10, decay_rate=0.98, decay_patient_epoch=5, lower_better=True) if save_params: # Create a saver for writing training checkpoints saver = tf.train.Saver(max_to_keep=None) # Add the variable initializer operation init_op = tf.global_variables_initializer() # Count total parameters if lstm_impl != 'CudnnLSTM': parameters_dict, total_parameters = count_total_parameters( tf.trainable_variables()) for parameter_name in sorted(parameters_dict.keys()): print("%s %d" % (parameter_name, parameters_dict[parameter_name])) print("Total %d variables, %s M parameters" % (len(parameters_dict.keys()), "{:,}".format( total_parameters / 1000000))) # Make feed dict feed_dict = { model.inputs_pl_list[0]: inputs, model.labels_pl_list[0]: labels_true_st, model.inputs_seq_len_pl_list[0]: inputs_seq_len, model.keep_prob_input_pl_list[0]: 1.0, model.keep_prob_hidden_pl_list[0]: 1.0, model.keep_prob_output_pl_list[0]: 1.0, learning_rate_pl: learning_rate } idx2phone = Idx2phone(map_file_path='./phone61_ctc.txt') with tf.Session() as sess: # Initialize parameters sess.run(init_op) # Wrapper for tfdbg # sess = tf_debug.LocalCLIDebugWrapperSession(sess) # Train model max_steps = 1000 start_time_global = time.time() start_time_step = time.time() ler_train_pre = 1 not_improved_count = 0 for step in range(max_steps): # Compute loss _, loss_train = sess.run([train_op, loss_op], feed_dict=feed_dict) # Gradient check # grads = sess.run(model.clipped_grads, # feed_dict=feed_dict) # for grad in grads: # print(np.max(grad)) if (step + 1) % 10 == 0: # Change to evaluation mode feed_dict[model.keep_prob_input_pl_list[0]] = 1.0 feed_dict[model.keep_prob_hidden_pl_list[0]] = 1.0 feed_dict[model.keep_prob_output_pl_list[0]] = 1.0 # Compute accuracy ler_train = sess.run(ler_op, feed_dict=feed_dict) duration_step = time.time() - start_time_step print( 'Step %d: loss = %.3f / ler = %.3f (%.3f sec) / lr = %.5f' % (step + 1, loss_train, ler_train, duration_step, learning_rate)) start_time_step = time.time() # Decode labels_pred_st = sess.run(decode_op, feed_dict=feed_dict) labels_true = sparsetensor2list(labels_true_st, batch_size=batch_size) # Visualize try: labels_pred = sparsetensor2list( labels_pred_st, batch_size=batch_size) if label_type == 'character': print('Ref: %s' % idx2alpha(labels_true[0])) print('Hyp: %s' % idx2alpha(labels_pred[0])) else: print('Ref: %s' % idx2phone(labels_true[0])) print('Hyp: %s' % idx2phone(labels_pred[0])) except IndexError: if label_type == 'character': print('Ref: %s' % idx2alpha(labels_true[0])) print('Hyp: %s' % '') else: print('Ref: %s' % idx2phone(labels_true[0])) print('Hyp: %s' % '') # NOTE: This is for no prediction if ler_train >= ler_train_pre: not_improved_count += 1 else: not_improved_count = 0 if ler_train < 0.05: print('Modle is Converged.') if save_params: # Save model (check point) checkpoint_file = './model.ckpt' save_path = saver.save(sess, checkpoint_file, global_step=1) print("Model saved in file: %s" % save_path) break ler_train_pre = ler_train # Update learning rate learning_rate = lr_controller.decay_lr( learning_rate=learning_rate, epoch=step, value=ler_train) feed_dict[learning_rate_pl] = learning_rate duration_global = time.time() - start_time_global print('Total time: %.3f sec' % (duration_global))
def decode_test_multitask(session, decode_op_main, decode_op_sub, model, dataset, train_data_size, label_type_main, label_type_sub, is_test=False, save_path=None): """Visualize label outputs of Multi-task CTC model. Args: session: session of training model decode_op_main: operation for decoding in the main task decode_op_sub: operation for decoding in the sub task model: the model to evaluate dataset: An instance of a `Dataset` class label_type_main (string): word label_type_sub (string): character or character_capital_divide train_data_size (string, optional): train_clean100 or train_clean360 or train_other500 or train_all save_path (string, optional): path to save decoding results """ idx2word = Idx2word(map_file_path='../metrics/mapping_files/ctc/word_' + train_data_size + '.txt') idx2char = Idx2char(map_file_path='../metrics/mapping_files/ctc/' + label_type_sub + '.txt') if save_path is not None: sys.stdout = open(join(model.model_dir, 'decode.txt'), 'w') while True: # Create feed dictionary for next mini batch data, is_new_epoch = dataset.next(batch_size=1) inputs, labels_true_word, labels_true_char, inputs_seq_len, input_names = data # NOTE: Batch size is expected to be 1 feed_dict = { model.inputs_pl_list[0]: inputs[0], model.inputs_seq_len_pl_list[0]: inputs_seq_len[0], model.keep_prob_input_pl_list[0]: 1.0, model.keep_prob_hidden_pl_list[0]: 1.0, model.keep_prob_output_pl_list[0]: 1.0 } # Visualize labels_pred_st_word, labels_pred_st_char = session.run( [decode_op_main, decode_op_sub], feed_dict=feed_dict) try: labels_pred_word = sparsetensor2list(labels_pred_st_word, batch_size=1) except IndexError: # no output labels_pred_word = [''] try: labels_pred_char = sparsetensor2list(labels_pred_st_char, batch_size=1) except IndexError: # no output labels_pred_char = [''] print('----- wav: %s -----' % input_names[0][0]) if dataset.is_test: str_true_word = labels_true_word[0][0][0] else: str_true_word = ' '.join(idx2word(labels_true_word[0][0])) str_pred_word = ' '.join(idx2word(labels_pred_word[0])) print('Ref (word): %s' % str_true_word) print('Hyp (word): %s' % str_pred_word) str_true_char = idx2char(labels_true_char[0][0]) str_pred_char = idx2char(labels_pred_char[0]).replace('_', ' ') print('Ref (char): %s' % str_true_char) print('Hyp (char): %s' % str_pred_char) if is_new_epoch: break
def check(self, decoder_type): print('==================================================') print(' decoder_type: %s' % decoder_type) print('==================================================') tf.reset_default_graph() with tf.Graph().as_default(): # Load batch data batch_size = 2 num_stack = 2 inputs, labels, inputs_seq_len = generate_data( label_type='character', model='ctc', batch_size=batch_size, num_stack=num_stack, splice=1) max_time = inputs.shape[1] # Define model graph model = CTC(encoder_type='blstm', input_size=inputs[0].shape[-1], splice=1, num_stack=num_stack, num_units=256, num_layers=2, num_classes=27, lstm_impl='LSTMBlockCell', parameter_init=0.1, clip_grad_norm=5.0, clip_activation=50, num_proj=256, weight_decay=1e-6) # Define placeholders model.create_placeholders() # Add to the graph each operation _, logits = model.compute_loss(model.inputs_pl_list[0], model.labels_pl_list[0], model.inputs_seq_len_pl_list[0], model.keep_prob_pl_list[0]) beam_width = 20 if 'beam_search' in decoder_type else 1 decode_op = model.decoder(logits, model.inputs_seq_len_pl_list[0], beam_width=beam_width) ler_op = model.compute_ler(decode_op, model.labels_pl_list[0]) posteriors_op = model.posteriors(logits, blank_prior=1) if decoder_type == 'np_greedy': decoder = GreedyDecoder(blank_index=model.num_classes) elif decoder_type == 'np_beam_search': decoder = BeamSearchDecoder(space_index=26, blank_index=model.num_classes - 1) # Make feed dict feed_dict = { model.inputs_pl_list[0]: inputs, model.labels_pl_list[0]: list2sparsetensor(labels, padded_value=-1), model.inputs_seq_len_pl_list[0]: inputs_seq_len, model.keep_prob_pl_list[0]: 1.0 } # Create a saver for writing training checkpoints saver = tf.train.Saver() with tf.Session() as sess: ckpt = tf.train.get_checkpoint_state('./') # If check point exists if ckpt: model_path = ckpt.model_checkpoint_path saver.restore(sess, model_path) print("Model restored: " + model_path) else: raise ValueError('There are not any checkpoints.') if decoder_type in ['tf_greedy', 'tf_beam_search']: # Decode labels_pred_st = sess.run(decode_op, feed_dict=feed_dict) labels_pred = sparsetensor2list(labels_pred_st, batch_size=batch_size) # Compute accuracy cer = sess.run(ler_op, feed_dict=feed_dict) else: # Compute CTC posteriors probs = sess.run(posteriors_op, feed_dict=feed_dict) probs = probs.reshape(-1, max_time, model.num_classes) if decoder_type == 'np_greedy': # Decode labels_pred = decoder(probs=probs, seq_len=inputs_seq_len) elif decoder_type == 'np_beam_search': # Decode labels_pred, scores = decoder(probs=probs, seq_len=inputs_seq_len, beam_width=beam_width) # Compute accuracy cer = compute_cer(str_pred=idx2alpha(labels_pred[0]), str_true=idx2alpha(labels[0]), normalize=True) # Visualize print('CER: %.3f %%' % (cer * 100)) print('Ref: %s' % idx2alpha(labels[0])) print('Hyp: %s' % idx2alpha(labels_pred[0]))
def decode_test_multitask(session, decode_op_main, decode_op_sub, model, dataset, label_type_main, label_type_sub, save_path=None): """Visualize label outputs of Multi-task CTC model. Args: session: session of training model decode_op_main: operation for decoding in the main task decode_op_sub: operation for decoding in the sub task model: the model to evaluate dataset: An instance of a `Dataset` class label_type_main (string): character or character_capital_divide label_type_sub (string): phone39 or phone48 or phone61 save_path (string, optional): path to save decoding results """ # TODO: fix if save_path is not None: sys.stdout = open(join(model.model_dir, 'decode.txt'), 'w') # Decode character print('===== ' + label_type_main + ' =====') idx2char = Idx2char(map_file_path='../metrics/mapping_files/ctc/' + label_type_main + '.txt') while True: # Create feed dictionary for next mini batch data, is_new_epoch = dataset.next(batch_size=1) inputs, labels_true, _, inputs_seq_len, input_names = data # NOTE: Batch size is expected to be 1 feed_dict = { model.inputs_pl_list[0]: inputs, model.inputs_seq_len_pl_list[0]: inputs_seq_len, model.keep_prob_input_pl_list[0]: 1.0, model.keep_prob_hidden_pl_list[0]: 1.0, model.keep_prob_output_pl_list[0]: 1.0 } # Visualize labels_pred_st = session.run(decode_op_main, feed_dict=feed_dict) labels_pred = sparsetensor2list(labels_pred_st, batch_size=1) print('----- wav: %s -----' % input_names[0]) print('Ref: %s' % idx2char(labels_true[0])) print('Hyp: %s' % idx2char(labels_pred[0])) if is_new_epoch: break # Decode phone print('\n===== ' + label_type_sub + ' =====') idx2phone = Idx2phone(map_file_path='../metrics/mapping_files/ctc/' + label_type_sub + '.txt') while True: # Create feed dictionary for next mini batch data, is_new_epoch = dataset.next(batch_size=1) inputs, _, labels_true, inputs_seq_len, input_names = data feed_dict = { model.inputs_pl_list[0]: inputs, model.inputs_seq_len_pl_list[0]: inputs_seq_len, model.keep_prob_input_pl_list[0]: 1.0, model.keep_prob_hidden_pl_list[0]: 1.0, model.keep_prob_output_pl_list[0]: 1.0 } # Visualize labels_pred_st = session.run(decode_op_sub, feed_dict=feed_dict) try: labels_pred = sparsetensor2list(labels_pred_st, batch_size=1) except IndexError: # no output labels_pred = [''] finally: print('----- wav: %s -----' % input_names[0]) print('Ref: %s' % idx2phone(labels_true[0])) print('Hyp: %s' % idx2phone(labels_pred[0])) if is_new_epoch: break
def decode_test(session, decode_op, model, dataset, label_type, train_data_size, save_path=None): """Visualize label outputs of CTC model. Args: session: session of training model decode_op: operation for decoding model: the model to evaluate dataset: An instance of a `Dataset` class label_type (string): character or character_capital_divide or word train_data_size (string, optional): train_clean100 or train_clean360 or train_other500 or train_all save_path (string, optional): path to save decoding results """ if label_type == 'character': idx2char = Idx2char( map_file_path='../metrics/mapping_files/ctc/character.txt') elif label_type == 'character_capital_divide': idx2char = Idx2char( map_file_path= '../metrics/mapping_files/ctc/character_capital_divide.txt', capital_divide=True) elif label_type == 'word': idx2word = Idx2word( map_file_path='../metrics/mapping_files/ctc/word_' + train_data_size + '.txt') if save_path is not None: sys.stdout = open(join(model.model_dir, 'decode.txt'), 'w') while True: # Create feed dictionary for next mini batch data, is_new_epoch = dataset.next(batch_size=1) inputs, labels_true, inputs_seq_len, input_names = data # NOTE: Batch size is expected to be 1 feed_dict = { model.inputs_pl_list[0]: inputs[0], model.inputs_seq_len_pl_list[0]: inputs_seq_len[0], model.keep_prob_input_pl_list[0]: 1.0, model.keep_prob_hidden_pl_list[0]: 1.0, model.keep_prob_output_pl_list[0]: 1.0 } # Visualize labels_pred_st = session.run(decode_op, feed_dict=feed_dict) try: labels_pred = sparsetensor2list(labels_pred_st, batch_size=1) except IndexError: # no output labels_pred = [''] finally: print('----- wav: %s -----' % input_names[0][0]) if label_type == 'character': str_true = idx2char(labels_true[0][0]).replace('_', ' ') str_pred = idx2char(labels_pred[0]).replace('_', ' ') elif label_type == 'character_capital_divide': str_true = idx2char(labels_true[0][0]) str_pred = idx2char(labels_pred[0]) else: if dataset.is_test: str_true = labels_true[0][0][0] else: str_true = ' '.join(idx2word(labels_true[0][0])) str_pred = ' '.join(idx2word(labels_pred[0])) print('Ref: %s' % str_true) print('Hyp: %s' % str_pred) # wer_align(ref=str_true.split(), hyp=str_pred.split()) if is_new_epoch: break
def decode_test(session, decode_op, model, dataset, label_type, save_path=None): """Visualize label outputs of CTC model. Args: session: session of training model decode_op: operation for decoding model: the model to evaluate dataset: An instance of a `Dataset` class label_type (string): phone39 or phone48 or phone61 or character or character_capital_divide save_path (string, optional): path to save decoding results """ if label_type == 'character': map_fn = Idx2char( map_file_path='../metrics/mapping_files/ctc/character.txt') elif label_type == 'character_capital_divide': map_fn = Idx2char( map_file_path= '../metrics/mapping_files/ctc/character_capital_divide.txt', capital_divide=True) else: map_fn = Idx2phone(map_file_path='../metrics/mapping_files/ctc/' + label_type + '.txt') if save_path is not None: sys.stdout = open(join(model.model_dir, 'decode.txt'), 'w') while True: # Create feed dictionary for next mini batch data, is_new_epoch = dataset.next(batch_size=1) inputs, labels_true, inputs_seq_len, input_names = data # NOTE: Batch size is expected to be 1 feed_dict = { model.inputs_pl_list[0]: inputs, model.inputs_seq_len_pl_list[0]: inputs_seq_len, model.keep_prob_input_pl_list[0]: 1.0, model.keep_prob_hidden_pl_list[0]: 1.0, model.keep_prob_output_pl_list[0]: 1.0 } # Visualize labels_pred_st = session.run(decode_op, feed_dict=feed_dict) try: labels_pred = sparsetensor2list(labels_pred_st, batch_size=1) except IndexError: # no output labels_pred = [''] finally: print('----- wav: %s -----' % input_names[0]) if label_type == 'character': true_seq = map_fn(labels_true[0]).replace('_', ' ') pred_seq = map_fn(labels_pred[0]).replace('_', ' ') else: true_seq = map_fn(labels_true[0]) pred_seq = map_fn(labels_pred[0]) print('Ref: %s' % true_seq) print('Hyp: %s' % pred_seq) if is_new_epoch: break
def check(self, encoder_type, label_type='character', lstm_impl=None, time_major=True, save_params=False): print('==================================================') print(' encoder_type: %s' % encoder_type) print(' label_type: %s' % label_type) print(' lstm_impl: %s' % lstm_impl) print(' time_major: %s' % str(time_major)) print(' save_params: %s' % str(save_params)) print('==================================================') tf.reset_default_graph() with tf.Graph().as_default(): # Load batch data batch_size = 2 splice = 11 if encoder_type in ['vgg_blstm', 'vgg_lstm', 'cnn_zhang', 'vgg_wang', 'resnet_wang', 'cldnn_wang'] else 1 num_stack = 2 inputs, labels, inputs_seq_len = generate_data( label_type=label_type, model='ctc', batch_size=batch_size, num_stack=num_stack, splice=splice) # NOTE: input_size must be even number when using CudnnLSTM # Define model graph num_classes = 27 if label_type == 'character' else 61 model = CTC(encoder_type=encoder_type, input_size=inputs[0].shape[-1] // splice // num_stack, splice=splice, num_stack=num_stack, num_units=256, num_layers=2, num_classes=num_classes, lstm_impl=lstm_impl, parameter_init=0.1, clip_grad_norm=5.0, clip_activation=50, num_proj=256, weight_decay=1e-10, # bottleneck_dim=50, bottleneck_dim=None, time_major=time_major) # Define placeholders model.create_placeholders() learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate') # Add to the graph each operation loss_op, logits = model.compute_loss( model.inputs_pl_list[0], model.labels_pl_list[0], model.inputs_seq_len_pl_list[0], model.keep_prob_pl_list[0]) train_op = model.train(loss_op, optimizer='nestrov', learning_rate=learning_rate_pl) # NOTE: Adam does not run on CudnnLSTM decode_op = model.decoder(logits, model.inputs_seq_len_pl_list[0], beam_width=20) ler_op = model.compute_ler(decode_op, model.labels_pl_list[0]) # Define learning rate controller learning_rate = 1e-4 lr_controller = Controller(learning_rate_init=learning_rate, decay_start_epoch=50, decay_rate=0.9, decay_patient_epoch=10, lower_better=True) if save_params: # Create a saver for writing training checkpoints saver = tf.train.Saver(max_to_keep=None) # Add the variable initializer operation init_op = tf.global_variables_initializer() # Count total parameters if lstm_impl != 'CudnnLSTM': parameters_dict, total_parameters = count_total_parameters( tf.trainable_variables()) for parameter_name in sorted(parameters_dict.keys()): print("%s %d" % (parameter_name, parameters_dict[parameter_name])) print("Total %d variables, %s M parameters" % (len(parameters_dict.keys()), "{:,}".format(total_parameters / 1000000))) # Make feed dict feed_dict = { model.inputs_pl_list[0]: inputs, model.labels_pl_list[0]: list2sparsetensor(labels, padded_value=-1), model.inputs_seq_len_pl_list[0]: inputs_seq_len, model.keep_prob_pl_list[0]: 1.0, learning_rate_pl: learning_rate } idx2phone = Idx2phone(map_file_path='./phone61.txt') with tf.Session() as sess: # Initialize parameters sess.run(init_op) # Wrapper for tfdbg # sess = tf_debug.LocalCLIDebugWrapperSession(sess) # Train model max_steps = 1000 start_time_step = time.time() for step in range(max_steps): # for debug # encoder_outputs = sess.run( # model.encoder_outputs, feed_dict) # print(encoder_outputs.shape) # Compute loss _, loss_train = sess.run( [train_op, loss_op], feed_dict=feed_dict) # Gradient check # grads = sess.run(model.clipped_grads, # feed_dict=feed_dict) # for grad in grads: # print(np.max(grad)) if (step + 1) % 10 == 0: # Change to evaluation mode feed_dict[model.keep_prob_pl_list[0]] = 1.0 # Compute accuracy ler_train = sess.run(ler_op, feed_dict=feed_dict) duration_step = time.time() - start_time_step print('Step %d: loss = %.3f / ler = %.3f (%.3f sec) / lr = %.5f' % (step + 1, loss_train, ler_train, duration_step, learning_rate)) start_time_step = time.time() # Decode labels_pred_st = sess.run( decode_op, feed_dict=feed_dict) # Visualize try: labels_pred = sparsetensor2list( labels_pred_st, batch_size=batch_size) if label_type == 'character': print('Ref: %s' % idx2alpha(labels[0])) print('Hyp: %s' % idx2alpha(labels_pred[0])) else: print('Ref: %s' % idx2phone(labels[0])) print('Hyp: %s' % idx2phone(labels_pred[0])) except IndexError: if label_type == 'character': print('Ref: %s' % idx2alpha(labels[0])) print('Hyp: %s' % '') else: print('Ref: %s' % idx2phone(labels[0])) print('Hyp: %s' % '') # NOTE: This is for no prediction if ler_train < 0.1: print('Modle is Converged.') if save_params: # Save model (check point) checkpoint_file = './model.ckpt' save_path = saver.save( sess, checkpoint_file, global_step=2) print("Model saved in file: %s" % save_path) break # Update learning rate learning_rate = lr_controller.decay_lr( learning_rate=learning_rate, epoch=step, value=ler_train) feed_dict[learning_rate_pl] = learning_rate
def do_eval_cer(session, decode_ops, model, dataset, label_type, train_data_size, is_test=False, eval_batch_size=None, progressbar=False, is_multitask=False, is_main=False): """Evaluate trained model by Character Error Rate. Args: session: session of training model decode_ops (list): operations for decoding model: the model to evaluate dataset: An instance of `Dataset` class label_type (string): kanji or kanji or kanji_divide or kana_divide train_data_size (string): train_subset or train_fullset is_test (bool, optional): set to True when evaluating by the test set eval_batch_size (int, optional): the batch size when evaluating the model progressbar (bool, optional): if True, visualize progressbar is_multitask (bool, optional): if True, evaluate the multitask model is_main (bool, optional): if True, evaluate the main task Return: cer_mean (float): An average of CER """ # NOTE: add multitask version assert isinstance(decode_ops, list), "decode_ops must be a list." batch_size_original = dataset.batch_size # Reset data counter dataset.reset() # Set batch size in the evaluation if eval_batch_size is not None: dataset.batch_size = eval_batch_size if 'kanji' in label_type: map_file_path = '../metrics/mapping_files/' + \ label_type + '_' + train_data_size + '.txt' elif 'kana' in label_type: map_file_path = '../metrics/mapping_files/' + label_type + '.txt' else: raise TypeError idx2char = Idx2char(map_file_path=map_file_path) cer_mean = 0 skip_data_num = 0 if progressbar: pbar = tqdm(total=len(dataset)) for data, is_new_epoch in dataset: # Create feed dictionary for next mini batch inputs, labels_true, inputs_seq_len, _ = data feed_dict = {} for i_device in range(len(decode_ops)): feed_dict[model.inputs_pl_list[i_device]] = inputs[i_device] feed_dict[model.inputs_seq_len_pl_list[i_device] ] = inputs_seq_len[i_device] feed_dict[model.keep_prob_pl_list[i_device]] = 1.0 labels_pred_st_list = session.run(decode_ops, feed_dict=feed_dict) for i_device in range(len(labels_pred_st_list)): batch_size_device = len(inputs[i_device]) try: labels_pred = sparsetensor2list(labels_pred_st_list[i_device], batch_size_device) for i_batch in range(batch_size_device): # Convert from list of index to string if is_test: str_true = labels_true[i_device][i_batch][0] # NOTE: transcript may be seperated by space('_') else: str_true = idx2char(labels_true[i_device][i_batch]) str_pred = idx2char(labels_pred[i_batch]) # Remove garbage labels str_true = re.sub(r'[_NZー・]+', '', str_true) str_pred = re.sub(r'[_NZー・]+', '', str_pred) # Compute CER cer_mean += compute_cer(str_pred=str_pred, str_true=str_true, normalize=True) if progressbar: pbar.update(1) except: print('skipped') skip_data_num += batch_size_device # TODO: Conduct decoding again with batch size 1 if progressbar: pbar.update(batch_size_device) if is_new_epoch: break cer_mean /= (len(dataset) - skip_data_num) # Register original batch size if eval_batch_size is not None: dataset.batch_size = batch_size_original return cer_mean
def do_eval_wer(session, decode_ops, model, dataset, train_data_size, is_test=False, eval_batch_size=None, progressbar=False, is_multitask=False): """Evaluate trained model by Word Error Rate. Args: session: session of training model decode_ops: list of operations for decoding model: the model to evaluate dataset: An instance of `Dataset` class train_data_size (string): train100h or train460h or train960h is_test (bool, optional): set to True when evaluating by the test set eval_batch_size (int, optional): the batch size when evaluating the model progressbar (bool, optional): if True, visualize progressbar is_multitask (bool, optional): if True, evaluate the multitask model Return: wer_mean (bool): An average of WER """ assert isinstance(decode_ops, list), "decode_ops must be a list." batch_size_original = dataset.batch_size # Reset data counter dataset.reset() # Set batch size in the evaluation if eval_batch_size is not None: dataset.batch_size = eval_batch_size idx2word = Idx2word( map_file_path='../metrics/mapping_files/word_' + train_data_size + '.txt') wer_mean = 0 skip_data_num = 0 if progressbar: pbar = tqdm(total=len(dataset)) for data, is_new_epoch in dataset: # Create feed dictionary for next mini batch if is_multitask: inputs, labels_true, _, inputs_seq_len, _ = data else: inputs, labels_true, inputs_seq_len, _ = data feed_dict = {} for i_device in range(len(decode_ops)): feed_dict[model.inputs_pl_list[i_device]] = inputs[i_device] feed_dict[model.inputs_seq_len_pl_list[i_device] ] = inputs_seq_len[i_device] feed_dict[model.keep_prob_pl_list[i_device]] = 1.0 labels_pred_st_list = session.run(decode_ops, feed_dict=feed_dict) for i_device, labels_pred_st in enumerate(labels_pred_st_list): batch_size_device = len(inputs[i_device]) try: labels_pred = sparsetensor2list(labels_pred_st, batch_size_device) for i_batch in range(batch_size_device): if is_test: str_true = labels_true[i_device][i_batch][0] # NOTE: transcript is seperated by space('_') else: str_true = '_'.join( idx2word(labels_true[i_device][i_batch])) str_pred = '_'.join(idx2word(labels_pred[i_batch])) # if len(str_true.split('_')) == 0: # print(str_true) # print(str_pred) # Compute WER wer_mean += compute_wer(ref=str_true.split('_'), hyp=str_pred.split('_'), normalize=True) # substitute, insert, delete = wer_align( # ref=str_true.split(' '), # hyp=str_pred.split(' ')) # print('SUB: %d' % substitute) # print('INS: %d' % insert) # print('DEL: %d' % delete) if progressbar: pbar.update(1) except IndexError: print('skipped') skip_data_num += batch_size_device # TODO: Conduct decoding again with batch size 1 if progressbar: pbar.update(batch_size_device) if is_new_epoch: break wer_mean /= (len(dataset) - skip_data_num) # Register original batch size if eval_batch_size is not None: dataset.batch_size = batch_size_original return wer_mean
def do_eval_fmeasure(session, decode_op, model, dataset, eval_batch_size=None, progressbar=False): """Evaluate trained model by F-measure. Args: session: session of training model decode_op: operation for decoding model: the model to evaluate dataset: An instance of a `Dataset' class label_type (string): phone39 or phone48 or phone61 is_test (bool, optional): set to True when evaluating by the test set eval_batch_size (int, optional): the batch size when evaluating the model progressbar (bool, optional): if True, visualize the progressbar Return: fmean (float): mean of f-measure of laughter and filler """ # Reset data counter dataset.reset() if eval_batch_size is None: batch_size = dataset.batch_size else: batch_size = eval_batch_size tp_l, fp_l, fn_l = 0, 0, 0 tp_f, fp_f, fn_f = 0, 0, 0 if progressbar: pbar = tqdm(total=len(dataset)) for data, is_new_epoch in dataset: # Create feed dictionary for next mini batch inputs, labels_true, inputs_seq_len, _ = data feed_dict = { model.inputs_pl_list[0]: inputs[0], model.inputs_seq_len_pl_list[0]: inputs_seq_len[0], model.keep_prob_pl_list[0]: 1.0 } batch_size = inputs[0].shape[0] # Decode labels_pred_st = session.run(decode_op, feed_dict=feed_dict) labels_pred = sparsetensor2list(labels_pred_st, batch_size) for i_batch in range(batch_size): detected_l_num = np.sum(np.array(labels_pred[i_batch]) == 1) detected_f_num = np.sum(np.array(labels_pred[i_batch]) == 2) true_l_num = np.sum(labels_true[0][i_batch] == 1) true_f_num = np.sum(labels_true[0][i_batch] == 2) # Laughter if detected_l_num <= true_l_num: tp_l += detected_l_num fn_l += true_l_num - detected_l_num else: tp_l += true_l_num fp_l += detected_l_num - true_l_num # Filler if detected_f_num <= true_f_num: tp_f += detected_f_num fn_f += true_f_num - detected_f_num else: tp_f += true_f_num fp_f += detected_f_num - true_f_num if progressbar: pbar.update(1) if is_new_epoch: break # Compute F-measure p_l = tp_l / (tp_l + fp_l) if (tp_l + fp_l) != 0 else 0 r_l = tp_l / (tp_l + fn_l) if (tp_l + fn_l) != 0 else 0 f_l = 2 * r_l * p_l / (r_l + p_l) if (r_l + p_l) != 0 else 0 r_f = tp_f / (tp_f + fn_f) if (tp_f + fn_f) != 0 else 0 p_f = tp_f / (tp_f + fp_f) if (tp_f + fp_f) != 0 else 0 f_f = 2 * r_f * p_f / (r_f + p_f) if (r_f + p_f) != 0 else 0 # confusion_l = [tp_l, fp_l, fn_l, tp_l + fp_l + fn_l] # confusion_f = [tp_f, fp_f, fn_f, tp_f + fp_f + fn_f] acc_l = [p_l, r_l, f_l] acc_f = [p_f, r_f, f_f] mean = [(p_l + p_f) / 2., (r_l + r_f) / 2., (f_l + f_f) / 2.] # df_confusion = pd.DataFrame({'Laughter': confusion_l, 'Filler': confusion_f}, # columns=['Laughter', 'Filler'], # index=['TP', 'FP', 'FN', 'Sum']) # print(df_confusion) df_acc = pd.DataFrame({ 'Laughter': acc_l, 'Filler': acc_f, 'Mean': mean }, columns=['Laughter', 'Filler', 'Mean'], index=['Precision', 'Recall', 'F-measure']) # print(df_acc) return mean[2], df_acc
def do_eval_wer(session, decode_ops, model, dataset, train_data_size, is_test=False, eval_batch_size=None, progressbar=False, is_multitask=False): """Evaluate trained model by Word Error Rate. Args: session: session of training model decode_ops: list of operations for decoding model: the model to evaluate dataset: An instance of `Dataset` class train_data_size (string): train100h or train460h or train960h is_test (bool, optional): set to True when evaluating by the test set eval_batch_size (int, optional): the batch size when evaluating the model progressbar (bool, optional): if True, visualize progressbar is_multitask (bool, optional): if True, evaluate the multitask model Return: wer_mean (bool): An average of WER """ assert isinstance(decode_ops, list), "decode_ops must be a list." batch_size_original = dataset.batch_size # Reset data counter dataset.reset() # Set batch size in the evaluation if eval_batch_size is not None: dataset.batch_size = eval_batch_size idx2word = Idx2word(map_file_path='../metrics/mapping_files/word_' + train_data_size + '.txt') wer_mean = 0 skip_data_num = 0 if progressbar: pbar = tqdm(total=len(dataset)) for data, is_new_epoch in dataset: # Create feed dictionary for next mini batch if is_multitask: inputs, labels_true, _, inputs_seq_len, _ = data else: inputs, labels_true, inputs_seq_len, _ = data feed_dict = {} for i_device in range(len(decode_ops)): feed_dict[model.inputs_pl_list[i_device]] = inputs[i_device] feed_dict[model.inputs_seq_len_pl_list[i_device]] = inputs_seq_len[ i_device] feed_dict[model.keep_prob_pl_list[i_device]] = 1.0 labels_pred_st_list = session.run(decode_ops, feed_dict=feed_dict) for i_device, labels_pred_st in enumerate(labels_pred_st_list): batch_size_device = len(inputs[i_device]) try: labels_pred = sparsetensor2list(labels_pred_st, batch_size_device) for i_batch in range(batch_size_device): if is_test: str_true = labels_true[i_device][i_batch][0] # NOTE: transcript is seperated by space('_') else: str_true = '_'.join( idx2word(labels_true[i_device][i_batch])) str_pred = '_'.join(idx2word(labels_pred[i_batch])) # if len(str_true.split('_')) == 0: # print(str_true) # print(str_pred) # Compute WER wer_mean += compute_wer(ref=str_true.split('_'), hyp=str_pred.split('_'), normalize=True) # substitute, insert, delete = wer_align( # ref=str_true.split(' '), # hyp=str_pred.split(' ')) # print('SUB: %d' % substitute) # print('INS: %d' % insert) # print('DEL: %d' % delete) if progressbar: pbar.update(1) except IndexError: print('skipped') skip_data_num += batch_size_device # TODO: Conduct decoding again with batch size 1 if progressbar: pbar.update(batch_size_device) if is_new_epoch: break wer_mean /= (len(dataset) - skip_data_num) # Register original batch size if eval_batch_size is not None: dataset.batch_size = batch_size_original return wer_mean
def decode(session, decode_op, model, dataset, label_type, ss_type, is_test=False, eval_batch_size=None, save_path=None): """Visualize label outputs of CTC model. Args: session: session of training model decode_op: operation for decoding model: the model to evaluate dataset: An instance of a `Dataset` class label_type (string): kana ss_type (string): remove or insert_left or insert_both or insert_right is_test (bool, optional): set to True when evaluating by the test set eval_batch_size (int, optional): the batch size when evaluating the model save_path (string, optional): path to save decoding results """ batch_size_original = dataset.batch_size # Reset data counter dataset.reset() # Set batch size in the evaluation if eval_batch_size is not None: dataset.batch_size = eval_batch_size idx2char = Idx2char( map_file_path='../metrics/mapping_files/' + label_type + '_' + ss_type + '.txt') if save_path is not None: sys.stdout = open(join(model.model_dir, 'decode.txt'), 'w') for data, is_new_epoch in dataset: # Create feed dictionary for next mini batch inputs, labels_true, inputs_seq_len, input_names = data feed_dict = { model.inputs_pl_list[0]: inputs[0], model.inputs_seq_len_pl_list[0]: inputs_seq_len[0], model.keep_prob_pl_list[0]: 1.0 } batch_size = inputs[0].shape[0] labels_pred_st = session.run(decode_op, feed_dict=feed_dict) try: labels_pred = sparsetensor2list( labels_pred_st, batch_size=batch_size) except IndexError: # no output labels_pred = [''] for i_batch in range(batch_size): print('----- wav: %s -----' % input_names[0][i_batch]) if 'char' in label_type: if is_test: str_true = labels_true[0][i_batch][0] else: str_true = idx2char(labels_true[0][i_batch]) str_pred = idx2char(labels_pred[i_batch]) else: if is_test: str_true = labels_true[0][i_batch][0] else: str_true = idx2char(labels_true[0][i_batch]) str_pred = idx2char(labels_pred[i_batch]) print('Ref: %s' % str_true) print('Hyp: %s' % str_pred) if is_new_epoch: break # Register original batch size if eval_batch_size is not None: dataset.batch_size = batch_size_original
def do_eval_cer(session, decode_op, model, dataset, label_type, ss_type, is_test=False, eval_batch_size=None, progressbar=False): """Evaluate trained model by Character Error Rate. Args: session: session of training model decode_op: operation for decoding model: the model to evaluate dataset: An instance of a `Dataset` class label_type (string): kana ss_type (string): remove or insert_left or insert_both or insert_right is_test (bool, optional): set to True when evaluating by the test set eval_batch_size (int, optional): the batch size when evaluating the model progressbar (bool, optional): if True, visualize the progressbar Return: cer_mean (float): An average of CER """ batch_size_original = dataset.batch_size # Reset data counter dataset.reset() # Set batch size in the evaluation if eval_batch_size is not None: dataset.batch_size = eval_batch_size idx2char = Idx2char( map_file_path='../metrics/mapping_files/' + label_type + '_' + ss_type + '.txt') cer_mean = 0 skip_data_num = 0 if progressbar: pbar = tqdm(total=len(dataset)) for data, is_new_epoch in dataset: # Create feed dictionary for next mini batch inputs, labels_true, inputs_seq_len, _ = data feed_dict = { model.inputs_pl_list[0]: inputs[0], model.inputs_seq_len_pl_list[0]: inputs_seq_len[0], model.keep_prob_pl_list[0]: 1.0 } batch_size = inputs[0].shape[0] labels_pred_st = session.run(decode_op, feed_dict=feed_dict) try: labels_pred = sparsetensor2list(labels_pred_st, batch_size) for i_batch in range(batch_size): # Convert from list of index to string if is_test: str_true = labels_true[0][i_batch][0] # NOTE: transcript is seperated by space('_') else: # Convert from list of index to string str_true = idx2char(labels_true[0][i_batch], padded_value=dataset.padded_value) str_pred = idx2char(labels_pred[i_batch]) # Remove garbage labels str_true = re.sub(r'[_NZLFBDlfbdー]+', '', str_true) str_pred = re.sub(r'[_NZLFBDlfbdー]+', '', str_pred) # Compute CER cer_mean += compute_cer(str_pred=str_pred, str_true=str_true, normalize=True) if progressbar: pbar.update(1) except: print('skipped') skip_data_num += batch_size # TODO: Conduct decoding again with batch size 1 if progressbar: pbar.update(batch_size) if is_new_epoch: break cer_mean /= (len(dataset) - skip_data_num) # Register original batch size if eval_batch_size is not None: dataset.batch_size = batch_size_original return cer_mean
def do_eval_fmeasure(session, decode_op, model, dataset, label_type, ss_type, is_test=False, eval_batch_size=None, progressbar=False): """Evaluate trained model by F-measure. Args: session: session of training model decode_op: operation for decoding model: the model to evaluate dataset: An instance of a `Dataset` class label_type (string): kana ss_type (string): remove or insert_left or insert_both or insert_right is_test (bool, optional): set to True when evaluating by the test set eval_batch_size (int, optional): the batch size when evaluating the model progressbar (bool, optional): if True, visualize the progressbar Returns: f_mean (float): An average of F-measure of each social signal """ batch_size_original = dataset.batch_size # Reset data counter dataset.reset() # Set batch size in the evaluation if eval_batch_size is not None: dataset.batch_size = eval_batch_size idx2char = Idx2char(map_file_path='../metrics/mapping_files/' + label_type + '_' + ss_type + '.txt') tp_lau, fp_lau, fn_lau = 0., 0., 0. tp_fil, fp_fil, fn_fil = 0., 0., 0. tp_bac, fp_bac, fn_bac = 0., 0., 0. tp_dis, fp_dis, fn_dis = 0., 0., 0. if progressbar: pbar = tqdm(total=len(dataset)) for data, is_new_epoch in dataset: # Create feed dictionary for next mini batch inputs, labels_true, inputs_seq_len, _ = data feed_dict = { model.inputs_pl_list[0]: inputs[0], model.inputs_seq_len_pl_list[0]: inputs_seq_len[0], model.keep_prob_pl_list[0]: 1.0 } batch_size = inputs[0].shape[0] labels_pred_st = session.run(decode_op, feed_dict=feed_dict) try: labels_pred = sparsetensor2list(labels_pred_st, batch_size) for i_batch in range(batch_size): # Convert from list of index to string if is_test: str_true = labels_true[0][i_batch][0] # NOTE: transcript is seperated by space('_') else: # Convert from list of index to string str_true = idx2char(labels_true[0][i_batch], padded_value=dataset.padded_value) str_pred = idx2char(labels_pred[i_batch]) detected_lau_num = str_pred.count('L') detected_fil_num = str_pred.count('F') detected_bac_num = str_pred.count('B') detected_dis_num = str_pred.count('D') true_lau_num = str_true.count('L') true_fil_num = str_true.count('F') true_bac_num = str_true.count('B') true_dis_num = str_true.count('D') # Laughter if detected_lau_num <= true_lau_num: tp_lau += detected_lau_num fn_lau += true_lau_num - detected_lau_num else: tp_lau += true_lau_num fp_lau += detected_lau_num - true_lau_num # Filler if detected_fil_num <= true_fil_num: tp_fil += detected_fil_num fn_fil += true_fil_num - detected_fil_num else: tp_fil += true_fil_num fp_fil += detected_fil_num - true_fil_num # Backchannel if detected_bac_num <= true_bac_num: tp_bac += detected_bac_num fn_bac += true_bac_num - detected_bac_num else: tp_bac += true_bac_num fp_bac += detected_bac_num - true_bac_num # Disfluency if detected_dis_num <= true_dis_num: tp_dis += detected_dis_num fn_dis += true_dis_num - detected_dis_num else: tp_dis += true_dis_num fp_dis += detected_dis_num - true_dis_num if progressbar: pbar.update(1) except: print('skipped') if progressbar: pbar.update(batch_size) if is_new_epoch: break p_lau = tp_lau / (tp_lau + fp_lau) if (tp_lau + fp_lau) != 0 else 0 r_lau = tp_lau / (tp_lau + fn_lau) if (tp_lau + fn_lau) != 0 else 0 f_lau = 2 * r_lau * p_lau / (r_lau + p_lau) if (r_lau + p_lau) != 0 else 0 r_fil = tp_fil / (tp_fil + fn_fil) if (tp_fil + fn_fil) != 0 else 0 p_fil = tp_fil / (tp_fil + fp_fil) if (tp_fil + fp_fil) != 0 else 0 f_fil = 2 * r_fil * p_fil / (r_fil + p_fil) if (r_fil + p_fil) != 0 else 0 p_bac = tp_bac / (tp_bac + fp_bac) if (tp_bac + fp_bac) != 0 else 0 r_bac = tp_bac / (tp_bac + fn_bac) if (tp_bac + fn_bac) != 0 else 0 f_bac = 2 * r_bac * p_bac / (r_bac + p_bac) if (r_bac + p_bac) != 0 else 0 r_dis = tp_dis / (tp_dis + fn_dis) if (tp_dis + fn_dis) != 0 else 0 p_dis = tp_dis / (tp_dis + fp_dis) if (tp_dis + fp_dis) != 0 else 0 f_dis = 2 * r_dis * p_dis / (r_dis + p_dis) if (r_dis + p_dis) != 0 else 0 acc_lau = [p_lau, r_lau, f_lau] acc_fil = [p_fil, r_fil, f_fil] acc_bac = [p_bac, r_bac, f_bac] acc_dis = [p_dis, r_dis, f_dis] mean = [(p_lau + p_fil + p_bac + p_dis) / 4., (r_lau + r_fil + r_bac + r_dis) / 4., (f_lau + f_fil + f_bac + f_dis) / 4.] df_acc = pd.DataFrame( { 'Laughter': acc_lau, 'Filler': acc_fil, 'Backchannel': acc_bac, 'Disfluency': acc_dis, 'Mean': mean }, columns=['Laughter', 'Filler', 'Backchannel', 'Disfluency', 'Mean'], index=['Precision', 'Recall', 'F-measure']) # Register original batch size if eval_batch_size is not None: dataset.batch_size = batch_size_original return df_acc
def decode(session, decode_op_main, decode_op_sub, model, dataset, label_type_main, label_type_sub, is_test=True, save_path=None): """Visualize label outputs of Multi-task CTC model. Args: session: session of training model decode_op_main: operation for decoding in the main task decode_op_sub: operation for decoding in the sub task model: the model to evaluate dataset: An instance of a `Dataset` class label_type_main (string): character or character_capital_divide label_type_sub (string): phone39 or phone48 or phone61 is_test (bool, optional): save_path (string, optional): path to save decoding results """ idx2char = Idx2char(map_file_path='../metrics/mapping_files/' + label_type_main + '.txt') idx2phone = Idx2phone(map_file_path='../metrics/mapping_files/' + label_type_sub + '.txt') if save_path is not None: sys.stdout = open(join(model.model_dir, 'decode.txt'), 'w') for data, is_new_epoch in dataset: # Create feed dictionary for next mini batch inputs, labels_true_char, labels_true_phone, inputs_seq_len, input_names = data feed_dict = { model.inputs_pl_list[0]: inputs[0], model.inputs_seq_len_pl_list[0]: inputs_seq_len[0], model.keep_prob_pl_list[0]: 1.0 } batch_size = inputs[0].shape[0] labels_pred_char_st, labels_pred_phone_st = session.run( [decode_op_main, decode_op_sub], feed_dict=feed_dict) try: labels_pred_char = sparsetensor2list(labels_pred_char_st, batch_size=batch_size) except: # no output labels_pred_char = [''] try: labels_pred_phone = sparsetensor2list(labels_pred_char_st, batch_size=batch_size) except: # no output labels_pred_phone = [''] for i_batch in range(batch_size): print('----- wav: %s -----' % input_names[0][i_batch]) if is_test: str_true_char = labels_true_char[0][i_batch][0].replace( '_', ' ') str_true_phone = labels_true_phone[0][i_batch][0] else: str_true_char = idx2char(labels_true_char[0][i_batch]) str_true_phone = idx2phone(labels_true_phone[0][i_batch]) str_pred_char = idx2char(labels_pred_char[i_batch]) str_pred_phone = idx2phone(labels_pred_phone[i_batch]) print('Ref (char): %s' % str_true_char) print('Hyp (char): %s' % str_pred_char) print('Ref (phone): %s' % str_true_phone) print('Hyp (phone): %s' % str_pred_phone) if is_new_epoch: break
def do_eval_cer(session, decode_op, model, dataset, label_type, ss_type, is_test=False, eval_batch_size=None, progressbar=False): """Evaluate trained model by Character Error Rate. Args: session: session of training model decode_op: operation for decoding model: the model to evaluate dataset: An instance of a `Dataset` class label_type (string): kana ss_type (string): remove or insert_left or insert_both or insert_right is_test (bool, optional): set to True when evaluating by the test set eval_batch_size (int, optional): the batch size when evaluating the model progressbar (bool, optional): if True, visualize the progressbar Return: cer_mean (float): An average of CER """ batch_size_original = dataset.batch_size # Reset data counter dataset.reset() # Set batch size in the evaluation if eval_batch_size is not None: dataset.batch_size = eval_batch_size idx2char = Idx2char(map_file_path='../metrics/mapping_files/' + label_type + '_' + ss_type + '.txt') cer_mean = 0 skip_data_num = 0 if progressbar: pbar = tqdm(total=len(dataset)) for data, is_new_epoch in dataset: # Create feed dictionary for next mini batch inputs, labels_true, inputs_seq_len, _ = data feed_dict = { model.inputs_pl_list[0]: inputs[0], model.inputs_seq_len_pl_list[0]: inputs_seq_len[0], model.keep_prob_pl_list[0]: 1.0 } batch_size = inputs[0].shape[0] labels_pred_st = session.run(decode_op, feed_dict=feed_dict) try: labels_pred = sparsetensor2list(labels_pred_st, batch_size) for i_batch in range(batch_size): # Convert from list of index to string if is_test: str_true = labels_true[0][i_batch][0] # NOTE: transcript is seperated by space('_') else: # Convert from list of index to string str_true = idx2char(labels_true[0][i_batch], padded_value=dataset.padded_value) str_pred = idx2char(labels_pred[i_batch]) # Remove garbage labels str_true = re.sub(r'[_NZLFBDlfbdー]+', '', str_true) str_pred = re.sub(r'[_NZLFBDlfbdー]+', '', str_pred) # Compute CER cer_mean += compute_cer(str_pred=str_pred, str_true=str_true, normalize=True) if progressbar: pbar.update(1) except: print('skipped') skip_data_num += batch_size # TODO: Conduct decoding again with batch size 1 if progressbar: pbar.update(batch_size) if is_new_epoch: break cer_mean /= (len(dataset) - skip_data_num) # Register original batch size if eval_batch_size is not None: dataset.batch_size = batch_size_original return cer_mean
def check(self, encoder_type, lstm_impl, time_major=False): print('==================================================') print(' encoder_type: %s' % str(encoder_type)) print(' lstm_impl: %s' % str(lstm_impl)) print(' time_major: %s' % str(time_major)) print('==================================================') tf.reset_default_graph() with tf.Graph().as_default(): # Load batch data batch_size = 2 inputs, labels_char, labels_phone, inputs_seq_len = generate_data( label_type='multitask', model='ctc', batch_size=batch_size) # Define model graph num_classes_main = 27 num_classes_sub = 61 model = MultitaskCTC( encoder_type=encoder_type, input_size=inputs[0].shape[1], num_units=256, num_layers_main=2, num_layers_sub=1, num_classes_main=num_classes_main, num_classes_sub=num_classes_sub, main_task_weight=0.8, lstm_impl=lstm_impl, parameter_init=0.1, clip_grad_norm=5.0, clip_activation=50, num_proj=256, weight_decay=1e-8, # bottleneck_dim=50, bottleneck_dim=None, time_major=time_major) # Define placeholders model.create_placeholders() learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate') # Add to the graph each operation loss_op, logits_main, logits_sub = model.compute_loss( model.inputs_pl_list[0], model.labels_pl_list[0], model.labels_sub_pl_list[0], model.inputs_seq_len_pl_list[0], model.keep_prob_pl_list[0]) train_op = model.train( loss_op, optimizer='adam', learning_rate=learning_rate_pl) decode_op_main, decode_op_sub = model.decoder( logits_main, logits_sub, model.inputs_seq_len_pl_list[0], beam_width=20) ler_op_main, ler_op_sub = model.compute_ler( decode_op_main, decode_op_sub, model.labels_pl_list[0], model.labels_sub_pl_list[0]) # Define learning rate controller learning_rate = 1e-3 lr_controller = Controller(learning_rate_init=learning_rate, decay_start_epoch=20, decay_rate=0.9, decay_patient_epoch=5, lower_better=True) # Add the variable initializer operation init_op = tf.global_variables_initializer() # Count total parameters parameters_dict, total_parameters = count_total_parameters( tf.trainable_variables()) for parameter_name in sorted(parameters_dict.keys()): print("%s %d" % (parameter_name, parameters_dict[parameter_name])) print("Total %d variables, %s M parameters" % (len(parameters_dict.keys()), "{:,}".format(total_parameters / 1000000))) # Make feed dict feed_dict = { model.inputs_pl_list[0]: inputs, model.labels_pl_list[0]: list2sparsetensor(labels_char, padded_value=-1), model.labels_sub_pl_list[0]: list2sparsetensor(labels_phone, padded_value=-1), model.inputs_seq_len_pl_list[0]: inputs_seq_len, model.keep_prob_pl_list[0]: 0.9, learning_rate_pl: learning_rate } idx2phone = Idx2phone(map_file_path='./phone61.txt') with tf.Session() as sess: # Initialize parameters sess.run(init_op) # Wrapper for tfdbg # sess = tf_debug.LocalCLIDebugWrapperSession(sess) # Train model max_steps = 1000 start_time_step = time.time() for step in range(max_steps): # Compute loss _, loss_train = sess.run( [train_op, loss_op], feed_dict=feed_dict) # Gradient check # grads = sess.run(model.clipped_grads, # feed_dict=feed_dict) # for grad in grads: # print(np.max(grad)) if (step + 1) % 10 == 0: # Change to evaluation mode feed_dict[model.keep_prob_pl_list[0]] = 1.0 # Compute accuracy ler_train_char, ler_train_phone = sess.run( [ler_op_main, ler_op_sub], feed_dict=feed_dict) duration_step = time.time() - start_time_step print('Step %d: loss = %.3f / cer = %.3f / per = %.3f (%.3f sec) / lr = %.5f' % (step + 1, loss_train, ler_train_char, ler_train_phone, duration_step, learning_rate)) start_time_step = time.time() # Visualize labels_pred_char_st, labels_pred_phone_st = sess.run( [decode_op_main, decode_op_sub], feed_dict=feed_dict) labels_pred_char = sparsetensor2list( labels_pred_char_st, batch_size=batch_size) labels_pred_phone = sparsetensor2list( labels_pred_phone_st, batch_size=batch_size) print('Character') try: print(' Ref: %s' % idx2alpha(labels_char[0])) print(' Hyp: %s' % idx2alpha(labels_pred_char[0])) except IndexError: print('Character') print(' Ref: %s' % idx2alpha(labels_char[0])) print(' Hyp: %s' % '') print('Phone') try: print(' Ref: %s' % idx2phone(labels_phone[0])) print(' Hyp: %s' % idx2phone(labels_pred_phone[0])) except IndexError: print(' Ref: %s' % idx2phone(labels_phone[0])) print(' Hyp: %s' % '') # NOTE: This is for no prediction print('-' * 30) if ler_train_char < 0.1: print('Modle is Converged.') break # Update learning rate learning_rate = lr_controller.decay_lr( learning_rate=learning_rate, epoch=step, value=ler_train_char) feed_dict[learning_rate_pl] = learning_rate
def decode(session, decode_op_main, decode_op_sub, model, dataset, train_data_size, label_type_main, label_type_sub, is_test=True, save_path=None): """Visualize label outputs of Multi-task CTC model. Args: session: session of training model decode_op_main: operation for decoding in the main task decode_op_sub: operation for decoding in the sub task model: the model to evaluate dataset: An instance of a `Dataset` class label_type_main (string): word label_type_sub (string): character or character_capital_divide train_data_size (string, optional): train100h or train460h or train960h is_test (bool, optional): set to True when evaluating by the test set save_path (string, optional): path to save decoding results """ idx2word = Idx2word( map_file_path='../metrics/mapping_files/word_' + train_data_size + '.txt') idx2char = Idx2char( map_file_path='../metrics/mapping_files/' + label_type_sub + '.txt') if save_path is not None: sys.stdout = open(join(model.model_dir, 'decode.txt'), 'w') for data, is_new_epoch in dataset: # Create feed dictionary for next mini batch inputs, labels_true_word, labels_true_char, inputs_seq_len, input_names = data feed_dict = { model.inputs_pl_list[0]: inputs[0], model.inputs_seq_len_pl_list[0]: inputs_seq_len[0], model.keep_prob_hidden_pl_list[0]: 1.0 } # Decode batch_size = inputs[0].shape[0] labels_pred_st_word, labels_pred_st_char = session.run( [decode_op_main, decode_op_sub], feed_dict=feed_dict) try: labels_pred_word = sparsetensor2list( labels_pred_st_word, batch_size=batch_size) except IndexError: # no output labels_pred_word = [''] try: labels_pred_char = sparsetensor2list( labels_pred_st_char, batch_size=batch_size) except IndexError: # no output labels_pred_char = [''] # Visualize for i_batch in range(batch_size): print('----- wav: %s -----' % input_names[0][i_batch]) if is_test: str_true_word = labels_true_word[0][i_batch][0] str_true_char = labels_true_char[0][i_batch][0] else: str_true_word = '_'.join( idx2word(labels_true_word[0][i_batch])) str_true_char = idx2char(labels_true_char[0][i_batch]) str_pred_word = '_'.join(idx2word(labels_pred_word[0])) str_pred_char = idx2char(labels_pred_char[0]) print('Ref (word): %s' % str_true_word) print('Ref (char): %s' % str_true_char) print('Hyp (word): %s' % str_pred_word) print('Hyp (char): %s' % str_pred_char) if is_new_epoch: break
def do_eval_fmeasure(session, decode_op, model, dataset, eval_batch_size=None, progressbar=False): """Evaluate trained model by F-measure. Args: session: session of training model decode_op: operation for decoding model: the model to evaluate dataset: An instance of a `Dataset' class label_type (string): phone39 or phone48 or phone61 is_test (bool, optional): set to True when evaluating by the test set eval_batch_size (int, optional): the batch size when evaluating the model progressbar (bool, optional): if True, visualize the progressbar Return: fmean (float): mean of f-measure of laughter and filler """ # Reset data counter dataset.reset() if eval_batch_size is None: batch_size = dataset.batch_size else: batch_size = eval_batch_size tp_l, fp_l, fn_l = 0, 0, 0 tp_f, fp_f, fn_f = 0, 0, 0 if progressbar: pbar = tqdm(total=len(dataset)) for data, is_new_epoch in dataset: # Create feed dictionary for next mini batch inputs, labels_true, inputs_seq_len, _ = data feed_dict = { model.inputs_pl_list[0]: inputs[0], model.inputs_seq_len_pl_list[0]: inputs_seq_len[0], model.keep_prob_pl_list[0]: 1.0 } batch_size = inputs[0].shape[0] # Decode labels_pred_st = session.run(decode_op, feed_dict=feed_dict) labels_pred = sparsetensor2list(labels_pred_st, batch_size) for i_batch in range(batch_size): detected_l_num = np.sum(np.array(labels_pred[i_batch]) == 1) detected_f_num = np.sum(np.array(labels_pred[i_batch]) == 2) true_l_num = np.sum(labels_true[0][i_batch] == 1) true_f_num = np.sum(labels_true[0][i_batch] == 2) # Laughter if detected_l_num <= true_l_num: tp_l += detected_l_num fn_l += true_l_num - detected_l_num else: tp_l += true_l_num fp_l += detected_l_num - true_l_num # Filler if detected_f_num <= true_f_num: tp_f += detected_f_num fn_f += true_f_num - detected_f_num else: tp_f += true_f_num fp_f += detected_f_num - true_f_num if progressbar: pbar.update(1) if is_new_epoch: break # Compute F-measure p_l = tp_l / (tp_l + fp_l) if (tp_l + fp_l) != 0 else 0 r_l = tp_l / (tp_l + fn_l) if (tp_l + fn_l) != 0 else 0 f_l = 2 * r_l * p_l / (r_l + p_l) if (r_l + p_l) != 0 else 0 r_f = tp_f / (tp_f + fn_f) if (tp_f + fn_f) != 0 else 0 p_f = tp_f / (tp_f + fp_f) if (tp_f + fp_f) != 0 else 0 f_f = 2 * r_f * p_f / (r_f + p_f) if (r_f + p_f) != 0 else 0 # confusion_l = [tp_l, fp_l, fn_l, tp_l + fp_l + fn_l] # confusion_f = [tp_f, fp_f, fn_f, tp_f + fp_f + fn_f] acc_l = [p_l, r_l, f_l] acc_f = [p_f, r_f, f_f] mean = [(p_l + p_f) / 2., (r_l + r_f) / 2., (f_l + f_f) / 2.] # df_confusion = pd.DataFrame({'Laughter': confusion_l, 'Filler': confusion_f}, # columns=['Laughter', 'Filler'], # index=['TP', 'FP', 'FN', 'Sum']) # print(df_confusion) df_acc = pd.DataFrame({'Laughter': acc_l, 'Filler': acc_f, 'Mean': mean}, columns=['Laughter', 'Filler', 'Mean'], index=['Precision', 'Recall', 'F-measure']) # print(df_acc) return mean[2], df_acc
def decode(session, decode_op, model, dataset, label_type, train_data_size, is_test=True, save_path=None): """Visualize label outputs of CTC model. Args: session: session of training model decode_op: operation for decoding model: the model to evaluate dataset: An instance of a `Dataset` class label_type (string): character or character_capital_divide or word train_data_size (string, optional): train100h or train460h or train960h is_test (bool, optional): set to True when evaluating by the test set save_path (string, optional): path to save decoding results """ if label_type == 'character': map_fn = Idx2char( map_file_path='../metrics/mapping_files/character.txt') elif label_type == 'character_capital_divide': map_fn = Idx2char( map_file_path='../metrics/mapping_files/character_capital_divide.txt', capital_divide=True) elif label_type == 'word': map_fn = Idx2word( map_file_path='../metrics/mapping_files/word_' + train_data_size + '.txt') if save_path is not None: sys.stdout = open(join(model.model_dir, 'decode.txt'), 'w') for data, is_new_epoch in dataset: # Create feed dictionary for next mini batch inputs, labels_true, inputs_seq_len, input_names = data feed_dict = { model.inputs_pl_list[0]: inputs[0], model.inputs_seq_len_pl_list[0]: inputs_seq_len[0], model.keep_prob_pl_list[0]: 1.0 } # Decode batch_size = inputs[0].shape[0] labels_pred_st = session.run(decode_op, feed_dict=feed_dict) no_output_flag = False try: labels_pred = sparsetensor2list( labels_pred_st, batch_size=batch_size) except IndexError: # no output no_output_flag = True # Visualize for i_batch in range(batch_size): print('----- wav: %s -----' % input_names[0][i_batch]) if 'char' in label_type: if is_test: str_true = labels_true[0][i_batch][0] else: str_true = map_fn(labels_true[0][i_batch]) if no_output_flag: str_pred = '' else: str_pred = map_fn(labels_pred[i_batch]) else: if is_test: str_true = labels_true[0][i_batch][0] else: str_true = '_'.join(map_fn(labels_true[0][i_batch])) if no_output_flag: str_pred = '' else: str_pred = '_'.join(map_fn(labels_pred[i_batch])) print('Ref: %s' % str_true) print('Hyp: %s' % str_pred) # wer_align(ref=str_true.split(), hyp=str_pred.split()) if is_new_epoch: break
def check(self, decoder_type): print('==================================================') print(' decoder_type: %s' % decoder_type) print('==================================================') tf.reset_default_graph() with tf.Graph().as_default(): # Load batch data batch_size = 2 num_stack = 2 inputs, labels, inputs_seq_len = generate_data( label_type='character', model='ctc', batch_size=batch_size, num_stack=num_stack, splice=1) max_time = inputs.shape[1] # Define model graph model = CTC(encoder_type='blstm', input_size=inputs[0].shape[-1], splice=1, num_stack=num_stack, num_units=256, num_layers=2, num_classes=27, lstm_impl='LSTMBlockCell', parameter_init=0.1, clip_grad_norm=5.0, clip_activation=50, num_proj=256, weight_decay=1e-6) # Define placeholders model.create_placeholders() # Add to the graph each operation _, logits = model.compute_loss( model.inputs_pl_list[0], model.labels_pl_list[0], model.inputs_seq_len_pl_list[0], model.keep_prob_pl_list[0]) beam_width = 20 if 'beam_search' in decoder_type else 1 decode_op = model.decoder(logits, model.inputs_seq_len_pl_list[0], beam_width=beam_width) ler_op = model.compute_ler(decode_op, model.labels_pl_list[0]) posteriors_op = model.posteriors(logits, blank_prior=1) if decoder_type == 'np_greedy': decoder = GreedyDecoder(blank_index=model.num_classes) elif decoder_type == 'np_beam_search': decoder = BeamSearchDecoder(space_index=26, blank_index=model.num_classes - 1) # Make feed dict feed_dict = { model.inputs_pl_list[0]: inputs, model.labels_pl_list[0]: list2sparsetensor(labels, padded_value=-1), model.inputs_seq_len_pl_list[0]: inputs_seq_len, model.keep_prob_pl_list[0]: 1.0 } # Create a saver for writing training checkpoints saver = tf.train.Saver() with tf.Session() as sess: ckpt = tf.train.get_checkpoint_state('./') # If check point exists if ckpt: model_path = ckpt.model_checkpoint_path saver.restore(sess, model_path) print("Model restored: " + model_path) else: raise ValueError('There are not any checkpoints.') if decoder_type in ['tf_greedy', 'tf_beam_search']: # Decode labels_pred_st = sess.run(decode_op, feed_dict=feed_dict) labels_pred = sparsetensor2list( labels_pred_st, batch_size=batch_size) # Compute accuracy cer = sess.run(ler_op, feed_dict=feed_dict) else: # Compute CTC posteriors probs = sess.run(posteriors_op, feed_dict=feed_dict) probs = probs.reshape(-1, max_time, model.num_classes) if decoder_type == 'np_greedy': # Decode labels_pred = decoder(probs=probs, seq_len=inputs_seq_len) elif decoder_type == 'np_beam_search': # Decode labels_pred, scores = decoder(probs=probs, seq_len=inputs_seq_len, beam_width=beam_width) # Compute accuracy cer = compute_cer(str_pred=idx2alpha(labels_pred[0]), str_true=idx2alpha(labels[0]), normalize=True) # Visualize print('CER: %.3f %%' % (cer * 100)) print('Ref: %s' % idx2alpha(labels[0])) print('Hyp: %s' % idx2alpha(labels_pred[0]))
def decode(session, decode_op, model, dataset, label_type, train_data_size, is_test=True, save_path=None): """Visualize label outputs of CTC model. Args: session: session of training model decode_op: operation for decoding model: the model to evaluate dataset: An instance of a `Dataset` class label_type (string): kanji or kanji or kanji_divide or kana_divide train_data_size (string): train_subset or train_fullset is_test (bool, optional): set to True when evaluating by the test set save_path (string, optional): path to save decoding results """ if 'kanji' in label_type: map_file_path = '../metrics/mapping_files/' + \ label_type + '_' + train_data_size + '.txt' elif 'kana' in label_type: map_file_path = '../metrics/mapping_files/' + label_type + '.txt' else: raise TypeError idx2char = Idx2char(map_file_path=map_file_path) if save_path is not None: sys.stdout = open(join(model.model_dir, 'decode.txt'), 'w') for data, is_new_epoch in dataset: # Create feed dictionary for next mini batch inputs, labels_true, inputs_seq_len, input_names = data feed_dict = { model.inputs_pl_list[0]: inputs[0], model.inputs_seq_len_pl_list[0]: inputs_seq_len[0], model.keep_prob_pl_list[0]: 1.0 } # Decode batch_size = inputs[0].shape[0] labels_pred_st = session.run(decode_op, feed_dict=feed_dict) no_output_flag = False try: labels_pred = sparsetensor2list( labels_pred_st, batch_size=batch_size) except IndexError: # no output no_output_flag = True # Visualize for i_batch in range(batch_size): print('----- wav: %s -----' % input_names[0][i_batch]) if is_test: str_true = labels_true[0][i_batch][0] else: str_true = idx2char(labels_true[0][i_batch]) if no_output_flag: str_pred = '' else: str_pred = idx2char(labels_pred[i_batch]) print('Ref: %s' % str_true) print('Hyp: %s' % str_pred) if is_new_epoch: break
def do_eval_per(session, decode_op, per_op, model, dataset, label_type, eval_batch_size=None, progressbar=False, is_multitask=False): """Evaluate trained model by Phone Error Rate. Args: session: session of training model decode_op: operation for decoding per_op: operation for computing phone error rate model: the model to evaluate dataset: An instance of a `Dataset' class label_type (string): phone39 or phone48 or phone61 eval_batch_size (int, optional): the batch size when evaluating the model progressbar (bool, optional): if True, visualize the progressbar is_multitask (bool, optional): if True, evaluate the multitask model Returns: per_mean (float): An average of PER """ # Reset data counter dataset.reset() train_label_type = label_type eval_label_type = dataset.label_type_sub if is_multitask else dataset.label_type # phone2idx_39_map_file_path = '../metrics/mapping_files/ctc/phone39.txt' idx2phone_train = Idx2phone(map_file_path='../metrics/mapping_files/ctc/' + train_label_type + '.txt') idx2phone_eval = Idx2phone(map_file_path='../metrics/mapping_files/ctc/' + eval_label_type + '.txt') map2phone39_train = Map2phone39( label_type=train_label_type, map_file_path='../metrics/mapping_files/phone2phone.txt') map2phone39_eval = Map2phone39( label_type=eval_label_type, map_file_path='../metrics/mapping_files/phone2phone.txt') per_mean = 0 if progressbar: pbar = tqdm(total=len(dataset)) for data, is_new_epoch in dataset: # Create feed dictionary for next mini batch if is_multitask: inputs, _, labels_true, inputs_seq_len, _ = data else: inputs, labels_true, inputs_seq_len, _ = data feed_dict = { model.inputs_pl_list[0]: inputs, model.inputs_seq_len_pl_list[0]: inputs_seq_len, model.keep_prob_input_pl_list[0]: 1.0, model.keep_prob_hidden_pl_list[0]: 1.0, model.keep_prob_output_pl_list[0]: 1.0 } batch_size_each = len(inputs) # Evaluate by 39 phones labels_pred_st = session.run(decode_op, feed_dict=feed_dict) labels_pred = sparsetensor2list(labels_pred_st, batch_size_each) for i_batch in range(batch_size_each): ############### # Hypothesis ############### # Convert from index to phone (-> list of phone strings) phone_pred_list = idx2phone_train(labels_pred[i_batch]).split(' ') # Mapping to 39 phones (-> list of phone strings) phone_pred_list = map2phone39_train(phone_pred_list) ############### # Reference ############### # Convert from index to phone (-> list of phone strings) phone_true_list = idx2phone_eval(labels_true[i_batch]).split(' ') # Mapping to 39 phones (-> list of phone strings) phone_true_list = map2phone39_eval(phone_true_list) # Compute PER per_mean += compute_per(ref=phone_pred_list, hyp=phone_true_list, normalize=True) if progressbar: pbar.update(1) if is_new_epoch: break per_mean /= len(dataset) return per_mean
def do_eval_cer(session, decode_ops, model, dataset, label_type, is_test=False, eval_batch_size=None, progressbar=False, is_multitask=False): """Evaluate trained model by Character Error Rate. Args: session: session of training model decode_ops: list of operations for decoding model: the model to evaluate dataset: An instance of a `Dataset` class label_type (string): character or character_capital_divide is_test (bool, optional): set to True when evaluating by the test set eval_batch_size (int, optional): the batch size when evaluating the model progressbar (bool, optional): if True, visualize the progressbar is_multitask (bool, optional): if True, evaluate the multitask model Return: cer_mean (float): An average of CER wer_mean (float): An average of WER """ assert isinstance(decode_ops, list), "decode_ops must be a list." batch_size_original = dataset.batch_size # Reset data counter dataset.reset() # Set batch size in the evaluation if eval_batch_size is not None: dataset.batch_size = eval_batch_size if label_type == 'character': idx2char = Idx2char( map_file_path='../metrics/mapping_files/character.txt') elif label_type == 'character_capital_divide': idx2char = Idx2char( map_file_path= '../metrics/mapping_files/character_capital_divide.txt', capital_divide=True, space_mark='_') else: raise TypeError cer_mean, wer_mean = 0, 0 skip_data_num = 0 if progressbar: pbar = tqdm(total=len(dataset)) for data, is_new_epoch in dataset: # Create feed dictionary for next mini batch if is_multitask: inputs, _, labels_true, inputs_seq_len, _ = data else: inputs, labels_true, inputs_seq_len, _ = data feed_dict = {} for i_device in range(len(decode_ops)): feed_dict[model.inputs_pl_list[i_device]] = inputs[i_device] feed_dict[model.inputs_seq_len_pl_list[i_device]] = inputs_seq_len[ i_device] feed_dict[model.keep_prob_pl_list[i_device]] = 1.0 labels_pred_st_list = session.run(decode_ops, feed_dict=feed_dict) for i_device, labels_pred_st in enumerate(labels_pred_st_list): batch_size_device = len(inputs[i_device]) try: labels_pred = sparsetensor2list(labels_pred_st, batch_size_device) for i_batch in range(batch_size_device): # Convert from list of index to string if is_test: str_true = labels_true[i_device][i_batch][0] # NOTE: transcript is seperated by space('_') else: str_true = idx2char(labels_true[i_device][i_batch], padded_value=dataset.padded_value) str_pred = idx2char(labels_pred[i_batch]) # Remove consecutive spaces str_pred = re.sub(r'[_]+', '_', str_pred) # Remove garbage labels str_true = re.sub(r'[\']+', '', str_true) str_pred = re.sub(r'[\']+', '', str_pred) # Compute WER wer_mean += compute_wer(ref=str_true.split('_'), hyp=str_pred.split('_'), normalize=True) # substitute, insert, delete = wer_align( # ref=str_pred.split('_'), # hyp=str_true.split('_')) # print('SUB: %d' % substitute) # print('INS: %d' % insert) # print('DEL: %d' % delete) # Remove spaces str_true = re.sub(r'[_]+', '', str_true) str_pred = re.sub(r'[_]+', '', str_pred) # Compute CER cer_mean += compute_cer(str_pred=str_pred, str_true=str_true, normalize=True) if progressbar: pbar.update(1) except IndexError: print('skipped') skip_data_num += batch_size_device # TODO: Conduct decoding again with batch size 1 if progressbar: pbar.update(batch_size_device) if is_new_epoch: break cer_mean /= (len(dataset) - skip_data_num) wer_mean /= (len(dataset) - skip_data_num) # TODO: Fix this # Register original batch size if eval_batch_size is not None: dataset.batch_size = batch_size_original return cer_mean, wer_mean
def decode(session, decode_op, model, dataset, label_type, train_data_size, is_test=True, save_path=None): """Visualize label outputs of CTC model. Args: session: session of training model decode_op: operation for decoding model: the model to evaluate dataset: An instance of a `Dataset` class label_type (string): kanji or kanji or kanji_divide or kana_divide train_data_size (string): train_subset or train_fullset is_test (bool, optional): set to True when evaluating by the test set save_path (string, optional): path to save decoding results """ if 'kanji' in label_type: map_file_path = '../metrics/mapping_files/' + \ label_type + '_' + train_data_size + '.txt' elif 'kana' in label_type: map_file_path = '../metrics/mapping_files/' + label_type + '.txt' else: raise TypeError idx2char = Idx2char(map_file_path=map_file_path) if save_path is not None: sys.stdout = open(join(model.model_dir, 'decode.txt'), 'w') for data, is_new_epoch in dataset: # Create feed dictionary for next mini batch inputs, labels_true, inputs_seq_len, input_names = data feed_dict = { model.inputs_pl_list[0]: inputs[0], model.inputs_seq_len_pl_list[0]: inputs_seq_len[0], model.keep_prob_pl_list[0]: 1.0 } # Decode batch_size = inputs[0].shape[0] labels_pred_st = session.run(decode_op, feed_dict=feed_dict) no_output_flag = False try: labels_pred = sparsetensor2list(labels_pred_st, batch_size=batch_size) except IndexError: # no output no_output_flag = True # Visualize for i_batch in range(batch_size): print('----- wav: %s -----' % input_names[0][i_batch]) if is_test: str_true = labels_true[0][i_batch][0] else: str_true = idx2char(labels_true[0][i_batch]) if no_output_flag: str_pred = '' else: str_pred = idx2char(labels_pred[i_batch]) print('Ref: %s' % str_true) print('Hyp: %s' % str_pred) if is_new_epoch: break
def do_eval_cer(session, decode_ops, model, dataset, label_type, is_test=False, eval_batch_size=None, progressbar=False, is_multitask=False): """Evaluate trained model by Character Error Rate. Args: session: session of training model decode_ops: list of operations for decoding model: the model to evaluate dataset: An instance of a `Dataset` class label_type (string): character or character_capital_divide is_test (bool, optional): set to True when evaluating by the test set eval_batch_size (int, optional): the batch size when evaluating the model progressbar (bool, optional): if True, visualize the progressbar is_multitask (bool, optional): if True, evaluate the multitask model Return: cer_mean (float): An average of CER wer_mean (float): An average of WER """ assert isinstance(decode_ops, list), "decode_ops must be a list." batch_size_original = dataset.batch_size # Reset data counter dataset.reset() # Set batch size in the evaluation if eval_batch_size is not None: dataset.batch_size = eval_batch_size if label_type == 'character': idx2char = Idx2char( map_file_path='../metrics/mapping_files/character.txt') elif label_type == 'character_capital_divide': idx2char = Idx2char( map_file_path='../metrics/mapping_files/character_capital_divide.txt', capital_divide=True, space_mark='_') else: raise TypeError cer_mean, wer_mean = 0, 0 skip_data_num = 0 if progressbar: pbar = tqdm(total=len(dataset)) for data, is_new_epoch in dataset: # Create feed dictionary for next mini batch if is_multitask: inputs, _, labels_true, inputs_seq_len, _ = data else: inputs, labels_true, inputs_seq_len, _ = data feed_dict = {} for i_device in range(len(decode_ops)): feed_dict[model.inputs_pl_list[i_device]] = inputs[i_device] feed_dict[model.inputs_seq_len_pl_list[i_device] ] = inputs_seq_len[i_device] feed_dict[model.keep_prob_pl_list[i_device]] = 1.0 labels_pred_st_list = session.run(decode_ops, feed_dict=feed_dict) for i_device, labels_pred_st in enumerate(labels_pred_st_list): batch_size_device = len(inputs[i_device]) try: labels_pred = sparsetensor2list(labels_pred_st, batch_size_device) for i_batch in range(batch_size_device): # Convert from list of index to string if is_test: str_true = labels_true[i_device][i_batch][0] # NOTE: transcript is seperated by space('_') else: str_true = idx2char(labels_true[i_device][i_batch], padded_value=dataset.padded_value) str_pred = idx2char(labels_pred[i_batch]) # Remove consecutive spaces str_pred = re.sub(r'[_]+', '_', str_pred) # Remove garbage labels str_true = re.sub(r'[\']+', '', str_true) str_pred = re.sub(r'[\']+', '', str_pred) # Compute WER wer_mean += compute_wer(ref=str_true.split('_'), hyp=str_pred.split('_'), normalize=True) # substitute, insert, delete = wer_align( # ref=str_pred.split('_'), # hyp=str_true.split('_')) # print('SUB: %d' % substitute) # print('INS: %d' % insert) # print('DEL: %d' % delete) # Remove spaces str_true = re.sub(r'[_]+', '', str_true) str_pred = re.sub(r'[_]+', '', str_pred) # Compute CER cer_mean += compute_cer(str_pred=str_pred, str_true=str_true, normalize=True) if progressbar: pbar.update(1) except IndexError: print('skipped') skip_data_num += batch_size_device # TODO: Conduct decoding again with batch size 1 if progressbar: pbar.update(batch_size_device) if is_new_epoch: break cer_mean /= (len(dataset) - skip_data_num) wer_mean /= (len(dataset) - skip_data_num) # TODO: Fix this # Register original batch size if eval_batch_size is not None: dataset.batch_size = batch_size_original return cer_mean, wer_mean