def main(): for data_type in ['train', 'dev', 'test']: print('=' * 50) print(' ' * 20 + data_type) print('=' * 50) # Convert transcript to index print('=> Processing transcripts...') trans_dict = read_text(text_path=join(args.data_save_path, data_type, 'text'), vocab_save_path=mkdir_join( args.data_save_path, 'vocab'), data_type=data_type, phone_map_file_path=args.phone_map_file_path) # Make dataset file (.csv) print('=> Saving dataset files...') csv_save_path = mkdir_join(args.data_save_path, 'dataset', args.tool, data_type) df_columns = ['frame_num', 'input_path', 'transcript'] df_phone61 = pd.DataFrame([], columns=df_columns) df_phone48 = pd.DataFrame([], columns=df_columns) df_phone39 = pd.DataFrame([], columns=df_columns) with open( join(args.data_save_path, 'feature', args.tool, data_type, 'frame_num.pickle'), 'rb') as f: frame_num_dict = pickle.load(f) for utt_idx, trans_list in tqdm(trans_dict.items()): feat_utt_save_path = join(args.data_save_path, 'feature', args.tool, data_type, utt_idx + '.npy') frame_num = frame_num_dict[utt_idx] if not isfile(feat_utt_save_path): raise ValueError('There is no file: %s' % feat_utt_save_path) phone61_indices, phone48_indices, phone39_indices = trans_list df_phone61 = add_element( df_phone61, [frame_num, feat_utt_save_path, phone61_indices]) df_phone48 = add_element( df_phone48, [frame_num, feat_utt_save_path, phone48_indices]) df_phone39 = add_element( df_phone39, [frame_num, feat_utt_save_path, phone39_indices]) df_phone61.to_csv(join(csv_save_path, 'phone61.csv')) df_phone48.to_csv(join(csv_save_path, 'phone48.csv')) df_phone39.to_csv(join(csv_save_path, 'phone39.csv'))
def plot(model, dataset, eval_batch_size, save_path=None): """ Args: model: the model to evaluate dataset: An instance of a `Dataset` class eval_batch_size (int): the batch size when evaluating the model save_path (string): path to save figures of CTC posteriors """ # Set batch size in the evaluation if eval_batch_size is not None: dataset.batch_size = eval_batch_size # Clean directory if isdir(save_path): shutil.rmtree(save_path) mkdir(save_path) idx2word = Idx2word(dataset.vocab_file_path) idx2char = Idx2char( dataset.vocab_file_path, capital_divide=dataset.label_type_sub == 'character_capital_divide') for batch, is_new_epoch in dataset: # Get CTC probs probs = model.posteriors(batch['xs'], batch['x_lens'], temperature=1) probs_sub = model.posteriors(batch['xs'], batch['x_lens'], is_sub_task=True, temperature=1) # NOTE: probs: '[B, T, num_classes]' # NOTE: probs_sub: '[B, T, num_classes_sub]' # Decode best_hyps = model.decode(batch['xs'], batch['x_lens'], beam_width=1) best_hyps_sub = model.decode(batch['xs'], batch['x_lens'], beam_width=1, is_sub_task=True) # Visualize for b in range(len(batch['xs'])): # Convert from list of index to string str_hyp = idx2word(best_hyps[b]) str_hyp_sub = idx2char(best_hyps_sub[b]) speaker = batch['input_names'][b].split('_')[0] plot_hierarchical_ctc_probs(probs[b, :batch['x_lens'][b], :], probs_sub[b, :batch['x_lens'][b], :], frame_num=batch['x_lens'][b], num_stack=dataset.num_stack, str_hyp=str_hyp, str_hyp_sub=str_hyp_sub, save_path=mkdir_join( save_path, speaker, batch['input_names'][b] + '.png')) if is_new_epoch: break
def plot(model, dataset, beam_width, eval_batch_size=None, save_path=None): """Visualize attention weights of attetnion-based model. Args: model: model to evaluate dataset: An instance of a `Dataset` class beam_width: (int): the size of beam eval_batch_size (int, optional): the batch size when evaluating the model save_path (string, optional): path to save attention weights plotting """ # Clean directory if save_path is not None and isdir(save_path): shutil.rmtree(save_path) mkdir(save_path) if 'char' in dataset.label_type: map_fn = Idx2char(dataset.vocab_file_path, capital_divide=dataset.label_type == 'character_capital_divide', return_list=True) max_decode_len = MAX_DECODE_LEN_CHAR else: map_fn = Idx2word(dataset.vocab_file_path, return_list=True) max_decode_len = MAX_DECODE_LEN_WORD for batch, is_new_epoch in dataset: # Decode best_hyps, aw, perm_idx = model.attention_weights( batch['xs'], batch['x_lens'], beam_width=beam_width, max_decode_len=max_decode_len) ys = batch['ys'][perm_idx] y_lens = batch['y_lens'][perm_idx] for b in range(len(batch['xs'])): ############################## # Reference ############################## if dataset.is_test: str_ref = ys[b][0] # NOTE: transcript is seperated by space('_') else: # Convert from list of index to string str_ref = map_fn(ys[b][:y_lens[b]]) token_list = map_fn(best_hyps[b]) speaker = '_'.join(batch['input_names'][b].split('_')[:2]) plot_attention_weights( aw[b, :len(token_list), :batch['x_lens'][b]], label_list=token_list, spectrogram=batch['xs'][b, :, :dataset.input_freq], str_ref=str_ref, save_path=mkdir_join(save_path, speaker, batch['input_names'][b] + '.png'), figsize=(20, 8)) if is_new_epoch: break
def do_plot(model, params, epoch, eval_batch_size): """Plot the multi-task CTC posteriors. Args: model: the model to restore params (dict): A dictionary of parameters epoch (int): the epoch to restore eval_batch_size (int): the size of mini-batch in evaluation """ # Load dataset test_data = Dataset(data_type='test', label_type_main=params['label_type_main'], label_type_sub=params['label_type_sub'], batch_size=eval_batch_size, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=False, progressbar=True) # Define placeholders model.create_placeholders() # Add to the graph each operation (including model definition) _, logits_main, logits_sub = model.compute_loss( model.inputs_pl_list[0], model.labels_pl_list[0], model.labels_sub_pl_list[0], model.inputs_seq_len_pl_list[0], model.keep_prob_pl_list[0]) posteriors_op_main, posteriors_op_sub = model.posteriors( logits_main, logits_sub) # Create a saver for writing training checkpoints saver = tf.train.Saver() with tf.Session() as sess: ckpt = tf.train.get_checkpoint_state(model.save_path) # If check point exists if ckpt: # Use last saved model model_path = ckpt.model_checkpoint_path if epoch != -1: model_path = model_path.split('/')[:-1] model_path = '/'.join(model_path) + '/model.ckpt-' + str(epoch) saver.restore(sess, model_path) print("Model restored: " + model_path) else: raise ValueError('There are not any checkpoints.') plot(session=sess, posteriors_op_main=posteriors_op_main, posteriors_op_sub=posteriors_op_sub, model=model, dataset=test_data, label_type_main=params['label_type_main'], label_type_sub=params['label_type_sub'], num_stack=params['num_stack'], save_path=mkdir_join(save_path, 'ctc_output'), show=False)
def do_plot(model, params, epoch, eval_batch_size): """Plot the multi-task CTC posteriors. Args: model: the model to restore params (dict): A dictionary of parameters epoch (int): the epoch to restore eval_batch_size (int): the size of mini-batch in evaluation """ # Load dataset test_data = Dataset( data_type='test', label_type_main=params['label_type_main'], label_type_sub=params['label_type_sub'], batch_size=eval_batch_size, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=False, progressbar=True) # Define placeholders model.create_placeholders() # Add to the graph each operation (including model definition) _, logits_main, logits_sub = model.compute_loss( model.inputs_pl_list[0], model.labels_pl_list[0], model.labels_sub_pl_list[0], model.inputs_seq_len_pl_list[0], model.keep_prob_pl_list[0]) posteriors_op_main, posteriors_op_sub = model.posteriors( logits_main, logits_sub) # Create a saver for writing training checkpoints saver = tf.train.Saver() with tf.Session() as sess: ckpt = tf.train.get_checkpoint_state(model.save_path) # If check point exists if ckpt: # Use last saved model model_path = ckpt.model_checkpoint_path if epoch != -1: model_path = model_path.split('/')[:-1] model_path = '/'.join(model_path) + '/model.ckpt-' + str(epoch) saver.restore(sess, model_path) print("Model restored: " + model_path) else: raise ValueError('There are not any checkpoints.') plot(session=sess, posteriors_op_main=posteriors_op_main, posteriors_op_sub=posteriors_op_sub, model=model, dataset=test_data, label_type_main=params['label_type_main'], label_type_sub=params['label_type_sub'], num_stack=params['num_stack'], save_path=mkdir_join(save_path, 'ctc_output'), show=False)
def plot(model, dataset, eval_batch_size=None, save_path=None, space_index=None): """ Args: model: the model to evaluate dataset: An instance of a `Dataset` class eval_batch_size (int, optional): the batch size when evaluating the model save_path (string): path to save figures of CTC posteriors space_index (int, optional): """ # Set batch size in the evaluation if eval_batch_size is not None: dataset.batch_size = eval_batch_size # Clean directory if isdir(save_path): shutil.rmtree(save_path) mkdir(save_path) vocab_file_path = '../metrics/vocab_files/' + \ dataset.label_type + '_' + dataset.data_size + '.txt' if dataset.label_type == 'character': map_fn = Idx2char(vocab_file_path) elif dataset.label_type == 'character_capital_divide': map_fn = Idx2char(vocab_file_path, capital_divide=True) else: map_fn = Idx2word(vocab_file_path) for batch, is_new_epoch in dataset: # Get CTC probs probs = model.posteriors(batch['xs'], batch['x_lens'], temperature=1) # NOTE: probs: '[B, T, num_classes]' # Decode best_hyps _ = model.decode(batch['xs'], batch['x_lens'], beam_width=1) # Visualize for b in range(len(batch['xs'])): # Convert from list of index to string str_pred = map_fn(best_hyps[b]) speaker, book = batch['input_names'][b].split('-')[:2] plot_ctc_probs( probs[b, :batch['x_lens'][b], :], frame_num=batch['x_lens'][b], num_stack=dataset.num_stack, space_index=space_index, str_pred=str_pred, save_path=mkdir_join(save_path, speaker, book, batch['input_names'][b] + '.png')) if is_new_epoch: break
def plot(model, dataset, eval_batch_size, beam_width, beam_width_sub, length_penalty, save_path=None): """Visualize attention weights of Attetnion-based model. Args: model: model to evaluate dataset: An instance of a `Dataset` class eval_batch_size (int): the batch size when evaluating the model beam_width: (int): the size of beam in the main task beam_width_sub: (int): the size of beam in the sub task length_penalty (float): save_path (string, optional): path to save attention weights plotting """ # Clean directory if save_path is not None and isdir(save_path): shutil.rmtree(save_path) mkdir(save_path) map_fn_main = Idx2word(dataset.vocab_file_path, return_list=True) map_fn_sub = Idx2char(dataset.vocab_file_path_sub, return_list=True) for batch, is_new_epoch in dataset: # Decode best_hyps, aw, perm_idx = model.decode( batch['xs'], batch['x_lens'], beam_width=beam_width, max_decode_len=MAX_DECODE_LEN_WORD) best_hyps_sub, aw_sub, _ = model.decode( batch['xs'], batch['x_lens'], beam_width=beam_width_sub, max_decode_len=MAX_DECODE_LEN_CHAR, task_index=1) for b in range(len(batch['xs'])): word_list = map_fn_main(best_hyps[b]) char_list = map_fn_sub(best_hyps_sub[b]) speaker = batch['input_names'][b].split('_')[0] plot_hierarchical_attention_weights( aw[b][:len(word_list), :batch['x_lens'][b]], aw_sub[b][:len(char_list), :batch['x_lens'][b]], label_list=word_list, label_list_sub=char_list, spectrogram=batch['xs'][b, :, :dataset.input_freq], save_path=mkdir_join(save_path, speaker, batch['input_names'][b] + '.png'), figsize=(40, 8) ) if is_new_epoch: break
def main(): args = parser.parse_args() # Load a config file (.yml) params = load_config(join(args.model_path, 'config.yml'), is_eval=True) # Load dataset test_data = Dataset( data_save_path=args.data_save_path, backend=params['backend'], input_freq=params['input_freq'], use_delta=params['use_delta'], use_double_delta=params['use_double_delta'], data_type='eval1', # data_type='eval2', # data_type='eval3', data_size=params['data_size'], label_type=params['label_type'], label_type_sub=params['label_type_sub'], batch_size=args.eval_batch_size, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=False, reverse=False, tool=params['tool']) params['num_classes'] = test_data.num_classes params['num_classes_sub'] = test_data.num_classes_sub # Load model model = load(model_type=params['model_type'], params=params, backend=params['backend']) # Restore the saved parameters model.load_checkpoint(save_path=args.model_path, epoch=args.epoch) # GPU setting model.set_cuda(deterministic=False, benchmark=True) a2c_oracle = False # Visualize plot(model=model, dataset=test_data, eval_batch_size=args.eval_batch_size, beam_width=args.beam_width, beam_width_sub=args.beam_width_sub, length_penalty=args.length_penalty, a2c_oracle=a2c_oracle, save_path=mkdir_join(args.model_path, 'att_weights'))
def main(): args = parser.parse_args() # Load a config file (.yml) params = load_config(join(args.model_path, 'config.yml'), is_eval=True) # Load dataset vocab_file_path = '../metrics/vocab_files/' + \ params['label_type'] + '_' + params['data_size'] + '.txt' test_data = Dataset( backend=params['backend'], input_channel=params['input_channel'], use_delta=params['use_delta'], use_double_delta=params['use_double_delta'], data_type='test_clean', # data_type='test_other', data_size=params['data_size'], label_type=params['label_type'], vocab_file_path=vocab_file_path, batch_size=args.eval_batch_size, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=True, reverse=True, save_format=params['save_format']) params['num_classes'] = test_data.num_classes # Load model model = load(model_type=params['model_type'], params=params, backend=params['backend']) # Restore the saved parameters model.load_checkpoint(save_path=args.model_path, epoch=args.epoch) # GPU setting model.set_cuda(deterministic=False, benchmark=True) # Visualize plot_attention(model=model, dataset=test_data, max_decode_len=args.max_decode_len, eval_batch_size=args.eval_batch_size, save_path=mkdir_join(args.model_path, 'att_weights'))
def main(): args = parser.parse_args() # Load a config file (.yml) params = load_config(join(args.model_path, 'config.yml'), is_eval=True) # Load dataset test_data = Dataset( data_save_path=args.data_save_path, backend=params['backend'], input_freq=params['input_freq'], use_delta=params['use_delta'], use_double_delta=params['use_double_delta'], data_type='eval2000_swbd', # data_type='eval2000_ch', data_size=params['data_size'], label_type=params['label_type'], batch_size=args.eval_batch_size, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=True, reverse=True, tool=params['tool']) params['num_classes'] = test_data.num_classes # Load model model = load(model_type=params['model_type'], params=params, backend=params['backend']) # Restore the saved parameters model.load_checkpoint(save_path=args.model_path, epoch=args.epoch) # GPU setting model.set_cuda(deterministic=False, benchmark=True) # Visualize plot(model=model, dataset=test_data, eval_batch_size=args.eval_batch_size, save_path=mkdir_join(args.model_path, 'ctc_probs'))
def main(): args = parser.parse_args() # Load a config file (.yml) params = load_config(join(args.model_path, 'config.yml'), is_eval=True) # Load dataset vocab_file_path = '../metrics/vocab_files/' + \ params['label_type'] + '_' + params['data_size'] + '.txt' test_data = Dataset( backend=params['backend'], input_channel=params['input_channel'], use_delta=params['use_delta'], use_double_delta=params['use_double_delta'], data_type='test_clean', # data_type='test_other', data_size=params['data_size'], label_type=params['label_type'], vocab_file_path=vocab_file_path, batch_size=args.eval_batch_size, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=True, reverse=False, save_format=params['save_format']) params['num_classes'] = test_data.num_classes # Load model model = load(model_type=params['model_type'], params=params, backend=params['backend']) # Restore the saved parameters model.load_checkpoint(save_path=args.model_path, epoch=args.epoch) # GPU setting model.set_cuda(deterministic=False, benchmark=True) space_index = 27 if params['label_type'] == 'character' else None # NOTE: index 0 is reserved for blank in warpctc_pytorch # Visualize plot(model=model, dataset=test_data, eval_batch_size=args.eval_batch_size, save_path=mkdir_join(args.model_path, 'ctc_probs'), space_index=space_index)
def main(config_path, model_save_path, gpu_indices): # Load a config file (.yml) with open(config_path, "r") as f: config = yaml.load(f) params = config['param'] # Except for a blank class params['num_classes'] = 28 # Model setting model = StudentCTC( encoder_type=params['encoder_type'], input_size=params['input_size'] * params['num_stack'] * params['splice'], splice=params['splice'], num_stack=params['num_stack'], num_classes=params['num_classes'], parameter_init=params['weight_init'], clip_grad_norm=params['clip_grad_norm'], weight_decay=params['weight_decay']) # Set process name setproctitle( 'tf_libri_' + model.name + '_' + params['train_data_size'] + '_' + params['label_type']) model.name += '_' + params['optimizer'] model.name += '_lr' + str(params['learning_rate']) if params['dropout'] != 0: model.name += '_drop' + str(params['dropout']) if params['num_stack'] != 1: model.name += '_stack' + str(params['num_stack']) if params['weight_decay'] != 0: model.name += '_wd' + str(params['weight_decay']) if len(gpu_indices) >= 2: model.name += '_gpu' + str(len(gpu_indices)) # Set save path model.save_path = mkdir_join( model_save_path, 'student_ctc', params['label_type'], params['train_data_size'], model.name) # Reset model directory model_index = 0 new_model_path = model.save_path while True: if isfile(join(new_model_path, 'complete.txt')): # Training of the first model have been finished model_index += 1 new_model_path = model.save_path + '_' + str(model_index) elif isfile(join(new_model_path, 'config.yml')): # Training of the first model have not been finished yet model_index += 1 new_model_path = model.save_path + '_' + str(model_index) else: break model.save_path = mkdir(new_model_path) # Save config file shutil.copyfile(config_path, join(model.save_path, 'config.yml')) sys.stdout = open(join(model.save_path, 'train.log'), 'w') # TODO(hirofumi): change to logger do_train(model=model, params=params, gpu_indices=gpu_indices)
def main(config_path, model_save_path): # Load a config file (.yml) with open(config_path, "r") as f: config = yaml.load(f) params = config['param'] # Except for a <SOS> and <EOS> class if params['label_type'] == 'phone61': params['num_classes'] = 61 elif params['label_type'] == 'phone48': params['num_classes'] = 48 elif params['label_type'] == 'phone39': params['num_classes'] = 39 elif params['label_type'] == 'character': params['num_classes'] = 28 elif params['label_type'] == 'character_capital_divide': params['num_classes'] = 72 else: raise TypeError # Model setting model = JointCTCAttention( input_size=params['input_size'] * params['num_stack'], encoder_type=params['encoder_type'], encoder_num_units=params['encoder_num_units'], encoder_num_layers=params['encoder_num_layers'], encoder_num_proj=params['encoder_num_proj'], attention_type=params['attention_type'], attention_dim=params['attention_dim'], decoder_type=params['decoder_type'], decoder_num_units=params['decoder_num_units'], decoder_num_layers=params['decoder_num_layers'], embedding_dim=params['embedding_dim'], lambda_weight=params['lambda_weight'], num_classes=params['num_classes'], sos_index=params['num_classes'], eos_index=params['num_classes'] + 1, max_decode_length=params['max_decode_length'], lstm_impl='LSTMBlockCell', use_peephole=params['use_peephole'], parameter_init=params['weight_init'], clip_grad_norm=params['clip_grad_norm'], clip_activation_encoder=params['clip_activation_encoder'], clip_activation_decoder=params['clip_activation_decoder'], weight_decay=params['weight_decay'], time_major=True, sharpening_factor=params['sharpening_factor'], logits_temperature=params['logits_temperature']) # Set process name setproctitle('tf_timit_' + model.name + '_' + params['label_type'] + '_' + params['attention_type']) model.name += '_en' + str(params['encoder_num_units']) model.name += '_' + str(params['encoder_num_layers']) model.name += '_att' + str(params['attention_dim']) model.name += '_de' + str(params['decoder_num_units']) model.name += '_' + str(params['decoder_num_layers']) model.name += '_' + params['optimizer'] model.name += '_lr' + str(params['learning_rate']) model.name += '_' + params['attention_type'] if params['dropout_encoder'] != 0: model.name += '_dropen' + str(params['dropout_encoder']) if params['dropout_decoder'] != 0: model.name += '_dropde' + str(params['dropout_decoder']) if params['dropout_embedding'] != 0: model.name += '_dropem' + str(params['dropout_embedding']) if params['num_stack'] != 1: model.name += '_stack' + str(params['num_stack']) if params['weight_decay'] != 0: model.name += 'wd' + str(params['weight_decay']) if params['sharpening_factor'] != 1: model.name += '_sharp' + str(params['sharpening_factor']) if params['logits_temperature'] != 1: model.name += '_temp' + str(params['logits_temperature']) model.name += '_lambda' + str(params['lambda_weight']) # Set save path model.save_path = mkdir_join( model_save_path, 'joint_ctc_attention', params['label_type'], model.name) # Reset model directory model_index = 0 new_model_path = model.save_path while True: if isfile(join(new_model_path, 'complete.txt')): # Training of the first model have been finished model_index += 1 new_model_path = model.save_path + '_' + str(model_index) elif isfile(join(new_model_path, 'config.yml')): # Training of the first model have not been finished yet model_index += 1 new_model_path = model.save_path + '_' + str(model_index) else: break model.save_path = mkdir(new_model_path) # Save config file shutil.copyfile(config_path, join(model.save_path, 'config.yml')) sys.stdout = open(join(model.save_path, 'train.log'), 'w') # TODO(hirofumi): change to logger do_train(model=model, params=params)
def plot(model, dataset, beam_width, beam_width_sub, eval_batch_size=None, a2c_oracle=False, save_path=None): """Visualize attention weights of Attetnion-based model. Args: model: model to evaluate dataset: An instance of a `Dataset` class beam_width: (int): the size of beam i nteh main task beam_width_sub: (int): the size of beam in the sub task eval_batch_size (int, optional): the batch size when evaluating the model a2c_oracle (bool, optional): save_path (string, optional): path to save attention weights plotting """ # Clean directory if save_path is not None and isdir(save_path): shutil.rmtree(save_path) mkdir(save_path) idx2word = Idx2word(dataset.vocab_file_path, return_list=True) idx2char = Idx2char(dataset.vocab_file_path_sub, return_list=True) for batch, is_new_epoch in dataset: batch_size = len(batch['xs']) if a2c_oracle: if dataset.is_test: max_label_num = 0 for b in range(batch_size): if max_label_num < len(list(batch['ys_sub'][b][0])): max_label_num = len(list(batch['ys_sub'][b][0])) ys_sub = np.zeros((batch_size, max_label_num), dtype=np.int32) ys_sub -= 1 # pad with -1 y_lens_sub = np.zeros((batch_size, ), dtype=np.int32) for b in range(batch_size): indices = char2idx(batch['ys_sub'][b][0]) ys_sub[b, :len(indices)] = indices y_lens_sub[b] = len(indices) # NOTE: transcript is seperated by space('_') else: ys_sub = batch['ys_sub'] y_lens_sub = batch['y_lens_sub'] else: ys_sub = None y_lens_sub = None best_hyps, best_hyps_sub, aw, aw_sub, aw_dec = model.attention_weights( batch['xs'], batch['x_lens'], beam_width=beam_width, beam_width_sub=beam_width_sub, max_decode_len=MAX_DECODE_LEN_WORD, max_decode_len_sub=MAX_DECODE_LEN_CHAR, teacher_forcing=a2c_oracle, ys_sub=ys_sub, y_lens_sub=y_lens_sub) for b in range(len(batch['xs'])): word_list = idx2word(best_hyps[b]) if 'word' in dataset.label_type_sub: char_list = idx2word(best_hyps_sub[b]) else: char_list = idx2char(best_hyps_sub[b]) # word to acoustic & character to acoustic plot_hierarchical_attention_weights( aw[b][:len(word_list), :batch['x_lens'][b]], aw_sub[b][:len(char_list), :batch['x_lens'][b]], label_list=word_list, label_list_sub=char_list, spectrogram=batch['xs'][b, :, :dataset.input_freq], save_path=mkdir_join(save_path, batch['input_names'][b] + '.png'), figsize=(40, 8)) # word to characater plot_word2char_attention_weights( aw_dec[b][:len(word_list), :len(char_list)], label_list=word_list, label_list_sub=char_list, save_path=mkdir_join( save_path, batch['input_names'][b] + '_word2char.png'), figsize=(40, 8)) # with open(join(save_path, speaker, batch['input_names'][b] + '.txt'), 'w') as f: # f.write(batch['ys'][b][0]) if is_new_epoch: break
def save(session, posteriors_op, model, dataset, data_type, save_prob=False, save_soft_targets=False, num_stack=1, save_path=None): # Initialize pbar = tqdm(total=len(dataset)) total_num_frames = 0 pool_input_frames = None pool_prob_frames = None num_frames_per_block = 1024 * 100 frame_counter = 0 block_counter = 0 pool_counter = 0 accumulated_total_num_frames = 0 ######################################## # Count total frame number ######################################## # for data, is_new_epoch in dataset: # # # Create feed dictionary for next mini batch # inputs, _, inputs_seq_len, input_names = data # # batch_size = inputs[0].shape[0] # for i_batch in range(batch_size): # total_num_frames += inputs_seq_len[0][i_batch] # # pbar.update(1) # # if is_new_epoch: # print(total_num_frames) # break ######################################## # Save probabilities per utterance ######################################## pbar = tqdm(total=len(dataset)) for data, is_new_epoch in dataset: # Create feed dictionary for next mini batch inputs, _, inputs_seq_len, input_names = data feed_dict = { model.inputs_pl_list[0]: inputs[0], model.inputs_seq_len_pl_list[0]: inputs_seq_len[0], model.keep_prob_pl_list[0]: 1.0 } batch_size, max_time = inputs[0].shape[:2] probs = session.run(posteriors_op, feed_dict=feed_dict) probs = probs.reshape(batch_size, max_time, model.num_classes) if pool_input_frames is None: # Initialize total_num_frames = TOTAL_NUM_FRAMES_DICT[data_type] pool_num_frames = total_num_frames // NUM_POOLS + 1 pool_capacity = pool_num_frames pool_input_frames = np.zeros( (pool_num_frames, 120 * 2 * 5)) # NOTE: input_size == 120 * 2 (num_stack == 2), splice == 5 pool_prob_frames = np.zeros( (pool_num_frames, model.num_classes)) for i_batch in range(batch_size): speaker = input_names[0][i_batch].split('-')[0] # Mask inputs_seq_len_i = inputs_seq_len[0][i_batch] inputs_i = inputs[0][i_batch][:inputs_seq_len_i] probs_i = probs[i_batch][:inputs_seq_len_i] # Save probabilities as npy file per utterance if save_prob: prob_save_path = mkdir_join( save_path, 'probs_utt', speaker, input_names[0][i_batch] + '.npy') np.save(prob_save_path, probs_i) # NOTE: `[T, num_classes]` if dataset.splice == 1: # NOTE: teacher is expected to be BLSTM # Splicing inputs_i = do_splice(inputs_i.reshape(1, inputs_seq_len_i, -1), splice=5, batch_size=1, num_stack=dataset.num_stack) inputs_i = inputs_i.reshape(inputs_seq_len_i, -1) else: # NOTE: teahcer is expected to be VGG (use features as it is) pass # Register if pool_capacity > inputs_seq_len_i: pool_input_frames[frame_counter:frame_counter + inputs_seq_len_i] = inputs_i pool_prob_frames[frame_counter: frame_counter + inputs_seq_len_i] = probs_i frame_counter += inputs_seq_len_i pool_capacity -= inputs_seq_len_i else: # Fulfill pool pool_input_frames[frame_counter:frame_counter + pool_capacity] = inputs_i[:pool_capacity] pool_prob_frames[frame_counter:frame_counter + pool_capacity] = probs_i[:pool_capacity] ################################################## # Shuffle frames, divide into blocks, and save ################################################## num_blocks = pool_num_frames // num_frames_per_block data_indices = list(range(pool_num_frames)) random.shuffle(data_indices) for i_block in range(num_blocks): block_indices = data_indices[:num_frames_per_block] data_indices = data_indices[num_frames_per_block:] # Pick up block block_inputs_frames = pool_input_frames[block_indices] # NOTE: `[1024 * 100, input_size]` block_probs_frames = pool_prob_frames[block_indices] # NOTE:`[1024 * 100, num_classes]` # Save block if save_soft_targets: print(' ==> Saving: block%d' % block_counter) input_save_path = mkdir_join( save_path, 'inputs', 'block' + str(block_counter) + '.npy') label_save_path = mkdir_join( save_path, 'labels', 'block' + str(block_counter) + '.npy') np.save(input_save_path, block_inputs_frames) np.save(label_save_path, block_probs_frames) block_counter += 1 accumulated_total_num_frames += len(block_indices) pool_carry_over_num_frames = pool_num_frames - num_frames_per_block * num_blocks utt_carry_over_num_frames = inputs_seq_len_i - pool_capacity carry_over_num_frames = pool_carry_over_num_frames + utt_carry_over_num_frames pool_carry_over_input_frames = pool_input_frames[data_indices] pool_carry_over_prob_frames = pool_prob_frames[data_indices] # Initialize if pool_counter != NUM_POOLS - 1: pool_num_frames = total_num_frames // NUM_POOLS + 1 + carry_over_num_frames else: # last pool pool_num_frames = total_num_frames - accumulated_total_num_frames pool_input_frames = np.zeros( (pool_num_frames, 120 * 2 * 5)) # NOTE: input_size == 120 * 2 (num_stack == 2), splice == 5 pool_prob_frames = np.zeros( (pool_num_frames, model.num_classes)) frame_counter = 0 pool_counter += 1 # Register carry over frames pool_input_frames[:pool_carry_over_num_frames] = pool_carry_over_input_frames pool_prob_frames[:pool_carry_over_num_frames] = pool_carry_over_prob_frames frame_counter += pool_carry_over_num_frames pool_input_frames[frame_counter:frame_counter + utt_carry_over_num_frames] = inputs_i[-utt_carry_over_num_frames:] pool_prob_frames[frame_counter:frame_counter + utt_carry_over_num_frames] = probs_i[-utt_carry_over_num_frames:] frame_counter += utt_carry_over_num_frames pool_capacity = pool_num_frames - carry_over_num_frames print('=== next pool ===') pbar.update(batch_size) if is_new_epoch: ################################################## # Save last pool ################################################## # Pick up block block_inputs_frames = pool_input_frames[:frame_counter] # NOTE: `[1024 * 100, input_size]` block_probs_frames = pool_prob_frames[:frame_counter] # NOTE:`[1024 * 100, num_classes]` # Save last lock if save_soft_targets: print(' ==> Saving: block%d' % block_counter) np.save(mkdir_join(save_path, 'inputs', 'block' + str(block_counter) + '.npy'), block_inputs_frames) np.save(mkdir_join(save_path, 'labels', 'block' + str(block_counter) + '.npy'), block_probs_frames) break
def main(config_path, model_save_path): # Load a config file (.yml) with open(config_path, "r") as f: config = yaml.load(f) params = config['param'] # Except for a <SOS> and <EOS> class if params['label_type'] == 'phone61': params['num_classes'] = 61 elif params['label_type'] == 'phone48': params['num_classes'] = 48 elif params['label_type'] == 'phone39': params['num_classes'] = 39 elif params['label_type'] == 'character': params['num_classes'] = 28 elif params['label_type'] == 'character_capital_divide': params['num_classes'] = 72 else: raise TypeError # Model setting model = AttentionSeq2Seq( input_size=params['input_size'] * params['num_stack'], encoder_type=params['encoder_type'], encoder_num_units=params['encoder_num_units'], encoder_num_layers=params['encoder_num_layers'], encoder_num_proj=params['encoder_num_proj'], attention_type=params['attention_type'], attention_dim=params['attention_dim'], decoder_type=params['decoder_type'], decoder_num_units=params['decoder_num_units'], decoder_num_layers=params['decoder_num_layers'], embedding_dim=params['embedding_dim'], num_classes=params['num_classes'], sos_index=params['num_classes'], eos_index=params['num_classes'] + 1, max_decode_length=params['max_decode_length'], lstm_impl='LSTMBlockCell', use_peephole=params['use_peephole'], parameter_init=params['weight_init'], clip_grad_norm=params['clip_grad_norm'], clip_activation_encoder=params['clip_activation_encoder'], clip_activation_decoder=params['clip_activation_decoder'], weight_decay=params['weight_decay'], time_major=True, sharpening_factor=params['sharpening_factor'], logits_temperature=params['logits_temperature'], sigmoid_smoothing=params['sigmoid_smoothing']) # Set process name setproctitle('tf_timit_' + model.name + '_' + params['label_type'] + '_' + params['attention_type']) model.name = 'en' + str(params['encoder_num_units']) model.name += '_' + str(params['encoder_num_layers']) model.name += '_att' + str(params['attention_dim']) model.name += '_de' + str(params['decoder_num_units']) model.name += '_' + str(params['decoder_num_layers']) model.name += '_' + params['optimizer'] model.name += '_lr' + str(params['learning_rate']) model.name += '_' + params['attention_type'] if params['dropout_encoder'] != 0: model.name += '_dropen' + str(params['dropout_encoder']) if params['dropout_decoder'] != 0: model.name += '_dropde' + str(params['dropout_decoder']) if params['dropout_embedding'] != 0: model.name += '_dropem' + str(params['dropout_embedding']) if params['num_stack'] != 1: model.name += '_stack' + str(params['num_stack']) if params['weight_decay'] != 0: model.name += 'wd' + str(params['weight_decay']) if params['sharpening_factor'] != 1: model.name += '_sharp' + str(params['sharpening_factor']) if params['logits_temperature'] != 1: model.name += '_temp' + str(params['logits_temperature']) if bool(params['sigmoid_smoothing']): model.name += '_smoothing' # Set save path model.save_path = mkdir_join(model_save_path, 'attention', params['label_type'], model.name) # Reset model directory model_index = 0 new_model_path = model.save_path while True: if isfile(join(new_model_path, 'complete.txt')): # Training of the first model have been finished model_index += 1 new_model_path = model.save_path + '_' + str(model_index) elif isfile(join(new_model_path, 'config.yml')): # Training of the first model have not been finished yet model_index += 1 new_model_path = model.save_path + '_' + str(model_index) else: break model.save_path = mkdir(new_model_path) # Save config file shutil.copyfile(config_path, join(model.save_path, 'config.yml')) sys.stdout = open(join(model.save_path, 'train.log'), 'w') # TODO(hirofumi): change to logger do_train(model=model, params=params)
def read_text(text_path, vocab_save_path, data_type, lexicon_path=None): """Read transcripts (.sdb) & save files (.npy). Args: text_path (string): path to a text file of kaldi vocab_save_path (string): path to save vocabulary files data_type (string): train or dev or eval2000_swbd or eval2000_ch lexicon_path (string): Returns: speaker_dict (dict): the dictionary of utterances of each speaker key (string) => speaker value (dict) => the dictionary of utterance information of each speaker key (string) => utterance index value (dict) => list of [word_indices, char_indices, char_capital_indices] """ # Make vocabulary files word_vocab_path = mkdir_join(vocab_save_path, 'word.txt') char_vocab_path = mkdir_join(vocab_save_path, 'character.txt') char_capital_vocab_path = mkdir_join( vocab_save_path, 'character_capital_divide.txt') trans_dict = {} char_set = set([]) char_capital_set = set([]) word_set = set([]) word_dict = {} with open(text_path, 'r') as f: for line in f: line = line.strip() utt_idx = line.split(' ')[0] trans = ' '.join(line.split(' ')[1:]).lower() # text normalization trans = trans.replace('<noise>', NOISE) trans = trans.replace('.period', 'period') trans = trans.replace('\'single-quote', 'single-quote') trans = trans.replace('-hyphen', 'hyphen') trans = trans.replace('`', '\'') # 47rc020w trans = re.sub(r'[(){}*,?!":;&/~]+', '', trans) trans = re.sub(r'<.*>', '', trans) trans = re.sub(r'[\s]+', ' ', trans) trans = trans.replace(' ', SPACE) if len(trans) == 0: continue # NOTE: utterances such as ~~ # 46uc030b # 47hc0418 trans_capital = '' for w in trans.split(SPACE): # Count word frequency if w not in word_dict.keys(): word_dict[w] = 1 else: word_dict[w] += 1 word_set.add(w) char_set |= set(list(w)) # Capital-divided if len(w) == 1: char_capital_set.add(w) trans_capital += w else: # Replace the first character with the capital # letter w = w[0].upper() + w[1:] # Check double-letters for i in range(0, len(w) - 1, 1): if w[i:i + 2] in DOUBLE_LETTERS: char_capital_set.add(w[i:i + 2]) else: char_capital_set.add(w[i]) trans_capital += w trans_dict[utt_idx] = [trans, trans_capital] # Save vocabulary files if 'train' in data_type: # word-level (threshold == 3) with codecs.open(word_vocab_path, 'w', 'utf-8') as f: word_list = sorted([w for w, freq in list(word_dict.items()) if freq >= 3]) + [OOV] for w in word_list: f.write('%s\n' % w) # character-level with open(char_vocab_path, 'w') as f: char_list = sorted(list(char_set)) + [SPACE] for c in char_list: f.write('%s\n' % c) # character-level (capital-divided) with open(char_capital_vocab_path, 'w') as f: char_capital_list = sorted(list(char_capital_set)) for c in char_capital_list: f.write('%s\n' % c) # Compute OOV rate if 'train' not in data_type: with codecs.open(mkdir_join(vocab_save_path, 'oov', data_type + '.txt'), 'w', 'utf-8') as f: # word-level (threshold == 3) oov_rate = compute_oov_rate(word_dict, word_vocab_path) f.write('Word (freq3):\n') f.write(' OOV rate: %f %%\n' % oov_rate) # Convert to index print('=====> Convert to index...') word2idx = Word2idx(word_vocab_path) char2idx = Char2idx(char_vocab_path) char2idx_capital = Char2idx(char_capital_vocab_path, capital_divide=True) for utt_idx, [trans, trans_capital] in tqdm(trans_dict.items()): if data_type == 'test_eval92': trans_dict[utt_idx] = {"word": trans, "char": trans, "char_capital": trans} # NOTE: save as it is else: word_indices = word2idx(trans) char_indices = char2idx(trans) char_capital_indices = char2idx_capital(trans) word_indices = ' '.join( list(map(str, word_indices.tolist()))) char_indices = ' '.join( list(map(str, char_indices.tolist()))) char_capital_indices = ' '.join( list(map(str, char_capital_indices.tolist()))) trans_dict[utt_idx] = {"word": word_indices, "char": char_indices, "char_capital": char_capital_indices} return trans_dict
def read_text(text_path, vocab_save_path, data_type, kana2phone_path, lexicon_path=None): """Read transcripts (.sdb) & save files (.npy). Args: text_path (string): path to a text file of kaldi vocab_save_path (string): path to save vocabulary files data_type (string): train or dev or eval1 or eval2 or eval3 kana2phone_path (string): lexicon_path (string, optional): Returns: speaker_dict (dict): the dictionary of utterances of each speaker key (string) => speaker value (dict) => the dictionary of utterance information of each speaker key (string) => utterance index value (dict) key => label type value => indices """ # Make kana set kana_set = set([]) with codecs.open(kana2phone_path, 'r', 'utf-8') as f: for line in f: line = line.strip() kana, phone_seq = line.split('+') kana_set.add(kana) # Make vocabulary files word_vocab_path = mkdir_join(vocab_save_path, 'word.txt') char_vocab_path = mkdir_join(vocab_save_path, 'character.txt') char_wb_vocab_path = mkdir_join(vocab_save_path, 'character_wb.txt') char_wb_left_vocab_path = mkdir_join(vocab_save_path, 'character_wb_left.txt') char_wb_right_vocab_path = mkdir_join(vocab_save_path, 'character_wb_right.txt') char_wb_both_vocab_path = mkdir_join(vocab_save_path, 'character_wb_both.txt') char_wb_remove_vocab_path = mkdir_join(vocab_save_path, 'character_wb_remove.txt') # phone_vocab_path = mkdir_join(vocab_save_path, 'phone' + '.txt') # phone_wb_vocab_path = mkdir_join(vocab_save_path, 'phone_wb' + '.txt') pos_vocab_path = mkdir_join(vocab_save_path, 'pos' + '.txt') trans_dict = {} char_set = set([]) char_set_remove = set([]) word_set = set([]) pos_set = set([]) word_dict = {} with codecs.open(text_path, 'r', 'utf-8') as f: for line in f: line = line.strip() utt_idx, trans_w_pos = line.split(' ') trans_w_pos = trans_w_pos.replace('<sp>', SHORT_PAUSE) trans = SPACE.join( [w.split('+')[0] for w in trans_w_pos.split(' ')]) trans_pos = SPACE.join([ w.split('+')[1].split('/')[0] if '+' in w else SHORT_PAUSE for w in trans_w_pos.split(' ') ]) # NOTE: word and POS sequence are the same length ################################### # with filler and disfluency ################################### trans_left_list, trans_right_list, trans_both_list, trans_remove_list = [], [], [], [] for w in trans_w_pos.split(' '): if '言いよどみ' in w: w_left = SOD + w.split('+')[0] w_right = w.split('+')[0] + EOD w_both = SOD + w.split('+')[0] + EOD elif '感動詞' in w: w_left = SOF + w.split('+')[0] w_right = w.split('+')[0] + EOF w_both = SOF + w.split('+')[0] + EOF else: w_left = w.split('+')[0] w_right = w.split('+')[0] w_both = w.split('+')[0] if w != SHORT_PAUSE: trans_remove_list.append(w.split('+')[0]) trans_left_list.append(w_left) trans_right_list.append(w_right) trans_both_list.append(w_both) trans_left = SPACE.join(trans_left_list) trans_right = SPACE.join(trans_right_list) trans_both = SPACE.join(trans_both_list) trans_remove = SPACE.join(trans_remove_list) trans_dict[utt_idx] = [ trans, trans_pos, trans_left, trans_right, trans_both, trans_remove ] for w in trans.split(SPACE): # Count word frequency if w not in word_dict.keys(): word_dict[w] = 1 else: word_dict[w] += 1 word_set.add(w) char_set |= set(list(w)) for w in trans_remove.split(SPACE): char_set_remove |= set(list(w)) for pos in trans_pos.split(SPACE): pos_set.add(pos) # TODO: load lexicon # Save vocabulary files if data_type == 'train': # word-level (threshold == 3) with codecs.open(word_vocab_path, 'w', 'utf-8') as f: word_list = sorted( [w for w, freq in list(word_dict.items()) if freq >= 3]) + [OOV] for w in word_list: f.write('%s\n' % w) # character-level (char, char_wb) char_list = sorted(list(char_set)) with codecs.open(char_vocab_path, 'w', 'utf-8') as f: for c in char_list + [OOV]: f.write('%s\n' % c) with codecs.open(char_wb_vocab_path, 'w', 'utf-8') as f: for c in char_list + [SPACE, OOV]: f.write('%s\n' % c) # character-level (char_wb + left, right, both, remove) with codecs.open(char_wb_left_vocab_path, 'w', 'utf-8') as f: for c in char_list + [SPACE, OOV, SOF, SOD]: f.write('%s\n' % c) with codecs.open(char_wb_right_vocab_path, 'w', 'utf-8') as f: for c in char_list + [SPACE, OOV, EOF, EOD]: f.write('%s\n' % c) with codecs.open(char_wb_both_vocab_path, 'w', 'utf-8') as f: for c in char_list + [SPACE, OOV, SOF, EOF, SOD, EOD]: f.write('%s\n' % c) with codecs.open(char_wb_remove_vocab_path, 'w', 'utf-8') as f: char_list_remove = sorted(list(char_set_remove)) for c in char_list_remove + [SPACE, OOV]: f.write('%s\n' % c) # phone-level (phone, phone_wb) # with codecs.open(phone_vocab_path, 'w', 'utf-8') as f, codecs.open(phone_wb_vocab_path, 'w', 'utf-8') as f_wb: # phone_list = sorted(list(phone_set)) # for phone in phone_list: # f.write('%s\n' % phone) # for phone in phone_list + [SIL]: # f_wb.write('%s\n' % phone) # pos-level with codecs.open(pos_vocab_path, 'w', 'utf-8') as f: pos_list = sorted(list(pos_set)) for pos in pos_list: f.write('%s\n' % pos) # Compute OOV rate if data_type != 'train': with codecs.open( mkdir_join(vocab_save_path, 'oov', data_type + '.txt'), 'w', 'utf-8') as f: # word-level (threshold == 3) oov_rate = compute_oov_rate(word_dict, word_vocab_path) f.write('Word (freq3):\n') f.write(' OOV rate: %f %%\n' % oov_rate) # Convert to index print('=====> Convert to index...') word2idx = Word2idx(word_vocab_path) char2idx = Char2idx(char_vocab_path) char2idx_wb = Char2idx(char_wb_vocab_path) char2idx_wb_left = Char2idx(char_wb_left_vocab_path) char2idx_wb_right = Char2idx(char_wb_right_vocab_path) char2idx_wb_both = Char2idx(char_wb_both_vocab_path) char2idx_wb_remove = Char2idx(char_wb_remove_vocab_path) # phone2idx = Phone2idx(phone_vocab_path) # phone2idx_wb = Phone2idx(phone_wb_vocab_path) pos2idx = Word2idx(pos_vocab_path) for utt_idx, [ trans, trans_pos, trans_left, trans_right, trans_both, trans_remove ] in tqdm(trans_dict.items()): if 'eval' in data_type: trans_dict[utt_idx] = { "word": trans, "char": trans.replace(SPACE, ''), "char_wb": trans, "char_wb_left": trans, "char_wb_right": trans, "char_wb_both": trans, "char_wb_remove": trans_remove, "phone": None, # "phone": trans_phone, "phone_wb": None, # "phone_wb": trans_phone.replace(SIL, '').replace(' ', ' '), "pos": trans_pos, } # NOTE: save as it is else: word_indices = word2idx(trans) char_indices = char2idx(trans.replace(SPACE, '')) char_wb_indices = char2idx_wb(trans) char_wb_left_indices = char2idx_wb_left(trans_left) char_wb_right_indices = char2idx_wb_right(trans_right) char_wb_both_indices = char2idx_wb_both(trans_both) char_wb_remove_indices = char2idx_wb_remove(trans_remove) # phone_indices = phone2idx( # trans_phone.replace(SIL, '').replace(' ', ' ')) # phone_wb_indices = phone2idx_wb(trans_phone) pos_indices = pos2idx(trans_pos) word_indices = ' '.join(list(map(str, word_indices.tolist()))) char_indices = ' '.join(list(map(str, char_indices.tolist()))) char_wb_indices = ' '.join(list(map(str, char_wb_indices.tolist()))) char_wb_left_indices = ' '.join( list(map(str, char_wb_left_indices.tolist()))) char_wb_right_indices = ' '.join( list(map(str, char_wb_right_indices.tolist()))) char_wb_both_indices = ' '.join( list(map(str, char_wb_both_indices.tolist()))) char_wb_remove_indices = ' '.join( list(map(str, char_wb_remove_indices.tolist()))) # phone_indices = ' '.join( # list(map(str, phone_indices.tolist()))) # phone_wb_indices = ' '.join( # list(map(str, phone_wb_indices.tolist()))) pos_indices = ' '.join(list(map(str, pos_indices.tolist()))) trans_dict[utt_idx] = { "word": word_indices, "char": char_indices, "char_wb": char_wb_indices, "char_wb_left": char_wb_left_indices, "char_wb_right": char_wb_right_indices, "char_wb_both": char_wb_both_indices, "char_wb_remove": char_wb_remove_indices, # "phone": phone_indices, # "phone_wb": phone_wb_indices, "pos": pos_indices, } return trans_dict
def main(): print('=> Processing input data...') for data_type in [ 'train_' + args.data_size, 'dev', 'eval1', 'eval2', 'eval3' ]: print('===> %s' % data_type) feature_save_path = mkdir_join(args.data_save_path, 'feature', args.tool, args.data_size, data_type.split('_')[0]) utt_indices = [] with codecs.open(join(args.data_save_path, data_type, 'text'), 'r', 'utf-8') as f: for line in f: line = line.strip() utt_indices.append(line.split(' ')[0]) segment_dict = {} utt_num = 0 with open(join(args.data_save_path, data_type, 'segments'), 'r') as f: for line in f: line = line.strip() utt_idx, speaker, start_time, end_time = line.split(' ') if speaker not in segment_dict.keys(): segment_dict[speaker] = OrderedDict() segment_dict[speaker][utt_idx] = [ int(float(start_time) * 100 + 0.5), int(float(end_time) * 100 + 0.5) ] utt_num += 1 assert len(utt_indices) == utt_num spk2audio = {} if args.tool == 'htk': with open(join(args.data_save_path, data_type, 'htk.scp'), 'r') as f: for line in f: htk_path = line.strip() speaker = basename(htk_path).split('.')[0] spk2audio[speaker] = htk_path else: with open(join(args.data_save_path, data_type, 'wav.scp'), 'r') as f: for line in f: line = line.strip() speaker = line.split(' ')[0] wav_path = line.split(' ')[2] spk2audio[speaker] = wav_path if 'train' in data_type: global_mean_male, global_std_male = None, None global_mean_female, global_std_female = None, None else: # Load statistics over train dataset global_mean_male = np.load( join(args.data_save_path, 'feature', args.tool, args.data_size, 'train/global_mean_male.npy')) global_std_male = np.load( join(args.data_save_path, 'feature', args.tool, args.data_size, 'train/global_std_male.npy')) global_mean_female = np.load( join(args.data_save_path, 'feature', args.tool, args.data_size, 'train/global_mean_female.npy')) global_std_female = np.load( join(args.data_save_path, 'feature', args.tool, args.data_size, 'train/global_std_female.npy')) read_audio(data_type=data_type, spk2audio=spk2audio, segment_dict=segment_dict, tool=args.tool, config=CONFIG, normalize=args.normalize, save_path=feature_save_path, global_mean_male=global_mean_male, global_std_male=global_std_male, global_mean_female=global_mean_female, global_std_female=global_std_female)
def do_plot(model, params, epoch, eval_batch_size): """Plot the CTC posteriors. Args: model: the model to restore params (dict): A dictionary of parameters epoch (int): the epoch to restore eval_batch_size (int): the size of mini-batch in evaluation """ # Load dataset test_clean_data = Dataset( data_type='test_clean', train_data_size=params['train_data_size'], label_type=params['label_type'], batch_size=eval_batch_size, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], shuffle=False) test_other_data = Dataset( data_type='test_other', train_data_size=params['train_data_size'], label_type=params['label_type'], batch_size=eval_batch_size, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], shuffle=False) with tf.name_scope('tower_gpu0'): # Define placeholders model.create_placeholders() # Add to the graph each operation (including model definition) _, logits = model.compute_loss( model.inputs_pl_list[0], model.labels_pl_list[0], model.inputs_seq_len_pl_list[0], model.keep_prob_input_pl_list[0], model.keep_prob_hidden_pl_list[0], model.keep_prob_output_pl_list[0], softmax_temperature=params['softmax_temperature']) posteriors_op = model.posteriors(logits, blank_prior=1) # Create a saver for writing training checkpoints saver = tf.train.Saver() with tf.Session() as sess: ckpt = tf.train.get_checkpoint_state(model.save_path) # If check point exists if ckpt: # Use last saved model model_path = ckpt.model_checkpoint_path if epoch != -1: model_path = model_path.split('/')[:-1] model_path = '/'.join(model_path) + '/model.ckpt-' + str(epoch) saver.restore(sess, model_path) print("Model restored: " + model_path) else: raise ValueError('There are not any checkpoints.') # Visualize posterior_test(session=sess, posteriors_op=posteriors_op, model=model, dataset=test_clean_data, label_type=params['label_type'], num_stack=params['num_stack'], # save_path=None) save_path=mkdir_join(model.save_path, 'ctc_output', 'test-clean')) posterior_test(session=sess, posteriors_op=posteriors_op, model=model, dataset=test_other_data, label_type=params['label_type'], num_stack=params['num_stack'], # save_path=None) save_path=mkdir_join(model.save_path, 'ctc_output', 'test-other'))
def main(config_path, model_save_path, gpu_indices): # Load a config file (.yml) with open(config_path, "r") as f: config = yaml.load(f) params = config['param'] # Except for a blank label if params['label_type'] == 'kana': params['num_classes'] = 146 elif params['label_type'] == 'kana_divide': params['num_classes'] = 147 elif params['label_type'] == 'kanji': if params['train_data_size'] == 'train_subset': params['num_classes'] = 2981 elif params['train_data_size'] == 'train_fullset': params['num_classes'] = 3385 elif params['label_type'] == 'kanji_divide': if params['train_data_size'] == 'train_subset': params['num_classes'] = 2982 elif params['train_data_size'] == 'train_fullset': params['num_classes'] = 3386 else: raise TypeError # Model setting model = CTC(encoder_type=params['encoder_type'], input_size=params['input_size'], splice=params['splice'], num_stack=params['num_stack'], num_units=params['num_units'], num_layers=params['num_layers'], num_classes=params['num_classes'], lstm_impl=params['lstm_impl'], use_peephole=params['use_peephole'], parameter_init=params['weight_init'], clip_grad_norm=params['clip_grad_norm'], clip_activation=params['clip_activation'], num_proj=params['num_proj'], weight_decay=params['weight_decay']) # Set process name setproctitle( 'tf_csj_' + model.name + '_' + params['train_data_size'] + '_' + params['label_type']) model.name += '_' + str(params['num_units']) model.name += '_' + str(params['num_layers']) model.name += '_' + params['optimizer'] model.name += '_lr' + str(params['learning_rate']) if params['num_proj'] != 0: model.name += '_proj' + str(params['num_proj']) if params['dropout'] != 0: model.name += '_drop' + str(params['dropout']) if params['num_stack'] != 1: model.name += '_stack' + str(params['num_stack']) if params['weight_decay'] != 0: model.name += '_wd' + str(params['weight_decay']) if params['bottleneck_dim'] != 0: model.name += '_bottle' + str(params['bottleneck_dim']) if len(gpu_indices) >= 2: model.name += '_gpu' + str(len(gpu_indices)) # Set save path model.save_path = mkdir_join( model_save_path, 'ctc', params['label_type'], params['train_data_size'], model.name) # Reset model directory model_index = 0 new_model_path = model.save_path while True: if isfile(join(new_model_path, 'complete.txt')): # Training of the first model have been finished model_index += 1 new_model_path = model.save_path + '_' + str(model_index) elif isfile(join(new_model_path, 'config.yml')): # Training of the first model have not been finished yet model_index += 1 new_model_path = model.save_path + '_' + str(model_index) else: break model.save_path = mkdir(new_model_path) # Save config file shutil.copyfile(config_path, join(model.save_path, 'config.yml')) sys.stdout = open(join(model.save_path, 'train.log'), 'w') # TODO(hirofumi): change to logger do_train(model=model, params=params, gpu_indices=gpu_indices)
def do_plot(model, params, epoch, eval_batch_size): """Decode the Attention outputs. Args: model: the model to restore params (dict): A dictionary of parameters epoch (int): the epoch to restore eval_batch_size (int): the size of mini-batch when evaluation """ map_file_path = '../metrics/mapping_files/' + \ params['label_type'] + '.txt' # Load dataset test_data = Dataset( data_type='test', label_type=params['label_type'], batch_size=eval_batch_size, map_file_path=map_file_path, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], shuffle=False, progressbar=True) # Define placeholders model.create_placeholders() # Add to the graph each operation (including model definition) _, _, decoder_outputs_train, decoder_outputs_infer = model.compute_loss( model.inputs_pl_list[0], model.labels_pl_list[0], model.inputs_seq_len_pl_list[0], model.labels_seq_len_pl_list[0], model.keep_prob_encoder_pl_list[0], model.keep_prob_decoder_pl_list[0], model.keep_prob_embedding_pl_list[0]) _, decode_op_infer = model.decode( decoder_outputs_train, decoder_outputs_infer) attention_weights_op = decoder_outputs_infer.attention_weights # Create a saver for writing training checkpoints saver = tf.train.Saver() with tf.Session() as sess: ckpt = tf.train.get_checkpoint_state(model.save_path) # If check point exists if ckpt: model_path = ckpt.model_checkpoint_path if epoch != -1: model_path = model_path.split('/')[:-1] model_path = '/'.join(model_path) + '/model.ckpt-' + str(epoch) saver.restore(sess, model_path) print("Model restored: " + model_path) else: raise ValueError('There are not any checkpoints.') # Visualize plot(session=sess, decode_op=decode_op_infer, attention_weights_op=attention_weights_op, model=model, dataset=test_data, label_type=params['label_type'], is_test=True, save_path=mkdir_join(model.save_path, 'attention_weights'), # save_path=None, show=False)
def main(config_path, model_save_path): # Load a config file (.yml) with open(config_path, "r") as f: config = yaml.load(f) params = config['param'] # Except for a blank class if params['label_type_main'] == 'character': params['num_classes_main'] = 28 elif params['label_type_main'] == 'character_capital_divide': params['num_classes_main'] = 72 else: raise TypeError if params['label_type_sub'] == 'phone61': params['num_classes_sub'] = 61 elif params['label_type_sub'] == 'phone48': params['num_classes_sub'] = 48 elif params['label_type_sub'] == 'phone39': params['num_classes_sub'] = 39 else: raise TypeError # Model setting model = MultitaskCTC(encoder_type=params['encoder_type'], input_size=params['input_size'], splice=params['splice'], num_stack=params['num_stack'], num_units=params['num_units'], num_layers_main=params['num_layers_main'], num_layers_sub=params['num_layers_sub'], num_classes_main=params['num_classes_main'], num_classes_sub=params['num_classes_sub'], main_task_weight=params['main_task_weight'], lstm_impl=params['lstm_impl'], use_peephole=params['use_peephole'], parameter_init=params['weight_init'], clip_grad_norm=params['clip_grad_norm'], clip_activation=params['clip_activation'], num_proj=params['num_proj'], weight_decay=params['weight_decay']) # Set process name setproctitle('tf_timit_' + model.name + '_' + params['label_type_main'] + '_' + params['label_type_sub']) model.name += '_' + str(params['num_units']) model.name += '_main' + str(params['num_layers_main']) model.name += '_sub' + str(params['num_layers_sub']) model.name += '_' + params['optimizer'] model.name += '_lr' + str(params['learning_rate']) if params['num_proj'] != 0: model.name += '_proj' + str(params['num_proj']) if params['dropout'] != 0: model.name += '_drop' + str(params['dropout']) if params['num_stack'] != 1: model.name += '_stack' + str(params['num_stack']) if params['weight_decay'] != 0: model.name += '_wd' + str(params['weight_decay']) model.name += '_main' + str(params['main_task_weight']) # Set save path model.save_path = mkdir_join( model_save_path, 'ctc', 'char_' + params['label_type_sub'], model.name) # Reset model directory model_index = 0 new_model_path = model.save_path while True: if isfile(join(new_model_path, 'complete.txt')): # Training of the first model have been finished model_index += 1 new_model_path = model.save_path + '_' + str(model_index) elif isfile(join(new_model_path, 'config.yml')): # Training of the first model have not been finished yet model_index += 1 new_model_path = model.save_path + '_' + str(model_index) else: break model.save_path = mkdir(new_model_path) # Save config file shutil.copyfile(config_path, join(model.save_path, 'config.yml')) sys.stdout = open(join(model.save_path, 'train.log'), 'w') # TODO(hirofumi): change to logger do_train(model=model, params=params)
def read_text(text_path, vocab_save_path, data_type, lexicon_path=None): """Read transcripts (.sdb) & save files (.npy). Args: text_path (string): path to a text file of kaldi vocab_save_path (string): path to save vocabulary files data_type (string): train or dev or eval2000_swbd or eval2000_ch lexicon_path (string, optional): Returns: speaker_dict (dict): the dictionary of utterances of each speaker key (string) => speaker value (dict) => the dictionary of utterance information of each speaker key (string) => utterance index value (list) => list of [word1_indices, word5_indices, word10_indices, word15_indices char_indices, char_capital_indices] """ # Make vocabulary files word1_vocab_path = mkdir_join(vocab_save_path, 'word1.txt') word5_vocab_path = mkdir_join(vocab_save_path, 'word5.txt') word10_vocab_path = mkdir_join(vocab_save_path, 'word10.txt') word15_vocab_path = mkdir_join(vocab_save_path, 'word15.txt') word20_vocab_path = mkdir_join(vocab_save_path, 'word20.txt') char_vocab_path = mkdir_join(vocab_save_path, 'character.txt') char_capital_vocab_path = mkdir_join(vocab_save_path, 'character_capital_divide.txt') char_left_vocab_path = mkdir_join(vocab_save_path, 'character_left.txt') char_right_vocab_path = mkdir_join(vocab_save_path, 'character_right.txt') char_both_vocab_path = mkdir_join(vocab_save_path, 'character_both.txt') char_remove_vocab_path = mkdir_join(vocab_save_path, 'character_remove.txt') # TODO: ここまで raise ValueError trans_dict = {} char_set = set([]) char_capital_set = set([]) word_set = set([]) word_dict = {} with open(text_path, 'r') as f: for line in f: line = line.strip() utt_idx = line.split(' ')[0] trans = ' '.join(line.split(' ')[1:]).lower() if data_type == 'eval2000_swbd' and utt_idx[:2] == 'en': continue if data_type == 'eval2000_ch' and utt_idx[:2] == 'sw': continue # text normalization trans = trans.replace('[laughter]', LAUGHTER) trans = trans.replace('[noise]', NOISE) trans = trans.replace('[vocalized-noise]', VOCALIZED_NOISE) if 'eval' in data_type: trans = trans.replace('<b_aside>', '') trans = trans.replace('<e_aside>', '') trans = re.sub(r'[()]+', '', trans) # Remove consecutive spaces trans = re.sub(r'[\s]+', ' ', trans) # Remove the first and last spaces if trans[0] == ' ': trans = trans[1:] if trans[-1] == ' ': trans = trans[:-1] ################################### # with filler and disfluency ################################### trans_left_list, trans_right_list, trans_both_list, trans_remove_list = [], [], [], [] for w in trans.split(' '): if '言いよどみ' in w: w_left = SOD + w.split('+')[0] w_right = w.split('+')[0] + EOD w_both = SOD + w.split('+')[0] + EOD elif '感動詞' in w: w_left = SOF + w.split('+')[0] w_right = w.split('+')[0] + EOF w_both = SOF + w.split('+')[0] + EOF else: w_left = w.split('+')[0] w_right = w.split('+')[0] w_both = w.split('+')[0] if w != SHORT_PAUSE: trans_remove_list.append(w.split('+')[0]) trans_left_list.append(w_left) trans_right_list.append(w_right) trans_both_list.append(w_both) trans_left = SPACE.join(trans_left_list) trans_right = SPACE.join(trans_right_list) trans_both = SPACE.join(trans_both_list) trans_remove = SPACE.join(trans_remove_list) trans = trans.replace(' ', SPACE) trans_capital = '' for word in trans.split(SPACE): # Count word frequency if word not in word_dict.keys(): word_dict[word] = 1 else: word_dict[word] += 1 word_set.add(word) char_set |= set(list(word)) # Capital-divided if len(word) == 1: char_capital_set.add(word) trans_capital += word else: # Replace the first character with the capital # letter word = word[0].upper() + word[1:] # Check double-letters for i in range(0, len(word) - 1, 1): if word[i:i + 2] in DOUBLE_LETTERS: char_capital_set.add(word[i:i + 2]) else: char_capital_set.add(word[i]) trans_capital += word trans_dict[utt_idx] = [ trans, trans_capital, trans_left, trans_right, trans_both, trans_remove ] # Reserve some indices char_set.discard('L') char_set.discard('A') char_set.discard('N') char_set.discard('Z') char_set.discard('V') # Save vocabulary files if data_type == 'train': # word-level (threshold == 1) with open(word1_vocab_path, 'w') as f: word_list = sorted(list(word_set)) + [OOV] for w in word_list: f.write('%s\n' % w) # NOTE: OOV index is reserved for the dev set # word-level (threshold == 5) with open(word5_vocab_path, 'w') as f: word_list = sorted( [w for w, freq in list(word_dict.items()) if freq >= 5]) + [OOV] for w in word_list: f.write('%s\n' % w) # word-level (threshold == 10) with open(word10_vocab_path, 'w') as f: word_list = sorted([ w for w, freq in list(word_dict.items()) if freq >= 10 ]) + [OOV] for w in word_list: f.write('%s\n' % w) # word-level (threshold == 15) with open(word15_vocab_path, 'w') as f: word_list = sorted([ w for w, freq in list(word_dict.items()) if freq >= 15 ]) + [OOV] for w in word_list: f.write('%s\n' % w) # word-level (threshold == 20) with open(word20_vocab_path, 'w') as f: word_list = sorted([ w for w, freq in list(word_dict.items()) if freq >= 20 ]) + [OOV] for w in word_list: f.write('%s\n' % w) # character-level with open(char_vocab_path, 'w') as f: char_list = sorted(list(char_set)) + \ [SPACE, LAUGHTER, NOISE, VOCALIZED_NOISE] for c in char_list: f.write('%s\n' % c) # character-level (capital-divided) with open(char_capital_vocab_path, 'w') as f: char_capital_list = sorted(list(char_capital_set)) + \ [LAUGHTER, NOISE, VOCALIZED_NOISE] for c in char_capital_list: f.write('%s\n' % c) # character-level # with open(char_left_vocab_path, 'w') as f: # char_left_list = sorted(list(char_set)) + \ # [SPACE, LAUGHTER, NOISE, VOCALIZED_NOISE] # for c in char_left_list: # f.write('%s\n' % c) raise ValueError # Compute OOV rate if data_type != 'train': with open(mkdir_join(vocab_save_path, 'oov', data_type + '.txt'), 'w') as f: # word-level (threshold == 1) oov_rate = compute_oov_rate(word_dict, word1_vocab_path) f.write('Word (freq1):\n') f.write(' OOV rate: %f %%\n' % oov_rate) # word-level (threshold == 5) oov_rate = compute_oov_rate(word_dict, word5_vocab_path) f.write('Word (freq5):\n') f.write(' OOV rate: %f %%\n' % oov_rate) # word-level (threshold == 10) oov_rate = compute_oov_rate(word_dict, word10_vocab_path) f.write('Word (freq10):\n') f.write(' OOV rate: %f %%\n' % oov_rate) # word-level (threshold == 15) oov_rate = compute_oov_rate(word_dict, word15_vocab_path) f.write('Word (freq15):\n') f.write(' OOV rate: %f %%\n' % oov_rate) # word-level (threshold == 20) oov_rate = compute_oov_rate(word_dict, word20_vocab_path) f.write('Word (freq20):\n') f.write(' OOV rate: %f %%\n' % oov_rate) # Convert to index print('=====> Convert to index...') word2idx_freq1 = Word2idx(word1_vocab_path) word2idx_freq5 = Word2idx(word5_vocab_path) word2idx_freq10 = Word2idx(word10_vocab_path) word2idx_freq15 = Word2idx(word15_vocab_path) word2idx_freq20 = Word2idx(word20_vocab_path) char2idx = Char2idx(char_vocab_path) char2idx_capital = Char2idx(char_capital_vocab_path, capital_divide=True) char2idx_left = Char2idx(char_left_vocab_path) char2idx_right = Char2idx(char_right_vocab_path) char2idx_both = Char2idx(char_both_vocab_path) char2idx_remove = Char2idx(char_remove_vocab_path) for utt_idx, [trans, trans_left, trans_right, trans_both, trans_remove] in tqdm(trans_dict.items()): if 'eval' in data_type: trans_dict[utt_idx] = { "word1": trans, "word5": trans, "word10": trans, "word15": trans, "word20": trans, "char": trans, "char_capital": trans, "char_left": trans, "char_right": trans, "char_both": trans, "char_remove": trans_remove, } # NOTE: save as it is else: word1_indices = word2idx_freq1(trans) word5_indices = word2idx_freq5(trans) word10_indices = word2idx_freq10(trans) word15_indices = word2idx_freq15(trans) word20_indices = word2idx_freq20(trans) char_indices = char2idx(trans) char_capital_indices = char2idx_capital(trans) char_left_indices = char2idx_left(trans_left) char_right_indices = char2idx_right(trans_right) char_both_indices = char2idx_both(trans_both) char_remove_indices = char2idx_remove(trans_remove) word1_indices = ' '.join(list(map(str, word1_indices.tolist()))) word5_indices = ' '.join(list(map(str, word5_indices.tolist()))) word10_indices = ' '.join(list(map(str, word10_indices.tolist()))) word15_indices = ' '.join(list(map(str, word15_indices.tolist()))) word20_indices = ' '.join(list(map(str, word20_indices.tolist()))) char_indices = ' '.join(list(map(str, char_indices.tolist()))) char_capital_indices = ' '.join( list(map(str, char_capital_indices.tolist()))) char_left_indices = ' '.join( list(map(str, char_left_indices.tolist()))) char_right_indices = ' '.join( list(map(str, char_right_indices.tolist()))) char_both_indices = ' '.join( list(map(str, char_both_indices.tolist()))) char_remove_indices = ' '.join( list(map(str, char_remove_indices.tolist()))) trans_dict[utt_idx] = { "word1": word1_indices, "word5": word5_indices, "word10": word10_indices, "word15": word15_indices, "word20": word20_indices, "char": char_indices, "char_capital": char_capital_indices, "char_left": char_left_indices, "char_right": char_right_indices, "char_both": char_both_indices, "char_remove": char_remove_indices, } return trans_dict
def main(config_path, model_save_path): # Load a config file (.yml) with open(config_path, "r") as f: config = yaml.load(f) params = config['param'] # Except for a blank class if params['feature'] == 'fbank': input_size = 123 elif params['feature'] == 'is13': input_size = 141 if params['label_type'] in ['original', 'phone3']: params['num_classes'] = 3 elif params['label_type'] == 'phone4': params['num_classes'] = 4 elif params['label_type'] == 'phone43': params['num_classes'] = 43 # Model setting model = CTC(encoder_type=params['encoder_type'], input_size=input_size * params['num_stack'], splice=params['splice'], num_units=params['num_units'], num_layers=params['num_layers'], num_classes=params['num_classes'], lstm_impl=params['lstm_impl'], use_peephole=params['use_peephole'], parameter_init=params['weight_init'], clip_grad_norm=params['clip_grad_norm'], clip_activation=params['clip_activation'], num_proj=params['num_proj'], weight_decay=params['weight_decay']) # Set process name setproctitle('tf_svc_' + model.name + '_' + params['label_type']) model.name += '_' + str(params['num_units']) model.name += '_' + str(params['num_layers']) model.name += '_' + params['optimizer'] model.name += '_lr' + str(params['learning_rate']) if params['num_proj'] != 0: model.name += '_proj' + str(params['num_proj']) if params['dropout'] != 0: model.name += '_drop' + str(params['dropout']) if params['num_stack'] != 1: model.name += '_stack' + str(params['num_stack']) if params['weight_decay'] != 0: model.name += '_wd' + str(params['weight_decay']) # Set save path model.save_path = mkdir_join( model_save_path, 'ctc', params['label_type'], model.name) # Reset model directory model_index = 0 new_model_path = model.save_path while True: if isfile(join(new_model_path, 'complete.txt')): # Training of the first model have been finished model_index += 1 new_model_path = model.save_path + '_' + str(model_index) elif isfile(join(new_model_path, 'config.yml')): # Training of the first model have not been finished yet model_index += 1 new_model_path = model.save_path + '_' + str(model_index) else: break model.save_path = mkdir(new_model_path) # Save config file shutil.copyfile(config_path, join(model.save_path, 'config.yml')) sys.stdout = open(join(model.save_path, 'train.log'), 'w') # TODO(hirofumi): change to logger do_train(model=model, params=params)
def main(): # for data_type in ['train', 'dev', 'eval2000_swbd', 'eval2000_ch']: for data_type in ['eval2000_swbd', 'eval2000_ch']: print('=' * 50) print(' ' * 20 + data_type) print('=' * 50) if 'eval' in data_type: data_type_tmp = 'eval2000' else: data_type_tmp = data_type # Convert transcript to index print('=> Processing transcripts...') trans_dict = read_text(text_path=join(args.data_save_path, data_type_tmp, 'text'), vocab_save_path=mkdir_join( args.data_save_path, 'vocab'), data_type=data_type, lexicon_path=None) # Make dataset file (.csv) print('=> Saving dataset files...') csv_save_path = mkdir_join(args.data_save_path, 'dataset', args.tool, data_type) df_columns = ['frame_num', 'input_path', 'transcript'] df_word1 = pd.DataFrame([], columns=df_columns) df_word5 = pd.DataFrame([], columns=df_columns) df_word10 = pd.DataFrame([], columns=df_columns) df_word15 = pd.DataFrame([], columns=df_columns) df_word20 = pd.DataFrame([], columns=df_columns) df_char = pd.DataFrame([], columns=df_columns) df_char_capital = pd.DataFrame([], columns=df_columns) df_char_left = pd.DataFrame([], columns=df_columns) df_char_right = pd.DataFrame([], columns=df_columns) df_char_both = pd.DataFrame([], columns=df_columns) df_char_remove = pd.DataFrame([], columns=df_columns) with open( join(args.data_save_path, 'feature', args.tool, data_type, 'frame_num.pickle'), 'rb') as f: frame_num_dict = pickle.load(f) utt_count = 0 df_word1_list, df_word5_list, df_word10_list, df_word15_list, df_word20_list = [], [], [], [], [] df_char_list, df_char_capital_list = [], [] df_char_left_list, df_char_right_list, df_char_both_list, df_char_remove_list = [], [], [], [] for utt_idx, trans in tqdm(trans_dict.items()): speaker = '_'.join(utt_idx.split('_')[:2]) feat_utt_save_path = join(args.data_save_path, 'feature', args.tool, data_type, speaker, utt_idx + '.npy') frame_num = frame_num_dict[utt_idx] if not isfile(feat_utt_save_path): raise ValueError('There is no file: %s' % feat_utt_save_path) df_word1 = add_element( df_word1, [frame_num, feat_utt_save_path, trans['word1']]) df_word5 = add_element( df_word5, [frame_num, feat_utt_save_path, trans['word5']]) df_word10 = add_element( df_word10, [frame_num, feat_utt_save_path, trans['word10']]) df_word15 = add_element( df_word15, [frame_num, feat_utt_save_path, trans['word15']]) df_word20 = add_element( df_word20, [frame_num, feat_utt_save_path, trans['word20']]) df_char = add_element( df_char, [frame_num, feat_utt_save_path, trans['char']]) df_char_capital = add_element( df_char_capital, [frame_num, feat_utt_save_path, trans['char_capital']]) df_char_left = add_element( df_char_left, [frame_num, feat_utt_save_path, trans['char_left']]) df_char_right = add_element( df_char_right, [frame_num, feat_utt_save_path, trans['char_right']]) df_char_both = add_element( df_char_both, [frame_num, feat_utt_save_path, trans['char_both']]) df_char_remove = add_element( df_char_remove, [frame_num, feat_utt_save_path, trans['char_remove']]) utt_count += 1 # Reset if utt_count == 10000: df_word1_list.append(df_word1) df_word5_list.append(df_word5) df_word10_list.append(df_word10) df_word15_list.append(df_word15) df_word20_list.append(df_word20) df_char_list.append(df_char) df_char_capital_list.append(df_char_capital) df_char_left_list.append(df_char_left) df_char_right_list.append(df_char_right) df_char_both_list.append(df_char_both) df_char_remove_list.append(df_char_remove) df_word1 = pd.DataFrame([], columns=df_columns) df_word5 = pd.DataFrame([], columns=df_columns) df_word10 = pd.DataFrame([], columns=df_columns) df_word15 = pd.DataFrame([], columns=df_columns) df_word20 = pd.DataFrame([], columns=df_columns) df_char = pd.DataFrame([], columns=df_columns) df_char_capital = pd.DataFrame([], columns=df_columns) df_char_left = pd.DataFrame([], columns=df_columns) df_char_right = pd.DataFrame([], columns=df_columns) df_char_both = pd.DataFrame([], columns=df_columns) df_char_remove = pd.DataFrame([], columns=df_columns) utt_count = 0 # Last dataframe df_word1_list.append(df_word1) df_word5_list.append(df_word5) df_word10_list.append(df_word10) df_word15_list.append(df_word15) df_word20_list.append(df_word20) df_char_list.append(df_char) df_char_capital_list.append(df_char_capital) df_char_left_list.append(df_char_left) df_char_right_list.append(df_char_right) df_char_both_list.append(df_char_both) df_char_remove_list.append(df_char_remove) # Concatenate all dataframes df_word1 = df_word1_list[0] df_word5 = df_word5_list[0] df_word10 = df_word10_list[0] df_word15 = df_word15_list[0] df_word20 = df_word20_list[0] df_char = df_char_list[0] df_char_capital = df_char_capital_list[0] df_char_left = df_char_left_list[0] df_char_right = df_char_right_list[0] df_char_both = df_char_both_list[0] df_char_remove = df_char_remove_list[0] for i in df_word1_list[1:]: df_word1 = pd.concat([df_word1, i], axis=0) for i in df_word5_list[1:]: df_word5 = pd.concat([df_word5, i], axis=0) for i in df_word10_list[1:]: df_word10 = pd.concat([df_word10, i], axis=0) for i in df_word15_list[1:]: df_word15 = pd.concat([df_word15, i], axis=0) for i in df_word15_list[1:]: df_word20 = pd.concat([df_word20, i], axis=0) for i in df_char_list[1:]: df_char = pd.concat([df_char, i], axis=0) for i in df_char_capital_list[1:]: df_char_capital = pd.concat([df_char_capital, i], axis=0) for i in df_char_left_list[1:]: df_char_left = pd.concat([df_char_left, i], axis=0) for i in df_char_right_list[1:]: df_char_right = pd.concat([df_char_right, i], axis=0) for i in df_char_both_list[1:]: df_char_both = pd.concat([df_char_both, i], axis=0) for i in df_char_remove_list[1:]: df_char_remove = pd.concat([df_char_remove, i], axis=0) df_word1.to_csv(join(csv_save_path, 'word1.csv'), encoding='utf-8') df_word5.to_csv(join(csv_save_path, 'word5.csv'), encoding='utf-8') df_word10.to_csv(join(csv_save_path, 'word10.csv'), encoding='utf-8') df_word15.to_csv(join(csv_save_path, 'word15.csv'), encoding='utf-8') df_word20.to_csv(join(csv_save_path, 'word20.csv'), encoding='utf-8') df_char.to_csv(join(csv_save_path, 'character.csv'), encoding='utf-8') df_char_capital.to_csv(join(csv_save_path, 'character_capital_divide.csv'), encoding='utf-8') df_char_left.to_csv(join(csv_save_path, 'char_left.csv'), encoding='utf-8') df_char_right.to_csv(join(csv_save_path, 'char_right.csv'), encoding='utf-8') df_char_both.to_csv(join(csv_save_path, 'char_both.csv'), encoding='utf-8') df_char_remove.to_csv(join(csv_save_path, 'char_remove.csv'), encoding='utf-8')
def main(config_path, model_save_path): # Load a config file (.yml) with open(config_path, "r") as f: config = yaml.load(f) params = config['param'] params['sos_index'] = 0 params['eos_index'] = 1 if params['label_type'] == 'phone61': params['att_num_classes'] = 63 params['ctc_num_classes'] = 61 elif params['label_type'] == 'phone48': params['att_num_classes'] = 50 params['ctc_num_classes'] = 48 elif params['label_type'] == 'phone39': params['att_num_classes'] = 41 params['ctc_num_classes'] = 39 elif params['label_type'] == 'character': params['att_num_classes'] = 30 params['ctc_num_classes'] = 28 # Model setting # AttentionModel = load(model_type=config['model_name']) model = JointCTCAttention( input_size=params['input_size'], encoder_num_unit=params['encoder_num_unit'], encoder_num_layer=params['encoder_num_layer'], attention_dim=params['attention_dim'], attention_type=params['attention_type'], decoder_num_unit=params['decoder_num_unit'], decoder_num_layer=params['decoder_num_layer'], embedding_dim=params['embedding_dim'], att_num_classes=params['att_num_classes'], ctc_num_classes=params['ctc_num_classes'], att_task_weight=params['att_task_weight'], sos_index=params['sos_index'], eos_index=params['eos_index'], max_decode_length=params['max_decode_length'], # attention_smoothing=params['attention_smoothing'], attention_weights_tempareture=params['attention_weights_tempareture'], logits_tempareture=params['logits_tempareture'], parameter_init=params['weight_init'], clip_grad=params['clip_grad'], clip_activation_encoder=params['clip_activation_encoder'], clip_activation_decoder=params['clip_activation_decoder'], weight_decay=params['weight_decay']) # Set process name setproctitle('timit_' + model.name + '_' + params['label_type']) model.name = params['model'] model.name += '_encoder' + str(params['encoder_num_unit']) model.name += '_' + str(params['encoder_num_layer']) model.name += '_attdim' + str(params['attention_dim']) model.name += '_decoder' + str(params['decoder_num_unit']) model.name += '_' + str(params['decoder_num_layer']) model.name += '_' + params['optimizer'] model.name += '_lr' + str(params['learning_rate']) model.name += '_' + params['attention_type'] # if bool(params['attention_smoothing']): # model.name += '_smoothing' if params['attention_weights_tempareture'] != 1: model.name += '_sharpening' + \ str(params['attention_weights_tempareture']) if params['weight_decay'] != 0: model.name += '_weightdecay' + str(params['weight_decay']) # Set save path model.save_path = mkdir_join(model_save_path, 'attention', params['label_type'], model.name) # Reset model directory model_index = 0 new_model_path = model.save_path while True: if isfile(join(new_model_path, 'complete.txt')): # Training of the first model have been finished model_index += 1 new_model_path = model.save_path + '_' + str(model_index) elif isfile(join(new_model_path, 'config.yml')): # Training of the first model have not been finished yet # tf.gfile.DeleteRecursively(new_model_path) # tf.gfile.MakeDirs(new_model_path) # break model_index += 1 new_model_path = model.save_path + '_' + str(model_index) else: break model.save_path = mkdir(new_model_path) # Save config file shutil.copyfile(config_path, join(model.save_path, 'config.yml')) # sys.stdout = open(join(model.save_path, 'train.log'), 'w') # TODO(hirofumi): change to logger do_train(model=model, params=params)
def main(): for data_type in ['train', 'dev', 'eval1', 'eval2', 'eval3']: # TODO: data_size print('=' * 50) print(' ' * 20 + data_type) print('=' * 50) # Convert transcript to index print('=> Processing transcripts...') trans_dict = read_text( text_path=join(args.data_save_path, data_type, 'text'), vocab_save_path=mkdir_join(args.data_save_path, 'vocab'), data_type=data_type, kana2phone_path='./local/csj_make_trans/kana2phone', lexicon_path=None) # Make dataset file (.csv) print('=> Saving dataset files...') csv_save_path = mkdir_join(args.data_save_path, 'dataset', args.tool, data_type) df_columns = ['frame_num', 'input_path', 'transcript'] df_word = pd.DataFrame([], columns=df_columns) df_char = pd.DataFrame([], columns=df_columns) df_char_wb = pd.DataFrame([], columns=df_columns) df_char_wb_left = pd.DataFrame([], columns=df_columns) df_char_wb_right = pd.DataFrame([], columns=df_columns) df_char_wb_both = pd.DataFrame([], columns=df_columns) df_char_wb_remove = pd.DataFrame([], columns=df_columns) # df_phone = pd.DataFrame([], columns=df_columns) # df_phone_wb = pd.DataFrame([], columns=df_columns) df_pos = pd.DataFrame([], columns=df_columns) with open( join(args.data_save_path, 'feature', args.tool, data_type, 'frame_num.pickle'), 'rb') as f: frame_num_dict = pickle.load(f) utt_count = 0 df_word_list = [] df_char_list, df_char_wb_list = [], [] df_char_wb_left_list, df_char_wb_right_list = [], [] df_char_wb_both_list, df_char_wb_remove_list = [], [] # df_phone_list, df_phone_wb_list = [], [] df_pos_list = [] for utt_idx, trans in tqdm(trans_dict.items()): speaker = utt_idx.split('_')[0] feat_utt_save_path = join(args.data_save_path, 'feature', args.tool, data_type, speaker, utt_idx + '.npy') frame_num = frame_num_dict[utt_idx] if not isfile(feat_utt_save_path): raise ValueError('There is no file: %s' % feat_utt_save_path) df_word = add_element( df_word, [frame_num, feat_utt_save_path, trans['word']]) df_char = add_element( df_char, [frame_num, feat_utt_save_path, trans['char']]) df_char_wb = add_element( df_char_wb, [frame_num, feat_utt_save_path, trans['char_wb']]) df_char_wb_left = add_element( df_char_wb_left, [frame_num, feat_utt_save_path, trans['char_wb_left']]) df_char_wb_right = add_element( df_char_wb_right, [frame_num, feat_utt_save_path, trans['char_wb_right']]) df_char_wb_both = add_element( df_char_wb_both, [frame_num, feat_utt_save_path, trans['char_wb_both']]) df_char_wb_remove = add_element( df_char_wb_remove, [frame_num, feat_utt_save_path, trans['char_wb_remove']]) # df_phone = add_element( # df_phone, [frame_num, feat_utt_save_path, phone_indices]) # df_phone_wb = add_element( # df_phone_wb, [frame_num, feat_utt_save_path, phone_wb_indices]) df_pos = add_element(df_pos, [frame_num, feat_utt_save_path, trans['pos']]) utt_count += 1 # Reset if utt_count == 10000: df_word_list.append(df_word) df_char_list.append(df_char) df_char_wb_list.append(df_char_wb) df_char_wb_left_list.append(df_char_wb_left) df_char_wb_right_list.append(df_char_wb_right) df_char_wb_both_list.append(df_char_wb_both) df_char_wb_remove_list.append(df_char_wb_remove) # df_phone_list.append(df_phone) # df_phone_wb_list.append(df_phone_wb) df_pos_list.append(df_pos) df_word = pd.DataFrame([], columns=df_columns) df_char = pd.DataFrame([], columns=df_columns) df_char_wb = pd.DataFrame([], columns=df_columns) df_char_wb_left = pd.DataFrame([], columns=df_columns) df_char_wb_right = pd.DataFrame([], columns=df_columns) df_char_wb_both = pd.DataFrame([], columns=df_columns) df_char_wb_remove = pd.DataFrame([], columns=df_columns) # df_phone = pd.DataFrame([], columns=df_columns) # df_phone_wb = pd.DataFrame([], columns=df_columns) df_pos = pd.DataFrame([], columns=df_columns) utt_count = 0 # Last dataframe df_word_list.append(df_word) df_char_list.append(df_char) df_char_wb_list.append(df_char_wb) df_char_wb_left_list.append(df_char_wb_left) df_char_wb_right_list.append(df_char_wb_right) df_char_wb_both_list.append(df_char_wb_both) df_char_wb_remove_list.append(df_char_wb_remove) # df_phone_list.append(df_phone) # df_phone_wb_list.append(df_phone_wb) df_pos_list.append(df_pos) # Concatenate all dataframes df_word = df_word_list[0] df_char = df_char_list[0] df_char_wb = df_char_wb_list[0] df_char_wb_left = df_char_wb_left_list[0] df_char_wb_right = df_char_wb_right_list[0] df_char_wb_both = df_char_wb_both_list[0] df_char_wb_remove = df_char_wb_remove_list[0] # df_phone = df_phone_list[0] # df_phone_wb = df_phone_wb_list[0] df_pos = df_pos_list[0] for i in df_word_list[1:]: df_word = pd.concat([df_word, i], axis=0) for i in df_char_list[1:]: df_char = pd.concat([df_char, i], axis=0) for i in df_char_wb_list[1:]: df_char_wb = pd.concat([df_char_wb, i], axis=0) for i in df_char_wb_left_list[1:]: df_char_wb_left = pd.concat([df_char_wb_left, i], axis=0) for i in df_char_wb_right_list[1:]: df_char_wb_right = pd.concat([df_char_wb_right, i], axis=0) for i in df_char_wb_both_list[1:]: df_char_wb_both = pd.concat([df_char_wb_both, i], axis=0) for i in df_char_wb_remove_list[1:]: df_char_wb_remove = pd.concat([df_char_wb_remove, i], axis=0) # for i in df_phone_list[1:]: # df_phone = pd.concat([df_phone, i], axis=0) # for i in df_phone_wb_list[1:]: # df_phone_wb = pd.concat([df_phone_wb, i], axis=0) for i in df_pos_list[1:]: df_pos = pd.concat([df_pos, i], axis=0) df_word.to_csv(join(csv_save_path, 'word.csv'), encoding='utf-8') df_char.to_csv(join(csv_save_path, 'character.csv'), encoding='utf-8') df_char_wb.to_csv(join(csv_save_path, 'character_wb.csv'), encoding='utf-8') df_char_wb_left.to_csv(join(csv_save_path, 'character_wb_left.csv'), encoding='utf-8') df_char_wb_right.to_csv(join(csv_save_path, 'character_wb_right.csv'), encoding='utf-8') df_char_wb_both.to_csv(join(csv_save_path, 'character_wb_both.csv'), encoding='utf-8') df_char_wb_remove.to_csv(join(csv_save_path, 'character_wb_remove.csv'), encoding='utf-8') # df_phone.to_csv(join(csv_save_path, 'phone.csv'), encoding='utf-8') # df_phone_wb.to_csv(join(csv_save_path, 'phone_wb.csv'), encoding='utf-8') df_pos.to_csv(join(csv_save_path, 'pos.csv'), encoding='utf-8')
def main(config_path, model_save_path): # Read a config file (.yml) with open(config_path, "r") as f: config = yaml.load(f) params = config['param'] # Except for a blank label if params['label_type_main'] == 'kanji': params['num_classes_main'] = 3386 elif params['label_type_main'] == 'kana': params['num_classes_main'] = 147 else: raise TypeError if params['label_type_sub'] == 'kana': params['num_classes_sub'] = 147 elif params['label_type_sub'] == 'phone': params['num_classes_sub'] = 38 else: TypeError # Model setting model = load(model_type=params['model']) model = model(batch_size=params['batch_size'], input_size=params['input_size'], splice=params['splice'], num_stack=params['num_stack'], num_units=params['num_units'], num_layer_main=params['num_layer_main'], num_layer_sub=params['num_layer_sub'], # bottleneck_dim=params['bottleneck_dim'], num_classes_main=params['num_classes_main'], num_classes_sub=params['num_classes_sub'], main_task_weight=params['main_task_weight'], parameter_init=params['weight_init'], clip_grad_norm=params['clip_grad_norm'], clip_activation=params['clip_activation'], num_proj=params['num_proj'], weight_decay=params['weight_decay']) model.model_name = params['model'] model.model_name += '_' + str(params['num_units']) model.model_name += '_main' + str(params['num_layer_main']) model.model_name += '_sub' + str(params['num_layer_sub']) model.model_name += '_' + params['optimizer'] model.model_name += '_lr' + str(params['learning_rate']) if params['bottleneck_dim'] != 0: model.model_name += '_bottoleneck' + str(params['bottleneck_dim']) if params['num_proj'] != 0: model.model_name += '_proj' + str(params['num_proj']) if params['num_stack'] != 1: model.model_name += '_stack' + str(params['num_stack']) if params['weight_decay'] != 0: model.model_name += '_weightdecay' + str(params['weight_decay']) model.model_name += '_taskweight' + str(params['main_task_weight']) if params['train_data_size'] == 'large': model.model_name += '_large' # Set save path model.save_path = mkdir(model_save_path) model.save_path = mkdir_join(model.save_path, 'ctc') model.save_path = mkdir_join( model.save_path, params['label_type_main'] + '_' + params['label_type_sub']) model.save_path = mkdir_join(model.save_path, model.model_name) # Reset model directory if not isfile(join(model.save_path, 'complete.txt')): tf.gfile.DeleteRecursively(model.save_path) tf.gfile.MakeDirs(model.save_path) else: raise ValueError('File exists.') # Set process name setproctitle('csj_multictc_' + params['label_type_main'] + '_' + params['label_type_sub'] + '_' + params['train_data_size']) # Save config file shutil.copyfile(config_path, join(model.save_path, 'config.yml')) sys.stdout = open(join(model.save_path, 'train.log'), 'w') do_train(model=model, params=params)
def main(): for data_size in ['train_si84', 'train_si284']: print('=' * 100) print(' ' * 40 + data_size) print('=' * 100) for data_type in [data_size, 'test_dev93', 'test_eval92']: print('=' * 50) print(' ' * 20 + data_type) print('=' * 50) # Convert transcript to index print('=> Processing transcripts...') trans_dict = read_text( text_path=join(args.data_save_path, data_type, 'text'), vocab_save_path=mkdir_join( args.data_save_path, 'vocab', data_size), data_type=data_type, lexicon_path=None) # Make dataset file (.csv) print('=> Saving dataset files...') csv_save_path = mkdir_join( args.data_save_path, 'dataset', args.tool, data_size, data_type) df_columns = ['frame_num', 'input_path', 'transcript'] df_word = pd.DataFrame([], columns=df_columns) df_char = pd.DataFrame([], columns=df_columns) df_char_capital = pd.DataFrame([], columns=df_columns) with open(join(args.data_save_path, 'feature', args.tool, data_size, data_type, 'frame_num.pickle'), 'rb') as f: frame_num_dict = pickle.load(f) utt_count = 0 df_word_list = [] df_char_list, df_char_capital_list = [], [] for utt_idx, trans in tqdm(trans_dict.items()): speaker = utt_idx[:3] feat_utt_save_path = join( args.data_save_path, 'feature', args.tool, data_size, data_type, speaker, utt_idx + '.npy') frame_num = frame_num_dict[utt_idx] if not isfile(feat_utt_save_path): raise ValueError('There is no file: %s' % feat_utt_save_path) df_word = add_element( df_word, [frame_num, feat_utt_save_path, trans['word']]) df_char = add_element( df_char, [frame_num, feat_utt_save_path, trans['char']]) df_char_capital = add_element( df_char_capital, [frame_num, feat_utt_save_path, trans['char_capital']]) utt_count += 1 # Reset if utt_count == 10000: df_word_list.append(df_word) df_char_list.append(df_char) df_char_capital_list.append(df_char_capital) df_word = pd.DataFrame([], columns=df_columns) df_char = pd.DataFrame([], columns=df_columns) df_char_capital = pd.DataFrame([], columns=df_columns) utt_count = 0 # Last dataframe df_word_list.append(df_word) df_char_list.append(df_char) df_char_capital_list.append(df_char_capital) # Concatenate all dataframes df_word = df_word_list[0] df_char = df_char_list[0] df_char_capital = df_char_capital_list[0] for i in df_word_list[1:]: df_word = pd.concat([df_word, i], axis=0) for i in df_char_list[1:]: df_char = pd.concat([df_char, i], axis=0) for i in df_char_capital_list[1:]: df_char_capital = pd.concat([df_char_capital, i], axis=0) df_word.to_csv( join(csv_save_path, 'word.csv'), encoding='utf-8') df_char.to_csv( join(csv_save_path, 'character.csv'), encoding='utf-8') df_char_capital.to_csv( join(csv_save_path, 'character_capital_divide.csv'), encoding='utf-8')
def main(): print('=> Processing input data...') for data_size in ['train_si84', 'train_si284']: for data_type in [data_size, 'test_dev93', 'test_eval92']: print('===> %s' % data_type) feature_save_path = mkdir_join( args.data_save_path, 'feature', args.tool, data_size, data_type) utt_indices = [] with open(join(args.data_save_path, data_type, 'text'), 'r') as f: for line in f: line = line.strip() utt_idx = line.split(' ')[0] utt_indices.append(utt_idx) audio_paths = [] if args.tool == 'htk': with open(join(args.data_save_path, data_type, 'htk.scp'), 'r') as f: for line in f: htk_path = line.strip() audio_paths.append(htk_path) else: with open(join(args.data_save_path, data_type, 'wav.scp'), 'r') as f: for line in f: line = line.strip() wav_path = line.split(' ')[4] audio_paths.append(wav_path) spk2gender = {} with open(join(args.data_save_path, data_type, 'spk2gender'), 'r') as f: for line in f: line = line.strip() speaker, gender = line.split(' ') spk2gender[speaker] = gender if 'train' in data_type: global_mean_male, global_std_male = None, None global_mean_female, global_std_female = None, None else: # Load statistics over train dataset global_mean_male = np.load( join(args.data_save_path, 'feature', args.tool, data_size, data_size, 'global_mean_male.npy')) global_std_male = np.load( join(args.data_save_path, 'feature', args.tool, data_size, data_size, 'global_std_male.npy')) global_mean_female = np.load( join(args.data_save_path, 'feature', args.tool, data_size, data_size, 'global_mean_female.npy')) global_std_female = np.load( join(args.data_save_path, 'feature', args.tool, data_size, data_size, 'global_std_female.npy')) read_audio(data_type=data_type, audio_paths=audio_paths, spk2gender=spk2gender, tool=args.tool, config=CONFIG, normalize=args.normalize, save_path=feature_save_path, global_mean_male=global_mean_male, global_std_male=global_std_male, global_mean_female=global_mean_female, global_std_female=global_std_female)
def main(): args = parser.parse_args() ################################################## # DATSET ################################################## if args.model_save_path is not None: # Load a config file (.yml) params = load_config(args.config_path) # NOTE: Retrain the saved model from the last checkpoint elif args.saved_model_path is not None: params = load_config(os.path.join(args.saved_model_path, 'config.yml')) else: raise ValueError("Set model_save_path or saved_model_path.") # Load dataset train_data = Dataset(data_save_path=args.data_save_path, backend=params['backend'], input_channel=params['input_channel'], use_delta=params['use_delta'], use_double_delta=params['use_double_delta'], data_type='train', data_size=params['data_size'], label_type=params['label_type'], batch_size=params['batch_size'], max_epoch=params['num_epoch'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=True, sort_stop_epoch=params['sort_stop_epoch'], tool=params['tool'], num_enque=None, dynamic_batching=params['dynamic_batching']) dev_clean_data = Dataset(data_save_path=args.data_save_path, backend=params['backend'], input_channel=params['input_channel'], use_delta=params['use_delta'], use_double_delta=params['use_double_delta'], data_type='dev_clean', data_size=params['data_size'], label_type=params['label_type'], batch_size=params['batch_size'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], shuffle=True, tool=params['tool']) dev_other_data = Dataset(data_save_path=args.data_save_path, backend=params['backend'], input_channel=params['input_channel'], use_delta=params['use_delta'], use_double_delta=params['use_double_delta'], data_type='dev_other', data_size=params['data_size'], label_type=params['label_type'], batch_size=params['batch_size'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], shuffle=True, tool=params['tool']) test_clean_data = Dataset(data_save_path=args.data_save_path, backend=params['backend'], input_channel=params['input_channel'], use_delta=params['use_delta'], use_double_delta=params['use_double_delta'], data_type='test_clean', data_size=params['data_size'], label_type=params['label_type'], batch_size=params['batch_size'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], tool=params['tool']) test_other_data = Dataset(data_save_path=args.data_save_path, backend=params['backend'], input_channel=params['input_channel'], use_delta=params['use_delta'], use_double_delta=params['use_double_delta'], data_type='test_other', data_size=params['data_size'], label_type=params['label_type'], batch_size=params['batch_size'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], tool=params['tool']) params['num_classes'] = train_data.num_classes ################################################## # MODEL ################################################## # Model setting model = load(model_type=params['model_type'], params=params, backend=params['backend']) if args.model_save_path is not None: # Set save path save_path = mkdir_join(args.model_save_path, params['backend'], params['model_type'], params['label_type'], params['data_size'], model.name) model.set_save_path(save_path) # Save config file save_config(config_path=args.config_path, save_path=model.save_path) # Setting for logging logger = set_logger(model.save_path) if os.path.isdir(params['char_init']): # NOTE: Start training from the pre-trained character model model.load_checkpoint(save_path=params['char_init'], epoch=-1, load_pretrained_model=True) # Count total parameters for name in sorted(list(model.num_params_dict.keys())): num_params = model.num_params_dict[name] logger.info("%s %d" % (name, num_params)) logger.info("Total %.3f M parameters" % (model.total_parameters / 1000000)) # Define optimizer model.set_optimizer(optimizer=params['optimizer'], learning_rate_init=float(params['learning_rate']), weight_decay=float(params['weight_decay']), clip_grad_norm=params['clip_grad_norm'], lr_schedule=False, factor=params['decay_rate'], patience_epoch=params['decay_patient_epoch']) epoch, step = 1, 0 learning_rate = float(params['learning_rate']) metric_dev_best = 1 # NOTE: Retrain the saved model from the last checkpoint elif args.saved_model_path is not None: # Set save path model.save_path = args.saved_model_path # Setting for logging logger = set_logger(model.save_path, restart=True) # Define optimizer model.set_optimizer( optimizer=params['optimizer'], learning_rate_init=float(params['learning_rate']), # on-the-fly weight_decay=float(params['weight_decay']), clip_grad_norm=params['clip_grad_norm'], lr_schedule=False, factor=params['decay_rate'], patience_epoch=params['decay_patient_epoch']) # Restore the last saved model epoch, step, learning_rate, metric_dev_best = model.load_checkpoint( save_path=args.saved_model_path, epoch=-1, restart=True) else: raise ValueError("Set model_save_path or saved_model_path.") train_data.epoch = epoch - 1 # GPU setting model.set_cuda(deterministic=False, benchmark=True) logger.info('PID: %s' % os.getpid()) logger.info('USERNAME: %s' % os.uname()[1]) # Set process name setproctitle('libri_' + params['backend'] + '_' + params['model_type'] + '_' + params['label_type'] + '_' + params['data_size']) ################################################## # TRAINING LOOP ################################################## # Define learning rate controller lr_controller = Controller( learning_rate_init=learning_rate, backend=params['backend'], decay_start_epoch=params['decay_start_epoch'], decay_rate=params['decay_rate'], decay_patient_epoch=params['decay_patient_epoch'], lower_better=True) # Setting for tensorboard if params['backend'] == 'pytorch': tf_writer = SummaryWriter(model.save_path) # Train model csv_steps, csv_loss_train, csv_loss_dev = [], [], [] start_time_train = time.time() start_time_epoch = time.time() start_time_step = time.time() not_improved_epoch = 0 best_model = model loss_train_mean = 0. pbar_epoch = tqdm(total=len(train_data)) while True: # Compute loss in the training set (including parameter update) batch_train, is_new_epoch = train_data.next() model, loss_train_val = train_step(model, batch_train, params['clip_grad_norm'], backend=params['backend']) loss_train_mean += loss_train_val pbar_epoch.update(len(batch_train['xs'])) if (step + 1) % params['print_step'] == 0: # Compute loss in the dev set batch_dev = dev_clean_data.next()[0] loss_dev = model(batch_dev['xs'], batch_dev['ys'], batch_dev['x_lens'], batch_dev['y_lens'], is_eval=True) loss_train_mean /= params['print_step'] csv_steps.append(step) csv_loss_train.append(loss_train_mean) csv_loss_dev.append(loss_dev) # Logging by tensorboard if params['backend'] == 'pytorch': tf_writer.add_scalar('train/loss', loss_train_mean, step + 1) tf_writer.add_scalar('dev/loss', loss_dev, step + 1) for name, param in model.named_parameters(): name = name.replace('.', '/') tf_writer.add_histogram(name, param.data.cpu().numpy(), step + 1) tf_writer.add_histogram(name + '/grad', param.grad.data.cpu().numpy(), step + 1) duration_step = time.time() - start_time_step logger.info( "...Step:%d(epoch:%.3f) loss:%.3f(%.3f)/lr:%.5f/batch:%d/x_lens:%d (%.3f min)" % (step + 1, train_data.epoch_detail, loss_train_mean, loss_dev, learning_rate, train_data.current_batch_size, max(batch_train['x_lens']) * params['num_stack'], duration_step / 60)) start_time_step = time.time() loss_train_mean = 0. step += 1 # Save checkpoint and evaluate model per epoch if is_new_epoch: duration_epoch = time.time() - start_time_epoch logger.info('===== EPOCH:%d (%.3f min) =====' % (epoch, duration_epoch / 60)) # Save fugure of loss plot_loss(csv_loss_train, csv_loss_dev, csv_steps, save_path=model.save_path) if epoch < params['eval_start_epoch']: # Save the model model.save_checkpoint(model.save_path, epoch, step, learning_rate, metric_dev_best) else: start_time_eval = time.time() # dev if 'word' in params['label_type']: metric_dev_epoch, _ = do_eval_wer( models=[model], dataset=dev_clean_data, beam_width=1, max_decode_len=MAX_DECODE_LEN_WORD, eval_batch_size=1) logger.info(' WER (dev-clean): %.3f %%' % (metric_dev_epoch * 100)) else: metric_dev_epoch, wer_dev_clean_epoch, _ = do_eval_cer( models=[model], dataset=dev_clean_data, beam_width=1, max_decode_len=MAX_DECODE_LEN_CHAR, eval_batch_size=1) logger.info(' CER / WER (dev-clean): %.3f %% / %.3f %%' % ((metric_dev_epoch * 100), (wer_dev_clean_epoch * 100))) if metric_dev_epoch < metric_dev_best: metric_dev_best = metric_dev_epoch not_improved_epoch = 0 best_model = copy.deepcopy(model) logger.info('||||| Best Score |||||') # Save the model model.save_checkpoint(model.save_path, epoch, step, learning_rate, metric_dev_best) # dev-other & test if 'word' in params['label_type']: metric_dev_other_epoch, _ = do_eval_wer( models=[model], dataset=dev_other_data, beam_width=1, max_decode_len=MAX_DECODE_LEN_WORD, eval_batch_size=1) logger.info(' WER (dev-other): %.3f %%' % (metric_dev_other_epoch * 100)) wer_test_clean, _ = do_eval_wer( models=[model], dataset=test_clean_data, beam_width=1, max_decode_len=MAX_DECODE_LEN_WORD, eval_batch_size=1) logger.info(' WER (test-clean): %.3f %%' % (wer_test_clean * 100)) wer_test_other, _ = do_eval_wer( models=[model], dataset=test_other_data, beam_width=1, max_decode_len=MAX_DECODE_LEN_WORD, eval_batch_size=1) logger.info(' WER (test-other): %.3f %%' % (wer_test_other * 100)) logger.info( ' WER (test-mean): %.3f %%' % ((wer_test_clean + wer_test_other) * 100 / 2)) else: metric_dev_other_epoch, wer_dev_other_epoch, _ = do_eval_cer( models=[model], dataset=dev_other_data, beam_width=1, max_decode_len=MAX_DECODE_LEN_CHAR, eval_batch_size=1) logger.info( ' CER / WER (dev-other): %.3f %% / %.3f %%' % ((metric_dev_other_epoch * 100), (wer_dev_other_epoch * 100))) cer_test_clean, wer_test_clean, _ = do_eval_cer( models=[model], dataset=test_clean_data, beam_width=1, max_decode_len=MAX_DECODE_LEN_CHAR, eval_batch_size=1) logger.info( ' CER / WER (test-clean): %.3f %% / %.3f %%' % ((cer_test_clean * 100), (wer_test_clean * 100))) cer_test_other, wer_test_other, _ = do_eval_cer( models=[model], dataset=test_other_data, beam_width=1, max_decode_len=MAX_DECODE_LEN_CHAR, eval_batch_size=1) logger.info( ' CER / WER (test-other): %.3f %% / %.3f %%' % ((cer_test_other * 100), (wer_test_other * 100))) logger.info( ' CER / WER (test-mean): %.3f %% / %.3f %%' % (((cer_test_clean + cer_test_other) * 100 / 2), ((wer_test_clean + wer_test_other) * 100 / 2))) else: not_improved_epoch += 1 duration_eval = time.time() - start_time_eval logger.info('Evaluation time: %.3f min' % (duration_eval / 60)) # Early stopping if not_improved_epoch == params['not_improved_patient_epoch']: break # Update learning rate model.optimizer, learning_rate = lr_controller.decay_lr( optimizer=model.optimizer, learning_rate=learning_rate, epoch=epoch, value=metric_dev_epoch) if epoch == params['convert_to_sgd_epoch']: # Convert to fine-tuning stage model.set_optimizer( 'sgd', learning_rate_init=learning_rate, weight_decay=float(params['weight_decay']), clip_grad_norm=params['clip_grad_norm'], lr_schedule=False, factor=params['decay_rate'], patience_epoch=params['decay_patient_epoch']) logger.info('========== Convert to SGD ==========') # Inject Gaussian noise to all parameters if float(params['weight_noise_std']) > 0: model.weight_noise_injection = True pbar_epoch = tqdm(total=len(train_data)) print('========== EPOCH:%d (%.3f min) ==========' % (epoch, duration_epoch / 60)) if epoch == params['num_epoch']: break start_time_step = time.time() start_time_epoch = time.time() epoch += 1 # TODO: evaluate the best model by beam search here duration_train = time.time() - start_time_train logger.info('Total time: %.3f hour' % (duration_train / 3600)) if params['backend'] == 'pytorch': tf_writer.close() # Training was finished correctly with open(os.path.join(model.save_path, 'COMPLETE'), 'w') as f: f.write('')
def do_save(model, params, epoch, eval_batch_size, temperature): """Save the CTC outputs. Args: model: the model to restore params (dict): A dictionary of parameters epoch (int): the epoch to restore eval_batch_size (int): the size of mini-batch in evaluation temperature (int): """ print('=' * 30) print(' frame stack %d' % int(params['num_stack'])) print(' splice %d' % int(params['splice'])) print(' temperature (training): %d' % temperature) print('=' * 30) # Load dataset train_data = Dataset( data_type='train', train_data_size=params['train_data_size'], label_type=params['label_type'], batch_size=params['batch_size'] if eval_batch_size == - 1 else eval_batch_size, max_epoch=3, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], shuffle=True, num_gpu=1) dev_clean_data = Dataset( data_type='dev_clean', train_data_size=params['train_data_size'], label_type=params['label_type'], batch_size=params['batch_size'] if eval_batch_size == - 1 else eval_batch_size, max_epoch=3, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], shuffle=True, num_gpu=1) dev_other_data = Dataset( data_type='dev_other', train_data_size=params['train_data_size'], label_type=params['label_type'], batch_size=params['batch_size'] if eval_batch_size == - 1 else eval_batch_size, max_epoch=3, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], shuffle=True, num_gpu=1) test_clean_data = Dataset( data_type='test_clean', train_data_size=params['train_data_size'], label_type=params['label_type'], batch_size=params['batch_size'] if eval_batch_size == - 1 else eval_batch_size, max_epoch=3, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], shuffle=True, num_gpu=1) test_other_data = Dataset( data_type='test_other', train_data_size=params['train_data_size'], label_type=params['label_type'], batch_size=params['batch_size'] if eval_batch_size == - 1 else eval_batch_size, max_epoch=3, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], shuffle=True, num_gpu=1) with tf.name_scope('tower_gpu0'): # Define placeholders model.create_placeholders() # Add to the graph each operation (including model definition) _, logits = model.compute_loss( model.inputs_pl_list[0], model.labels_pl_list[0], model.inputs_seq_len_pl_list[0], model.keep_prob_pl_list[0]) logits /= temperature posteriors_op = model.posteriors(logits, blank_prior=1) # Create a saver for writing training checkpoints saver = tf.train.Saver() with tf.Session() as sess: ckpt = tf.train.get_checkpoint_state(model.save_path) # If check point exists if ckpt: model_path = ckpt.model_checkpoint_path if epoch != -1: model_path = model_path.split('/')[:-1] model_path = '/'.join(model_path) + '/model.ckpt-' + str(epoch) saver.restore(sess, model_path) print("Model restored: " + model_path) else: raise ValueError('There are not any checkpoints.') ######################### # Save soft targets ######################### # train100h # save(session=sess, # posteriors_op=posteriors_op, # model=model, # dataset=train_data, # data_type='train', # num_stack=params['num_stack'], # save_prob=False, # save_soft_targets=True, # save_path=mkdir_join(model.save_path, 'temp' + str(temperature), 'train')) # dev # save(session=sess, # posteriors_op=posteriors_op, # model=model, # dataset=dev_clean_data, # data_type='dev_clean', # num_stack=params['num_stack'], # save_prob=False, # save_soft_targets=True, # save_path=mkdir_join(model.save_path, 'temp' + str(temperature), 'dev_clean')) # save(session=sess, # posteriors_op=posteriors_op, # model=model, # dataset=dev_other_data, # data_type='dev_other', # num_stack=params['num_stack'], # save_prob=False, # save_soft_targets=True, # save_path=mkdir_join(model.save_path, 'temp' + str(temperature), 'dev_other')) # test save(session=sess, posteriors_op=posteriors_op, model=model, dataset=test_clean_data, data_type='test_clean', num_stack=params['num_stack'], save_prob=True, save_soft_targets=False, save_path=mkdir_join(model.save_path, 'temp' + str(temperature), 'test_clean')) save(session=sess, posteriors_op=posteriors_op, model=model, dataset=test_other_data, data_type='test_other', num_stack=params['num_stack'], save_prob=True, save_soft_targets=False, save_path=mkdir_join(model.save_path, 'temp' + str(temperature), 'test_other'))
def read_audio(data_type, spk2audio, segment_dict, tool, config, normalize, save_path, global_mean_male=None, global_std_male=None, global_mean_female=None, global_std_female=None, dtype=np.float32): """Read HTK or WAV files. Args: data_type (string): spk2audio (dict): key (string) => value () => segment_dict (dict): key (string) => value (dict) => tool (string): the tool to extract features, htk or librosa or python_speech_features config (dict): a configuration for feature extraction normalize (string): no => normalization will be not conducted global => normalize input features by global mean & stddev over the training set per gender speaker => normalize input features by mean & stddev per speaker utterance => normalize input features by mean & stddev per utterancet data by mean & stddev per utterance save_path (string): path to save npy files global_mean_male (np.ndarray, optional): global mean of male over the training set global_std_male (np.ndarray, optional): global standard deviation of male over the training set global_mean_female (np.ndarray, optional): global mean of female over the training set global_std_female (np.ndarray, optional): global standard deviation of female over the training set dtype (optional): the type of data, default is np.float32 """ is_training = 'train' in data_type if not is_training: if global_mean_male is None or global_mean_female is None: raise ValueError('Set mean & stddev computed in the training set.') if normalize not in ['global', 'speaker', 'utterance', 'no']: raise ValueError( 'normalize must be "utterance" or "speaker" or "global" or "no".') if tool not in ['htk', 'python_speech_features', 'librosa']: raise TypeError( 'tool must be "htk" or "python_speech_features" or "librosa".') audio_paths_male, audio_paths_female = [], [] total_frame_num_male, total_frame_num_female = 0, 0 total_frame_num_dict = {} speaker_mean_dict = {} # NOTE: assume that speakers are different between sessions # Loop 1: Computing global mean and statistics if is_training and normalize != 'no': print('=====> Reading audio files...') for i, speaker in enumerate(tqdm(segment_dict.keys())): audio_path = spk2audio[speaker] # Divide each audio file into utterances _, feat_utt_sum, speaker_mean, _, total_frame_num_speaker = segment( audio_path, speaker, segment_dict[speaker], # dict of utterances is_training=True, sil_duration=0, tool=tool, config=config) if i == 0: # Initialize global statistics feat_dim = feat_utt_sum.shape[0] global_mean_male = np.zeros((feat_dim, ), dtype=dtype) global_mean_female = np.zeros((feat_dim, ), dtype=dtype) global_std_male = np.zeros((feat_dim, ), dtype=dtype) global_std_female = np.zeros((feat_dim, ), dtype=dtype) # For computing global mean if speaker[3] == 'M': audio_paths_male.append(audio_path) global_mean_male += feat_utt_sum total_frame_num_male += total_frame_num_speaker elif speaker[3] == 'F': audio_paths_female.append(audio_path) global_mean_female += feat_utt_sum total_frame_num_female += total_frame_num_speaker else: raise ValueError('gender is M or F.') # For computing speaker mean & stddev if normalize == 'speaker': speaker_mean_dict[speaker] = speaker_mean total_frame_num_dict[speaker] = total_frame_num_speaker # NOTE: speaker mean is already computed print('=====> Computing global mean & stddev...') # Compute global mean per gender global_mean_male /= total_frame_num_male global_mean_female /= total_frame_num_female for speaker in tqdm(segment_dict.keys()): audio_path = spk2audio[speaker] # Divide each audio into utterances feat_dict_speaker, _, _, _, _ = segment(audio_path, speaker, segment_dict[speaker], is_training=True, sil_duration=0, tool=tool, config=config) # For computing global stddev if speaker[3] == 'M': for feat_utt in feat_dict_speaker.values(): global_std_male += np.sum(np.abs(feat_utt - global_mean_male)**2, axis=0) elif speaker[3] == 'F': for feat_utt in feat_dict_speaker.values(): global_std_female += np.sum(np.abs(feat_utt - global_mean_female)**2, axis=0) else: raise ValueError('gender is M or F.') # Compute global stddev per gender global_std_male = np.sqrt(global_std_male / (total_frame_num_male - 1)) global_std_female = np.sqrt(global_std_female / (total_frame_num_female - 1)) # Save global mean & stddev per gender np.save(join(save_path, 'global_mean_male.npy'), global_mean_male) np.save(join(save_path, 'global_mean_female.npy'), global_mean_female) np.save(join(save_path, 'global_std_male.npy'), global_std_male) np.save(join(save_path, 'global_std_female.npy'), global_std_female) # Loop 2: Normalization and Saving print('=====> Normalization...') frame_num_dict = {} # sampPeriod, parmKind = None, None for speaker in tqdm(segment_dict.keys()): audio_path = spk2audio[speaker] if normalize == 'speaker' and is_training: speaker_mean = speaker_mean_dict[speaker] else: speaker_mean = None # Divide each audio into utterances feat_dict_speaker, _, speaker_mean, speaker_std, _ = segment( audio_path, speaker, segment_dict[speaker], is_training=is_training, sil_duration=0, tool=tool, config=config, mean=speaker_mean) # for compute speaker stddev # NOTE: feat_dict_speaker have been not normalized yet for utt_idx, feat_utt in feat_dict_speaker.items(): if normalize == 'no': pass elif normalize == 'global' or not is_training: # Normalize by mean & stddev over the training set per gender if speaker[3] == 'M': feat_utt -= global_mean_male feat_utt /= global_std_male elif speaker[3] == 'F': feat_utt -= global_mean_female feat_utt /= global_std_female else: raise ValueError('gender is M or F.') elif normalize == 'speaker': # Normalize by mean & stddev per speaker feat_utt = (feat_utt - speaker_mean) / speaker_std elif normalize == 'utterance': # Normalize by mean & stddev per utterance utt_mean = np.mean(feat_utt, axis=0, dtype=dtype) utt_std = np.std(feat_utt, axis=0, dtype=dtype) feat_utt = (feat_utt - utt_mean) / utt_std frame_num_dict[utt_idx] = feat_utt.shape[0] # Save input features np.save(mkdir_join(save_path, speaker, utt_idx + '.npy'), feat_utt) # Save the frame number dictionary with open(join(save_path, 'frame_num.pickle'), 'wb') as f: pickle.dump(frame_num_dict, f)
def main(): args = parser.parse_args() # Load a config file (.yml) params = load_config(join(args.model_path, 'config.yml'), is_eval=True) # Load dataset dataset = Dataset( data_save_path=args.data_save_path, backend=params['backend'], input_freq=params['input_freq'], use_delta=params['use_delta'], use_double_delta=params['use_double_delta'], data_type='eval1', # data_type='eval2', # data_type='eval3', data_size=params['data_size'], label_type=params['label_type'], label_type_sub=params['label_type_sub'], batch_size=args.eval_batch_size, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=False, reverse=False, tool=params['tool']) params['num_classes'] = dataset.num_classes params['num_classes_sub'] = dataset.num_classes_sub # Load model model = load(model_type=params['model_type'], params=params, backend=params['backend']) # Restore the saved parameters model.load_checkpoint(save_path=args.model_path, epoch=args.epoch) # GPU setting model.set_cuda(deterministic=False, benchmark=True) save_path = mkdir_join(args.model_path, 'att_weights') ###################################################################### # Clean directory if save_path is not None and isdir(save_path): shutil.rmtree(save_path) mkdir(save_path) for batch, is_new_epoch in dataset: # Decode best_hyps, aw, perm_idx = model.decode( batch['xs'], batch['x_lens'], beam_width=args.beam_width, max_decode_len=MAX_DECODE_LEN_WORD, min_decode_len=MIN_DECODE_LEN_WORD, length_penalty=args.length_penalty, coverage_penalty=args.coverage_penalty) best_hyps_sub, aw_sub, _ = model.decode( batch['xs'], batch['x_lens'], beam_width=args.beam_width_sub, max_decode_len=MAX_DECODE_LEN_CHAR, min_decode_len=MIN_DECODE_LEN_CHAR, length_penalty=args.length_penalty, coverage_penalty=args.coverage_penalty, task_index=1) for b in range(len(batch['xs'])): word_list = dataset.idx2word(best_hyps[b], return_list=True) char_list = dataset.idx2char(best_hyps_sub[b], return_list=True) speaker = batch['input_names'][b].split('_')[0] plot_hierarchical_attention_weights( aw[b][:len(word_list), :batch['x_lens'][b]], aw_sub[b][:len(char_list), :batch['x_lens'][b]], label_list=word_list, label_list_sub=char_list, spectrogram=batch['xs'][b, :, :dataset.input_freq], save_path=mkdir_join(save_path, speaker, batch['input_names'][b] + '.png'), figsize=(40, 8)) if is_new_epoch: break
def main(): args = parser.parse_args() # Load a config file (.yml) params = load_config(join(args.model_path, 'config.yml'), is_eval=True) # Load dataset dataset = Dataset(data_save_path=args.data_save_path, backend=params['backend'], input_freq=params['input_freq'], use_delta=params['use_delta'], use_double_delta=params['use_double_delta'], data_type='test', label_type=params['label_type'], batch_size=args.eval_batch_size, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=True, reverse=True, tool=params['tool']) params['num_classes'] = dataset.num_classes # Load model model = load(model_type=params['model_type'], params=params, backend=params['backend']) # Restore the saved parameters model.load_checkpoint(save_path=args.model_path, epoch=args.epoch) # GPU setting model.set_cuda(deterministic=False, benchmark=True) save_path = mkdir_join(args.model_path, 'ctc_probs') ###################################################################### # Clean directory if save_path is not None and isdir(save_path): shutil.rmtree(save_path) mkdir(save_path) for batch, is_new_epoch in dataset: # Get CTC probs probs, x_lens, _ = model.posteriors(batch['xs'], batch['x_lens'], temperature=1) # NOTE: probs: '[B, T, num_classes]' # Visualize for b in range(len(batch['xs'])): plot_ctc_probs(probs[b, :x_lens[b], :], frame_num=x_lens[b], num_stack=dataset.num_stack, spectrogram=batch['xs'][b, :, :40], save_path=join(save_path, batch['input_names'][b] + '.png'), figsize=(14, 7)) if is_new_epoch: break
def do_save(model, params, epoch, eval_batch_size): """Save the CTC outputs. Args: model: the model to restore params (dict): A dictionary of parameters epoch (int): the epoch to restore eval_batch_size (int): the size of mini-batch in evaluation """ # Load dataset train_data = Dataset( data_type='train', train_data_size=params['train_data_size'], label_type=params['label_type'], batch_size=eval_batch_size, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=True) with tf.name_scope('tower_gpu0'): # Define placeholders model.create_placeholders() # Add to the graph each operation (including model definition) _, logits = model.compute_loss( model.inputs_pl_list[0], model.labels_pl_list[0], model.inputs_seq_len_pl_list[0], model.keep_prob_input_pl_list[0], model.keep_prob_hidden_pl_list[0], model.keep_prob_output_pl_list[0], softmax_temperature=params['softmax_temperature']) posteriors_op = model.posteriors(logits, blank_prior=1) # Create a saver for writing training checkpoints saver = tf.train.Saver() with tf.Session() as sess: ckpt = tf.train.get_checkpoint_state(model.save_path) # If check point exists if ckpt: # Use last saved model model_path = ckpt.model_checkpoint_path if epoch != -1: model_path = model_path.split('/')[:-1] model_path = '/'.join(model_path) + '/model.ckpt-' + str(epoch) saver.restore(sess, model_path) print("Model restored: " + model_path) else: raise ValueError('There are not any checkpoints.') for data, is_new_epoch in train_data: # Create feed dictionary for next mini batch inputs, _, inputs_seq_len, input_names = data feed_dict = { model.inputs_pl_list[0]: inputs[0], model.inputs_seq_len_pl_list[0]: inputs_seq_len[0], model.keep_prob_input_pl_list[0]: 1.0, model.keep_prob_hidden_pl_list[0]: 1.0, model.keep_prob_output_pl_list[0]: 1.0 } batch_size, max_frame_num = inputs[0].shape[:2] posteriors = sess.run(posteriors_op, feed_dict=feed_dict) posteriors = posteriors.reshape(-1, max_frame_num, model.num_classes) for i_batch in range(batch_size): prob = posteriors[i_batch][:int(inputs_seq_len[0][i_batch]), :] # Save as a npy file np.save(mkdir_join(model.save_path, 'probs', input_names[0][i_batch]), prob) if is_new_epoch: break
def main(config_path, model_save_path, gpu_indices): # Load a config file (.yml) with open(config_path, "r") as f: config = yaml.load(f) params = config['param'] # Except for a <SOS> and <EOS> class if params['label_type'] == 'kana': params['num_classes'] = 146 elif params['label_type'] == 'kana_divide': params['num_classes'] = 147 elif params['label_type'] == 'kanji': if params['train_data_size'] == 'train_subset': params['num_classes'] = 2981 elif params['train_data_size'] == 'train_fullset': params['num_classes'] = 3385 elif params['label_type'] == 'kanji_divide': if params['train_data_size'] == 'train_subset': params['num_classes'] = 2982 elif params['train_data_size'] == 'train_fullset': params['num_classes'] = 3386 else: raise TypeError # Model setting model = AttentionSeq2Seq( input_size=params['input_size'] * params['num_stack'], encoder_type=params['encoder_type'], encoder_num_units=params['encoder_num_units'], encoder_num_layers=params['encoder_num_layers'], encoder_num_proj=params['encoder_num_proj'], attention_type=params['attention_type'], attention_dim=params['attention_dim'], decoder_type=params['decoder_type'], decoder_num_units=params['decoder_num_units'], decoder_num_layers=params['decoder_num_layers'], embedding_dim=params['embedding_dim'], num_classes=params['num_classes'], sos_index=params['num_classes'], eos_index=params['num_classes'] + 1, max_decode_length=params['max_decode_length'], lstm_impl='LSTMBlockCell', use_peephole=params['use_peephole'], parameter_init=params['weight_init'], clip_grad_norm=params['clip_grad_norm'], clip_activation_encoder=params['clip_activation_encoder'], clip_activation_decoder=params['clip_activation_decoder'], weight_decay=params['weight_decay'], time_major=True, sharpening_factor=params['sharpening_factor'], logits_temperature=params['logits_temperature'], sigmoid_smoothing=params['sigmoid_smoothing']) # Set process name setproctitle('tf_csj_' + model.name + '_' + params['train_data_size'] + '_' + params['label_type'] + '_' + params['attention_type']) model.name = 'en' + str(params['encoder_num_units']) model.name += '_' + str(params['encoder_num_layers']) model.name += '_att' + str(params['attention_dim']) model.name += '_de' + str(params['decoder_num_units']) model.name += '_' + str(params['decoder_num_layers']) model.name += '_' + params['optimizer'] model.name += '_lr' + str(params['learning_rate']) model.name += '_' + params['attention_type'] if params['dropout_encoder'] != 0: model.name += '_dropen' + str(params['dropout_encoder']) if params['dropout_decoder'] != 0: model.name += '_dropde' + str(params['dropout_decoder']) if params['dropout_embedding'] != 0: model.name += '_dropem' + str(params['dropout_embedding']) if params['num_stack'] != 1: model.name += '_stack' + str(params['num_stack']) if params['weight_decay'] != 0: model.name += 'wd' + str(params['weight_decay']) if params['sharpening_factor'] != 1: model.name += '_sharp' + str(params['sharpening_factor']) if params['logits_temperature'] != 1: model.name += '_temp' + str(params['logits_temperature']) if bool(params['sigmoid_smoothing']): model.name += '_smoothing' if len(gpu_indices) >= 2: model.name += '_gpu' + str(len(gpu_indices)) # Set save path model.save_path = mkdir_join( model_save_path, 'attention', params['label_type'], params['train_data_size'], model.name) # Reset model directory model_index = 0 new_model_path = model.save_path while True: if isfile(join(new_model_path, 'complete.txt')): # Training of the first model have been finished model_index += 1 new_model_path = model.save_path + '_' + str(model_index) elif isfile(join(new_model_path, 'config.yml')): # Training of the first model have not been finished yet model_index += 1 new_model_path = model.save_path + '_' + str(model_index) else: break model.save_path = mkdir(new_model_path) # Save config file shutil.copyfile(config_path, join(model.save_path, 'config.yml')) sys.stdout = open(join(model.save_path, 'train.log'), 'w') # TODO(hirofumi): change to logger do_train(model=model, params=params, gpu_indices=gpu_indices)
def main(config_path, model_save_path): # Load a config file (.yml) with open(config_path, "r") as f: config = yaml.load(f) params = config['param'] # Except for a blank class if params['label_type_main'] == 'character': params['num_classes_main'] = 28 elif params['label_type_main'] == 'character_capital_divide': params['num_classes_main'] = 72 if params['label_type_sub'] == 'phone61': params['num_classes_sub'] = 61 elif params['label_type_sub'] == 'phone48': params['num_classes_sub'] = 48 elif params['label_type_sub'] == 'phone39': params['num_classes_sub'] = 39 # Model setting model = Multitask_CTC(encoder_type=params['encoder_type'], input_size=params['input_size'] * params['num_stack'], num_units=params['num_units'], num_layers_main=params['num_layers_main'], num_layers_sub=params['num_layers_sub'], num_classes_main=params['num_classes_main'], num_classes_sub=params['num_classes_sub'], main_task_weight=params['main_task_weight'], lstm_impl=params['lstm_impl'], use_peephole=params['use_peephole'], parameter_init=params['weight_init'], clip_grad=params['clip_grad'], clip_activation=params['clip_activation'], num_proj=params['num_proj'], weight_decay=params['weight_decay']) # Set process name setproctitle('timit_' + model.name + '_' + params['label_type_main'] + '_' + params['label_type_sub']) model.name += '_' + str(params['num_units']) model.name += '_main' + str(params['num_layers_main']) model.name += '_sub' + str(params['num_layers_sub']) model.name += '_' + params['optimizer'] model.name += '_lr' + str(params['learning_rate']) if params['num_proj'] != 0: model.name += '_proj' + str(params['num_proj']) if params['dropout_input'] != 1: model.name += '_dropi' + str(params['dropout_input']) if params['dropout_hidden'] != 1: model.name += '_droph' + str(params['dropout_hidden']) if params['num_stack'] != 1: model.name += '_stack' + str(params['num_stack']) if params['weight_decay'] != 0: model.name += '_wd' + str(params['weight_decay']) model.name += '_main' + str(params['main_task_weight']) # Set save path model.save_path = mkdir_join(model_save_path, 'ctc', 'char_' + params['label_type_sub'], model.name) # Reset model directory model_index = 0 new_model_path = model.save_path while True: if isfile(join(new_model_path, 'complete.txt')): # Training of the first model have been finished model_index += 1 new_model_path = model.save_path + '_' + str(model_index) elif isfile(join(new_model_path, 'config.yml')): # Training of the first model have not been finished yet # tf.gfile.DeleteRecursively(new_model_path) # tf.gfile.MakeDirs(new_model_path) # break model_index += 1 new_model_path = model.save_path + '_' + str(model_index) else: break model.save_path = mkdir(new_model_path) # Save config file shutil.copyfile(config_path, join(model.save_path, 'config.yml')) sys.stdout = open(join(model.save_path, 'train.log'), 'w') # TODO(hirofumi): change to logger do_train(model=model, params=params)
def main(config_path, gpu_indices): # Load a config file (.yml) with open(config_path, "r") as f: config = yaml.load(f) corpus = config['corpus'] feature = config['feature'] param = config['param'] if corpus['label_type'] == 'phone61': output_size = 61 elif corpus['label_type'] == 'phone48': output_size = 48 elif corpus['label_type'] == 'phone39': output_size = 39 elif corpus['label_type'] == 'character': output_size = 30 # Model setting CTCModel = load(model_type=config['model_name']) network = CTCModel(batch_size=param['batch_size'], input_size=feature['input_size'] * feature['num_stack'], num_unit=param['num_unit'], num_layer=param['num_layer'], output_size=output_size, parameter_init=param['weight_init'], clip_grad=param['clip_grad'], clip_activation=param['clip_activation'], dropout_ratio_input=param['dropout_input'], dropout_ratio_hidden=param['dropout_hidden'], num_proj=param['num_proj'], weight_decay=param['weight_decay']) network.model_name = config['model_name'].upper() network.model_name += '_' + str(param['num_unit']) network.model_name += '_' + str(param['num_layer']) network.model_name += '_' + param['optimizer'] network.model_name += '_lr' + str(param['learning_rate']) if param['num_proj'] != 0: network.model_name += '_proj' + str(param['num_proj']) if feature['num_stack'] != 1: network.model_name += '_stack' + str(feature['num_stack']) if param['weight_decay'] != 0: network.model_name += '_weightdecay' + str(param['weight_decay']) network.model_name += '_' + str(len(gpu_indices)) + 'gpu' # Set save path network.model_dir = mkdir('/n/sd8/inaguma/result/timit/ctc/') network.model_dir = mkdir_join(network.model_dir, corpus['label_type']) network.model_dir = mkdir_join(network.model_dir, network.model_name) # Reset model directory if not isfile(join(network.model_dir, 'complete.txt')): tf.gfile.DeleteRecursively(network.model_dir) tf.gfile.MakeDirs(network.model_dir) else: raise ValueError('File exists.') # Set process name setproctitle('multigpu_ctc_timit_' + corpus['label_type']) # Save config file shutil.copyfile(config_path, join(network.model_dir, 'config.yml')) sys.stdout = open(join(network.model_dir, 'train.log'), 'w') print(network.model_name) do_train(network=network, optimizer=param['optimizer'], learning_rate=param['learning_rate'], batch_size=param['batch_size'], epoch_num=param['num_epoch'], label_type=corpus['label_type'], num_stack=feature['num_stack'], num_skip=feature['num_skip'], gpu_indices=gpu_indices) sys.stdout = sys.__stdout__
def read_audio(data_type, audio_paths, spk2gender, tool, config, normalize, save_path, global_mean_male=None, global_std_male=None, global_mean_female=None, global_std_female=None, dtype=np.float32): """Read HTK or WAV files. Args: data_type (string): train_si84 or train_si284 or test_dev93 or test_eval92 audio_paths (list): paths to audio files spk2gender (dict): key => speaker value => gender tool (string): the tool to extract features, htk or librosa or python_speech_features config (dict): a configuration for feature extraction normalize (string): no => normalization will be not conducted global => normalize input features by global mean & stddev over the training set per gender speaker => normalize input features by mean & stddev per speaker utterance => normalize input features by mean & stddev per utterancet data by mean & stddev per utterance save_path (string): path to save npy files global_mean_male (np.ndarray, optional): global mean of male over the training set global_std_male (np.ndarray, optional): global standard deviation of male over the training set global_mean_female (np.ndarray, optional): global mean of female over the training set global_std_female (np.ndarray, optional): global standard deviation of female over the training set dtype (optional): the type of data, default is np.float32 """ if 'train' not in data_type: if global_mean_male is None or global_mean_female is None: raise ValueError('Set mean & stddev computed in the training set.') if normalize not in ['global', 'speaker', 'utterance', 'no']: raise ValueError( 'normalize must be "utterance" or "speaker" or "global" or "no".') if tool not in ['htk', 'python_speech_features', 'librosa']: raise TypeError( 'tool must be "htk" or "python_speech_features" or "librosa".') audio_paths_male, audio_paths_female = [], [] total_frame_num_male, total_frame_num_female = 0, 0 total_frame_num_dict = {} speaker_mean_dict, speaker_std_dict = {}, {} # Loop 1: Computing global mean and statistics if 'train' in data_type and normalize != 'no': print('=====> Reading audio files...') for i, audio_path in enumerate(tqdm(audio_paths)): speaker = audio_path.split('/')[-2] utt_idx = basename(audio_path).split('.')[0] gender = spk2gender[speaker] if tool == 'htk': feat_utt, sampPeriod, parmKind = read(audio_path) elif tool == 'python_speech_features': feat_utt = w2f_psf(audio_path, feature_type=config['feature_type'], feature_dim=config['channels'], use_energy=config['energy'], use_delta1=config['delta'], use_delta2=config['deltadelta'], window=config['window'], slide=config['slide']) elif tool == 'librosa': feat_utt = w2f_librosa(audio_path, feature_type=config['feature_type'], feature_dim=config['channels'], use_energy=config['energy'], use_delta1=config['delta'], use_delta2=config['deltadelta'], window=config['window'], slide=config['slide']) frame_num, feat_dim = feat_utt.shape feat_utt_sum = np.sum(feat_utt, axis=0) if i == 0: # Initialize global statistics global_mean_male = np.zeros((feat_dim,), dtype=dtype) global_mean_female = np.zeros((feat_dim,), dtype=dtype) global_std_male = np.zeros((feat_dim,), dtype=dtype) global_std_female = np.zeros((feat_dim,), dtype=dtype) # For computing global mean if gender == 'm': audio_paths_male.append(audio_path) global_mean_male += feat_utt_sum total_frame_num_male += frame_num elif gender == 'f': audio_paths_female.append(audio_path) global_mean_female += feat_utt_sum total_frame_num_female += frame_num else: raise ValueError('gender is m or f.') # For computing speaker mean & stddev if normalize == 'speaker': # Initialize speaker statistics if speaker not in total_frame_num_dict.keys(): total_frame_num_dict[speaker] = 0 speaker_mean_dict[speaker] = np.zeros( (feat_dim,), dtype=dtype) speaker_std_dict[speaker] = np.zeros( (feat_dim,), dtype=dtype) total_frame_num_dict[speaker] += frame_num speaker_mean_dict[speaker] += feat_utt_sum print('=====> Computing global mean & stddev...') # Compute global mean per gender global_mean_male /= total_frame_num_male global_mean_female /= total_frame_num_female # Compute speaker mean if normalize == 'speaker': for speaker in speaker_mean_dict.keys(): speaker_mean_dict[speaker] /= total_frame_num_dict[speaker] for audio_path in tqdm(audio_paths): speaker = audio_path.split('/')[-2] utt_idx = basename(audio_path).split('.')[0] gender = spk2gender[speaker] if tool == 'htk': feat_utt, sampPeriod, parmKind = read(audio_path) elif tool == 'python_speech_features': feat_utt = w2f_psf(audio_path, feature_type=config['feature_type'], feature_dim=config['channels'], use_energy=config['energy'], use_delta1=config['delta'], use_delta2=config['deltadelta'], window=config['window'], slide=config['slide']) elif tool == 'librosa': feat_utt = w2f_librosa(audio_path, feature_type=config['feature_type'], feature_dim=config['channels'], use_energy=config['energy'], use_delta1=config['delta'], use_delta2=config['deltadelta'], window=config['window'], slide=config['slide']) # For computing global stddev if gender == 'm': global_std_male += np.sum( np.abs(feat_utt - global_mean_male) ** 2, axis=0) elif gender == 'f': global_std_female += np.sum( np.abs(feat_utt - global_mean_female) ** 2, axis=0) else: raise ValueError('gender is m or f.') # For computing speaker stddev if normalize == 'speaker': speaker_std_dict[speaker] += np.sum( np.abs(feat_utt - speaker_mean_dict[speaker]) ** 2, axis=0) # Compute speaker stddev if normalize == 'speaker': for speaker in speaker_std_dict.keys(): speaker_std_dict[speaker] = np.sqrt( speaker_std_dict[speaker] / (total_frame_num_dict[speaker] - 1)) # Compute global stddev per gender global_std_male = np.sqrt( global_std_male / (total_frame_num_male - 1)) global_std_female = np.sqrt( global_std_female / (total_frame_num_female - 1)) # Save global mean & stddev per gender np.save(join(save_path, 'global_mean_male.npy'), global_mean_male) np.save(join(save_path, 'global_mean_female.npy'), global_mean_female) np.save(join(save_path, 'global_std_male.npy'), global_std_male) np.save(join(save_path, 'global_std_female.npy'), global_std_female) # Loop 2: Normalization and saving print('=====> Normalization...') frame_num_dict = {} # sampPeriod, parmKind = None, None for audio_path in tqdm(audio_paths): speaker = audio_path.split('/')[-2] utt_idx = basename(audio_path).split('.')[0] gender = spk2gender[speaker] if tool == 'htk': feat_utt, sampPeriod, parmKind = read(audio_path) elif tool == 'python_speech_features': feat_utt = w2f_psf(audio_path, feature_type=config['feature_type'], feature_dim=config['channels'], use_energy=config['energy'], use_delta1=config['delta'], use_delta2=config['deltadelta'], window=config['window'], slide=config['slide']) elif tool == 'librosa': feat_utt = w2f_librosa(audio_path, feature_type=config['feature_type'], feature_dim=config['channels'], use_energy=config['energy'], use_delta1=config['delta'], use_delta2=config['deltadelta'], window=config['window'], slide=config['slide']) if normalize == 'no': pass elif normalize == 'global' or 'train' not in data_type: # Normalize by mean & stddev over the training set per gender if gender == 'm': feat_utt -= global_mean_male feat_utt /= global_std_male elif gender == 'f': feat_utt -= global_mean_female feat_utt /= global_std_female else: raise ValueError('gender is m or f.') elif normalize == 'speaker': # Normalize by mean & stddev per speaker feat_utt = ( feat_utt - speaker_mean_dict[speaker]) / speaker_std_dict[speaker] elif normalize == 'utterance': # Normalize by mean & stddev per utterance utt_mean = np.mean(feat_utt, axis=0, dtype=dtype) utt_std = np.std(feat_utt, axis=0, dtype=dtype) feat_utt = (feat_utt - utt_mean) / utt_std frame_num_dict[utt_idx] = feat_utt.shape[0] # Save input features np.save(mkdir_join(save_path, speaker, utt_idx + '.npy'), feat_utt) # Save the frame number dictionary with open(join(save_path, 'frame_num.pickle'), 'wb') as f: pickle.dump(frame_num_dict, f)