feat_len_kv=list_klen) group = model.in_size // data_iterator_source.get_feat_dim() sorted_data_idx = np.argsort( data_iterator_infer.get_feat_length())[::-1].tolist() data_rr = iter_minibucket(sorted_data_idx, opts['chunk'], shuffle=False, excludes=[]) list_saved = [] for rr in tqdm(list(data_rr), ascii=True, ncols=50): curr_feat_list = data_iterator_infer.get_feat_by_index(rr) feat_mat, feat_len = batch_speech(opts['gpu'], curr_feat_list, feat_sil=feat_sil, group=group, start_sil=0, end_sil=0) pred_out = invert_mel_to_linear(model, Variable(feat_mat), feat_len, group=group) # convert to cpu & numpy # pred_out = pred_out.data.cpu().numpy() pred_out = [pred_out[ii, 0:feat_len[ii]] for ii in range(len(rr))] # save results for ii in range(len(rr)): _info = {} key_ii = data_iterator_infer.get_key_by_index(rr[ii]) path_feat_ii = os.path.join(tmpdir, '{}.npz'.format(key_ii)) feat_ii = pred_out[ii][0:feat_len[ii]]
for set_name, set_rr, set_train_mode in [('train', train_rr, True), ('dev', dev_rr, False), ('test', test_rr, False)]: for rr in tqdm_wrapper(set_rr): tic = timeit.default_timer() curr_key_list = feat_in_iterator[set_name].get_key_by_index(rr) curr_feat_in_list = feat_in_iterator[set_name].get_feat_by_key( curr_key_list) curr_feat_out_list = feat_out_iterator[ set_name].get_feat_by_key(curr_key_list) # print(1, timeit.default_timer() - tic); tic = timeit.default_timer() feat_in_mat, feat_in_len = batch_speech( opts['gpu'], curr_feat_in_list, feat_sil=feat_in_sil, group=group, start_sil=0, end_sil=opts['pad_sil']) feat_out_mat, feat_out_len = batch_speech( opts['gpu'], curr_feat_out_list, feat_sil=feat_out_sil, group=group, start_sil=0, end_sil=opts['pad_sil']) # print(2, timeit.default_timer() - tic); tic = timeit.default_timer() _tmp_loss = fn_batch(feat_in_mat, feat_in_len, feat_out_mat, feat_out_len,
for rr in tqdm(list(data_rr), ascii=True, ncols=50): curr_key_list = [list_key[rrii] for rrii in rr] curr_feat_list = [list_feat[rrii]() for rrii in rr] # lazy load call if args.mode == 'pred': pass elif args.mode == 'tf': curr_text_list = text_iterator.get_text_by_key(curr_key_list) text_mat, text_len = batch_text(args.gpu, curr_text_list) else: raise ValueError curr_feat_list = [ np.hstack(x) if isinstance(x, (list, tuple)) else x for x in curr_feat_list ] feat_mat, feat_len = batch_speech(args.gpu, curr_feat_list, feat_sil=None) if args.mode == 'pred': if args.search == 'greedy': curr_best_hypothesis, _, curr_best_att = greedy_search( model, feat_mat, feat_len, map_text2idx, args.max_target) elif args.search == 'beam': assert args.kbeam is not None, "kbeam must be specified" curr_best_hypothesis, _, curr_best_att = beam_search( model, feat_mat, feat_len, map_text2idx, args.max_target, args.kbeam, args.coeff_lp) else: raise ValueError('search method is not defined') elif args.mode == 'tf': _, _, curr_best_att = teacher_forcing(model,
def iter_cycle_asr2tts(set_name, set_train_mode, set_rr): rr = set_rr rr = sort_reverse(rr, feat_len[set_name]) rr_key = feat_iterator[set_name].get_key_by_index(rr) curr_feat_list = feat_iterator[set_name].get_feat_by_key(rr_key) curr_feat_mat, curr_feat_len = batch_speech( opts['gpu'], curr_feat_list, feat_sil=feat_sil, group=opts['tts_group'], start_sil=1, end_sil=opts['tts_pad_sil']) # modified feature for ASR # curr_feat_mat_for_asr = curr_feat_mat[:, 1:-opts[ 'tts_pad_sil']].contiguous().view(len(set_rr), -1, NDIM_FEAT) curr_feat_len_for_asr = [len(x) for x in curr_feat_list] if opts['asr_gen_search']['type'] == 'greedy': curr_pred_text_list, curr_pred_text_len, curr_pred_att_mat = generator_text.greedy_search( model_asr, curr_feat_mat_for_asr, curr_feat_len_for_asr, map_text2idx=map_text2idx, max_target=opts['asr_gen_cutoff']) elif opts['asr_gen_search']['type'] == 'beam': curr_pred_text_list, curr_pred_text_len = [], [] for ii in range(0, len(rr), opts['asr_gen_search']['chunk']): _start_ii = ii _end_ii = min(ii + opts['asr_gen_search']['chunk'], len(rr)) curr_pred_text_list_ii, curr_pred_text_len_ii, _ = generator_text.beam_search( model_asr, curr_feat_mat_for_asr[_start_ii:_end_ii], curr_feat_len_for_asr[_start_ii:_end_ii], map_text2idx=map_text2idx, max_target=opts['asr_gen_cutoff'], kbeam=opts['asr_gen_search']['kbeam']) curr_pred_text_list.extend(curr_pred_text_list_ii) curr_pred_text_len.extend(curr_pred_text_len_ii) if model_tts.TYPE == TacotronType.MULTI_SPEAKER: curr_spkvec_list = feat_spkvec_iterator.get_feat_by_key(rr_key) # TODO: filter bad text # curr_pred_quality = generator_text.eval_gen_text_quality( None, curr_pred_text_len, None) curr_pred_valid_idx = [ x for x, y in enumerate(curr_pred_quality) if y == 1 ] m_asr_gen_info['total'] += len(rr) m_asr_gen_info['valid'] += len(curr_pred_valid_idx) if len(curr_pred_valid_idx) == 0: return None curr_pred_text_list = batch_select(curr_pred_text_list, curr_pred_valid_idx) if model_tts.TYPE == TacotronType.MULTI_SPEAKER: curr_spkvec_list = batch_select(curr_spkvec_list, curr_pred_valid_idx) curr_pred_text_len = batch_select(curr_pred_text_len, curr_pred_valid_idx) curr_feat_mat = batch_select(curr_feat_mat, curr_pred_valid_idx) curr_feat_len = batch_select(curr_feat_len, curr_pred_valid_idx) # zip & sort dec # curr_pred_text_list = batch_sorter(curr_pred_text_list, curr_pred_text_len) if model_tts.TYPE == TacotronType.MULTI_SPEAKER: curr_spkvec_list = batch_sorter(curr_spkvec_list, curr_pred_text_len) curr_feat_mat = batch_sorter(curr_feat_mat, curr_pred_text_len) curr_feat_len = batch_sorter(curr_feat_len, curr_pred_text_len) curr_pred_text_len = batch_sorter( curr_pred_text_len, curr_pred_text_len) # sort key must be on the last step curr_pred_text_mat, curr_pred_text_len = batch_text( opts['gpu'], curr_pred_text_list, ) if model_tts.TYPE == TacotronType.MULTI_SPEAKER: curr_aux_info = {'speaker_vector': curr_spkvec_list} else: curr_aux_info = None _loss, _loss_feat, _loss_bce_fend, _loss_spk_emb, _acc_fend = fn_batch_tts( model_tts, curr_pred_text_mat, curr_pred_text_len, curr_feat_mat, curr_feat_len, aux_info=curr_aux_info, train_step=set_train_mode, coeff_loss=opts['coeff_unpair']) _loss /= opts['coeff_unpair'] assert_nan(_loss) _count = len(curr_pred_text_list) m_tts_loss[set_name] += _loss * _count m_tts_loss_feat[set_name] += _loss_feat * _count m_tts_loss_bce[set_name] += _loss_bce_fend * _count m_tts_loss_spk_emb[set_name] += _loss_spk_emb * _count m_tts_acc[set_name] += _acc_fend * _count m_tts_count[set_name] += _count if tf_writer is not None: auto_writer_info_tts(set_name, _loss, _loss_feat, _loss_bce_fend, _loss_spk_emb, _acc_fend)
def iter_cycle_tts2asr(set_name, set_train_mode, set_rr): rr = set_rr rr = sort_reverse(rr, text_len[set_name]) rr_key = text_iterator[set_name].get_key_by_index(rr) curr_text_list = text_iterator[set_name].get_text_by_key(rr_key) if model_tts.TYPE == TacotronType.MULTI_SPEAKER: if opts['tts_spk_sample'] is None: curr_spkvec_list = feat_spkvec_iterator.get_feat_by_key( rr_key) elif opts['tts_spk_sample'] == 'uniform': _sample_rr_key = random.sample(feat_iterator[set_name].key, k=len(set_rr)) curr_spkvec_list = feat_spkvec_iterator.get_feat_by_key( _sample_rr_key) else: raise NotImplementedError() curr_aux_info = {'speaker_vector': curr_spkvec_list} else: curr_aux_info = None curr_text_mat, curr_text_len = batch_text(opts['gpu'], curr_text_list) curr_pred_feat_list, curr_pred_feat_len, curr_pred_att_mat = generator_speech.decode_greedy_pred( model_tts, curr_text_mat, curr_text_len, group=opts['tts_group'], feat_sil=feat_sil, aux_info=curr_aux_info, max_target=opts['tts_gen_cutoff'] // opts['tts_group']) # filter bad speech # curr_pred_quality = generator_text.eval_gen_text_quality( None, curr_pred_feat_len, None) curr_pred_valid_idx = [ x for x, y in enumerate(curr_pred_quality) if y == 1 ] m_tts_gen_info['total'] += len(rr) m_tts_gen_info['valid'] += len(curr_pred_valid_idx) if len(curr_pred_valid_idx) == 0: return None curr_pred_feat_list = batch_select(curr_pred_feat_list, curr_pred_valid_idx) curr_pred_feat_len = batch_select(curr_pred_feat_len, curr_pred_valid_idx) curr_text_mat = batch_select(curr_text_mat, curr_pred_valid_idx) curr_text_len = batch_select(curr_text_len, curr_pred_valid_idx) # zip & sort dec # curr_pred_feat_list = batch_sorter(curr_pred_feat_list, curr_pred_feat_len) curr_text_mat = batch_sorter(curr_text_mat, curr_pred_feat_len) curr_text_len = batch_sorter(curr_text_len, curr_pred_feat_len) curr_pred_feat_len = batch_sorter( curr_pred_feat_len, curr_pred_feat_len) # sort key must be on the last step curr_pred_feat_mat, curr_pred_feat_len = batch_speech( opts['gpu'], curr_pred_feat_list) # if sorted(curr_pred_feat_len, reverse=True) != curr_pred_feat_len : # import ipdb; ipdb.set_trace() _loss, _acc = fn_batch_asr(model_asr, curr_pred_feat_mat, curr_pred_feat_len, curr_text_mat, curr_text_len, train_step=set_train_mode, coeff_loss=opts['coeff_unpair']) _loss /= opts['coeff_unpair'] assert_nan(_loss) _count = len(rr) m_asr_loss[set_name] += _loss * _count m_asr_acc[set_name] += _acc * _count m_asr_count[set_name] += _count if tf_writer is not None: auto_writer_info_asr(set_name, _loss, _acc)
all_embed = [] all_speaker_id = [] all_key = [] for rr in tqdm(list( iter_minibucket(sorted_feat_idx, args.batchsize, shuffle=False)), ncols=60): curr_feat_list = feat_iterator.get_feat_by_index(rr) curr_key_list = feat_iterator.get_key_by_index(rr) curr_speaker_list = [map_key2spk[x] for x in curr_key_list] curr_speaker_list_id = [map_spk2id[x] for x in curr_speaker_list] all_speaker_id.extend(curr_speaker_list_id) all_key.extend(curr_key_list) feat_mat, feat_len = batch_speech(args.gpu, curr_feat_list, feat_sil=feat_sil, group=1, start_sil=1, end_sil=1) res_embed = model(Variable(feat_mat), feat_len) res_embed = res_embed.cpu().data.numpy() all_embed.append(res_embed) pass all_embed = np.concatenate(all_embed).astype(np.float64) all_speaker_id = np.array(all_speaker_id, dtype=np.int32) # save speaker vector if args.path is None: args.path = tempfile.mkdtemp() print('Create temporary dir: {}'.format(args.path), file=sys.stderr)