def decode_greedy_tf_torch(model, text_mat, text_len, feat_mat, feat_len, group, feat_sil, max_target=1000, aux_info=None): """ decode with teacher forcing method by using ground truth feature as the input """ assert isinstance(model, AVAILABLE_MODEL), "model is not supported" if not isinstance(text_mat, Variable): text_mat = Variable(text_mat) batch = text_mat.size(0) model.reset() model.eval() model.encode(text_mat, text_len) if aux_info is not None: if isinstance(aux_info['speaker_vector'], list): aux_info['speaker_vector'] = Variable( tensorauto( model, torch.from_numpy( np.stack( aux_info['speaker_vector']).astype('float32')))) model.set_aux_info(aux_info) feats_core = [] feats_att = [] feat_sil = np.tile(feat_sil, group).astype('float32') feat_sil = tensorauto( model, torch.from_numpy(feat_sil).unsqueeze(0).expand(batch, feat_sil.shape[0])) feat_sil_var = Variable(feat_sil) feat_mat_input = feat_mat[:, 0:-1] feat_mask = Variable(generate_seq_mask([x - 1 for x in feat_len], model)) dec_len = feat_mat_input.size(1) for ii in range(dec_len): curr_feat, curr_decatt_res, curr_bern_end = model.decode( feat_mat[:, ii], feat_mask[:, ii]) feats_core.append(curr_feat) feats_att.append(curr_decatt_res['att_output']['p_ctx']) pass feats_core = torch.stack(feats_core, dim=1) feats_core = feats_core * feat_mask.unsqueeze(-1) feats_core = feats_core.view(batch, feats_core.shape[1] * group, -1) feats_att = torch.stack(feats_att, dim=1) return feats_core, feat_len, feats_att
def criterion_diag_att(att_mat, dec_len, enc_len, std_dev=0.2, size_average=True): if opts['loss_diag_att_cfg'] is None: return 0 batch, max_dec_len, max_enc_len = att_mat.size() loss = 0 for bb in range(batch): range_dec_len = torch.arange(0, dec_len[bb]) range_enc_len = torch.arange(0, enc_len[bb]) inv_normal_diag_mat = 1.0 - torch.exp(-( (range_dec_len / dec_len[bb])[:, None] - (range_enc_len / enc_len[bb])[None, :])**2 / (2 * std_dev**2)) inv_normal_diag_mat = Variable( tensorauto(model, inv_normal_diag_mat)) # convert to device loss_bb = (att_mat[bb, 0:dec_len[bb], 0:enc_len[bb]] * inv_normal_diag_mat).sum() loss += loss_bb if size_average: loss /= batch return loss * scheduler_decay_diag_att.value
def batch_speech(device, feat_list, feat_sil=None, group=None, start_sil=0, end_sil=0): if group is not None: # grouping feat per x frame into 1 frame # feat_list = [ group_feat_timestep(feat_ii, group) for feat_ii in feat_list ] if feat_sil is not None: feat_sil = np.tile(feat_sil, group) feat_len = [len(x) + start_sil + end_sil for x in feat_list] batch = len(feat_list) max_feat_len = max(feat_len) ndim = feat_list[0].shape[-1] feat_mat = np.zeros((batch, max_feat_len, ndim), dtype='float32') + \ (feat_sil if feat_sil is not None else 0) for ii in range(batch): feat_mat[ii, start_sil:start_sil + len(feat_list[ii])] = feat_list[ii] feat_mat = torch.from_numpy(feat_mat).float() feat_mat = tensorauto(device, feat_mat) return feat_mat, feat_len
def encode(self, input, src_len=None): """ input : (batch x max_src_len x in_size) mask : (batch x max_src_len) """ batch, max_src_len, in_size = input.size() if src_len is None: src_len = [max_src_len] * batch res = input.view(batch * max_src_len, in_size) enc_fnn_act = getattr(F, self.enc_fnn_act) for ii in range(len(self.enc_fnn)): res = F.dropout(enc_fnn_act(self.enc_fnn[ii](res)), self.enc_fnn_do[ii], self.training) pass # res = batch * max_src_len x ndim # res = res.view(batch, max_src_len, res.size(1)).transpose(1, 2).unsqueeze(3) # res = batch x ndim x src_len x 1 # enc_cnn_act = getattr(F, self.enc_cnn_act) for ii in range(len(self.enc_cnn)): if self.use_pad1[ii]: res = F.pad(res, (0, 0, 0, 1)) res = self.enc_cnn[ii](res) res = enc_cnn_act(res) src_len = [x // self.enc_cnn_strides[ii] for x in src_len] pass res = res.squeeze(3).transpose(1, 2) # batch x src_len x ndim # # add position embedding # _pos_arr = np.arange(0, res.size(1)).astype('float32') # src_len # _pos_arr = np.repeat(_pos_arr[np.newaxis, :], batch, 0) # batch x src_len # _pos_arr /= np.array( src_len)[:, np.newaxis] # divide for relative position # _pos_arr = tensorauto(self, torch.from_numpy(_pos_arr)) _pos_var = Variable(_pos_arr.view(batch * _pos_arr.size(1), 1)) # TODO : absolute or relative position # res_pos = self.pos_emb(_pos_var) res_pos = res_pos.view(batch, _pos_arr.size(1), -1) ctx = res + res_pos # TODO : sum or concat ? # # create mask if required # if src_len is not None: ctx_mask = torchauto(self).FloatTensor(batch, ctx.size(1)).zero_() for ii in range(batch): ctx_mask[ii, 0:src_len[ii]] = 1.0 ctx_mask = Variable(ctx_mask) else: ctx_mask = None self.dec.set_ctx(ctx, ctx_mask)
def batch_text(device, text_list, add_bos=True, add_eos=True): """ return text_mat, text_len """ assert all(isinstance(x, list) for x in text_list) text_idx_list = text_list batch = len(text_list) if add_bos: text_idx_list = [[constant.BOS] + x for x in text_idx_list] if add_eos: text_idx_list = [x + [constant.EOS] for x in text_idx_list] text_len = [len(x) for x in text_idx_list ] # -1 because we shift mask by 1 for input output # text_mat = np.full((batch, max(text_len)), constant.PAD, dtype='int64') for ii in range(batch): text_mat[ii, 0:text_len[ii]] = text_idx_list[ii] text_mat = tensorauto(device, torch.from_numpy(text_mat)) return text_mat, text_len
def greedy_decoder_torch(model, feat_source_mat, feat_source_len, feat_sil): MAX_DURATION = 400 # hardcoded EOS_THRESHOLD = 0.1 batch = feat_source_mat.size(0) model.reset() model.eval() model.encode(feat_source_mat, feat_source_len) feat_sil = np.tile(feat_sil, group).astype('float32') feat_sil = tensorauto( model, torch.from_numpy(feat_sil).unsqueeze(0).repeat(batch, 1)) feat_sil_var = Variable(feat_sil) prev_feat = feat_sil_var # 1 dim # idx = 0 feat_pred_len = [-1 for _ in range(batch)] feats_pred_core = [] alignments = [] while True: curr_feat, curr_decatt_res = model.decode(prev_feat) feats_pred_core.append(curr_feat) alignments.append(curr_decatt_res['att_output']['p_ctx']) prev_feat = curr_feat # check if batch bb already finished or not # dist_to_sil = (torch.abs(curr_feat - feat_sil_var)).sum(1).data print(idx, dist_to_sil.tolist()) for bb in range(batch): if feat_pred_len[bb] == -1 and dist_to_sil[bb] < EOS_THRESHOLD: feat_pred_len[bb] = idx idx += 1 if idx > MAX_DURATION or all([x != -1 for x in feat_pred_len]): break pass feats_pred_core = torch.stack(feats_pred_core, dim=1) alignments = torch.stack(alignments, dim=1) return feats_pred_core, alignments, feat_pred_len pass
def fn_batch(text_mat, text_len, feat_mat, feat_len, aux_info=None, train_step=True): text_mat = Variable(text_mat) feat_mat_input = Variable(feat_mat[:, 0:-1]) feat_mat_output = Variable(feat_mat[:, 1:]) feat_mask = Variable( generate_seq_mask([x - 1 for x in feat_len], opts['gpu'])) feat_label_end = Variable( 1. - generate_seq_mask([x - 1 - opts['pad_sil'] for x in feat_len], opts['gpu'], max_len=feat_mask.size(1))) model.reset() model.train(train_step) model.encode(text_mat, text_len) # additional input condition if model.TYPE == TacotronType.MULTI_SPEAKER: aux_info['speaker_vector'] = Variable( tensorauto( opts['gpu'], torch.from_numpy( np.stack( aux_info['speaker_vector']).astype('float32')))) model.set_aux_info(aux_info) batch, dec_len, _ = feat_mat_input.size() list_dec_core = [] list_dec_core_bernoulli_end = [] list_dec_att = [] for ii in range(dec_len): _dec_core_ii, _dec_att_ii, _dec_core_bernoulli_end = model.decode( feat_mat_input[:, ii], feat_mask[:, ii] if opts['mask_dec'] else None) list_dec_core.append(_dec_core_ii) list_dec_core_bernoulli_end.append(_dec_core_bernoulli_end) list_dec_att.append(_dec_att_ii['att_output']['p_ctx']) pass dec_core = torch.stack(list_dec_core, 1) dec_core_bernoulli_end = torch.cat(list_dec_core_bernoulli_end, 1) dec_att = torch.stack(list_dec_att, dim=1) # main : loss mel spectrogram # loss_core = criterion(dec_core, feat_mat_output, feat_mask) # optional : aux loss for lower frequency # loss_core_freq = 1 * criterion_freq(dec_core, feat_mat_output, feat_mask) loss_feat = loss_core + loss_core_freq # main : frame ending prediction # loss_core_bernoulli_end = F.binary_cross_entropy_with_logits( dec_core_bernoulli_end, feat_label_end) * opts['coeff_bern'] acc_core_bernoulli_end = ((dec_core_bernoulli_end > 0.0) == ( feat_label_end > 0.5)).float().mean() # optional : aux loss for encourage diagonal attention # loss_diag_att = 1 * criterion_diag_att( dec_att, dec_len=[x - 1 for x in feat_len], enc_len=text_len) # combine all loss # loss = loss_feat + loss_core_bernoulli_end + loss_diag_att if train_step: opt.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), opts['grad_clip']) opt.step() # write report # tf_writer.add_scalar('loss', loss.data[0], global_step=tf_writer._n_iter) tf_writer.add_scalar('loss_feat', loss_feat.data[0], global_step=tf_writer._n_iter) tf_writer.add_scalar('loss_bern_end', loss_core_bernoulli_end.data[0], global_step=tf_writer._n_iter) if opts['loss_diag_att_cfg'] is not None: tf_writer.add_scalar('loss_diag_att', loss_diag_att.data[0], global_step=tf_writer._n_iter) tf_writer._n_iter += 1 return loss.data.sum(), loss_feat.data.sum( ), loss_core_bernoulli_end.data.sum(), acc_core_bernoulli_end.data.sum( )
def decode_greedy_pred_torch(model, text_mat, text_len, group, feat_sil, max_target=1000, aux_info=None): """ decode free-path with its own predicted feature as the input """ assert isinstance(model, AVAILABLE_MODEL), "model is not supported" if not isinstance(text_mat, Variable): text_mat = Variable(text_mat) batch = text_mat.size(0) model.reset() model.eval() model.encode(text_mat, text_len) if aux_info is not None: if isinstance(aux_info['speaker_vector'], list): aux_info['speaker_vector'] = Variable( tensorauto( model, torch.from_numpy( np.stack( aux_info['speaker_vector']).astype('float32')))) model.set_aux_info(aux_info) feats_core = [] feats_att = [] feat_sil = np.tile(feat_sil, group).astype('float32') feat_sil = tensorauto( model, torch.from_numpy(feat_sil).unsqueeze(0).expand(batch, feat_sil.shape[0])) feat_sil_var = Variable(feat_sil) prev_feat = feat_sil_var # 1 dim # idx = 0 feat_len = [-1 for _ in range(batch)] while True: curr_feat, curr_decatt_res, curr_bern_end = model.decode(prev_feat) feats_core.append(curr_feat) feats_att.append(curr_decatt_res['att_output']['p_ctx']) idx += 1 # increase index # prev_feat = curr_feat # check if batch bb already finished or not # curr_bern_end = curr_bern_end[:, 0].data dist_to_sil = (torch.abs(curr_feat - feat_sil_var)).sum(1).data for bb in range(batch): # output frame end is logit (not sigmoid) if feat_len[bb] == -1 and curr_bern_end[bb] > 0.0: feat_len[bb] = idx if idx >= max_target or all([x != -1 for x in feat_len]): # too long or all samples already STOP break pass feats_core = torch.stack(feats_core, dim=1) # TODO : masking # # reshape feats_core = feats_core.view(batch, feats_core.shape[1] * group, -1) feat_len = [x * group for x in feat_len] feats_att = torch.stack(feats_att, dim=1) return feats_core, feat_len, feats_att
print('[info] load pretrained model') # additional # if opts['result'].startswith('+'): opts['result'] = os.path.dirname( opts['model_pt']) + opts['result'][1:] print('[info] append pretrained folder name to result') else: _model_cfg = opts['model_cfg'] _model_cfg['enc_in_size'] = NDIM _model_cfg['dec_in_size'] = NVOCAB _model_cfg['dec_out_size'] = NVOCAB model = ModelSerializer.load_config(_model_cfg) crit_weight = tensorauto(opts['gpu'], torch.ones(NVOCAB)) crit_weight[constant.PAD] = 0 crit_weight = Variable(crit_weight, requires_grad=False) criterion = ElementwiseCrossEntropy(weight=crit_weight, label_smoothing=opts['lbl_smooth']) if opts['gpu'] >= 0: model.cuda(opts['gpu']) pass # setting optimizer # opt = getattr(torch.optim, opts['opt'])(model.parameters(), lr=opts['lrate']) scheduler = ReduceLROnPlateauEv(opt, factor=opts['reducelr']['factor'], patience=opts['reducelr']['patience'],
excludes=[]) list_saved = [] for rr in tqdm(list(data_rr), ascii=True, ncols=50): # optional # aux_info = None # TODO REMOVE THIS # case prediction mode if opts['mode'] == 'pred': curr_key_list = text_iterator.get_key_by_index(rr) if model.TYPE == TacotronType.MULTI_SPEAKER: if opts['spkvec'] is None: _spk_vec = np.stack( feat_spkvec_iterator.get_feat_by_key( curr_key_list)).astype('float32') _spk_vec = Variable( tensorauto(opts['gpu'], torch.from_numpy(_spk_vec))) elif os.path.exists(opts['spkvec']): _spk_vec = np.load( opts['spkvec'])['feat'][None, :].astype('float32') _spk_vec = np.repeat(_spk_vec, len(rr), axis=0) _spk_vec = Variable( tensorauto(opts['gpu'], torch.from_numpy(_spk_vec))) else: _spk_vec = feat_spkvec_iterator.get_feat_by_key( opts['spkvec']) _spk_vec = _spk_vec[None, :].astype('float32') _spk_vec = np.repeat(_spk_vec, len(rr), axis=0) _spk_vec = Variable( tensorauto(opts['gpu'], torch.from_numpy(_spk_vec))) aux_info = {'speaker_vector': _spk_vec} else:
def fn_batch_tts(model, text_mat, text_len, feat_mat, feat_len, aux_info=None, train_step=True, coeff_loss=1): # refit data # if max(feat_len) != feat_mat.shape[1]: feat_mat = feat_mat[:, 0:max(feat_len)] if max(text_len) != text_mat.shape[1]: text_mat = text_mat[:, 0:max(text_len)] batch_size = text_mat.shape[0] if not isinstance(text_mat, Variable): text_mat = Variable(text_mat) if not isinstance(feat_mat, Variable): feat_mat = Variable(feat_mat) feat_mat_input = feat_mat[:, 0:-1] feat_mat_output = feat_mat[:, 1:] feat_mask = Variable( generate_seq_mask([x - 1 for x in feat_len], opts['gpu'])) feat_label_end = Variable( 1. - generate_seq_mask([x - 1 - opts['tts_pad_sil'] for x in feat_len], opts['gpu'], max_len=feat_mask.size(1))) model.reset() model.train(train_step) model.encode(text_mat, text_len) # additional input condition if model.TYPE == TacotronType.MULTI_SPEAKER: aux_info['speaker_vector'] = Variable( tensorauto( opts['gpu'], torch.from_numpy( np.stack( aux_info['speaker_vector']).astype('float32')))) model.set_aux_info(aux_info) batch, dec_len, _ = feat_mat_input.size() list_dec_core = [] list_dec_core_bernoulli_end = [] list_dec_att = [] for ii in range(dec_len): _dec_core_ii, _dec_att_ii, _dec_core_bernoulli_end = model.decode( feat_mat_input[:, ii], feat_mask[:, ii] if opts['tts_mask_dec'] else None) list_dec_core.append(_dec_core_ii) list_dec_core_bernoulli_end.append(_dec_core_bernoulli_end) list_dec_att.append(_dec_att_ii['att_output']['p_ctx']) pass dec_core = torch.stack(list_dec_core, 1) dec_core_bernoulli_end = torch.cat(list_dec_core_bernoulli_end, 1) dec_att = torch.stack(list_dec_att, dim=1) # main : loss mel spectrogram # loss_core = tts_loss(dec_core, feat_mat_output, feat_mask) # optional : aux loss for lower frequency # loss_core_freq = 1 * tts_loss_freq(dec_core, feat_mat_output, feat_mask) loss_feat = loss_core + loss_core_freq # optional : aux loss for speaker embedding reconstruction # if model_tts.TYPE == TacotronType.MULTI_SPEAKER: loss_spk_emb = tts_loss_spk_emb( dec_core.view(batch_size, -1, NDIM_FEAT), [x * opts['tts_group'] for x in feat_len], aux_info['speaker_vector']) else: loss_spk_emb = Variable(torchauto(opts['gpu']).FloatTensor([0.0])) # main : frame ending prediction # loss_core_bernoulli_end = F.binary_cross_entropy_with_logits( dec_core_bernoulli_end, feat_label_end) * opts['tts_coeff_bern'] acc_core_bernoulli_end = ((dec_core_bernoulli_end > 0.0) == ( feat_label_end > 0.5)).float().mean() # combine all loss # loss = loss_feat + loss_core_bernoulli_end + loss_spk_emb loss = loss * coeff_loss # if train_step : if train_step == True: model.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), opts['tts_grad_clip']) tts_opt.step() return loss.data.sum(), loss_feat.data.sum(), loss_core_bernoulli_end.data.sum(), \ loss_spk_emb.data.sum(), acc_core_bernoulli_end.data.sum()