def encode(self, input, src_len=None): """ input : (batch x max_src_len) mask : (batch x max_src_len) """ batch, max_src_len = input.size() if src_len is None: src_len = [max_src_len] * batch res = self.enc_emb_lyr(input) # batch x max_src_len x emb_dim # res = F.dropout(res, self.enc_emb_do, training=self.training) res = res.view(batch * max_src_len, -1) for ii in range(len(self.enc_prenet_lyr)): res = self.enc_prenet_lyr[ii](res) res = generator_act_fn(self.enc_prenet_fn)(res) res = F.dropout(res, p=self.enc_prenet_do[ii], training=self.training) res = res.view(batch, max_src_len, -1) res = self.enc_core_lyr(res, src_len) ctx = res if src_len is not None: ctx_mask = Variable( generate_seq_mask(src_len, self, max_len=ctx.size(1))) else: ctx_mask = None self.ctx = ctx self.ctx_mask = ctx_mask self.src_len = src_len self.dec_att_lyr.set_ctx(ctx, ctx_mask)
def fn_batch_ce(feat_mat, feat_len, speaker_list, train_step=True): feat_mat = Variable(feat_mat) feat_mask = Variable( generate_seq_mask([x for x in feat_len], opts['gpu'])) speaker_list_id = [map_spk2id[x] for x in speaker_list] speaker_list_id = Variable( torchauto(model).LongTensor(speaker_list_id)) model.reset() model.train(train_step) batch, dec_len, _ = feat_mat.size() pred_emb = model(feat_mat, feat_len) pred_softmax = model.forward_softmax(pred_emb) loss = criterion_ce(pred_softmax, speaker_list_id) * opts['coeff_ce'] acc = torch.max(pred_softmax, 1)[1].data.eq( speaker_list_id.data).sum() / batch if train_step: opt.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), opts['grad_clip']) opt.step() return loss.data.sum(), acc
def encode(self, input, src_len=None): """ input : (batch x max_src_len x in_size) mask : (batch x max_src_len) """ batch, max_src_len, in_size = input.size() if src_len is None: src_len = [max_src_len] * batch res = input.view(batch * max_src_len, in_size) enc_fnn_act = getattr(F, self.enc_fnn_act) for ii in range(len(self.enc_fnn)): res = F.dropout(enc_fnn_act(self.enc_fnn[ii](res)), self.enc_fnn_do[ii], self.training) pass res = res.view(batch, max_src_len, -1) for ii in range(len(self.enc_rnn)): res = pack(res, src_len, batch_first=True) res = self.enc_rnn[ii](res)[0] # get h only # res, _ = unpack(res, batch_first=True) res = F.dropout(res, self.enc_rnn_do[ii], self.training) if self.downsampling[ii] == True: res = res[:, 1::2] src_len = [x // 2 for x in src_len] pass ctx = res # create mask if required # if src_len is not None: ctx_mask = Variable( generate_seq_mask(src_len, self, max_len=ctx.size(1))) else: ctx_mask = None self.dec.set_ctx(ctx, ctx_mask)
def forward(self, input, seq_len=None): if seq_len is not None: mask_input = Variable( generate_seq_mask( seq_len=seq_len, device=self).unsqueeze(-1)) # batch x seq_len x 1 # mask_input_conv = mask_input.transpose(1, 2) # batch x 1 x seq_len else: mask_input = None if mask_input is not None: input = input * mask_input res = input res = res.transpose(1, 2) for ii in range(len(self.conv_bank_lyrs)): res = self.conv_bank_lyrs[ii](res) res = generator_act_fn(self.conv_fn_act)(res) if self.conv_do[ii] > 0.0: res = F.dropout(res, p=self.conv_do[ii], training=self.training) if mask_input is not None: res = res * mask_input_conv res = res.transpose(1, 2) # batch x seq_len x ndim # apply linear layer # res = self.lin_pred_lyr(res) if mask_input is not None: res = res * mask_input return res
def decode_greedy_tf_torch(model, text_mat, text_len, feat_mat, feat_len, group, feat_sil, max_target=1000, aux_info=None): """ decode with teacher forcing method by using ground truth feature as the input """ assert isinstance(model, AVAILABLE_MODEL), "model is not supported" if not isinstance(text_mat, Variable): text_mat = Variable(text_mat) batch = text_mat.size(0) model.reset() model.eval() model.encode(text_mat, text_len) if aux_info is not None: if isinstance(aux_info['speaker_vector'], list): aux_info['speaker_vector'] = Variable( tensorauto( model, torch.from_numpy( np.stack( aux_info['speaker_vector']).astype('float32')))) model.set_aux_info(aux_info) feats_core = [] feats_att = [] feat_sil = np.tile(feat_sil, group).astype('float32') feat_sil = tensorauto( model, torch.from_numpy(feat_sil).unsqueeze(0).expand(batch, feat_sil.shape[0])) feat_sil_var = Variable(feat_sil) feat_mat_input = feat_mat[:, 0:-1] feat_mask = Variable(generate_seq_mask([x - 1 for x in feat_len], model)) dec_len = feat_mat_input.size(1) for ii in range(dec_len): curr_feat, curr_decatt_res, curr_bern_end = model.decode( feat_mat[:, ii], feat_mask[:, ii]) feats_core.append(curr_feat) feats_att.append(curr_decatt_res['att_output']['p_ctx']) pass feats_core = torch.stack(feats_core, dim=1) feats_core = feats_core * feat_mask.unsqueeze(-1) feats_core = feats_core.view(batch, feats_core.shape[1] * group, -1) feats_att = torch.stack(feats_att, dim=1) return feats_core, feat_len, feats_att
def fn_batch(feat_in_mat, feat_in_len, feat_out_mat, feat_out_len, train_step=True): feat_in_mat = Variable(feat_in_mat) feat_out_mat = Variable(feat_out_mat) feat_in_mask = Variable(generate_seq_mask(feat_in_len, opts['gpu'])) feat_out_mask = Variable(generate_seq_mask(feat_out_len, opts['gpu'])) model.train(train_step) pred_mat_output = model(feat_in_mat, feat_in_len) # main : loss mel spectrogram # loss_core = criterion(pred_mat_output, feat_out_mat, feat_out_mask) # optional : aux loss for lower frequency # loss_core_freq = criterion_freq(pred_mat_output, feat_out_mat, feat_out_mask) loss_feat = loss_core + loss_core_freq # combine all loss # loss = loss_feat if train_step: opt.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), opts['grad_clip']) opt.step() # write report # tf_writer.add_scalar('loss', loss.data[0], global_step=tf_writer._n_iter) tf_writer._n_iter += 1 return loss.data.sum()
def encode(self, input, src_len=None): """ input : (batch x max_src_len) mask : (batch x max_src_len) """ batch, max_src_len = input.size() if src_len is None: src_len = [max_src_len] * batch mask_input = Variable( generate_seq_mask(src_len, self, max_len=input.shape[1]).unsqueeze(-1)) res = self.enc_emb_lyr(input) # batch x src_len x emb_dim # res = res * mask_input res = res.transpose(1, 2) # batch x emb_dim x src_len # # apply enc conv res = self.enc_conv_lyr(res) # batch x filter x src_len # res = res.transpose(1, 2) # batch x src_len x filter # res = res * mask_input # apply enc rnn for ii in range(len(self.enc_rnn_lyr)): res = pack(res, src_len, batch_first=True) res = self.enc_rnn_lyr[ii](res)[0] res, _ = unpack(res, batch_first=True) if ii != len(self.enc_rnn_lyr) - 1: res = F.dropout(res, p=self.enc_rnn_do[ii], training=self.training) # save as context ctx = res if src_len is not None: ctx_mask = mask_input.squeeze(-1) else: ctx_mask = None self.ctx = ctx self.ctx_mask = ctx_mask self.src_len = src_len self.dec_att_lyr.set_ctx(ctx, ctx_mask)
def encode(self, input, input_aux, src_len=None): """ input : feat matrix input_aux : map contains additional info speaker embedding ID """ batch, max_src_len = input.size() self.input_spk_emb = self.get_speaker_emb(input_aux['spk']) assert self.input_spk_emb.size(0) == batch if src_len is None: src_len = [max_src_len] * batch res = self.enc_emb_lyr(input) # batch x max_src_len x emb_dim # res = F.dropout(res, self.enc_emb_do, self.training) res = res.view(batch * max_src_len, -1) for ii in range(len(self.enc_prenet_lyr)): res = self.enc_prenet_lyr[ii](res) res = generator_act_fn(self.enc_prenet_fn)(res) res = F.dropout(res, p=self.enc_prenet_do[ii], training=self.training) res = res.view(batch, max_src_len, -1) ### SPK ### # res_spk = self.spk_enc_lin_prenet_lyr(input_spk_emb).unsqueeze(1).expand_as( # batch, max_src_len, self.spk_emb_lyr.embedding_dim) # res_spk = self.spk_act_fn(res_spk) # res = res + res_spk ########### res = self.enc_core_lyr(res, src_len) ctx = res if src_len is not None: ctx_mask = Variable( generate_seq_mask(src_len, self, max_len=ctx.size(1))) else: ctx_mask = None self.ctx = ctx self.ctx_mask = ctx_mask self.src_len = src_len self.dec_att_lyr.set_ctx(ctx, ctx_mask) pass
def encode(self, input, src_len=None): """ input : (batch x max_src_len x in_size) mask : (batch x max_src_len) """ batch, max_src_len, in_size = input.size() if src_len is None: src_len = [max_src_len] * batch res = self.enc_lyr(input, src_len) ctx = res['enc_output'] ctx_len = res['enc_len'] ctx_mask = None if src_len is not None: ctx_mask = Variable( generate_seq_mask(seq_len=src_len, device=self, max_len=ctx.size(1))) self.dec_lyr.set_ctx(ctx, ctx_mask) pass
def fn_generate_dynamic_supervised(model_gen, feat_source, feat_source_len, feat_target, feat_target_len, train_step=True, tfboard_writer=None, niter=0, opt=None, model_name='') : if not isinstance(feat_source, Variable) : feat_source = Variable(feat_source) if not isinstance(feat_target, Variable) : feat_target = Variable(feat_target) feat_target_input = feat_target[:, 0:-1] feat_target_output = feat_target[:, 1:] feat_target_input_len = [x-1 for x in feat_target_len] feat_target_mask = Variable(generate_seq_mask(seq_len=feat_target_input_len, device=opts['gpu'])) batch, dec_len, _ = feat_target_input.size() model_gen.reset() model_gen.train(train_step) model_gen.encode(feat_source, feat_source_len) list_dec_core = [] for ii in range(dec_len) : _dec_core_ii, _ = model_gen.decode(feat_target_input[:, ii], feat_target_mask[:, ii] if opts['mask_dec'] else None) list_dec_core.append(_dec_core_ii) pass dec_core = torch.stack(list_dec_core,1) # calculate loss and update # loss_sup = criterion_recon(feat_target_output, dec_core, mask=None) # TODO : decide use mask or not # if train_step : opt.zero_grad() torch.nn.utils.clip_grad_norm(model_gen.parameters(), opts['grad_clip']) loss_sup.backward() opt.step() # log # if tfboard_writer is not None : tfboard_writer.add_scalar('loss/sup {}'.format(model_name), loss_sup.data.cpu().numpy(), niter) pass pass
def forward(self, input, input_len=None): batch, max_input_len, in_size = input.size() # convert to batch, channel, seq_len, n_dim # apply masking # if input_len is not None: mask_input = Variable( generate_seq_mask(input_len, device=self, max_len=max_input_len).unsqueeze(-1)) input = input * mask_input res = input.unsqueeze(1) # apply conv for ii in range(self.num_layers): res = self.conv_lyr[ii](res) res = generator_act_fn(self.conv_fn_act)(res) res = self.resblock_lyr[ii](res) # res = [batch, out_channel, seq_len, n_dim] # # pool across seq_len # if self.pool_fn == 'avg': res = F.avg_pool2d(res, kernel_size=[res.size(2), 1], stride=1) elif self.pool_fn == 'max': res = F.max_pool2d(res, kernel_size=[res.size(2), 1], stride=1) else: raise ValueError("pool_fn {} is not implemented".format( self.pool_fn)) # affine transform # # res = [batch, out_channel, 1, n_dim] # res = F.avg_pool2d(res, kernel_size=[1, res.size(-1)], stride=1) # res = [batch, out_channel, 1, 1] # res = res.squeeze(-1).squeeze(-1) # res = [batch, out_channel] res = self.lin_emb_lyr(res) # normalize to unit-norm # res = res / torch.norm(res, p=2, dim=1, keepdim=True) return res
def fn_batch(text_mat, text_len, feat_mat, feat_len, aux_info=None, train_step=True): text_mat = Variable(text_mat) feat_mat_input = Variable(feat_mat[:, 0:-1]) feat_mat_output = Variable(feat_mat[:, 1:]) feat_mask = Variable( generate_seq_mask([x - 1 for x in feat_len], opts['gpu'])) feat_label_end = Variable( 1. - generate_seq_mask([x - 1 - opts['pad_sil'] for x in feat_len], opts['gpu'], max_len=feat_mask.size(1))) model.reset() model.train(train_step) model.encode(text_mat, text_len) # additional input condition if model.TYPE == TacotronType.MULTI_SPEAKER: aux_info['speaker_vector'] = Variable( tensorauto( opts['gpu'], torch.from_numpy( np.stack( aux_info['speaker_vector']).astype('float32')))) model.set_aux_info(aux_info) batch, dec_len, _ = feat_mat_input.size() list_dec_core = [] list_dec_core_bernoulli_end = [] list_dec_att = [] for ii in range(dec_len): _dec_core_ii, _dec_att_ii, _dec_core_bernoulli_end = model.decode( feat_mat_input[:, ii], feat_mask[:, ii] if opts['mask_dec'] else None) list_dec_core.append(_dec_core_ii) list_dec_core_bernoulli_end.append(_dec_core_bernoulli_end) list_dec_att.append(_dec_att_ii['att_output']['p_ctx']) pass dec_core = torch.stack(list_dec_core, 1) dec_core_bernoulli_end = torch.cat(list_dec_core_bernoulli_end, 1) dec_att = torch.stack(list_dec_att, dim=1) # main : loss mel spectrogram # loss_core = criterion(dec_core, feat_mat_output, feat_mask) # optional : aux loss for lower frequency # loss_core_freq = 1 * criterion_freq(dec_core, feat_mat_output, feat_mask) loss_feat = loss_core + loss_core_freq # main : frame ending prediction # loss_core_bernoulli_end = F.binary_cross_entropy_with_logits( dec_core_bernoulli_end, feat_label_end) * opts['coeff_bern'] acc_core_bernoulli_end = ((dec_core_bernoulli_end > 0.0) == ( feat_label_end > 0.5)).float().mean() # optional : aux loss for encourage diagonal attention # loss_diag_att = 1 * criterion_diag_att( dec_att, dec_len=[x - 1 for x in feat_len], enc_len=text_len) # combine all loss # loss = loss_feat + loss_core_bernoulli_end + loss_diag_att if train_step: opt.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), opts['grad_clip']) opt.step() # write report # tf_writer.add_scalar('loss', loss.data[0], global_step=tf_writer._n_iter) tf_writer.add_scalar('loss_feat', loss_feat.data[0], global_step=tf_writer._n_iter) tf_writer.add_scalar('loss_bern_end', loss_core_bernoulli_end.data[0], global_step=tf_writer._n_iter) if opts['loss_diag_att_cfg'] is not None: tf_writer.add_scalar('loss_diag_att', loss_diag_att.data[0], global_step=tf_writer._n_iter) tf_writer._n_iter += 1 return loss.data.sum(), loss_feat.data.sum( ), loss_core_bernoulli_end.data.sum(), acc_core_bernoulli_end.data.sum( )
def fn_batch_tts(model, text_mat, text_len, feat_mat, feat_len, aux_info=None, train_step=True, coeff_loss=1): # refit data # if max(feat_len) != feat_mat.shape[1]: feat_mat = feat_mat[:, 0:max(feat_len)] if max(text_len) != text_mat.shape[1]: text_mat = text_mat[:, 0:max(text_len)] batch_size = text_mat.shape[0] if not isinstance(text_mat, Variable): text_mat = Variable(text_mat) if not isinstance(feat_mat, Variable): feat_mat = Variable(feat_mat) feat_mat_input = feat_mat[:, 0:-1] feat_mat_output = feat_mat[:, 1:] feat_mask = Variable( generate_seq_mask([x - 1 for x in feat_len], opts['gpu'])) feat_label_end = Variable( 1. - generate_seq_mask([x - 1 - opts['tts_pad_sil'] for x in feat_len], opts['gpu'], max_len=feat_mask.size(1))) model.reset() model.train(train_step) model.encode(text_mat, text_len) # additional input condition if model.TYPE == TacotronType.MULTI_SPEAKER: aux_info['speaker_vector'] = Variable( tensorauto( opts['gpu'], torch.from_numpy( np.stack( aux_info['speaker_vector']).astype('float32')))) model.set_aux_info(aux_info) batch, dec_len, _ = feat_mat_input.size() list_dec_core = [] list_dec_core_bernoulli_end = [] list_dec_att = [] for ii in range(dec_len): _dec_core_ii, _dec_att_ii, _dec_core_bernoulli_end = model.decode( feat_mat_input[:, ii], feat_mask[:, ii] if opts['tts_mask_dec'] else None) list_dec_core.append(_dec_core_ii) list_dec_core_bernoulli_end.append(_dec_core_bernoulli_end) list_dec_att.append(_dec_att_ii['att_output']['p_ctx']) pass dec_core = torch.stack(list_dec_core, 1) dec_core_bernoulli_end = torch.cat(list_dec_core_bernoulli_end, 1) dec_att = torch.stack(list_dec_att, dim=1) # main : loss mel spectrogram # loss_core = tts_loss(dec_core, feat_mat_output, feat_mask) # optional : aux loss for lower frequency # loss_core_freq = 1 * tts_loss_freq(dec_core, feat_mat_output, feat_mask) loss_feat = loss_core + loss_core_freq # optional : aux loss for speaker embedding reconstruction # if model_tts.TYPE == TacotronType.MULTI_SPEAKER: loss_spk_emb = tts_loss_spk_emb( dec_core.view(batch_size, -1, NDIM_FEAT), [x * opts['tts_group'] for x in feat_len], aux_info['speaker_vector']) else: loss_spk_emb = Variable(torchauto(opts['gpu']).FloatTensor([0.0])) # main : frame ending prediction # loss_core_bernoulli_end = F.binary_cross_entropy_with_logits( dec_core_bernoulli_end, feat_label_end) * opts['tts_coeff_bern'] acc_core_bernoulli_end = ((dec_core_bernoulli_end > 0.0) == ( feat_label_end > 0.5)).float().mean() # combine all loss # loss = loss_feat + loss_core_bernoulli_end + loss_spk_emb loss = loss * coeff_loss # if train_step : if train_step == True: model.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), opts['tts_grad_clip']) tts_opt.step() return loss.data.sum(), loss_feat.data.sum(), loss_core_bernoulli_end.data.sum(), \ loss_spk_emb.data.sum(), acc_core_bernoulli_end.data.sum()