def decode(self, y_tm1, mask=None): assert y_tm1.dim() == 2, "batchsize x dec_in_size ( 1 timestep only)" res = y_tm1 for ii in range(len(self.dec_prenet_lyr)): res = self.dec_prenet_lyr[ii](res) res = generator_act_fn(self.dec_prenet_fn)(res) res = F.dropout(res, self.dec_prenet_do[ii], training=self.training) ### APPLY ADAPTER PRENET -> RNN ### res = self.dec_adapter_lyr.prenet_to_rnn(res) res = generator_act_fn(self.dec_prenet_fn)(res) res = F.dropout(res, self.dec_prenet_do[-1], training=self.training) ### # compute decoder rnn # res_dec = self.dec_att_lyr(res, mask) res = res_dec['dec_output'] ### APPLY ADAPTER RNN -> FIRST REG ### # res = self.dec_adapter_lyr.rnn_to_first_reg(res) # res = self.dec_att_lyr.ctx_proj_fn_act(res) # res = F.dropout(res, self.dec_att_lyr.do[-1], self.training) ### res_first = self.dec_first_reg_lyr(res) ### APPLY ADAPTER AFTER FIRST_REG ### res_first = self.dec_adapter_lyr.first_reg(res_first) ### # compute sigmoid layer # res_bern_end = self.dec_bernoulli_end_lyr( torch.cat([res_first, res_dec['dec_output']], 1)) return res_first, res_dec, res_bern_end
def decode(self, y_tm1, mask=None): """ Return: res_first : core (Mel) prediction res_dec : decoder RNN Attention result res_bern_end : final frame prediction """ assert y_tm1.dim() == 2, "batchsize x dec_in_size ( 1 timestep only)" assert self.speaker_vector is not None, "set speaker vector into with method set_aux_info" assert self.speaker_vector.shape[0] == y_tm1.shape[ 0] == self.ctx.shape[0], "batch size is different" res = y_tm1 # OPTIONAL # res = self.mask_dec_feat(res) for ii in range(len(self.dec_prenet_lyr)): res = self.dec_prenet_lyr[ii](res) if ii == len(self.dec_prenet_lyr) - 1: # last prenet layer res_spk = self.speaker_module_lyr.dec_proj_prenet_lyr( self.speaker_vector) res_spk = generator_act_fn(self.speaker_integrate_fn)(res_spk) res += res_spk res = generator_act_fn(self.dec_prenet_fn)(res) res = F.dropout(res, self.dec_prenet_do[ii], training=self.training) # compute decoder rnn # res_dec = self.dec_att_lyr(res, mask) res = res_dec['dec_output'] # generate mel-spec prediction # res_first = res for ii in range(len(self.dec_core_gen_lyr)): res_first = self.dec_core_gen_lyr[ii](res_first) if ii == 0: # integrate speaker info # res_spk = self.speaker_module_lyr.dec_proj_core_gen_lyr( self.speaker_vector) res_spk = generator_act_fn(self.speaker_integrate_fn)(res_spk) res_first = res_first + res_spk if ii != len(self.dec_core_gen_lyr ) - 1: # if not last layer, apply act_fn & dropout res_first = generator_act_fn(self.dec_core_gen_fn)(res_first) res_first = F.dropout(res_first, self.dec_core_gen_do[ii], training=self.training) # predict frame ending # res_bern_end = self.dec_bernoulli_end_lyr( torch.cat([res_first, res_dec['dec_output']], 1).detach()) # stop gradient produce better result return res_first, res_dec, res_bern_end
def encode(self, input, src_len=None): """ input : (batch x max_src_len) mask : (batch x max_src_len) """ batch, max_src_len = input.size() if src_len is None: src_len = [max_src_len] * batch res = self.enc_emb_lyr(input) # batch x max_src_len x emb_dim # res = F.dropout(res, self.enc_emb_do, training=self.training) res = res.view(batch * max_src_len, -1) for ii in range(len(self.enc_prenet_lyr)): res = self.enc_prenet_lyr[ii](res) res = generator_act_fn(self.enc_prenet_fn)(res) res = F.dropout(res, p=self.enc_prenet_do[ii], training=self.training) res = res.view(batch, max_src_len, -1) res = self.enc_core_lyr(res, src_len) ctx = res if src_len is not None: ctx_mask = Variable( generate_seq_mask(src_len, self, max_len=ctx.size(1))) else: ctx_mask = None self.ctx = ctx self.ctx_mask = ctx_mask self.src_len = src_len self.dec_att_lyr.set_ctx(ctx, ctx_mask)
def encode_raw(self, input, src_len=None): batch, max_src_len, in_size = input.size() if src_len is None: src_len = [max_src_len] * batch res = input.view(batch * max_src_len, 1, in_size, 1) enc_cnn_act = generator_act_fn(self.enc_cnn_act) # apply conv # for ii in range(len(self.enc_cnn)): res = F.dropout(enc_cnn_act(self.enc_cnn[ii](res)), self.enc_cnn_do[ii], self.training) if self.use_bn: res = self.enc_cnn_bn[ii](res) pass pass # apply NiN # for ii in range(len(self.enc_nin)): res = enc_cnn_act(self.enc_nin[ii](res)) final_h, final_w = res.size()[2:] res = F.avg_pool2d( res, (final_h, final_w)) # (batch * seq_len) x ch x 1 x 1 # res = res.unsqueeze(2).unsqueeze(2) # (batch * seq_len) x ch # res = res.view(batch, max_src_len, -1) return res pass
def forward(self, x, src_len=None): """ x : (batch x seq x ndim) src_len : (batch) """ batchsize, seqlen, ndim = x.size() if src_len is None: src_len = [seqlen] * batchsize ### FNN ### # convert shape for FNN # res = x.contiguous().view(seqlen * batchsize, ndim) for ii in range(len(self.fnn_sizes)): res = generator_act_fn(self.fnn_act)(self.fnn_lyr[ii](res)) res = F.dropout(res, self.fnn_do, training=self.training) ### RNN ### # convert shape for RNN # res = res.view(batchsize, seqlen, -1) if self.use_pack: res = pack(res, src_len, batch_first=True) res = self.rnn_lyr(res)[0] if self.use_pack: res, _ = unpack(res, batch_first=True) ### PRE SOFTMAX ### batchsize, seqlen_final, ndim_final = res.size() res = res.contiguous().view(seqlen_final * batchsize, ndim_final) res = self.pre_softmax(res) res = res.view(batchsize, seqlen_final, -1) res = res.transpose(1, 0) return res, Variable(torch.IntTensor(src_len))
def forward(self, input, seq_len=None): if seq_len is not None: mask_input = Variable( generate_seq_mask( seq_len=seq_len, device=self).unsqueeze(-1)) # batch x seq_len x 1 # mask_input_conv = mask_input.transpose(1, 2) # batch x 1 x seq_len else: mask_input = None if mask_input is not None: input = input * mask_input res = input res = res.transpose(1, 2) for ii in range(len(self.conv_bank_lyrs)): res = self.conv_bank_lyrs[ii](res) res = generator_act_fn(self.conv_fn_act)(res) if self.conv_do[ii] > 0.0: res = F.dropout(res, p=self.conv_do[ii], training=self.training) if mask_input is not None: res = res * mask_input_conv res = res.transpose(1, 2) # batch x seq_len x ndim # apply linear layer # res = self.lin_pred_lyr(res) if mask_input is not None: res = res * mask_input return res
def decode(self, y_tm1, mask=None): assert y_tm1.dim() == 2, "batchsize x dec_in_size ( 1 timestep only)" res = y_tm1 # OPTIONAL # res = self.mask_dec_feat(res) # apply dec prenet lyr # for ii in range(len(self.dec_prenet_lyr)): res = self.dec_prenet_lyr[ii](res) res = generator_act_fn(self.dec_prenet_fn)(res) res = F.dropout(res, p=self.dec_prenet_do[ii], training=self.training) # apply dec att lyr # res_dec_att = self.dec_att_lyr(res, mask) res = res_dec_att['dec_output'] # apply lin proj lyr # for ii in range(len(self.dec_proj_lyr)): if ii != len(self.dec_proj_lyr) - 1: res = self.dec_proj_lyr[ii](res) res = generator_act_fn(self.dec_proj_fn)(res) res = F.dropout(res, p=self.dec_proj_do[ii], training=self.training) else: res = self.dec_proj_lyr[ii](res) # predict frame stopping # # input = spec + dec_att_output + att_context _bern_end_input = torch.cat([ res, res_dec_att['dec_output'], res_dec_att['att_output']['expected_ctx'] ], dim=1) res_bern_end = self.dec_bern_end_lyr( _bern_end_input.detach()) # stop gradient return res, res_dec_att, res_bern_end
def decode(self, y_tm1, mask=None): """ Return: res_first : core (Mel) prediction res_dec : decoder RNN Attention result res_bern_end : final frame prediction """ assert y_tm1.dim() == 2, "batchsize x dec_in_size ( 1 timestep only)" res = y_tm1 # OPTIONAL # res = self.mask_dec_feat(res) for ii in range(len(self.dec_prenet_lyr)): res = self.dec_prenet_lyr[ii](res) res = generator_act_fn(self.dec_prenet_fn)(res) res = F.dropout(res, self.dec_prenet_do[ii], training=self.training) # compute decoder rnn # res_dec = self.dec_att_lyr(res, mask) res = res_dec['dec_output'] # generate mel-spec prediction # res_first = res for ii in range(len(self.dec_core_gen_lyr)): res_first = self.dec_core_gen_lyr[ii](res_first) if ii != len(self.dec_core_gen_lyr ) - 1: # if not last layer, apply act_fn & dropout res_first = generator_act_fn(self.dec_core_gen_fn)(res_first) res_first = F.dropout(res_first, self.dec_core_gen_do[ii], training=self.training) res_bern_end = self.dec_bernoulli_end_lyr( torch.cat([res_first, res_dec['dec_output']], 1).detach()) # stop gradient produce better result return res_first, res_dec, res_bern_end
def decode(self, y_tm1, mask=None): assert y_tm1.dim() == 2, "batchsize x dec_in_size ( 1 timestep only)" batch = y_tm1.size(0) res = y_tm1 # OPTIONAL # res = self.mask_dec_feat(res) for ii in range(len(self.dec_prenet_lyr)): res = self.dec_prenet_lyr[ii](res) if ii == len(self.dec_prenet_lyr) - 1: # last layer # # concat speaker info # res_spk = self.spk_module_lyr.dec_lin_prenet_lyr( self.input_spk_emb) res_spk = generator_act_fn(self.speaker_act_fn)(res_spk) res += res_spk res = generator_act_fn(self.dec_prenet_fn)(res) res = F.dropout(res, self.dec_prenet_do[ii], training=self.training) # compute decoder rnn # res_dec = self.dec_att_lyr(res, mask) res = res_dec['dec_output'] # concat speaker info # res_spk = self.spk_module_lyr.dec_lin_pre_reg_first_lyr( self.input_spk_emb) res_spk = generator_act_fn(self.speaker_act_fn)(res_spk) res = res + res_spk res_first = self.dec_first_reg_lyr(res) res_bern_end = self.dec_bernoulli_end_lyr( torch.cat([res_first, res_dec['dec_output']], 1)) return res_first, res_dec, res_bern_end
def encode(self, input, input_aux, src_len=None): """ input : feat matrix input_aux : map contains additional info speaker embedding ID """ batch, max_src_len = input.size() self.input_spk_emb = self.get_speaker_emb(input_aux['spk']) assert self.input_spk_emb.size(0) == batch if src_len is None: src_len = [max_src_len] * batch res = self.enc_emb_lyr(input) # batch x max_src_len x emb_dim # res = F.dropout(res, self.enc_emb_do, self.training) res = res.view(batch * max_src_len, -1) for ii in range(len(self.enc_prenet_lyr)): res = self.enc_prenet_lyr[ii](res) res = generator_act_fn(self.enc_prenet_fn)(res) res = F.dropout(res, p=self.enc_prenet_do[ii], training=self.training) res = res.view(batch, max_src_len, -1) ### SPK ### # res_spk = self.spk_enc_lin_prenet_lyr(input_spk_emb).unsqueeze(1).expand_as( # batch, max_src_len, self.spk_emb_lyr.embedding_dim) # res_spk = self.spk_act_fn(res_spk) # res = res + res_spk ########### res = self.enc_core_lyr(res, src_len) ctx = res if src_len is not None: ctx_mask = Variable( generate_seq_mask(src_len, self, max_len=ctx.size(1))) else: ctx_mask = None self.ctx = ctx self.ctx_mask = ctx_mask self.src_len = src_len self.dec_att_lyr.set_ctx(ctx, ctx_mask) pass
def decode(self, y_tm1, mask=None): assert y_tm1.dim() == 2, "batchsize x dec_in_size ( 1 timestep only)" res = y_tm1 # OPTIONAL # res = self.mask_dec_feat(res) for ii in range(len(self.dec_prenet_lyr)): res = self.dec_prenet_lyr[ii](res) res = generator_act_fn(self.dec_prenet_fn)(res) res = F.dropout(res, self.dec_prenet_do[ii], training=self.training) # compute decoder rnn # res_dec = self.dec_att_lyr(res, mask) res = res_dec['dec_output'] res_first = self.dec_first_reg_lyr(res) return res_first, res_dec
def forward(self, input, input_len=None): batch, max_input_len, in_size = input.size() # convert to batch, channel, seq_len, n_dim # apply masking # if input_len is not None: mask_input = Variable( generate_seq_mask(input_len, device=self, max_len=max_input_len).unsqueeze(-1)) input = input * mask_input res = input.unsqueeze(1) # apply conv for ii in range(self.num_layers): res = self.conv_lyr[ii](res) res = generator_act_fn(self.conv_fn_act)(res) res = self.resblock_lyr[ii](res) # res = [batch, out_channel, seq_len, n_dim] # # pool across seq_len # if self.pool_fn == 'avg': res = F.avg_pool2d(res, kernel_size=[res.size(2), 1], stride=1) elif self.pool_fn == 'max': res = F.max_pool2d(res, kernel_size=[res.size(2), 1], stride=1) else: raise ValueError("pool_fn {} is not implemented".format( self.pool_fn)) # affine transform # # res = [batch, out_channel, 1, n_dim] # res = F.avg_pool2d(res, kernel_size=[1, res.size(-1)], stride=1) # res = [batch, out_channel, 1, 1] # res = res.squeeze(-1).squeeze(-1) # res = [batch, out_channel] res = self.lin_emb_lyr(res) # normalize to unit-norm # res = res / torch.norm(res, p=2, dim=1, keepdim=True) return res