def __init__(self, hparams): super(Tacotron2, self).__init__() self.mask_padding = hparams.mask_padding self.fp16_run = hparams.fp16_run self.use_vae = hparams.use_vae self.embedding_variation = hparams.embedding_variation self.label_type = hparams.label_type self.n_mel_channels = hparams.n_mel_channels self.n_frames_per_step = hparams.n_frames_per_step self.symbols_embedding_dim = hparams.symbols_embedding_dim self.speaker_embedding_dim = hparams.speaker_embedding_dim self.emotion_embedding_dim = hparams.emotion_embedding_dim self.transcript_embedding = nn.Embedding(hparams.n_symbols, hparams.symbols_embedding_dim) if self.use_vae: if self.label_type == 'one-hot': self.speaker_embedding = LinearNorm( hparams.n_speakers, hparams.speaker_embedding_dim, bias=True, w_init_gain='tanh') self.emotion_embedding = LinearNorm( hparams.n_emotions, hparams.emotion_embedding_dim, bias=True, w_init_gain='tanh') elif self.label_type == 'id': self.speaker_embedding = nn.Embedding( hparams.n_speakers, hparams.speaker_embedding_dim) self.emotion_embedding = nn.Embedding( hparams.n_emotions, hparams.emotion_embedding_dim) self.vae_input_type = hparams.vae_input_type std = sqrt(2.0 / (hparams.n_symbols + hparams.symbols_embedding_dim)) val = sqrt(3.0) * std # uniform bounds for std self.transcript_embedding.weight.data.uniform_(-val, val) self.encoder = Encoder(hparams) self.decoder = Decoder(hparams) self.postnet = Postnet(hparams) self.vae_gst = VAE_GST(hparams)
def __init__(self, hparams): super(Decoder, self).__init__() self.n_mel_channels = hparams.n_mel_channels self.n_frames_per_step = hparams.n_frames_per_step self.encoder_embedding_dim = hparams.encoder_embedding_dim self.attention_rnn_dim = hparams.attention_rnn_dim self.decoder_rnn_dim = hparams.decoder_rnn_dim self.prenet_dim = hparams.prenet_dim self.max_decoder_steps = hparams.max_decoder_steps self.gate_threshold = hparams.gate_threshold self.p_attention_dropout = hparams.p_attention_dropout self.p_decoder_dropout = hparams.p_decoder_dropout self.prenet = Prenet( hparams.n_mel_channels * hparams.n_frames_per_step, [hparams.prenet_dim, hparams.prenet_dim]) self.attention_rnn = nn.LSTMCell( hparams.prenet_dim + hparams.encoder_embedding_dim, hparams.attention_rnn_dim) self.attention_layer = Attention( hparams.attention_rnn_dim, hparams.encoder_embedding_dim, hparams.attention_dim, hparams.attention_location_n_filters, hparams.attention_location_kernel_size) self.decoder_rnn = nn.LSTMCell( hparams.attention_rnn_dim + hparams.encoder_embedding_dim, hparams.decoder_rnn_dim, 1) self.linear_projection = LinearNorm( hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, hparams.n_mel_channels * hparams.n_frames_per_step) self.gate_layer = LinearNorm(hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, 1, bias=True, w_init_gain='sigmoid')
def __init__(self, config): super(Decoder, self).__init__() self.n_mel_channels = config["n_mel_channels"] self.n_frames_per_step = config["n_frames_per_step"] self.encoder_embedding_dim = config["encoder_embedding_dim"] self.attention_rnn_dim = config["attention_rnn_dim"] self.decoder_rnn_dim = config["decoder_rnn_dim"] self.prenet_dim = config["prenet_dim"] self.max_decoder_steps = config["max_decoder_steps"] self.gate_threshold = config["gate_threshold"] self.p_attention_dropout = config["p_attention_dropout"] self.p_decoder_dropout = config["p_decoder_dropout"] self.prenet = Prenet( config["n_mel_channels"] * config["n_frames_per_step"], [config["prenet_dim"], config["prenet_dim"]]) self.attention_rnn = nn.LSTMCell( config["prenet_dim"] + config["encoder_embedding_dim"], config["attention_rnn_dim"]) self.attention_layer = Attention( config["attention_rnn_dim"], config["encoder_embedding_dim"], config["attention_dim"], config["attention_location_n_filters"], config["attention_location_kernel_size"]) self.decoder_rnn = nn.LSTMCell( config["attention_rnn_dim"] + config["encoder_embedding_dim"], config["decoder_rnn_dim"], 1) self.linear_projection = LinearNorm( config["decoder_rnn_dim"] + config["encoder_embedding_dim"], config["n_mel_channels"] * config["n_frames_per_step"]) self.gate_layer = LinearNorm(config["decoder_rnn_dim"] + config["encoder_embedding_dim"], 1, bias=True, w_init_gain='sigmoid')
def __init__(self, attention_rnn_dim, embedding_dim, attention_dim, attention_location_n_filters, attention_location_kernel_size, prune_rate=0): super(Attention, self).__init__() self.query_layer = LinearNorm(attention_rnn_dim, attention_dim, bias=False, w_init_gain='tanh') self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False, w_init_gain='tanh') self.v = LinearNorm(attention_dim, 1, bias=False) self.location_layer = LocationLayer(attention_location_n_filters, attention_location_kernel_size, attention_dim) self.score_mask_value = -float("inf") self.prune_rate = prune_rate
def __init__(self, attention_rnn_dim, embedding_dim, attention_dim, attention_location_n_filters, attention_location_kernel_size): super(Attention, self).__init__() self.query_layer = LinearNorm(attention_rnn_dim, attention_dim, bias=False, w_init_gain='tanh') # if hparams.style == 'speaker_encoder': # embedding_dim += 256 # elif hparams.style == 'style_embedding': # embedding_dim += 128 # elif hparams.style == 'both': # embedding_dim += 256 + 128 self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False, w_init_gain='tanh') self.v = LinearNorm(attention_dim, 1, bias=False) self.location_layer = LocationLayer(attention_location_n_filters, attention_location_kernel_size, attention_dim) self.score_mask_value = -float("inf")
def __init__(self, in_dim, sizes, hparams): super(Prenet, self).__init__() in_sizes = [in_dim] + sizes[:-1] self.layers = nn.ModuleList([ LinearNorm(in_size, out_size, bias=False) for (in_size, out_size) in zip(in_sizes, sizes) ]) self.convolutions = nn.Sequential( ConvNorm(hparams.prenet_dim, hparams.prenet_dim, kernel_size=hparams.audio_kernel_size, stride=hparams.audio_stride, w_init_gain='relu'), nn.BatchNorm1d(hparams.prenet_dim))
def __init__(self, query_dim, keys_dim, attention_dim, attention_location_n_filters, attention_location_kernel_size): super(Attention, self).__init__() # 传统attention需要query和keys做线性变换再v^T.*tanh(W * query + V * keys) # 这个query_layer和memory_layer分别得到 W * query 和 V * keys # w_init_gain='tanh'是因为他们包在tanh(W * query + V * keys)函数中 self.query_layer = LinearNorm(query_dim, attention_dim, bias=False, w_init_gain='tanh') self.memory_layer = LinearNorm(keys_dim, attention_dim, bias=False, w_init_gain='tanh') # 当前attention除了传统参数还包括对注意力权重做卷积处理 self.location_layer = LocationLayer(attention_location_n_filters, attention_location_kernel_size, attention_dim) self.EOS_embedding_layer = nn.Embedding(1, attention_dim) self.v = LinearNorm(attention_dim, 1, bias=False) self.score_mask_value = -float("inf")
def __init__(self, in_dim, sizes, p_prenet_dropout, prenet_batchnorm): super(Prenet, self).__init__() in_sizes = [in_dim] + sizes[:-1] self.layers = nn.ModuleList([ LinearNorm(in_size, out_size, bias=False) for (in_size, out_size) in zip(in_sizes, sizes) ]) self.p_prenet_dropout = p_prenet_dropout self.prenet_batchnorm = prenet_batchnorm self.p_prenet_input_dropout = 0 if self.prenet_batchnorm: self.batchnorms = nn.ModuleList( [nn.BatchNorm1d(size) for size in sizes])
def __init__(self, attention_n_filters, attention_kernel_size, attention_dim): super(LocationLayer, self).__init__() padding = int((attention_kernel_size - 1) / 2) self.location_conv = ConvNorm(2, attention_n_filters, kernel_size=attention_kernel_size, padding=padding, bias=False, stride=1, dilation=1) self.location_dense = LinearNorm(attention_n_filters, attention_dim, bias=False, w_init_gain='tanh')
def __init__(self, hparams): super(BERT_Tacotron2, self).__init__() self.mask_padding = hparams.mask_padding self.n_mel_channels = hparams.n_mel_channels self.n_frames_per_step = hparams.n_frames_per_step self.embedding = nn.Embedding(hparams.n_symbols, hparams.symbols_embedding_dim) std = sqrt(2.0 / (hparams.n_symbols + hparams.symbols_embedding_dim)) val = sqrt(3.0) * std # uniform bounds for std self.embedding.weight.data.uniform_(-val, val) self.encoder = Encoder(hparams) self.linear_converter = LinearNorm( hparams.encoder_embedding_dim + hparams.BERT_embedding_dim, hparams.encoder_embedding_dim) self.decoder = Decoder(hparams) self.postnet = Postnet(hparams)
def __init__(self, hparams, supervised=False): super(GMVAE_revised, self).__init__() self.latent_embedding_dim = hparams.latent_embedding_dim self.supervised = supervised convolutions = [] conv_layer_1 = nn.Sequential( ConvNorm(hparams.n_mel_channels, hparams.latent_embedding_dim, kernel_size=hparams.latent_kernel_size, stride=1, padding=int((hparams.latent_kernel_size - 1) / 2), dilation=1, w_init_gain='relu'), nn.BatchNorm1d(hparams.latent_embedding_dim)) convolutions.append(conv_layer_1) conv_layer_2 = nn.Sequential( ConvNorm(hparams.latent_embedding_dim, hparams.latent_embedding_dim, kernel_size=hparams.latent_kernel_size, stride=1, padding=int((hparams.latent_kernel_size - 1) / 2), dilation=1, w_init_gain='relu'), nn.BatchNorm1d(hparams.latent_embedding_dim)) convolutions.append(conv_layer_2) self.convolutions = nn.ModuleList(convolutions) self.lstm = nn.LSTM(hparams.latent_embedding_dim, int(hparams.latent_embedding_dim / 2), 1, batch_first=True, bidirectional=True) # self.mean_pool = nn.AvgPool1d(hparams.latent_kernel_size, stride=1) # # self.mean_pool_out_size = hparams.latent_embedding_dim - hparams.latent_kernel_size + 1 self.linear_projection = LinearNorm( hparams.latent_embedding_dim, int(hparams.latent_embedding_dim / 2)) self.linear_projection_mean = LinearNorm( int(hparams.latent_embedding_dim / 2), hparams.latent_out_dim) self.linear_projection_variance = LinearNorm( int(hparams.latent_embedding_dim / 2), hparams.latent_out_dim) self.fc3 = nn.Linear(hparams.latent_out_dim, int(hparams.latent_embedding_dim / 2)) self.fc4 = nn.Linear(int(hparams.latent_embedding_dim / 2), hparams.latent_embedding_dim)
def __init__(self, attention_n_filters, attention_kernel_size, attention_dim): super(LocationLayer, self).__init__() ##每个类的__init__都要加这一句? padding = int((attention_kernel_size - 1) / 2) self.location_conv = ConvNorm( 2, attention_n_filters, ##ConvNorm是layers.py里定义的一个类 kernel_size=attention_kernel_size, padding=padding, bias=False, stride=1, dilation=1) self.location_dense = LinearNorm( attention_n_filters, attention_dim, ##LinearNorm是layers.py里定义的一个类 bias=False, w_init_gain='tanh')
def __init__(self, num_mixtures, attention_rnn_dim, embedding_dim, attention_dim, attention_location_n_filters, attention_location_kernel_size): super(GMMAttention, self).__init__() self.num_mixtures = num_mixtures lin = nn.Linear(attention_dim, 3*num_mixtures, bias=True) lin.weight.data.mul_(0.001) lin.bias.data.mul_(0.001) lin.bias.data.sub_(2.) self.F = nn.Sequential( LinearNorm(attention_rnn_dim, attention_dim, bias=True, w_init_gain='tanh'), nn.Tanh(), lin) # LinearNorm(attention_dim, 3*num_mixtures, bias=False, w_init_gain='linear')) self.score_mask_value = 0 # -float("inf") self.register_buffer('pos', torch.arange( 0, 2000, dtype=torch.float).view(1, -1, 1).data)
def __init__(self, hparams): super(GST, self).__init__() self.token_embedding_size = hparams.token_embedding_size self.token_num = hparams.token_num self.torchMoji_linear = hparams.torchMoji_linear if hparams.token_activation_func == 'softmax': self.activation_fn = 0 elif hparams.token_activation_func == 'sigmoid': self.activation_fn = 1 elif hparams.token_activation_func == 'tanh': self.activation_fn = 2 elif hparams.token_activation_func == 'absolute': self.activation_fn = 3 else: print( f'token_activation_func of {hparams.token_activation_func} is invalid\nPlease use "softmax", "sigmoid" or "tanh"' ) raise self.token_embedding = nn.Parameter( torch.zeros([self.token_num, self.token_embedding_size])) # (token_num, Embedding) init.normal_(self.token_embedding, mean=0., std=0.5) # init.orthogonal_(self.token_embedding) self.ref_encoder = ReferenceEncoder(hparams, activation_fn=torch.tanh) self.att = MultiHeadAttention(hparams) # torchMoji if self.torchMoji_linear: self.map_lin = LinearNorm(hparams.torchMoji_attDim, self.token_num) self.p_drop_tokens = hparams.p_drop_tokens self.drop_tokens_mode = hparams.drop_tokens_mode if self.drop_tokens_mode == 'embedding': self.embedding = nn.Embedding(1, self.token_num) elif self.drop_tokens_mode == 'speaker_embedding': self.speaker_embedding = nn.Embedding(hparams.n_speakers, self.token_num)
def __init__(self, hparams): super().__init__() self.hidden1 = LinearNorm(64, 256) self.output = LinearNorm(256, hparams.speaker_num)
def __init__(self, in_dim, sizes): super(Prenet, self).__init__() in_sizes = [in_dim] + sizes[:-1] self.layers = nn.ModuleList( [LinearNorm(in_size, out_size, bias=False) for (in_size, out_size) in zip(in_sizes, sizes)])
class GMVAE_revised(nn.Module): def __init__(self, hparams, supervised=False): super(GMVAE_revised, self).__init__() self.latent_embedding_dim = hparams.latent_embedding_dim self.supervised = supervised convolutions = [] conv_layer_1 = nn.Sequential( ConvNorm(hparams.n_mel_channels, hparams.latent_embedding_dim, kernel_size=hparams.latent_kernel_size, stride=1, padding=int((hparams.latent_kernel_size - 1) / 2), dilation=1, w_init_gain='relu'), nn.BatchNorm1d(hparams.latent_embedding_dim)) convolutions.append(conv_layer_1) conv_layer_2 = nn.Sequential( ConvNorm(hparams.latent_embedding_dim, hparams.latent_embedding_dim, kernel_size=hparams.latent_kernel_size, stride=1, padding=int((hparams.latent_kernel_size - 1) / 2), dilation=1, w_init_gain='relu'), nn.BatchNorm1d(hparams.latent_embedding_dim)) convolutions.append(conv_layer_2) self.convolutions = nn.ModuleList(convolutions) self.lstm = nn.LSTM(hparams.latent_embedding_dim, int(hparams.latent_embedding_dim / 2), 1, batch_first=True, bidirectional=True) # self.mean_pool = nn.AvgPool1d(hparams.latent_kernel_size, stride=1) # # self.mean_pool_out_size = hparams.latent_embedding_dim - hparams.latent_kernel_size + 1 self.linear_projection = LinearNorm( hparams.latent_embedding_dim, int(hparams.latent_embedding_dim / 2)) self.linear_projection_mean = LinearNorm( int(hparams.latent_embedding_dim / 2), hparams.latent_out_dim) self.linear_projection_variance = LinearNorm( int(hparams.latent_embedding_dim / 2), hparams.latent_out_dim) self.fc3 = nn.Linear(hparams.latent_out_dim, int(hparams.latent_embedding_dim / 2)) self.fc4 = nn.Linear(int(hparams.latent_embedding_dim / 2), hparams.latent_embedding_dim) def parse_batch(self, batch): if self.supervised: text_padded, input_lengths, mel_padded, gate_padded, output_lengths, mel_padded_512, gate_padded_512, output_lengths_512, labels = batch else: text_padded, input_lengths, mel_padded, gate_padded, output_lengths, mel_padded_512, gate_padded_512, output_lengths_512 = batch text_padded = to_gpu(text_padded).long() input_lengths = to_gpu(input_lengths).long() max_len = torch.max(input_lengths.data).item() mel_padded = to_gpu(mel_padded).float() gate_padded = to_gpu(gate_padded).float() output_lengths = to_gpu(output_lengths).long() mel_padded_512 = to_gpu(mel_padded_512).float() gate_padded_512 = to_gpu(gate_padded_512).float() output_lengths_512 = to_gpu(output_lengths_512).long() return ((text_padded, input_lengths, mel_padded, max_len, output_lengths, mel_padded), (mel_padded, gate_padded)) def vae_encode(self, inputs, label=None): _, _, x, _, _, _ = inputs # print('x shape:', x.shape) # pdb.set_trace() for conv in self.convolutions: x = F.dropout(F.relu(conv(x)), 0.5, self.training) # pdb.set_trace() x = x.transpose(1, 2) # print('Just finished convs') # pdb.set_trace() out, _ = self.lstm(x) # print('Just finished lstm', out.shape) # pdb.set_trace() out = torch.mean(out, dim=1) x_after_mean = out # print('After mean pool', out.shape) # pdb.set_trace() # out=torch.cat([out, label],1) out = self.linear_projection.forward(out) # print('After linear 1', out.shape) # pdb.set_trace() mean = self.linear_projection_mean.forward(out) variance = self.linear_projection_variance.forward(out) # mean = torch.mean(torch.mean(self.linear_projection_mean.forward(out),dim=1), dim=0) # variance = torch.mean(torch.mean(self.linear_projection_variance.forward(out),dim=1), dim=0) # print('mean', mean.shape) # print('variance', variance.shape) # pdb.set_trace() return mean, variance, x_after_mean def reparameterize(self, mu, logvar): std = torch.exp(0.5 * logvar) eps = torch.randn_like(std) return mu + eps * std def decode(self, z, label=None): # print('shape to be decoded', z.shape) #z=torch.cat([z, label],1) h3 = F.relu(self.fc3(z)) # print('shape of the recons',h3.shape) # pdb.set_trace() return torch.sigmoid(self.fc4(h3)) def forward(self, x, label=None): mu, logvar, x_after_mean = self.vae_encode(x, label) z = self.reparameterize(mu, logvar) # print('mu shape:', mu.shape) # print('logvar shape:', logvar.shape) # pdb.set_trace() return self.decode(z, label), mu, logvar, x_after_mean def generate_sample(self, x): mu, logvar, _ = self.vae_encode(x) # pdb.set_trace() # pdb.set_trace() return Normal(mu, logvar.exp()).sample( (1, x[2].shape[2])).squeeze(dim=0)
def __init__(self,hparams): super(speaker_classifier, self).__init__() self.hidden1 = LinearNorm(hparams.z_speaker_dim,256) self.output = LinearNorm(256, hparams.speaker_num)
def __init__(self, input_dims, hidden_dims): super(vectorBased_selfAttention, self).__init__() self.layers1 = LinearNorm(input_dims, hidden_dims) self.layers2 = LinearNorm(hidden_dims, input_dims)
def __init__(self, hparams): super(Decoder, self).__init__() self.mellotron = hparams.mellotron self.disable_f0 = hparams.disable_f0 self.n_mel_channels = hparams.n_mel_channels self.n_frames_per_step = hparams.n_frames_per_step self.encoder_dim = hparams.encoder_LSTM_dim + hparams.token_embedding_size + hparams.speaker_embedding_dim self.attention_rnn_dim = hparams.attention_rnn_dim self.decoder_rnn_dim = hparams.decoder_rnn_dim self.prenet_dim = hparams.prenet_dim self.prenet_layers = hparams.prenet_layers self.prenet_batchnorm = hparams.prenet_batchnorm self.p_prenet_dropout = hparams.p_prenet_dropout self.max_decoder_steps = hparams.max_decoder_steps self.gate_threshold = hparams.gate_threshold self.AttRNN_extra_decoder_input = hparams.AttRNN_extra_decoder_input self.AttRNN_hidden_dropout_type = hparams.AttRNN_hidden_dropout_type self.p_AttRNN_hidden_dropout = hparams.p_AttRNN_hidden_dropout self.p_AttRNN_cell_dropout = hparams.p_AttRNN_cell_dropout self.DecRNN_hidden_dropout_type = hparams.DecRNN_hidden_dropout_type self.p_DecRNN_hidden_dropout = hparams.p_DecRNN_hidden_dropout self.p_DecRNN_cell_dropout = hparams.p_DecRNN_cell_dropout self.p_teacher_forcing = hparams.p_teacher_forcing self.teacher_force_till = hparams.teacher_force_till self.num_att_mixtures = hparams.num_att_mixtures self.extra_projection = hparams.extra_projection self.normalize_attention_input = hparams.normalize_attention_input self.normalize_AttRNN_output = hparams.normalize_AttRNN_output self.attention_type = hparams.attention_type self.attention_layers = hparams.attention_layers self.low_vram_inference = hparams.low_vram_inference self.context_frames = hparams.context_frames self.hide_startstop_tokens = hparams.hide_startstop_tokens attention_rnn_in_dim = hparams.prenet_dim + self.encoder_dim if not self.disable_f0: self.prenet_f0 = ConvNorm( 1, hparams.prenet_f0_dim, kernel_size=hparams.prenet_f0_kernel_size, padding=max(0, int(hparams.prenet_f0_kernel_size / 2)), bias=False, stride=1, dilation=1) attention_rnn_in_dim += hparams.prenet_f0_dim self.prenet = Prenet( hparams.n_mel_channels * hparams.n_frames_per_step * self.context_frames, [hparams.prenet_dim] * hparams.prenet_layers, self.p_prenet_dropout, self.prenet_batchnorm) if self.AttRNN_extra_decoder_input: attention_rnn_in_dim += hparams.decoder_rnn_dim if self.AttRNN_hidden_dropout_type == 'dropout': self.attention_rnn = nn.LSTMCell( attention_rnn_in_dim, # input_size hparams.attention_rnn_dim) # hidden_size) elif self.AttRNN_hidden_dropout_type == 'zoneout': self.attention_rnn = LSTMCellWithZoneout( attention_rnn_in_dim, # input_size hparams.attention_rnn_dim, zoneout_prob=self.p_DecRNN_hidden_dropout ) # hidden_size, zoneout) self.p_AttRNN_hidden_dropout = 0.0 # zoneout assigned inside LSTMCellWithZoneout so don't need normal dropout if self.attention_type == 0: self.attention_layer = Attention( hparams.attention_rnn_dim, self.encoder_dim, hparams.attention_dim, hparams.attention_location_n_filters, hparams.attention_location_kernel_size) elif self.attention_type == 1: self.attention_layer = GMMAttention( hparams.num_att_mixtures, hparams.attention_layers, hparams.attention_rnn_dim, self.encoder_dim, hparams.attention_dim, hparams.attention_location_n_filters, hparams.attention_location_kernel_size, hparams) else: raise NotImplementedException( "attention_type invalid, valid values are... 0 and 1") if self.DecRNN_hidden_dropout_type == 'dropout': self.decoder_rnn = nn.LSTMCell( hparams.attention_rnn_dim + self.encoder_dim, # input_size hparams.decoder_rnn_dim, 1) # hidden_size, bias) elif self.DecRNN_hidden_dropout_type == 'zoneout': self.decoder_rnn = LSTMCellWithZoneout( hparams.attention_rnn_dim + self.encoder_dim, # input_size hparams.decoder_rnn_dim, 1, zoneout_prob=self.p_DecRNN_hidden_dropout ) # hidden_size, zoneout) self.p_DecRNN_hidden_dropout = 0.0 # zoneout assigned inside LSTMCellWithZoneout so don't need normal dropout if self.extra_projection: self.linear_projection_pre = LinearNorm( hparams.decoder_rnn_dim + self.encoder_dim, hparams.decoder_rnn_dim + self.encoder_dim) self.linear_projection = LinearNorm( hparams.decoder_rnn_dim + self.encoder_dim, hparams.n_mel_channels * hparams.n_frames_per_step) self.gate_layer = LinearNorm(hparams.decoder_rnn_dim + self.encoder_dim, 1, bias=True, w_init_gain='sigmoid')
def __init__(self): super().__init__() self.hidden2 = LinearNorm(64, 256) self.output = LinearNorm(256, 2) self.lambd = 1
def __init__(self, hparams): super(augmentation_classifier, self).__init__() self.hidden2 = LinearNorm(hparams.z_speaker_dim, 256) self.output = LinearNorm(256, 2) self.lambd = 1