def __init__(self, hparams): super(EmotionNet, self).__init__() self.unknown_id = len(hparams.emotion_classes) self.ref_enc = ReferenceEncoder( hparams, hparams.emotionnet_ref_enc_convs, hparams.emotionnet_ref_enc_rnn_dim, hparams.emotionnet_ref_enc_use_bias, drop_rate=hparams.emotionnet_ref_enc_droprate, ) input_dim = hparams.speaker_embedding_dim + hparams.emotionnet_ref_enc_rnn_dim + hparams.emotionnet_RNN_dim self.classifier_layer = LinearNorm(input_dim, len(hparams.emotion_classes)) input_dim = input_dim + len(hparams.emotion_classes) self.classifier_layer_dropout = hparams.emotionnet_classifier_layer_dropout self.latent_layer = LinearNorm(input_dim, hparams.emotionnet_latent_dim * 2) self.encoder_outputs_dropout = hparams.emotionnet_encoder_outputs_dropout self.text_rnn = nn.GRU(hparams.encoder_LSTM_dim, hparams.emotionnet_RNN_dim, batch_first=True)
def __init__(self, hparams): super(AuxEmotionNet, self).__init__() layers = [] for i, dim in enumerate(hparams.auxemotionnet_layer_dims): last_layer = (i + 1 == len(hparams.auxemotionnet_layer_dims)) in_dim = out_dim = dim if i == 0: in_dim = hparams.torchMoji_attDim if last_layer: out_dim = hparams.torchMoji_attDim layers.append(LinearNorm(in_dim, out_dim)) if not last_layer: layers.append(nn.LeakyReLU(negative_slope=0.05, inplace=True)) self.seq_layers = nn.Sequential(*layers) self.n_classes = len(hparams.emotion_classes) input_dim = hparams.speaker_embedding_dim + hparams.torchMoji_attDim + hparams.auxemotionnet_RNN_dim self.classifier_layer_dropout = hparams.auxemotionnet_classifier_layer_dropout self.latent_classifier_layer = LinearNorm( input_dim, hparams.emotionnet_latent_dim * 2 + self.n_classes) self.encoder_outputs_dropout = hparams.auxemotionnet_encoder_outputs_dropout self.text_rnn = nn.GRU(hparams.encoder_LSTM_dim, hparams.auxemotionnet_RNN_dim, batch_first=True)
def __init__(self, hp): super(MemoryBottleneck, self).__init__() self.mem_output_dim = hp.memory_bottleneck_dim self.mem_input_dim = hp.encoder_LSTM_dim + hp.speaker_embedding_dim + len( hp.emotion_classes) + hp.emotionnet_latent_dim + 1 self.bottleneck = LinearNorm(self.mem_input_dim, self.mem_output_dim, bias=hp.memory_bottleneck_bias, w_init_gain='tanh')
def __init__(self, attention_n_filters, attention_kernel_size, attention_dim, out_bias=False): super(LocationLayer, self).__init__() padding = int((attention_kernel_size - 1) / 2) self.location_conv = ConvNorm(2, attention_n_filters, kernel_size=attention_kernel_size, padding=padding, bias=False, stride=1, dilation=1) self.location_dense = LinearNorm(attention_n_filters, attention_dim, bias=out_bias, w_init_gain='tanh')
def __init__(self, hp): super(Encoder, self).__init__() self.encoder_speaker_embed_dim = hp.encoder_speaker_embed_dim if self.encoder_speaker_embed_dim: self.encoder_speaker_embedding = nn.Embedding( hp.n_speakers, self.encoder_speaker_embed_dim) self.encoder_concat_speaker_embed = hp.encoder_concat_speaker_embed self.encoder_conv_hidden_dim = hp.encoder_conv_hidden_dim convolutions = [] for _ in range(hp.encoder_n_convolutions): if _ == 0: if self.encoder_concat_speaker_embed == 'before_conv': input_dim = hp.symbols_embedding_dim + self.encoder_speaker_embed_dim elif self.encoder_concat_speaker_embed == 'before_lstm': input_dim = hp.symbols_embedding_dim else: raise NotImplementedError( f'encoder_concat_speaker_embed is has invalid value {hp.encoder_concat_speaker_embed}, valid values are "before","inside".' ) else: input_dim = self.encoder_conv_hidden_dim if _ == (hp.encoder_n_convolutions) - 1: # last conv if self.encoder_concat_speaker_embed == 'before_conv': output_dim = hp.encoder_LSTM_dim elif self.encoder_concat_speaker_embed == 'before_lstm': output_dim = hp.encoder_LSTM_dim - self.encoder_speaker_embed_dim else: output_dim = self.encoder_conv_hidden_dim conv_layer = nn.Sequential( ConvNorm(input_dim, output_dim, kernel_size=hp.encoder_kernel_size, stride=1, padding=int((hp.encoder_kernel_size - 1) / 2), dilation=1, w_init_gain='relu'), nn.BatchNorm1d(output_dim)) convolutions.append(conv_layer) self.convolutions = nn.ModuleList(convolutions) self.lstm = nn.LSTM(hp.encoder_LSTM_dim, int(hp.encoder_LSTM_dim / 2), 1, batch_first=True, bidirectional=True) self.LReLU = nn.LeakyReLU(negative_slope=0.01) # LeakyReLU self.sylps_layer = LinearNorm(hp.encoder_LSTM_dim, 1)
def __init__(self, hparams): super(SylpsNet, self).__init__() layers = [] for i, dim in enumerate(hparams.sylpsnet_layer_dims): last_layer = (i + 1 == len(hparams.sylpsnet_layer_dims)) in_dim = out_dim = dim if i == 0: in_dim = 2 if last_layer: out_dim = 1 layers.append(LinearNorm(in_dim, out_dim)) if not last_layer: layers.append(nn.LeakyReLU(negative_slope=0.05, inplace=True)) self.seq_layers = nn.Sequential(*layers) self.res_weight = nn.Parameter(torch.tensor(0.01))
def __init__(self, hparams): super(GST, self).__init__() if not hparams.ss_vae_gst: mha_outdim = hparams.token_num * (1 + hparams.gst_vae_mode) else: mha_outdim = len(hparams.vae_classes) # VAE / SS-VAE self.vae = hparams.gst_vae_mode self.ss_vae = hparams.ss_vae_gst self.ss_vae_zu_dim = hparams.ss_vae_zu_dim if self.ss_vae: self.ss_vae_layers = nn.Sequential( nn.Linear(mha_outdim, 2 * self.ss_vae_zu_dim), nn.Tanh(), ) # Encoder self.token_embedding_size = hparams.token_embedding_size self.token_num = hparams.token_num self.ref_encoder = ReferenceEncoder(hparams, activation_fn=torch.tanh) self.att = MultiHeadAttention(hparams, outdim=mha_outdim) self.token_embedding = nn.Parameter( torch.zeros([ self.ss_vae_zu_dim if self.ss_vae else self.token_num, self.token_embedding_size ])) # (token_num, Embedding) init.normal_(self.token_embedding, mean=0., std=0.5) # Token activation function if hparams.token_activation_func == 'softmax': self.activation_fn = 0 elif hparams.token_activation_func == 'sigmoid': self.activation_fn = 1 elif hparams.token_activation_func == 'tanh': self.activation_fn = 2 elif hparams.token_activation_func == 'linear': self.activation_fn = 3 else: print( f'token_activation_func of {hparams.token_activation_func} is invalid\nPlease use "softmax", "sigmoid" or "tanh"' ) raise # tanh on output embed self.output_tanh = True # torchMoji self.torchMoji_linear = hparams.torchMoji_linear if self.torchMoji_linear: self.map_lin = LinearNorm( hparams.torchMoji_attDim, self.token_num * (1 + hparams.gst_vae_mode)) # Drop Tokens self.p_drop_tokens = hparams.p_drop_tokens self.drop_tokens_mode = hparams.drop_tokens_mode if self.drop_tokens_mode == 'embedding': self.embedding = nn.Embedding( 1, self.token_num * (1 + hparams.gst_vae_mode)) elif self.drop_tokens_mode == 'speaker_embedding': self.speaker_embedding = nn.Embedding( hparams.n_speakers, self.token_num * (1 + hparams.gst_vae_mode))
def __init__(self, hparams): super(UnTTS, self).__init__() self.n_mel_channels = hparams.n_mel_channels self.melenc_enable = hparams.melenc_enable self.bn_pl = MaskedBatchNorm1d(1, momentum=0.10, eval_only_momentum=False, affine=False) self.bn_energy = MaskedBatchNorm1d(1, momentum=0.05, eval_only_momentum=False, affine=False) self.bn_cenergy = MaskedBatchNorm1d(1, momentum=0.05, eval_only_momentum=False, affine=False) self.lbn_duration = LnBatchNorm1d(1, momentum=0.05, eval_only_momentum=False, affine=False, clamp_min=0.75, clamp_max=60.) if (hparams.f0_log_scale if hasattr(hparams, 'f0_log_scale') else False): self.bn_f0 = LnBatchNorm1d(1, momentum=0.05, eval_only_momentum=False, affine=False, clamp_min=0.01, clamp_max=800.) self.bn_cf0 = LnBatchNorm1d(1, momentum=0.05, eval_only_momentum=False, affine=False, clamp_min=0.01, clamp_max=800.) else: self.bn_f0 = MaskedBatchNorm1d(1, momentum=0.05, eval_only_momentum=False, affine=False) self.bn_cf0 = MaskedBatchNorm1d(1, momentum=0.05, eval_only_momentum=False, affine=False) self.embedding = nn.Embedding(hparams.n_symbols, hparams.symbols_embedding_dim) std = sqrt(2.0 / (hparams.n_symbols + hparams.symbols_embedding_dim)) val = sqrt(3.0) * std # uniform bounds for std self.embedding.weight.data.uniform_(-val, val) self.torchmoji_linear = LinearNorm(hparams.torchMoji_attDim, hparams.torchMoji_crushed_dim) enc_global_dim = 2 self.encoder = Encoder(hparams, enc_global_dim * 2) cond_input_dim = enc_global_dim + hparams.torchMoji_crushed_dim + hparams.encoder_LSTM_dim # sylps/pl + torchmoji_dim + encoder_outputs self.speaker_embedding_dim = hparams.speaker_embedding_dim if self.speaker_embedding_dim: self.speaker_embedding = nn.Embedding(hparams.n_speakers, self.speaker_embedding_dim) cond_input_dim += self.speaker_embedding_dim self.cvar_glow = CVarGlow( hparams, cond_input_dim) if hparams.DurGlow_enable else None cond_input_dim += 3 # char f0, char energy, char voiced_mask self.var_glow = VarGlow(hparams, cond_input_dim) melenc_input_dim = None self.mel_encoder = MelEncoder( hparams, melenc_input_dim, hparams.melenc_output_dim) if hparams.melenc_enable else None cond_input_dim += 3 # frame f0, frame energy, voiced_mask hparams.cond_input_dim = cond_input_dim self.decoder = Decoder(hparams)