예제 #1
0
    def __init__(self, hparams):
        super(EmotionNet, self).__init__()
        self.unknown_id = len(hparams.emotion_classes)

        self.ref_enc = ReferenceEncoder(
            hparams,
            hparams.emotionnet_ref_enc_convs,
            hparams.emotionnet_ref_enc_rnn_dim,
            hparams.emotionnet_ref_enc_use_bias,
            drop_rate=hparams.emotionnet_ref_enc_droprate,
        )

        input_dim = hparams.speaker_embedding_dim + hparams.emotionnet_ref_enc_rnn_dim + hparams.emotionnet_RNN_dim
        self.classifier_layer = LinearNorm(input_dim,
                                           len(hparams.emotion_classes))

        input_dim = input_dim + len(hparams.emotion_classes)
        self.classifier_layer_dropout = hparams.emotionnet_classifier_layer_dropout
        self.latent_layer = LinearNorm(input_dim,
                                       hparams.emotionnet_latent_dim * 2)

        self.encoder_outputs_dropout = hparams.emotionnet_encoder_outputs_dropout
        self.text_rnn = nn.GRU(hparams.encoder_LSTM_dim,
                               hparams.emotionnet_RNN_dim,
                               batch_first=True)
예제 #2
0
    def __init__(self, hparams):
        super(AuxEmotionNet, self).__init__()
        layers = []
        for i, dim in enumerate(hparams.auxemotionnet_layer_dims):
            last_layer = (i + 1 == len(hparams.auxemotionnet_layer_dims))
            in_dim = out_dim = dim
            if i == 0:
                in_dim = hparams.torchMoji_attDim
            if last_layer:
                out_dim = hparams.torchMoji_attDim
            layers.append(LinearNorm(in_dim, out_dim))
            if not last_layer:
                layers.append(nn.LeakyReLU(negative_slope=0.05, inplace=True))
        self.seq_layers = nn.Sequential(*layers)

        self.n_classes = len(hparams.emotion_classes)
        input_dim = hparams.speaker_embedding_dim + hparams.torchMoji_attDim + hparams.auxemotionnet_RNN_dim
        self.classifier_layer_dropout = hparams.auxemotionnet_classifier_layer_dropout
        self.latent_classifier_layer = LinearNorm(
            input_dim, hparams.emotionnet_latent_dim * 2 + self.n_classes)

        self.encoder_outputs_dropout = hparams.auxemotionnet_encoder_outputs_dropout
        self.text_rnn = nn.GRU(hparams.encoder_LSTM_dim,
                               hparams.auxemotionnet_RNN_dim,
                               batch_first=True)
예제 #3
0
 def __init__(self, hp):
     super(MemoryBottleneck, self).__init__()
     self.mem_output_dim = hp.memory_bottleneck_dim
     self.mem_input_dim = hp.encoder_LSTM_dim + hp.speaker_embedding_dim + len(
         hp.emotion_classes) + hp.emotionnet_latent_dim + 1
     self.bottleneck = LinearNorm(self.mem_input_dim,
                                  self.mem_output_dim,
                                  bias=hp.memory_bottleneck_bias,
                                  w_init_gain='tanh')
예제 #4
0
 def __init__(self, attention_n_filters, attention_kernel_size,
              attention_dim, out_bias=False):
     super(LocationLayer, self).__init__()
     padding = int((attention_kernel_size - 1) / 2)
     self.location_conv = ConvNorm(2, attention_n_filters,
                                   kernel_size=attention_kernel_size,
                                   padding=padding, bias=False, stride=1,
                                   dilation=1)
     self.location_dense = LinearNorm(attention_n_filters, attention_dim,
                                      bias=out_bias, w_init_gain='tanh')
예제 #5
0
    def __init__(self, hp):
        super(Encoder, self).__init__()
        self.encoder_speaker_embed_dim = hp.encoder_speaker_embed_dim
        if self.encoder_speaker_embed_dim:
            self.encoder_speaker_embedding = nn.Embedding(
                hp.n_speakers, self.encoder_speaker_embed_dim)

        self.encoder_concat_speaker_embed = hp.encoder_concat_speaker_embed
        self.encoder_conv_hidden_dim = hp.encoder_conv_hidden_dim

        convolutions = []
        for _ in range(hp.encoder_n_convolutions):
            if _ == 0:
                if self.encoder_concat_speaker_embed == 'before_conv':
                    input_dim = hp.symbols_embedding_dim + self.encoder_speaker_embed_dim
                elif self.encoder_concat_speaker_embed == 'before_lstm':
                    input_dim = hp.symbols_embedding_dim
                else:
                    raise NotImplementedError(
                        f'encoder_concat_speaker_embed is has invalid value {hp.encoder_concat_speaker_embed}, valid values are "before","inside".'
                    )
            else:
                input_dim = self.encoder_conv_hidden_dim

            if _ == (hp.encoder_n_convolutions) - 1:  # last conv
                if self.encoder_concat_speaker_embed == 'before_conv':
                    output_dim = hp.encoder_LSTM_dim
                elif self.encoder_concat_speaker_embed == 'before_lstm':
                    output_dim = hp.encoder_LSTM_dim - self.encoder_speaker_embed_dim
            else:
                output_dim = self.encoder_conv_hidden_dim

            conv_layer = nn.Sequential(
                ConvNorm(input_dim,
                         output_dim,
                         kernel_size=hp.encoder_kernel_size,
                         stride=1,
                         padding=int((hp.encoder_kernel_size - 1) / 2),
                         dilation=1,
                         w_init_gain='relu'), nn.BatchNorm1d(output_dim))
            convolutions.append(conv_layer)
        self.convolutions = nn.ModuleList(convolutions)

        self.lstm = nn.LSTM(hp.encoder_LSTM_dim,
                            int(hp.encoder_LSTM_dim / 2),
                            1,
                            batch_first=True,
                            bidirectional=True)
        self.LReLU = nn.LeakyReLU(negative_slope=0.01)  # LeakyReLU

        self.sylps_layer = LinearNorm(hp.encoder_LSTM_dim, 1)
예제 #6
0
 def __init__(self, hparams):
     super(SylpsNet, self).__init__()
     layers = []
     for i, dim in enumerate(hparams.sylpsnet_layer_dims):
         last_layer = (i + 1 == len(hparams.sylpsnet_layer_dims))
         in_dim = out_dim = dim
         if i == 0:
             in_dim = 2
         if last_layer:
             out_dim = 1
         layers.append(LinearNorm(in_dim, out_dim))
         if not last_layer:
             layers.append(nn.LeakyReLU(negative_slope=0.05, inplace=True))
     self.seq_layers = nn.Sequential(*layers)
     self.res_weight = nn.Parameter(torch.tensor(0.01))
예제 #7
0
    def __init__(self, hparams):
        super(GST, self).__init__()

        if not hparams.ss_vae_gst:
            mha_outdim = hparams.token_num * (1 + hparams.gst_vae_mode)
        else:
            mha_outdim = len(hparams.vae_classes)

        # VAE / SS-VAE
        self.vae = hparams.gst_vae_mode
        self.ss_vae = hparams.ss_vae_gst
        self.ss_vae_zu_dim = hparams.ss_vae_zu_dim
        if self.ss_vae:
            self.ss_vae_layers = nn.Sequential(
                nn.Linear(mha_outdim, 2 * self.ss_vae_zu_dim),
                nn.Tanh(),
            )

        # Encoder
        self.token_embedding_size = hparams.token_embedding_size
        self.token_num = hparams.token_num

        self.ref_encoder = ReferenceEncoder(hparams, activation_fn=torch.tanh)
        self.att = MultiHeadAttention(hparams, outdim=mha_outdim)

        self.token_embedding = nn.Parameter(
            torch.zeros([
                self.ss_vae_zu_dim if self.ss_vae else self.token_num,
                self.token_embedding_size
            ]))  # (token_num, Embedding)
        init.normal_(self.token_embedding, mean=0., std=0.5)

        # Token activation function
        if hparams.token_activation_func == 'softmax': self.activation_fn = 0
        elif hparams.token_activation_func == 'sigmoid': self.activation_fn = 1
        elif hparams.token_activation_func == 'tanh': self.activation_fn = 2
        elif hparams.token_activation_func == 'linear': self.activation_fn = 3
        else:
            print(
                f'token_activation_func of {hparams.token_activation_func} is invalid\nPlease use "softmax", "sigmoid" or "tanh"'
            )
            raise

        # tanh on output embed
        self.output_tanh = True

        # torchMoji
        self.torchMoji_linear = hparams.torchMoji_linear
        if self.torchMoji_linear:
            self.map_lin = LinearNorm(
                hparams.torchMoji_attDim,
                self.token_num * (1 + hparams.gst_vae_mode))

        # Drop Tokens
        self.p_drop_tokens = hparams.p_drop_tokens
        self.drop_tokens_mode = hparams.drop_tokens_mode
        if self.drop_tokens_mode == 'embedding':
            self.embedding = nn.Embedding(
                1, self.token_num * (1 + hparams.gst_vae_mode))
        elif self.drop_tokens_mode == 'speaker_embedding':
            self.speaker_embedding = nn.Embedding(
                hparams.n_speakers,
                self.token_num * (1 + hparams.gst_vae_mode))
예제 #8
0
    def __init__(self, hparams):
        super(UnTTS, self).__init__()
        self.n_mel_channels = hparams.n_mel_channels
        self.melenc_enable = hparams.melenc_enable

        self.bn_pl = MaskedBatchNorm1d(1,
                                       momentum=0.10,
                                       eval_only_momentum=False,
                                       affine=False)
        self.bn_energy = MaskedBatchNorm1d(1,
                                           momentum=0.05,
                                           eval_only_momentum=False,
                                           affine=False)
        self.bn_cenergy = MaskedBatchNorm1d(1,
                                            momentum=0.05,
                                            eval_only_momentum=False,
                                            affine=False)
        self.lbn_duration = LnBatchNorm1d(1,
                                          momentum=0.05,
                                          eval_only_momentum=False,
                                          affine=False,
                                          clamp_min=0.75,
                                          clamp_max=60.)

        if (hparams.f0_log_scale
                if hasattr(hparams, 'f0_log_scale') else False):
            self.bn_f0 = LnBatchNorm1d(1,
                                       momentum=0.05,
                                       eval_only_momentum=False,
                                       affine=False,
                                       clamp_min=0.01,
                                       clamp_max=800.)
            self.bn_cf0 = LnBatchNorm1d(1,
                                        momentum=0.05,
                                        eval_only_momentum=False,
                                        affine=False,
                                        clamp_min=0.01,
                                        clamp_max=800.)
        else:
            self.bn_f0 = MaskedBatchNorm1d(1,
                                           momentum=0.05,
                                           eval_only_momentum=False,
                                           affine=False)
            self.bn_cf0 = MaskedBatchNorm1d(1,
                                            momentum=0.05,
                                            eval_only_momentum=False,
                                            affine=False)

        self.embedding = nn.Embedding(hparams.n_symbols,
                                      hparams.symbols_embedding_dim)
        std = sqrt(2.0 / (hparams.n_symbols + hparams.symbols_embedding_dim))
        val = sqrt(3.0) * std  # uniform bounds for std
        self.embedding.weight.data.uniform_(-val, val)

        self.torchmoji_linear = LinearNorm(hparams.torchMoji_attDim,
                                           hparams.torchMoji_crushed_dim)

        enc_global_dim = 2
        self.encoder = Encoder(hparams, enc_global_dim * 2)
        cond_input_dim = enc_global_dim + hparams.torchMoji_crushed_dim + hparams.encoder_LSTM_dim
        #    sylps/pl   +       torchmoji_dim         +     encoder_outputs

        self.speaker_embedding_dim = hparams.speaker_embedding_dim
        if self.speaker_embedding_dim:
            self.speaker_embedding = nn.Embedding(hparams.n_speakers,
                                                  self.speaker_embedding_dim)
            cond_input_dim += self.speaker_embedding_dim

        self.cvar_glow = CVarGlow(
            hparams, cond_input_dim) if hparams.DurGlow_enable else None

        cond_input_dim += 3  # char f0, char energy, char voiced_mask
        self.var_glow = VarGlow(hparams, cond_input_dim)

        melenc_input_dim = None
        self.mel_encoder = MelEncoder(
            hparams, melenc_input_dim,
            hparams.melenc_output_dim) if hparams.melenc_enable else None

        cond_input_dim += 3  # frame f0, frame energy, voiced_mask
        hparams.cond_input_dim = cond_input_dim
        self.decoder = Decoder(hparams)