def __init__(self, hparams): super(Postnet, self).__init__() self.convolutions = nn.ModuleList() self.convolutions.append( nn.Sequential( ConvNorm(hparams.n_mel_channels, hparams.postnet_embedding_dim, kernel_size=hparams.postnet_kernel_size, stride=1, padding=int((hparams.postnet_kernel_size - 1) / 2), dilation=1, w_init_gain='tanh'), nn.BatchNorm1d(hparams.postnet_embedding_dim)) ) for i in range(1, hparams.postnet_n_convolutions - 1): self.convolutions.append( nn.Sequential( ConvNorm(hparams.postnet_embedding_dim, hparams.postnet_embedding_dim, kernel_size=hparams.postnet_kernel_size, stride=1, padding=int((hparams.postnet_kernel_size - 1) / 2), dilation=1, w_init_gain='tanh'), nn.BatchNorm1d(hparams.postnet_embedding_dim)) ) self.convolutions.append( nn.Sequential( ConvNorm(hparams.postnet_embedding_dim, hparams.n_mel_channels, kernel_size=hparams.postnet_kernel_size, stride=1, padding=int((hparams.postnet_kernel_size - 1) / 2), dilation=1, w_init_gain='linear'), nn.BatchNorm1d(hparams.n_mel_channels)) )
def __init__(self, hparams, input_dim, output_dim): super(MelEncoder, self).__init__() self.melenc_conv_hidden_dim = hparams.melenc_conv_dim self.output_dim = output_dim self.drop_chance = hparams.melenc_drop_frame_rate convolutions = [] for _ in range(hparams.melenc_n_layers): input_dim = input_dim if (_ == 0) else self.melenc_conv_hidden_dim output_dim = self.output_dim if ( _ == hparams.melenc_n_layers - 1) else self.melenc_conv_hidden_dim conv_layer = nn.Sequential( ConvNorm(input_dim, output_dim, kernel_size=hparams.melenc_kernel_size, stride=hparams.melenc_stride, padding=int((hparams.melenc_kernel_size - 1) / 2), dilation=1, w_init_gain='relu'), nn.BatchNorm1d(output_dim)) convolutions.append(conv_layer) self.convolutions = nn.ModuleList(convolutions) self.lstm = nn.LSTM(hparams.melenc_n_tokens, hparams.melenc_n_tokens // 2, 1, batch_first=True, bidirectional=True) self.LReLU = nn.LeakyReLU(negative_slope=0.01)
def __init__(self, attention_n_filters, attention_kernel_size, attention_dim, out_bias=False): super(LocationLayer, self).__init__() padding = int((attention_kernel_size - 1) / 2) self.location_conv = ConvNorm(2, attention_n_filters, kernel_size=attention_kernel_size, padding=padding, bias=False, stride=1, dilation=1) self.location_dense = LinearNorm(attention_n_filters, attention_dim, bias=out_bias, w_init_gain='tanh')
def __init__(self, input_dim, output_dim, n_layers=2, n_blocks=1, kernel_size=3, act_func=nn.LeakyReLU(negative_slope=0.01, inplace=True), hidden_dim=None, dropout=0.2, use_batchnorm=True, residual_act_func=False): super(Conv1dResBlock, self).__init__() self.input_dim = input_dim self.output_dim = output_dim self.hidden_dim = hidden_dim or output_dim self.n_blocks = n_blocks self.n_layers = n_layers self.kernel_size = kernel_size self.dropout = dropout self.act_func = act_func self.residual_act_func = residual_act_func self.blocks = nn.ModuleList() self.start_conv = ConvNorm(input_dim, hidden_dim, 1) for i in range(self.n_blocks): convs = nn.ModuleList() for j in range(self.n_layers): conv = ConvNorm( hidden_dim, hidden_dim, kernel_size, padding=(kernel_size - 1) // 2, ) if use_batchnorm: conv = nn.Sequential(conv, nn.BatchNorm1d(hidden_dim)) convs.append(conv) self.blocks.append(convs) self.end_conv = ConvNorm(hidden_dim, output_dim, 1)
def __init__(self, hp): super(Encoder, self).__init__() self.encoder_speaker_embed_dim = hp.encoder_speaker_embed_dim if self.encoder_speaker_embed_dim: self.encoder_speaker_embedding = nn.Embedding( hp.n_speakers, self.encoder_speaker_embed_dim) self.encoder_concat_speaker_embed = hp.encoder_concat_speaker_embed self.encoder_conv_hidden_dim = hp.encoder_conv_hidden_dim convolutions = [] for _ in range(hp.encoder_n_convolutions): if _ == 0: if self.encoder_concat_speaker_embed == 'before_conv': input_dim = hp.symbols_embedding_dim + self.encoder_speaker_embed_dim elif self.encoder_concat_speaker_embed == 'before_lstm': input_dim = hp.symbols_embedding_dim else: raise NotImplementedError( f'encoder_concat_speaker_embed is has invalid value {hp.encoder_concat_speaker_embed}, valid values are "before","inside".' ) else: input_dim = self.encoder_conv_hidden_dim if _ == (hp.encoder_n_convolutions) - 1: # last conv if self.encoder_concat_speaker_embed == 'before_conv': output_dim = hp.encoder_LSTM_dim elif self.encoder_concat_speaker_embed == 'before_lstm': output_dim = hp.encoder_LSTM_dim - self.encoder_speaker_embed_dim else: output_dim = self.encoder_conv_hidden_dim conv_layer = nn.Sequential( ConvNorm(input_dim, output_dim, kernel_size=hp.encoder_kernel_size, stride=1, padding=int((hp.encoder_kernel_size - 1) / 2), dilation=1, w_init_gain='relu'), nn.BatchNorm1d(output_dim)) convolutions.append(conv_layer) self.convolutions = nn.ModuleList(convolutions) self.lstm = nn.LSTM(hp.encoder_LSTM_dim, int(hp.encoder_LSTM_dim / 2), 1, batch_first=True, bidirectional=True) self.LReLU = nn.LeakyReLU(negative_slope=0.01) # LeakyReLU self.sylps_layer = LinearNorm(hp.encoder_LSTM_dim, 1)
def __init__(self, hparams, global_cond_dim): super(Encoder, self).__init__() self.encoder_speaker_embed_dim = hparams.encoder_speaker_embed_dim if self.encoder_speaker_embed_dim: self.encoder_speaker_embedding = nn.Embedding( hparams.n_speakers, self.encoder_speaker_embed_dim) std = sqrt(2.0 / (hparams.n_speakers + self.encoder_speaker_embed_dim)) val = sqrt(3.0) * std # uniform bounds for std self.encoder_speaker_embedding.weight.data.uniform_(-val, val) self.encoder_conv_hidden_dim = hparams.encoder_conv_hidden_dim output_dim = hparams.symbols_embedding_dim + self.encoder_speaker_embed_dim # first layer input_dim convolutions = [] for i in range(hparams.encoder_n_convolutions): is_last_layer = bool(i + 1 == hparams.encoder_n_convolutions) is_first_layer = bool(i == 0) input_dim = output_dim output_dim = hparams.encoder_LSTM_dim if is_last_layer else self.encoder_conv_hidden_dim conv_layer = nn.Sequential( ConvNorm(input_dim, output_dim, kernel_size=hparams.encoder_kernel_size, stride=1, padding=int((hparams.encoder_kernel_size - 1) / 2), dilation=1, w_init_gain='relu'), nn.BatchNorm1d(output_dim)) convolutions.append(conv_layer) self.convolutions = nn.ModuleList(convolutions) self.lstm = nn.LSTM(hparams.encoder_LSTM_dim, int(hparams.encoder_LSTM_dim / 2), 1, batch_first=True, bidirectional=True) self.LReLU = nn.LeakyReLU(negative_slope=0.01) # LeakyReLU self.cond_conv = nn.Linear( hparams.encoder_LSTM_dim, global_cond_dim ) # predicts Preceived Loudness Mu/Logvar from LSTM Hidden State