def __init__(self): super(Attention, self).__init__() self.query_layer = LinearNorm(1024, 128, bias=False, w_init_gain='tanh') self.memory_layer = LinearNorm(512, 128, bias=False, w_init_gain='tanh') self.v = LinearNorm(128, 1, bias=False) self.location_layer = LocationLayer() self.score_mask_value = -float("inf")
def __init__(self, attention_rnn_dim, embedding_dim, attention_dim, attention_location_n_filters, attention_location_kernel_size): super(Attention, self).__init__() self.query_layer = LinearNorm(attention_rnn_dim, attention_dim, bias=False, w_init_gain='tanh') self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False, w_init_gain='tanh') self.v = LinearNorm(attention_dim, 1, bias=False) self.location_layer = LocationLayer(attention_location_n_filters, attention_location_kernel_size, attention_dim) self.score_mask_value = -float('inf')
def __init__(self, in_dim, sizes): super(Prenet, self).__init__() in_sizes = [in_dim] + sizes[:-1] self.layers = nn.ModuleList([ LinearNorm(in_size, out_size, bias=False) for (in_size, out_size) in zip(in_sizes, sizes) ])
def __init__(self): super(Prenet, self).__init__() out_sizes = hps.prenet_out_sizes in_sizes = [hps.prenet_input_dim] + out_sizes[:-1] self.layers = nn.ModuleList([ LinearNorm(in_size, out_size, bias=False, w_init_gain='relu') for (in_size, out_size) in zip(in_sizes, out_sizes) ])
def __init__(self): super(LocationLayer, self).__init__() kernel_size = 31 padding = int(((kernel_size - 1) / 2)) self.location_conv = ConvNorm(2, 32, kernel_size=kernel_size, padding=padding, bias=False, stride=1, dilation=1) self.location_dense = LinearNorm(32, 128, bias=False, w_init_gain='tanh')
def __init__(self, attention_n_filters, attention_kernel_size, attention_dim): super(LocationLayer, self).__init__() padding = int((attention_kernel_size - 1) / 2) self.location_conv = ConvNorm(2, attention_n_filters, kernel_size=attention_kernel_size, padding=padding, bias=False, stride=1, dilation=1) self.location_dense = LinearNorm(attention_n_filters, attention_dim, bias=False, w_init_gain='tanh')
def __init__(self, args): super(I2SModel, self).__init__() self.args = args self.num_mels = hps.num_mels self.mask_padding = hps.mask_padding self.n_frames_per_step = hps.n_frames_per_step if self.args.img_format == 'BU': in_dim = 2048 + 1024 else: in_dim = 2048 self.Linear_vis_info = LinearNorm(1607, 1024, bias=False) self.encoder = Image_Encoder(in_dim) self.decoder = Decoder() self.postnet = Postnet() self.image_encoder = LinearNorm(hps.encoder_embedding_dim, hps.encoder_embedding_dim) self.mel_encoder = Mel_encoder(hps.encoder_embedding_dim)
def __init__(self): super(Decoder, self).__init__() self.n_frames_per_step = hps.n_frames_per_step self.n_mel_channels = hps.n_mel_channels self.encoder_embedding_dim = hps.encoder_embedding_dim self.attention_rnn_dim = hps.attention_rnn_dim self.decoder_rnn_dim = hps.decoder_rnn_dim self.prenet = Prenet() self.attention_rnn = nn.LSTMCell(256 + 512, 1024) self.attention_layer = Attention() # decoder rnn input : 256 + 512 = 768 # decoder rnn output : 1024 self.decoder_rnn = nn.LSTMCell(1024 + 512, 1024, 1) self.linear_projection = LinearNorm(1024 + 512, 80 * hps.n_frames_per_step) self.gate_layer = LinearNorm(1024 + 512, 1, bias=True, w_init_gain='sigmoid')
def __init__(self): super(Decoder, self).__init__() self.num_mels = hps.num_mels self.n_frames_per_step = hps.n_frames_per_step self.encoder_embedding_dim = hps.encoder_embedding_dim self.attention_rnn_dim = hps.attention_rnn_dim self.decoder_rnn_dim = hps.decoder_rnn_dim self.prenet_dim = hps.prenet_dim self.max_decoder_steps = hps.max_decoder_steps self.gate_threshold = hps.gate_threshold self.p_attention_dropout = hps.p_attention_dropout self.p_decoder_dropout = hps.p_decoder_dropout self.prenet = Prenet(hps.num_mels * hps.n_frames_per_step, [hps.prenet_dim, hps.prenet_dim]) self.attention_rnn = nn.LSTMCell( hps.prenet_dim + hps.encoder_embedding_dim, hps.attention_rnn_dim) self.attention_layer = Attention(hps.attention_rnn_dim, hps.encoder_embedding_dim, hps.attention_dim, hps.attention_location_n_filters, hps.attention_location_kernel_size) self.decoder_rnn = nn.LSTMCell( hps.attention_rnn_dim + hps.encoder_embedding_dim, hps.decoder_rnn_dim, 1) self.linear_projection = LinearNorm( hps.decoder_rnn_dim + hps.encoder_embedding_dim, hps.num_mels * hps.n_frames_per_step) self.gate_layer = LinearNorm(hps.decoder_rnn_dim + hps.encoder_embedding_dim, 1, bias=True, w_init_gain='sigmoid')
def __init__(self, in_dim): super(Image_Encoder, self).__init__() in_sizes = [in_dim] + [hps.encoder_embedding_dim * 2] sizes = [hps.encoder_embedding_dim * 2] + [hps.encoder_embedding_dim] self.layers = nn.ModuleList([ LinearNorm(in_size, out_size, bias=False) for (in_size, out_size) in zip(in_sizes, sizes) ]) self.lstm = nn.LSTM(hps.encoder_embedding_dim, int(hps.encoder_embedding_dim / 2), 1, batch_first=True, bidirectional=True)
def \ __init__(self,active_encoder): super(Decoder, self).__init__() self.n_mel_channels = hparams.n_mel_channels self.n_frames_per_step = hparams.n_frames_per_step self.attention_rnn_dim = hparams.attention_rnn_dim self.decoder_rnn_dim = hparams.decoder_rnn_dim self.prenet_dim = hparams.prenet_dim self.max_decoder_steps = hparams.max_decoder_steps self.gate_threshold = hparams.gate_threshold self.p_attention_dropout = hparams.p_attention_dropout self.p_decoder_dropout = hparams.p_decoder_dropout self.max_len = hparams.max_len if active_encoder: self.encoder_embedding_dim = hparams.encoder_embedding_dim else: self.encoder_embedding_dim = hparams.embedding_dim self.prenet = Prenet( hparams.n_mel_channels * hparams.n_frames_per_step, [hparams.prenet_dim, hparams.prenet_dim]) self.attention_rnn = nn.LSTMCell(hparams.prenet_dim + self.encoder_embedding_dim, hparams.attention_rnn_dim, bias=True) self.attention_layer = Attention( hparams.attention_rnn_dim, self.encoder_embedding_dim, hparams.attention_dim, hparams.attention_location_n_filters, hparams.attention_location_kernel_size) self.decoder_rnn = nn.LSTMCell(hparams.attention_rnn_dim + self.encoder_embedding_dim, hparams.decoder_rnn_dim, bias=True) self.linear_projection = LinearNorm( hparams.decoder_rnn_dim + self.encoder_embedding_dim, hparams.n_mel_channels * hparams.n_frames_per_step)