コード例 #1
0
 def __init__(self):
     super(Attention, self).__init__()
     self.query_layer = LinearNorm(1024, 128,
                                   bias=False, w_init_gain='tanh')
     self.memory_layer = LinearNorm(512, 128,
                                    bias=False, w_init_gain='tanh')
     self.v = LinearNorm(128, 1, bias=False)
     self.location_layer = LocationLayer()
     self.score_mask_value = -float("inf")
コード例 #2
0
ファイル: model.py プロジェクト: xishaoheng/Tacotron2-PyTorch
	def __init__(self, attention_rnn_dim, embedding_dim, attention_dim,
				 attention_location_n_filters, attention_location_kernel_size):
		super(Attention, self).__init__()
		self.query_layer = LinearNorm(attention_rnn_dim, attention_dim,
									  bias=False, w_init_gain='tanh')
		self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False,
									   w_init_gain='tanh')
		self.v = LinearNorm(attention_dim, 1, bias=False)
		self.location_layer = LocationLayer(attention_location_n_filters,
											attention_location_kernel_size,
											attention_dim)
		self.score_mask_value = -float('inf')
コード例 #3
0
ファイル: model.py プロジェクト: xzm2004260/Tacotron2-PyTorch
 def __init__(self, in_dim, sizes):
     super(Prenet, self).__init__()
     in_sizes = [in_dim] + sizes[:-1]
     self.layers = nn.ModuleList([
         LinearNorm(in_size, out_size, bias=False)
         for (in_size, out_size) in zip(in_sizes, sizes)
     ])
コード例 #4
0
 def __init__(self):
     super(Prenet, self).__init__()
     out_sizes = hps.prenet_out_sizes
     in_sizes = [hps.prenet_input_dim] + out_sizes[:-1]
     self.layers = nn.ModuleList([
         LinearNorm(in_size, out_size, bias=False, w_init_gain='relu')
         for (in_size, out_size) in zip(in_sizes, out_sizes)
     ])
コード例 #5
0
 def __init__(self):
     super(LocationLayer, self).__init__()
     kernel_size = 31
     padding = int(((kernel_size - 1) / 2))
     self.location_conv = ConvNorm(2, 32,
                                   kernel_size=kernel_size,
                                   padding=padding,
                                   bias=False, stride=1, dilation=1)
     self.location_dense = LinearNorm(32, 128,
                                      bias=False, w_init_gain='tanh')
コード例 #6
0
ファイル: model.py プロジェクト: xishaoheng/Tacotron2-PyTorch
	def __init__(self, attention_n_filters, attention_kernel_size,
				 attention_dim):
		super(LocationLayer, self).__init__()
		padding = int((attention_kernel_size - 1) / 2)
		self.location_conv = ConvNorm(2, attention_n_filters,
									  kernel_size=attention_kernel_size,
									  padding=padding, bias=False, stride=1,
									  dilation=1)
		self.location_dense = LinearNorm(attention_n_filters, attention_dim,
										 bias=False, w_init_gain='tanh')
コード例 #7
0
    def __init__(self, args):
        super(I2SModel, self).__init__()
        self.args = args
        self.num_mels = hps.num_mels
        self.mask_padding = hps.mask_padding
        self.n_frames_per_step = hps.n_frames_per_step
        if self.args.img_format == 'BU':
            in_dim = 2048 + 1024
        else:
            in_dim = 2048
        self.Linear_vis_info = LinearNorm(1607, 1024, bias=False)
        self.encoder = Image_Encoder(in_dim)

        self.decoder = Decoder()
        self.postnet = Postnet()

        self.image_encoder = LinearNorm(hps.encoder_embedding_dim,
                                        hps.encoder_embedding_dim)
        self.mel_encoder = Mel_encoder(hps.encoder_embedding_dim)
コード例 #8
0
ファイル: decoder.py プロジェクト: shinewide/sba_speech
    def __init__(self):
        super(Decoder, self).__init__()

        self.n_frames_per_step = hps.n_frames_per_step
        self.n_mel_channels = hps.n_mel_channels
        self.encoder_embedding_dim = hps.encoder_embedding_dim
        self.attention_rnn_dim = hps.attention_rnn_dim
        self.decoder_rnn_dim = hps.decoder_rnn_dim

        self.prenet = Prenet()

        self.attention_rnn = nn.LSTMCell(256 + 512, 1024)

        self.attention_layer = Attention()

        # decoder rnn input : 256 + 512 = 768
        # decoder rnn output : 1024
        self.decoder_rnn = nn.LSTMCell(1024 + 512, 1024, 1)

        self.linear_projection = LinearNorm(1024 + 512, 80 * hps.n_frames_per_step)

        self.gate_layer = LinearNorm(1024 + 512, 1, bias=True, w_init_gain='sigmoid')
コード例 #9
0
ファイル: model.py プロジェクト: xzm2004260/Tacotron2-PyTorch
    def __init__(self):
        super(Decoder, self).__init__()
        self.num_mels = hps.num_mels
        self.n_frames_per_step = hps.n_frames_per_step
        self.encoder_embedding_dim = hps.encoder_embedding_dim
        self.attention_rnn_dim = hps.attention_rnn_dim
        self.decoder_rnn_dim = hps.decoder_rnn_dim
        self.prenet_dim = hps.prenet_dim
        self.max_decoder_steps = hps.max_decoder_steps
        self.gate_threshold = hps.gate_threshold
        self.p_attention_dropout = hps.p_attention_dropout
        self.p_decoder_dropout = hps.p_decoder_dropout

        self.prenet = Prenet(hps.num_mels * hps.n_frames_per_step,
                             [hps.prenet_dim, hps.prenet_dim])

        self.attention_rnn = nn.LSTMCell(
            hps.prenet_dim + hps.encoder_embedding_dim, hps.attention_rnn_dim)

        self.attention_layer = Attention(hps.attention_rnn_dim,
                                         hps.encoder_embedding_dim,
                                         hps.attention_dim,
                                         hps.attention_location_n_filters,
                                         hps.attention_location_kernel_size)

        self.decoder_rnn = nn.LSTMCell(
            hps.attention_rnn_dim + hps.encoder_embedding_dim,
            hps.decoder_rnn_dim, 1)

        self.linear_projection = LinearNorm(
            hps.decoder_rnn_dim + hps.encoder_embedding_dim,
            hps.num_mels * hps.n_frames_per_step)

        self.gate_layer = LinearNorm(hps.decoder_rnn_dim +
                                     hps.encoder_embedding_dim,
                                     1,
                                     bias=True,
                                     w_init_gain='sigmoid')
コード例 #10
0
    def __init__(self, in_dim):
        super(Image_Encoder, self).__init__()
        in_sizes = [in_dim] + [hps.encoder_embedding_dim * 2]
        sizes = [hps.encoder_embedding_dim * 2] + [hps.encoder_embedding_dim]
        self.layers = nn.ModuleList([
            LinearNorm(in_size, out_size, bias=False)
            for (in_size, out_size) in zip(in_sizes, sizes)
        ])

        self.lstm = nn.LSTM(hps.encoder_embedding_dim,
                            int(hps.encoder_embedding_dim / 2),
                            1,
                            batch_first=True,
                            bidirectional=True)
コード例 #11
0
ファイル: decoder.py プロジェクト: hagelborn/tacotron3
    def \
            __init__(self,active_encoder):
        super(Decoder, self).__init__()
        self.n_mel_channels = hparams.n_mel_channels
        self.n_frames_per_step = hparams.n_frames_per_step
        self.attention_rnn_dim = hparams.attention_rnn_dim
        self.decoder_rnn_dim = hparams.decoder_rnn_dim
        self.prenet_dim = hparams.prenet_dim
        self.max_decoder_steps = hparams.max_decoder_steps
        self.gate_threshold = hparams.gate_threshold
        self.p_attention_dropout = hparams.p_attention_dropout
        self.p_decoder_dropout = hparams.p_decoder_dropout
        self.max_len = hparams.max_len

        if active_encoder:
            self.encoder_embedding_dim = hparams.encoder_embedding_dim
        else:
            self.encoder_embedding_dim = hparams.embedding_dim

        self.prenet = Prenet(
            hparams.n_mel_channels * hparams.n_frames_per_step,
            [hparams.prenet_dim, hparams.prenet_dim])

        self.attention_rnn = nn.LSTMCell(hparams.prenet_dim +
                                         self.encoder_embedding_dim,
                                         hparams.attention_rnn_dim,
                                         bias=True)

        self.attention_layer = Attention(
            hparams.attention_rnn_dim, self.encoder_embedding_dim,
            hparams.attention_dim, hparams.attention_location_n_filters,
            hparams.attention_location_kernel_size)

        self.decoder_rnn = nn.LSTMCell(hparams.attention_rnn_dim +
                                       self.encoder_embedding_dim,
                                       hparams.decoder_rnn_dim,
                                       bias=True)

        self.linear_projection = LinearNorm(
            hparams.decoder_rnn_dim + self.encoder_embedding_dim,
            hparams.n_mel_channels * hparams.n_frames_per_step)