Ejemplo n.º 1
0
    def __init__(self, hyperparams, is_training, inputs, input_lengths):
        # inputs: (batch, max_input_length)
        # input_lengths: (batch)

        # Embeddings
        char_embed_table = tf.get_variable(
            'embedding', [hyperparams.num_symbols, hyperparams.embedding_size],
            dtype=tf.float32,
            initializer=tf.truncated_normal_initializer(stddev=0.5))

        # [N, T_in, embedding_size]
        char_embedded_inputs = tf.nn.embedding_lookup(char_embed_table, inputs)

        # [N, T_in, enc_prenet_sizes[-1]]
        prenet_outputs = modules.prenet(
            char_embedded_inputs,
            is_training,
            layer_sizes=hyperparams.enc_prenet_sizes,
            drop_prob=hyperparams.dropout_prob,
            scope='prenet')

        encoder_outputs = modules.cbhg(prenet_outputs,
                                       input_lengths,
                                       is_training,
                                       hyperparams.enc_bank_size,
                                       hyperparams.enc_bank_channel_size,
                                       hyperparams.enc_maxpool_width,
                                       hyperparams.enc_highway_depth,
                                       hyperparams.enc_rnn_size,
                                       hyperparams.enc_proj_sizes,
                                       hyperparams.enc_proj_width,
                                       scope="encoder_cbhg")

        self.encoder_outputs = encoder_outputs
Ejemplo n.º 2
0
def decode2(inputs, is_training=True, scope="decoder2", reuse=None):
    '''
    Args:
      inputs: A 3d tensor with shape of [N, T', C'], where C'=hp.n_mels*hp.r, 
        dtype of float32. Log magnitude spectrogram of sound files.
      is_training: Whether or not the layer is in training mode.  
      scope: Optional scope for `variable_scope`
      reuse: Boolean, whether to reuse the weights of a previous layer
        by the same name.
        
    Returns
      Predicted magnitude spectrogram tensor with shape of [N, T', C''], 
        where C'' = (1+hp.n_fft//2)*hp.r.
    '''
    with tf.variable_scope(scope, reuse=reuse):
        # Decoder pre-net
        prenet_out = mod.prenet(inputs,
                                is_training=is_training)  # (N, T'', E/2)

        # Decoder Post-processing net = CBHG
        ## Conv1D bank
        dec = mod.conv1d_banks(prenet_out,
                               K=hp.decoder_num_banks,
                               is_training=is_training)  # (N, T', E*K/2)

        ## Max pooling
        dec = tf.layers.max_pooling1d(dec, 2, 1,
                                      padding="same")  # (N, T', E*K/2)

        ## Conv1D projections
        dec = mod.conv1d(dec, hp.embed_size, 3, scope="conv1d_1")  # (N, T', E)
        dec = mod.normalize(dec,
                            type=hp.norm_type,
                            is_training=is_training,
                            activation_fn=tf.nn.relu,
                            scope="norm1")
        dec = mod.conv1d(dec, hp.embed_size // 2, 3,
                         scope="conv1d_2")  # (N, T', E/2)
        dec = mod.normalize(dec,
                            type=hp.norm_type,
                            is_training=is_training,
                            activation_fn=None,
                            scope="norm2")
        dec += prenet_out

        ## Highway Nets
        for i in range(4):
            dec = mod.highwaynet(
                dec,
                num_units=hp.embed_size // 2,
                scope='highwaynet_{}'.format(i))  # (N, T, E/2)

        ## Bidirectional GRU
        dec = mod.gru(dec, hp.embed_size // 2, True)  # (N, T', E)

        # Outputs => (N, T', (1+hp.n_fft//2)*hp.r)
        out_dim = (1 + hp.n_fft // 2) * hp.r
        outputs = tf.layers.dense(dec, out_dim)

    return outputs
Ejemplo n.º 3
0
    def fnet(self, mel, is_training=True, reuse=None):
        prenet_out = prenet(mel,
                            num_units=[hp.hidden_units, hp.hidden_units // 2],
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            reuse=reuse)  # (N, T, E/2)
        # CBHG1: mel-scale
        out, _ = cbhg(prenet_out, hp.num_banks, hp.hidden_units // 2,
                        hp.num_highway_blocks, hp.norm_type, is_training,
                        scope="fnet_cbhg1",
                        reuse=reuse)
        
        mid = out


        out, _ = cbhg(prenet_out, hp.num_banks, hp.hidden_units // 2,
                        hp.num_highway_blocks, hp.norm_type, is_training,
                        scope="fnet_cbhg2",
                        reuse=reuse)

        # Final linear projection
        logits = tf.layers.dense(out, hp.len_chinese_ppgs, trainable=is_training, reuse=reuse)  # (N, T, V)
        ppgs = tf.nn.softmax(logits / hp.t, name='ppgs')  # (N, T, V)
        preds = tf.to_int32(tf.argmax(logits, axis=-1))  # (N, T)

        decoded = tf.transpose(logits, perm=[1, 0, 2])    
        sequence_len = tf.reduce_sum(tf.cast(tf.not_equal(tf.reduce_sum(mel, reduction_indices=2), 0.), tf.int32), reduction_indices=1)  
        decoded, _ = tf.nn.ctc_beam_search_decoder(decoded, sequence_len, merge_repeated=False)    
        decoded = tf.sparse_to_dense(decoded[0].indices,decoded[0].dense_shape,decoded[0].values)

        return mid, logits, ppgs, preds, decoded
Ejemplo n.º 4
0
    def _net1(self):
        with tf.variable_scope('net1'):
            # Load vocabulary
            phn2idx, idx2phn = load_vocab()

            # Pre-net
            prenet_out = prenet(self.x_mfcc,
                                num_units=[
                                    hp.Train1.hidden_units,
                                    hp.Train1.hidden_units // 2
                                ],
                                dropout_rate=hp.Train1.dropout_rate,
                                is_training=self.is_training)  # (N, T, E/2)

            # CBHG
            out = cbhg(prenet_out, hp.Train1.num_banks,
                       hp.Train1.hidden_units // 2,
                       hp.Train1.num_highway_blocks, hp.Train1.norm_type,
                       self.is_training)

            # Final linear projection
            logits = tf.layers.dense(out, len(phn2idx))  # (N, T, V)
            ppgs = tf.nn.softmax(logits / hp.Train1.t)  # (N, T, V)
            preds = tf.to_int32(tf.arg_max(logits, dimension=-1))  # (N, T)

        return ppgs, preds, logits
Ejemplo n.º 5
0
    def gnet(self, feature, is_training=True, reuse=None):

        prenet_out = tf.layers.dense(feature, hp.hidden_units, reuse=reuse)

        prenet_out = prenet(prenet_out,
                            num_units=[hp.hidden_units, hp.hidden_units],
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            reuse=reuse)  # (N, T, E/2)
        
        # CBHG1: mel-scale
        pred_mel, _ = cbhg(prenet_out, hp.num_banks, hp.hidden_units,
                        hp.num_highway_blocks, hp.norm_type, is_training,
                        scope="cbhg_gnet_mel",
                        reuse=reuse)

        g_mel = tf.layers.dense(pred_mel, self.x_mel.shape[-1], name='g_mel', reuse=reuse)  # (N, T, n_mel)

        pred_spec = tf.layers.dense(g_mel, hp.hidden_units, reuse=reuse)  # (N, T, n_mels)

        pred_spec, _ = cbhg(pred_spec, hp.num_banks, hp.hidden_units,
                   hp.num_highway_blocks, hp.norm_type, is_training, 
                   scope="cbhg_gnet_spec",
                   reuse=reuse)

        g_spec = tf.layers.dense(pred_spec, self.x_spec.shape[-1], name = 'g_spec', reuse=reuse)


        return g_spec, g_mel
Ejemplo n.º 6
0
    def network(self, ppgs, is_training):
        # Pre-net
        prenet_out = prenet(ppgs,
                            num_units=[hp.Train2.hidden_units,
                                       hp.Train2.hidden_units // 2],
                            dropout_rate=hp.Train2.dropout_rate,
                            is_training=is_training)  # (N, T, E/2)

        # CBHG1: mel-scale
        pred_mel = cbhg(prenet_out, hp.Train2.num_banks, hp.Train2.hidden_units // 2,
                        hp.Train2.num_highway_blocks, hp.Train2.norm_type, is_training,
                        scope="cbhg_mel")
        pred_mel = tf.layers.dense(
            pred_mel, self.y_mel.shape[-1], name='pred_mel')  # (N, T, n_mels)

        # CBHG2: linear-scale
        pred_spec = tf.layers.dense(
            pred_mel, hp.Train2.hidden_units // 2)  # (N, T, n_mels)
        pred_spec = cbhg(pred_spec, hp.Train2.num_banks, hp.Train2.hidden_units // 2,
                         hp.Train2.num_highway_blocks, hp.Train2.norm_type, is_training, scope="cbhg_linear")
        # log magnitude: (N, T, 1+n_fft//2)
        pred_spec = tf.layers.dense(
            pred_spec, self.y_spec.shape[-1], name='pred_spec')

        return pred_spec, pred_mel
Ejemplo n.º 7
0
def test_prenet():
	fc1_hidden_size = 256
	fc2_hidden_size = 128

	# simulate pre-net in decoder
	batch_size = 32
	input_size = 80
	input = Variable(torch.randn(batch_size, 1, input_size))

	prenet = PreNet(input_size, 
					fc1_hidden_size=fc1_hidden_size, 
					fc2_hidden_size=fc2_hidden_size)
	output = prenet(input)

	assert output.size() == (batch_size, 1, fc2_hidden_size)

	# simulate pre-net in encoder
	batch_size = 32
	embedding_size = 256
	time_steps = 17
	input2 = Variable(torch.randn(batch_size, time_steps, embedding_size))

	prenet2 = PreNet(embedding_size,
					 fc1_hidden_size=fc1_hidden_size,
					 fc2_hidden_size=fc2_hidden_size)

	output2 = prenet2(input2)

	assert output2.size() == (batch_size, time_steps, fc2_hidden_size)
Ejemplo n.º 8
0
 def call(self, inputs, state):
     prenet_out = prenet(inputs,
                         self._is_training,
                         self._layer_sizes,
                         scope='decoder_prenet')
     # cell(...) calls the __call() method of RNNCell class
     # as _cell is a type of RNNCell
     return self._cell(prenet_out, state)
Ejemplo n.º 9
0
def decode(
    inputs, memory, is_training = True, scope = 'decoder_layers', reuse = None
):
    with tf.variable_scope(scope, reuse = reuse):
        dec = prenet(inputs, is_training = is_training)
        dec = attention_decoder(dec, memory, embed_size)
        dec += gru(dec, embed_size, False, scope = 'gru1')
        dec += gru(dec, embed_size, False, scope = 'gru2')
        return tf.layers.dense(dec, len(char2idx))
    def network(self, ppgs, is_training):
        # Pre-net
        prenet_out = prenet(
            ppgs,
            num_units=[hp.train2.hidden_units, hp.train2.hidden_units // 2],
            dropout_rate=hp.train2.dropout_rate,
            is_training=is_training)  # (N, T, E/2)

        # CBHG1: mel-scale
        # pred_mel = cbhg(prenet_out, hp.train2.num_banks, hp.train2.hidden_units // 2,
        #                 hp.train2.num_highway_blocks, hp.train2.norm_type, is_training,
        #                 scope="cbhg_mel")
        # pred_mel = tf.layers.dense(pred_mel, self.y_mel.shape[-1])  # (N, T, n_mels)
        pred_mel = prenet_out

        # CBHG2: linear-scale
        out = tf.layers.dense(pred_mel,
                              hp.train2.hidden_units // 2)  # (N, T, n_mels)
        out = cbhg(out,
                   hp.train2.num_banks,
                   hp.train2.hidden_units // 2,
                   hp.train2.num_highway_blocks,
                   hp.train2.norm_type,
                   is_training,
                   scope="cbhg_linear")

        _, n_timesteps, n_bins = self.y_spec.get_shape().as_list()
        n_units = n_bins * hp.train2.n_mixtures
        out = tf.layers.dense(out,
                              n_units * 3,
                              bias_initializer=tf.random_uniform_initializer(
                                  minval=-3., maxval=3.))

        mu = tf.nn.sigmoid(out[..., :n_units])
        mu = tf.reshape(
            mu,
            shape=(-1, n_timesteps, n_bins,
                   hp.train2.n_mixtures))  # (N, T, 1+hp.n_fft//2, n_mixtures)

        log_var = tf.maximum(out[..., n_units:2 * n_units], -7.0)
        log_var = tf.reshape(
            log_var,
            shape=(-1, n_timesteps, n_bins,
                   hp.train2.n_mixtures))  # (N, T, 1+hp.n_fft//2, n_mixtures)

        log_pi = tf.reshape(
            out[..., 2 * n_units:3 * n_units],
            shape=(-1, n_timesteps, n_bins,
                   hp.train2.n_mixtures))  # (N, T, 1+hp.n_fft//2, n_mixtures)
        log_pi = normalize(log_pi,
                           type='ins',
                           is_training=get_current_tower_context().is_training,
                           scope='normalize_pi')
        log_pi = tf.nn.log_softmax(log_pi)

        return mu, log_var, log_pi
Ejemplo n.º 11
0
    def network(self, x_mfcc, is_training):
        # Pre-net
        prenet_out = prenet(x_mfcc,
                            num_units=[hp.train1.hidden_units, hp.train1.hidden_units // 2],
                            dropout_rate=hp.train1.dropout_rate,
                            is_training=is_training)  # (N, T, E/2)

        # CBHG
        out = cbhg(prenet_out, hp.train1.num_banks, hp.train1.hidden_units // 2,
                   hp.train1.num_highway_blocks, hp.train1.norm_type, is_training)

        # Final linear projection
        logits = tf.layers.dense(out, len(phns))  # (N, T, V)
        ppgs = tf.nn.softmax(logits / hp.train1.t, name='ppgs')  # (N, T, V)
        preds = tf.to_int32(tf.argmax(logits, axis=-1))  # (N, T)

        return ppgs, preds, logits
Ejemplo n.º 12
0
def encode(inputs, is_training = True, scope = 'encoder', reuse = None):
    with tf.variable_scope(scope, reuse = reuse):
        prenet_out = prenet(inputs, scope = 'prenet', is_training = is_training)
        enc = conv1d_banks(
            prenet_out, K = encoder_num_banks, is_training = is_training
        )
        enc = tf.layers.max_pooling1d(enc, 2, 1, padding = 'same')
        enc = conv1d(enc, embed_size // 2, 3, scope = 'conv1d_1')
        enc = normalize_in(enc, activation_fn = tf.nn.relu)
        enc = conv1d(enc, embed_size // 2, 3, scope = 'conv1d_2')
        enc = normalize_in(enc, activation_fn = tf.nn.relu)
        enc += prenet_out
        for i in range(num_highway_blocks):
            enc = highwaynet(
                enc, units = embed_size // 2, scope = 'highwaynet_%d' % (i)
            )
        memory = gru(enc, embed_size // 2, True)
    return memory
Ejemplo n.º 13
0
    def pre_decoder(self,
                    inputs,
                    memory,
                    is_training=False,
                    scope="pre-decoder",
                    reuse=None):
        """ Pre Decoder
        :param inputs: A 3D Tensor with shape of [N, T_y / r, n_mels(*r)], with dtype of intxx.
        :param memory: A 3D Tensor with shape of [N, T_x, E].
        :param is_training: A boolean.
        :param scope: A str, Optional scope for 'variable_scope'.
        :param reuse: A boolean. Whether to reuse the weights of a previous layer
            by the same name.
        :return:
        """
        with tf.variable_scope(scope, reuse=reuse):
            # Decoder PreNet
            prenet_dec = prenet(inputs, is_training=is_training)

            # Decoder Attention
            dec, state = attention_decoder(prenet_dec,
                                           memory,
                                           num_units=self.embed_size)

            alignments = tf.transpose(state.alignment_history.stack(),
                                      [1, 2, 0])

            # Decoder stacked GRU
            dec += biGRU(dec,
                         num_units=self.embed_size,
                         bidirection=False,
                         scope="GRU-1")
            dec += biGRU(dec,
                         num_units=self.embed_size,
                         bidirection=False,
                         scope="GRU-2")

            mel_hats = tf.layers.dense(dec,
                                       units=self.n_mels *
                                       self.reduction_factor,
                                       kernel_initializer=_init,
                                       kernel_regularizer=_reg)
        return mel_hats, alignments
Ejemplo n.º 14
0
    def _net2(self):
        # PPGs from net1
        ppgs, preds_ppg, logits_ppg = self._net1()

        with tf.variable_scope('net2'):
            # Pre-net
            prenet_out = prenet(ppgs,
                                num_units=[
                                    self.hparams.Train2.hidden_units,
                                    self.hparams.Train2.hidden_units // 2
                                ],
                                dropout_rate=self.hparams.Train2.dropout_rate,
                                is_training=self.is_training)  # (N, T, E/2)

            # CBHG1: mel-scale
            pred_mel = cbhg(prenet_out,
                            self.hparams.Train2.num_banks,
                            self.hparams.Train2.hidden_units // 2,
                            self.hparams.Train2.num_highway_blocks,
                            self.hparams.Train2.norm_type,
                            self.is_training,
                            scope="cbhg1")
            pred_mel = tf.layers.dense(
                pred_mel,
                self.y_mel.shape[-1])  # log magnitude: (N, T, n_mels)

            # CBHG2: linear-scale
            pred_spec = tf.layers.dense(pred_mel,
                                        self.hparams.Train2.hidden_units //
                                        2)  # log magnitude: (N, T, n_mels)
            pred_spec = cbhg(pred_spec,
                             self.hparams.Train2.num_banks,
                             self.hparams.Train2.hidden_units // 2,
                             self.hparams.Train2.num_highway_blocks,
                             self.hparams.Train2.norm_type,
                             self.is_training,
                             scope="cbhg2")
            pred_spec = tf.layers.dense(
                pred_spec, self.y_spec.shape[-1]
            )  # log magnitude: (N, T, 1+self.hparams.n_fft//2)

        return ppgs, preds_ppg, logits_ppg, pred_spec, pred_mel
Ejemplo n.º 15
0
    def _net1(self):
        with tf.variable_scope('net1'):
            # Load vocabulary
            phn2idx, idx2phn = load_vocab()

            # Pre-net
            prenet_out = prenet(self.x_mfcc,
                                num_units=[hp.Train1.hidden_units, hp.Train1.hidden_units // 2],
                                dropout_rate=hp.Train1.dropout_rate,
                                is_training=self.is_training)  # (N, T, E/2)

            # CBHG
            out = cbhg(prenet_out, hp.Train1.num_banks, hp.Train1.hidden_units // 2, hp.Train1.num_highway_blocks, hp.Train1.norm_type, self.is_training)

            # Final linear projection
            logits = tf.layers.dense(out, len(phn2idx))  # (N, T, V)
            ppgs = tf.nn.softmax(logits / hp.Train1.t)  # (N, T, V)
            preds = tf.to_int32(tf.arg_max(logits, dimension=-1))  # (N, T)

        return ppgs, preds, logits
Ejemplo n.º 16
0
    def _net2(self):
        # PPGs from net1
        ppgs, preds_ppg, logits_ppg = self._net1()

        with tf.variable_scope('net2'):
            # Pre-net
            prenet_out = prenet(ppgs,
                                num_units=[hp.Train2.hidden_units, hp.Train2.hidden_units // 2],
                                dropout_rate=hp.Train2.dropout_rate,
                                is_training=self.is_training)  # (N, T, E/2)

            # CBHG1: mel-scale
            pred_mel = cbhg(prenet_out, hp.Train2.num_banks, hp.Train2.hidden_units // 2, hp.Train2.num_highway_blocks, hp.Train2.norm_type, self.is_training, scope="cbhg1")
            pred_mel = tf.layers.dense(pred_mel, self.y_mel.shape[-1])  # log magnitude: (N, T, n_mels)

            # CBHG2: linear-scale
            pred_spec = tf.layers.dense(pred_mel, hp.Train2.hidden_units // 2)  # log magnitude: (N, T, n_mels)
            pred_spec = cbhg(pred_spec, hp.Train2.num_banks, hp.Train2.hidden_units // 2, hp.Train2.num_highway_blocks, hp.Train2.norm_type, self.is_training, scope="cbhg2")
            pred_spec = tf.layers.dense(pred_spec, self.y_spec.shape[-1])  # log magnitude: (N, T, 1+hp.n_fft//2)

        return ppgs, preds_ppg, logits_ppg, pred_spec, pred_mel
Ejemplo n.º 17
0
def decode1(decoder_inputs,
            memory,
            is_training=True,
            scope="decoder1",
            reuse=None):
    '''
    Args:
      decoder_inputs: A 3d tensor with shape of [N, T', C'], where C'=hp.n_mels*hp.r, 
        dtype of float32. Shifted melspectrogram of sound files. 
      memory: A 3d tensor with shape of [N, T, C], where C=hp.embed_size.
      is_training: Whether or not the layer is in training mode.
      scope: Optional scope for `variable_scope`
      reuse: Boolean, whether to reuse the weights of a previous layer
        by the same name.
        
    Returns
      Predicted melspectrogram tensor with shape of [N, T', C'].
    '''
    with tf.variable_scope(scope, reuse=reuse):
        # Decoder pre-net
        #ipdb.set_trace()
        dec = mod.prenet(decoder_inputs,
                         is_training=is_training)  # (N, T', E/2)

        # Attention RNN
        dec = mod.attention_decoder(dec, memory,
                                    num_units=hp.embed_size)  # (N, T', E)

        # Decoder RNNs
        dec += mod.gru(dec, hp.embed_size, False,
                       scope="decoder_gru1")  # (N, T', E)
        dec += mod.gru(dec, hp.embed_size, False,
                       scope="decoder_gru2")  # (N, T', E)

        # Outputs => (N, T', hp.n_mels*hp.r)
        out_dim = decoder_inputs.get_shape().as_list()[-1]
        outputs = tf.layers.dense(
            dec, out_dim)  # (N, None, E) output the same shape as input

    return outputs
Ejemplo n.º 18
0
 def call(self, inputs, state):
     from modules import prenet
     prenet_out = prenet(inputs, self._is_training, scope="decoder_prenet")
     return self._cell(prenet_out, state)
Ejemplo n.º 19
0
  def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None):
    '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
    '''
    with tf.variable_scope('inference') as scope:
      is_training = linear_targets is not None
      batch_size = tf.shape(inputs)[0]
      hp = self._hparams

      # Embeddings
      embedding_table = tf.get_variable(
        'embedding', [len(symbols), 256], dtype=tf.float32,
        initializer=tf.truncated_normal_initializer(stddev=0.5))
      embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)           # [N, T_in, 256]

      # Encoder
      prenet_outputs = prenet(embedded_inputs, is_training)                       # [N, T_in, 128]
      encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training)  # [N, T_in, 256]

      # Attention
      attention_cell = AttentionWrapper(
        DecoderPrenetWrapper(GRUCell(256), is_training),
        BahdanauAttention(256, encoder_outputs),
        alignment_history=True,
        output_attention=False)                                                  # [N, T_in, 256]

      # Concatenate attention context vector and RNN cell output into a 512D vector.
      concat_cell = ConcatOutputAndAttentionWrapper(attention_cell)              # [N, T_in, 512]

      # Decoder (layers specified bottom to top):
      decoder_cell = MultiRNNCell([
          OutputProjectionWrapper(concat_cell, 256),
          ResidualWrapper(GRUCell(256)),
          ResidualWrapper(GRUCell(256))
        ], state_is_tuple=True)                                                  # [N, T_in, 256]

      # Project onto r mel spectrograms (predict r outputs at each RNN step):
      output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step)
      decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32)

      if is_training:
        helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step)
      else:
        helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step)

      (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
        BasicDecoder(output_cell, helper, decoder_init_state),
        maximum_iterations=hp.max_iters)                                        # [N, T_out/r, M*r]

      # Reshape outputs to be one output per entry
      mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M]

      # Add post-processing CBHG:
      post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training)           # [N, T_out, 256]
      linear_outputs = tf.layers.dense(post_outputs, hp.num_freq)               # [N, T_out, F]

      # Grab alignments from the final decoder state:
      alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

      self.inputs = inputs
      self.input_lengths = input_lengths
      self.mel_outputs = mel_outputs
      self.linear_outputs = linear_outputs
      self.alignments = alignments
      self.mel_targets = mel_targets
      self.linear_targets = linear_targets
      log('Initialized Tacotron model. Dimensions: ')
      log('  embedding:               %d' % embedded_inputs.shape[-1])
      log('  prenet out:              %d' % prenet_outputs.shape[-1])
      log('  encoder out:             %d' % encoder_outputs.shape[-1])
      log('  attention out:           %d' % attention_cell.output_size)
      log('  concat attn & out:       %d' % concat_cell.output_size)
      log('  decoder cell out:        %d' % decoder_cell.output_size)
      log('  decoder out (%d frames):  %d' % (hp.outputs_per_step, decoder_outputs.shape[-1]))
      log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
      log('  postnet out:             %d' % post_outputs.shape[-1])
      log('  linear out:              %d' % linear_outputs.shape[-1])
Ejemplo n.º 20
0
    def encoder(self,
                inputs,
                use_highway_network=True,
                is_training=True,
                scope="encoder",
                reuse=None):
        """ Encoder
        :param inputs: A 2D Tensor with shape of [Seq, E], with dtype of intxx.
        :param use_highway_network: A boolean. Whether using highway network or not
        :param is_training: A boolean.
        :param scope: A str, Optional scope for 'variable_scope'.
        :param reuse: A boolean. Whether to reuse the weights of a previous layer
            by the same name.
        :return:
        """
        with tf.variable_scope(scope, reuse=reuse):
            # Encoder PreNet
            prenet_enc = prenet(inputs, is_training=is_training)

            # Encoder Convolutional Block
            enc = conv1d_banks(prenet_enc,
                               n_kernels=self.n_encoder_banks,
                               is_training=is_training)
            enc = tf.layers.max_pooling1d(enc,
                                          pool_size=2,
                                          strides=1,
                                          padding='SAME')

            # Encoder PostNet
            enc = conv1d(enc,
                         n_filters=self.embed_size // 2,
                         kernel=3,
                         scope="conv1d-proj-1")
            enc = batch_norm(enc,
                             is_training=is_training,
                             activation_fn=tf.nn.relu,
                             scope="bn-proj-1")

            enc = conv1d(enc,
                         n_filters=self.embed_size // 2,
                         kernel=3,
                         scope="conv1d-proj-2")
            enc = batch_norm(enc,
                             is_training=is_training,
                             activation_fn=tf.nn.relu,
                             scope="bn-proj-2")

            enc += prenet_enc  # long skip connection (LSC)

            # highway networks
            if use_highway_network:
                for i in range(self.n_highway_blocks):
                    enc = highway_network(enc,
                                          num_units=self.embed_size // 2,
                                          scope="highway_network-%d" % i)

            memory = biGRU(enc,
                           num_units=self.embed_size // 2,
                           bidirection=True)

        return memory
Ejemplo n.º 21
0
def encode(inputs, is_training=True, scope="encoder", reuse=None):
    ''' 
    Args:
      inputs: A 2d tensor with shape of [N, T], dtype of int32. N: batch_size  T: real length
      seqlens: A 1d tensor with shape of [N,], dtype of int32.
      masks: A 3d tensor with shape of [N, T, 1], dtype of float32.
      is_training: Whether or not the layer is in training mode.
      scope: Optional scope for `variable_scope`
      reuse: Boolean, whether to reuse the weights of a previous layer
        by the same name.
    
    Returns: E is the spectrogram filter N
      A collection of Hidden vectors, whose shape is (N, T, E). N seqs, each with T characters, and each of them encoded to E dimension latent representation
    '''
    with tf.variable_scope(scope, reuse=reuse):
        # Load vocabulary
        #char2idx, idx2char = load_vocab()

        # Character Embedding  N seqs
        #inputs = mod.embed(inputs, len(char2idx), hp.embed_size) # (N, T, E) shape=(32, ?, 256)
        # Encoder pre-net: dense(E)--dropout--dense(E/2)--dropout
        #ipdb.set_trace()
        inputs = mod.pre_spectro(inputs, is_training=is_training)  # (N, T, E)
        prenet_out = mod.prenet(inputs, is_training=is_training)  # (N, T, E/2)

        # Encoder CBHG
        ## Conv1D bank
        enc = mod.conv1d_banks(prenet_out,
                               K=hp.encoder_num_banks,
                               is_training=is_training)  # (N, T, K * E / 2)

        ### Max pooling
        enc = tf.layers.max_pooling1d(enc, 2, 1,
                                      padding="same")  # (N, T, K * E / 2)

        ### Conv1D projections
        enc = mod.conv1d(enc, hp.embed_size // 2, 3,
                         scope="conv1d_1")  # (N, T, E/2)
        enc = mod.normalize(enc,
                            type=hp.norm_type,
                            is_training=is_training,
                            activation_fn=tf.nn.relu,
                            scope="norm1")
        enc = mod.conv1d(enc, hp.embed_size // 2, 3,
                         scope="conv1d_2")  # (N, T, E/2)
        enc = mod.normalize(enc,
                            type=hp.norm_type,
                            is_training=is_training,
                            activation_fn=None,
                            scope="norm2")
        enc += prenet_out  # (N, T, E/2) # residual connections

        ### Highway Nets
        for i in range(hp.num_highwaynet_blocks):
            enc = mod.highwaynet(
                enc,
                num_units=hp.embed_size // 2,
                scope='highwaynet_{}'.format(i))  # (N, T, E/2)

        ### Bidirectional GRU---apply nonlineararity

        memory = mod.gru(
            enc, hp.embed_size // 2, False
        )  # (N, T, E)  what the network represent the input text input

    return memory