Esempio n. 1
0
    def embedding(self, x, is_training=False):
        """
        :param x: shape=(n, t, n_mels)
        :return: embedding. shape=(n, e)
        """
        # frame-level embedding
        x = tf.layers.dense(x, units=self.hidden_units, activation=tf.nn.relu)  # (n, t, h)

        out = conv1d_banks(x, K=self.num_banks, num_units=self.hidden_units, norm_type=self.norm_type,
                           is_training=is_training)  # (n, t, k * h)

        out = tf.layers.max_pooling1d(out, 2, 1, padding="same")  # (n, t, k * h)

        out = conv1d(out, self.hidden_units, 3, scope="conv1d_1")  # (n, t, h)
        out = normalize(out, type=self.norm_type, is_training=is_training, activation_fn=tf.nn.relu)
        out = conv1d(out, self.hidden_units, 3, scope="conv1d_2")  # (n, t, h)
        out += x  # (n, t, h) # residual connections

        for i in range(self.num_highway):
            out = highwaynet(out, num_units=self.hidden_units, scope='highwaynet_{}'.format(i))  # (n, t, h)

        out = gru(out, self.hidden_units, False)  # (n, t, h)

        # take the last output
        out = out[..., -1]  # (n, h)

        # embedding
        out = tf.layers.dense(out, self.num_classes, name='projection')  # (n, e)
        out = tf.identity(out, name="embedding")

        return out
Esempio n. 2
0
def decode2(inputs, is_training=True, scope="decoder2", reuse=None):
    '''
    Args:
      inputs: A 3d tensor with shape of [N, T', C'], where C'=hp.n_mels*hp.r, 
        dtype of float32. Log magnitude spectrogram of sound files.
      is_training: Whether or not the layer is in training mode.  
      scope: Optional scope for `variable_scope`
      reuse: Boolean, whether to reuse the weights of a previous layer
        by the same name.
        
    Returns
      Predicted magnitude spectrogram tensor with shape of [N, T', C''], 
        where C'' = (1+hp.n_fft//2)*hp.r.
    '''
    with tf.variable_scope(scope, reuse=reuse):
        # Decoder pre-net
        prenet_out = mod.prenet(inputs,
                                is_training=is_training)  # (N, T'', E/2)

        # Decoder Post-processing net = CBHG
        ## Conv1D bank
        dec = mod.conv1d_banks(prenet_out,
                               K=hp.decoder_num_banks,
                               is_training=is_training)  # (N, T', E*K/2)

        ## Max pooling
        dec = tf.layers.max_pooling1d(dec, 2, 1,
                                      padding="same")  # (N, T', E*K/2)

        ## Conv1D projections
        dec = mod.conv1d(dec, hp.embed_size, 3, scope="conv1d_1")  # (N, T', E)
        dec = mod.normalize(dec,
                            type=hp.norm_type,
                            is_training=is_training,
                            activation_fn=tf.nn.relu,
                            scope="norm1")
        dec = mod.conv1d(dec, hp.embed_size // 2, 3,
                         scope="conv1d_2")  # (N, T', E/2)
        dec = mod.normalize(dec,
                            type=hp.norm_type,
                            is_training=is_training,
                            activation_fn=None,
                            scope="norm2")
        dec += prenet_out

        ## Highway Nets
        for i in range(4):
            dec = mod.highwaynet(
                dec,
                num_units=hp.embed_size // 2,
                scope='highwaynet_{}'.format(i))  # (N, T, E/2)

        ## Bidirectional GRU
        dec = mod.gru(dec, hp.embed_size // 2, True)  # (N, T', E)

        # Outputs => (N, T', (1+hp.n_fft//2)*hp.r)
        out_dim = (1 + hp.n_fft // 2) * hp.r
        outputs = tf.layers.dense(dec, out_dim)

    return outputs
Esempio n. 3
0
def encode(inputs, is_training = True, scope = 'encoder', reuse = None):
    with tf.variable_scope(scope, reuse = reuse):
        prenet_out = prenet(inputs, scope = 'prenet', is_training = is_training)
        enc = conv1d_banks(
            prenet_out, K = encoder_num_banks, is_training = is_training
        )
        enc = tf.layers.max_pooling1d(enc, 2, 1, padding = 'same')
        enc = conv1d(enc, embed_size // 2, 3, scope = 'conv1d_1')
        enc = normalize_in(enc, activation_fn = tf.nn.relu)
        enc = conv1d(enc, embed_size // 2, 3, scope = 'conv1d_2')
        enc = normalize_in(enc, activation_fn = tf.nn.relu)
        enc += prenet_out
        for i in range(num_highway_blocks):
            enc = highwaynet(
                enc, units = embed_size // 2, scope = 'highwaynet_%d' % (i)
            )
        memory = gru(enc, embed_size // 2, True)
    return memory
Esempio n. 4
0
    def init_inference(self, config, is_training=False):
        num_banks = config['num_banks']
        hidden_units = config['hidden_units']
        num_highway = config['num_highway']
        norm_type = config['norm_type']
        batch_size = config['batch_size']
        num_rnn_layer = config['num_rnn_layer']
        self._input_dim = input_dim = config['input_dim']
        self._output_dim = output_dim = config['alphabet_size']

        self._inputs = tf.placeholder(tf.float32,
                                      [batch_size, None, input_dim])
        self._seq_lens = tf.placeholder(tf.int32, shape=batch_size)
        self._out_lens = self._seq_lens

        # TODO, awni, for now on the client to remember to initialize these.
        self._mean = tf.get_variable("mean", shape=input_dim, trainable=False)
        self._std = tf.get_variable("std", shape=input_dim, trainable=False)

        std_inputs = (self._inputs - self._mean) / self._std

        x = conv1d(self._inputs, hidden_units, 1, scope="conv1d")

        out = conv1d_banks(x,
                           K=num_banks,
                           num_units=hidden_units,
                           norm_type=norm_type,
                           is_training=is_training)  # (n, t, k * h)

        out = tf.layers.max_pooling1d(out, 2, 1,
                                      padding="same")  # (n, t, k * h)

        out = conv1d(out, hidden_units, 3, scope="conv1d_1")  # (n, t, h)
        out = normalize(out,
                        type=norm_type,
                        is_training=is_training,
                        activation_fn=tf.nn.relu)
        out = conv1d(out, hidden_units, 3, scope="conv1d_2")  # (n, t, h)

        out += x  # (n, t, h) # residual connections

        for i in range(num_highway):
            out = highwaynet(out,
                             num_units=hidden_units,
                             scope='highwaynet_{}'.format(i))  # (n, t, h)

        rnn_out, state, initial_state = gru(
            out,
            hidden_units,
            False,
            seqlens=self._seq_lens,
            num_layers=num_rnn_layer,
            is_training=is_training)  # (n, t, h)

        self._initial_state = initial_state
        self._rnn_state = state
        rnn_out = tf.transpose(rnn_out, [1, 0, 2])

        # Collapse time and batch dims pre softmax.
        rnn_out = tf.reshape(rnn_out, (-1, hidden_units))
        logits, probas = _add_softmax_linear(
            rnn_out,
            hidden_units,
            output_dim,
            initializer=tf.contrib.layers.xavier_initializer())
        # Reshape to time-major.
        self._logits = tf.reshape(logits, (-1, batch_size, output_dim))
        self._probas = tf.reshape(probas, (-1, batch_size, output_dim))

        self._init_inference = True
Esempio n. 5
0
def encode(inputs, is_training=True, scope="encoder", reuse=None):
    ''' 
    Args:
      inputs: A 2d tensor with shape of [N, T], dtype of int32. N: batch_size  T: real length
      seqlens: A 1d tensor with shape of [N,], dtype of int32.
      masks: A 3d tensor with shape of [N, T, 1], dtype of float32.
      is_training: Whether or not the layer is in training mode.
      scope: Optional scope for `variable_scope`
      reuse: Boolean, whether to reuse the weights of a previous layer
        by the same name.
    
    Returns: E is the spectrogram filter N
      A collection of Hidden vectors, whose shape is (N, T, E). N seqs, each with T characters, and each of them encoded to E dimension latent representation
    '''
    with tf.variable_scope(scope, reuse=reuse):
        # Load vocabulary
        #char2idx, idx2char = load_vocab()

        # Character Embedding  N seqs
        #inputs = mod.embed(inputs, len(char2idx), hp.embed_size) # (N, T, E) shape=(32, ?, 256)
        # Encoder pre-net: dense(E)--dropout--dense(E/2)--dropout
        #ipdb.set_trace()
        inputs = mod.pre_spectro(inputs, is_training=is_training)  # (N, T, E)
        prenet_out = mod.prenet(inputs, is_training=is_training)  # (N, T, E/2)

        # Encoder CBHG
        ## Conv1D bank
        enc = mod.conv1d_banks(prenet_out,
                               K=hp.encoder_num_banks,
                               is_training=is_training)  # (N, T, K * E / 2)

        ### Max pooling
        enc = tf.layers.max_pooling1d(enc, 2, 1,
                                      padding="same")  # (N, T, K * E / 2)

        ### Conv1D projections
        enc = mod.conv1d(enc, hp.embed_size // 2, 3,
                         scope="conv1d_1")  # (N, T, E/2)
        enc = mod.normalize(enc,
                            type=hp.norm_type,
                            is_training=is_training,
                            activation_fn=tf.nn.relu,
                            scope="norm1")
        enc = mod.conv1d(enc, hp.embed_size // 2, 3,
                         scope="conv1d_2")  # (N, T, E/2)
        enc = mod.normalize(enc,
                            type=hp.norm_type,
                            is_training=is_training,
                            activation_fn=None,
                            scope="norm2")
        enc += prenet_out  # (N, T, E/2) # residual connections

        ### Highway Nets
        for i in range(hp.num_highwaynet_blocks):
            enc = mod.highwaynet(
                enc,
                num_units=hp.embed_size // 2,
                scope='highwaynet_{}'.format(i))  # (N, T, E/2)

        ### Bidirectional GRU---apply nonlineararity

        memory = mod.gru(
            enc, hp.embed_size // 2, False
        )  # (N, T, E)  what the network represent the input text input

    return memory