Example #1
0
    def embedding(self, x, is_training=False):
        """
        :param x: shape=(n, t, n_mels)
        :return: embedding. shape=(n, e)
        """
        # frame-level embedding
        x = tf.layers.dense(x, units=self.hidden_units, activation=tf.nn.relu)  # (n, t, h)

        out = conv1d_banks(x, K=self.num_banks, num_units=self.hidden_units, norm_type=self.norm_type,
                           is_training=is_training)  # (n, t, k * h)

        out = tf.layers.max_pooling1d(out, 2, 1, padding="same")  # (n, t, k * h)

        out = conv1d(out, self.hidden_units, 3, scope="conv1d_1")  # (n, t, h)
        out = normalize(out, type=self.norm_type, is_training=is_training, activation_fn=tf.nn.relu)
        out = conv1d(out, self.hidden_units, 3, scope="conv1d_2")  # (n, t, h)
        out += x  # (n, t, h) # residual connections

        for i in range(self.num_highway):
            out = highwaynet(out, num_units=self.hidden_units, scope='highwaynet_{}'.format(i))  # (n, t, h)

        out = gru(out, self.hidden_units, False)  # (n, t, h)

        # take the last output
        out = out[..., -1]  # (n, h)

        # embedding
        out = tf.layers.dense(out, self.num_classes, name='projection')  # (n, e)
        out = tf.identity(out, name="embedding")

        return out
Example #2
0
def decode2(inputs, is_training=True, scope="decoder2", reuse=None):
    '''
    Args:
      inputs: A 3d tensor with shape of [N, T', C'], where C'=hp.n_mels*hp.r, 
        dtype of float32. Log magnitude spectrogram of sound files.
      is_training: Whether or not the layer is in training mode.  
      scope: Optional scope for `variable_scope`
      reuse: Boolean, whether to reuse the weights of a previous layer
        by the same name.
        
    Returns
      Predicted magnitude spectrogram tensor with shape of [N, T', C''], 
        where C'' = (1+hp.n_fft//2)*hp.r.
    '''
    with tf.variable_scope(scope, reuse=reuse):
        # Decoder pre-net
        prenet_out = mod.prenet(inputs,
                                is_training=is_training)  # (N, T'', E/2)

        # Decoder Post-processing net = CBHG
        ## Conv1D bank
        dec = mod.conv1d_banks(prenet_out,
                               K=hp.decoder_num_banks,
                               is_training=is_training)  # (N, T', E*K/2)

        ## Max pooling
        dec = tf.layers.max_pooling1d(dec, 2, 1,
                                      padding="same")  # (N, T', E*K/2)

        ## Conv1D projections
        dec = mod.conv1d(dec, hp.embed_size, 3, scope="conv1d_1")  # (N, T', E)
        dec = mod.normalize(dec,
                            type=hp.norm_type,
                            is_training=is_training,
                            activation_fn=tf.nn.relu,
                            scope="norm1")
        dec = mod.conv1d(dec, hp.embed_size // 2, 3,
                         scope="conv1d_2")  # (N, T', E/2)
        dec = mod.normalize(dec,
                            type=hp.norm_type,
                            is_training=is_training,
                            activation_fn=None,
                            scope="norm2")
        dec += prenet_out

        ## Highway Nets
        for i in range(4):
            dec = mod.highwaynet(
                dec,
                num_units=hp.embed_size // 2,
                scope='highwaynet_{}'.format(i))  # (N, T, E/2)

        ## Bidirectional GRU
        dec = mod.gru(dec, hp.embed_size // 2, True)  # (N, T', E)

        # Outputs => (N, T', (1+hp.n_fft//2)*hp.r)
        out_dim = (1 + hp.n_fft // 2) * hp.r
        outputs = tf.layers.dense(dec, out_dim)

    return outputs
Example #3
0
def dilated_convolution(input, Local_condition, Global_condition, filter_width,
                        dilation_rate, output_width, index, dilation, reuse):
    #dilated convolution
    conv = conv1d(input,
                  hp.Q * 2,
                  filter_width,
                  rate=dilation_rate,
                  padding="causal",
                  scope='conv_{}_{}'.format(index, dilation),
                  reuse=reuse)
    local_cond = conv1d(Local_condition,
                        hp.Q * 2,
                        1,
                        rate=1,
                        padding="SAME",
                        scope='local_cond_{}_{}'.format(index, dilation),
                        reuse=reuse)
    local_cond = local_cond[:, hp.size**(index + 1) - 1:, :]
    global_cond = conv1d(Global_condition,
                         hp.Q * 2,
                         1,
                         rate=1,
                         padding="SAME",
                         scope='global_cond_{}_{}'.format(index, dilation),
                         reuse=reuse)
    conv_filter, conv_gate = tf.split(conv, 2, -1)
    local_cond_filter, local_cond_gate = tf.split(local_cond, 2, -1)
    global_cond_filter, global_cond_gate = tf.split(global_cond, 2, -1)
    conv_filter = conv_filter + local_cond_filter + global_cond_filter  #broadcast
    conv_gate = conv_gate + local_cond_gate + global_cond_gate  #broadcast
    out = tf.tanh(conv_filter) + tf.sigmoid(conv_gate)
    transformed = conv1d(out,
                         filters=hp.Q,
                         padding="SAME",
                         scope='transformed_{}_{}'.format(index, dilation),
                         onebyone=True,
                         reuse=reuse)
    _, x, __ = out.get_shape().as_list()
    skip_cut = x - output_width
    out_skip = tf.slice(out, [0, skip_cut, 0], [-1, -1, -1], name='out_skip')
    out_skip = tf.reshape(out_skip, [hp.batch_size, output_width, hp.Q])
    skip_contribution = conv1d(out_skip,
                               filters=hp.Q,
                               padding="SAME",
                               scope='skip_contribution_{}_{}'.format(
                                   index, dilation),
                               onebyone=True,
                               reuse=reuse)
    transformed_cut = transformed.get_shape().as_list()[1]
    input_cut = input.get_shape().as_list()[1] - transformed_cut
    input_batch = tf.slice(input, [0, input_cut, 0], [-1, -1, -1])
    input_batch = tf.reshape(input_batch,
                             [hp.batch_size, transformed_cut, hp.Q])

    return skip_contribution, input_batch + transformed
def encode(inputs, is_training = True, scope = 'encoder', reuse = None):
    with tf.variable_scope(scope, reuse = reuse):
        prenet_out = prenet(inputs, scope = 'prenet', is_training = is_training)
        enc = conv1d_banks(
            prenet_out, K = encoder_num_banks, is_training = is_training
        )
        enc = tf.layers.max_pooling1d(enc, 2, 1, padding = 'same')
        enc = conv1d(enc, embed_size // 2, 3, scope = 'conv1d_1')
        enc = normalize_in(enc, activation_fn = tf.nn.relu)
        enc = conv1d(enc, embed_size // 2, 3, scope = 'conv1d_2')
        enc = normalize_in(enc, activation_fn = tf.nn.relu)
        enc += prenet_out
        for i in range(num_highway_blocks):
            enc = highwaynet(
                enc, units = embed_size // 2, scope = 'highwaynet_%d' % (i)
            )
        memory = gru(enc, embed_size // 2, True)
    return memory
Example #5
0
def decoder(decoder_inputs, speaker_emb, z_q):
    '''Wavenet decoder.
    Args:
      decoder_inputs: [B, T, 1].
      speaker_emb: [B, len(speakser)]. One-hot. Global condition.
      z_q: [B, T', D]. Local condition.

    '''
    with tf.variable_scope("decoder"):
        # Prenet
        z = conv1d(decoder_inputs,
                   hp.num_units,
                   activation_fn=tf.tanh,
                   padding="causal",
                   bn=True,
                   scope='conv_in')  # (B, T, H)

        # Residual blocks
        skip = 0  # skip connections
        for i in range(hp.num_blocks):
            for r in hp.dilations:
                z, s = residual_block(z,
                                      size=hp.size,
                                      rate=r,
                                      speaker_emb=speaker_emb,
                                      z_q=z_q,
                                      scope="res_block_{}_{}".format(i, r))
                skip += s

        # Postnet
        skip = tf.nn.relu(skip)
        skip = conv1d(skip,
                      padding="causal",
                      activation_fn=tf.nn.relu,
                      bn=True,
                      scope="one_by_one_1")  # (B, T, H)
        y = conv1d(skip, filters=hp.Q, padding="causal",
                   scope="one_by_one_2")  # (B, T, Q) wave logits.

    return y
Example #6
0
def encoder(x):
    '''
    Args:
      x: waveform. [B, T, Q]

    Returns:
      z_e: encoded variable. [B, T', D]
    '''
    with tf.variable_scope("encoder"):
        for i in range(hp.encoder_layers):
            x = tf.pad(x, [[0, 0], [1, 1], [0, 0]])
            x = conv1d(x,
                       filters=hp.D,
                       size=hp.winsize,
                       strides=hp.stride,
                       padding="valid",
                       bn=True,
                       activation_fn=tf.nn.relu
                       if i < hp.encoder_layers - 1 else None,
                       scope="conv1d_{}".format(i))
    z_e = x
    return z_e
Example #7
0
    def init_inference(self, config, is_training=False):
        num_banks = config['num_banks']
        hidden_units = config['hidden_units']
        num_highway = config['num_highway']
        norm_type = config['norm_type']
        batch_size = config['batch_size']
        num_rnn_layer = config['num_rnn_layer']
        self._input_dim = input_dim = config['input_dim']
        self._output_dim = output_dim = config['alphabet_size']

        self._inputs = tf.placeholder(tf.float32,
                                      [batch_size, None, input_dim])
        self._seq_lens = tf.placeholder(tf.int32, shape=batch_size)
        self._out_lens = self._seq_lens

        # TODO, awni, for now on the client to remember to initialize these.
        self._mean = tf.get_variable("mean", shape=input_dim, trainable=False)
        self._std = tf.get_variable("std", shape=input_dim, trainable=False)

        std_inputs = (self._inputs - self._mean) / self._std

        x = conv1d(self._inputs, hidden_units, 1, scope="conv1d")

        out = conv1d_banks(x,
                           K=num_banks,
                           num_units=hidden_units,
                           norm_type=norm_type,
                           is_training=is_training)  # (n, t, k * h)

        out = tf.layers.max_pooling1d(out, 2, 1,
                                      padding="same")  # (n, t, k * h)

        out = conv1d(out, hidden_units, 3, scope="conv1d_1")  # (n, t, h)
        out = normalize(out,
                        type=norm_type,
                        is_training=is_training,
                        activation_fn=tf.nn.relu)
        out = conv1d(out, hidden_units, 3, scope="conv1d_2")  # (n, t, h)

        out += x  # (n, t, h) # residual connections

        for i in range(num_highway):
            out = highwaynet(out,
                             num_units=hidden_units,
                             scope='highwaynet_{}'.format(i))  # (n, t, h)

        rnn_out, state, initial_state = gru(
            out,
            hidden_units,
            False,
            seqlens=self._seq_lens,
            num_layers=num_rnn_layer,
            is_training=is_training)  # (n, t, h)

        self._initial_state = initial_state
        self._rnn_state = state
        rnn_out = tf.transpose(rnn_out, [1, 0, 2])

        # Collapse time and batch dims pre softmax.
        rnn_out = tf.reshape(rnn_out, (-1, hidden_units))
        logits, probas = _add_softmax_linear(
            rnn_out,
            hidden_units,
            output_dim,
            initializer=tf.contrib.layers.xavier_initializer())
        # Reshape to time-major.
        self._logits = tf.reshape(logits, (-1, batch_size, output_dim))
        self._probas = tf.reshape(probas, (-1, batch_size, output_dim))

        self._init_inference = True
Example #8
0
def decoder(decoder_inputs, speaker_emb, z_q, is_training=True):
    '''
    Wavenet decoder.
    Args:
      decoder_inputs: raw wav form [B, T, 1].
      speaker_emb: [B, len(speaker)]. One-hot. Global condition.
      -->speaker_emb:[B,ivec_size] speaker ivector
      z_q: [B, T', D]. Local condition.
      is_training: tell model whether it is in training mode
    Return:
      output: [B,T-receptive_field+1,Q]
    '''
    with tf.variable_scope("decoder"):
        #multiples = hp.stride**hp.encoder_layers
        receptive_field = hp.dilations[-1] * hp.size
        output_width = decoder_inputs.get_shape().as_list(
        )[1] - receptive_field + 1
        # raw wav form(B,T,1) to (B,T,Q)
        #decoder_inputs=tf.reshape(decoder_inputs,[hp.batch_size,hp.T,hp.Q])
        # local condition (B,T',D) to (B,T,Q)
        '''
        B,t,D = z_q.get_shape().as_list()
        z_out = tf.reshape(z_q,[1,-1,D])
        for i in range(0,hp.encoder_layers):
            z_out = tf.concat((z_out,z_out),axis=0)
        z_out = tf.transpose(z_out,perm=[1,0,2])
        z_out = tf.reshape(z_out,[B,multiples*t,D])
        '''
        reuse = None
        if is_training is True:
            z_out = transposed_conv(z_q)
        else:
            reuse = tf.AUTO_REUSE
            z_out = z_q
        #z_out is now (B,T,Q)
        #global conditioning (B,L) to (B,1,Q)
        speaker_emb = tf.expand_dims(speaker_emb, 1)  #(B,1,L)
        gc = speaker_emb
        outputs = []
        for index, dilation in enumerate(hp.dilations):
            out, decoder_inputs = dilated_convolution(decoder_inputs, z_out,
                                                      gc, hp.size, dilation,
                                                      output_width, index,
                                                      dilation, reuse)
            outputs.append(out)
        #postnet
        total = sum(outputs)
        transformed1 = tf.nn.relu(total)
        conv1 = conv1d(transformed1,
                       hp.Q,
                       scope='transformed1',
                       onebyone=True,
                       reuse=reuse)
        transformed2 = tf.nn.relu(conv1)
        conv2 = conv1d(transformed2,
                       hp.Q,
                       scope='transformed2',
                       onebyone=True,
                       reuse=reuse)

    return conv2
Example #9
0
def encode(inputs, is_training=True, scope="encoder", reuse=None):
    ''' 
    Args:
      inputs: A 2d tensor with shape of [N, T], dtype of int32. N: batch_size  T: real length
      seqlens: A 1d tensor with shape of [N,], dtype of int32.
      masks: A 3d tensor with shape of [N, T, 1], dtype of float32.
      is_training: Whether or not the layer is in training mode.
      scope: Optional scope for `variable_scope`
      reuse: Boolean, whether to reuse the weights of a previous layer
        by the same name.
    
    Returns: E is the spectrogram filter N
      A collection of Hidden vectors, whose shape is (N, T, E). N seqs, each with T characters, and each of them encoded to E dimension latent representation
    '''
    with tf.variable_scope(scope, reuse=reuse):
        # Load vocabulary
        #char2idx, idx2char = load_vocab()

        # Character Embedding  N seqs
        #inputs = mod.embed(inputs, len(char2idx), hp.embed_size) # (N, T, E) shape=(32, ?, 256)
        # Encoder pre-net: dense(E)--dropout--dense(E/2)--dropout
        #ipdb.set_trace()
        inputs = mod.pre_spectro(inputs, is_training=is_training)  # (N, T, E)
        prenet_out = mod.prenet(inputs, is_training=is_training)  # (N, T, E/2)

        # Encoder CBHG
        ## Conv1D bank
        enc = mod.conv1d_banks(prenet_out,
                               K=hp.encoder_num_banks,
                               is_training=is_training)  # (N, T, K * E / 2)

        ### Max pooling
        enc = tf.layers.max_pooling1d(enc, 2, 1,
                                      padding="same")  # (N, T, K * E / 2)

        ### Conv1D projections
        enc = mod.conv1d(enc, hp.embed_size // 2, 3,
                         scope="conv1d_1")  # (N, T, E/2)
        enc = mod.normalize(enc,
                            type=hp.norm_type,
                            is_training=is_training,
                            activation_fn=tf.nn.relu,
                            scope="norm1")
        enc = mod.conv1d(enc, hp.embed_size // 2, 3,
                         scope="conv1d_2")  # (N, T, E/2)
        enc = mod.normalize(enc,
                            type=hp.norm_type,
                            is_training=is_training,
                            activation_fn=None,
                            scope="norm2")
        enc += prenet_out  # (N, T, E/2) # residual connections

        ### Highway Nets
        for i in range(hp.num_highwaynet_blocks):
            enc = mod.highwaynet(
                enc,
                num_units=hp.embed_size // 2,
                scope='highwaynet_{}'.format(i))  # (N, T, E/2)

        ### Bidirectional GRU---apply nonlineararity

        memory = mod.gru(
            enc, hp.embed_size // 2, False
        )  # (N, T, E)  what the network represent the input text input

    return memory