Beispiel #1
0
 def _clip_quant_scale(x, quant_chann, use_mu_law):
     x = tf.clip_by_value(x, -1.0, 1.0 - 2.0 / quant_chann)
     # Remove the values unseen in data.
     if use_mu_law:
         # suppose x is mu_law encoded audio signal in [-1, 1)
         x_quantized = utils.cast_quantize(x, quant_chann)
         x_scaled = utils.inv_mu_law(x_quantized)
     else:
         # suppose x is real audio signal in [-1, 1)
         x_quantized = utils.cast_quantize(x, quant_chann)
         x_scaled = utils.inv_cast_quantize(x_quantized, quant_chann)
     return x_scaled
Beispiel #2
0
def gauss_sample(gauss_params, quant_chann, use_log_scales=True):
    mean, std = mean_std_from_out_params(gauss_params, use_log_scales)
    distribution = Normal(loc=mean, scale=std)
    x = distribution.sample()
    x = tf.clip_by_value(x, -1., 1. - 2. / quant_chann)
    x_quantized = utils.cast_quantize(x, quant_chann)
    return x_quantized
Beispiel #3
0
    def encode_signal(self, inputs):
        ###
        # Encode the source with 8-bit Mu-Law or just use 16-bit signal.
        ###
        quant_chann = self.quant_chann
        use_mu_law = self.use_mu_law

        x = inputs['wav']
        if use_mu_law:
            x_quantized = utils.mu_law(x)
            x_scaled = tf.cast(x_quantized, tf.float32) / (quant_chann / 2.)
            real_targets = x_scaled
            cate_targets = tf.cast(x_quantized, tf.int32) + tf.cast(
                quant_chann / 2., tf.int32)
        else:
            x_quantized = utils.cast_quantize(x, quant_chann)
            x_scaled = x
            real_targets = x
            cate_targets = tf.cast(x_quantized, tf.int32) + tf.cast(
                quant_chann / 2., tf.int32)

        return {
            'wav_scaled': x_scaled,
            'real_targets': real_targets,
            'cate_targets': cate_targets
        }
Beispiel #4
0
def mol_sample(mol_params, quant_chann, use_log_scales=True):
    """
    Args:
        mol_params: [batch_size, 1, number of mixture * 3]
        quant_chann: quantization channels (2 ** 8 or 2 ** 16)
        use_log_scales: scale parameters is in log scale or linear scale.
    Returns:
        x_quantized: [batch_size, 1],
                     x_quantized is casted to [-quant_chann / 2, quant_chann / 2)
    """
    logit_probs, means, scale_params = tf.split(
        mol_params, num_or_size_splits=3, axis=2)
    nr_mix = mol_params.get_shape().as_list()[2] // 3

    ru = tf.random_uniform(tf.shape(logit_probs), minval=1e-5, maxval=1. - 1e-5)
    sel = tf.one_hot(
        tf.argmax(logit_probs - tf.log(-tf.log(ru)), axis=2),
        depth=nr_mix, dtype=tf.float32)
    means = tf.reduce_sum(means * sel, axis=2)

    if use_log_scales:
        log_scales = tf.clip_by_value(
            tf.reduce_sum(scale_params * sel, axis=2), -7.0, 7.0)
        scales = tf.exp(log_scales)
    else:
        scales = tf.clip_by_value(
            tf.reduce_sum(scale_params * sel, axis=2), tf.exp(-7.0), tf.exp(7.0))

    ru2 = tf.random_uniform(tf.shape(means), minval=1e-5, maxval=1. - 1e-5)
    x = means + scales * (tf.log(ru2) - tf.log(1. - ru2))
    x = tf.clip_by_value(x, -1., 1. - 2. / quant_chann)
    x_quantized = utils.cast_quantize(x, quant_chann)
    return x_quantized
Beispiel #5
0
    def encode_signal(self, inputs, add_noise=False):
        ###
        # Encode the source with 8-bit Mu-Law or just use 16-bit signal.
        ###
        quant_chann = self.quant_chann
        use_mu_law = self.use_mu_law

        x = inputs['wav']
        if use_mu_law:
            x_quantized = utils.mu_law(x)
            x_scaled = tf.cast(x_quantized, tf.float32) / (quant_chann / 2.)
            real_targets = x_scaled
            cate_targets = tf.cast(x_quantized, tf.int32) + tf.cast(quant_chann / 2., tf.int32)
        else:
            x_quantized = utils.cast_quantize(x, quant_chann)
            x_scaled = x
            real_targets = x
            cate_targets = tf.cast(x_quantized, tf.int32) + tf.cast(quant_chann / 2., tf.int32)

        if add_noise:
            # only used when the wavenet is trained as a teacher.
            x_scaled += tf.random_normal(shape=x_scaled.get_shape(), mean=0.0, stddev=0.1)

        return {'wav_scaled': x_scaled,
                'real_targets': real_targets,
                'cate_targets': cate_targets}
Beispiel #6
0
def mol_sample_(mol_params, quant_chann, use_log_scales=True):
    logit_probs, means, scale_params = tf.split(
        mol_params, num_or_size_splits=3, axis=2)
    nr_mix = mol_params.get_shape().as_list()[2] // 3
    sel = tf.one_hot(tf.argmax(logit_probs, axis=2), depth=nr_mix, dtype=tf.float32)
    x = tf.reduce_sum(means * sel, axis=2)
    x = tf.clip_by_value(x, -1., 1. - 2. / quant_chann)
    x_quantized = utils.cast_quantize(x, quant_chann)
    return x_quantized
Beispiel #7
0
def mog_sample(mog_params, quant_chann, use_log_scales=True):
    distribution = mog_from_out_params(mog_params, use_log_scales)
    x = distribution.sample()
    x = tf.clip_by_value(x, -1., 1. - 2. / quant_chann)
    x_quantized = utils.cast_quantize(x, quant_chann)
    return x_quantized
Beispiel #8
0
    def feed_forward(self, inputs):
        """Build the graph for this configuration.

        Args:
          inputs: A dict of inputs. For training, should contain 'wav'.

        Returns:
          A dict of outputs that includes the 'predictions', 'loss', the 'encoding',
          the 'quantized_input', and whatever metrics we want to track for eval.
        """
        num_stages = self.hparams.num_stages
        num_layers = self.hparams.num_layers
        filter_length = self.hparams.filter_length
        width = self.hparams.width
        skip_width = self.hparams.skip_width
        use_mu_law = self.use_mu_law
        quant_chann = self.quant_chann
        out_width = self.out_width

        ###
        # The Transpose Convolution Stack for mel feature.
        ###
        # wavenet inputs <- trans_conv (l2, s2) <- trans_conv (l1, s1) <- mel_ceps
        # win_len: l1 * s2 + (l2 - s2); win_shift: s1 * s2
        # (l1, s1) = (40, 10), (l2, s2) = (80, 20) is a proper configuration.
        # it is almost consistent with mel analysis frame shift (200) and frame length (800).
        mel = inputs['mel']
        ds_dict = self.deconv_stack({'mel': mel})
        mel_en = ds_dict['encoding']

        ###
        # Encode the source with 8-bit Mu-Law or just use 16-bit signal.
        ###
        x = inputs['wav']
        if use_mu_law:
            x_quantized = utils.mu_law(x)
            x_scaled = tf.cast(x_quantized, tf.float32) / (quant_chann / 2.)
            real_targets = x_scaled
            cate_targets = tf.cast(x_quantized, tf.int32) + tf.cast(
                quant_chann / 2., tf.int32)
        else:
            x_quantized = utils.cast_quantize(x, quant_chann)
            x_scaled = x
            real_targets = x
            cate_targets = tf.cast(x_quantized, tf.int32) + tf.cast(
                quant_chann / 2., tf.int32)
        x_scaled = tf.expand_dims(x_scaled, 2)

        ###
        # The WaveNet Decoder.
        ###
        l = masked.shift_right(x_scaled)
        l = masked.conv1d(l,
                          num_filters=width,
                          filter_length=filter_length,
                          name='startconv')

        # Set up skip connections.
        s = masked.conv1d(l,
                          num_filters=skip_width,
                          filter_length=1,
                          name='skip_start')

        # Residual blocks with skip connections.
        for i in range(num_layers):
            dilation = 2**(i % num_stages)
            d = masked.conv1d(l,
                              num_filters=2 * width,
                              filter_length=filter_length,
                              dilation=dilation,
                              name='dilated_conv_%d' % (i + 1))
            c = masked.conv1d(mel_en,
                              num_filters=2 * width,
                              filter_length=1,
                              name='mel_cond_%d' % (i + 1))
            d = _condition(d, c)

            assert d.get_shape().as_list()[2] % 2 == 0
            m = d.get_shape().as_list()[2] // 2
            d_sigmoid = tf.sigmoid(d[:, :, :m])
            d_tanh = tf.tanh(d[:, :, m:])
            d = d_sigmoid * d_tanh

            l += masked.conv1d(d,
                               num_filters=width,
                               filter_length=1,
                               name='res_%d' % (i + 1))
            s += masked.conv1d(d,
                               num_filters=skip_width,
                               filter_length=1,
                               name='skip_%d' % (i + 1))

        s = tf.nn.relu(s)
        s = masked.conv1d(s,
                          num_filters=skip_width,
                          filter_length=1,
                          name='out1')
        c = masked.conv1d(mel_en,
                          num_filters=skip_width,
                          filter_length=1,
                          name='mel_cond_out1')
        s = _condition(s, c)
        s = tf.nn.relu(s)
        # when using mol loss, the model always predicts log_scale, the initializer makes
        # the log_scale in a reasonable small range to speed up convergence.
        final_kernel_init = (tf.truncated_normal_initializer(0.0, 0.01)
                             if self.loss_type == 'mol' else
                             tf.uniform_unit_scaling_initializer(1.0))
        out = masked.conv1d(s,
                            num_filters=out_width,
                            filter_length=1,
                            name='out2',
                            kernel_initializer=final_kernel_init)

        return {
            'real_targets': real_targets,
            'cate_targets': cate_targets,
            'encoding': mel_en,
            'out_params': out
        }