Python conv1d Examples

Programming Language: Python

Namespace/Package Name: magenta.models.nsynth.wavenet.masked

Method/Function: conv1d

Examples at hotexamples.com: 10

Python conv1d - 10 examples found. These are the top rated real world Python examples of magenta.models.nsynth.wavenet.masked.conv1d extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: nsynth.py Project: sudanenator/time-domain-neural-audio-style-transfer

def compute_wavenet_encoder_features(content, style):
    ae_hop_length = 512
    ae_bottleneck_width = 16
    ae_num_stages = 10
    ae_num_layers = 30
    ae_filter_length = 3
    ae_width = 128
    # Encode the source with 8-bit Mu-Law.
    n_frames = content.shape[0]
    n_samples = content.shape[1]
    content_tf = np.ascontiguousarray(content)
    style_tf = np.ascontiguousarray(style)
    g = tf.Graph()
    content_features = []
    style_features = []
    layers = []
    with g.as_default(), g.device('/cpu:0'), tf.Session() as sess:
        x = tf.placeholder('float32', [n_frames, n_samples], name="x")
        x_quantized = mu_law(x)
        x_scaled = tf.cast(x_quantized, tf.float32) / 128.0
        x_scaled = tf.expand_dims(x_scaled, 2)
        en = masked.conv1d(x_scaled,
                           causal=False,
                           num_filters=ae_width,
                           filter_length=ae_filter_length,
                           name='ae_startconv')
        for num_layer in range(ae_num_layers):
            dilation = 2**(num_layer % ae_num_stages)
            d = tf.nn.relu(en)
            d = masked.conv1d(d,
                              causal=False,
                              num_filters=ae_width,
                              filter_length=ae_filter_length,
                              dilation=dilation,
                              name='ae_dilatedconv_%d' % (num_layer + 1))
            d = tf.nn.relu(d)
            en += masked.conv1d(d,
                                num_filters=ae_width,
                                filter_length=1,
                                name='ae_res_%d' % (num_layer + 1))
            layers.append(en)
        en = masked.conv1d(en,
                           num_filters=ae_bottleneck_width,
                           filter_length=1,
                           name='ae_bottleneck')
        en = masked.pool1d(en, ae_hop_length, name='ae_pool', mode='avg')
        saver = tf.train.Saver()
        saver.restore(sess, './model.ckpt-200000')
        content_features = sess.run(layers, feed_dict={x: content_tf})
        styles = sess.run(layers, feed_dict={x: style_tf})
        for i, style_feature in enumerate(styles):
            n_features = np.prod(layers[i].shape.as_list()[-1])
            features = np.reshape(style_feature, (-1, n_features))
            style_gram = np.matmul(features.T,
                                   features) / (n_samples * n_frames)
            style_features.append(style_gram)
    return content_features, style_features

Example #2

Show file

    def build(self,
              inputs,
              is_training,
              rescale_inputs=True,
              include_decoder=True,
              use_reduce_mean_to_pool=False):
        """Build the graph for this configuration.

        Args:
          inputs: A dict of inputs. For training, should contain 'wav'.
          is_training: Whether we are training or not. Not used in this config.
          rescale_inputs: Whether to convert inputs to mu-law and back to unit
            scaling before passing through the model (loses gradients).
          include_decoder: bool, whether to include the decoder in the build().
          use_reduce_mean_to_pool: whether to use reduce_mean (instead of pool1d)
            for pooling.
        Returns:
          A dict of outputs that includes the 'predictions', 'loss', the 'encoding',
          the 'quantized_input', and whatever metrics we want to track for eval.
        """
        num_stages = 10
        num_layers = 30
        filter_length = 3
        width = 512
        skip_width = 256
        ae_num_stages = 10
        ae_num_layers = 30
        ae_filter_length = 3
        ae_width = 128

        # Encode the source with 8-bit Mu-Law.
        x = inputs['wav']
        x_quantized = utils.mu_law(x)
        x_scaled = tf.cast(x_quantized, tf.float32) / 128.0
        x_scaled = tf.expand_dims(x_scaled, 2)
        x = tf.expand_dims(x, 2)

        ###
        # The Non-Causal Temporal Encoder.
        ###
        en = masked.conv1d(x_scaled if rescale_inputs else x,
                           causal=False,
                           num_filters=ae_width,
                           filter_length=ae_filter_length,
                           name='ae_startconv',
                           is_training=is_training)

        for num_layer in range(ae_num_layers):
            dilation = 2**(num_layer % ae_num_stages)
            d = tf.nn.relu(en)
            d = masked.conv1d(d,
                              causal=False,
                              num_filters=ae_width,
                              filter_length=ae_filter_length,
                              dilation=dilation,
                              name='ae_dilatedconv_%d' % (num_layer + 1),
                              is_training=is_training)
            d = tf.nn.relu(d)
            en += masked.conv1d(d,
                                num_filters=ae_width,
                                filter_length=1,
                                name='ae_res_%d' % (num_layer + 1),
                                is_training=is_training)

        en = masked.conv1d(en,
                           num_filters=self.ae_bottleneck_width,
                           filter_length=1,
                           name='ae_bottleneck',
                           is_training=is_training)

        if use_reduce_mean_to_pool:
            # Depending on the accelerator used for training, masked.pool1d may
            # lead to out of memory error.
            # reduce_mean is equivalent to masked.pool1d when the stride is the same
            # as the window length (which is the case here).
            batch_size, unused_length, depth = en.shape.as_list()
            en = tf.reshape(en, [batch_size, -1, self.ae_hop_length, depth])
            en = tf.reduce_mean(en, axis=2)
        else:
            en = masked.pool1d(en,
                               self.ae_hop_length,
                               name='ae_pool',
                               mode='avg')
        encoding = en

        if not include_decoder:
            return {'encoding': encoding}

        ###
        # The WaveNet Decoder.
        ###
        l = masked.shift_right(x_scaled if rescale_inputs else x)  # noqa
        l = masked.conv1d(  # noqa
            l,
            num_filters=width,
            filter_length=filter_length,
            name='startconv',
            is_training=is_training)

        # Set up skip connections.
        s = masked.conv1d(l,
                          num_filters=skip_width,
                          filter_length=1,
                          name='skip_start',
                          is_training=is_training)

        # Residual blocks with skip connections.
        for i in range(num_layers):
            dilation = 2**(i % num_stages)
            d = masked.conv1d(l,
                              num_filters=2 * width,
                              filter_length=filter_length,
                              dilation=dilation,
                              name='dilatedconv_%d' % (i + 1),
                              is_training=is_training)
            d = self._condition(
                d,
                masked.conv1d(en,
                              num_filters=2 * width,
                              filter_length=1,
                              name='cond_map_%d' % (i + 1),
                              is_training=is_training))

            assert d.get_shape().as_list()[2] % 2 == 0
            m = d.get_shape().as_list()[2] // 2
            d_sigmoid = tf.sigmoid(d[:, :, :m])
            d_tanh = tf.tanh(d[:, :, m:])
            d = d_sigmoid * d_tanh

            l += masked.conv1d(  # noqa
                d,
                num_filters=width,
                filter_length=1,
                name='res_%d' % (i + 1),
                is_training=is_training)
            s += masked.conv1d(d,
                               num_filters=skip_width,
                               filter_length=1,
                               name='skip_%d' % (i + 1),
                               is_training=is_training)

        s = tf.nn.relu(s)
        s = masked.conv1d(s,
                          num_filters=skip_width,
                          filter_length=1,
                          name='out1',
                          is_training=is_training)
        s = self._condition(
            s,
            masked.conv1d(en,
                          num_filters=skip_width,
                          filter_length=1,
                          name='cond_map_out1',
                          is_training=is_training))
        s = tf.nn.relu(s)

        ###
        # Compute the logits and get the loss.
        ###
        logits = masked.conv1d(s,
                               num_filters=256,
                               filter_length=1,
                               name='logits',
                               is_training=is_training)
        logits = tf.reshape(logits, [-1, 256])
        probs = tf.nn.softmax(logits, name='softmax')
        x_indices = tf.cast(tf.reshape(x_quantized, [-1]), tf.int32) + 128
        loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=x_indices, name='nll'),
                              0,
                              name='loss')

        return {
            'predictions': probs,
            'loss': loss,
            'eval': {
                'nll': loss
            },
            'quantized_input': x_quantized,
            'encoding': encoding,
        }

Example #3

Show file

File: h512_bo16.py Project: Alice-ren/magenta

  def build(self, inputs, is_training):
    """Build the graph for this configuration.

    Args:
      inputs: A dict of inputs. For training, should contain 'wav'.
      is_training: Whether we are training or not. Not used in this config.

    Returns:
      A dict of outputs that includes the 'predictions', 'loss', the 'encoding',
      the 'quantized_input', and whatever metrics we want to track for eval.
    """
    del is_training
    num_stages = 10
    num_layers = 30
    filter_length = 3
    width = 512
    skip_width = 256
    ae_num_stages = 10
    ae_num_layers = 30
    ae_filter_length = 3
    ae_width = 128

    # Encode the source with 8-bit Mu-Law.
    x = inputs['wav']
    x_quantized = utils.mu_law(x)
    x_scaled = tf.cast(x_quantized, tf.float32) / 128.0
    x_scaled = tf.expand_dims(x_scaled, 2)

    ###
    # The Non-Causal Temporal Encoder.
    ###
    en = masked.conv1d(
        x_scaled,
        causal=False,
        num_filters=ae_width,
        filter_length=ae_filter_length,
        name='ae_startconv')

    for num_layer in range(ae_num_layers):
      dilation = 2**(num_layer % ae_num_stages)
      d = tf.nn.relu(en)
      d = masked.conv1d(
          d,
          causal=False,
          num_filters=ae_width,
          filter_length=ae_filter_length,
          dilation=dilation,
          name='ae_dilatedconv_%d' % (num_layer + 1))
      d = tf.nn.relu(d)
      en += masked.conv1d(
          d,
          num_filters=ae_width,
          filter_length=1,
          name='ae_res_%d' % (num_layer + 1))

    en = masked.conv1d(
        en,
        num_filters=self.ae_bottleneck_width,
        filter_length=1,
        name='ae_bottleneck')
    en = masked.pool1d(en, self.ae_hop_length, name='ae_pool', mode='avg')
    encoding = en

    ###
    # The WaveNet Decoder.
    ###
    l = masked.shift_right(x_scaled)
    l = masked.conv1d(
        l, num_filters=width, filter_length=filter_length, name='startconv')

    # Set up skip connections.
    s = masked.conv1d(
        l, num_filters=skip_width, filter_length=1, name='skip_start')

    # Residual blocks with skip connections.
    for i in range(num_layers):
      dilation = 2**(i % num_stages)
      d = masked.conv1d(
          l,
          num_filters=2 * width,
          filter_length=filter_length,
          dilation=dilation,
          name='dilatedconv_%d' % (i + 1))
      d = self._condition(d,
                          masked.conv1d(
                              en,
                              num_filters=2 * width,
                              filter_length=1,
                              name='cond_map_%d' % (i + 1)))

      assert d.get_shape().as_list()[2] % 2 == 0
      m = d.get_shape().as_list()[2] // 2
      d_sigmoid = tf.sigmoid(d[:, :, :m])
      d_tanh = tf.tanh(d[:, :, m:])
      d = d_sigmoid * d_tanh

      l += masked.conv1d(
          d, num_filters=width, filter_length=1, name='res_%d' % (i + 1))
      s += masked.conv1d(
          d, num_filters=skip_width, filter_length=1, name='skip_%d' % (i + 1))

    s = tf.nn.relu(s)
    s = masked.conv1d(s, num_filters=skip_width, filter_length=1, name='out1')
    s = self._condition(s,
                        masked.conv1d(
                            en,
                            num_filters=skip_width,
                            filter_length=1,
                            name='cond_map_out1'))
    s = tf.nn.relu(s)

    ###
    # Compute the logits and get the loss.
    ###
    logits = masked.conv1d(s, num_filters=256, filter_length=1, name='logits')
    logits = tf.reshape(logits, [-1, 256])
    probs = tf.nn.softmax(logits, name='softmax')
    x_indices = tf.cast(tf.reshape(x_quantized, [-1]), tf.int32) + 128
    loss = tf.reduce_mean(
        tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=x_indices, name='nll'),
        0,
        name='loss')

    return {
        'predictions': probs,
        'loss': loss,
        'eval': {
            'nll': loss
        },
        'quantized_input': x_quantized,
        'encoding': encoding,
    }

Example #4

Show file

    def build(self, inputs, is_training):
        """Build the graph for this configuration.

        Parameters
        ----------
        inputs
            A dict of inputs. For training, should contain 'wav'.
        is_training
            Whether we are training or not. Not used in this config.

        Returns
        -------
        A dict of outputs that includes the 'predictions', 'loss', the 'encoding',
        the 'quantized_input', and whatever metrics we want to track for eval.
        """
        del is_training
        num_stages = 10
        num_layers = 30
        filter_length = 3
        width = 512
        skip_width = 256
        ae_num_stages = 10
        ae_num_layers = 30
        ae_filter_length = 3
        ae_width = 128

        # Encode the source with 8-bit Mu-Law.
        x = inputs['wav']
        x_quantized = utils.mu_law(x)
        x_scaled = tf.cast(x_quantized, tf.float32) / 128.0
        x_scaled = tf.expand_dims(x_scaled, 2)

        if self.encoding:
            ###
            # The Non-Causal Temporal Encoder.
            ###
            en = masked.conv1d(
                x_scaled,
                causal=False,
                num_filters=ae_width,
                filter_length=ae_filter_length,
                name='ae_startconv')

            for num_layer in range(ae_num_layers):
                dilation = 2**(num_layer % ae_num_stages)
                d = tf.nn.relu(en)
                d = masked.conv1d(
                    d,
                    causal=False,
                    num_filters=ae_width,
                    filter_length=ae_filter_length,
                    dilation=dilation,
                    name='ae_dilatedconv_%d' % (num_layer + 1))
                d = tf.nn.relu(d)
                en += masked.conv1d(
                    d,
                    num_filters=ae_width,
                    filter_length=1,
                    name='ae_res_%d' % (num_layer + 1))

            en = masked.conv1d(
                en,
                num_filters=self.ae_bottleneck_width,
                filter_length=1,
                name='ae_bottleneck')
            en = masked.pool1d(
                en, self.ae_hop_length, name='ae_pool', mode='avg')
            encoding = en
        else:
            encoding = en = tf.placeholder(
                name='ae_pool', shape=[1, 125, 16], dtype=tf.float32)

        ###
        # The WaveNet Decoder.
        ###
        l = masked.shift_right(x_scaled)
        l = masked.conv1d(
            l, num_filters=width, filter_length=filter_length, name='startconv')

        # Set up skip connections.
        s = masked.conv1d(
            l, num_filters=skip_width, filter_length=1, name='skip_start')

        # Residual blocks with skip connections.
        for i in range(num_layers):
            dilation = 2**(i % num_stages)
            d = masked.conv1d(
                l,
                num_filters=2 * width,
                filter_length=filter_length,
                dilation=dilation,
                name='dilatedconv_%d' % (i + 1))
            d = self._condition(d,
                                masked.conv1d(
                                    en,
                                    num_filters=2 * width,
                                    filter_length=1,
                                    name='cond_map_%d' % (i + 1)))

            assert d.get_shape().as_list()[2] % 2 == 0
            m = d.get_shape().as_list()[2] // 2
            d_sigmoid = tf.sigmoid(d[:, :, :m])
            d_tanh = tf.tanh(d[:, :, m:])
            d = d_sigmoid * d_tanh

            l += masked.conv1d(
                d, num_filters=width, filter_length=1, name='res_%d' % (i + 1))
            s += masked.conv1d(
                d,
                num_filters=skip_width,
                filter_length=1,
                name='skip_%d' % (i + 1))

        s = tf.nn.relu(s)
        s = masked.conv1d(
            s, num_filters=skip_width, filter_length=1, name='out1')
        s = self._condition(s,
                            masked.conv1d(
                                en,
                                num_filters=skip_width,
                                filter_length=1,
                                name='cond_map_out1'))
        s = tf.nn.relu(s)

        ###
        # Compute the logits and get the loss.
        ###
        logits = masked.conv1d(
            s, num_filters=256, filter_length=1, name='logits')
        logits = tf.reshape(logits, [-1, 256])
        probs = tf.nn.softmax(logits, name='softmax')
        x_indices = tf.cast(tf.reshape(x_quantized, [-1]), tf.int32) + 128
        loss = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=logits, labels=x_indices, name='nll'),
            0,
            name='loss')

        return {
            'predictions': probs,
            'loss': loss,
            'eval': {
                'nll': loss
            },
            'quantized_input': x_quantized,
            'encoding': encoding,
        }

Example #5

Show file

File: h512_bo16.py Project: sirdr/magenta

  def build(self, inputs, is_training):
    """Build the graph for this configuration.

    Args:
      inputs: A dict of inputs. For training, should contain 'wav'.
      is_training: Whether we are training or not. Not used in this config.

    Returns:
      A dict of outputs that includes the 'predictions', 'loss', the 'encoding',
      the 'quantized_input', and whatever metrics we want to track for eval.
    """
    del is_training
    num_stages = self.num_stages
    num_layers = self.num_layers
    filter_length = 3
    width = 512
    skip_width = 256
    ae_num_stages = self.ae_num_stages
    ae_num_layers = self.ae_num_layers
    ae_filter_length = 3
    ae_width = 128

    # Encode the source with 8-bit Mu-Law.
    x = inputs['wav']
    x_quantized = utils.mu_law(x)
    x_scaled = tf.cast(x_quantized, tf.float32) / 128.0
    x_scaled = tf.expand_dims(x_scaled, 2)

    if self.iw > 1:
      x_scaled = self._duplicate(x_scaled, self.iw)

    ###
    # The Non-Causal Temporal Encoder.
    ###
    en = masked.conv1d(
        x_scaled,
        causal=False,
        num_filters=ae_width,
        filter_length=ae_filter_length,
        name='ae_startconv')

    for num_layer in range(ae_num_layers):
      dilation = 2**(num_layer % ae_num_stages)
      d = tf.nn.relu(en)
      d = masked.conv1d(
          d,
          causal=False,
          num_filters=ae_width,
          filter_length=ae_filter_length,
          dilation=dilation,
          name='ae_dilatedconv_%d' % (num_layer + 1))
      d = tf.nn.relu(d)
      en += masked.conv1d(
          d,
          num_filters=ae_width,
          filter_length=1,
          name='ae_res_%d' % (num_layer + 1))

    en = masked.conv1d(
        en,
        num_filters=self.ae_bottleneck_width,
        filter_length=1,
        name='ae_bottleneck')
    en = masked.pool1d(en, self.ae_hop_length, name='ae_pool', mode='avg')

    # divide encoding into "mean" and "variance"
    mn, v = self._gaussian_parameters(en)

    # flatten "mean" and "var"
    m_shape = mn.get_shape().as_list()
    v_shape = v.get_shape().as_list()

    mn = tf.reshape(mn, (-1, m_shape[-2]*m_shape[-1]))
    v = tf.reshape(v, (-1, v_shape[-2]*v_shape[-1]))

    # reparameterization trick
    en = self._sample_gaussian(mn, v)

    # reshape into original embedding shape
    en = tf.reshape(en, (-1, m_shape[-2], m_shape[-1]))

    encoding = en


    ###
    # The WaveNet Decoder.
    ###
    dropout_mask = tf.distributions.Bernoulli(probs=tf.to_float(self.dropout), dtype=tf.float32).sample(sample_shape=tf.shape(x_scaled))
    l = tf.math.multiply(masked.shift_right(x_scaled), dropout_mask)
    l = masked.conv1d(
        l, num_filters=width, filter_length=filter_length, name='startconv')

    # Set up skip connections.
    s = masked.conv1d(
        l, num_filters=skip_width, filter_length=1, name='skip_start')

    # Residual blocks with skip connections.
    for i in range(num_layers):
      dilation = 2**(i % num_stages)
      d = masked.conv1d(
          l,
          num_filters=2 * width,
          filter_length=filter_length,
          dilation=dilation,
          name='dilatedconv_%d' % (i + 1))
      d = self._condition(d,
                          masked.conv1d(
                              en,
                              num_filters=2 * width,
                              filter_length=1,
                              name='cond_map_%d' % (i + 1)))

      assert d.get_shape().as_list()[2] % 2 == 0
      m = d.get_shape().as_list()[2] // 2
      d_sigmoid = tf.sigmoid(d[:, :, :m])
      d_tanh = tf.tanh(d[:, :, m:])
      d = d_sigmoid * d_tanh

      l += masked.conv1d(
          d, num_filters=width, filter_length=1, name='res_%d' % (i + 1))
      s += masked.conv1d(
          d, num_filters=skip_width, filter_length=1, name='skip_%d' % (i + 1))

    s = tf.nn.relu(s)
    s = masked.conv1d(s, num_filters=skip_width, filter_length=1, name='out1')
    s = self._condition(s,
                        masked.conv1d(
                            en,
                            num_filters=skip_width,
                            filter_length=1,
                            name='cond_map_out1'))
    s = tf.nn.relu(s)

    if self.aux > 0:
      en_logits = masked.conv1d(
                            en,
                            num_filters=skip_width,
                            filter_length=1,
                            name='cond_map_rec')
      enc_mb, enc_length, enc_channels = en_logits.get_shape().as_list()
      mb, length, channels = s.get_shape().as_list()
      assert enc_mb == mb
      assert enc_channels == channels

      en_logits = tf.nn.relu(en_logits)
      en_logits = tf.reshape(en_logits, [mb, enc_length, 1, channels])

      _, _, reps, _ = tf.reshape(s, [mb, enc_length, -1, channels]).get_shape().as_list()

      en_logits = tf.tile(en_logits, [1, 1, reps, 1])
      en_logits = tf.reshape(en_logits, [mb, length, channels])
      en_logits = masked.conv1d(en_logits, num_filters=256, filter_length=1, name='en_logits')
      en_logits = tf.reshape(en_logits, [-1, 256])
      en_probs = tf.nn.softmax(en_logits, name='en_softmax')


    ###
    # Compute the logits and get the loss.
    ###
    logits = masked.conv1d(s, num_filters=256, filter_length=1, name='logits')
    logits = tf.reshape(logits, [-1, 256])
    probs = tf.nn.softmax(logits, name='softmax')
    x_indices = tf.cast(tf.reshape(x_quantized, [-1]), tf.int32) + 128

    rec = tf.reduce_mean(
        tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=x_indices, name='nll'),
        0,
        name='loss')

    if self.aux > 0:
      aux = tf.reduce_mean(
        tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=en_logits, labels=x_indices, name='en_nll'),
        0,
        name='aux')
    else:
      aux = 0


    kl = tf.reduce_mean(self._kl_normal(mn, v, tf.zeros(1), tf.ones(1)), name='kl')

    return {
        'predictions': probs,
        'loss': {
            'kl': kl,
            'rec': rec,
            'aux': aux},
        'eval': {
            'kl': kl,
            'rec':rec
        },
        'quantized_input': x_quantized,
        'encoding': encoding,
    }

Example #6

Show file

File: nsynth.py Project: sudanenator/time-domain-neural-audio-style-transfer

def compute_wavenet_encoder_stylization(n_samples,
                                        n_frames,
                                        content_features,
                                        style_features,
                                        alpha=1e-4,
                                        learning_rate=1e-3,
                                        iterations=100):
    ae_style_layers = [1, 5]
    ae_num_layers = 30
    ae_num_stages = 10
    ae_filter_length = 3
    ae_width = 128
    layers = []
    with tf.Graph().as_default() as g, g.device(
            '/cpu:0'), tf.Session() as sess:
        x = tf.placeholder(name="x",
                           shape=(n_frames, n_samples, 1),
                           dtype=tf.float32)
        en = masked.conv1d(x,
                           causal=False,
                           num_filters=ae_width,
                           filter_length=ae_filter_length,
                           name='ae_startconv')
        for num_layer in range(ae_num_layers):
            dilation = 2**(num_layer % ae_num_stages)
            d = tf.nn.relu(en)
            d = masked.conv1d(d,
                              causal=False,
                              num_filters=ae_width,
                              filter_length=ae_filter_length,
                              dilation=dilation,
                              name='ae_dilatedconv_%d' % (num_layer + 1))
            d = tf.nn.relu(d)
            en += masked.conv1d(d,
                                num_filters=ae_width,
                                filter_length=1,
                                name='ae_res_%d' % (num_layer + 1))
            layer_i = tf.identity(en, name='layer_{}'.format(num_layer))
            layers.append(layer_i)
        saver = tf.train.Saver()
        saver.restore(sess, './model.ckpt-200000')
        sess.run(tf.initialize_all_variables())
        frozen_graph_def = tf.graph_util.convert_variables_to_constants(
            sess, sess.graph_def, [en.name.replace(':0', '')] +
            ['layer_{}'.format(i) for i in range(ae_num_layers)])
    with tf.Graph().as_default() as g, g.device(
            '/cpu:0'), tf.Session() as sess:
        x = tf.Variable(
            np.random.randn(n_frames, n_samples, 1).astype(np.float32))
        tf.import_graph_def(frozen_graph_def, input_map={'x:0': x})
        content_loss = np.float32(0.0)
        style_loss = np.float32(0.0)
        for num_layer in ae_style_layers:
            layer_i = g.get_tensor_by_name(name='import/layer_%d:0' %
                                           (num_layer))
            content_loss = content_loss + alpha * 2 * tf.nn.l2_loss(
                layer_i - content_features[num_layer])
            n_features = layer_i.shape.as_list()[-1]
            features = tf.reshape(layer_i, (-1, n_features))
            gram = tf.matmul(tf.transpose(features),
                             features) / (n_frames * n_samples)
            style_loss = style_loss + 2 * tf.nn.l2_loss(
                gram - style_features[num_layer])
        loss = content_loss + style_loss
        # Optimization
        print('Started optimization.')
        opt = tf.train.AdamOptimizer(
            learning_rate=learning_rate).minimize(loss)
        var_list = tf.trainable_variables()
        print(var_list)
        sess.run(tf.initialize_all_variables())
        for i in range(iterations):
            s, c, layer, _ = sess.run([style_loss, content_loss, loss, opt])
            print(i, '- Style:', s, 'Content:', c, end='\r')
        result = x.eval()
        result = inv_mu_law_numpy(result[..., 0] / result.max() * 128.0)
    return result

Example #7

Show file

File: nsynth.py Project: sudanenator/time-domain-neural-audio-style-transfer

def compute_wavenet_decoder_stylization(n_samples,
                                        n_frames,
                                        content_features,
                                        style_features,
                                        alpha=1e-4,
                                        learning_rate=1e-3,
                                        iterations=100):

    style_layers = [1, 5]
    num_stages = 10
    num_layers = 30
    filter_length = 3
    width = 512
    skip_width = 256
    layers = []
    with tf.Graph().as_default() as g, g.device(
            '/cpu:0'), tf.Session() as sess:
        x = tf.placeholder(name="x",
                           shape=(n_frames, n_samples, 1),
                           dtype=tf.float32)
        layer = x
        layer = masked.conv1d(layer,
                              num_filters=width,
                              filter_length=filter_length,
                              name='startconv')

        # Set up skip connections.
        s = masked.conv1d(layer,
                          num_filters=skip_width,
                          filter_length=1,
                          name='skip_start')

        # Residual blocks with skip connections.
        for i in range(num_layers):
            dilation = 2**(i % num_stages)
            d = masked.conv1d(layer,
                              num_filters=2 * width,
                              filter_length=filter_length,
                              dilation=dilation,
                              name='dilatedconv_%d' % (i + 1))
            assert d.get_shape().as_list()[2] % 2 == 0
            m = d.get_shape().as_list()[2] // 2
            d_sigmoid = tf.sigmoid(d[:, :, :m])
            d_tanh = tf.tanh(d[:, :, m:])
            d = d_sigmoid * d_tanh

            layer += masked.conv1d(d,
                                   num_filters=width,
                                   filter_length=1,
                                   name='res_%d' % (i + 1))
            s += masked.conv1d(d,
                               num_filters=skip_width,
                               filter_length=1,
                               name='skip_%d' % (i + 1))
            layer_i = tf.identity(s, name='layer_{}'.format(num_layers))
            layers.append(layer_i)
        saver = tf.train.Saver()
        saver.restore(sess, './model.ckpt-200000')
        sess.run(tf.initialize_all_variables())
        frozen_graph_def = tf.graph_util.convert_variables_to_constants(
            sess, sess.graph_def, [s.name.replace(':0', '')] +
            ['layer_{}'.format(i) for i in range(num_layers)])

    with tf.Graph().as_default() as g, g.device(
            '/cpu:0'), tf.Session() as sess:
        x = tf.Variable(
            np.random.randn(n_frames, n_samples, 1).astype(np.float32))
        tf.import_graph_def(frozen_graph_def, input_map={'x:0': x})
        content_loss = np.float32(0.0)
        style_loss = np.float32(0.0)
        for num_layer in style_layers:
            layer_i = g.get_tensor_by_name(name='import/layer_%d:0' %
                                           (num_layer))
            content_loss = content_loss + alpha * 2 * tf.nn.l2_loss(
                layer_i - content_features[num_layer])
            n_features = layer_i.shape.as_list()[-1]
            features = tf.reshape(layer_i, (-1, n_features))
            gram = tf.matmul(tf.transpose(features),
                             features) / (n_frames * n_samples)
            style_loss = style_loss + 2 * tf.nn.l2_loss(
                gram - style_features[num_layer])
        loss = content_loss + style_loss
        # Optimization
        print('Started optimization.')
        opt = tf.train.AdamOptimizer(
            learning_rate=learning_rate).minimize(loss)
        var_list = tf.trainable_variables()
        print(var_list)
        sess.run(tf.initialize_all_variables())
        for i in range(iterations):
            s, c, _ = sess.run([style_loss, content_loss, opt])
            print(i, '- Style:', s, 'Content:', c, end='\r')
        result = x.eval()
        result = inv_mu_law_numpy(result[..., 0] / result.max() * 128.0)

    return result

Example #8

Show file

File: nsynth.py Project: sudanenator/time-domain-neural-audio-style-transfer

def compute_wavenet_decoder_features(content, style):
    num_stages = 10
    num_layers = 30
    filter_length = 3
    width = 512
    skip_width = 256
    # Encode the source with 8-bit Mu-Law.
    n_frames = content.shape[0]
    n_samples = content.shape[1]
    content_tf = np.ascontiguousarray(content)
    style_tf = np.ascontiguousarray(style)
    g = tf.Graph()
    content_features = []
    style_features = []
    layers = []
    with g.as_default(), g.device('/cpu:0'), tf.Session() as sess:
        x = tf.placeholder('float32', [n_frames, n_samples], name="x")
        x_quantized = mu_law(x)
        x_scaled = tf.cast(x_quantized, tf.float32) / 128.0
        x_scaled = tf.expand_dims(x_scaled, 2)
        layer = x_scaled
        layer = masked.conv1d(layer,
                              num_filters=width,
                              filter_length=filter_length,
                              name='startconv')

        # Set up skip connections.
        s = masked.conv1d(layer,
                          num_filters=skip_width,
                          filter_length=1,
                          name='skip_start')

        # Residual blocks with skip connections.
        for i in range(num_layers):
            dilation = 2**(i % num_stages)
            d = masked.conv1d(layer,
                              num_filters=2 * width,
                              filter_length=filter_length,
                              dilation=dilation,
                              name='dilatedconv_%d' % (i + 1))
            assert d.get_shape().as_list()[2] % 2 == 0
            m = d.get_shape().as_list()[2] // 2
            d_sigmoid = tf.sigmoid(d[:, :, :m])
            d_tanh = tf.tanh(d[:, :, m:])
            d = d_sigmoid * d_tanh

            layer += masked.conv1d(d,
                                   num_filters=width,
                                   filter_length=1,
                                   name='res_%d' % (i + 1))
            s += masked.conv1d(d,
                               num_filters=skip_width,
                               filter_length=1,
                               name='skip_%d' % (i + 1))
            layers.append(s)

        saver = tf.train.Saver()
        saver.restore(sess, './model.ckpt-200000')
        content_features = sess.run(layers, feed_dict={x: content_tf})
        styles = sess.run(layers, feed_dict={x: style_tf})
        for i, style_feature in enumerate(styles):
            n_features = np.prod(layers[i].shape.as_list()[-1])
            features = np.reshape(style_feature, (-1, n_features))
            style_gram = np.matmul(features.T,
                                   features) / (n_samples * n_frames)
            style_features.append(style_gram)
    return content_features, style_features

Example #9

Show file

    s = masked.conv1d(
        l, num_filters=skip_width, filter_length=1, name='skip_start')

    # Residual blocks with skip connections.
    for i in range(num_layers):
      dilation = 2**(i % num_stages)
      d = masked.conv1d(
          l,
          num_filters=2 * width,
          filter_length=filter_length,
          dilation=dilation,
          name='dilatedconv_%d' % (i + 1))
      d = self._condition(d,
                          masked.conv1d(
                              en,
                              num_filters=2 * width,
                              filter_length=1,
                              name='cond_map_%d' % (i + 1)))

      assert d.get_shape().as_list()[2] % 2 == 0
      m = d.get_shape().as_list()[2] // 2
      d_sigmoid = tf.sigmoid(d[:, :, :m])
      d_tanh = tf.tanh(d[:, :, m:])
      d = d_sigmoid * d_tanh

      l += masked.conv1d(
          d, num_filters=width, filter_length=1, name='res_%d' % (i + 1))
      s += masked.conv1d(
          d, num_filters=skip_width, filter_length=1, name='skip_%d' % (i + 1))

    s = tf.nn.relu(s)

Example #10

Show file

    def build(self, inputs, is_training):
        """Build the graph for this configuration.

    Args:
      inputs: A dict of inputs. For training, should contain 'wav'.
      is_training: Whether we are training or not. Not used in this config.

    Returns:
      A dict of outputs that includes the 'predictions', 'loss', the 'encoding',
      the 'quantized_input', and whatever metrics we want to track for eval.
    """
        del is_training
        num_stages = 10
        num_layers = 30
        filter_length = 3
        width = 512
        skip_width = 256
        ae_num_stages = 10
        ae_num_layers = 30
        ae_filter_length = 3
        ae_width = 128

        print("@build, inputs: ",
              inputs)  #pitch shape=(1,),  wav shape=(1, 6144), key shape=(1,)
        # Encode the source with 8-bit Mu-Law.
        x = inputs['wav']
        print("@build, x: ", x)  #shape=(1, 6144)
        x_quantized = utils.mu_law(x)
        print("@build, x_quantized: ", x_quantized)  #shape=(1, 6144)
        x_scaled = tf.cast(x_quantized, tf.float32) / 128.0
        print("@build, x_scaled@1: ", x_scaled)  #shape=(1, 6144)
        x_scaled = tf.expand_dims(x_scaled, 2)
        print("@build, x_scaled@2: ", x_scaled)  #shape=(1, 6144, 1)

        ###
        # The Non-Causal Temporal Encoder.
        ###
        print("@build, ##Non-Causal Temporal Encoder...")
        print("\t create Layer ae_startconv")
        print("\t input[x_scaled] is: ", x_scaled)
        en = masked.conv1d(
            x_scaled,
            causal=False,
            num_filters=ae_width,  #ae_width = 128
            filter_length=ae_filter_length,
            name='ae_startconv')
        print("\t ae_startconv output [en] is:", en)  #shape=(1. 6144, 128)
        print("\t create Layer ae_startconv Done\n")

        for num_layer in range(ae_num_layers):
            dilation = 2**(num_layer % ae_num_stages)
            print("\t create Layer relu")
            print("\t input[en] is: ", en)  #shape=(1. 6144, 128)
            d = tf.nn.relu(en)
            print("\t relu output [d] is:", d)
            print("\t create Layer relu Done\n")

            print("\t create Layer ae_dilatedconv_{}, dilation={}".format(
                num_layer + 1, dilation))
            print("\t input[d] is: ", d)
            d = masked.conv1d(
                d,
                causal=False,
                num_filters=ae_width,  #128
                filter_length=ae_filter_length,
                dilation=dilation,
                name='ae_dilatedconv_%d' % (num_layer + 1))
            print("\t output [d] is:", d)
            print(
                "\t create Layer ae_dilatedconv_{}, dilation={} Done\n".format(
                    num_layer + 1, dilation))

            print("\t create Layer relu")
            print("\t input[d] is: ", d)
            d = tf.nn.relu(d)
            print("\t relu output [d] is:", d)
            print("\t create Layer relu Done\n")

            print("\t create Layer ae_res_{}".format(num_layer + 1))
            print("\t input[en] is: ", en)
            print("\t input[d] is: ", d)
            en += masked.conv1d(
                d,
                num_filters=ae_width,  #128
                filter_length=1,
                name='ae_res_%d' % (num_layer + 1))
            print("\t output [en] is:", en)  #shape=(1, 6144, 128)
            print("\t create Layer ae_res_{} Done\n".format(num_layer + 1))

        print("\t create Layer ae_bottleneck")
        print("\t input[en] is: ", en)  #shape=(1, 6144, 128)
        en = masked.conv1d(
            en,
            num_filters=self.ae_bottleneck_width,  #16
            filter_length=1,
            name='ae_bottleneck')
        print("\t output[en] is: ", en)  #shape=(1, 6144, 16)
        print("\t create Layer ae_bottleneck Done\n")

        print("\t create ae_pool")
        print("\t input[en] is: ", en)  #shape=(1, 6144, 16)
        en = masked.pool1d(en, self.ae_hop_length, name='ae_pool',
                           mode='avg')  #ae_hop_length=512
        print("\t output[en] is: ", en)  #shape=(1, 12, 16) #6144/512=12
        print("\t create ae_pool Done\n")

        encoding = en  #encoding is 'feature vector', (125,16) for every 4 seconds voice. 125=4x16000/512
        print("\t ##Non-Causal Temporal Encoder output[en|encoding] is: ",
              encoding)
        print("@build, ##Non-Causal Temporal Encoder...Done\n")

        ###
        # The WaveNet Decoder.
        ###
        print("@build, ##The WaveNet Decoder...")
        print("\t input[x_scaled] is: ", x_scaled)  #shape=(1, 6144, 1)
        l = masked.shift_right(x_scaled)
        print("\t create startconv")
        print("\t input[l] is: ", l)  #shape=(1, 6144, 1)
        l = masked.conv1d(l,
                          num_filters=width,
                          filter_length=filter_length,
                          name='startconv')  #width=512
        print("\t output[l] is: ", l)  #shape=(1, 6144, 512)
        print("\t create startconv Done\n")

        # Set up skip connections.
        print("\t create skip_start")
        print("\t input[l] is: ", l)
        s = masked.conv1d(l,
                          num_filters=skip_width,
                          filter_length=1,
                          name='skip_start')  #skip_width=256
        print("\t output[s] is: ", s)  #shape=(1, 6144, 256)
        print("\t create skip_start Done\n")

        # Residual blocks with skip connections.
        for i in range(num_layers):
            dilation = 2**(i % num_stages)
            print("\t create dilatedconv_{}, dilation={}".format(
                i + 1, dilation))
            print("\t input[l] is: ", l)
            d = masked.conv1d(l,
                              num_filters=2 * width,
                              filter_length=filter_length,
                              dilation=dilation,
                              name='dilatedconv_%d' % (i + 1))
            print("\t output[d] is: ", d)  #shape=(1, 6144, 1024)
            print("\t create dilatedconv_{}, dilation={} Done\n".format(
                i + 1, dilation))

            print("\t create _condition for cond_map_{}".format(i + 1))
            print("\t input[d] is: ", d)
            print("\t input[en] is: ", en)
            d = self._condition(
                d,
                masked.conv1d(en,
                              num_filters=2 * width,
                              filter_length=1,
                              name='cond_map_%d' % (i + 1)))
            print("\t output[d] is: ", d)
            print("\t create _condition for cond_map_{} Done\n".format(i + 1))

            assert d.get_shape().as_list()[2] % 2 == 0
            m = d.get_shape().as_list()[2] // 2
            d_sigmoid = tf.sigmoid(d[:, :, :m])
            d_tanh = tf.tanh(d[:, :, m:])
            d = d_sigmoid * d_tanh
            print("\t d after some cacule:", d)  #shape=(1, 6144, 512)
            print("")

            print("\t create res_{}".format(i + 1))
            print("\t input[d] is: ", d)  #shape=(1, 6144, 512)
            print("\t input[l] is: ", l)  #shape=(1, 6144, 512)
            l += masked.conv1d(d,
                               num_filters=width,
                               filter_length=1,
                               name='res_%d' % (i + 1))  #width=512
            print("\t output[l] is: ", l)  #shape=(1, 6144, 512)
            print("\t create res_{} Done\n".format(i + 1))

            print("\t create skip_{}".format(i + 1))
            print("\t input[d] is: ", d)  #shape=(1, 6144, 512)
            print("\t input[s] is: ", s)  #shape=(1, 6144, 256)
            s += masked.conv1d(d,
                               num_filters=skip_width,
                               filter_length=1,
                               name='skip_%d' % (i + 1))  #skip_width=256
            print("\t output[s] is: ", s)  #shape=(1, 6144, 256)
            print("\t create skip_{} Done\n".format(i + 1))

        print("\t create Layer relu")
        print("\t input[s] is: ", s)  #shape=(1, 6144, 256)
        s = tf.nn.relu(s)
        print("\t output[s] is: ", s)  #shape=(1, 6144, 256)
        print("\t create Layer relu Done\n")

        print("\t create Layer out1")
        print("\t input[s] is: ", s)  #shape=(1, 6144, 256)
        s = masked.conv1d(s,
                          num_filters=skip_width,
                          filter_length=1,
                          name='out1')  #skip_width=256
        print("\t output[s] is: ", s)  #shape=(1, 6144, 256)
        print("\t create Layer out1 Done\n")

        print("\t create _condition for cond_map_out1")
        print("\t input[s] is: ", s)  #shape=(1, 6144, 256)
        print("\t input[en] is: ", en)
        s = self._condition(
            s,
            masked.conv1d(
                en,
                num_filters=skip_width,  #skip_width=256
                filter_length=1,
                name='cond_map_out1'))
        print("\t output[s] is: ", s)
        print("\t create _condition for cond_map_out1 Done\n")

        print("\t create Layer relu")
        print("\t input[s] is: ", s)  #shape=(1, 6144, 256)
        s = tf.nn.relu(s)
        print("\t output[s] is: ", s)  #shape=(1, 6144, 256)
        print("\t create Layer relu Done\n")
        print("@build, ##The WaveNet Decoder...Done")

        ###
        # Compute the logits and get the loss.
        ###
        print("@build, ##Compute the logits and get the loss...")
        print("\t input[s] is: ", s)  #shape=(1, 6144, 256)
        logits = masked.conv1d(s,
                               num_filters=256,
                               filter_length=1,
                               name='logits')
        print("\t output[logits] is: ", logits)  #shape=(1, 6144, 256)
        logits = tf.reshape(logits, [-1, 256])
        print("\t logits after reshape: ", logits)  #shape=(6144, 256)
        probs = tf.nn.softmax(logits, name='softmax')
        print("\t probs: ", probs)  #shape=(6144, 256)
        print("\t x_quantized: ", x_quantized)  #
        x_indices = tf.cast(tf.reshape(x_quantized, [-1]), tf.int32) + 128
        print("\t x_indices", x_indices)  #shape=(6144,)
        loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=x_indices, name='nll'),
                              0,
                              name='loss')
        print("@build, ##Compute the logits and get the loss...Done")

        print("@build, Done, return:")
        print("\t probs:", probs)  #shape=(6144, 256)
        print("\t loss:", loss)  #shape=()
        print("\t x_quantized:", x_quantized)  #shape=(1, 6144)
        print("\t encoding:", encoding)  #shape=(1, 12, 16)

        return {
            'predictions': probs,
            'loss': loss,
            'eval': {
                'nll': loss
            },
            'quantized_input': x_quantized,
            'encoding': encoding,
        }