def compute_wavenet_encoder_features(content, style):
    ae_hop_length = 512
    ae_bottleneck_width = 16
    ae_num_stages = 10
    ae_num_layers = 30
    ae_filter_length = 3
    ae_width = 128
    # Encode the source with 8-bit Mu-Law.
    n_frames = content.shape[0]
    n_samples = content.shape[1]
    content_tf = np.ascontiguousarray(content)
    style_tf = np.ascontiguousarray(style)
    g = tf.Graph()
    content_features = []
    style_features = []
    layers = []
    with g.as_default(), g.device('/cpu:0'), tf.Session() as sess:
        x = tf.placeholder('float32', [n_frames, n_samples], name="x")
        x_quantized = mu_law(x)
        x_scaled = tf.cast(x_quantized, tf.float32) / 128.0
        x_scaled = tf.expand_dims(x_scaled, 2)
        en = masked.conv1d(x_scaled,
                           causal=False,
                           num_filters=ae_width,
                           filter_length=ae_filter_length,
                           name='ae_startconv')
        for num_layer in range(ae_num_layers):
            dilation = 2**(num_layer % ae_num_stages)
            d = tf.nn.relu(en)
            d = masked.conv1d(d,
                              causal=False,
                              num_filters=ae_width,
                              filter_length=ae_filter_length,
                              dilation=dilation,
                              name='ae_dilatedconv_%d' % (num_layer + 1))
            d = tf.nn.relu(d)
            en += masked.conv1d(d,
                                num_filters=ae_width,
                                filter_length=1,
                                name='ae_res_%d' % (num_layer + 1))
            layers.append(en)
        en = masked.conv1d(en,
                           num_filters=ae_bottleneck_width,
                           filter_length=1,
                           name='ae_bottleneck')
        en = masked.pool1d(en, ae_hop_length, name='ae_pool', mode='avg')
        saver = tf.train.Saver()
        saver.restore(sess, './model.ckpt-200000')
        content_features = sess.run(layers, feed_dict={x: content_tf})
        styles = sess.run(layers, feed_dict={x: style_tf})
        for i, style_feature in enumerate(styles):
            n_features = np.prod(layers[i].shape.as_list()[-1])
            features = np.reshape(style_feature, (-1, n_features))
            style_gram = np.matmul(features.T,
                                   features) / (n_samples * n_frames)
            style_features.append(style_gram)
    return content_features, style_features
Exemple #2
0
  def encode(self, inputs, reuse=False):
    ae_num_stages = self.ae_num_stages
    ae_num_layers = self.ae_num_layers
    ae_filter_length = self.ae_filter_length
    ae_width = self.ae_width
    ae_bottleneck_width = self.ae_bottleneck_width

    # Encode the source with 8-bit Mu-Law.
    x = inputs
    tf.logging.info("x shape: %s", str(x.shape.as_list()))
    x_quantized = utils.mu_law(x)
    x_scaled = tf.cast(x_quantized, tf.float32) / 128.0
    x_scaled = tf.expand_dims(x_scaled, 2)

    en = masked.conv1d(
        x_scaled,
        causal=False,
        num_filters=ae_width,
        filter_length=ae_filter_length,
        name='ae_startconv')

    for num_layer in xrange(ae_num_layers):
      dilation = 2**(num_layer % ae_num_stages)
      d = tf.nn.relu(en)
      d = masked.conv1d(
          d,
          causal=False,
          num_filters=ae_width,
          filter_length=ae_filter_length,
          dilation=dilation,
          name='ae_dilatedconv_%d' % (num_layer + 1))
      d = tf.nn.relu(d)
      en += masked.conv1d(
          d,
          num_filters=ae_width,
          filter_length=1,
          name='ae_res_%d' % (num_layer + 1))

    en = masked.conv1d(
        en,
        num_filters=self.ae_bottleneck_width,
        filter_length=1,
        name='ae_bottleneck')

    # pooling is optional
    # en = masked.pool1d(en, self.ae_hop_length, name='ae_pool', mode='avg')

    return {
        'x_quantized': x_quantized,
        'encoding': en,
    }
Exemple #3
0
  def build(self, inputs, is_training):
    """Build the graph for this configuration.

    Args:
      inputs: A dict of inputs. For training, should contain 'wav'.
      is_training: Whether we are training or not. Not used in this config.

    Returns:
      A dict of outputs that includes the 'predictions', 'loss', the 'encoding',
      the 'quantized_input', and whatever metrics we want to track for eval.
    """
    del is_training
    num_stages = 10
    num_layers = 30
    filter_length = 3
    width = 512
    skip_width = 256
    ae_num_stages = 10
    ae_num_layers = 30
    ae_filter_length = 3
    ae_width = 128

    # Encode the source with 8-bit Mu-Law.
    x = inputs['wav']
    x_quantized = utils.mu_law(x)
    x_scaled = tf.cast(x_quantized, tf.float32) / 128.0
    x_scaled = tf.expand_dims(x_scaled, 2)

    ###
    # The Non-Causal Temporal Encoder.
    ###
    en = masked.conv1d(
        x_scaled,
        causal=False,
        num_filters=ae_width,
        filter_length=ae_filter_length,
        name='ae_startconv')
Exemple #4
0
    def build(self, inputs):
        """Build the graph for this configuration.

        Args:
          inputs: A dict of inputs. For training, should contain 'wav'.

        Returns:
          A dict of outputs that includes the 'predictions',
          'init_ops', the 'push_ops', and the 'quantized_input'.
        """
        num_stages = 10
        num_layers = 30
        filter_length = 3
        width = 512
        skip_width = 256
        num_z = 16

        # Encode the source with 8-bit Mu-Law.
        x = inputs['wav']
        batch_size = self.batch_size
        x_quantized = utils.mu_law(x)
        x_scaled = tf.cast(x_quantized, tf.float32) / 128.0
        x_scaled = tf.expand_dims(x_scaled, 2)

        encoding = tf.placeholder(name='encoding',
                                  shape=[batch_size, num_z],
                                  dtype=tf.float32)
        en = tf.expand_dims(encoding, 1)

        init_ops, push_ops = [], []

        ###
        # The WaveNet Decoder.
        ###
        l = x_scaled  # noqa
        l, inits, pushs = utils.causal_linear(x=l,
                                              n_inputs=1,
                                              n_outputs=width,
                                              name='startconv',
                                              rate=1,
                                              batch_size=batch_size,
                                              filter_length=filter_length)

        for init in inits:
            init_ops.append(init)
        for push in pushs:
            push_ops.append(push)

        # Set up skip connections.
        s = utils.linear(l, width, skip_width, name='skip_start')

        # Residual blocks with skip connections.
        for i in range(num_layers):
            dilation = 2**(i % num_stages)

            # dilated masked cnn
            d, inits, pushs = utils.causal_linear(x=l,
                                                  n_inputs=width,
                                                  n_outputs=width * 2,
                                                  name='dilatedconv_%d' %
                                                  (i + 1),
                                                  rate=dilation,
                                                  batch_size=batch_size,
                                                  filter_length=filter_length)

            for init in inits:
                init_ops.append(init)
            for push in pushs:
                push_ops.append(push)

            # local conditioning
            d += utils.linear(en,
                              num_z,
                              width * 2,
                              name='cond_map_%d' % (i + 1))

            # gated cnn
            assert d.get_shape().as_list()[2] % 2 == 0
            m = d.get_shape().as_list()[2] // 2
            d = tf.sigmoid(d[:, :, :m]) * tf.tanh(d[:, :, m:])

            # residuals
            l += utils.linear(d, width, width, name='res_%d' % (i + 1))  # noqa

            # skips
            s += utils.linear(d, width, skip_width, name='skip_%d' % (i + 1))

        s = tf.nn.relu(s)
        s = (utils.linear(s, skip_width, skip_width, name='out1') +
             utils.linear(en, num_z, skip_width, name='cond_map_out1'))
        s = tf.nn.relu(s)

        ###
        # Compute the logits and get the loss.
        ###
        logits = utils.linear(s, skip_width, 256, name='logits')
        logits = tf.reshape(logits, [-1, 256])
        probs = tf.nn.softmax(logits, name='softmax')

        return {
            'init_ops': init_ops,
            'push_ops': push_ops,
            'predictions': probs,
            'encoding': encoding,
            'quantized_input': x_quantized,
        }
Exemple #5
0
    def build(self,
              inputs,
              is_training,
              rescale_inputs=True,
              include_decoder=True,
              use_reduce_mean_to_pool=False):
        """Build the graph for this configuration.

        Args:
          inputs: A dict of inputs. For training, should contain 'wav'.
          is_training: Whether we are training or not. Not used in this config.
          rescale_inputs: Whether to convert inputs to mu-law and back to unit
            scaling before passing through the model (loses gradients).
          include_decoder: bool, whether to include the decoder in the build().
          use_reduce_mean_to_pool: whether to use reduce_mean (instead of pool1d)
            for pooling.
        Returns:
          A dict of outputs that includes the 'predictions', 'loss', the 'encoding',
          the 'quantized_input', and whatever metrics we want to track for eval.
        """
        num_stages = 10
        num_layers = 30
        filter_length = 3
        width = 512
        skip_width = 256
        ae_num_stages = 10
        ae_num_layers = 30
        ae_filter_length = 3
        ae_width = 128

        # Encode the source with 8-bit Mu-Law.
        x = inputs['wav']
        x_quantized = utils.mu_law(x)
        x_scaled = tf.cast(x_quantized, tf.float32) / 128.0
        x_scaled = tf.expand_dims(x_scaled, 2)
        x = tf.expand_dims(x, 2)

        ###
        # The Non-Causal Temporal Encoder.
        ###
        en = masked.conv1d(x_scaled if rescale_inputs else x,
                           causal=False,
                           num_filters=ae_width,
                           filter_length=ae_filter_length,
                           name='ae_startconv',
                           is_training=is_training)

        for num_layer in range(ae_num_layers):
            dilation = 2**(num_layer % ae_num_stages)
            d = tf.nn.relu(en)
            d = masked.conv1d(d,
                              causal=False,
                              num_filters=ae_width,
                              filter_length=ae_filter_length,
                              dilation=dilation,
                              name='ae_dilatedconv_%d' % (num_layer + 1),
                              is_training=is_training)
            d = tf.nn.relu(d)
            en += masked.conv1d(d,
                                num_filters=ae_width,
                                filter_length=1,
                                name='ae_res_%d' % (num_layer + 1),
                                is_training=is_training)

        en = masked.conv1d(en,
                           num_filters=self.ae_bottleneck_width,
                           filter_length=1,
                           name='ae_bottleneck',
                           is_training=is_training)

        if use_reduce_mean_to_pool:
            # Depending on the accelerator used for training, masked.pool1d may
            # lead to out of memory error.
            # reduce_mean is equivalent to masked.pool1d when the stride is the same
            # as the window length (which is the case here).
            batch_size, unused_length, depth = en.shape.as_list()
            en = tf.reshape(en, [batch_size, -1, self.ae_hop_length, depth])
            en = tf.reduce_mean(en, axis=2)
        else:
            en = masked.pool1d(en,
                               self.ae_hop_length,
                               name='ae_pool',
                               mode='avg')
        encoding = en

        if not include_decoder:
            return {'encoding': encoding}

        ###
        # The WaveNet Decoder.
        ###
        l = masked.shift_right(x_scaled if rescale_inputs else x)  # noqa
        l = masked.conv1d(  # noqa
            l,
            num_filters=width,
            filter_length=filter_length,
            name='startconv',
            is_training=is_training)

        # Set up skip connections.
        s = masked.conv1d(l,
                          num_filters=skip_width,
                          filter_length=1,
                          name='skip_start',
                          is_training=is_training)

        # Residual blocks with skip connections.
        for i in range(num_layers):
            dilation = 2**(i % num_stages)
            d = masked.conv1d(l,
                              num_filters=2 * width,
                              filter_length=filter_length,
                              dilation=dilation,
                              name='dilatedconv_%d' % (i + 1),
                              is_training=is_training)
            d = self._condition(
                d,
                masked.conv1d(en,
                              num_filters=2 * width,
                              filter_length=1,
                              name='cond_map_%d' % (i + 1),
                              is_training=is_training))

            assert d.get_shape().as_list()[2] % 2 == 0
            m = d.get_shape().as_list()[2] // 2
            d_sigmoid = tf.sigmoid(d[:, :, :m])
            d_tanh = tf.tanh(d[:, :, m:])
            d = d_sigmoid * d_tanh

            l += masked.conv1d(  # noqa
                d,
                num_filters=width,
                filter_length=1,
                name='res_%d' % (i + 1),
                is_training=is_training)
            s += masked.conv1d(d,
                               num_filters=skip_width,
                               filter_length=1,
                               name='skip_%d' % (i + 1),
                               is_training=is_training)

        s = tf.nn.relu(s)
        s = masked.conv1d(s,
                          num_filters=skip_width,
                          filter_length=1,
                          name='out1',
                          is_training=is_training)
        s = self._condition(
            s,
            masked.conv1d(en,
                          num_filters=skip_width,
                          filter_length=1,
                          name='cond_map_out1',
                          is_training=is_training))
        s = tf.nn.relu(s)

        ###
        # Compute the logits and get the loss.
        ###
        logits = masked.conv1d(s,
                               num_filters=256,
                               filter_length=1,
                               name='logits',
                               is_training=is_training)
        logits = tf.reshape(logits, [-1, 256])
        probs = tf.nn.softmax(logits, name='softmax')
        x_indices = tf.cast(tf.reshape(x_quantized, [-1]), tf.int32) + 128
        loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=x_indices, name='nll'),
                              0,
                              name='loss')

        return {
            'predictions': probs,
            'loss': loss,
            'eval': {
                'nll': loss
            },
            'quantized_input': x_quantized,
            'encoding': encoding,
        }
Exemple #6
0
  def build(self, inputs):
    """Build the graph for this configuration.

    Args:
      inputs: A dict of inputs. For training, should contain 'wav'.

    Returns:
      A dict of outputs that includes the 'predictions',
      'init_ops', the 'push_ops', and the 'quantized_input'.
    """
    num_stages = 10
    num_layers = 30
    filter_length = 3
    width = 512
    skip_width = 256
    num_z = 16

    # Encode the source with 8-bit Mu-Law.
    x = inputs['wav']
    batch_size = self.batch_size
    x_quantized = utils.mu_law(x)
    x_scaled = tf.cast(x_quantized, tf.float32) / 128.0
    x_scaled = tf.expand_dims(x_scaled, 2)

    encoding = tf.placeholder(
        name='encoding', shape=[batch_size, num_z], dtype=tf.float32)
    en = tf.expand_dims(encoding, 1)

    init_ops, push_ops = [], []

    ###
    # The WaveNet Decoder.
    ###
    l = x_scaled
    l, inits, pushs = utils.causal_linear(
        x=l,
        n_inputs=1,
        n_outputs=width,
        name='startconv',
        rate=1,
        batch_size=batch_size,
        filter_length=filter_length)

    for init in inits:
      init_ops.append(init)
    for push in pushs:
      push_ops.append(push)

    # Set up skip connections.
    s = utils.linear(l, width, skip_width, name='skip_start')

    # Residual blocks with skip connections.
    for i in range(num_layers):
      dilation = 2**(i % num_stages)

      # dilated masked cnn
      d, inits, pushs = utils.causal_linear(
          x=l,
          n_inputs=width,
          n_outputs=width * 2,
          name='dilatedconv_%d' % (i + 1),
          rate=dilation,
          batch_size=batch_size,
          filter_length=filter_length)

      for init in inits:
        init_ops.append(init)
      for push in pushs:
        push_ops.append(push)

      # local conditioning
      d += utils.linear(en, num_z, width * 2, name='cond_map_%d' % (i + 1))

      # gated cnn
      assert d.get_shape().as_list()[2] % 2 == 0
      m = d.get_shape().as_list()[2] // 2
      d = tf.sigmoid(d[:, :, :m]) * tf.tanh(d[:, :, m:])

      # residuals
      l += utils.linear(d, width, width, name='res_%d' % (i + 1))

      # skips
      s += utils.linear(d, width, skip_width, name='skip_%d' % (i + 1))

    s = tf.nn.relu(s)
    s = (utils.linear(s, skip_width, skip_width, name='out1') + utils.linear(
        en, num_z, skip_width, name='cond_map_out1'))
    s = tf.nn.relu(s)

    ###
    # Compute the logits and get the loss.
    ###
    logits = utils.linear(s, skip_width, 256, name='logits')
    logits = tf.reshape(logits, [-1, 256])
    probs = tf.nn.softmax(logits, name='softmax')

    return {
        'init_ops': init_ops,
        'push_ops': push_ops,
        'predictions': probs,
        'encoding': encoding,
        'quantized_input': x_quantized,
    }
Exemple #7
0
  def build(self, inputs, is_training):
    """Build the graph for this configuration.

    Args:
      inputs: A dict of inputs. For training, should contain 'wav'.
      is_training: Whether we are training or not. Not used in this config.

    Returns:
      A dict of outputs that includes the 'predictions', 'loss', the 'encoding',
      the 'quantized_input', and whatever metrics we want to track for eval.
    """
    del is_training
    num_stages = 10
    num_layers = 30
    filter_length = 3
    width = 512
    skip_width = 256
    ae_num_stages = 10
    ae_num_layers = 30
    ae_filter_length = 3
    ae_width = 128

    # Encode the source with 8-bit Mu-Law.
    x = inputs['wav']
    x_quantized = utils.mu_law(x)
    x_scaled = tf.cast(x_quantized, tf.float32) / 128.0
    x_scaled = tf.expand_dims(x_scaled, 2)

    ###
    # The Non-Causal Temporal Encoder.
    ###
    en = masked.conv1d(
        x_scaled,
        causal=False,
        num_filters=ae_width,
        filter_length=ae_filter_length,
        name='ae_startconv')

    for num_layer in range(ae_num_layers):
      dilation = 2**(num_layer % ae_num_stages)
      d = tf.nn.relu(en)
      d = masked.conv1d(
          d,
          causal=False,
          num_filters=ae_width,
          filter_length=ae_filter_length,
          dilation=dilation,
          name='ae_dilatedconv_%d' % (num_layer + 1))
      d = tf.nn.relu(d)
      en += masked.conv1d(
          d,
          num_filters=ae_width,
          filter_length=1,
          name='ae_res_%d' % (num_layer + 1))

    en = masked.conv1d(
        en,
        num_filters=self.ae_bottleneck_width,
        filter_length=1,
        name='ae_bottleneck')
    en = masked.pool1d(en, self.ae_hop_length, name='ae_pool', mode='avg')
    encoding = en

    ###
    # The WaveNet Decoder.
    ###
    l = masked.shift_right(x_scaled)
    l = masked.conv1d(
        l, num_filters=width, filter_length=filter_length, name='startconv')

    # Set up skip connections.
    s = masked.conv1d(
        l, num_filters=skip_width, filter_length=1, name='skip_start')

    # Residual blocks with skip connections.
    for i in range(num_layers):
      dilation = 2**(i % num_stages)
      d = masked.conv1d(
          l,
          num_filters=2 * width,
          filter_length=filter_length,
          dilation=dilation,
          name='dilatedconv_%d' % (i + 1))
      d = self._condition(d,
                          masked.conv1d(
                              en,
                              num_filters=2 * width,
                              filter_length=1,
                              name='cond_map_%d' % (i + 1)))

      assert d.get_shape().as_list()[2] % 2 == 0
      m = d.get_shape().as_list()[2] // 2
      d_sigmoid = tf.sigmoid(d[:, :, :m])
      d_tanh = tf.tanh(d[:, :, m:])
      d = d_sigmoid * d_tanh

      l += masked.conv1d(
          d, num_filters=width, filter_length=1, name='res_%d' % (i + 1))
      s += masked.conv1d(
          d, num_filters=skip_width, filter_length=1, name='skip_%d' % (i + 1))

    s = tf.nn.relu(s)
    s = masked.conv1d(s, num_filters=skip_width, filter_length=1, name='out1')
    s = self._condition(s,
                        masked.conv1d(
                            en,
                            num_filters=skip_width,
                            filter_length=1,
                            name='cond_map_out1'))
    s = tf.nn.relu(s)

    ###
    # Compute the logits and get the loss.
    ###
    logits = masked.conv1d(s, num_filters=256, filter_length=1, name='logits')
    logits = tf.reshape(logits, [-1, 256])
    probs = tf.nn.softmax(logits, name='softmax')
    x_indices = tf.cast(tf.reshape(x_quantized, [-1]), tf.int32) + 128
    loss = tf.reduce_mean(
        tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=x_indices, name='nll'),
        0,
        name='loss')

    return {
        'predictions': probs,
        'loss': loss,
        'eval': {
            'nll': loss
        },
        'quantized_input': x_quantized,
        'encoding': encoding,
    }
Exemple #8
0
    def build(self, inputs, is_training):
        """Build the graph for this configuration.

        Parameters
        ----------
        inputs
            A dict of inputs. For training, should contain 'wav'.
        is_training
            Whether we are training or not. Not used in this config.

        Returns
        -------
        A dict of outputs that includes the 'predictions', 'loss', the 'encoding',
        the 'quantized_input', and whatever metrics we want to track for eval.
        """
        del is_training
        num_stages = 10
        num_layers = 30
        filter_length = 3
        width = 512
        skip_width = 256
        ae_num_stages = 10
        ae_num_layers = 30
        ae_filter_length = 3
        ae_width = 128

        # Encode the source with 8-bit Mu-Law.
        x = inputs['wav']
        x_quantized = utils.mu_law(x)
        x_scaled = tf.cast(x_quantized, tf.float32) / 128.0
        x_scaled = tf.expand_dims(x_scaled, 2)

        if self.encoding:
            ###
            # The Non-Causal Temporal Encoder.
            ###
            en = masked.conv1d(
                x_scaled,
                causal=False,
                num_filters=ae_width,
                filter_length=ae_filter_length,
                name='ae_startconv')

            for num_layer in range(ae_num_layers):
                dilation = 2**(num_layer % ae_num_stages)
                d = tf.nn.relu(en)
                d = masked.conv1d(
                    d,
                    causal=False,
                    num_filters=ae_width,
                    filter_length=ae_filter_length,
                    dilation=dilation,
                    name='ae_dilatedconv_%d' % (num_layer + 1))
                d = tf.nn.relu(d)
                en += masked.conv1d(
                    d,
                    num_filters=ae_width,
                    filter_length=1,
                    name='ae_res_%d' % (num_layer + 1))

            en = masked.conv1d(
                en,
                num_filters=self.ae_bottleneck_width,
                filter_length=1,
                name='ae_bottleneck')
            en = masked.pool1d(
                en, self.ae_hop_length, name='ae_pool', mode='avg')
            encoding = en
        else:
            encoding = en = tf.placeholder(
                name='ae_pool', shape=[1, 125, 16], dtype=tf.float32)

        ###
        # The WaveNet Decoder.
        ###
        l = masked.shift_right(x_scaled)
        l = masked.conv1d(
            l, num_filters=width, filter_length=filter_length, name='startconv')

        # Set up skip connections.
        s = masked.conv1d(
            l, num_filters=skip_width, filter_length=1, name='skip_start')

        # Residual blocks with skip connections.
        for i in range(num_layers):
            dilation = 2**(i % num_stages)
            d = masked.conv1d(
                l,
                num_filters=2 * width,
                filter_length=filter_length,
                dilation=dilation,
                name='dilatedconv_%d' % (i + 1))
            d = self._condition(d,
                                masked.conv1d(
                                    en,
                                    num_filters=2 * width,
                                    filter_length=1,
                                    name='cond_map_%d' % (i + 1)))

            assert d.get_shape().as_list()[2] % 2 == 0
            m = d.get_shape().as_list()[2] // 2
            d_sigmoid = tf.sigmoid(d[:, :, :m])
            d_tanh = tf.tanh(d[:, :, m:])
            d = d_sigmoid * d_tanh

            l += masked.conv1d(
                d, num_filters=width, filter_length=1, name='res_%d' % (i + 1))
            s += masked.conv1d(
                d,
                num_filters=skip_width,
                filter_length=1,
                name='skip_%d' % (i + 1))

        s = tf.nn.relu(s)
        s = masked.conv1d(
            s, num_filters=skip_width, filter_length=1, name='out1')
        s = self._condition(s,
                            masked.conv1d(
                                en,
                                num_filters=skip_width,
                                filter_length=1,
                                name='cond_map_out1'))
        s = tf.nn.relu(s)

        ###
        # Compute the logits and get the loss.
        ###
        logits = masked.conv1d(
            s, num_filters=256, filter_length=1, name='logits')
        logits = tf.reshape(logits, [-1, 256])
        probs = tf.nn.softmax(logits, name='softmax')
        x_indices = tf.cast(tf.reshape(x_quantized, [-1]), tf.int32) + 128
        loss = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=logits, labels=x_indices, name='nll'),
            0,
            name='loss')

        return {
            'predictions': probs,
            'loss': loss,
            'eval': {
                'nll': loss
            },
            'quantized_input': x_quantized,
            'encoding': encoding,
        }
Exemple #9
0
  def build(self, inputs, is_training):
    """Build the graph for this configuration.

    Args:
      inputs: A dict of inputs. For training, should contain 'wav'.
      is_training: Whether we are training or not. Not used in this config.

    Returns:
      A dict of outputs that includes the 'predictions', 'loss', the 'encoding',
      the 'quantized_input', and whatever metrics we want to track for eval.
    """
    del is_training
    num_stages = self.num_stages
    num_layers = self.num_layers
    filter_length = 3
    width = 512
    skip_width = 256
    ae_num_stages = self.ae_num_stages
    ae_num_layers = self.ae_num_layers
    ae_filter_length = 3
    ae_width = 128

    # Encode the source with 8-bit Mu-Law.
    x = inputs['wav']
    x_quantized = utils.mu_law(x)
    x_scaled = tf.cast(x_quantized, tf.float32) / 128.0
    x_scaled = tf.expand_dims(x_scaled, 2)

    if self.iw > 1:
      x_scaled = self._duplicate(x_scaled, self.iw)

    ###
    # The Non-Causal Temporal Encoder.
    ###
    en = masked.conv1d(
        x_scaled,
        causal=False,
        num_filters=ae_width,
        filter_length=ae_filter_length,
        name='ae_startconv')

    for num_layer in range(ae_num_layers):
      dilation = 2**(num_layer % ae_num_stages)
      d = tf.nn.relu(en)
      d = masked.conv1d(
          d,
          causal=False,
          num_filters=ae_width,
          filter_length=ae_filter_length,
          dilation=dilation,
          name='ae_dilatedconv_%d' % (num_layer + 1))
      d = tf.nn.relu(d)
      en += masked.conv1d(
          d,
          num_filters=ae_width,
          filter_length=1,
          name='ae_res_%d' % (num_layer + 1))

    en = masked.conv1d(
        en,
        num_filters=self.ae_bottleneck_width,
        filter_length=1,
        name='ae_bottleneck')
    en = masked.pool1d(en, self.ae_hop_length, name='ae_pool', mode='avg')

    # divide encoding into "mean" and "variance"
    mn, v = self._gaussian_parameters(en)

    # flatten "mean" and "var"
    m_shape = mn.get_shape().as_list()
    v_shape = v.get_shape().as_list()

    mn = tf.reshape(mn, (-1, m_shape[-2]*m_shape[-1]))
    v = tf.reshape(v, (-1, v_shape[-2]*v_shape[-1]))

    # reparameterization trick
    en = self._sample_gaussian(mn, v)

    # reshape into original embedding shape
    en = tf.reshape(en, (-1, m_shape[-2], m_shape[-1]))

    encoding = en


    ###
    # The WaveNet Decoder.
    ###
    dropout_mask = tf.distributions.Bernoulli(probs=tf.to_float(self.dropout), dtype=tf.float32).sample(sample_shape=tf.shape(x_scaled))
    l = tf.math.multiply(masked.shift_right(x_scaled), dropout_mask)
    l = masked.conv1d(
        l, num_filters=width, filter_length=filter_length, name='startconv')

    # Set up skip connections.
    s = masked.conv1d(
        l, num_filters=skip_width, filter_length=1, name='skip_start')

    # Residual blocks with skip connections.
    for i in range(num_layers):
      dilation = 2**(i % num_stages)
      d = masked.conv1d(
          l,
          num_filters=2 * width,
          filter_length=filter_length,
          dilation=dilation,
          name='dilatedconv_%d' % (i + 1))
      d = self._condition(d,
                          masked.conv1d(
                              en,
                              num_filters=2 * width,
                              filter_length=1,
                              name='cond_map_%d' % (i + 1)))

      assert d.get_shape().as_list()[2] % 2 == 0
      m = d.get_shape().as_list()[2] // 2
      d_sigmoid = tf.sigmoid(d[:, :, :m])
      d_tanh = tf.tanh(d[:, :, m:])
      d = d_sigmoid * d_tanh

      l += masked.conv1d(
          d, num_filters=width, filter_length=1, name='res_%d' % (i + 1))
      s += masked.conv1d(
          d, num_filters=skip_width, filter_length=1, name='skip_%d' % (i + 1))

    s = tf.nn.relu(s)
    s = masked.conv1d(s, num_filters=skip_width, filter_length=1, name='out1')
    s = self._condition(s,
                        masked.conv1d(
                            en,
                            num_filters=skip_width,
                            filter_length=1,
                            name='cond_map_out1'))
    s = tf.nn.relu(s)

    if self.aux > 0:
      en_logits = masked.conv1d(
                            en,
                            num_filters=skip_width,
                            filter_length=1,
                            name='cond_map_rec')
      enc_mb, enc_length, enc_channels = en_logits.get_shape().as_list()
      mb, length, channels = s.get_shape().as_list()
      assert enc_mb == mb
      assert enc_channels == channels

      en_logits = tf.nn.relu(en_logits)
      en_logits = tf.reshape(en_logits, [mb, enc_length, 1, channels])

      _, _, reps, _ = tf.reshape(s, [mb, enc_length, -1, channels]).get_shape().as_list()

      en_logits = tf.tile(en_logits, [1, 1, reps, 1])
      en_logits = tf.reshape(en_logits, [mb, length, channels])
      en_logits = masked.conv1d(en_logits, num_filters=256, filter_length=1, name='en_logits')
      en_logits = tf.reshape(en_logits, [-1, 256])
      en_probs = tf.nn.softmax(en_logits, name='en_softmax')


    ###
    # Compute the logits and get the loss.
    ###
    logits = masked.conv1d(s, num_filters=256, filter_length=1, name='logits')
    logits = tf.reshape(logits, [-1, 256])
    probs = tf.nn.softmax(logits, name='softmax')
    x_indices = tf.cast(tf.reshape(x_quantized, [-1]), tf.int32) + 128

    rec = tf.reduce_mean(
        tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=x_indices, name='nll'),
        0,
        name='loss')

    if self.aux > 0:
      aux = tf.reduce_mean(
        tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=en_logits, labels=x_indices, name='en_nll'),
        0,
        name='aux')
    else:
      aux = 0


    kl = tf.reduce_mean(self._kl_normal(mn, v, tf.zeros(1), tf.ones(1)), name='kl')

    return {
        'predictions': probs,
        'loss': {
            'kl': kl,
            'rec': rec,
            'aux': aux},
        'eval': {
            'kl': kl,
            'rec':rec
        },
        'quantized_input': x_quantized,
        'encoding': encoding,
    }
def compute_wavenet_decoder_features(content, style):
    num_stages = 10
    num_layers = 30
    filter_length = 3
    width = 512
    skip_width = 256
    # Encode the source with 8-bit Mu-Law.
    n_frames = content.shape[0]
    n_samples = content.shape[1]
    content_tf = np.ascontiguousarray(content)
    style_tf = np.ascontiguousarray(style)
    g = tf.Graph()
    content_features = []
    style_features = []
    layers = []
    with g.as_default(), g.device('/cpu:0'), tf.Session() as sess:
        x = tf.placeholder('float32', [n_frames, n_samples], name="x")
        x_quantized = mu_law(x)
        x_scaled = tf.cast(x_quantized, tf.float32) / 128.0
        x_scaled = tf.expand_dims(x_scaled, 2)
        layer = x_scaled
        layer = masked.conv1d(layer,
                              num_filters=width,
                              filter_length=filter_length,
                              name='startconv')

        # Set up skip connections.
        s = masked.conv1d(layer,
                          num_filters=skip_width,
                          filter_length=1,
                          name='skip_start')

        # Residual blocks with skip connections.
        for i in range(num_layers):
            dilation = 2**(i % num_stages)
            d = masked.conv1d(layer,
                              num_filters=2 * width,
                              filter_length=filter_length,
                              dilation=dilation,
                              name='dilatedconv_%d' % (i + 1))
            assert d.get_shape().as_list()[2] % 2 == 0
            m = d.get_shape().as_list()[2] // 2
            d_sigmoid = tf.sigmoid(d[:, :, :m])
            d_tanh = tf.tanh(d[:, :, m:])
            d = d_sigmoid * d_tanh

            layer += masked.conv1d(d,
                                   num_filters=width,
                                   filter_length=1,
                                   name='res_%d' % (i + 1))
            s += masked.conv1d(d,
                               num_filters=skip_width,
                               filter_length=1,
                               name='skip_%d' % (i + 1))
            layers.append(s)

        saver = tf.train.Saver()
        saver.restore(sess, './model.ckpt-200000')
        content_features = sess.run(layers, feed_dict={x: content_tf})
        styles = sess.run(layers, feed_dict={x: style_tf})
        for i, style_feature in enumerate(styles):
            n_features = np.prod(layers[i].shape.as_list()[-1])
            features = np.reshape(style_feature, (-1, n_features))
            style_gram = np.matmul(features.T,
                                   features) / (n_samples * n_frames)
            style_features.append(style_gram)
    return content_features, style_features
Exemple #11
0
    def build(self, inputs, is_training):
        """Build the graph for this configuration.

    Args:
      inputs: A dict of inputs. For training, should contain 'wav'.
      is_training: Whether we are training or not. Not used in this config.

    Returns:
      A dict of outputs that includes the 'predictions', 'loss', the 'encoding',
      the 'quantized_input', and whatever metrics we want to track for eval.
    """
        del is_training
        num_stages = 10
        num_layers = 30
        filter_length = 3
        width = 512
        skip_width = 256
        ae_num_stages = 10
        ae_num_layers = 30
        ae_filter_length = 3
        ae_width = 128

        print("@build, inputs: ",
              inputs)  #pitch shape=(1,),  wav shape=(1, 6144), key shape=(1,)
        # Encode the source with 8-bit Mu-Law.
        x = inputs['wav']
        print("@build, x: ", x)  #shape=(1, 6144)
        x_quantized = utils.mu_law(x)
        print("@build, x_quantized: ", x_quantized)  #shape=(1, 6144)
        x_scaled = tf.cast(x_quantized, tf.float32) / 128.0
        print("@build, x_scaled@1: ", x_scaled)  #shape=(1, 6144)
        x_scaled = tf.expand_dims(x_scaled, 2)
        print("@build, x_scaled@2: ", x_scaled)  #shape=(1, 6144, 1)

        ###
        # The Non-Causal Temporal Encoder.
        ###
        print("@build, ##Non-Causal Temporal Encoder...")
        print("\t create Layer ae_startconv")
        print("\t input[x_scaled] is: ", x_scaled)
        en = masked.conv1d(
            x_scaled,
            causal=False,
            num_filters=ae_width,  #ae_width = 128
            filter_length=ae_filter_length,
            name='ae_startconv')
        print("\t ae_startconv output [en] is:", en)  #shape=(1. 6144, 128)
        print("\t create Layer ae_startconv Done\n")

        for num_layer in range(ae_num_layers):
            dilation = 2**(num_layer % ae_num_stages)
            print("\t create Layer relu")
            print("\t input[en] is: ", en)  #shape=(1. 6144, 128)
            d = tf.nn.relu(en)
            print("\t relu output [d] is:", d)
            print("\t create Layer relu Done\n")

            print("\t create Layer ae_dilatedconv_{}, dilation={}".format(
                num_layer + 1, dilation))
            print("\t input[d] is: ", d)
            d = masked.conv1d(
                d,
                causal=False,
                num_filters=ae_width,  #128
                filter_length=ae_filter_length,
                dilation=dilation,
                name='ae_dilatedconv_%d' % (num_layer + 1))
            print("\t output [d] is:", d)
            print(
                "\t create Layer ae_dilatedconv_{}, dilation={} Done\n".format(
                    num_layer + 1, dilation))

            print("\t create Layer relu")
            print("\t input[d] is: ", d)
            d = tf.nn.relu(d)
            print("\t relu output [d] is:", d)
            print("\t create Layer relu Done\n")

            print("\t create Layer ae_res_{}".format(num_layer + 1))
            print("\t input[en] is: ", en)
            print("\t input[d] is: ", d)
            en += masked.conv1d(
                d,
                num_filters=ae_width,  #128
                filter_length=1,
                name='ae_res_%d' % (num_layer + 1))
            print("\t output [en] is:", en)  #shape=(1, 6144, 128)
            print("\t create Layer ae_res_{} Done\n".format(num_layer + 1))

        print("\t create Layer ae_bottleneck")
        print("\t input[en] is: ", en)  #shape=(1, 6144, 128)
        en = masked.conv1d(
            en,
            num_filters=self.ae_bottleneck_width,  #16
            filter_length=1,
            name='ae_bottleneck')
        print("\t output[en] is: ", en)  #shape=(1, 6144, 16)
        print("\t create Layer ae_bottleneck Done\n")

        print("\t create ae_pool")
        print("\t input[en] is: ", en)  #shape=(1, 6144, 16)
        en = masked.pool1d(en, self.ae_hop_length, name='ae_pool',
                           mode='avg')  #ae_hop_length=512
        print("\t output[en] is: ", en)  #shape=(1, 12, 16) #6144/512=12
        print("\t create ae_pool Done\n")

        encoding = en  #encoding is 'feature vector', (125,16) for every 4 seconds voice. 125=4x16000/512
        print("\t ##Non-Causal Temporal Encoder output[en|encoding] is: ",
              encoding)
        print("@build, ##Non-Causal Temporal Encoder...Done\n")

        ###
        # The WaveNet Decoder.
        ###
        print("@build, ##The WaveNet Decoder...")
        print("\t input[x_scaled] is: ", x_scaled)  #shape=(1, 6144, 1)
        l = masked.shift_right(x_scaled)
        print("\t create startconv")
        print("\t input[l] is: ", l)  #shape=(1, 6144, 1)
        l = masked.conv1d(l,
                          num_filters=width,
                          filter_length=filter_length,
                          name='startconv')  #width=512
        print("\t output[l] is: ", l)  #shape=(1, 6144, 512)
        print("\t create startconv Done\n")

        # Set up skip connections.
        print("\t create skip_start")
        print("\t input[l] is: ", l)
        s = masked.conv1d(l,
                          num_filters=skip_width,
                          filter_length=1,
                          name='skip_start')  #skip_width=256
        print("\t output[s] is: ", s)  #shape=(1, 6144, 256)
        print("\t create skip_start Done\n")

        # Residual blocks with skip connections.
        for i in range(num_layers):
            dilation = 2**(i % num_stages)
            print("\t create dilatedconv_{}, dilation={}".format(
                i + 1, dilation))
            print("\t input[l] is: ", l)
            d = masked.conv1d(l,
                              num_filters=2 * width,
                              filter_length=filter_length,
                              dilation=dilation,
                              name='dilatedconv_%d' % (i + 1))
            print("\t output[d] is: ", d)  #shape=(1, 6144, 1024)
            print("\t create dilatedconv_{}, dilation={} Done\n".format(
                i + 1, dilation))

            print("\t create _condition for cond_map_{}".format(i + 1))
            print("\t input[d] is: ", d)
            print("\t input[en] is: ", en)
            d = self._condition(
                d,
                masked.conv1d(en,
                              num_filters=2 * width,
                              filter_length=1,
                              name='cond_map_%d' % (i + 1)))
            print("\t output[d] is: ", d)
            print("\t create _condition for cond_map_{} Done\n".format(i + 1))

            assert d.get_shape().as_list()[2] % 2 == 0
            m = d.get_shape().as_list()[2] // 2
            d_sigmoid = tf.sigmoid(d[:, :, :m])
            d_tanh = tf.tanh(d[:, :, m:])
            d = d_sigmoid * d_tanh
            print("\t d after some cacule:", d)  #shape=(1, 6144, 512)
            print("")

            print("\t create res_{}".format(i + 1))
            print("\t input[d] is: ", d)  #shape=(1, 6144, 512)
            print("\t input[l] is: ", l)  #shape=(1, 6144, 512)
            l += masked.conv1d(d,
                               num_filters=width,
                               filter_length=1,
                               name='res_%d' % (i + 1))  #width=512
            print("\t output[l] is: ", l)  #shape=(1, 6144, 512)
            print("\t create res_{} Done\n".format(i + 1))

            print("\t create skip_{}".format(i + 1))
            print("\t input[d] is: ", d)  #shape=(1, 6144, 512)
            print("\t input[s] is: ", s)  #shape=(1, 6144, 256)
            s += masked.conv1d(d,
                               num_filters=skip_width,
                               filter_length=1,
                               name='skip_%d' % (i + 1))  #skip_width=256
            print("\t output[s] is: ", s)  #shape=(1, 6144, 256)
            print("\t create skip_{} Done\n".format(i + 1))

        print("\t create Layer relu")
        print("\t input[s] is: ", s)  #shape=(1, 6144, 256)
        s = tf.nn.relu(s)
        print("\t output[s] is: ", s)  #shape=(1, 6144, 256)
        print("\t create Layer relu Done\n")

        print("\t create Layer out1")
        print("\t input[s] is: ", s)  #shape=(1, 6144, 256)
        s = masked.conv1d(s,
                          num_filters=skip_width,
                          filter_length=1,
                          name='out1')  #skip_width=256
        print("\t output[s] is: ", s)  #shape=(1, 6144, 256)
        print("\t create Layer out1 Done\n")

        print("\t create _condition for cond_map_out1")
        print("\t input[s] is: ", s)  #shape=(1, 6144, 256)
        print("\t input[en] is: ", en)
        s = self._condition(
            s,
            masked.conv1d(
                en,
                num_filters=skip_width,  #skip_width=256
                filter_length=1,
                name='cond_map_out1'))
        print("\t output[s] is: ", s)
        print("\t create _condition for cond_map_out1 Done\n")

        print("\t create Layer relu")
        print("\t input[s] is: ", s)  #shape=(1, 6144, 256)
        s = tf.nn.relu(s)
        print("\t output[s] is: ", s)  #shape=(1, 6144, 256)
        print("\t create Layer relu Done\n")
        print("@build, ##The WaveNet Decoder...Done")

        ###
        # Compute the logits and get the loss.
        ###
        print("@build, ##Compute the logits and get the loss...")
        print("\t input[s] is: ", s)  #shape=(1, 6144, 256)
        logits = masked.conv1d(s,
                               num_filters=256,
                               filter_length=1,
                               name='logits')
        print("\t output[logits] is: ", logits)  #shape=(1, 6144, 256)
        logits = tf.reshape(logits, [-1, 256])
        print("\t logits after reshape: ", logits)  #shape=(6144, 256)
        probs = tf.nn.softmax(logits, name='softmax')
        print("\t probs: ", probs)  #shape=(6144, 256)
        print("\t x_quantized: ", x_quantized)  #
        x_indices = tf.cast(tf.reshape(x_quantized, [-1]), tf.int32) + 128
        print("\t x_indices", x_indices)  #shape=(6144,)
        loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=x_indices, name='nll'),
                              0,
                              name='loss')
        print("@build, ##Compute the logits and get the loss...Done")

        print("@build, Done, return:")
        print("\t probs:", probs)  #shape=(6144, 256)
        print("\t loss:", loss)  #shape=()
        print("\t x_quantized:", x_quantized)  #shape=(1, 6144)
        print("\t encoding:", encoding)  #shape=(1, 12, 16)

        return {
            'predictions': probs,
            'loss': loss,
            'eval': {
                'nll': loss
            },
            'quantized_input': x_quantized,
            'encoding': encoding,
        }