def compute_wavenet_encoder_features(content, style): ae_hop_length = 512 ae_bottleneck_width = 16 ae_num_stages = 10 ae_num_layers = 30 ae_filter_length = 3 ae_width = 128 # Encode the source with 8-bit Mu-Law. n_frames = content.shape[0] n_samples = content.shape[1] content_tf = np.ascontiguousarray(content) style_tf = np.ascontiguousarray(style) g = tf.Graph() content_features = [] style_features = [] layers = [] with g.as_default(), g.device('/cpu:0'), tf.Session() as sess: x = tf.placeholder('float32', [n_frames, n_samples], name="x") x_quantized = mu_law(x) x_scaled = tf.cast(x_quantized, tf.float32) / 128.0 x_scaled = tf.expand_dims(x_scaled, 2) en = masked.conv1d(x_scaled, causal=False, num_filters=ae_width, filter_length=ae_filter_length, name='ae_startconv') for num_layer in range(ae_num_layers): dilation = 2**(num_layer % ae_num_stages) d = tf.nn.relu(en) d = masked.conv1d(d, causal=False, num_filters=ae_width, filter_length=ae_filter_length, dilation=dilation, name='ae_dilatedconv_%d' % (num_layer + 1)) d = tf.nn.relu(d) en += masked.conv1d(d, num_filters=ae_width, filter_length=1, name='ae_res_%d' % (num_layer + 1)) layers.append(en) en = masked.conv1d(en, num_filters=ae_bottleneck_width, filter_length=1, name='ae_bottleneck') en = masked.pool1d(en, ae_hop_length, name='ae_pool', mode='avg') saver = tf.train.Saver() saver.restore(sess, './model.ckpt-200000') content_features = sess.run(layers, feed_dict={x: content_tf}) styles = sess.run(layers, feed_dict={x: style_tf}) for i, style_feature in enumerate(styles): n_features = np.prod(layers[i].shape.as_list()[-1]) features = np.reshape(style_feature, (-1, n_features)) style_gram = np.matmul(features.T, features) / (n_samples * n_frames) style_features.append(style_gram) return content_features, style_features
def build(self, inputs, is_training, rescale_inputs=True, include_decoder=True, use_reduce_mean_to_pool=False): """Build the graph for this configuration. Args: inputs: A dict of inputs. For training, should contain 'wav'. is_training: Whether we are training or not. Not used in this config. rescale_inputs: Whether to convert inputs to mu-law and back to unit scaling before passing through the model (loses gradients). include_decoder: bool, whether to include the decoder in the build(). use_reduce_mean_to_pool: whether to use reduce_mean (instead of pool1d) for pooling. Returns: A dict of outputs that includes the 'predictions', 'loss', the 'encoding', the 'quantized_input', and whatever metrics we want to track for eval. """ num_stages = 10 num_layers = 30 filter_length = 3 width = 512 skip_width = 256 ae_num_stages = 10 ae_num_layers = 30 ae_filter_length = 3 ae_width = 128 # Encode the source with 8-bit Mu-Law. x = inputs['wav'] x_quantized = utils.mu_law(x) x_scaled = tf.cast(x_quantized, tf.float32) / 128.0 x_scaled = tf.expand_dims(x_scaled, 2) x = tf.expand_dims(x, 2) ### # The Non-Causal Temporal Encoder. ### en = masked.conv1d(x_scaled if rescale_inputs else x, causal=False, num_filters=ae_width, filter_length=ae_filter_length, name='ae_startconv', is_training=is_training) for num_layer in range(ae_num_layers): dilation = 2**(num_layer % ae_num_stages) d = tf.nn.relu(en) d = masked.conv1d(d, causal=False, num_filters=ae_width, filter_length=ae_filter_length, dilation=dilation, name='ae_dilatedconv_%d' % (num_layer + 1), is_training=is_training) d = tf.nn.relu(d) en += masked.conv1d(d, num_filters=ae_width, filter_length=1, name='ae_res_%d' % (num_layer + 1), is_training=is_training) en = masked.conv1d(en, num_filters=self.ae_bottleneck_width, filter_length=1, name='ae_bottleneck', is_training=is_training) if use_reduce_mean_to_pool: # Depending on the accelerator used for training, masked.pool1d may # lead to out of memory error. # reduce_mean is equivalent to masked.pool1d when the stride is the same # as the window length (which is the case here). batch_size, unused_length, depth = en.shape.as_list() en = tf.reshape(en, [batch_size, -1, self.ae_hop_length, depth]) en = tf.reduce_mean(en, axis=2) else: en = masked.pool1d(en, self.ae_hop_length, name='ae_pool', mode='avg') encoding = en if not include_decoder: return {'encoding': encoding} ### # The WaveNet Decoder. ### l = masked.shift_right(x_scaled if rescale_inputs else x) # noqa l = masked.conv1d( # noqa l, num_filters=width, filter_length=filter_length, name='startconv', is_training=is_training) # Set up skip connections. s = masked.conv1d(l, num_filters=skip_width, filter_length=1, name='skip_start', is_training=is_training) # Residual blocks with skip connections. for i in range(num_layers): dilation = 2**(i % num_stages) d = masked.conv1d(l, num_filters=2 * width, filter_length=filter_length, dilation=dilation, name='dilatedconv_%d' % (i + 1), is_training=is_training) d = self._condition( d, masked.conv1d(en, num_filters=2 * width, filter_length=1, name='cond_map_%d' % (i + 1), is_training=is_training)) assert d.get_shape().as_list()[2] % 2 == 0 m = d.get_shape().as_list()[2] // 2 d_sigmoid = tf.sigmoid(d[:, :, :m]) d_tanh = tf.tanh(d[:, :, m:]) d = d_sigmoid * d_tanh l += masked.conv1d( # noqa d, num_filters=width, filter_length=1, name='res_%d' % (i + 1), is_training=is_training) s += masked.conv1d(d, num_filters=skip_width, filter_length=1, name='skip_%d' % (i + 1), is_training=is_training) s = tf.nn.relu(s) s = masked.conv1d(s, num_filters=skip_width, filter_length=1, name='out1', is_training=is_training) s = self._condition( s, masked.conv1d(en, num_filters=skip_width, filter_length=1, name='cond_map_out1', is_training=is_training)) s = tf.nn.relu(s) ### # Compute the logits and get the loss. ### logits = masked.conv1d(s, num_filters=256, filter_length=1, name='logits', is_training=is_training) logits = tf.reshape(logits, [-1, 256]) probs = tf.nn.softmax(logits, name='softmax') x_indices = tf.cast(tf.reshape(x_quantized, [-1]), tf.int32) + 128 loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=x_indices, name='nll'), 0, name='loss') return { 'predictions': probs, 'loss': loss, 'eval': { 'nll': loss }, 'quantized_input': x_quantized, 'encoding': encoding, }
def build(self, inputs, is_training): """Build the graph for this configuration. Args: inputs: A dict of inputs. For training, should contain 'wav'. is_training: Whether we are training or not. Not used in this config. Returns: A dict of outputs that includes the 'predictions', 'loss', the 'encoding', the 'quantized_input', and whatever metrics we want to track for eval. """ del is_training num_stages = 10 num_layers = 30 filter_length = 3 width = 512 skip_width = 256 ae_num_stages = 10 ae_num_layers = 30 ae_filter_length = 3 ae_width = 128 # Encode the source with 8-bit Mu-Law. x = inputs['wav'] x_quantized = utils.mu_law(x) x_scaled = tf.cast(x_quantized, tf.float32) / 128.0 x_scaled = tf.expand_dims(x_scaled, 2) ### # The Non-Causal Temporal Encoder. ### en = masked.conv1d( x_scaled, causal=False, num_filters=ae_width, filter_length=ae_filter_length, name='ae_startconv') for num_layer in range(ae_num_layers): dilation = 2**(num_layer % ae_num_stages) d = tf.nn.relu(en) d = masked.conv1d( d, causal=False, num_filters=ae_width, filter_length=ae_filter_length, dilation=dilation, name='ae_dilatedconv_%d' % (num_layer + 1)) d = tf.nn.relu(d) en += masked.conv1d( d, num_filters=ae_width, filter_length=1, name='ae_res_%d' % (num_layer + 1)) en = masked.conv1d( en, num_filters=self.ae_bottleneck_width, filter_length=1, name='ae_bottleneck') en = masked.pool1d(en, self.ae_hop_length, name='ae_pool', mode='avg') encoding = en ### # The WaveNet Decoder. ### l = masked.shift_right(x_scaled) l = masked.conv1d( l, num_filters=width, filter_length=filter_length, name='startconv') # Set up skip connections. s = masked.conv1d( l, num_filters=skip_width, filter_length=1, name='skip_start') # Residual blocks with skip connections. for i in range(num_layers): dilation = 2**(i % num_stages) d = masked.conv1d( l, num_filters=2 * width, filter_length=filter_length, dilation=dilation, name='dilatedconv_%d' % (i + 1)) d = self._condition(d, masked.conv1d( en, num_filters=2 * width, filter_length=1, name='cond_map_%d' % (i + 1))) assert d.get_shape().as_list()[2] % 2 == 0 m = d.get_shape().as_list()[2] // 2 d_sigmoid = tf.sigmoid(d[:, :, :m]) d_tanh = tf.tanh(d[:, :, m:]) d = d_sigmoid * d_tanh l += masked.conv1d( d, num_filters=width, filter_length=1, name='res_%d' % (i + 1)) s += masked.conv1d( d, num_filters=skip_width, filter_length=1, name='skip_%d' % (i + 1)) s = tf.nn.relu(s) s = masked.conv1d(s, num_filters=skip_width, filter_length=1, name='out1') s = self._condition(s, masked.conv1d( en, num_filters=skip_width, filter_length=1, name='cond_map_out1')) s = tf.nn.relu(s) ### # Compute the logits and get the loss. ### logits = masked.conv1d(s, num_filters=256, filter_length=1, name='logits') logits = tf.reshape(logits, [-1, 256]) probs = tf.nn.softmax(logits, name='softmax') x_indices = tf.cast(tf.reshape(x_quantized, [-1]), tf.int32) + 128 loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=x_indices, name='nll'), 0, name='loss') return { 'predictions': probs, 'loss': loss, 'eval': { 'nll': loss }, 'quantized_input': x_quantized, 'encoding': encoding, }
def build(self, inputs, is_training): """Build the graph for this configuration. Parameters ---------- inputs A dict of inputs. For training, should contain 'wav'. is_training Whether we are training or not. Not used in this config. Returns ------- A dict of outputs that includes the 'predictions', 'loss', the 'encoding', the 'quantized_input', and whatever metrics we want to track for eval. """ del is_training num_stages = 10 num_layers = 30 filter_length = 3 width = 512 skip_width = 256 ae_num_stages = 10 ae_num_layers = 30 ae_filter_length = 3 ae_width = 128 # Encode the source with 8-bit Mu-Law. x = inputs['wav'] x_quantized = utils.mu_law(x) x_scaled = tf.cast(x_quantized, tf.float32) / 128.0 x_scaled = tf.expand_dims(x_scaled, 2) if self.encoding: ### # The Non-Causal Temporal Encoder. ### en = masked.conv1d( x_scaled, causal=False, num_filters=ae_width, filter_length=ae_filter_length, name='ae_startconv') for num_layer in range(ae_num_layers): dilation = 2**(num_layer % ae_num_stages) d = tf.nn.relu(en) d = masked.conv1d( d, causal=False, num_filters=ae_width, filter_length=ae_filter_length, dilation=dilation, name='ae_dilatedconv_%d' % (num_layer + 1)) d = tf.nn.relu(d) en += masked.conv1d( d, num_filters=ae_width, filter_length=1, name='ae_res_%d' % (num_layer + 1)) en = masked.conv1d( en, num_filters=self.ae_bottleneck_width, filter_length=1, name='ae_bottleneck') en = masked.pool1d( en, self.ae_hop_length, name='ae_pool', mode='avg') encoding = en else: encoding = en = tf.placeholder( name='ae_pool', shape=[1, 125, 16], dtype=tf.float32) ### # The WaveNet Decoder. ### l = masked.shift_right(x_scaled) l = masked.conv1d( l, num_filters=width, filter_length=filter_length, name='startconv') # Set up skip connections. s = masked.conv1d( l, num_filters=skip_width, filter_length=1, name='skip_start') # Residual blocks with skip connections. for i in range(num_layers): dilation = 2**(i % num_stages) d = masked.conv1d( l, num_filters=2 * width, filter_length=filter_length, dilation=dilation, name='dilatedconv_%d' % (i + 1)) d = self._condition(d, masked.conv1d( en, num_filters=2 * width, filter_length=1, name='cond_map_%d' % (i + 1))) assert d.get_shape().as_list()[2] % 2 == 0 m = d.get_shape().as_list()[2] // 2 d_sigmoid = tf.sigmoid(d[:, :, :m]) d_tanh = tf.tanh(d[:, :, m:]) d = d_sigmoid * d_tanh l += masked.conv1d( d, num_filters=width, filter_length=1, name='res_%d' % (i + 1)) s += masked.conv1d( d, num_filters=skip_width, filter_length=1, name='skip_%d' % (i + 1)) s = tf.nn.relu(s) s = masked.conv1d( s, num_filters=skip_width, filter_length=1, name='out1') s = self._condition(s, masked.conv1d( en, num_filters=skip_width, filter_length=1, name='cond_map_out1')) s = tf.nn.relu(s) ### # Compute the logits and get the loss. ### logits = masked.conv1d( s, num_filters=256, filter_length=1, name='logits') logits = tf.reshape(logits, [-1, 256]) probs = tf.nn.softmax(logits, name='softmax') x_indices = tf.cast(tf.reshape(x_quantized, [-1]), tf.int32) + 128 loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=x_indices, name='nll'), 0, name='loss') return { 'predictions': probs, 'loss': loss, 'eval': { 'nll': loss }, 'quantized_input': x_quantized, 'encoding': encoding, }
def build(self, inputs, is_training): """Build the graph for this configuration. Args: inputs: A dict of inputs. For training, should contain 'wav'. is_training: Whether we are training or not. Not used in this config. Returns: A dict of outputs that includes the 'predictions', 'loss', the 'encoding', the 'quantized_input', and whatever metrics we want to track for eval. """ del is_training num_stages = self.num_stages num_layers = self.num_layers filter_length = 3 width = 512 skip_width = 256 ae_num_stages = self.ae_num_stages ae_num_layers = self.ae_num_layers ae_filter_length = 3 ae_width = 128 # Encode the source with 8-bit Mu-Law. x = inputs['wav'] x_quantized = utils.mu_law(x) x_scaled = tf.cast(x_quantized, tf.float32) / 128.0 x_scaled = tf.expand_dims(x_scaled, 2) if self.iw > 1: x_scaled = self._duplicate(x_scaled, self.iw) ### # The Non-Causal Temporal Encoder. ### en = masked.conv1d( x_scaled, causal=False, num_filters=ae_width, filter_length=ae_filter_length, name='ae_startconv') for num_layer in range(ae_num_layers): dilation = 2**(num_layer % ae_num_stages) d = tf.nn.relu(en) d = masked.conv1d( d, causal=False, num_filters=ae_width, filter_length=ae_filter_length, dilation=dilation, name='ae_dilatedconv_%d' % (num_layer + 1)) d = tf.nn.relu(d) en += masked.conv1d( d, num_filters=ae_width, filter_length=1, name='ae_res_%d' % (num_layer + 1)) en = masked.conv1d( en, num_filters=self.ae_bottleneck_width, filter_length=1, name='ae_bottleneck') en = masked.pool1d(en, self.ae_hop_length, name='ae_pool', mode='avg') # divide encoding into "mean" and "variance" mn, v = self._gaussian_parameters(en) # flatten "mean" and "var" m_shape = mn.get_shape().as_list() v_shape = v.get_shape().as_list() mn = tf.reshape(mn, (-1, m_shape[-2]*m_shape[-1])) v = tf.reshape(v, (-1, v_shape[-2]*v_shape[-1])) # reparameterization trick en = self._sample_gaussian(mn, v) # reshape into original embedding shape en = tf.reshape(en, (-1, m_shape[-2], m_shape[-1])) encoding = en ### # The WaveNet Decoder. ### dropout_mask = tf.distributions.Bernoulli(probs=tf.to_float(self.dropout), dtype=tf.float32).sample(sample_shape=tf.shape(x_scaled)) l = tf.math.multiply(masked.shift_right(x_scaled), dropout_mask) l = masked.conv1d( l, num_filters=width, filter_length=filter_length, name='startconv') # Set up skip connections. s = masked.conv1d( l, num_filters=skip_width, filter_length=1, name='skip_start') # Residual blocks with skip connections. for i in range(num_layers): dilation = 2**(i % num_stages) d = masked.conv1d( l, num_filters=2 * width, filter_length=filter_length, dilation=dilation, name='dilatedconv_%d' % (i + 1)) d = self._condition(d, masked.conv1d( en, num_filters=2 * width, filter_length=1, name='cond_map_%d' % (i + 1))) assert d.get_shape().as_list()[2] % 2 == 0 m = d.get_shape().as_list()[2] // 2 d_sigmoid = tf.sigmoid(d[:, :, :m]) d_tanh = tf.tanh(d[:, :, m:]) d = d_sigmoid * d_tanh l += masked.conv1d( d, num_filters=width, filter_length=1, name='res_%d' % (i + 1)) s += masked.conv1d( d, num_filters=skip_width, filter_length=1, name='skip_%d' % (i + 1)) s = tf.nn.relu(s) s = masked.conv1d(s, num_filters=skip_width, filter_length=1, name='out1') s = self._condition(s, masked.conv1d( en, num_filters=skip_width, filter_length=1, name='cond_map_out1')) s = tf.nn.relu(s) if self.aux > 0: en_logits = masked.conv1d( en, num_filters=skip_width, filter_length=1, name='cond_map_rec') enc_mb, enc_length, enc_channels = en_logits.get_shape().as_list() mb, length, channels = s.get_shape().as_list() assert enc_mb == mb assert enc_channels == channels en_logits = tf.nn.relu(en_logits) en_logits = tf.reshape(en_logits, [mb, enc_length, 1, channels]) _, _, reps, _ = tf.reshape(s, [mb, enc_length, -1, channels]).get_shape().as_list() en_logits = tf.tile(en_logits, [1, 1, reps, 1]) en_logits = tf.reshape(en_logits, [mb, length, channels]) en_logits = masked.conv1d(en_logits, num_filters=256, filter_length=1, name='en_logits') en_logits = tf.reshape(en_logits, [-1, 256]) en_probs = tf.nn.softmax(en_logits, name='en_softmax') ### # Compute the logits and get the loss. ### logits = masked.conv1d(s, num_filters=256, filter_length=1, name='logits') logits = tf.reshape(logits, [-1, 256]) probs = tf.nn.softmax(logits, name='softmax') x_indices = tf.cast(tf.reshape(x_quantized, [-1]), tf.int32) + 128 rec = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=x_indices, name='nll'), 0, name='loss') if self.aux > 0: aux = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=en_logits, labels=x_indices, name='en_nll'), 0, name='aux') else: aux = 0 kl = tf.reduce_mean(self._kl_normal(mn, v, tf.zeros(1), tf.ones(1)), name='kl') return { 'predictions': probs, 'loss': { 'kl': kl, 'rec': rec, 'aux': aux}, 'eval': { 'kl': kl, 'rec':rec }, 'quantized_input': x_quantized, 'encoding': encoding, }
def compute_wavenet_encoder_stylization(n_samples, n_frames, content_features, style_features, alpha=1e-4, learning_rate=1e-3, iterations=100): ae_style_layers = [1, 5] ae_num_layers = 30 ae_num_stages = 10 ae_filter_length = 3 ae_width = 128 layers = [] with tf.Graph().as_default() as g, g.device( '/cpu:0'), tf.Session() as sess: x = tf.placeholder(name="x", shape=(n_frames, n_samples, 1), dtype=tf.float32) en = masked.conv1d(x, causal=False, num_filters=ae_width, filter_length=ae_filter_length, name='ae_startconv') for num_layer in range(ae_num_layers): dilation = 2**(num_layer % ae_num_stages) d = tf.nn.relu(en) d = masked.conv1d(d, causal=False, num_filters=ae_width, filter_length=ae_filter_length, dilation=dilation, name='ae_dilatedconv_%d' % (num_layer + 1)) d = tf.nn.relu(d) en += masked.conv1d(d, num_filters=ae_width, filter_length=1, name='ae_res_%d' % (num_layer + 1)) layer_i = tf.identity(en, name='layer_{}'.format(num_layer)) layers.append(layer_i) saver = tf.train.Saver() saver.restore(sess, './model.ckpt-200000') sess.run(tf.initialize_all_variables()) frozen_graph_def = tf.graph_util.convert_variables_to_constants( sess, sess.graph_def, [en.name.replace(':0', '')] + ['layer_{}'.format(i) for i in range(ae_num_layers)]) with tf.Graph().as_default() as g, g.device( '/cpu:0'), tf.Session() as sess: x = tf.Variable( np.random.randn(n_frames, n_samples, 1).astype(np.float32)) tf.import_graph_def(frozen_graph_def, input_map={'x:0': x}) content_loss = np.float32(0.0) style_loss = np.float32(0.0) for num_layer in ae_style_layers: layer_i = g.get_tensor_by_name(name='import/layer_%d:0' % (num_layer)) content_loss = content_loss + alpha * 2 * tf.nn.l2_loss( layer_i - content_features[num_layer]) n_features = layer_i.shape.as_list()[-1] features = tf.reshape(layer_i, (-1, n_features)) gram = tf.matmul(tf.transpose(features), features) / (n_frames * n_samples) style_loss = style_loss + 2 * tf.nn.l2_loss( gram - style_features[num_layer]) loss = content_loss + style_loss # Optimization print('Started optimization.') opt = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(loss) var_list = tf.trainable_variables() print(var_list) sess.run(tf.initialize_all_variables()) for i in range(iterations): s, c, layer, _ = sess.run([style_loss, content_loss, loss, opt]) print(i, '- Style:', s, 'Content:', c, end='\r') result = x.eval() result = inv_mu_law_numpy(result[..., 0] / result.max() * 128.0) return result
def compute_wavenet_decoder_stylization(n_samples, n_frames, content_features, style_features, alpha=1e-4, learning_rate=1e-3, iterations=100): style_layers = [1, 5] num_stages = 10 num_layers = 30 filter_length = 3 width = 512 skip_width = 256 layers = [] with tf.Graph().as_default() as g, g.device( '/cpu:0'), tf.Session() as sess: x = tf.placeholder(name="x", shape=(n_frames, n_samples, 1), dtype=tf.float32) layer = x layer = masked.conv1d(layer, num_filters=width, filter_length=filter_length, name='startconv') # Set up skip connections. s = masked.conv1d(layer, num_filters=skip_width, filter_length=1, name='skip_start') # Residual blocks with skip connections. for i in range(num_layers): dilation = 2**(i % num_stages) d = masked.conv1d(layer, num_filters=2 * width, filter_length=filter_length, dilation=dilation, name='dilatedconv_%d' % (i + 1)) assert d.get_shape().as_list()[2] % 2 == 0 m = d.get_shape().as_list()[2] // 2 d_sigmoid = tf.sigmoid(d[:, :, :m]) d_tanh = tf.tanh(d[:, :, m:]) d = d_sigmoid * d_tanh layer += masked.conv1d(d, num_filters=width, filter_length=1, name='res_%d' % (i + 1)) s += masked.conv1d(d, num_filters=skip_width, filter_length=1, name='skip_%d' % (i + 1)) layer_i = tf.identity(s, name='layer_{}'.format(num_layers)) layers.append(layer_i) saver = tf.train.Saver() saver.restore(sess, './model.ckpt-200000') sess.run(tf.initialize_all_variables()) frozen_graph_def = tf.graph_util.convert_variables_to_constants( sess, sess.graph_def, [s.name.replace(':0', '')] + ['layer_{}'.format(i) for i in range(num_layers)]) with tf.Graph().as_default() as g, g.device( '/cpu:0'), tf.Session() as sess: x = tf.Variable( np.random.randn(n_frames, n_samples, 1).astype(np.float32)) tf.import_graph_def(frozen_graph_def, input_map={'x:0': x}) content_loss = np.float32(0.0) style_loss = np.float32(0.0) for num_layer in style_layers: layer_i = g.get_tensor_by_name(name='import/layer_%d:0' % (num_layer)) content_loss = content_loss + alpha * 2 * tf.nn.l2_loss( layer_i - content_features[num_layer]) n_features = layer_i.shape.as_list()[-1] features = tf.reshape(layer_i, (-1, n_features)) gram = tf.matmul(tf.transpose(features), features) / (n_frames * n_samples) style_loss = style_loss + 2 * tf.nn.l2_loss( gram - style_features[num_layer]) loss = content_loss + style_loss # Optimization print('Started optimization.') opt = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(loss) var_list = tf.trainable_variables() print(var_list) sess.run(tf.initialize_all_variables()) for i in range(iterations): s, c, _ = sess.run([style_loss, content_loss, opt]) print(i, '- Style:', s, 'Content:', c, end='\r') result = x.eval() result = inv_mu_law_numpy(result[..., 0] / result.max() * 128.0) return result
def compute_wavenet_decoder_features(content, style): num_stages = 10 num_layers = 30 filter_length = 3 width = 512 skip_width = 256 # Encode the source with 8-bit Mu-Law. n_frames = content.shape[0] n_samples = content.shape[1] content_tf = np.ascontiguousarray(content) style_tf = np.ascontiguousarray(style) g = tf.Graph() content_features = [] style_features = [] layers = [] with g.as_default(), g.device('/cpu:0'), tf.Session() as sess: x = tf.placeholder('float32', [n_frames, n_samples], name="x") x_quantized = mu_law(x) x_scaled = tf.cast(x_quantized, tf.float32) / 128.0 x_scaled = tf.expand_dims(x_scaled, 2) layer = x_scaled layer = masked.conv1d(layer, num_filters=width, filter_length=filter_length, name='startconv') # Set up skip connections. s = masked.conv1d(layer, num_filters=skip_width, filter_length=1, name='skip_start') # Residual blocks with skip connections. for i in range(num_layers): dilation = 2**(i % num_stages) d = masked.conv1d(layer, num_filters=2 * width, filter_length=filter_length, dilation=dilation, name='dilatedconv_%d' % (i + 1)) assert d.get_shape().as_list()[2] % 2 == 0 m = d.get_shape().as_list()[2] // 2 d_sigmoid = tf.sigmoid(d[:, :, :m]) d_tanh = tf.tanh(d[:, :, m:]) d = d_sigmoid * d_tanh layer += masked.conv1d(d, num_filters=width, filter_length=1, name='res_%d' % (i + 1)) s += masked.conv1d(d, num_filters=skip_width, filter_length=1, name='skip_%d' % (i + 1)) layers.append(s) saver = tf.train.Saver() saver.restore(sess, './model.ckpt-200000') content_features = sess.run(layers, feed_dict={x: content_tf}) styles = sess.run(layers, feed_dict={x: style_tf}) for i, style_feature in enumerate(styles): n_features = np.prod(layers[i].shape.as_list()[-1]) features = np.reshape(style_feature, (-1, n_features)) style_gram = np.matmul(features.T, features) / (n_samples * n_frames) style_features.append(style_gram) return content_features, style_features
s = masked.conv1d( l, num_filters=skip_width, filter_length=1, name='skip_start') # Residual blocks with skip connections. for i in range(num_layers): dilation = 2**(i % num_stages) d = masked.conv1d( l, num_filters=2 * width, filter_length=filter_length, dilation=dilation, name='dilatedconv_%d' % (i + 1)) d = self._condition(d, masked.conv1d( en, num_filters=2 * width, filter_length=1, name='cond_map_%d' % (i + 1))) assert d.get_shape().as_list()[2] % 2 == 0 m = d.get_shape().as_list()[2] // 2 d_sigmoid = tf.sigmoid(d[:, :, :m]) d_tanh = tf.tanh(d[:, :, m:]) d = d_sigmoid * d_tanh l += masked.conv1d( d, num_filters=width, filter_length=1, name='res_%d' % (i + 1)) s += masked.conv1d( d, num_filters=skip_width, filter_length=1, name='skip_%d' % (i + 1)) s = tf.nn.relu(s)
def build(self, inputs, is_training): """Build the graph for this configuration. Args: inputs: A dict of inputs. For training, should contain 'wav'. is_training: Whether we are training or not. Not used in this config. Returns: A dict of outputs that includes the 'predictions', 'loss', the 'encoding', the 'quantized_input', and whatever metrics we want to track for eval. """ del is_training num_stages = 10 num_layers = 30 filter_length = 3 width = 512 skip_width = 256 ae_num_stages = 10 ae_num_layers = 30 ae_filter_length = 3 ae_width = 128 print("@build, inputs: ", inputs) #pitch shape=(1,), wav shape=(1, 6144), key shape=(1,) # Encode the source with 8-bit Mu-Law. x = inputs['wav'] print("@build, x: ", x) #shape=(1, 6144) x_quantized = utils.mu_law(x) print("@build, x_quantized: ", x_quantized) #shape=(1, 6144) x_scaled = tf.cast(x_quantized, tf.float32) / 128.0 print("@build, x_scaled@1: ", x_scaled) #shape=(1, 6144) x_scaled = tf.expand_dims(x_scaled, 2) print("@build, x_scaled@2: ", x_scaled) #shape=(1, 6144, 1) ### # The Non-Causal Temporal Encoder. ### print("@build, ##Non-Causal Temporal Encoder...") print("\t create Layer ae_startconv") print("\t input[x_scaled] is: ", x_scaled) en = masked.conv1d( x_scaled, causal=False, num_filters=ae_width, #ae_width = 128 filter_length=ae_filter_length, name='ae_startconv') print("\t ae_startconv output [en] is:", en) #shape=(1. 6144, 128) print("\t create Layer ae_startconv Done\n") for num_layer in range(ae_num_layers): dilation = 2**(num_layer % ae_num_stages) print("\t create Layer relu") print("\t input[en] is: ", en) #shape=(1. 6144, 128) d = tf.nn.relu(en) print("\t relu output [d] is:", d) print("\t create Layer relu Done\n") print("\t create Layer ae_dilatedconv_{}, dilation={}".format( num_layer + 1, dilation)) print("\t input[d] is: ", d) d = masked.conv1d( d, causal=False, num_filters=ae_width, #128 filter_length=ae_filter_length, dilation=dilation, name='ae_dilatedconv_%d' % (num_layer + 1)) print("\t output [d] is:", d) print( "\t create Layer ae_dilatedconv_{}, dilation={} Done\n".format( num_layer + 1, dilation)) print("\t create Layer relu") print("\t input[d] is: ", d) d = tf.nn.relu(d) print("\t relu output [d] is:", d) print("\t create Layer relu Done\n") print("\t create Layer ae_res_{}".format(num_layer + 1)) print("\t input[en] is: ", en) print("\t input[d] is: ", d) en += masked.conv1d( d, num_filters=ae_width, #128 filter_length=1, name='ae_res_%d' % (num_layer + 1)) print("\t output [en] is:", en) #shape=(1, 6144, 128) print("\t create Layer ae_res_{} Done\n".format(num_layer + 1)) print("\t create Layer ae_bottleneck") print("\t input[en] is: ", en) #shape=(1, 6144, 128) en = masked.conv1d( en, num_filters=self.ae_bottleneck_width, #16 filter_length=1, name='ae_bottleneck') print("\t output[en] is: ", en) #shape=(1, 6144, 16) print("\t create Layer ae_bottleneck Done\n") print("\t create ae_pool") print("\t input[en] is: ", en) #shape=(1, 6144, 16) en = masked.pool1d(en, self.ae_hop_length, name='ae_pool', mode='avg') #ae_hop_length=512 print("\t output[en] is: ", en) #shape=(1, 12, 16) #6144/512=12 print("\t create ae_pool Done\n") encoding = en #encoding is 'feature vector', (125,16) for every 4 seconds voice. 125=4x16000/512 print("\t ##Non-Causal Temporal Encoder output[en|encoding] is: ", encoding) print("@build, ##Non-Causal Temporal Encoder...Done\n") ### # The WaveNet Decoder. ### print("@build, ##The WaveNet Decoder...") print("\t input[x_scaled] is: ", x_scaled) #shape=(1, 6144, 1) l = masked.shift_right(x_scaled) print("\t create startconv") print("\t input[l] is: ", l) #shape=(1, 6144, 1) l = masked.conv1d(l, num_filters=width, filter_length=filter_length, name='startconv') #width=512 print("\t output[l] is: ", l) #shape=(1, 6144, 512) print("\t create startconv Done\n") # Set up skip connections. print("\t create skip_start") print("\t input[l] is: ", l) s = masked.conv1d(l, num_filters=skip_width, filter_length=1, name='skip_start') #skip_width=256 print("\t output[s] is: ", s) #shape=(1, 6144, 256) print("\t create skip_start Done\n") # Residual blocks with skip connections. for i in range(num_layers): dilation = 2**(i % num_stages) print("\t create dilatedconv_{}, dilation={}".format( i + 1, dilation)) print("\t input[l] is: ", l) d = masked.conv1d(l, num_filters=2 * width, filter_length=filter_length, dilation=dilation, name='dilatedconv_%d' % (i + 1)) print("\t output[d] is: ", d) #shape=(1, 6144, 1024) print("\t create dilatedconv_{}, dilation={} Done\n".format( i + 1, dilation)) print("\t create _condition for cond_map_{}".format(i + 1)) print("\t input[d] is: ", d) print("\t input[en] is: ", en) d = self._condition( d, masked.conv1d(en, num_filters=2 * width, filter_length=1, name='cond_map_%d' % (i + 1))) print("\t output[d] is: ", d) print("\t create _condition for cond_map_{} Done\n".format(i + 1)) assert d.get_shape().as_list()[2] % 2 == 0 m = d.get_shape().as_list()[2] // 2 d_sigmoid = tf.sigmoid(d[:, :, :m]) d_tanh = tf.tanh(d[:, :, m:]) d = d_sigmoid * d_tanh print("\t d after some cacule:", d) #shape=(1, 6144, 512) print("") print("\t create res_{}".format(i + 1)) print("\t input[d] is: ", d) #shape=(1, 6144, 512) print("\t input[l] is: ", l) #shape=(1, 6144, 512) l += masked.conv1d(d, num_filters=width, filter_length=1, name='res_%d' % (i + 1)) #width=512 print("\t output[l] is: ", l) #shape=(1, 6144, 512) print("\t create res_{} Done\n".format(i + 1)) print("\t create skip_{}".format(i + 1)) print("\t input[d] is: ", d) #shape=(1, 6144, 512) print("\t input[s] is: ", s) #shape=(1, 6144, 256) s += masked.conv1d(d, num_filters=skip_width, filter_length=1, name='skip_%d' % (i + 1)) #skip_width=256 print("\t output[s] is: ", s) #shape=(1, 6144, 256) print("\t create skip_{} Done\n".format(i + 1)) print("\t create Layer relu") print("\t input[s] is: ", s) #shape=(1, 6144, 256) s = tf.nn.relu(s) print("\t output[s] is: ", s) #shape=(1, 6144, 256) print("\t create Layer relu Done\n") print("\t create Layer out1") print("\t input[s] is: ", s) #shape=(1, 6144, 256) s = masked.conv1d(s, num_filters=skip_width, filter_length=1, name='out1') #skip_width=256 print("\t output[s] is: ", s) #shape=(1, 6144, 256) print("\t create Layer out1 Done\n") print("\t create _condition for cond_map_out1") print("\t input[s] is: ", s) #shape=(1, 6144, 256) print("\t input[en] is: ", en) s = self._condition( s, masked.conv1d( en, num_filters=skip_width, #skip_width=256 filter_length=1, name='cond_map_out1')) print("\t output[s] is: ", s) print("\t create _condition for cond_map_out1 Done\n") print("\t create Layer relu") print("\t input[s] is: ", s) #shape=(1, 6144, 256) s = tf.nn.relu(s) print("\t output[s] is: ", s) #shape=(1, 6144, 256) print("\t create Layer relu Done\n") print("@build, ##The WaveNet Decoder...Done") ### # Compute the logits and get the loss. ### print("@build, ##Compute the logits and get the loss...") print("\t input[s] is: ", s) #shape=(1, 6144, 256) logits = masked.conv1d(s, num_filters=256, filter_length=1, name='logits') print("\t output[logits] is: ", logits) #shape=(1, 6144, 256) logits = tf.reshape(logits, [-1, 256]) print("\t logits after reshape: ", logits) #shape=(6144, 256) probs = tf.nn.softmax(logits, name='softmax') print("\t probs: ", probs) #shape=(6144, 256) print("\t x_quantized: ", x_quantized) # x_indices = tf.cast(tf.reshape(x_quantized, [-1]), tf.int32) + 128 print("\t x_indices", x_indices) #shape=(6144,) loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=x_indices, name='nll'), 0, name='loss') print("@build, ##Compute the logits and get the loss...Done") print("@build, Done, return:") print("\t probs:", probs) #shape=(6144, 256) print("\t loss:", loss) #shape=() print("\t x_quantized:", x_quantized) #shape=(1, 6144) print("\t encoding:", encoding) #shape=(1, 12, 16) return { 'predictions': probs, 'loss': loss, 'eval': { 'nll': loss }, 'quantized_input': x_quantized, 'encoding': encoding, }