def _create_iaf(self, inputs, iaf_idx, init): num_stages = self.hparams.num_stages num_layers = self.hparams.num_iaf_layers[iaf_idx] filter_length = self.hparams.filter_length width = self.hparams.width out_width = self.out_width deconv_width = self.hparams.deconv_width deconv_config = self.hparams.deconv_config # [[l1, s1], [l2, s2]] use_weight_norm = self.use_weight_norm use_resize_conv = self.use_resize_conv upsample_act = self.upsample_act gate_width = width final_init, final_bias = PWNHelper.manual_finit_or_not_fn( init, iaf_idx) mel = inputs['mel'] x = inputs['x'] iaf_name = 'iaf_{:d}'.format(iaf_idx + 1) mel_en = wavenet._deconv_stack(mel, deconv_width, deconv_config, act=upsample_act, use_resize_conv=use_resize_conv, name=iaf_name, use_weight_norm=use_weight_norm, init=init) l = masked.shift_right(x) l = masked.conv1d(l, num_filters=width, filter_length=filter_length, name='{}/start_conv'.format(iaf_name), use_weight_norm=use_weight_norm, init=init) for i in range(num_layers): dilation = 2**(i % num_stages) d = masked.conv1d(l, num_filters=gate_width, filter_length=filter_length, dilation=dilation, name='{}/dilated_conv_{:d}'.format( iaf_name, i + 1), use_weight_norm=use_weight_norm, init=init) c = masked.conv1d(mel_en, num_filters=gate_width, filter_length=1, name='{}/mel_cond_{:d}'.format(iaf_name, i + 1), use_weight_norm=use_weight_norm, init=init) d = wavenet._condition(d, c) assert d.get_shape().as_list()[2] % 2 == 0 m = d.get_shape().as_list()[2] // 2 d_sigmoid = tf.sigmoid(d[:, :, :m]) d_tanh = tf.tanh(d[:, :, m:]) d = d_sigmoid * d_tanh l += masked.conv1d(d, num_filters=width, filter_length=1, name='{}/res_{:d}'.format(iaf_name, i + 1), use_weight_norm=use_weight_norm, init=init) l = tf.nn.relu(l) l = masked.conv1d(l, num_filters=width, filter_length=1, name='{}/out1'.format(iaf_name), use_weight_norm=use_weight_norm, init=init) c = masked.conv1d(mel_en, num_filters=width, filter_length=1, name='{}/mel_cond_out1'.format(iaf_name), use_weight_norm=use_weight_norm, init=init) l = wavenet._condition(l, c) l = tf.nn.relu(l) mean = masked.conv1d(l, num_filters=out_width // 2, filter_length=1, name='{}/out2_mean'.format(iaf_name), use_weight_norm=use_weight_norm, init=final_init) scale_params = masked.conv1d( l, num_filters=out_width // 2, filter_length=1, name='{}/out2_scale'.format(iaf_name), use_weight_norm=use_weight_norm, init=final_init, biases_initializer=tf.constant_initializer(final_bias)) scale, log_scale = PWNHelper.scale_log_scale_fn(scale_params) new_x = x * scale + mean if DETAIL_LOG: tf.summary.scalar('scale_{}'.format(iaf_idx), tf.reduce_mean(scale)) tf.summary.scalar('log_scale_{}'.format(iaf_idx), tf.reduce_mean(log_scale)) tf.summary.scalar('mean_{}'.format(iaf_idx), tf.reduce_mean(mean)) return { 'x': new_x, 'mean': mean, 'scale': scale, 'log_scale': log_scale }
def feed_forward(self, inputs, init=False): """Build the graph for this configuration. Args: inputs: A dict of inputs. For training, should contain 'wav'. init: data dependent initialization. Returns: A dict of outputs that includes the 'predictions', 'loss', the 'encoding', the 'quantized_input', and whatever metrics we want to track for eval. """ use_weight_norm = self.use_weight_norm num_stages = self.hparams.num_stages num_layers = self.hparams.num_layers filter_length = self.hparams.filter_length width = self.hparams.width skip_width = self.hparams.skip_width out_width = self.out_width use_dropout = self.use_dropout use_as_teacher = self.use_as_teacher # in parallel wavenet paper, gate width is the same with residual width # not double of that. gate_width = 2 * width if self.double_gate_width else width dropout_training = not use_as_teacher ### # The Transpose Convolution Stack for mel feature. ### # wavenet inputs <- trans_conv (l2, s2) <- trans_conv (l1, s1) <- mel_ceps # win_len: l1 * s2 + (l2 - s2); win_shift: s1 * s2 # (l1, s1) = (40, 10), (l2, s2) = (80, 20) is a proper configuration. # it is almost consistent with mel analysis frame shift (200) and frame length (800). mel = inputs['mel'] ds_dict = self.deconv_stack({'mel': mel}, init=init) mel_en = ds_dict['encoding'] x_scaled = inputs['wav_scaled'] x_scaled = tf.expand_dims(x_scaled, 2) ### # The WaveNet Decoder. ### l = masked.shift_right(x_scaled) l = masked.conv1d(l, num_filters=width, filter_length=filter_length, name='conv_start', use_weight_norm=use_weight_norm, init=init) if use_dropout: l = tf.layers.dropout(l, rate=0.2, training=dropout_training, name='conv_dropout') # Set up skip connections. s = masked.conv1d(l, num_filters=skip_width, filter_length=1, name='skip_start', use_weight_norm=use_weight_norm, init=init) ### # Residual blocks with skip connections. ### for i in range(num_layers): dilation = 2**(i % num_stages) d = masked.conv1d(l, num_filters=gate_width, filter_length=filter_length, dilation=dilation, name='dilated_conv_%d' % (i + 1), use_weight_norm=use_weight_norm, init=init) c = masked.conv1d(mel_en, num_filters=gate_width, filter_length=1, name='mel_cond_%d' % (i + 1), use_weight_norm=use_weight_norm, init=init) d = _condition(d, c) assert d.get_shape().as_list()[2] % 2 == 0 m = d.get_shape().as_list()[2] // 2 d_sigmoid = tf.sigmoid(d[:, :, :m]) d_tanh = tf.tanh(d[:, :, m:]) d = d_sigmoid * d_tanh l += masked.conv1d(d, num_filters=width, filter_length=1, name='res_%d' % (i + 1), use_weight_norm=use_weight_norm, init=init) s += masked.conv1d(d, num_filters=skip_width, filter_length=1, name='skip_%d' % (i + 1), use_weight_norm=use_weight_norm, init=init) if use_dropout: l = tf.layers.dropout(l, rate=0.2, training=dropout_training, name='res_dropout_%d' % (i + 1)) s = tf.nn.relu(s) s = masked.conv1d(s, num_filters=skip_width, filter_length=1, name='out1', use_weight_norm=use_weight_norm, init=init) c = masked.conv1d(mel_en, num_filters=skip_width, filter_length=1, name='mel_cond_out1', use_weight_norm=use_weight_norm, init=init) s = _condition(s, c) s = tf.nn.relu(s) out = masked.conv1d(s, num_filters=out_width, filter_length=1, name='out2', use_weight_norm=use_weight_norm, init=init) return {'encoding': mel_en, 'out_params': out}
def _create_iaf(self, inputs, iaf_idx): num_stages = self.hparams.num_stages num_layers = self.hparams.num_iaf_layers[iaf_idx] filter_length = self.hparams.filter_length width = self.hparams.width out_width = self.out_width deconv_width = self.hparams.deconv_width deconv_config = self.hparams.deconv_config # [[l1, s1], [l2, s2]] use_log_scale = getattr(self.hparams, 'use_log_scale', True) mel = inputs['mel'] x = inputs['x'] iaf_name = 'iaf_{:d}'.format(iaf_idx + 1) mel_en = wavenet._deconv_stack( mel, deconv_width, deconv_config, name=iaf_name) l = masked.shift_right(x) l = masked.conv1d(l, num_filters=width, filter_length=filter_length, name='{}/start_conv'.format(iaf_name)) for i in range(num_layers): dilation = 2 ** (i % num_stages) d = masked.conv1d( l, num_filters=2 * width, filter_length=filter_length, dilation=dilation, name='{}/dilated_conv_{:d}'.format(iaf_name, i + 1)) c = masked.conv1d( mel_en, num_filters=2 * width, filter_length=1, name='{}/mel_cond_{:d}'.format(iaf_name, i + 1)) d = wavenet._condition(d, c) assert d.get_shape().as_list()[2] % 2 == 0 m = d.get_shape().as_list()[2] // 2 d_sigmoid = tf.sigmoid(d[:, :, :m]) d_tanh = tf.tanh(d[:, :, m:]) d = d_sigmoid * d_tanh l += masked.conv1d(d, num_filters=width, filter_length=1, name='{}/res_{:d}'.format(iaf_name, i + 1)) l = tf.nn.relu(l) l = masked.conv1d(l, num_filters=width, filter_length=1, name='{}/out1'.format(iaf_name)) c = masked.conv1d(mel_en, num_filters=width, filter_length=1, name='{}/mel_cond_out1'.format(iaf_name)) l = wavenet._condition(l, c) l = tf.nn.relu(l) # to keep the scale in a reasonable small range if use_log_scale=True. final_kernel_init = (tf.truncated_normal_initializer(0., 0.01) if use_log_scale else tf.uniform_unit_scaling_initializer(1.0)) out = masked.conv1d(l, num_filters=out_width, filter_length=1, name='{}/out2'.format(iaf_name), kernel_initializer=final_kernel_init) mean, scale_params = tf.split(out, num_or_size_splits=2, axis=2) if use_log_scale: log_scale = tf.clip_by_value(scale_params, -9.0, 7.0) scale = tf.exp(log_scale) else: scale_params = tf.nn.softplus(scale_params) scale = tf.clip_by_value(scale_params, tf.exp(-9.0), tf.exp(7.0)) log_scale = tf.log(scale) new_x = x * scale + mean if DETAIL_LOG: tf.summary.scalar('scale_{}'.format(iaf_idx), tf.reduce_mean(scale)) tf.summary.scalar('log_scale_{}'.format(iaf_idx), tf.reduce_mean(log_scale)) tf.summary.scalar('mean_{}'.format(iaf_idx), tf.reduce_mean(mean)) return {'x': new_x, 'mean': mean, 'scale': scale, 'log_scale': log_scale}
def _create_iaf(self, inputs, iaf_idx): num_stages = self.hparams.num_stages num_layers = self.hparams.num_iaf_layers[iaf_idx] filter_length = self.hparams.filter_length width = self.hparams.width out_width = self.out_width deconv_width = self.hparams.deconv_width deconv_config = self.hparams.deconv_config # [[l1, s1], [l2, s2]] mel = inputs['mel'] x = inputs['x'] iaf_name = 'iaf_{:d}'.format(iaf_idx + 1) mel_en = wavenet._deconv_stack(mel, deconv_width, deconv_config, name=iaf_name) l = masked.shift_right(x) l = masked.conv1d(l, num_filters=width, filter_length=filter_length, name='{}/start_conv'.format(iaf_name)) for i in range(num_layers): dilation = 2**(i % num_stages) d = masked.conv1d(l, num_filters=2 * width, filter_length=filter_length, dilation=dilation, name='{}/dilated_conv_{:d}'.format( iaf_name, i + 1)) c = masked.conv1d(mel_en, num_filters=2 * width, filter_length=1, name='{}/mel_cond_{:d}'.format(iaf_name, i + 1)) d = wavenet._condition(d, c) assert d.get_shape().as_list()[2] % 2 == 0 m = d.get_shape().as_list()[2] // 2 d_sigmoid = tf.sigmoid(d[:, :, :m]) d_tanh = tf.tanh(d[:, :, m:]) d = d_sigmoid * d_tanh l += masked.conv1d(d, num_filters=width, filter_length=1, name='{}/res_{:d}'.format(iaf_name, i + 1)) l = tf.nn.relu(l) l = masked.conv1d(l, num_filters=width, filter_length=1, name='{}/out1'.format(iaf_name)) c = masked.conv1d(mel_en, num_filters=width, filter_length=1, name='{}/mel_cond_out1'.format(iaf_name)) l = wavenet._condition(l, c) l = tf.nn.relu(l) out = masked.conv1d(l, num_filters=out_width, filter_length=1, name='{}/out2'.format(iaf_name)) mean, scale = tf.split(out, num_or_size_splits=2, axis=2) scale = tf.clip_by_value(scale, tf.exp(-7.0), tf.exp(7.0)) new_x = x * scale + mean return {'x': new_x, 'mean': mean, 'scale': scale}
def feed_forward(self, inputs): """Build the graph for this configuration. Args: inputs: A dict of inputs. For training, should contain 'wav'. Returns: A dict of outputs that includes the 'predictions', 'loss', the 'encoding', the 'quantized_input', and whatever metrics we want to track for eval. """ num_stages = self.hparams.num_stages num_layers = self.hparams.num_layers filter_length = self.hparams.filter_length width = self.hparams.width skip_width = self.hparams.skip_width use_mu_law = self.use_mu_law quant_chann = self.quant_chann out_width = self.out_width ### # The Transpose Convolution Stack for mel feature. ### # wavenet inputs <- trans_conv (l2, s2) <- trans_conv (l1, s1) <- mel_ceps # win_len: l1 * s2 + (l2 - s2); win_shift: s1 * s2 # (l1, s1) = (40, 10), (l2, s2) = (80, 20) is a proper configuration. # it is almost consistent with mel analysis frame shift (200) and frame length (800). mel = inputs['mel'] ds_dict = self.deconv_stack({'mel': mel}) mel_en = ds_dict['encoding'] ### # Encode the source with 8-bit Mu-Law or just use 16-bit signal. ### x = inputs['wav'] if use_mu_law: x_quantized = utils.mu_law(x) x_scaled = tf.cast(x_quantized, tf.float32) / (quant_chann / 2.) real_targets = x_scaled cate_targets = tf.cast(x_quantized, tf.int32) + tf.cast( quant_chann / 2., tf.int32) else: x_quantized = utils.cast_quantize(x, quant_chann) x_scaled = x real_targets = x cate_targets = tf.cast(x_quantized, tf.int32) + tf.cast( quant_chann / 2., tf.int32) x_scaled = tf.expand_dims(x_scaled, 2) ### # The WaveNet Decoder. ### l = masked.shift_right(x_scaled) l = masked.conv1d(l, num_filters=width, filter_length=filter_length, name='startconv') # Set up skip connections. s = masked.conv1d(l, num_filters=skip_width, filter_length=1, name='skip_start') # Residual blocks with skip connections. for i in range(num_layers): dilation = 2**(i % num_stages) d = masked.conv1d(l, num_filters=2 * width, filter_length=filter_length, dilation=dilation, name='dilated_conv_%d' % (i + 1)) c = masked.conv1d(mel_en, num_filters=2 * width, filter_length=1, name='mel_cond_%d' % (i + 1)) d = _condition(d, c) assert d.get_shape().as_list()[2] % 2 == 0 m = d.get_shape().as_list()[2] // 2 d_sigmoid = tf.sigmoid(d[:, :, :m]) d_tanh = tf.tanh(d[:, :, m:]) d = d_sigmoid * d_tanh l += masked.conv1d(d, num_filters=width, filter_length=1, name='res_%d' % (i + 1)) s += masked.conv1d(d, num_filters=skip_width, filter_length=1, name='skip_%d' % (i + 1)) s = tf.nn.relu(s) s = masked.conv1d(s, num_filters=skip_width, filter_length=1, name='out1') c = masked.conv1d(mel_en, num_filters=skip_width, filter_length=1, name='mel_cond_out1') s = _condition(s, c) s = tf.nn.relu(s) # when using mol loss, the model always predicts log_scale, the initializer makes # the log_scale in a reasonable small range to speed up convergence. final_kernel_init = (tf.truncated_normal_initializer(0.0, 0.01) if self.loss_type == 'mol' else tf.uniform_unit_scaling_initializer(1.0)) out = masked.conv1d(s, num_filters=out_width, filter_length=1, name='out2', kernel_initializer=final_kernel_init) return { 'real_targets': real_targets, 'cate_targets': cate_targets, 'encoding': mel_en, 'out_params': out }