def deconv_stack(self, mel_inputs, name='', init=False): mel = mel_inputs['mel'] deconv_width = self.hparams.deconv_width deconv_config = self.hparams.deconv_config # [[l1, s1], [l2, s2]] upsample_act = self.upsample_act use_resize_conv = self.use_resize_conv use_weight_norm = self.use_weight_norm mel_en = wavenet._deconv_stack(mel, deconv_width, deconv_config, act=upsample_act, use_resize_conv=use_resize_conv, name=name, use_weight_norm=use_weight_norm, init=init) return {'encoding': mel_en}
def _create_iaf(self, inputs, iaf_idx, init): num_stages = self.hparams.num_stages num_layers = self.hparams.num_iaf_layers[iaf_idx] filter_length = self.hparams.filter_length width = self.hparams.width out_width = self.out_width deconv_width = self.hparams.deconv_width deconv_config = self.hparams.deconv_config # [[l1, s1], [l2, s2]] use_weight_norm = self.use_weight_norm use_resize_conv = self.use_resize_conv upsample_act = self.upsample_act gate_width = width final_init, final_bias = PWNHelper.manual_finit_or_not_fn( init, iaf_idx) mel = inputs['mel'] x = inputs['x'] iaf_name = 'iaf_{:d}'.format(iaf_idx + 1) mel_en = wavenet._deconv_stack(mel, deconv_width, deconv_config, act=upsample_act, use_resize_conv=use_resize_conv, name=iaf_name, use_weight_norm=use_weight_norm, init=init) l = masked.shift_right(x) l = masked.conv1d(l, num_filters=width, filter_length=filter_length, name='{}/start_conv'.format(iaf_name), use_weight_norm=use_weight_norm, init=init) for i in range(num_layers): dilation = 2**(i % num_stages) d = masked.conv1d(l, num_filters=gate_width, filter_length=filter_length, dilation=dilation, name='{}/dilated_conv_{:d}'.format( iaf_name, i + 1), use_weight_norm=use_weight_norm, init=init) c = masked.conv1d(mel_en, num_filters=gate_width, filter_length=1, name='{}/mel_cond_{:d}'.format(iaf_name, i + 1), use_weight_norm=use_weight_norm, init=init) d = wavenet._condition(d, c) assert d.get_shape().as_list()[2] % 2 == 0 m = d.get_shape().as_list()[2] // 2 d_sigmoid = tf.sigmoid(d[:, :, :m]) d_tanh = tf.tanh(d[:, :, m:]) d = d_sigmoid * d_tanh l += masked.conv1d(d, num_filters=width, filter_length=1, name='{}/res_{:d}'.format(iaf_name, i + 1), use_weight_norm=use_weight_norm, init=init) l = tf.nn.relu(l) l = masked.conv1d(l, num_filters=width, filter_length=1, name='{}/out1'.format(iaf_name), use_weight_norm=use_weight_norm, init=init) c = masked.conv1d(mel_en, num_filters=width, filter_length=1, name='{}/mel_cond_out1'.format(iaf_name), use_weight_norm=use_weight_norm, init=init) l = wavenet._condition(l, c) l = tf.nn.relu(l) mean = masked.conv1d(l, num_filters=out_width // 2, filter_length=1, name='{}/out2_mean'.format(iaf_name), use_weight_norm=use_weight_norm, init=final_init) scale_params = masked.conv1d( l, num_filters=out_width // 2, filter_length=1, name='{}/out2_scale'.format(iaf_name), use_weight_norm=use_weight_norm, init=final_init, biases_initializer=tf.constant_initializer(final_bias)) scale, log_scale = PWNHelper.scale_log_scale_fn(scale_params) new_x = x * scale + mean if DETAIL_LOG: tf.summary.scalar('scale_{}'.format(iaf_idx), tf.reduce_mean(scale)) tf.summary.scalar('log_scale_{}'.format(iaf_idx), tf.reduce_mean(log_scale)) tf.summary.scalar('mean_{}'.format(iaf_idx), tf.reduce_mean(mean)) return { 'x': new_x, 'mean': mean, 'scale': scale, 'log_scale': log_scale }
def _create_iaf(self, inputs, iaf_idx): num_stages = self.hparams.num_stages num_layers = self.hparams.num_iaf_layers[iaf_idx] filter_length = self.hparams.filter_length width = self.hparams.width out_width = self.out_width deconv_width = self.hparams.deconv_width deconv_config = self.hparams.deconv_config # [[l1, s1], [l2, s2]] mel = inputs['mel'] x = inputs['x'] iaf_name = 'iaf_{:d}'.format(iaf_idx + 1) mel_en = wavenet._deconv_stack(mel, deconv_width, deconv_config, name=iaf_name) l = masked.shift_right(x) l = masked.conv1d(l, num_filters=width, filter_length=filter_length, name='{}/start_conv'.format(iaf_name)) for i in range(num_layers): dilation = 2**(i % num_stages) d = masked.conv1d(l, num_filters=2 * width, filter_length=filter_length, dilation=dilation, name='{}/dilated_conv_{:d}'.format( iaf_name, i + 1)) c = masked.conv1d(mel_en, num_filters=2 * width, filter_length=1, name='{}/mel_cond_{:d}'.format(iaf_name, i + 1)) d = wavenet._condition(d, c) assert d.get_shape().as_list()[2] % 2 == 0 m = d.get_shape().as_list()[2] // 2 d_sigmoid = tf.sigmoid(d[:, :, :m]) d_tanh = tf.tanh(d[:, :, m:]) d = d_sigmoid * d_tanh l += masked.conv1d(d, num_filters=width, filter_length=1, name='{}/res_{:d}'.format(iaf_name, i + 1)) l = tf.nn.relu(l) l = masked.conv1d(l, num_filters=width, filter_length=1, name='{}/out1'.format(iaf_name)) c = masked.conv1d(mel_en, num_filters=width, filter_length=1, name='{}/mel_cond_out1'.format(iaf_name)) l = wavenet._condition(l, c) l = tf.nn.relu(l) out = masked.conv1d(l, num_filters=out_width, filter_length=1, name='{}/out2'.format(iaf_name)) mean, scale = tf.split(out, num_or_size_splits=2, axis=2) scale = tf.clip_by_value(scale, tf.exp(-7.0), tf.exp(7.0)) new_x = x * scale + mean return {'x': new_x, 'mean': mean, 'scale': scale}
def _create_iaf(self, inputs, iaf_idx): num_stages = self.hparams.num_stages num_layers = self.hparams.num_iaf_layers[iaf_idx] filter_length = self.hparams.filter_length width = self.hparams.width out_width = self.out_width deconv_width = self.hparams.deconv_width deconv_config = self.hparams.deconv_config # [[l1, s1], [l2, s2]] use_log_scale = getattr(self.hparams, 'use_log_scale', True) mel = inputs['mel'] x = inputs['x'] iaf_name = 'iaf_{:d}'.format(iaf_idx + 1) mel_en = wavenet._deconv_stack( mel, deconv_width, deconv_config, name=iaf_name) l = masked.shift_right(x) l = masked.conv1d(l, num_filters=width, filter_length=filter_length, name='{}/start_conv'.format(iaf_name)) for i in range(num_layers): dilation = 2 ** (i % num_stages) d = masked.conv1d( l, num_filters=2 * width, filter_length=filter_length, dilation=dilation, name='{}/dilated_conv_{:d}'.format(iaf_name, i + 1)) c = masked.conv1d( mel_en, num_filters=2 * width, filter_length=1, name='{}/mel_cond_{:d}'.format(iaf_name, i + 1)) d = wavenet._condition(d, c) assert d.get_shape().as_list()[2] % 2 == 0 m = d.get_shape().as_list()[2] // 2 d_sigmoid = tf.sigmoid(d[:, :, :m]) d_tanh = tf.tanh(d[:, :, m:]) d = d_sigmoid * d_tanh l += masked.conv1d(d, num_filters=width, filter_length=1, name='{}/res_{:d}'.format(iaf_name, i + 1)) l = tf.nn.relu(l) l = masked.conv1d(l, num_filters=width, filter_length=1, name='{}/out1'.format(iaf_name)) c = masked.conv1d(mel_en, num_filters=width, filter_length=1, name='{}/mel_cond_out1'.format(iaf_name)) l = wavenet._condition(l, c) l = tf.nn.relu(l) # to keep the scale in a reasonable small range if use_log_scale=True. final_kernel_init = (tf.truncated_normal_initializer(0., 0.01) if use_log_scale else tf.uniform_unit_scaling_initializer(1.0)) out = masked.conv1d(l, num_filters=out_width, filter_length=1, name='{}/out2'.format(iaf_name), kernel_initializer=final_kernel_init) mean, scale_params = tf.split(out, num_or_size_splits=2, axis=2) if use_log_scale: log_scale = tf.clip_by_value(scale_params, -9.0, 7.0) scale = tf.exp(log_scale) else: scale_params = tf.nn.softplus(scale_params) scale = tf.clip_by_value(scale_params, tf.exp(-9.0), tf.exp(7.0)) log_scale = tf.log(scale) new_x = x * scale + mean if DETAIL_LOG: tf.summary.scalar('scale_{}'.format(iaf_idx), tf.reduce_mean(scale)) tf.summary.scalar('log_scale_{}'.format(iaf_idx), tf.reduce_mean(log_scale)) tf.summary.scalar('mean_{}'.format(iaf_idx), tf.reduce_mean(mean)) return {'x': new_x, 'mean': mean, 'scale': scale, 'log_scale': log_scale}