def initialize(self, y, c, g, input_lengths, x=None, synthesis_length=None): '''Initialize wavenet graph for train, eval and test cases. ''' hparams = self._hparams self.is_training = x is not None self.is_evaluating = not self.is_training and y is not None #Set all convolutions to corresponding mode self.set_mode(self.is_training) log('Initializing Wavenet model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(self.is_training)) log(' Eval mode: {}'.format(self.is_evaluating)) log(' Synthesis mode: {}'.format(not (self.is_training or self.is_evaluating))) with tf.variable_scope('inference') as scope: #Training if self.is_training: batch_size = tf.shape(x)[0] #[batch_size, time_length, 1] self.mask = self.get_mask(input_lengths, maxlen=tf.shape(x)[-1]) #To be used in loss computation #[batch_size, channels, time_length] y_hat = self.step(x, c, g, softmax=False) #softmax is automatically computed inside softmax_cross_entropy if needed if is_mulaw_quantize(hparams.input_type): #[batch_size, time_length, channels] self.y_hat_q = tf.transpose(y_hat, [0, 2, 1]) self.y_hat = y_hat self.y = y self.input_lengths = input_lengths #Graph extension for log saving #[batch_size, time_length] shape_control = (batch_size, tf.shape(x)[-1], 1) with tf.control_dependencies([tf.assert_equal(tf.shape(y), shape_control)]): y_log = tf.squeeze(y, [-1]) if is_mulaw_quantize(hparams.input_type): self.y = y_log y_hat_log = tf.cond(tf.equal(tf.rank(y_hat), 4), lambda: tf.squeeze(y_hat, [-1]), lambda: y_hat) y_hat_log = tf.reshape(y_hat_log, [batch_size, hparams.out_channels, -1]) if is_mulaw_quantize(hparams.input_type): #[batch_size, time_length] y_hat_log = tf.reduce_max(tf.nn.softmax(y_hat_log, axis=1), 1) y_hat_log = util.inv_mulaw_quantize(y_hat_log, hparams.quantize_channels) y_log = util.inv_mulaw_quantize(y_log, hparams.quantize_channels) else: #[batch_size, time_length] y_hat_log = sample_from_discretized_mix_logistic( y_hat_log, log_scale_min=hparams.log_scale_min) if is_mulaw(hparams.input_type): y_hat_log = util.inv_mulaw(y_hat_log, hparams.quantize_channels) y_log = util.inv_mulaw(y_log, hparams.quantize_channels) self.y_hat_log = y_hat_log self.y_log = y_log log(' inputs: {}'.format(x.shape)) if self.local_conditioning_enabled(): log(' local_condition: {}'.format(c.shape)) if self.has_speaker_embedding(): log(' global_condition: {}'.format(g.shape)) log(' targets: {}'.format(y_log.shape)) log(' outputs: {}'.format(y_hat_log.shape)) #evaluating elif self.is_evaluating: #[time_length, ] idx = 0 length = input_lengths[idx] y_target = tf.reshape(y[idx], [-1])[:length] if c is not None: c = tf.expand_dims(c[idx, :, :length], axis=0) with tf.control_dependencies([tf.assert_equal(tf.rank(c), 3)]): c = tf.identity(c, name='eval_assert_c_rank_op') if g is not None: g = g[idx] #Start silence frame if is_mulaw_quantize(hparams.input_type): initial_value = mulaw_quantize(0, hparams.quantize_channels) elif is_mulaw(hparams.input_type): initial_value = mulaw(0.0, hparams.quantize_channels) else: initial_value = 0.0 #[channels, ] if is_mulaw_quantize(hparams.input_type): initial_input = tf.one_hot(indices=initial_value, depth=hparams.quantize_channels, dtype=tf.float32) initial_input = tf.reshape(initial_input, [1, 1, hparams.quantize_channels]) else: initial_input = tf.ones([1, 1, 1], tf.float32) * initial_value #Fast eval y_hat = self.incremental(initial_input, c=c, g=g, time_length=length, softmax=True, quantize=True, log_scale_min=hparams.log_scale_min) #Save targets and length for eval loss computation if is_mulaw_quantize(hparams.input_type): self.y_eval = tf.reshape(y[idx], [1, -1])[:, :length] else: self.y_eval = tf.expand_dims(y[idx], axis=0)[:, :length, :] self.eval_length = length if is_mulaw_quantize(hparams.input_type): y_hat = tf.reshape(tf.reduce_max(y_hat, axis=1), [-1]) y_hat = inv_mulaw_quantize(y_hat, hparams.quantize_channels) y_target = inv_mulaw_quantize(y_target, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = inv_mulaw(tf.reshape(y_hat, [-1]), hparams.quantize_channels) y_target = inv_mulaw(y_target, hparams.quantize_channels) else: y_hat = tf.reshape(y_hat, [-1]) self.y_hat = y_hat self.y_target = y_target if self.local_conditioning_enabled(): log(' local_condition: {}'.format(c.shape)) if self.has_speaker_embedding(): log(' global_condition: {}'.format(g.shape)) log(' targets: {}'.format(y_target.shape)) log(' outputs: {}'.format(y_hat.shape)) #synthesizing else: if c is None: assert synthesis_length is not None else: #[batch_size, local_condition_time, local_condition_dimension(num_mels)] message = ('Expected 3 dimension shape [batch_size(1), time_length, {}] for local condition features but found {}'.format( hparams.cin_channels, c.shape)) with tf.control_dependencies([tf.assert_equal(tf.rank(c), 3, message=message)]): c = tf.identity(c, name='synthesis_assert_c_rank_op') Tc = tf.shape(c)[1] upsample_factor = audio.get_hop_size(self._hparams) #Overwrite length with respect to local condition features synthesis_length = Tc * upsample_factor #[batch_size, local_condition_dimension, local_condition_time] #time_length will be corrected using the upsample network c = tf.transpose(c, [0, 2, 1]) #Start silence frame if is_mulaw_quantize(hparams.input_type): initial_value = mulaw_quantize(0, hparams.quantize_channels) elif is_mulaw(hparams.input_type): initial_value = mulaw(0.0, hparams.quantize_channels) else: initial_value = 0.0 if is_mulaw_quantize(hparams.input_type): assert initial_value >= 0 and initial_value < hparams.quantize_channels initial_input = tf.one_hot(indices=initial_value, depth=hparams.quantize_channels, dtype=tf.float32) initial_input = tf.reshape(initial_input, [1, 1, hparams.quantize_channels]) else: initial_input = tf.ones([1, 1, 1], tf.float32) * initial_value y_hat = self.incremental(initial_input, c=c, g=g, time_length=synthesis_length, softmax=True, quantize=True, log_scale_min=hparams.log_scale_min) if is_mulaw_quantize(hparams.input_type): y_hat = tf.reshape(tf.reduce_max(y_hat, axis=1), [-1]) y_hat = util.inv_mulaw_quantize(y_hat, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = util.inv_mulaw(tf.reshape(y_hat, [-1]), hparams.quantize_channels) else: y_hat = tf.reshape(y_hat, [-1]) self.y_hat = y_hat if self.local_conditioning_enabled(): log(' local_condition: {}'.format(c.shape)) if self.has_speaker_embedding(): log(' global_condition: {}'.format(g.shape)) log(' outputs: {}'.format(y_hat.shape)) self.variables = tf.trainable_variables() self.ema = tf.train.ExponentialMovingAverage(decay=hparams.wavenet_ema_decay)
def initialize(self, y, c, g, input_lengths, x=None, synthesis_length=None): '''Initialize wavenet graph for train, eval and test cases. ''' hparams = self._hparams self.is_training = x is not None self.is_evaluating = not self.is_training and y is not None #Set all convolutions to corresponding mode self.set_mode(self.is_training) log('Initializing Wavenet model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(self.is_training)) log(' Eval mode: {}'.format(self.is_evaluating)) log(' Synthesis mode: {}'.format(not ( self.is_training or self.is_evaluating))) with tf.variable_scope('inference') as scope: #Training if self.is_training: batch_size = tf.shape(x)[0] #[batch_size, time_length, 1] self.mask = self.get_mask( input_lengths, maxlen=tf.shape(x)[-1]) #To be used in loss computation #[batch_size, channels, time_length] y_hat = self.step( x, c, g, softmax=False ) #softmax is automatically computed inside softmax_cross_entropy if needed if is_mulaw_quantize(hparams.input_type): #[batch_size, time_length, channels] self.y_hat_q = tf.transpose(y_hat, [0, 2, 1]) self.y_hat = y_hat self.y = y self.input_lengths = input_lengths #Add mean and scale stats if using Guassian distribution output (there would be too many logistics if using MoL) if self._hparams.out_channels == 2: self.means = self.y_hat[:, 0, :] self.log_scales = self.y_hat[:, 1, :] else: self.means = None #Graph extension for log saving #[batch_size, time_length] shape_control = (batch_size, tf.shape(x)[-1], 1) with tf.control_dependencies( [tf.assert_equal(tf.shape(y), shape_control)]): y_log = tf.squeeze(y, [-1]) if is_mulaw_quantize(hparams.input_type): self.y = y_log y_hat_log = tf.cond(tf.equal(tf.rank(y_hat), 4), lambda: tf.squeeze(y_hat, [-1]), lambda: y_hat) y_hat_log = tf.reshape(y_hat_log, [batch_size, hparams.out_channels, -1]) if is_mulaw_quantize(hparams.input_type): #[batch_size, time_length] y_hat_log = tf.argmax(tf.nn.softmax(y_hat_log, axis=1), 1) y_hat_log = util.inv_mulaw_quantize( y_hat_log, hparams.quantize_channels) y_log = util.inv_mulaw_quantize(y_log, hparams.quantize_channels) else: #[batch_size, time_length] if hparams.out_channels == 2: y_hat_log = sample_from_gaussian( y_hat_log, log_scale_min_gauss=hparams.log_scale_min_gauss) else: y_hat_log = sample_from_discretized_mix_logistic( y_hat_log, log_scale_min=hparams.log_scale_min) if is_mulaw(hparams.input_type): y_hat_log = util.inv_mulaw(y_hat_log, hparams.quantize_channels) y_log = util.inv_mulaw(y_log, hparams.quantize_channels) self.y_hat_log = y_hat_log self.y_log = y_log log(' inputs: {}'.format(x.shape)) if self.local_conditioning_enabled(): log(' local_condition: {}'.format(c.shape)) if self.has_speaker_embedding(): log(' global_condition: {}'.format(g.shape)) log(' targets: {}'.format(y_log.shape)) log(' outputs: {}'.format(y_hat_log.shape)) #evaluating elif self.is_evaluating: #[time_length, ] idx = 0 length = input_lengths[idx] y_target = tf.reshape(y[idx], [-1])[:length] if c is not None: c = tf.expand_dims(c[idx, :, :length], axis=0) with tf.control_dependencies( [tf.assert_equal(tf.rank(c), 3)]): c = tf.identity(c, name='eval_assert_c_rank_op') if g is not None: g = tf.expand_dims(g[idx], axis=0) batch_size = tf.shape(c)[0] #Start silence frame if is_mulaw_quantize(hparams.input_type): initial_value = mulaw_quantize(0, hparams.quantize_channels) elif is_mulaw(hparams.input_type): initial_value = mulaw(0.0, hparams.quantize_channels) else: initial_value = 0.0 #[channels, ] if is_mulaw_quantize(hparams.input_type): initial_input = tf.one_hot(indices=initial_value, depth=hparams.quantize_channels, dtype=tf.float32) initial_input = tf.tile( tf.reshape(initial_input, [1, 1, hparams.quantize_channels]), [batch_size, 1, 1]) else: initial_input = tf.ones([batch_size, 1, 1], tf.float32) * initial_value #Fast eval y_hat = self.incremental(initial_input, c=c, g=g, time_length=length, softmax=False, quantize=True, log_scale_min=hparams.log_scale_min) #Save targets and length for eval loss computation if is_mulaw_quantize(hparams.input_type): self.y_eval = tf.reshape(y[idx], [1, -1])[:, :length] else: self.y_eval = tf.expand_dims(y[idx], axis=0)[:, :length, :] self.eval_length = length if is_mulaw_quantize(hparams.input_type): y_hat = tf.reshape(tf.argmax(y_hat, axis=1), [-1]) y_hat = inv_mulaw_quantize(y_hat, hparams.quantize_channels) y_target = inv_mulaw_quantize(y_target, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = inv_mulaw(tf.reshape(y_hat, [-1]), hparams.quantize_channels) y_target = inv_mulaw(y_target, hparams.quantize_channels) else: y_hat = tf.reshape(y_hat, [-1]) self.y_hat = y_hat self.y_target = y_target if self.local_conditioning_enabled(): log(' local_condition: {}'.format(c.shape)) if self.has_speaker_embedding(): log(' global_condition: {}'.format(g.shape)) log(' targets: {}'.format(y_target.shape)) log(' outputs: {}'.format(y_hat.shape)) #synthesizing else: batch_size = tf.shape(c)[0] if c is None: assert synthesis_length is not None else: #[batch_size, local_condition_time, local_condition_dimension(num_mels)] message = ( 'Expected 3 dimension shape [batch_size(1), time_length, {}] for local condition features but found {}' .format(hparams.cin_channels, c.shape)) with tf.control_dependencies( [tf.assert_equal(tf.rank(c), 3, message=message)]): c = tf.identity(c, name='synthesis_assert_c_rank_op') Tc = tf.shape(c)[1] upsample_factor = audio.get_hop_size(self._hparams) #Overwrite length with respect to local condition features synthesis_length = Tc * upsample_factor #[batch_size, local_condition_dimension, local_condition_time] #time_length will be corrected using the upsample network c = tf.transpose(c, [0, 2, 1]) #Start silence frame if is_mulaw_quantize(hparams.input_type): initial_value = mulaw_quantize(0, hparams.quantize_channels) elif is_mulaw(hparams.input_type): initial_value = mulaw(0.0, hparams.quantize_channels) else: initial_value = 0.0 if is_mulaw_quantize(hparams.input_type): assert initial_value >= 0 and initial_value < hparams.quantize_channels initial_input = tf.one_hot(indices=initial_value, depth=hparams.quantize_channels, dtype=tf.float32) initial_input = tf.tile( tf.reshape(initial_input, [1, 1, hparams.quantize_channels]), [batch_size, 1, 1]) else: initial_input = tf.ones([batch_size, 1, 1], tf.float32) * initial_value y_hat = self.incremental(initial_input, c=c, g=g, time_length=synthesis_length, softmax=False, quantize=True, log_scale_min=hparams.log_scale_min) if is_mulaw_quantize(hparams.input_type): y_hat = tf.reshape(tf.argmax(y_hat, axis=1), [batch_size, -1]) self.out_node = y_hat y_hat = util.inv_mulaw_quantize(y_hat, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = util.inv_mulaw(tf.reshape(y_hat, [batch_size, -1]), hparams.quantize_channels) else: y_hat = tf.reshape(y_hat, [batch_size, -1]) self.y_hat = y_hat if self.local_conditioning_enabled(): log(' local_condition: {}'.format(c.shape)) if self.has_speaker_embedding(): log(' global_condition: {}'.format(g.shape)) log(' outputs: {}'.format(y_hat.shape)) self.variables = tf.trainable_variables() self.ema = tf.train.ExponentialMovingAverage( decay=hparams.wavenet_ema_decay)
def initialize(self, y, c, g, input_lengths, x=None, synthesis_length=None): '''Initialize wavenet graph for train, eval and test cases. ''' hparams = self._hparams self.is_training = x is not None self.is_evaluating = not self.is_training and y is not None #Set all convolutions to corresponding mode self.set_mode(self.is_training) # split_device = '/cpu:0' if self._hparams.wavenet_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:0' # with tf.device(split_device): # hp = self._hparams # lout_int = [tf.int32] * hp.wavenet_num_gpus # lout_float = [tf.float32] * hp.wavenet_num_gpus # # tower_input_lengths = tf.split(input_lengths, num_or_size_splits=hp.wavenet_num_gpus, axis=0) if input_lengths is not None else [input_lengths] * hp.wavenet_num_gpus # # tower_y = tf.split(y, num_or_size_splits=hp.wavenet_num_gpus, axis=0) if y is not None else [y] * hp.wavenet_num_gpus # tower_x = tf.split(x, num_or_size_splits=hp.wavenet_num_gpus, axis=0) if x is not None else [x] * hp.wavenet_num_gpus # tower_c = tf.split(c, num_or_size_splits=hp.wavenet_num_gpus, axis=0) if self.local_conditioning_enabled() else [None] * hp.wavenet_num_gpus # tower_g = tf.split(g, num_or_size_splits=hp.wavenet_num_gpus, axis=0) if self.global_conditioning_enabled() else [None] * hp.wavenet_num_gpus # tower_test_inputs = tf.split(test_inputs, num_or_size_splits=hp.wavenet_num_gpus, axis=0) if test_inputs is not None else [test_inputs] * hp.wavenet_num_gpus # # self.tower_y_hat_q = [] # self.tower_y_hat_train = [] # self.tower_y = [] # self.tower_input_lengths = [] # self.tower_means = [] # self.tower_log_scales = [] # self.tower_y_hat_log = [] # self.tower_y_log = [] # self.tower_c = [] # self.tower_y_eval = [] # self.tower_eval_length = [] # self.tower_y_hat = [] # self.tower_y_target = [] # self.tower_eval_c = [] # self.tower_mask = [] # self.tower_upsampled_local_features = [] # self.tower_eval_upsampled_local_features = [] # self.tower_synth_upsampled_local_features = [] # log('Initializing Wavenet model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(self.is_training)) log(' Eval mode: {}'.format(self.is_evaluating)) log(' Synthesis mode: {}'.format(not ( self.is_training or self.is_evaluating))) #1. Declare GPU devices #gpus = ['/gpu:{}'.format(i) for i in range(hp.wavenet_num_gpus)] #for i in range(hp.wavenet_num_gpus): #with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device='/cpu:0', worker_device=gpus[i])): with tf.variable_scope('inference') as scope: #log(' device: {}'.format(i)) #Training if self.is_training: batch_size = tf.shape(x)[0] #[batch_size, time_length, 1] self.mask = self.get_mask( input_lengths, maxlen=tf.shape(x)[-1]) #To be used in loss computation #[batch_size, channels, time_length] y_hat = self.step( x, c, g, softmax=False ) #softmax is automatically computed inside softmax_cross_entropy if needed if is_mulaw_quantize(hparams.input_type): #[batch_size, time_length, channels] self.y_hat_q = tf.transpose(y_hat, [0, 2, 1]) self.y_hat = y_hat self.y = y self.input_lengths = input_lengths #Add mean and scale stats if using Guassian distribution output (there would be too many logistics if using MoL) if self._hparams.out_channels == 2: self.means = self.y_hat[:, 0, :] self.log_scales = y_hat[:, 1, :] else: self.means = None #Graph extension for log saving #[batch_size, time_length] shape_control = (batch_size, tf.shape(x)[-1], 1) with tf.control_dependencies( [tf.assert_equal(tf.shape(y), shape_control)]): y_log = tf.squeeze(y, [-1]) if is_mulaw_quantize(hparams.input_type): self.y = y_log y_hat_log = tf.cond(tf.equal(tf.rank(y_hat), 4), lambda: tf.squeeze(y_hat, [-1]), lambda: y_hat) y_hat_log = tf.reshape(y_hat_log, [batch_size, hparams.out_channels, -1]) if is_mulaw_quantize(hparams.input_type): #[batch_size, time_length] y_hat_log = tf.argmax(tf.nn.softmax(y_hat_log, axis=1), 1) y_hat_log = util.inv_mulaw_quantize( y_hat_log, hparams.quantize_channels) y_log = util.inv_mulaw_quantize(y_log, hparams.quantize_channels) else: #[batch_size, time_length] if hparams.out_channels == 2: y_hat_log = sample_from_gaussian( y_hat_log, log_scale_min_gauss=hparams.log_scale_min_gauss) else: y_hat_log = sample_from_discretized_mix_logistic( y_hat_log, log_scale_min=hparams.log_scale_min) if is_mulaw(hparams.input_type): y_hat_log = util.inv_mulaw(y_hat_log, hparams.quantize_channels) y_log = util.inv_mulaw(y_log, hparams.quantize_channels) self.y_hat_log = y_hat_log self.y_log = y_log # self.tower_c.append(tower_c[i]) # self.tower_upsampled_local_features.append(self.upsampled_local_features) log(' inputs: {}'.format(x.shape)) if self.local_conditioning_enabled(): log(' local_condition: {}'.format(c.shape)) if self.has_speaker_embedding(): log(' global_condition: {}'.format(g.shape)) log(' targets: {}'.format(y_log.shape)) log(' outputs: {}'.format(y_hat_log.shape)) #evaluating elif self.is_evaluating: #[time_length, ] idx = 0 length = input_lengths[idx] y_target = tf.reshape(y[idx], [-1])[:length] #test_inputs = tf.reshape(y_target, [1, -1, 1]) if not hparams.wavenet_natural_eval else None if c is not None: c = tf.expand_dims(c[idx, :, :length], axis=0) with tf.control_dependencies( [tf.assert_equal(tf.rank(c), 3)]): c = tf.identity(c, name='eval_assert_c_rank_op') if g is not None: g = tf.expand_dims(g[idx], axis=0) batch_size = tf.shape(c)[0] #Start silence frame if is_mulaw_quantize(hparams.input_type): initial_value = mulaw_quantize(0, hparams.quantize_channels) elif is_mulaw(hparams.input_type): initial_value = mulaw(0.0, hparams.quantize_channels) else: initial_value = 0.0 #[channels, ] if is_mulaw_quantize(hparams.input_type): initial_input = tf.one_hot(indices=initial_value, depth=hparams.quantize_channels, dtype=tf.float32) initial_input = tf.tile( tf.reshape(initial_input, [1, 1, hparams.quantize_channels]), [batch_size, 1, 1]) else: initial_input = tf.ones([batch_size, 1, 1], tf.float32) * initial_value #Fast eval y_hat = self.incremental( initial_input, c=c, g=g, time_length=length, softmax=False, quantize=True, log_scale_min=hparams.log_scale_min, log_scale_min_gauss=hparams.log_scale_min_gauss) #Save targets and length for eval loss computation if is_mulaw_quantize(hparams.input_type): self.y_eval = tf.reshape(y[idx], [1, -1])[:, :length] else: self.y_eval = tf.expand_dims(y[idx], axis=0)[:, :length, :] self.eval_length = length if is_mulaw_quantize(hparams.input_type): y_hat = tf.reshape(tf.argmax(y_hat, axis=1), [-1]) y_hat = inv_mulaw_quantize(y_hat, hparams.quantize_channels) y_target = inv_mulaw_quantize(y_target, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = inv_mulaw(tf.reshape(y_hat, [-1]), hparams.quantize_channels) y_target = inv_mulaw(y_target, hparams.quantize_channels) else: y_hat = tf.reshape(y_hat, [-1]) self.y_hat = y_hat self.y_target = y_target # self.tower_eval_c.append(tower_c[i][idx]) # self.tower_eval_upsampled_local_features.append(self.upsampled_local_features[idx]) if self.local_conditioning_enabled(): log(' local_condition: {}'.format(c.shape)) if self.has_speaker_embedding(): log(' global_condition: {}'.format(g.shape)) log(' targets: {}'.format(y_target.shape)) log(' outputs: {}'.format(y_hat.shape)) #synthesizing else: batch_size = tf.shape(c)[0] if c is None: assert synthesis_length is not None else: #[batch_size, local_condition_time, local_condition_dimension(num_mels)] message = ( 'Expected 3 dimension shape [batch_size(1), time_length, {}] for local condition features but found {}' .format(hparams.cin_channels, c.shape)) with tf.control_dependencies( [tf.assert_equal(tf.rank(c), 3, message=message)]): c = tf.identity(c, name='synthesis_assert_c_rank_op') Tc = tf.shape(c)[1] upsample_factor = audio.get_hop_size(self._hparams) #Overwrite length with respect to local condition features synthesis_length = Tc * upsample_factor #[batch_size, local_condition_dimension, local_condition_time] #time_length will be corrected using the upsample network c = tf.transpose(c, [0, 2, 1]) if g is not None: assert g.shape == (batch_size, 1) #Start silence frame if is_mulaw_quantize(hparams.input_type): initial_value = mulaw_quantize(0, hparams.quantize_channels) elif is_mulaw(hparams.input_type): initial_value = mulaw(0.0, hparams.quantize_channels) else: initial_value = 0.0 if is_mulaw_quantize(hparams.input_type): assert initial_value >= 0 and initial_value < hparams.quantize_channels initial_input = tf.one_hot(indices=initial_value, depth=hparams.quantize_channels, dtype=tf.float32) initial_input = tf.tile( tf.reshape(initial_input, [1, 1, hparams.quantize_channels]), [batch_size, 1, 1]) else: initial_input = tf.ones([batch_size, 1, 1], tf.float32) * initial_value y_hat = self.incremental( initial_input, c=c, g=g, time_length=synthesis_length, softmax=False, quantize=True, log_scale_min=hparams.log_scale_min, log_scale_min_gauss=hparams.log_scale_min_gauss) if is_mulaw_quantize(hparams.input_type): y_hat = tf.reshape(tf.argmax(y_hat, axis=1), [batch_size, -1]) y_hat = util.inv_mulaw_quantize(y_hat, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = util.inv_mulaw(tf.reshape(y_hat, [batch_size, -1]), hparams.quantize_channels) else: y_hat = tf.reshape(y_hat, [batch_size, -1]) self.y_hat = y_hat #self.tower_synth_upsampled_local_features.append(self.upsampled_local_features) if self.local_conditioning_enabled(): log(' local_condition: {}'.format(c.shape)) if self.has_speaker_embedding(): log(' global_condition: {}'.format(g.shape)) log(' outputs: {}'.format(y_hat.shape)) self.variables = tf.trainable_variables() log(' Receptive Field: ({} samples / {:.1f} ms)'.format( self.receptive_field, self.receptive_field / hparams.sample_rate * 1000.)) #1_000_000 is causing syntax problems for some people?! Python please :) log(' WaveNet Parameters: {:.3f} Million.'.format( np.sum([np.prod(v.get_shape().as_list()) for v in self.variables]) / 1000000)) self.ema = tf.train.ExponentialMovingAverage( decay=hparams.wavenet_ema_decay)