Example #1
0
def reference_encoder(inputs,
                      filters,
                      kernel_size,
                      strides,
                      encoder_cell,
                      is_training,
                      scope='ref_encoder'):
    with tf.variable_scope(scope):
        ref_outputs = tf.expand_dims(inputs, axis=-1)
        # CNN stack
        for i, channel in enumerate(filters):
            ref_outputs = conv2d(ref_outputs, channel, kernel_size, strides,
                                 tf.nn.relu, is_training, 'conv2d_%d' % i)

        shapes = shape_list(ref_outputs)
        ref_outputs = tf.reshape(ref_outputs,
                                 shapes[:-2] + [shapes[2] * shapes[3]])
        # RNN
        encoder_outputs, encoder_state = tf.nn.dynamic_rnn(encoder_cell,
                                                           ref_outputs,
                                                           dtype=tf.float32)

        reference_state = tf.layers.dense(encoder_outputs[:, -1, :],
                                          128,
                                          activation=tf.nn.tanh)  # [N, 128]
        return reference_state
 def _combine_heads(self, x):
     '''Combine all heads
    Returns:
        a Tensor with shape [batch, length_x, shape_x[-1] * shape_x[-3]]
 '''
     x = tf.transpose(x, [0, 2, 1, 3])
     x_shape = shape_list(x)
     return tf.reshape(x, x_shape[:-2] + [self.num_heads * x_shape[-1]])
 def _split_last_dimension(self, x, num_heads):
     '''Reshape x to num_heads
 Returns:
     a Tensor with shape [batch, length_x, num_heads, dim_x/num_heads]
 '''
     x_shape = shape_list(x)
     dim = x_shape[-1]
     assert dim % num_heads == 0
     return tf.reshape(x, x_shape[:-1] + [num_heads, dim // num_heads])
 def _split_heads(self, q, k, v):
     '''Split the channels into multiple heads
 
 Returns:
      Tensors with shape [batch, num_heads, length_x, dim_x/num_heads]
 '''
     qs = tf.transpose(self._split_last_dimension(q, self.num_heads),
                       [0, 2, 1, 3])
     ks = tf.transpose(self._split_last_dimension(k, self.num_heads),
                       [0, 2, 1, 3])
     v_shape = shape_list(v)
     vs = tf.tile(tf.expand_dims(v, axis=1), [1, self.num_heads, 1, 1])
     return qs, ks, vs
Example #5
0
	def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False,
			global_step=None, is_training=False, is_evaluating=False, split_infos=None, reference_mel=None):
		"""
		Initializes the model for inference
		sets "mel_outputs" and "alignments" fields.
		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
		if mel_targets is None and stop_token_targets is not None:
			raise ValueError('no multi targets were provided but token_targets were given')
		if mel_targets is not None and stop_token_targets is None and not gta:
			raise ValueError('Mel targets are provided without corresponding token_targets')
		if not gta and self._hparams.predict_linear==True and linear_targets is None and is_training:
			raise ValueError('Model is set to use post processing to predict linear spectrograms in training but no linear targets given!')
		if gta and linear_targets is not None:
			raise ValueError('Linear spectrogram prediction is not supported in GTA mode!')
		if is_training and self._hparams.mask_decoder and targets_lengths is None:
			raise RuntimeError('Model set to mask paddings but no targets lengths provided for the mask!')
		if is_training and is_evaluating:
			raise RuntimeError('Model can not be in training and evaluation modes at the same time!')

		split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:{}'.format(self._hparams.tacotron_gpu_start_idx)
		with tf.device(split_device):
			hp = self._hparams
			lout_int = [tf.int32]*hp.tacotron_num_gpus
			lout_float = [tf.float32]*hp.tacotron_num_gpus

			tower_input_lengths = tf.split(input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0)
			tower_targets_lengths = tf.split(targets_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if targets_lengths is not None else targets_lengths

			p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]], lout_int)
			p_mel_targets = tf.py_func(split_func, [mel_targets, split_infos[:,1]], lout_float) if mel_targets is not None else mel_targets
			p_stop_token_targets = tf.py_func(split_func, [stop_token_targets, split_infos[:,2]], lout_float) if stop_token_targets is not None else stop_token_targets
			p_linear_targets = tf.py_func(split_func, [linear_targets, split_infos[:,3]], lout_float) if linear_targets is not None else linear_targets

			tower_inputs = []
			tower_mel_targets = []
			tower_stop_token_targets = []
			tower_linear_targets = []

			batch_size = tf.shape(inputs)[0]
			mel_channels = hp.num_mels
			linear_channels = hp.num_freq
			for i in range (hp.tacotron_num_gpus):
				tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1]))
				if p_mel_targets is not None:
					tower_mel_targets.append(tf.reshape(p_mel_targets[i], [batch_size, -1, mel_channels]))
				if p_stop_token_targets is not None:
					tower_stop_token_targets.append(tf.reshape(p_stop_token_targets[i], [batch_size, -1]))
				if p_linear_targets is not None:
					tower_linear_targets.append(tf.reshape(p_linear_targets[i], [batch_size, -1, linear_channels]))

		self.tower_decoder_output = []
		self.tower_alignments = []
		self.tower_stop_token_prediction = []
		self.tower_mel_outputs = []
		self.tower_linear_outputs = []

		tower_embedded_inputs = []
		tower_enc_conv_output_shape = []
		tower_encoder_outputs = []
		tower_residual = []
		tower_projected_residual = []
		
		# 1. Declare GPU Devices
		gpus = ["/gpu:{}".format(i) for i in range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx+hp.tacotron_num_gpus)]
		for i in range(hp.tacotron_num_gpus):
			with tf.device(tf.train.replica_device_setter(ps_tasks=1,ps_device="/cpu:0",worker_device=gpus[i])):
				with tf.variable_scope('inference') as scope:
					assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled')
					if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
						assert global_step is not None

					#GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
					post_condition = hp.predict_linear and not gta

					# Embeddings ==> [batch_size, sequence_length, embedding_dim]
					self.embedding_table = tf.get_variable(
						'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32)
					embedded_inputs = tf.nn.embedding_lookup(self.embedding_table, tower_inputs[i])

					if hp.use_gst:
						#Global style tokens (GST)
						gst_tokens = tf.get_variable('style_tokens', [hp.num_gst, hp.style_embed_depth // hp.num_heads], 
							dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5))
						self.gst_tokens = gst_tokens

					#Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
					encoder_cell = TacotronEncoderCell(
						EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'),
						EncoderRNN(is_training, size=hp.encoder_lstm_units,
							zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM'))

					encoder_outputs = encoder_cell(embedded_inputs, tower_input_lengths[i])

					#For shape visualization purpose
					enc_conv_output_shape = encoder_cell.conv_output_shape
					if is_training:
						reference_mel = mel_targets
						
					if reference_mel is not None:
						# Reference encoder
						refnet_outputs = reference_encoder(
						reference_mel, 
						filters=hp.reference_filters, 
						kernel_size=(3,3),
						strides=(2,2),
						encoder_cell=GRUCell(hp.reference_depth),
						is_training=is_training)                                                 # [N, 128]
						self.refnet_outputs = refnet_outputs

						if hp.use_gst:
						# Style attention
							style_attention = MultiheadAttention(
								tf.expand_dims(refnet_outputs, axis=1),                                   # [N, 1, 128]
								tf.tanh(tf.tile(tf.expand_dims(gst_tokens, axis=0), [batch_size,1,1])),            # [N, hp.num_gst, 256/hp.num_heads]   
								num_heads=hp.num_heads,
								num_units=hp.style_att_dim,
								attention_type=hp.style_att_type)

							style_embeddings = style_attention.multi_head_attention() 
						else:
							style_embeddings = tf.expand_dims(refnet_outputs, axis=1)                   # [N, 1, 128]
					else:
						if hp.use_gst:
							print("Use random weight for GST.")
							random_weights = tf.random_uniform([hp.num_heads, hp.num_gst], maxval=1.0, dtype=tf.float32)
							random_weights = tf.nn.softmax(random_weights, name="random_weights")
							print("random_weights:",random_weights)
							style_embeddings = tf.matmul(random_weights, tf.nn.tanh(gst_tokens))
							style_embeddings = tf.reshape(style_embeddings, [1, 1] + [hp.num_heads * gst_tokens.get_shape().as_list()[1]])
					

					#Extend style embeddings to be compatible with encoder_outputs. 
					#Make encoder_output's dimensions by concatenating style embeddings with a vector of all zeroes.
					#Preserves effect of both style and encoder_outputs.
					if hp.use_gst:
						neg = tf.add(style_embeddings, tf.negative(style_embeddings))
						style_embeddings = tf.concat([style_embeddings, neg], axis=-1)
						# Add style embedding to every text encoder state
						style_embeddings = tf.tile(style_embeddings, [1, shape_list(encoder_outputs)[1], 1]) # [N, T_in, 128]
						encoder_outputs = tf.add(encoder_outputs, style_embeddings)   

					#Decoder Parts
					#Attention Decoder Prenet
					prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet')
					#Attention Mechanism
					attention_mechanism = LocationSensitiveAttention(hp.attention_dim, encoder_outputs, hparams=hp,
						mask_encoder=hp.mask_encoder, memory_sequence_length=tf.reshape(tower_input_lengths[i], [-1]), smoothing=hp.smoothing,
						cumulate_weights=hp.cumulative_weights)
					#Decoder LSTM Cells
					decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers,
						size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_LSTM')
					#Frames Projection layer
					frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform_projection')
					#<stop_token> projection layer
					stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection')


					#Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
					decoder_cell = TacotronDecoderCell(
						prenet,
						attention_mechanism,
						decoder_lstm,
						frame_projection,
						stop_projection)


					#Define the helper for our decoder
					if is_training or is_evaluating or gta:
						self.helper = TacoTrainingHelper(batch_size, tower_mel_targets[i], hp, gta, is_evaluating, global_step)
					else:
						self.helper = TacoTestHelper(batch_size, hp)


					#initial decoder state
					decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32)

					#Only use max iterations at synthesis time
					max_iters = hp.max_iters if not (is_training or is_evaluating) else None

					#Decode
					(frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode(
						CustomDecoder(decoder_cell, self.helper, decoder_init_state),
						impute_finished=False,
						maximum_iterations=max_iters,
						swap_memory=hp.tacotron_swap_with_cpu)


					# Reshape outputs to be one output per entry 
					#==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
					decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels])
					stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1])

					#Postnet
					postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions')

					#Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
					residual = postnet(decoder_output)

					#Project residual to same dimension as mel spectrogram 
					#==> [batch_size, decoder_steps * r, num_mels]
					residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection')
					projected_residual = residual_projection(residual)


					#Compute the mel spectrogram
					mel_outputs = decoder_output + projected_residual


					if post_condition:
						# Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs.
						post_cbhg = CBHG(hp.cbhg_kernels, hp.cbhg_conv_channels, hp.cbhg_pool_size, [hp.cbhg_projection, hp.num_mels],
							hp.cbhg_projection_kernel_size, hp.cbhg_highwaynet_layers, 
							hp.cbhg_highway_units, hp.cbhg_rnn_units, is_training, name='CBHG_postnet')

						#[batch_size, decoder_steps(mel_frames), cbhg_channels]
						post_outputs = post_cbhg(mel_outputs, None)

						#Linear projection of extracted features to make linear spectrogram
						linear_specs_projection = FrameProjection(hp.num_freq, scope='cbhg_linear_specs_projection')

						#[batch_size, decoder_steps(linear_frames), num_freq]
						linear_outputs = linear_specs_projection(post_outputs)

					#Grab alignments from the final decoder state
					alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0])

					self.tower_decoder_output.append(decoder_output)
					self.tower_alignments.append(alignments)
					self.tower_stop_token_prediction.append(stop_token_prediction)
					self.tower_mel_outputs.append(mel_outputs)
					tower_embedded_inputs.append(embedded_inputs)
					tower_enc_conv_output_shape.append(enc_conv_output_shape)
					tower_encoder_outputs.append(encoder_outputs)
					tower_residual.append(residual)
					tower_projected_residual.append(projected_residual)

					if post_condition:
						self.tower_linear_outputs.append(linear_outputs)
			log('initialisation done {}'.format(gpus[i]))


		if is_training:
			self.ratio = self.helper._ratio
		self.tower_inputs = tower_inputs
		self.tower_input_lengths = tower_input_lengths
		self.tower_mel_targets = tower_mel_targets
		self.tower_linear_targets = tower_linear_targets
		self.tower_targets_lengths = tower_targets_lengths
		self.tower_stop_token_targets = tower_stop_token_targets

		self.all_vars = tf.trainable_variables()

		log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
		log('  Train mode:               {}'.format(is_training))
		log('  Eval mode:                {}'.format(is_evaluating))
		log('  GTA mode:                 {}'.format(gta))
		log('  Synthesis mode:           {}'.format(not (is_training or is_evaluating)))
		log('  Input:                    {}'.format(inputs.shape))
		for i in range(hp.tacotron_num_gpus+hp.tacotron_gpu_start_idx):
			log('  device:                   {}'.format(i))
			log('  embedding:                {}'.format(tower_embedded_inputs[i].shape))
			log('  enc conv out:             {}'.format(tower_enc_conv_output_shape[i]))
			log('  encoder out:              {}'.format(tower_encoder_outputs[i].shape))
			log('  decoder out:              {}'.format(self.tower_decoder_output[i].shape))
			log('  residual out:             {}'.format(tower_residual[i].shape))
			log('  projected residual out:   {}'.format(tower_projected_residual[i].shape))
			log('  mel out:                  {}'.format(self.tower_mel_outputs[i].shape))
			if post_condition:
				log('  linear out:               {}'.format(self.tower_linear_outputs[i].shape))
			log('  <stop_token> out:         {}'.format(self.tower_stop_token_prediction[i].shape))

			#1_000_000 is causing syntax problems for some people?! Python please :)
			log('  Tacotron Parameters       {:.3f} Million.'.format(np.sum([np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))