def _decode(self, input_dict): """ Decodes representation into data Args: input_dict (dict): Python dictionary with inputs to decoder. Must define: * src_inputs - decoder input Tensor of shape [batch_size, time, dim] or [time, batch_size, dim] * src_lengths - decoder input lengths Tensor of shape [batch_size] * tgt_inputs - Only during training. labels Tensor of the shape [batch_size, time, num_features] or [time, batch_size, num_features] * stop_token_inputs - Only during training. labels Tensor of the shape [batch_size, time, 1] or [time, batch_size, 1] * tgt_lengths - Only during training. labels lengths Tensor of the shape [batch_size] Returns: dict: A python dictionary containing: * outputs - array containing: * decoder_output - tensor of shape [batch_size, time, num_features] or [time, batch_size, num_features]. Spectrogram representation learned by the decoder rnn * spectrogram_prediction - tensor of shape [batch_size, time, num_features] or [time, batch_size, num_features]. Spectrogram containing the residual corrections from the postnet if enabled * alignments - tensor of shape [batch_size, time, memory_size] or [time, batch_size, memory_size]. The alignments learned by the attention layer * stop_token_prediction - tensor of shape [batch_size, time, 1] or [time, batch_size, 1]. The stop token predictions * final_sequence_lengths - tensor of shape [batch_size] * stop_token_predictions - tensor of shape [batch_size, time, 1] or [time, batch_size, 1]. The stop token predictions for use inside the loss function. """ encoder_outputs = input_dict['encoder_output']['outputs'] enc_src_lengths = input_dict['encoder_output']['src_length'] if self._mode == "train": spec = input_dict['target_tensors'][0] if 'target_tensors' in \ input_dict else None spec_length = input_dict['target_tensors'][2] if 'target_tensors' in \ input_dict else None _batch_size = encoder_outputs.get_shape().as_list()[0] training = (self._mode == "train") regularizer = self.params.get('regularizer', None) if self.params.get('enable_postnet', True): if "postnet_conv_layers" not in self.params: raise ValueError( "postnet_conv_layers must be passed from config file if postnet is" "enabled" ) if self._both: num_audio_features = self._n_feats["mel"] if self._mode == "train": spec, _ = tf.split( spec, [self._n_feats['mel'], self._n_feats['magnitude']], axis=2 ) else: num_audio_features = self._n_feats output_projection_layer = tf.layers.Dense( name="output_proj", units=num_audio_features, use_bias=True, ) stop_token_projection_layer = tf.layers.Dense( name="stop_token_proj", units=1, use_bias=True, ) prenet = None if self.params.get('enable_prenet', True): prenet = Prenet( self.params.get('prenet_units', 256), self.params.get('prenet_layers', 2), self.params.get("prenet_activation", tf.nn.relu), self.params["dtype"] ) cell_params = {} cell_params["num_units"] = self.params['decoder_cell_units'] decoder_cells = [ single_cell( cell_class=self.params['decoder_cell_type'], cell_params=cell_params, zoneout_prob=self.params.get("zoneout_prob", 0.), dp_output_keep_prob=1.-self.params.get("dropout_prob", 0.1), training=training, ) for _ in range(self.params['decoder_layers']) ] if self.params['attention_type'] is not None: attention_mechanism = self._build_attention( encoder_outputs, enc_src_lengths, self.params.get("attention_bias", False) ) attention_cell = tf.contrib.rnn.MultiRNNCell(decoder_cells) attentive_cell = AttentionWrapper( cell=attention_cell, attention_mechanism=attention_mechanism, alignment_history=True, output_attention="both", ) decoder_cell = attentive_cell if self.params['attention_type'] is None: decoder_cell = tf.contrib.rnn.MultiRNNCell(decoder_cells) if self._mode == "train": train_and_not_sampling = True helper = TacotronTrainingHelper( inputs=spec, sequence_length=spec_length, prenet=None, model_dtype=self.params["dtype"], mask_decoder_sequence=self.params.get("mask_decoder_sequence", True) ) elif self._mode == "eval" or self._mode == "infer": train_and_not_sampling = False inputs = tf.zeros( (_batch_size, 1, num_audio_features), dtype=self.params["dtype"] ) helper = TacotronHelper( inputs=inputs, prenet=None, mask_decoder_sequence=self.params.get("mask_decoder_sequence", True) ) else: raise ValueError("Unknown mode for decoder: {}".format(self._mode)) decoder = TacotronDecoder( decoder_cell=decoder_cell, helper=helper, initial_decoder_state=decoder_cell.zero_state( _batch_size, self.params["dtype"] ), attention_type=self.params["attention_type"], spec_layer=output_projection_layer, stop_token_layer=stop_token_projection_layer, prenet=prenet, dtype=self.params["dtype"], train=train_and_not_sampling ) if self._mode == 'train': maximum_iterations = tf.reduce_max(spec_length) else: maximum_iterations = tf.reduce_max(enc_src_lengths) * 10 outputs, final_state, sequence_lengths = tf.contrib.seq2seq.dynamic_decode( # outputs, final_state, sequence_lengths, final_inputs = dynamic_decode( decoder=decoder, impute_finished=False, maximum_iterations=maximum_iterations, swap_memory=self.params.get("use_swap_memory", False), output_time_major=self.params.get("time_major", False), parallel_iterations=self.params.get("parallel_iterations", 32) ) decoder_output = outputs.rnn_output stop_token_logits = outputs.stop_token_output with tf.variable_scope("decoder"): # If we are in train and doing sampling, we need to do the projections if train_and_not_sampling: decoder_spec_output = output_projection_layer(decoder_output) stop_token_logits = stop_token_projection_layer(decoder_spec_output) decoder_output = decoder_spec_output ## Add the post net ## if self.params.get('enable_postnet', True): dropout_keep_prob = self.params.get('postnet_keep_dropout_prob', 0.5) top_layer = decoder_output for i, conv_params in enumerate(self.params['postnet_conv_layers']): ch_out = conv_params['num_channels'] kernel_size = conv_params['kernel_size'] # [time, freq] strides = conv_params['stride'] padding = conv_params['padding'] activation_fn = conv_params['activation_fn'] if ch_out == -1: if self._both: ch_out = self._n_feats["mel"] else: ch_out = self._n_feats top_layer = conv_bn_actv( layer_type="conv1d", name="conv{}".format(i + 1), inputs=top_layer, filters=ch_out, kernel_size=kernel_size, activation_fn=activation_fn, strides=strides, padding=padding, regularizer=regularizer, training=training, data_format=self.params.get('postnet_data_format', 'channels_last'), bn_momentum=self.params.get('postnet_bn_momentum', 0.1), bn_epsilon=self.params.get('postnet_bn_epsilon', 1e-5), ) top_layer = tf.layers.dropout( top_layer, rate=1. - dropout_keep_prob, training=training ) else: top_layer = tf.zeros( [ _batch_size, maximum_iterations, outputs.rnn_output.get_shape()[-1] ], dtype=self.params["dtype"] ) if regularizer and training: vars_to_regularize = [] vars_to_regularize += attentive_cell.trainable_variables vars_to_regularize += attention_mechanism.memory_layer.trainable_variables vars_to_regularize += output_projection_layer.trainable_variables vars_to_regularize += stop_token_projection_layer.trainable_variables for weights in vars_to_regularize: if "bias" not in weights.name: # print("Added regularizer to {}".format(weights.name)) if weights.dtype.base_dtype == tf.float16: tf.add_to_collection( 'REGULARIZATION_FUNCTIONS', (weights, regularizer) ) else: tf.add_to_collection( ops.GraphKeys.REGULARIZATION_LOSSES, regularizer(weights) ) if self.params.get('enable_prenet', True): prenet.add_regularization(regularizer) if self.params['attention_type'] is not None: alignments = tf.transpose( final_state.alignment_history.stack(), [1, 0, 2] ) else: alignments = tf.zeros([_batch_size, _batch_size, _batch_size]) spectrogram_prediction = decoder_output + top_layer if self._both: mag_spec_prediction = spectrogram_prediction mag_spec_prediction = conv_bn_actv( layer_type="conv1d", name="conv_0", inputs=mag_spec_prediction, filters=256, kernel_size=4, activation_fn=tf.nn.relu, strides=1, padding="SAME", regularizer=regularizer, training=training, data_format=self.params.get('postnet_data_format', 'channels_last'), bn_momentum=self.params.get('postnet_bn_momentum', 0.1), bn_epsilon=self.params.get('postnet_bn_epsilon', 1e-5), ) mag_spec_prediction = conv_bn_actv( layer_type="conv1d", name="conv_1", inputs=mag_spec_prediction, filters=512, kernel_size=4, activation_fn=tf.nn.relu, strides=1, padding="SAME", regularizer=regularizer, training=training, data_format=self.params.get('postnet_data_format', 'channels_last'), bn_momentum=self.params.get('postnet_bn_momentum', 0.1), bn_epsilon=self.params.get('postnet_bn_epsilon', 1e-5), ) if self._model.get_data_layer()._exp_mag: mag_spec_prediction = tf.exp(mag_spec_prediction) mag_spec_prediction = tf.layers.conv1d( mag_spec_prediction, self._n_feats["magnitude"], 1, name="post_net_proj", use_bias=False, ) else: mag_spec_prediction = tf.zeros([_batch_size, _batch_size, _batch_size]) stop_token_prediction = tf.sigmoid(stop_token_logits) outputs = [ decoder_output, spectrogram_prediction, alignments, stop_token_prediction, sequence_lengths, mag_spec_prediction ] return { 'outputs': outputs, 'stop_token_prediction': stop_token_logits, }
def _encode(self, input_dict): """Creates TensorFlow graph for Tacotron-2 like encoder. Args: input_dict (dict): dictionary with inputs. Must define: source_tensors - array containing [ * source_sequence: tensor of shape [batch_size, sequence length] * src_length: tensor of shape [batch_size] ] Returns: dict: A python dictionary containing: * outputs - tensor containing the encoded text to be passed to the attention layer * src_length - the length of the encoded text """ text = input_dict['source_tensors'][0] text_len = input_dict['source_tensors'][1] training = (self._mode == "train") regularizer = self.params.get('regularizer', None) data_format = self.params.get('data_format', 'channels_last') src_vocab_size = self._model.get_data_layer().params['src_vocab_size'] zoneout_prob = self.params.get('zoneout_prob', 0.) # if src_vocab_size % 8 != 0: # src_vocab_size += 8 - (src_vocab_size % 8) # ----- Embedding layer ----------------------------------------------- enc_emb_w = tf.get_variable( name="EncoderEmbeddingMatrix", shape=[src_vocab_size, self.params['src_emb_size']], dtype=self.params['dtype'], # initializer=tf.random_normal_initializer() ) embedded_inputs = tf.cast(tf.nn.embedding_lookup( enc_emb_w, text, ), self.params['dtype']) # ----- Convolutional layers ----------------------------------------------- input_layer = embedded_inputs if data_format == 'channels_last': top_layer = input_layer else: top_layer = tf.transpose(input_layer, [0, 2, 1]) for i, conv_params in enumerate(self.params['conv_layers']): ch_out = conv_params['num_channels'] kernel_size = conv_params['kernel_size'] # [time, freq] strides = conv_params['stride'] padding = conv_params['padding'] if padding == "VALID": text_len = (text_len - kernel_size[0] + strides[0]) // strides[0] else: text_len = (text_len + strides[0] - 1) // strides[0] top_layer = conv_bn_actv( layer_type="conv1d", name="conv{}".format(i + 1), inputs=top_layer, filters=ch_out, kernel_size=kernel_size, activation_fn=self.params['activation_fn'], strides=strides, padding=padding, regularizer=regularizer, training=training, data_format=data_format, bn_momentum=self.params.get('bn_momentum', 0.1), bn_epsilon=self.params.get('bn_epsilon', 1e-5), ) top_layer = tf.layers.dropout(top_layer, rate=self.params["cnn_dropout_prob"], training=training) if data_format == 'channels_first': top_layer = tf.transpose(top_layer, [0, 2, 1]) # ----- RNN --------------------------------------------------------------- num_rnn_layers = self.params['num_rnn_layers'] if num_rnn_layers > 0: cell_params = {} cell_params["num_units"] = self.params['rnn_cell_dim'] rnn_type = self.params['rnn_type'] rnn_input = top_layer rnn_vars = [] if self.params["use_cudnn_rnn"]: if self._mode == "infer": cell = lambda: tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell( cell_params["num_units"]) cells_fw = [cell() for _ in range(1)] cells_bw = [cell() for _ in range(1)] (top_layer, _, _) = tf.contrib.rnn.stack_bidirectional_dynamic_rnn( cells_fw, cells_bw, rnn_input, sequence_length=text_len, dtype=rnn_input.dtype, time_major=False) else: all_cudnn_classes = [ i[1] for i in inspect.getmembers( tf.contrib.cudnn_rnn, inspect.isclass) ] if not rnn_type in all_cudnn_classes: raise TypeError("rnn_type must be a Cudnn RNN class") if zoneout_prob != 0.: raise ValueError( "Zoneout is currently not supported for cudnn rnn classes" ) rnn_input = tf.transpose(top_layer, [1, 0, 2]) if self.params['rnn_unidirectional']: direction = cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION else: direction = cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION rnn_block = rnn_type(num_layers=num_rnn_layers, num_units=cell_params["num_units"], direction=direction, dtype=rnn_input.dtype, name="cudnn_rnn") rnn_block.build(rnn_input.get_shape()) top_layer, _ = rnn_block(rnn_input) top_layer = tf.transpose(top_layer, [1, 0, 2]) rnn_vars += rnn_block.trainable_variables else: multirnn_cell_fw = tf.nn.rnn_cell.MultiRNNCell([ single_cell(cell_class=rnn_type, cell_params=cell_params, zoneout_prob=zoneout_prob, training=training, residual_connections=False) for _ in range(num_rnn_layers) ]) rnn_vars += multirnn_cell_fw.trainable_variables if self.params['rnn_unidirectional']: top_layer, _ = tf.nn.dynamic_rnn( cell=multirnn_cell_fw, inputs=rnn_input, sequence_length=text_len, dtype=rnn_input.dtype, time_major=False, ) else: multirnn_cell_bw = tf.nn.rnn_cell.MultiRNNCell([ single_cell(cell_class=rnn_type, cell_params=cell_params, zoneout_prob=zoneout_prob, training=training, residual_connections=False) for _ in range(num_rnn_layers) ]) top_layer, _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=multirnn_cell_fw, cell_bw=multirnn_cell_bw, inputs=rnn_input, sequence_length=text_len, dtype=rnn_input.dtype, time_major=False) # concat 2 tensors [B, T, n_cell_dim] --> [B, T, 2*n_cell_dim] top_layer = tf.concat(top_layer, 2) rnn_vars += multirnn_cell_bw.trainable_variables if regularizer and training: cell_weights = [] cell_weights += rnn_vars cell_weights += [enc_emb_w] for weights in cell_weights: if "bias" not in weights.name: # print("Added regularizer to {}".format(weights.name)) if weights.dtype.base_dtype == tf.float16: tf.add_to_collection('REGULARIZATION_FUNCTIONS', (weights, regularizer)) else: tf.add_to_collection( ops.GraphKeys.REGULARIZATION_LOSSES, regularizer(weights)) # -- end of rnn------------------------------------------------------------ top_layer = tf.layers.dropout(top_layer, rate=self.params["rnn_dropout_prob"], training=training) outputs = top_layer return {'outputs': outputs, 'src_length': text_len}
def _encode(self, input_dict): """Encodes data into representation. Args: input_dict: a Python dictionary. Must define: * src_inputs - a Tensor of shape [batch_size, time] or [time, batch_size] (depending on time_major param) * src_lengths - a Tensor of shape [batch_size] Returns: a Python dictionary with: * encoder_outputs - a Tensor of shape [batch_size, time, representation_dim] or [time, batch_size, representation_dim] * encoder_state - a Tensor of shape [batch_size, dim] * src_lengths - (copy ref from input) a Tensor of shape [batch_size] """ # TODO: make a separate level of config for cell_params? source_sequence = input_dict['source_tensors'][0] source_length = input_dict['source_tensors'][1] self._enc_emb_w = tf.get_variable( name="EncoderEmbeddingMatrix", shape=[self._src_vocab_size, self._src_emb_size], dtype=tf.float32, ) if self._mode == "train": dp_input_keep_prob = self.params['encoder_dp_input_keep_prob'] dp_output_keep_prob = self.params['encoder_dp_output_keep_prob'] else: dp_input_keep_prob = 1.0 dp_output_keep_prob = 1.0 fwd_cells = [ single_cell( cell_class=self.params['core_cell'], cell_params=self.params.get('core_cell_params', {}), dp_input_keep_prob=dp_input_keep_prob, dp_output_keep_prob=dp_output_keep_prob, residual_connections=self.params['encoder_use_skip_connections'] ) for _ in range(self.params['encoder_layers']) ] # pylint: disable=no-member self._encoder_cell_fw = tf.contrib.rnn.MultiRNNCell(fwd_cells) time_major = self.params.get("time_major", False) use_swap_memory = self.params.get("use_swap_memory", False) embedded_inputs = tf.cast( tf.nn.embedding_lookup( self.enc_emb_w, source_sequence, ), self.params['dtype'], ) encoder_outputs, encoder_state = tf.nn.dynamic_rnn( cell=self._encoder_cell_fw, inputs=embedded_inputs, sequence_length=source_length, time_major=time_major, swap_memory=use_swap_memory, dtype=embedded_inputs.dtype, ) return {'outputs': encoder_outputs, 'state': encoder_state, 'src_lengths': source_length, 'encoder_input': source_sequence}
def _encode(self, input_dict): source_sequence = input_dict['source_tensors'][0] source_length = input_dict['source_tensors'][1] self._enc_emb_w = tf.get_variable( name="EncoderEmbeddingMatrix", shape=[self._src_vocab_size, self._src_emb_size], dtype=tf.float32, ) if self.params['encoder_layers'] < 2: raise ValueError("GNMT encoder must have at least 2 layers") with tf.variable_scope("Level1FW"): self._encoder_l1_cell_fw = single_cell( cell_class=self.params['core_cell'], cell_params=self.params.get('core_cell_params', {}), dp_input_keep_prob=1.0, dp_output_keep_prob=1.0, residual_connections=False, ) with tf.variable_scope("Level1BW"): self._encoder_l1_cell_bw = single_cell( cell_class=self.params['core_cell'], cell_params=self.params.get('core_cell_params', {}), dp_input_keep_prob=1.0, dp_output_keep_prob=1.0, residual_connections=False, ) if self._mode == "train": dp_input_keep_prob = self.params['encoder_dp_input_keep_prob'] dp_output_keep_prob = self.params['encoder_dp_output_keep_prob'] else: dp_input_keep_prob = 1.0 dp_output_keep_prob = 1.0 with tf.variable_scope("UniDirLevel"): self._encoder_cells = [ single_cell( cell_class=self.params['core_cell'], cell_params=self.params.get('core_cell_params', {}), dp_input_keep_prob=dp_input_keep_prob, dp_output_keep_prob=dp_output_keep_prob, residual_connections=False, ) for _ in range(self.params['encoder_layers'] - 1) ] # add residual connections starting from the third layer for idx, cell in enumerate(self._encoder_cells): if idx > 0: # pylint: disable=no-member self._encoder_cells[idx] = tf.contrib.rnn.ResidualWrapper(cell) time_major = self.params.get("time_major", False) use_swap_memory = self.params.get("use_swap_memory", False) embedded_inputs = tf.cast( tf.nn.embedding_lookup( self.enc_emb_w, source_sequence, ), self.params['dtype'], ) # first bi-directional layer _encoder_output, _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=self._encoder_l1_cell_fw, cell_bw=self._encoder_l1_cell_bw, inputs=embedded_inputs, sequence_length=source_length, swap_memory=use_swap_memory, time_major=time_major, dtype=embedded_inputs.dtype, ) encoder_l1_outputs = tf.concat(_encoder_output, 2) # stack of unidirectional layers # pylint: disable=no-member encoder_outputs, encoder_state = tf.nn.dynamic_rnn( cell=tf.contrib.rnn.MultiRNNCell(self._encoder_cells), inputs=encoder_l1_outputs, sequence_length=source_length, swap_memory=use_swap_memory, time_major=time_major, dtype=encoder_l1_outputs.dtype, ) return {'outputs': encoder_outputs, 'state': encoder_state, 'src_lengths': source_length, 'encoder_input': source_sequence}
def _encode(self, input_dict): """ Encodes data into representation :param input_dict: a Python dictionary. Must define: * src_inputs - a Tensor of shape [batch_size, time] or [time, batch_size] (depending on time_major param) * src_lengths - a Tensor of shape [batch_size] :return: a Python dictionary with: * encoder_outputs - a Tensor of shape [batch_size, time, representation_dim] or [time, batch_size, representation_dim] * encoder_state - a Tensor of shape [batch_size, dim] * src_lengths - (copy ref from input) a Tensor of shape [batch_size] """ time_major = self.params.get("time_major", False) use_swap_memory = self.params.get("use_swap_memory", False) regularizer = self.params.get('regularizer', None) fc_use_bias = self.params.get('fc_use_bias', True) use_cudnn_rnn = self.params.get("use_cudnn_rnn", False) cudnn_rnn_type = self.params.get("cudnn_rnn_type", None) if 'initializer' in self.params: init_dict = self.params.get('initializer_params', {}) initializer = self.params['initializer'](**init_dict) else: initializer = None if self._mode == "train": dp_input_keep_prob = self.params['encoder_dp_input_keep_prob'] dp_output_keep_prob = self.params['encoder_dp_output_keep_prob'] last_input_keep_prob = self.params['encoder_last_input_keep_prob'] last_output_keep_prob = self.params[ 'encoder_last_output_keep_prob'] emb_keep_prob = self.params['encoder_emb_keep_prob'] recurrent_keep_prob = self.params['recurrent_keep_prob'] input_weight_keep_prob = self.params['input_weight_keep_prob'] recurrent_weight_keep_prob = self.params[ 'recurrent_weight_keep_prob'] else: dp_input_keep_prob, dp_output_keep_prob = 1.0, 1.0 last_input_keep_prob, last_output_keep_prob = 1.0, 1.0 emb_keep_prob, recurrent_keep_prob = 1.0, 1.0 input_weight_keep_prob, recurrent_weight_keep_prob = 1.0, 1.0 self._output_layer = tf.layers.Dense(self._fc_dim, kernel_regularizer=regularizer, kernel_initializer=initializer, use_bias=fc_use_bias, dtype=self._params['dtype']) if self._weight_tied: last_cell_params = copy.deepcopy(self.params['core_cell_params']) last_cell_params['num_units'] = self._emb_size else: last_cell_params = self.params['core_cell_params'] last_output_dim = last_cell_params['num_units'] if self._use_cell_state: last_output_dim = 2 * last_output_dim fake_input = tf.zeros(shape=(1, last_output_dim), dtype=self._params['dtype']) fake_output = self._output_layer.apply(fake_input) with tf.variable_scope("dense", reuse=True): dense_weights = tf.get_variable("kernel") dense_biases = tf.get_variable("bias") if self._weight_tied and self._lm_phase: enc_emb_w = tf.transpose(dense_weights) else: enc_emb_w = tf.get_variable( name="EncoderEmbeddingMatrix", shape=[self._vocab_size, self._emb_size], dtype=self._params['dtype']) self._enc_emb_w = tf.nn.dropout(enc_emb_w, keep_prob=emb_keep_prob) if use_cudnn_rnn: if self._mode == 'train' or self._mode == 'eval': all_cudnn_classes = [ i[1] for i in inspect.getmembers(tf.contrib.cudnn_rnn, inspect.isclass) ] if not cudnn_rnn_type in all_cudnn_classes: raise TypeError("rnn_type must be a Cudnn RNN class") rnn_block = cudnn_rnn_type( num_layers=self.params['encoder_layers'], num_units=self._emb_size, dtype=self._params['dtype'], name="cudnn_rnn") else: # Transferring weights from model trained with CudnnLSTM/CudnnGRU # to CudnnCompatibleLSTMCell/CudnnCompatibleGRUCell for inference if 'CudnnLSTM' in str(cudnn_rnn_type): cell = lambda: tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell( num_units=self._emb_size) elif 'CudnnGRU' in str(cudnn_rnn_type): cell = lambda: tf.contrib.cudnn_rnn.CudnnCompatibleGRUCell( num_units=self._emb_size) fwd_cells = [ cell() for _ in range(self.params['encoder_layers']) ] self._encoder_cell_fw = tf.nn.rnn_cell.MultiRNNCell(fwd_cells) else: fwd_cells = [ single_cell( cell_class=self.params['core_cell'], cell_params=self.params['core_cell_params'], dp_input_keep_prob=dp_input_keep_prob, dp_output_keep_prob=dp_output_keep_prob, recurrent_keep_prob=recurrent_keep_prob, input_weight_keep_prob=input_weight_keep_prob, recurrent_weight_keep_prob=recurrent_weight_keep_prob, weight_variational=self.params['weight_variational'], dropout_seed=self.params['dropout_seed'], residual_connections=self. params['encoder_use_skip_connections'], awd_initializer=self.params['awd_initializer'], dtype=self._params['dtype']) for _ in range(self.params['encoder_layers'] - 1) ] fwd_cells.append( single_cell( cell_class=self.params['core_cell'], cell_params=last_cell_params, dp_input_keep_prob=last_input_keep_prob, dp_output_keep_prob=last_output_keep_prob, recurrent_keep_prob=recurrent_keep_prob, input_weight_keep_prob=input_weight_keep_prob, recurrent_weight_keep_prob=recurrent_weight_keep_prob, weight_variational=self.params['weight_variational'], dropout_seed=self.params['dropout_seed'], residual_connections=self. params['encoder_use_skip_connections'], awd_initializer=self.params['awd_initializer'], dtype=self._params['dtype'])) self._encoder_cell_fw = tf.contrib.rnn.MultiRNNCell(fwd_cells) time_major = self.params.get("time_major", False) use_swap_memory = self.params.get("use_swap_memory", False) source_sequence = input_dict['source_tensors'][0] source_length = input_dict['source_tensors'][1] # Inference for language modeling requires a different graph if (not self._lm_phase ) or self._mode == 'train' or self._mode == 'eval': embedded_inputs = tf.cast( tf.nn.embedding_lookup( self.enc_emb_w, source_sequence, ), self.params['dtype']) if use_cudnn_rnn: # The CudnnLSTM will return encoder_state as a tuple of hidden # and cell values that. The hidden and cell tensors are stored for # each LSTM Layer. # reshape to [B, T, C] --> [T, B, C] if time_major == False: embedded_inputs = tf.transpose(embedded_inputs, [1, 0, 2]) rnn_block.build(embedded_inputs.get_shape()) encoder_outputs, encoder_state = rnn_block(embedded_inputs) encoder_outputs = tf.transpose(encoder_outputs, [1, 0, 2]) else: encoder_outputs, encoder_state = tf.nn.dynamic_rnn( cell=self._encoder_cell_fw, inputs=embedded_inputs, sequence_length=source_length, time_major=time_major, swap_memory=use_swap_memory, dtype=self._params['dtype'], scope='decoder', ) if not self._lm_phase: # CudnnLSTM stores cell and hidden state differently if use_cudnn_rnn: if self._use_cell_state: encoder_outputs = tf.concat( [encoder_state[0][-1], encoder_state[1][-1]], axis=1) else: encoder_outputs = encoder_state[0][-1] else: if self._use_cell_state: encoder_outputs = tf.concat( [encoder_state[-1].h, encoder_state[-1].c], axis=1) else: encoder_outputs = encoder_state[-1].h if self._mode == 'train' and self._num_sampled < self._fc_dim: # sampled softmax output_dict = { 'weights': enc_emb_w, 'bias': dense_biases, 'inputs': encoder_outputs, 'logits': encoder_outputs, 'outputs': [encoder_outputs], 'num_sampled': self._num_sampled } else: # full softmax logits = self._output_layer.apply(encoder_outputs) output_dict = {'logits': logits, 'outputs': [logits]} else: # infer in LM phase # This portion of graph is required to restore weights from CudnnLSTM to # CudnnCompatibleLSTMCell/CudnnCompatibleGRUCell if use_cudnn_rnn: embedded_inputs = tf.cast( tf.nn.embedding_lookup( self.enc_emb_w, source_sequence, ), self.params['dtype']) # Scope must remain unset to restore weights encoder_outputs, encoder_state = tf.nn.dynamic_rnn( cell=self._encoder_cell_fw, inputs=embedded_inputs, sequence_length=source_length, time_major=time_major, swap_memory=use_swap_memory, dtype=self._params['dtype']) embedding_fn = lambda ids: tf.cast( tf.nn.embedding_lookup( self.enc_emb_w, ids, ), self.params['dtype']) helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( embedding=embedding_fn, #self._dec_emb_w, start_tokens=tf.constant(self.params['seed_tokens']), end_token=self.params['end_token']) decoder = tf.contrib.seq2seq.BasicDecoder( cell=self._encoder_cell_fw, helper=helper, initial_state=self._encoder_cell_fw.zero_state( batch_size=self._batch_size, dtype=self._params['dtype'], ), output_layer=self._output_layer, ) maximum_iterations = tf.constant(self._num_tokens_gen) final_outputs, final_state, final_sequence_lengths = tf.contrib.seq2seq.dynamic_decode( decoder=decoder, impute_finished=False, maximum_iterations=maximum_iterations, swap_memory=use_swap_memory, output_time_major=time_major, ) output_dict = { 'logits': final_outputs.rnn_output, 'outputs': [tf.argmax(final_outputs.rnn_output, axis=-1)], 'final_state': final_state, 'final_sequence_lengths': final_sequence_lengths } return output_dict
def _embed_style(self, style_spec, style_len): """ Code that implements the reference encoder as described in "Towards end-to-end prosody transfer for expressive speech synthesis with Tacotron", and "Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End Speech Synthesis" Config parameters: * **conv_layers** (list) --- See the conv_layers parameter for the Tacotron-2 model. * **num_rnn_layers** (int) --- Number of rnn layers in the reference encoder * **rnn_cell_dim** (int) --- Size of rnn layer * **rnn_unidirectional** (bool) --- Uni- or bi-directional rnn. * **rnn_type** --- Must be a valid tf rnn cell class * **emb_size** (int) --- Size of gst * **attention_layer_size** (int) --- Size of linear layers in attention * **num_tokens** (int) --- Number of tokens for gst * **num_heads** (int) --- Number of attention heads """ training = (self._mode == "train") regularizer = self.params.get('regularizer', None) data_format = self.params.get('data_format', 'channels_last') batch_size = style_spec.get_shape().as_list()[0] top_layer = tf.expand_dims(style_spec, -1) params = self.params['style_embedding_params'] if "conv_layers" in params: for i, conv_params in enumerate(params['conv_layers']): ch_out = conv_params['num_channels'] kernel_size = conv_params['kernel_size'] # [time, freq] strides = conv_params['stride'] padding = conv_params['padding'] if padding == "VALID": style_len = (style_len - kernel_size[0] + strides[0]) // strides[0] else: style_len = (style_len + strides[0] - 1) // strides[0] top_layer = conv_bn_actv( layer_type="conv2d", name="conv{}".format(i + 1), inputs=top_layer, filters=ch_out, kernel_size=kernel_size, activation_fn=self.params['activation_fn'], strides=strides, padding=padding, regularizer=regularizer, training=training, data_format=data_format, bn_momentum=self.params.get('bn_momentum', 0.1), bn_epsilon=self.params.get('bn_epsilon', 1e-5), ) if data_format == 'channels_first': top_layer = tf.transpose(top_layer, [0, 2, 1]) top_layer = tf.concat(tf.unstack(top_layer, axis=2), axis=-1) num_rnn_layers = params['num_rnn_layers'] if num_rnn_layers > 0: cell_params = {} cell_params["num_units"] = params['rnn_cell_dim'] rnn_type = params['rnn_type'] rnn_input = top_layer rnn_vars = [] multirnn_cell_fw = tf.nn.rnn_cell.MultiRNNCell([ single_cell(cell_class=rnn_type, cell_params=cell_params, training=training, residual_connections=False) for _ in range(num_rnn_layers) ]) rnn_vars += multirnn_cell_fw.trainable_variables if params['rnn_unidirectional']: top_layer, final_state = tf.nn.dynamic_rnn( cell=multirnn_cell_fw, inputs=rnn_input, sequence_length=style_len, dtype=rnn_input.dtype, time_major=False, ) final_state = final_state[0] else: multirnn_cell_bw = tf.nn.rnn_cell.MultiRNNCell([ single_cell(cell_class=rnn_type, cell_params=cell_params, training=training, residual_connections=False) for _ in range(num_rnn_layers) ]) top_layer, final_state = tf.nn.bidirectional_dynamic_rnn( cell_fw=multirnn_cell_fw, cell_bw=multirnn_cell_bw, inputs=rnn_input, sequence_length=style_len, dtype=rnn_input.dtype, time_major=False) # concat 2 tensors [B, T, n_cell_dim] --> [B, T, 2*n_cell_dim] final_state = tf.concat( (final_state[0][0].h, final_state[1][0].h), 1) rnn_vars += multirnn_cell_bw.trainable_variables top_layer = final_state # Apply linear layer top_layer = tf.layers.dense(top_layer, 128, activation=tf.nn.tanh, kernel_regularizer=regularizer, name="reference_activation") if regularizer and training: cell_weights = rnn_vars for weights in cell_weights: if "bias" not in weights.name: # print("Added regularizer to {}".format(weights.name)) if weights.dtype.base_dtype == tf.float16: tf.add_to_collection('REGULARIZATION_FUNCTIONS', (weights, regularizer)) else: tf.add_to_collection( ops.GraphKeys.REGULARIZATION_LOSSES, regularizer(weights)) num_units = params["num_tokens"] att_size = params["attention_layer_size"] # Randomly initilized tokens gst_embedding = tf.get_variable( "token_embeddings", shape=[num_units, params["emb_size"]], dtype=self.params["dtype"], initializer=tf.random_uniform_initializer( minval=-1., maxval=1., dtype=self.params["dtype"]), trainable=False) attention = attention_layer.Attention(params["attention_layer_size"], params["num_heads"], 0., training, mode="bahdanau") top_layer = tf.expand_dims(top_layer, 1) gst_embedding = tf.nn.tanh(gst_embedding) gst_embedding = tf.expand_dims(gst_embedding, 0) gst_embedding = tf.tile(gst_embedding, [batch_size, 1, 1]) token_embeddings = attention(top_layer, gst_embedding, None) token_embeddings = tf.squeeze(token_embeddings, 1) return token_embeddings