Ejemplo n.º 1
0
  def _decode(self, input_dict):
    """
    Decodes representation into data

    Args:
      input_dict (dict): Python dictionary with inputs to decoder. Must define:
          * src_inputs - decoder input Tensor of shape [batch_size, time, dim]
            or [time, batch_size, dim]
          * src_lengths - decoder input lengths Tensor of shape [batch_size]
          * tgt_inputs - Only during training. labels Tensor of the
            shape [batch_size, time, num_features] or
            [time, batch_size, num_features]
          * stop_token_inputs - Only during training. labels Tensor of the
            shape [batch_size, time, 1] or [time, batch_size, 1]
          * tgt_lengths - Only during training. labels lengths
            Tensor of the shape [batch_size]

    Returns:
      dict:
        A python dictionary containing:

          * outputs - array containing:

              * decoder_output - tensor of shape [batch_size, time,
                num_features] or [time, batch_size, num_features]. Spectrogram
                representation learned by the decoder rnn
              * spectrogram_prediction - tensor of shape [batch_size, time,
                num_features] or [time, batch_size, num_features]. Spectrogram
                containing the residual corrections from the postnet if enabled
              * alignments - tensor of shape [batch_size, time, memory_size]
                or [time, batch_size, memory_size]. The alignments learned by
                the attention layer
              * stop_token_prediction - tensor of shape [batch_size, time, 1]
                or [time, batch_size, 1]. The stop token predictions
              * final_sequence_lengths - tensor of shape [batch_size]
          * stop_token_predictions - tensor of shape [batch_size, time, 1]
            or [time, batch_size, 1]. The stop token predictions for use inside
            the loss function.
    """
    encoder_outputs = input_dict['encoder_output']['outputs']
    enc_src_lengths = input_dict['encoder_output']['src_length']
    if self._mode == "train":
      spec = input_dict['target_tensors'][0] if 'target_tensors' in \
                                                    input_dict else None
      spec_length = input_dict['target_tensors'][2] if 'target_tensors' in \
                                                    input_dict else None
    _batch_size = encoder_outputs.get_shape().as_list()[0]

    training = (self._mode == "train")
    regularizer = self.params.get('regularizer', None)

    if self.params.get('enable_postnet', True):
      if "postnet_conv_layers" not in self.params:
        raise ValueError(
            "postnet_conv_layers must be passed from config file if postnet is"
            "enabled"
        )

    if self._both:
      num_audio_features = self._n_feats["mel"]
      if self._mode == "train":
        spec, _ = tf.split(
            spec,
            [self._n_feats['mel'], self._n_feats['magnitude']],
            axis=2
        )
    else:
      num_audio_features = self._n_feats

    output_projection_layer = tf.layers.Dense(
        name="output_proj",
        units=num_audio_features,
        use_bias=True,
    )
    stop_token_projection_layer = tf.layers.Dense(
        name="stop_token_proj",
        units=1,
        use_bias=True,
    )

    prenet = None
    if self.params.get('enable_prenet', True):
      prenet = Prenet(
          self.params.get('prenet_units', 256),
          self.params.get('prenet_layers', 2),
          self.params.get("prenet_activation", tf.nn.relu),
          self.params["dtype"]
      )

    cell_params = {}
    cell_params["num_units"] = self.params['decoder_cell_units']
    decoder_cells = [
        single_cell(
            cell_class=self.params['decoder_cell_type'],
            cell_params=cell_params,
            zoneout_prob=self.params.get("zoneout_prob", 0.),
            dp_output_keep_prob=1.-self.params.get("dropout_prob", 0.1),
            training=training,
        ) for _ in range(self.params['decoder_layers'])
    ]

    if self.params['attention_type'] is not None:
      attention_mechanism = self._build_attention(
          encoder_outputs, enc_src_lengths,
          self.params.get("attention_bias", False)
      )

      attention_cell = tf.contrib.rnn.MultiRNNCell(decoder_cells)

      attentive_cell = AttentionWrapper(
          cell=attention_cell,
          attention_mechanism=attention_mechanism,
          alignment_history=True,
          output_attention="both",
      )

      decoder_cell = attentive_cell

    if self.params['attention_type'] is None:
      decoder_cell = tf.contrib.rnn.MultiRNNCell(decoder_cells)

    if self._mode == "train":
      train_and_not_sampling = True
      helper = TacotronTrainingHelper(
          inputs=spec,
          sequence_length=spec_length,
          prenet=None,
          model_dtype=self.params["dtype"],
          mask_decoder_sequence=self.params.get("mask_decoder_sequence", True)
      )
    elif self._mode == "eval" or self._mode == "infer":
      train_and_not_sampling = False
      inputs = tf.zeros(
          (_batch_size, 1, num_audio_features), dtype=self.params["dtype"]
      )
      helper = TacotronHelper(
          inputs=inputs,
          prenet=None,
          mask_decoder_sequence=self.params.get("mask_decoder_sequence", True)
      )
    else:
      raise ValueError("Unknown mode for decoder: {}".format(self._mode))
    decoder = TacotronDecoder(
        decoder_cell=decoder_cell,
        helper=helper,
        initial_decoder_state=decoder_cell.zero_state(
            _batch_size, self.params["dtype"]
        ),
        attention_type=self.params["attention_type"],
        spec_layer=output_projection_layer,
        stop_token_layer=stop_token_projection_layer,
        prenet=prenet,
        dtype=self.params["dtype"],
        train=train_and_not_sampling
    )

    if self._mode == 'train':
      maximum_iterations = tf.reduce_max(spec_length)
    else:
      maximum_iterations = tf.reduce_max(enc_src_lengths) * 10

    outputs, final_state, sequence_lengths = tf.contrib.seq2seq.dynamic_decode(
        # outputs, final_state, sequence_lengths, final_inputs = dynamic_decode(
        decoder=decoder,
        impute_finished=False,
        maximum_iterations=maximum_iterations,
        swap_memory=self.params.get("use_swap_memory", False),
        output_time_major=self.params.get("time_major", False),
        parallel_iterations=self.params.get("parallel_iterations", 32)
    )

    decoder_output = outputs.rnn_output
    stop_token_logits = outputs.stop_token_output

    with tf.variable_scope("decoder"):
      # If we are in train and doing sampling, we need to do the projections
      if train_and_not_sampling:
        decoder_spec_output = output_projection_layer(decoder_output)
        stop_token_logits = stop_token_projection_layer(decoder_spec_output)
        decoder_output = decoder_spec_output

    ## Add the post net ##
    if self.params.get('enable_postnet', True):
      dropout_keep_prob = self.params.get('postnet_keep_dropout_prob', 0.5)

      top_layer = decoder_output
      for i, conv_params in enumerate(self.params['postnet_conv_layers']):
        ch_out = conv_params['num_channels']
        kernel_size = conv_params['kernel_size']  # [time, freq]
        strides = conv_params['stride']
        padding = conv_params['padding']
        activation_fn = conv_params['activation_fn']

        if ch_out == -1:
          if self._both:
            ch_out = self._n_feats["mel"]
          else:
            ch_out = self._n_feats

        top_layer = conv_bn_actv(
            layer_type="conv1d",
            name="conv{}".format(i + 1),
            inputs=top_layer,
            filters=ch_out,
            kernel_size=kernel_size,
            activation_fn=activation_fn,
            strides=strides,
            padding=padding,
            regularizer=regularizer,
            training=training,
            data_format=self.params.get('postnet_data_format', 'channels_last'),
            bn_momentum=self.params.get('postnet_bn_momentum', 0.1),
            bn_epsilon=self.params.get('postnet_bn_epsilon', 1e-5),
        )
        top_layer = tf.layers.dropout(
            top_layer, rate=1. - dropout_keep_prob, training=training
        )

    else:
      top_layer = tf.zeros(
          [
              _batch_size, maximum_iterations,
              outputs.rnn_output.get_shape()[-1]
          ],
          dtype=self.params["dtype"]
      )

    if regularizer and training:
      vars_to_regularize = []
      vars_to_regularize += attentive_cell.trainable_variables
      vars_to_regularize += attention_mechanism.memory_layer.trainable_variables
      vars_to_regularize += output_projection_layer.trainable_variables
      vars_to_regularize += stop_token_projection_layer.trainable_variables

      for weights in vars_to_regularize:
        if "bias" not in weights.name:
          # print("Added regularizer to {}".format(weights.name))
          if weights.dtype.base_dtype == tf.float16:
            tf.add_to_collection(
                'REGULARIZATION_FUNCTIONS', (weights, regularizer)
            )
          else:
            tf.add_to_collection(
                ops.GraphKeys.REGULARIZATION_LOSSES, regularizer(weights)
            )

      if self.params.get('enable_prenet', True):
        prenet.add_regularization(regularizer)

    if self.params['attention_type'] is not None:
      alignments = tf.transpose(
          final_state.alignment_history.stack(), [1, 0, 2]
      )
    else:
      alignments = tf.zeros([_batch_size, _batch_size, _batch_size])

    spectrogram_prediction = decoder_output + top_layer
    if self._both:
      mag_spec_prediction = spectrogram_prediction
      mag_spec_prediction = conv_bn_actv(
          layer_type="conv1d",
          name="conv_0",
          inputs=mag_spec_prediction,
          filters=256,
          kernel_size=4,
          activation_fn=tf.nn.relu,
          strides=1,
          padding="SAME",
          regularizer=regularizer,
          training=training,
          data_format=self.params.get('postnet_data_format', 'channels_last'),
          bn_momentum=self.params.get('postnet_bn_momentum', 0.1),
          bn_epsilon=self.params.get('postnet_bn_epsilon', 1e-5),
      )
      mag_spec_prediction = conv_bn_actv(
          layer_type="conv1d",
          name="conv_1",
          inputs=mag_spec_prediction,
          filters=512,
          kernel_size=4,
          activation_fn=tf.nn.relu,
          strides=1,
          padding="SAME",
          regularizer=regularizer,
          training=training,
          data_format=self.params.get('postnet_data_format', 'channels_last'),
          bn_momentum=self.params.get('postnet_bn_momentum', 0.1),
          bn_epsilon=self.params.get('postnet_bn_epsilon', 1e-5),
      )
      if self._model.get_data_layer()._exp_mag:
        mag_spec_prediction = tf.exp(mag_spec_prediction)
      mag_spec_prediction = tf.layers.conv1d(
          mag_spec_prediction,
          self._n_feats["magnitude"],
          1,
          name="post_net_proj",
          use_bias=False,
      )
    else:
      mag_spec_prediction = tf.zeros([_batch_size, _batch_size, _batch_size])

    stop_token_prediction = tf.sigmoid(stop_token_logits)
    outputs = [
        decoder_output, spectrogram_prediction, alignments,
        stop_token_prediction, sequence_lengths, mag_spec_prediction
    ]

    return {
        'outputs': outputs,
        'stop_token_prediction': stop_token_logits,
    }
Ejemplo n.º 2
0
    def _encode(self, input_dict):
        """Creates TensorFlow graph for Tacotron-2 like encoder.

    Args:
       input_dict (dict): dictionary with inputs.
        Must define:

            source_tensors - array containing [

              * source_sequence: tensor of shape [batch_size, sequence length]
              * src_length: tensor of shape [batch_size]

            ]

    Returns:
      dict: A python dictionary containing:

          * outputs - tensor containing the encoded text to be passed to the
            attention layer
          * src_length - the length of the encoded text
    """

        text = input_dict['source_tensors'][0]
        text_len = input_dict['source_tensors'][1]

        training = (self._mode == "train")
        regularizer = self.params.get('regularizer', None)
        data_format = self.params.get('data_format', 'channels_last')
        src_vocab_size = self._model.get_data_layer().params['src_vocab_size']
        zoneout_prob = self.params.get('zoneout_prob', 0.)

        # if src_vocab_size % 8 != 0:
        #   src_vocab_size += 8 - (src_vocab_size % 8)

        # ----- Embedding layer -----------------------------------------------
        enc_emb_w = tf.get_variable(
            name="EncoderEmbeddingMatrix",
            shape=[src_vocab_size, self.params['src_emb_size']],
            dtype=self.params['dtype'],
            # initializer=tf.random_normal_initializer()
        )

        embedded_inputs = tf.cast(tf.nn.embedding_lookup(
            enc_emb_w,
            text,
        ), self.params['dtype'])

        # ----- Convolutional layers -----------------------------------------------
        input_layer = embedded_inputs

        if data_format == 'channels_last':
            top_layer = input_layer
        else:
            top_layer = tf.transpose(input_layer, [0, 2, 1])

        for i, conv_params in enumerate(self.params['conv_layers']):
            ch_out = conv_params['num_channels']
            kernel_size = conv_params['kernel_size']  # [time, freq]
            strides = conv_params['stride']
            padding = conv_params['padding']

            if padding == "VALID":
                text_len = (text_len - kernel_size[0] +
                            strides[0]) // strides[0]
            else:
                text_len = (text_len + strides[0] - 1) // strides[0]

            top_layer = conv_bn_actv(
                layer_type="conv1d",
                name="conv{}".format(i + 1),
                inputs=top_layer,
                filters=ch_out,
                kernel_size=kernel_size,
                activation_fn=self.params['activation_fn'],
                strides=strides,
                padding=padding,
                regularizer=regularizer,
                training=training,
                data_format=data_format,
                bn_momentum=self.params.get('bn_momentum', 0.1),
                bn_epsilon=self.params.get('bn_epsilon', 1e-5),
            )
            top_layer = tf.layers.dropout(top_layer,
                                          rate=self.params["cnn_dropout_prob"],
                                          training=training)

        if data_format == 'channels_first':
            top_layer = tf.transpose(top_layer, [0, 2, 1])

        # ----- RNN ---------------------------------------------------------------
        num_rnn_layers = self.params['num_rnn_layers']
        if num_rnn_layers > 0:
            cell_params = {}
            cell_params["num_units"] = self.params['rnn_cell_dim']
            rnn_type = self.params['rnn_type']
            rnn_input = top_layer
            rnn_vars = []

            if self.params["use_cudnn_rnn"]:
                if self._mode == "infer":
                    cell = lambda: tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell(
                        cell_params["num_units"])
                    cells_fw = [cell() for _ in range(1)]
                    cells_bw = [cell() for _ in range(1)]
                    (top_layer, _,
                     _) = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
                         cells_fw,
                         cells_bw,
                         rnn_input,
                         sequence_length=text_len,
                         dtype=rnn_input.dtype,
                         time_major=False)
                else:
                    all_cudnn_classes = [
                        i[1] for i in inspect.getmembers(
                            tf.contrib.cudnn_rnn, inspect.isclass)
                    ]
                    if not rnn_type in all_cudnn_classes:
                        raise TypeError("rnn_type must be a Cudnn RNN class")
                    if zoneout_prob != 0.:
                        raise ValueError(
                            "Zoneout is currently not supported for cudnn rnn classes"
                        )

                    rnn_input = tf.transpose(top_layer, [1, 0, 2])
                    if self.params['rnn_unidirectional']:
                        direction = cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION
                    else:
                        direction = cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION

                    rnn_block = rnn_type(num_layers=num_rnn_layers,
                                         num_units=cell_params["num_units"],
                                         direction=direction,
                                         dtype=rnn_input.dtype,
                                         name="cudnn_rnn")
                    rnn_block.build(rnn_input.get_shape())
                    top_layer, _ = rnn_block(rnn_input)
                    top_layer = tf.transpose(top_layer, [1, 0, 2])
                    rnn_vars += rnn_block.trainable_variables

            else:
                multirnn_cell_fw = tf.nn.rnn_cell.MultiRNNCell([
                    single_cell(cell_class=rnn_type,
                                cell_params=cell_params,
                                zoneout_prob=zoneout_prob,
                                training=training,
                                residual_connections=False)
                    for _ in range(num_rnn_layers)
                ])
                rnn_vars += multirnn_cell_fw.trainable_variables
                if self.params['rnn_unidirectional']:
                    top_layer, _ = tf.nn.dynamic_rnn(
                        cell=multirnn_cell_fw,
                        inputs=rnn_input,
                        sequence_length=text_len,
                        dtype=rnn_input.dtype,
                        time_major=False,
                    )
                else:
                    multirnn_cell_bw = tf.nn.rnn_cell.MultiRNNCell([
                        single_cell(cell_class=rnn_type,
                                    cell_params=cell_params,
                                    zoneout_prob=zoneout_prob,
                                    training=training,
                                    residual_connections=False)
                        for _ in range(num_rnn_layers)
                    ])
                    top_layer, _ = tf.nn.bidirectional_dynamic_rnn(
                        cell_fw=multirnn_cell_fw,
                        cell_bw=multirnn_cell_bw,
                        inputs=rnn_input,
                        sequence_length=text_len,
                        dtype=rnn_input.dtype,
                        time_major=False)
                    # concat 2 tensors [B, T, n_cell_dim] --> [B, T, 2*n_cell_dim]
                    top_layer = tf.concat(top_layer, 2)
                    rnn_vars += multirnn_cell_bw.trainable_variables

            if regularizer and training:
                cell_weights = []
                cell_weights += rnn_vars
                cell_weights += [enc_emb_w]
                for weights in cell_weights:
                    if "bias" not in weights.name:
                        # print("Added regularizer to {}".format(weights.name))
                        if weights.dtype.base_dtype == tf.float16:
                            tf.add_to_collection('REGULARIZATION_FUNCTIONS',
                                                 (weights, regularizer))
                        else:
                            tf.add_to_collection(
                                ops.GraphKeys.REGULARIZATION_LOSSES,
                                regularizer(weights))

        # -- end of rnn------------------------------------------------------------

        top_layer = tf.layers.dropout(top_layer,
                                      rate=self.params["rnn_dropout_prob"],
                                      training=training)
        outputs = top_layer

        return {'outputs': outputs, 'src_length': text_len}
Ejemplo n.º 3
0
  def _encode(self, input_dict):
    """Encodes data into representation.

    Args:
      input_dict: a Python dictionary.
        Must define:
          * src_inputs - a Tensor of shape [batch_size, time] or
                         [time, batch_size]
                         (depending on time_major param)
          * src_lengths - a Tensor of shape [batch_size]

    Returns:
       a Python dictionary with:
      * encoder_outputs - a Tensor of shape
                          [batch_size, time, representation_dim]
      or [time, batch_size, representation_dim]
      * encoder_state - a Tensor of shape [batch_size, dim]
      * src_lengths - (copy ref from input) a Tensor of shape [batch_size]
    """
    # TODO: make a separate level of config for cell_params?
    source_sequence = input_dict['source_tensors'][0]
    source_length = input_dict['source_tensors'][1]

    self._enc_emb_w = tf.get_variable(
        name="EncoderEmbeddingMatrix",
        shape=[self._src_vocab_size, self._src_emb_size],
        dtype=tf.float32,
    )

    if self._mode == "train":
      dp_input_keep_prob = self.params['encoder_dp_input_keep_prob']
      dp_output_keep_prob = self.params['encoder_dp_output_keep_prob']
    else:
      dp_input_keep_prob = 1.0
      dp_output_keep_prob = 1.0

    fwd_cells = [
        single_cell(
            cell_class=self.params['core_cell'],
            cell_params=self.params.get('core_cell_params', {}),
            dp_input_keep_prob=dp_input_keep_prob,
            dp_output_keep_prob=dp_output_keep_prob,
            residual_connections=self.params['encoder_use_skip_connections']
        ) for _ in range(self.params['encoder_layers'])
    ]
    # pylint: disable=no-member
    self._encoder_cell_fw = tf.contrib.rnn.MultiRNNCell(fwd_cells)

    time_major = self.params.get("time_major", False)
    use_swap_memory = self.params.get("use_swap_memory", False)

    embedded_inputs = tf.cast(
        tf.nn.embedding_lookup(
            self.enc_emb_w,
            source_sequence,
        ),
        self.params['dtype'],
    )

    encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
        cell=self._encoder_cell_fw,
        inputs=embedded_inputs,
        sequence_length=source_length,
        time_major=time_major,
        swap_memory=use_swap_memory,
        dtype=embedded_inputs.dtype,
    )
    return {'outputs': encoder_outputs,
            'state': encoder_state,
            'src_lengths': source_length,
            'encoder_input': source_sequence}
Ejemplo n.º 4
0
  def _encode(self, input_dict):
    source_sequence = input_dict['source_tensors'][0]
    source_length = input_dict['source_tensors'][1]
    self._enc_emb_w = tf.get_variable(
        name="EncoderEmbeddingMatrix",
        shape=[self._src_vocab_size, self._src_emb_size],
        dtype=tf.float32,
    )

    if self.params['encoder_layers'] < 2:
      raise ValueError("GNMT encoder must have at least 2 layers")

    with tf.variable_scope("Level1FW"):
      self._encoder_l1_cell_fw = single_cell(
          cell_class=self.params['core_cell'],
          cell_params=self.params.get('core_cell_params', {}),
          dp_input_keep_prob=1.0,
          dp_output_keep_prob=1.0,
          residual_connections=False,
      )

    with tf.variable_scope("Level1BW"):
      self._encoder_l1_cell_bw = single_cell(
          cell_class=self.params['core_cell'],
          cell_params=self.params.get('core_cell_params', {}),
          dp_input_keep_prob=1.0,
          dp_output_keep_prob=1.0,
          residual_connections=False,
      )

    if self._mode == "train":
      dp_input_keep_prob = self.params['encoder_dp_input_keep_prob']
      dp_output_keep_prob = self.params['encoder_dp_output_keep_prob']
    else:
      dp_input_keep_prob = 1.0
      dp_output_keep_prob = 1.0

    with tf.variable_scope("UniDirLevel"):
      self._encoder_cells = [
          single_cell(
              cell_class=self.params['core_cell'],
              cell_params=self.params.get('core_cell_params', {}),
              dp_input_keep_prob=dp_input_keep_prob,
              dp_output_keep_prob=dp_output_keep_prob,
              residual_connections=False,
          ) for _ in range(self.params['encoder_layers'] - 1)
      ]

      # add residual connections starting from the third layer
      for idx, cell in enumerate(self._encoder_cells):
        if idx > 0:
          # pylint: disable=no-member
          self._encoder_cells[idx] = tf.contrib.rnn.ResidualWrapper(cell)

    time_major = self.params.get("time_major", False)
    use_swap_memory = self.params.get("use_swap_memory", False)
    embedded_inputs = tf.cast(
        tf.nn.embedding_lookup(
            self.enc_emb_w,
            source_sequence,
        ),
        self.params['dtype'],
    )

    # first bi-directional layer
    _encoder_output, _ = tf.nn.bidirectional_dynamic_rnn(
        cell_fw=self._encoder_l1_cell_fw,
        cell_bw=self._encoder_l1_cell_bw,
        inputs=embedded_inputs,
        sequence_length=source_length,
        swap_memory=use_swap_memory,
        time_major=time_major,
        dtype=embedded_inputs.dtype,
    )
    encoder_l1_outputs = tf.concat(_encoder_output, 2)

    # stack of unidirectional layers
    # pylint: disable=no-member
    encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
        cell=tf.contrib.rnn.MultiRNNCell(self._encoder_cells),
        inputs=encoder_l1_outputs,
        sequence_length=source_length,
        swap_memory=use_swap_memory,
        time_major=time_major,
        dtype=encoder_l1_outputs.dtype,
    )

    return {'outputs': encoder_outputs,
            'state': encoder_state,
            'src_lengths': source_length,
            'encoder_input': source_sequence}
Ejemplo n.º 5
0
    def _encode(self, input_dict):
        """
    Encodes data into representation
    :param input_dict: a Python dictionary.
    Must define:
      * src_inputs - a Tensor of shape [batch_size, time] or [time, batch_size]
                    (depending on time_major param)
      * src_lengths - a Tensor of shape [batch_size]
    :return: a Python dictionary with:
      * encoder_outputs - a Tensor of shape
                          [batch_size, time, representation_dim]
      or [time, batch_size, representation_dim]
      * encoder_state - a Tensor of shape [batch_size, dim]
      * src_lengths - (copy ref from input) a Tensor of shape [batch_size]
    """
        time_major = self.params.get("time_major", False)
        use_swap_memory = self.params.get("use_swap_memory", False)

        regularizer = self.params.get('regularizer', None)
        fc_use_bias = self.params.get('fc_use_bias', True)

        use_cudnn_rnn = self.params.get("use_cudnn_rnn", False)
        cudnn_rnn_type = self.params.get("cudnn_rnn_type", None)

        if 'initializer' in self.params:
            init_dict = self.params.get('initializer_params', {})
            initializer = self.params['initializer'](**init_dict)
        else:
            initializer = None

        if self._mode == "train":
            dp_input_keep_prob = self.params['encoder_dp_input_keep_prob']
            dp_output_keep_prob = self.params['encoder_dp_output_keep_prob']
            last_input_keep_prob = self.params['encoder_last_input_keep_prob']
            last_output_keep_prob = self.params[
                'encoder_last_output_keep_prob']
            emb_keep_prob = self.params['encoder_emb_keep_prob']
            recurrent_keep_prob = self.params['recurrent_keep_prob']
            input_weight_keep_prob = self.params['input_weight_keep_prob']
            recurrent_weight_keep_prob = self.params[
                'recurrent_weight_keep_prob']

        else:
            dp_input_keep_prob, dp_output_keep_prob = 1.0, 1.0
            last_input_keep_prob, last_output_keep_prob = 1.0, 1.0
            emb_keep_prob, recurrent_keep_prob = 1.0, 1.0
            input_weight_keep_prob, recurrent_weight_keep_prob = 1.0, 1.0

        self._output_layer = tf.layers.Dense(self._fc_dim,
                                             kernel_regularizer=regularizer,
                                             kernel_initializer=initializer,
                                             use_bias=fc_use_bias,
                                             dtype=self._params['dtype'])

        if self._weight_tied:
            last_cell_params = copy.deepcopy(self.params['core_cell_params'])
            last_cell_params['num_units'] = self._emb_size
        else:
            last_cell_params = self.params['core_cell_params']

        last_output_dim = last_cell_params['num_units']

        if self._use_cell_state:
            last_output_dim = 2 * last_output_dim

        fake_input = tf.zeros(shape=(1, last_output_dim),
                              dtype=self._params['dtype'])
        fake_output = self._output_layer.apply(fake_input)
        with tf.variable_scope("dense", reuse=True):
            dense_weights = tf.get_variable("kernel")
            dense_biases = tf.get_variable("bias")

        if self._weight_tied and self._lm_phase:
            enc_emb_w = tf.transpose(dense_weights)
        else:
            enc_emb_w = tf.get_variable(
                name="EncoderEmbeddingMatrix",
                shape=[self._vocab_size, self._emb_size],
                dtype=self._params['dtype'])

        self._enc_emb_w = tf.nn.dropout(enc_emb_w, keep_prob=emb_keep_prob)

        if use_cudnn_rnn:
            if self._mode == 'train' or self._mode == 'eval':
                all_cudnn_classes = [
                    i[1] for i in inspect.getmembers(tf.contrib.cudnn_rnn,
                                                     inspect.isclass)
                ]

                if not cudnn_rnn_type in all_cudnn_classes:
                    raise TypeError("rnn_type must be a Cudnn RNN class")

                rnn_block = cudnn_rnn_type(
                    num_layers=self.params['encoder_layers'],
                    num_units=self._emb_size,
                    dtype=self._params['dtype'],
                    name="cudnn_rnn")
            else:
                # Transferring weights from model trained with CudnnLSTM/CudnnGRU
                # to CudnnCompatibleLSTMCell/CudnnCompatibleGRUCell for inference
                if 'CudnnLSTM' in str(cudnn_rnn_type):
                    cell = lambda: tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell(
                        num_units=self._emb_size)
                elif 'CudnnGRU' in str(cudnn_rnn_type):
                    cell = lambda: tf.contrib.cudnn_rnn.CudnnCompatibleGRUCell(
                        num_units=self._emb_size)

                fwd_cells = [
                    cell() for _ in range(self.params['encoder_layers'])
                ]
                self._encoder_cell_fw = tf.nn.rnn_cell.MultiRNNCell(fwd_cells)
        else:
            fwd_cells = [
                single_cell(
                    cell_class=self.params['core_cell'],
                    cell_params=self.params['core_cell_params'],
                    dp_input_keep_prob=dp_input_keep_prob,
                    dp_output_keep_prob=dp_output_keep_prob,
                    recurrent_keep_prob=recurrent_keep_prob,
                    input_weight_keep_prob=input_weight_keep_prob,
                    recurrent_weight_keep_prob=recurrent_weight_keep_prob,
                    weight_variational=self.params['weight_variational'],
                    dropout_seed=self.params['dropout_seed'],
                    residual_connections=self.
                    params['encoder_use_skip_connections'],
                    awd_initializer=self.params['awd_initializer'],
                    dtype=self._params['dtype'])
                for _ in range(self.params['encoder_layers'] - 1)
            ]

            fwd_cells.append(
                single_cell(
                    cell_class=self.params['core_cell'],
                    cell_params=last_cell_params,
                    dp_input_keep_prob=last_input_keep_prob,
                    dp_output_keep_prob=last_output_keep_prob,
                    recurrent_keep_prob=recurrent_keep_prob,
                    input_weight_keep_prob=input_weight_keep_prob,
                    recurrent_weight_keep_prob=recurrent_weight_keep_prob,
                    weight_variational=self.params['weight_variational'],
                    dropout_seed=self.params['dropout_seed'],
                    residual_connections=self.
                    params['encoder_use_skip_connections'],
                    awd_initializer=self.params['awd_initializer'],
                    dtype=self._params['dtype']))

            self._encoder_cell_fw = tf.contrib.rnn.MultiRNNCell(fwd_cells)

        time_major = self.params.get("time_major", False)
        use_swap_memory = self.params.get("use_swap_memory", False)

        source_sequence = input_dict['source_tensors'][0]
        source_length = input_dict['source_tensors'][1]

        # Inference for language modeling requires a different graph
        if (not self._lm_phase
            ) or self._mode == 'train' or self._mode == 'eval':
            embedded_inputs = tf.cast(
                tf.nn.embedding_lookup(
                    self.enc_emb_w,
                    source_sequence,
                ), self.params['dtype'])

            if use_cudnn_rnn:
                # The CudnnLSTM will return encoder_state as a tuple of hidden
                # and cell values that. The hidden and cell tensors are stored for
                # each LSTM Layer.

                # reshape to [B, T, C] --> [T, B, C]
                if time_major == False:
                    embedded_inputs = tf.transpose(embedded_inputs, [1, 0, 2])

                rnn_block.build(embedded_inputs.get_shape())
                encoder_outputs, encoder_state = rnn_block(embedded_inputs)
                encoder_outputs = tf.transpose(encoder_outputs, [1, 0, 2])
            else:
                encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
                    cell=self._encoder_cell_fw,
                    inputs=embedded_inputs,
                    sequence_length=source_length,
                    time_major=time_major,
                    swap_memory=use_swap_memory,
                    dtype=self._params['dtype'],
                    scope='decoder',
                )

            if not self._lm_phase:
                # CudnnLSTM stores cell and hidden state differently
                if use_cudnn_rnn:
                    if self._use_cell_state:
                        encoder_outputs = tf.concat(
                            [encoder_state[0][-1], encoder_state[1][-1]],
                            axis=1)
                    else:
                        encoder_outputs = encoder_state[0][-1]
                else:
                    if self._use_cell_state:
                        encoder_outputs = tf.concat(
                            [encoder_state[-1].h, encoder_state[-1].c], axis=1)
                    else:
                        encoder_outputs = encoder_state[-1].h

            if self._mode == 'train' and self._num_sampled < self._fc_dim:  # sampled softmax
                output_dict = {
                    'weights': enc_emb_w,
                    'bias': dense_biases,
                    'inputs': encoder_outputs,
                    'logits': encoder_outputs,
                    'outputs': [encoder_outputs],
                    'num_sampled': self._num_sampled
                }
            else:  # full softmax
                logits = self._output_layer.apply(encoder_outputs)
                output_dict = {'logits': logits, 'outputs': [logits]}
        else:  # infer in LM phase
            # This portion of graph is required to restore weights from CudnnLSTM to
            # CudnnCompatibleLSTMCell/CudnnCompatibleGRUCell
            if use_cudnn_rnn:
                embedded_inputs = tf.cast(
                    tf.nn.embedding_lookup(
                        self.enc_emb_w,
                        source_sequence,
                    ), self.params['dtype'])

                # Scope must remain unset to restore weights
                encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
                    cell=self._encoder_cell_fw,
                    inputs=embedded_inputs,
                    sequence_length=source_length,
                    time_major=time_major,
                    swap_memory=use_swap_memory,
                    dtype=self._params['dtype'])

            embedding_fn = lambda ids: tf.cast(
                tf.nn.embedding_lookup(
                    self.enc_emb_w,
                    ids,
                ), self.params['dtype'])

            helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                embedding=embedding_fn,  #self._dec_emb_w,
                start_tokens=tf.constant(self.params['seed_tokens']),
                end_token=self.params['end_token'])

            decoder = tf.contrib.seq2seq.BasicDecoder(
                cell=self._encoder_cell_fw,
                helper=helper,
                initial_state=self._encoder_cell_fw.zero_state(
                    batch_size=self._batch_size,
                    dtype=self._params['dtype'],
                ),
                output_layer=self._output_layer,
            )
            maximum_iterations = tf.constant(self._num_tokens_gen)

            final_outputs, final_state, final_sequence_lengths = tf.contrib.seq2seq.dynamic_decode(
                decoder=decoder,
                impute_finished=False,
                maximum_iterations=maximum_iterations,
                swap_memory=use_swap_memory,
                output_time_major=time_major,
            )
            output_dict = {
                'logits': final_outputs.rnn_output,
                'outputs': [tf.argmax(final_outputs.rnn_output, axis=-1)],
                'final_state': final_state,
                'final_sequence_lengths': final_sequence_lengths
            }

        return output_dict
Ejemplo n.º 6
0
    def _embed_style(self, style_spec, style_len):
        """
    Code that implements the reference encoder as described in "Towards
    end-to-end prosody transfer for expressive speech synthesis with Tacotron",
    and "Style Tokens: Unsupervised Style Modeling, Control and Transfer in
    End-to-End Speech Synthesis"

    Config parameters:

    * **conv_layers** (list) --- See the conv_layers parameter for the
      Tacotron-2 model.
    * **num_rnn_layers** (int) --- Number of rnn layers in the reference encoder
    * **rnn_cell_dim** (int) --- Size of rnn layer
    * **rnn_unidirectional** (bool) --- Uni- or bi-directional rnn.
    * **rnn_type** --- Must be a valid tf rnn cell class
    * **emb_size** (int) --- Size of gst
    * **attention_layer_size** (int) --- Size of linear layers in attention
    * **num_tokens** (int) --- Number of tokens for gst
    * **num_heads** (int) --- Number of attention heads
    """
        training = (self._mode == "train")
        regularizer = self.params.get('regularizer', None)
        data_format = self.params.get('data_format', 'channels_last')
        batch_size = style_spec.get_shape().as_list()[0]

        top_layer = tf.expand_dims(style_spec, -1)
        params = self.params['style_embedding_params']
        if "conv_layers" in params:
            for i, conv_params in enumerate(params['conv_layers']):
                ch_out = conv_params['num_channels']
                kernel_size = conv_params['kernel_size']  # [time, freq]
                strides = conv_params['stride']
                padding = conv_params['padding']

                if padding == "VALID":
                    style_len = (style_len - kernel_size[0] +
                                 strides[0]) // strides[0]
                else:
                    style_len = (style_len + strides[0] - 1) // strides[0]

                top_layer = conv_bn_actv(
                    layer_type="conv2d",
                    name="conv{}".format(i + 1),
                    inputs=top_layer,
                    filters=ch_out,
                    kernel_size=kernel_size,
                    activation_fn=self.params['activation_fn'],
                    strides=strides,
                    padding=padding,
                    regularizer=regularizer,
                    training=training,
                    data_format=data_format,
                    bn_momentum=self.params.get('bn_momentum', 0.1),
                    bn_epsilon=self.params.get('bn_epsilon', 1e-5),
                )

            if data_format == 'channels_first':
                top_layer = tf.transpose(top_layer, [0, 2, 1])

        top_layer = tf.concat(tf.unstack(top_layer, axis=2), axis=-1)

        num_rnn_layers = params['num_rnn_layers']
        if num_rnn_layers > 0:
            cell_params = {}
            cell_params["num_units"] = params['rnn_cell_dim']
            rnn_type = params['rnn_type']
            rnn_input = top_layer
            rnn_vars = []

            multirnn_cell_fw = tf.nn.rnn_cell.MultiRNNCell([
                single_cell(cell_class=rnn_type,
                            cell_params=cell_params,
                            training=training,
                            residual_connections=False)
                for _ in range(num_rnn_layers)
            ])
            rnn_vars += multirnn_cell_fw.trainable_variables
            if params['rnn_unidirectional']:
                top_layer, final_state = tf.nn.dynamic_rnn(
                    cell=multirnn_cell_fw,
                    inputs=rnn_input,
                    sequence_length=style_len,
                    dtype=rnn_input.dtype,
                    time_major=False,
                )
                final_state = final_state[0]
            else:
                multirnn_cell_bw = tf.nn.rnn_cell.MultiRNNCell([
                    single_cell(cell_class=rnn_type,
                                cell_params=cell_params,
                                training=training,
                                residual_connections=False)
                    for _ in range(num_rnn_layers)
                ])
                top_layer, final_state = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw=multirnn_cell_fw,
                    cell_bw=multirnn_cell_bw,
                    inputs=rnn_input,
                    sequence_length=style_len,
                    dtype=rnn_input.dtype,
                    time_major=False)
                # concat 2 tensors [B, T, n_cell_dim] --> [B, T, 2*n_cell_dim]
                final_state = tf.concat(
                    (final_state[0][0].h, final_state[1][0].h), 1)
                rnn_vars += multirnn_cell_bw.trainable_variables

            top_layer = final_state
            # Apply linear layer
            top_layer = tf.layers.dense(top_layer,
                                        128,
                                        activation=tf.nn.tanh,
                                        kernel_regularizer=regularizer,
                                        name="reference_activation")
            if regularizer and training:
                cell_weights = rnn_vars
                for weights in cell_weights:
                    if "bias" not in weights.name:
                        # print("Added regularizer to {}".format(weights.name))
                        if weights.dtype.base_dtype == tf.float16:
                            tf.add_to_collection('REGULARIZATION_FUNCTIONS',
                                                 (weights, regularizer))
                        else:
                            tf.add_to_collection(
                                ops.GraphKeys.REGULARIZATION_LOSSES,
                                regularizer(weights))

        num_units = params["num_tokens"]
        att_size = params["attention_layer_size"]

        # Randomly initilized tokens
        gst_embedding = tf.get_variable(
            "token_embeddings",
            shape=[num_units, params["emb_size"]],
            dtype=self.params["dtype"],
            initializer=tf.random_uniform_initializer(
                minval=-1., maxval=1., dtype=self.params["dtype"]),
            trainable=False)

        attention = attention_layer.Attention(params["attention_layer_size"],
                                              params["num_heads"],
                                              0.,
                                              training,
                                              mode="bahdanau")

        top_layer = tf.expand_dims(top_layer, 1)
        gst_embedding = tf.nn.tanh(gst_embedding)
        gst_embedding = tf.expand_dims(gst_embedding, 0)
        gst_embedding = tf.tile(gst_embedding, [batch_size, 1, 1])
        token_embeddings = attention(top_layer, gst_embedding, None)
        token_embeddings = tf.squeeze(token_embeddings, 1)

        return token_embeddings