def causal_conv_bn_actv(
    layer_type, name, inputs, filters, kernel_size, activation_fn, strides,
    padding, regularizer, training, data_format, bn_momentum, bn_epsilon,
    dilation=1):
  """
  Defines a single dilated causal convolutional layer with batch norm
  """

  block = conv_bn_actv(
      layer_type=layer_type,
      name=name,
      inputs=inputs,
      filters=filters,
      kernel_size=kernel_size,
      activation_fn=activation_fn,
      strides=strides,
      padding=padding,
      regularizer=regularizer,
      training=training,
      data_format=data_format,
      bn_momentum=bn_momentum,
      bn_epsilon=bn_epsilon,
      dilation=dilation
  )

  # pad the left side of the time-series with an amount of zeros based on the
  # dilation rate
  block = tf.pad(block, [[0, 0], [dilation * (kernel_size - 1), 0], [0, 0]])
  return block
  def _embed_style( self, style_spec, style_len ):
    """
    Code that implements the reference encoder as described in "Towards
    end-to-end prosody transfer for expressive speech synthesis with Tacotron",
    and "Style Tokens: Unsupervised Style Modeling, Control and Transfer in
    End-to-End Speech Synthesis"

    Config parameters:

    * **conv_layers** (list) --- See the conv_layers parameter for the
      Tacotron-2 model.
    * **num_rnn_layers** (int) --- Number of rnn layers in the reference encoder
    * **rnn_cell_dim** (int) --- Size of rnn layer
    * **rnn_unidirectional** (bool) --- Uni- or bi-directional rnn.
    * **rnn_type** --- Must be a valid tf rnn cell class
    * **emb_size** (int) --- Size of gst
    * **attention_layer_size** (int) --- Size of linear layers in attention
    * **num_tokens** (int) --- Number of tokens for gst
    * **num_heads** (int) --- Number of attention heads
    """
    batch_size = style_spec.get_shape().as_list()[0]
    
    training = (self._mode == "train")
    regularizer = self.params.get('regularizer', None)
    data_format = self.params.get('data_format', 'channels_last')

    top_layer = tf.expand_dims(style_spec, -1)
    params = self.params['style_embedding_params']
    if "conv_layers" in params:
      for i, conv_params in enumerate(params['conv_layers']):
        ch_out = conv_params['num_channels']
        kernel_size = conv_params['kernel_size']  # [time, freq]
        strides = conv_params['stride']
        padding = conv_params['padding']

        if padding == "VALID":
          style_len = (style_len - kernel_size[0] + strides[0]) // strides[0]
        else:
          style_len = (style_len + strides[0] - 1) // strides[0]

        top_layer = conv_bn_actv(
            layer_type="conv2d",
            name="conv{}".format(i + 1),
            inputs=top_layer,
            filters=ch_out,
            kernel_size=kernel_size,
            activation_fn=self.params['activation_fn'],
            strides=strides,
            padding=padding,
            regularizer=regularizer,
            training=training,
            data_format=data_format,
            bn_momentum=self.params.get('bn_momentum', 0.1),
            bn_epsilon=self.params.get('bn_epsilon', 1e-5),
        )

      if data_format == 'channels_first':
        top_layer = tf.transpose(top_layer, [0, 2, 1])

    top_layer = tf.concat(tf.unstack(top_layer, axis=2), axis=-1)
    top_layer = tf.layers.dropout(
          top_layer, rate=self.params["cnn_dropout_prob"], training=training
      )

    num_rnn_layers = params['num_rnn_layers']
    if num_rnn_layers > 0:
      cell_params = {}
      cell_params["num_units"] = params['rnn_cell_dim']
      rnn_type = params['rnn_type']
      rnn_input = top_layer
      rnn_vars = []

      multirnn_cell_fw = tf.nn.rnn_cell.MultiRNNCell(
          [
              single_cell(
                  cell_class=rnn_type,
                  cell_params=cell_params,
                  training=training,
                  residual_connections=False
              ) for _ in range(num_rnn_layers)
          ]
      )
      rnn_vars += multirnn_cell_fw.trainable_variables
      if params['rnn_unidirectional']:
        top_layer, final_state = tf.nn.dynamic_rnn(
            cell=multirnn_cell_fw,
            inputs=rnn_input,
            sequence_length=style_len,
            dtype=rnn_input.dtype,
            time_major=False,
        )
        final_state = final_state[0]
      else:
        multirnn_cell_bw = tf.nn.rnn_cell.MultiRNNCell(
            [
                single_cell(
                    cell_class=rnn_type,
                    cell_params=cell_params,
                    training=training,
                    residual_connections=False
                ) for _ in range(num_rnn_layers)
            ]
        )
        top_layer, final_state = tf.nn.bidirectional_dynamic_rnn(
            cell_fw=multirnn_cell_fw,
            cell_bw=multirnn_cell_bw,
            inputs=rnn_input,
            sequence_length=style_len,
            dtype=rnn_input.dtype,
            time_major=False
        )
        # concat 2 tensors [B, T, n_cell_dim] --> [B, T, 2*n_cell_dim]
        final_state = tf.concat((final_state[0][0], final_state[1][0]), 1)
        rnn_vars += multirnn_cell_bw.trainable_variables

      top_layer = final_state
      # Apply linear layer
      top_layer = tf.layers.dense(
          top_layer,
          256,
          activation=tf.nn.tanh,
          kernel_regularizer=regularizer,
          name="reference_activation"
      )

      if regularizer and training:
        cell_weights = rnn_vars
        for weights in cell_weights:
          if "bias" not in weights.name:
            # print("Added regularizer to {}".format(weights.name))
            if weights.dtype.base_dtype == tf.float16:
              tf.add_to_collection(
                  'REGULARIZATION_FUNCTIONS', (weights, regularizer)
              )
            else:
              tf.add_to_collection(
                  ops.GraphKeys.REGULARIZATION_LOSSES, regularizer(weights)
              )

    return top_layer
  def _encode(self, input_dict):
    """Creates TensorFlow graph for Tacotron-2 like encoder.

    Args:
       input_dict (dict): dictionary with inputs.
        Must define:

            source_tensors - array containing [

              * source_sequence: tensor of shape [batch_size, sequence length]
              * src_length: tensor of shape [batch_size]

            ]

    Returns:
      dict: A python dictionary containing:

          * outputs - tensor containing the encoded text to be passed to the
            attention layer
          * src_length - the length of the encoded text
    """

    text = input_dict['source_tensors'][0]
    text_len = input_dict['source_tensors'][1]
    mel = input_dict['source_tensors'][2]
    mel_length = input_dict['source_tensors'][3]
    words_per_frame = input_dict['source_tensors'][4]
    chars_per_frame = input_dict['source_tensors'][5]

    training = (self._mode == "train")
    regularizer = self.params.get('regularizer', None)
    data_format = self.params.get('data_format', 'channels_last')
    src_vocab_size = self._model.get_data_layer().params['src_vocab_size']
    zoneout_prob = self.params.get('zoneout_prob', 0.)

    # if src_vocab_size % 8 != 0:
    #   src_vocab_size += 8 - (src_vocab_size % 8)

    # ----- Embedding layer -----------------------------------------------
    enc_emb_w = tf.get_variable(
        name="EncoderEmbeddingMatrix",
        shape=[src_vocab_size, self.params['src_emb_size']],
        dtype=self.params['dtype'],
        # initializer=tf.random_normal_initializer()
    )

    embedded_inputs = tf.cast(
        tf.nn.embedding_lookup(
            enc_emb_w,
            text,
        ), self.params['dtype']
    )

    # ----- Convolutional layers -----------------------------------------------
    input_layer = embedded_inputs

    if data_format == 'channels_last':
      top_layer = input_layer
    else:
      top_layer = tf.transpose(input_layer, [0, 2, 1])

    for i, conv_params in enumerate(self.params['conv_layers']):
      ch_out = conv_params['num_channels']
      kernel_size = conv_params['kernel_size']  # [time, freq]
      strides = conv_params['stride']
      padding = conv_params['padding']

      if padding == "VALID":
        text_len = (text_len - kernel_size[0] + strides[0]) // strides[0]
      else:
        text_len = (text_len + strides[0] - 1) // strides[0]

      top_layer = conv_bn_actv(
          layer_type="conv1d",
          name="conv{}".format(i + 1),
          inputs=top_layer,
          filters=ch_out,
          kernel_size=kernel_size,
          activation_fn=self.params['activation_fn'],
          strides=strides,
          padding=padding,
          regularizer=regularizer,
          training=training,
          data_format=data_format,
          bn_momentum=self.params.get('bn_momentum', 0.1),
          bn_epsilon=self.params.get('bn_epsilon', 1e-5),
      )
      top_layer = tf.layers.dropout(
          top_layer, rate=self.params["cnn_dropout_prob"], training=training
      )

    if data_format == 'channels_first':
      top_layer = tf.transpose(top_layer, [0, 2, 1])

    # ----- RNN ---------------------------------------------------------------
    num_rnn_layers = self.params['num_rnn_layers']
    cell_params = {}
    cell_params["num_units"] = self.params['rnn_cell_dim']
    rnn_type = self.params['rnn_type']
    rnn_input = top_layer
    rnn_vars = []

    multirnn_cell_fw = tf.nn.rnn_cell.MultiRNNCell(
        [
            single_cell(
                cell_class=rnn_type,
                cell_params=cell_params,
                zoneout_prob=zoneout_prob,
                training=training,
                residual_connections=False
            ) for _ in range(num_rnn_layers)
        ]
    )
    rnn_vars += multirnn_cell_fw.trainable_variables

    if self.params['rnn_unidirectional']:
      top_layer, final_state = tf.nn.dynamic_rnn(
          cell=multirnn_cell_fw,
          inputs=rnn_input,
          sequence_length=text_len,
          dtype=rnn_input.dtype,
          time_major=False,
      )
      final_state = final_state[0]
    else:
      multirnn_cell_bw = tf.nn.rnn_cell.MultiRNNCell(
          [
              single_cell(
                  cell_class=rnn_type,
                  cell_params=cell_params,
                  zoneout_prob=zoneout_prob,
                  training=training,
                  residual_connections=False
              ) for _ in range(num_rnn_layers)
          ]
      )
      top_layer, final_state = tf.nn.bidirectional_dynamic_rnn(
          cell_fw=multirnn_cell_fw,
          cell_bw=multirnn_cell_bw,
          inputs=rnn_input,
          sequence_length=text_len,
          dtype=rnn_input.dtype,
          time_major=False
      )
      # concat 2 tensors [B, T, n_cell_dim] --> [B, T, 2*n_cell_dim]
      # final_state = tf.concat((final_state[0][0], final_state[1][0]), 1)
      top_layer = tf.concat(top_layer, 2)
      rnn_vars += multirnn_cell_bw.trainable_variables

    # -- end of rnn------------------------------------------------------------

    top_layer = tf.layers.dropout(
        top_layer, rate=self.params["rnn_dropout_prob"], training=training
    )
    with tf.variable_scope("style_encoder"):
      style_embedding = self._embed_style( mel, mel_length )
      style_embedding = tf.concat( [style_embedding, words_per_frame, chars_per_frame], axis=-1 )

    style_embedding = tf.layers.dense(
          style_embedding,
          256,
          activation=tf.nn.tanh,
          kernel_regularizer=regularizer,
          name="extended_style_dense_layer"
      )

    style_embedding = tf.expand_dims(style_embedding, 1)
    style_embedding = tf.tile(
        style_embedding,
        [1, tf.reduce_max(text_len), 1]
    )

    outputs = tf.concat([top_layer, style_embedding], axis=-1)

    with tf.variable_scope("concatenated_encoder"):
      cell_params = {}
      cell_params["num_units"] = 512
      multirnn_cell_fw2 = tf.nn.rnn_cell.MultiRNNCell(
            [
                single_cell(
                    cell_class=rnn_type,
                    cell_params=cell_params,
                    training=training,
                    residual_connections=False
                ) for _ in range(num_rnn_layers)
            ]
        )
      rnn_vars += multirnn_cell_fw2.trainable_variables
      
      multirnn_cell_bw2 = tf.nn.rnn_cell.MultiRNNCell(
            [
                single_cell(
                    cell_class=rnn_type,
                    cell_params=cell_params,
                    zoneout_prob=zoneout_prob,
                    training=training,
                    residual_connections=False
                ) for _ in range(num_rnn_layers)
            ]
        )

      _, final_state = tf.nn.bidirectional_dynamic_rnn(
          cell_fw=multirnn_cell_fw2,
          cell_bw=multirnn_cell_bw2,
          inputs=outputs,
          sequence_length=text_len,
          dtype=rnn_input.dtype,
          time_major=False
      )

      rnn_vars += multirnn_cell_bw2.trainable_variables
      final_state = tf.concat((final_state[0][0], final_state[1][0]), 1)

    if regularizer and training:
      cell_weights = []
      cell_weights += rnn_vars
      cell_weights += [enc_emb_w]
      for weights in cell_weights:
        if "bias" not in weights.name:
          # print("Added regularizer to {}".format(weights.name))
          if weights.dtype.base_dtype == tf.float16:
            tf.add_to_collection(
                'REGULARIZATION_FUNCTIONS', (weights, regularizer)
            )
          else:
            tf.add_to_collection(
                ops.GraphKeys.REGULARIZATION_LOSSES, regularizer(weights)
            )

    dense_outputs = tf.layers.dense(
            final_state,
            1024,
            activation=tf.nn.tanh,
            kernel_regularizer=regularizer,
            name="concatenated_encoder_activation1"
    )

    dense_outputs = tf.layers.dense(
            dense_outputs,
            512,
            activation=tf.nn.tanh,
            kernel_regularizer=regularizer,
            name="concatenated_encoder_activation2"
    )

    return {
        'outputs': dense_outputs,
        'src_length': text_len,
    }
    def _encode(self, input_dict):
        """Creates TensorFlow graph for Tacotron-2 like encoder.

    Args:
       input_dict (dict): dictionary with inputs.
        Must define:

            source_tensors - array containing [

              * source_sequence: tensor of shape [batch_size, sequence length]
              * src_length: tensor of shape [batch_size]

            ]

    Returns:
      dict: A python dictionary containing:

          * outputs - tensor containing the encoded text to be passed to the
            attention layer
          * src_length - the length of the encoded text
    """

        text = input_dict['source_tensors'][0]
        text_len = input_dict['source_tensors'][1]

        training = (self._mode == "train")
        regularizer = self.params.get('regularizer', None)
        data_format = self.params.get('data_format', 'channels_last')
        src_vocab_size = self._model.get_data_layer().params['src_vocab_size']
        zoneout_prob = self.params.get('zoneout_prob', 0.)

        # if src_vocab_size % 8 != 0:
        #   src_vocab_size += 8 - (src_vocab_size % 8)

        # ----- Embedding layer -----------------------------------------------
        enc_emb_w = tf.get_variable(
            name="EncoderEmbeddingMatrix",
            shape=[src_vocab_size, self.params['src_emb_size']],
            dtype=self.params['dtype'],
            # initializer=tf.random_normal_initializer()
        )

        embedded_inputs = tf.cast(tf.nn.embedding_lookup(
            enc_emb_w,
            text,
        ), self.params['dtype'])

        # ----- Style layer ---------------------------------------------------
        if self.params.get("style_embedding_enable", False):
            if "style_embedding_params" not in self.params:
                raise ValueError(
                    "style_embedding_params must be passed if style embedding is",
                    "enabled")
            with tf.variable_scope("style_encoder"):
                if (self._model.get_data_layer().params.get(
                        "style_input", None) == "wav"):
                    style_spec = input_dict['source_tensors'][2]
                    style_len = input_dict['source_tensors'][3]
                    style_embedding = self._embed_style(style_spec, style_len)
                else:
                    raise ValueError(
                        "The data layer's style input parameter must be set.")
                style_embedding = tf.expand_dims(style_embedding, 1)
                style_embedding = tf.tile(style_embedding,
                                          [1, tf.reduce_max(text_len), 1])

        # ----- Convolutional layers -----------------------------------------------
        input_layer = embedded_inputs

        if data_format == 'channels_last':
            top_layer = input_layer
        else:
            top_layer = tf.transpose(input_layer, [0, 2, 1])

        for i, conv_params in enumerate(self.params['conv_layers']):
            ch_out = conv_params['num_channels']
            kernel_size = conv_params['kernel_size']  # [time, freq]
            strides = conv_params['stride']
            padding = conv_params['padding']

            if padding == "VALID":
                text_len = (text_len - kernel_size[0] +
                            strides[0]) // strides[0]
            else:
                text_len = (text_len + strides[0] - 1) // strides[0]

            top_layer = conv_bn_actv(
                layer_type="conv1d",
                name="conv{}".format(i + 1),
                inputs=top_layer,
                filters=ch_out,
                kernel_size=kernel_size,
                activation_fn=self.params['activation_fn'],
                strides=strides,
                padding=padding,
                regularizer=regularizer,
                training=training,
                data_format=data_format,
                bn_momentum=self.params.get('bn_momentum', 0.1),
                bn_epsilon=self.params.get('bn_epsilon', 1e-5),
            )
            top_layer = tf.layers.dropout(top_layer,
                                          rate=self.params["cnn_dropout_prob"],
                                          training=training)

        if data_format == 'channels_first':
            top_layer = tf.transpose(top_layer, [0, 2, 1])

        # ----- RNN ---------------------------------------------------------------
        num_rnn_layers = self.params['num_rnn_layers']
        if num_rnn_layers > 0:
            cell_params = {}
            cell_params["num_units"] = self.params['rnn_cell_dim']
            rnn_type = self.params['rnn_type']
            rnn_input = top_layer
            rnn_vars = []

            if self.params["use_cudnn_rnn"]:
                if self._mode == "infer":
                    cell = lambda: tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell(
                        cell_params["num_units"])
                    cells_fw = [cell() for _ in range(1)]
                    cells_bw = [cell() for _ in range(1)]
                    (top_layer, _,
                     _) = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
                         cells_fw,
                         cells_bw,
                         rnn_input,
                         sequence_length=text_len,
                         dtype=rnn_input.dtype,
                         time_major=False)
                else:
                    all_cudnn_classes = [
                        i[1] for i in inspect.getmembers(
                            tf.contrib.cudnn_rnn, inspect.isclass)
                    ]
                    if not rnn_type in all_cudnn_classes:
                        raise TypeError("rnn_type must be a Cudnn RNN class")
                    if zoneout_prob != 0.:
                        raise ValueError(
                            "Zoneout is currently not supported for cudnn rnn classes"
                        )

                    rnn_input = tf.transpose(top_layer, [1, 0, 2])
                    if self.params['rnn_unidirectional']:
                        direction = cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION
                    else:
                        direction = cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION

                    rnn_block = rnn_type(num_layers=num_rnn_layers,
                                         num_units=cell_params["num_units"],
                                         direction=direction,
                                         dtype=rnn_input.dtype,
                                         name="cudnn_rnn")
                    rnn_block.build(rnn_input.get_shape())
                    top_layer, _ = rnn_block(rnn_input)
                    top_layer = tf.transpose(top_layer, [1, 0, 2])
                    rnn_vars += rnn_block.trainable_variables

            else:
                multirnn_cell_fw = tf.nn.rnn_cell.MultiRNNCell([
                    single_cell(cell_class=rnn_type,
                                cell_params=cell_params,
                                zoneout_prob=zoneout_prob,
                                training=training,
                                residual_connections=False)
                    for _ in range(num_rnn_layers)
                ])
                rnn_vars += multirnn_cell_fw.trainable_variables
                if self.params['rnn_unidirectional']:
                    top_layer, _ = tf.nn.dynamic_rnn(
                        cell=multirnn_cell_fw,
                        inputs=rnn_input,
                        sequence_length=text_len,
                        dtype=rnn_input.dtype,
                        time_major=False,
                    )
                else:
                    multirnn_cell_bw = tf.nn.rnn_cell.MultiRNNCell([
                        single_cell(cell_class=rnn_type,
                                    cell_params=cell_params,
                                    zoneout_prob=zoneout_prob,
                                    training=training,
                                    residual_connections=False)
                        for _ in range(num_rnn_layers)
                    ])
                    top_layer, _ = tf.nn.bidirectional_dynamic_rnn(
                        cell_fw=multirnn_cell_fw,
                        cell_bw=multirnn_cell_bw,
                        inputs=rnn_input,
                        sequence_length=text_len,
                        dtype=rnn_input.dtype,
                        time_major=False)
                    # concat 2 tensors [B, T, n_cell_dim] --> [B, T, 2*n_cell_dim]
                    top_layer = tf.concat(top_layer, 2)
                    rnn_vars += multirnn_cell_bw.trainable_variables

            if regularizer and training:
                cell_weights = []
                cell_weights += rnn_vars
                cell_weights += [enc_emb_w]
                for weights in cell_weights:
                    if "bias" not in weights.name:
                        # print("Added regularizer to {}".format(weights.name))
                        if weights.dtype.base_dtype == tf.float16:
                            tf.add_to_collection('REGULARIZATION_FUNCTIONS',
                                                 (weights, regularizer))
                        else:
                            tf.add_to_collection(
                                ops.GraphKeys.REGULARIZATION_LOSSES,
                                regularizer(weights))

        # -- end of rnn------------------------------------------------------------

        top_layer = tf.layers.dropout(top_layer,
                                      rate=self.params["rnn_dropout_prob"],
                                      training=training)
        outputs = top_layer
        if self.params.get("style_embedding_enable", False):
            outputs = tf.concat([outputs, style_embedding], axis=-1)

        return {'outputs': outputs, 'src_length': text_len}
Exemple #5
0
    def _decode(self, input_dict):
        """
    Decodes representation into data

    Args:
      input_dict (dict): Python dictionary with inputs to decoder. Must define:
          * src_inputs - decoder input Tensor of shape [batch_size, time, dim]
            or [time, batch_size, dim]
          * src_lengths - decoder input lengths Tensor of shape [batch_size]
          * tgt_inputs - Only during training. labels Tensor of the
            shape [batch_size, time, num_features] or
            [time, batch_size, num_features]
          * stop_token_inputs - Only during training. labels Tensor of the
            shape [batch_size, time, 1] or [time, batch_size, 1]
          * tgt_lengths - Only during training. labels lengths
            Tensor of the shape [batch_size]

    Returns:
      dict:
        A python dictionary containing:

          * outputs - array containing:

              * decoder_output - tensor of shape [batch_size, time,
                num_features] or [time, batch_size, num_features]. Spectrogram
                representation learned by the decoder rnn
              * spectrogram_prediction - tensor of shape [batch_size, time,
                num_features] or [time, batch_size, num_features]. Spectrogram
                containing the residual corrections from the postnet if enabled
              * alignments - tensor of shape [batch_size, time, memory_size]
                or [time, batch_size, memory_size]. The alignments learned by
                the attention layer
              * stop_token_prediction - tensor of shape [batch_size, time, 1]
                or [time, batch_size, 1]. The stop token predictions
              * final_sequence_lengths - tensor of shape [batch_size]
          * stop_token_predictions - tensor of shape [batch_size, time, 1]
            or [time, batch_size, 1]. The stop token predictions for use inside
            the loss function.
    """
        encoder_outputs = input_dict['encoder_output']['outputs']
        enc_src_lengths = input_dict['encoder_output']['src_length']
        if self._mode == "train" or (self._mode == "infer"
                                     and self._gta_forcing == True):
            spec = input_dict['target_tensors'][0]
            spec_length = input_dict['target_tensors'][2]
        else:
            spec = None
            spec_length = None

        _batch_size = encoder_outputs.get_shape().as_list()[0]

        training = (self._mode == "train")
        regularizer = self.params.get('regularizer', None)

        if self.params.get('enable_postnet', True):
            if "postnet_conv_layers" not in self.params:
                raise ValueError(
                    "postnet_conv_layers must be passed from config file if postnet is"
                    "enabled")

        if self._both:
            num_audio_features = self._n_feats["mel"]
            if self._mode == "train":
                spec, _ = tf.split(
                    spec, [self._n_feats['mel'], self._n_feats['magnitude']],
                    axis=2)
        else:
            num_audio_features = self._n_feats

        output_projection_layer = tf.layers.Dense(
            name="output_proj",
            units=num_audio_features,
            use_bias=True,
        )
        stop_token_projection_layer = tf.layers.Dense(
            name="stop_token_proj",
            units=1,
            use_bias=True,
        )

        prenet = None
        if self.params.get('enable_prenet', True):
            prenet = Prenet(self.params.get('prenet_units', 256),
                            self.params.get('prenet_layers', 2),
                            self.params.get("prenet_activation", tf.nn.relu),
                            self.params["dtype"])

        cell_params = {}
        cell_params["num_units"] = self.params['decoder_cell_units']
        decoder_cells = [
            single_cell(
                cell_class=self.params['decoder_cell_type'],
                cell_params=cell_params,
                zoneout_prob=self.params.get("zoneout_prob", 0.),
                dp_output_keep_prob=1. - self.params.get("dropout_prob", 0.1),
                training=training,
            ) for _ in range(self.params['decoder_layers'])
        ]

        if self.params['attention_type'] is not None:
            attention_mechanism = self._build_attention(
                encoder_outputs, enc_src_lengths,
                self.params.get("attention_bias", False))

            attention_cell = tf.contrib.rnn.MultiRNNCell(decoder_cells)

            attentive_cell = AttentionWrapper(
                cell=attention_cell,
                attention_mechanism=attention_mechanism,
                alignment_history=True,
                output_attention="both",
            )

            decoder_cell = attentive_cell

        if self.params['attention_type'] is None:
            decoder_cell = tf.contrib.rnn.MultiRNNCell(decoder_cells)

        if self._mode == "train":
            train_and_not_sampling = True
            helper = TacotronTrainingHelper(
                inputs=spec,
                sequence_length=spec_length,
                prenet=None,
                model_dtype=self.params["dtype"],
                mask_decoder_sequence=self.params.get("mask_decoder_sequence",
                                                      True))
        elif self._mode == "eval" or self._mode == "infer":
            train_and_not_sampling = False
            inputs = tf.zeros((_batch_size, 1, num_audio_features),
                              dtype=self.params["dtype"])
            helper = TacotronHelper(
                inputs=inputs,
                prenet=None,
                mask_decoder_sequence=self.params.get("mask_decoder_sequence",
                                                      True),
                gta_mels=spec,
                gta_mel_lengths=spec_length,
            )
        else:
            raise ValueError("Unknown mode for decoder: {}".format(self._mode))
        decoder = TacotronDecoder(
            decoder_cell=decoder_cell,
            helper=helper,
            initial_decoder_state=decoder_cell.zero_state(
                _batch_size, self.params["dtype"]),
            attention_type=self.params["attention_type"],
            spec_layer=output_projection_layer,
            stop_token_layer=stop_token_projection_layer,
            prenet=prenet,
            dtype=self.params["dtype"],
            train=train_and_not_sampling)

        if self._mode == 'train':
            maximum_iterations = tf.reduce_max(spec_length)
        else:
            maximum_iterations = tf.reduce_max(enc_src_lengths) * 10

        outputs, final_state, sequence_lengths = tf.contrib.seq2seq.dynamic_decode(
            # outputs, final_state, sequence_lengths, final_inputs = dynamic_decode(
            decoder=decoder,
            impute_finished=False,
            maximum_iterations=maximum_iterations,
            swap_memory=self.params.get("use_swap_memory", False),
            output_time_major=self.params.get("time_major", False),
            parallel_iterations=self.params.get("parallel_iterations", 32))

        decoder_output = outputs.rnn_output
        stop_token_logits = outputs.stop_token_output

        with tf.variable_scope("decoder"):
            # If we are in train and doing sampling, we need to do the projections
            if train_and_not_sampling:
                decoder_spec_output = output_projection_layer(decoder_output)
                stop_token_logits = stop_token_projection_layer(
                    decoder_spec_output)
                decoder_output = decoder_spec_output

        ## Add the post net ##
        if self.params.get('enable_postnet', True):
            dropout_keep_prob = self.params.get('postnet_keep_dropout_prob',
                                                0.5)

            top_layer = decoder_output
            for i, conv_params in enumerate(
                    self.params['postnet_conv_layers']):
                ch_out = conv_params['num_channels']
                kernel_size = conv_params['kernel_size']  # [time, freq]
                strides = conv_params['stride']
                padding = conv_params['padding']
                activation_fn = conv_params['activation_fn']

                if ch_out == -1:
                    if self._both:
                        ch_out = self._n_feats["mel"]
                    else:
                        ch_out = self._n_feats

                top_layer = conv_bn_actv(
                    layer_type="conv1d",
                    name="conv{}".format(i + 1),
                    inputs=top_layer,
                    filters=ch_out,
                    kernel_size=kernel_size,
                    activation_fn=activation_fn,
                    strides=strides,
                    padding=padding,
                    regularizer=regularizer,
                    training=training,
                    data_format=self.params.get('postnet_data_format',
                                                'channels_last'),
                    bn_momentum=self.params.get('postnet_bn_momentum', 0.1),
                    bn_epsilon=self.params.get('postnet_bn_epsilon', 1e-5),
                )
                top_layer = tf.layers.dropout(top_layer,
                                              rate=1. - dropout_keep_prob,
                                              training=training)

        else:
            top_layer = tf.zeros([
                _batch_size, maximum_iterations,
                outputs.rnn_output.get_shape()[-1]
            ],
                                 dtype=self.params["dtype"])

        if regularizer and training:
            vars_to_regularize = []
            vars_to_regularize += attentive_cell.trainable_variables
            if (attention_mechanism.memory_layer is not None):
                vars_to_regularize += attention_mechanism.memory_layer.trainable_variables
            vars_to_regularize += output_projection_layer.trainable_variables
            vars_to_regularize += stop_token_projection_layer.trainable_variables

            for weights in vars_to_regularize:
                if "bias" not in weights.name:
                    # print("Added regularizer to {}".format(weights.name))
                    if weights.dtype.base_dtype == tf.float16:
                        tf.add_to_collection('REGULARIZATION_FUNCTIONS',
                                             (weights, regularizer))
                    else:
                        tf.add_to_collection(
                            ops.GraphKeys.REGULARIZATION_LOSSES,
                            regularizer(weights))

            if self.params.get('enable_prenet', True):
                prenet.add_regularization(regularizer)

        if self.params['attention_type'] is not None:
            alignments = tf.transpose(final_state.alignment_history.stack(),
                                      [1, 0, 2])
        else:
            alignments = tf.zeros([_batch_size, _batch_size, _batch_size])

        spectrogram_prediction = decoder_output + top_layer
        if self._both:
            mag_spec_prediction = spectrogram_prediction
            mag_spec_prediction = conv_bn_actv(
                layer_type="conv1d",
                name="conv_0",
                inputs=mag_spec_prediction,
                filters=256,
                kernel_size=4,
                activation_fn=tf.nn.relu,
                strides=1,
                padding="SAME",
                regularizer=regularizer,
                training=training,
                data_format=self.params.get('postnet_data_format',
                                            'channels_last'),
                bn_momentum=self.params.get('postnet_bn_momentum', 0.1),
                bn_epsilon=self.params.get('postnet_bn_epsilon', 1e-5),
            )
            mag_spec_prediction = conv_bn_actv(
                layer_type="conv1d",
                name="conv_1",
                inputs=mag_spec_prediction,
                filters=512,
                kernel_size=4,
                activation_fn=tf.nn.relu,
                strides=1,
                padding="SAME",
                regularizer=regularizer,
                training=training,
                data_format=self.params.get('postnet_data_format',
                                            'channels_last'),
                bn_momentum=self.params.get('postnet_bn_momentum', 0.1),
                bn_epsilon=self.params.get('postnet_bn_epsilon', 1e-5),
            )
            if self._model.get_data_layer()._exp_mag:
                mag_spec_prediction = tf.exp(mag_spec_prediction)
            mag_spec_prediction = tf.layers.conv1d(
                mag_spec_prediction,
                self._n_feats["magnitude"],
                1,
                name="post_net_proj",
                use_bias=False,
            )
        else:
            mag_spec_prediction = tf.zeros(
                [_batch_size, _batch_size, _batch_size])

        stop_token_prediction = tf.sigmoid(stop_token_logits)
        outputs = [
            decoder_output, spectrogram_prediction, alignments,
            stop_token_prediction, sequence_lengths, mag_spec_prediction
        ]

        return {
            'outputs': outputs,
            'stop_token_prediction': stop_token_logits,
        }
Exemple #6
0
  def _encode(self, input_dict):
    """Creates TensorFlow graph for DeepSpeech-2 like encoder.

    Args:
      input_dict (dict): input dictionary that has to contain
          the following fields::
            input_dict = {
              "source_tensors": [
                src_sequence (shape=[batch_size, sequence length, num features]),
                src_length (shape=[batch_size])
              ]
            }

    Returns:
      dict: dictionary with the following tensors::

        {
          'outputs': hidden state, shape=[batch_size, sequence length, n_hidden]
          'src_length': tensor, shape=[batch_size]
        }
    """

    source_sequence, src_length = input_dict['source_tensors']

    training = (self._mode == "train")
    dropout_keep_prob = self.params['dropout_keep_prob'] if training else 1.0
    regularizer = self.params.get('regularizer', None)
    data_format = self.params.get('data_format', 'channels_last')
    bn_momentum = self.params.get('bn_momentum', 0.99)
    bn_epsilon = self.params.get('bn_epsilon', 1e-3)

    input_layer = tf.expand_dims(source_sequence, axis=-1)
    batch_size = input_layer.get_shape().as_list()[0]
    if data_format == 'channels_last':
      top_layer = input_layer
    else:
      top_layer = tf.transpose(input_layer, [0, 3, 1, 2])

    # ----- Convolutional layers ---------------------------------------------
    conv_layers = self.params['conv_layers']

    for idx_conv in range(len(conv_layers)):
      ch_out = conv_layers[idx_conv]['num_channels']
      kernel_size = conv_layers[idx_conv]['kernel_size']  # [time, freq]
      strides = conv_layers[idx_conv]['stride']
      padding = conv_layers[idx_conv]['padding']

      if padding == "VALID":
        src_length = (src_length - kernel_size[0] + strides[0]) // strides[0]
      else:
        src_length = (src_length + strides[0] - 1) // strides[0]

      top_layer = conv_bn_actv(
          type="conv2d",
          name="conv{}".format(idx_conv + 1),
          inputs=top_layer,
          filters=ch_out,
          kernel_size=kernel_size,
          activation_fn=self.params['activation_fn'],
          strides=strides,
          padding=padding,
          regularizer=regularizer,
          training=training,
          data_format=data_format,
          bn_momentum=bn_momentum,
          bn_epsilon=bn_epsilon,
      )
    if data_format == 'channels_first':
      top_layer = tf.transpose(top_layer, [0, 2, 3, 1])

    # reshape to [B, T, FxC]
    f = top_layer.get_shape().as_list()[2]
    c = top_layer.get_shape().as_list()[3]
    fc = f * c
    top_layer = tf.reshape(top_layer, [batch_size, -1, fc])

    # ----- RNN ---------------------------------------------------------------
    num_rnn_layers = self.params['num_rnn_layers']
    if num_rnn_layers > 0:
      rnn_cell_dim = self.params['rnn_cell_dim']
      rnn_type = self.params['rnn_type']
      if self.params['use_cudnn_rnn']:
        # reshape to [B, T, C] --> [T, B, C]
        rnn_input = tf.transpose(top_layer, [1, 0, 2])
        if self.params['rnn_unidirectional']:
          direction = cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION
        else:
          direction = cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION

        if rnn_type == "cudnn_gru" or rnn_type == "gru":
          rnn_block = tf.contrib.cudnn_rnn.CudnnGRU(
              num_layers=num_rnn_layers,
              num_units=rnn_cell_dim,
              direction=direction,
              dropout=1.0 - dropout_keep_prob,
              dtype=rnn_input.dtype,
              name="cudnn_gru",
          )
        elif rnn_type == "cudnn_lstm" or rnn_type == "lstm":
          rnn_block = tf.contrib.cudnn_rnn.CudnnLSTM(
              num_layers=num_rnn_layers,
              num_units=rnn_cell_dim,
              direction=direction,
              dropout=1.0 - dropout_keep_prob,
              dtype=rnn_input.dtype,
              name="cudnn_lstm",
          )
        else:
          raise ValueError(
              "{} is not a valid rnn_type for cudnn_rnn layers".format(
                  rnn_type)
          )
        top_layer, state = rnn_block(rnn_input)
        top_layer = tf.transpose(top_layer, [1, 0, 2])
      else:
        rnn_input = top_layer
        multirnn_cell_fw = tf.nn.rnn_cell.MultiRNNCell(
            [rnn_cell(rnn_cell_dim=rnn_cell_dim, layer_type=rnn_type,
                      dropout_keep_prob=dropout_keep_prob)
             for _ in range(num_rnn_layers)]
        )
        if self.params['rnn_unidirectional']:
          top_layer, state = tf.nn.dynamic_rnn(
              cell=multirnn_cell_fw,
              inputs=rnn_input,
              sequence_length=src_length,
              dtype=rnn_input.dtype,
              time_major=False,
          )
        else:
          multirnn_cell_bw = tf.nn.rnn_cell.MultiRNNCell(
              [rnn_cell(rnn_cell_dim=rnn_cell_dim, layer_type=rnn_type,
                        dropout_keep_prob=dropout_keep_prob)
               for _ in range(num_rnn_layers)]
          )
          top_layer, state = tf.nn.bidirectional_dynamic_rnn(
              cell_fw=multirnn_cell_fw, cell_bw=multirnn_cell_bw,
              inputs=rnn_input,
              sequence_length=src_length,
              dtype=rnn_input.dtype,
              time_major=False
          )
          # concat 2 tensors [B, T, n_cell_dim] --> [B, T, 2*n_cell_dim]
          top_layer = tf.concat(top_layer, 2)
    # -- end of rnn------------------------------------------------------------

    if self.params['row_conv']:
      channels = top_layer.get_shape().as_list()[-1]
      top_layer = row_conv(
          name="row_conv",
          input_layer=top_layer,
          batch=batch_size,
          channels=channels,
          activation_fn=self.params['activation_fn'],
          width=self.params['row_conv_width'],
          regularizer=regularizer,
          training=training,
          data_format=data_format,
          bn_momentum=bn_momentum,
          bn_epsilon=bn_epsilon,
      )

    # Reshape [B, T, C] --> [B*T, C]
    c = top_layer.get_shape().as_list()[-1]
    top_layer = tf.reshape(top_layer, [-1, c])

    # --- hidden layer with clipped ReLU activation and dropout---------------
    top_layer = tf.layers.dense(
        inputs=top_layer,
        units=self.params['n_hidden'],
        kernel_regularizer=regularizer,
        activation=self.params['activation_fn'],
        name='fully_connected',
    )
    outputs = tf.nn.dropout(x=top_layer, keep_prob=dropout_keep_prob)

    # reshape from  [B*T,A] --> [B, T, A].
    # Output shape: [batch_size, n_steps, n_hidden]
    outputs = tf.reshape(
        outputs,
        [batch_size, -1, self.params['n_hidden']],
    )

    return {
        'outputs': outputs,
        'src_length': src_length,
    }