コード例 #1
0
  def _embed_style( self, style_spec, style_len ):
    """
    Code that implements the reference encoder as described in "Towards
    end-to-end prosody transfer for expressive speech synthesis with Tacotron",
    and "Style Tokens: Unsupervised Style Modeling, Control and Transfer in
    End-to-End Speech Synthesis"

    Config parameters:

    * **conv_layers** (list) --- See the conv_layers parameter for the
      Tacotron-2 model.
    * **num_rnn_layers** (int) --- Number of rnn layers in the reference encoder
    * **rnn_cell_dim** (int) --- Size of rnn layer
    * **rnn_unidirectional** (bool) --- Uni- or bi-directional rnn.
    * **rnn_type** --- Must be a valid tf rnn cell class
    * **emb_size** (int) --- Size of gst
    * **attention_layer_size** (int) --- Size of linear layers in attention
    * **num_tokens** (int) --- Number of tokens for gst
    * **num_heads** (int) --- Number of attention heads
    """
    batch_size = style_spec.get_shape().as_list()[0]
    
    training = (self._mode == "train")
    regularizer = self.params.get('regularizer', None)
    data_format = self.params.get('data_format', 'channels_last')

    top_layer = tf.expand_dims(style_spec, -1)
    params = self.params['style_embedding_params']
    if "conv_layers" in params:
      for i, conv_params in enumerate(params['conv_layers']):
        ch_out = conv_params['num_channels']
        kernel_size = conv_params['kernel_size']  # [time, freq]
        strides = conv_params['stride']
        padding = conv_params['padding']

        if padding == "VALID":
          style_len = (style_len - kernel_size[0] + strides[0]) // strides[0]
        else:
          style_len = (style_len + strides[0] - 1) // strides[0]

        top_layer = conv_bn_actv(
            layer_type="conv2d",
            name="conv{}".format(i + 1),
            inputs=top_layer,
            filters=ch_out,
            kernel_size=kernel_size,
            activation_fn=self.params['activation_fn'],
            strides=strides,
            padding=padding,
            regularizer=regularizer,
            training=training,
            data_format=data_format,
            bn_momentum=self.params.get('bn_momentum', 0.1),
            bn_epsilon=self.params.get('bn_epsilon', 1e-5),
        )

      if data_format == 'channels_first':
        top_layer = tf.transpose(top_layer, [0, 2, 1])

    top_layer = tf.concat(tf.unstack(top_layer, axis=2), axis=-1)
    top_layer = tf.layers.dropout(
          top_layer, rate=self.params["cnn_dropout_prob"], training=training
      )

    num_rnn_layers = params['num_rnn_layers']
    if num_rnn_layers > 0:
      cell_params = {}
      cell_params["num_units"] = params['rnn_cell_dim']
      rnn_type = params['rnn_type']
      rnn_input = top_layer
      rnn_vars = []

      multirnn_cell_fw = tf.nn.rnn_cell.MultiRNNCell(
          [
              single_cell(
                  cell_class=rnn_type,
                  cell_params=cell_params,
                  training=training,
                  residual_connections=False
              ) for _ in range(num_rnn_layers)
          ]
      )
      rnn_vars += multirnn_cell_fw.trainable_variables
      if params['rnn_unidirectional']:
        top_layer, final_state = tf.nn.dynamic_rnn(
            cell=multirnn_cell_fw,
            inputs=rnn_input,
            sequence_length=style_len,
            dtype=rnn_input.dtype,
            time_major=False,
        )
        final_state = final_state[0]
      else:
        multirnn_cell_bw = tf.nn.rnn_cell.MultiRNNCell(
            [
                single_cell(
                    cell_class=rnn_type,
                    cell_params=cell_params,
                    training=training,
                    residual_connections=False
                ) for _ in range(num_rnn_layers)
            ]
        )
        top_layer, final_state = tf.nn.bidirectional_dynamic_rnn(
            cell_fw=multirnn_cell_fw,
            cell_bw=multirnn_cell_bw,
            inputs=rnn_input,
            sequence_length=style_len,
            dtype=rnn_input.dtype,
            time_major=False
        )
        # concat 2 tensors [B, T, n_cell_dim] --> [B, T, 2*n_cell_dim]
        final_state = tf.concat((final_state[0][0], final_state[1][0]), 1)
        rnn_vars += multirnn_cell_bw.trainable_variables

      top_layer = final_state
      # Apply linear layer
      top_layer = tf.layers.dense(
          top_layer,
          256,
          activation=tf.nn.tanh,
          kernel_regularizer=regularizer,
          name="reference_activation"
      )

      if regularizer and training:
        cell_weights = rnn_vars
        for weights in cell_weights:
          if "bias" not in weights.name:
            # print("Added regularizer to {}".format(weights.name))
            if weights.dtype.base_dtype == tf.float16:
              tf.add_to_collection(
                  'REGULARIZATION_FUNCTIONS', (weights, regularizer)
              )
            else:
              tf.add_to_collection(
                  ops.GraphKeys.REGULARIZATION_LOSSES, regularizer(weights)
              )

    return top_layer
コード例 #2
0
    def _encode(self, input_dict):
        """Encodes data into representation.

    Args:
      input_dict: a Python dictionary.
        Must define:
          * src_inputs - a Tensor of shape [batch_size, time] or
                         [time, batch_size]
                         (depending on time_major param)
          * src_lengths - a Tensor of shape [batch_size]

    Returns:
       a Python dictionary with:
      * encoder_outputs - a Tensor of shape
                          [batch_size, time, representation_dim]
      or [time, batch_size, representation_dim]
      * encoder_state - a Tensor of shape [batch_size, dim]
      * src_lengths - (copy ref from input) a Tensor of shape [batch_size]
    """
        # TODO: make a separate level of config for cell_params?
        source_sequence = input_dict['source_tensors'][0]
        source_length = input_dict['source_tensors'][1]

        self._enc_emb_w = tf.get_variable(
            name="EncoderEmbeddingMatrix",
            shape=[self._src_vocab_size, self._src_emb_size],
            dtype=tf.float32,
        )

        if self._mode == "train":
            dp_input_keep_prob = self.params['encoder_dp_input_keep_prob']
            dp_output_keep_prob = self.params['encoder_dp_output_keep_prob']
        else:
            dp_input_keep_prob = 1.0
            dp_output_keep_prob = 1.0

        fwd_cells = [
            single_cell(cell_class=self.params['core_cell'],
                        cell_params=self.params.get('core_cell_params', {}),
                        dp_input_keep_prob=dp_input_keep_prob,
                        dp_output_keep_prob=dp_output_keep_prob,
                        residual_connections=self.
                        params['encoder_use_skip_connections'])
            for _ in range(self.params['encoder_layers'])
        ]
        # pylint: disable=no-member
        self._encoder_cell_fw = tf.contrib.rnn.MultiRNNCell(fwd_cells)

        time_major = self.params.get("time_major", False)
        use_swap_memory = self.params.get("use_swap_memory", False)

        embedded_inputs = tf.cast(
            tf.nn.embedding_lookup(
                self.enc_emb_w,
                source_sequence,
            ),
            self.params['dtype'],
        )

        encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
            cell=self._encoder_cell_fw,
            inputs=embedded_inputs,
            sequence_length=source_length,
            time_major=time_major,
            swap_memory=use_swap_memory,
            dtype=embedded_inputs.dtype,
        )
        return {
            'outputs': encoder_outputs,
            'state': encoder_state,
            'src_lengths': source_length,
            'encoder_input': source_sequence
        }
コード例 #3
0
  def _encode(self, input_dict):
    """Creates TensorFlow graph for Tacotron-2 like encoder.

    Args:
       input_dict (dict): dictionary with inputs.
        Must define:

            source_tensors - array containing [

              * source_sequence: tensor of shape [batch_size, sequence length]
              * src_length: tensor of shape [batch_size]

            ]

    Returns:
      dict: A python dictionary containing:

          * outputs - tensor containing the encoded text to be passed to the
            attention layer
          * src_length - the length of the encoded text
    """

    text = input_dict['source_tensors'][0]
    text_len = input_dict['source_tensors'][1]
    mel = input_dict['source_tensors'][2]
    mel_length = input_dict['source_tensors'][3]
    words_per_frame = input_dict['source_tensors'][4]
    chars_per_frame = input_dict['source_tensors'][5]

    training = (self._mode == "train")
    regularizer = self.params.get('regularizer', None)
    data_format = self.params.get('data_format', 'channels_last')
    src_vocab_size = self._model.get_data_layer().params['src_vocab_size']
    zoneout_prob = self.params.get('zoneout_prob', 0.)

    # if src_vocab_size % 8 != 0:
    #   src_vocab_size += 8 - (src_vocab_size % 8)

    # ----- Embedding layer -----------------------------------------------
    enc_emb_w = tf.get_variable(
        name="EncoderEmbeddingMatrix",
        shape=[src_vocab_size, self.params['src_emb_size']],
        dtype=self.params['dtype'],
        # initializer=tf.random_normal_initializer()
    )

    embedded_inputs = tf.cast(
        tf.nn.embedding_lookup(
            enc_emb_w,
            text,
        ), self.params['dtype']
    )

    # ----- Convolutional layers -----------------------------------------------
    input_layer = embedded_inputs

    if data_format == 'channels_last':
      top_layer = input_layer
    else:
      top_layer = tf.transpose(input_layer, [0, 2, 1])

    for i, conv_params in enumerate(self.params['conv_layers']):
      ch_out = conv_params['num_channels']
      kernel_size = conv_params['kernel_size']  # [time, freq]
      strides = conv_params['stride']
      padding = conv_params['padding']

      if padding == "VALID":
        text_len = (text_len - kernel_size[0] + strides[0]) // strides[0]
      else:
        text_len = (text_len + strides[0] - 1) // strides[0]

      top_layer = conv_bn_actv(
          layer_type="conv1d",
          name="conv{}".format(i + 1),
          inputs=top_layer,
          filters=ch_out,
          kernel_size=kernel_size,
          activation_fn=self.params['activation_fn'],
          strides=strides,
          padding=padding,
          regularizer=regularizer,
          training=training,
          data_format=data_format,
          bn_momentum=self.params.get('bn_momentum', 0.1),
          bn_epsilon=self.params.get('bn_epsilon', 1e-5),
      )
      top_layer = tf.layers.dropout(
          top_layer, rate=self.params["cnn_dropout_prob"], training=training
      )

    if data_format == 'channels_first':
      top_layer = tf.transpose(top_layer, [0, 2, 1])

    # ----- RNN ---------------------------------------------------------------
    num_rnn_layers = self.params['num_rnn_layers']
    cell_params = {}
    cell_params["num_units"] = self.params['rnn_cell_dim']
    rnn_type = self.params['rnn_type']
    rnn_input = top_layer
    rnn_vars = []

    multirnn_cell_fw = tf.nn.rnn_cell.MultiRNNCell(
        [
            single_cell(
                cell_class=rnn_type,
                cell_params=cell_params,
                zoneout_prob=zoneout_prob,
                training=training,
                residual_connections=False
            ) for _ in range(num_rnn_layers)
        ]
    )
    rnn_vars += multirnn_cell_fw.trainable_variables

    if self.params['rnn_unidirectional']:
      top_layer, final_state = tf.nn.dynamic_rnn(
          cell=multirnn_cell_fw,
          inputs=rnn_input,
          sequence_length=text_len,
          dtype=rnn_input.dtype,
          time_major=False,
      )
      final_state = final_state[0]
    else:
      multirnn_cell_bw = tf.nn.rnn_cell.MultiRNNCell(
          [
              single_cell(
                  cell_class=rnn_type,
                  cell_params=cell_params,
                  zoneout_prob=zoneout_prob,
                  training=training,
                  residual_connections=False
              ) for _ in range(num_rnn_layers)
          ]
      )
      top_layer, final_state = tf.nn.bidirectional_dynamic_rnn(
          cell_fw=multirnn_cell_fw,
          cell_bw=multirnn_cell_bw,
          inputs=rnn_input,
          sequence_length=text_len,
          dtype=rnn_input.dtype,
          time_major=False
      )
      # concat 2 tensors [B, T, n_cell_dim] --> [B, T, 2*n_cell_dim]
      # final_state = tf.concat((final_state[0][0], final_state[1][0]), 1)
      top_layer = tf.concat(top_layer, 2)
      rnn_vars += multirnn_cell_bw.trainable_variables

    # -- end of rnn------------------------------------------------------------

    top_layer = tf.layers.dropout(
        top_layer, rate=self.params["rnn_dropout_prob"], training=training
    )
    with tf.variable_scope("style_encoder"):
      style_embedding = self._embed_style( mel, mel_length )
      style_embedding = tf.concat( [style_embedding, words_per_frame, chars_per_frame], axis=-1 )

    style_embedding = tf.layers.dense(
          style_embedding,
          256,
          activation=tf.nn.tanh,
          kernel_regularizer=regularizer,
          name="extended_style_dense_layer"
      )

    style_embedding = tf.expand_dims(style_embedding, 1)
    style_embedding = tf.tile(
        style_embedding,
        [1, tf.reduce_max(text_len), 1]
    )

    outputs = tf.concat([top_layer, style_embedding], axis=-1)

    with tf.variable_scope("concatenated_encoder"):
      cell_params = {}
      cell_params["num_units"] = 512
      multirnn_cell_fw2 = tf.nn.rnn_cell.MultiRNNCell(
            [
                single_cell(
                    cell_class=rnn_type,
                    cell_params=cell_params,
                    training=training,
                    residual_connections=False
                ) for _ in range(num_rnn_layers)
            ]
        )
      rnn_vars += multirnn_cell_fw2.trainable_variables
      
      multirnn_cell_bw2 = tf.nn.rnn_cell.MultiRNNCell(
            [
                single_cell(
                    cell_class=rnn_type,
                    cell_params=cell_params,
                    zoneout_prob=zoneout_prob,
                    training=training,
                    residual_connections=False
                ) for _ in range(num_rnn_layers)
            ]
        )

      _, final_state = tf.nn.bidirectional_dynamic_rnn(
          cell_fw=multirnn_cell_fw2,
          cell_bw=multirnn_cell_bw2,
          inputs=outputs,
          sequence_length=text_len,
          dtype=rnn_input.dtype,
          time_major=False
      )

      rnn_vars += multirnn_cell_bw2.trainable_variables
      final_state = tf.concat((final_state[0][0], final_state[1][0]), 1)

    if regularizer and training:
      cell_weights = []
      cell_weights += rnn_vars
      cell_weights += [enc_emb_w]
      for weights in cell_weights:
        if "bias" not in weights.name:
          # print("Added regularizer to {}".format(weights.name))
          if weights.dtype.base_dtype == tf.float16:
            tf.add_to_collection(
                'REGULARIZATION_FUNCTIONS', (weights, regularizer)
            )
          else:
            tf.add_to_collection(
                ops.GraphKeys.REGULARIZATION_LOSSES, regularizer(weights)
            )

    dense_outputs = tf.layers.dense(
            final_state,
            1024,
            activation=tf.nn.tanh,
            kernel_regularizer=regularizer,
            name="concatenated_encoder_activation1"
    )

    dense_outputs = tf.layers.dense(
            dense_outputs,
            512,
            activation=tf.nn.tanh,
            kernel_regularizer=regularizer,
            name="concatenated_encoder_activation2"
    )

    return {
        'outputs': dense_outputs,
        'src_length': text_len,
    }
コード例 #4
0
    def _encode(self, input_dict):
        """Creates TensorFlow graph for Tacotron-2 like encoder.

    Args:
       input_dict (dict): dictionary with inputs.
        Must define:

            source_tensors - array containing [

              * source_sequence: tensor of shape [batch_size, sequence length]
              * src_length: tensor of shape [batch_size]

            ]

    Returns:
      dict: A python dictionary containing:

          * outputs - tensor containing the encoded text to be passed to the
            attention layer
          * src_length - the length of the encoded text
    """

        text = input_dict['source_tensors'][0]
        text_len = input_dict['source_tensors'][1]

        training = (self._mode == "train")
        regularizer = self.params.get('regularizer', None)
        data_format = self.params.get('data_format', 'channels_last')
        src_vocab_size = self._model.get_data_layer().params['src_vocab_size']
        zoneout_prob = self.params.get('zoneout_prob', 0.)

        # if src_vocab_size % 8 != 0:
        #   src_vocab_size += 8 - (src_vocab_size % 8)

        # ----- Embedding layer -----------------------------------------------
        enc_emb_w = tf.get_variable(
            name="EncoderEmbeddingMatrix",
            shape=[src_vocab_size, self.params['src_emb_size']],
            dtype=self.params['dtype'],
            # initializer=tf.random_normal_initializer()
        )

        embedded_inputs = tf.cast(tf.nn.embedding_lookup(
            enc_emb_w,
            text,
        ), self.params['dtype'])

        # ----- Style layer ---------------------------------------------------
        if self.params.get("style_embedding_enable", False):
            if "style_embedding_params" not in self.params:
                raise ValueError(
                    "style_embedding_params must be passed if style embedding is",
                    "enabled")
            with tf.variable_scope("style_encoder"):
                if (self._model.get_data_layer().params.get(
                        "style_input", None) == "wav"):
                    style_spec = input_dict['source_tensors'][2]
                    style_len = input_dict['source_tensors'][3]
                    style_embedding = self._embed_style(style_spec, style_len)
                else:
                    raise ValueError(
                        "The data layer's style input parameter must be set.")
                style_embedding = tf.expand_dims(style_embedding, 1)
                style_embedding = tf.tile(style_embedding,
                                          [1, tf.reduce_max(text_len), 1])

        # ----- Convolutional layers -----------------------------------------------
        input_layer = embedded_inputs

        if data_format == 'channels_last':
            top_layer = input_layer
        else:
            top_layer = tf.transpose(input_layer, [0, 2, 1])

        for i, conv_params in enumerate(self.params['conv_layers']):
            ch_out = conv_params['num_channels']
            kernel_size = conv_params['kernel_size']  # [time, freq]
            strides = conv_params['stride']
            padding = conv_params['padding']

            if padding == "VALID":
                text_len = (text_len - kernel_size[0] +
                            strides[0]) // strides[0]
            else:
                text_len = (text_len + strides[0] - 1) // strides[0]

            top_layer = conv_bn_actv(
                layer_type="conv1d",
                name="conv{}".format(i + 1),
                inputs=top_layer,
                filters=ch_out,
                kernel_size=kernel_size,
                activation_fn=self.params['activation_fn'],
                strides=strides,
                padding=padding,
                regularizer=regularizer,
                training=training,
                data_format=data_format,
                bn_momentum=self.params.get('bn_momentum', 0.1),
                bn_epsilon=self.params.get('bn_epsilon', 1e-5),
            )
            top_layer = tf.layers.dropout(top_layer,
                                          rate=self.params["cnn_dropout_prob"],
                                          training=training)

        if data_format == 'channels_first':
            top_layer = tf.transpose(top_layer, [0, 2, 1])

        # ----- RNN ---------------------------------------------------------------
        num_rnn_layers = self.params['num_rnn_layers']
        if num_rnn_layers > 0:
            cell_params = {}
            cell_params["num_units"] = self.params['rnn_cell_dim']
            rnn_type = self.params['rnn_type']
            rnn_input = top_layer
            rnn_vars = []

            if self.params["use_cudnn_rnn"]:
                if self._mode == "infer":
                    cell = lambda: tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell(
                        cell_params["num_units"])
                    cells_fw = [cell() for _ in range(1)]
                    cells_bw = [cell() for _ in range(1)]
                    (top_layer, _,
                     _) = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
                         cells_fw,
                         cells_bw,
                         rnn_input,
                         sequence_length=text_len,
                         dtype=rnn_input.dtype,
                         time_major=False)
                else:
                    all_cudnn_classes = [
                        i[1] for i in inspect.getmembers(
                            tf.contrib.cudnn_rnn, inspect.isclass)
                    ]
                    if not rnn_type in all_cudnn_classes:
                        raise TypeError("rnn_type must be a Cudnn RNN class")
                    if zoneout_prob != 0.:
                        raise ValueError(
                            "Zoneout is currently not supported for cudnn rnn classes"
                        )

                    rnn_input = tf.transpose(top_layer, [1, 0, 2])
                    if self.params['rnn_unidirectional']:
                        direction = cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION
                    else:
                        direction = cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION

                    rnn_block = rnn_type(num_layers=num_rnn_layers,
                                         num_units=cell_params["num_units"],
                                         direction=direction,
                                         dtype=rnn_input.dtype,
                                         name="cudnn_rnn")
                    rnn_block.build(rnn_input.get_shape())
                    top_layer, _ = rnn_block(rnn_input)
                    top_layer = tf.transpose(top_layer, [1, 0, 2])
                    rnn_vars += rnn_block.trainable_variables

            else:
                multirnn_cell_fw = tf.nn.rnn_cell.MultiRNNCell([
                    single_cell(cell_class=rnn_type,
                                cell_params=cell_params,
                                zoneout_prob=zoneout_prob,
                                training=training,
                                residual_connections=False)
                    for _ in range(num_rnn_layers)
                ])
                rnn_vars += multirnn_cell_fw.trainable_variables
                if self.params['rnn_unidirectional']:
                    top_layer, _ = tf.nn.dynamic_rnn(
                        cell=multirnn_cell_fw,
                        inputs=rnn_input,
                        sequence_length=text_len,
                        dtype=rnn_input.dtype,
                        time_major=False,
                    )
                else:
                    multirnn_cell_bw = tf.nn.rnn_cell.MultiRNNCell([
                        single_cell(cell_class=rnn_type,
                                    cell_params=cell_params,
                                    zoneout_prob=zoneout_prob,
                                    training=training,
                                    residual_connections=False)
                        for _ in range(num_rnn_layers)
                    ])
                    top_layer, _ = tf.nn.bidirectional_dynamic_rnn(
                        cell_fw=multirnn_cell_fw,
                        cell_bw=multirnn_cell_bw,
                        inputs=rnn_input,
                        sequence_length=text_len,
                        dtype=rnn_input.dtype,
                        time_major=False)
                    # concat 2 tensors [B, T, n_cell_dim] --> [B, T, 2*n_cell_dim]
                    top_layer = tf.concat(top_layer, 2)
                    rnn_vars += multirnn_cell_bw.trainable_variables

            if regularizer and training:
                cell_weights = []
                cell_weights += rnn_vars
                cell_weights += [enc_emb_w]
                for weights in cell_weights:
                    if "bias" not in weights.name:
                        # print("Added regularizer to {}".format(weights.name))
                        if weights.dtype.base_dtype == tf.float16:
                            tf.add_to_collection('REGULARIZATION_FUNCTIONS',
                                                 (weights, regularizer))
                        else:
                            tf.add_to_collection(
                                ops.GraphKeys.REGULARIZATION_LOSSES,
                                regularizer(weights))

        # -- end of rnn------------------------------------------------------------

        top_layer = tf.layers.dropout(top_layer,
                                      rate=self.params["rnn_dropout_prob"],
                                      training=training)
        outputs = top_layer
        if self.params.get("style_embedding_enable", False):
            outputs = tf.concat([outputs, style_embedding], axis=-1)

        return {'outputs': outputs, 'src_length': text_len}
コード例 #5
0
    def _encode(self, input_dict):
        source_sequence = input_dict['source_tensors'][0]
        source_length = input_dict['source_tensors'][1]
        self._enc_emb_w = tf.get_variable(
            name="EncoderEmbeddingMatrix",
            shape=[self._src_vocab_size, self._src_emb_size],
            dtype=tf.float32,
        )

        if self.params['encoder_layers'] < 2:
            raise ValueError("GNMT encoder must have at least 2 layers")

        with tf.variable_scope("Level1FW"):
            self._encoder_l1_cell_fw = single_cell(
                cell_class=self.params['core_cell'],
                cell_params=self.params.get('core_cell_params', {}),
                dp_input_keep_prob=1.0,
                dp_output_keep_prob=1.0,
                residual_connections=False,
            )

        with tf.variable_scope("Level1BW"):
            self._encoder_l1_cell_bw = single_cell(
                cell_class=self.params['core_cell'],
                cell_params=self.params.get('core_cell_params', {}),
                dp_input_keep_prob=1.0,
                dp_output_keep_prob=1.0,
                residual_connections=False,
            )

        if self._mode == "train":
            dp_input_keep_prob = self.params['encoder_dp_input_keep_prob']
            dp_output_keep_prob = self.params['encoder_dp_output_keep_prob']
        else:
            dp_input_keep_prob = 1.0
            dp_output_keep_prob = 1.0

        with tf.variable_scope("UniDirLevel"):
            self._encoder_cells = [
                single_cell(
                    cell_class=self.params['core_cell'],
                    cell_params=self.params.get('core_cell_params', {}),
                    dp_input_keep_prob=dp_input_keep_prob,
                    dp_output_keep_prob=dp_output_keep_prob,
                    residual_connections=False,
                ) for _ in range(self.params['encoder_layers'] - 1)
            ]

            # add residual connections starting from the third layer
            for idx, cell in enumerate(self._encoder_cells):
                if idx > 0:
                    # pylint: disable=no-member
                    self._encoder_cells[idx] = tf.contrib.rnn.ResidualWrapper(
                        cell)

        time_major = self.params.get("time_major", False)
        use_swap_memory = self.params.get("use_swap_memory", False)
        embedded_inputs = tf.cast(
            tf.nn.embedding_lookup(
                self.enc_emb_w,
                source_sequence,
            ),
            self.params['dtype'],
        )

        # first bi-directional layer
        _encoder_output, _ = tf.nn.bidirectional_dynamic_rnn(
            cell_fw=self._encoder_l1_cell_fw,
            cell_bw=self._encoder_l1_cell_bw,
            inputs=embedded_inputs,
            sequence_length=source_length,
            swap_memory=use_swap_memory,
            time_major=time_major,
            dtype=embedded_inputs.dtype,
        )
        encoder_l1_outputs = tf.concat(_encoder_output, 2)

        # stack of unidirectional layers
        # pylint: disable=no-member
        encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
            cell=tf.contrib.rnn.MultiRNNCell(self._encoder_cells),
            inputs=encoder_l1_outputs,
            sequence_length=source_length,
            swap_memory=use_swap_memory,
            time_major=time_major,
            dtype=encoder_l1_outputs.dtype,
        )

        return {
            'outputs': encoder_outputs,
            'state': encoder_state,
            'src_lengths': source_length,
            'encoder_input': source_sequence
        }
コード例 #6
0
    def _decode(self, input_dict):
        """
    Decodes representation into data

    Args:
      input_dict (dict): Python dictionary with inputs to decoder. Must define:
          * src_inputs - decoder input Tensor of shape [batch_size, time, dim]
            or [time, batch_size, dim]
          * src_lengths - decoder input lengths Tensor of shape [batch_size]
          * tgt_inputs - Only during training. labels Tensor of the
            shape [batch_size, time, num_features] or
            [time, batch_size, num_features]
          * stop_token_inputs - Only during training. labels Tensor of the
            shape [batch_size, time, 1] or [time, batch_size, 1]
          * tgt_lengths - Only during training. labels lengths
            Tensor of the shape [batch_size]

    Returns:
      dict:
        A python dictionary containing:

          * outputs - array containing:

              * decoder_output - tensor of shape [batch_size, time,
                num_features] or [time, batch_size, num_features]. Spectrogram
                representation learned by the decoder rnn
              * spectrogram_prediction - tensor of shape [batch_size, time,
                num_features] or [time, batch_size, num_features]. Spectrogram
                containing the residual corrections from the postnet if enabled
              * alignments - tensor of shape [batch_size, time, memory_size]
                or [time, batch_size, memory_size]. The alignments learned by
                the attention layer
              * stop_token_prediction - tensor of shape [batch_size, time, 1]
                or [time, batch_size, 1]. The stop token predictions
              * final_sequence_lengths - tensor of shape [batch_size]
          * stop_token_predictions - tensor of shape [batch_size, time, 1]
            or [time, batch_size, 1]. The stop token predictions for use inside
            the loss function.
    """
        encoder_outputs = input_dict['encoder_output']['outputs']
        enc_src_lengths = input_dict['encoder_output']['src_length']
        if self._mode == "train" or (self._mode == "infer"
                                     and self._gta_forcing == True):
            spec = input_dict['target_tensors'][0]
            spec_length = input_dict['target_tensors'][2]
        else:
            spec = None
            spec_length = None

        _batch_size = encoder_outputs.get_shape().as_list()[0]

        training = (self._mode == "train")
        regularizer = self.params.get('regularizer', None)

        if self.params.get('enable_postnet', True):
            if "postnet_conv_layers" not in self.params:
                raise ValueError(
                    "postnet_conv_layers must be passed from config file if postnet is"
                    "enabled")

        if self._both:
            num_audio_features = self._n_feats["mel"]
            if self._mode == "train":
                spec, _ = tf.split(
                    spec, [self._n_feats['mel'], self._n_feats['magnitude']],
                    axis=2)
        else:
            num_audio_features = self._n_feats

        output_projection_layer = tf.layers.Dense(
            name="output_proj",
            units=num_audio_features,
            use_bias=True,
        )
        stop_token_projection_layer = tf.layers.Dense(
            name="stop_token_proj",
            units=1,
            use_bias=True,
        )

        prenet = None
        if self.params.get('enable_prenet', True):
            prenet = Prenet(self.params.get('prenet_units', 256),
                            self.params.get('prenet_layers', 2),
                            self.params.get("prenet_activation", tf.nn.relu),
                            self.params["dtype"])

        cell_params = {}
        cell_params["num_units"] = self.params['decoder_cell_units']
        decoder_cells = [
            single_cell(
                cell_class=self.params['decoder_cell_type'],
                cell_params=cell_params,
                zoneout_prob=self.params.get("zoneout_prob", 0.),
                dp_output_keep_prob=1. - self.params.get("dropout_prob", 0.1),
                training=training,
            ) for _ in range(self.params['decoder_layers'])
        ]

        if self.params['attention_type'] is not None:
            attention_mechanism = self._build_attention(
                encoder_outputs, enc_src_lengths,
                self.params.get("attention_bias", False))

            attention_cell = tf.contrib.rnn.MultiRNNCell(decoder_cells)

            attentive_cell = AttentionWrapper(
                cell=attention_cell,
                attention_mechanism=attention_mechanism,
                alignment_history=True,
                output_attention="both",
            )

            decoder_cell = attentive_cell

        if self.params['attention_type'] is None:
            decoder_cell = tf.contrib.rnn.MultiRNNCell(decoder_cells)

        if self._mode == "train":
            train_and_not_sampling = True
            helper = TacotronTrainingHelper(
                inputs=spec,
                sequence_length=spec_length,
                prenet=None,
                model_dtype=self.params["dtype"],
                mask_decoder_sequence=self.params.get("mask_decoder_sequence",
                                                      True))
        elif self._mode == "eval" or self._mode == "infer":
            train_and_not_sampling = False
            inputs = tf.zeros((_batch_size, 1, num_audio_features),
                              dtype=self.params["dtype"])
            helper = TacotronHelper(
                inputs=inputs,
                prenet=None,
                mask_decoder_sequence=self.params.get("mask_decoder_sequence",
                                                      True),
                gta_mels=spec,
                gta_mel_lengths=spec_length,
            )
        else:
            raise ValueError("Unknown mode for decoder: {}".format(self._mode))
        decoder = TacotronDecoder(
            decoder_cell=decoder_cell,
            helper=helper,
            initial_decoder_state=decoder_cell.zero_state(
                _batch_size, self.params["dtype"]),
            attention_type=self.params["attention_type"],
            spec_layer=output_projection_layer,
            stop_token_layer=stop_token_projection_layer,
            prenet=prenet,
            dtype=self.params["dtype"],
            train=train_and_not_sampling)

        if self._mode == 'train':
            maximum_iterations = tf.reduce_max(spec_length)
        else:
            maximum_iterations = tf.reduce_max(enc_src_lengths) * 10

        outputs, final_state, sequence_lengths = tf.contrib.seq2seq.dynamic_decode(
            # outputs, final_state, sequence_lengths, final_inputs = dynamic_decode(
            decoder=decoder,
            impute_finished=False,
            maximum_iterations=maximum_iterations,
            swap_memory=self.params.get("use_swap_memory", False),
            output_time_major=self.params.get("time_major", False),
            parallel_iterations=self.params.get("parallel_iterations", 32))

        decoder_output = outputs.rnn_output
        stop_token_logits = outputs.stop_token_output

        with tf.variable_scope("decoder"):
            # If we are in train and doing sampling, we need to do the projections
            if train_and_not_sampling:
                decoder_spec_output = output_projection_layer(decoder_output)
                stop_token_logits = stop_token_projection_layer(
                    decoder_spec_output)
                decoder_output = decoder_spec_output

        ## Add the post net ##
        if self.params.get('enable_postnet', True):
            dropout_keep_prob = self.params.get('postnet_keep_dropout_prob',
                                                0.5)

            top_layer = decoder_output
            for i, conv_params in enumerate(
                    self.params['postnet_conv_layers']):
                ch_out = conv_params['num_channels']
                kernel_size = conv_params['kernel_size']  # [time, freq]
                strides = conv_params['stride']
                padding = conv_params['padding']
                activation_fn = conv_params['activation_fn']

                if ch_out == -1:
                    if self._both:
                        ch_out = self._n_feats["mel"]
                    else:
                        ch_out = self._n_feats

                top_layer = conv_bn_actv(
                    layer_type="conv1d",
                    name="conv{}".format(i + 1),
                    inputs=top_layer,
                    filters=ch_out,
                    kernel_size=kernel_size,
                    activation_fn=activation_fn,
                    strides=strides,
                    padding=padding,
                    regularizer=regularizer,
                    training=training,
                    data_format=self.params.get('postnet_data_format',
                                                'channels_last'),
                    bn_momentum=self.params.get('postnet_bn_momentum', 0.1),
                    bn_epsilon=self.params.get('postnet_bn_epsilon', 1e-5),
                )
                top_layer = tf.layers.dropout(top_layer,
                                              rate=1. - dropout_keep_prob,
                                              training=training)

        else:
            top_layer = tf.zeros([
                _batch_size, maximum_iterations,
                outputs.rnn_output.get_shape()[-1]
            ],
                                 dtype=self.params["dtype"])

        if regularizer and training:
            vars_to_regularize = []
            vars_to_regularize += attentive_cell.trainable_variables
            if (attention_mechanism.memory_layer is not None):
                vars_to_regularize += attention_mechanism.memory_layer.trainable_variables
            vars_to_regularize += output_projection_layer.trainable_variables
            vars_to_regularize += stop_token_projection_layer.trainable_variables

            for weights in vars_to_regularize:
                if "bias" not in weights.name:
                    # print("Added regularizer to {}".format(weights.name))
                    if weights.dtype.base_dtype == tf.float16:
                        tf.add_to_collection('REGULARIZATION_FUNCTIONS',
                                             (weights, regularizer))
                    else:
                        tf.add_to_collection(
                            ops.GraphKeys.REGULARIZATION_LOSSES,
                            regularizer(weights))

            if self.params.get('enable_prenet', True):
                prenet.add_regularization(regularizer)

        if self.params['attention_type'] is not None:
            alignments = tf.transpose(final_state.alignment_history.stack(),
                                      [1, 0, 2])
        else:
            alignments = tf.zeros([_batch_size, _batch_size, _batch_size])

        spectrogram_prediction = decoder_output + top_layer
        if self._both:
            mag_spec_prediction = spectrogram_prediction
            mag_spec_prediction = conv_bn_actv(
                layer_type="conv1d",
                name="conv_0",
                inputs=mag_spec_prediction,
                filters=256,
                kernel_size=4,
                activation_fn=tf.nn.relu,
                strides=1,
                padding="SAME",
                regularizer=regularizer,
                training=training,
                data_format=self.params.get('postnet_data_format',
                                            'channels_last'),
                bn_momentum=self.params.get('postnet_bn_momentum', 0.1),
                bn_epsilon=self.params.get('postnet_bn_epsilon', 1e-5),
            )
            mag_spec_prediction = conv_bn_actv(
                layer_type="conv1d",
                name="conv_1",
                inputs=mag_spec_prediction,
                filters=512,
                kernel_size=4,
                activation_fn=tf.nn.relu,
                strides=1,
                padding="SAME",
                regularizer=regularizer,
                training=training,
                data_format=self.params.get('postnet_data_format',
                                            'channels_last'),
                bn_momentum=self.params.get('postnet_bn_momentum', 0.1),
                bn_epsilon=self.params.get('postnet_bn_epsilon', 1e-5),
            )
            if self._model.get_data_layer()._exp_mag:
                mag_spec_prediction = tf.exp(mag_spec_prediction)
            mag_spec_prediction = tf.layers.conv1d(
                mag_spec_prediction,
                self._n_feats["magnitude"],
                1,
                name="post_net_proj",
                use_bias=False,
            )
        else:
            mag_spec_prediction = tf.zeros(
                [_batch_size, _batch_size, _batch_size])

        stop_token_prediction = tf.sigmoid(stop_token_logits)
        outputs = [
            decoder_output, spectrogram_prediction, alignments,
            stop_token_prediction, sequence_lengths, mag_spec_prediction
        ]

        return {
            'outputs': outputs,
            'stop_token_prediction': stop_token_logits,
        }
コード例 #7
0
    def _encode(self, input_dict):
        """
    Encodes data into representation
    :param input_dict: a Python dictionary.
    Must define:
      * src_inputs - a Tensor of shape [batch_size, time] or [time, batch_size]
                    (depending on time_major param)
      * src_lengths - a Tensor of shape [batch_size]
    :return: a Python dictionary with:
      * encoder_outputs - a Tensor of shape
                          [batch_size, time, representation_dim]
      or [time, batch_size, representation_dim]
      * encoder_state - a Tensor of shape [batch_size, dim]
      * src_lengths - (copy ref from input) a Tensor of shape [batch_size]
    """
        time_major = self.params.get("time_major", False)
        use_swap_memory = self.params.get("use_swap_memory", False)

        regularizer = self.params.get('regularizer', None)
        fc_use_bias = self.params.get('fc_use_bias', True)

        use_cudnn_rnn = self.params.get("use_cudnn_rnn", False)
        cudnn_rnn_type = self.params.get("cudnn_rnn_type", None)

        if 'initializer' in self.params:
            init_dict = self.params.get('initializer_params', {})
            initializer = self.params['initializer'](**init_dict)
        else:
            initializer = None

        if self._mode == "train":
            dp_input_keep_prob = self.params['encoder_dp_input_keep_prob']
            dp_output_keep_prob = self.params['encoder_dp_output_keep_prob']
            last_input_keep_prob = self.params['encoder_last_input_keep_prob']
            last_output_keep_prob = self.params[
                'encoder_last_output_keep_prob']
            emb_keep_prob = self.params['encoder_emb_keep_prob']
            recurrent_keep_prob = self.params['recurrent_keep_prob']
            input_weight_keep_prob = self.params['input_weight_keep_prob']
            recurrent_weight_keep_prob = self.params[
                'recurrent_weight_keep_prob']

        else:
            dp_input_keep_prob, dp_output_keep_prob = 1.0, 1.0
            last_input_keep_prob, last_output_keep_prob = 1.0, 1.0
            emb_keep_prob, recurrent_keep_prob = 1.0, 1.0
            input_weight_keep_prob, recurrent_weight_keep_prob = 1.0, 1.0

        self._output_layer = tf.layers.Dense(self._fc_dim,
                                             kernel_regularizer=regularizer,
                                             kernel_initializer=initializer,
                                             use_bias=fc_use_bias,
                                             dtype=self._params['dtype'])

        if self._weight_tied:
            last_cell_params = copy.deepcopy(self.params['core_cell_params'])
            last_cell_params['num_units'] = self._emb_size
        else:
            last_cell_params = self.params['core_cell_params']

        last_output_dim = last_cell_params['num_units']

        if self._use_cell_state:
            last_output_dim = 2 * last_output_dim

        fake_input = tf.zeros(shape=(1, last_output_dim),
                              dtype=self._params['dtype'])
        fake_output = self._output_layer.apply(fake_input)
        with tf.variable_scope("dense", reuse=True):
            dense_weights = tf.get_variable("kernel")
            dense_biases = tf.get_variable("bias")

        if self._weight_tied and self._lm_phase:
            enc_emb_w = tf.transpose(dense_weights)
        else:
            enc_emb_w = tf.get_variable(
                name="EncoderEmbeddingMatrix",
                shape=[self._vocab_size, self._emb_size],
                dtype=self._params['dtype'])

        self._enc_emb_w = tf.nn.dropout(enc_emb_w, keep_prob=emb_keep_prob)

        if use_cudnn_rnn:
            if self._mode == 'train' or self._mode == 'eval':
                all_cudnn_classes = [
                    i[1] for i in inspect.getmembers(tf.contrib.cudnn_rnn,
                                                     inspect.isclass)
                ]

                if not cudnn_rnn_type in all_cudnn_classes:
                    raise TypeError("rnn_type must be a Cudnn RNN class")

                rnn_block = cudnn_rnn_type(
                    num_layers=self.params['encoder_layers'],
                    num_units=self._emb_size,
                    dtype=self._params['dtype'],
                    name="cudnn_rnn")
            else:
                # Transferring weights from model trained with CudnnLSTM/CudnnGRU
                # to CudnnCompatibleLSTMCell/CudnnCompatibleGRUCell for inference
                if 'CudnnLSTM' in str(cudnn_rnn_type):
                    cell = lambda: tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell(
                        num_units=self._emb_size)
                elif 'CudnnGRU' in str(cudnn_rnn_type):
                    cell = lambda: tf.contrib.cudnn_rnn.CudnnCompatibleGRUCell(
                        num_units=self._emb_size)

                fwd_cells = [
                    cell() for _ in range(self.params['encoder_layers'])
                ]
                self._encoder_cell_fw = tf.nn.rnn_cell.MultiRNNCell(fwd_cells)
        else:
            fwd_cells = [
                single_cell(
                    cell_class=self.params['core_cell'],
                    cell_params=self.params['core_cell_params'],
                    dp_input_keep_prob=dp_input_keep_prob,
                    dp_output_keep_prob=dp_output_keep_prob,
                    recurrent_keep_prob=recurrent_keep_prob,
                    input_weight_keep_prob=input_weight_keep_prob,
                    recurrent_weight_keep_prob=recurrent_weight_keep_prob,
                    weight_variational=self.params['weight_variational'],
                    dropout_seed=self.params['dropout_seed'],
                    residual_connections=self.
                    params['encoder_use_skip_connections'],
                    awd_initializer=self.params['awd_initializer'],
                    dtype=self._params['dtype'])
                for _ in range(self.params['encoder_layers'] - 1)
            ]

            fwd_cells.append(
                single_cell(
                    cell_class=self.params['core_cell'],
                    cell_params=last_cell_params,
                    dp_input_keep_prob=last_input_keep_prob,
                    dp_output_keep_prob=last_output_keep_prob,
                    recurrent_keep_prob=recurrent_keep_prob,
                    input_weight_keep_prob=input_weight_keep_prob,
                    recurrent_weight_keep_prob=recurrent_weight_keep_prob,
                    weight_variational=self.params['weight_variational'],
                    dropout_seed=self.params['dropout_seed'],
                    residual_connections=self.
                    params['encoder_use_skip_connections'],
                    awd_initializer=self.params['awd_initializer'],
                    dtype=self._params['dtype']))

            self._encoder_cell_fw = tf.contrib.rnn.MultiRNNCell(fwd_cells)

        time_major = self.params.get("time_major", False)
        use_swap_memory = self.params.get("use_swap_memory", False)

        source_sequence = input_dict['source_tensors'][0]
        source_length = input_dict['source_tensors'][1]

        # Inference for language modeling requires a different graph
        if (not self._lm_phase
            ) or self._mode == 'train' or self._mode == 'eval':
            embedded_inputs = tf.cast(
                tf.nn.embedding_lookup(
                    self.enc_emb_w,
                    source_sequence,
                ), self.params['dtype'])

            if use_cudnn_rnn:
                # The CudnnLSTM will return encoder_state as a tuple of hidden
                # and cell values that. The hidden and cell tensors are stored for
                # each LSTM Layer.

                # reshape to [B, T, C] --> [T, B, C]
                if time_major == False:
                    embedded_inputs = tf.transpose(embedded_inputs, [1, 0, 2])

                rnn_block.build(embedded_inputs.get_shape())
                encoder_outputs, encoder_state = rnn_block(embedded_inputs)
                encoder_outputs = tf.transpose(encoder_outputs, [1, 0, 2])
            else:
                encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
                    cell=self._encoder_cell_fw,
                    inputs=embedded_inputs,
                    sequence_length=source_length,
                    time_major=time_major,
                    swap_memory=use_swap_memory,
                    dtype=self._params['dtype'],
                    scope='decoder',
                )

            if not self._lm_phase:
                # CudnnLSTM stores cell and hidden state differently
                if use_cudnn_rnn:
                    if self._use_cell_state:
                        encoder_outputs = tf.concat(
                            [encoder_state[0][-1], encoder_state[1][-1]],
                            axis=1)
                    else:
                        encoder_outputs = encoder_state[0][-1]
                else:
                    if self._use_cell_state:
                        encoder_outputs = tf.concat(
                            [encoder_state[-1].h, encoder_state[-1].c], axis=1)
                    else:
                        encoder_outputs = encoder_state[-1].h

            if self._mode == 'train' and self._num_sampled < self._fc_dim:  # sampled softmax
                output_dict = {
                    'weights': enc_emb_w,
                    'bias': dense_biases,
                    'inputs': encoder_outputs,
                    'logits': encoder_outputs,
                    'outputs': [encoder_outputs],
                    'num_sampled': self._num_sampled
                }
            else:  # full softmax
                logits = self._output_layer.apply(encoder_outputs)
                output_dict = {'logits': logits, 'outputs': [logits]}
        else:  # infer in LM phase
            # This portion of graph is required to restore weights from CudnnLSTM to
            # CudnnCompatibleLSTMCell/CudnnCompatibleGRUCell
            if use_cudnn_rnn:
                embedded_inputs = tf.cast(
                    tf.nn.embedding_lookup(
                        self.enc_emb_w,
                        source_sequence,
                    ), self.params['dtype'])

                # Scope must remain unset to restore weights
                encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
                    cell=self._encoder_cell_fw,
                    inputs=embedded_inputs,
                    sequence_length=source_length,
                    time_major=time_major,
                    swap_memory=use_swap_memory,
                    dtype=self._params['dtype'])

            embedding_fn = lambda ids: tf.cast(
                tf.nn.embedding_lookup(
                    self.enc_emb_w,
                    ids,
                ), self.params['dtype'])

            helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                embedding=embedding_fn,  #self._dec_emb_w,
                start_tokens=tf.constant(self.params['seed_tokens']),
                end_token=self.params['end_token'])

            decoder = tf.contrib.seq2seq.BasicDecoder(
                cell=self._encoder_cell_fw,
                helper=helper,
                initial_state=self._encoder_cell_fw.zero_state(
                    batch_size=self._batch_size,
                    dtype=self._params['dtype'],
                ),
                output_layer=self._output_layer,
            )
            maximum_iterations = tf.constant(self._num_tokens_gen)

            final_outputs, final_state, final_sequence_lengths = tf.contrib.seq2seq.dynamic_decode(
                decoder=decoder,
                impute_finished=False,
                maximum_iterations=maximum_iterations,
                swap_memory=use_swap_memory,
                output_time_major=time_major,
            )
            output_dict = {
                'logits': final_outputs.rnn_output,
                'outputs': [tf.argmax(final_outputs.rnn_output, axis=-1)],
                'final_state': final_state,
                'final_sequence_lengths': final_sequence_lengths
            }

        return output_dict
コード例 #8
0
    def _decode(self, input_dict):
        """
    Decodes representation into data
    :param input_dict: Python dictionary with inputs to decoder
    Must define:
      * src_inputs - decoder input Tensor of shape [batch_size, time, dim]
                     or [time, batch_size, dim]
      * src_lengths - decoder input lengths Tensor of shape [batch_size]
    Does not need tgt_inputs and tgt_lengths
    :return: a Python dictionary with:
      * final_outputs - tensor of shape [batch_size, time, dim] or
                        [time, batch_size, dim]
      * final_state - tensor with decoder final state
      * final_sequence_lengths - tensor of shape [batch_size, time] or
                                 [time, batch_size]
    """
        encoder_outputs = input_dict['encoder_output']['outputs']
        enc_src_lengths = input_dict['encoder_output']['src_lengths']

        self._dec_emb_w = tf.get_variable(
            name='DecoderEmbeddingMatrix',
            shape=[self._tgt_vocab_size, self._tgt_emb_size],
            dtype=tf.float32)

        self._output_projection_layer = tf.layers.Dense(
            self._tgt_vocab_size,
            use_bias=False,
        )

        #cell_params = copy.deepcopy(self.params)
        #cell_params["num_units"] = self.params['decoder_cell_units']

        if self._mode == "train":
            dp_input_keep_prob = self.params['decoder_dp_input_keep_prob']
            dp_output_keep_prob = self.params['decoder_dp_output_keep_prob']
        else:
            dp_input_keep_prob = 1.0
            dp_output_keep_prob = 1.0

        #if self.params['attention_type'].startswith('gnmt'):
        #  residual_connections = False
        #  wrap_to_multi_rnn = False
        #else:
        #  residual_connections = self.params['decoder_use_skip_connections']
        #  wrap_to_multi_rnn = True

        #self._decoder_cells = create_rnn_cell(
        #  cell_type=self.params['decoder_cell_type'],
        #  cell_params=cell_params,
        #  num_layers=self.params['decoder_layers'],
        #  dp_input_keep_prob=dp_input_keep_prob,
        #  dp_output_keep_prob=dp_output_keep_prob,
        #  residual_connections=residual_connections,
        #  wrap_to_multi_rnn=wrap_to_multi_rnn,
        #)
        residual_connections = self.params['decoder_use_skip_connections']
        # list of cells
        self._decoder_cells = [
            single_cell(
                cell_class=self.params['core_cell'],
                cell_params=self.params.get('core_cell_params', {}),
                dp_input_keep_prob=dp_input_keep_prob,
                dp_output_keep_prob=dp_output_keep_prob,
                # residual connections are added a little differently for GNMT
                residual_connections=False
                if self.params['attention_type'].startswith('gnmt') else
                residual_connections,
            ) for _ in range(self.params['decoder_layers'])
        ]

        tiled_enc_outputs = tf.contrib.seq2seq.tile_batch(
            encoder_outputs,
            multiplier=self._beam_width,
        )
        tiled_enc_src_lengths = tf.contrib.seq2seq.tile_batch(
            enc_src_lengths,
            multiplier=self._beam_width,
        )
        attention_mechanism = self._build_attention(
            tiled_enc_outputs,
            tiled_enc_src_lengths,
        )

        if self.params['attention_type'].startswith('gnmt'):
            attention_cell = self._decoder_cells.pop(0)
            attention_cell = AttentionWrapper(
                attention_cell,
                attention_mechanism=attention_mechanism,
                attention_layer_size=None,  # don't use attention layer.
                output_attention=False,
                name="gnmt_attention")
            attentive_decoder_cell = GNMTAttentionMultiCell(
                attention_cell,
                self._add_residual_wrapper(self._decoder_cells)
                if residual_connections else self._decoder_cells,
                use_new_attention=(self.params['attention_type'] == 'gnmt_v2'))
        else:  # non-GNMT
            attentive_decoder_cell = AttentionWrapper(
                cell=tf.contrib.rnn.MultiRNNCell(self._decoder_cells),
                attention_mechanism=attention_mechanism,
            )
        batch_size_tensor = tf.constant(self._batch_size)
        embedding_fn = lambda ids: tf.cast(tf.nn.embedding_lookup(
            self._dec_emb_w, ids),
                                           dtype=self.params['dtype'])
        # decoder = tf.contrib.seq2seq.BeamSearchDecoder(
        decoder = BeamSearchDecoder(
            cell=attentive_decoder_cell,
            embedding=embedding_fn,
            start_tokens=tf.tile([self.GO_SYMBOL], [self._batch_size]),
            end_token=self.END_SYMBOL,
            initial_state=attentive_decoder_cell.zero_state(
                dtype=encoder_outputs.dtype,
                batch_size=batch_size_tensor * self._beam_width,
            ),
            beam_width=self._beam_width,
            output_layer=self._output_projection_layer,
            length_penalty_weight=self._length_penalty_weight)

        time_major = self.params.get("time_major", False)
        use_swap_memory = self.params.get("use_swap_memory", False)
        final_outputs, final_state, final_sequence_lengths = \
          tf.contrib.seq2seq.dynamic_decode(
          decoder=decoder,
          maximum_iterations=tf.reduce_max(enc_src_lengths) * 2,
          swap_memory=use_swap_memory,
          output_time_major=time_major,
        )

        return {
            'logits':
            final_outputs.predicted_ids[:, :, 0] if not time_major else
            tf.transpose(final_outputs.predicted_ids[:, :, 0], perm=[1, 0, 2]),
            'outputs': [final_outputs.predicted_ids[:, :, 0]],
            'final_state':
            final_state,
            'final_sequence_lengths':
            final_sequence_lengths
        }
コード例 #9
0
    def _decode(self, input_dict):
        """
    Decodes representation into data
    :param input_dict: Python dictionary with inputs to decoder
    Must define:
      * src_inputs - decoder input Tensor of shape [batch_size, time, dim]
                     or [time, batch_size, dim]
      * src_lengths - decoder input lengths Tensor of shape [batch_size]
      * tgt_inputs - Only during training. labels Tensor of the
                     shape [batch_size, time] or [time, batch_size]
      * tgt_lengths - Only during training. labels lengths
                      Tensor of the shape [batch_size]
    :return: a Python dictionary with:
      * final_outputs - tensor of shape [batch_size, time, dim]
                        or [time, batch_size, dim]
      * final_state - tensor with decoder final state
      * final_sequence_lengths - tensor of shape [batch_size, time]
                                 or [time, batch_size]
    """
        encoder_outputs = input_dict['encoder_output']['outputs']
        enc_src_lengths = input_dict['encoder_output']['src_lengths']
        tgt_inputs = input_dict['target_tensors'][0] if 'target_tensors' in \
                                                        input_dict else None
        tgt_lengths = input_dict['target_tensors'][1] if 'target_tensors' in \
                                                        input_dict else None

        self._dec_emb_w = tf.get_variable(
            name='DecoderEmbeddingMatrix',
            shape=[self._tgt_vocab_size, self._tgt_emb_size],
            dtype=tf.float32,
        )

        self._output_projection_layer = tf.layers.Dense(
            self._tgt_vocab_size,
            use_bias=False,
        )

        #cell_params = copy.deepcopy(self.params)
        #cell_params["num_units"] = self.params['decoder_cell_units']

        if self._mode == "train":
            dp_input_keep_prob = self.params['decoder_dp_input_keep_prob']
            dp_output_keep_prob = self.params['decoder_dp_output_keep_prob']
        else:
            dp_input_keep_prob = 1.0
            dp_output_keep_prob = 1.0

        residual_connections = self.params['decoder_use_skip_connections']

        # list of cells
        self._decoder_cells = [
            single_cell(
                cell_class=self.params['core_cell'],
                cell_params=self.params.get('core_cell_params', {}),
                dp_input_keep_prob=dp_input_keep_prob,
                dp_output_keep_prob=dp_output_keep_prob,
                # residual connections are added a little differently for GNMT
                residual_connections=False
                if self.params['attention_type'].startswith('gnmt') else
                residual_connections,
            ) for _ in range(self.params['decoder_layers'])
        ]

        attention_mechanism = self._build_attention(
            encoder_outputs,
            enc_src_lengths,
        )
        if self.params['attention_type'].startswith('gnmt'):
            attention_cell = self._decoder_cells.pop(0)
            attention_cell = AttentionWrapper(
                attention_cell,
                attention_mechanism=attention_mechanism,
                attention_layer_size=None,
                output_attention=False,
                name="gnmt_attention")
            attentive_decoder_cell = GNMTAttentionMultiCell(
                attention_cell,
                self._add_residual_wrapper(self._decoder_cells)
                if residual_connections else self._decoder_cells,
                use_new_attention=(self.params['attention_type'] == 'gnmt_v2'))
        else:
            # attentive_decoder_cell = tf.contrib.seq2seq.AttentionWrapper(
            attentive_decoder_cell = AttentionWrapper(
                cell=tf.contrib.rnn.MultiRNNCell(self._decoder_cells),
                attention_mechanism=attention_mechanism,
            )
        if self._mode == "train":
            input_vectors = tf.cast(tf.nn.embedding_lookup(
                self._dec_emb_w, tgt_inputs),
                                    dtype=self.params['dtype'])
            helper = tf.contrib.seq2seq.TrainingHelper(
                inputs=input_vectors, sequence_length=tgt_lengths)
            decoder = tf.contrib.seq2seq.BasicDecoder(
                cell=attentive_decoder_cell,
                helper=helper,
                output_layer=self._output_projection_layer,
                initial_state=attentive_decoder_cell.zero_state(
                    self._batch_size,
                    dtype=encoder_outputs.dtype,
                ),
            )
        elif self._mode == "infer" or self._mode == "eval":
            embedding_fn = lambda ids: tf.cast(tf.nn.embedding_lookup(
                self._dec_emb_w, ids),
                                               dtype=self.params['dtype'])
            helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                embedding=embedding_fn,  #self._dec_emb_w,
                start_tokens=tf.fill([self._batch_size], self.GO_SYMBOL),
                end_token=self.END_SYMBOL)
            decoder = tf.contrib.seq2seq.BasicDecoder(
                cell=attentive_decoder_cell,
                helper=helper,
                initial_state=attentive_decoder_cell.zero_state(
                    batch_size=self._batch_size,
                    dtype=encoder_outputs.dtype,
                ),
                output_layer=self._output_projection_layer,
            )
        else:
            raise ValueError("Unknown mode for decoder: {}".format(self._mode))

        time_major = self.params.get("time_major", False)
        use_swap_memory = self.params.get("use_swap_memory", False)
        if self._mode == 'train':
            maximum_iterations = tf.reduce_max(tgt_lengths)
        else:
            maximum_iterations = tf.reduce_max(enc_src_lengths) * 2

        final_outputs, final_state, final_sequence_lengths = tf.contrib.seq2seq.dynamic_decode(
            decoder=decoder,
            # impute_finished=False if self._decoder_type == "beam_search" else True,
            impute_finished=True,
            maximum_iterations=maximum_iterations,
            swap_memory=use_swap_memory,
            output_time_major=time_major,
        )

        return {
            'logits':
            final_outputs.rnn_output if not time_major else tf.transpose(
                final_outputs.rnn_output, perm=[1, 0, 2]),
            'outputs': [tf.argmax(final_outputs.rnn_output, axis=-1)],
            'final_state':
            final_state,
            'final_sequence_lengths':
            final_sequence_lengths
        }
コード例 #10
0
    def _encode(self, input_dict):
        """
    Encodes data into representation
    :param input_dict: a Python dictionary.
    Must define:
      * src_inputs - a Tensor of shape [batch_size, time] or [time, batch_size]
                    (depending on time_major param)
      * src_lengths - a Tensor of shape [batch_size]
    :return: a Python dictionary with:
      * encoder_outputs - a Tensor of shape
                          [batch_size, time, representation_dim]
      or [time, batch_size, representation_dim]
      * encoder_state - a Tensor of shape [batch_size, dim]
      * src_lengths - (copy ref from input) a Tensor of shape [batch_size]
    """
        time_major = self.params.get("time_major", False)
        use_swap_memory = self.params.get("use_swap_memory", False)

        regularizer = self.params.get('regularizer', None)
        fc_use_bias = self.params.get('fc_use_bias', True)

        if 'initializer' in self.params:
            init_dict = self.params.get('initializer_params', {})
            initializer = self.params['initializer'](**init_dict)
        else:
            initializer = None

        if self._mode == "train":
            dp_input_keep_prob = self.params['encoder_dp_input_keep_prob']
            dp_output_keep_prob = self.params['encoder_dp_output_keep_prob']
            last_input_keep_prob = self.params['encoder_last_input_keep_prob']
            last_output_keep_prob = self.params[
                'encoder_last_output_keep_prob']
            emb_keep_prob = self.params['encoder_emb_keep_prob']
            recurrent_keep_prob = self.params['recurrent_keep_prob']
            input_weight_keep_prob = self.params['input_weight_keep_prob']
            recurrent_weight_keep_prob = self.params[
                'recurrent_weight_keep_prob']

        else:
            dp_input_keep_prob, dp_output_keep_prob = 1.0, 1.0
            last_input_keep_prob, last_output_keep_prob = 1.0, 1.0
            emb_keep_prob, recurrent_keep_prob = 1.0, 1.0
            input_weight_keep_prob, recurrent_weight_keep_prob = 1.0, 1.0

        self._output_layer = tf.layers.Dense(
            self._vocab_size,
            kernel_regularizer=regularizer,
            kernel_initializer=initializer,
            use_bias=fc_use_bias,
        )

        if self._weight_tied:
            fake_input = tf.zeros(shape=(1, self._emb_size))
            fake_output = self._output_layer.apply(fake_input)
            with tf.variable_scope("dense", reuse=True):
                dense_weights = tf.get_variable("kernel")
                dense_biases = tf.get_variable("bias")
            enc_emb_w = tf.transpose(dense_weights)

        else:
            enc_emb_w = tf.get_variable(
                name="EncoderEmbeddingMatrix",
                shape=[self._vocab_size, self._emb_size],
                dtype=self._params['dtype'])

        self._enc_emb_w = tf.nn.dropout(enc_emb_w, keep_prob=emb_keep_prob)

        if self._weight_tied:
            last_cell_params = self.params['last_cell_params']
        else:
            last_cell_params = self.params['core_cell_params']

        fwd_cells = [
            single_cell(cell_class=self.params['core_cell'],
                        cell_params=self.params['core_cell_params'],
                        dp_input_keep_prob=dp_input_keep_prob,
                        dp_output_keep_prob=dp_output_keep_prob,
                        recurrent_keep_prob=recurrent_keep_prob,
                        input_weight_keep_prob=input_weight_keep_prob,
                        recurrent_weight_keep_prob=recurrent_weight_keep_prob,
                        weight_variational=self.params['weight_variational'],
                        dropout_seed=self.params['dropout_seed'],
                        residual_connections=self.
                        params['encoder_use_skip_connections'],
                        awd_initializer=self.params['awd_initializer'],
                        dtype=self._params['dtype'])
            for _ in range(self.params['encoder_layers'] - 1)
        ]

        fwd_cells.append(
            single_cell(cell_class=self.params['core_cell'],
                        cell_params=last_cell_params,
                        dp_input_keep_prob=last_input_keep_prob,
                        dp_output_keep_prob=last_output_keep_prob,
                        recurrent_keep_prob=recurrent_keep_prob,
                        input_weight_keep_prob=input_weight_keep_prob,
                        recurrent_weight_keep_prob=recurrent_weight_keep_prob,
                        weight_variational=self.params['weight_variational'],
                        dropout_seed=self.params['dropout_seed'],
                        residual_connections=self.
                        params['encoder_use_skip_connections'],
                        awd_initializer=self.params['awd_initializer'],
                        dtype=self._params['dtype']))

        self._encoder_cell_fw = tf.contrib.rnn.MultiRNNCell(fwd_cells)

        time_major = self.params.get("time_major", False)
        use_swap_memory = self.params.get("use_swap_memory", False)

        source_sequence = input_dict['source_tensors'][0]
        source_length = input_dict['source_tensors'][1]

        if self._mode == 'train' or self._mode == 'eval':
            embedded_inputs = tf.cast(
                tf.nn.embedding_lookup(
                    self.enc_emb_w,
                    source_sequence,
                ), self.params['dtype'])

            encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
                cell=self._encoder_cell_fw,
                inputs=embedded_inputs,
                sequence_length=source_length,
                time_major=time_major,
                swap_memory=use_swap_memory,
                dtype=embedded_inputs.dtype,
                scope='decoder',
            )
            if self._mode == 'eval' or self._num_sampled >= self._vocab_size:
                logits = self._output_layer.apply(
                    encoder_outputs)  # full softmax
                output_dict = {
                    'logits': logits,
                    'outputs': [tf.argmax(logits, axis=-1)]
                }
            else:
                output_dict = {
                    'weights': enc_emb_w,
                    'bias': dense_biases,
                    'inputs': encoder_outputs,
                    'logits': encoder_outputs,
                    'outputs': [encoder_outputs],
                    'num_sampled': self._num_sampled
                }

        else:
            embedding_fn = lambda ids: tf.cast(
                tf.nn.embedding_lookup(
                    self.enc_emb_w,
                    ids,
                ), self.params['dtype'])

            helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                embedding=embedding_fn,  #self._dec_emb_w,
                start_tokens=tf.constant(self.params['seed_tokens']),
                end_token=self.params['end_token'])

            decoder = tf.contrib.seq2seq.BasicDecoder(
                cell=self._encoder_cell_fw,
                helper=helper,
                initial_state=self._encoder_cell_fw.zero_state(
                    batch_size=self._batch_size,
                    dtype=self._params['dtype'],
                ),
                output_layer=self._output_layer,
            )
            maximum_iterations = tf.constant(200)

            final_outputs, final_state, final_sequence_lengths = tf.contrib.seq2seq.dynamic_decode(
                decoder=decoder,
                impute_finished=False,
                maximum_iterations=maximum_iterations,
                swap_memory=use_swap_memory,
                output_time_major=time_major,
            )
            output_dict = {
                'logits': final_outputs.rnn_output,
                'outputs': [tf.argmax(final_outputs.rnn_output, axis=-1)],
                'final_state': final_state,
                'final_sequence_lengths': final_sequence_lengths
            }

        return output_dict