Example #1
0
def decoder(name, latents, hparams, decoder_self_attention_bias, **kwargs):
    """Compute final hidden states for p(y|z,x)."""
    with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
        decoder_input = drop_2d(latents, hparams.mode,
                                hparams.decoder_2d_dropout)
        if hparams.pos_attn:
            decoder_input = gops.positional_attention(
                "pos_attn", decoder_input, decoder_self_attention_bias,
                hparams)
        else:
            decoder_input = common_attention.add_timing_signal_1d(
                decoder_input)
        if common_layers.shape_list(latents)[-1] != hparams.hidden_size:
            decoder_input = gops.dense("lat2hid", latents, hparams.hidden_size)
        decoder_output = transformer_decoder_layers(
            "block",
            n_layers=hparams.n_decoder_layers,
            decoder_input=decoder_input,
            hparams=hparams,
            decoder_self_attention_bias=decoder_self_attention_bias,
            **kwargs)
        batch_size, targets_length = common_layers.shape_list(
            decoder_output)[:2]
        decoder_output = tf.reshape(
            decoder_output,
            [batch_size, targets_length, 1, hparams.hidden_size])
        # Expand since t2t expects 4d tensors.
        return decoder_output
Example #2
0
def transformer_prepare_decoder(targets, hparams, features=None):
  """Prepare one shard of the model for the decoder.

  Args:
    targets: a Tensor.
    hparams: run hyperparameters
    features: optionally pass the entire features dictionary as well.
      This is needed now for "packed" datasets.

  Returns:
    decoder_input: a Tensor, bottom of decoder stack
    decoder_self_attention_bias: a bias tensor for use in encoder self-attention
  """
  decoder_self_attention_bias = (
      common_attention.attention_bias_lower_triangle(
          common_layers.shape_list(targets)[1]))
  if features and "targets_segmentation" in features:
    # "Packed" dataset - keep the examples from seeing each other.
    targets_segmentation = features["targets_segmentation"]
    targets_position = features["targets_position"]
    decoder_self_attention_bias += common_attention.attention_bias_same_segment(
        targets_segmentation, targets_segmentation)
  else:
    targets_position = None
  if hparams.proximity_bias:
    decoder_self_attention_bias += common_attention.attention_bias_proximal(
        common_layers.shape_list(targets)[1])
  decoder_input = common_layers.shift_right_3d(targets)
  if hparams.pos == "timing":
    if targets_position is not None:
      decoder_input = common_attention.add_timing_signal_1d_given_position(
          decoder_input, targets_position)
    else:
      decoder_input = common_attention.add_timing_signal_1d(decoder_input)
  return (decoder_input, decoder_self_attention_bias)
Example #3
0
def cond_prior(name,
               hparams,
               decoder_input,
               targets_mask,
               output_size,
               decoder_self_attention_bias,
               init_scale=0.0,
               **kwargs):
    """Compute hidden states for parameters for conditional prior."""
    with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
        decoder_input = common_attention.add_timing_signal_1d(decoder_input)
        decoder_input = tf.nn.dropout(
            decoder_input, rate=hparams.layer_prepostprocess_dropout)
        decoder_output = transformer_decoder_layers(
            "block",
            n_layers=hparams.n_posterior_layers,
            decoder_input=decoder_input,
            hparams=hparams,
            decoder_self_attention_bias=decoder_self_attention_bias,
            **kwargs)
        decoder_output = gops.dense_weightnorm("h2o_out",
                                               decoder_output,
                                               output_size,
                                               targets_mask,
                                               init_scale=init_scale,
                                               init=False)
        return decoder_output
def attention_lm_moe_prepare_decoder(targets, hparams):
  """Prepare one shard of the model for the decoder.

  Args:
    targets: a Tensor.
    hparams: run hyperparameters

  Returns:
    decoder_input: a Tensor, bottom of decoder stack
    decoder_self_attention_bias: a Tensor, containing large negative values
    to implement masked attention and possibly baises for diagonal alignments
    pad_remover (expert_utils.PadRemover): an util object to remove padding
  """
  targets_pad_mask = common_attention.embedding_to_padding(targets)
  with tf.name_scope("pad_remover"):
    # Because of the shift_right, the <eos> token will be considered as
    # padding. In practice, it doesn't really matter, due to the triangular
    # mask, this token should never be attended.
    pad_remover = expert_utils.PadRemover(targets_pad_mask)

  if hparams.prepend_mode == "prepend_inputs_full_attention":
    decoder_self_attention_bias = (
        common_attention.attention_bias_prepend_inputs_full_attention(
            targets_pad_mask))
  else:
    decoder_self_attention_bias = (
        common_attention.attention_bias_lower_triangle(tf.shape(targets)[1]))
  decoder_input = common_layers.shift_right_3d(targets)
  if hparams.pos == "timing":
    decoder_input = common_attention.add_timing_signal_1d(decoder_input)
  return (decoder_input, decoder_self_attention_bias, pad_remover)
Example #5
0
def attention_lm_moe_prepare_decoder(targets, hparams):
    """Prepare one shard of the model for the decoder.

  Args:
    targets: a Tensor.
    hparams: run hyperparameters

  Returns:
    decoder_input: a Tensor, bottom of decoder stack
    decoder_self_attention_bias: a Tensor, containing large negative values
    to implement masked attention and possibly baises for diagonal alignments
    pad_remover (expert_utils.PadRemover): an util object to remove padding
  """
    targets_pad_mask = common_attention.embedding_to_padding(targets)
    with tf.name_scope("pad_remover"):
        pad_remover = expert_utils.PadRemover(targets_pad_mask)

    if hparams.prepend_mode == "prepend_inputs_full_attention":
        decoder_self_attention_bias = (
            common_attention.attention_bias_prepended(targets_pad_mask))
    else:
        decoder_self_attention_bias = (
            common_attention.attention_bias_lower_triangle(
                tf.shape(targets)[1]))
    decoder_input = common_layers.shift_left_3d(targets)
    if hparams.pos == "timing":
        decoder_input = common_attention.add_timing_signal_1d(decoder_input)
    return (decoder_input, decoder_self_attention_bias, pad_remover)
def attention_lm_moe_prepare_decoder(targets, hparams):
    """Prepare one shard of the model for the decoder.

  Args:
    targets: a Tensor.
    hparams: run hyperparameters

  Returns:
    decoder_input: a Tensor, bottom of decoder stack
    decoder_self_attention_bias: a Tensor, containing large negative values
    to implement masked attention and possibly biases for diagonal alignments
    pad_remover (expert_utils.PadRemover): an util object to remove padding
  """
    targets_pad_mask = common_attention.embedding_to_padding(targets)
    with tf.name_scope("pad_remover"):
        # Because of the shift_right, the <eos> token will be considered as
        # padding. In practice, it doesn't really matter, due to the triangular
        # mask, this token should never be attended.
        pad_remover = expert_utils.PadRemover(targets_pad_mask)

    if hparams.prepend_mode == "prepend_inputs_full_attention":
        decoder_self_attention_bias = (
            common_attention.attention_bias_prepend_inputs_full_attention(
                targets_pad_mask))
    else:
        decoder_self_attention_bias = (
            common_attention.attention_bias_lower_triangle(
                tf.shape(targets)[1]))
    decoder_input = common_layers.shift_right_3d(targets)
    if hparams.pos == "timing":
        decoder_input = common_attention.add_timing_signal_1d(decoder_input)
    return (decoder_input, decoder_self_attention_bias, pad_remover)
Example #7
0
def transformer_prepare_encoder2(encoder_input, target_space, hparams,
                                 emb_name):
    '''the same as the existing module except for being able to name the embedding'''
    # compute bias
    ishape_static = encoder_input.shape.as_list()
    encoder_padding = common_attention.embedding_to_padding(encoder_input)
    ignore_padding = common_attention.attention_bias_ignore_padding(
        encoder_padding)
    encoder_self_attention_bias = ignore_padding
    encoder_decoder_attention_bias = ignore_padding
    if hparams.proximity_bias:
        encoder_self_attention_bias += common_attention.attention_bias_proximal(
            tf.shape(encoder_input)[1])

    # Append target_space_id embedding to encoder_input
    id_values = [
        value for attr, value in vars(problem.SpaceID).items()
        if not attr.startswith("__")
    ]
    id_cur = int(max(id_values) + 1)
    emb_target_space = common_layers.embedding(target_space,
                                               id_cur,
                                               ishape_static[-1],
                                               name=emb_name)
    emb_target_space = tf.reshape(emb_target_space, [1, 1, -1])
    encoder_input += emb_target_space

    # position embedding
    if hparams.pos == "timing":
        encoder_input = common_attention.add_timing_signal_1d(encoder_input)
    return encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias
def prepare_image_question_encoder(image_feat, question, hparams):
  """Prepare encoder.

  Args:
    image_feat: a Tensor.
    question: a Tensor.
    hparams: run hyperparameters

  Returns:
    encoder_input: a Tensor, bottom of encoder stack
    encoder_self_attention_bias: a bias tensor for use in encoder self-attention
  """

  encoder_input = tf.concat([image_feat, question], axis=1)
  encoder_padding = common_attention.embedding_to_padding(encoder_input)
  ignore_padding = common_attention.attention_bias_ignore_padding(
      encoder_padding)
  encoder_self_attention_bias = ignore_padding
  encoder_decoder_attention_bias = ignore_padding
  # Usual case - not a packed dataset.
  if hparams.pos == "timing":
    question = common_attention.add_timing_signal_1d(question)
  elif hparams.pos == "emb":
    question = common_attention.add_positional_embedding(
        question, hparams.max_length, "inputs_positional_embedding",
        None)
  encoder_input = tf.concat([image_feat, question], axis=1)

  return (encoder_input, encoder_self_attention_bias,
          encoder_decoder_attention_bias)
Example #9
0
def transformer_prepare_encoder(inputs, hparams):
    """Prepare one shard of the model for the encoder.

    Args:
        inputs: [batch_size, input_length, hidden_dim]
        hparams: hyperparameters

    Returns:
        encoder_input: a Tensor, bottom of encoder stack
            [batch_size, input_length, hidden_dim]
        encoder_self_attention_bias: a bias tensor for use in encoder
            self-attention [batch_size, input_length]
        top_layer_attention_bias: a bias tensor for use in top layer
            classification [batch_size, input_length]
    """
    ishape_static = inputs.shape.as_list()
    encoder_input = inputs
    encoder_padding = common_attention.embedding_to_padding(encoder_input)
    ignore_padding = common_attention.attention_bias_ignore_padding(
        encoder_padding)
    encoder_self_attention_bias = ignore_padding
    top_layer_attention_bias = ignore_padding
    if hparams.proximity_bias:
        encoder_self_attention_bias += common_attention.attention_bias_proximal(
            tf.shape(inputs)[1])
    if hparams.pos == "timing":
        encoder_input = common_attention.add_timing_signal_1d(encoder_input)
    return (encoder_input, encoder_self_attention_bias,
            top_layer_attention_bias)
Example #10
0
def transformer_prepare_encoder(inputs, target_space, hparams):
  """Prepare one shard of the model for the encoder.

  Args:
    inputs: a Tensor.
    target_space: a Tensor.
    hparams: run hyperparameters

  Returns:
    encoder_input: a Tensor, bottom of encoder stack
    encoder_self_attention_bias: a bias tensor for use in encoder self-attention
    encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder
      attention
  """
  ishape_static = inputs.shape.as_list()
  encoder_input = inputs
  encoder_padding = common_attention.embedding_to_padding(encoder_input)
  ignore_padding = common_attention.attention_bias_ignore_padding(
      encoder_padding)
  encoder_self_attention_bias = ignore_padding
  encoder_decoder_attention_bias = ignore_padding
  if hparams.proximity_bias:
    encoder_self_attention_bias += common_attention.attention_bias_proximal(
        tf.shape(inputs)[1])
  # Append target_space_id embedding to inputs.
  emb_target_space = common_layers.embedding(
      target_space, 32, ishape_static[-1], name="target_space_embedding")
  emb_target_space = tf.reshape(emb_target_space, [1, 1, -1])
  encoder_input += emb_target_space
  if hparams.pos == "timing":
    encoder_input = common_attention.add_timing_signal_1d(encoder_input)
  return (encoder_input, encoder_self_attention_bias,
          encoder_decoder_attention_bias)
Example #11
0
def transformer_prepare_decoder(targets, hparams, features=None):
  """Prepare one shard of the model for the decoder.

  Args:
    targets: a Tensor.
    hparams: run hyperparameters
    features: optionally pass the entire features dictionary as well.
      This is needed now for "packed" datasets.

  Returns:
    decoder_input: a Tensor, bottom of decoder stack
    decoder_self_attention_bias: a bias tensor for use in encoder self-attention
  """
  decoder_self_attention_bias = (
      common_attention.attention_bias_lower_triangle(
          common_layers.shape_list(targets)[1]))
  if features and "targets_segmentation" in features:
    # "Packed" dataset - keep the examples from seeing each other.
    targets_segmentation = features["targets_segmentation"]
    targets_position = features["targets_position"]
    decoder_self_attention_bias += common_attention.attention_bias_same_segment(
        targets_segmentation, targets_segmentation)
  else:
    targets_position = None
  if hparams.proximity_bias:
    decoder_self_attention_bias += common_attention.attention_bias_proximal(
        common_layers.shape_list(targets)[1])
  decoder_input = common_layers.shift_right_3d(targets)
  if hparams.pos == "timing":
    if targets_position is not None:
      decoder_input = common_attention.add_timing_signal_1d_given_position(
          decoder_input, targets_position)
    else:
      decoder_input = common_attention.add_timing_signal_1d(decoder_input)
  return (decoder_input, decoder_self_attention_bias)
def prepare_image_question_encoder(image_feat, question, hparams):
    """Prepare encoder.

  Args:
    image_feat: a Tensor.
    question: a Tensor.
    hparams: run hyperparameters

  Returns:
    encoder_input: a Tensor, bottom of encoder stack
    encoder_self_attention_bias: a bias tensor for use in encoder self-attention
  """

    encoder_input = tf.concat([image_feat, question], axis=1)
    encoder_padding = common_attention.embedding_to_padding(encoder_input)
    ignore_padding = common_attention.attention_bias_ignore_padding(
        encoder_padding)
    encoder_self_attention_bias = ignore_padding
    encoder_decoder_attention_bias = ignore_padding
    # Usual case - not a packed dataset.
    if hparams.pos == "timing":
        question = common_attention.add_timing_signal_1d(question)
    elif hparams.pos == "emb":
        question = common_attention.add_positional_embedding(
            question, hparams.max_length, "inputs_positional_embedding", None)
    encoder_input = tf.concat([image_feat, question], axis=1)

    return (encoder_input, encoder_self_attention_bias,
            encoder_decoder_attention_bias)
Example #13
0
    def __init__(self,
                 input,
                 word_size=500000,
                 embedding_dim=30,
                 batch_num=10,
                 use_position_encoding=False,
                 use_diy=True):
        self.input = input
        self.word_size = word_size
        self.embedding_dim = embedding_dim
        self.batch_num = batch_num

        with tf.name_scope("word_embedding"):
            self.word_W = tf.get_variable(
                name="word_W",
                shape=[word_size, embedding_dim],
                initializer=tf.orthogonal_initializer())

        self.embedding_input = tf.nn.embedding_lookup(self.word_W, self.input)

        if use_position_encoding:
            if use_diy:
                self.postional_encoding_add = self.positional_encoding_layer(
                    input=self.embedding_input)
            else:
                self.postional_encoding_add = common_attention.add_timing_signal_1d(
                    self.embedding_input)
        else:
            self.postional_encoding_add = self.embedding_input

        self.output = tf.reduce_mean(self.multi_encoding_layer(
            self.postional_encoding_add),
                                     axis=-1)
Example #14
0
    def decode_inputs_to_outputs(self, decoder_embed_inputs, encoder_outputs,
                                 encoder_attn_bias, rule_id_input_placeholder,
                                 mem_contexts, mem_outputs, global_step):
        if self.hparams.pos == 'timing':
            decoder_embed_inputs = common_attention.add_timing_signal_1d(
                decoder_embed_inputs)
            print('Use positional encoding in decoder text.')

        decoder_attn_bias = common_attention.attention_bias_lower_triangle(
            tf.shape(decoder_embed_inputs)[1])
        decoder_embed_inputs = tf.nn.dropout(
            decoder_embed_inputs,
            1.0 - self.hparams.layer_prepostprocess_dropout)

        if 'rule' in self.model_config.memory:
            decoder_output, contexts = transformer.transformer_decoder2(
                decoder_embed_inputs, encoder_outputs, decoder_attn_bias,
                encoder_attn_bias, self.hparams)

            # encoder_gate_w = tf.get_variable('encoder_gate_w', shape=(
            #     1, self.model_config.dimension, 1))
            # encoder_gate_b = tf.get_variable('encoder_gate_b', shape=(1, 1, 1))
            # encoder_gate = tf.tanh(encoder_gate_b + tf.nn.conv1d(encoder_outputs, encoder_gate_w, 1, 'SAME'))
            # encoder_context_outputs = tf.expand_dims(tf.reduce_mean(encoder_outputs * encoder_gate, axis=1), axis=1)
            cur_context = contexts[0]  #tf.concat(contexts, axis=-1)
            cur_mem_contexts = tf.stack(self.embedding_fn(
                rule_id_input_placeholder, mem_contexts),
                                        axis=1)
            cur_mem_outputs = tf.stack(self.embedding_fn(
                rule_id_input_placeholder, mem_outputs),
                                       axis=1)

            bias = tf.expand_dims(-1e9 * tf.to_float(
                tf.equal(tf.stack(rule_id_input_placeholder, axis=1), 0)),
                                  axis=1)
            weights = tf.nn.softmax(
                bias +
                tf.matmul(cur_context, cur_mem_contexts, transpose_b=True))
            mem_output = tf.matmul(weights, cur_mem_outputs)

            temp_output = tf.concat((decoder_output, mem_output), axis=-1)
            w = tf.get_variable('w_ffn',
                                shape=(1, self.model_config.dimension * 2,
                                       self.model_config.dimension))
            # b = tf.get_variable('b_ffn', shape=(
            #     1, 1, self.model_config.dimension))
            mem_output = tf.nn.conv1d(temp_output, w, 1, 'SAME')
            g = tf.greater(
                global_step,
                tf.constant(2 * self.model_config.memory_prepare_step,
                            dtype=tf.int64))
            final_output = tf.cond(g, lambda: mem_output,
                                   lambda: decoder_output)
            return final_output, decoder_output, cur_context
        else:
            decoder_output = transformer.transformer_decoder(
                decoder_embed_inputs, encoder_outputs, decoder_attn_bias,
                encoder_attn_bias, self.hparams)
            final_output = decoder_output
            return final_output, decoder_output, None
Example #15
0
 def preprocess_firstP(firstP):
     firstP = self._shard_features({"firstP": firstP})["firstP"]
     firstP_modality = self._problem_hparams.input_modality["firstP"]
     with tf.variable_scope(firstP_modality.name):
         firstP = firstP_modality.targets_bottom_sharded(firstP, dp)[0]
     firstP = common_layers.flatten4d3d(firstP)
     if hparams.pos == "timing":
         firstP = common_attention.add_timing_signal_1d(firstP)
     return firstP
Example #16
0
def transformer_prepare_decoder_right(targets, hparams, features=None):
    """Prepare one shard of the model for the decoder.

  Args:
    targets: a Tensor.
    hparams: run hyperparameters
    features: optionally pass the entire features dictionary as well.
      This is needed now for "packed" datasets.

  Returns:
    decoder_input: a Tensor, bottom of decoder stack
    decoder_self_attention_bias: a bias tensor for use in decoder self-attention
  """
    if hparams.causal_decoder_self_attention:
        # Causal attention.
        if hparams.prepend_mode == "prepend_inputs_full_attention":
            decoder_self_attention_bias = (
                common_attention.attention_bias_prepend_inputs_full_attention(
                    common_attention.embedding_to_padding(targets)))
        else:
            decoder_self_attention_bias = (
                common_attention.attention_bias_local(
                    common_layers.shape_list(targets)[1], 0, -1))
    else:
        # Full attention.
        decoder_padding = common_attention.embedding_to_padding(targets)
        decoder_self_attention_bias = (
            common_attention.attention_bias_ignore_padding(decoder_padding))

    if features and "targets_segmentation" in features:
        # "Packed" dataset - keep the examples from seeing each other.
        targets_segmentation = features["targets_segmentation"]
        targets_position = features["targets_position"]
        decoder_self_attention_bias += common_attention.attention_bias_same_segment(
            targets_segmentation, targets_segmentation)
    else:
        targets_position = None
    if hparams.proximity_bias:
        decoder_self_attention_bias += common_attention.attention_bias_proximal(
            common_layers.shape_list(targets)[1])
    decoder_input = shift_left_3d(targets)
    if hparams.pos == "timing":
        if targets_position is not None:
            decoder_input = common_attention.add_timing_signal_1d_given_position(
                decoder_input, targets_position)
        else:
            decoder_input = common_attention.add_timing_signal_1d(
                decoder_input)
    elif hparams.pos == "emb":
        decoder_input = common_attention.add_positional_embedding(
            decoder_input, hparams.max_length, "targets_positional_embedding",
            targets_position)

    if hparams.activation_dtype == "bfloat16":
        decoder_self_attention_bias = tf.cast(decoder_self_attention_bias,
                                              tf.bfloat16)
    return (decoder_input, decoder_self_attention_bias)
Example #17
0
def prepare_decoder(targets, target_space_emb):
  """Prepare decoder."""
  decoder_self_attention_bias = (
      common_attention.attention_bias_lower_triangle(tf.shape(targets)[1]))
  target_space_emb = tf.reshape(target_space_emb, [1, 1, -1])
  target_space_emb = tf.tile(target_space_emb, [tf.shape(targets)[0], 1, 1])
  decoder_input = common_layers.shift_right_3d(
      targets, pad_value=target_space_emb)
  decoder_input = common_attention.add_timing_signal_1d(decoder_input)
  return (decoder_input, decoder_self_attention_bias)
def prepare_decoder(targets, target_space_emb):
    """Prepare decoder."""
    decoder_self_attention_bias = (
        common_attention.attention_bias_lower_triangle(tf.shape(targets)[1]))
    target_space_emb = tf.reshape(target_space_emb, [1, 1, -1])
    target_space_emb = tf.tile(target_space_emb, [tf.shape(targets)[0], 1, 1])
    decoder_input = common_layers.shift_right_3d(targets,
                                                 pad_value=target_space_emb)
    decoder_input = common_attention.add_timing_signal_1d(decoder_input)
    return (decoder_input, decoder_self_attention_bias)
def transformer_prepare_encoder(inputs, target_space, hparams, features=None):
    """Prepare one shard of the model for the encoder.

  Args:
    inputs: a Tensor.
    target_space: a Tensor.
    hparams: run hyperparameters
    features: optionally pass the entire features dictionary as well.
      This is needed now for "packed" datasets.

  Returns:
    encoder_input: a Tensor, bottom of encoder stack
    encoder_self_attention_bias: a bias tensor for use in encoder self-attention
    encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder
      attention
  """
    ishape_static = inputs.shape.as_list()
    encoder_input = inputs
    if features and "inputs_segmentation" in features:
        # Packed dataset.  Keep the examples from seeing each other.
        inputs_segmentation = features["inputs_segmentation"]
        inputs_position = features["inputs_position"]
        targets_segmentation = features["targets_segmentation"]
        encoder_self_attention_bias = common_attention.attention_bias_same_segment(
            inputs_segmentation, inputs_segmentation)
        encoder_decoder_attention_bias = (
            common_attention.attention_bias_same_segment(
                targets_segmentation, inputs_segmentation))
    else:
        # Usual case - not a packed dataset.
        encoder_padding = common_attention.embedding_to_padding(encoder_input)
        ignore_padding = common_attention.attention_bias_ignore_padding(
            encoder_padding)
        encoder_self_attention_bias = ignore_padding
        encoder_decoder_attention_bias = ignore_padding
        inputs_position = None
    if hparams.proximity_bias:
        encoder_self_attention_bias += common_attention.attention_bias_proximal(
            common_layers.shape_list(inputs)[1])
    # Append target_space_id embedding to inputs.
    emb_target_space = common_layers.embedding(target_space,
                                               32,
                                               ishape_static[-1],
                                               name="target_space_embedding")
    emb_target_space = tf.reshape(emb_target_space, [1, 1, -1])
    encoder_input += emb_target_space
    if hparams.pos == "timing":
        if inputs_position is not None:
            encoder_input = common_attention.add_timing_signal_1d_given_position(
                encoder_input, inputs_position)
        else:
            encoder_input = common_attention.add_timing_signal_1d(
                encoder_input)
    return (encoder_input, encoder_self_attention_bias,
            encoder_decoder_attention_bias)
    def decode(self, decoder_inputs, timestep):
        """
        Args:
            decoder_inputs: targets of shape [batch_size,sequence_length,
                            hidden_size]. Sequence is shifter right
                            by one.
            timestep: used for timestep encoding during ACT
        Return:
            decoder_outputs: the result of passing the decoder_input
                             through the edecoderlayers.
                             Input shape is preserved.

        This function is one step of decoding.
        """
        with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
            #positional encoding
            x = common_attention.add_timing_signal_1d(decoder_inputs)
            #timestep encoding
            x = common_attention.add_timing_signal_1d_given_position(
                x, timestep)
            #decoder-decoder attention
            y = common_attention.multihead_attention(
                query_antecedent=x,
                memory_antecedent=None,
                bias=self.decoder_attention_bias,
                total_key_depth=self.hparams.hidden_size,
                total_value_depth=self.hparams.hidden_size,
                output_depth=self.hparams.hidden_size,
                num_heads=self.hparams.num_heads,
                dropout_rate=self.hparams.attention_dropout)
            #residual connection and dropout
            x = common_layers.layer_postprocess(x, y, self.hparams)
            #layer norm
            x = common_layers.layer_norm(x)
            #encoder-decoder attention
            y = common_attention.multihead_attention(
                query_antecedent=x,
                memory_antecedent=self.encoder_outputs,
                bias=self.encoder_attention_bias,
                total_key_depth=self.hparams.hidden_size,
                total_value_depth=self.hparams.hidden_size,
                output_depth=self.hparams.hidden_size,
                num_heads=self.hparams.num_heads,
                dropout_rate=self.hparams.attention_dropout)
            #residual connection and dropout
            x = common_layers.layer_postprocess(x, y, self.hparams)
            #layer norm
            x = common_layers.layer_norm(x)
            #transition function as fc
            y = tf.layers.dense(x, self.hparams.hidden_size, name="transition")
            #residual connection and dropout
            x = common_layers.layer_postprocess(x, y, self.hparams)
            #layer norm
            x = common_layers.layer_norm(x)
            return x
Example #21
0
def transformer_prepare_decoder(targets, hparams):
    """Copied from tensor2tensor.models.transformer."""
    decoder_self_attention_bias = (
        common_attention.attention_bias_lower_triangle(tf.shape(targets)[1]))
    if hparams.proximity_bias:
        decoder_self_attention_bias += common_attention.attention_bias_proximal(
            tf.shape(targets)[1])
    decoder_input = common_layers.shift_right_3d(targets)
    if hparams.pos == "timing":
        decoder_input = common_attention.add_timing_signal_1d(decoder_input)
    return (decoder_input, decoder_self_attention_bias)
Example #22
0
 def decode_inputs_to_outputs(self, kword_input, abstr_outputs, abstr_bias, hist_vector=None):
     if self.hparams.pos == 'timing':
         kword_input = common_attention.add_timing_signal_1d(kword_input)
     kword_tribias = common_attention.attention_bias_lower_triangle(tf.shape(kword_input)[1])
     kword_input = tf.nn.dropout(
         kword_input, 1.0 - self.hparams.layer_prepostprocess_dropout)
     kword_output = transformer.transformer_decoder(
         kword_input, abstr_outputs, kword_tribias,
         abstr_bias, self.hparams,
         hist_vector=hist_vector)
     return kword_output
def decode(cond_vec, cond_add, gold, c, ed, hparams):
    """Transformer decoder."""
    drop_gold = tf.nn.dropout(gold, 1.0 - hparams.layer_prepostprocess_dropout)
    decoder_input = common_layers.shift_right(drop_gold, pad_value=cond_vec)
    if cond_add is not None:
        decoder_input += cond_add
    decoder_input = tf.squeeze(decoder_input, axis=2)
    decoder_input = common_attention.add_timing_signal_1d(decoder_input)
    bias = common_attention.attention_bias_lower_triangle(tf.shape(gold)[1])
    if c is not None and len(c.get_shape()) > 3:
        c = tf.squeeze(c, axis=2)
    return transformer.transformer_decoder(decoder_input, c, bias, ed, hparams)
Example #24
0
def transformer_prepare_encoder(inputs, target_space, hparams, features=None):
  """Prepare one shard of the model for the encoder.

  Args:
    inputs: a Tensor.
    target_space: a Tensor.
    hparams: run hyperparameters
    features: optionally pass the entire features dictionary as well.
      This is needed now for "packed" datasets.

  Returns:
    encoder_input: a Tensor, bottom of encoder stack
    encoder_self_attention_bias: a bias tensor for use in encoder self-attention
    encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder
      attention
  """
  ishape_static = inputs.shape.as_list()
  encoder_input = inputs
  if features and "inputs_segmentation" in features:
    # Packed dataset.  Keep the examples from seeing each other.
    inputs_segmentation = features["inputs_segmentation"]
    inputs_position = features["inputs_position"]
    targets_segmentation = features["targets_segmentation"]
    encoder_self_attention_bias = common_attention.attention_bias_same_segment(
        inputs_segmentation, inputs_segmentation)
    encoder_decoder_attention_bias = (
        common_attention.attention_bias_same_segment(
            targets_segmentation, inputs_segmentation))
  else:
    # Usual case - not a packed dataset.
    encoder_padding = common_attention.embedding_to_padding(encoder_input)
    ignore_padding = common_attention.attention_bias_ignore_padding(
        encoder_padding)
    encoder_self_attention_bias = ignore_padding
    encoder_decoder_attention_bias = ignore_padding
    inputs_position = None
  if hparams.proximity_bias:
    encoder_self_attention_bias += common_attention.attention_bias_proximal(
        common_layers.shape_list(inputs)[1])
  # Append target_space_id embedding to inputs.
  emb_target_space = common_layers.embedding(
      target_space, 32, ishape_static[-1], name="target_space_embedding")
  emb_target_space = tf.reshape(emb_target_space, [1, 1, -1])
  encoder_input += emb_target_space
  if hparams.pos == "timing":
    if inputs_position is not None:
      encoder_input = common_attention.add_timing_signal_1d_given_position(
          encoder_input, inputs_position)
    else:
      encoder_input = common_attention.add_timing_signal_1d(encoder_input)
  return (encoder_input, encoder_self_attention_bias,
          encoder_decoder_attention_bias)
def attend(x, source, hparams, name):
    with tf.variable_scope(name):
        x = tf.squeeze(x, axis=2)
        if len(source.get_shape()) > 3:
            source = tf.squeeze(source, axis=2)
        source = common_attention.add_timing_signal_1d(source)
        y = common_attention.multihead_attention(
            common_layers.layer_preprocess(x, hparams), source, None,
            hparams.attention_key_channels or hparams.hidden_size,
            hparams.attention_value_channels or hparams.hidden_size,
            hparams.hidden_size, hparams.num_heads, hparams.attention_dropout)
        res = common_layers.layer_postprocess(x, y, hparams)
        return tf.expand_dims(res, axis=2)
Example #26
0
def attend(x, source, hparams, name):
  with tf.variable_scope(name):
    x = tf.squeeze(x, axis=2)
    if len(source.get_shape()) > 3:
      source = tf.squeeze(source, axis=2)
    source = common_attention.add_timing_signal_1d(source)
    y = common_attention.multihead_attention(
        common_layers.layer_preprocess(x, hparams), source, None,
        hparams.attention_key_channels or hparams.hidden_size,
        hparams.attention_value_channels or hparams.hidden_size,
        hparams.hidden_size, hparams.num_heads,
        hparams.attention_dropout)
    res = common_layers.layer_postprocess(x, y, hparams)
    return tf.expand_dims(res, axis=2)
Example #27
0
    def build_graph(self, inputs, masks):
        with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE):
            if self.input_mapping:
                inputs = tf.layers.conv1d(inputs, filters=self.filters, \
                                          kernel_size=1, padding='SAME', name='input_mapping')
            outputs = inputs
            for i in range(self.num_blocks):
                with tf.variable_scope('block{}'.format(i + 1)):
                    outputs = add_timing_signal_1d(outputs)
                    for j in range(self.num_layers):
                        with tf.variable_scope('conv{}'.format(j + 1)):

                            def fn(x):
                                output = tf.layers.separable_conv1d(
                                    layer_norm(x, name='ln1_{}'.format(j + 1)),
                                    filters=self.filters,
                                    kernel_size=self.kernel_size,
                                    padding='SAME',
                                    name='conv{}'.format(j + 1))
                                if j % 2 == 0:
                                    output = tf.nn.dropout(
                                        output, self.keep_prob)
                                return output

                            outputs = layer_dropout(x=outputs,
                                                    fn=fn,
                                                    keep_prob=1 -
                                                    (j + 1) / self.num_layers *
                                                    (1 - self.keep_prob))
                    outputs = tf.nn.dropout(
                        outputs + multihead_self_attention(
                            layer_norm(outputs, name='ln2_{}'.format(i)),
                            masks, self.num_heads), self.keep_prob)
                    res = outputs
                    outputs = layer_norm(outputs, name='ln3_{}'.format(i + 1))
                    #outputs = tf.layers.conv1d(outputs, filters=self.filters, kernel_size=1, padding='SAME', kernel_initializer=initializer_relu(), name='ffn1')
                    outputs = tf.nn.relu(
                        tf.layers.conv1d(outputs,
                                         filters=self.filters,
                                         kernel_size=1,
                                         padding='SAME',
                                         kernel_initializer=initializer_relu(),
                                         name='ffn1'))
                    outputs = tf.layers.conv1d(outputs,
                                               filters=self.filters,
                                               kernel_size=1,
                                               padding='SAME',
                                               name='ffn2')
                    outputs = tf.nn.dropout(res + outputs, self.keep_prob)
            return outputs
Example #28
0
  def transformer_prepare_delibdecoder(self, inputs, hparams):
    """Prepare one shard of the model for the encoder.
    Args:
    inputs: a Tensor.
    hparams: run hyperparameters
    Returns:
    """
    firstPdecoder_input = inputs
    firstPdecoder_padding = common_attention.embedding_to_padding(firstPdecoder_input)
    ignore_padding = common_attention.attention_bias_ignore_padding(firstPdecoder_padding)
    firstP_delib_attention_bias = ignore_padding
    if hparams.pos == "timing":
      firstPdecoder_input = common_attention.add_timing_signal_1d(firstPdecoder_input)

    return (firstPdecoder_input, firstP_delib_attention_bias)
Example #29
0
def attend(x, source, hparams, name):
    """Attend function."""
    with tf.variable_scope(name):
        # x = tf.squeeze(x, axis=2)
        x, xshape, _ = cia.maybe_reshape_4d_to_3d(x)
        if len(source.get_shape()) > 3:
            source = tf.squeeze(source, axis=2)
        source = common_attention.add_timing_signal_1d(source)
        y = common_attention.multihead_attention(
            common_layers.layer_preprocess(x, hparams), source, None,
            hparams.attention_key_channels or hparams.hidden_size,
            hparams.attention_value_channels or hparams.hidden_size,
            hparams.hidden_size, hparams.num_heads, hparams.attention_dropout)
        res = common_layers.layer_postprocess(x, y, hparams)
        return tf.reshape(res, xshape)
def transformer_decoder_block(name,
                              n_layers,
                              x,
                              x_mask,
                              output_size,
                              init,
                              **kwargs):
  """A transformation block composed of transformer decoder layers.

  Args:
    name: variable scope.
    n_layers: number of transformer layers.
    x: input to transformation.
    x_mask: mask.
    output_size: output dimensionality.
    init: data-dependent init for weightnorm parameters.
    **kwargs: Constains hparams, encoder_output,
      encoder_decoder_attention_bias and decoder_self_attention_bias

  Returns:
    outputs: Tensor of shape [batch_size, length, output_size].
  """
  with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
    hparams = kwargs.pop("hparams")
    disable_dropout = kwargs.pop("disable_dropout")
    if disable_dropout:
      hparams = copy.deepcopy(hparams)
      hparams.attention_dropout = 0.0
      hparams.layer_prepostprocess_dropout = 0.0
      hparams.relu_dropout = 0.0
    n_channels = common_layers.shape_list(x)[-1]
    if n_channels != hparams.hidden_size:
      hparams = copy.deepcopy(hparams)
      hparams.hidden_size = n_channels

    outputs = common_attention.add_timing_signal_1d(x)
    with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
      for layer_idx in range(n_layers):
        outputs = transformer_decoder_layer(
            decoder_input=outputs,
            layer_idx=layer_idx,
            hparams=hparams,
            **kwargs)
    outputs = common_layers.layer_preprocess(outputs, hparams)
    outputs = dense_weightnorm(
        "h2o", outputs, output_size, x_mask, init_scale=0.0, init=init)
    return outputs
def transformer_fast_prepare_decoder(targets, hparams):
    """Prepare one shard of the model for the decoder.
  Args:
    targets: a Tensor.
    hparams: run hyperparameters
  Returns:
    decoder_input: a Tensor, bottom of decoder stack
    decoder_position_forward_mask: mask Tensor for position-forward. [1, t, 1]
  """
    length = tf.shape(targets)[1]
    decoder_position_forward_mask = 1. / tf.expand_dims(
        tf.expand_dims(tf.to_float(tf.range(length)) + 1., 0), -1)  # [1, t, 1]

    decoder_input = common_layers.shift_right_3d(targets)
    if hparams.pos == "timing":
        decoder_input = common_attention.add_timing_signal_1d(decoder_input)
    return (decoder_input, decoder_position_forward_mask)
Example #32
0
 def add_vanilla_transformer_layer(x, num_layers, name):
     """Passes the input through num_layers of vanilla transformer layers.
 Args:
  x: input
  num_layers: number of layers
  name: string, prefix of layer names
 Returns:
    output of vanilla_transformer_layer
 """
     if hparams.add_position_timing_signal:
         # In case of add_position_timing_signal=true, we set  hparams.pos=None
         # and add position timing signal at the beginning of each step, so for
         # the vanilla transformer, we need to add timing signal here.
         x = common_attention.add_timing_signal_1d(x)
     for layer in range(num_layers):
         with tf.variable_scope(name + "layer_%d" % layer):
             x = ffn_unit(attention_unit(x))
     return x
 def decode_syntax_template(self, trg_syntax_emb):
     with tf.variable_scope('syntax_decoder', reuse=tf.AUTO_REUSE):
         trg_syntax_emb = common_attention.add_timing_signal_1d(
             trg_syntax_emb)
         trg_syntax_emb = self.update_embedding(trg_syntax_emb)
         trg_syntax_length = tf.shape(trg_syntax_emb)[1]
         trg_self_attention_bias = common_attention.attention_bias_lower_triangle(
             trg_syntax_length)
         trg_syntax_outputs = transformer.transformer_decoder(
             decoder_input=trg_syntax_emb,
             decoder_self_attention_bias=trg_self_attention_bias,
             encoder_output=self.shared_tensors['src_outputs'],
             encoder_decoder_attention_bias=self.shared_tensors['src_bias'],
             hparams=self.hparams,
             external_output=self.
             shared_tensors['template_prev_simp_outputs'],
             external_bias=self.shared_tensors['template_simp_bias'])
     return trg_syntax_outputs
Example #34
0
def attend(x, source, hparams, name):
  """Attend function."""
  with tf.variable_scope(name):
    # x = tf.squeeze(x, axis=2)
    x, xshape, _ = cia.maybe_reshape_4d_to_3d(x)
    if len(source.get_shape()) > 3:
      source = tf.squeeze(source, axis=2)
    source = common_attention.add_timing_signal_1d(source)
    y = common_attention.multihead_attention(
        common_layers.layer_preprocess(x, hparams),
        source,
        None,
        hparams.attention_key_channels or hparams.hidden_size,
        hparams.attention_value_channels or hparams.hidden_size,
        hparams.hidden_size, hparams.num_heads,
        hparams.attention_dropout)
    res = common_layers.layer_postprocess(x, y, hparams)
    return tf.reshape(res, xshape)
Example #35
0
def attention_lm_prepare_decoder(targets, hparams):
    """Prepare one shard of the model for the decoder.

  Args:
    targets: a Tensor.
    hparams: run hyperparameters

  Returns:
    decoder_input: a Tensor, bottom of decoder stack
    decoder_self_attention_bias: a Tensor, containing large negative values
    to implement masked attention and possibly baises for diagonal alignments
  """
    decoder_self_attention_bias = (
        common_attention.attention_bias_lower_triangle(tf.shape(targets)[1]))
    decoder_input = common_layers.shift_left_3d(targets)
    if hparams.pos == "timing":
        decoder_input = common_attention.add_timing_signal_1d(decoder_input)
    return (decoder_input, decoder_self_attention_bias)
Example #36
0
def long_answer_prepare_decoder(inputs, targets, hparams):
  """Prepare one shard of the model for the decoder.

  Args:
    inputs: a Tensor.
    targets: a Tensor.
    hparams: run hyperparameters

  Returns:
    decoder_input: a Tensor, bottom of decoder stack
  """
  decoder_input = tf.concat([
      length_embedding(targets, hparams), inputs,
      common_layers.shift_left_3d(targets)
  ], 1)
  if hparams.pos == "timing":
    decoder_input = common_attention.add_timing_signal_1d(decoder_input)
  return decoder_input
Example #37
0
def transformer_prepare_encoder(inputs, target_space, hparams):
    """Prepare one shard of the model for the encoder.
  
    Args:
      inputs: Tensor with shape [batch, memory_length, depth]
      target_space: a Tensor.
      hparams: run hyperparameters
  
    Returns:
      encoder_input: a Tensor, bottom of encoder stack
      encoder_self_attention_bias: a bias tensor for use in encoder self-attention
      encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder
        attention
    """

    ignore_padding = get_ignore_padding(inputs)
    encoder_self_attention_bias = ignore_padding

    # Bias for self-attention to encourage attention to close positions.
    if hparams.proximity_bias:
        encoder_self_attention_bias += comm_attn.attention_bias_proximal(
            length=tf.shape(inputs)[1])

    # Append target_space_id embedding to inputs.
    emb_target_space = common_layers.embedding(
        x=target_space,
        vocab_size=32,
        dense_size=inputs.shape.as_list[-1],
        name='target_space_embedding')
    emb_target_space = tf.reshape(emb_target_space, [1, 1, -1])

    # Question: wat
    encoder_input = inputs + emb_target_space
    if hparams.pos == 'timing':
        encoder_input = comm_attn.add_timing_signal_1d(encoder_input)
    # Putting this here since always called immediately after...
    encoder_input = with_dropout(encoder_input, hparams)

    return EncoderState(input=encoder_input,
                        self_attn_bias=encoder_self_attention_bias,
                        decoder_attn_bias=ignore_padding,
                        output=None)
def prepare_question_encoder(inputs, hparams):
  """Prepare question encoder.

  Args:
    inputs: a Tensor.
    hparams: run hyperparameters

  Returns:
    encoder_input: a Tensor, bottom of encoder stack
    encoder_self_attention_bias: a bias tensor for use in encoder self-attention
  """
  encoder_input = inputs
  # Usual case - not a packed dataset.
  encoder_padding = common_attention.embedding_to_padding(encoder_input)
  ignore_padding = common_attention.attention_bias_ignore_padding(
      encoder_padding)
  encoder_self_attention_bias = ignore_padding
  if hparams.pos == "timing":
    encoder_input = common_attention.add_timing_signal_1d(encoder_input)
  elif hparams.pos == "emb":
    encoder_input = common_attention.add_positional_embedding(
        encoder_input, hparams.max_length, "inputs_positional_embedding",
        None)
  return (encoder_input, encoder_self_attention_bias)
Example #39
0
def attention_lm_prepare_decoder(targets, hparams):
  """Prepare one shard of the model for the decoder.

  Args:
    targets: a Tensor.
    hparams: run hyperparameters

  Returns:
    decoder_input: a Tensor, bottom of decoder stack
    decoder_self_attention_bias: a Tensor, containing large negative values
    to implement masked attention and possibly baises for diagonal alignments
  """
  if hparams.prepend_mode == "prepend_inputs_full_attention":
    decoder_self_attention_bias = (
        common_attention.attention_bias_prepended(
            common_attention.embedding_to_padding(targets)))
  else:
    decoder_self_attention_bias = (
        common_attention.attention_bias_lower_triangle(
            common_layers.shape_list(targets)[1]))
  decoder_input = common_layers.shift_right_3d(targets)
  if hparams.pos == "timing":
    decoder_input = common_attention.add_timing_signal_1d(decoder_input)
  return (decoder_input, decoder_self_attention_bias)