Ejemplo n.º 1
0
 def apply(self, prev_output, batch_size, seq_length, attention_mask):
     layer_input = prev_output
     attention_output = self.self_attention.call(
         layer_input,
         attention_mask,
         batch_size,
         seq_length,
     )
     with tf.compat.v1.variable_scope("intermediate"):
         intermediate_output = self.intermediate_ff(attention_output)
     with tf.compat.v1.variable_scope("output"):
         layer_output = self.output_ff(intermediate_output)
         layer_output = bc.dropout(layer_output,
                                   self.config.hidden_dropout_prob)
         layer_output = bc.layer_norm(layer_output + attention_output)
     return intermediate_output, layer_output
Ejemplo n.º 2
0
def get_masked_lm_output_albert(model_config, input_tensor, output_weights,
                                positions, label_ids, label_weights):
    """Get loss and log probs for the masked LM."""
    input_tensor = bert_common.gather_indexes(input_tensor, positions)

    with tf.compat.v1.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.compat.v1.variable_scope("transform"):
            input_tensor = tf.keras.layers.Dense(
                model_config.embedding_size,
                activation=bert_common.get_activation(model_config.hidden_act),
                kernel_initializer=bert_common.create_initializer(
                    model_config.initializer_range))(input_tensor)
            input_tensor = bert_common.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.compat.v1.get_variable(
            "output_bias",
            shape=[model_config.vocab_size],
            initializer=tf.compat.v1.zeros_initializer())
        print("output_weights", output_weights.shape)
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        label_ids = tf.reshape(label_ids, [-1])
        label_weights = tf.reshape(label_weights, [-1])

        one_hot_labels = tf.one_hot(label_ids,
                                    depth=model_config.vocab_size,
                                    dtype=tf.float32)

        # The `positions` tensor might be zero-padded (if the sequence is too
        # short to have the maximum number of predictions). The `label_weights`
        # tensor has a value of 1.0 for every real prediction and 0.0 for the
        # padding predictions.
        per_example_loss = -tf.reduce_sum(
            input_tensor=log_probs * one_hot_labels, axis=[-1])
        numerator = tf.reduce_sum(input_tensor=label_weights *
                                  per_example_loss)
        denominator = tf.reduce_sum(input_tensor=label_weights) + 1e-5
        loss = numerator / denominator

    return (loss, per_example_loss, log_probs)
def self_attention_with_add(layer_input, attention_mask, config, batch_size,
                            seq_length, hidden_size, initializer, values,
                            add_locations):
    attention_head_size = int(hidden_size / config.num_attention_heads)
    with tf.compat.v1.variable_scope("attention"):
        attention_heads = []
        with tf.compat.v1.variable_scope("self"):
            attention_head = bc.attention_layer(
                from_tensor=layer_input,
                to_tensor=layer_input,
                attention_mask=attention_mask,
                num_attention_heads=config.num_attention_heads,
                size_per_head=attention_head_size,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_2d_tensor=True,
                batch_size=batch_size,
                from_seq_length=seq_length,
                to_seq_length=seq_length)
            attention_heads.append(attention_head)

        attention_output = None
        if len(attention_heads) == 1:
            attention_output = attention_heads[0]
        else:
            # In the case where we have other sequences, we just concatenate
            # them to the self-attention head before the projection.
            attention_output = tf.concat(attention_heads, axis=-1)

        # [batch*seq_length, hidden_dim] , [batch, n_locations]
        attention_output = tf.tensor_scatter_nd_add(attention_output,
                                                    add_locations, values)

        # Run a linear projection of `hidden_size` then add a residual
        # with `layer_input`.
        with tf.compat.v1.variable_scope("output"):
            attention_output = bc.dense(hidden_size,
                                        initializer)(attention_output)
            attention_output = bc.dropout(attention_output,
                                          config.hidden_dropout_prob)
            attention_output = bc.layer_norm(attention_output + layer_input)
    return attention_output
Ejemplo n.º 4
0
    def __call__(self, inputs):
        from_tensor, to_tensor_list, attention_mask = inputs

        attention_output = attention_layer(
            from_tensor=from_tensor,
            to_tensor_list=to_tensor_list,
            query_ff=self.sub_layers['query'],
            key_ff=self.sub_layers['key'],
            value_ff=self.sub_layers['value'],
            attention_mask=attention_mask,
            num_attention_heads=self.num_attention_heads,
            size_per_head=self.attention_head_size,
            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
        )

        attention_output = self.sub_layers['output'](attention_output)
        attention_output = bc.dropout(attention_output,
                                      self.hidden_dropout_prob)
        attention_output = bc.layer_norm(attention_output + from_tensor.matrix)
        return attention_output
Ejemplo n.º 5
0
    def call(self, layer_input, attention_mask, batch_size, seq_length):
        attention_heads = []
        with tf.compat.v1.variable_scope("attention"):
            with tf.compat.v1.variable_scope("self"):
                attention_head = bc.attention_layer2(
                    from_tensor=layer_input,
                    to_tensor=layer_input,
                    query_ff=self.query_layer,
                    key_ff=self.key_layer,
                    value_ff=self.value_layer,
                    attention_mask=attention_mask,
                    num_attention_heads=self.num_attention_heads,
                    size_per_head=self.attention_head_size,
                    attention_probs_dropout_prob=self.
                    attention_probs_dropout_prob,
                    do_return_2d_tensor=True,
                    batch_size=batch_size,
                    from_seq_length=seq_length,
                    to_seq_length=seq_length)
                attention_heads.append(attention_head)

            attention_output = None
            if len(attention_heads) == 1:
                attention_output = attention_heads[0]
            else:
                # In the case where we have other sequences, we just concatenate
                # them to the self-attention head before the projection.
                attention_output = tf.concat(attention_heads, axis=-1)

            # Run a linear projection of `hidden_size` then add a residual
            # with `layer_input`.
            with tf.compat.v1.variable_scope("output"):
                attention_output = self.output_layer(attention_output)
                attention_output = bc.dropout(attention_output,
                                              self.hidden_dropout_prob)
                attention_output = bc.layer_norm(attention_output +
                                                 layer_input)
        return attention_output
    def forward_layer_with_added(self, prev_output, added_value, locations):
        hidden_size = self.config.hidden_size
        layer_input = prev_output
        attention_output = self_attention_with_add(
            layer_input, self.attention_mask, self.config, self.batch_size,
            self.seq_length, hidden_size, self.initializer, added_value,
            locations)

        with tf.compat.v1.variable_scope("intermediate"):
            intermediate_output = bc.dense(
                self.config.intermediate_size,
                self.initializer,
                activation=bc.get_activation(
                    self.config.hidden_act))(attention_output)

        with tf.compat.v1.variable_scope("output"):
            layer_output = bc.dense(hidden_size,
                                    self.initializer)(intermediate_output)
            layer_output = bc.dropout(layer_output,
                                      self.config.hidden_dropout_prob)
            layer_output = bc.layer_norm(layer_output + attention_output)
            prev_output = layer_output
        return intermediate_output, layer_output
Ejemplo n.º 7
0
 def __call__(self, inputs):
     intermediate_output = self.intermediate_ff(inputs)
     layer_output = self.output_ff(intermediate_output)
     layer_output = bc.dropout(layer_output, self.hidden_dropout_prob)
     layer_output = bc.layer_norm(layer_output + inputs)
     return layer_output
Ejemplo n.º 8
0
def transformer_model(input_tensor,
                    attention_mask=None,
                    input_mask=None,
                    hidden_size=768,
                    num_hidden_layers=12,
                    num_attention_heads=12,
                    mr_num_route=10,
                    intermediate_size=3072,
                    intermediate_act_fn=gelu,
                    hidden_dropout_prob=0.1,
                    attention_probs_dropout_prob=0.1,
                    initializer_range=0.02,
                    is_training=True,
                    do_return_all_layers=False):
    """Multi-headed, multi-layer Transformer from "Attention is All You Need".

    This is almost an exact implementation of the original Transformer encoder.

    See the original paper:
    https://arxiv.org/abs/1706.03762

    Also see:
    https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py

    Args:
        input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
        attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,
            seq_length], with 1 for positions that can be attended to and 0 in
            positions that should not be.
        hidden_size: int. Hidden size of the Transformer.
        num_hidden_layers: int. Number of layers (blocks) in the Transformer.
        num_attention_heads: int. Number of attention heads in the Transformer.
        intermediate_size: int. The size of the "intermediate" (a.k.a., feed
            forward) layer.
        intermediate_act_fn: function. The non-linear activation function to apply
            to the output of the intermediate/feed-forward layer.
        hidden_dropout_prob: float. Dropout probability for the hidden layers.
        attention_probs_dropout_prob: float. Dropout probability of the attention
            probabilities.
        initializer_range: float. Range of the initializer (stddev of truncated
            normal).
        do_return_all_layers: Whether to also return all layers or just the final
            layer.

    Returns:
        float Tensor of shape [batch_size, seq_length, hidden_size], the final
        hidden layer of the Transformer.

    Raises:
        ValueError: A Tensor shape or parameter is invalid.
    """
    if hidden_size % num_attention_heads != 0:
        raise ValueError(
                "The hidden size (%d) is not a multiple of the number of attention "
                "heads (%d)" % (hidden_size, num_attention_heads))

    attention_head_size = int(hidden_size / num_attention_heads)
    input_shape = get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_shape[0]
    seq_length = input_shape[1]
    input_width = input_shape[2]

    initializer = create_initializer(initializer_range)

    ext_tensor = tf.compat.v1.get_variable("ext_tensor",
                                 shape=[num_hidden_layers, mr_num_route, EXT_SIZE ,hidden_size],
                                 initializer=initializer,
                                 )
    ext_tensor_inter = tf.compat.v1.get_variable("ext_tensor_inter",
                                       shape=[num_hidden_layers, mr_num_route, intermediate_size],
                                       initializer=initializer,
                                           )
    # The Transformer performs sum residuals on all layers so the input needs
    # to be the same as the hidden size.
    if input_width != hidden_size:
        raise ValueError("The width of the input tensor (%d) != hidden size (%d)" %
                                         (input_width, hidden_size))

    # We keep the representation as a 2D tensor to avoid re-shaping it back and
    # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on
    # the GPU/CPU but may not be free on the TPU, so we want to minimize them to
    # help the optimizer.
    prev_output = reshape_to_matrix(input_tensor)

    def is_mr_layer(layer_idx):
        if layer_idx > 1:
            return True
        else:
            return False

    all_layer_outputs = []
    for layer_idx in range(num_hidden_layers):
        if not is_mr_layer(layer_idx):
            with tf.compat.v1.variable_scope("layer_%d" % layer_idx):
                layer_input = prev_output

                with tf.compat.v1.variable_scope("attention"):
                    attention_heads = []
                    with tf.compat.v1.variable_scope("self"):
                        attention_head = attention_layer(
                                from_tensor=layer_input,
                                to_tensor=layer_input,
                                attention_mask=attention_mask,
                                num_attention_heads=num_attention_heads,
                                size_per_head=attention_head_size,
                                attention_probs_dropout_prob=attention_probs_dropout_prob,
                                initializer_range=initializer_range,
                                do_return_2d_tensor=True,
                                batch_size=batch_size,
                                from_seq_length=seq_length,
                                to_seq_length=seq_length)
                        attention_heads.append(attention_head)

                    attention_output = None
                    if len(attention_heads) == 1:
                        attention_output = attention_heads[0]
                    else:
                        # In the case where we have other sequences, we just concatenate
                        # them to the self-attention head before the projection.
                        attention_output = tf.concat(attention_heads, axis=-1)

                    # Run a linear projection of `hidden_size` then add a residual
                    # with `layer_input`.
                    with tf.compat.v1.variable_scope("output"):
                        attention_output = dense(hidden_size, initializer)(attention_output)
                        attention_output = dropout(attention_output, hidden_dropout_prob)
                        attention_output = layer_norm(attention_output + layer_input)

                # The activation is only applied to the "intermediate" hidden layer.
                with tf.compat.v1.variable_scope("intermediate"):
                    intermediate_output = dense(intermediate_size, initializer,
                                                activation=intermediate_act_fn)(attention_output)

                # Down-project back to `hidden_size` then add the residual.
                with tf.compat.v1.variable_scope("output"):
                    layer_output = dense(hidden_size, initializer)(intermediate_output)
                    layer_output = dropout(layer_output, hidden_dropout_prob)
                    layer_output = layer_norm(layer_output + attention_output)
                    prev_output = layer_output
                    all_layer_outputs.append(layer_output)

                with tf.compat.v1.variable_scope("mr_key"):
                    key_output = tf.keras.layers.Dense(
                        mr_num_route,
                        kernel_initializer=create_initializer(initializer_range))(intermediate_output)
                    key_output = dropout(key_output, hidden_dropout_prob)

                    if is_training:
                        key = tf.random.categorical(key_output, 1) # [batch_size, 1]
                        key = tf.reshape(key, [-1])
                    else:
                        key = tf.math.argmax(input=key_output, axis=1)

        else: # Case MR layer
            with tf.compat.v1.variable_scope("layer_%d" % layer_idx):
                layer_input = prev_output
                ext_slice = tf.gather(ext_tensor[layer_idx], key)
                ext_interm_slice = tf.gather(ext_tensor_inter[layer_idx], key)
                print("ext_slice (batch*seq, ", ext_slice.shape)
                with tf.compat.v1.variable_scope("attention"):
                    attention_heads = []
                    with tf.compat.v1.variable_scope("self"):
                        attention_head = attention_layer_w_ext(
                            from_tensor=layer_input,
                            to_tensor=layer_input,
                            attention_mask=attention_mask,
                            ext_slice=ext_slice,
                            num_attention_heads=num_attention_heads,
                            size_per_head=attention_head_size,
                            attention_probs_dropout_prob=attention_probs_dropout_prob,
                            initializer_range=initializer_range,
                            do_return_2d_tensor=True,
                            batch_size=batch_size,
                            from_seq_length=seq_length,
                            to_seq_length=seq_length)
                        attention_head = attention_head + ext_slice[:,EXT_ATT_OUT,:]
                        attention_heads.append(attention_head)

                    attention_output = None
                    if len(attention_heads) == 1:
                        attention_output = attention_heads[0]
                    else:
                        # In the case where we have other sequences, we just concatenate
                        # them to the self-attention head before the projection.
                        attention_output = tf.concat(attention_heads, axis=-1)

                    # Run a linear projection of `hidden_size` then add a residual
                    # with `layer_input`.
                    with tf.compat.v1.variable_scope("output"):
                        attention_output = dense(hidden_size, initializer)(attention_output)
                        attention_output = dropout(attention_output, hidden_dropout_prob)
                        attention_output = attention_output + ext_slice[:,EXT_ATT_PROJ,:]
                        attention_output = layer_norm(attention_output + layer_input)

                # The activation is only applied to the "intermediate" hidden layer.
                with tf.compat.v1.variable_scope("intermediate"):
                    intermediate_output = dense(intermediate_size, initializer,
                                                activation=intermediate_act_fn)(attention_output)
                    intermediate_output = ext_interm_slice + intermediate_output
                # Down-project back to `hidden_size` then add the residual.
                with tf.compat.v1.variable_scope("output"):
                    layer_output = dense(hidden_size, initializer)(intermediate_output)
                    layer_output = layer_output + ext_slice[:, EXT_LAYER_OUT,:]
                    layer_output = dropout(layer_output, hidden_dropout_prob)
                    layer_output = layer_norm(layer_output + attention_output)
                    prev_output = layer_output
                    all_layer_outputs.append(layer_output)

    if do_return_all_layers:
        final_outputs = []
        for layer_output in all_layer_outputs:
            final_output = reshape_from_matrix(layer_output, input_shape)
            final_outputs.append(final_output)
        return final_outputs, key
    else:
        final_output = reshape_from_matrix(prev_output, input_shape)
        return final_output, key