Esempio n. 1
0
def einsum_via_matmul(input_tensor, w, num_inner_dims):
    """Implements einsum via matmul and reshape ops.

    Args:
      input_tensor: float Tensor of shape [<batch_dims>, <inner_dims>].
      w: float Tensor of shape [<inner_dims>, <outer_dims>].
      num_inner_dims: int. number of dimensions to use for inner products.

    Returns:
      float Tensor of shape [<batch_dims>, <outer_dims>].
    """
    input_shape = get_shape_list(input_tensor)
    w_shape = get_shape_list(w)
    batch_dims = input_shape[: -num_inner_dims]
    inner_dims = input_shape[-num_inner_dims:]
    outer_dims = w_shape[num_inner_dims:]
    inner_dim = np.prod(inner_dims)
    outer_dim = np.prod(outer_dims)
    if num_inner_dims > 1:
        input_tensor = tf.reshape(input_tensor, batch_dims + [inner_dim])
    if len(w_shape) > 2:
        w = tf.reshape(w, [inner_dim, outer_dim])
    ret = tf.matmul(input_tensor, w)
    if len(outer_dims) > 1:
        ret = tf.reshape(ret, batch_dims + outer_dims)
    return ret
Esempio n. 2
0
def dot_product_attention(q, k, v, bias, dropout_rate=0.0):
    """Dot-product attention.

    Args:
      q: Tensor with shape [..., length_q, depth_k].
      k: Tensor with shape [..., length_kv, depth_k]. Leading dimensions must
        match with q.
      v: Tensor with shape [..., length_kv, depth_v] Leading dimensions must
        match with q.
      bias: bias Tensor (see attention_bias())
      dropout_rate: a float.

    Returns:
      Tensor with shape [..., length_q, depth_v].
    """
    logits = tf.matmul(q, k, transpose_b=True)  # [..., length_q, length_kv]
    logits = tf.multiply(logits, 1.0 / math.sqrt(float(get_shape_list(q)[-1])))
    if bias is not None:
        # `attention_mask` = [B, T]
        from_shape = get_shape_list(q)
        if len(from_shape) == 4:
            broadcast_ones = tf.ones([from_shape[0], 1, from_shape[2], 1], tf.float32)
        elif len(from_shape) == 5:
            # from_shape = [B, N, Block_num, block_size, depth]#
            broadcast_ones = tf.ones([from_shape[0], 1, from_shape[2], from_shape[3],
                                      1], tf.float32)
        else:
            raise ValueError('wrong dimension for from_shape')

        bias = tf.matmul(broadcast_ones,
                         tf.cast(bias, tf.float32), transpose_b=True)

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
        # positions we want to attend and -10000.0 for masked positions.
        adder = (1.0 - bias) * -10000.0

        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.
        logits += adder
    else:
        adder = 0.0

    attention_probs = tf.nn.softmax(logits, name="attention_probs")
    attention_probs = dropout(attention_probs, dropout_rate)
    return tf.matmul(attention_probs, v)
Esempio n. 3
0
def embedding_lookup(input_ids,
                     vocab_size,
                     embedding_size=128,
                     initializer_range=0.02,
                     word_embedding_name='word_embeddings',
                     use_one_hot_embeddings=False):
    """Looks up words embeddings for id tensor.
    Args:
      input_ids: int32 Tensor of shape [batch_size, seq_length] containing word
        ids.
      vocab_size: int. Size of the embedding vocabulary.
      embedding_size: int. Width of the word embeddings.
      initializer_range: float. Embedding initialization range.
      word_embedding_name: string. Name of the embedding table.
      use_one_hot_embeddings: bool. If True, use one-hot method for word
        embeddings. If False, use `tf.nn.embedding_lookup()`.
    Returns:
      float Tensor of shape [batch_size, seq_length, embedding_size].
    """
    # This function assumes that the input is of shape [batch_size, seq_length,
    # num_inputs].
    #
    # If the input is a 2D tensor of shape [batch_size, seq_length], we
    # reshape to [batch_size, seq_length, 1].
    if input_ids.shape.ndims == 2:
        input_ids = tf.expand_dims(input_ids,
                                   axis=[-1])  # [batch_size, seq_length, 1]

    embedding_table = tf.get_variable(
        name=word_embedding_name,
        shape=[vocab_size, embedding_size],
        initializer=create_initializer(
            initializer_range))  # [vocab_size, embedding_size]

    if use_one_hot_embeddings:
        flat_input_ids = tf.reshape(input_ids,
                                    [-1])  # [batch_size * seq_length, ]
        one_hot_input_ids = tf.one_hot(
            flat_input_ids,
            depth=vocab_size)  # [batch_size * seq_length, vocab_size]
        output = tf.matmul(
            one_hot_input_ids,
            embedding_table)  # [batch_size * seq_length, embedding_size]
    else:
        output = tf.gather(
            embedding_table,
            input_ids)  # [batch_size * seq_length, embedding_size]

    input_shape = get_shape_list(input_ids)  # [batch_size, seq_length, 1]

    output = tf.reshape(output, input_shape[0:-1] +
                        [input_shape[-1] * embedding_size
                         ])  # [batch_size, seq_length, embedding_size]

    return output, embedding_table
Esempio n. 4
0
def dense_layer_3d(input_tensor,
                   num_attention_heads,
                   head_size,
                   initializer,
                   activation,
                   use_einsum,
                   name=None):
    """A dense layer with 3D kernel.

    Args:
      input_tensor: float Tensor of shape [batch, seq_length, hidden_size].
      num_attention_heads: Number of attention heads.
      head_size: The size per attention head.
      initializer: Kernel initializer.
      activation: Actication function.
      use_einsum: bool. Whether to use einsum or reshape+matmul for dense layers_test.
      name: The name scope of this layer.

    Returns:
      float logits Tensor.
    """

    input_shape = get_shape_list(input_tensor)
    hidden_size = input_shape[2]

    with tf.variable_scope(name):
        w = tf.get_variable(
            name="kernel",
            shape=[hidden_size, num_attention_heads * head_size],
            initializer=initializer)
        w = tf.reshape(w, [hidden_size, num_attention_heads, head_size])
        b = tf.get_variable(
            name="bias",
            shape=[num_attention_heads * head_size],
            initializer=tf.zeros_initializer)
        b = tf.reshape(b, [num_attention_heads, head_size])
        if use_einsum:
            ret = tf.einsum("BFH,HND->BFND", input_tensor, w)
        else:
            ret = einsum_via_matmul(input_tensor, w, 1)
        ret += b
    if activation is not None:
        return activation(ret)
    else:
        return ret
Esempio n. 5
0
def dense_layer_2d(input_tensor,
                   output_size,
                   initializer,
                   activation,
                   use_einsum,
                   num_attention_heads=1,
                   name=None):
    """A dense layer with 2D kernel.

    Args:
      input_tensor: Float tensor with rank 3.
      output_size: The size of output dimension.
      initializer: Kernel initializer.
      activation: Activation function.
      use_einsum: bool. Whether to use einsum or reshape+matmul for dense layers_test.
      num_attention_heads: number of attention head in attention layer.
      name: The name scope of this layer.

    Returns:
      float logits Tensor.
    """
    del num_attention_heads  # unused
    input_shape = get_shape_list(input_tensor)
    hidden_size = input_shape[2]
    with tf.variable_scope(name):
        w = tf.get_variable(
            name="kernel",
            shape=[hidden_size, output_size],
            initializer=initializer)
        b = tf.get_variable(
            name="bias", shape=[output_size], initializer=tf.zeros_initializer)
        if use_einsum:
            ret = tf.einsum("BFH,HO->BFO", input_tensor, w)
        else:
            ret = tf.matmul(input_tensor, w)
        ret += b
    if activation is not None:
        return activation(ret)
    else:
        return ret
Esempio n. 6
0
    def __call__(self, is_training, input_ids, input_mask, segment_ids, labels,
                 num_labels):
        if not is_training:
            self.config.hidden_dropout_prob = 0.0

        input_shape = get_shape_list(input_ids, expected_rank=2)
        batch_size, seq_length = input_shape

        if input_mask is None:
            input_mask = tf.ones(shape=[batch_size, seq_length],
                                 dtype=tf.int32)

        if segment_ids is None:
            segment_ids = tf.zeros(shape=[batch_size, seq_length],
                                   dtype=tf.int32)

        with tf.variable_scope('embeddings'):
            word_embedding_output, output_embedding_table = embedding_lookup(
                input_ids=input_ids,
                vocab_size=self.config.vocab_size,
                embedding_size=self.config.embedding_size,
                initializer_range=self.config.initializer_range,
                word_embedding_name='word_embeddings')

            # embedding_output = embedding_postprocessor(
            #     input_tensor=word_embedding_output,
            #     use_token_type=True,
            #     token_type_ids=segment_ids,
            #     token_type_vocab_size=self.config.type_vocab_size,
            #     token_type_embedding_name='token_type_embeddings',
            #     use_position_embeddings=True,
            #     position_embedding_name='position_embeddings',
            #     initializer_range=self.config.initializer_range,
            #     max_position_embeddings=self.config.max_position_embeddings,
            #     dropout_prob=self.config.hidden_dropout_prob)

            embedding_output = tf.multiply(
                word_embedding_output,
                tf.expand_dims(tf.cast(input_mask, tf.float32), -1))

        pooled_outputs = []
        for i, filter_size in enumerate(self.config.filter_sizes):
            with tf.variable_scope('conv-maxpool-%s' % filter_size):
                filters = tf.get_variable(
                    name='filters',
                    initializer=tf.truncated_normal(shape=[
                        filter_size, self.config.embedding_size,
                        self.config.hidden_size
                    ]),
                    dtype=tf.float32)
                bias = tf.get_variable(
                    name='bias',
                    initializer=tf.constant(0.,
                                            shape=[self.config.hidden_size]),
                    dtype=tf.float32)
                conv = tf.nn.conv1d(embedding_output,
                                    filters=filters,
                                    stride=1,
                                    padding='VALID',
                                    dilations=self.config.dilations,
                                    name='conv')
                conv = tf.nn.bias_add(conv, bias)
                conv = tf.nn.relu(conv, name='relu')
                pooled = tf.nn.max_pool1d(
                    conv,
                    ksize=[
                        1,
                        self.config.max_position_embeddings - filter_size + 1,
                        1
                    ],
                    strides=1,
                    padding='VALID',
                    name='pool')
                pooled_outputs.append(pooled)

        concat_pooled_outputs = tf.concat(pooled_outputs, axis=-1)

        with tf.variable_scope('flat-dropout-fnn'):
            flatten_size = concat_pooled_outputs.shape[
                -1] * concat_pooled_outputs.shape[-2]
            h_flatten = tf.reshape(concat_pooled_outputs,
                                   shape=[-1, flatten_size],
                                   name='flatten')
            h_dropout = tf.nn.dropout(h_flatten,
                                      rate=self.config.hidden_dropout_prob)
            output_weights = tf.get_variable(
                name='output_weights',
                shape=[flatten_size, num_labels],
                initializer=tf.truncated_normal_initializer(stddev=0.02),
                dtype=tf.float32)
            output_bias = tf.get_variable(name='output_bias',
                                          shape=[num_labels],
                                          initializer=tf.zeros_initializer(),
                                          dtype=tf.float32)

        with tf.variable_scope('loss'):
            logits = tf.matmul(h_dropout, output_weights)
            logits = tf.nn.bias_add(logits, output_bias)

            probabilities = tf.nn.softmax(logits, axis=-1)
            predictions = tf.argmax(probabilities,
                                    axis=-1,
                                    output_type=tf.int32)
            log_probs = tf.nn.log_softmax(logits, axis=-1)
            one_hot_labels = tf.one_hot(labels,
                                        depth=num_labels,
                                        dtype=tf.float32)

            per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs,
                                              axis=-1)

            batch_loss = tf.reduce_mean(per_example_loss)

            return batch_loss, per_example_loss, probabilities, logits, predictions
Esempio n. 7
0
    def __init__(self,
                 config,
                 is_training,
                 input_ids,
                 input_mask=None,
                 token_type_ids=None,
                 use_one_hot_embeddings=False,
                 use_einsum=True,
                 scope=None):
        """Constructor for BertModel.

        Args:
          config: `BertConfig` instance.
          is_training: bool. true for training model, false for eval model. Controls
            whether dropout will be applied.
          input_ids: int32 Tensor of shape [batch_size, seq_length].
          input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
          token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
          use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
            embeddings or tf.embedding_lookup() for the word embeddings.
          scope: (optional) variable scope. Defaults to "bert".

        Raises:
          ValueError: The config is invalid or one of the input tensor shapes
            is invalid.
        """
        config = copy.deepcopy(config)
        if not is_training:
            config.hidden_dropout_prob = 0.0
            config.attention_probs_dropout_prob = 0.0

        input_shape = get_shape_list(input_ids, expected_rank=2)
        batch_size = input_shape[0]
        seq_length = input_shape[1]

        if input_mask is None:
            input_mask = tf.ones(shape=[batch_size, seq_length],
                                 dtype=tf.int32)

        if token_type_ids is None:
            token_type_ids = tf.zeros(shape=[batch_size, seq_length],
                                      dtype=tf.int32)

        with tf.variable_scope(scope, default_name='bert',
                               reuse=tf.AUTO_REUSE):
            with tf.variable_scope('embeddings'):
                # Perform embedding lookup on the word ids.
                self.embedding_output, self.embedding_table = embedding_lookup(
                    input_ids=input_ids,
                    vocab_size=config.vocab_size,
                    embedding_size=config.hidden_size,
                    initializer_range=config.initializer_range,
                    word_embedding_name='word_embeddings',
                    use_one_hot_embeddings=use_one_hot_embeddings)

                # Add positional embeddings and token type embeddings, then layer
                # normalize and perform dropout.
                self.embedding_output = embedding_postprocessor(
                    input_tensor=self.embedding_output,
                    use_token_type=True,
                    token_type_ids=token_type_ids,
                    token_type_vocab_size=config.type_vocab_size,
                    token_type_embedding_name='token_type_embeddings',
                    use_position_embeddings=True,
                    position_embedding_name='position_embeddings',
                    initializer_range=config.initializer_range,
                    max_position_embeddings=config.max_position_embeddings,
                    dropout_prob=config.hidden_dropout_prob,
                    use_one_hot_embeddings=use_one_hot_embeddings)

            with tf.variable_scope('encoder'):
                # # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
                # # mask of shape [batch_size, seq_length, seq_length] which is used
                # # for the attention scores.
                # attention_mask = create_attention_mask_from_input_mask(
                #     input_ids, input_mask)
                attention_mask = input_mask

                # Run the stacked transformer.
                # `sequence_output` shape = [batch_size, seq_length, hidden_size].
                self.all_encoder_layers = transformer_model(
                    input_tensor=self.embedding_output,
                    attention_mask=attention_mask,
                    hidden_size=config.hidden_size,
                    num_hidden_layers=config.num_hidden_layers,
                    num_attention_heads=config.num_attention_heads,
                    intermediate_size=config.intermediate_size,
                    intermediate_act_fn=get_activation(config.hidden_act),
                    hidden_dropout_prob=config.hidden_dropout_prob,
                    attention_probs_dropout_prob=config.
                    attention_probs_dropout_prob,
                    initializer_range=config.initializer_range,
                    do_return_all_layers=True,
                    use_einsum=use_einsum,
                    share_parameter_across_layers=False)

            self.sequence_output = self.all_encoder_layers[-1]
            # The "pooler" converts the encoded sequence tensor of shape
            # [batch_size, seq_length, hidden_size] to a tensor of shape
            # [batch_size, hidden_size]. This is necessary for segment-level
            # (or segment-pair-level) classification tasks where we need a fixed
            # dimensional representation of the segment.
            with tf.variable_scope('pooler'):
                # We "pool" the model by simply taking the hidden state corresponding
                # to the first token. We assume that this has been pre-trained.
                first_token_tensor = tf.squeeze(self.sequence_output[:,
                                                                     0:1, :],
                                                axis=1)
                self.pooled_output = tf.layers.dense(
                    first_token_tensor,
                    config.hidden_size,
                    activation=tf.tanh,
                    kernel_initializer=create_initializer(
                        config.initializer_range))
Esempio n. 8
0
def embedding_postprocessor(input_tensor,
                            use_token_type=False,
                            token_type_ids=None,
                            token_type_vocab_size=16,
                            token_type_embedding_name="token_type_embeddings",
                            use_position_embeddings=True,
                            position_embedding_name="position_embeddings",
                            initializer_range=0.02,
                            max_position_embeddings=512,
                            dropout_prob=0.1,
                            use_one_hot_embeddings=True):
    """Performs various post-processing on a word embedding tensor.
    Args:
      input_tensor: float Tensor of shape [batch_size, seq_length,
        embedding_size].
      use_token_type: bool. Whether to add embeddings for `token_type_ids`.
      token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
        Must be specified if `use_token_type` is True.
      token_type_vocab_size: int. The vocabulary size of `token_type_ids`.
      token_type_embedding_name: string. The name of the embedding table variable
        for token type ids.
      use_position_embeddings: bool. Whether to add position embeddings for the
        position of each token in the sequence.
      position_embedding_name: string. The name of the embedding table variable
        for positional embeddings.
      initializer_range: float. Range of the weight initialization.
      max_position_embeddings: int. Maximum sequence length that might ever be
        used with this model. This can be longer than the sequence length of
        input_tensor, but cannot be shorter.
      dropout_prob: float. Dropout probability applied to the final output tensor.
      use_one_hot_embeddings: bool. If True, use one-hot method for word
        embeddings. If False, use `tf.nn.embedding_lookup()`.
    Returns:
      float tensor with same shape as `input_tensor`.
    Raises:
      ValueError: One of the tensor shapes or input values is invalid.
    """
    input_shape = get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_shape[0]
    seq_length = input_shape[1]
    width = input_shape[2]

    output = input_tensor

    if use_token_type:
        if token_type_ids is None:
            raise ValueError("`token_type_ids` must be specified if"
                             "`use_token_type` is True.")
        token_type_table = tf.get_variable(
            name=token_type_embedding_name,
            shape=[token_type_vocab_size, width],
            initializer=create_initializer(initializer_range))
        # This vocab will be small so we always do one-hot here, since it is always
        # faster for a small vocabulary, unless converting to tflite model.
        if use_one_hot_embeddings:
            flat_token_type_ids = tf.reshape(token_type_ids, [-1])
            one_hot_ids = tf.one_hot(flat_token_type_ids,
                                     depth=token_type_vocab_size)
            token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
            token_type_embeddings = tf.reshape(token_type_embeddings,
                                               [batch_size, seq_length, width])
        else:
            token_type_embeddings = tf.nn.embedding_lookup(
                token_type_table, token_type_ids)
        output += token_type_embeddings

    if use_position_embeddings:
        assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
        with tf.control_dependencies([assert_op]):
            full_position_embeddings = tf.get_variable(
                name=position_embedding_name,
                shape=[max_position_embeddings, width],
                initializer=create_initializer(initializer_range))
            # Since the position embedding table is a learned variable, we create it
            # using a (long) sequence length `max_position_embeddings`. The actual
            # sequence length might be shorter than this, for faster training of
            # tasks that do not have long sequences.
            #
            # So `full_position_embeddings` is effectively an embedding table
            # for position [0, 1, 2, ..., max_position_embeddings-1], and the current
            # sequence has positions [0, 1, 2, ... seq_length-1], so we can just
            # perform a slice.
            position_embeddings = tf.slice(full_position_embeddings, [0, 0],
                                           [seq_length, -1])
            num_dims = len(output.shape.as_list())

            # Only the last two dimensions are relevant (`seq_length` and `width`), so
            # we broadcast among the first dimensions, which is typically just
            # the batch size.
            position_broadcast_shape = []
            for _ in range(num_dims - 2):
                position_broadcast_shape.append(1)
            position_broadcast_shape.extend([seq_length, width])
            position_embeddings = tf.reshape(position_embeddings,
                                             position_broadcast_shape)
            output += position_embeddings

    output = layer_norm_and_dropout(output, dropout_prob)
    return output
Esempio n. 9
0
def attention_layer(from_tensor,
                    to_tensor,
                    attention_mask=None,
                    num_attention_heads=1,
                    query_act=None,
                    key_act=None,
                    value_act=None,
                    attention_probs_dropout_prob=0.0,
                    initializer_range=0.02,
                    batch_size=None,
                    from_seq_length=None,
                    to_seq_length=None,
                    use_einsum=True):
    """Performs multi-headed attention from `from_tensor` to `to_tensor`.

    Args:
      from_tensor: float Tensor of shape [batch_size, from_seq_length,
        from_width].
      to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width].
      attention_mask: (optional) int32 Tensor of shape [batch_size,
        from_seq_length, to_seq_length]. The values should be 1 or 0. The
        attention scores will effectively be set to -infinity for any positions in
        the mask that are 0, and will be unchanged for positions that are 1.
      num_attention_heads: int. Number of attention heads.
      query_act: (optional) Activation function for the query transform.
      key_act: (optional) Activation function for the key transform.
      value_act: (optional) Activation function for the value transform.
      attention_probs_dropout_prob: (optional) float. Dropout probability of the
        attention probabilities.
      initializer_range: float. Range of the weight initializer.
      batch_size: (Optional) int. If the input is 2D, this might be the batch size
        of the 3D version of the `from_tensor` and `to_tensor`.
      from_seq_length: (Optional) If the input is 2D, this might be the seq length
        of the 3D version of the `from_tensor`.
      to_seq_length: (Optional) If the input is 2D, this might be the seq length
        of the 3D version of the `to_tensor`.
      use_einsum: bool. Whether to use einsum or reshape+matmul for dense layers_test

    Returns:
      float Tensor of shape [batch_size, from_seq_length, num_attention_heads,
        size_per_head].

    Raises:
      ValueError: Any of the arguments or tensor shapes are invalid.
    """
    from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
    to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])
    size_per_head = int(from_shape[2] / num_attention_heads)

    if len(from_shape) != len(to_shape):
        raise ValueError(
            "The rank of `from_tensor` must match the rank of `to_tensor`.")

    if len(from_shape) == 3:
        batch_size = from_shape[0]
        from_seq_length = from_shape[1]
        to_seq_length = to_shape[1]
    elif len(from_shape) == 2:
        if batch_size is None or from_seq_length is None or to_seq_length is None:
            raise ValueError(
                "When passing in rank 2 tensors to attention_layer, the values "
                "for `batch_size`, `from_seq_length`, and `to_seq_length` "
                "must all be specified.")

    # Scalar dimensions referenced here:
    #   B = batch size (number of sequences)
    #   F = `from_tensor` sequence length
    #   T = `to_tensor` sequence length
    #   N = `num_attention_heads`
    #   H = `size_per_head`

    # `query_layer` = [B, F, N, H]
    q = dense_layer_3d(from_tensor, num_attention_heads, size_per_head,
                       create_initializer(initializer_range), query_act,
                       use_einsum, "query")

    # `key_layer` = [B, T, N, H]
    k = dense_layer_3d(to_tensor, num_attention_heads, size_per_head,
                       create_initializer(initializer_range), key_act,
                       use_einsum, "key")
    # `value_layer` = [B, T, N, H]
    v = dense_layer_3d(to_tensor, num_attention_heads, size_per_head,
                       create_initializer(initializer_range), value_act,
                       use_einsum, "value")
    q = tf.transpose(q, [0, 2, 1, 3])
    k = tf.transpose(k, [0, 2, 1, 3])
    v = tf.transpose(v, [0, 2, 1, 3])
    if attention_mask is not None:
        attention_mask = tf.reshape(
            attention_mask, [batch_size, 1, to_seq_length, 1])
        # 'new_embeddings = [B, N, F, H]'
    new_embeddings = dot_product_attention(q, k, v, attention_mask,
                                           attention_probs_dropout_prob)

    return tf.transpose(new_embeddings, [0, 2, 1, 3])
Esempio n. 10
0
def transformer_model(input_tensor,
                      attention_mask=None,
                      hidden_size=768,
                      num_hidden_layers=12,
                      num_hidden_groups=12,
                      num_attention_heads=12,
                      intermediate_size=3072,
                      inner_group_num=1,
                      intermediate_act_fn='gelu',
                      hidden_dropout_prob=0.1,
                      attention_probs_dropout_prob=0.1,
                      initializer_range=0.02,
                      do_return_all_layers=False,
                      share_parameter_across_layers=False,
                      use_einsum=True):
    """Multi-headed, multi-layer Transformer from "Attention is All You Need".
    This is almost an exact implementation of the original Transformer encoder.
    See the original paper:
    https://arxiv.org/abs/1706.03762
    Also see:
    https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py
    Args:
      input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
      attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,
        seq_length], with 1 for positions that can be attended to and 0 in
        positions that should not be.
      hidden_size: int. Hidden size of the Transformer.
      num_hidden_layers: int. Number of layers_test (blocks) in the Transformer.
      num_hidden_groups: int. Number of group for the hidden layers_test, parameters
        in the same group are shared.
      num_attention_heads: int. Number of attention heads in the Transformer.
      intermediate_size: int. The size of the "intermediate" (a.k.a., feed
        forward) layer.
      inner_group_num: int, number of inner repetition of attention and ffn.
      intermediate_act_fn: function. The non-linear activation function to apply
        to the output of the intermediate/feed-forward layer.
      hidden_dropout_prob: float. Dropout probability for the hidden layers_test.
      attention_probs_dropout_prob: float. Dropout probability of the attention
        probabilities.
      initializer_range: float. Range of the initializer (stddev of truncated
        normal).
      do_return_all_layers: Whether to also return all layers_test or just the final
        layer.
      share_parameter_across_layers: Whether to share attention_ffn_blocks
      use_einsum: bool. Whether to use einsum or reshape+matmul for dense layers_test
    Returns:
      float Tensor of shape [batch_size, seq_length, hidden_size], the final
      hidden layer of the Transformer.
    Raises:
      ValueError: A Tensor shape or parameter is invalid.
    """
    if hidden_size % num_attention_heads != 0:
        raise ValueError(
            "The hidden size (%d) is not a multiple of the number of attention "
            "heads (%d)" % (hidden_size, num_attention_heads))

    attention_head_size = hidden_size // num_attention_heads
    input_shape = get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_shape[0]
    seq_length = input_shape[1]
    input_width = input_shape[2]

    # The Transformer performs sum residuals on all layers_test so that the input needs
    # to be the same as the hidden size.
    if input_width != hidden_size:
        prev_output = dense_layer_2d(
            input_tensor, hidden_size, create_initializer(initializer_range),
            None, use_einsum=use_einsum, name="embedding_hidden_mapping_in")
    else:
        prev_output = input_tensor

    all_layer_outputs = []

    with tf.variable_scope('transformer', reuse=tf.AUTO_REUSE if share_parameter_across_layers else False):
        for layer_idx in range(num_hidden_layers):
            group_idx = int(layer_idx / num_hidden_layers * num_hidden_groups) \
                if share_parameter_across_layers else 0
            name_variable_scope = 'layer_%d'
            with tf.variable_scope('group_%d' % group_idx):
                with tf.name_scope(name_variable_scope % layer_idx):
                    layer_output = prev_output
                    if share_parameter_across_layers:
                        for inner_group_idx in range(inner_group_num):
                            with tf.variable_scope("inner_group_%d" % inner_group_idx):
                                layer_output = attention_ffn_block(
                                    layer_input=layer_output,
                                    hidden_size=hidden_size,
                                    attention_mask=attention_mask,
                                    num_attention_heads=num_attention_heads,
                                    attention_head_size=attention_head_size,
                                    attention_probs_dropout_prob=attention_probs_dropout_prob,
                                    intermediate_size=intermediate_size,
                                    intermediate_act_fn=intermediate_act_fn,
                                    initializer_range=initializer_range,
                                    hidden_dropout_prob=hidden_dropout_prob,
                                    use_einsum=use_einsum)
                                prev_output = layer_output
                                all_layer_outputs.append(layer_output)
                    else:
                        layer_output = attention_ffn_block(
                            layer_input=layer_output,
                            hidden_size=hidden_size,
                            attention_mask=attention_mask,
                            num_attention_heads=num_attention_heads,
                            attention_head_size=attention_head_size,
                            attention_probs_dropout_prob=attention_probs_dropout_prob,
                            intermediate_size=intermediate_size,
                            intermediate_act_fn=intermediate_act_fn,
                            initializer_range=initializer_range,
                            hidden_dropout_prob=hidden_dropout_prob,
                            use_einsum=use_einsum)
                        prev_output = layer_output
                        all_layer_outputs.append(layer_output)
    if do_return_all_layers:
        return all_layer_outputs
    else:
        return all_layer_outputs[-1]