Python _error Beispiele, utils.log._error Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: model_helper.py Projekt: gucasdongzi/BERT_TF

def assert_rank(tensor, expected_rank, name=None):
    """Check whether the rank of the 'tensor' matches the expected_rank.
        Remember rank is the number of the total dimensions.
    
    Args:
        tensor: A tf.Tensor to check.
        expected_rank: Python integer or list of intefers.
        name: (optional) name for the error.
    """
    if name is None:
        name = tensor.name
    
    expected_rank_dict = {}
    # save the given rank into the dictionary
    if isinstance(expected_rank, six.integer_types):
        expected_rank_dict[expected_rank] = True
    else:
        for rank in expected_rank:
            expected_rank_dict[rank] = True
    
    tensor_rank = tensor.shape.ndims
    if tensor_rank not in expected_rank_dict:
        scope_name = tf.get_variable_scope().name
        _error( 'For the tensor {} in scope {}, the tensor rank {%d} \
            (shape = {}) is not equal to the expected_rank {}'.format(
            name, scope_name, tensor_rank, str(tensor.shape), str(expected_rank)))
        raise ValueError

Beispiel #2

0

Datei anzeigen

Datei: run_predict.py Projekt: we1l1n/BERT_TF

    def _process_input(self, input_ids, max_length):
        assert len(input_ids) < max_length, _error(
            'Input length is larger than the maximum length')

        question_length = len(input_ids)
        input_ids += [
            vocab_idx['<mask>'] for _ in range(max_length - question_length)
        ]
        # input_ids[2] = 330
        # input_ids[3] = 1470
        # input_ids[4] = 1048
        # input_ids[5] = 116
        input_mask = [1 for _ in range(question_length)
                      ] + [0 for _ in range(max_length - question_length)]
        input_mask = create_mask_for_seq(input_mask, question_length,
                                         max_length - question_length)

        # input_mask = []
        # for _ in range(max_length):
        #     temp = [1 for _ in range(question_length)] + [0 for _ in range(max_length - question_length)]
        #     input_mask.append(temp)

        masked_lm_positions = [
            question_length + idx
            for idx in range(max_length - question_length)
        ]

        return [input_ids], [input_mask], [masked_lm_positions]

Beispiel #3

0

Datei anzeigen

 def __setattr__(self, name, value):
     if hasattr(self, name):
         wrapped_setatrr(self, name, value)
     else:
         _error('Add new {} is forbidden'.format(name))
         raise AttributeError

Beispiel #4

0

Datei anzeigen

def train_generator(path, max_length, train_type=None):
    """"This is the entrance to the input_fn."""
    if train_type == 'seq2seq':
        questions, answers, max_length = parse_data(path, train_type)
        for que, ans in zip(questions, answers):
            # 1. input_ids
            # use <mask> to represent the answer instead of the original 0
            input_ids = que + [vocab_idx['<mask>'] for _ in range(len(ans))
                               ]  # que + ans(represented by <mask>)
            padding_part = [
                vocab_idx['<padding>']
                for _ in range(max_length - len(input_ids))
            ]
            # input_ids -> [5, 2, 1, 10, 10, 10, 0, 0, 0, 0], where supposing 10 is <mask>, 0 is <padding>
            input_ids += padding_part  # [max_length]

            # 2. mask for attention scores
            # original input_mask in paper -> [1, 1, 1, 0, 0], however, use another mask here
            # where 1 indicates the question part, 0 indicates both the answer part and padding part.
            input_mask = [1 for _ in range(len(que))
                          ] + [0 for _ in range(len(ans + padding_part))]
            input_mask = create_mask_for_seq(input_mask, len(que),
                                             len(ans + padding_part))

            # 3. masked_lm_positions saves the relative positions for answer part and padding part.
            # no padding masked_lm_positions -> [[2, 3, 4, 5, 6, 7, 8, 9], [5, 6, 7, 8, 9]]
            masked_lm_positions = [
                len(que) + idx for idx in range(max_length - len(que))
            ]
            # ATTENTION # the above `masked_lm_positions` of each data in a batch may not have the same length,
            # # # # # # # due to the various length of question,
            # # # # # # # so padding the `masked_lm_positions` to the same length as max_length is necessary,
            # # # # # # # although the padding items are fake, the following `mask_lm_weights` will handle this.
            # supposing the max_length equals to 10, the example no padding masked_lm_positions will look like
            # the following after the next step:
            # [[2, 3, 4, 5, 6, 7, 8, 9, 0, 0], [5, 6, 7, 8, 0, 0, 0, 0, 0, 0]]
            # The reason for using `0` to pad here, During training, the `masked_lm_positions` wii add the `flat_offset`,
            # the padding items do not exist, if add other numbers instead of 0, maybe cause index error.
            masked_lm_positions += [
                0 for idx in range(max_length - len(masked_lm_positions))
            ]

            # 4. mask_lm_ids -> the actual labels
            mask_lm_ids = ans + padding_part
            # padding the `mask_lm_ids` to the max_length
            mask_lm_ids += [
                vocab_idx['<padding>']
                for _ in range(max_length - len(mask_lm_ids))
            ]

            # 5. mask_lm_weights -> for calculate the actual loss, which help to ignore the padding part
            mask_lm_weights = [1 for _ in range(len(ans))
                               ] + [0 for _ in range(len(padding_part))]
            # padding
            mask_lm_weights += [
                0 for _ in range(max_length - len(mask_lm_weights))
            ]

            # print(input_ids)
            # print(input_mask)
            # print(masked_lm_positions)
            # print(mask_lm_ids)
            # print(mask_lm_weights)
            # input()

            features = {
                'input_ids': input_ids,
                'input_mask': input_mask,
                'masked_lm_positions': masked_lm_positions,
                'masked_lm_ids': mask_lm_ids,
                'masked_lm_weights': mask_lm_weights
            }
            yield features
    elif train_type == 'lm':
        sentences, max_length = parse_data(path, train_type)
        for line in sentences:
            input_ids = [vocab_idx['S']]
            padding_part = [
                vocab_idx['<padding>']
                for _ in range(max_length - len(input_ids))
            ]
            input_ids += padding_part

            input_mask = create_mask_for_lm(max_length)

            masked_lm_positions = [
                idx + 1 for idx in range(len(input_ids) - 1)
            ]
            masked_lm_positions += [
                masked_lm_positions[-1] + 1 + idx
                for idx in range(len(input_ids) - len(masked_lm_positions))
            ]
            mask_lm_ids = line + [
                vocab_idx['<padding>']
                for _ in range(len(input_ids) - len(line) - 1)
            ]
            mask_lm_ids += [
                vocab_idx['<padding>']
                for _ in range(len(input_ids) - len(mask_lm_ids))
            ]
            mask_lm_weights = [1 for _ in range(len(line))] + [
                0 for _ in range(len(input_ids) - len(line) - 1)
            ]
            mask_lm_weights += [
                0 for _ in range(len(input_ids) - len(mask_lm_weights))
            ]

            # print(line)
            # print(len(input_ids))
            # print(len(input_mask))
            # print(len(masked_lm_positions))
            # print(len(mask_lm_ids))
            # print(len(mask_lm_weights))
            # input()

            features = {
                'input_ids': input_ids,
                'input_mask': input_mask,
                'masked_lm_positions': masked_lm_positions,
                'masked_lm_ids': mask_lm_ids,
                'masked_lm_weights': mask_lm_weights
            }
            yield features
    else:
        _error('Non supported train type: {}'.format(train_type))
        raise ValueError

Beispiel #5

0

Datei anzeigen

Datei: model_helper.py Projekt: gucasdongzi/BERT_TF

def embedding_postprocessor(input_tensor,
                            use_token_type=False,
                            token_type_ids=None,
                            token_type_vocab_size=3,
                            token_type_embedding_name='token_type_embeddings',
                            use_positional_embeddings=True,
                            positional_embedding_type='normal',
                            pre_positional_embeddings=None,
                            positional_embedding_name='position_embeddings',
                            initializer_range=0.01,
                            max_positional_embeddings=512,
                            dropout_prob=0.01):
    """Performs some preprocessing on the word embeddings.
    
    Args:
        input_tensor: float Tensor of shape [batch_size, seq_length, embedding_size].
        use_token_type: bool. Whether to add segment embeddings, very confused about the original comments
            uses 'token' as name, as I realized, token_type_ids would be [[0, 0, 1], [0, 1, 0]], 0 refers to the segment 1,
            and 1 refers to segment 2, the last 0 in the second array refers to the padding.
        token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
        token_type_vocab_size: the number of token types.
        use_positional_embeddings: bool. Whether to add positional embeddings.
        positional_embedding_type: ['normal', 'trigonometrical'].
        pre_positional_embeddings: postional embeddings for the pre_positional_embeddings.
        postional_embedding_name: string. The name of the embedding table variable.
        initializer_range: float. Range of the weight initializer.
        max_positional_embeddings: int. Maximum sequence length for each sentence, which should be equal to or longer than the sequence.
        dropout_prob: float. Dropout probability applied to the final output tensor.
    
    Returns:
        float Tensor with the identical shape as 'input_tensor'.
    """
    input_shape = get_shape_list(input_tensor, expected_rank=[2,3])
    batch_size, seq_length, width = input_shape[0], input_shape[1], input_shape[2]

    # create this variable in case of not use any pre-embeddings on the input_tensor
    output = input_tensor

    if use_token_type:
        if token_type_ids is None:
            _error('`token_type_ids` must be specified if `use_token_type` is True.')
            raise ValueError
        token_type_table = tf.get_variable(
            name=token_type_embedding_name,
            shape=[token_type_vocab_size, width],
            initializer=create_initializer(initializer_range))
        
        token_type_embeddings = tf.nn.embedding_lookup(token_type_table, token_type_ids)
        output += token_type_embeddings

    if use_positional_embeddings:
        assert_op = tf.assert_less_equal(seq_length, max_positional_embeddings)
        with tf.control_dependencies([assert_op]):
            full_positional_embeddings = tf.get_variable(
                name=positional_embedding_name,
                shape=[max_positional_embeddings, width],
                initializer=create_initializer(initializer_range))
            
            # the full_positional_embeddings is created under the maximum sequence length,
            # however, the actual length maybe less than the maximum length, so slicing is necessary.
            positional_embeddings = tf.slice(full_positional_embeddings, [0, 0], [seq_length, -1])
            output += positional_embeddings
    
    output = layer_norm_and_dropout(output, dropout_prob)
    return output

Beispiel #6

0

Datei anzeigen

Datei: pre_train.py Projekt: lgstd/BERT_TF

    def model_fn(features, labels, mode, params):
        """this is prototype syntax, all parameters are necessary."""
        # obtain the data
        _info('*** Features ***')
        for name in sorted(features.keys()):
            tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))

        input_ids = features['input_ids']       # [batch_size, seq_length]
        input_mask = features['input_mask']     # [batch_size, seq_length]

        # if mode != tf.estimator.ModeKeys.PREDICT:
        #     # segment_idx = features['segment_dis']
        #     masked_lm_positions = features['masked_lm_positions']   # [batch_size, seq_length], specify the answer
        #     masked_lm_ids = features['masked_lm_ids']               # [batch_size, answer_seq_length], specify the answer labels
        #     masked_lm_weights = features['masked_lm_weights']        # [batch_size, seq_length], [1, 1, 0], 0 refers to the mask
        #     # next_sentence_labels = features['next_sentence_labels']
        # else:
        masked_lm_positions = features['masked_lm_positions']
        masked_lm_ids = features['masked_lm_ids']
        masked_lm_weights = features['masked_lm_weights']

        if bert_config.train_type == 'seq2seq':
            _info('Training seq2seq task.')
        elif bert_config.train_type == 'lm':
            _info('Training language model task.')
  
        # build model
        is_training = (mode == tf.estimator.ModeKeys.TRAIN)
        model = BertModel(
            config=bert_config,
            is_training=is_training,
            input_ids=input_ids,
            input_mask=input_mask)
        
        # compute loss
        loss, pre_loss, log_probs = get_masked_lm_output(bert_config,
                                                         model.get_sequence_output(),
                                                         model.embedding_table,
                                                         model.projection_table,
                                                         masked_lm_positions,
                                                         masked_lm_ids,
                                                         masked_lm_weights,
                                                         mode)
  
        if mode == tf.estimator.ModeKeys.PREDICT:
            masked_lm_predictions = tf.reshape(tf.argmax(log_probs, axis=-1, output_type=tf.int32), [-1])
            output_spec = tf.estimator.EstimatorSpec(mode, predictions=masked_lm_predictions)
        else:
            if mode == tf.estimator.ModeKeys.TRAIN:   
                # restore from the checkpoint,
                # tf.estimator automatically restore from the model typically,
                # maybe here is for restore some pre-trained parameters
                tvars = tf.trainable_variables()
                initialized_variable_names = {}
                if init_checkpoint:
                    (assignment_map, initialized_variable_names) = get_assignment_map_from_checkpoint(tvars, init_checkpoint)
                    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

                _info('*** Trainable Variables ***')
                for var in tvars:
                    init_string = ''
                    if var.name in initialized_variable_names:
                        init_string = ', *INIT_FROM_CKPT*'
                    _info('name = {}, shape={}{}'.format(var.name, var.shape, init_string))
                
                train_op = optimization.create_optimizer(
                    loss, bert_config.learning_rate, num_train_steps)

                # learning_rate = tf.train.polynomial_decay(bert_config.learning_rate,
                #                                         tf.train.get_or_create_global_step(),
                #                                         num_train_steps,
                #                                         end_learning_rate=0.0,
                #                                         power=1.0,
                #                                         cycle=False)
                # optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
                # gradients = tf.gradients(loss, tvars, colocate_gradients_with_ops=True)
                # clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
                # train_op = optimizer.apply_gradients(zip(clipped_gradients, tvars), global_step=tf.train.get_global_step())
                output_spec = tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
            elif mode == tf.estimator.ModeKeys.EVAL:
                # TODO define the metrics
                _error('to do ...')
                raise NotImplementedError

        return output_spec

Beispiel #7

0

Datei anzeigen

    input_fn = functools.partial(train_input_fn, 
                                path=bert_config.data_path,
                                batch_size=bert_config.batch_size,
                                repeat_num=bert_config.num_train_steps,
                                max_length = bert_config.max_length)

    gpu_config = tf.ConfigProto()
    gpu_config.gpu_options.allow_growth=True

    run_config = tf.contrib.tpu.RunConfig(
        session_config=gpu_config,
        keep_checkpoint_max=1,
        save_checkpoints_steps=10,
        model_dir=bert_config.model_dir)
    
    estimaotr = tf.estimator.Estimator(model_fn, config=run_config)
    estimaotr.train(input_fn)     # train_input_fn should be callable

def package_model(ckpt_path, pb_path):
    model_fn = model_fn_builder(bert_config, None, bert_config.learning_rate, bert_config.num_train_steps)
    estimator = tf.estimator.Estimator(model_fn, ckpt_path)
    estimator.export_saved_model(pb_path, server_input_receiver_fn)

if __name__ == '__main__':
    if sys.argv[1] == 'train':
        main()
    elif sys.argv[1] == 'package':
        package_model(str(PROJECT_PATH / 'models_lm'), str(PROJECT_PATH / 'models_deploy_lm'))
    else:
        _error('Unknown parameter: {}.'.format(sys.argv[1]))
        _info('Choose from [train | package].')

Beispiel #8

0

Datei anzeigen

def tranformer_model(input_tensor,
                     attention_mask=None,
                     hidden_size=1024,
                     num_hidden_layers=12,
                     num_attention_heads=12,
                     intermediate_size=3072,
                     intermediate_act_fn=_mh.gelu,
                     hidden_dropout_prob=0.1,
                     attention_probs_dropout_prob=0.1,
                     initializer_range=0.02,
                     do_return_all_layers=False,
                     share_parameter_across_layers=True):
    """Multi-head, multi-layer Transformer.
    
    Args:
        input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
        attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, seq_length],
            where 1 indicates the position can be attended and 0 indicates the position cannot be attended.
        hidden_size: int. Hidden size of the Transformer.
        num_hidden_layers: int. Number of layers in the Transformer.
        num_attention_heads: int. Number of attention heads in the Transformer.
        intermediate_size: int. The size of the feed forward layer.
        intermediate_act_fn: activation function after feed forward layer.
        hidden_dropout_prob: float.
        attention_probs_dropout_prob: float.
        initializer_range: float.
        do_return_all_layers: bool. Return the output from all the hidden layers or just the final layer.
        share_parameter_across_layers: bool. Whether share parameters across each attention layer.

    Returns:
        float Tensor of shape [batch_size, seq_length, hidden_size],
        or a list contains 'num_hidden_layers' float Tensor.
    """
    if hidden_size % num_attention_heads != 0:
        _error(
            'The hidden size {} cannot be divided by the number of attention heads {}'
            .format(hidden_size, num_attention_heads))
        raise ValueError

    # the hidden size for each head
    attention_head_size = int(hidden_size / num_attention_heads)
    input_shape = _mh.get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_shape[0]
    seq_length = input_shape[1]
    input_width = input_shape[2]

    # residual layer need to perform on the outputs from all layers,
    # so the hidden size, i.e. the outputs from the transformer blocks
    # should be the same as the input_width, at the beginning, it is input tensor,
    # diffetentiate hidden_size from the intermediate_size,
    # intermediate layer is before the hidden layer.
    if input_width != hidden_size:
        _error(
            'The width of the input tensor {} not not equal to the hidden size {}'
            .format(input_width, hidden_size))
        raise ValueError

    # create a list to save the output from each transformer layer]
    prev_output = input_tensor  # [batch_size, seq_length, width]
    all_layer_outputs = []
    for layer_idx in range(num_hidden_layers):
        if share_parameter_across_layers:
            name_variable_scope = 'layer_shared'
        else:
            name_variable_scope = 'layer_{}'.format(layer_idx)

        # share the parameter across layers when share_parameter_across_layers us True and not the first layer
        with tf.variable_scope(
                name_variable_scope,
                reuse=True if
            (share_parameter_across_layers and layer_idx > 0) else False):
            layer_input = prev_output
            with tf.variable_scope('attention'):
                attention_heads = []
                with tf.variable_scope('self'):
                    attention_head = self_attention_layer(
                        from_tensor=layer_input,
                        to_tensor=layer_input,
                        attention_mask=attention_mask,
                        num_attention_heads=num_attention_heads,
                        size_per_head=attention_head_size,
                        attention_probs_dropout_prob=
                        attention_probs_dropout_prob,
                        initializer_range=initializer_range,
                        batch_size=batch_size,
                        from_seq_length=seq_length,
                        to_seq_length=seq_length)
                attention_output = attention_head
                # perform residual layer to finish the self-attention block
                with tf.variable_scope('output'):
                    attention_output = tf.layers.dense(
                        attention_output,
                        hidden_size,
                        kernel_initializer=_mh.create_initializer(
                            initializer_range))
                    attention_output = _mh.dropout(attention_output,
                                                   hidden_dropout_prob)
                    attention_output = _mh.layer_norm(attention_output +
                                                      layer_input)

            # do double linear projection to enhance the context representation
            with tf.variable_scope('intermediate'):
                intermediate_output = tf.layers.dense(
                    attention_output,
                    intermediate_size,
                    activation=intermediate_act_fn,
                    kernel_initializer=_mh.create_initializer(
                        initializer_range))

            with tf.variable_scope('output'):
                layer_output = tf.layers.dense(
                    intermediate_output,
                    hidden_size,
                    kernel_initializer=_mh.create_initializer(
                        initializer_range))
                layer_output = _mh.dropout(layer_output, hidden_dropout_prob)
                layer_output = _mh.layer_norm(layer_output + attention_output)
                prev_output = layer_output
                all_layer_outputs.append(layer_output)

    if do_return_all_layers:
        return all_layer_outputs
    else:
        return all_layer_outputs[-1]

Beispiel #9

0

Datei anzeigen

def self_attention_layer(from_tensor,
                         to_tensor,
                         attention_mask=None,
                         num_attention_heads=1,
                         size_per_head=512,
                         query_act=None,
                         key_act=None,
                         value_act=None,
                         attention_probs_dropout_prob=0.0,
                         initializer_range=0.02,
                         batch_size=None,
                         from_seq_length=None,
                         to_seq_length=None):
    """Perform self-attention.
    
    Args:
        from_tensor: float Tensor of shape [batch_size, seq_length, width].
        to_tensor: float Tensor of shape [batch_size, seq_length, width].
        attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, seq_length],
            where 1 indicates the position can be attended and 0 indicates the position cannot be attended.
        num_attention_heads: int. Number of attention heads in the Transformer.
        size_per_head: int. Size of each attention head.
        query_act: (optional) Activation function for the query transformer.
        key_act: (optional) Activation function for the key transformer.
        value_act: (optional) Activation function for the value transformer.
        attention_probs_dropout_prob: (optional) float.
        initializer_range: float.
        batch_size: (optional) int.
        from_seq_length: (optional) int.
        to_seq_length: (optional) int.
    
    Returns:
        float Tensor of shape [batch_size, from_seq_length, width].
    """
    def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
                             seq_length, size_per_head):
        """Change the order of axes. witdh = num_attention_heads * size_per_head.
        
        Args:
            input_tensor: float Tensor of shape [batch_size, seq_length, width].

        Returns:
            float Tensor of shape [batch_size, num_attention_heads, seq_length, size_per_head].
        """
        output_tensor = tf.reshape(
            input_tensor,
            [batch_size, seq_length, num_attention_heads, size_per_head])
        output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
        return output_tensor

    # check the rank
    from_shape = _mh.get_shape_list(from_tensor, expected_rank=3)
    to_shape = _mh.get_shape_list(to_tensor, expected_rank=3)

    if len(from_shape) != len(to_shape) != 3:
        _error(
            'The rank of `from_tensor` should match the rank of `to_tensor`, and should be 3'
        )
        raise ValueError

    # calculate the query, key, value
    # from_tensor: [batch_size, seq_length, width] -> query_layer: [batch_size, seq_length, num_attention_heads * size_per_head]
    # num_attention_heads * size_per_head == hidden_size == width
    query_layer = tf.layers.dense(
        from_tensor,
        num_attention_heads * size_per_head,
        activation=query_act,
        name='query',
        kernel_initializer=_mh.create_initializer(initializer_range))

    key_layer = tf.layers.dense(
        to_tensor,
        num_attention_heads * size_per_head,
        activation=key_act,
        name='key',
        kernel_initializer=_mh.create_initializer(initializer_range))

    value_layer = tf.layers.dense(
        to_tensor,
        num_attention_heads * size_per_head,
        activation=value_act,
        name='value',
        kernel_initializer=_mh.create_initializer(initializer_range))

    # [batch_size, seq_length, width] -> [batch_size, num_attention_heads, seq_length, size_per_head]
    query_layer = transpose_for_scores(query_layer, batch_size,
                                       num_attention_heads, from_seq_length,
                                       size_per_head)
    key_layer = transpose_for_scores(key_layer, batch_size,
                                     num_attention_heads, to_seq_length,
                                     size_per_head)

    # calculate the attention scores
    # [batch_size, num_attention_heads, from_seq_length, to_seq_length]
    attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
    attention_scores = tf.multiply(attention_scores,
                                   1.0 / math.sqrt(float(size_per_head)))

    if attention_mask is not None:
        # [batch_size, seq_length, seq_length] -> [batch_size, 1, seq_length, seq_length]
        attention_mask = tf.expand_dims(attention_mask, axis=1)
        adder = (1.0 - tf.cast(attention_mask, dtype=tf.float32)) * -10000.0
        attention_scores += adder

    attention_probs = tf.nn.softmax(attention_scores)
    attention_probs = _mh.dropout(attention_probs,
                                  attention_probs_dropout_prob)

    # calculate the context layer
    # [batch_size, num_attention_heads, to_seq_length, size_per_head]
    value_layer = transpose_for_scores(value_layer, batch_size,
                                       num_attention_heads, to_seq_length,
                                       size_per_head)
    context_layer = tf.matmul(attention_scores, value_layer)
    # [batch_size, from_seq_length, num_attention_heads, size_per_head]
    context_layer = tf.transpose(context_layer, [0, 2, 1, 3])
    # [batch_size, from_seq_length, width]
    context_layer = tf.reshape(
        context_layer,
        [batch_size, from_seq_length, num_attention_heads * size_per_head])

    return context_layer