Example #1
0
def embedding_lookup(input_ids,
                     vocab_size,
                     embedding_size=128,
                     initializer_range=0.02,
                     word_embedding_name="word_embeddings",
                     use_one_hot_embeddings=False,
                     previor_bplayer=None,
                     name_or_scope=None):
    """
    Looks up words embeddings for id tensor.
    Args:
        input_ids: int32 Tensor of shape [batch_size, seq_length] containing word
          ids.
        vocab_size: int. Size of the embedding vocabulary.
        embedding_size: int. Width of the word embeddings.
        initializer_range: float. Embedding initialization range.
        word_embedding_name: string. Name of the embedding table.
        use_one_hot_embeddings: bool. If True, use one-hot method for word
          embeddings. If False, use `tf.gather()`.
        previor_bplayer: previor_bplayer
        name_or_scope: name_or_scope
    Returns:
        float Tensor of shape [batch_size, seq_length, embedding_size].
    """
    # This function assumes that the input is of shape [batch_size, seq_length,
    # num_inputs].
    #
    # If the input is a 2D tensor of shape [batch_size, seq_length], we
    # reshape to [batch_size, seq_length, 1].
    with tf.variable_scope(name_or_scope, default_name='embedding_lookup'):

        with tf.variable_scope("table") as table_scope:
            embedding_table = tf.get_variable(
                name=word_embedding_name,
                shape=[vocab_size, embedding_size],
                initializer=utils.create_initializer(initializer_range))
            bplayer_table = BPLayer(embedding_table, table_scope,
                                    [previor_bplayer])

        with tf.variable_scope("lookup") as lookup_scope:
            if input_ids.shape.ndims == 2:
                input_ids = tf.expand_dims(input_ids, axis=[-1])
            flat_input_ids = tf.reshape(input_ids, [-1])
            if use_one_hot_embeddings:
                one_hot_input_ids = tf.one_hot(flat_input_ids,
                                               depth=vocab_size)
                output = tf.matmul(one_hot_input_ids, embedding_table)
            else:
                output = tf.gather(embedding_table, flat_input_ids)
            input_shape = utils.get_shape_list(input_ids)
            output = tf.reshape(
                output, input_shape[0:-1] + [input_shape[-1] * embedding_size])
            bplayer_output = BPLayer(output, lookup_scope, [bplayer_table])
        return (output, bplayer_output, embedding_table, bplayer_table)
Example #2
0
 def __init__(self,
              hidden_size,
              num_attention_heads,
              attention_probs_dropout_prob=0.1,
              initializer_range=0.02,
              **kwargs):
     super().__init__(**kwargs)
     assert hidden_size % num_attention_heads == 0
     self.d_k = hidden_size // num_attention_heads
     self.hidden_size = hidden_size
     self.num_attention_heads = num_attention_heads
     self.drop_out = Dropout(attention_probs_dropout_prob)
     self.q_matrix = Dense(self.hidden_size,
                           kernel_initializer=create_initializer(initializer_range),
                           name='query')
     self.k_matrix = Dense(self.hidden_size,
                           kernel_initializer=create_initializer(initializer_range),
                           name='key')
     self.v_matrix = Dense(self.hidden_size,
                           kernel_initializer=create_initializer(initializer_range),
                           name='value')
Example #3
0
def feedforward_func(input_tensor,
                     intermediate_size,
                     initializer_range,
                     hidden_size,
                     hidden_dropout_prob,
                     intermediate_act_fn=utils.gelu):
    # The activation is only applied to the "intermediate" hidden layer.
    with tf.variable_scope("feedforward") as scope:
        intermediate_output = tf.layers.dense(
            input_tensor,
            intermediate_size,
            activation=intermediate_act_fn,
            kernel_initializer=utils.create_initializer(initializer_range),
            name="intermediate_dense")
        # Down-project back to `hidden_size` then add the residual.
        layer_output = tf.layers.dense(
            intermediate_output,
            hidden_size,
            kernel_initializer=utils.create_initializer(initializer_range),
            name="intermediate_output")
        layer_output = utils.dropout(layer_output, hidden_dropout_prob)
        layer_output = utils.layer_norm(layer_output + input_tensor)
    return layer_output, scope
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids, label_weights, prev_bplayers=None):
    """Get loss and log probs for the masked LM."""
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope("cls/predictions") as prediction_scope:
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=utils.get_activation(bert_config.hidden_act),
                kernel_initializer=utils.create_initializer(
                    bert_config.initializer_range))
            input_tensor = utils.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable(
            "output_bias",
            shape=[bert_config.vocab_size],
            initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        label_ids = tf.reshape(label_ids, [-1])
        label_weights = tf.reshape(label_weights, [-1])

        one_hot_labels = tf.one_hot(
            label_ids, depth=bert_config.vocab_size, dtype=tf.float32)

        # The `positions` tensor might be zero-padded (if the sequence is too
        # short to have the maximum number of predictions). The `label_weights`
        # tensor has a value of 1.0 for every real prediction and 0.0 for the
        # padding predictions.
        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
        numerator = tf.reduce_sum(label_weights * per_example_loss)
        denominator = tf.reduce_sum(label_weights) + 1e-5
        loss = numerator / denominator
        loss_bplayer = BPLayer(loss, prediction_scope, backward_layers=prev_bplayers)

    return (loss, per_example_loss, log_probs, loss_bplayer)
def get_next_sentence_output(bert_config, input_tensor, labels, prev_bplayers=None):
    """Get loss and log probs for the next sentence prediction."""

    # Simple binary classification. Note that 0 is "next sentence" and 1 is
    # "random sentence". This weight matrix is not used after pre-training.
    with tf.variable_scope("cls/seq_relationship") as scope:
        output_weights = tf.get_variable(
            "output_weights",
            shape=[2, bert_config.hidden_size],
            initializer=utils.create_initializer(bert_config.initializer_range))
        output_bias = tf.get_variable(
            "output_bias", shape=[2], initializer=tf.zeros_initializer())

        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)
        labels = tf.reshape(labels, [-1])
        one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32)
        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        loss = tf.reduce_mean(per_example_loss)
        loss_bplayer = BPLayer(loss, scope, backward_layers=prev_bplayers)
        return (loss, per_example_loss, log_probs, loss_bplayer)
Example #6
0
def attention_func(input_tensor, attention_mask, hidden_size,
                   hidden_dropout_prob, num_attention_heads,
                   attention_head_size, attention_probs_dropout_prob,
                   initializer_range, batch_size, seq_length):
    attention_heads = []
    with tf.variable_scope("attention") as scope:
        with tf.variable_scope("self"):
            attention_head = attention_layer(
                from_tensor=input_tensor,
                to_tensor=input_tensor,
                attention_mask=attention_mask,
                num_attention_heads=num_attention_heads,
                size_per_head=attention_head_size,
                attention_probs_dropout_prob=attention_probs_dropout_prob,
                initializer_range=initializer_range,
                do_return_2d_tensor=True,
                batch_size=batch_size,
                from_seq_length=seq_length,
                to_seq_length=seq_length)
            attention_heads.append(attention_head)
        if len(attention_heads) == 1:
            attention_output = attention_heads[0]
        else:
            # In the case where we have other sequences, we just concatenate
            # them to the self-attention head before the projection.
            attention_output = tf.concat(attention_heads, axis=-1)
        # Run a linear projection of `hidden_size` then add a residual
        # with `layer_input`.
        with tf.variable_scope("output"):
            attention_output = tf.layers.dense(
                attention_output,
                hidden_size,
                kernel_initializer=utils.create_initializer(initializer_range))
            attention_output = utils.dropout(attention_output,
                                             hidden_dropout_prob)
            attention_output = utils.layer_norm(attention_output +
                                                input_tensor)
    return attention_output, scope
Example #7
0
def embedding_postprocessor(input_tensor,
                            use_token_type=False,
                            token_type_ids=None,
                            token_type_vocab_size=16,
                            token_type_embedding_name="token_type_embeddings",
                            use_position_embeddings=True,
                            position_embedding_name="position_embeddings",
                            initializer_range=0.02,
                            max_position_embeddings=512,
                            dropout_prob=0.1,
                            previor_bplayer=None,
                            name_or_scope=None):
    """
    Performs various post-processing on a word embedding tensor.
    Args:
      input_tensor: float Tensor of shape [batch_size, seq_length,
        embedding_size].
      use_token_type: bool. Whether to add embeddings for `token_type_ids`.
      token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
        Must be specified if `use_token_type` is True.
      token_type_vocab_size: int. The vocabulary size of `token_type_ids`.
      token_type_embedding_name: string. The name of the embedding table variable
        for token type ids.
      use_position_embeddings: bool. Whether to add position embeddings for the
        position of each token in the sequence.
      position_embedding_name: string. The name of the embedding table variable
        for positional embeddings.
      initializer_range: float. Range of the weight initialization.
      max_position_embeddings: int. Maximum sequence length that might ever be
        used with this model. This can be longer than the sequence length of
        input_tensor, but cannot be shorter.
      dropout_prob: float. Dropout probability applied to the final output tensor.
      previor_bplayer: bplayer.
      name_or_scope: string or scope.
    Returns:
      float tensor with same shape as `input_tensor`.
    Raises:
      ValueError: One of the tensor shapes or input values is invalid.
    """
    with tf.variable_scope(name_or_scope,
                           default_name='embedding_postprocessor') as scope:
        input_shape = utils.get_shape_list(input_tensor, expected_rank=3)
        batch_size = input_shape[0]
        seq_length = input_shape[1]
        width = input_shape[2]

        output = input_tensor

        if use_token_type:
            if token_type_ids is None:
                raise ValueError("`token_type_ids` must be specified if"
                                 "`use_token_type` is True.")
            token_type_table = tf.get_variable(
                name=token_type_embedding_name,
                shape=[token_type_vocab_size, width],
                initializer=utils.create_initializer(initializer_range))
            # This vocab will be small so we always do one-hot here, since it is always
            # faster for a small vocabulary.
            flat_token_type_ids = tf.reshape(token_type_ids, [-1])
            one_hot_ids = tf.one_hot(flat_token_type_ids,
                                     depth=token_type_vocab_size)
            token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
            token_type_embeddings = tf.reshape(token_type_embeddings,
                                               [batch_size, seq_length, width])
            output += token_type_embeddings

        if use_position_embeddings:
            assert_op = tf.assert_less_equal(seq_length,
                                             max_position_embeddings)
            with tf.control_dependencies([assert_op]):
                full_position_embeddings = tf.get_variable(
                    name=position_embedding_name,
                    shape=[max_position_embeddings, width],
                    initializer=utils.create_initializer(initializer_range))
                # Since the position embedding table is a learned variable, we create it
                # using a (long) sequence length `max_position_embeddings`. The actual
                # sequence length might be shorter than this, for faster training of
                # tasks that do not have long sequences.
                #
                # So `full_position_embeddings` is effectively an embedding table
                # for position [0, 1, 2, ..., max_position_embeddings-1], and the current
                # sequence has positions [0, 1, 2, ... seq_length-1], so we can just
                # perform a slice.
                position_embeddings = tf.slice(full_position_embeddings,
                                               [0, 0], [seq_length, -1])
                num_dims = len(output.shape.as_list())

                # Only the last two dimensions are relevant (`seq_length` and `width`), so
                # we broadcast among the first dimensions, which is typically just
                # the batch size.
                position_broadcast_shape = []
                for _ in range(num_dims - 2):
                    position_broadcast_shape.append(1)
                position_broadcast_shape.extend([seq_length, width])
                position_embeddings = tf.reshape(position_embeddings,
                                                 position_broadcast_shape)
                output += position_embeddings

        output = utils.layer_norm_and_dropout(output, dropout_prob)
        bplayer_output = BPLayer(output, scope, [previor_bplayer])
    return output, bplayer_output
Example #8
0
def attention_layer(from_tensor,
                    to_tensor,
                    attention_mask=None,
                    num_attention_heads=1,
                    size_per_head=512,
                    query_act=None,
                    key_act=None,
                    value_act=None,
                    attention_probs_dropout_prob=0.0,
                    initializer_range=0.02,
                    do_return_2d_tensor=False,
                    batch_size=None,
                    from_seq_length=None,
                    to_seq_length=None):
    """Performs multi-headed attention from `from_tensor` to `to_tensor`.
      This is an implementation of multi-headed attention based on "Attention
      is all you Need". If `from_tensor` and `to_tensor` are the same, then
      this is self-attention. Each timestep in `from_tensor` attends to the
      corresponding sequence in `to_tensor`, and returns a fixed-with vector.
      This function first projects `from_tensor` into a "query" tensor and
      `to_tensor` into "key" and "value" tensors. These are (effectively) a list
      of tensors of length `num_attention_heads`, where each tensor is of shape
      [batch_size, seq_length, size_per_head].
      Then, the query and key tensors are dot-producted and scaled. These are
      softmaxed to obtain attention probabilities. The value tensors are then
      interpolated by these probabilities, then concatenated back to a single
      tensor and returned.
      In practice, the multi-headed attention are done with transposes and
      reshapes rather than actual separate tensors.
      Args:
        from_tensor: float Tensor of shape [batch_size, from_seq_length,
          from_width].
        to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width].
        attention_mask: (optional) int32 Tensor of shape [batch_size,
          from_seq_length, to_seq_length]. The values should be 1 or 0. The
          attention scores will effectively be set to -infinity for any positions in
          the mask that are 0, and will be unchanged for positions that are 1.
        num_attention_heads: int. Number of attention heads.
        size_per_head: int. Size of each attention head.
        query_act: (optional) Activation function for the query transform.
        key_act: (optional) Activation function for the key transform.
        value_act: (optional) Activation function for the value transform.
        attention_probs_dropout_prob: (optional) float. Dropout probability of the
          attention probabilities.
        initializer_range: float. Range of the weight initializer.
        do_return_2d_tensor: bool. If True, the output will be of shape [batch_size
          * from_seq_length, num_attention_heads * size_per_head]. If False, the
          output will be of shape [batch_size, from_seq_length, num_attention_heads
          * size_per_head].
        batch_size: (Optional) int. If the input is 2D, this might be the batch size
          of the 3D version of the `from_tensor` and `to_tensor`.
        from_seq_length: (Optional) If the input is 2D, this might be the seq length
          of the 3D version of the `from_tensor`.
        to_seq_length: (Optional) If the input is 2D, this might be the seq length
          of the 3D version of the `to_tensor`.
      Returns:
        float Tensor of shape [batch_size, from_seq_length,
          num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is
          true, this will be of shape [batch_size * from_seq_length,
          num_attention_heads * size_per_head]).
      Raises:
        ValueError: Any of the arguments or tensor shapes are invalid.
      """
    def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
                             seq_length, width):
        output_tensor = tf.reshape(
            input_tensor, [batch_size, seq_length, num_attention_heads, width])
        output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
        return output_tensor

    from_shape = utils.get_shape_list(from_tensor, expected_rank=[2, 3])
    to_shape = utils.get_shape_list(to_tensor, expected_rank=[2, 3])
    if len(from_shape) != len(to_shape):
        raise ValueError(
            "The rank of `from_tensor` must match the rank of `to_tensor`.")
    if len(from_shape) == 3:
        batch_size = from_shape[0]
        from_seq_length = from_shape[1]
        to_seq_length = to_shape[1]
    elif len(from_shape) == 2:
        if (batch_size is None or from_seq_length is None
                or to_seq_length is None):
            raise ValueError(
                "When passing in rank 2 tensors to attention_layer, the values "
                "for `batch_size`, `from_seq_length`, and `to_seq_length` "
                "must all be specified.")
    # Scalar dimensions referenced here:
    #   B = batch size (number of sequences)
    #   F = `from_tensor` sequence length
    #   T = `to_tensor` sequence length
    #   N = `num_attention_heads`
    #   H = `size_per_head`
    from_tensor_2d = utils.reshape_to_matrix(from_tensor)
    to_tensor_2d = utils.reshape_to_matrix(to_tensor)
    # `query_layer` = [B*F, N*H]
    query_layer = tf.layers.dense(
        from_tensor_2d,
        num_attention_heads * size_per_head,
        activation=query_act,
        name="query",
        kernel_initializer=utils.create_initializer(initializer_range))
    # `key_layer` = [B*T, N*H]
    key_layer = tf.layers.dense(
        to_tensor_2d,
        num_attention_heads * size_per_head,
        activation=key_act,
        name="key",
        kernel_initializer=utils.create_initializer(initializer_range))
    # `value_layer` = [B*T, N*H]
    value_layer = tf.layers.dense(
        to_tensor_2d,
        num_attention_heads * size_per_head,
        activation=value_act,
        name="value",
        kernel_initializer=utils.create_initializer(initializer_range))
    # `query_layer` = [B, N, F, H]
    query_layer = transpose_for_scores(query_layer, batch_size,
                                       num_attention_heads, from_seq_length,
                                       size_per_head)
    # `key_layer` = [B, N, T, H]
    key_layer = transpose_for_scores(key_layer, batch_size,
                                     num_attention_heads, to_seq_length,
                                     size_per_head)
    # Take the dot product between "query" and "key" to get the raw
    # attention scores.
    # `attention_scores` = [B, N, F, T]
    attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
    attention_scores = tf.multiply(attention_scores,
                                   1.0 / math.sqrt(float(size_per_head)))
    if attention_mask is not None:
        # `attention_mask` = [B, 1, F, T]
        attention_mask = tf.expand_dims(attention_mask, axis=[1])
        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
        # positions we want to attend and -10000.0 for masked positions.
        adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0
        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.
        attention_scores += adder
    # Normalize the attention scores to probabilities.
    # `attention_probs` = [B, N, F, T]
    attention_probs = tf.nn.softmax(attention_scores)
    # This is actually dropping out entire tokens to attend to, which might
    # seem a bit unusual, but is taken from the original Transformer paper.
    attention_probs = utils.dropout(attention_probs,
                                    attention_probs_dropout_prob)
    # `value_layer` = [B, T, N, H]
    value_layer = tf.reshape(
        value_layer,
        [batch_size, to_seq_length, num_attention_heads, size_per_head])
    # `value_layer` = [B, N, T, H]
    value_layer = tf.transpose(value_layer, [0, 2, 1, 3])
    # `context_layer` = [B, N, F, H]
    context_layer = tf.matmul(attention_probs, value_layer)
    # `context_layer` = [B, F, N, H]
    context_layer = tf.transpose(context_layer, [0, 2, 1, 3])
    if do_return_2d_tensor:
        # `context_layer` = [B*F, N*H]
        context_layer = tf.reshape(context_layer, [
            batch_size * from_seq_length, num_attention_heads * size_per_head
        ])
    else:
        # `context_layer` = [B, F, N*H]
        context_layer = tf.reshape(
            context_layer,
            [batch_size, from_seq_length, num_attention_heads * size_per_head])
    return context_layer
Example #9
0
    def __init__(self,
                 config,
                 is_training,
                 input_ids,
                 input_mask=None,
                 token_type_ids=None,
                 use_one_hot_embeddings=False,
                 scope=None):
        """Constructor for BertModel.

        Args:
          config: `BertConfig` instance.
          is_training: bool. true for training model, false for eval model. Controls
            whether dropout will be applied.
          input_ids: int32 Tensor of shape [batch_size, seq_length].
          input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
          token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
          use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
            embeddings or tf.embedding_lookup() for the word embeddings.
          scope: (optional) variable scope. Defaults to "revbert".

        Raises:
          ValueError: The config is invalid or one of the input tensor shapes
            is invalid.
        """
        config = copy.deepcopy(config)
        if not is_training:
            config.hidden_dropout_prob = 0.0
            config.attention_probs_dropout_prob = 0.0

        input_shape = utils.get_shape_list(input_ids, expected_rank=2)
        batch_size = input_shape[0]
        seq_length = input_shape[1]

        if input_mask is None:
            input_mask = tf.ones(shape=[batch_size, seq_length],
                                 dtype=tf.int32)

        if token_type_ids is None:
            token_type_ids = tf.zeros(shape=[batch_size, seq_length],
                                      dtype=tf.int32)

        with tf.variable_scope(scope, default_name="revbert"):
            with tf.variable_scope("embeddings"):
                # Perform embedding lookup on the word ids.
                (self.embedding_output, self.embedding_output_bplayer,
                 self.embedding_table,
                 self.embedding_table_bplayer) = layers.embedding_lookup(
                     input_ids=input_ids,
                     vocab_size=config.vocab_size,
                     embedding_size=config.hidden_size,
                     initializer_range=config.initializer_range,
                     word_embedding_name="word_embeddings",
                     use_one_hot_embeddings=use_one_hot_embeddings)

                # Add positional embeddings and token type embeddings, then layer
                # normalize and perform dropout.
                self.embedding_output, self.embedding_output_bplayer = layers.embedding_postprocessor(
                    input_tensor=self.embedding_output,
                    use_token_type=True,
                    token_type_ids=token_type_ids,
                    token_type_vocab_size=config.type_vocab_size,
                    token_type_embedding_name="token_type_embeddings",
                    use_position_embeddings=True,
                    position_embedding_name="position_embeddings",
                    initializer_range=config.initializer_range,
                    max_position_embeddings=config.max_position_embeddings,
                    dropout_prob=config.hidden_dropout_prob,
                    previor_bplayer=self.embedding_output_bplayer)

            with tf.variable_scope("encoder"):
                # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
                # mask of shape [batch_size, seq_length, seq_length] which is used
                # for the attention scores.
                attention_mask = utils.create_attention_mask_from_input_mask(
                    input_ids, input_mask)

                # Run the stacked transformer.
                # `sequence_output` shape = [batch_size, seq_length, hidden_size].
                self.all_encoder_layers = transformer_model(
                    input_tensor=self.embedding_output,
                    input_bplayer=self.embedding_output_bplayer,
                    attention_mask=attention_mask,
                    hidden_size=config.hidden_size,
                    num_hidden_layers=config.num_hidden_layers,
                    num_attention_heads=config.num_attention_heads,
                    intermediate_size=config.intermediate_size,
                    intermediate_act_fn=utils.get_activation(
                        config.hidden_act),
                    hidden_dropout_prob=config.hidden_dropout_prob,
                    attention_probs_dropout_prob=config.
                    attention_probs_dropout_prob,
                    initializer_range=config.initializer_range,
                    do_return_all_layers=True)

            self.sequence_output, self.sequence_output_bplayer = self.all_encoder_layers[
                -1]
            # The "pooler" converts the encoded sequence tensor of shape
            # [batch_size, seq_length, hidden_size] to a tensor of shape
            # [batch_size, hidden_size]. This is necessary for segment-level
            # (or segment-pair-level) classification tasks where we need a fixed
            # dimensional representation of the segment.
            with tf.variable_scope("pooler") as pooler_scope:
                # We "pool" the model by simply taking the hidden state corresponding
                # to the first token. We assume that this has been pre-trained
                first_token_tensor = tf.squeeze(self.sequence_output[:,
                                                                     0:1, :],
                                                axis=1)
                self.pooled_output = tf.layers.dense(
                    first_token_tensor,
                    config.hidden_size,
                    activation=tf.tanh,
                    kernel_initializer=utils.create_initializer(
                        config.initializer_range))
                self.pooled_output_bplayer = BPLayer(
                    self.pooled_output, pooler_scope,
                    [self.sequence_output_bplayer])