def __init__(self,
                 config,
                 is_training,
                 input_ids,
                 input_mask=None,
                 token_type_ids=None,
                 use_one_hot_embeddings=True,
                 scope=None,
                 embedded_input=None):
        """Constructor for BertModel.

    Args:
      config: `BertConfig` instance.
      is_training: bool. rue for training model, false for eval model. Controls
        whether dropout will be applied.
      input_ids: int32 Tensor of shape [batch_size, seq_length].
      input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
      token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
      use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
        embeddings or tf.embedding_lookup() for the word embeddings. On the TPU,
        it is must faster if this is True, on the CPU or GPU, it is faster if
        this is False.
      scope: (optional) variable scope. Defaults to "bert".
      embedded_input: (optional) If provided, the embedding layer here is
        skipped and the passed embeddings are passed into the self-attentional
        layers.

    Raises:
      ValueError: The config is invalid or one of the input tensor shapes
        is invalid.
    """
        config = copy.deepcopy(config)
        if not is_training:
            config.hidden_dropout_prob = 0.0
            config.attention_probs_dropout_prob = 0.0

        input_shape = get_shape_list(input_ids, expected_rank=2)
        batch_size = input_shape[0]
        seq_length = input_shape[1]

        if input_mask is None:
            input_mask = tf.ones(shape=[batch_size, seq_length],
                                 dtype=tf.int32)

        if token_type_ids is None:
            token_type_ids = tf.zeros(shape=[batch_size, seq_length],
                                      dtype=tf.int32)

        with tf.compat.v1.variable_scope("bert", scope):
            with tf.compat.v1.variable_scope("embeddings"):
                if embedded_input is None:
                    # Perform embedding lookup on the word ids.
                    (self.embedding_output,
                     self.embedding_table) = embedding_lookup(
                         input_ids=input_ids,
                         vocab_size=config.vocab_size,
                         embedding_size=config.hidden_size,
                         initializer_range=config.initializer_range,
                         word_embedding_name="word_embeddings",
                         use_one_hot_embeddings=use_one_hot_embeddings)
                else:
                    self.embedding_output = embedded_input

                # Add positional embeddings and token type embeddings, then layer
                # normalize and perform dropout.
                self.embedding_output = embedding_postprocessor(
                    input_tensor=self.embedding_output,
                    use_token_type=True,
                    token_type_ids=token_type_ids,
                    token_type_vocab_size=config.type_vocab_size,
                    token_type_embedding_name="token_type_embeddings",
                    use_position_embeddings=True,
                    position_embedding_name="position_embeddings",
                    initializer_range=config.initializer_range,
                    max_position_embeddings=config.max_position_embeddings,
                    dropout_prob=config.hidden_dropout_prob)

            with tf.compat.v1.variable_scope("encoder"):
                # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
                # mask of shape [batch_size, seq_length, seq_length] which is used
                # for the attention scores.
                attention_mask = create_attention_mask_from_input_mask(
                    input_ids, input_mask)

                # Run the stacked transformer.
                # `sequence_output` shape = [batch_size, seq_length, hidden_size].
                self.all_encoder_layers = transformer_model(
                    input_tensor=self.embedding_output,
                    attention_mask=attention_mask,
                    hidden_size=config.hidden_size,
                    num_hidden_layers=config.num_hidden_layers,
                    num_attention_heads=config.num_attention_heads,
                    intermediate_size=config.intermediate_size,
                    intermediate_act_fn=get_activation(config.hidden_act),
                    hidden_dropout_prob=config.hidden_dropout_prob,
                    attention_probs_dropout_prob=config.
                    attention_probs_dropout_prob,
                    initializer_range=config.initializer_range,
                    do_return_all_layers=True)

            self.sequence_output = self.all_encoder_layers[-1]
            # The "pooler" converts the encoded sequence tensor of shape
            # [batch_size, seq_length, hidden_size] to a tensor of shape
            # [batch_size, hidden_size]. This is necessary for segment-level
            # (or segment-pair-level) classification tasks where we need a fixed
            # dimensional representation of the segment.
            with tf.compat.v1.variable_scope("pooler"):
                # We "pool" the model by simply taking the hidden state corresponding
                # to the first token. We assume that this has been pre-trained
                first_token_tensor = tf.squeeze(self.sequence_output[:,
                                                                     0:1, :],
                                                axis=1)
                self.pooled_output = tf.compat.v1.layers.dense(
                    first_token_tensor,
                    config.hidden_size,
                    activation=tf.tanh,
                    kernel_initializer=create_initializer(
                        config.initializer_range))
Beispiel #2
0
def dense(
    inputs: tf.Tensor,
    units: int,
    activation: Optional[str],
    activation_noise: tf.Tensor,
    name: str,
    use_bias: bool,
    dropout_keep_rate: Optional[Union[float, tf.Tensor]] = None
) -> Tuple[tf.Tensor, tf.Tensor]:
    """
    Creates a dense, feed-forward layer with the given parameters.

    Args:
        inputs: The input tensor. Has the shape [B, ..., D]
        units: The number of output units. Denoted by K.
        activation: Optional activation function. If none, the activation is linear.
        activation_noise: Noise scale to apply to the final activations
        name: Name prefix for the created trainable variables.
        use_bias: Whether to add a bias to the output.
        dropout_keep_rate: Optional dropout to apply to the activations
    Returns:
        A tuple of 2 elements: (1) the transformed inputs in a [B, ..., K] tensor and (2) the transformed inputs without the activation function.
            This second entry is included for debugging purposes.
    """
    # Get the size of the input features, denoted by D
    input_units = inputs.get_shape()[-1]

    # Create the weight matrix
    W = tf.compat.v1.get_variable(
        name='{0}-kernel'.format(name),
        shape=[input_units, units],
        initializer=tf.compat.v1.initializers.glorot_uniform(),
        trainable=True)

    # Apply the given weights
    transformed = tf.matmul(inputs, W)  # [B, ..., K]

    # Add the bias if specified
    if use_bias:
        # Bias vector of size [K]
        b = tf.compat.v1.get_variable(
            name='{0}-bias'.format(name),
            shape=[1, units],
            initializer=tf.compat.v1.initializers.random_uniform(minval=-0.7,
                                                                 maxval=0.7),
            trainable=True)
        transformed = transformed + b

    pre_activation = transformed

    # Apply the activation function if specified
    activation_fn = get_activation(activation)
    if activation_fn is not None:
        transformed = activation_fn(transformed)

    # Apply noise regularization
    transformed = apply_noise(transformed, scale=activation_noise)

    if dropout_keep_rate is not None:
        transformed = tf.nn.dropout(transformed, rate=1.0 - dropout_keep_rate)

    return transformed, pre_activation
Beispiel #3
0
def transformer_model(input_tensor,
                      attention_mask=None,
                      hidden_size=768,
                      num_hidden_layers=12,
                      num_attention_heads=12,
                      intermediate_size=3072,
                      intermediate_act_fn=get_activation('gelu'),
                      hidden_dropout_prob=0.1,
                      attention_probs_dropout_prob=0.1,
                      initializer_range=0.02,
                      do_return_all_layers=False):
    """Multi-headed, multi-layer Transformer from "Attention is All You Need".

  This is almost an exact implementation of the original Transformer encoder.

  See the original paper:
  https://arxiv.org/abs/1706.03762

  Also see:
  https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py

  Args:
    input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
    attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,
      seq_length], with 1 for positions that can be attended to and 0 in
      positions that should not be.
    hidden_size: int. Hidden size of the Transformer.
    num_hidden_layers: int. Number of layers (blocks) in the Transformer.
    num_attention_heads: int. Number of attention heads in the Transformer.
    intermediate_size: int. The size of the "intermediate" (a.k.a., feed
      forward) layer.
    intermediate_act_fn: function. The non-linear activation function to apply
      to the output of the intermediate/feed-forward layer.
    hidden_dropout_prob: float. Dropout probability for the hidden layers.
    attention_probs_dropout_prob: float. Dropout probability of the attention
      probabilities.
    initializer_range: float. Range of the initializer (stddev of truncated
      normal).
    do_return_all_layers: Whether to also return all layers or just the final
      layer.

  Returns:
    float Tensor of shape [batch_size, seq_length, hidden_size], the final
    hidden layer of the Transformer.

  Raises:
    ValueError: A Tensor shape or parameter is invalid.
  """
    if hidden_size % num_attention_heads != 0:
        raise ValueError(
            "The hidden size (%d) is not a multiple of the number of attention "
            "heads (%d)" % (hidden_size, num_attention_heads))

    attention_head_size = int(hidden_size / num_attention_heads)
    input_shape = get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_shape[0]
    seq_length = input_shape[1]
    input_width = input_shape[2]

    # The Transformer performs sum residuals on all layers so the input needs
    # to be the same as the hidden size.
    if input_width != hidden_size:
        raise ValueError("The width of the input tensor (%d) != hidden size (%d)" %
                         (input_width, hidden_size))

    # We keep the representation as a 2D tensor to avoid re-shaping it back and
    # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on
    # the GPU/CPU but may not be free on the TPU, so we want to minimize them to
    # help the optimizer.
    prev_output = reshape_to_matrix(input_tensor)

    all_layer_outputs = []
    for layer_idx in range(num_hidden_layers):
        with tf.compat.v1.variable_scope("layer_%d" % layer_idx):
            layer_input = prev_output

            with tf.compat.v1.variable_scope("attention"):
                attention_heads = []
                with tf.compat.v1.variable_scope("self"):
                    attention_head = attention_layer(
                        from_tensor=layer_input,
                        to_tensor=layer_input,
                        attention_mask=attention_mask,
                        num_attention_heads=num_attention_heads,
                        size_per_head=attention_head_size,
                        attention_probs_dropout_prob=attention_probs_dropout_prob,
                        initializer_range=initializer_range,
                        do_return_2d_tensor=True,
                        batch_size=batch_size,
                        from_seq_length=seq_length,
                        to_seq_length=seq_length)
                    attention_heads.append(attention_head)

                attention_output = None
                if len(attention_heads) == 1:
                    attention_output = attention_heads[0]
                else:
                    # In the case where we have other sequences, we just concatenate
                    # them to the self-attention head before the projection.
                    attention_output = tf.concat(attention_heads, axis=-1)

                # Run a linear projection of `hidden_size` then add a residual
                # with `layer_input`.
                with tf.compat.v1.variable_scope("output"):
                    attention_output = tf.compat.v1.layers.dense(
                        attention_output,
                        hidden_size,
                        kernel_initializer=create_initializer(initializer_range))
                    attention_output = dropout(attention_output, hidden_dropout_prob)
                    attention_output = layer_norm(attention_output + layer_input)

            # The activation is only applied to the "intermediate" hidden layer.
            with tf.compat.v1.variable_scope("intermediate"):
                intermediate_output = tf.compat.v1.layers.dense(
                    attention_output,
                    intermediate_size,
                    activation=intermediate_act_fn,
                    kernel_initializer=create_initializer(initializer_range))

            # Down-project back to `hidden_size` then add the residual.
            with tf.compat.v1.variable_scope("output"):
                layer_output = tf.compat.v1.layers.dense(
                    intermediate_output,
                    hidden_size,
                    kernel_initializer=create_initializer(initializer_range))
                layer_output = dropout(layer_output, hidden_dropout_prob)
                layer_output = layer_norm(layer_output + attention_output)
                prev_output = layer_output
                all_layer_outputs.append(layer_output)

    if do_return_all_layers:
        final_outputs = []
        for layer_output in all_layer_outputs:
            final_output = reshape_from_matrix(layer_output, input_shape)
            final_outputs.append(final_output)
        return final_outputs
    else:
        final_output = reshape_from_matrix(prev_output, input_shape)
        return final_output
Beispiel #4
0
def conv_1d(inputs: tf.Tensor, filter_width: int, stride: int,
            activation: Optional[str], activation_noise: float,
            dropout_keep_rate: tf.Tensor, use_dropout: bool,
            name: str) -> tf.Tensor:
    """
    Performs a 1d convolution over the given inputs.

    Args:
        inputs: A [B, T, D] tensor of features (D) for each seq element (T) and batch sample (B)
        filter_width: The width of the convolution filter. Must be at least one.
        stride: The convolution stride. Must be at least one.
        activation: The name of the activation function. If none, then we apply a linear activation.
        activation_noise: The noise to apply to the final activations.
        dropout_keep_rate: The dropout keep rate to apply to the transformed representation.
        use_dropout: Whether to apply dropout.
        name: The name of this layer.
    Returns:
        A [B, T, D] tensor that is the result of applying the 1d convolution filter
            to the inputs.
    """
    assert filter_width >= 1, 'Must have a filter width of at least one. Got: {0}'.format(
        filter_width)
    assert stride >= 1, 'Must have a stride length of at least one. Got: {0}'.format(
        stride)

    with tf.variable_scope(name):
        # Create the (trainable) convolution filter
        num_features = inputs.get_shape()[-1]  # D
        conv_filter = tf.get_variable(
            shape=[filter_width, num_features, num_features],
            initializer=tf.glorot_uniform_initializer(),
            name='filter',
            dtype=tf.float32)

        # Create the (trainable) bias
        bias = tf.get_variable(shape=[1, 1, num_features],
                               initializer=tf.random_uniform_initializer(
                                   minval=-0.7, maxval=0.7),
                               name='bias',
                               dtype=tf.float32)

        # Apply the convolution filter, [B, T, D]
        transformed = tf.nn.conv1d(value=inputs,
                                   filters=conv_filter,
                                   stride=stride,
                                   padding='SAME',
                                   data_format='NWC')

        transformed = transformed + bias  # [B, T, D]

        # Apply the activation function, [B, T, D]
        activation_fn = get_activation(activation)
        if activation_fn is not None:
            transformed = activation_fn(transformed)

        # Apply the activation noise
        transformed = apply_noise(transformed, scale=activation_noise)

        # Apply dropout if specified, [B, T, D]
        if use_dropout:
            transformed = tf.nn.dropout(transformed,
                                        keep_prob=dropout_keep_rate)

        return transformed
Beispiel #5
0
    def _complex_model(self, is_train: bool = False) -> tf.Tensor:
        models = ['nbow', 'rnn']  # nbow, cnn, rnn, bert
        attention = False
        embeddings = list()
        with tf.variable_scope("tree_encoder"):
            self._make_placeholders()

            self.placeholders['tokens_lengths'] = \
                tf.placeholder(tf.int32, shape=[None], name='tokens_lengths')
            self.placeholders['rnn_dropout_keep_rate'] = \
                tf.placeholder(tf.float32, shape=[], name='rnn_dropout_keep_rate')
            self.placeholders['rnn_recurrent_dropout_keep_rate'] = \
                tf.placeholder(tf.float32, shape=[], name='rnn_recurrent_dropout_keep_rate')

            common_flag = True
            if 'nbow' in models and 'rnn' in models:
                seq_tokens = self.placeholders['tokens']
                seq_tokens_embeddings = self.embedding_layer(seq_tokens)
                common_flag = False
            if 'nbow' in models:
                if common_flag:
                    seq_tokens_embeddings = self.embedding_layer(
                        self.placeholders['tokens'])
                seq_token_mask = self.placeholders['tokens_mask']
                seq_token_lengths = tf.reduce_sum(seq_token_mask, axis=1)  # B

                embedding = pool_sequence_embedding(
                    self.get_hyper('nbow_pool_mode').lower(),
                    sequence_token_embeddings=seq_tokens_embeddings,
                    sequence_lengths=seq_token_lengths,
                    sequence_token_masks=seq_token_mask)
                embeddings.append(embedding)
            if 'cnn' in models:
                if common_flag:
                    seq_tokens_embeddings = self.embedding_layer(
                        self.placeholders['tokens'])
                seq_tokens_embeddings = self.__add_position_encoding(
                    seq_tokens_embeddings)

                activation_fun = get_activation(
                    self.get_hyper('1dcnn_activation'))
                current_embeddings = seq_tokens_embeddings
                num_filters_and_width = zip(
                    self.get_hyper('1dcnn_layer_list'),
                    self.get_hyper('1dcnn_kernel_width'))
                for (layer_idx,
                     (num_filters,
                      kernel_width)) in enumerate(num_filters_and_width):
                    next_embeddings = tf.layers.conv1d(
                        inputs=current_embeddings,
                        filters=num_filters,
                        kernel_size=kernel_width,
                        padding="same")

                    # Add residual connections past the first layer.
                    if self.get_hyper('1dcnn_add_residual_connections'
                                      ) and layer_idx > 0:
                        next_embeddings += current_embeddings

                    current_embeddings = activation_fun(next_embeddings)
                    current_embeddings = tf.nn.dropout(
                        current_embeddings,
                        keep_prob=self.placeholders['dropout_keep_rate'])

                seq_token_mask = self.placeholders['tokens_mask']
                seq_token_lengths = tf.reduce_sum(seq_token_mask, axis=1)  # B
                embedding = pool_sequence_embedding(
                    self.get_hyper('1dcnn_pool_mode').lower(),
                    sequence_token_embeddings=current_embeddings,
                    sequence_lengths=seq_token_lengths,
                    sequence_token_masks=seq_token_mask)
                embeddings.append(embedding)
            if 'rnn' in models:
                if common_flag:
                    seq_tokens = self.placeholders['tokens']
                    seq_tokens_embeddings = self.embedding_layer(seq_tokens)
                seq_tokens_lengths = self.placeholders['tokens_lengths']
                rnn_final_state, token_embeddings = self._encode_with_rnn(
                    seq_tokens_embeddings, seq_tokens_lengths)

                output_pool_mode = self.get_hyper('rnn_pool_mode').lower()
                if output_pool_mode == 'rnn_final':
                    embedding = rnn_final_state
                else:
                    token_mask = tf.expand_dims(tf.range(
                        tf.shape(seq_tokens)[1]),
                                                axis=0)  # 1 x T
                    token_mask = tf.tile(
                        token_mask,
                        multiples=(tf.shape(seq_tokens_lengths)[0],
                                   1))  # B x T
                    token_mask = tf.cast(token_mask < tf.expand_dims(
                        seq_tokens_lengths, axis=-1),
                                         dtype=tf.float32)  # B x T
                    embedding = pool_sequence_embedding(
                        output_pool_mode,
                        sequence_token_embeddings=token_embeddings,
                        sequence_lengths=seq_tokens_lengths,
                        sequence_token_masks=token_mask)
                embeddings.append(embedding)
            if 'bert' in models:
                config = BertConfig(
                    vocab_size=self.get_hyper('token_vocab_size'),
                    hidden_size=self.get_hyper('self_attention_hidden_size'),
                    num_hidden_layers=self.get_hyper(
                        'self_attention_num_layers'),
                    num_attention_heads=self.get_hyper(
                        'self_attention_num_heads'),
                    intermediate_size=self.get_hyper(
                        'self_attention_intermediate_size'))

                model = BertModel(config=config,
                                  is_training=is_train,
                                  input_ids=self.placeholders['tokens'],
                                  input_mask=self.placeholders['tokens_mask'],
                                  use_one_hot_embeddings=False)

                output_pool_mode = self.get_hyper(
                    'self_attention_pool_mode').lower()
                if output_pool_mode == 'bert':
                    embedding = model.get_pooled_output()
                else:
                    seq_token_embeddings = model.get_sequence_output()
                    seq_token_masks = self.placeholders['tokens_mask']
                    seq_token_lengths = tf.reduce_sum(seq_token_masks,
                                                      axis=1)  # B
                    embedding = pool_sequence_embedding(
                        output_pool_mode,
                        sequence_token_embeddings=seq_token_embeddings,
                        sequence_lengths=seq_token_lengths,
                        sequence_token_masks=seq_token_masks)
                embeddings.append(embedding)

            embeddings = tf.concat(embeddings, axis=-1)
            if attention:
                embeddings = Common.self_attention_layer(embeddings)
            # "concat one-hot" is equal to "accumulate embedding"
            # [v1^T, v2^T, v3^T] * W = [v1^T, v2^T, v3^T]*[w1, w2, w3]^T = v1^T*w1+v2^T*w2+v3^T*w3
            print('*@' * 16)
            print(embeddings)
            print(tf.shape(embeddings))
            return tf.reduce_sum(embeddings, axis=0)
Beispiel #6
0
    def _single_model(self, is_train: bool = False) -> tf.Tensor:
        model = 'nbow'  # nbow, cnn, rnn, bert
        attention = False
        embedding = None
        with tf.variable_scope("tree_encoder"):
            self._make_placeholders()

            self.placeholders['tokens_lengths'] = \
                tf.placeholder(tf.int32, shape=[None], name='tokens_lengths')
            self.placeholders['rnn_dropout_keep_rate'] = \
                tf.placeholder(tf.float32, shape=[], name='rnn_dropout_keep_rate')
            self.placeholders['rnn_recurrent_dropout_keep_rate'] = \
                tf.placeholder(tf.float32, shape=[], name='rnn_recurrent_dropout_keep_rate')

            if model == 'nbow':
                seq_tokens_embeddings = self.embedding_layer(
                    self.placeholders['tokens'])
                seq_token_mask = self.placeholders['tokens_mask']
                seq_token_lengths = tf.reduce_sum(seq_token_mask, axis=1)  # B

                if attention:
                    embedding = Common.yet_attention_layer(
                        seq_tokens_embeddings)
                else:
                    embedding = pool_sequence_embedding(
                        self.get_hyper('nbow_pool_mode').lower(),
                        sequence_token_embeddings=seq_tokens_embeddings,
                        sequence_lengths=seq_token_lengths,
                        sequence_token_masks=seq_token_mask)
            elif model == 'cnn':
                seq_tokens_embeddings = self.embedding_layer(
                    self.placeholders['tokens'])
                seq_tokens_embeddings = self.__add_position_encoding(
                    seq_tokens_embeddings)

                activation_fun = get_activation(
                    self.get_hyper('1dcnn_activation'))
                current_embeddings = seq_tokens_embeddings
                num_filters_and_width = zip(
                    self.get_hyper('1dcnn_layer_list'),
                    self.get_hyper('1dcnn_kernel_width'))
                for (layer_idx,
                     (num_filters,
                      kernel_width)) in enumerate(num_filters_and_width):
                    next_embeddings = tf.layers.conv1d(
                        inputs=current_embeddings,
                        filters=num_filters,
                        kernel_size=kernel_width,
                        padding="same")

                    # Add residual connections past the first layer.
                    if self.get_hyper('1dcnn_add_residual_connections'
                                      ) and layer_idx > 0:
                        next_embeddings += current_embeddings

                    current_embeddings = activation_fun(next_embeddings)
                    current_embeddings = tf.nn.dropout(
                        current_embeddings,
                        keep_prob=self.placeholders['dropout_keep_rate'])

                if attention:
                    embedding = Common.yet_attention_layer(current_embeddings)
                else:
                    seq_token_mask = self.placeholders['tokens_mask']
                    seq_token_lengths = tf.reduce_sum(seq_token_mask,
                                                      axis=1)  # B
                    embedding = pool_sequence_embedding(
                        self.get_hyper('1dcnn_pool_mode').lower(),
                        sequence_token_embeddings=current_embeddings,
                        sequence_lengths=seq_token_lengths,
                        sequence_token_masks=seq_token_mask)
            elif model == 'rnn':
                seq_tokens = self.placeholders['tokens']
                seq_tokens_embeddings = self.embedding_layer(seq_tokens)
                seq_tokens_lengths = self.placeholders['tokens_lengths']
                rnn_final_state, token_embeddings = self._encode_with_rnn(
                    seq_tokens_embeddings, seq_tokens_lengths)

                output_pool_mode = self.get_hyper('rnn_pool_mode').lower()
                if output_pool_mode == 'rnn_final':
                    embedding = rnn_final_state
                else:
                    if attention:
                        embedding = Common.yet_attention_layer(
                            token_embeddings)
                    else:
                        token_mask = tf.expand_dims(tf.range(
                            tf.shape(seq_tokens)[1]),
                                                    axis=0)  # 1 x T
                        token_mask = tf.tile(
                            token_mask,
                            multiples=(tf.shape(seq_tokens_lengths)[0],
                                       1))  # B x T
                        token_mask = tf.cast(token_mask < tf.expand_dims(
                            seq_tokens_lengths, axis=-1),
                                             dtype=tf.float32)  # B x T
                        embedding = pool_sequence_embedding(
                            output_pool_mode,
                            sequence_token_embeddings=token_embeddings,
                            sequence_lengths=seq_tokens_lengths,
                            sequence_token_masks=token_mask)
            elif model == 'bert':
                config = BertConfig(
                    vocab_size=self.get_hyper('token_vocab_size'),
                    hidden_size=self.get_hyper('self_attention_hidden_size'),
                    num_hidden_layers=self.get_hyper(
                        'self_attention_num_layers'),
                    num_attention_heads=self.get_hyper(
                        'self_attention_num_heads'),
                    intermediate_size=self.get_hyper(
                        'self_attention_intermediate_size'))

                model = BertModel(config=config,
                                  is_training=is_train,
                                  input_ids=self.placeholders['tokens'],
                                  input_mask=self.placeholders['tokens_mask'],
                                  use_one_hot_embeddings=False)

                output_pool_mode = self.get_hyper(
                    'self_attention_pool_mode').lower()
                if output_pool_mode == 'bert':
                    embedding = model.get_pooled_output()
                else:
                    seq_token_embeddings = model.get_sequence_output()
                    # only when it is not pooled out, then we consider attention
                    if attention:
                        embedding = Common.yet_attention_layer(
                            seq_token_embeddings)
                    else:
                        seq_token_masks = self.placeholders['tokens_mask']
                        seq_token_lengths = tf.reduce_sum(seq_token_masks,
                                                          axis=1)  # B
                        embedding = pool_sequence_embedding(
                            output_pool_mode,
                            sequence_token_embeddings=seq_token_embeddings,
                            sequence_lengths=seq_token_lengths,
                            sequence_token_masks=seq_token_masks)
            else:
                raise ValueError('Undefined Config')
            return embedding