Beispiel #1
0
    def __init__(self,
                 xlnet_config,
                 is_training,
                 input_ids,
                 seg_ids,
                 input_mask,
                 mems,
                 perm_mask,
                 target,
                 target_mask,
                 target_mapping,
                 inp_q,
                 sample_weight=None,
                 **kwargs):
        super().__init__()

        run_config = XLNetRunConfig(
            is_training=is_training,
            bi_data=True,
            use_tpu=False,
            use_bfloat16=False,
            dropout=(0.1 if is_training else 0.0),
            dropatt=(0.1 if is_training else 0.0),
            init='normal',
            init_range=0.1,
            init_std=0.02,
            clamp_len=-1)

        model = XLNetEncoder(
            xlnet_config=xlnet_config,
            is_training=is_training,
            input_ids=input_ids,
            seg_ids=seg_ids,
            input_mask=input_mask,
            mems=mems,
            perm_mask=perm_mask,
            target_mapping=target_mapping,
            inp_q=inp_q,
            **kwargs)

        with tf.variable_scope('model', reuse=tf.AUTO_REUSE):
            per_example_loss, preds = lm_loss(
                hidden=model.get_sequence_output(),
                target=target,
                n_token=xlnet_config.n_token,
                d_model=xlnet_config.d_model,
                initializer=model.get_initializer(),
                lookup_table=model.get_embedding_table(),
                tie_weight=True,
                bi_data=run_config.bi_data,
                use_tpu=run_config.use_tpu)
            if sample_weight is not None:
                sample_weight = tf.expand_dims(
                    tf.cast(sample_weight, dtype=tf.float32), axis=-1)
                per_example_loss *= sample_weight

        self.total_loss = tf.reduce_sum(
            per_example_loss * target_mask) / tf.reduce_sum(target_mask)
        self.losses['PLM'] = per_example_loss * target_mask
        self.preds['PLM'] = preds
        self.preds['PLM_mask'] = target_mask
Beispiel #2
0
    def __init__(self,
                 bert_config,
                 is_training,
                 sketchy_encoder,
                 intensive_encoder,
                 query_mask,
                 label_ids,
                 has_answer,
                 sample_weight=None,
                 scope='retro_reader',
                 matching_mechanism='cross-attention',
                 beta_1=0.5,
                 beta_2=0.5,
                 threshold=1.0,
                 trainable=True,
                 **kwargs):
        super().__init__(**kwargs)

        # verifier
        with tf.variable_scope(scope):

            # sketchy reading module
            with tf.variable_scope('sketchy/prediction'):
                sketchy_output = sketchy_encoder.get_pooled_output()
                hidden_size = sketchy_output.shape.as_list()[-1]

                output_weights = tf.get_variable(
                    'output_weights',
                    shape=[2, hidden_size],
                    initializer=util.create_initializer(
                        bert_config.initializer_range),
                    trainable=trainable)
                output_bias = tf.get_variable(
                    'output_bias',
                    shape=[2],
                    initializer=tf.zeros_initializer(),
                    trainable=trainable)

                output_layer = util.dropout(
                    sketchy_output, bert_config.hidden_dropout_prob \
                        if is_training else 0.0)
                logits = tf.matmul(
                    output_layer, output_weights, transpose_b=True)
                logits = tf.nn.bias_add(logits, output_bias)

                log_probs = tf.nn.log_softmax(logits, axis=-1)
                one_hot_labels = tf.one_hot(
                    has_answer, depth=2, dtype=tf.float32)
                per_example_loss = - tf.reduce_sum(
                    one_hot_labels * log_probs, axis=-1)
                if sample_weight is not None:
                    per_example_loss = tf.cast(
                        sample_weight, dtype=tf.float32) * per_example_loss

                self.losses['sketchy_losses'] = per_example_loss
                sketchy_loss = tf.reduce_mean(per_example_loss)

                score_ext = logits[:, 1] - logits[:, 0]

            # intensive reading module
            with tf.variable_scope('intensive'):
                H = intensive_encoder.get_sequence_output()
                H_Q = H * tf.cast(
                    tf.expand_dims(query_mask, axis=-1), tf.float32)
                (batch_size, max_seq_length, hidden_size) = \
                    util.get_shape_list(H)

                # cross-attention
                if matching_mechanism == 'cross-attention':
                    with tf.variable_scope('cross_attention'):
                        attention_mask = \
                            self.create_attention_mask_from_input_mask(
                                query_mask, batch_size, max_seq_length)
                        (H_prime, _) = self.attention_layer(
                            from_tensor=H,
                            to_tensor=H_Q,
                            attention_mask=attention_mask,
                            num_attention_heads=\
                                bert_config.num_attention_heads,
                            size_per_head=\
                                hidden_size // bert_config.num_attention_heads,
                            attention_probs_dropout_prob=\
                                bert_config.hidden_dropout_prob,
                            initializer_range=bert_config.initializer_range,
                            do_return_2d_tensor=False,
                            batch_size=batch_size,
                            from_max_seq_length=max_seq_length,
                            to_max_seq_length=max_seq_length,
                            trainable=trainable)

                # matching-attention
                elif matching_mechanism == 'matching-attention':
                    with tf.variable_scope('matching_attention'):
                        output_weights = tf.get_variable(
                            'output_weights',
                            shape=[hidden_size, hidden_size],
                            initializer=util.create_initializer(
                                bert_config.initializer_range),
                            trainable=trainable)
                        output_bias = tf.get_variable(
                            'output_bias',
                            shape=[hidden_size],
                            initializer=tf.zeros_initializer(),
                            trainable=trainable)
                        trans = tf.matmul(
                            H_Q, tf.tile(
                                tf.expand_dims(output_weights, axis=0),
                                [batch_size, 1, 1]),
                            transpose_b=True)
                        trans = tf.nn.bias_add(trans, output_bias)
                        M = tf.nn.softmax(
                            tf.matmul(H, trans, transpose_b=True), axis=-1)
                        H_prime = tf.matmul(M, H_Q)

                with tf.variable_scope('prediction'):
                    output_weights = tf.get_variable(
                        'output_weights',
                        shape=[2, hidden_size],
                        initializer=util.create_initializer(
                            bert_config.initializer_range),
                        trainable=trainable)
                    output_bias = tf.get_variable(
                        'output_bias',
                        shape=[2],
                        initializer=tf.zeros_initializer(),
                        trainable=trainable)

                    output_layer = util.dropout(
                        H_prime, bert_config.hidden_dropout_prob \
                            if is_training else 0.0)
                    output_layer = tf.reshape(
                        output_layer,
                        [batch_size * max_seq_length, hidden_size])
                    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
                    logits = tf.nn.bias_add(logits, output_bias)
                    logits = tf.reshape(
                        logits, [batch_size, max_seq_length, 2])
                    logits = tf.transpose(logits, [0, 2, 1])
                    probs = tf.nn.softmax(logits, axis=-1, name='probs')

                    self.probs['mrc_probs'] = probs
                    self.preds['mrc_preds'] = tf.argmax(logits, axis=-1)

                    start_one_hot_labels = tf.one_hot(
                        label_ids[:, 0], depth=max_seq_length,
                        dtype=tf.float32)
                    end_one_hot_labels = tf.one_hot(
                        label_ids[:, 1], depth=max_seq_length,
                        dtype=tf.float32)
                    start_log_probs = tf.nn.log_softmax(logits[:, 0, :], axis=-1)
                    end_log_probs = tf.nn.log_softmax(logits[:, 1, :], axis=-1)
                    per_example_loss = (
                        - 0.5 * tf.reduce_sum(
                            start_one_hot_labels * start_log_probs, axis=-1)
                        - 0.5 * tf.reduce_sum(
                            end_one_hot_labels * end_log_probs, axis=-1))
                    if sample_weight is not None:
                        per_example_loss *= sample_weight

                    intensive_loss = tf.reduce_mean(per_example_loss)
                    self.losses['intensive_losses'] = per_example_loss

                    score_has = tf.norm(
                        probs[:, 0, 1:] + probs[:, 1, 1:], np.inf, axis=-1)
                    score_null = probs[:, 0, 0] + probs[:, 1, 0]
                    score_diff = score_has - score_null

            # rear verification
            v = beta_1 * score_diff + beta_2 * score_ext
            self.preds['verifier_preds'] = \
                tf.cast(tf.greater(v, threshold), tf.int32)
            self.probs['verifier_probs'] = v

            self.total_loss = sketchy_loss + intensive_loss
Beispiel #3
0
    def __init__(self,
                 vocab_size,
                 is_training,
                 input_ids,
                 input_mask,
                 segment_ids,
                 sample_weight=None,
                 reduced_size=64,
                 topic_size=1024,
                 hidden_size=768,
                 num_hidden_layers=12,
                 num_attention_heads=12,
                 bias=0,
                 scope='vae',
                 trainable=True,
                 **kwargs):
        super().__init__()

        # freeze parameters
        config = Config(vocab_size,
                        hidden_size=hidden_size,
                        num_hidden_layers=num_hidden_layers,
                        num_attention_heads=num_attention_heads)
        if not is_training:
            config.hidden_dropout_prob = 0.0
            config.attention_probs_dropout_prob = 0.0

        input_shape = util.get_shape_list(input_ids, expected_rank=2)
        batch_size = input_shape[0]
        seq_length = input_shape[1]

        # Tilda embeddings for SMART algorithm
        tilda_embeddings = None
        use_tilda_embedding = kwargs.get('use_tilda_embedding')
        if use_tilda_embedding:
            with tf.variable_scope('', reuse=True):
                tilda_embeddings = tf.get_variable('tilda_embeddings')

        with tf.variable_scope(scope):
            with tf.variable_scope('embeddings'):

                (self.embedding_output, self.embedding_table) = \
                    self.embedding_lookup(
                        input_ids=input_ids,
                        vocab_size=config.vocab_size,
                        batch_size=batch_size,
                        max_seq_length=seq_length,
                        embedding_size=config.hidden_size,
                        initializer_range=config.initializer_range,
                        word_embedding_name='word_embeddings',
                        tilda_embeddings=tilda_embeddings,
                        trainable=trainable)
                self.embedding_output = self.embedding_postprocessor(
                    input_tensor=self.embedding_output,
                    batch_size=batch_size,
                    max_seq_length=seq_length,
                    hidden_size=config.hidden_size,
                    use_token_type=True,
                    segment_ids=segment_ids,
                    token_type_vocab_size=config.type_vocab_size,
                    token_type_embedding_name='token_type_embeddings',
                    use_position_embeddings=True,
                    position_embedding_name='position_embeddings',
                    initializer_range=config.initializer_range,
                    max_position_embeddings=config.max_position_embeddings,
                    dropout_prob=config.hidden_dropout_prob,
                    trainable=trainable)

            with tf.variable_scope('encoder'):

                # stacked transformer
                attention_mask = self.create_attention_mask_from_input_mask(
                    input_mask, batch_size, seq_length)
                self.all_encoder_layers = self.transformer_model(
                    input_tensor=self.embedding_output,
                    batch_size=batch_size,
                    max_seq_length=seq_length,
                    attention_mask=attention_mask,
                    hidden_size=config.hidden_size,
                    num_hidden_layers=config.num_hidden_layers,
                    num_attention_heads=config.num_attention_heads,
                    intermediate_size=config.intermediate_size,
                    intermediate_act_fn=util.get_activation(config.hidden_act),
                    hidden_dropout_prob=config.hidden_dropout_prob,
                    attention_probs_dropout_prob=\
                        config.attention_probs_dropout_prob,
                    initializer_range=config.initializer_range,
                    trainable=trainable)

                # projection
                with tf.variable_scope('projection'):
                    transformer_output = tf.layers.dense(
                        self.all_encoder_layers[-1],
                        reduced_size,
                        activation=util.gelu,
                        kernel_initializer=tf.truncated_normal_initializer(
                            stddev=config.initializer_range),
                        trainable=trainable)
                    transformer_output = tf.reshape(transformer_output,
                                                    [batch_size, -1])
                    input_length = tf.reduce_sum(input_mask, axis=-1)
                    input_length = tf.cast(input_length, tf.float32)
                    input_length_1d = tf.reshape(input_length, [batch_size])
                    input_length_2d = tf.reshape(input_length, [batch_size, 1])

                    broadcast_mask = tf.sequence_mask(
                        tf.multiply(input_length_1d, reduced_size),
                        seq_length * reduced_size,
                        dtype=tf.float32)
                    broadcast_mask = tf.multiply(broadcast_mask,
                                                 seq_length / input_length_2d)
                    transformer_output *= broadcast_mask

                    # latent space
                    miu = tf.layers.dense(
                        transformer_output,
                        topic_size,
                        activation='tanh',
                        kernel_initializer=tf.truncated_normal_initializer(
                            stddev=config.initializer_range),
                        name='miu',
                        trainable=trainable)
                    sigma = tf.layers.dense(
                        transformer_output,
                        topic_size,
                        kernel_initializer=tf.truncated_normal_initializer(
                            stddev=config.initializer_range),
                        name='sigma',
                        trainable=trainable)
                    self.probs['miu'] = miu
                    self.probs['sigma'] = sigma

            with tf.variable_scope('decoder'):
                with tf.variable_scope('projection'):

                    # reparametarization
                    if is_training:
                        noise = tf.random_normal([batch_size, topic_size])
                    else:
                        noise = tf.random_uniform([batch_size, topic_size],
                                                  minval=-bias,
                                                  maxval=bias)
                    decoder_input = miu + tf.exp(sigma) * noise

                    # projection
                    decoder_input = tf.layers.dense(
                        decoder_input,
                        seq_length * reduced_size,
                        activation=util.gelu,
                        kernel_initializer=tf.truncated_normal_initializer(
                            stddev=config.initializer_range),
                        trainable=trainable)
                    intermediate_input = tf.reshape(
                        decoder_input, [-1, seq_length, reduced_size])
                    intermediate_input = util.layer_norm(intermediate_input,
                                                         trainable=trainable)
                    intermediate_input = util.dropout(
                        intermediate_input, config.hidden_dropout_prob)

                # MLP
                with tf.variable_scope('intermediate'):
                    intermediate_output = tf.layers.dense(
                        intermediate_input,
                        4 * reduced_size,
                        activation=util.gelu,
                        kernel_initializer=util.create_initializer(
                            config.initializer_range),
                        trainable=trainable)
                with tf.variable_scope('output'):
                    decoder_output = tf.layers.dense(
                        intermediate_output,
                        config.hidden_size,
                        kernel_initializer=util.create_initializer(
                            config.initializer_range),
                        trainable=trainable)
                    decoder_output = util.layer_norm(decoder_output,
                                                     trainable=trainable)
                    decoder_output = util.dropout(decoder_output,
                                                  config.hidden_dropout_prob)
                self.all_decoder_layers = [intermediate_output, decoder_output]
                self.all_decoder_layers = [decoder_output]

        # reconstruction
        with tf.variable_scope('cls/predictions'):
            with tf.variable_scope('transform'):
                input_tensor = tf.layers.dense(
                    decoder_output,
                    units=config.hidden_size,
                    activation=util.get_activation(config.hidden_act),
                    kernel_initializer=util.create_initializer(
                        config.initializer_range),
                    trainable=trainable)
                input_tensor = util.layer_norm(input_tensor,
                                               trainable=trainable)
            output_weights = self.embedding_table
            output_bias = tf.get_variable('output_bias',
                                          shape=[config.vocab_size],
                                          initializer=tf.zeros_initializer(),
                                          trainable=trainable)
            flatten_input_tensor = tf.reshape(input_tensor,
                                              [-1, config.hidden_size])

            logits = tf.matmul(flatten_input_tensor,
                               output_weights,
                               transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)

            logits = tf.reshape(logits,
                                [batch_size, seq_length, config.vocab_size])
            probs = tf.nn.softmax(logits, axis=-1, name='probs')
            lm_log_probs = tf.nn.log_softmax(logits, axis=-1)

            self.preds['preds'] = tf.argmax(probs, axis=-1)
            one_hot_labels = tf.one_hot(input_ids,
                                        depth=config.vocab_size,
                                        dtype=tf.float32)
            per_example_loss = -tf.reduce_sum(lm_log_probs * one_hot_labels,
                                              axis=[-1])
            if sample_weight is not None:
                per_example_loss *= tf.expand_dims(sample_weight, axis=-1)

            self.total_loss = (tf.reduce_mean(per_example_loss) +
                               tf.reduce_mean(tf.square(miu)) +
                               tf.reduce_mean(tf.exp(sigma) - sigma - 1))
            self.losses['losses'] = per_example_loss
Beispiel #4
0
    def __init__(self,
                 bert_config,
                 is_training,
                 input_ids,
                 input_mask,
                 segment_ids,
                 scope='bert',
                 drop_pooler=False,
                 trainable=True,
                 **kwargs):

        bert_config = copy.deepcopy(bert_config)
        if not is_training:
            bert_config.hidden_dropout_prob = 0.0
            bert_config.attention_probs_dropout_prob = 0.0

        input_shape = util.get_shape_list(input_ids, expected_rank=2)
        batch_size = input_shape[0]
        max_seq_length = input_shape[1]

        # Tilda embeddings for SMART algorithm
        tilda_embeddings = None
        use_tilda_embedding = kwargs.get('use_tilda_embedding')
        if use_tilda_embedding:
            with tf.variable_scope('', reuse=True):
                tilda_embeddings = tf.get_variable('tilda_embeddings')

        with tf.variable_scope(scope):
            with tf.variable_scope('embeddings'):

                (self.embedding_output, self.embedding_table) = \
                    self.embedding_lookup(
                        input_ids=input_ids,
                        vocab_size=bert_config.vocab_size,
                        batch_size=batch_size,
                        max_seq_length=max_seq_length,
                        embedding_size=bert_config.hidden_size,
                        initializer_range=bert_config.initializer_range,
                        word_embedding_name='word_embeddings',
                        tilda_embeddings=tilda_embeddings,
                        trainable=trainable)

                # Add positional embeddings and token type embeddings
                # layer normalize and perform dropout.
                self.embedding_output = self.embedding_postprocessor(
                    input_tensor=self.embedding_output,
                    batch_size=batch_size,
                    max_seq_length=max_seq_length,
                    hidden_size=bert_config.hidden_size,
                    use_token_type=True,
                    segment_ids=segment_ids,
                    token_type_vocab_size=bert_config.type_vocab_size,
                    token_type_embedding_name='token_type_embeddings',
                    use_position_embeddings=True,
                    position_embedding_name='position_embeddings',
                    initializer_range=bert_config.initializer_range,
                    max_position_embeddings=\
                        bert_config.max_position_embeddings,
                    dropout_prob=bert_config.hidden_dropout_prob,
                    trainable=trainable)

            with tf.variable_scope('encoder'):
                attention_mask = self.create_attention_mask_from_input_mask(
                    input_mask, batch_size, max_seq_length)

                # stacked transformers
                self.all_encoder_layers = self.transformer_model(
                    input_tensor=self.embedding_output,
                    batch_size=batch_size,
                    max_seq_length=max_seq_length,
                    attention_mask=attention_mask,
                    hidden_size=bert_config.hidden_size,
                    num_hidden_layers=bert_config.num_hidden_layers,
                    num_attention_heads=bert_config.num_attention_heads,
                    intermediate_size=bert_config.intermediate_size,
                    intermediate_act_fn=util.get_activation(
                        bert_config.hidden_act),
                    hidden_dropout_prob=bert_config.hidden_dropout_prob,
                    attention_probs_dropout_prob=\
                    bert_config.attention_probs_dropout_prob,
                    initializer_range=bert_config.initializer_range,
                    trainable=trainable)

            self.sequence_output = self.all_encoder_layers[-1]
            with tf.variable_scope('pooler'):
                first_token_tensor = self.sequence_output[:, 0, :]

                # trick: ignore the fully connected layer
                if drop_pooler:
                    self.pooled_output = first_token_tensor
                else:
                    self.pooled_output = tf.layers.dense(
                        first_token_tensor,
                        bert_config.hidden_size,
                        activation=tf.tanh,
                        kernel_initializer=util.create_initializer(
                            bert_config.initializer_range),
                        trainable=trainable)
Beispiel #5
0
                def _build_forward(layer_input):
                    with tf.variable_scope('attention'):
                        attention_heads = []
                        with tf.variable_scope('self'):
                            (attention_head, attention_scores) = \
                                self.attention_layer(
                                    from_tensor=layer_input,
                                    to_tensor=layer_input,
                                    attention_mask=attention_mask,
                                    num_attention_heads=num_attention_heads,
                                    size_per_head=attention_head_size,
                                    attention_probs_dropout_prob=\
                                        attention_probs_dropout_prob,
                                    initializer_range=initializer_range,
                                    do_return_2d_tensor=True,
                                    batch_size=batch_size,
                                    from_max_seq_length=max_seq_length,
                                    to_max_seq_length=max_seq_length,
                                    dtype=dtype,
                                    trainable=trainable)
                            attention_heads.append(attention_head)
                            self.attention_scores.append(attention_scores)

                        attention_output = None
                        if len(attention_heads) == 1:
                            attention_output = attention_heads[0]
                        else:
                            attention_output = tf.concat(attention_heads,
                                                         axis=-1)

                        with tf.variable_scope('output'):
                            attention_output = tf.layers.dense(
                                attention_output,
                                hidden_size,
                                kernel_initializer=util.create_initializer(
                                    initializer_range),
                                trainable=trainable)
                            attention_output = util.dropout(
                                attention_output, hidden_dropout_prob)
                            attention_output = util.layer_norm(
                                attention_output + layer_input,
                                trainable=trainable)

                    # The activation is only applied to the `intermediate`
                    # hidden layer.
                    with tf.variable_scope('intermediate'):
                        intermediate_output = tf.layers.dense(
                            attention_output,
                            intermediate_size,
                            activation=intermediate_act_fn,
                            kernel_initializer=util.create_initializer(
                                initializer_range),
                            trainable=trainable)

                    # Down-project back to hidden_size then add the residual.
                    with tf.variable_scope('output'):
                        layer_output = tf.layers.dense(
                            intermediate_output,
                            hidden_size,
                            kernel_initializer=util.create_initializer(
                                initializer_range),
                            trainable=trainable)
                        layer_output = util.dropout(layer_output,
                                                    hidden_dropout_prob)
                        layer_output = util.layer_norm(layer_output +
                                                       attention_output,
                                                       trainable=trainable)

                    return layer_output
Beispiel #6
0
def transformer_model(input_tensor,
                      attention_mask=None,
                      hidden_size=768,
                      num_hidden_layers=12,
                      num_attention_heads=12,
                      intermediate_size=3072,
                      intermediate_act_fn=util.gelu,
                      hidden_dropout_prob=0.1,
                      attention_probs_dropout_prob=0.1,
                      initializer_range=0.02,
                      do_return_all_layers=False):
    '''Multi-headed, multi-layer Transformer from 'Attention is All You Need'.

  This is almost an exact implementation of the original Transformer encoder.

  See the original paper:
  https://arxiv.org/abs/1706.03762

  Also see:
  https://github.com/tensorflow/tensor2tensor/blob/master/
    tensor2tensor/models/transformer.py

  Args:
    input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
    attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,
      seq_length], with 1 for positions that can be attended to and 0 in
      positions that should not be.
    hidden_size: int. Hidden size of the Transformer.
    num_hidden_layers: int. Number of layers (blocks) in the Transformer.
    num_attention_heads: int. Number of attention heads in the Transformer.
    intermediate_size: int. The size of the 'intermediate' (a.k.a., feed
      forward) layer.
    intermediate_act_fn: function. The non-linear activation function to apply
      to the output of the intermediate/feed-forward layer.
    hidden_dropout_prob: float. Dropout probability for the hidden layers.
    attention_probs_dropout_prob: float. Dropout probability of the attention
      probabilities.
    initializer_range: float. Range of the initializer (stddev of truncated
      normal).
    do_return_all_layers: Whether to also return all layers or just the final
      layer.

  Returns:
    float Tensor of shape [batch_size, seq_length, hidden_size], the final
    hidden layer of the Transformer.

  Raises:
    ValueError: A Tensor shape or parameter is invalid.
  '''
    if hidden_size % num_attention_heads != 0:
        raise ValueError(
            'The hidden size (%d) is not a multiple of the number of attention '
            'heads (%d)' % (hidden_size, num_attention_heads))

    attention_head_size = int(hidden_size / num_attention_heads)
    input_shape = util.get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_shape[0]
    seq_length = input_shape[1]
    input_width = input_shape[2]

    # The Transformer performs sum residuals on all layers so the input needs
    # to be the same as the hidden size.
    if input_width != hidden_size:
        raise ValueError(
            'The width of the input tensor (%d) != hidden size (%d)' %
            (input_width, hidden_size))

    # We keep the representation as a 2D tensor to avoid re-shaping it back and
    # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on
    # the GPU/CPU but may not be free on the TPU, so we want to minimize them to
    # help the optimizer.
    prev_output = util.reshape_to_matrix(input_tensor)

    attn_maps = []
    all_layer_outputs = []
    for layer_idx in range(num_hidden_layers):
        with tf.variable_scope('layer_%d' % layer_idx):
            with tf.variable_scope('attention'):
                attention_heads = []
                with tf.variable_scope('self'):
                    attention_head, probs = attention_layer(
                        from_tensor=prev_output,
                        to_tensor=prev_output,
                        attention_mask=attention_mask,
                        num_attention_heads=num_attention_heads,
                        size_per_head=attention_head_size,
                        attention_probs_dropout_prob=
                        attention_probs_dropout_prob,
                        initializer_range=initializer_range,
                        do_return_2d_tensor=True,
                        batch_size=batch_size,
                        from_seq_length=seq_length,
                        to_seq_length=seq_length)
                    attention_heads.append(attention_head)
                    attn_maps.append(probs)

                attention_output = None
                if len(attention_heads) == 1:
                    attention_output = attention_heads[0]
                else:
                    # In the case where we have other sequences, we just concatenate
                    # them to the self-attention head before the projection.
                    attention_output = tf.concat(attention_heads, axis=-1)

                # Run a linear projection of `hidden_size` then add a residual
                # with `layer_input`.
                with tf.variable_scope('output'):
                    attention_output = tf.layers.dense(
                        attention_output,
                        hidden_size,
                        kernel_initializer=util.create_initializer(
                            initializer_range))
                    attention_output = util.dropout(attention_output,
                                                    hidden_dropout_prob)
                    attention_output = util.layer_norm(attention_output +
                                                       prev_output)

            # The activation is only applied to the 'intermediate' hidden layer.
            with tf.variable_scope('intermediate'):
                intermediate_output = tf.layers.dense(
                    attention_output,
                    intermediate_size,
                    activation=intermediate_act_fn,
                    kernel_initializer=util.create_initializer(
                        initializer_range))

            # Down-project back to `hidden_size` then add the residual.
            with tf.variable_scope('output'):
                prev_output = tf.layers.dense(
                    intermediate_output,
                    hidden_size,
                    kernel_initializer=util.create_initializer(
                        initializer_range))
                prev_output = util.dropout(prev_output, hidden_dropout_prob)
                prev_output = util.layer_norm(prev_output + attention_output)
                all_layer_outputs.append(prev_output)

    attn_maps = tf.stack(attn_maps, 0)
    if do_return_all_layers:
        return tf.stack([
            util.reshape_from_matrix(layer, input_shape)
            for layer in all_layer_outputs
        ], 0), attn_maps
    else:
        return util.reshape_from_matrix(prev_output, input_shape), attn_maps
Beispiel #7
0
    def __init__(self,
                 is_training,
                 input_tensor,
                 n_wide_features,
                 wide_features,
                 label_ids,
                 label_size=2,
                 sample_weight=None,
                 scope='cls/seq_relationship',
                 hidden_dropout_prob=0.1,
                 initializer_range=0.02,
                 trainable=True,
                 **kwargs):
        super().__init__(**kwargs)

        hidden_size = input_tensor.shape.as_list()[-1]
        feature_size = wide_features.shape.as_list()[-1]
        with tf.variable_scope('wide'):
            feature_embeddings = tf.get_variable(
                name='feature_embeddings',
                shape=[feature_size + 1, hidden_size],
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            wide_output = tf.gather(feature_embeddings,
                                    wide_features)  # [B, N, H]

        with tf.variable_scope('wide_and_deep'):
            deep_output = tf.expand_dims(input_tensor, -1)  # [B, H, 1]
            attention_scores = tf.matmul(wide_output, deep_output)  # [B, N, 1]
            attention_scores = tf.transpose(attention_scores,
                                            [0, 2, 1])  # [B, 1, N]
            attention_scores = tf.multiply(attention_scores,
                                           1.0 / math.sqrt(hidden_size))
            feature_mask = tf.cast(
                tf.sequence_mask(n_wide_features, feature_size),
                tf.float32)  # [B, N]
            feature_mask = tf.expand_dims(feature_mask, 1)  # [B, 1, N]
            attention_scores += (1.0 - feature_mask) * -10000.0
            attention_matrix = tf.nn.softmax(attention_scores, axis=-1)
            attention_output = tf.matmul(attention_matrix,
                                         wide_output)  # [B, 1, H]
            attention_output = attention_output[:, 0, :]  # [B, H]
            # attention_output = util.dropout(
            #     attention_output, hidden_dropout_prob)
            input_tensor = util.layer_norm(attention_output + input_tensor,
                                           trainable=trainable)

        with tf.variable_scope(scope):
            output_weights = tf.get_variable(
                'output_weights',
                shape=[label_size, hidden_size],
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            output_bias = tf.get_variable('output_bias',
                                          shape=[label_size],
                                          initializer=tf.zeros_initializer(),
                                          trainable=trainable)

            output_layer = util.dropout(
                input_tensor, hidden_dropout_prob if is_training else 0.0)
            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)

            self.preds['preds'] = tf.argmax(logits, axis=-1)
            self.probs['probs'] = tf.nn.softmax(logits, axis=-1, name='probs')

            log_probs = tf.nn.log_softmax(logits, axis=-1)
            one_hot_labels = tf.one_hot(label_ids,
                                        depth=label_size,
                                        dtype=tf.float32)
            per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs,
                                              axis=-1)
            if sample_weight is not None:
                per_example_loss = tf.cast(sample_weight,
                                           dtype=tf.float32) * per_example_loss
            thresh = kwargs.get('tsa_thresh')
            if thresh is not None:
                assert isinstance(
                    thresh,
                    float), ('`tsa_thresh` must be a float between 0 and 1.')
                uncertainty = tf.reduce_sum(self.probs['probs'] *
                                            tf.log(self.probs['probs']),
                                            axis=-1)
                uncertainty /= tf.log(1 / label_size)
                per_example_loss = tf.cast(
                    tf.greater(uncertainty, thresh), dtype=tf.float32) * \
                    per_example_loss

            self.losses['losses'] = per_example_loss
            self.total_loss = tf.reduce_mean(per_example_loss)
Beispiel #8
0
def attention_ffn_block(layer_input,
                        hidden_size=768,
                        attention_mask=None,
                        num_attention_heads=1,
                        attention_head_size=64,
                        attention_probs_dropout_prob=0.0,
                        intermediate_size=3072,
                        intermediate_act_fn=None,
                        initializer_range=0.02,
                        hidden_dropout_prob=0.0,
                        use_einsum=True):
    """A network with attention-ffn as sub-block.

  Args:
    layer_input: float Tensor of shape [batch_size, from_seq_length,
      from_width].
    hidden_size: (optional) int, size of hidden layer.
    attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
      The values should be 1 or 0. The attention scores will effectively be set
      to -infinity for any positions in the mask that are 0, and will be
      unchanged for positions that are 1.
    num_attention_heads: int. Number of attention heads.
    attention_head_size: int. Size of attention head.
    attention_probs_dropout_prob: float. dropout probability for attention_layer
    intermediate_size: int. Size of intermediate hidden layer.
    intermediate_act_fn: (optional) Activation function for the intermediate
      layer.
    initializer_range: float. Range of the weight initializer.
    hidden_dropout_prob: (optional) float. Dropout probability of the hidden
      layer.
    use_einsum: bool. Whether to use einsum or reshape+matmul for dense layers

  Returns:
    layer output
  """

    with tf.variable_scope("attention_1"):
        with tf.variable_scope("self"):
            attention_output = attention_layer(
                from_tensor=layer_input,
                to_tensor=layer_input,
                attention_mask=attention_mask,
                num_attention_heads=num_attention_heads,
                attention_probs_dropout_prob=attention_probs_dropout_prob,
                initializer_range=initializer_range,
                use_einsum=use_einsum)

        # Run a linear projection of `hidden_size` then add a residual
        # with `layer_input`.
        with tf.variable_scope("output"):
            attention_output = dense_layer_3d_proj(
                attention_output,
                hidden_size,
                attention_head_size,
                util.create_initializer(initializer_range),
                None,
                use_einsum=use_einsum,
                name="dense")
            attention_output = util.dropout(attention_output,
                                            hidden_dropout_prob)
    attention_output = util.layer_norm(attention_output + layer_input)
    with tf.variable_scope("ffn_1"):
        with tf.variable_scope("intermediate"):
            intermediate_output = dense_layer_2d(
                attention_output,
                intermediate_size,
                util.create_initializer(initializer_range),
                intermediate_act_fn,
                use_einsum=use_einsum,
                num_attention_heads=num_attention_heads,
                name="dense")
            with tf.variable_scope("output"):
                ffn_output = dense_layer_2d(
                    intermediate_output,
                    hidden_size,
                    util.create_initializer(initializer_range),
                    None,
                    use_einsum=use_einsum,
                    num_attention_heads=num_attention_heads,
                    name="dense")
            ffn_output = util.dropout(ffn_output, hidden_dropout_prob)
    ffn_output = util.layer_norm(ffn_output + attention_output)
    return ffn_output
Beispiel #9
0
def transformer_model(input_tensor,
                      attention_mask=None,
                      hidden_size=768,
                      num_hidden_layers=12,
                      num_hidden_groups=12,
                      num_attention_heads=12,
                      intermediate_size=3072,
                      inner_group_num=1,
                      intermediate_act_fn="gelu",
                      hidden_dropout_prob=0.1,
                      attention_probs_dropout_prob=0.1,
                      initializer_range=0.02,
                      do_return_all_layers=False,
                      use_einsum=True,
                      trainable=True):
    """Multi-headed, multi-layer Transformer from "Attention is All You Need".

  This is almost an exact implementation of the original Transformer encoder.

  See the original paper:
  https://arxiv.org/abs/1706.03762

  Also see:
  https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py

  Args:
    input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
    attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length],
      with 1 for positions that can be attended to and 0 in positions that
      should not be.
    hidden_size: int. Hidden size of the Transformer.
    num_hidden_layers: int. Number of layers (blocks) in the Transformer.
    num_hidden_groups: int. Number of group for the hidden layers, parameters
      in the same group are shared.
    num_attention_heads: int. Number of attention heads in the Transformer.
    intermediate_size: int. The size of the "intermediate" (a.k.a., feed
      forward) layer.
    inner_group_num: int, number of inner repetition of attention and ffn.
    intermediate_act_fn: function. The non-linear activation function to apply
      to the output of the intermediate/feed-forward layer.
    hidden_dropout_prob: float. Dropout probability for the hidden layers.
    attention_probs_dropout_prob: float. Dropout probability of the attention
      probabilities.
    initializer_range: float. Range of the initializer (stddev of truncated
      normal).
    do_return_all_layers: Whether to also return all layers or just the final
      layer.
    use_einsum: bool. Whether to use einsum or reshape+matmul for dense layers

  Returns:
    float Tensor of shape [batch_size, seq_length, hidden_size], the final
    hidden layer of the Transformer.

  Raises:
    ValueError: A Tensor shape or parameter is invalid.
  """
    if hidden_size % num_attention_heads != 0:
        raise ValueError(
            "The hidden size (%d) is not a multiple of the number of attention "
            "heads (%d)" % (hidden_size, num_attention_heads))

    attention_head_size = hidden_size // num_attention_heads
    input_shape = util.get_shape_list(input_tensor, expected_rank=3)
    input_width = input_shape[2]

    all_layer_outputs = []
    if input_width != hidden_size:
        prev_output = dense_layer_2d(
            input_tensor,
            hidden_size,
            util.create_initializer(initializer_range),
            None,
            use_einsum=use_einsum,
            name="embedding_hidden_mapping_in")
    else:
        prev_output = input_tensor
    with tf.variable_scope("transformer", reuse=tf.AUTO_REUSE):
        for layer_idx in range(num_hidden_layers):
            group_idx = int(layer_idx / num_hidden_layers * num_hidden_groups)
            with tf.variable_scope("group_%d" % group_idx):
                with tf.name_scope("layer_%d" % layer_idx):
                    layer_output = prev_output
                    for inner_group_idx in range(inner_group_num):
                        with tf.variable_scope("inner_group_%d" %
                                               inner_group_idx):
                            layer_output = attention_ffn_block(
                                layer_input=layer_output,
                                hidden_size=hidden_size,
                                attention_mask=attention_mask,
                                num_attention_heads=num_attention_heads,
                                attention_head_size=attention_head_size,
                                attention_probs_dropout_prob=
                                attention_probs_dropout_prob,
                                intermediate_size=intermediate_size,
                                intermediate_act_fn=intermediate_act_fn,
                                initializer_range=initializer_range,
                                hidden_dropout_prob=hidden_dropout_prob,
                                use_einsum=use_einsum)
                            prev_output = layer_output
                            all_layer_outputs.append(layer_output)
    if do_return_all_layers:
        return all_layer_outputs
    else:
        return all_layer_outputs[-1]
Beispiel #10
0
def transformer_xl(inp_k,
                   n_token,
                   n_layer,
                   d_model,
                   n_head,
                   d_head,
                   d_inner,
                   dropout,
                   dropatt,
                   attn_type,
                   bi_data,
                   initializer,
                   is_training,
                   mem_len=None,
                   inp_q=None,
                   mems=None,
                   same_length=False,
                   clamp_len=-1,
                   untie_r=False,
                   use_tpu=True,
                   input_mask=None,
                   perm_mask=None,
                   seg_id=None,
                   reuse_len=None,
                   ff_activation='relu',
                   target_mapping=None,
                   use_bfloat16=False,
                   scope='transformer',
                   tilda_embeddings=None,
                   **kwargs):
    '''
    Defines a Transformer-XL computation graph with additional
    support for XLNet.

      Args:

      inp_k: int32 Tensor in shape [len, bsz], the input token IDs.
      seg_id: int32 Tensor in shape [len, bsz], the input segment IDs.
      input_mask: float32 Tensor in shape [len, bsz], the input mask.
          0 for real tokens and 1 for padding.
      mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
          from previous batches. The length of the list equals n_layer.
          If None, no memory is used.
      perm_mask: float32 Tensor in shape [len, len, bsz].
          If perm_mask[i, j, k] = 0, i attend to j in batch k;
          if perm_mask[i, j, k] = 1, i does not attend to j in batch k.
          If None, each position attends to all the others.
      target_mapping: float32 Tensor in shape [num_predict, len, bsz].
          If target_mapping[i, j, k] = 1, the i-th predict in batch k is
          on the j-th token.
          Only used during pretraining for partial prediction.
          Set to None during finetuning.
      inp_q: float32 Tensor in shape [len, bsz].
          1 for tokens with losses and 0 for tokens without losses.
          Only used during pretraining for two-stream attention.
          Set to None during finetuning.

      n_layer: int, the number of layers.
      d_model: int, the hidden size.
      n_head: int, the number of attention heads.
      d_head: int, the dimension size of each attention head.
      d_inner: int, the hidden size in feed-forward layers.
      ff_activation: str, 'relu' or 'gelu'.
      untie_r: bool, whether to untie the biases in attention.
      n_token: int, the vocab size.

      is_training: bool, whether in training mode.
      use_tpu: bool, whether TPUs are used.
      use_bfloat16: bool, use bfloat16 instead of float32.
      dropout: float, dropout rate.
      dropatt: float, dropout rate on attention probabilities.
      init: str, the initialization scheme, either 'normal' or 'uniform'.
      init_range: float, initialize the parameters with a uniform distribution
          in [-init_range, init_range]. Only effective when init='uniform'.
      init_std: float, initialize the parameters with a normal distribution
          with mean 0 and stddev init_std. Only effective when init='normal'.
      mem_len: int, the number of tokens to cache.
      reuse_len: int, the number of tokens in the currect batch to be cached
          and reused in the future.
      bi_data: bool, whether to use bidirectional input pipeline.
          Usually set to True during pretraining and False during finetuning.
      clamp_len: int, clamp all relative distances larger than clamp_len.
          -1 means no clamping.
      same_length: bool, whether to use the same attention length for each token.
      summary_type: str, 'last', 'first', 'mean', or 'attn'. The method
          to pool the input to get a vector representation.
      initializer: A tf initializer.
      scope: scope name for the computation graph.
    '''
    tf_float = tf.bfloat16 if use_bfloat16 else tf.float32

    new_mems = []
    with tf.variable_scope(scope):
        if untie_r:
            r_w_bias = tf.get_variable('r_w_bias', [n_layer, n_head, d_head],
                                       dtype=tf_float,
                                       initializer=initializer)
            r_r_bias = tf.get_variable('r_r_bias', [n_layer, n_head, d_head],
                                       dtype=tf_float,
                                       initializer=initializer)
        else:
            r_w_bias = tf.get_variable('r_w_bias', [n_head, d_head],
                                       dtype=tf_float,
                                       initializer=initializer)
            r_r_bias = tf.get_variable('r_r_bias', [n_head, d_head],
                                       dtype=tf_float,
                                       initializer=initializer)

        bsz = tf.shape(inp_k)[1]
        qlen = tf.shape(inp_k)[0]
        mlen = tf.shape(mems[0])[0] if mems is not None else 0
        klen = mlen + qlen

        ##### Attention mask
        # causal attention mask
        if attn_type == 'uni':
            attn_mask = _create_mask(qlen, mlen, tf_float, same_length)
            attn_mask = attn_mask[:, :, None, None]
        elif attn_type == 'bi':
            attn_mask = None
        else:
            raise ValueError('Unsupported attention type: %s' % attn_type)

        # data mask: input mask & perm mask
        if input_mask is not None and perm_mask is not None:
            data_mask = input_mask[None] + perm_mask
        elif input_mask is not None and perm_mask is None:
            data_mask = input_mask[None]
        elif input_mask is None and perm_mask is not None:
            data_mask = perm_mask
        else:
            data_mask = None

        if data_mask is not None:
            # all mems can be attended to
            mems_mask = tf.zeros([tf.shape(data_mask)[0], mlen, bsz],
                                 dtype=tf_float)
            data_mask = tf.cast(data_mask, dtype=tf.float32)
            data_mask = tf.concat([mems_mask, data_mask], 1)
            if attn_mask is None:
                attn_mask = data_mask[:, :, :, None]
            else:
                attn_mask += data_mask[:, :, :, None]

        if attn_mask is not None:
            attn_mask = tf.cast(attn_mask > 0, dtype=tf_float)

        if attn_mask is not None:
            non_tgt_mask = -tf.eye(qlen, dtype=tf_float)
            non_tgt_mask = tf.concat(
                [tf.zeros([qlen, mlen], dtype=tf_float), non_tgt_mask],
                axis=-1)
            non_tgt_mask = tf.cast(
                (attn_mask + non_tgt_mask[:, :, None, None]) > 0,
                dtype=tf_float)
        else:
            non_tgt_mask = None

        ##### Word embedding
        word_emb_k, lookup_table = embedding_lookup(
            x=inp_k,
            n_token=n_token,
            d_embed=d_model,
            initializer=initializer,
            use_tpu=use_tpu,
            dtype=tf_float,
            scope='word_embedding',
            tilda_embeddings=tilda_embeddings)

        if inp_q is not None:
            with tf.variable_scope('mask_emb'):
                mask_emb = tf.get_variable('mask_emb', [1, 1, d_model],
                                           dtype=tf_float)
                if target_mapping is not None:
                    word_emb_q = tf.tile(mask_emb,
                                         [tf.shape(target_mapping)[0], bsz, 1])
                else:
                    inp_q_ext = inp_q[:, :, None]
                    word_emb_q = \
                        inp_q_ext * mask_emb + (1 - inp_q_ext) * word_emb_k
        output_h = tf.layers.dropout(word_emb_k, dropout, training=is_training)
        if inp_q is not None:
            output_g = tf.layers.dropout(word_emb_q,
                                         dropout,
                                         training=is_training)

        ##### Segment embedding
        if seg_id is not None:
            if untie_r:
                r_s_bias = tf.get_variable('r_s_bias',
                                           [n_layer, n_head, d_head],
                                           dtype=tf_float,
                                           initializer=initializer)
            else:
                # default case (tie)
                r_s_bias = tf.get_variable('r_s_bias', [n_head, d_head],
                                           dtype=tf_float,
                                           initializer=initializer)

            seg_embed = tf.get_variable('seg_embed',
                                        [n_layer, 2, n_head, d_head],
                                        dtype=tf_float,
                                        initializer=initializer)

            # Convert `seg_id` to one-hot `seg_mat`
            mem_pad = tf.zeros([mlen, bsz], dtype=tf.int32)
            cat_ids = tf.concat([mem_pad, seg_id], 0)

            # `1` indicates not in the same segment [qlen x klen x bsz]
            seg_mat = tf.cast(
                tf.logical_not(tf.equal(seg_id[:, None], cat_ids[None, :])),
                tf.int32)
            seg_mat = tf.one_hot(seg_mat, 2, dtype=tf_float)
        else:
            seg_mat = None

        ##### Positional encoding
        pos_emb = relative_positional_encoding(qlen,
                                               klen,
                                               d_model,
                                               clamp_len,
                                               attn_type,
                                               bi_data,
                                               bsz=bsz,
                                               dtype=tf_float)
        pos_emb = tf.layers.dropout(pos_emb, dropout, training=is_training)

        ##### Attention layers
        if mems is None:
            mems = [None] * n_layer

        for i in range(n_layer):
            # cache new mems
            new_mems.append(_cache_mem(output_h, mems[i], mem_len, reuse_len))

            # segment bias
            if seg_id is None:
                r_s_bias_i = None
                seg_embed_i = None
            else:
                r_s_bias_i = r_s_bias if not untie_r else r_s_bias[i]
                seg_embed_i = seg_embed[i]

            with tf.variable_scope('layer_{}'.format(i)):
                if inp_q is not None:
                    output_h, output_g = two_stream_rel_attn(
                        h=output_h,
                        g=output_g,
                        r=pos_emb,
                        r_w_bias=r_w_bias if not untie_r else r_w_bias[i],
                        r_r_bias=r_r_bias if not untie_r else r_r_bias[i],
                        seg_mat=seg_mat,
                        r_s_bias=r_s_bias_i,
                        seg_embed=seg_embed_i,
                        attn_mask_h=non_tgt_mask,
                        attn_mask_g=attn_mask,
                        mems=mems[i],
                        target_mapping=target_mapping,
                        d_model=d_model,
                        n_head=n_head,
                        d_head=d_head,
                        dropout=dropout,
                        dropatt=dropatt,
                        is_training=is_training,
                        kernel_initializer=initializer)
                    reuse = True
                else:
                    reuse = False

                    output_h = rel_multihead_attn(
                        h=output_h,
                        r=pos_emb,
                        r_w_bias=r_w_bias if not untie_r else r_w_bias[i],
                        r_r_bias=r_r_bias if not untie_r else r_r_bias[i],
                        seg_mat=seg_mat,
                        r_s_bias=r_s_bias_i,
                        seg_embed=seg_embed_i,
                        attn_mask=non_tgt_mask,
                        mems=mems[i],
                        d_model=d_model,
                        n_head=n_head,
                        d_head=d_head,
                        dropout=dropout,
                        dropatt=dropatt,
                        is_training=is_training,
                        kernel_initializer=initializer,
                        reuse=reuse)

                if inp_q is not None:
                    output_g = positionwise_ffn(inp=output_g,
                                                d_model=d_model,
                                                d_inner=d_inner,
                                                dropout=dropout,
                                                kernel_initializer=initializer,
                                                activation_type=ff_activation,
                                                is_training=is_training)

                output_h = positionwise_ffn(inp=output_h,
                                            d_model=d_model,
                                            d_inner=d_inner,
                                            dropout=dropout,
                                            kernel_initializer=initializer,
                                            activation_type=ff_activation,
                                            is_training=is_training,
                                            reuse=reuse)

        if inp_q is not None:
            output = tf.layers.dropout(output_g, dropout, training=is_training)
        else:
            output = tf.layers.dropout(output_h, dropout, training=is_training)

        return output, new_mems, lookup_table
Beispiel #11
0
    def __init__(self,
                 albert_config,
                 is_training,
                 input_ids,
                 input_mask=None,
                 segment_ids=None,
                 scope='bert',
                 drop_pooler=False,
                 trainable=True,
                 **kwargs):
        """Constructor for AlbertModel.

    Args:
      albert_config: `AlbertConfig` instance.
      is_training: bool. true for training model, false for eval model.
        Controls whether dropout will be applied.
      input_ids: int32 Tensor of shape [batch_size, seq_length].
      input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
      segment_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
      use_einsum: (optional) bool. Whether to use einsum or reshape+matmul for
        dense layers
      scope: (optional) variable scope. Defaults to "bert".

    Raises:
      ValueError: The config is invalid or one of the input tensor shapes
        is invalid.
    """
        albert_config = copy.deepcopy(albert_config)
        if not is_training:
            albert_config.hidden_dropout_prob = 0.0
            albert_config.attention_probs_dropout_prob = 0.0

        input_shape = util.get_shape_list(input_ids, expected_rank=2)
        batch_size = input_shape[0]
        seq_length = input_shape[1]

        if input_mask is None:
            input_mask = tf.ones(shape=[batch_size, seq_length],
                                 dtype=tf.int32)

        if segment_ids is None:
            segment_ids = tf.zeros(shape=[batch_size, seq_length],
                                   dtype=tf.int32)

        # Tilda embeddings for SMART algorithm
        tilda_embeddings = None
        use_tilda_embedding = kwargs.get('use_tilda_embedding')
        if use_tilda_embedding:
            with tf.variable_scope('', reuse=True):
                tilda_embeddings = tf.get_variable('tilda_embeddings')

        with tf.variable_scope(scope):
            with tf.variable_scope("embeddings"):
                # Perform embedding lookup on the word ids.
                (self.word_embedding_output,
                 self.output_embedding_table) = embedding_lookup(
                     input_ids=input_ids,
                     vocab_size=albert_config.vocab_size,
                     embedding_size=albert_config.embedding_size,
                     initializer_range=albert_config.initializer_range,
                     word_embedding_name="word_embeddings",
                     tilda_embeddings=tilda_embeddings,
                     trainable=trainable)

                # Add positional embeddings and token type embeddings, then layer
                # normalize and perform dropout.
                self.embedding_output = embedding_postprocessor(
                    input_tensor=self.word_embedding_output,
                    use_token_type=True,
                    segment_ids=segment_ids,
                    token_type_vocab_size=albert_config.type_vocab_size,
                    token_type_embedding_name="token_type_embeddings",
                    use_position_embeddings=True,
                    position_embedding_name="position_embeddings",
                    initializer_range=albert_config.initializer_range,
                    max_position_embeddings=albert_config.
                    max_position_embeddings,
                    dropout_prob=albert_config.hidden_dropout_prob,
                    trainable=trainable)

            with tf.variable_scope("encoder"):
                # Run the stacked transformer.
                # `sequence_output` shape = [batch_size, seq_length, hidden_size].
                self.all_encoder_layers = transformer_model(
                    input_tensor=self.embedding_output,
                    attention_mask=input_mask,
                    hidden_size=albert_config.hidden_size,
                    num_hidden_layers=albert_config.num_hidden_layers,
                    num_hidden_groups=albert_config.num_hidden_groups,
                    num_attention_heads=albert_config.num_attention_heads,
                    intermediate_size=albert_config.intermediate_size,
                    inner_group_num=albert_config.inner_group_num,
                    intermediate_act_fn=util.get_activation(
                        albert_config.hidden_act),
                    hidden_dropout_prob=albert_config.hidden_dropout_prob,
                    attention_probs_dropout_prob=albert_config.
                    attention_probs_dropout_prob,
                    initializer_range=albert_config.initializer_range,
                    do_return_all_layers=True,
                    use_einsum=False,
                    trainable=trainable)

            self.sequence_output = self.all_encoder_layers[-1]
            # The "pooler" converts the encoded sequence tensor of shape
            # [batch_size, seq_length, hidden_size] to a tensor of shape
            # [batch_size, hidden_size]. This is necessary for segment-level
            # (or segment-pair-level) classification tasks where we need a fixed
            # dimensional representation of the segment.
            with tf.variable_scope("pooler"):
                # We "pool" the model by simply taking the hidden state corresponding
                # to the first token. We assume that this has been pre-trained
                first_token_tensor = tf.squeeze(self.sequence_output[:,
                                                                     0:1, :],
                                                axis=1)

                # trick: ignore the fully connected layer
                if drop_pooler:
                    self.pooled_output = first_token_tensor
                else:
                    self.pooled_output = tf.layers.dense(
                        first_token_tensor,
                        albert_config.hidden_size,
                        activation=tf.tanh,
                        kernel_initializer=util.create_initializer(
                            albert_config.initializer_range),
                        trainable=trainable)
Beispiel #12
0
def two_stream_rel_attn(h,
                        g,
                        r,
                        mems,
                        r_w_bias,
                        r_r_bias,
                        seg_mat,
                        r_s_bias,
                        seg_embed,
                        attn_mask_h,
                        attn_mask_g,
                        target_mapping,
                        d_model,
                        n_head,
                        d_head,
                        dropout,
                        dropatt,
                        is_training,
                        kernel_initializer,
                        scope='rel_attn'):
    '''Two-stream attention with relative positional encoding.'''

    scale = 1 / (d_head**0.5)
    with tf.variable_scope(scope, reuse=False):

        # content based attention score
        if mems is not None and mems.shape.ndims > 1:
            cat = tf.concat([mems, h], 0)
        else:
            cat = h

        # content-based key head
        k_head_h = head_projection(cat, d_model, n_head, d_head,
                                   kernel_initializer, 'k')

        # content-based value head
        v_head_h = head_projection(cat, d_model, n_head, d_head,
                                   kernel_initializer, 'v')

        # position-based key head
        k_head_r = head_projection(r, d_model, n_head, d_head,
                                   kernel_initializer, 'r')

        ##### h-stream
        # content-stream query head
        q_head_h = head_projection(h, d_model, n_head, d_head,
                                   kernel_initializer, 'q')

        # core attention ops
        attn_vec_h = rel_attn_core(q_head_h, k_head_h, v_head_h, k_head_r,
                                   seg_embed, seg_mat, r_w_bias, r_r_bias,
                                   r_s_bias, attn_mask_h, dropatt, is_training,
                                   scale)

        # post processing
        output_h = post_attention(h, attn_vec_h, d_model, n_head, d_head,
                                  dropout, is_training, kernel_initializer)

    with tf.variable_scope(scope, reuse=True):
        ##### g-stream
        # query-stream query head
        q_head_g = head_projection(g, d_model, n_head, d_head,
                                   kernel_initializer, 'q')

        # core attention ops
        if target_mapping is not None:
            q_head_g = tf.einsum('mbnd,mlb->lbnd', q_head_g, target_mapping)
            attn_vec_g = rel_attn_core(q_head_g, k_head_h, v_head_h, k_head_r,
                                       seg_embed, seg_mat, r_w_bias, r_r_bias,
                                       r_s_bias, attn_mask_g, dropatt,
                                       is_training, scale)
            attn_vec_g = tf.einsum('lbnd,mlb->mbnd', attn_vec_g,
                                   target_mapping)
        else:
            attn_vec_g = rel_attn_core(q_head_g, k_head_h, v_head_h, k_head_r,
                                       seg_embed, seg_mat, r_w_bias, r_r_bias,
                                       r_s_bias, attn_mask_g, dropatt,
                                       is_training, scale)

        # post processing
        output_g = post_attention(g, attn_vec_g, d_model, n_head, d_head,
                                  dropout, is_training, kernel_initializer)

        return output_h, output_g
Beispiel #13
0
    def __init__(self,
                 xlnet_config,
                 is_training,
                 input_ids,
                 seg_ids,
                 input_mask,
                 mems=None,
                 perm_mask=None,
                 target_mapping=None,
                 inp_q=None,
                 **kwargs):
        '''
        Args:
          xlnet_config: XLNetConfig.
          is_training: bool, whether is training or not.
          input_ids: int32 Tensor in shape [len, bsz], the input token IDs.
          seg_ids: int32 Tensor in shape [len, bsz], the input segment IDs.
          input_mask: float32 Tensor in shape [len, bsz], the input mask.
              0 for real tokens and 1 for padding.
          mems: a list of float32 Tensors in shape [mem_len, bsz, d_model],
              memory from previous batches. The length of the list equals
              n_layer. If None, no memory is used.
          perm_mask: float32 Tensor in shape [len, len, bsz].
              If perm_mask[i, j, k] = 0, i attend to j in batch k;
              if perm_mask[i, j, k] = 1, i does not attend to j in batch k.
              If None, each position attends to all the others.
          target_mapping: float32 Tensor in shape [num_predict, len, bsz].
              If target_mapping[i, j, k] = 1, the i-th predict in batch k is
              on the j-th token.
              Only used during pretraining for partial prediction.
              Set to None during finetuning.
          inp_q: float32 Tensor in shape [len, bsz].
              1 for tokens with losses and 0 for tokens without losses.
              Only used during pretraining for two-stream attention.
              Set to None during finetuning.
        '''

        run_config = XLNetRunConfig(is_training=is_training,
                                    bi_data=False,
                                    use_tpu=False,
                                    use_bfloat16=False,
                                    dropout=(0.1 if is_training else 0.0),
                                    dropatt=(0.1 if is_training else 0.0),
                                    init='normal',
                                    init_range=0.1,
                                    init_std=0.02,
                                    clamp_len=-1)
        initializer = _get_initializer(run_config)

        # Tilda embeddings for SMART algorithm
        tilda_embeddings = None
        use_tilda_embedding = kwargs.get('use_tilda_embedding')
        if use_tilda_embedding:
            with tf.variable_scope('', reuse=True):
                tilda_embeddings = tf.get_variable('tilda_embeddings')

        tfm_args = dict(n_token=xlnet_config.n_token,
                        initializer=initializer,
                        attn_type='bi',
                        n_layer=xlnet_config.n_layer,
                        d_model=xlnet_config.d_model,
                        n_head=xlnet_config.n_head,
                        d_head=xlnet_config.d_head,
                        d_inner=xlnet_config.d_inner,
                        ff_activation=xlnet_config.ff_activation,
                        untie_r=xlnet_config.untie_r,
                        is_training=run_config.is_training,
                        use_bfloat16=run_config.use_bfloat16,
                        use_tpu=run_config.use_tpu,
                        dropout=run_config.dropout,
                        dropatt=run_config.dropatt,
                        mem_len=run_config.mem_len,
                        reuse_len=run_config.reuse_len,
                        bi_data=run_config.bi_data,
                        clamp_len=run_config.clamp_len,
                        same_length=run_config.same_length)

        input_args = dict(inp_k=input_ids,
                          seg_id=seg_ids,
                          input_mask=input_mask,
                          mems=mems,
                          perm_mask=perm_mask,
                          target_mapping=target_mapping,
                          inp_q=inp_q,
                          tilda_embeddings=tilda_embeddings)
        tfm_args.update(input_args)

        with tf.variable_scope('model', reuse=tf.AUTO_REUSE):
            (self.output, self.new_mems, self.lookup_table) = \
                transformer_xl(**tfm_args)

        self.input_mask = input_mask
        self.initializer = initializer
        self.xlnet_config = xlnet_config
        self.run_config = run_config
Beispiel #14
0
def summarize_sequence(summary_type,
                       hidden,
                       d_model,
                       n_head,
                       d_head,
                       dropout,
                       dropatt,
                       input_mask,
                       is_training,
                       initializer,
                       scope=None,
                       reuse=None,
                       use_proj=True):
    '''
    Different classification tasks may not may not share the same parameters
    to summarize the sequence features.

    If shared, one can keep the `scope` to the default value `None`.
    Otherwise, one should specify a different `scope` for each task.
    '''

    with tf.variable_scope(scope, 'sequnece_summary', reuse=reuse):
        if summary_type == 'last':
            summary = hidden[-1]
        elif summary_type == 'first':
            summary = hidden[0]
        elif summary_type == 'mean':
            summary = tf.reduce_mean(hidden, axis=0)
        elif summary_type == 'attn':
            bsz = tf.shape(hidden)[1]

            summary_bias = tf.get_variable('summary_bias', [d_model],
                                           dtype=hidden.dtype,
                                           initializer=initializer)
            summary_bias = tf.tile(summary_bias[None, None], [1, bsz, 1])

            if input_mask is not None:
                input_mask = input_mask[None, :, :, None]

            summary = multihead_attn(summary_bias,
                                     hidden,
                                     hidden,
                                     input_mask,
                                     d_model,
                                     n_head,
                                     d_head,
                                     dropout,
                                     dropatt,
                                     is_training,
                                     initializer,
                                     residual=False)
            summary = summary[0]
        else:
            raise ValueError('Unsupported summary type %s' % summary_type)

        # use another projection as in BERT
        if use_proj:
            summary = tf.layers.dense(summary,
                                      d_model,
                                      activation=tf.tanh,
                                      kernel_initializer=initializer,
                                      name='summary')

        # dropout
        summary = tf.layers.dropout(summary,
                                    dropout,
                                    training=is_training,
                                    name='dropout')

    return summary
Beispiel #15
0
    def _cls_self_attention_paper(self,
                                  prev_output,
                                  batch_size,
                                  max_seq_length,
                                  label_size,
                                  attention_mask=None,
                                  cls_hidden_size=128,
                                  cls_num_attention_heads=2,
                                  attention_probs_dropout_prob=0.1,
                                  initializer_range=0.02,
                                  dtype=tf.float32,
                                  trainable=True):
        if cls_hidden_size % cls_num_attention_heads != 0:
            raise ValueError(
                '`cls_hidden_size` (%d) is not a multiple of the number of '
                '`cls_num_attention_heads` (%d)' %
                (cls_hidden_size, cls_num_attention_heads))
        cls_attention_head_size = int(cls_hidden_size /
                                      cls_num_attention_heads)

        with tf.variable_scope('project'):
            attention_input = tf.layers.dense(
                prev_output,
                cls_hidden_size,
                activation='tanh',
                kernel_initializer=util.create_initializer(initializer_range),
                trainable=trainable)

        with tf.variable_scope('attention'):
            attention_heads = []
            with tf.variable_scope('self'):
                (attention_head, _) = self.attention_layer(
                    from_tensor=attention_input,
                    to_tensor=attention_input,
                    attention_mask=attention_mask,
                    num_attention_heads=cls_num_attention_heads,
                    size_per_head=cls_attention_head_size,
                    attention_probs_dropout_prob=attention_probs_dropout_prob,
                    initializer_range=initializer_range,
                    do_return_2d_tensor=False,
                    batch_size=batch_size,
                    from_max_seq_length=max_seq_length,
                    to_max_seq_length=max_seq_length,
                    dtype=dtype,
                    trainable=trainable)
                attention_heads.append(attention_head)

            attention_output = None
            if len(attention_heads) == 1:
                attention_output = attention_heads[0]
            else:
                attention_output = tf.concat(attention_heads, axis=-1)

        with tf.variable_scope('intermediate'):
            intermediate_output = tf.layers.dense(
                attention_output[:, 0, :],
                cls_hidden_size,
                activation='tanh',
                kernel_initializer=util.create_initializer(initializer_range),
                trainable=trainable)

        with tf.variable_scope('output'):
            cls_output = tf.layers.dense(
                intermediate_output,
                label_size,
                kernel_initializer=util.create_initializer(initializer_range),
                trainable=trainable)

        return cls_output
Beispiel #16
0
    def __init__(self,
                 bert_config,
                 is_training,
                 dilated_ids,
                 label_ids,
                 max_seq_length,
                 spad_id=1,
                 loop=3,
                 sample_weight=None,
                 scope='dilated',
                 use_tilda_embedding=False,
                 **kwargs):
        super().__init__()

        dilated_mask = tf.cast(tf.not_equal(dilated_ids, 0), tf.float32)

        shape = util.get_shape_list(dilated_ids, expected_rank=2)
        batch_size = shape[0]
        dilated_seq_length = shape[1]

        # Tilda embeddings for SMART algorithm
        tilda_embeddings = None
        if use_tilda_embedding:
            with tf.variable_scope('', reuse=True):
                tilda_embeddings = tf.get_variable('tilda_embeddings')

        with tf.variable_scope(scope):

            # forward once
            if is_training:
                logits = self._bert_forward(bert_config,
                                            dilated_ids,
                                            dilated_mask,
                                            batch_size,
                                            dilated_seq_length,
                                            tilda_embeddings=tilda_embeddings)

                self.preds['LM'] = tf.argmax(logits, axis=-1)

                # LM loss
                log_probs = tf.nn.log_softmax(logits, axis=-1)
                one_hot_labels = tf.one_hot(label_ids,
                                            depth=bert_config.vocab_size)
                per_token_loss = -tf.reduce_sum(one_hot_labels * log_probs,
                                                axis=-1)

                input_length = tf.reduce_sum(dilated_mask, axis=-1) * 2
                label_mask = tf.sequence_mask(input_length,
                                              max_seq_length * 2,
                                              dtype=tf.float32)
                per_example_loss = \
                    tf.reduce_sum(per_token_loss * label_mask, axis=-1) / \
                    tf.reduce_sum(label_mask, axis=-1)
                if sample_weight is not None:
                    per_example_loss *= tf.expand_dims(sample_weight, axis=-1)

                self.total_loss = tf.reduce_mean(per_example_loss)
                self.losses['LM'] = per_example_loss

            # forward loop
            else:

                def _forward(dilated_ids, dilated_mask):

                    logits = self._bert_forward(
                        bert_config,
                        dilated_ids,
                        dilated_mask,
                        batch_size,
                        dilated_seq_length,
                        tilda_embeddings=tilda_embeddings)
                    output_ids = tf.argmax(logits, axis=-1)
                    output_ids = tf.cast(output_ids, dtype=tf.int32)

                    # special padding (using `spad` token)
                    equal_zero = tf.cast(tf.equal(output_ids, 0), tf.int32)
                    equal_zero = tf.reduce_sum(equal_zero, axis=-1)
                    right_pad = spad_id * tf.sequence_mask(
                        equal_zero, dilated_seq_length, dtype=tf.int32)
                    paded = tf.concat([output_ids, right_pad], axis=-1)

                    # extract ids of length `max_seq_length`
                    flattened_padded = tf.reshape(paded, [-1])
                    is_valid = tf.cast(tf.greater(flattened_padded, 0),
                                       dtype=tf.int32)
                    flattened_valid = tf.boolean_mask(flattened_padded,
                                                      is_valid)
                    valid = tf.reshape(flattened_valid,
                                       [batch_size, dilated_seq_length])
                    cutted_valid = valid[:, :max_seq_length]

                    # replace `spad` token with `pad`
                    non_spad_mask = tf.cast(tf.not_equal(
                        cutted_valid, spad_id),
                                            dtype=tf.int32)
                    output_ids = cutted_valid * non_spad_mask
                    output_length = tf.reduce_sum(non_spad_mask, axis=-1)

                    # dilate
                    reshaped_ids = tf.reshape(output_ids,
                                              [batch_size, max_seq_length, 1])
                    reshaped_mask = tf.reshape(
                        tf.sequence_mask(output_length,
                                         max_seq_length,
                                         dtype=tf.int32),
                        [batch_size, max_seq_length, 1])
                    concat_ids = tf.concat(
                        [reshaped_ids,
                         tf.zeros_like(reshaped_ids)], axis=-1)
                    concat_mask = tf.concat([
                        reshaped_mask,
                        tf.zeros_like(reshaped_mask, dtype=tf.int32)
                    ],
                                            axis=-1)
                    dilated_ids = tf.reshape(concat_ids,
                                             [batch_size, max_seq_length * 2])
                    dilated_mask = tf.reshape(concat_mask,
                                              [batch_size, max_seq_length * 2])

                    return dilated_ids, dilated_mask

                for _ in range(loop):
                    dilated_ids, dilated_mask = _forward(
                        dilated_ids, dilated_mask)

                self.preds['LM'] = dilated_ids
Beispiel #17
0
    def __init__(self,
                 bert_config,
                 is_training,
                 input_ids,
                 input_mask=None,
                 token_type_ids=None,
                 use_one_hot_embeddings=True,
                 scope=None,
                 embedding_size=None,
                 input_embeddings=None,
                 input_reprs=None,
                 update_embeddings=True,
                 untied_embeddings=False):
        '''Constructor for BertModel.

        Args:
          bert_config: `BertConfig` instance.
          is_training: bool. true for training model, false for eval model.
            Controls whether dropout will be applied.
          input_ids: int32 Tensor of shape [batch_size, seq_length].
          input_mask: (optional) int32 Tensor of shape [batch_size,
            seq_length].
          token_type_ids: (optional) int32 Tensor of shape [batch_size,
            seq_length].
          use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
            embeddings or tf.embedding_lookup() for the word embeddings. On
            the TPU, it is much faster if this is True, on the CPU or GPU,
            it is faster if this is False.
          scope: (optional) variable scope. Defaults to 'electra'.

        Raises:
          ValueError: The config is invalid or one of the input tensor shapes
            is invalid.
        '''
        bert_config = copy.deepcopy(bert_config)
        if not is_training:
            bert_config.hidden_dropout_prob = 0.0
            bert_config.attention_probs_dropout_prob = 0.0

        input_shape = util.get_shape_list(token_type_ids, expected_rank=2)
        batch_size = input_shape[0]
        seq_length = input_shape[1]

        if input_mask is None:
            input_mask = tf.ones(shape=[batch_size, seq_length],
                                 dtype=tf.int32)

        assert token_type_ids is not None

        if input_reprs is None:
            with tf.variable_scope(
                ((scope if untied_embeddings else 'electra') + '/embeddings'),
                    reuse=tf.AUTO_REUSE):
                # Perform embedding lookup on the word ids
                if embedding_size is None:
                    embedding_size = bert_config.hidden_size
                (token_embeddings, self.embedding_table) = \
                    embedding_lookup(
                        input_ids=input_ids,
                        vocab_size=bert_config.vocab_size,
                        embedding_size=embedding_size,
                        initializer_range=bert_config.initializer_range,
                        word_embedding_name='word_embeddings',
                        use_one_hot_embeddings=use_one_hot_embeddings)

            with tf.variable_scope(
                ((scope if untied_embeddings else 'electra') + '/embeddings'),
                    reuse=tf.AUTO_REUSE):
                # Add positional embeddings and token type embeddings, then
                # layer normalize and perform dropout.
                self.embedding_output = embedding_postprocessor(
                    input_tensor=token_embeddings,
                    use_token_type=True,
                    token_type_ids=token_type_ids,
                    token_type_vocab_size=bert_config.type_vocab_size,
                    token_type_embedding_name='token_type_embeddings',
                    use_position_embeddings=True,
                    position_embedding_name='position_embeddings',
                    initializer_range=bert_config.initializer_range,
                    max_position_embeddings=\
                        bert_config.max_position_embeddings,
                    dropout_prob=bert_config.hidden_dropout_prob)
        else:
            self.embedding_output = input_reprs
        if not update_embeddings:
            self.embedding_output = tf.stop_gradient(self.embedding_output)

        with tf.variable_scope(scope, default_name='electra'):
            if self.embedding_output.shape[-1] != bert_config.hidden_size:
                self.embedding_output = tf.layers.dense(
                    self.embedding_output,
                    bert_config.hidden_size,
                    name='embeddings_project')

            with tf.variable_scope('encoder'):
                # This converts a 2D mask of shape [batch_size, seq_length]
                # to a 3D mask of shape [batch_size, seq_length, seq_length]
                # which is used for the attention scores.
                attention_mask = create_attention_mask_from_input_mask(
                    token_type_ids, input_mask)

                # Run the stacked transformer. Output shapes
                # attn_maps:
                #   [n_layers, batch_size, n_heads, seq_length, seq_length]
                (self.all_layer_outputs, self.attn_maps) = transformer_model(
                    input_tensor=self.embedding_output,
                    attention_mask=attention_mask,
                    hidden_size=bert_config.hidden_size,
                    num_hidden_layers=bert_config.num_hidden_layers,
                    num_attention_heads=bert_config.num_attention_heads,
                    intermediate_size=bert_config.intermediate_size,
                    intermediate_act_fn=util.get_activation(
                        bert_config.hidden_act),
                    hidden_dropout_prob=bert_config.hidden_dropout_prob,
                    attention_probs_dropout_prob=bert_config.
                    attention_probs_dropout_prob,
                    initializer_range=bert_config.initializer_range,
                    do_return_all_layers=True)
                self.sequence_output = self.all_layer_outputs[-1]
                self.pooled_output = self.sequence_output[:, 0]
Beispiel #18
0
    def _lm_forward(self,
                    is_training,
                    input_tensor,
                    input_mask,
                    label_ids,
                    bert_config,
                    batch_size,
                    max_seq_length,
                    prob,
                    scope,
                    name,
                    sample_weight=None,
                    hidden_dropout_prob=0.1,
                    initializer_range=0.02):

        with tf.variable_scope(scope):

            with tf.variable_scope('verifier'):
                logits = tf.layers.dense(
                    input_tensor,
                    2,
                    kernel_initializer=util.create_initializer(
                        bert_config.initializer_range),
                    trainable=True)
                verifier_label_ids = tf.cast(tf.greater(label_ids, 0),
                                             tf.int32)

                # loss
                log_probs = tf.nn.log_softmax(logits, axis=-1)
                one_hot_labels = tf.one_hot(verifier_label_ids, depth=2)
                per_token_loss = -tf.reduce_sum(one_hot_labels * log_probs,
                                                axis=-1)

                input_mask = tf.cast(input_mask, tf.float32)
                per_token_loss *= input_mask / tf.reduce_sum(
                    input_mask, keepdims=True, axis=-1)
                per_example_loss = tf.reduce_sum(per_token_loss, axis=-1)
                if sample_weight is not None:
                    per_example_loss *= tf.expand_dims(sample_weight, axis=-1)

                if prob != 0:
                    self.total_loss += tf.reduce_mean(per_example_loss)
                verifier_loss = per_example_loss
                verifier_preds = tf.argmax(logits, axis=-1)

            with tf.variable_scope('prediction'):

                with tf.variable_scope('intermediate'):
                    logits = tf.layers.dense(
                        input_tensor,
                        bert_config.hidden_size * 4,
                        kernel_initializer=util.create_initializer(
                            bert_config.initializer_range),
                        activation=util.gelu,
                        trainable=True)
                with tf.variable_scope('output'):
                    logits = tf.layers.dense(
                        logits,
                        bert_config.hidden_size,
                        kernel_initializer=util.create_initializer(
                            bert_config.initializer_range),
                        trainable=True)

                flattened = tf.reshape(
                    logits,
                    [batch_size * max_seq_length, bert_config.hidden_size])
                logits = tf.matmul(flattened,
                                   self.embedding_table,
                                   transpose_b=True)
                logits = tf.reshape(
                    logits, [-1, max_seq_length, bert_config.vocab_size])

                # loss
                log_probs = tf.nn.log_softmax(logits, axis=-1)
                one_hot_labels = tf.one_hot(label_ids,
                                            depth=bert_config.vocab_size)
                per_token_loss = -tf.reduce_sum(one_hot_labels * log_probs,
                                                axis=-1)

                input_mask *= tf.cast(verifier_preds, tf.float32)
                per_token_loss *= input_mask / (
                    tf.reduce_sum(input_mask, keepdims=True, axis=-1) + 1e-6)
                per_example_loss = tf.reduce_sum(per_token_loss, axis=-1)
                if sample_weight is not None:
                    per_example_loss *= tf.expand_dims(sample_weight, axis=-1)

                if prob != 0:
                    self.total_loss += tf.reduce_mean(per_example_loss)
                self.losses[name + '_loss'] = verifier_loss
                self.preds[name + '_preds'] = \
                    tf.argmax(logits, axis=-1) * verifier_preds
Beispiel #19
0
    def _get_generator_output(self, inputs, sample_weight, generator):
        '''Masked language modeling softmax layer.'''
        def gather_indexes(sequence_tensor, positions):
            sequence_shape = util.get_shape_list(sequence_tensor, 3)
            batch_size = sequence_shape[0]
            seq_length = sequence_shape[1]
            width = sequence_shape[2]

            flat_offsets = tf.reshape(
                tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
            flat_positions = tf.reshape(positions + flat_offsets, [-1])
            flat_sequence_tensor = tf.reshape(sequence_tensor,
                                              [batch_size * seq_length, width])
            output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
            return output_tensor

        input_tensor = gather_indexes(generator.get_sequence_output(),
                                      inputs.masked_lm_positions)
        with tf.variable_scope('generator_predictions'):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=self.config.embedding_size,
                activation=util.get_activation(self.bert_config.hidden_act),
                kernel_initializer=util.create_initializer(
                    self.bert_config.initializer_range))
            input_tensor = util.layer_norm(input_tensor)
            output_bias = tf.get_variable('output_bias',
                                          shape=[self.bert_config.vocab_size],
                                          initializer=tf.zeros_initializer())

            logits = tf.matmul(input_tensor,
                               generator.get_embedding_table(),
                               transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            probs = tf.nn.softmax(logits, axis=-1, name='MLM_probs')
            preds = tf.argmax(logits, axis=-1)
            log_probs = tf.nn.log_softmax(logits, axis=-1)

            label_ids = tf.reshape(inputs.masked_lm_ids, [-1])
            masked_lm_weights = inputs.masked_lm_weights
            if sample_weight is not None:
                sample_weight = tf.expand_dims(tf.cast(sample_weight,
                                                       dtype=tf.float32),
                                               axis=-1)
                masked_lm_weights *= sample_weight
            label_weights = tf.reshape(masked_lm_weights, [-1])
            one_hot_labels = tf.one_hot(label_ids,
                                        depth=self.bert_config.vocab_size,
                                        dtype=tf.float32)
            per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                              axis=[-1])
            per_example_loss = label_weights * per_example_loss

            numerator = tf.reduce_sum(per_example_loss)
            denominator = tf.reduce_sum(label_weights) + 1e-6
            loss = numerator / denominator

            MLMOutput = collections.namedtuple(
                'MLMOutput',
                ['logits', 'probs', 'loss', 'per_example_loss', 'preds'])
            return MLMOutput(logits=logits,
                             probs=probs,
                             per_example_loss=per_example_loss,
                             loss=loss,
                             preds=preds)
Beispiel #20
0
    def __init__(self,
                 bert_config,
                 is_training,
                 input_ids,
                 add_label_ids,
                 del_label_ids,
                 sample_weight=None,
                 add_prob=0,
                 del_prob=0,
                 scope='bert',
                 use_tilda_embedding=False,
                 **kwargs):
        super().__init__()

        input_mask = tf.cast(tf.not_equal(input_ids, 0), tf.float32)

        shape = util.get_shape_list(input_ids, expected_rank=2)
        batch_size = shape[0]
        max_seq_length = shape[1]

        if not is_training:
            bert_config.hidden_dropout_prob = 0.0
            bert_config.attention_probs_dropout_prob = 0.0

        # Tilda embeddings for SMART algorithm
        tilda_embeddings = None
        if use_tilda_embedding:
            with tf.variable_scope('', reuse=True):
                tilda_embeddings = tf.get_variable('tilda_embeddings')

        with tf.variable_scope(scope):

            # forward once
            hidden = self._bert_forward(bert_config,
                                        input_ids,
                                        input_mask,
                                        batch_size,
                                        max_seq_length,
                                        tilda_embeddings=tilda_embeddings)

            # additional_position_embeddings = tf.get_variable(
            #     name='position_embeddings',
            #     shape=[bert_config.max_position_embeddings,
            #             bert_config.hidden_size],
            #     initializer=util.create_initializer(
            #         bert_config.initializer_range))
            # embedding_slice = tf.slice(
            #     additional_position_embeddings, [0, 0], [max_seq_length, -1])
            # hidden += tf.reshape(
            #     embedding_slice,
            #     [1, max_seq_length, bert_config.hidden_size])

            self.total_loss = 0
            self._lm_forward(is_training,
                             input_tensor=hidden,
                             input_mask=input_mask,
                             label_ids=add_label_ids,
                             bert_config=bert_config,
                             batch_size=batch_size,
                             max_seq_length=max_seq_length,
                             prob=add_prob,
                             scope='cls/add',
                             name='add',
                             sample_weight=sample_weight)
            self._cls_forward(is_training,
                              input_tensor=hidden,
                              input_mask=input_mask,
                              label_ids=del_label_ids,
                              bert_config=bert_config,
                              batch_size=batch_size,
                              max_seq_length=max_seq_length,
                              prob=del_prob,
                              scope='cls/del',
                              name='del',
                              sample_weight=sample_weight)
Beispiel #21
0
    def __init__(self,
                 is_training,
                 input_tensor,
                 is_supervised,
                 is_expanded,
                 label_ids,
                 label_size=2,
                 sample_weight=None,
                 scope='cls/seq_relationship',
                 hidden_dropout_prob=0.1,
                 initializer_range=0.02,
                 trainable=True,
                 global_step=None,
                 num_train_steps=None,
                 uda_softmax_temp=-1,
                 uda_confidence_thresh=-1,
                 tsa_schedule='linear',
                 **kwargs):
        super().__init__(**kwargs)

        is_supervised = tf.cast(is_supervised, tf.float32)
        is_expanded = tf.cast(is_expanded, tf.float32)

        hidden_size = input_tensor.shape.as_list()[-1]
        with tf.variable_scope(scope):
            output_weights = tf.get_variable(
                'output_weights',
                shape=[label_size, hidden_size],
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            output_bias = tf.get_variable('output_bias',
                                          shape=[label_size],
                                          initializer=tf.zeros_initializer(),
                                          trainable=trainable)

            output_layer = util.dropout(
                input_tensor, hidden_dropout_prob if is_training else 0.0)
            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            log_probs = tf.nn.log_softmax(logits, axis=-1)

            with tf.variable_scope('sup_loss'):

                # reshape
                sup_ori_log_probs = tf.boolean_mask(log_probs,
                                                    mask=(1.0 - is_expanded),
                                                    axis=0)
                sup_log_probs = tf.boolean_mask(sup_ori_log_probs,
                                                mask=is_supervised,
                                                axis=0)
                sup_label_ids = tf.boolean_mask(label_ids,
                                                mask=is_supervised,
                                                axis=0)

                self.preds['preds'] = tf.argmax(sup_ori_log_probs, axis=-1)

                one_hot_labels = tf.one_hot(sup_label_ids,
                                            depth=label_size,
                                            dtype=tf.float32)
                per_example_loss = -tf.reduce_sum(
                    one_hot_labels * sup_log_probs, axis=-1)

                loss_mask = tf.ones_like(per_example_loss, dtype=tf.float32)
                correct_label_probs = tf.reduce_sum(one_hot_labels *
                                                    tf.exp(sup_log_probs),
                                                    axis=-1)

                if is_training and tsa_schedule:
                    tsa_start = 1.0 / label_size
                    tsa_threshold = get_tsa_threshold(tsa_schedule,
                                                      global_step,
                                                      num_train_steps,
                                                      tsa_start,
                                                      end=1)

                    larger_than_threshold = tf.greater(correct_label_probs,
                                                       tsa_threshold)
                    loss_mask = loss_mask * (
                        1 - tf.cast(larger_than_threshold, tf.float32))

                loss_mask = tf.stop_gradient(loss_mask)
                per_example_loss = per_example_loss * loss_mask
                if sample_weight is not None:
                    sup_sample_weight = tf.boolean_mask(sample_weight,
                                                        mask=is_supervised,
                                                        axis=0)
                    per_example_loss *= tf.cast(sup_sample_weight,
                                                dtype=tf.float32)
                sup_loss = (tf.reduce_sum(per_example_loss) /
                            tf.maximum(tf.reduce_sum(loss_mask), 1))

                self.losses['supervised'] = per_example_loss

            with tf.variable_scope('unsup_loss'):

                # reshape
                ori_log_probs = tf.boolean_mask(sup_ori_log_probs,
                                                mask=(1.0 - is_supervised),
                                                axis=0)
                aug_log_probs = tf.boolean_mask(log_probs,
                                                mask=is_expanded,
                                                axis=0)
                sup_ori_logits = tf.boolean_mask(logits,
                                                 mask=(1.0 - is_expanded),
                                                 axis=0)
                ori_logits = tf.boolean_mask(sup_ori_logits,
                                             mask=(1.0 - is_supervised),
                                             axis=0)

                unsup_loss_mask = 1
                if uda_softmax_temp != -1:
                    tgt_ori_log_probs = tf.nn.log_softmax(ori_logits /
                                                          uda_softmax_temp,
                                                          axis=-1)
                    tgt_ori_log_probs = tf.stop_gradient(tgt_ori_log_probs)
                else:
                    tgt_ori_log_probs = tf.stop_gradient(ori_log_probs)

                if uda_confidence_thresh != -1:
                    largest_prob = tf.reduce_max(tf.exp(ori_log_probs),
                                                 axis=-1)
                    unsup_loss_mask = tf.cast(
                        tf.greater(largest_prob, uda_confidence_thresh),
                        tf.float32)
                    unsup_loss_mask = tf.stop_gradient(unsup_loss_mask)

                per_example_loss = kl_for_log_probs(
                    tgt_ori_log_probs, aug_log_probs) * unsup_loss_mask
                if sample_weight is not None:
                    unsup_sample_weight = tf.boolean_mask(sample_weight,
                                                          mask=(1.0 -
                                                                is_supervised),
                                                          axis=0)
                    per_example_loss *= tf.cast(unsup_sample_weight,
                                                dtype=tf.float32)
                unsup_loss = tf.reduce_mean(per_example_loss)

                self.losses['unsupervised'] = per_example_loss

            self.total_loss = sup_loss + unsup_loss
Beispiel #22
0
    def _bert_forward(self,
                      bert_config,
                      input_ids,
                      input_mask,
                      batch_size,
                      max_seq_length,
                      dtype=tf.float32,
                      trainable=True,
                      tilda_embeddings=None):

        with tf.variable_scope('embeddings'):

            (embedding_output, self.embedding_table) = self.embedding_lookup(
                input_ids=input_ids,
                vocab_size=bert_config.vocab_size,
                batch_size=batch_size,
                max_seq_length=max_seq_length,
                embedding_size=bert_config.hidden_size,
                initializer_range=bert_config.initializer_range,
                word_embedding_name='word_embeddings',
                dtype=dtype,
                trainable=trainable,
                tilda_embeddings=tilda_embeddings)

            # Add positional embeddings and token type embeddings
            # layer normalize and perform dropout.
            embedding_output = self.embedding_postprocessor(
                input_tensor=embedding_output,
                batch_size=batch_size,
                max_seq_length=max_seq_length,
                hidden_size=bert_config.hidden_size,
                use_token_type=False,
                use_position_embeddings=True,
                position_embedding_name='position_embeddings',
                initializer_range=bert_config.initializer_range,
                max_position_embeddings=\
                    bert_config.max_position_embeddings,
                dropout_prob=bert_config.hidden_dropout_prob,
                dtype=dtype,
                trainable=trainable)

        with tf.variable_scope('encoder'):
            attention_mask = self.create_attention_mask_from_input_mask(
                input_mask, batch_size, max_seq_length, dtype=dtype)

            # stacked transformers
            all_encoder_layers = self.transformer_model(
                input_tensor=embedding_output,
                batch_size=batch_size,
                max_seq_length=max_seq_length,
                attention_mask=attention_mask,
                hidden_size=bert_config.hidden_size,
                num_hidden_layers=bert_config.num_hidden_layers,
                num_attention_heads=bert_config.num_attention_heads,
                intermediate_size=bert_config.intermediate_size,
                intermediate_act_fn=util.get_activation(
                    bert_config.hidden_act),
                hidden_dropout_prob=bert_config.hidden_dropout_prob,
                attention_probs_dropout_prob=\
                bert_config.attention_probs_dropout_prob,
                initializer_range=bert_config.initializer_range,
                dtype=dtype,
                trainable=trainable)

        return all_encoder_layers[-1]
Beispiel #23
0
    def transformer_model(self,
                          input_tensor,
                          batch_size,
                          max_seq_length,
                          attention_mask=None,
                          hidden_size=768,
                          num_hidden_layers=12,
                          num_attention_heads=12,
                          intermediate_size=3072,
                          intermediate_act_fn=util.gelu,
                          hidden_dropout_prob=0.1,
                          attention_probs_dropout_prob=0.1,
                          initializer_range=0.02,
                          dtype=tf.float32,
                          trainable=True):
        if hidden_size % num_attention_heads != 0:
            raise ValueError(
                'The hidden size (%d) is not a multiple of the number '
                'of attention heads (%d)' % (hidden_size, num_attention_heads))

        attention_head_size = int(hidden_size / num_attention_heads)
        prev_output = util.reshape_to_matrix(input_tensor)

        self.attention_scores = []
        all_layer_outputs = []
        for layer_idx in range(num_hidden_layers):
            with tf.variable_scope('layer_%d' % layer_idx):
                layer_input = prev_output

                def _build_forward(layer_input):
                    with tf.variable_scope('attention'):
                        attention_heads = []
                        with tf.variable_scope('self'):
                            (attention_head, attention_scores) = \
                                self.attention_layer(
                                    from_tensor=layer_input,
                                    to_tensor=layer_input,
                                    attention_mask=attention_mask,
                                    num_attention_heads=num_attention_heads,
                                    size_per_head=attention_head_size,
                                    attention_probs_dropout_prob=\
                                        attention_probs_dropout_prob,
                                    initializer_range=initializer_range,
                                    do_return_2d_tensor=True,
                                    batch_size=batch_size,
                                    from_max_seq_length=max_seq_length,
                                    to_max_seq_length=max_seq_length,
                                    dtype=dtype,
                                    trainable=trainable)
                            attention_heads.append(attention_head)
                            self.attention_scores.append(attention_scores)

                        attention_output = None
                        if len(attention_heads) == 1:
                            attention_output = attention_heads[0]
                        else:
                            attention_output = tf.concat(attention_heads,
                                                         axis=-1)

                        with tf.variable_scope('output'):
                            attention_output = tf.layers.dense(
                                attention_output,
                                hidden_size,
                                kernel_initializer=util.create_initializer(
                                    initializer_range),
                                trainable=trainable)
                            attention_output = util.dropout(
                                attention_output, hidden_dropout_prob)
                            attention_output = util.layer_norm(
                                attention_output + layer_input,
                                trainable=trainable)

                    # The activation is only applied to the `intermediate`
                    # hidden layer.
                    with tf.variable_scope('intermediate'):
                        intermediate_output = tf.layers.dense(
                            attention_output,
                            intermediate_size,
                            activation=intermediate_act_fn,
                            kernel_initializer=util.create_initializer(
                                initializer_range),
                            trainable=trainable)

                    # Down-project back to hidden_size then add the residual.
                    with tf.variable_scope('output'):
                        layer_output = tf.layers.dense(
                            intermediate_output,
                            hidden_size,
                            kernel_initializer=util.create_initializer(
                                initializer_range),
                            trainable=trainable)
                        layer_output = util.dropout(layer_output,
                                                    hidden_dropout_prob)
                        layer_output = util.layer_norm(layer_output +
                                                       attention_output,
                                                       trainable=trainable)

                    return layer_output

                layer_output = _build_forward(layer_input)
                prev_output = layer_output
                all_layer_outputs.append(layer_output)

        original_shape = [batch_size * max_seq_length, hidden_size]
        input_shape = [batch_size, max_seq_length, hidden_size]

        final_all_layer_outputs = []
        for layer_output in all_layer_outputs:
            final_output = util.reshape_from_matrix(
                layer_output, input_shape, original_shape=original_shape)
            final_all_layer_outputs.append(final_output)
        return final_all_layer_outputs
Beispiel #24
0
def layer_norm(input_tensor,
               center=True,
               scale=True,
               activation_fn=None,
               variables_collections=None,
               outputs_collections=None,
               begin_norm_axis=-1,
               begin_params_axis=-1,
               trainable=True):
    ''' Runs layer normalization on the last dimension of the tensor.

    Args:
      input_tensor: A tensor having rank `R`. The normalization is performed
        over axes `begin_norm_axis ... R - 1` and centering and scaling
        parameters are calculated over `begin_params_axis ... R - 1`.
      center: If True, add offset of `beta` to normalized tensor. If False,
        `beta` is ignored.
      scale: If True, multiply by `gamma`. If False, `gamma` is not used.
        When the next layer is linear (also e.g. `nn.relu`), this can be
        disabled since the scaling can be done by the next layer.
      activation_fn: Activation function, default set to None to skip it and
        maintain a linear activation.
      variables_collections: Optional collections for the variables.
      outputs_collections: Collections to add the outputs.
      trainable: If `True` also add variables to the graph collection
        `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
      begin_norm_axis: The first normalization dimension: normalization will
        be performed along dimensions `begin_norm_axis : rank(input_tensor)`
      begin_params_axis: The first parameter (beta, gamma) dimension: scale
        and centering parameters will have dimensions
        `begin_params_axis : rank(input_tensor)` and will be broadcast with
        the normalized inputs accordingly.
      scope: Optional scope for `variable_scope`.

    Returns:
      A `Tensor` representing the output of the operation, having the same
      shape and dtype as `input_tensor`.

    Raises:
      ValueError: If the rank of `input_tensor` is not known at graph build
        time, or if `input_tensor.shape[begin_params_axis:]` is not fully
        defined at graph build time.
    '''
    with tf.variable_scope('LayerNorm'):
        inputs_shape = input_tensor.shape
        inputs_rank = inputs_shape.ndims
        if inputs_rank is None:
            raise ValueError('Inputs %s has undefined rank.'
                             % input_tensor.name)
        dtype = input_tensor.dtype.base_dtype
        if begin_norm_axis < 0:
            begin_norm_axis = inputs_rank + begin_norm_axis
        if begin_params_axis >= inputs_rank or begin_norm_axis >= inputs_rank:
            raise ValueError(
                'begin_params_axis (%d) and begin_norm_axis (%d) '
                'must be < rank(inputs) (%d)'
                % (begin_params_axis, begin_norm_axis, inputs_rank))
        params_shape = inputs_shape[begin_params_axis:]
        if not params_shape.is_fully_defined():
            raise ValueError(
                'Inputs %s: shape(inputs)[%s:] is not fully defined: %s' %
                (input_tensor.name, begin_params_axis, inputs_shape))

        # Allocate parameters for the beta and gamma of the normalization.
        beta, gamma = None, None
        if center:
            beta = tf.get_variable(
                'beta', shape=params_shape,
                dtype=dtype,
                initializer=tf.zeros_initializer(),
                trainable=trainable)
        if scale:
            gamma = tf.get_variable(
                'gamma',
                shape=params_shape,
                dtype=dtype,
                initializer=tf.ones_initializer(),
                trainable=trainable)
        # By default, compute the moments across all the dimensions except the
        # one with index 0.
        norm_axes = list(range(begin_norm_axis, inputs_rank))
        mean, variance = tf.nn.moments(input_tensor, norm_axes, keep_dims=True)

        # Compute layer normalization using the batch_normalization function.
        # Note that epsilon must be increased for float16 due to the limited
        # representable range.
        variance_epsilon = 1e-12 if dtype != tf.float16 else 1e-3
        outputs = tf.nn.batch_normalization(
            input_tensor, mean, variance,
            offset=beta, scale=gamma,
            variance_epsilon=variance_epsilon)

        outputs.set_shape(inputs_shape)
        if activation_fn is not None:
            outputs = activation_fn(outputs)
        return outputs
Beispiel #25
0
    def __init__(self,
                 bert_config,
                 is_training,
                 encoder,
                 masked_lm_positions,
                 masked_lm_ids,
                 masked_lm_weights,
                 next_sentence_labels,
                 sample_weight=None,
                 scope_lm='cls/predictions',
                 scope_cls='cls/seq_relationship',
                 trainable=True,
                 use_nsp_loss=True,
                 **kwargs):
        super(BERTDecoder, self).__init__(**kwargs)

        def gather_indexes(sequence_tensor, positions):
            sequence_shape = util.get_shape_list(sequence_tensor, 3)
            batch_size = sequence_shape[0]
            seq_length = sequence_shape[1]
            width = sequence_shape[2]

            flat_offsets = tf.reshape(
                tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
            flat_positions = tf.reshape(positions + flat_offsets, [-1])
            flat_sequence_tensor = tf.reshape(sequence_tensor,
                                              [batch_size * seq_length, width])
            output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
            return output_tensor

        scalar_losses = []

        # masked language modeling
        input_tensor = gather_indexes(encoder.get_sequence_output(),
                                      masked_lm_positions)
        with tf.variable_scope(scope_lm):
            with tf.variable_scope('transform'):
                input_tensor = tf.layers.dense(
                    input_tensor,
                    units=bert_config.hidden_size,
                    activation=util.get_activation(bert_config.hidden_act),
                    kernel_initializer=util.create_initializer(
                        bert_config.initializer_range))
                input_tensor = util.layer_norm(input_tensor)
            output_bias = tf.get_variable('output_bias',
                                          shape=[bert_config.vocab_size],
                                          initializer=tf.zeros_initializer(),
                                          trainable=trainable)

            logits = tf.matmul(input_tensor,
                               encoder.get_embedding_table(),
                               transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            probs = tf.nn.softmax(logits, axis=-1, name='MLM_probs')
            log_probs = tf.nn.log_softmax(logits, axis=-1)

            label_ids = tf.reshape(masked_lm_ids, [-1])
            if sample_weight is not None:
                sample_weight = tf.expand_dims(tf.cast(sample_weight,
                                                       dtype=tf.float32),
                                               axis=-1)
                masked_lm_weights *= sample_weight
            label_weights = tf.reshape(masked_lm_weights, [-1])
            one_hot_labels = tf.one_hot(label_ids,
                                        depth=bert_config.vocab_size,
                                        dtype=tf.float32)
            per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                              axis=[-1])
            per_example_loss = label_weights * per_example_loss

            numerator = tf.reduce_sum(per_example_loss)
            denominator = tf.reduce_sum(label_weights) + 1e-5
            loss = numerator / denominator

            scalar_losses.append(loss)
            self.losses['MLM_losses'] = per_example_loss
            self.preds['MLM_preds'] = tf.argmax(probs, axis=-1)

        # next sentence prediction
        with tf.variable_scope(scope_cls):
            output_weights = tf.get_variable(
                'output_weights',
                shape=[2, bert_config.hidden_size],
                initializer=util.create_initializer(
                    bert_config.initializer_range),
                trainable=trainable)
            output_bias = tf.get_variable('output_bias',
                                          shape=[2],
                                          initializer=tf.zeros_initializer(),
                                          trainable=trainable)

            logits = tf.matmul(encoder.get_pooled_output(),
                               output_weights,
                               transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            probs = tf.nn.softmax(logits, axis=-1, name='probs')
            log_probs = tf.nn.log_softmax(logits, axis=-1)

            labels = tf.reshape(next_sentence_labels, [-1])
            one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32)
            per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs,
                                              axis=-1)
            if sample_weight is not None:
                per_example_loss = (tf.cast(sample_weight, dtype=tf.float32) *
                                    per_example_loss)
            loss = tf.reduce_mean(per_example_loss)

            if use_nsp_loss:
                scalar_losses.append(loss)
            self.losses['NSP_losses'] = per_example_loss
            self.probs['NSP_probs'] = probs
            self.preds['NSP_preds'] = tf.argmax(probs, axis=-1)

        self.total_loss = tf.add_n(scalar_losses)
Beispiel #26
0
    def dynamic_transformer_model(self,
                                  is_training,
                                  input_tensor,
                                  input_mask,
                                  batch_size,
                                  max_seq_length,
                                  label_size,
                                  attention_mask=None,
                                  hidden_size=768,
                                  num_hidden_layers=12,
                                  num_attention_heads=12,
                                  intermediate_size=3072,
                                  intermediate_act_fn=util.gelu,
                                  hidden_dropout_prob=0.1,
                                  attention_probs_dropout_prob=0.1,
                                  initializer_range=0.02,
                                  dtype=tf.float32,
                                  cls_model='self-attention',
                                  cls_hidden_size=128,
                                  cls_num_attention_heads=2,
                                  speed=0.1,
                                  ignore_cls=None):
        if hidden_size % num_attention_heads != 0:
            raise ValueError(
                'The hidden size (%d) is not a multiple of the number of '
                'attention heads (%d)' % (hidden_size, num_attention_heads))
        attention_head_size = int(hidden_size / num_attention_heads)

        keep_cls = list(range(num_hidden_layers + 1))
        keep_cls = [
            cls_idx for cls_idx in keep_cls if cls_idx not in ignore_cls
        ]

        all_layer_outputs = []
        all_layer_cls_outputs = collections.OrderedDict()
        prev_output = input_tensor
        prev_mask = input_mask
        for layer_idx in range(num_hidden_layers):
            with tf.variable_scope('layer_%d' % layer_idx):

                # build child classifier
                if is_training or layer_idx not in ignore_cls:
                    with tf.variable_scope('distill'):

                        # FCN + Self_Attention + FCN + FCN
                        if cls_model == 'self-attention-paper':
                            cls_output = self._cls_self_attention_paper(
                                prev_output,
                                batch_size,
                                max_seq_length,
                                label_size,
                                attention_mask=attention_mask,
                                cls_hidden_size=cls_hidden_size,
                                cls_num_attention_heads=\
                                    cls_num_attention_heads,
                                attention_probs_dropout_prob=\
                                    attention_probs_dropout_prob,
                                initializer_range=initializer_range,
                                dtype=tf.float32,
                                trainable=True)

                        # Self_Attention + FCN
                        elif cls_model == 'self-attention':
                            cls_output = self._cls_self_attention(
                                prev_output,
                                batch_size,
                                max_seq_length,
                                label_size,
                                attention_mask=attention_mask,
                                cls_hidden_size=cls_hidden_size,
                                cls_num_attention_heads=\
                                    cls_num_attention_heads,
                                attention_probs_dropout_prob=\
                                    attention_probs_dropout_prob,
                                initializer_range=initializer_range,
                                dtype=tf.float32,
                                trainable=True)

                        # FCN
                        elif cls_model == 'fcn':
                            cls_output = self._cls_fcn(
                                prev_output,
                                label_size,
                                hidden_size=hidden_size,
                                initializer_range=initializer_range,
                                dtype=tf.float32,
                                trainable=True)

                        else:
                            raise ValueError(
                                'Invalid `cls_model = %s`. Pick one from '
                                '`self-attention-paper`, `self-attention` '
                                'and `fcn`' % cls_model)

                        # distill core
                        layer_cls_output = tf.nn.softmax(cls_output,
                                                         axis=-1,
                                                         name='cls_%d' %
                                                         layer_idx)
                        uncertainty = tf.reduce_sum(layer_cls_output *
                                                    tf.log(layer_cls_output),
                                                    axis=-1)
                        uncertainty /= tf.log(1 / label_size)

                    # branching only in inference
                    if not is_training:

                        # last output
                        if layer_idx == keep_cls[-1]:
                            all_layer_outputs.append(prev_output)
                            all_layer_cls_outputs[layer_idx] = layer_cls_output
                            return (all_layer_outputs, all_layer_cls_outputs)

                        mask = tf.less(uncertainty, speed)
                        unfinished_mask = \
                            (tf.ones_like(mask, dtype=dtype) -
                             tf.cast(mask, dtype=dtype))
                        prev_output = tf.boolean_mask(prev_output,
                                                      mask=unfinished_mask,
                                                      axis=0)
                        prev_mask = tf.boolean_mask(prev_mask,
                                                    mask=unfinished_mask,
                                                    axis=0)
                    all_layer_cls_outputs[layer_idx] = layer_cls_output

                    # new attention mask
                    input_shape = util.get_shape_list(prev_output)
                    batch_size = input_shape[0]
                    max_seq_length = input_shape[1]
                    attention_mask = \
                        self.create_attention_mask_from_input_mask(
                            prev_mask, batch_size, max_seq_length, dtype=dtype)

                # originial stream
                with tf.variable_scope('attention'):
                    attention_heads = []
                    with tf.variable_scope('self'):
                        (attention_head, _) = self.attention_layer(
                            from_tensor=prev_output,
                            to_tensor=prev_output,
                            attention_mask=attention_mask,
                            num_attention_heads=num_attention_heads,
                            size_per_head=attention_head_size,
                            attention_probs_dropout_prob=\
                                attention_probs_dropout_prob,
                            initializer_range=initializer_range,
                            do_return_2d_tensor=False,
                            batch_size=batch_size,
                            from_max_seq_length=max_seq_length,
                            to_max_seq_length=max_seq_length,
                            dtype=dtype,
                            trainable=False)
                        attention_heads.append(attention_head)

                    attention_output = None
                    if len(attention_heads) == 1:
                        attention_output = attention_heads[0]
                    else:
                        attention_output = tf.concat(attention_heads, axis=-1)

                    with tf.variable_scope('output'):
                        attention_output = tf.layers.dense(
                            attention_output,
                            hidden_size,
                            kernel_initializer=util.create_initializer(
                                initializer_range),
                            trainable=False)
                        attention_output = util.dropout(
                            attention_output, hidden_dropout_prob)
                        attention_output = util.layer_norm(attention_output +
                                                           prev_output,
                                                           trainable=False)

                # The activation is only applied to the `intermediate`
                # hidden layer.
                with tf.variable_scope('intermediate'):
                    intermediate_output = tf.layers.dense(
                        attention_output,
                        intermediate_size,
                        activation=intermediate_act_fn,
                        kernel_initializer=util.create_initializer(
                            initializer_range),
                        trainable=False)

                # Down-project back to hidden_size then add the residual.
                with tf.variable_scope('output'):
                    layer_output = tf.layers.dense(
                        intermediate_output,
                        hidden_size,
                        kernel_initializer=util.create_initializer(
                            initializer_range),
                        trainable=False)
                    layer_output = util.dropout(layer_output,
                                                hidden_dropout_prob)
                    layer_output = util.layer_norm(layer_output +
                                                   attention_output,
                                                   trainable=False)

                prev_output = layer_output
                all_layer_outputs.append(layer_output)

        return (all_layer_outputs, all_layer_cls_outputs)
Beispiel #27
0
    def __init__(self,
                 bert_config,
                 is_training,
                 input_tensor,
                 sa_mask,
                 label_ids,
                 sample_weight=None,
                 scope='sanet',
                 alpha=0.5,
                 hidden_dropout_prob=0.1,
                 initializer_range=0.02,
                 trainable=True,
                 **kwargs):
        super().__init__(**kwargs)

        shape = util.get_shape_list(input_tensor)
        batch_size = shape[0]
        seq_length = shape[1]
        hidden_size = shape[2]
        sa_mask = tf.reshape(sa_mask, [batch_size, seq_length, seq_length])
        with tf.variable_scope(scope):
            with tf.variable_scope('sentence_attention'):
                (sa_output, _) = self.attention_layer(
                    from_tensor=input_tensor,
                    to_tensor=input_tensor,
                    attention_mask=sa_mask,
                    num_attention_heads=bert_config.num_attention_heads,
                    size_per_head=\
                        hidden_size // bert_config.num_attention_heads,
                    attention_probs_dropout_prob=\
                        bert_config.hidden_dropout_prob,
                    initializer_range=bert_config.initializer_range,
                    do_return_2d_tensor=False,
                    batch_size=batch_size,
                    from_max_seq_length=seq_length,
                    to_max_seq_length=seq_length,
                    trainable=trainable)

            with tf.variable_scope('cls/mrc'):
                output_weights = tf.get_variable(
                    'output_weights',
                    shape=[2, hidden_size],
                    initializer=util.create_initializer(initializer_range),
                    trainable=trainable)
                output_bias = tf.get_variable(
                    'output_bias',
                    shape=[2],
                    initializer=tf.zeros_initializer(),
                    trainable=trainable)

            output_layer = alpha * sa_output + (1 - alpha) * input_tensor
            output_layer = tf.reshape(output_layer, [-1, hidden_size])
            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            logits = tf.reshape(logits, [-1, seq_length, 2])
            logits = tf.transpose(logits, [0, 2, 1])
            probs = tf.nn.softmax(logits, axis=-1, name='probs')
            self.probs['probs'] = probs
            self.preds['preds'] = tf.argmax(logits, axis=-1)

            start_one_hot_labels = tf.one_hot(label_ids[:, 0],
                                              depth=seq_length,
                                              dtype=tf.float32)
            end_one_hot_labels = tf.one_hot(label_ids[:, 1],
                                            depth=seq_length,
                                            dtype=tf.float32)
            start_log_probs = tf.nn.log_softmax(logits[:, 0, :], axis=-1)
            end_log_probs = tf.nn.log_softmax(logits[:, 1, :], axis=-1)
            per_example_loss = (
                -0.5 * tf.reduce_sum(start_one_hot_labels * start_log_probs,
                                     axis=-1) - 0.5 *
                tf.reduce_sum(end_one_hot_labels * end_log_probs, axis=-1))
            if sample_weight is not None:
                per_example_loss *= sample_weight

            self.total_loss = tf.reduce_mean(per_example_loss)
            self.losses['losses'] = per_example_loss
Beispiel #28
0
    def __init__(self,
                 bert_config,
                 is_training,
                 input_ids,
                 input_mask,
                 segment_ids,
                 sample_weight=None,
                 scope='bert',
                 dtype=tf.float32,
                 drop_pooler=False,
                 cls_model='self-attention',
                 label_size=2,
                 speed=0.1,
                 ignore_cls='0',
                 **kwargs):
        super(FastBERTCLSDistillor, self).__init__()

        if not ignore_cls:
            ignore_cls = []
        if isinstance(ignore_cls, str):
            ignore_cls = ignore_cls.replace(' ', '').split(',')
            ignore_cls = list(map(int, ignore_cls))
        elif isinstance(ignore_cls, list):
            ignore_cls = list(map(int, ignore_cls))
        else:
            raise ValueError(
                '`ignore_cls` should be a list of child-classifier ids or '
                'a string seperated with commas.')

        if not speed:
            raise ValueError(
                '`speed` should be a float number between `0` and `1`.')

        bert_config = copy.deepcopy(bert_config)
        bert_config.hidden_dropout_prob = 0.0
        bert_config.attention_probs_dropout_prob = 0.0

        input_shape = util.get_shape_list(input_ids, expected_rank=2)
        batch_size = input_shape[0]
        max_seq_length = input_shape[1]

        with tf.variable_scope(scope):
            with tf.variable_scope('embeddings'):

                (self.embedding_output, self.embedding_table) = \
                    self.embedding_lookup(
                        input_ids=input_ids,
                        vocab_size=bert_config.vocab_size,
                        batch_size=batch_size,
                        max_seq_length=max_seq_length,
                        embedding_size=bert_config.hidden_size,
                        initializer_range=bert_config.initializer_range,
                        word_embedding_name='word_embeddings',
                        dtype=dtype,
                        trainable=False,
                        tilda_embeddings=None)

                # Add positional embeddings and token type embeddings
                # layer normalize and perform dropout.
                self.embedding_output = self.embedding_postprocessor(
                    input_tensor=self.embedding_output,
                    batch_size=batch_size,
                    max_seq_length=max_seq_length,
                    hidden_size=bert_config.hidden_size,
                    use_token_type=True,
                    segment_ids=segment_ids,
                    token_type_vocab_size=bert_config.type_vocab_size,
                    token_type_embedding_name='token_type_embeddings',
                    use_position_embeddings=True,
                    position_embedding_name='position_embeddings',
                    initializer_range=bert_config.initializer_range,
                    max_position_embeddings=\
                        bert_config.max_position_embeddings,
                    dropout_prob=bert_config.hidden_dropout_prob,
                    dtype=dtype,
                    trainable=False)

            with tf.variable_scope('encoder'):
                attention_mask = self.create_attention_mask_from_input_mask(
                    input_mask, batch_size, max_seq_length, dtype=dtype)

                # stacked transformers
                (self.all_encoder_layers, self.all_cls_layers) = \
                    self.dynamic_transformer_model(
                        is_training,
                        input_tensor=self.embedding_output,
                        input_mask=input_mask,
                        batch_size=batch_size,
                        max_seq_length=max_seq_length,
                        label_size=label_size,
                        attention_mask=attention_mask,
                        hidden_size=bert_config.hidden_size,
                        num_hidden_layers=bert_config.num_hidden_layers,
                        num_attention_heads=bert_config.num_attention_heads,
                        intermediate_size=bert_config.intermediate_size,
                        intermediate_act_fn=util.get_activation(
                            bert_config.hidden_act),
                        hidden_dropout_prob=bert_config.hidden_dropout_prob,
                        attention_probs_dropout_prob=\
                            bert_config.attention_probs_dropout_prob,
                        initializer_range=bert_config.initializer_range,
                        dtype=dtype,
                        cls_model=cls_model,
                        speed=speed,
                        ignore_cls=ignore_cls)

            self.sequence_output = self.all_encoder_layers[-1]
            with tf.variable_scope('pooler'):
                first_token_tensor = self.sequence_output[:, 0, :]

                # trick: ignore the fully connected layer
                if drop_pooler:
                    self.pooled_output = first_token_tensor
                else:
                    self.pooled_output = tf.layers.dense(
                        first_token_tensor,
                        bert_config.hidden_size,
                        activation=tf.tanh,
                        kernel_initializer=util.create_initializer(
                            bert_config.initializer_range),
                        trainable=False)

        # teacher classifier
        if bert_config.num_hidden_layers not in ignore_cls:
            with tf.variable_scope('cls/seq_relationship'):
                output_weights = tf.get_variable(
                    'output_weights',
                    shape=[label_size, bert_config.hidden_size],
                    initializer=util.create_initializer(
                        bert_config.initializer_range),
                    trainable=False)
                output_bias = tf.get_variable(
                    'output_bias',
                    shape=[label_size],
                    initializer=tf.zeros_initializer(),
                    trainable=False)

                logits = tf.matmul(self.pooled_output,
                                   output_weights,
                                   transpose_b=True)
                logits = tf.nn.bias_add(logits, output_bias)
                probs = tf.nn.softmax(logits, axis=-1)

        # distillation
        if is_training:
            losses = []
            for cls_probs in self.all_cls_layers.values():

                # KL-Divergence
                per_example_loss = tf.reduce_sum(
                    cls_probs * (tf.log(cls_probs) - tf.log(probs)), axis=-1)
                if sample_weight is not None:
                    per_example_loss *= tf.cast(sample_weight,
                                                dtype=tf.float32)
                loss = tf.reduce_mean(per_example_loss)
                losses.append(loss)

            distill_loss = tf.add_n(losses)
            self.total_loss = distill_loss
            self.losses['losses'] = distill_loss

        else:
            if bert_config.num_hidden_layers not in ignore_cls:
                self.all_cls_layers[bert_config.num_hidden_layers] = probs
            self.probs['probs'] = tf.concat(list(self.all_cls_layers.values()),
                                            axis=0,
                                            name='probs')
Beispiel #29
0
def mlp(x, scope, n_state, *, hparams):
    with tf.variable_scope(scope):
        nx = x.shape[-1].value
        h = gelu(conv1d(x, 'c_fc', n_state))
        h2 = conv1d(h, 'c_proj', nx)
        return h2
Beispiel #30
0
                def _build_forward(layer_input):
                    with tf.variable_scope('attention'):
                        with tf.variable_scope('self'):

                            layer_input *= tf.cast(tf.expand_dims(input_mask,
                                                                  axis=-1),
                                                   dtype=tf.float32)
                            attention_layer = Attention(
                                hidden_size=hidden_size,
                                num_heads=num_attention_heads,
                                attention_dropout=attention_probs_dropout_prob,
                                kernel_transformation=\
                                    self.kernel_transformation,
                                numerical_stabilizer=0.001,
                                causal=False,
                                projection_matrix_type=True \
                                    if bool(self.nb_random_features) else None,
                                nb_random_features=self.nb_random_features)
                            attention_layer.build(layer_input.shape)
                            attention_output = attention_layer.call(
                                layer_input,
                                layer_input,
                                bias=None,
                                training=is_training,
                                cache=None,
                                decode_loop_step=None)

                        with tf.variable_scope('output'):
                            attention_output = tf.layers.dense(
                                attention_output,
                                hidden_size,
                                kernel_initializer=util.create_initializer(
                                    initializer_range),
                                trainable=trainable)
                            attention_output = util.dropout(
                                attention_output, hidden_dropout_prob)
                            attention_output = util.layer_norm(
                                attention_output + layer_input,
                                trainable=trainable)

                    # The activation is only applied to the `intermediate`
                    # hidden layer.
                    with tf.variable_scope('intermediate'):
                        intermediate_output = tf.layers.dense(
                            attention_output,
                            intermediate_size,
                            activation=intermediate_act_fn,
                            kernel_initializer=util.create_initializer(
                                initializer_range),
                            trainable=trainable)

                    # Down-project back to hidden_size then add the residual.
                    with tf.variable_scope('output'):
                        layer_output = tf.layers.dense(
                            intermediate_output,
                            hidden_size,
                            kernel_initializer=util.create_initializer(
                                initializer_range),
                            trainable=trainable)
                        layer_output = util.dropout(layer_output,
                                                    hidden_dropout_prob)
                        layer_output = util.layer_norm(layer_output +
                                                       attention_output,
                                                       trainable=trainable)

                    return layer_output