Ejemplo n.º 1
0
 def _get_discriminator_output(self, inputs, sample_weight, discriminator,
                               labels):
     '''Discriminator binary classifier.'''
     with tf.variable_scope('discriminator_predictions'):
         hidden = tf.layers.dense(
             discriminator.get_sequence_output(),
             units=self.bert_config.hidden_size,
             activation=util.get_activation(self.bert_config.hidden_act),
             kernel_initializer=util.create_initializer(
                 self.bert_config.initializer_range))
         logits = tf.squeeze(tf.layers.dense(hidden, units=1), -1)
         weights = tf.cast(inputs.input_mask, tf.float32)
         labelsf = tf.cast(labels, tf.float32)
         losses = tf.nn.sigmoid_cross_entropy_with_logits(
             logits=logits, labels=labelsf) * weights
         per_example_loss = (tf.reduce_sum(losses, axis=-1) /
                             (1e-6 + tf.reduce_sum(weights, axis=-1)))
         if sample_weight is not None:
             sample_weight = tf.cast(sample_weight, dtype=tf.float32)
             per_example_loss *= sample_weight
         loss = tf.reduce_sum(losses) / (1e-6 + tf.reduce_sum(weights))
         probs = tf.nn.sigmoid(logits)
         preds = tf.cast(tf.greater(probs, 0.5), tf.int32)
         DiscOutput = collections.namedtuple(
             'DiscOutput',
             ['loss', 'per_example_loss', 'probs', 'preds', 'labels'])
         return DiscOutput(loss=loss,
                           per_example_loss=per_example_loss,
                           probs=probs,
                           preds=preds,
                           labels=labels)
Ejemplo n.º 2
0
                def _forward(dilated_ids, dilated_mask):

                    logits = self._bert_forward(
                        bert_config,
                        dilated_ids,
                        dilated_mask,
                        batch_size,
                        dilated_seq_length,
                        tilda_embeddings=tilda_embeddings)
                    output_ids = tf.argmax(logits, axis=-1)
                    output_ids = tf.cast(output_ids, dtype=tf.int32)

                    # special padding (using `spad` token)
                    equal_zero = tf.cast(tf.equal(output_ids, 0), tf.int32)
                    equal_zero = tf.reduce_sum(equal_zero, axis=-1)
                    right_pad = spad_id * tf.sequence_mask(
                        equal_zero, dilated_seq_length, dtype=tf.int32)
                    paded = tf.concat([output_ids, right_pad], axis=-1)

                    # extract ids of length `max_seq_length`
                    flattened_padded = tf.reshape(paded, [-1])
                    is_valid = tf.cast(tf.greater(flattened_padded, 0),
                                       dtype=tf.int32)
                    flattened_valid = tf.boolean_mask(flattened_padded,
                                                      is_valid)
                    valid = tf.reshape(flattened_valid,
                                       [batch_size, dilated_seq_length])
                    cutted_valid = valid[:, :max_seq_length]

                    # replace `spad` token with `pad`
                    non_spad_mask = tf.cast(tf.not_equal(
                        cutted_valid, spad_id),
                                            dtype=tf.int32)
                    output_ids = cutted_valid * non_spad_mask
                    output_length = tf.reduce_sum(non_spad_mask, axis=-1)

                    # dilate
                    reshaped_ids = tf.reshape(output_ids,
                                              [batch_size, max_seq_length, 1])
                    reshaped_mask = tf.reshape(
                        tf.sequence_mask(output_length,
                                         max_seq_length,
                                         dtype=tf.int32),
                        [batch_size, max_seq_length, 1])
                    concat_ids = tf.concat(
                        [reshaped_ids,
                         tf.zeros_like(reshaped_ids)], axis=-1)
                    concat_mask = tf.concat([
                        reshaped_mask,
                        tf.zeros_like(reshaped_mask, dtype=tf.int32)
                    ],
                                            axis=-1)
                    dilated_ids = tf.reshape(concat_ids,
                                             [batch_size, max_seq_length * 2])
                    dilated_mask = tf.reshape(concat_mask,
                                              [batch_size, max_seq_length * 2])

                    return dilated_ids, dilated_mask
Ejemplo n.º 3
0
    def __init__(self,
                 is_training,
                 input_tensor,
                 label_ids,
                 label_size=2,
                 sample_weight=None,
                 scope='cls/seq_relationship',
                 hidden_dropout_prob=0.1,
                 initializer_range=0.02,
                 trainable=True,
                 **kwargs):
        super().__init__(**kwargs)

        hidden_size = input_tensor.shape.as_list()[-1]
        with tf.variable_scope(scope):
            output_weights = tf.get_variable(
                'output_weights',
                shape=[label_size, hidden_size],
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            output_bias = tf.get_variable(
                'output_bias',
                shape=[label_size],
                initializer=tf.zeros_initializer(),
                trainable=trainable)

            output_layer = util.dropout(
                input_tensor, hidden_dropout_prob if is_training else 0.0)
            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)

            self.preds['preds'] = tf.argmax(logits, axis=-1)
            self.probs['probs'] = tf.nn.softmax(logits, axis=-1, name='probs')

            log_probs = tf.nn.log_softmax(logits, axis=-1)
            one_hot_labels = tf.one_hot(
                label_ids, depth=label_size, dtype=tf.float32)
            per_example_loss = - tf.reduce_sum(
                one_hot_labels * log_probs, axis=-1)
            if sample_weight is not None:
                per_example_loss = tf.cast(
                    sample_weight, dtype=tf.float32) * per_example_loss
            thresh = kwargs.get('tsa_thresh')
            if thresh is not None:
                assert isinstance(thresh, float), (
                    '`tsa_thresh` must be a float between 0 and 1.')
                uncertainty = tf.reduce_sum(self.probs['probs'] * tf.log(
                    self.probs['probs']), axis=-1)
                uncertainty /= tf.log(1 / label_size)
                per_example_loss = tf.cast(
                    tf.greater(uncertainty, thresh), dtype=tf.float32) * \
                    per_example_loss

            self.losses['losses'] = per_example_loss
            self.total_loss = tf.reduce_mean(per_example_loss)
Ejemplo n.º 4
0
    def __init__(self,
                 is_training,
                 input_tensor,
                 label_ids,
                 label_size=2,
                 sample_weight=None,
                 label_weight=None,
                 scope='cls/seq_relationship',
                 hidden_dropout_prob=0.1,
                 initializer_range=0.02,
                 trainable=True,
                 **kwargs):
        super().__init__(**kwargs)

        hidden_size = input_tensor.shape.as_list()[-1]
        with tf.variable_scope(scope):
            output_weights = tf.get_variable(
                'output_weights',
                shape=[label_size, hidden_size],
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            output_bias = tf.get_variable(
                'output_bias',
                shape=[label_size],
                initializer=tf.zeros_initializer(),
                trainable=trainable)

            output_layer = util.dropout(
                input_tensor, hidden_dropout_prob if is_training else 0.0)
            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            probs = tf.nn.sigmoid(logits, name='probs')

            self.probs['probs'] = probs
            self.preds['preds'] = tf.greater(probs, 0.5)

            per_example_loss = tf.nn.sigmoid_cross_entropy_with_logits(
                logits=logits,
                labels=tf.cast(label_ids, dtype=tf.float32))
            if label_weight is not None:
                label_weight = tf.constant(label_weight, dtype=tf.float32)
                label_weight = tf.reshape(label_weight, [1, label_size])
                per_example_loss *= label_weight
            per_example_loss = tf.reduce_mean(per_example_loss, axis=-1)
            if sample_weight is not None:
                per_example_loss *= sample_weight

            self.losses['losses'] = per_example_loss
            self.total_loss = tf.reduce_mean(per_example_loss)
Ejemplo n.º 5
0
                def _forward(dilated_ids, dilated_mask):

                    logits = self._bert_forward(
                        bert_config,
                        dilated_ids,
                        dilated_mask,
                        batch_size,
                        dilated_seq_length,
                        tilda_embeddings=tilda_embeddings)
                    output_ids = tf.argmax(logits, axis=-1)
                    output_ids = tf.cast(output_ids, dtype=tf.int32)

                    equal_zero = tf.cast(tf.equal(output_ids, 0), tf.int32)
                    equal_zero = tf.reduce_sum(equal_zero, axis=-1)
                    right_pad = spad_id * tf.sequence_mask(
                        equal_zero, dilated_seq_length, dtype=tf.int32)

                    paded = tf.concat([output_ids, right_pad], axis=-1)
                    flattened_padded = tf.reshape(paded, [-1])
                    is_valid = tf.cast(tf.greater(flattened_padded, 0),
                                       dtype=tf.int32)
                    flattened_valid = tf.boolean_mask(flattened_padded,
                                                      is_valid)
                    valid = tf.reshape(flattened_valid,
                                       [batch_size, dilated_seq_length])
                    cutted_valid = valid[:, :max_seq_length]

                    nonpad_mask = tf.cast(tf.not_equal(cutted_valid, spad_id),
                                          dtype=tf.int32)
                    output_ids = cutted_valid * nonpad_mask

                    reshaped = tf.reshape(output_ids,
                                          [batch_size, max_seq_length, 1])
                    concatenated = tf.concat(
                        [reshaped, tf.zeros_like(reshaped)], axis=-1)
                    dilated_ids = tf.reshape(concatenated,
                                             [batch_size, max_seq_length * 2])

                    input_mask = tf.reduce_sum(nonpad_mask, axis=-1)
                    dilated_mask = tf.sequence_mask(input_mask,
                                                    dilated_seq_length,
                                                    dtype=tf.int32)

                    return dilated_ids, dilated_mask
Ejemplo n.º 6
0
    def create_attention_mask_from_input_mask(self,
                                              input_mask,
                                              batch_size,
                                              max_seq_length,
                                              dtype=tf.float32):
        to_mask = tf.cast(tf.reshape(input_mask,
                                     [batch_size, 1, max_seq_length]),
                          dtype=dtype)
        broadcast_ones = tf.ones(shape=[batch_size, max_seq_length, 1],
                                 dtype=dtype)
        mask = broadcast_ones * to_mask

        broadcast_eye = tf.tile(
            tf.reshape(tf.eye(max_seq_length),
                       [1, max_seq_length, max_seq_length]),
            [batch_size, 1, 1])
        mask += broadcast_eye
        mask = tf.cast(tf.greater(mask, 0), dtype)
        return mask
Ejemplo n.º 7
0
    def __init__(self,
                 bert_config,
                 is_training,
                 sketchy_encoder,
                 intensive_encoder,
                 query_mask,
                 label_ids,
                 has_answer,
                 sample_weight=None,
                 scope='retro_reader',
                 matching_mechanism='cross-attention',
                 beta_1=0.5,
                 beta_2=0.5,
                 threshold=1.0,
                 trainable=True,
                 **kwargs):
        super().__init__(**kwargs)

        # verifier
        with tf.variable_scope(scope):

            # sketchy reading module
            with tf.variable_scope('sketchy/prediction'):
                sketchy_output = sketchy_encoder.get_pooled_output()
                hidden_size = sketchy_output.shape.as_list()[-1]

                output_weights = tf.get_variable(
                    'output_weights',
                    shape=[2, hidden_size],
                    initializer=util.create_initializer(
                        bert_config.initializer_range),
                    trainable=trainable)
                output_bias = tf.get_variable(
                    'output_bias',
                    shape=[2],
                    initializer=tf.zeros_initializer(),
                    trainable=trainable)

                output_layer = util.dropout(
                    sketchy_output, bert_config.hidden_dropout_prob \
                        if is_training else 0.0)
                logits = tf.matmul(
                    output_layer, output_weights, transpose_b=True)
                logits = tf.nn.bias_add(logits, output_bias)

                log_probs = tf.nn.log_softmax(logits, axis=-1)
                one_hot_labels = tf.one_hot(
                    has_answer, depth=2, dtype=tf.float32)
                per_example_loss = - tf.reduce_sum(
                    one_hot_labels * log_probs, axis=-1)
                if sample_weight is not None:
                    per_example_loss = tf.cast(
                        sample_weight, dtype=tf.float32) * per_example_loss

                self.losses['sketchy_losses'] = per_example_loss
                sketchy_loss = tf.reduce_mean(per_example_loss)

                score_ext = logits[:, 1] - logits[:, 0]

            # intensive reading module
            with tf.variable_scope('intensive'):
                H = intensive_encoder.get_sequence_output()
                H_Q = H * tf.cast(
                    tf.expand_dims(query_mask, axis=-1), tf.float32)
                (batch_size, max_seq_length, hidden_size) = \
                    util.get_shape_list(H)

                # cross-attention
                if matching_mechanism == 'cross-attention':
                    with tf.variable_scope('cross_attention'):
                        attention_mask = \
                            self.create_attention_mask_from_input_mask(
                                query_mask, batch_size, max_seq_length)
                        (H_prime, _) = self.attention_layer(
                            from_tensor=H,
                            to_tensor=H_Q,
                            attention_mask=attention_mask,
                            num_attention_heads=\
                                bert_config.num_attention_heads,
                            size_per_head=\
                                hidden_size // bert_config.num_attention_heads,
                            attention_probs_dropout_prob=\
                                bert_config.hidden_dropout_prob,
                            initializer_range=bert_config.initializer_range,
                            do_return_2d_tensor=False,
                            batch_size=batch_size,
                            from_max_seq_length=max_seq_length,
                            to_max_seq_length=max_seq_length,
                            trainable=trainable)

                # matching-attention
                elif matching_mechanism == 'matching-attention':
                    with tf.variable_scope('matching_attention'):
                        output_weights = tf.get_variable(
                            'output_weights',
                            shape=[hidden_size, hidden_size],
                            initializer=util.create_initializer(
                                bert_config.initializer_range),
                            trainable=trainable)
                        output_bias = tf.get_variable(
                            'output_bias',
                            shape=[hidden_size],
                            initializer=tf.zeros_initializer(),
                            trainable=trainable)
                        trans = tf.matmul(
                            H_Q, tf.tile(
                                tf.expand_dims(output_weights, axis=0),
                                [batch_size, 1, 1]),
                            transpose_b=True)
                        trans = tf.nn.bias_add(trans, output_bias)
                        M = tf.nn.softmax(
                            tf.matmul(H, trans, transpose_b=True), axis=-1)
                        H_prime = tf.matmul(M, H_Q)

                with tf.variable_scope('prediction'):
                    output_weights = tf.get_variable(
                        'output_weights',
                        shape=[2, hidden_size],
                        initializer=util.create_initializer(
                            bert_config.initializer_range),
                        trainable=trainable)
                    output_bias = tf.get_variable(
                        'output_bias',
                        shape=[2],
                        initializer=tf.zeros_initializer(),
                        trainable=trainable)

                    output_layer = util.dropout(
                        H_prime, bert_config.hidden_dropout_prob \
                            if is_training else 0.0)
                    output_layer = tf.reshape(
                        output_layer,
                        [batch_size * max_seq_length, hidden_size])
                    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
                    logits = tf.nn.bias_add(logits, output_bias)
                    logits = tf.reshape(
                        logits, [batch_size, max_seq_length, 2])
                    logits = tf.transpose(logits, [0, 2, 1])
                    probs = tf.nn.softmax(logits, axis=-1, name='probs')

                    self.probs['mrc_probs'] = probs
                    self.preds['mrc_preds'] = tf.argmax(logits, axis=-1)

                    start_one_hot_labels = tf.one_hot(
                        label_ids[:, 0], depth=max_seq_length,
                        dtype=tf.float32)
                    end_one_hot_labels = tf.one_hot(
                        label_ids[:, 1], depth=max_seq_length,
                        dtype=tf.float32)
                    start_log_probs = tf.nn.log_softmax(logits[:, 0, :], axis=-1)
                    end_log_probs = tf.nn.log_softmax(logits[:, 1, :], axis=-1)
                    per_example_loss = (
                        - 0.5 * tf.reduce_sum(
                            start_one_hot_labels * start_log_probs, axis=-1)
                        - 0.5 * tf.reduce_sum(
                            end_one_hot_labels * end_log_probs, axis=-1))
                    if sample_weight is not None:
                        per_example_loss *= sample_weight

                    intensive_loss = tf.reduce_mean(per_example_loss)
                    self.losses['intensive_losses'] = per_example_loss

                    score_has = tf.norm(
                        probs[:, 0, 1:] + probs[:, 1, 1:], np.inf, axis=-1)
                    score_null = probs[:, 0, 0] + probs[:, 1, 0]
                    score_diff = score_has - score_null

            # rear verification
            v = beta_1 * score_diff + beta_2 * score_ext
            self.preds['verifier_preds'] = \
                tf.cast(tf.greater(v, threshold), tf.int32)
            self.probs['verifier_probs'] = v

            self.total_loss = sketchy_loss + intensive_loss
Ejemplo n.º 8
0
    def __init__(self,
                 is_training,
                 input_tensor,
                 is_supervised,
                 is_expanded,
                 label_ids,
                 label_size=2,
                 sample_weight=None,
                 scope='cls/seq_relationship',
                 hidden_dropout_prob=0.1,
                 initializer_range=0.02,
                 trainable=True,
                 global_step=None,
                 num_train_steps=None,
                 uda_softmax_temp=-1,
                 uda_confidence_thresh=-1,
                 tsa_schedule='linear',
                 **kwargs):
        super().__init__(**kwargs)

        is_supervised = tf.cast(is_supervised, tf.float32)
        is_expanded = tf.cast(is_expanded, tf.float32)

        hidden_size = input_tensor.shape.as_list()[-1]
        with tf.variable_scope(scope):
            output_weights = tf.get_variable(
                'output_weights',
                shape=[label_size, hidden_size],
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            output_bias = tf.get_variable('output_bias',
                                          shape=[label_size],
                                          initializer=tf.zeros_initializer(),
                                          trainable=trainable)

            output_layer = util.dropout(
                input_tensor, hidden_dropout_prob if is_training else 0.0)
            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            log_probs = tf.nn.log_softmax(logits, axis=-1)

            with tf.variable_scope('sup_loss'):

                # reshape
                sup_ori_log_probs = tf.boolean_mask(log_probs,
                                                    mask=(1.0 - is_expanded),
                                                    axis=0)
                sup_log_probs = tf.boolean_mask(sup_ori_log_probs,
                                                mask=is_supervised,
                                                axis=0)
                sup_label_ids = tf.boolean_mask(label_ids,
                                                mask=is_supervised,
                                                axis=0)

                self.preds['preds'] = tf.argmax(sup_ori_log_probs, axis=-1)

                one_hot_labels = tf.one_hot(sup_label_ids,
                                            depth=label_size,
                                            dtype=tf.float32)
                per_example_loss = -tf.reduce_sum(
                    one_hot_labels * sup_log_probs, axis=-1)

                loss_mask = tf.ones_like(per_example_loss, dtype=tf.float32)
                correct_label_probs = tf.reduce_sum(one_hot_labels *
                                                    tf.exp(sup_log_probs),
                                                    axis=-1)

                if is_training and tsa_schedule:
                    tsa_start = 1.0 / label_size
                    tsa_threshold = get_tsa_threshold(tsa_schedule,
                                                      global_step,
                                                      num_train_steps,
                                                      tsa_start,
                                                      end=1)

                    larger_than_threshold = tf.greater(correct_label_probs,
                                                       tsa_threshold)
                    loss_mask = loss_mask * (
                        1 - tf.cast(larger_than_threshold, tf.float32))

                loss_mask = tf.stop_gradient(loss_mask)
                per_example_loss = per_example_loss * loss_mask
                if sample_weight is not None:
                    sup_sample_weight = tf.boolean_mask(sample_weight,
                                                        mask=is_supervised,
                                                        axis=0)
                    per_example_loss *= tf.cast(sup_sample_weight,
                                                dtype=tf.float32)
                sup_loss = (tf.reduce_sum(per_example_loss) /
                            tf.maximum(tf.reduce_sum(loss_mask), 1))

                self.losses['supervised'] = per_example_loss

            with tf.variable_scope('unsup_loss'):

                # reshape
                ori_log_probs = tf.boolean_mask(sup_ori_log_probs,
                                                mask=(1.0 - is_supervised),
                                                axis=0)
                aug_log_probs = tf.boolean_mask(log_probs,
                                                mask=is_expanded,
                                                axis=0)
                sup_ori_logits = tf.boolean_mask(logits,
                                                 mask=(1.0 - is_expanded),
                                                 axis=0)
                ori_logits = tf.boolean_mask(sup_ori_logits,
                                             mask=(1.0 - is_supervised),
                                             axis=0)

                unsup_loss_mask = 1
                if uda_softmax_temp != -1:
                    tgt_ori_log_probs = tf.nn.log_softmax(ori_logits /
                                                          uda_softmax_temp,
                                                          axis=-1)
                    tgt_ori_log_probs = tf.stop_gradient(tgt_ori_log_probs)
                else:
                    tgt_ori_log_probs = tf.stop_gradient(ori_log_probs)

                if uda_confidence_thresh != -1:
                    largest_prob = tf.reduce_max(tf.exp(ori_log_probs),
                                                 axis=-1)
                    unsup_loss_mask = tf.cast(
                        tf.greater(largest_prob, uda_confidence_thresh),
                        tf.float32)
                    unsup_loss_mask = tf.stop_gradient(unsup_loss_mask)

                per_example_loss = kl_for_log_probs(
                    tgt_ori_log_probs, aug_log_probs) * unsup_loss_mask
                if sample_weight is not None:
                    unsup_sample_weight = tf.boolean_mask(sample_weight,
                                                          mask=(1.0 -
                                                                is_supervised),
                                                          axis=0)
                    per_example_loss *= tf.cast(unsup_sample_weight,
                                                dtype=tf.float32)
                unsup_loss = tf.reduce_mean(per_example_loss)

                self.losses['unsupervised'] = per_example_loss

            self.total_loss = sup_loss + unsup_loss
Ejemplo n.º 9
0
    def __init__(self,
                 is_training,
                 input_tensor,
                 n_wide_features,
                 wide_features,
                 label_ids,
                 label_size=2,
                 sample_weight=None,
                 scope='cls/seq_relationship',
                 hidden_dropout_prob=0.1,
                 initializer_range=0.02,
                 trainable=True,
                 **kwargs):
        super().__init__(**kwargs)

        hidden_size = input_tensor.shape.as_list()[-1]
        feature_size = wide_features.shape.as_list()[-1]
        with tf.variable_scope('wide'):
            feature_embeddings = tf.get_variable(
                name='feature_embeddings',
                shape=[feature_size + 1, hidden_size],
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            wide_output = tf.gather(feature_embeddings,
                                    wide_features)  # [B, N, H]

        with tf.variable_scope('wide_and_deep'):
            deep_output = tf.expand_dims(input_tensor, -1)  # [B, H, 1]
            attention_scores = tf.matmul(wide_output, deep_output)  # [B, N, 1]
            attention_scores = tf.transpose(attention_scores,
                                            [0, 2, 1])  # [B, 1, N]
            attention_scores = tf.multiply(attention_scores,
                                           1.0 / math.sqrt(hidden_size))
            feature_mask = tf.cast(
                tf.sequence_mask(n_wide_features, feature_size),
                tf.float32)  # [B, N]
            feature_mask = tf.expand_dims(feature_mask, 1)  # [B, 1, N]
            attention_scores += (1.0 - feature_mask) * -10000.0
            attention_matrix = tf.nn.softmax(attention_scores, axis=-1)
            attention_output = tf.matmul(attention_matrix,
                                         wide_output)  # [B, 1, H]
            attention_output = attention_output[:, 0, :]  # [B, H]
            # attention_output = util.dropout(
            #     attention_output, hidden_dropout_prob)
            input_tensor = util.layer_norm(attention_output + input_tensor,
                                           trainable=trainable)

        with tf.variable_scope(scope):
            output_weights = tf.get_variable(
                'output_weights',
                shape=[label_size, hidden_size],
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            output_bias = tf.get_variable('output_bias',
                                          shape=[label_size],
                                          initializer=tf.zeros_initializer(),
                                          trainable=trainable)

            output_layer = util.dropout(
                input_tensor, hidden_dropout_prob if is_training else 0.0)
            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)

            self.preds['preds'] = tf.argmax(logits, axis=-1)
            self.probs['probs'] = tf.nn.softmax(logits, axis=-1, name='probs')

            log_probs = tf.nn.log_softmax(logits, axis=-1)
            one_hot_labels = tf.one_hot(label_ids,
                                        depth=label_size,
                                        dtype=tf.float32)
            per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs,
                                              axis=-1)
            if sample_weight is not None:
                per_example_loss = tf.cast(sample_weight,
                                           dtype=tf.float32) * per_example_loss
            thresh = kwargs.get('tsa_thresh')
            if thresh is not None:
                assert isinstance(
                    thresh,
                    float), ('`tsa_thresh` must be a float between 0 and 1.')
                uncertainty = tf.reduce_sum(self.probs['probs'] *
                                            tf.log(self.probs['probs']),
                                            axis=-1)
                uncertainty /= tf.log(1 / label_size)
                per_example_loss = tf.cast(
                    tf.greater(uncertainty, thresh), dtype=tf.float32) * \
                    per_example_loss

            self.losses['losses'] = per_example_loss
            self.total_loss = tf.reduce_mean(per_example_loss)
Ejemplo n.º 10
0
    def _lm_forward(self,
                    is_training,
                    input_tensor,
                    input_mask,
                    label_ids,
                    bert_config,
                    batch_size,
                    max_seq_length,
                    prob,
                    scope,
                    name,
                    sample_weight=None,
                    hidden_dropout_prob=0.1,
                    initializer_range=0.02):

        with tf.variable_scope(scope):

            with tf.variable_scope('verifier'):
                logits = tf.layers.dense(
                    input_tensor,
                    2,
                    kernel_initializer=util.create_initializer(
                        bert_config.initializer_range),
                    trainable=True)
                verifier_label_ids = tf.cast(tf.greater(label_ids, 0),
                                             tf.int32)

                # loss
                log_probs = tf.nn.log_softmax(logits, axis=-1)
                one_hot_labels = tf.one_hot(verifier_label_ids, depth=2)
                per_token_loss = -tf.reduce_sum(one_hot_labels * log_probs,
                                                axis=-1)

                input_mask = tf.cast(input_mask, tf.float32)
                per_token_loss *= input_mask / tf.reduce_sum(
                    input_mask, keepdims=True, axis=-1)
                per_example_loss = tf.reduce_sum(per_token_loss, axis=-1)
                if sample_weight is not None:
                    per_example_loss *= tf.expand_dims(sample_weight, axis=-1)

                if prob != 0:
                    self.total_loss += tf.reduce_mean(per_example_loss)
                verifier_loss = per_example_loss
                verifier_preds = tf.argmax(logits, axis=-1)

            with tf.variable_scope('prediction'):

                with tf.variable_scope('intermediate'):
                    logits = tf.layers.dense(
                        input_tensor,
                        bert_config.hidden_size * 4,
                        kernel_initializer=util.create_initializer(
                            bert_config.initializer_range),
                        activation=util.gelu,
                        trainable=True)
                with tf.variable_scope('output'):
                    logits = tf.layers.dense(
                        logits,
                        bert_config.hidden_size,
                        kernel_initializer=util.create_initializer(
                            bert_config.initializer_range),
                        trainable=True)

                flattened = tf.reshape(
                    logits,
                    [batch_size * max_seq_length, bert_config.hidden_size])
                logits = tf.matmul(flattened,
                                   self.embedding_table,
                                   transpose_b=True)
                logits = tf.reshape(
                    logits, [-1, max_seq_length, bert_config.vocab_size])

                # loss
                log_probs = tf.nn.log_softmax(logits, axis=-1)
                one_hot_labels = tf.one_hot(label_ids,
                                            depth=bert_config.vocab_size)
                per_token_loss = -tf.reduce_sum(one_hot_labels * log_probs,
                                                axis=-1)

                input_mask *= tf.cast(verifier_preds, tf.float32)
                per_token_loss *= input_mask / (
                    tf.reduce_sum(input_mask, keepdims=True, axis=-1) + 1e-6)
                per_example_loss = tf.reduce_sum(per_token_loss, axis=-1)
                if sample_weight is not None:
                    per_example_loss *= tf.expand_dims(sample_weight, axis=-1)

                if prob != 0:
                    self.total_loss += tf.reduce_mean(per_example_loss)
                self.losses[name + '_loss'] = verifier_loss
                self.preds[name + '_preds'] = \
                    tf.argmax(logits, axis=-1) * verifier_preds
Ejemplo n.º 11
0
    def __init__(self,
                 bert_config,
                 is_training,
                 input_tensor,
                 input_mask,
                 sem_features,
                 label_ids,
                 max_seq_length,
                 feature_size,
                 label_size=2,
                 sample_weight=None,
                 scope='cls/seq_relationship',
                 hidden_dropout_prob=0.1,
                 initializer_range=0.02,
                 trainable=True,
                 **kwargs):
        super().__init__(**kwargs)

        input_shape = util.get_shape_list(input_tensor)
        batch_size = input_shape[0]
        hidden_size = input_shape[-1]
        with tf.variable_scope('sem'):
            feature_embeddings = tf.get_variable(
                name='feature_embeddings',
                shape=[feature_size + 3,
                       hidden_size],  # for [PAD], [CLS], [SEP]
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            sem_output = tf.gather(feature_embeddings,
                                   sem_features)  # [B, N, H]

            attention_heads = []
            with tf.variable_scope('self'):
                attention_mask = BERTEncoder.create_attention_mask_from_input_mask(
                    input_mask, batch_size, max_seq_length)
                (attention_head, _) = BERTEncoder.attention_layer(
                    from_tensor=sem_output,
                    to_tensor=sem_output,
                    attention_mask=attention_mask,
                    num_attention_heads=bert_config.num_attention_heads,
                    size_per_head=(hidden_size //
                                   bert_config.num_attention_heads),
                    attention_probs_dropout_prob=hidden_dropout_prob
                    if is_training else 0.0,
                    initializer_range=initializer_range,
                    do_return_2d_tensor=False,
                    batch_size=batch_size,
                    from_max_seq_length=max_seq_length,
                    to_max_seq_length=max_seq_length,
                    trainable=trainable)
                attention_heads.append(attention_head)

            if len(attention_heads) == 1:
                attention_output = attention_heads[0]
            else:
                attention_output = tf.concat(attention_heads, axis=-1)

            attention_output = attention_output[:, 0, :]  # [B, H]
            input_tensor = util.layer_norm(attention_output + input_tensor,
                                           trainable=trainable)

        with tf.variable_scope(scope):
            output_weights = tf.get_variable(
                'output_weights',
                shape=[label_size, hidden_size],
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            output_bias = tf.get_variable('output_bias',
                                          shape=[label_size],
                                          initializer=tf.zeros_initializer(),
                                          trainable=trainable)

            output_layer = util.dropout(
                input_tensor, hidden_dropout_prob if is_training else 0.0)
            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)

            self.preds['preds'] = tf.argmax(logits, axis=-1)
            self.probs['probs'] = tf.nn.softmax(logits, axis=-1, name='probs')

            log_probs = tf.nn.log_softmax(logits, axis=-1)
            one_hot_labels = tf.one_hot(label_ids,
                                        depth=label_size,
                                        dtype=tf.float32)
            per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs,
                                              axis=-1)
            if sample_weight is not None:
                per_example_loss = tf.cast(sample_weight,
                                           dtype=tf.float32) * per_example_loss
            thresh = kwargs.get('tsa_thresh')
            if thresh is not None:
                assert isinstance(
                    thresh,
                    float), ('`tsa_thresh` must be a float between 0 and 1.')
                uncertainty = tf.reduce_sum(self.probs['probs'] *
                                            tf.log(self.probs['probs']),
                                            axis=-1)
                uncertainty /= tf.log(1 / label_size)
                per_example_loss = tf.cast(
                    tf.greater(uncertainty, thresh), dtype=tf.float32) * \
                    per_example_loss

            self.losses['losses'] = per_example_loss
            self.total_loss = tf.reduce_mean(per_example_loss)