Python reduce_mean Beispiele, uf.tools.tf.reduce_mean Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: base.py Projekt: llgithubll/unif

    def __init__(self,
                 is_training,
                 input_tensor,
                 input_mask,
                 label_ids,
                 label_size=2,
                 sample_weight=None,
                 scope='cls/sequence',
                 name='',
                 hidden_dropout_prob=0.1,
                 initializer_range=0.02,
                 trainable=True,
                 **kwargs):
        super().__init__(**kwargs)

        batch_size = tf.shape(input_tensor)[0]
        seq_length = input_tensor.shape.as_list()[-2]
        hidden_size = input_tensor.shape.as_list()[-1]
        with tf.variable_scope(scope):
            output_weights = tf.get_variable(
                'output_weights',
                shape=[label_size, hidden_size],
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            output_bias = tf.get_variable('output_bias',
                                          shape=[label_size],
                                          initializer=tf.zeros_initializer(),
                                          trainable=trainable)

            output_layer = util.dropout(
                input_tensor, hidden_dropout_prob if is_training else 0.0)

            output_layer = tf.reshape(output_layer, [-1, hidden_size])
            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            logits = tf.reshape(logits, [-1, seq_length, label_size])

            self.preds[name] = tf.argmax(logits, axis=-1)
            self.probs[name] = tf.nn.softmax(logits, axis=-1, name='probs')

            log_probs = tf.nn.log_softmax(logits, axis=-1)
            one_hot_labels = tf.one_hot(label_ids,
                                        depth=label_size,
                                        dtype=tf.float32)
            per_token_losses = -tf.reduce_mean(one_hot_labels * log_probs,
                                               axis=-1)
            input_mask = tf.concat([
                tf.zeros((batch_size, 1), dtype=tf.float32),
                tf.cast(input_mask[:, 2:], dtype=tf.float32),
                tf.zeros((batch_size, 1), dtype=tf.float32)
            ],
                                   axis=-1)
            per_token_losses *= input_mask
            per_example_loss = tf.reduce_mean(per_token_losses, axis=-1)
            if sample_weight is not None:
                per_example_loss *= tf.cast(sample_weight, dtype=tf.float32)

            self.losses[name] = per_example_loss
            self.total_loss = tf.reduce_mean(per_example_loss)

Beispiel #2

0

Datei anzeigen

def norm(x, scope, *, axis=-1, epsilon=1e-5):
    '''Normalize to mean = 0, std = 1, then do a diagonal affine transform.'''
    with tf.variable_scope(scope):
        n_state = x.shape[-1].value
        g = tf.get_variable('g', [n_state],
                            initializer=tf.constant_initializer(1))
        b = tf.get_variable('b', [n_state],
                            initializer=tf.constant_initializer(0))
        u = tf.reduce_mean(x, axis=axis, keepdims=True)
        s = tf.reduce_mean(tf.square(x - u), axis=axis, keepdims=True)
        x = (x - u) * tf.rsqrt(s + epsilon)
        x = x * g + b
        return x

Beispiel #3

0

Datei anzeigen

Datei: base.py Projekt: wangbq18/unif

    def __init__(self,
                 is_training,
                 input_tensor,
                 label_ids,
                 label_size=2,
                 sample_weight=None,
                 label_weight=None,
                 scope='cls/seq_relationship',
                 hidden_dropout_prob=0.1,
                 initializer_range=0.02,
                 trainable=True,
                 **kwargs):
        super().__init__(**kwargs)

        hidden_size = input_tensor.shape.as_list()[-1]
        with tf.variable_scope(scope):
            output_weights = tf.get_variable(
                'output_weights',
                shape=[label_size, hidden_size],
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            output_bias = tf.get_variable(
                'output_bias',
                shape=[label_size],
                initializer=tf.zeros_initializer(),
                trainable=trainable)

            output_layer = util.dropout(
                input_tensor, hidden_dropout_prob if is_training else 0.0)
            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            probs = tf.nn.sigmoid(logits, name='probs')

            self.probs['probs'] = probs
            self.preds['preds'] = tf.greater(probs, 0.5)

            per_example_loss = tf.nn.sigmoid_cross_entropy_with_logits(
                logits=logits,
                labels=tf.cast(label_ids, dtype=tf.float32))
            if label_weight is not None:
                label_weight = tf.constant(label_weight, dtype=tf.float32)
                label_weight = tf.reshape(label_weight, [1, label_size])
                per_example_loss *= label_weight
            per_example_loss = tf.reduce_mean(per_example_loss, axis=-1)
            if sample_weight is not None:
                per_example_loss *= sample_weight

            self.losses['losses'] = per_example_loss
            self.total_loss = tf.reduce_mean(per_example_loss)

Beispiel #4

0

Datei anzeigen

Datei: base.py Projekt: llgithubll/unif

    def __init__(self,
                 is_training,
                 input_tensor,
                 label_ids,
                 sample_weight=None,
                 scope='mrc',
                 name='',
                 hidden_dropout_prob=0.1,
                 initializer_range=0.02,
                 trainable=True,
                 **kwargs):
        super().__init__(**kwargs)

        seq_length = input_tensor.shape.as_list()[-2]
        hidden_size = input_tensor.shape.as_list()[-1]
        with tf.variable_scope(scope):
            output_weights = tf.get_variable(
                'output_weights',
                shape=[2, hidden_size],
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            output_bias = tf.get_variable('output_bias',
                                          shape=[2],
                                          initializer=tf.zeros_initializer(),
                                          trainable=trainable)

            output_layer = util.dropout(
                input_tensor, hidden_dropout_prob if is_training else 0.0)

            output_layer = tf.reshape(output_layer, [-1, hidden_size])
            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            logits = tf.reshape(logits, [-1, seq_length, 2])
            logits = tf.transpose(logits, [0, 2, 1])
            probs = tf.nn.softmax(logits, axis=-1, name='probs')
            self.probs[name] = probs

            start_one_hot_labels = tf.one_hot(label_ids[:, 0],
                                              depth=seq_length,
                                              dtype=tf.float32)
            end_one_hot_labels = tf.one_hot(label_ids[:, 1],
                                            depth=seq_length,
                                            dtype=tf.float32)
            start_log_probs = tf.nn.log_softmax(logits[:, 0, :], axis=-1)
            end_log_probs = tf.nn.log_softmax(logits[:, 1, :], axis=-1)
            per_example_loss = (
                -0.5 * tf.reduce_sum(start_one_hot_labels * start_log_probs,
                                     axis=-1) - 0.5 *
                tf.reduce_sum(end_one_hot_labels * end_log_probs, axis=-1))
            if sample_weight is not None:
                per_example_loss *= sample_weight

            self.total_loss = tf.reduce_mean(per_example_loss)
            self.losses[name] = per_example_loss

            start_preds = tf.expand_dims(tf.argmax(logits[:, 0, :], axis=-1),
                                         axis=-1)
            end_preds = tf.expand_dims(tf.argmax(logits[:, 1, :], axis=-1),
                                       axis=-1)
            self.preds[name] = tf.concat([start_preds, end_preds], axis=-1)

Beispiel #5

0

Datei anzeigen

Datei: tiny_bert.py Projekt: zhongyunuestc/unif

 def _get_pred_loss(self, teacher_logits, student_logits, weights):
     teacher_probs = tf.nn.softmax(teacher_logits, axis=-1)
     teacher_probs = tf.stop_gradient(teacher_probs)
     student_log_probs = tf.nn.log_softmax(student_logits, axis=-1)
     pred_loss = (
         - tf.reduce_sum(teacher_probs * student_log_probs, axis=-1) *
         tf.reshape(weights, [-1, 1]))
     pred_loss = tf.reduce_mean(pred_loss)
     return pred_loss

Beispiel #6

0

Datei anzeigen

Datei: base.py Projekt: wangbq18/unif

    def __init__(self,
                 is_training,
                 input_tensor,
                 label_ids,
                 label_size=2,
                 sample_weight=None,
                 scope='cls/seq_relationship',
                 hidden_dropout_prob=0.1,
                 initializer_range=0.02,
                 trainable=True,
                 **kwargs):
        super().__init__(**kwargs)

        hidden_size = input_tensor.shape.as_list()[-1]
        with tf.variable_scope(scope):
            output_weights = tf.get_variable(
                'output_weights',
                shape=[label_size, hidden_size],
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            output_bias = tf.get_variable(
                'output_bias',
                shape=[label_size],
                initializer=tf.zeros_initializer(),
                trainable=trainable)

            output_layer = util.dropout(
                input_tensor, hidden_dropout_prob if is_training else 0.0)
            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)

            self.preds['preds'] = tf.argmax(logits, axis=-1)
            self.probs['probs'] = tf.nn.softmax(logits, axis=-1, name='probs')

            log_probs = tf.nn.log_softmax(logits, axis=-1)
            one_hot_labels = tf.one_hot(
                label_ids, depth=label_size, dtype=tf.float32)
            per_example_loss = - tf.reduce_sum(
                one_hot_labels * log_probs, axis=-1)
            if sample_weight is not None:
                per_example_loss = tf.cast(
                    sample_weight, dtype=tf.float32) * per_example_loss
            thresh = kwargs.get('tsa_thresh')
            if thresh is not None:
                assert isinstance(thresh, float), (
                    '`tsa_thresh` must be a float between 0 and 1.')
                uncertainty = tf.reduce_sum(self.probs['probs'] * tf.log(
                    self.probs['probs']), axis=-1)
                uncertainty /= tf.log(1 / label_size)
                per_example_loss = tf.cast(
                    tf.greater(uncertainty, thresh), dtype=tf.float32) * \
                    per_example_loss

            self.losses['losses'] = per_example_loss
            self.total_loss = tf.reduce_mean(per_example_loss)

Beispiel #7

0

Datei anzeigen

def summarize_sequence(summary_type, hidden, d_model, n_head, d_head, dropout,
                       dropatt, input_mask, is_training, initializer,
                       scope=None, reuse=None, use_proj=True):

    '''
    Different classification tasks may not may not share the same parameters
    to summarize the sequence features.

    If shared, one can keep the `scope` to the default value `None`.
    Otherwise, one should specify a different `scope` for each task.
    '''

    with tf.variable_scope(scope, 'sequnece_summary', reuse=reuse):
        if summary_type == 'last':
            summary = hidden[-1]
        elif summary_type == 'first':
            summary = hidden[0]
        elif summary_type == 'mean':
            summary = tf.reduce_mean(hidden, axis=0)
        elif summary_type == 'attn':
            bsz = tf.shape(hidden)[1]

            summary_bias = tf.get_variable('summary_bias', [d_model],
                                           dtype=hidden.dtype,
                                           initializer=initializer)
            summary_bias = tf.tile(summary_bias[None, None], [1, bsz, 1])

            if input_mask is not None:
                input_mask = input_mask[None, :, :, None]

            summary = multihead_attn(
                summary_bias, hidden, hidden, input_mask,
                d_model, n_head, d_head, dropout, dropatt,
                is_training, initializer, residual=False)
            summary = summary[0]
        else:
            raise ValueError('Unsupported summary type %s' % summary_type)

        # use another projection as in BERT
        if use_proj:
            summary = tf.layers.dense(
                summary,
                d_model,
                activation=tf.tanh,
                kernel_initializer=initializer,
                name='summary')

        # dropout
        summary = tf.layers.dropout(
            summary, dropout, training=is_training,
            name='dropout')

    return summary

Beispiel #8

0

Datei anzeigen

Datei: crf.py Projekt: llgithubll/unif

    def __init__(self,
                 is_training,
                 input_tensor,
                 input_mask,
                 label_ids,
                 label_size=5,
                 sample_weight=None,
                 scope='cls/sequence',
                 name='',
                 hidden_dropout_prob=0.1,
                 initializer_range=0.02,
                 trainable=True,
                 **kwargs):
        super().__init__(**kwargs)

        seq_length = input_tensor.shape.as_list()[-2]
        hidden_size = input_tensor.shape.as_list()[-1]
        with tf.variable_scope(scope):
            output_weights = tf.get_variable(
                'output_weights',
                shape=[label_size, hidden_size],
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            output_bias = tf.get_variable('output_bias',
                                          shape=[label_size],
                                          initializer=tf.zeros_initializer(),
                                          trainable=trainable)

            output_layer = util.dropout(
                input_tensor, hidden_dropout_prob if is_training else 0.0)

            output_layer = tf.reshape(output_layer, [-1, hidden_size])
            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            logits = tf.reshape(logits, [-1, seq_length, label_size])

            with tf.variable_scope('crf'):
                input_length = tf.reduce_sum(input_mask, axis=-1)
                per_example_loss, transition_matrix = \
                    contrib.crf.crf_log_likelihood(
                        inputs=logits,
                        tag_indices=label_ids,
                        sequence_lengths=input_length)
                per_example_loss = -per_example_loss
                if sample_weight is not None:
                    per_example_loss *= tf.cast(sample_weight,
                                                dtype=tf.float32)
                self.total_loss = tf.reduce_mean(per_example_loss)
                self.losses[name] = per_example_loss
                self.preds[name] = tf.argmax(logits, axis=-1)
                self.probs['logits'] = logits
                self.probs['transition_matrix'] = transition_matrix

Beispiel #9

0

Datei anzeigen

Datei: base.py Projekt: llgithubll/unif

    def __init__(self,
                 is_training,
                 input_tensor,
                 label_ids,
                 label_size=2,
                 sample_weight=None,
                 scope='cls/seq_relationship',
                 name='',
                 hidden_dropout_prob=0.1,
                 initializer_range=0.02,
                 trainable=True,
                 **kwargs):
        super().__init__(**kwargs)

        hidden_size = input_tensor.shape.as_list()[-1]
        with tf.variable_scope(scope):
            output_weights = tf.get_variable(
                'output_weights',
                shape=[label_size, hidden_size],
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            output_bias = tf.get_variable('output_bias',
                                          shape=[label_size],
                                          initializer=tf.zeros_initializer(),
                                          trainable=trainable)

            output_layer = util.dropout(
                input_tensor, hidden_dropout_prob if is_training else 0.0)
            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)

            self.preds[name] = tf.argmax(logits, axis=-1)
            self.probs[name] = tf.nn.softmax(logits, axis=-1, name='probs')

            log_probs = tf.nn.log_softmax(logits, axis=-1)
            one_hot_labels = tf.one_hot(label_ids,
                                        depth=label_size,
                                        dtype=tf.float32)
            per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs,
                                              axis=-1)
            if sample_weight is not None:
                per_example_loss = tf.cast(sample_weight,
                                           dtype=tf.float32) * per_example_loss

            self.losses[name] = per_example_loss
            self.total_loss = tf.reduce_mean(per_example_loss)

Beispiel #10

0

Datei anzeigen

Datei: rec_bert.py Projekt: zhongyunuestc/unif

    def _cls_forward(self,
                     is_training,
                     input_tensor,
                     input_mask,
                     label_ids,
                     bert_config,
                     batch_size,
                     max_seq_length,
                     prob,
                     scope,
                     name,
                     sample_weight=None,
                     hidden_dropout_prob=0.1,
                     initializer_range=0.02):

        with tf.variable_scope(scope):
            logits = tf.layers.dense(
                input_tensor,
                2,
                kernel_initializer=util.create_initializer(
                    bert_config.initializer_range),
                trainable=True)

            # loss
            log_probs = tf.nn.log_softmax(logits, axis=-1)
            one_hot_labels = tf.one_hot(label_ids, depth=2)
            per_token_loss = -tf.reduce_sum(one_hot_labels * log_probs,
                                            axis=-1)

            input_mask = tf.cast(input_mask, tf.float32)
            per_token_loss *= input_mask / tf.reduce_sum(
                input_mask, keepdims=True, axis=-1)
            per_example_loss = tf.reduce_sum(per_token_loss, axis=-1)
            if sample_weight is not None:
                per_example_loss *= tf.expand_dims(sample_weight, axis=-1)

            if prob != 0:
                self.total_loss += tf.reduce_mean(per_example_loss)
            self.losses[name + '_loss'] = per_example_loss
            self.preds[name + '_preds'] = tf.argmax(logits, axis=-1)

Beispiel #11

0

Datei anzeigen

    def __init__(self,
                 is_training,
                 input_tensor,
                 n_wide_features,
                 wide_features,
                 label_ids,
                 label_size=2,
                 sample_weight=None,
                 scope='cls/seq_relationship',
                 hidden_dropout_prob=0.1,
                 initializer_range=0.02,
                 trainable=True,
                 **kwargs):
        super().__init__(**kwargs)

        hidden_size = input_tensor.shape.as_list()[-1]
        feature_size = wide_features.shape.as_list()[-1]
        with tf.variable_scope('wide'):
            feature_embeddings = tf.get_variable(
                name='feature_embeddings',
                shape=[feature_size + 1, hidden_size],
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            wide_output = tf.gather(feature_embeddings,
                                    wide_features)  # [B, N, H]

        with tf.variable_scope('wide_and_deep'):
            deep_output = tf.expand_dims(input_tensor, -1)  # [B, H, 1]
            attention_scores = tf.matmul(wide_output, deep_output)  # [B, N, 1]
            attention_scores = tf.transpose(attention_scores,
                                            [0, 2, 1])  # [B, 1, N]
            attention_scores = tf.multiply(attention_scores,
                                           1.0 / math.sqrt(hidden_size))
            feature_mask = tf.cast(
                tf.sequence_mask(n_wide_features, feature_size),
                tf.float32)  # [B, N]
            feature_mask = tf.expand_dims(feature_mask, 1)  # [B, 1, N]
            attention_scores += (1.0 - feature_mask) * -10000.0
            attention_matrix = tf.nn.softmax(attention_scores, axis=-1)
            attention_output = tf.matmul(attention_matrix,
                                         wide_output)  # [B, 1, H]
            attention_output = attention_output[:, 0, :]  # [B, H]
            # attention_output = util.dropout(
            #     attention_output, hidden_dropout_prob)
            input_tensor = util.layer_norm(attention_output + input_tensor,
                                           trainable=trainable)

        with tf.variable_scope(scope):
            output_weights = tf.get_variable(
                'output_weights',
                shape=[label_size, hidden_size],
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            output_bias = tf.get_variable('output_bias',
                                          shape=[label_size],
                                          initializer=tf.zeros_initializer(),
                                          trainable=trainable)

            output_layer = util.dropout(
                input_tensor, hidden_dropout_prob if is_training else 0.0)
            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)

            self.preds['preds'] = tf.argmax(logits, axis=-1)
            self.probs['probs'] = tf.nn.softmax(logits, axis=-1, name='probs')

            log_probs = tf.nn.log_softmax(logits, axis=-1)
            one_hot_labels = tf.one_hot(label_ids,
                                        depth=label_size,
                                        dtype=tf.float32)
            per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs,
                                              axis=-1)
            if sample_weight is not None:
                per_example_loss = tf.cast(sample_weight,
                                           dtype=tf.float32) * per_example_loss
            thresh = kwargs.get('tsa_thresh')
            if thresh is not None:
                assert isinstance(
                    thresh,
                    float), ('`tsa_thresh` must be a float between 0 and 1.')
                uncertainty = tf.reduce_sum(self.probs['probs'] *
                                            tf.log(self.probs['probs']),
                                            axis=-1)
                uncertainty /= tf.log(1 / label_size)
                per_example_loss = tf.cast(
                    tf.greater(uncertainty, thresh), dtype=tf.float32) * \
                    per_example_loss

            self.losses['losses'] = per_example_loss
            self.total_loss = tf.reduce_mean(per_example_loss)

Beispiel #12

0

Datei anzeigen

Datei: uda.py Projekt: zhongyunuestc/unif

    def __init__(self,
                 is_training,
                 input_tensor,
                 is_supervised,
                 is_expanded,
                 label_ids,
                 label_size=2,
                 sample_weight=None,
                 scope='cls/seq_relationship',
                 hidden_dropout_prob=0.1,
                 initializer_range=0.02,
                 trainable=True,
                 global_step=None,
                 num_train_steps=None,
                 uda_softmax_temp=-1,
                 uda_confidence_thresh=-1,
                 tsa_schedule='linear',
                 **kwargs):
        super().__init__(**kwargs)

        is_supervised = tf.cast(is_supervised, tf.float32)
        is_expanded = tf.cast(is_expanded, tf.float32)

        hidden_size = input_tensor.shape.as_list()[-1]
        with tf.variable_scope(scope):
            output_weights = tf.get_variable(
                'output_weights',
                shape=[label_size, hidden_size],
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            output_bias = tf.get_variable('output_bias',
                                          shape=[label_size],
                                          initializer=tf.zeros_initializer(),
                                          trainable=trainable)

            output_layer = util.dropout(
                input_tensor, hidden_dropout_prob if is_training else 0.0)
            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            log_probs = tf.nn.log_softmax(logits, axis=-1)

            with tf.variable_scope('sup_loss'):

                # reshape
                sup_ori_log_probs = tf.boolean_mask(log_probs,
                                                    mask=(1.0 - is_expanded),
                                                    axis=0)
                sup_log_probs = tf.boolean_mask(sup_ori_log_probs,
                                                mask=is_supervised,
                                                axis=0)
                sup_label_ids = tf.boolean_mask(label_ids,
                                                mask=is_supervised,
                                                axis=0)

                self.preds['preds'] = tf.argmax(sup_ori_log_probs, axis=-1)

                one_hot_labels = tf.one_hot(sup_label_ids,
                                            depth=label_size,
                                            dtype=tf.float32)
                per_example_loss = -tf.reduce_sum(
                    one_hot_labels * sup_log_probs, axis=-1)

                loss_mask = tf.ones_like(per_example_loss, dtype=tf.float32)
                correct_label_probs = tf.reduce_sum(one_hot_labels *
                                                    tf.exp(sup_log_probs),
                                                    axis=-1)

                if is_training and tsa_schedule:
                    tsa_start = 1.0 / label_size
                    tsa_threshold = get_tsa_threshold(tsa_schedule,
                                                      global_step,
                                                      num_train_steps,
                                                      tsa_start,
                                                      end=1)

                    larger_than_threshold = tf.greater(correct_label_probs,
                                                       tsa_threshold)
                    loss_mask = loss_mask * (
                        1 - tf.cast(larger_than_threshold, tf.float32))

                loss_mask = tf.stop_gradient(loss_mask)
                per_example_loss = per_example_loss * loss_mask
                if sample_weight is not None:
                    sup_sample_weight = tf.boolean_mask(sample_weight,
                                                        mask=is_supervised,
                                                        axis=0)
                    per_example_loss *= tf.cast(sup_sample_weight,
                                                dtype=tf.float32)
                sup_loss = (tf.reduce_sum(per_example_loss) /
                            tf.maximum(tf.reduce_sum(loss_mask), 1))

                self.losses['supervised'] = per_example_loss

            with tf.variable_scope('unsup_loss'):

                # reshape
                ori_log_probs = tf.boolean_mask(sup_ori_log_probs,
                                                mask=(1.0 - is_supervised),
                                                axis=0)
                aug_log_probs = tf.boolean_mask(log_probs,
                                                mask=is_expanded,
                                                axis=0)
                sup_ori_logits = tf.boolean_mask(logits,
                                                 mask=(1.0 - is_expanded),
                                                 axis=0)
                ori_logits = tf.boolean_mask(sup_ori_logits,
                                             mask=(1.0 - is_supervised),
                                             axis=0)

                unsup_loss_mask = 1
                if uda_softmax_temp != -1:
                    tgt_ori_log_probs = tf.nn.log_softmax(ori_logits /
                                                          uda_softmax_temp,
                                                          axis=-1)
                    tgt_ori_log_probs = tf.stop_gradient(tgt_ori_log_probs)
                else:
                    tgt_ori_log_probs = tf.stop_gradient(ori_log_probs)

                if uda_confidence_thresh != -1:
                    largest_prob = tf.reduce_max(tf.exp(ori_log_probs),
                                                 axis=-1)
                    unsup_loss_mask = tf.cast(
                        tf.greater(largest_prob, uda_confidence_thresh),
                        tf.float32)
                    unsup_loss_mask = tf.stop_gradient(unsup_loss_mask)

                per_example_loss = kl_for_log_probs(
                    tgt_ori_log_probs, aug_log_probs) * unsup_loss_mask
                if sample_weight is not None:
                    unsup_sample_weight = tf.boolean_mask(sample_weight,
                                                          mask=(1.0 -
                                                                is_supervised),
                                                          axis=0)
                    per_example_loss *= tf.cast(unsup_sample_weight,
                                                dtype=tf.float32)
                unsup_loss = tf.reduce_mean(per_example_loss)

                self.losses['unsupervised'] = per_example_loss

            self.total_loss = sup_loss + unsup_loss

Beispiel #13

0

Datei anzeigen

Datei: tiny_bert.py Projekt: llgithubll/unif

    def __init__(self,
                 tiny_bert_config,
                 bert_config,
                 is_training,
                 input_ids,
                 input_mask,
                 segment_ids,
                 label_ids=None,
                 sample_weight=None,
                 scope='bert',
                 name='',
                 dtype=tf.float32,
                 use_tilda_embedding=False,
                 drop_pooler=False,
                 label_size=2,
                 trainable=True,
                 **kwargs):
        super(TinyBERTCLSDistillor, self).__init__()

        def _get_logits(pooled_output, hidden_size, scope, trainable):
            with tf.variable_scope(scope):
                output_weights = tf.get_variable(
                    'output_weights',
                    shape=[label_size, hidden_size],
                    initializer=util.create_initializer(
                        bert_config.initializer_range),
                    trainable=trainable)
                output_bias = tf.get_variable(
                    'output_bias',
                    shape=[label_size],
                    initializer=tf.zeros_initializer(),
                    trainable=trainable)

                logits = tf.matmul(pooled_output,
                                   output_weights,
                                   transpose_b=True)
                logits = tf.nn.bias_add(logits, output_bias)
                return logits

        student = BERTEncoder(bert_config=tiny_bert_config,
                              is_training=is_training,
                              input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              scope='tiny/bert',
                              use_tilda_embedding=use_tilda_embedding,
                              drop_pooler=drop_pooler,
                              trainable=True,
                              **kwargs)
        student_logits = _get_logits(student.get_pooled_output(),
                                     tiny_bert_config.hidden_size,
                                     'tiny/cls/seq_relationship', True)

        if is_training:
            teacher = BERTEncoder(bert_config=bert_config,
                                  is_training=False,
                                  input_ids=input_ids,
                                  input_mask=input_mask,
                                  segment_ids=segment_ids,
                                  scope=scope,
                                  use_tilda_embedding=False,
                                  drop_pooler=drop_pooler,
                                  trainable=False,
                                  **kwargs)
            teacher_logits = _get_logits(teacher.get_pooled_output(),
                                         bert_config.hidden_size,
                                         'cls/seq_relationship', False)

            weights = 1.0
            if sample_weight is not None:
                weights = tf.cast(sample_weight, dtype=tf.float32)

            # embedding loss
            teacher_embedding = teacher.get_embedding_output()
            student_embedding = student.get_embedding_output()
            with tf.variable_scope('embedding_loss'):
                linear_trans = tf.layers.dense(
                    student_embedding,
                    bert_config.hidden_size,
                    kernel_initializer=util.create_initializer(
                        bert_config.initializer_range))
                embedding_loss = tf.losses.mean_squared_error(
                    linear_trans,
                    teacher_embedding,
                    weights=tf.reshape(weights, [-1, 1, 1]))

            # attention loss
            teacher_attention_scores = teacher.get_attention_scores()
            student_attention_scores = student.get_attention_scores()
            num_teacher_hidden_layers = bert_config.num_hidden_layers
            num_student_hidden_layers = tiny_bert_config.num_hidden_layers
            num_projections = \
                int(num_teacher_hidden_layers / num_student_hidden_layers)
            attention_losses = []
            for i in range(num_student_hidden_layers):
                attention_losses.append(
                    tf.losses.mean_squared_error(
                        teacher_attention_scores[num_projections * i +
                                                 num_projections - 1],
                        student_attention_scores[i],
                        weights=tf.reshape(weights, [-1, 1, 1, 1])), )
            attention_loss = tf.add_n(attention_losses)

            # hidden loss
            teacher_hidden_layers = teacher.all_encoder_layers
            student_hidden_layers = student.all_encoder_layers
            num_teacher_hidden_layers = bert_config.num_hidden_layers
            num_student_hidden_layers = tiny_bert_config.num_hidden_layers
            num_projections = int(num_teacher_hidden_layers /
                                  num_student_hidden_layers)
            with tf.variable_scope('hidden_loss'):
                hidden_losses = []
                for i in range(num_student_hidden_layers):
                    hidden_losses.append(
                        tf.losses.mean_squared_error(
                            teacher_hidden_layers[num_projections * i +
                                                  num_projections - 1],
                            tf.layers.dense(
                                student_hidden_layers[i],
                                bert_config.hidden_size,
                                kernel_initializer=util.create_initializer(
                                    bert_config.initializer_range)),
                            weights=tf.reshape(weights, [-1, 1, 1])))
                hidden_loss = tf.add_n(hidden_losses)

            # prediction loss
            teacher_probs = tf.nn.softmax(teacher_logits, axis=-1)
            student_log_probs = tf.nn.log_softmax(student_logits, axis=-1)
            pred_loss = (
                -tf.reduce_sum(teacher_probs * student_log_probs, axis=-1) *
                tf.reshape(weights, [-1, 1]))
            pred_loss = tf.reduce_mean(pred_loss)

            # sum up
            distill_loss = (embedding_loss + attention_loss + hidden_loss +
                            pred_loss)
            self.total_loss = distill_loss
            self.losses['distill'] = tf.reshape(distill_loss, [1])

        else:
            student_probs = tf.nn.softmax(student_logits,
                                          axis=-1,
                                          name='probs')
            self.probs[name] = student_probs

Beispiel #14

0

Datei anzeigen

    def __init__(self,
                 vocab_size,
                 is_training,
                 source_ids,
                 target_ids,
                 sos_id,
                 sample_weight=None,
                 hidden_size=768,
                 num_blocks=6,
                 num_attention_heads=12,
                 scope='transformer',
                 use_label_smoothing=False,
                 use_tilda_embedding=False,
                 trainable=True,
                 **kwargs):
        super().__init__()

        dropout_rate = 0.0
        if is_training:
            dropout_rate = 0.1

        source_shape = util.get_shape_list(source_ids, expected_rank=2)
        target_shape = util.get_shape_list(target_ids, expected_rank=2)
        batch_size = source_shape[0]
        source_max_seq_length = source_shape[1]
        target_max_seq_length = target_shape[1]

        # Tilda embeddings for SMART algorithm
        tilda_embeddings = None
        if use_tilda_embedding:
            with tf.variable_scope('', reuse=True):
                tilda_embeddings = tf.get_variable('tilda_embeddings')

        with tf.variable_scope(scope):
            source_mask = tf.math.equal(source_ids, 0)

            # embedding
            with tf.variable_scope('embeddings'):
                (enc, embedding_table) = embedding_lookup(
                    input_ids=source_ids,
                    vocab_size=vocab_size,
                    batch_size=batch_size,
                    max_seq_length=source_max_seq_length,
                    embedding_size=hidden_size,
                    word_embedding_name='word_embeddings',
                    tilda_embeddings=tilda_embeddings)
                enc *= hidden_size ** 0.5  # scale
                enc += positional_encoding(enc, source_max_seq_length)
                enc = util.dropout(enc, dropout_rate)

            with tf.variable_scope('encoder'):

                # stacked multi-attention layers
                for i in range(num_blocks):
                    with tf.variable_scope('block_%s' % i):

                        # self-attention
                        enc = multihead_attention(
                            queries=enc,
                            keys=enc,
                            values=enc,
                            key_masks=source_mask,
                            num_heads=num_attention_heads,
                            dropout_rate=dropout_rate,
                            training=is_training,
                            causality=False,
                            scope='self_attention')

                        # feed forward
                        enc = ff(enc, num_units=[hidden_size * 4, hidden_size])
                memory = enc

            def _forward(target_ids, target_mask, target_max_seq_length):

                with tf.variable_scope('decoder'):

                    # shared embedding
                    dec = tf.nn.embedding_lookup(embedding_table, target_ids)
                    dec *= hidden_size ** 0.5  # scale
                    dec += positional_encoding(dec, target_max_seq_length)
                    dec = util.dropout(dec, dropout_rate)

                    # blocks
                    for i in range(num_blocks):
                        with tf.variable_scope('block_%s' % i):

                            # masked self-attention
                            dec = multihead_attention(
                                queries=dec,
                                keys=dec,
                                values=dec,
                                key_masks=target_mask,
                                num_heads=num_attention_heads,
                                dropout_rate=dropout_rate,
                                training=is_training,
                                causality=True,
                                scope='masked_self_attention')

                            # vanilla attention
                            dec = multihead_attention(
                                queries=dec,
                                keys=memory,
                                values=memory,
                                key_masks=source_mask,
                                num_heads=num_attention_heads,
                                dropout_rate=dropout_rate,
                                training=is_training,
                                causality=False,
                                scope='vanilla_attention')

                            # feed forward
                            dec = ff(
                                dec, num_units=[4 * hidden_size, hidden_size])

                # final linear projection (embedding weights are shared)
                with tf.variable_scope('cls'):
                    output_bias = tf.get_variable(
                        'output_bias', shape=[vocab_size],
                        initializer=tf.zeros_initializer())
                    dec = tf.reshape(dec, [-1, hidden_size])
                    logits = tf.matmul(dec, embedding_table, transpose_b=True)
                    logits = tf.reshape(
                        logits, [-1, target_max_seq_length, vocab_size])
                    logits = tf.nn.bias_add(logits, output_bias)

                return logits

            # convert to labels
            label_ids = tf.concat(
                [target_ids[:, 1:],
                 tf.zeros([batch_size, 1], dtype=tf.int32)], axis=-1)

            # forward once
            if is_training:
                target_mask = tf.math.equal(target_ids, 0)  # (N, T2)
                logits = _forward(
                    target_ids, target_mask, target_max_seq_length)

                self.preds['MT'] = tf.argmax(logits, axis=-1)

            # forward loop
            else:
                target_mask_base = tf.zeros([batch_size, 1], dtype=tf.int32)
                target_ids = tf.ones([batch_size, 1], dtype=tf.int32) * sos_id

                for cur_length in range(1, target_max_seq_length + 1):
                    target_mask = tf.tile(target_mask_base, [1, cur_length])
                    logits = _forward(target_ids, target_mask, cur_length)

                    pred_ids = tf.argmax(
                        logits[:, cur_length-1:cur_length, :],
                        axis=-1)
                    pred_ids = tf.cast(pred_ids, tf.int32)
                    target_ids = tf.concat([target_ids, pred_ids], axis=-1)

                self.preds['MT'] = target_ids[:, 1:]

            # loss
            log_probs = tf.nn.log_softmax(logits, axis=-1)
            one_hot_labels = tf.one_hot(label_ids, depth=vocab_size)
            if use_label_smoothing:
                one_hot_labels = label_smoothing(one_hot_labels)
            per_token_loss = -tf.reduce_sum(
                one_hot_labels * log_probs, axis=-1)
            label_mask = tf.cast(tf.not_equal(label_ids, 0), tf.float32)
            per_example_loss = \
                tf.reduce_sum(per_token_loss * label_mask, axis=-1) / \
                tf.reduce_sum(label_mask, axis=-1)
            if sample_weight is not None:
                per_example_loss *= tf.expand_dims(sample_weight, axis=-1)

            self.total_loss = tf.reduce_mean(per_example_loss)
            self.losses['MT'] = per_example_loss

Beispiel #15

0

Datei anzeigen

    def __init__(self,
                 bert_config,
                 is_training,
                 input_ids,
                 input_mask,
                 segment_ids,
                 sample_weight=None,
                 scope='bert',
                 dtype=tf.float32,
                 drop_pooler=False,
                 cls_model='self-attention',
                 label_size=2,
                 speed=0.1,
                 ignore_cls='0',
                 **kwargs):
        super(FastBERTCLSDistillor, self).__init__()

        if not ignore_cls:
            ignore_cls = []
        if isinstance(ignore_cls, str):
            ignore_cls = ignore_cls.replace(' ', '').split(',')
            ignore_cls = list(map(int, ignore_cls))
        elif isinstance(ignore_cls, list):
            ignore_cls = list(map(int, ignore_cls))
        else:
            raise ValueError(
                '`ignore_cls` should be a list of child-classifier ids or '
                'a string seperated with commas.')

        if not speed:
            raise ValueError(
                '`speed` should be a float number between `0` and `1`.')

        bert_config = copy.deepcopy(bert_config)
        bert_config.hidden_dropout_prob = 0.0
        bert_config.attention_probs_dropout_prob = 0.0

        input_shape = util.get_shape_list(input_ids, expected_rank=2)
        batch_size = input_shape[0]
        max_seq_length = input_shape[1]

        with tf.variable_scope(scope):
            with tf.variable_scope('embeddings'):

                (self.embedding_output, self.embedding_table) = \
                    self.embedding_lookup(
                        input_ids=input_ids,
                        vocab_size=bert_config.vocab_size,
                        batch_size=batch_size,
                        max_seq_length=max_seq_length,
                        embedding_size=bert_config.hidden_size,
                        initializer_range=bert_config.initializer_range,
                        word_embedding_name='word_embeddings',
                        dtype=dtype,
                        trainable=False,
                        tilda_embeddings=None)

                # Add positional embeddings and token type embeddings
                # layer normalize and perform dropout.
                self.embedding_output = self.embedding_postprocessor(
                    input_tensor=self.embedding_output,
                    batch_size=batch_size,
                    max_seq_length=max_seq_length,
                    hidden_size=bert_config.hidden_size,
                    use_token_type=True,
                    segment_ids=segment_ids,
                    token_type_vocab_size=bert_config.type_vocab_size,
                    token_type_embedding_name='token_type_embeddings',
                    use_position_embeddings=True,
                    position_embedding_name='position_embeddings',
                    initializer_range=bert_config.initializer_range,
                    max_position_embeddings=\
                        bert_config.max_position_embeddings,
                    dropout_prob=bert_config.hidden_dropout_prob,
                    dtype=dtype,
                    trainable=False)

            with tf.variable_scope('encoder'):
                attention_mask = self.create_attention_mask_from_input_mask(
                    input_mask, batch_size, max_seq_length, dtype=dtype)

                # stacked transformers
                (self.all_encoder_layers, self.all_cls_layers) = \
                    self.dynamic_transformer_model(
                        is_training,
                        input_tensor=self.embedding_output,
                        input_mask=input_mask,
                        batch_size=batch_size,
                        max_seq_length=max_seq_length,
                        label_size=label_size,
                        attention_mask=attention_mask,
                        hidden_size=bert_config.hidden_size,
                        num_hidden_layers=bert_config.num_hidden_layers,
                        num_attention_heads=bert_config.num_attention_heads,
                        intermediate_size=bert_config.intermediate_size,
                        intermediate_act_fn=util.get_activation(
                            bert_config.hidden_act),
                        hidden_dropout_prob=bert_config.hidden_dropout_prob,
                        attention_probs_dropout_prob=\
                            bert_config.attention_probs_dropout_prob,
                        initializer_range=bert_config.initializer_range,
                        dtype=dtype,
                        cls_model=cls_model,
                        speed=speed,
                        ignore_cls=ignore_cls)

            self.sequence_output = self.all_encoder_layers[-1]
            with tf.variable_scope('pooler'):
                first_token_tensor = self.sequence_output[:, 0, :]

                # trick: ignore the fully connected layer
                if drop_pooler:
                    self.pooled_output = first_token_tensor
                else:
                    self.pooled_output = tf.layers.dense(
                        first_token_tensor,
                        bert_config.hidden_size,
                        activation=tf.tanh,
                        kernel_initializer=util.create_initializer(
                            bert_config.initializer_range),
                        trainable=False)

        # teacher classifier
        if bert_config.num_hidden_layers not in ignore_cls:
            with tf.variable_scope('cls/seq_relationship'):
                output_weights = tf.get_variable(
                    'output_weights',
                    shape=[label_size, bert_config.hidden_size],
                    initializer=util.create_initializer(
                        bert_config.initializer_range),
                    trainable=False)
                output_bias = tf.get_variable(
                    'output_bias',
                    shape=[label_size],
                    initializer=tf.zeros_initializer(),
                    trainable=False)

                logits = tf.matmul(self.pooled_output,
                                   output_weights,
                                   transpose_b=True)
                logits = tf.nn.bias_add(logits, output_bias)
                probs = tf.nn.softmax(logits, axis=-1)

        # distillation
        if is_training:
            losses = []
            for cls_probs in self.all_cls_layers.values():

                # KL-Divergence
                per_example_loss = tf.reduce_sum(
                    cls_probs * (tf.log(cls_probs) - tf.log(probs)), axis=-1)
                if sample_weight is not None:
                    per_example_loss *= tf.cast(sample_weight,
                                                dtype=tf.float32)
                loss = tf.reduce_mean(per_example_loss)
                losses.append(loss)

            distill_loss = tf.add_n(losses)
            self.total_loss = distill_loss
            self.losses['losses'] = distill_loss

        else:
            if bert_config.num_hidden_layers not in ignore_cls:
                self.all_cls_layers[bert_config.num_hidden_layers] = probs
            self.probs['probs'] = tf.concat(list(self.all_cls_layers.values()),
                                            axis=0,
                                            name='probs')

Beispiel #16

0

Datei anzeigen

    def __init__(self,
                 bert_config,
                 is_training,
                 encoder,
                 masked_lm_positions,
                 masked_lm_ids,
                 masked_lm_weights,
                 next_sentence_labels,
                 sample_weight=None,
                 scope_lm='cls/predictions',
                 scope_cls='cls/seq_relationship',
                 trainable=True,
                 use_nsp_loss=True,
                 **kwargs):
        super(BERTDecoder, self).__init__(**kwargs)

        def gather_indexes(sequence_tensor, positions):
            sequence_shape = util.get_shape_list(sequence_tensor, 3)
            batch_size = sequence_shape[0]
            seq_length = sequence_shape[1]
            width = sequence_shape[2]

            flat_offsets = tf.reshape(
                tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
            flat_positions = tf.reshape(positions + flat_offsets, [-1])
            flat_sequence_tensor = tf.reshape(sequence_tensor,
                                              [batch_size * seq_length, width])
            output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
            return output_tensor

        scalar_losses = []

        # masked language modeling
        input_tensor = gather_indexes(encoder.get_sequence_output(),
                                      masked_lm_positions)
        with tf.variable_scope(scope_lm):
            with tf.variable_scope('transform'):
                input_tensor = tf.layers.dense(
                    input_tensor,
                    units=bert_config.hidden_size,
                    activation=util.get_activation(bert_config.hidden_act),
                    kernel_initializer=util.create_initializer(
                        bert_config.initializer_range))
                input_tensor = util.layer_norm(input_tensor)
            output_bias = tf.get_variable('output_bias',
                                          shape=[bert_config.vocab_size],
                                          initializer=tf.zeros_initializer(),
                                          trainable=trainable)

            logits = tf.matmul(input_tensor,
                               encoder.get_embedding_table(),
                               transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            probs = tf.nn.softmax(logits, axis=-1, name='MLM_probs')
            log_probs = tf.nn.log_softmax(logits, axis=-1)

            label_ids = tf.reshape(masked_lm_ids, [-1])
            if sample_weight is not None:
                sample_weight = tf.expand_dims(tf.cast(sample_weight,
                                                       dtype=tf.float32),
                                               axis=-1)
                masked_lm_weights *= sample_weight
            label_weights = tf.reshape(masked_lm_weights, [-1])
            one_hot_labels = tf.one_hot(label_ids,
                                        depth=bert_config.vocab_size,
                                        dtype=tf.float32)
            per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                              axis=[-1])
            per_example_loss = label_weights * per_example_loss

            numerator = tf.reduce_sum(per_example_loss)
            denominator = tf.reduce_sum(label_weights) + 1e-5
            loss = numerator / denominator

            scalar_losses.append(loss)
            self.losses['MLM_losses'] = per_example_loss
            self.preds['MLM_preds'] = tf.argmax(probs, axis=-1)

        # next sentence prediction
        with tf.variable_scope(scope_cls):
            output_weights = tf.get_variable(
                'output_weights',
                shape=[2, bert_config.hidden_size],
                initializer=util.create_initializer(
                    bert_config.initializer_range),
                trainable=trainable)
            output_bias = tf.get_variable('output_bias',
                                          shape=[2],
                                          initializer=tf.zeros_initializer(),
                                          trainable=trainable)

            logits = tf.matmul(encoder.get_pooled_output(),
                               output_weights,
                               transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            probs = tf.nn.softmax(logits, axis=-1, name='probs')
            log_probs = tf.nn.log_softmax(logits, axis=-1)

            labels = tf.reshape(next_sentence_labels, [-1])
            one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32)
            per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs,
                                              axis=-1)
            if sample_weight is not None:
                per_example_loss = (tf.cast(sample_weight, dtype=tf.float32) *
                                    per_example_loss)
            loss = tf.reduce_mean(per_example_loss)

            if use_nsp_loss:
                scalar_losses.append(loss)
            self.losses['NSP_losses'] = per_example_loss
            self.probs['NSP_probs'] = probs
            self.preds['NSP_preds'] = tf.argmax(probs, axis=-1)

        self.total_loss = tf.add_n(scalar_losses)

Beispiel #17

0

Datei anzeigen

    def __init__(self,
                 bert_config,
                 is_training,
                 input_tensor,
                 sa_mask,
                 label_ids,
                 sample_weight=None,
                 scope='sanet',
                 alpha=0.5,
                 hidden_dropout_prob=0.1,
                 initializer_range=0.02,
                 trainable=True,
                 **kwargs):
        super().__init__(**kwargs)

        shape = util.get_shape_list(input_tensor)
        batch_size = shape[0]
        seq_length = shape[1]
        hidden_size = shape[2]
        sa_mask = tf.reshape(sa_mask, [batch_size, seq_length, seq_length])
        with tf.variable_scope(scope):
            with tf.variable_scope('sentence_attention'):
                (sa_output, _) = self.attention_layer(
                    from_tensor=input_tensor,
                    to_tensor=input_tensor,
                    attention_mask=sa_mask,
                    num_attention_heads=bert_config.num_attention_heads,
                    size_per_head=\
                        hidden_size // bert_config.num_attention_heads,
                    attention_probs_dropout_prob=\
                        bert_config.hidden_dropout_prob,
                    initializer_range=bert_config.initializer_range,
                    do_return_2d_tensor=False,
                    batch_size=batch_size,
                    from_max_seq_length=seq_length,
                    to_max_seq_length=seq_length,
                    trainable=trainable)

            with tf.variable_scope('cls/mrc'):
                output_weights = tf.get_variable(
                    'output_weights',
                    shape=[2, hidden_size],
                    initializer=util.create_initializer(initializer_range),
                    trainable=trainable)
                output_bias = tf.get_variable(
                    'output_bias',
                    shape=[2],
                    initializer=tf.zeros_initializer(),
                    trainable=trainable)

            output_layer = alpha * sa_output + (1 - alpha) * input_tensor
            output_layer = tf.reshape(output_layer, [-1, hidden_size])
            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            logits = tf.reshape(logits, [-1, seq_length, 2])
            logits = tf.transpose(logits, [0, 2, 1])
            probs = tf.nn.softmax(logits, axis=-1, name='probs')
            self.probs['probs'] = probs
            self.preds['preds'] = tf.argmax(logits, axis=-1)

            start_one_hot_labels = tf.one_hot(label_ids[:, 0],
                                              depth=seq_length,
                                              dtype=tf.float32)
            end_one_hot_labels = tf.one_hot(label_ids[:, 1],
                                            depth=seq_length,
                                            dtype=tf.float32)
            start_log_probs = tf.nn.log_softmax(logits[:, 0, :], axis=-1)
            end_log_probs = tf.nn.log_softmax(logits[:, 1, :], axis=-1)
            per_example_loss = (
                -0.5 * tf.reduce_sum(start_one_hot_labels * start_log_probs,
                                     axis=-1) - 0.5 *
                tf.reduce_sum(end_one_hot_labels * end_log_probs, axis=-1))
            if sample_weight is not None:
                per_example_loss *= sample_weight

            self.total_loss = tf.reduce_mean(per_example_loss)
            self.losses['losses'] = per_example_loss

Beispiel #18

0

Datei anzeigen

    def __init__(self,
                 bert_config,
                 is_training,
                 sketchy_encoder,
                 intensive_encoder,
                 query_mask,
                 label_ids,
                 has_answer,
                 sample_weight=None,
                 scope='retro_reader',
                 matching_mechanism='cross-attention',
                 beta_1=0.5,
                 beta_2=0.5,
                 threshold=1.0,
                 trainable=True,
                 **kwargs):
        super().__init__(**kwargs)

        # verifier
        with tf.variable_scope(scope):

            # sketchy reading module
            with tf.variable_scope('sketchy/prediction'):
                sketchy_output = sketchy_encoder.get_pooled_output()
                hidden_size = sketchy_output.shape.as_list()[-1]

                output_weights = tf.get_variable(
                    'output_weights',
                    shape=[2, hidden_size],
                    initializer=util.create_initializer(
                        bert_config.initializer_range),
                    trainable=trainable)
                output_bias = tf.get_variable(
                    'output_bias',
                    shape=[2],
                    initializer=tf.zeros_initializer(),
                    trainable=trainable)

                output_layer = util.dropout(
                    sketchy_output, bert_config.hidden_dropout_prob \
                        if is_training else 0.0)
                logits = tf.matmul(
                    output_layer, output_weights, transpose_b=True)
                logits = tf.nn.bias_add(logits, output_bias)

                log_probs = tf.nn.log_softmax(logits, axis=-1)
                one_hot_labels = tf.one_hot(
                    has_answer, depth=2, dtype=tf.float32)
                per_example_loss = - tf.reduce_sum(
                    one_hot_labels * log_probs, axis=-1)
                if sample_weight is not None:
                    per_example_loss = tf.cast(
                        sample_weight, dtype=tf.float32) * per_example_loss

                self.losses['sketchy_losses'] = per_example_loss
                sketchy_loss = tf.reduce_mean(per_example_loss)

                score_ext = logits[:, 1] - logits[:, 0]

            # intensive reading module
            with tf.variable_scope('intensive'):
                H = intensive_encoder.get_sequence_output()
                H_Q = H * tf.cast(
                    tf.expand_dims(query_mask, axis=-1), tf.float32)
                (batch_size, max_seq_length, hidden_size) = \
                    util.get_shape_list(H)

                # cross-attention
                if matching_mechanism == 'cross-attention':
                    with tf.variable_scope('cross_attention'):
                        attention_mask = \
                            self.create_attention_mask_from_input_mask(
                                query_mask, batch_size, max_seq_length)
                        (H_prime, _) = self.attention_layer(
                            from_tensor=H,
                            to_tensor=H_Q,
                            attention_mask=attention_mask,
                            num_attention_heads=\
                                bert_config.num_attention_heads,
                            size_per_head=\
                                hidden_size // bert_config.num_attention_heads,
                            attention_probs_dropout_prob=\
                                bert_config.hidden_dropout_prob,
                            initializer_range=bert_config.initializer_range,
                            do_return_2d_tensor=False,
                            batch_size=batch_size,
                            from_max_seq_length=max_seq_length,
                            to_max_seq_length=max_seq_length,
                            trainable=trainable)

                # matching-attention
                elif matching_mechanism == 'matching-attention':
                    with tf.variable_scope('matching_attention'):
                        output_weights = tf.get_variable(
                            'output_weights',
                            shape=[hidden_size, hidden_size],
                            initializer=util.create_initializer(
                                bert_config.initializer_range),
                            trainable=trainable)
                        output_bias = tf.get_variable(
                            'output_bias',
                            shape=[hidden_size],
                            initializer=tf.zeros_initializer(),
                            trainable=trainable)
                        trans = tf.matmul(
                            H_Q, tf.tile(
                                tf.expand_dims(output_weights, axis=0),
                                [batch_size, 1, 1]),
                            transpose_b=True)
                        trans = tf.nn.bias_add(trans, output_bias)
                        M = tf.nn.softmax(
                            tf.matmul(H, trans, transpose_b=True), axis=-1)
                        H_prime = tf.matmul(M, H_Q)

                with tf.variable_scope('prediction'):
                    output_weights = tf.get_variable(
                        'output_weights',
                        shape=[2, hidden_size],
                        initializer=util.create_initializer(
                            bert_config.initializer_range),
                        trainable=trainable)
                    output_bias = tf.get_variable(
                        'output_bias',
                        shape=[2],
                        initializer=tf.zeros_initializer(),
                        trainable=trainable)

                    output_layer = util.dropout(
                        H_prime, bert_config.hidden_dropout_prob \
                            if is_training else 0.0)
                    output_layer = tf.reshape(
                        output_layer,
                        [batch_size * max_seq_length, hidden_size])
                    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
                    logits = tf.nn.bias_add(logits, output_bias)
                    logits = tf.reshape(
                        logits, [batch_size, max_seq_length, 2])
                    logits = tf.transpose(logits, [0, 2, 1])
                    probs = tf.nn.softmax(logits, axis=-1, name='probs')

                    self.probs['mrc_probs'] = probs
                    self.preds['mrc_preds'] = tf.argmax(logits, axis=-1)

                    start_one_hot_labels = tf.one_hot(
                        label_ids[:, 0], depth=max_seq_length,
                        dtype=tf.float32)
                    end_one_hot_labels = tf.one_hot(
                        label_ids[:, 1], depth=max_seq_length,
                        dtype=tf.float32)
                    start_log_probs = tf.nn.log_softmax(logits[:, 0, :], axis=-1)
                    end_log_probs = tf.nn.log_softmax(logits[:, 1, :], axis=-1)
                    per_example_loss = (
                        - 0.5 * tf.reduce_sum(
                            start_one_hot_labels * start_log_probs, axis=-1)
                        - 0.5 * tf.reduce_sum(
                            end_one_hot_labels * end_log_probs, axis=-1))
                    if sample_weight is not None:
                        per_example_loss *= sample_weight

                    intensive_loss = tf.reduce_mean(per_example_loss)
                    self.losses['intensive_losses'] = per_example_loss

                    score_has = tf.norm(
                        probs[:, 0, 1:] + probs[:, 1, 1:], np.inf, axis=-1)
                    score_null = probs[:, 0, 0] + probs[:, 1, 0]
                    score_diff = score_has - score_null

            # rear verification
            v = beta_1 * score_diff + beta_2 * score_ext
            self.preds['verifier_preds'] = \
                tf.cast(tf.greater(v, threshold), tf.int32)
            self.probs['verifier_probs'] = v

            self.total_loss = sketchy_loss + intensive_loss

Beispiel #19

0

Datei anzeigen

Datei: dilated.py Projekt: zhongyunuestc/unif

    def __init__(self,
                 bert_config,
                 is_training,
                 dilated_ids,
                 label_ids,
                 max_seq_length,
                 spad_id=1,
                 loop=3,
                 sample_weight=None,
                 scope='dilated',
                 use_tilda_embedding=False,
                 **kwargs):
        super().__init__()

        dilated_mask = tf.cast(tf.not_equal(dilated_ids, 0), tf.float32)

        shape = util.get_shape_list(dilated_ids, expected_rank=2)
        batch_size = shape[0]
        dilated_seq_length = shape[1]

        # Tilda embeddings for SMART algorithm
        tilda_embeddings = None
        if use_tilda_embedding:
            with tf.variable_scope('', reuse=True):
                tilda_embeddings = tf.get_variable('tilda_embeddings')

        with tf.variable_scope(scope):

            # forward once
            if is_training:
                logits = self._bert_forward(bert_config,
                                            dilated_ids,
                                            dilated_mask,
                                            batch_size,
                                            dilated_seq_length,
                                            tilda_embeddings=tilda_embeddings)

                self.preds['LM'] = tf.argmax(logits, axis=-1)

                # LM loss
                log_probs = tf.nn.log_softmax(logits, axis=-1)
                one_hot_labels = tf.one_hot(label_ids,
                                            depth=bert_config.vocab_size)
                per_token_loss = -tf.reduce_sum(one_hot_labels * log_probs,
                                                axis=-1)

                input_length = tf.reduce_sum(dilated_mask, axis=-1) * 2
                label_mask = tf.sequence_mask(input_length,
                                              max_seq_length * 2,
                                              dtype=tf.float32)
                per_example_loss = \
                    tf.reduce_sum(per_token_loss * label_mask, axis=-1) / \
                    tf.reduce_sum(label_mask, axis=-1)
                if sample_weight is not None:
                    per_example_loss *= tf.expand_dims(sample_weight, axis=-1)

                self.total_loss = tf.reduce_mean(per_example_loss)
                self.losses['LM'] = per_example_loss

            # forward loop
            else:

                def _forward(dilated_ids, dilated_mask):

                    logits = self._bert_forward(
                        bert_config,
                        dilated_ids,
                        dilated_mask,
                        batch_size,
                        dilated_seq_length,
                        tilda_embeddings=tilda_embeddings)
                    output_ids = tf.argmax(logits, axis=-1)
                    output_ids = tf.cast(output_ids, dtype=tf.int32)

                    # special padding (using `spad` token)
                    equal_zero = tf.cast(tf.equal(output_ids, 0), tf.int32)
                    equal_zero = tf.reduce_sum(equal_zero, axis=-1)
                    right_pad = spad_id * tf.sequence_mask(
                        equal_zero, dilated_seq_length, dtype=tf.int32)
                    paded = tf.concat([output_ids, right_pad], axis=-1)

                    # extract ids of length `max_seq_length`
                    flattened_padded = tf.reshape(paded, [-1])
                    is_valid = tf.cast(tf.greater(flattened_padded, 0),
                                       dtype=tf.int32)
                    flattened_valid = tf.boolean_mask(flattened_padded,
                                                      is_valid)
                    valid = tf.reshape(flattened_valid,
                                       [batch_size, dilated_seq_length])
                    cutted_valid = valid[:, :max_seq_length]

                    # replace `spad` token with `pad`
                    non_spad_mask = tf.cast(tf.not_equal(
                        cutted_valid, spad_id),
                                            dtype=tf.int32)
                    output_ids = cutted_valid * non_spad_mask
                    output_length = tf.reduce_sum(non_spad_mask, axis=-1)

                    # dilate
                    reshaped_ids = tf.reshape(output_ids,
                                              [batch_size, max_seq_length, 1])
                    reshaped_mask = tf.reshape(
                        tf.sequence_mask(output_length,
                                         max_seq_length,
                                         dtype=tf.int32),
                        [batch_size, max_seq_length, 1])
                    concat_ids = tf.concat(
                        [reshaped_ids,
                         tf.zeros_like(reshaped_ids)], axis=-1)
                    concat_mask = tf.concat([
                        reshaped_mask,
                        tf.zeros_like(reshaped_mask, dtype=tf.int32)
                    ],
                                            axis=-1)
                    dilated_ids = tf.reshape(concat_ids,
                                             [batch_size, max_seq_length * 2])
                    dilated_mask = tf.reshape(concat_mask,
                                              [batch_size, max_seq_length * 2])

                    return dilated_ids, dilated_mask

                for _ in range(loop):
                    dilated_ids, dilated_mask = _forward(
                        dilated_ids, dilated_mask)

                self.preds['LM'] = dilated_ids

Beispiel #20

0

Datei anzeigen

Datei: vae.py Projekt: zhongyunuestc/unif

    def __init__(self,
                 vocab_size,
                 is_training,
                 input_ids,
                 input_mask,
                 segment_ids,
                 sample_weight=None,
                 reduced_size=64,
                 topic_size=1024,
                 hidden_size=768,
                 num_hidden_layers=12,
                 num_attention_heads=12,
                 bias=0,
                 scope='vae',
                 trainable=True,
                 **kwargs):
        super().__init__()

        # freeze parameters
        config = Config(vocab_size,
                        hidden_size=hidden_size,
                        num_hidden_layers=num_hidden_layers,
                        num_attention_heads=num_attention_heads)
        if not is_training:
            config.hidden_dropout_prob = 0.0
            config.attention_probs_dropout_prob = 0.0

        input_shape = util.get_shape_list(input_ids, expected_rank=2)
        batch_size = input_shape[0]
        seq_length = input_shape[1]

        # Tilda embeddings for SMART algorithm
        tilda_embeddings = None
        use_tilda_embedding = kwargs.get('use_tilda_embedding')
        if use_tilda_embedding:
            with tf.variable_scope('', reuse=True):
                tilda_embeddings = tf.get_variable('tilda_embeddings')

        with tf.variable_scope(scope):
            with tf.variable_scope('embeddings'):

                (self.embedding_output, self.embedding_table) = \
                    self.embedding_lookup(
                        input_ids=input_ids,
                        vocab_size=config.vocab_size,
                        batch_size=batch_size,
                        max_seq_length=seq_length,
                        embedding_size=config.hidden_size,
                        initializer_range=config.initializer_range,
                        word_embedding_name='word_embeddings',
                        tilda_embeddings=tilda_embeddings,
                        trainable=trainable)
                self.embedding_output = self.embedding_postprocessor(
                    input_tensor=self.embedding_output,
                    batch_size=batch_size,
                    max_seq_length=seq_length,
                    hidden_size=config.hidden_size,
                    use_token_type=True,
                    segment_ids=segment_ids,
                    token_type_vocab_size=config.type_vocab_size,
                    token_type_embedding_name='token_type_embeddings',
                    use_position_embeddings=True,
                    position_embedding_name='position_embeddings',
                    initializer_range=config.initializer_range,
                    max_position_embeddings=config.max_position_embeddings,
                    dropout_prob=config.hidden_dropout_prob,
                    trainable=trainable)

            with tf.variable_scope('encoder'):

                # stacked transformer
                attention_mask = self.create_attention_mask_from_input_mask(
                    input_mask, batch_size, seq_length)
                self.all_encoder_layers = self.transformer_model(
                    input_tensor=self.embedding_output,
                    batch_size=batch_size,
                    max_seq_length=seq_length,
                    attention_mask=attention_mask,
                    hidden_size=config.hidden_size,
                    num_hidden_layers=config.num_hidden_layers,
                    num_attention_heads=config.num_attention_heads,
                    intermediate_size=config.intermediate_size,
                    intermediate_act_fn=util.get_activation(config.hidden_act),
                    hidden_dropout_prob=config.hidden_dropout_prob,
                    attention_probs_dropout_prob=\
                        config.attention_probs_dropout_prob,
                    initializer_range=config.initializer_range,
                    trainable=trainable)

                # projection
                with tf.variable_scope('projection'):
                    transformer_output = tf.layers.dense(
                        self.all_encoder_layers[-1],
                        reduced_size,
                        activation=util.gelu,
                        kernel_initializer=tf.truncated_normal_initializer(
                            stddev=config.initializer_range),
                        trainable=trainable)
                    transformer_output = tf.reshape(transformer_output,
                                                    [batch_size, -1])
                    input_length = tf.reduce_sum(input_mask, axis=-1)
                    input_length = tf.cast(input_length, tf.float32)
                    input_length_1d = tf.reshape(input_length, [batch_size])
                    input_length_2d = tf.reshape(input_length, [batch_size, 1])

                    broadcast_mask = tf.sequence_mask(
                        tf.multiply(input_length_1d, reduced_size),
                        seq_length * reduced_size,
                        dtype=tf.float32)
                    broadcast_mask = tf.multiply(broadcast_mask,
                                                 seq_length / input_length_2d)
                    transformer_output *= broadcast_mask

                    # latent space
                    miu = tf.layers.dense(
                        transformer_output,
                        topic_size,
                        activation='tanh',
                        kernel_initializer=tf.truncated_normal_initializer(
                            stddev=config.initializer_range),
                        name='miu',
                        trainable=trainable)
                    sigma = tf.layers.dense(
                        transformer_output,
                        topic_size,
                        kernel_initializer=tf.truncated_normal_initializer(
                            stddev=config.initializer_range),
                        name='sigma',
                        trainable=trainable)
                    self.probs['miu'] = miu
                    self.probs['sigma'] = sigma

            with tf.variable_scope('decoder'):
                with tf.variable_scope('projection'):

                    # reparametarization
                    if is_training:
                        noise = tf.random_normal([batch_size, topic_size])
                    else:
                        noise = tf.random_uniform([batch_size, topic_size],
                                                  minval=-bias,
                                                  maxval=bias)
                    decoder_input = miu + tf.exp(sigma) * noise

                    # projection
                    decoder_input = tf.layers.dense(
                        decoder_input,
                        seq_length * reduced_size,
                        activation=util.gelu,
                        kernel_initializer=tf.truncated_normal_initializer(
                            stddev=config.initializer_range),
                        trainable=trainable)
                    intermediate_input = tf.reshape(
                        decoder_input, [-1, seq_length, reduced_size])
                    intermediate_input = util.layer_norm(intermediate_input,
                                                         trainable=trainable)
                    intermediate_input = util.dropout(
                        intermediate_input, config.hidden_dropout_prob)

                # MLP
                with tf.variable_scope('intermediate'):
                    intermediate_output = tf.layers.dense(
                        intermediate_input,
                        4 * reduced_size,
                        activation=util.gelu,
                        kernel_initializer=util.create_initializer(
                            config.initializer_range),
                        trainable=trainable)
                with tf.variable_scope('output'):
                    decoder_output = tf.layers.dense(
                        intermediate_output,
                        config.hidden_size,
                        kernel_initializer=util.create_initializer(
                            config.initializer_range),
                        trainable=trainable)
                    decoder_output = util.layer_norm(decoder_output,
                                                     trainable=trainable)
                    decoder_output = util.dropout(decoder_output,
                                                  config.hidden_dropout_prob)
                self.all_decoder_layers = [intermediate_output, decoder_output]
                self.all_decoder_layers = [decoder_output]

        # reconstruction
        with tf.variable_scope('cls/predictions'):
            with tf.variable_scope('transform'):
                input_tensor = tf.layers.dense(
                    decoder_output,
                    units=config.hidden_size,
                    activation=util.get_activation(config.hidden_act),
                    kernel_initializer=util.create_initializer(
                        config.initializer_range),
                    trainable=trainable)
                input_tensor = util.layer_norm(input_tensor,
                                               trainable=trainable)
            output_weights = self.embedding_table
            output_bias = tf.get_variable('output_bias',
                                          shape=[config.vocab_size],
                                          initializer=tf.zeros_initializer(),
                                          trainable=trainable)
            flatten_input_tensor = tf.reshape(input_tensor,
                                              [-1, config.hidden_size])

            logits = tf.matmul(flatten_input_tensor,
                               output_weights,
                               transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)

            logits = tf.reshape(logits,
                                [batch_size, seq_length, config.vocab_size])
            probs = tf.nn.softmax(logits, axis=-1, name='probs')
            lm_log_probs = tf.nn.log_softmax(logits, axis=-1)

            self.preds['preds'] = tf.argmax(probs, axis=-1)
            one_hot_labels = tf.one_hot(input_ids,
                                        depth=config.vocab_size,
                                        dtype=tf.float32)
            per_example_loss = -tf.reduce_sum(lm_log_probs * one_hot_labels,
                                              axis=[-1])
            if sample_weight is not None:
                per_example_loss *= tf.expand_dims(sample_weight, axis=-1)

            self.total_loss = (tf.reduce_mean(per_example_loss) +
                               tf.reduce_mean(tf.square(miu)) +
                               tf.reduce_mean(tf.exp(sigma) - sigma - 1))
            self.losses['losses'] = per_example_loss

Beispiel #21

0

Datei anzeigen

Datei: rec_bert.py Projekt: zhongyunuestc/unif

    def _lm_forward(self,
                    is_training,
                    input_tensor,
                    input_mask,
                    label_ids,
                    bert_config,
                    batch_size,
                    max_seq_length,
                    prob,
                    scope,
                    name,
                    sample_weight=None,
                    hidden_dropout_prob=0.1,
                    initializer_range=0.02):

        with tf.variable_scope(scope):

            with tf.variable_scope('verifier'):
                logits = tf.layers.dense(
                    input_tensor,
                    2,
                    kernel_initializer=util.create_initializer(
                        bert_config.initializer_range),
                    trainable=True)
                verifier_label_ids = tf.cast(tf.greater(label_ids, 0),
                                             tf.int32)

                # loss
                log_probs = tf.nn.log_softmax(logits, axis=-1)
                one_hot_labels = tf.one_hot(verifier_label_ids, depth=2)
                per_token_loss = -tf.reduce_sum(one_hot_labels * log_probs,
                                                axis=-1)

                input_mask = tf.cast(input_mask, tf.float32)
                per_token_loss *= input_mask / tf.reduce_sum(
                    input_mask, keepdims=True, axis=-1)
                per_example_loss = tf.reduce_sum(per_token_loss, axis=-1)
                if sample_weight is not None:
                    per_example_loss *= tf.expand_dims(sample_weight, axis=-1)

                if prob != 0:
                    self.total_loss += tf.reduce_mean(per_example_loss)
                verifier_loss = per_example_loss
                verifier_preds = tf.argmax(logits, axis=-1)

            with tf.variable_scope('prediction'):

                with tf.variable_scope('intermediate'):
                    logits = tf.layers.dense(
                        input_tensor,
                        bert_config.hidden_size * 4,
                        kernel_initializer=util.create_initializer(
                            bert_config.initializer_range),
                        activation=util.gelu,
                        trainable=True)
                with tf.variable_scope('output'):
                    logits = tf.layers.dense(
                        logits,
                        bert_config.hidden_size,
                        kernel_initializer=util.create_initializer(
                            bert_config.initializer_range),
                        trainable=True)

                flattened = tf.reshape(
                    logits,
                    [batch_size * max_seq_length, bert_config.hidden_size])
                logits = tf.matmul(flattened,
                                   self.embedding_table,
                                   transpose_b=True)
                logits = tf.reshape(
                    logits, [-1, max_seq_length, bert_config.vocab_size])

                # loss
                log_probs = tf.nn.log_softmax(logits, axis=-1)
                one_hot_labels = tf.one_hot(label_ids,
                                            depth=bert_config.vocab_size)
                per_token_loss = -tf.reduce_sum(one_hot_labels * log_probs,
                                                axis=-1)

                input_mask *= tf.cast(verifier_preds, tf.float32)
                per_token_loss *= input_mask / (
                    tf.reduce_sum(input_mask, keepdims=True, axis=-1) + 1e-6)
                per_example_loss = tf.reduce_sum(per_token_loss, axis=-1)
                if sample_weight is not None:
                    per_example_loss *= tf.expand_dims(sample_weight, axis=-1)

                if prob != 0:
                    self.total_loss += tf.reduce_mean(per_example_loss)
                self.losses[name + '_loss'] = verifier_loss
                self.preds[name + '_preds'] = \
                    tf.argmax(logits, axis=-1) * verifier_preds

Beispiel #22

0

Datei anzeigen

    def __init__(self,
                 hparams,
                 is_training,
                 input_ids,
                 sample_weight=None,
                 scope='model',
                 given=1,
                 use_tilda_embedding=False,
                 **kwargs):
        super().__init__()

        batch_size = util.get_shape_list(input_ids, expected_rank=2)[0]
        max_seq_length = hparams.n_predict

        # Tilda embeddings for SMART algorithm
        tilda_embeddings = None
        if use_tilda_embedding:
            with tf.variable_scope('', reuse=True):
                tilda_embeddings = tf.get_variable('tilda_embeddings')

        with tf.variable_scope(scope):

            def _forward(input_ids, past=None):
                batch, sequence = shape_list(input_ids)

                if tilda_embeddings is None:
                    wte = tf.get_variable(
                        'word_embeddings', [hparams.n_vocab, hparams.n_embed],
                        initializer=tf.random_normal_initializer(stddev=0.02))
                else:
                    wte = tilda_embeddings
                wpe = tf.get_variable(
                    'wpe', [hparams.n_ctx, hparams.n_embed],
                    initializer=tf.random_normal_initializer(stddev=0.01))
                past_length = 0 if past is None else tf.shape(past)[-2]
                h = (tf.gather(wte, input_ids) +
                     tf.gather(wpe, positions_for(input_ids, past_length)))

                # stacked transformer layers
                presents = []
                pasts = tf.unstack(past, axis=1) if past is not None else \
                    [None] * hparams.n_layer
                assert len(pasts) == hparams.n_layer
                for layer, past in enumerate(pasts):
                    h, present = block(h,
                                       'h%d' % layer,
                                       past=past,
                                       hparams=hparams)
                    presents.append(present)
                present = tf.stack(presents, axis=1)
                h = norm(h, 'ln_f')

                # Language model loss.  Do tokens <n predict token n?
                h_flat = tf.reshape(h, [batch * sequence, hparams.n_embed])
                logits = tf.matmul(h_flat, wte, transpose_b=True)
                logits = tf.reshape(logits, [batch, sequence, hparams.n_vocab])

                return logits, present

            # convert to labels
            label_ids = tf.concat(
                [input_ids[:, 1:],
                 tf.zeros([batch_size, 1], dtype=tf.int32)],
                axis=-1)

            # forward once
            if is_training:
                (logits, _) = _forward(input_ids)

                self.preds['LM'] = tf.argmax(logits, axis=-1)

            # forward loop
            else:
                input_ids = input_ids[:, 0:given]

                for cur_length in range(given, max_seq_length + 1):
                    (logits, _) = _forward(input_ids)

                    pred_ids = tf.argmax(logits[:,
                                                cur_length - 1:cur_length, :],
                                         axis=-1)
                    pred_ids = tf.cast(pred_ids, tf.int32)
                    input_ids = tf.concat([input_ids, pred_ids], axis=-1)

                self.preds['LM'] = input_ids

            # loss
            log_probs = tf.nn.log_softmax(logits, axis=-1)
            one_hot_labels = tf.one_hot(label_ids, depth=hparams.n_vocab)
            per_token_loss = -tf.reduce_sum(one_hot_labels * log_probs,
                                            axis=-1)
            label_mask = tf.cast(tf.not_equal(label_ids, 0), tf.float32)
            per_example_loss = \
                tf.reduce_sum(per_token_loss * label_mask, axis=-1) / \
                tf.reduce_sum(label_mask, axis=-1)
            if sample_weight is not None:
                per_example_loss *= tf.expand_dims(sample_weight, axis=-1)

            self.total_loss = tf.reduce_mean(per_example_loss)
            self.losses['LM'] = per_example_loss

Beispiel #23

0

Datei anzeigen

Datei: sem_bert.py Projekt: zhongyunuestc/unif

    def __init__(self,
                 bert_config,
                 is_training,
                 input_tensor,
                 input_mask,
                 sem_features,
                 label_ids,
                 max_seq_length,
                 feature_size,
                 label_size=2,
                 sample_weight=None,
                 scope='cls/seq_relationship',
                 hidden_dropout_prob=0.1,
                 initializer_range=0.02,
                 trainable=True,
                 **kwargs):
        super().__init__(**kwargs)

        input_shape = util.get_shape_list(input_tensor)
        batch_size = input_shape[0]
        hidden_size = input_shape[-1]
        with tf.variable_scope('sem'):
            feature_embeddings = tf.get_variable(
                name='feature_embeddings',
                shape=[feature_size + 3,
                       hidden_size],  # for [PAD], [CLS], [SEP]
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            sem_output = tf.gather(feature_embeddings,
                                   sem_features)  # [B, N, H]

            attention_heads = []
            with tf.variable_scope('self'):
                attention_mask = BERTEncoder.create_attention_mask_from_input_mask(
                    input_mask, batch_size, max_seq_length)
                (attention_head, _) = BERTEncoder.attention_layer(
                    from_tensor=sem_output,
                    to_tensor=sem_output,
                    attention_mask=attention_mask,
                    num_attention_heads=bert_config.num_attention_heads,
                    size_per_head=(hidden_size //
                                   bert_config.num_attention_heads),
                    attention_probs_dropout_prob=hidden_dropout_prob
                    if is_training else 0.0,
                    initializer_range=initializer_range,
                    do_return_2d_tensor=False,
                    batch_size=batch_size,
                    from_max_seq_length=max_seq_length,
                    to_max_seq_length=max_seq_length,
                    trainable=trainable)
                attention_heads.append(attention_head)

            if len(attention_heads) == 1:
                attention_output = attention_heads[0]
            else:
                attention_output = tf.concat(attention_heads, axis=-1)

            attention_output = attention_output[:, 0, :]  # [B, H]
            input_tensor = util.layer_norm(attention_output + input_tensor,
                                           trainable=trainable)

        with tf.variable_scope(scope):
            output_weights = tf.get_variable(
                'output_weights',
                shape=[label_size, hidden_size],
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            output_bias = tf.get_variable('output_bias',
                                          shape=[label_size],
                                          initializer=tf.zeros_initializer(),
                                          trainable=trainable)

            output_layer = util.dropout(
                input_tensor, hidden_dropout_prob if is_training else 0.0)
            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)

            self.preds['preds'] = tf.argmax(logits, axis=-1)
            self.probs['probs'] = tf.nn.softmax(logits, axis=-1, name='probs')

            log_probs = tf.nn.log_softmax(logits, axis=-1)
            one_hot_labels = tf.one_hot(label_ids,
                                        depth=label_size,
                                        dtype=tf.float32)
            per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs,
                                              axis=-1)
            if sample_weight is not None:
                per_example_loss = tf.cast(sample_weight,
                                           dtype=tf.float32) * per_example_loss
            thresh = kwargs.get('tsa_thresh')
            if thresh is not None:
                assert isinstance(
                    thresh,
                    float), ('`tsa_thresh` must be a float between 0 and 1.')
                uncertainty = tf.reduce_sum(self.probs['probs'] *
                                            tf.log(self.probs['probs']),
                                            axis=-1)
                uncertainty /= tf.log(1 / label_size)
                per_example_loss = tf.cast(
                    tf.greater(uncertainty, thresh), dtype=tf.float32) * \
                    per_example_loss

            self.losses['losses'] = per_example_loss
            self.total_loss = tf.reduce_mean(per_example_loss)