Esempio n. 1
0
        def compute_loss(log_probs, positions, depth=seq_length):
            one_hot_positions = tf.one_hot(
                positions, depth=depth, dtype=tf.float32)

            loss = - tf.reduce_sum(one_hot_positions * log_probs, axis=-1)
            loss = tf.reduce_mean(loss)
            return loss
Esempio n. 2
0
def kl_div_loss(student_logits, teacher_logits, temperature=1):
    """The Kullback–Leibler divergence from Q to P:
     D_kl (P||Q) = sum(P * log(P / Q))
    from student to teacher: sum(teacher * log(teacher / student))
    """
    teacher_softmax = tf.nn.softmax(teacher_logits / temperature)
    teacher_log_softmax = tf.nn.log_softmax(teacher_logits / temperature)
    student_log_softmax = tf.nn.log_softmax(student_logits / temperature)
    kl_dist = teacher_softmax * (teacher_log_softmax - student_log_softmax)
    kl_loss = tf.reduce_mean(tf.reduce_sum(kl_dist, -1))
    return kl_loss
Esempio n. 3
0
def get_race_loss(FLAGS, features, is_training):
    """Loss for downstream multi-choice QA tasks such as RACE."""

    bsz_per_core = tf.shape(features["input_ids"])[0]

    def _transform_features(feature):
        out = tf.reshape(feature, [bsz_per_core, 4, -1])
        out = tf.transpose(out, [2, 0, 1])
        out = tf.reshape(out, [-1, bsz_per_core * 4])
        return out

    inp = _transform_features(features["input_ids"])
    seg_id = _transform_features(features["segment_ids"])
    inp_mask = _transform_features(features["input_mask"])
    label = tf.reshape(features["label_ids"], [bsz_per_core])

    xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
    run_config = xlnet.create_run_config(is_training, True, FLAGS)

    xlnet_model = xlnet.XLNetModel(
        xlnet_config=xlnet_config,
        run_config=run_config,
        input_ids=inp,
        seg_ids=seg_id,
        input_mask=inp_mask)
    summary = xlnet_model.get_pooled_out(FLAGS.summary_type,
                                         FLAGS.use_summ_proj)

    with tf.variable_scope("logits"):
        logits = tf.layers.dense(summary, 1,
                                 kernel_initializer=xlnet_model.get_initializer())
        logits = tf.reshape(logits, [bsz_per_core, 4])

        one_hot_target = tf.one_hot(label, 4)
        per_example_loss = -tf.reduce_sum(
            tf.nn.log_softmax(logits) * one_hot_target, -1)
        total_loss = tf.reduce_mean(per_example_loss)

    return total_loss, per_example_loss, logits
Esempio n. 4
0
    def model_fn(features, labels, mode, params):
        # ### Training or Evaluation
        is_training = (mode == tf.estimator.ModeKeys.TRAIN)
        return_dict = function_builder.get_classification_outputs(
            FLAGS, features, is_training)
        # per_example_loss = return_dict["per_example_loss"]
        cls_logits = return_dict["cls_logits"]
        # ### Check model parameters
        num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()])
        logger.info('#params: {}'.format(num_params))

        # ### load pretrained models
        scaffold_fn = model_utils.init_from_checkpoint(FLAGS)

        if mode == tf.estimator.ModeKeys.PREDICT:
            # label_ids = tf.reshape(features["cls"], [-1])
            predictions = {
                "feature_id": features["feature_id"],
                "cls_logits": cls_logits,
                # "cls": label_ids,
            }

            if FLAGS.use_tpu:
                output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                    mode=mode,
                    predictions=predictions,
                    scaffold_fn=scaffold_fn)
            else:
                output_spec = tf.estimator.EstimatorSpec(
                    mode=mode, predictions=predictions)
            return output_spec

        def compute_loss(log_probs, positions, depth):
            one_hot_positions = tf.one_hot(positions,
                                           depth=depth,
                                           dtype=tf.float32)

            loss = -tf.reduce_sum(one_hot_positions * log_probs, axis=-1)
            loss = tf.reduce_mean(loss)
            return loss

        cls_log_probs = return_dict["cls_log_probs"]
        num_choices = FLAGS.num_choices
        if num_choices:
            num_classes = num_choices
        else:
            num_classes = FLAGS.num_classes
        total_loss = compute_loss(cls_log_probs,
                                  features["cls"],
                                  depth=num_classes)

        # ### Configuring the optimizer
        train_op, learning_rate, _ = model_utils.get_train_op(
            FLAGS, total_loss)

        monitor_dict = {'loss/cls': total_loss, "lr": learning_rate}

        # ### Constucting training TPUEstimatorSpec with new cache.
        if FLAGS.use_tpu:
            # ### Creating host calls
            if not FLAGS.is_regression:
                label_ids = tf.reshape(features['cls'], [-1])
                predictions = tf.argmax(cls_logits,
                                        axis=-1,
                                        output_type=label_ids.dtype)
                is_correct = tf.equal(predictions, label_ids)
                accuracy = tf.reduce_mean(tf.cast(is_correct, tf.float32))

                monitor_dict["accuracy"] = accuracy

                host_call = function_builder.construct_scalar_host_call(
                    monitor_dict=monitor_dict,
                    model_dir=FLAGS.model_dir,
                    prefix="train/",
                    reduce_fn=tf.reduce_mean)
            else:
                host_call = None

            train_spec = tf.contrib.tpu.TPUEstimatorSpec(
                mode=mode,
                loss=total_loss,
                train_op=train_op,
                host_call=host_call,
                scaffold_fn=scaffold_fn)
        else:
            train_spec = tf.estimator.EstimatorSpec(mode=mode,
                                                    loss=total_loss,
                                                    train_op=train_op)

        return train_spec
Esempio n. 5
0
def summarize_sequence(summary_type,
                       hidden,
                       d_model,
                       n_head,
                       d_head,
                       dropout,
                       dropatt,
                       input_mask,
                       is_training,
                       initializer,
                       scope=None,
                       reuse=None,
                       use_proj=True):
    """
        Different classification tasks may not may not share the same parameters
        to summarize the sequence features.

        If shared, one can keep the `scope` to the default value `None`.
        Otherwise, one should specify a different `scope` for each task.
    """

    with tf.variable_scope(scope, 'sequnece_summary', reuse=reuse):
        if summary_type == 'last':
            summary = hidden[-1]
        elif summary_type == 'first':
            summary = hidden[0]
        elif summary_type == 'mean':
            summary = tf.reduce_mean(hidden, axis=0)
        elif summary_type == 'attn':
            bsz = tf.shape(hidden)[1]

            summary_bias = tf.get_variable('summary_bias', [d_model],
                                           dtype=hidden.dtype,
                                           initializer=initializer)
            summary_bias = tf.tile(summary_bias[None, None], [1, bsz, 1])

            if input_mask is not None:
                input_mask = input_mask[None, :, :, None]

            summary = multihead_attn(summary_bias,
                                     hidden,
                                     hidden,
                                     input_mask,
                                     d_model,
                                     n_head,
                                     d_head,
                                     dropout,
                                     dropatt,
                                     is_training,
                                     initializer,
                                     residual=False)
            summary = summary[0]
        else:
            raise ValueError(
                'Unsupported summary type {}'.format(summary_type))

        # use another projection as in BERT
        if use_proj:
            summary = tf.layers.dense(summary,
                                      d_model,
                                      activation=tf.tanh,
                                      kernel_initializer=initializer,
                                      name='summary')

        # dropout
        summary = tf.layers.dropout(summary,
                                    dropout,
                                    training=is_training,
                                    name='dropout')

    return summary