Example #1
0
def classification_loss(hidden,
                        labels,
                        n_class,
                        initializer,
                        scope,
                        reuse=None,
                        return_logits=False):
    """
        Different classification tasks should use different scope names to ensure
        different dense layers (parameters) are used to produce the logits.

        An exception will be in transfer learning, where one hopes to transfer
        the classification weights.
    """

    with tf.variable_scope(scope, reuse=reuse):
        logits = tf.layers.dense(hidden,
                                 n_class,
                                 kernel_initializer=initializer,
                                 name='logit')

        one_hot_target = tf.one_hot(labels, n_class, dtype=hidden.dtype)
        loss = -tf.reduce_sum(tf.nn.log_softmax(logits) * one_hot_target, -1)

        if return_logits:
            return loss, logits

        return loss
Example #2
0
        def compute_loss(log_probs, positions, depth=seq_length):
            one_hot_positions = tf.one_hot(
                positions, depth=depth, dtype=tf.float32)

            loss = - tf.reduce_sum(one_hot_positions * log_probs, axis=-1)
            loss = tf.reduce_mean(loss)
            return loss
Example #3
0
def kl_div_loss(student_logits, teacher_logits, temperature=1):
    """The Kullback–Leibler divergence from Q to P:
     D_kl (P||Q) = sum(P * log(P / Q))
    from student to teacher: sum(teacher * log(teacher / student))
    """
    teacher_softmax = tf.nn.softmax(teacher_logits / temperature)
    teacher_log_softmax = tf.nn.log_softmax(teacher_logits / temperature)
    student_log_softmax = tf.nn.log_softmax(student_logits / temperature)
    kl_dist = teacher_softmax * (teacher_log_softmax - student_log_softmax)
    kl_loss = tf.reduce_mean(tf.reduce_sum(kl_dist, -1))
    return kl_loss
Example #4
0
def get_race_loss(FLAGS, features, is_training):
    """Loss for downstream multi-choice QA tasks such as RACE."""

    bsz_per_core = tf.shape(features["input_ids"])[0]

    def _transform_features(feature):
        out = tf.reshape(feature, [bsz_per_core, 4, -1])
        out = tf.transpose(out, [2, 0, 1])
        out = tf.reshape(out, [-1, bsz_per_core * 4])
        return out

    inp = _transform_features(features["input_ids"])
    seg_id = _transform_features(features["segment_ids"])
    inp_mask = _transform_features(features["input_mask"])
    label = tf.reshape(features["label_ids"], [bsz_per_core])

    xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
    run_config = xlnet.create_run_config(is_training, True, FLAGS)

    xlnet_model = xlnet.XLNetModel(
        xlnet_config=xlnet_config,
        run_config=run_config,
        input_ids=inp,
        seg_ids=seg_id,
        input_mask=inp_mask)
    summary = xlnet_model.get_pooled_out(FLAGS.summary_type,
                                         FLAGS.use_summ_proj)

    with tf.variable_scope("logits"):
        logits = tf.layers.dense(summary, 1,
                                 kernel_initializer=xlnet_model.get_initializer())
        logits = tf.reshape(logits, [bsz_per_core, 4])

        one_hot_target = tf.one_hot(label, 4)
        per_example_loss = -tf.reduce_sum(
            tf.nn.log_softmax(logits) * one_hot_target, -1)
        total_loss = tf.reduce_mean(per_example_loss)

    return total_loss, per_example_loss, logits
Example #5
0
def get_decomposed_qa_outputs(FLAGS, features, is_training):
    question_ids = features["question_ids"]
    context_ids = features["context_ids"]
    seq_len = FLAGS.max_seq_length
    q_seq_len = FLAGS.max_first_length + 2
    ctx_seq_len = seq_len - q_seq_len
    q_mask_int = tf.cast(tf.cast(question_ids, tf.bool), tf.int32)
    cls_index = tf.reshape(
        tf.reduce_sum(q_mask_int, axis=1) + ctx_seq_len, [-1])
    # 0 for mask out
    # q_zeros = tf.zeros_like(question_ids)
    # p_ids = tf.concat([context_ids, q_zeros], axis=1)
    # p_mask = tf.cast(tf.cast(p_ids, tf.bool), tf.float32)
    question_ids = tf.transpose(question_ids, [1, 0])
    context_ids = tf.transpose(context_ids, [1, 0])

    q_attn_mask = get_attention_mask(question_ids, q_seq_len)
    c_attn_mask = get_attention_mask(context_ids, ctx_seq_len)
    qc_attn_mask = get_attention_mask(
        tf.concat([context_ids, question_ids], axis=0), seq_len)

    xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
    run_config = xlnet.create_run_config(is_training, True, FLAGS)
    initializer = xlnet._get_initializer(run_config)
    tfm_args = dict(
        n_token=xlnet_config.n_token,
        initializer=initializer,
        attn_type="bi",
        n_layer=xlnet_config.n_layer,
        d_model=xlnet_config.d_model,
        n_head=xlnet_config.n_head,
        d_head=xlnet_config.d_head,
        d_inner=xlnet_config.d_inner,
        ff_activation=xlnet_config.ff_activation,
        untie_r=xlnet_config.untie_r,
        is_training=run_config.is_training,
        use_bfloat16=run_config.use_bfloat16,
        use_tpu=run_config.use_tpu,
        dropout=run_config.dropout,
        dropatt=run_config.dropatt,

        # mem_len=run_config.mem_len,
        # reuse_len=run_config.reuse_len,
        # bi_data=run_config.bi_data,
        clamp_len=run_config.clamp_len,
        # same_length=run_config.same_length,
        ctx_ids=context_ids,
        q_ids=question_ids,
        q_seq_len=q_seq_len,
        ctx_seq_len=ctx_seq_len,
        sep_layer=FLAGS.sep_layer,
        q_attn_mask=q_attn_mask,
        c_attn_mask=c_attn_mask,
        qc_attn_mask=qc_attn_mask,
    )

    with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
        upper_outputs = transformer_xl_decomposed(**tfm_args)

    output = upper_outputs[-1]
    return_dict = {'upper_outputs': upper_outputs}
    with tf.variable_scope("logits"):
        # logits: seq, batch_size, 2
        logits = tf.layers.dense(output, 2, kernel_initializer=initializer)

        # logits: 2, batch_size, seq
        logits = tf.transpose(logits, [2, 1, 0])

        # start_logits: batch_size, seq
        # end_logits: batch_size, seq
        start_logits, end_logits = tf.unstack(logits, axis=0)

        # start_logits_masked = start_logits * p_mask - 1e30 * (1 - p_mask)
        # start_log_probs = tf.nn.log_softmax(start_logits_masked, -1)
        start_log_probs = tf.nn.log_softmax(start_logits, -1)

        # end_logits_masked = end_logits * p_mask - 1e30 * (1 - p_mask)
        # end_log_probs = tf.nn.log_softmax(end_logits_masked, -1)
        end_log_probs = tf.nn.log_softmax(end_logits, -1)

    return_dict["start_logits"] = start_logits
    return_dict["end_logits"] = end_logits
    if is_training:
        return_dict["start_log_probs"] = start_log_probs
        return_dict["end_log_probs"] = end_log_probs

    # an additional layer to predict answer class, 0: span, 1:yes, 2:no
    with tf.variable_scope("answer_class"):
        # get the representation of CLS
        cls_index = tf.one_hot(cls_index, seq_len, axis=-1, dtype=tf.float32)
        cls_feature = tf.einsum("lbh,bl->bh", output, cls_index)
        ans_feature = tf.layers.dense(cls_feature,
                                      xlnet_config.d_model,
                                      activation=tf.tanh,
                                      kernel_initializer=initializer,
                                      name='pooler')

        ans_feature = tf.layers.dropout(ans_feature,
                                        FLAGS.dropout,
                                        training=is_training)
        # hotpot has 3 classes,
        # squad 2.0 has 2 classes
        cls_logits = tf.layers.dense(ans_feature,
                                     FLAGS.num_classes,
                                     kernel_initializer=initializer,
                                     name="cls")
        cls_log_probs = tf.nn.log_softmax(cls_logits, -1)

    return_dict["cls_logits"] = cls_logits
    if is_training:
        return_dict["cls_log_probs"] = cls_log_probs

    return return_dict
Example #6
0
def get_qa_outputs(FLAGS, features, is_training):
    """Loss for downstream span-extraction QA tasks such as SQuAD."""

    input_ids = features["input_ids"]
    seg_id = features["segment_ids"]
    input_mask_int = tf.cast(tf.cast(input_ids, tf.bool), tf.int32)
    cls_index = tf.reshape(tf.reduce_sum(input_mask_int, axis=1), [-1])
    p_mask = tf.cast(tf.cast(seg_id, tf.bool), tf.float32)
    input_ids = tf.transpose(input_ids, [1, 0])
    input_mask = 1 - tf.cast(input_mask_int, tf.float32)
    input_mask = tf.transpose(input_mask, [1, 0])
    seg_id = tf.transpose(seg_id, [1, 0])
    seq_len = tf.shape(input_ids)[0]

    xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
    run_config = xlnet.create_run_config(is_training, True, FLAGS)

    xlnet_model = xlnet.XLNetModel(
        xlnet_config=xlnet_config,
        run_config=run_config,
        input_ids=input_ids,
        seg_ids=seg_id,
        input_mask=input_mask)
    output = xlnet_model.get_sequence_output()
    initializer = xlnet_model.get_initializer()

    return_dict = {}
    with tf.variable_scope("logits"):
        # logits: seq, batch_size, 2
        logits = tf.layers.dense(output, 2, kernel_initializer=initializer)

        # logits: 2, batch_size, seq
        logits = tf.transpose(logits, [2, 1, 0])

        # start_logits: batch_size, seq
        # end_logits: batch_size, seq
        start_logits, end_logits = tf.unstack(logits, axis=0)

        start_logits_masked = start_logits * (1 - p_mask) - 1e30 * p_mask
        start_log_probs = tf.nn.log_softmax(start_logits_masked, -1)

        end_logits_masked = end_logits * (1 - p_mask) - 1e30 * p_mask
        end_log_probs = tf.nn.log_softmax(end_logits_masked, -1)

    if is_training:
        return_dict["start_log_probs"] = start_log_probs
        return_dict["end_log_probs"] = end_log_probs
    else:
        return_dict["start_logits"] = start_logits
        return_dict["end_logits"] = end_logits

    # an additional layer to predict answer class, 0: span, 1:yes, 2:no
    with tf.variable_scope("answer_class"):
        # get the representation of CLS
        cls_index = tf.one_hot(cls_index, seq_len, axis=-1, dtype=tf.float32)
        cls_feature = tf.einsum("lbh,bl->bh", output, cls_index)
        ans_feature = tf.layers.dense(cls_feature, xlnet_config.d_model,
                                      activation=tf.tanh,
                                      kernel_initializer=initializer,
                                      name='pooler')

        ans_feature = tf.layers.dropout(ans_feature, FLAGS.dropout,
                                        training=is_training)
        # hotpot has 3 classes,
        # squad 2.0 has 2 classes
        cls_logits = tf.layers.dense(ans_feature, FLAGS.num_classes,
                                     kernel_initializer=initializer,
                                     name="cls")
        cls_log_probs = tf.nn.log_softmax(cls_logits, -1)
    if is_training:
        return_dict["cls_log_probs"] = cls_log_probs
    return_dict["cls_logits"] = cls_logits

    return return_dict