def get_regression_loss(
        FLAGS, features, is_training):
    """Loss for downstream regression tasks."""

    bsz_per_core = tf.shape(features["input_ids"])[0]

    inp = tf.transpose(features["input_ids"], [1, 0])
    seg_id = tf.transpose(features["segment_ids"], [1, 0])
    inp_mask = tf.transpose(features["input_mask"], [1, 0])
    label = tf.reshape(features["label_ids"], [bsz_per_core])

    xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
    run_config = xlnet.create_run_config(is_training, True, FLAGS)

    xlnet_model = xlnet.XLNetModel(
        xlnet_config=xlnet_config,
        run_config=run_config,
        input_ids=inp,
        seg_ids=seg_id,
        input_mask=inp_mask)

    summary = xlnet_model.get_pooled_out(
        FLAGS.summary_type, FLAGS.use_summ_proj)

    with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
        per_example_loss, logits = modeling.regression_loss(
            hidden=summary,
            labels=label,
            initializer=xlnet_model.get_initializer(),
            scope="regression_{}".format(FLAGS.task_name.lower()),
            return_logits=True)

        total_loss = tf.reduce_mean(per_example_loss)

        return total_loss, per_example_loss, logits
Example #2
0
    def _create_model(self, input_ids, input_masks, segment_ids, label_ids,
                      label_list, mode):
        """Creates XLNet-NER model"""
        model = xlnet.XLNetModel(xlnet_config=self.model_config,
                                 run_config=xlnet.create_run_config(
                                     mode == tf.estimator.ModeKeys.TRAIN, True,
                                     FLAGS),
                                 input_ids=tf.transpose(input_ids, perm=[1,
                                                                         0]),
                                 input_mask=tf.transpose(input_masks,
                                                         perm=[1, 0]),
                                 seg_ids=tf.transpose(segment_ids, perm=[1,
                                                                         0]))

        initializer = model.get_initializer()

        with tf.variable_scope("ner", reuse=tf.AUTO_REUSE):
            result = tf.transpose(model.get_sequence_output(), perm=[1, 0, 2])
            result_mask = tf.cast(tf.expand_dims(1 - input_masks, axis=-1),
                                  dtype=tf.float32)

            dense_layer = tf.keras.layers.Dense(
                units=len(label_list),
                activation=None,
                use_bias=True,
                kernel_initializer=initializer,
                bias_initializer=tf.zeros_initializer,
                kernel_regularizer=None,
                bias_regularizer=None,
                trainable=True)

            dropout_layer = tf.keras.layers.Dropout(
                rate=0.1, seed=np.random.randint(10000))

            result = dense_layer(result)
            if mode == tf.estimator.ModeKeys.TRAIN:
                result = dropout_layer(result)

            masked_predict = result * result_mask + MIN_FLOAT * (1 -
                                                                 result_mask)
            predict_ids = tf.cast(tf.argmax(tf.nn.softmax(masked_predict,
                                                          axis=-1),
                                            axis=-1),
                                  dtype=tf.int32)

        loss = tf.constant(0.0, dtype=tf.float32)
        if mode in [tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL
                    ] and label_ids is not None:
            with tf.variable_scope("loss", reuse=tf.AUTO_REUSE):
                label = tf.cast(label_ids, dtype=tf.float32)
                label_mask = tf.cast(1 - input_masks, dtype=tf.float32)
                masked_label = tf.cast(label * label_mask, dtype=tf.int32)
                cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=masked_label, logits=masked_predict)
                loss = tf.reduce_sum(
                    cross_entropy * label_mask) / tf.reduce_sum(
                        tf.reduce_max(label_mask, axis=-1))

        return loss, predict_ids
Example #3
0
 def load_model(self, model: str, model_path: str):
     model_path = os.path.join(model_path, next(os.walk(model_path))[1][0])
     self.xlnet_config = xlnet.XLNetConfig(
         json_path=os.path.join(model_path, Embeddings.mode_config_path))
     self.run_config = xlnet.create_run_config(is_training=True,
                                               is_finetune=True,
                                               FLAGS=Flags)
     self.load_tokenizer(model_path)
     self.model = model
     print("Model loaded Successfully !")
Example #4
0
 def __init__(self, flags, input_ids, seg_ids, input_mask):
     xlnet_config = xln.XLNetConfig(json_path=flags.model_config_path)
     run_config = xln.create_run_config(is_training=True,
                                        is_finetune=True,
                                        FLAGS=flags)
     self.model = xln.XLNetModel(xlnet_config=xlnet_config,
                                 run_config=run_config,
                                 input_ids=input_ids,
                                 seg_ids=seg_ids,
                                 input_mask=input_mask)
def create_model(cf,
                 input_ids,
                 input_mask,
                 segment_ids,
                 labels,
                 is_training=True):
    '''
    构建模型
    :param cf:
    :param input_ids:
    :param input_mask:
    :param segment_ids:
    :param labels:
    :param is_training:
    :return:
    '''
    bsz_per_core = tf.shape(input_ids)[0]
    inp = tf.transpose(input_ids, [1, 0])
    seg_id = tf.transpose(segment_ids, [1, 0])
    inp_mask = tf.transpose(input_mask, [1, 0])
    label = tf.reshape(labels, [bsz_per_core])

    xlnet_config = xlnet.XLNetConfig(json_path=cf.model_config_path)
    run_config = xlnet.create_run_config(is_training, True, cf)

    xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config,
                                   run_config=run_config,
                                   input_ids=inp,
                                   seg_ids=seg_id,
                                   input_mask=inp_mask)
    summary = xlnet_model.get_pooled_out(cf.summary_type, cf.use_summ_proj)

    with tf.variable_scope("model", reuse=tf.AUTO_REUSE):

        if cf.cls_scope is not None and cf.cls_scope:
            cls_scope = "classification_{}".format(cf.cls_scope)
        else:
            cls_scope = "classification_{}".format(cf.task_name.lower())

        per_example_loss, logits = modeling.classification_loss(
            hidden=summary,
            labels=label,
            n_class=cf.num_labels,
            initializer=xlnet_model.get_initializer(),
            scope=cls_scope,
            return_logits=True)

        total_loss = tf.reduce_mean(per_example_loss)

        return total_loss, per_example_loss, logits
Example #6
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    tpu_config = model_utils.configure_tpu(FLAGS)
    model_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
    run_config = xlnet.create_run_config(False, True, FLAGS)

    model_builder = XLNetModelBuilder(
        default_model_config=model_config,
        default_run_config=run_config,
        default_init_checkpoint=FLAGS.init_checkpoint,
        use_tpu=FLAGS.use_tpu)

    model_fn = model_builder.get_model_fn(model_config, run_config,
                                          FLAGS.init_checkpoint,
                                          FLAGS.model_type)

    # If TPU is not available, this will fall back to normal Estimator on CPU or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(use_tpu=FLAGS.use_tpu,
                                            model_fn=model_fn,
                                            config=tpu_config,
                                            export_to_tpu=FLAGS.use_tpu,
                                            train_batch_size=1)

    tokenizer = XLNetTokenizer(sp_model_file=FLAGS.spiece_model_file,
                               lower_case=FLAGS.lower_case)

    example_converter = XLNetExampleConverter(
        label_list=[],
        max_seq_length=FLAGS.max_seq_length,
        tokenizer=tokenizer)

    features = example_converter.convert_examples_to_features(
        [PaddingInputExample()])

    input_fn = XLNetInputBuilder.get_input_builder(features,
                                                   FLAGS.max_seq_length, True,
                                                   False)
    estimator.train(input_fn, max_steps=1)

    tf.gfile.MakeDirs(FLAGS.export_dir)
    serving_input_fn = XLNetInputBuilder.get_serving_input_fn(
        FLAGS.max_seq_length)
    estimator.export_savedmodel(FLAGS.export_dir,
                                serving_input_fn,
                                as_text=False)
    def __init__(self, model_config_path, is_training, FLAGS, input_ids,
                 segment_ids, input_mask, label, n_class):
        '''

        :param model_config_path:
        :param is_training:
        :param FLAGS:
        :param input_ids:
        :param segment_ids:
        :param input_mask:
        :param label:
        :param n_class:
        '''
        self.xlnet_config = xlnet.XLNetConfig(json_path=model_config_path)
        self.run_config = xlnet.create_run_config(is_training, True, FLAGS)
        self.input_ids = tf.transpose(input_ids, [1, 0])
        self.segment_ids = tf.transpose(segment_ids, [1, 0])
        self.input_mask = tf.transpose(input_mask, [1, 0])

        self.model = xlnet.XLNetModel(xlnet_config=self.xlnet_config,
                                      run_config=self.run_config,
                                      input_ids=self.input_ids,
                                      seg_ids=self.segment_ids,
                                      input_mask=self.input_mask)

        cls_scope = FLAGS.cls_scope
        summary = self.model.get_pooled_out(FLAGS.summary_type,
                                            FLAGS.use_summ_proj)
        self.per_example_loss, self.logits = modeling.classification_loss(
            hidden=summary,
            labels=label,
            n_class=n_class,
            initializer=self.model.get_initializer(),
            scope=cls_scope,
            return_logits=True)

        self.total_loss = tf.reduce_mean(self.per_example_loss)

        with tf.name_scope("train_op"):
            self.train_op, _, _ = model_utils.get_train_op(
                FLAGS, self.total_loss)

        with tf.name_scope("acc"):
            one_hot_target = tf.one_hot(label, n_class)
            self.acc = self.accuracy(self.logits, one_hot_target)
def get_classification_loss(options, features, n_class, is_training):
    """Loss for downstream classification tasks."""

    bsz_per_core = tf.shape(features["input_ids"])[0]

    inp = tf.transpose(features["input_ids"], [1, 0])
    seg_id = tf.transpose(features["segment_ids"], [1, 0])
    inp_mask = tf.transpose(features["input_mask"], [1, 0])
    label = tf.reshape(features["label_ids"], [bsz_per_core])

    xlnet_config = xlnet.XLNetConfig(json_path=options['model_config_file'])
    run_config = xlnet.create_run_config(is_training, True, options)

    xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config,
                                   run_config=run_config,
                                   input_ids=inp,
                                   seg_ids=seg_id,
                                   input_mask=inp_mask)

    summary = xlnet_model.get_pooled_out(options['summary_type'],
                                         options['use_summ_proj'])

    with tf.variable_scope("model", reuse=tf.AUTO_REUSE):

        if options['cls_scope'] is not None and options['cls_scope']:
            cls_scope = "classification_{}".format(options['cls_scope'])
        else:
            cls_scope = "classification_{}".format(
                options['task_name'].lower())

        per_example_loss, logits = modeling.classification_loss(
            hidden=summary,
            labels=label,
            n_class=n_class,
            initializer=xlnet_model.get_initializer(),
            scope=cls_scope,
            return_logits=True)

        total_loss = tf.reduce_mean(per_example_loss)

        return total_loss, per_example_loss, logits
def get_race_loss(FLAGS, features, is_training):
    """Loss for downstream multi-choice QA tasks such as RACE."""

    bsz_per_core = tf.shape(features["input_ids"])[0]

    def _transform_features(feature):
        out = tf.reshape(feature, [bsz_per_core, 4, -1])
        out = tf.transpose(out, [2, 0, 1])
        out = tf.reshape(out, [-1, bsz_per_core * 4])
        return out

    inp = _transform_features(features["input_ids"])
    seg_id = _transform_features(features["segment_ids"])
    inp_mask = _transform_features(features["input_mask"])
    label = tf.reshape(features["label_ids"], [bsz_per_core])

    xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
    run_config = xlnet.create_run_config(is_training, True, FLAGS)

    xlnet_model = xlnet.XLNetModel(
        xlnet_config=xlnet_config,
        run_config=run_config,
        input_ids=inp,
        seg_ids=seg_id,
        input_mask=inp_mask)
    summary = xlnet_model.get_pooled_out(
        FLAGS.summary_type, FLAGS.use_summ_proj)

    with tf.variable_scope("logits"):
        logits = tf.layers.dense(
            summary, 1, kernel_initializer=xlnet_model.get_initializer())
        logits = tf.reshape(logits, [bsz_per_core, 4])

        one_hot_target = tf.one_hot(label, 4)
        per_example_loss = -tf.reduce_sum(
            tf.nn.log_softmax(logits) * one_hot_target, -1)
        total_loss = tf.reduce_mean(per_example_loss)

    return total_loss, per_example_loss, logits
Example #10
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    np.random.seed(FLAGS.random_seed)

    processor = NerProcessor(data_dir=FLAGS.data_dir,
                             input_file=FLAGS.input_file,
                             task_name=FLAGS.task_name.lower())

    label_list = processor.get_labels()
    tf.logging.info(label_list)

    tpu_config = model_utils.configure_tpu(FLAGS)
    model_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
    run_config = xlnet.create_run_config(False, True, FLAGS)

    model_builder = XLNetModelBuilder(
        default_model_config=model_config,
        default_run_config=run_config,
        default_init_checkpoint=FLAGS.init_checkpoint,
        use_tpu=FLAGS.use_tpu)

    model_fn = model_builder.get_model_fn(model_config, run_config,
                                          FLAGS.init_checkpoint, label_list)

    # If TPU is not available, this will fall back to normal Estimator on CPU or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=tpu_config,
        export_to_tpu=FLAGS.use_tpu,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    tokenizer = XLNetTokenizer(sp_model_file=FLAGS.spiece_model_file,
                               lower_case=FLAGS.lower_case)

    example_converter = XLNetExampleConverter(
        label_list=label_list,
        max_seq_length=FLAGS.max_seq_length,
        tokenizer=tokenizer)

    if FLAGS.do_train:
        train_examples = processor.get_chem_examples()

        tf.logging.info("***** Run training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", FLAGS.train_steps)

        train_features = example_converter.convert_examples_to_features(
            train_examples)
        train_input_fn = XLNetInputBuilder.get_input_builder(
            train_features, FLAGS.max_seq_length, True, True)

        estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_steps)

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples()

        tf.logging.info("***** Run evaluation *****")
        tf.logging.info("  Num examples = %d", len(eval_examples))
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        eval_features = example_converter.convert_examples_to_features(
            eval_examples)
        eval_input_fn = XLNetInputBuilder.get_input_builder(
            eval_features, FLAGS.max_seq_length, False, False)

        result = estimator.evaluate(input_fn=eval_input_fn)

        precision = result["precision"]
        recall = result["recall"]
        f1_score = 2.0 * precision * recall / (precision + recall)

        tf.logging.info("***** Evaluation result *****")
        tf.logging.info("  Precision (token-level) = %s", str(precision))
        tf.logging.info("  Recall (token-level) = %s", str(recall))
        tf.logging.info("  F1 score (token-level) = %s", str(f1_score))

    if FLAGS.do_predict:
        predict_examples = processor.get_test_examples()
        pmids = [e.guid for e in predict_examples]
        tokens = [e.guid for e in predict_examples]

        tf.logging.info("***** Run prediction *****")
        tf.logging.info("  Num examples = %d", len(predict_examples))
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_features = example_converter.convert_examples_to_features(
            predict_examples)
        predict_input_fn = XLNetInputBuilder.get_input_builder(
            predict_features, FLAGS.max_seq_length, False, False)

        result = estimator.predict(input_fn=predict_input_fn)

        predict_recorder = XLNetPredictRecorder(
            output_dir=FLAGS.output_dir,
            label_list=label_list,
            guids=pmids,
            max_seq_length=FLAGS.max_seq_length,
            tokenizer=tokenizer,
            predict_tag=FLAGS.predict_tag)

        predicts = [{
            "input_ids": feature.input_ids,
            "input_masks": feature.input_masks,
            "label_ids": feature.label_ids,
            "predict_ids": predict["predict"].tolist()
        } for feature, predict in zip(predict_features, result)]

        predict_recorder.record(predicts)

    if FLAGS.do_export:
        tf.logging.info("***** Running exporting *****")
        tf.gfile.MakeDirs(FLAGS.export_dir)
        serving_input_fn = XLNetInputBuilder.get_serving_input_fn(
            FLAGS.max_seq_length)
        estimator.export_savedmodel(FLAGS.export_dir,
                                    serving_input_fn,
                                    as_text=False)
    def _create_model(self, input_ids, input_masks, segment_ids,
                      sent_label_ids, sent_label_list, mode):
        """Creates XLNet-Classifier model"""
        model = xlnet.XLNetModel(xlnet_config=self.model_config,
                                 run_config=xlnet.create_run_config(
                                     mode == tf.estimator.ModeKeys.TRAIN, True,
                                     FLAGS),
                                 input_ids=tf.transpose(input_ids, perm=[1,
                                                                         0]),
                                 input_mask=tf.transpose(input_masks,
                                                         perm=[1, 0]),
                                 seg_ids=tf.transpose(segment_ids, perm=[1,
                                                                         0]))

        initializer = model.get_initializer()

        with tf.variable_scope("sent", reuse=tf.AUTO_REUSE):
            sent_result = model.get_pooled_out("last")
            sent_result_mask = tf.cast(tf.reduce_max(1 - input_masks,
                                                     axis=-1,
                                                     keepdims=True),
                                       dtype=tf.float32)

            sent_dense_layer = tf.keras.layers.Dense(
                units=len(sent_label_list),
                activation=None,
                use_bias=True,
                kernel_initializer=initializer,
                bias_initializer=tf.zeros_initializer,
                kernel_regularizer=None,
                bias_regularizer=None,
                trainable=True)

            sent_dropout_layer = tf.keras.layers.Dropout(
                rate=0.1, seed=np.random.randint(10000))

            sent_result = sent_dense_layer(sent_result)
            if mode == tf.estimator.ModeKeys.TRAIN:
                sent_result = sent_dropout_layer(sent_result)

            masked_sent_predict = sent_result * sent_result_mask + MIN_FLOAT * (
                1 - sent_result_mask)
            sent_predict_probs = tf.nn.softmax(masked_sent_predict, axis=-1)
            sent_predict_ids = tf.cast(tf.argmax(sent_predict_probs, axis=-1),
                                       dtype=tf.int32)
            sent_predict_scores = tf.reduce_max(sent_predict_probs, axis=-1)

        loss = tf.constant(0.0, dtype=tf.float32)
        if mode not in [
                tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL
        ]:
            return loss, sent_predict_ids, sent_predict_scores, sent_predict_probs

        if sent_label_ids is not None:
            with tf.variable_scope("sent_loss", reuse=tf.AUTO_REUSE):
                sent_label = tf.cast(sent_label_ids, dtype=tf.float32)
                sent_label_mask = tf.cast(tf.reduce_max(1 - input_masks,
                                                        axis=-1),
                                          dtype=tf.float32)
                masked_sent_label = tf.cast(sent_label * sent_label_mask,
                                            dtype=tf.int32)
                cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=masked_sent_label, logits=masked_sent_predict)
                sent_loss = tf.reduce_sum(
                    cross_entropy * sent_label_mask) / tf.reduce_sum(
                        tf.reduce_max(sent_label_mask, axis=-1))
                loss = loss + sent_loss

        return loss, sent_predict_ids, sent_predict_scores, sent_predict_probs
Example #12
0
from xlnet import xlnet
from absl.flags import FLAGS

# some code omitted here...
# initialize FLAGS
# initialize instances of tf.Tensor, including input_ids, seg_ids, and input_mask

# XLNetConfig contains hyperparameters that are specific to a model checkpoint.
xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)

# RunConfig contains hyperparameters that could be different between pretraining and finetuning.
run_config = xlnet.create_run_config(is_training=True, is_finetune=True, FLAGS=FLAGS)

# Construct an XLNet model
xlnet_model = xlnet.XLNetModel(
    xlnet_config=xlnet_config,
    run_config=run_config,
    input_ids=input_ids,
    seg_ids=seg_ids,
    input_mask=input_mask)

# Get a summary of the sequence using the last hidden state
summary = xlnet_model.get_pooled_out(summary_type="last")

# Get a sequence output
seq_out = xlnet_model.get_sequence_output()

# build your applications based on `summary` or `seq_out`
def two_stream_loss(FLAGS, features, labels, mems, is_training):
    """Pretraining loss with two-stream attention Transformer-XL."""

    # Unpack input
    mem_name = "mems"
    mems = mems.get(mem_name, None)

    inp_k = tf.transpose(features["input_k"], [1, 0])
    inp_q = tf.transpose(features["input_q"], [1, 0])

    seg_id = tf.transpose(features["seg_id"], [1, 0])

    inp_mask = None
    perm_mask = tf.transpose(features["perm_mask"], [1, 2, 0])

    if FLAGS.num_predict is not None:
        # [num_predict x tgt_len x bsz]
        target_mapping = tf.transpose(features["target_mapping"], [1, 2, 0])
    else:
        target_mapping = None

    # target for LM loss
    tgt = tf.transpose(features["target"], [1, 0])

    # target mask for LM loss
    tgt_mask = tf.transpose(features["target_mask"], [1, 0])

    # construct xlnet config and save to model_dir
    xlnet_config = xlnet.XLNetConfig(FLAGS=FLAGS)
    xlnet_config.to_json(os.path.join(FLAGS.model_dir, "config.json"))

    # construct run config from FLAGS
    run_config = xlnet.create_run_config(is_training, False, FLAGS)

    xlnet_model = xlnet.XLNetModel(
        xlnet_config=xlnet_config,
        run_config=run_config,
        input_ids=inp_k,
        seg_ids=seg_id,
        input_mask=inp_mask,
        mems=mems,
        perm_mask=perm_mask,
        target_mapping=target_mapping,
        inp_q=inp_q)

    output = xlnet_model.get_sequence_output()
    new_mems = {mem_name: xlnet_model.get_new_memory()}
    lookup_table = xlnet_model.get_embedding_table()

    initializer = xlnet_model.get_initializer()

    with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
        # LM loss
        lm_loss = modeling.lm_loss(
            hidden=output,
            target=tgt,
            n_token=xlnet_config.n_token,
            d_model=xlnet_config.d_model,
            initializer=initializer,
            lookup_table=lookup_table,
            tie_weight=True,
            bi_data=run_config.bi_data,
            use_tpu=run_config.use_tpu)

    # Quantity to monitor
    monitor_dict = {}

    if FLAGS.use_bfloat16:
        tgt_mask = tf.cast(tgt_mask, tf.float32)
        lm_loss = tf.cast(lm_loss, tf.float32)

    total_loss = tf.reduce_sum(lm_loss * tgt_mask) / tf.reduce_sum(tgt_mask)
    monitor_dict["total_loss"] = total_loss

    return total_loss, new_mems, monitor_dict
def get_qa_outputs(FLAGS, features, is_training):
    """Loss for downstream span-extraction QA tasks such as SQuAD."""

    inp = tf.transpose(features["input_ids"], [1, 0])
    seg_id = tf.transpose(features["segment_ids"], [1, 0])
    inp_mask = tf.transpose(features["input_mask"], [1, 0])
    cls_index = tf.reshape(features["cls_index"], [-1])

    seq_len = tf.shape(inp)[0]

    xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
    run_config = xlnet.create_run_config(is_training, True, FLAGS)

    xlnet_model = xlnet.XLNetModel(
        xlnet_config=xlnet_config,
        run_config=run_config,
        input_ids=inp,
        seg_ids=seg_id,
        input_mask=inp_mask)
    output = xlnet_model.get_sequence_output()
    initializer = xlnet_model.get_initializer()

    return_dict = {}

    # invalid position mask such as query and special symbols (PAD, SEP, CLS)
    p_mask = features["p_mask"]

    # logit of the start position
    with tf.variable_scope("start_logits"):
        start_logits = tf.layers.dense(
            output,
            1,
            kernel_initializer=initializer)
        start_logits = tf.transpose(tf.squeeze(start_logits, -1), [1, 0])
        start_logits_masked = start_logits * (1 - p_mask) - 1e30 * p_mask
        start_log_probs = tf.nn.log_softmax(start_logits_masked, -1)

    # logit of the end position
    with tf.variable_scope("end_logits"):
        if is_training:
            # during training, compute the end logits based on the
            # ground truth of the start position

            start_positions = tf.reshape(features["start_positions"], [-1])
            start_index = tf.one_hot(start_positions, depth=seq_len, axis=-1,
                                     dtype=tf.float32)
            start_features = tf.einsum("lbh,bl->bh", output, start_index)
            start_features = tf.tile(start_features[None], [seq_len, 1, 1])
            end_logits = tf.layers.dense(
                tf.concat([output, start_features], axis=-1), xlnet_config.d_model,
                kernel_initializer=initializer, activation=tf.tanh, name="dense_0")
            end_logits = tf.contrib.layers.layer_norm(
                end_logits, begin_norm_axis=-1)

            end_logits = tf.layers.dense(
                end_logits, 1,
                kernel_initializer=initializer,
                name="dense_1")
            end_logits = tf.transpose(tf.squeeze(end_logits, -1), [1, 0])
            end_logits_masked = end_logits * (1 - p_mask) - 1e30 * p_mask
            end_log_probs = tf.nn.log_softmax(end_logits_masked, -1)
        else:
            # during inference, compute the end logits based on beam search

            start_top_log_probs, start_top_index = tf.nn.top_k(
                start_log_probs, k=FLAGS.start_n_top)
            start_index = tf.one_hot(start_top_index,
                                     depth=seq_len, axis=-1, dtype=tf.float32)
            start_features = tf.einsum("lbh,bkl->bkh", output, start_index)
            end_input = tf.tile(output[:, :, None],
                                [1, 1, FLAGS.start_n_top, 1])
            start_features = tf.tile(start_features[None],
                                     [seq_len, 1, 1, 1])
            end_input = tf.concat([end_input, start_features], axis=-1)
            end_logits = tf.layers.dense(
                end_input,
                xlnet_config.d_model,
                kernel_initializer=initializer,
                activation=tf.tanh,
                name="dense_0")
            end_logits = tf.contrib.layers.layer_norm(end_logits,
                                                      begin_norm_axis=-1)
            end_logits = tf.layers.dense(
                end_logits,
                1,
                kernel_initializer=initializer,
                name="dense_1")
            end_logits = tf.reshape(
                end_logits, [
                    seq_len, -1, FLAGS.start_n_top])
            end_logits = tf.transpose(end_logits, [1, 2, 0])
            end_logits_masked = end_logits * (
                1 - p_mask[:, None]) - 1e30 * p_mask[:, None]
            end_log_probs = tf.nn.log_softmax(end_logits_masked, -1)
            end_top_log_probs, end_top_index = tf.nn.top_k(
                end_log_probs, k=FLAGS.end_n_top)
            end_top_log_probs = tf.reshape(
                end_top_log_probs,
                [-1, FLAGS.start_n_top * FLAGS.end_n_top])
            end_top_index = tf.reshape(
                end_top_index,
                [-1, FLAGS.start_n_top * FLAGS.end_n_top])

    if is_training:
        return_dict["start_log_probs"] = start_log_probs
        return_dict["end_log_probs"] = end_log_probs
    else:
        return_dict["start_top_log_probs"] = start_top_log_probs
        return_dict["start_top_index"] = start_top_index
        return_dict["end_top_log_probs"] = end_top_log_probs
        return_dict["end_top_index"] = end_top_index

    # an additional layer to predict answerability
    with tf.variable_scope("answer_class"):
        # get the representation of CLS
        cls_index = tf.one_hot(cls_index, seq_len, axis=-1, dtype=tf.float32)
        cls_feature = tf.einsum("lbh,bl->bh", output, cls_index)

        # get the representation of START
        start_p = tf.nn.softmax(start_logits_masked, axis=-1,
                                name="softmax_start")
        start_feature = tf.einsum("lbh,bl->bh", output, start_p)

        # note(zhiliny): no dependency on end_feature so that we can obtain
        # one single `cls_logits` for each sample
        ans_feature = tf.concat([start_feature, cls_feature], -1)
        ans_feature = tf.layers.dense(
            ans_feature,
            xlnet_config.d_model,
            activation=tf.tanh,
            kernel_initializer=initializer, name="dense_0")
        ans_feature = tf.layers.dropout(ans_feature, FLAGS.dropout,
                                        training=is_training)
        cls_logits = tf.layers.dense(
            ans_feature,
            1,
            kernel_initializer=initializer,
            name="dense_1",
            use_bias=False)
        cls_logits = tf.squeeze(cls_logits, -1)

        return_dict["cls_logits"] = cls_logits

    return return_dict
Example #15
0
    def get_predictions_and_loss(self, input_ids, seg_ids, input_mask,
                                 text_len, speaker_ids, genre, is_training,
                                 gold_starts, gold_ends, cluster_ids,
                                 sentence_map):

        run_config = xlnet.create_run_config(is_training=True,
                                             is_finetune=True,
                                             FLAGS=self.FLAGS)

        # Construct an XLNet model
        model = xlnet.XLNetModel(xlnet_config=self.xlnet_config,
                                 run_config=run_config,
                                 input_ids=input_ids,
                                 seg_ids=seg_ids,
                                 input_mask=input_mask)

        mention_doc = model.get_sequence_output()

        mention_doc = tf.transpose(mention_doc, perm=[1, 0, 2])
        input_ids = tf.transpose(input_ids)
        input_mask = tf.transpose(input_mask)
        seg_ids = tf.transpose(seg_ids)
        speaker_ids = tf.transpose(speaker_ids)
        flipped_mask = (input_mask < 1)
        input_mask = tf.cast(flipped_mask, tf.float32)

        self.dropout = self.get_dropout(self.config["dropout_rate"],
                                        is_training)

        num_sentences = tf.shape(mention_doc)[0]
        max_sentence_length = tf.shape(mention_doc)[1]

        mention_doc = self.flatten_emb_by_sentence(mention_doc, input_mask)
        num_words = util_xlnet.shape(mention_doc, 0)
        antecedent_doc = mention_doc

        flattened_sentence_indices = sentence_map
        #with tf.control_dependencies([print_input_ids]):
        candidate_starts = tf.tile(
            tf.expand_dims(tf.range(num_words), 1),
            [1, self.max_span_width])  # [num_words, max_span_width]
        candidate_ends = candidate_starts + tf.expand_dims(
            tf.range(self.max_span_width), 0)  # [num_words, max_span_width]
        candidate_start_sentence_indices = tf.gather(
            flattened_sentence_indices,
            candidate_starts)  # [num_words, max_span_width]
        candidate_end_sentence_indices = tf.gather(
            flattened_sentence_indices,
            tf.minimum(candidate_ends,
                       num_words - 1))  # [num_words, max_span_width]
        candidate_mask = tf.logical_and(
            candidate_ends < num_words,
            tf.equal(
                candidate_start_sentence_indices,
                candidate_end_sentence_indices))  # [num_words, max_span_width]
        flattened_candidate_mask = tf.reshape(
            candidate_mask, [-1])  # [num_words * max_span_width]
        candidate_starts = tf.boolean_mask(
            tf.reshape(candidate_starts,
                       [-1]), flattened_candidate_mask)  # [num_candidates]
        candidate_ends = tf.boolean_mask(
            tf.reshape(candidate_ends,
                       [-1]), flattened_candidate_mask)  # [num_candidates]
        candidate_sentence_indices = tf.boolean_mask(
            tf.reshape(candidate_start_sentence_indices, [-1]),
            flattened_candidate_mask)  # [num_candidates]

        candidate_cluster_ids = self.get_candidate_labels(
            candidate_starts, candidate_ends, gold_starts, gold_ends,
            cluster_ids)  # [num_candidates]
        candidate_span_emb = self.get_span_emb(
            mention_doc, mention_doc, candidate_starts,
            candidate_ends)  # [num_candidates, emb]
        candidate_mention_scores = self.get_mention_scores(
            candidate_span_emb, candidate_starts, candidate_ends)
        candidate_mention_scores = tf.squeeze(candidate_mention_scores,
                                              1)  # [k]

        # beam size
        k = tf.minimum(
            3900,
            tf.to_int32(
                tf.floor(
                    tf.to_float(num_words) * self.config["top_span_ratio"])))
        c = tf.minimum(self.config["max_top_antecedents"], k)
        # pull from beam
        top_span_indices = coref_ops.extract_spans(
            tf.expand_dims(candidate_mention_scores, 0),
            tf.expand_dims(candidate_starts, 0),
            tf.expand_dims(candidate_ends, 0), tf.expand_dims(k, 0), num_words,
            True)  # [1, k]
        top_span_indices.set_shape([1, None])
        top_span_indices = tf.squeeze(top_span_indices, 0)  # [k]

        top_span_starts = tf.gather(candidate_starts, top_span_indices)  # [k]
        top_span_ends = tf.gather(candidate_ends, top_span_indices)  # [k]
        top_span_emb = tf.gather(candidate_span_emb,
                                 top_span_indices)  # [k, emb]
        top_span_cluster_ids = tf.gather(candidate_cluster_ids,
                                         top_span_indices)  # [k]
        top_span_mention_scores = tf.gather(candidate_mention_scores,
                                            top_span_indices)  # [k]
        genre_emb = tf.gather(
            tf.get_variable(
                "genre_embeddings",
                [len(self.genres), self.config["feature_size"]],
                initializer=tf.truncated_normal_initializer(stddev=0.02)),
            genre)  # [emb]
        if self.config['use_metadata']:
            speaker_ids = self.flatten_emb_by_sentence(speaker_ids, input_mask)
            top_span_speaker_ids = tf.gather(speaker_ids,
                                             top_span_starts)  # [k]i
        else:
            top_span_speaker_ids = None

        dummy_scores = tf.zeros([k, 1])  # [k, 1]
        top_antecedents, top_antecedents_mask, top_fast_antecedent_scores, top_antecedent_offsets = self.coarse_to_fine_pruning(
            top_span_emb, top_span_mention_scores, c)

        num_segs, seg_len = util_xlnet.shape(input_ids, 0), util_xlnet.shape(
            input_ids, 1)
        word_segments = tf.tile(tf.expand_dims(tf.range(0, num_segs), 1),
                                [1, seg_len])
        flat_word_segments = tf.boolean_mask(tf.reshape(word_segments, [-1]),
                                             tf.reshape(input_mask, [-1]))
        mention_segments = tf.expand_dims(
            tf.gather(flat_word_segments, top_span_starts), 1)  # [k, 1]
        antecedent_segments = tf.gather(flat_word_segments,
                                        tf.gather(top_span_starts,
                                                  top_antecedents))  #[k, c]
        segment_distance = tf.clip_by_value(
            mention_segments -
            antecedent_segments, 0, self.config['max_training_sentences'] -
            1) if self.config['use_segment_distance'] else None  #[k, c]
        if self.config['fine_grained']:
            for i in range(self.config["coref_depth"]):
                with tf.variable_scope("coref_layer", reuse=(i > 0)):
                    top_antecedent_emb = tf.gather(
                        top_span_emb, top_antecedents)  # [k, c, emb]
                    top_antecedent_scores = top_fast_antecedent_scores + self.get_slow_antecedent_scores(
                        top_span_emb, top_antecedents, top_antecedent_emb,
                        top_antecedent_offsets, top_span_speaker_ids,
                        genre_emb, segment_distance)  # [k, c]
                    top_antecedent_weights = tf.nn.softmax(
                        tf.concat([dummy_scores, top_antecedent_scores],
                                  1))  # [k, c + 1]
                    top_antecedent_emb = tf.concat(
                        [tf.expand_dims(top_span_emb, 1), top_antecedent_emb],
                        1)  # [k, c + 1, emb]
                    attended_span_emb = tf.reduce_sum(
                        tf.expand_dims(top_antecedent_weights, 2) *
                        top_antecedent_emb, 1)  # [k, emb]
                    with tf.variable_scope("f"):
                        f = tf.sigmoid(
                            util_xlnet.projection(
                                tf.concat([top_span_emb, attended_span_emb],
                                          1),
                                util_xlnet.shape(top_span_emb,
                                                 -1)))  # [k, emb]
                        top_span_emb = f * attended_span_emb + (
                            1 - f) * top_span_emb  # [k, emb]
        else:
            top_antecedent_scores = top_fast_antecedent_scores

        top_antecedent_scores = tf.concat(
            [dummy_scores, top_antecedent_scores], 1)  # [k, c + 1]

        top_antecedent_cluster_ids = tf.gather(top_span_cluster_ids,
                                               top_antecedents)  # [k, c]
        top_antecedent_cluster_ids += tf.to_int32(
            tf.log(tf.to_float(top_antecedents_mask)))  # [k, c]
        same_cluster_indicator = tf.equal(top_antecedent_cluster_ids,
                                          tf.expand_dims(
                                              top_span_cluster_ids,
                                              1))  # [k, c]
        non_dummy_indicator = tf.expand_dims(top_span_cluster_ids > 0,
                                             1)  # [k, 1]
        pairwise_labels = tf.logical_and(same_cluster_indicator,
                                         non_dummy_indicator)  # [k, c]
        dummy_labels = tf.logical_not(
            tf.reduce_any(pairwise_labels, 1, keepdims=True))  # [k, 1]
        top_antecedent_labels = tf.concat([dummy_labels, pairwise_labels],
                                          1)  # [k, c + 1]
        loss = self.softmax_loss(top_antecedent_scores,
                                 top_antecedent_labels)  # [k]

        loss = tf.reduce_sum(loss)  # []
        return [
            candidate_starts, candidate_ends, candidate_mention_scores,
            top_span_starts, top_span_ends, top_antecedents,
            top_antecedent_scores
        ], loss
def get_uda_classification_loss(options, features, n_class, is_training,
                                global_step, input_ids, input_mask,
                                segment_ids, labels):
    """Loss for downstream classification tasks."""

    tsa = options['tsa']
    unsup_ratio = options['unsup_ratio']
    num_train_steps = options['num_train_steps']
    uda_softmax_temp = options['uda_softmax_temp']
    uda_confidence_thresh = options['uda_confidence_thresh']

    inp = tf.transpose(input_ids, [1, 0])
    seg_id = tf.transpose(segment_ids, [1, 0])
    inp_mask = tf.transpose(input_mask, [1, 0])

    num_sample = input_ids.shape[0].value

    if is_training:
        assert num_sample % (1 + 2 * unsup_ratio) == 0
        sup_batch_size = num_sample // (1 + 2 * unsup_ratio)
        unsup_batch_size = sup_batch_size * unsup_ratio
        bsz_per_core = tf.shape(input_ids)[0] // (1 + 2 * unsup_ratio)

    else:
        sup_batch_size = num_sample
        unsup_batch_size = 0
        bsz_per_core = tf.shape(input_ids)[0]

    labels = tf.reshape(labels, [bsz_per_core])
    xlnet_config = xlnet.XLNetConfig(json_path=options['model_config_file'])
    run_config = xlnet.create_run_config(is_training, True, options)

    xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config,
                                   run_config=run_config,
                                   input_ids=inp,
                                   seg_ids=seg_id,
                                   input_mask=inp_mask)

    summary = xlnet_model.get_pooled_out(options['summary_type'],
                                         options['use_summ_proj'])

    if options['cls_scope'] is not None and options['cls_scope']:
        cls_scope = "classification_{}".format(options['cls_scope'])
    else:
        cls_scope = "classification_{}".format(options['task_name'].lower())

    clas_logits = modeling.uda_logits(
        hidden=summary,
        labels=labels,
        n_class=n_class,
        initializer=xlnet_model.get_initializer(),
        scope=cls_scope)

    log_probs = tf.nn.log_softmax(clas_logits, axis=-1)

    correct_label_probs = None

    with tf.variable_scope("sup_loss"):
        sup_log_probs = log_probs[:sup_batch_size]
        one_hot_labels = tf.one_hot(labels, depth=n_class, dtype=tf.float32)
        tgt_label_prob = one_hot_labels

        per_example_loss = -tf.reduce_sum(tgt_label_prob * sup_log_probs,
                                          axis=-1)
        loss_mask = tf.ones_like(per_example_loss,
                                 dtype=per_example_loss.dtype)
        correct_label_probs = tf.reduce_sum(one_hot_labels *
                                            tf.exp(sup_log_probs),
                                            axis=-1)

        if tsa:
            tf.logging.info("Applying TSA")
            # Starting threshold is just the inverse number of labels.
            tsa_start = 1. / n_class
            tsa_threshold = model_utils.get_tsa_threshold(tsa,
                                                          global_step,
                                                          num_train_steps,
                                                          tsa_start,
                                                          end=1)

            larger_than_threshold = tf.greater(correct_label_probs,
                                               tsa_threshold)
            loss_mask = loss_mask * (
                1 - tf.cast(larger_than_threshold, tf.float32))
        else:
            tsa_threshold = 1

        loss_mask = tf.stop_gradient(loss_mask)
        per_example_loss = per_example_loss * loss_mask
        sup_loss = (tf.reduce_sum(per_example_loss) /
                    tf.maximum(tf.reduce_sum(loss_mask), 1))

    unsup_loss_mask = None
    if is_training and unsup_ratio > 0:
        with tf.variable_scope("unsup_loss"):
            ori_start = sup_batch_size
            ori_end = ori_start + unsup_batch_size
            aug_start = sup_batch_size + unsup_batch_size
            aug_end = aug_start + unsup_batch_size

            ori_log_probs = log_probs[ori_start:ori_end]
            aug_log_probs = log_probs[aug_start:aug_end]
            unsup_loss_mask = 1
            if options['uda_softmax_temp'] != -1:
                tgt_ori_log_probs = tf.nn.log_softmax(
                    clas_logits[ori_start:ori_end] /
                    options['uda_softmax_temp'],
                    axis=-1)
                tgt_ori_log_probs = tf.stop_gradient(tgt_ori_log_probs)
            else:
                tgt_ori_log_probs = tf.stop_gradient(ori_log_probs)

            if options['uda_confidence_thresh'] != -1:
                largest_prob = tf.reduce_max(tf.exp(ori_log_probs), axis=-1)
                unsup_loss_mask = tf.cast(
                    tf.greater(largest_prob, options['uda_confidence_thresh']),
                    tf.float32)
                unsup_loss_mask = tf.stop_gradient(unsup_loss_mask)

            per_example_kl_loss = model_utils.kl_for_log_probs(
                tgt_ori_log_probs, aug_log_probs) * unsup_loss_mask
            unsup_loss = tf.reduce_mean(per_example_kl_loss)

    else:
        unsup_loss = 0.

    return (sup_loss, unsup_loss, clas_logits[:sup_batch_size],
            per_example_loss, loss_mask, tsa_threshold, unsup_loss_mask,
            correct_label_probs)