def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    processors = {
        "sst-2": run_classifier.SST2Processor,
        "mnli": run_classifier.MnliProcessor
    }

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint1)
    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint2)

    if not tf.train.checkpoint_exists(FLAGS.init_checkpoint1):
        raise TFCheckpointNotFoundError("checkpoint1 does not exist!")

    if not tf.train.checkpoint_exists(FLAGS.init_checkpoint2) and \
       not FLAGS.use_random:
        raise TFCheckpointNotFoundError("checkpoint2 does not exist!")

    bert_config1 = modeling.BertConfig.from_json_file(FLAGS.bert_config_file1)
    bert_config2 = modeling.BertConfig.from_json_file(FLAGS.bert_config_file2)

    if FLAGS.max_seq_length > bert_config1.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config1.max_position_embeddings))

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    all_results = []

    predict_examples = processor.get_test_examples(FLAGS.diff_input_file)
    num_actual_predict_examples = len(predict_examples)

    # For single sentence tasks (like SST2) eg.text_b is None
    original_data = [(eg.text_a, eg.text_b) for eg in predict_examples]
    if FLAGS.use_tpu:
        # TPU requires a fixed batch size for all batches, therefore the number
        # of examples must be a multiple of the batch size, or else examples
        # will get dropped. So we pad with fake examples which are ignored
        # later on.
        while len(predict_examples) % FLAGS.predict_batch_size != 0:
            predict_examples.append(run_classifier.PaddingInputExample())

    predict_file = os.path.join(FLAGS.init_checkpoint1,
                                FLAGS.exp_name + ".predict.tf_record")

    run_classifier.file_based_convert_examples_to_features(
        predict_examples, label_list, FLAGS.max_seq_length, tokenizer,
        predict_file)

    for bert_config_type, output_dir in [
        (bert_config1, FLAGS.init_checkpoint1),
        (bert_config2, FLAGS.init_checkpoint2)
    ]:
        tpu_cluster_resolver = None
        if FLAGS.use_tpu and FLAGS.tpu_name:
            tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
                FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

        is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2
        run_config = contrib_tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            master=FLAGS.master,
            model_dir=output_dir,
            save_checkpoints_steps=FLAGS.save_checkpoints_steps,
            tpu_config=contrib_tpu.TPUConfig(
                iterations_per_loop=FLAGS.iterations_per_loop,
                num_shards=FLAGS.num_tpu_cores,
                per_host_input_for_training=is_per_host))

        model_fn = run_classifier.model_fn_builder(
            bert_config=bert_config_type,
            num_labels=len(label_list),
            # This init checkpoint is eventually overriden by the estimator
            init_checkpoint=FLAGS.output_dir,
            learning_rate=FLAGS.learning_rate,
            num_train_steps=None,
            num_warmup_steps=None,
            use_tpu=FLAGS.use_tpu,
            use_one_hot_embeddings=FLAGS.use_tpu)

        # If TPU is not available, this will fall back to normal Estimator on CPU
        # or GPU.
        estimator = contrib_tpu.TPUEstimator(
            use_tpu=FLAGS.use_tpu,
            model_fn=model_fn,
            config=run_config,
            train_batch_size=FLAGS.train_batch_size,
            eval_batch_size=FLAGS.eval_batch_size,
            predict_batch_size=FLAGS.predict_batch_size)

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(predict_examples), num_actual_predict_examples,
                        len(predict_examples) - num_actual_predict_examples)
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_drop_remainder = True if FLAGS.use_tpu else False
        predict_input_fn = run_classifier.file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        result = [x for x in estimator.predict(input_fn=predict_input_fn)]
        all_results.append(result)

    all_results[0] = all_results[0][:num_actual_predict_examples]
    all_results[1] = all_results[1][:num_actual_predict_examples]

    assert len(all_results[0]) == len(all_results[1])

    # Assuming model1's predictions are gold labels, calculate model2's accuracy
    score = 0
    for prob1, prob2 in zip(all_results[0], all_results[1]):
        if np.argmax(prob1["probabilities"]) == np.argmax(
                prob2["probabilities"]):
            score += 1

    tf.logging.info("Agreement score = %.6f",
                    float(score) / num_actual_predict_examples)

    # Calculate the average value of |v1 - v2|, the distance on the simplex
    # Unlike KL divergence, this is a bounded metric
    # However, these results are not comparable across tasks
    # with different number classes
    distances = []
    for prob1, prob2 in zip(all_results[0], all_results[1]):
        distances.append(
            np.linalg.norm(prob1["probabilities"] - prob2["probabilities"]))

    tf.logging.info("Average length |p1 - p2| = %.8f", np.mean(distances))
    tf.logging.info("Max length |p1 - p2| = %.8f", np.max(distances))
    tf.logging.info("Min length |p1 - p2| = %.8f", np.min(distances))
    tf.logging.info("Std length |p1 - p2| = %.8f", np.std(distances))

    if FLAGS.diff_type == "kld1":
        all_kld = []

        for prob1, prob2 in zip(all_results[0], all_results[1]):
            all_kld.append(
                stats.entropy(prob1["probabilities"], prob2["probabilities"]))

        tf.logging.info("Average kl-divergence (p1, p2) = %.8f",
                        np.mean(all_kld))
        tf.logging.info("Max kl-divergence (p1, p2) = %.8f", np.max(all_kld))
        tf.logging.info("Min kl-divergence (p1, p2) = %.8f", np.min(all_kld))
        tf.logging.info("Std kl-divergence (p1, p2) = %.8f", np.std(all_kld))

    elif FLAGS.diff_type == "kld2":
        all_kld = []

        for prob1, prob2 in zip(all_results[0], all_results[1]):
            all_kld.append(
                stats.entropy(prob2["probabilities"], prob1["probabilities"]))

        tf.logging.info("Average kl-divergence (p2, p1) = %.8f",
                        np.mean(all_kld))
        tf.logging.info("Max kl-divergence (p2, p1) = %.8f", np.max(all_kld))
        tf.logging.info("Min kl-divergence (p2, p1) = %.8f", np.min(all_kld))
        tf.logging.info("Std kl-divergence (p2, p1) = %.8f", np.std(all_kld))

    if FLAGS.diff_output_file:
        output = ""

        # Removing padded examples
        all_results[0] = all_results[0][:len(original_data)]
        all_results[1] = all_results[1][:len(original_data)]

        with tf.gfile.GFile(FLAGS.diff_output_file, "w") as f:
            for i, (eg, prob1, prob2) in enumerate(
                    zip(original_data, all_results[0], all_results[1])):

                if i % 1000 == 0:
                    tf.logging.info("Writing instance %d", i + 1)

                p1_items = [p1.item() for p1 in prob1["probabilities"]]
                p2_items = [p2.item() for p2 in prob2["probabilities"]]

                prob1_str = "%.6f\t%.6f\t%.6f" % (p1_items[0], p1_items[1],
                                                  p1_items[2])
                prob2_str = "%.6f\t%.6f\t%.6f" % (p2_items[0], p2_items[1],
                                                  p2_items[2])

                output = "%s\t%s\t%s\t%s\n" % (eg[0], eg[1], prob1_str,
                                               prob2_str)
                f.write(output)

    return
Exemple #2
0
def get_tokenizer():
    return tokenization.FullTokenizer(vocab_file=vocab_file,
                                      do_lower_case=False)
def prepare_bert(bert_vocab_file, bert_config_file, init_checkpoint, sen_len, select_layers,  batch_size, graph_file, model_dir):

    tokenizer = tokenization.FullTokenizer(bert_vocab_file)
    estimator = get_estimator(bert_config_file, init_checkpoint, sen_len, select_layers,  batch_size, graph_file, model_dir)
    
    return tokenizer, estimator
Exemple #4
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    if not FLAGS.do_train and not FLAGS.do_eval:
        raise ValueError("At least one of `do_train`, `do_eval` must be True.")

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.output_dir)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2
    run_config = contrib_tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=contrib_tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    num_train_steps = None
    num_warmup_steps = None
    if FLAGS.do_train:
        num_train_steps = int(
            FLAGS.train_data_size / FLAGS.train_batch_size) * FLAGS.epochs
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    model_fn = model_fn_builder(bert_config=bert_config,
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                use_one_hot_embeddings=FLAGS.use_tpu)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = contrib_tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        if not tf.gfile.Exists(FLAGS.train_file):
            tf.logging.info(
                "DANITER:File doesn't exist, creating tfrecord data")
            examples = model_builder.load_hellaswag(FLAGS.train_raw_data)
            tf.logging.info("DANITER:Read raw data as json")
            model_builder.file_based_convert_examples_for_bilinear(
                examples, 512, tokenizer, FLAGS.train_file, do_copa=True)
        train_input_fn = file_based_input_fn_builder(
            input_file=FLAGS.train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, steps=num_train_steps)

    if FLAGS.do_eval:
        # This tells the estimator to run through the entire set.
        if FLAGS.eval_data_size < 0:
            eval_steps = None
        else:
            eval_steps = int(FLAGS.eval_data_size / FLAGS.eval_batch_size)

        eval_drop_remainder = True if FLAGS.use_tpu else False
        if not tf.gfile.Exists(FLAGS.eval_file):
            examples = model_builder.load_hellaswag(FLAGS.eval_raw_data)
            model_builder.file_based_convert_examples_for_bilinear(
                examples, 512, tokenizer, FLAGS.eval_file, do_copa=True)
        eval_input_fn = file_based_input_fn_builder(
            input_file=FLAGS.eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)

        def _find_valid_cands(curr_step):
            filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
            candidates = []
            for filename in filenames:
                if filename.endswith(".index"):
                    ckpt_name = filename[:-6]
                    idx = ckpt_name.split("-")[-1]
                    if idx != "best" and int(idx) > curr_step:
                        candidates.append(filename)
            return candidates

        tf.logging.info("Evaling all models in output dir")
        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best")
        key_name = "eval_accuracy"
        tf.logging.info("Checkpoint path " + checkpoint_path)
        if tf.gfile.Exists(checkpoint_path + ".index"):
            tf.logging.info("Found a best model... not good")
            result = estimator.evaluate(input_fn=eval_input_fn,
                                        steps=eval_steps,
                                        checkpoint_path=checkpoint_path)
            best_perf = result[key_name]
            global_step = result["global_step"]
        else:
            tf.logging.info("Setting global step to -1")
            global_step = -1
            best_perf = -1
            checkpoint_path = None
        tf.logging.info("Openning writer " + output_eval_file)
        writer = tf.gfile.GFile(output_eval_file, "w")

        steps_and_files = {}
        filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
        tf.logging.info("Models found " + "\n".join(filenames))
        for filename in filenames:
            if filename.endswith(".index"):
                ckpt_name = filename[:-6]
                cur_filename = os.path.join(FLAGS.output_dir, ckpt_name)
                if cur_filename.split("-")[-1] == "best":
                    continue
                gstep = int(cur_filename.split("-")[-1])
                if gstep not in steps_and_files:
                    tf.logging.info(
                        "Add {} to eval list.".format(cur_filename))
                    steps_and_files[gstep] = cur_filename
        tf.logging.info("found {} files.".format(len(steps_and_files)))
        # steps_and_files = sorted(steps_and_files, key=lambda x: x[0])
        if not steps_and_files:
            tf.logging.info(
                "found 0 file, global step: {}. Sleeping.".format(global_step))
        else:
            for ele in sorted(steps_and_files.items()):
                step, checkpoint_path = ele
                if global_step >= step:
                    if len(_find_valid_cands(step)) > 1:
                        for ext in ["meta", "data-00000-of-00001", "index"]:
                            src_ckpt = checkpoint_path + ".{}".format(ext)
                            tf.logging.info("removing {}".format(src_ckpt))
                            # Why should we remove checkpoints?
                            # tf.gfile.Remove(src_ckpt)
                    tf.logging.info("Skipping candidate for some reason")
                    continue
                result = estimator.evaluate(input_fn=eval_input_fn,
                                            steps=eval_steps,
                                            checkpoint_path=checkpoint_path)
                global_step = result["global_step"]
                tf.logging.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    tf.logging.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))
                writer.write("best = {}\n".format(best_perf))

                if len(_find_valid_cands(global_step)) > 1:
                    for ext in ["meta", "data-00000-of-00001", "index"]:
                        src_ckpt = checkpoint_path + ".{}".format(ext)
                        tf.logging.info("removing {}".format(src_ckpt))
                        # tf.gfile.Remove(src_ckpt)
                writer.write("=" * 50 + "\n")
        writer.close()
Exemple #5
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    processors = {"answer_sent_labeling": AnswerSentenceLabelingProcessor}

    if not FLAGS.do_train_and_eval and not FLAGS.do_predict and not FLAGS.do_train and not FLAGS.do_eval:
        raise ValueError(
            "At least one of `do_train_and_eval`, or `do_predict', or `do_train`, or `do_eval' must be True."
        )

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.work_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    config = tf.compat.v1.ConfigProto()
    run_config = tf.estimator.RunConfig(
        model_dir=FLAGS.work_dir,
        session_config=config,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
    )

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    if FLAGS.do_train_and_eval or FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    model_fn = model_fn_builder(
        bert_config=bert_config,
        num_labels=len(label_list),
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_one_hot_embeddings=FLAGS.use_one_hot_embeddings)

    # If GPU is not available, this will fall back to normal Estimator on CPU.
    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       config=run_config,
                                       params={
                                           "train_batch_size":
                                           FLAGS.train_batch_size,
                                           "predict_batch_size":
                                           FLAGS.predict_batch_size
                                       })

    # Early_stop
    early_stopping_hook = tf.contrib.estimator.stop_if_no_increase_hook(
        estimator=estimator,
        metric_name="f1_score",
        max_steps_without_increase=FLAGS.max_steps_without_increase,
        min_steps=1000,
        run_every_secs=None,
        run_every_steps=FLAGS.save_checkpoints_steps,
    )

    if FLAGS.do_train_and_eval:
        train_file = os.path.join(FLAGS.work_dir, "train.tf_record")
        file_based_convert_examples_to_features(train_examples,
                                                FLAGS.max_answer_num,
                                                FLAGS.max_seq_length,
                                                tokenizer, train_file)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)

        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            max_answer_num=FLAGS.max_answer_num,
            is_training=True,
            drop_remainder=True)
        train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn,
                                            max_steps=num_train_steps,
                                            hooks=[early_stopping_hook])

        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        eval_file = os.path.join(FLAGS.work_dir, "eval.tf_record")
        file_based_convert_examples_to_features(eval_examples,
                                                FLAGS.max_answer_num,
                                                FLAGS.max_seq_length,
                                                tokenizer, eval_file)
        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d", len(eval_examples))
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        eval_drop_remainder = False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            max_answer_num=FLAGS.max_answer_num,
            is_training=False,
            drop_remainder=eval_drop_remainder)
        # steps=None tells the estimator to run through the entire set.
        # throttle_secs set minimum seconds needed to evaluate model again.
        eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn,
                                          steps=None,
                                          throttle_secs=10)

        tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

    if FLAGS.do_train:
        train_file = os.path.join(FLAGS.work_dir, "train.tf_record")
        file_based_convert_examples_to_features(train_examples,
                                                FLAGS.max_answer_num,
                                                FLAGS.max_seq_length,
                                                tokenizer, train_file)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            max_answer_num=FLAGS.max_answer_num,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        eval_file = os.path.join(FLAGS.work_dir, "eval.tf_record")
        file_based_convert_examples_to_features(eval_examples,
                                                FLAGS.max_answer_num,
                                                FLAGS.max_seq_length,
                                                tokenizer, eval_file)

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d", len(eval_examples))
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        # This tells the estimator to run through the entire set.
        eval_steps = None

        eval_drop_remainder = False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            max_answer_num=FLAGS.max_answer_num,
            is_training=False,
            drop_remainder=eval_drop_remainder)

        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

        output_eval_file = os.path.join(FLAGS.work_dir, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("\n  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    if FLAGS.do_predict:
        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        num_predict_examples = len(predict_examples)

        predict_file = os.path.join(FLAGS.work_dir, "predict.tf_record")
        file_based_convert_examples_to_features(predict_examples,
                                                FLAGS.max_answer_num,
                                                FLAGS.max_seq_length,
                                                tokenizer, predict_file)

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d ", len(predict_examples))
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_drop_remainder = False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            max_answer_num=FLAGS.max_answer_num,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        result = estimator.predict(input_fn=predict_input_fn)

        output_predict_file = os.path.join(FLAGS.work_dir, "test_results.tsv")
        output_pos_prob_np_file = os.path.join(FLAGS.work_dir,
                                               "test_results.npy")
        pos_prob_list = []
        with open(output_predict_file, "w") as writer:
            num_written_lines = 0
            for (query_id, prediction) in enumerate(result):
                if query_id % 200 == 0:
                    tf.logging.info("***** query_id: {}*****".format(query_id))
                predict = prediction["predict"]
                answer_num = prediction["answer_num"]
                positive_probabilities = prediction[
                    "positive_probabilities"][:answer_num]
                pos_prob_list.extend(positive_probabilities)
                for answer_id, class_id in enumerate(predict):
                    if answer_id < answer_num:
                        output_line = "\t".join(
                            str(x)
                            for x in [query_id, answer_id, class_id]) + "\n"
                        writer.write(output_line)
                num_written_lines += 1
        np.save(output_pos_prob_np_file, np.array(pos_prob_list))
        assert num_written_lines == num_predict_examples
def main(argv):
    tf.logging.set_verbosity(tf.logging.INFO)
    tf.config.set_soft_device_placement(True)
    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    train_input_fn = None
    ft_known_train_file = None
    train_file = None
    if FLAGS.do_train:
        current_seed = 0
        num_known_classes = FLAGS.num_domains * FLAGS.num_labels_per_domain
        data_output_dir = FLAGS.data_output_dir
        if not tf.gfile.Exists(data_output_dir):
            tf.gfile.MakeDirs(data_output_dir)
        known_ft_path = os.path.join(data_output_dir,
                                     "known_ft_train.tf_record")
        unknown_ft_path = os.path.join(data_output_dir,
                                       "unknown_ft_train.tf_record")
        if not tf.gfile.Glob(known_ft_path):
            preprocess_few_shot_training_data(tokenizer, known_ft_path,
                                              unknown_ft_path, current_seed)

        if FLAGS.continual_learning is None:
            assert False, "Not Implemented"
        elif FLAGS.continual_learning == "pretrain":
            train_file = os.path.join(FLAGS.data_output_dir,
                                      "known_ft_train.tf_record")
            num_classes = num_known_classes
            num_train_examples = num_known_classes * FLAGS.known_num_shots
            num_shots_per_class = FLAGS.known_num_shots
        elif FLAGS.continual_learning == "few_shot":
            train_file = os.path.join(FLAGS.data_output_dir,
                                      "unknown_ft_train.tf_record")
            ft_known_train_file = os.path.join(FLAGS.data_output_dir,
                                               "known_ft_train.tf_record")
            num_unknown_classes = NUM_CLASSES - num_known_classes
            num_classes = num_unknown_classes
            num_train_examples = num_unknown_classes * FLAGS.few_shot
            num_shots_per_class = FLAGS.few_shot

        tpu_split = FLAGS.tpu_split if FLAGS.use_tpu else 1
        if num_shots_per_class < tpu_split:
            steps_per_epoch = 1
        else:
            steps_per_epoch = num_shots_per_class // tpu_split
        num_train_steps = int(steps_per_epoch * FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
        FLAGS.num_train_steps = num_train_steps
        FLAGS.save_checkpoints_steps = int(steps_per_epoch *
                                           FLAGS.save_every_epoch)

        tf.logging.info("***** Running training *****")
        tf.logging.info("  train_file: %s" % train_file)
        tf.logging.info("  use_tpu: %s" % FLAGS.use_tpu)
        tf.logging.info("  Num examples = %d", num_train_examples)
        tf.logging.info("  Batch size = %d", FLAGS.batch_size)
        tf.logging.info("  Save checkpoints steps = %d",
                        FLAGS.save_checkpoints_steps)
        tf.logging.info("  warmup steps = %d", num_warmup_steps)
        tf.logging.info("  Num epochs = %d", FLAGS.num_train_epochs)
        tf.logging.info("  Num steps = %d", num_train_steps)
        tf.logging.info("  Reduce method = %s", FLAGS.reduce_method)
        tf.logging.info("  Max Seq Length = %d", FLAGS.max_seq_length)
        tf.logging.info(" learning_rate = %.7f", FLAGS.learning_rate)
        tf.logging.info(" dropout rate = %.4f", DROPOUT_PROB)

        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            ft_known_train_file=ft_known_train_file,
            use_tpu=FLAGS.use_tpu)

    model_fn = model_fn_builder(bert_config=bert_config,
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                use_one_hot_embeddings=FLAGS.use_tpu)

    FLAGS.do_eval = False
    eval_input_fn = None
    params = _get_hparams()
    params.update(num_train_steps=num_train_steps)
    if not FLAGS.do_train:
        train_input_fn = eval_input_fn

    experiment_utils.run_experiment(model_fn=model_fn,
                                    train_input_fn=train_input_fn,
                                    eval_input_fn=train_input_fn,
                                    params=params)
Exemple #7
0
def main(_):
    tf.gfile.MakeDirs(os.path.dirname(FLAGS.output_tfrecord))
    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_path,
                                           do_lower_case=True)

    annotations_zipfn = os.path.join(FLAGS.data_dir, "vcr1annots.zip")
    images_zipfn = os.path.join(FLAGS.data_dir, "vcr1images.zip")

    # Generate data for all splits:
    for split in ["train", "val", "test"]:
        jsonl_file = split + ".jsonl"
        output_tfrecord = "-".join([
            FLAGS.output_tfrecord, split,
            "%05d" % FLAGS.shard, "of",
            "%05d" % FLAGS.num_shards
        ])
        with tf.python_io.TFRecordWriter(output_tfrecord) as writer:
            with tf.Session() as sess:
                sess.run(tf.global_variables_initializer())
                sess.run(tf.tables_initializer())
                with zipfile.ZipFile(
                        tf.gfile.Open(annotations_zipfn)) as annotations_zip:
                    with zipfile.ZipFile(
                            tf.gfile.Open(images_zipfn)) as images_zip:
                        with annotations_zip.open(jsonl_file) as jsonl:
                            for idx, line in enumerate(jsonl):
                                if idx % FLAGS.num_shards != FLAGS.shard:
                                    continue
                                example = json.loads(line)
                                meta_filename = "vcr1images/" + example[
                                    "metadata_fn"]
                                meta = json.loads(
                                    images_zip.open(meta_filename).read())
                                del meta["segms"]

                                try:
                                    image_filename = "vcr1images/" + example[
                                        "img_fn"]
                                    tf.logging.info("Reading %s",
                                                    image_filename)
                                    with images_zip.open(
                                            image_filename) as image:
                                        image_string = image.read()
                                except zipfile.BadZipfile as e:
                                    tf.logging.error("Bad Zip file: " + str(e))
                                    image_string = BLANK_JPEG
                                    for box in meta["boxes"]:
                                        box[0] = 0.0
                                        box[1] = 0.0
                                        box[2] = 1.0
                                        box[3] = 1.0

                                is_test = (split == "test")
                                for tf_example in create_tf_examples(
                                        tokenizer,
                                        example,
                                        image_string,
                                        meta,
                                        is_test=is_test):
                                    writer.write(
                                        tf_example.SerializeToString())
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)
    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
    validate_flags_or_throw(bert_config)
    tf.gfile.MakeDirs(FLAGS.output_dir)

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(FLAGS.tpu_name, zone=FLAGS.tpu_zone,
                                                                              project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    save_checkpoints_steps = int(FLAGS.train_num_precomputed / FLAGS.train_batch_size)
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        keep_checkpoint_max=100,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    num_train_steps = None
    num_warmup_steps = None
    if FLAGS.do_train:
        num_train_features = FLAGS.train_num_precomputed
        num_train_steps = int(int(num_train_features / FLAGS.train_batch_size) * FLAGS.num_train_epochs)

        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    model_fn = model_fn_builder(
        bert_config=bert_config,
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_tpu)
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        predict_batch_size=FLAGS.predict_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        model_dir=FLAGS.output_dir)
    if FLAGS.do_train:
        tf.logging.info("***** Running training on precomputed features *****")
        tf.logging.info("  Num split examples = %d", num_train_features)
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        train_filename = FLAGS.train_precomputed_file
        train_input_fn = input_fn_builder(
            input_file=train_filename,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
Exemple #9
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    processors = {"race": race_utils.RaceProcessor}

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' must be True."
        )

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name, ))

    processor = processors[task_name](
        use_spm=True if FLAGS.spm_model_file else False,
        do_lower_case=FLAGS.do_lower_case,
        high_only=FLAGS.high_only,
        middle_only=FLAGS.middle_only)

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(
        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)  #,
    # spm_model_file=FLAGS.spm_model_file)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2
    if FLAGS.do_train:
        iterations_per_loop = int(
            min(FLAGS.iterations_per_loop, FLAGS.save_checkpoints_steps))
    else:
        iterations_per_loop = FLAGS.iterations_per_loop
    run_config = contrib_tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=int(FLAGS.save_checkpoints_steps),
        keep_checkpoint_max=0,
        tpu_config=contrib_tpu.TPUConfig(
            iterations_per_loop=iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    train_examples = None
    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)

    model_fn = race_utils.model_fn_builder(
        bert_config=bert_config,
        num_labels=len(label_list),
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=int(FLAGS.train_step),
        num_warmup_steps=int(FLAGS.warmup_step),
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_tpu,
        max_seq_length=FLAGS.max_seq_length,
        dropout_prob=FLAGS.dropout_prob)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    # if FLAGS.use_tpu:
    estimator = contrib_tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)
    # else:
    #   estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config)

    if FLAGS.do_train:
        if not tf.gfile.Exists(FLAGS.train_file):
            race_utils.file_based_convert_examples_to_features(
                train_examples, label_list, FLAGS.max_seq_length, tokenizer,
                FLAGS.train_file, FLAGS.max_qa_length)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", FLAGS.train_step)
        train_input_fn = classifier_utils.file_based_input_fn_builder(
            input_file=FLAGS.train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True,
            task_name=task_name,
            use_tpu=FLAGS.use_tpu,
            bsz=FLAGS.train_batch_size,
            multiple=len(label_list))
        estimator.train(input_fn=train_input_fn,
                        max_steps=int(FLAGS.train_step))

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        num_actual_eval_examples = len(eval_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on. These do NOT count towards the metric (all tf.metrics
            # support a per-instance weight, and these get a weight of 0.0).
            while len(eval_examples) % FLAGS.eval_batch_size != 0:
                eval_examples.append(classifier_utils.PaddingInputExample())

        if not tf.gfile.Exists(FLAGS.eval_file):
            race_utils.file_based_convert_examples_to_features(
                eval_examples, label_list, FLAGS.max_seq_length, tokenizer,
                FLAGS.eval_file, FLAGS.max_qa_length)

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(eval_examples), num_actual_eval_examples,
                        len(eval_examples) - num_actual_eval_examples)
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        # This tells the estimator to run through the entire set.
        eval_steps = None
        # However, if running eval on the TPU, you will need to specify the
        # number of steps.
        if FLAGS.use_tpu:
            assert len(eval_examples) % FLAGS.eval_batch_size == 0
            eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

        eval_drop_remainder = True if FLAGS.use_tpu else False
        eval_input_fn = classifier_utils.file_based_input_fn_builder(
            input_file=FLAGS.eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder,
            task_name=task_name,
            use_tpu=FLAGS.use_tpu,
            bsz=FLAGS.eval_batch_size,
            multiple=len(label_list))

        def _find_valid_cands(curr_step):
            filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
            candidates = []
            for filename in filenames:
                if filename.endswith(".index"):
                    ckpt_name = filename[:-6]
                    idx = ckpt_name.split("-")[-1]
                    if idx != "best" and int(idx) > curr_step:
                        candidates.append(filename)
            return candidates

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best")
        key_name = "eval_accuracy"
        if tf.gfile.Exists(checkpoint_path + ".index"):
            result = estimator.evaluate(input_fn=eval_input_fn,
                                        steps=eval_steps,
                                        checkpoint_path=checkpoint_path)
            best_perf = result[key_name]
            global_step = result["global_step"]
        else:
            global_step = -1
            best_perf = -1
            checkpoint_path = None
        writer = tf.gfile.GFile(output_eval_file, "w")
        while global_step < FLAGS.train_step:
            steps_and_files = {}
            filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
            for filename in filenames:
                if filename.endswith(".index"):
                    ckpt_name = filename[:-6]
                    cur_filename = os.path.join(FLAGS.output_dir, ckpt_name)
                    if cur_filename.split("-")[-1] == "best":
                        continue
                    gstep = int(cur_filename.split("-")[-1])
                    if gstep not in steps_and_files:
                        tf.logging.info(
                            "Add {} to eval list.".format(cur_filename))
                        steps_and_files[gstep] = cur_filename
            tf.logging.info("found {} files.".format(len(steps_and_files)))
            # steps_and_files = sorted(steps_and_files, key=lambda x: x[0])
            if not steps_and_files:
                tf.logging.info(
                    "found 0 file, global step: {}. Sleeping.".format(
                        global_step))
                time.sleep(1)
            else:
                for ele in sorted(steps_and_files.items()):
                    step, checkpoint_path = ele
                    if global_step >= step:
                        if len(_find_valid_cands(step)) > 1:
                            for ext in [
                                    "meta", "data-00000-of-00001", "index"
                            ]:
                                src_ckpt = checkpoint_path + ".{}".format(ext)
                                tf.logging.info("removing {}".format(src_ckpt))
                                tf.gfile.Remove(src_ckpt)
                        continue
                    result = estimator.evaluate(
                        input_fn=eval_input_fn,
                        steps=eval_steps,
                        checkpoint_path=checkpoint_path)
                    global_step = result["global_step"]
                    tf.logging.info("***** Eval results *****")
                    for key in sorted(result.keys()):
                        tf.logging.info("  %s = %s", key, str(result[key]))
                        writer.write("%s = %s\n" % (key, str(result[key])))
                    writer.write("best = {}\n".format(best_perf))
                    if result[key_name] > best_perf:
                        best_perf = result[key_name]
                        for ext in ["meta", "data-00000-of-00001", "index"]:
                            src_ckpt = checkpoint_path + ".{}".format(ext)
                            tgt_ckpt = checkpoint_path.rsplit(
                                "-", 1)[0] + "-best.{}".format(ext)
                            tf.logging.info("saving {} to {}".format(
                                src_ckpt, tgt_ckpt))
                            tf.gfile.Copy(src_ckpt, tgt_ckpt, overwrite=True)
                            writer.write("saved {} to {}\n".format(
                                src_ckpt, tgt_ckpt))

                    if len(_find_valid_cands(global_step)) > 1:
                        for ext in ["meta", "data-00000-of-00001", "index"]:
                            src_ckpt = checkpoint_path + ".{}".format(ext)
                            tf.logging.info("removing {}".format(src_ckpt))
                            tf.gfile.Remove(src_ckpt)
                    writer.write("=" * 50 + "\n")
        writer.close()
    if FLAGS.do_predict:
        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        num_actual_predict_examples = len(predict_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on.
            while len(predict_examples) % FLAGS.predict_batch_size != 0:
                predict_examples.append(classifier_utils.PaddingInputExample())
            assert len(predict_examples) % FLAGS.predict_batch_size == 0
            predict_steps = int(
                len(predict_examples) // FLAGS.predict_batch_size)

        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        race_utils.file_based_convert_examples_to_features(
            predict_examples, label_list, FLAGS.max_seq_length, tokenizer,
            predict_file, FLAGS.max_qa_length)

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(predict_examples), num_actual_predict_examples,
                        len(predict_examples) - num_actual_predict_examples)
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_drop_remainder = True if FLAGS.use_tpu else False
        predict_input_fn = classifier_utils.file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder,
            task_name=task_name,
            use_tpu=FLAGS.use_tpu,
            bsz=FLAGS.predict_batch_size,
            multiple=len(label_list))

        checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best")
        result = estimator.evaluate(input_fn=predict_input_fn,
                                    steps=predict_steps,
                                    checkpoint_path=checkpoint_path)

        output_predict_file = os.path.join(FLAGS.output_dir,
                                           "predict_results.txt")
        with tf.gfile.GFile(output_predict_file, "w") as pred_writer:
            # num_written_lines = 0
            tf.logging.info("***** Predict results *****")
            pred_writer.write("***** Predict results *****\n")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                pred_writer.write("%s = %s\n" % (key, str(result[key])))
            pred_writer.write("best = {}\n".format(best_perf))
Exemple #10
0
def main():
    MAX_SEQ_LENGTH, LABELS_LIST, VOCAB_FILE_PATH = get_config("cn")
    credentials = grpc.ssl_channel_credentials(
        root_certificates=ROOT_CERT.encode())
    channel = grpc.secure_channel(
        '{}:{}'.format(MODEL_SERVER_HOST, MODEL_SERVER_PORT), credentials)
    stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)

    # get the sentence of input
    # sentence = str(globals.request.headers.getlist('Text')[0])
    # sentence = globals.request.form.to_dict()
    sentence = "配置很不错,有很多的贴心配置,让人感到很温暖"

    # convert single sentence to feature
    tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE_PATH,
                                           do_lower_case=True)
    example = run_classifier.InputExample(
        guid="test-0",
        text_a=tokenization.convert_to_unicode(sentence),
        text_b=None,
        label=LABELS_LIST[0])
    feature = run_classifier.convert_single_example(0, example, LABELS_LIST,
                                                    MAX_SEQ_LENGTH, tokenizer)

    # get the input of model
    input_ids = np.reshape([feature.input_ids], (1, MAX_SEQ_LENGTH))
    input_mask = np.reshape([feature.input_mask], (1, MAX_SEQ_LENGTH))
    segment_ids = np.reshape([feature.segment_ids], (MAX_SEQ_LENGTH))
    label_ids = [feature.label_id]

    # Construct the request to tensorflow serving
    request = predict_pb2.PredictRequest()
    request.model_spec.name = MODEL_NAME
    request.model_spec.signature_name = 'serving_default'

    # package the input into request, Note the format of the input(follow the model)
    request.inputs['input_ids'].CopyFrom(
        tf.contrib.util.make_tensor_proto(input_ids,
                                          shape=[1, MAX_SEQ_LENGTH],
                                          dtype=tf.int32))
    request.inputs['input_mask'].CopyFrom(
        tf.contrib.util.make_tensor_proto(input_mask,
                                          shape=[1, MAX_SEQ_LENGTH],
                                          dtype=tf.int32))
    request.inputs['label_ids'].CopyFrom(
        tf.contrib.util.make_tensor_proto(label_ids, shape=[1],
                                          dtype=tf.int32))
    request.inputs['segment_ids'].CopyFrom(
        tf.contrib.util.make_tensor_proto(segment_ids,
                                          shape=[1, MAX_SEQ_LENGTH],
                                          dtype=tf.int32))

    # do predict
    result = stub.Predict(request, 100,
                          metadata=metadata_transformer())  # 120 secs timeout

    # parse the result
    probabilities_tensor_proto = result.outputs["probabilities"]
    probabilities = list(probabilities_tensor_proto.float_val)
    probabilities_np = np.array(probabilities)
    top3_index_np = probabilities_np.argsort()[-3:][::-1]
    probabilities_top3 = probabilities_np[top3_index_np]
    label_top3 = np.array(LABELS_LIST)[top3_index_np]
    # shape = tf.TensorShape(probabilities_tensor_proto.tensor_shape)
    # probabilities = np.array(probabilities_tensor_proto.float_val).reshape(
    #     shape.as_list())
    result_list = []
    for index in range(3):
        result_list.append({
            "label": label_top3[index],
            "score": str(probabilities_top3[index])
        })
    output_json = {"predictions": [{"results": result_list}]}
    return Response(json.dumps(output_json), mimetype='application/json')
 def __init__(self, is_training):
     self.is_training = is_training
     self.tokenizer = tokenization.FullTokenizer(
         vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
Exemple #12
0
maxlen = 512

custom_objects = get_custom_objects()
custom_objects["tf"] = tf

model = load_model("../bert_best_finetuned.h5", custom_objects=custom_objects)

with open("../old_complete_output.json_class_labels.txt") as f:
    label_mapping = np.array(json.load(f))

with open("../mesh_mapping.json") as f:
    mesh_mapping = json.load(f)

graph = tf.get_default_graph()
tokenizer = tokenization.FullTokenizer("../biobert_pubmed/vocab.txt",
                                       do_lower_case=False)


def make_multilabel_prediction(abstract):

    abstract = ["[CLS]"
                ] + tokenizer.tokenize(abstract)[0:maxlen - 2] + ["[SEP]"]
    vocab = tokenizer.vocab

    print(abstract)

    token_vectors = np.asarray([vocab[token] for token in abstract] + [0] *
                               (maxlen - len(abstract)))

    # Model expects a list of samples, we only have one.
    token_vectors = np.asarray([token_vectors])
              encoding='utf-8') as detail:
        for idx in range(len(logits_list)):
            item = {}
            item['trans'] = trans_list[idx]
            item['lengths'] = lengths_list[idx]
            item['logit'] = logits_list[idx]
            item['pred'] = y_pred_list[idx]
            item['ldct_list'] = ldct_list[idx]
            detail.write(
                json.dumps(item, ensure_ascii=False, cls=NpEncoder) + '\n')


if __name__ == '__main__':
    config = Config()
    vocab_file = config.vocab_file
    do_lower_case = False
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case)
    print('Predicting test.txt..........')
    dev_iter = DataIterator(config.batch_size,
                            data_file=result_data_dir + 'test.txt',
                            use_bert=config.use_bert,
                            seq_length=config.sequence_length,
                            is_test=True,
                            tokenizer=tokenizer)
    # print('Predicting dev.txt..........')
    # dev_iter = DataIterator(config.batch_size, data_file=result_data_dir + 'dev.txt', use_bert=config.use_bert,
    #                         seq_length=config.sequence_length, is_test=True, tokenizer=tokenizer)

    set_test(dev_iter, config.checkpoint_path)
Exemple #14
0
def main(_):
    logging.set_verbosity(logging.INFO)
    processors = {"ner": NerProcessor}
    if not FLAGS.do_train and not FLAGS.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")
    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))
    task_name = FLAGS.task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))
    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)
    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))
    train_examples = None
    num_train_steps = None
    num_warmup_steps = None

    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        print("# training examples", len(train_examples))
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list),
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                use_one_hot_embeddings=FLAGS.use_tpu)
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
        _, _ = filed_based_convert_examples_to_features(
            train_examples, label_list, FLAGS.max_seq_length, tokenizer,
            train_file)
        logging.info("***** Running training *****")
        logging.info("  Num examples = %d", len(train_examples))
        logging.info("  Batch size = %d", FLAGS.train_batch_size)
        logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        batch_tokens, batch_labels = filed_based_convert_examples_to_features(
            eval_examples, label_list, FLAGS.max_seq_length, tokenizer,
            eval_file)

        logging.info("***** Running evaluation *****")
        logging.info("  Num examples = %d", len(eval_examples))
        logging.info("  Batch size = %d", FLAGS.eval_batch_size)
        # if FLAGS.use_tpu:
        #     eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size)
        # eval_drop_remainder = True if FLAGS.use_tpu else False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=False)
        result = estimator.evaluate(input_fn=eval_input_fn)
        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with open(output_eval_file, "w", encoding="utf-8") as wf:
            logging.info("***** Eval results *****")
            confusion_matrix = result["confusion_matrix"]
            p, r, f = metrics.calculate(confusion_matrix, len(label_list) - 1)
            logging.info("***********************************************")
            logging.info("********************P = %s*********************",
                         str(p))
            logging.info("********************R = %s*********************",
                         str(r))
            logging.info("********************F = %s*********************",
                         str(f))
            logging.info("***********************************************")

    if FLAGS.do_predict:
        with open(FLAGS.middle_output + '/label2id.pkl', 'rb') as rf:
            label2id = pickle.load(rf)
            id2label = {value: key for key, value in label2id.items()}

        predict_examples = processor.get_test_examples(FLAGS.data_dir)

        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        batch_tokens, batch_labels = filed_based_convert_examples_to_features(
            predict_examples, label_list, FLAGS.max_seq_length, tokenizer,
            predict_file)

        logging.info("***** Running prediction*****")
        logging.info("  Num examples = %d", len(predict_examples))
        logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=False)

        result = estimator.predict(input_fn=predict_input_fn)
        output_predict_file = os.path.join(FLAGS.output_dir, "label_test.txt")
        #here if the tag is "X" means it belong to its before token, here for convenient evaluate use
        # conlleval.pl we  discarding it directly
        Writer(output_predict_file, result, batch_tokens, batch_labels,
               id2label)
Exemple #15
0
    def __init__(self, config):
        self.config = config
        self.max_segment_len = config['max_segment_len']
        self.max_span_width = config["max_span_width"]
        self.genres = {g: i for i, g in enumerate(config["genres"])}
        self.subtoken_maps = {}
        self.gold = {}
        self.eval_data = None  # Load eval data lazily.
        self.eval_test_data = None
        self.bert_config = modeling.BertConfig.from_json_file(
            config["bert_config_file"])
        self.tokenizer = tokenization.FullTokenizer(
            vocab_file=config['vocab_file'], do_lower_case=False)

        input_props = []
        input_props.append((tf.int32, [None, None]))  # input_ids.
        input_props.append((tf.int32, [None, None]))  # input_mask
        input_props.append((tf.int32, [None]))  # Text lengths.
        input_props.append((tf.int32, [None, None]))  # Speaker IDs.
        input_props.append((tf.int32, []))  # Genre.
        input_props.append((tf.bool, []))  # Is training.
        input_props.append((tf.int32, [None]))  # Gold starts.
        input_props.append((tf.int32, [None]))  # Gold ends.
        input_props.append((tf.int32, [None]))  # Cluster ids.
        input_props.append((tf.int32, [None]))  # Sentence Map

        self.queue_input_tensors = [
            tf.placeholder(dtype, shape) for dtype, shape in input_props
        ]
        dtypes, shapes = zip(*input_props)
        queue = tf.PaddingFIFOQueue(capacity=10, dtypes=dtypes, shapes=shapes)
        self.enqueue_op = queue.enqueue(self.queue_input_tensors)
        self.input_tensors = queue.dequeue()

        self.predictions, self.loss = self.get_predictions_and_loss(
            *self.input_tensors)
        # bert stuff
        tvars = tf.trainable_variables()
        # If you're using TF weights only, tf_checkpoint and init_checkpoint can be the same
        # Get the assignment map from the tensorflow checkpoint. Depending on the extension, use TF/Pytorch to load weights.
        assignment_map, initialized_variable_names = modeling.get_assignment_map_from_checkpoint(
            tvars, config['tf_checkpoint'])
        init_from_checkpoint = tf.train.init_from_checkpoint if config[
            'init_checkpoint'].endswith(
                'ckpt') else load_from_pytorch_checkpoint
        init_from_checkpoint(config['init_checkpoint'], assignment_map)
        print("**** Trainable Variables ****")
        for var in tvars:
            init_string = ""
            if var.name in initialized_variable_names:
                init_string = ", *INIT_FROM_CKPT*"
            # tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
            # init_string)
            print("  name = %s, shape = %s%s" %
                  (var.name, var.shape, init_string))

        num_train_steps = int(self.config['num_docs'] *
                              self.config['num_epochs'])
        num_warmup_steps = int(num_train_steps * 0.1)
        self.global_step = tf.train.get_or_create_global_step()
        self.train_op, self.bert_lr, self.task_lr = optimization.create_custom_optimizer(
            tvars,
            self.loss,
            self.config['bert_learning_rate'],
            self.config['task_learning_rate'],
            num_train_steps,
            num_warmup_steps,
            False,
            self.global_step,
            freeze=-1,
            task_opt=self.config['task_optimizer'],
            eps=config['adam_eps'])
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    layer_indexes = [int(x) for x in FLAGS.layers.split(",")]

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        master=FLAGS.master,
        tpu_config=tf.contrib.tpu.TPUConfig(
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    examples = read_examples(FLAGS.input_file)

    features = convert_examples_to_features(examples=examples,
                                            seq_length=FLAGS.max_seq_length,
                                            tokenizer=tokenizer)

    unique_id_to_feature = {}
    for feature in features:
        unique_id_to_feature[feature.unique_id] = feature

    model_fn = model_fn_builder(
        bert_config=bert_config,
        init_checkpoint=FLAGS.init_checkpoint,
        layer_indexes=layer_indexes,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_one_hot_embeddings)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        predict_batch_size=FLAGS.batch_size)

    input_fn = input_fn_builder(features=features,
                                seq_length=FLAGS.max_seq_length)

    with codecs.getwriter("utf-8")(tf.gfile.Open(FLAGS.output_file,
                                                 "w")) as writer:
        for result in estimator.predict(input_fn, yield_single_examples=True):
            unique_id = int(result["unique_id"])
            feature = unique_id_to_feature[unique_id]
            output_json = collections.OrderedDict()
            output_json["linex_index"] = unique_id
            all_features = []
            for (i, token) in enumerate(feature.tokens):
                all_layers = []
                for (j, layer_index) in enumerate(layer_indexes):
                    layer_output = result["layer_output_%d" % j]
                    layers = collections.OrderedDict()
                    layers["index"] = layer_index
                    layers["values"] = [
                        round(float(x), 6)
                        for x in layer_output[i:(i + 1)].flat
                    ]
                    all_layers.append(layers)
                features = collections.OrderedDict()
                features["token"] = token
                features["layers"] = all_layers
                all_features.append(features)
            output_json["features"] = all_features
            writer.write(json.dumps(output_json) + "\n")
Exemple #17
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--bert_config_file",
        default=None,
        type=str,
        required=True,
        help="The config json file corresponding to the pre-trained BERT model. "
        "This specifies the model architecture.")
    parser.add_argument(
        "--vocab_file",
        default=None,
        type=str,
        required=True,
        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )

    ## Answering ablities
    parser.add_argument("--span_extraction",
                        default=False,
                        action='store_true',
                        help="Whether to use span extraction.")
    parser.add_argument("--addition_subtraction",
                        default=False,
                        action='store_true',
                        help="Whether to use addition subtraction.")
    parser.add_argument("--counting",
                        default=False,
                        action='store_true',
                        help="Whether to use counting.")
    parser.add_argument("--negation",
                        default=False,
                        action='store_true',
                        help="Whether to use negation.")
    parser.add_argument("--include_more_numbers",
                        default=True,
                        action='store_true',
                        help="Whether to include more numbers.")
    parser.add_argument("--beam_size",
                        default=3,
                        type=int,
                        help="The size of beam search.")
    parser.add_argument("--max_count",
                        default=4,
                        type=int,
                        help="The maximal number of add_sub expressions.")
    parser.add_argument("--max_answer_number",
                        default=8,
                        type=int,
                        help="The maximal number of answers.")

    ## Other parameters
    parser.add_argument("--do_debug",
                        default=False,
                        action='store_true',
                        help="Whether to run in debug mode.")
    parser.add_argument(
        "--train_file",
        default=None,
        type=str,
        help="DROP json for training. E.g., drop_dataset_train.json")
    parser.add_argument("--predict_file",
                        default=None,
                        type=str,
                        help="DROP json for predictions.")
    parser.add_argument(
        "--prediction_dir",
        default=None,
        type=str,
        help="Nitish added: directory for predictions and metrircs")
    parser.add_argument("--predictions_json",
                        default=None,
                        type=str,
                        help="Nitish added: file name to write predictions")
    parser.add_argument("--metrics_json",
                        default=None,
                        type=str,
                        help="Nitish added: filename for metrics")
    parser.add_argument(
        "--init_checkpoint",
        default=None,
        type=str,
        help="Initial checkpoint (usually from a pre-trained BERT model).")
    parser.add_argument(
        "--do_lower_case",
        default=False,
        action='store_true',
        help="Whether to lower case the input text. Should be True for uncased "
        "models and False for cased models.")
    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_predict",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--predict_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for predictions.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.05,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% "
        "of training.")
    parser.add_argument("--length_heuristic",
                        default=0.05,
                        type=float,
                        help="Weight on length heuristic.")
    parser.add_argument(
        "--n_best_size",
        default=20,
        type=int,
        help=
        "The total number of n-best predictions to generate in the nbest_predictions.json "
        "output file.")
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help=
        "The maximum length of an answer that can be generated. This is needed because the start "
        "and end predictions are not conditioned on one another.")
    parser.add_argument(
        "--verbose_logging",
        default=False,
        action='store_true',
        help=
        "If true, all of the warnings related to data processing will be printed. "
        "A number of warnings are expected for a normal SQuAD evaluation.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--data_parallel",
                        default=False,
                        action='store_true',
                        help="Whether not to use data parallel")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        '--optimize_on_cpu',
        default=False,
        action='store_true',
        help=
        "Whether to perform optimization and keep the optimizer averages on CPU"
    )
    parser.add_argument(
        '--fp16',
        default=False,
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=128,
        help=
        'Loss scaling, positive power of 2 values can improve fp16 convergence.'
    )

    args = parser.parse_args()

    if not args.span_extraction and not args.addition_subtraction and not args.counting and not args.negation:
        raise ValueError(
            "At least one of `span_extraction` or `addition_subtraction` or `counting` or `negation` must be True."
        )

    args.answering_abilities = []
    if args.span_extraction:
        args.answering_abilities.append("span_extraction")
    if args.addition_subtraction:
        args.answering_abilities.append("addition_subtraction")
    if args.counting:
        args.answering_abilities.append("counting")
    if args.negation:
        args.answering_abilities.append("negation")
    logger.info("Answering abilities: {}".format(args.answering_abilities))

    assert "span_extraction" in args.answering_abilities and "addition_subtraction" in args.answering_abilities

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    logger.info('output_dir: {}'.format(args.output_dir))
    save_path = os.path.join(args.output_dir, 'checkpoint.pth.tar')
    log_path = os.path.join(args.output_dir, 'performance.txt')
    network_path = os.path.join(args.output_dir, 'network.txt')
    parameter_path = os.path.join(args.output_dir, 'parameter.txt')

    f = open(parameter_path, "w")
    for arg in sorted(vars(args)):
        print("{}: {}".format(arg, getattr(args, arg)), file=f)
    f.close()

    if not args.do_train and not args.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")

    if args.do_train and not args.train_file:
        raise ValueError(
            "If `do_train` is True, then `train_file` must be specified.")
    if args.do_predict and not args.predict_file:
        raise ValueError(
            "If `do_predict` is True, then `predict_file` must be specified.")

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
        if args.fp16:
            logger.info(
                "16-bits training currently not supported in distributed training"
            )
            args.fp16 = False  # (see https://github.com/pytorch/pytorch/pull/13496)
    logger.info(
        "torch_version: {} device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}"
        .format(torch.__version__, device, n_gpu, bool(args.local_rank != -1),
                args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    bert_config = BertConfig.from_json_file(args.bert_config_file)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (args.max_seq_length, bert_config.max_position_embeddings))

    # --- Prepare model ---
    logger.info("***** Preparing model *****")
    model = MTMSN(bert_config, args.answering_abilities,
                  args.max_answer_number)
    if args.init_checkpoint is not None and not os.path.isfile(save_path):
        logger.info("Loading model from pretrained checkpoint: {}".format(
            args.init_checkpoint))
        model = bert_load_state_dict(
            model, torch.load(args.init_checkpoint, map_location='cpu'))

    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1 or args.data_parallel:
        model = torch.nn.DataParallel(model)

    if os.path.isfile(save_path):
        checkpoint = torch.load(save_path)
        model.load_state_dict(checkpoint['model'])
        logger.info(
            "Loading model from finetuned checkpoint: '{}' (step {}, epoch {})"
            .format(save_path, checkpoint['step'], checkpoint['epoch']))
    f = open(network_path, "w")
    for n, param in model.named_parameters():
        print("name: {}, size: {}, dtype: {}, requires_grad: {}".format(
            n, param.size(), param.dtype, param.requires_grad),
              file=f)
    total_trainable_params = sum(p.numel() for p in model.parameters()
                                 if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    print("Total trainable parameters: {}".format(total_trainable_params),
          file=f)
    print("Total parameters: {}".format(total_params), file=f)
    f.close()

    # --- Prepare data ---
    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=args.do_lower_case)

    train_examples, train_features, num_train_steps = None, None, None
    eval_examples, eval_features = None, None
    if args.do_train:
        logger.info("***** Preparing training *****")
        train_examples, train_features, num_train_steps = read_train_data(
            args, tokenizer, logger)
        logger.info("***** Preparing evaluation *****")
        eval_examples, eval_features = read_eval_data(args, tokenizer, logger)
    if args.do_predict and eval_features is None:
        logger.info("***** Preparing prediction *****")
        eval_examples, eval_features = read_eval_data(args, tokenizer, logger)

    # --- Prepare optimizer ---
    logger.info("***** Preparing optimizer *****")
    if args.fp16:
        param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \
                            for n, param in model.named_parameters()]
    elif args.optimize_on_cpu:
        param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \
                            for n, param in model.named_parameters()]
    else:
        param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params': [p for n, p in param_optimizer if n not in no_decay],
        'weight_decay_rate':
        0.01
    }, {
        'params': [p for n, p in param_optimizer if n in no_decay],
        'weight_decay_rate':
        0.0
    }]
    optimizer = BERTAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=num_train_steps)

    global_step, global_epoch = 0, 1
    if os.path.isfile(save_path) and args.do_train:
        checkpoint = torch.load(save_path)
        optimizer.load_state_dict(checkpoint['optimizer'])
        logger.info(
            "Load optimizer from finetuned checkpoint: '{}' (step {}, epoch {})"
            .format(save_path, checkpoint['step'], checkpoint['epoch']))
        global_step = checkpoint['step']
        global_epoch = checkpoint['epoch'] + 1

    # --- Run training ---
    if args.do_train and global_epoch < int(args.num_train_epochs) + 1:
        logger.info("***** Running training *****")
        best_f1 = 0
        for epoch in range(global_epoch, int(args.num_train_epochs) + 1):
            logger.info("***** Epoch: {} *****".format(epoch))
            global_step, model, best_f1 = run_train_epoch(
                args, global_step, n_gpu, device, model, param_optimizer,
                optimizer, train_examples, train_features, eval_examples,
                eval_features, logger, log_path, save_path, best_f1, epoch)

    # --- Run prediction ---
    if args.do_predict:
        logger.info("***** Running prediction *****")
        # restore from best checkpoint
        if save_path and os.path.isfile(save_path):
            checkpoint = torch.load(save_path)
            model.load_state_dict(checkpoint['model'])
            logger.info(
                "Loading model from finetuned checkpoint: '{}' (step {}, epoch {})"
                .format(save_path, checkpoint['step'], checkpoint['epoch']))
            global_step = checkpoint['step']

            torch.save(
                {
                    'model': model.state_dict(),
                    'step': checkpoint['step'],
                    'epoch': checkpoint['epoch']
                }, save_path)

        model.eval()
        metrics = evaluate(args,
                           model,
                           device,
                           eval_examples,
                           eval_features,
                           logger,
                           write_pred=True)
        metrics_path = os.path.join(args.prediction_dir, args.metrics_json)
        f = open(metrics_path, "w")
        metrics_dict = {'f1': metrics['f1'], 'em': metrics['em']}
        json.dump(metrics_dict, f)
        f.close()
        logger.info("Predition written to : {}".format(metrics_path))
    # evaluate
    tf.logging.info("Precision, Recall and F1-Score...")
    tf.logging.info(
        metrics.classification_report(y_test_cls,
                                      y_pred_cls,
                                      target_names=label_list))

    tf.logging.info("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    tf.logging.info(cm)


if __name__ == '__main__':
    if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
        raise ValueError("""usage: python run_TCM.py [train / test]""")

    tf.logging.set_verbosity(tf.logging.INFO)
    config = TextConfig()
    label_list = Processor().get_labels()
    tokenizer = tokenization.FullTokenizer(vocab_file=config.vocab_file,
                                           do_lower_case=False)
    model = TextCNN(config)

    if sys.argv[1] == 'train':
        train()
    elif sys.argv[1] == 'test':
        test()
    else:
        exit()
def main(_):

    tf.logging.set_verbosity(tf.logging.INFO)
    # to make sure the output_dir exists
    tf.gfile.MakeDirs(FLAGS.output_dir)

    processors = {"ner": NerProcessor}

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' must be True."
        )

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    task_name = FLAGS.task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))
    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2

    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    train_examples = None
    eval_examples = None
    num_train_steps = None
    num_warmup_steps = None

    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    model_fn = model_fn_builder(
        bert_config=bert_config,
        num_labels=len(label_list) + 1,  # why? label id starts from 1
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_tpu)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
        filed_based_convert_examples_to_features(train_examples, label_list,
                                                 FLAGS.max_seq_length,
                                                 tokenizer, train_file)

        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            is_eval=False,
            drop_remainder=True)

        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        num_actual_eval_examples = len(eval_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on. These do NOT count towards the metric (all tf.metrics
            # support a per-instance weight, and these get a weight of 0.0).

            # 8 is the TPU cores
            while len(eval_examples) % (FLAGS.eval_batch_size *
                                        FLAGS.num_tpu_cores) != 0:
                eval_examples.append(PaddingInputExample())

        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        filed_based_convert_examples_to_features(eval_examples, label_list,
                                                 FLAGS.max_seq_length,
                                                 tokenizer, eval_file)

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(eval_examples), num_actual_eval_examples,
                        len(eval_examples) - num_actual_eval_examples)
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        # This tells the estimator to run through the entire set.
        eval_steps = None
        # However, if running eval on the TPU, you will need to specify the
        # number of steps.
        if FLAGS.use_tpu:
            assert len(eval_examples) % FLAGS.eval_batch_size == 0
            eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size)
            # The total batch size should be a multiple of 64 (8 per TPU core), and feature dimensions should be a multiple of 128
            # https://cloud.google.com/tpu/docs/troubleshooting
            # eval_steps = eval_steps // 8 * 8
            # solved by padding

        eval_drop_remainder = True if FLAGS.use_tpu else False
        #eval_drop_remainder = False

        if eval_steps is None:
            tf.logging.info("  eval_steps: None")
        else:
            tf.logging.info("  eval_steps = %d", eval_steps)

        tf.logging.info("  eval_drop_remainder = %d", int(eval_drop_remainder))

        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            is_eval=True,
            drop_remainder=eval_drop_remainder)

        try:
            result = estimator.evaluate(input_fn=eval_input_fn,
                                        steps=eval_steps)
        except tf.errors.OutOfRangeError:
            tf.logging.info("Out Of Range error cached")

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    if FLAGS.do_predict:
        #token_path = os.path.join(FLAGS.output_dir, "token_test.txt")
        with tf.gfile.Open(os.path.join(FLAGS.output_dir, 'label2id.pkl'),
                           'rb') as rf:
            label2id = pickle.load(rf)
            id2label = {value: key for key, value in label2id.items()}
        #if os.path.exists(token_path):
        #    os.remove(token_path)
        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        num_actual_predict_examples = len(predict_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on.
            while len(predict_examples) % FLAGS.predict_batch_size != 0:
                predict_examples.append(PaddingInputExample())

        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        filed_based_convert_examples_to_features(predict_examples,
                                                 label_list,
                                                 FLAGS.max_seq_length,
                                                 tokenizer,
                                                 predict_file,
                                                 mode="test")

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(predict_examples), num_actual_predict_examples,
                        len(predict_examples) - num_actual_predict_examples)
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_drop_remainder = True if FLAGS.use_tpu else False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            is_eval=False,
            drop_remainder=predict_drop_remainder)

        result = estimator.predict(input_fn=predict_input_fn)

        output_predict_file = os.path.join(FLAGS.output_dir,
                                           "test_results.txt")

        def result_to_pair(writer):
            for predict_line, prediction in zip(predict_examples, result):
                idx = 0
                line = ''
                line_token = str(predict_line.text).split(example_col_sep)
                label_token = str(predict_line.label).split(example_col_sep)
                if len(line_token) != len(label_token):
                    tf.logging.info(predict_line.text)
                    tf.logging.info(predict_line.label)
                for id in prediction:
                    if id == 0:
                        continue
                    curr_labels = id2label[id]
                    if curr_labels in ['[CLS]', '[SEP]']:
                        continue
                    # 不知道为什么,这里会出现idx out of range 的错误。。。do not know why here cache list out of range exception!
                    try:
                        line += line_token[idx] + ' ' + label_token[
                            idx] + ' ' + curr_labels + '\n'
                    except Exception as e:
                        tf.logging.info(e)
                        tf.logging.info(predict_line.text)
                        tf.logging.info(predict_line.label)
                        line = ''
                        break
                    idx += 1
                writer.write(line + '\n')

        with tf.gfile.GFile(output_predict_file, 'w') as writer:
            result_to_pair(writer)

        with tf.gfile.GFile(output_predict_file, 'r') as reader:
            eval_result = return_report(reader)
            print(eval_result)
Exemple #20
0
        config.read(os.path.join(directory, 'defaults.cfg'))

    gpu_config = tf.ConfigProto()
    gpu_config.gpu_options.allow_growth = True
    set_session(tf.Session(config=gpu_config))

    # concepts
    concept, smpl_dev_data, dictionary, corpus_dev_sampled = load_concepts(
        config['terminology']['dict_file'])
    # mentions
    corpus_train = load_mentions(config['corpus']['training_file'],
                                 'training corpus')
    corpus_dev = load_mentions(config['corpus']['development_file'],
                               'dev corpus')

    tokenizer = tokenization.FullTokenizer(config['bert']['vocab_file'],
                                           do_lower_case=False)

    # FIXME: only using one concept name per mention
    positives_training, positives_dev, positives_dev_sampled = load_data(
        'data/gitig_positive_indices.pickle')
    del positives_dev
    positives_training = [(_, span.lower()) for _, span in positives_training]
    positives_dev_sampled = [(_, span.lower())
                             for _, span in positives_dev_sampled]

    # generators for training and validation instances
    train_examples = examples(concept, positives_training, tokenizer,
                              config.getint('training', 'neg_count'),
                              config.getint('training', 'mmaxlen'),
                              config.getint('training', 'cmaxlen'))
    dev_examples = examples(concept, positives_dev_sampled, tokenizer,
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)
    processors = {
        "bc5cdr": BC5CDRProcessor,
        "clefe": CLEFEProcessor,
    }
    # if not FLAGS.do_train and not FLAGS.do_eval:
    #    raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    task_name = FLAGS.task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    tf.gfile.MakeDirs(FLAGS.output_dir)

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)
    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2

    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None

    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list) + 1,
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                use_one_hot_embeddings=FLAGS.use_tpu)

    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
        filed_based_convert_examples_to_features(train_examples, label_list,
                                                 FLAGS.max_seq_length,
                                                 tokenizer, train_file)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        filed_based_convert_examples_to_features(eval_examples, label_list,
                                                 FLAGS.max_seq_length,
                                                 tokenizer, eval_file)

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d", len(eval_examples))
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
        eval_steps = None
        if FLAGS.use_tpu:
            eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size)
        eval_drop_remainder = True if FLAGS.use_tpu else False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)
        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.gfile.Open(output_eval_file, "w") as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
    if FLAGS.do_predict:
        with tf.gfile.Open(os.path.join(FLAGS.output_dir, 'label2id.pkl'),
                           'rb') as rf:
            label2id = pickle.load(rf)
            id2label = {value: key for key, value in label2id.items()}
        token_path = os.path.join(FLAGS.output_dir, "token_test.txt")
        if tf.gfile.Exists(token_path):
            tf.gfile.Remove(token_path)
        predict_examples = processor.get_test_examples(FLAGS.data_dir)

        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        filed_based_convert_examples_to_features(predict_examples,
                                                 label_list,
                                                 FLAGS.max_seq_length,
                                                 tokenizer,
                                                 predict_file,
                                                 mode="test")

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d", len(predict_examples))
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)
        if FLAGS.use_tpu:
            # Warning: According to tpu_estimator.py Prediction on TPU is an
            # experimental feature and hence not supported here
            raise ValueError("Prediction in TPU not supported")
        predict_drop_remainder = True if FLAGS.use_tpu else False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        prf = estimator.evaluate(input_fn=predict_input_fn, steps=None)
        output_test_file = os.path.join(FLAGS.output_dir, "test_results.txt")
        with tf.gfile.Open(output_test_file, "w") as writer:
            tf.logging.info("***** TEST results *****")
            for key in sorted(prf.keys()):
                tf.logging.info("  %s = %s", key, str(prf[key]))
                writer.write("%s = %s\n" % (key, str(prf[key])))

        result = estimator.predict(input_fn=predict_input_fn)
        result = list(result)
        output_predict_file = os.path.join(FLAGS.output_dir, "label_test.txt")
        with tf.gfile.Open(output_predict_file, 'w') as writer:
            print(id2label)
            for prediction in result:
                output_line = "\n".join(id2label[id]
                                        for id in prediction if id != 0) + "\n"
                writer.write(output_line)

        output_predict_file = os.path.join(FLAGS.output_dir, "test_labels.txt")
        output_err_file = os.path.join(FLAGS.output_dir,
                                       "test_labels_errs.txt")
        result_to_pair(predict_examples, result, id2label, output_predict_file,
                       output_err_file)

        tf.logging.info('Reading: %s', output_predict_file)
        with tf.gfile.Open(output_predict_file, "r") as f:
            counts = evaluate(f)
        eval_result = report_notprint(counts)
        print(''.join(eval_result))
        with tf.gfile.Open(
                os.path.join(FLAGS.output_dir, 'test_results_conlleval.txt'),
                'w') as fd:
            fd.write(''.join(eval_result))
Exemple #22
0
]
test_documents = [
    os.path.join(documents_dir, name + '.json') for name in test_domains
]

train_mentions = mentions_dir + '/train.json'
val_mentions = mentions_dir + '/val.json'
test_mentions = mentions_dir + '/test.json'

train_cands = cands_dir + '/train.json'
val_cands = cands_dir + '/val.json'
test_cands = cands_dir + '/test.json'
tf.logging.set_verbosity(tf.logging.INFO)

tokenizer = tokenization.FullTokenizer(vocab_file=dict_path,
                                       do_lower_case='uncased'
                                       in BERT_BASE_DIR)

from collections import namedtuple

TrainingInstance = namedtuple(
    'TrainingInstance', 'mention_context_id mention_context_title '
    'mention_context_tokens cand_tokens mention_start'
    ' mention_end mention_guid cand_guids label_id')


def pad_sequence(tokens, max_len):
    assert len(tokens) <= max_len
    return tokens + [0] * (max_len - len(tokens))

Exemple #23
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    if FLAGS.data_type == "onehop":
        dataset_class = input_fns.OneHopDataset
        eval_fn = evaluate.multihop_eval_fn
    elif FLAGS.data_type == "twohop":
        dataset_class = input_fns.TwoHopDataset
        eval_fn = evaluate.multihop_eval_fn
    elif FLAGS.data_type == "threehop":
        dataset_class = input_fns.ThreeHopDataset
        eval_fn = evaluate.multihop_eval_fn
    elif (FLAGS.data_type == "wikimovie" or FLAGS.data_type == "wikimovie-2hop"
          or FLAGS.data_type == "wikimovie-3hop"):
        dataset_class = input_fns.WikiMovieDataset
        eval_fn = evaluate.wikimovie_eval_fn
    elif FLAGS.data_type == "hotpotqa":
        dataset_class = input_fns.HotpotQADataset
        eval_fn = evaluate.hotpot_eval_fn
    if FLAGS.model_type == "onehop":
        create_model_fn = model_fns.create_onehop_model
    elif FLAGS.model_type == "twohop":
        create_model_fn = model_fns.create_twohop_model
    elif FLAGS.model_type == "twohop-cascaded":
        create_model_fn = model_fns.create_twohopcascade_model
    elif FLAGS.model_type == "threehop":
        create_model_fn = functools.partial(model_fns.create_twohop_model,
                                            num_hops=3)
    elif FLAGS.model_type == "threehop-cascaded":
        create_model_fn = functools.partial(
            model_fns.create_twohopcascade_model, num_hops=3)
    elif FLAGS.model_type == "wikimovie":
        create_model_fn = model_fns.create_wikimovie_model
    elif FLAGS.model_type == "wikimovie-2hop":
        create_model_fn = functools.partial(model_fns.create_wikimovie_model,
                                            num_hops=2)
    elif FLAGS.model_type == "wikimovie-3hop":
        create_model_fn = functools.partial(model_fns.create_wikimovie_model,
                                            num_hops=3)
    elif FLAGS.model_type == "hotpotqa":
        create_model_fn = functools.partial(model_fns.create_hotpotqa_model,
                                            num_hops=FLAGS.num_hops)

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    # Load mention and entity files.
    mention2text = json.load(
        tf.gfile.Open(os.path.join(FLAGS.train_data_dir, "mention2text.json")))
    tf.logging.info("Loading metadata about entities and mentions...")
    entity2id, entity2name = json.load(
        tf.gfile.Open(os.path.join(FLAGS.train_data_dir, "entities.json")))
    entityid2name = {str(i): entity2name[e] for e, i in entity2id.items()}
    # all_paragraphs = json.load(tf.gfile.Open(os.path.join(
    #     FLAGS.train_data_dir, "subparas.json")))
    # all_mentions = np.load(tf.gfile.Open(os.path.join(
    #     FLAGS.train_data_dir, "mentions.npy")))
    all_paragraphs = None
    all_mentions = None

    qa_config = QAConfig(
        qry_layers_to_use=FLAGS.qry_layers_to_use,
        qry_aggregation_fn=FLAGS.qry_aggregation_fn,
        dropout=FLAGS.question_dropout,
        qry_num_layers=FLAGS.question_num_layers,
        projection_dim=FLAGS.projection_dim,
        load_only_bert=FLAGS.load_only_bert,
        num_entities=len(entity2id),
        max_entity_len=FLAGS.max_entity_len,
        ensure_answer_sparse=FLAGS.ensure_answer_sparse,
        ensure_answer_dense=FLAGS.ensure_answer_dense,
        train_with_sparse=FLAGS.train_with_sparse,
        predict_with_sparse=FLAGS.predict_with_sparse,
        fix_sparse_to_one=FLAGS.fix_sparse_to_one,
        supervision=FLAGS.supervision,
        l2_normalize_db=FLAGS.l2_normalize_db,
        entity_score_aggregation_fn=FLAGS.entity_score_aggregation_fn,
        entity_score_threshold=FLAGS.entity_score_threshold,
        softmax_temperature=FLAGS.softmax_temperature,
        sparse_reduce_fn=FLAGS.sparse_reduce_fn,
        intermediate_loss=FLAGS.intermediate_loss,
        light=FLAGS.light,
        sparse_strategy=FLAGS.sparse_strategy,
        train_batch_size=FLAGS.train_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    mips_config = MIPSConfig(ckpt_path=os.path.join(FLAGS.train_data_dir,
                                                    "mention_feats"),
                             ckpt_var_name="db_emb",
                             num_mentions=len(mention2text),
                             emb_size=FLAGS.projection_dim * 2,
                             num_neighbors=FLAGS.num_mips_neighbors)

    validate_flags_or_throw()

    tf.gfile.MakeDirs(FLAGS.output_dir)

    if FLAGS.do_train:
        json.dump(
            tf.app.flags.FLAGS.flag_values_dict(),
            tf.gfile.Open(os.path.join(FLAGS.output_dir, "flags.json"), "w"))

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.estimator.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        keep_checkpoint_max=8,
        tpu_config=tf.estimator.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host),
        session_config=tf.ConfigProto(log_device_placement=False))

    num_train_steps = None
    num_warmup_steps = None
    if FLAGS.do_train:
        train_dataset = dataset_class(
            in_file=FLAGS.train_file,
            tokenizer=tokenizer,
            subject_mention_probability=FLAGS.subject_mention_probability,
            max_qry_length=FLAGS.max_query_length,
            is_training=True,
            entity2id=entity2id,
            tfrecord_filename=os.path.join(FLAGS.output_dir,
                                           "train.tf_record"))
        num_train_steps = int(train_dataset.num_examples /
                              FLAGS.train_batch_size * FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    summary_obj = None
    model_fn = model_fn_builder(
        bert_config=bert_config,
        qa_config=qa_config,
        mips_config=mips_config,
        init_checkpoint=FLAGS.init_checkpoint,
        e2m_checkpoint=os.path.join(FLAGS.train_data_dir, "ent2ment.npz"),
        m2e_checkpoint=os.path.join(FLAGS.train_data_dir, "coref.npz"),
        entity_id_checkpoint=os.path.join(FLAGS.train_data_dir, "entity_ids"),
        entity_mask_checkpoint=os.path.join(FLAGS.train_data_dir,
                                            "entity_mask"),
        learning_rate=FLAGS.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_tpu,
        create_model_fn=create_model_fn,
        summary_obj=summary_obj)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.estimator.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num orig examples = %d", train_dataset.num_examples)
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        train(train_dataset, estimator, num_train_steps)

    if FLAGS.do_predict:
        eval_dataset = dataset_class(in_file=FLAGS.predict_file,
                                     tokenizer=tokenizer,
                                     subject_mention_probability=0.0,
                                     max_qry_length=FLAGS.max_query_length,
                                     is_training=False,
                                     entity2id=entity2id,
                                     tfrecord_filename=os.path.join(
                                         FLAGS.output_dir, "eval.tf_record"))

        continuous_eval(eval_dataset,
                        estimator,
                        mention2text,
                        entityid2name,
                        qa_config.supervision,
                        eval_fn,
                        paragraphs=all_paragraphs,
                        mentions=all_mentions)

    if FLAGS.do_test:
        # Load mention and entity files.
        mention2text = json.load(
            tf.gfile.Open(
                os.path.join(FLAGS.test_data_dir, "mention2text.json")))
        entity2id, entity2name = json.load(
            tf.gfile.Open(os.path.join(FLAGS.test_data_dir, "entities.json")))
        entityid2name = {str(i): entity2name[e] for e, i in entity2id.items()}
        all_paragraphs = json.load(
            tf.gfile.Open(os.path.join(FLAGS.test_data_dir, "subparas.json")))
        all_mentions = np.load(
            tf.gfile.Open(os.path.join(FLAGS.test_data_dir, "mentions.npy")))

        qa_config.num_entities = len(entity2id)
        mips_config = MIPSConfig(ckpt_path=os.path.join(
            FLAGS.test_data_dir, "mention_feats"),
                                 ckpt_var_name="db_emb",
                                 num_mentions=len(mention2text),
                                 emb_size=FLAGS.projection_dim * 2,
                                 num_neighbors=FLAGS.num_mips_neighbors)

        model_fn = model_fn_builder(
            bert_config=bert_config,
            qa_config=qa_config,
            mips_config=mips_config,
            init_checkpoint=FLAGS.init_checkpoint,
            e2m_checkpoint=os.path.join(FLAGS.test_data_dir, "ent2ment.npz"),
            m2e_checkpoint=os.path.join(FLAGS.test_data_dir, "coref.npz"),
            entity_id_checkpoint=os.path.join(FLAGS.test_data_dir,
                                              "entity_ids"),
            entity_mask_checkpoint=os.path.join(FLAGS.test_data_dir,
                                                "entity_mask"),
            learning_rate=FLAGS.learning_rate,
            num_train_steps=num_train_steps,
            num_warmup_steps=num_warmup_steps,
            use_tpu=FLAGS.use_tpu,
            use_one_hot_embeddings=FLAGS.use_tpu,
            create_model_fn=create_model_fn,
            summary_obj=summary_obj)
        estimator = tf.estimator.tpu.TPUEstimator(
            use_tpu=FLAGS.use_tpu,
            model_fn=model_fn,
            config=run_config,
            train_batch_size=FLAGS.train_batch_size,
            predict_batch_size=FLAGS.predict_batch_size)

        eval_dataset = dataset_class(in_file=FLAGS.test_file,
                                     tokenizer=tokenizer,
                                     subject_mention_probability=0.0,
                                     max_qry_length=FLAGS.max_query_length,
                                     is_training=False,
                                     entity2id=entity2id,
                                     tfrecord_filename=os.path.join(
                                         FLAGS.output_dir, "test.tf_record"))

        if tf.gfile.Exists(os.path.join(FLAGS.output_dir, "best_model.meta")):
            ckpt_path = os.path.join(FLAGS.output_dir, "best_model")
        else:
            ckpt_path = None
        output_prediction_file = os.path.join(FLAGS.output_dir,
                                              "test_predictions.json")
        metrics = single_eval(eval_dataset,
                              estimator,
                              ckpt_path,
                              mention2text,
                              entityid2name,
                              qa_config.supervision,
                              output_prediction_file,
                              eval_fn,
                              paragraphs=all_paragraphs,
                              mentions=all_mentions)
        with tf.gfile.Open(os.path.join(FLAGS.output_dir, "test_metrics.txt"),
                           "w") as fo:
            for metric, value in metrics.items():
                tf.logging.info("%s: %.4f", metric, value)
                fo.write("%s %.4f\n" % (metric, value))
Exemple #24
0
def main(_):
  if not tf.gfile.Exists(FLAGS.multihop_output_dir):
    tf.gfile.MakeDirs(FLAGS.multihop_output_dir)

  # Initialize tokenizer.
  tokenizer = tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

  # Read entities.
  if FLAGS.do_preprocess:
    tf.logging.info("Reading entities.")
    entity2id, entity2name = {}, {}
    with tf.gfile.Open(FLAGS.entity_file) as f:
      entities = json.load(f)
      tf.logging.info("Read %d entities", len(entities))
      for e, (_, n) in entities.items():
        if e.lower() in entity2id:
          continue
          # tf.logging.warn("%s entity repeated", e)
        entity2id[e.lower()] = len(entity2id)
        entity2name[e.lower()] = n
      tf.logging.info("Kept %d entities", len(entity2id))

  # Read paragraphs, mentions and entities.
  if FLAGS.do_preprocess:
    mentions = []
    ent_rows, ent_cols, ent_vals = [], [], []
    mention2text = {}
    total_sub_paras = [0]
    all_sub_paras = []
    num_skipped_mentions = 0.
    tf.logging.info("Reading paragraphs from %s", FLAGS.wiki_file)
    with tf.gfile.Open(FLAGS.wiki_file) as f:
      for ii, line in tqdm(enumerate(f)):
        if ii == FLAGS.max_total_paragraphs:
          tf.logging.info("Processed maximum number of paragraphs, breaking.")
          break
        if ii > 0 and ii % 100000 == 0:
          tf.logging.info("Skipped / Kept mentions = %.3f",
                          num_skipped_mentions / len(mentions))
        orig_para = json.loads(line.strip())
        if orig_para["kb_id"].lower() not in entity2id:
          tf.logging.warn("%s not in entities. Skipping %s para",
                          orig_para["kb_id"], orig_para["title"])
          continue
        sub_para_objs = _get_sub_paras(orig_para, tokenizer,
                                       FLAGS.max_seq_length, FLAGS.doc_stride,
                                       total_sub_paras)
        for para_obj in sub_para_objs:
          # Add mentions from this paragraph.
          local2global = {}
          title_entity_mention = None
          for im, mention in enumerate(
              para_obj["mentions"][:FLAGS.max_mentions_per_entity]):
            if mention["kb_id"].lower() not in entity2id:
              # tf.logging.warn("%s not in entities. Skipping mention %s",
              #                 mention["kb_id"], mention["text"])
              num_skipped_mentions += 1
              continue
            mention2text[len(mentions)] = mention["text"]
            local2global[im] = len(mentions)
            if mention["kb_id"] == orig_para["kb_id"]:
              title_entity_mention = len(mentions)
            mentions.append(
                (entity2id[mention["kb_id"].lower()], para_obj["id"],
                 mention["start_token"], mention["end_token"]))
          for im, gm in local2global.items():
            # entity to mention matrix.
            ent_rows.append(entity2id[orig_para["kb_id"].lower()])
            ent_cols.append(gm)
            ent_vals.append(1.)
            if title_entity_mention is not None:
              ent_rows.append(mentions[gm][0])
              ent_cols.append(title_entity_mention)
              ent_vals.append(1.)
          all_sub_paras.append(para_obj["tokens"])
        assert len(all_sub_paras) == total_sub_paras[0], (len(all_sub_paras),
                                                          total_sub_paras)
    tf.logging.info("Num paragraphs = %d, Num mentions = %d",
                    total_sub_paras[0], len(mentions))
    tf.logging.info("Saving coreference map.")
    search_utils.write_to_checkpoint(
        "coref", np.array([m[0] for m in mentions], dtype=np.int32), tf.int32,
        os.path.join(FLAGS.multihop_output_dir, "coref.npz"))
    tf.logging.info("Creating entity to mentions matrix.")
    sp_entity2mention = sp.csr_matrix((ent_vals, (ent_rows, ent_cols)),
                                      shape=[len(entity2id),
                                             len(mentions)])
    tf.logging.info("Num nonzero = %d", sp_entity2mention.getnnz())
    tf.logging.info("Saving as ragged tensor %s.", str(sp_entity2mention.shape))
    search_utils.write_ragged_to_checkpoint(
        "ent2ment", sp_entity2mention,
        os.path.join(FLAGS.multihop_output_dir, "ent2ment.npz"))
    tf.logging.info("Saving mentions metadata.")
    np.save(
        tf.gfile.Open(
            os.path.join(FLAGS.multihop_output_dir, "mentions.npy"), "w"),
        np.array(mentions, dtype=np.int64))
    json.dump(
        mention2text,
        tf.gfile.Open(
            os.path.join(FLAGS.multihop_output_dir, "mention2text.json"), "w"))
    tf.logging.info("Saving entities metadata.")
    json.dump([entity2id, entity2name],
              tf.gfile.Open(
                  os.path.join(FLAGS.multihop_output_dir, "entities.json"),
                  "w"))
    tf.logging.info("Saving split paragraphs.")
    json.dump(
        all_sub_paras,
        tf.gfile.Open(
            os.path.join(FLAGS.multihop_output_dir, "subparas.json"), "w"))

  # Store entity tokens.
  if FLAGS.do_preprocess:
    tf.logging.info("Processing entities.")
    entity_ids = np.zeros((len(entity2id), FLAGS.max_entity_length),
                          dtype=np.int32)
    entity_mask = np.zeros((len(entity2id), FLAGS.max_entity_length),
                           dtype=np.float32)
    num_exceed_len = 0.
    for entity in tqdm(entity2id):
      ei = entity2id[entity]
      entity_tokens = tokenizer.tokenize(entity2name[entity])
      entity_token_ids = tokenizer.convert_tokens_to_ids(entity_tokens)
      if len(entity_token_ids) > FLAGS.max_entity_length:
        num_exceed_len += 1
        entity_token_ids = entity_token_ids[:FLAGS.max_entity_length]
      entity_ids[ei, :len(entity_token_ids)] = entity_token_ids
      entity_mask[ei, :len(entity_token_ids)] = 1.
    tf.logging.info("Saving %d entity ids. %d exceed max-length of %d.",
                    len(entity2id), num_exceed_len, FLAGS.max_entity_length)
    search_utils.write_to_checkpoint(
        "entity_ids", entity_ids, tf.int32,
        os.path.join(FLAGS.multihop_output_dir, "entity_ids"))
    search_utils.write_to_checkpoint(
        "entity_mask", entity_mask, tf.float32,
        os.path.join(FLAGS.multihop_output_dir, "entity_mask"))

  # Copy BERT checkpoint for future use.
  if FLAGS.do_preprocess:
    tf.logging.info("Copying BERT checkpoint.")
    if tf.gfile.Exists(os.path.join(FLAGS.pretrain_dir, "best_model.index")):
      bert_ckpt = os.path.join(FLAGS.pretrain_dir, "best_model")
    else:
      bert_ckpt = tf.train.latest_checkpoint(FLAGS.pretrain_dir)
    tf.logging.info("%s.data-00000-of-00001", bert_ckpt)
    tf.gfile.Copy(
        bert_ckpt + ".data-00000-of-00001",
        os.path.join(FLAGS.multihop_output_dir,
                     "bert_init.data-00000-of-00001"),
        overwrite=True)
    tf.logging.info("%s.index", bert_ckpt)
    tf.gfile.Copy(
        bert_ckpt + ".index",
        os.path.join(FLAGS.multihop_output_dir, "bert_init.index"),
        overwrite=True)
    tf.logging.info("%s.meta", bert_ckpt)
    tf.gfile.Copy(
        bert_ckpt + ".meta",
        os.path.join(FLAGS.multihop_output_dir, "bert_init.meta"),
        overwrite=True)

  if FLAGS.do_embed:
    # Get mention embeddings from BERT.
    bert_ckpt = os.path.join(FLAGS.multihop_output_dir, "bert_init")
    if not FLAGS.do_preprocess:
      with tf.gfile.Open(
          os.path.join(FLAGS.multihop_output_dir, "mentions.npy")) as f:
        mentions = np.load(f)
      with tf.gfile.Open(
          os.path.join(FLAGS.multihop_output_dir, "subparas.json")) as f:
        all_sub_paras = json.load(f)
    tf.logging.info("Computing embeddings for %d mentions over %d paras.",
                    len(mentions), len(all_sub_paras))
    shard_size = len(mentions) // FLAGS.num_shards
    bert_predictor = bert_utils_v2.BERTPredictor(tokenizer, bert_ckpt)
    if FLAGS.my_shard is None:
      shard_range = range(FLAGS.num_shards + 1)
    else:
      shard_range = [FLAGS.my_shard]
    for ns in shard_range:
      min_ = ns * shard_size
      max_ = (ns + 1) * shard_size
      if min_ >= len(mentions):
        break
      if max_ > len(mentions):
        max_ = len(mentions)
      min_subp = mentions[min_][1]
      max_subp = mentions[max_ - 1][1]
      tf.logging.info("Processing shard %d of %d mentions and %d paras.", ns,
                      max_ - min_, max_subp - min_subp + 1)
      para_emb = bert_predictor.get_doc_embeddings(
          all_sub_paras[min_subp:max_subp + 1])
      assert para_emb.shape[2] == 2 * FLAGS.projection_dim
      mention_emb = np.empty((max_ - min_, 2 * bert_predictor.emb_dim),
                             dtype=np.float32)
      for im, mention in enumerate(mentions[min_:max_]):
        mention_emb[im, :] = np.concatenate([
            para_emb[mention[1] - min_subp, mention[2], :FLAGS.projection_dim],
            para_emb[mention[1] - min_subp, mention[3],
                     FLAGS.projection_dim:2 * FLAGS.projection_dim]
        ])
      del para_emb
      tf.logging.info("Saving %d mention features to tensorflow checkpoint.",
                      mention_emb.shape[0])
      with tf.device("/cpu:0"):
        search_utils.write_to_checkpoint(
            "db_emb_%d" % ns, mention_emb, tf.float32,
            os.path.join(FLAGS.multihop_output_dir, "mention_feats_%d" % ns))

  if FLAGS.do_combine:
    # Combine sharded DB into one.
    if FLAGS.shards_to_combine is None:
      shard_range = range(FLAGS.num_shards + 1)
    else:
      shard_range = range(FLAGS.shards_to_combine)
    with tf.device("/cpu:0"):
      all_db = []
      for i in shard_range:
        ckpt_path = os.path.join(FLAGS.multihop_output_dir,
                                 "mention_feats_%d" % i)
        reader = pywrap_tensorflow.NewCheckpointReader(ckpt_path)
        var_to_shape_map = reader.get_variable_to_shape_map()
        tf.logging.info("Reading %s from %s with shape %s", "db_emb_%d" % i,
                        ckpt_path, str(var_to_shape_map["db_emb_%d" % i]))
        tf_db = search_utils.load_database("db_emb_%d" % i,
                                           var_to_shape_map["db_emb_%d" % i],
                                           ckpt_path)
        all_db.append(tf_db)
      tf.logging.info("Reading all variables.")
      session = tf.Session()
      session.run(tf.global_variables_initializer())
      session.run(tf.local_variables_initializer())
      np_db = session.run(all_db)
      tf.logging.info("Concatenating and storing.")
      np_db = np.concatenate(np_db, axis=0)
      search_utils.write_to_checkpoint(
          "db_emb", np_db, tf.float32,
          os.path.join(FLAGS.multihop_output_dir, "mention_feats"))
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    processors = {
        "ske_2019": SKE_2019_Multi_Label_Classification_Processor,
    }

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    # if not FLAGS.do_predict:
    #     raise ValueError(
    #         "`do_predict' must be True.")

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    label_list = processor.get_labels()
    # label_length = len(label_list)

    tokenizer = tokenization.FullTokenizer(
        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

    # is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        # cluster=tpu_cluster_resolver,
        # master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        # tpu_config=tf.contrib.tpu.TPUConfig(
        #     iterations_per_loop=FLAGS.iterations_per_loop,
        #     num_shards=FLAGS.num_tpu_cores,
        #     per_host_input_for_training=is_per_host))
    )

    # num_train_steps = None
    # num_warmup_steps = None

    model_fn = model_fn_builder(
        bert_config=bert_config,
        num_labels=len(label_list),
        init_checkpoint=FLAGS.init_checkpoint,
        # learning_rate=FLAGS.learning_rate,
        # num_train_steps=num_train_steps,
        # num_warmup_steps=num_warmup_steps,
        # use_tpu=FLAGS.use_tpu,
        # use_one_hot_embeddings=FLAGS.use_tpu
    )

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=False,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        # eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

# if FLAGS.do_predict:
    # predict_examples = processor.get_test_examples(FLAGS.data_dir)
    # num_actual_predict_examples = len(predict_examples)

    # predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
    # file_based_convert_examples_to_features(predict_examples, label_list,
    #                                         FLAGS.max_seq_length, tokenizer,
    #                                         predict_file)

    # tf.logging.info("***** Running prediction*****")
    # tf.logging.info("  Num examples = %d (%d actual, %d padding)",
    #                 len(predict_examples), num_actual_predict_examples,
    #                 len(predict_examples) - num_actual_predict_examples)
    # tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)
    predict_test_data = [
        "《中国风水十讲》是2007年华夏出版社出版的图书,作者是杨文衡",
        "你是最爱词:许常德李素珍/曲:刘天健你的故事写到你离去后为止",
        "《苏州商会档案丛编第二辑》是2012年华中师范大学出版社出版的图书,作者是马敏、祖苏、肖芃"
    ]
    num_actual_predict_examples = len(predict_test_data)
    # dataset = string_tokenizer(
    #     examples=predict_test_data,
    #     label_list=label_list,
    #     max_seq_length=FLAGS.max_seq_length,
    #     tokenizer=tokenizer)

    predict_input_fn = string_based_input_fn_builder(
        data=predict_test_data,
        seq_length=FLAGS.max_seq_length,
        label_list=label_list,
        tokenizer=tokenizer)

    result = estimator.predict(input_fn=predict_input_fn)

    num_written_lines = 0
    tf.logging.info("***** Predict results *****")
    for (i, prediction) in enumerate(result):
        print("\n\n prediction:\n{}".format(prediction))
        # continue
        # probabilities = prediction["probabilities"]
        # if i >= num_actual_predict_examples:
        #     break
        # output_line_score_value = " ".join(
        #     str(class_probability)
        #     for class_probability in probabilities) + "\n"
        # predicate_predict = []
        # for idx, class_probability in enumerate(probabilities):
        #     if class_probability > 0.5:
        #         predicate_predict.append(label_list[idx])
        # output_line_predicate_predict = " ".join(
        #     predicate_predict) + "\n"
        # predicate_predict_writer.write(
        #     output_line_predicate_predict)
        # score_value_writer.write(output_line_score_value)
        num_written_lines += 1
    assert num_written_lines == num_actual_predict_examples
Exemple #26
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)
    if not FLAGS.do_train and not FLAGS.do_eval_dev and not FLAGS.do_eval_test:
        raise ValueError("At least one of `do_train`, `do_eval_dev` or "
                         "`do_eval_test' must be True.")

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.output_dir)
    label_list = ["Yes", "No"]
    if FLAGS.from_three_class_model:
        label_list.append("Neutral")

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2
    run_config = contrib_tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=contrib_tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    if FLAGS.do_train:
        train_examples = get_train()
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list),
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                use_one_hot_embeddings=FLAGS.use_tpu)

    estimator = contrib_tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
        file_based_convert_examples_to_features(
            examples=train_examples,
            label_list=label_list,
            max_seq_length=FLAGS.max_seq_length,
            tokenizer=tokenizer,
            output_file=train_file)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)

        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)

        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    eval_on = []
    if FLAGS.do_eval_dev:
        eval_on.append((get_dev(), "dev"))
    if FLAGS.do_eval_test:
        eval_on.append((get_test(), "test"))

    for eval_examples, name in eval_on:
        eval_file = os.path.join(FLAGS.output_dir, "%s.tf_record" % name)
        file_based_convert_examples_to_features(
            examples=eval_examples,
            label_list=label_list,
            max_seq_length=FLAGS.max_seq_length,
            tokenizer=tokenizer,
            output_file=eval_file)

        tf.logging.info("***** Running %s *****" % name)
        tf.logging.info("  Num examples = %d", len(eval_examples))
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        eval_steps = None
        if FLAGS.use_tpu:
            eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size)

        eval_drop_remainder = True if FLAGS.use_tpu else False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)

        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
        output_eval_file = os.path.join(FLAGS.output_dir,
                                        "%s_eval_results.txt" % name)

        with tf.gfile.GFile(output_eval_file, "w") as writer:
            tf.logging.info("***** %s eval results *****" % name)
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        "xnli": XnliProcessor,
    }

    if not FLAGS.do_train and not FLAGS.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(
        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    model_fn = model_fn_builder(
        bert_config=bert_config,
        num_labels=len(label_list),
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_tpu)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size)

    if FLAGS.do_train:
        train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
        filed_based_convert_examples_to_features(
            train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        filed_based_convert_examples_to_features(
            eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file)

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d", len(eval_examples))
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        # This tells the estimator to run through the entire set.
        eval_steps = None
        # However, if running eval on the TPU, you will need to specify the
        # number of steps.
        if FLAGS.use_tpu:
            # Eval will be slightly WRONG on the TPU because it will truncate
            # the last batch.
            eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size)

        eval_drop_remainder = True if FLAGS.use_tpu else False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)

        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
Exemple #28
0
def main(args):
    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' must be True."
        )

    run_config = tf.estimator.RunConfig(
        model_dir=save_model_dir,
        save_summary_steps=SAVE_SUMMARY_STEPS,
        keep_checkpoint_max=1,
        save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

    bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)
    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(LABEL_COLUMNS),
                                init_checkpoint=BERT_INIT_CHKPNT,
                                learning_rate=LEARNING_RATE,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_tpu=False,
                                use_one_hot_embeddings=False)

    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       config=run_config,
                                       params={"batch_size": BATCH_SIZE})

    train_input_fn = file_based_input_fn_builder(
        input_file=train_tf_record_path,
        seq_length=MAX_SEQ_LENGTH,
        is_training=True,
        drop_remainder=True)

    eval_input_fn = file_based_input_fn_builder(input_file=eval_tf_record_path,
                                                seq_length=MAX_SEQ_LENGTH,
                                                is_training=False,
                                                drop_remainder=False)

    if FLAGS.do_train and FLAGS.do_eval:
        print(f'Beginning Training and evaluating!')
        train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn,
                                            max_steps=num_train_steps)
        eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn, steps=None)
        tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
    elif FLAGS.do_train:
        print(f'Beginning Training!')
        current_time = datetime.now()
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
        print("Training took time ", datetime.now() - current_time)
    elif FLAGS.do_eval:
        print(f'Beginning evaluating!')
        current_time = datetime.now()
        result = estimator.evaluate(input_fn=eval_input_fn,
                                    steps=None)  # None 表示跑完整个数据集
        print("Training took time ", datetime.now() - current_time)
        output_eval_file = os.path.join("data", "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
    if FLAGS.do_predict:
        predict_sample = "If you have a look back at the source, the information I updated was the correct form. I can only guess the source hadn't updated. I shall update the information once again but thank you for your message."
        input_sample = InputExample(guid=0,
                                    text_a=predict_sample,
                                    labels=[0, 0, 0, 0, 0, 0])
        tokenization.validate_case_matches_checkpoint(True, BERT_INIT_CHKPNT)
        tokenizer = tokenization.FullTokenizer(vocab_file=BERT_VOCAB,
                                               do_lower_case=True)
        feature = convert_single_example(input_sample, MAX_SEQ_LENGTH,
                                         tokenizer)
        predict_input_fn = input_fn_builder([feature], MAX_SEQ_LENGTH, False,
                                            False)
        predictions = estimator.predict(predict_input_fn)
        probabilities = []
        for (i, prediction) in enumerate(predictions):
            preds = prediction["probabilities"]
            probabilities.append(preds)
        print(probabilities[0])
Exemple #29
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        "xnli": XnliProcessor,
        "ske": SkeProcessor,
    }

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict and not FLAGS.do_export:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' must be True."
        )

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list),
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                use_one_hot_embeddings=FLAGS.use_tpu)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
        if not os.path.exists(train_file):
            file_based_convert_examples_to_features(train_examples, label_list,
                                                    FLAGS.max_seq_length,
                                                    tokenizer, train_file)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        num_actual_eval_examples = len(eval_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on. These do NOT count towards the metric (all tf.metrics
            # support a per-instance weight, and these get a weight of 0.0).
            while len(eval_examples) % FLAGS.eval_batch_size != 0:
                eval_examples.append(PaddingInputExample())

        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        if not os.path.exists(eval_file):
            file_based_convert_examples_to_features(eval_examples, label_list,
                                                    FLAGS.max_seq_length,
                                                    tokenizer, eval_file)

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(eval_examples), num_actual_eval_examples,
                        len(eval_examples) - num_actual_eval_examples)
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        # This tells the estimator to run through the entire set.
        eval_steps = None
        # However, if running eval on the TPU, you will need to specify the
        # number of steps.
        if FLAGS.use_tpu:
            assert len(eval_examples) % FLAGS.eval_batch_size == 0
            eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

        eval_drop_remainder = True if FLAGS.use_tpu else False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)

        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    if FLAGS.do_predict:
        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        num_actual_predict_examples = len(predict_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on.
            while len(predict_examples) % FLAGS.predict_batch_size != 0:
                predict_examples.append(PaddingInputExample())

        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        if not os.path.exists(predict_file):
            file_based_convert_examples_to_features(predict_examples,
                                                    label_list,
                                                    FLAGS.max_seq_length,
                                                    tokenizer, predict_file)

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(predict_examples), num_actual_predict_examples,
                        len(predict_examples) - num_actual_predict_examples)
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_drop_remainder = True if FLAGS.use_tpu else False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        result = estimator.predict(input_fn=predict_input_fn)

        output_predict_file = os.path.join(FLAGS.output_dir,
                                           "test_results.tsv")
        with tf.gfile.GFile(output_predict_file, "w") as writer:
            num_written_lines = 0
            tf.logging.info("***** Predict results *****")
            for (i, prediction) in enumerate(result):
                probabilities = prediction["probabilities"]
                if i >= num_actual_predict_examples:
                    break
                output_line = "\t".join(
                    str(class_probability)
                    for class_probability in probabilities) + "\n"
                writer.write(output_line)
                num_written_lines += 1
        tf.logging.info("num_written_lines %d,num_actual_predict_examples %d",
                        num_written_lines, num_actual_predict_examples)
        assert num_written_lines == num_actual_predict_examples

        if FLAGS.do_export:
            tf.logging.info("doing saved model export !!")
            estimator._export_to_tpu = False
            estimator.export_saved_model(FLAGS.export_dir, serving_input_fn)
Exemple #30
0
    def __init__(self,
                 args,
                 is_training=True,
                 emb_class='glove',
                 use_crf=True):
        self.emb_path = args.emb_path
        self.embvec = pkl.load(open(
            self.emb_path, 'rb'))  # resources(glove, vocab, path, etc)
        self.wrd_dim = args.wrd_dim  # size of word embedding(glove)
        self.chr_dim = 50  # size of character embedding
        self.pos_dim = 7  # size of part of speech embedding
        self.class_size = len(self.embvec.tag_vocab)  # number of class(tags)
        self.word_length = args.word_length  # maximum character size of word for convolution
        self.restore = args.restore  # checkpoint path if available
        self.use_crf = use_crf  # use crf decoder or not
        self.emb_class = emb_class  # class of embedding(glove, elmo, bert)
        self.starter_learning_rate = 0.001  # 0.001(default), 0.0003(transformer)
        self.decay_steps = 12000
        self.decay_rate = 0.9
        self.clip_norm = 10

        self.keep_prob = 0.7  # keep probability for dropout
        self.chr_conv_type = 'conv1d'  # conv1d | conv2d
        self.filter_sizes = [3]  # filter sizes
        self.num_filters = 25  # number of filters
        self.rnn_used = True  # use rnn layer or not
        self.rnn_num_layers = 2  # number of RNN layers
        self.rnn_type = 'fused'  # normal | fused
        self.rnn_size = 200  # size of RNN hidden unit
        self.tf_used = False  # use transformer encoder layer or not
        self.tf_num_layers = 4  # number of layers for transformer encoder
        self.tf_keep_prob = 0.8  # keep probability for transformer encoder
        self.tf_mh_num_heads = 4  # number of head for multi head attention
        self.tf_mh_num_units = 64  # Q,K,V dimension for multi head attention
        self.tf_mh_keep_prob = 0.8  # keep probability for multi head attention
        self.tf_ffn_kernel_size = 3  # conv1d kernel size for feed forward net
        self.tf_ffn_keep_prob = 0.8  # keep probability for feed forward net

        self.is_training = is_training
        if self.is_training:
            self.epoch = args.epoch
            self.batch_size = args.batch_size
            self.dev_batch_size = 2 * self.batch_size
            self.checkpoint_dir = args.checkpoint_dir
            self.summary_dir = args.summary_dir

        if self.emb_class == 'elmo':
            from bilm import Batcher, BidirectionalLanguageModel
            self.word_length = 50  # replace to fixed word length for the pre-trained elmo : 'max_characters_per_token'
            self.elmo_batcher = Batcher(
                self.embvec.elmo_vocab_path,
                self.word_length)  # map text to character ids
            self.elmo_bilm = BidirectionalLanguageModel(
                self.embvec.elmo_options_path,
                self.embvec.elmo_weight_path)  # biLM graph
            self.elmo_keep_prob = 0.8
        if self.emb_class == 'bert':
            from bert import modeling
            from bert import tokenization
            self.bert_config = modeling.BertConfig.from_json_file(
                self.embvec.bert_config_path)
            self.bert_tokenizer = tokenization.FullTokenizer(
                vocab_file=self.embvec.bert_vocab_path,
                do_lower_case=self.embvec.bert_do_lower_case)
            self.bert_init_checkpoint = self.embvec.bert_init_checkpoint
            self.bert_max_seq_length = self.embvec.bert_max_seq_length
            self.bert_keep_prob = 0.8
            # modified for bert
            self.rnn_size = 256
            self.starter_learning_rate = 2e-5
            self.decay_steps = 5000
            self.decay_rate = 0.9
            self.clip_norm = 1.5
            if self.is_training:
                self.dev_batch_size = self.batch_size  # set batch_size == dev_batch_size