Esempio n. 1
0
def load_from_bert(vocab_file, input_file_a, input_file_b, do_lower_case=True, 
            max_seq_length=128, vocab_file1=None, align_file=None, n_max_sent=None,
            align_punc=False, policy='1to1'):

    tokenizer = tokenization.FullTokenizer(
        vocab_file=vocab_file, do_lower_case=do_lower_case)
    tokenizer1 = tokenization.FullTokenizer(
        vocab_file=vocab_file1, do_lower_case=do_lower_case)

    examples = load_bert(input_file_a, input_file_b, n_max_sent=n_max_sent)

    aligns = None
    if align_file:
        aligns = load_aligns(align_file, n_max_sent=n_max_sent, examples=examples, 
                            align_punc=align_punc, policy=policy)
        try:
            assert len(examples) == len(aligns)
        except:
            raise ValueError("Number of examples({}) and alignments({}) mismatch!".format(len(examples),len(aligns)))

    features = convert_bert_examples_to_features(
        examples=examples, seq_length=max_seq_length, tokenizer=tokenizer, 
        tokenizer1=tokenizer1, aligns=aligns)

    unique_id_to_feature = {}
    for feature in features:
        unique_id_to_feature[feature.unique_id] = feature

    #all_input_ids_a = torch.tensor([f.input_ids_a for f in features], dtype=torch.long)
    #all_input_ids_b = torch.tensor([f.input_ids_b for f in features], dtype=torch.long)
    all_input_embs_a = torch.tensor([f.input_embs_a for f in features], dtype=torch.float)
    all_input_embs_b = torch.tensor([f.input_embs_b for f in features], dtype=torch.float)
    all_input_mask_a = torch.tensor([f.input_mask_a for f in features], dtype=torch.long)
    all_input_mask_b = torch.tensor([f.input_mask_b for f in features], dtype=torch.long)
    all_example_index = torch.arange(all_input_mask_a.size(0), dtype=torch.long)

    if align_file:
        all_align_ids_a = torch.tensor([f.align_ids_a for f in features], dtype=torch.long)
        all_align_ids_b = torch.tensor([f.align_ids_b for f in features], dtype=torch.long)
        all_align_mask = torch.tensor([f.align_mask for f in features], dtype=torch.long)
        dataset = TensorDataset(all_input_embs_a, all_input_mask_a, 
                        all_input_embs_b, all_input_mask_b, all_align_ids_a, all_align_ids_b,
                        all_align_mask, all_example_index)
    else:
        dataset = TensorDataset(all_input_embs_a, all_input_mask_a, 
                        all_input_embs_b, all_input_mask_b, all_example_index)
    #if local_rank == -1:
    #    sampler = SequentialSampler(dataset)
        #sampler = RandomSampler(dataset)
    #else:
    #    sampler = DistributedSampler(dataset)
    #dataloader = DataLoader(dataset, sampler=sampler, batch_size=batch_size)

    return dataset, unique_id_to_feature, features
Esempio n. 2
0
def convert(vocab_file,
            sents,
            batch_size=32,
            do_lower_case=True,
            max_seq_length=128,
            local_rank=-1):

    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case)

    features = convert_sents_to_features(sents=sents,
                                         seq_length=max_seq_length,
                                         tokenizer=tokenizer)

    unique_id_to_feature = {}
    for feature in features:
        unique_id_to_feature[feature.unique_id] = feature

    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_example_index)

    return dataset, unique_id_to_feature, features
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    # bpe_tokenizer = BertTokenizer.from_pretrained(
    #     FLAGS.bert_tokenizer_name,
    #     tokenize_chinese_chars=False
    # )
    # bpe_tokenizer.tokenize_chinese_chars = False
    # print("bpe_tokenizer.tokenize_chinese_chars: ", bpe_tokenizer.tokenize_chinese_chars)

    bpe_tokenizer = tokenization.FullTokenizer(
        vocab_file=FLAGS.vocab_file,
        do_lower_case=FLAGS.do_lower_case,
        spm_model_file=FLAGS.spm_model_file)

    input_files = []
    for input_pattern in FLAGS.input_file.split(","):
        input_files.extend(tf.gfile.Glob(input_pattern))

    tf.logging.info("*** Reading from input files ***")
    for input_file in input_files:
        tf.logging.info("  %s", input_file)

    rng = random.Random(FLAGS.random_seed)
    create_training_instances(
        input_files,
        FLAGS.output_file,
        bpe_tokenizer,
        FLAGS.max_seq_length,
        FLAGS.dupe_factor,
        FLAGS.short_seq_prob,
        FLAGS.masked_lm_prob,
        FLAGS.max_predictions_per_seq,
        rng,
    )
Esempio n. 4
0
def submit(model=None,
           path="",
           vocab_file="",
           use_crf="",
           label_file="",
           tag_to_index=None):
    """
    submit task
    """
    tokenizer_ = tokenization.FullTokenizer(vocab_file=vocab_file)
    data = []
    for line in open(path):
        if not line.strip():
            continue
        oneline = json.loads(line.strip())
        res = process(model=model,
                      text=oneline["text"],
                      tokenizer_=tokenizer_,
                      use_crf=use_crf,
                      tag_to_index=tag_to_index,
                      vocab=vocab_file)
        data.append(json.dumps({"label": res}, ensure_ascii=False))
    open("ner_predict.json", "w").write("\n".join(data))
    labels = []
    with open(label_file) as f:
        for label in f:
            labels.append(label.strip())
    get_result(labels, "ner_predict.json", path)
Esempio n. 5
0
def do_eval(dataset=None,
            vocab_file="",
            eval_json="",
            load_checkpoint_path="",
            seq_length=384):
    """ do eval """
    if load_checkpoint_path == "":
        raise ValueError(
            "Finetune model missed, evaluation task must load finetune model!")
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=True)
    eval_examples = read_squad_examples(eval_json, False)
    eval_features = convert_examples_to_features(examples=eval_examples,
                                                 tokenizer=tokenizer,
                                                 max_seq_length=seq_length,
                                                 doc_stride=128,
                                                 max_query_length=64,
                                                 is_training=False,
                                                 output_fn=None,
                                                 verbose_logging=False)

    net = BertSquad(bert_net_cfg, False, 2)
    net.set_train(False)
    param_dict = load_checkpoint(load_checkpoint_path)
    load_param_into_net(net, param_dict)
    model = Model(net)
    output = []
    RawResult = collections.namedtuple(
        "RawResult", ["unique_id", "start_logits", "end_logits"])
    columns_list = ["input_ids", "input_mask", "segment_ids", "unique_ids"]
    for data in dataset.create_dict_iterator():
        input_data = []
        for i in columns_list:
            input_data.append(Tensor(data[i]))
        input_ids, input_mask, segment_ids, unique_ids = input_data
        start_positions = Tensor([1], mstype.float32)
        end_positions = Tensor([1], mstype.float32)
        is_impossible = Tensor([1], mstype.float32)
        logits = model.predict(input_ids, input_mask, segment_ids,
                               start_positions, end_positions, unique_ids,
                               is_impossible)
        ids = logits[0].asnumpy()
        start = logits[1].asnumpy()
        end = logits[2].asnumpy()

        for i in range(bert_net_cfg.batch_size):
            unique_id = int(ids[i])
            start_logits = [float(x) for x in start[i].flat]
            end_logits = [float(x) for x in end[i].flat]
            output.append(
                RawResult(unique_id=unique_id,
                          start_logits=start_logits,
                          end_logits=end_logits))
    write_predictions(eval_examples, eval_features, output, 20, 30, True,
                      "./predictions.json", None, None)
Esempio n. 6
0
def test_eval():
    """Evaluation function for SQuAD task"""
    tokenizer = tokenization.FullTokenizer(vocab_file="./vocab.txt",
                                           do_lower_case=True)
    input_file = "dataset/v1.1/dev-v1.1.json"
    eval_examples = read_squad_examples(input_file, False)
    eval_features = convert_examples_to_features(examples=eval_examples,
                                                 tokenizer=tokenizer,
                                                 max_seq_length=384,
                                                 doc_stride=128,
                                                 max_query_length=64,
                                                 is_training=False,
                                                 output_fn=None,
                                                 verbose_logging=False)

    device_id = int(os.getenv('DEVICE_ID'))
    context.set_context(mode=context.GRAPH_MODE,
                        device_target='Ascend',
                        device_id=device_id)
    dataset = get_squad_dataset(bert_net_cfg.batch_size, 1)
    net = BertSquad(bert_net_cfg, False, 2)
    net.set_train(False)
    param_dict = load_checkpoint(cfg.finetune_ckpt)
    load_param_into_net(net, param_dict)
    model = Model(net)
    output = []
    RawResult = collections.namedtuple(
        "RawResult", ["unique_id", "start_logits", "end_logits"])
    columns_list = ["input_ids", "input_mask", "segment_ids", "unique_ids"]
    for data in dataset.create_dict_iterator():
        input_data = []
        for i in columns_list:
            input_data.append(Tensor(data[i]))
        input_ids, input_mask, segment_ids, unique_ids = input_data
        start_positions = Tensor([1], mstype.float32)
        end_positions = Tensor([1], mstype.float32)
        is_impossible = Tensor([1], mstype.float32)
        logits = model.predict(input_ids, input_mask, segment_ids,
                               start_positions, end_positions, unique_ids,
                               is_impossible)
        ids = logits[0].asnumpy()
        start = logits[1].asnumpy()
        end = logits[2].asnumpy()

        for i in range(bert_net_cfg.batch_size):
            unique_id = int(ids[i])
            start_logits = [float(x) for x in start[i].flat]
            end_logits = [float(x) for x in end[i].flat]
            output.append(
                RawResult(unique_id=unique_id,
                          start_logits=start_logits,
                          end_logits=end_logits))
    write_predictions(eval_examples, eval_features, output, 20, 30, True,
                      "./predictions.json", None, None, False, False)
Esempio n. 7
0
def submit(model=None,
           path="",
           vocab_file="",
           use_crf="",
           label_file="",
           tag_to_index=None):
    """
    submit task
    """
    tokenizer_ = tokenization.FullTokenizer(vocab_file=vocab_file)
    data = []
    if cfg.schema_file is not None:
        f1 = open(cfg.schema_file, 'r')
        numRows = json.load(f1)
        up_num = numRows["numRows"]
    else:
        up_num = 600000000000
    num = 0
    for line in open(path):
        num = num + 1
        if num > up_num:
            break
        if not line.strip():
            continue
        oneline = json.loads(line.strip())
        if cfg.task == 'Classification':
            res = process(model=model,
                          text=oneline["sentence"],
                          tokenizer_=tokenizer_,
                          use_crf=use_crf,
                          tag_to_index=tag_to_index,
                          vocab=vocab_file)

            print("text", oneline["sentence"])
        elif cfg.task == 'NER':
            res = process(model=model,
                          text=oneline["text"],
                          tokenizer_=tokenizer_,
                          use_crf=use_crf,
                          tag_to_index=tag_to_index,
                          vocab=vocab_file)
            print("text", oneline["text"])
        else:
            raise Exception("Task error")
        print("res:", res)
        f.write("result: " + str(res) + '\n')
        data.append(json.dumps({"label": res}, ensure_ascii=False))
    f.close()
Esempio n. 8
0
def submit(model=None, path="", vocab_file="", use_crf="", label2id_file=""):
    """
    submit task
    """
    tokenizer_ = tokenization.FullTokenizer(vocab_file=vocab_file)
    data = []
    for line in open(path):
        if not line.strip():
            continue
        oneline = json.loads(line.strip())
        res = process(model=model,
                      text=oneline["text"],
                      tokenizer_=tokenizer_,
                      use_crf=use_crf,
                      label2id_file=label2id_file)
        print("text", oneline["text"])
        print("res:", res)
        data.append(json.dumps({"label": res}, ensure_ascii=False))
    open("ner_predict.json", "w").write("\n".join(data))
Esempio n. 9
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    processors = {
        "chn": data_processors.ChnSentiCorpDataProcessor,
        "lcqmc": data_processors.LCQMCProcessor,
        "xnli": data_processors.XnliProcessor,
        "book_review": data_processors.BookReviewProcessor,
        "shopping": data_processors.ShoppingProcessor,
        "weibo": data_processors.WeiboProcessor,
        "law_qa": data_processors.LawQAProcessor,
        "nlpcc_dbqa": data_processors.NlpccDbqaProcessor,
    }

    # tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
    #                                               FLAGS.init_checkpoint)

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' must be True."
        )

    if not FLAGS.albert_config_file and not FLAGS.albert_hub_module_handle:
        raise ValueError("At least one of `--albert_config_file` and "
                         "`--albert_hub_module_handle` must be set")

    if FLAGS.albert_config_file:
        albert_config = modeling.AlbertConfig.from_json_file(
            FLAGS.albert_config_file)
        if FLAGS.max_seq_length > albert_config.max_position_embeddings:
            raise ValueError(
                "Cannot use sequence length %d because the ALBERT model "
                "was only trained up to sequence length %d" %
                (FLAGS.max_seq_length, albert_config.max_position_embeddings))
    else:
        albert_config = None  # Get the config from TF-Hub.

    tf.gfile.MakeDirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % task_name)

    processor = processors[task_name](FLAGS)

    label_list = processor.get_labels()

    # bpe_tokenizer = BertTokenizer.from_pretrained(
    #     FLAGS.bert_tokenizer_name,
    #     tokenize_chinese_chars=False
    # )
    # bpe_tokenizer.tokenize_chinese_chars = False
    # print("bpe_tokenizer.tokenize_chinese_chars: ", bpe_tokenizer.tokenize_chinese_chars)

    bpe_tokenizer = tokenization.FullTokenizer(
        vocab_file=FLAGS.vocab_file,
        do_lower_case=FLAGS.do_lower_case,
        spm_model_file=FLAGS.spm_model_file)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2
    if FLAGS.do_train:
        iterations_per_loop = int(
            min(FLAGS.iterations_per_loop, FLAGS.save_checkpoints_steps))
    else:
        iterations_per_loop = FLAGS.iterations_per_loop
    run_config = contrib_tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=int(FLAGS.save_checkpoints_steps),
        keep_checkpoint_max=0,
        tpu_config=contrib_tpu.TPUConfig(
            iterations_per_loop=iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    train_examples = None
    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
    model_fn = classifier_utils.model_fn_builder(
        albert_config=albert_config,
        num_labels=len(label_list),
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=FLAGS.train_step,
        num_warmup_steps=FLAGS.warmup_step,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_tpu,
        task_name=task_name,
        hub_module=FLAGS.albert_hub_module_handle,
        optimizer=FLAGS.optimizer)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = contrib_tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        cached_dir = FLAGS.cached_dir
        if not cached_dir:
            cached_dir = FLAGS.output_dir
        train_file = os.path.join(cached_dir, task_name + "_train.tf_record")
        # if not tf.gfile.Exists(train_file):
        classifier_utils.file_based_convert_examples_to_features(
            train_examples,
            label_list,
            FLAGS.max_seq_length,
            bpe_tokenizer,
            train_file,
            task_name,
        )
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", FLAGS.train_step)
        train_input_fn = classifier_utils.file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True,
            task_name=task_name,
            use_tpu=FLAGS.use_tpu,
            bsz=FLAGS.train_batch_size)
        estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_step)

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        num_actual_eval_examples = len(eval_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on. These do NOT count towards the metric (all tf.metrics
            # support a per-instance weight, and these get a weight of 0.0).
            while len(eval_examples) % FLAGS.eval_batch_size != 0:
                eval_examples.append(data_processors.PaddingInputExample())

        cached_dir = FLAGS.cached_dir
        if not cached_dir:
            cached_dir = FLAGS.output_dir
        eval_file = os.path.join(cached_dir, task_name + "_eval.tf_record")
        # if not tf.gfile.Exists(eval_file):
        classifier_utils.file_based_convert_examples_to_features(
            eval_examples,
            label_list,
            FLAGS.max_seq_length,
            bpe_tokenizer,
            eval_file,
            task_name,
        )

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(eval_examples), num_actual_eval_examples,
                        len(eval_examples) - num_actual_eval_examples)
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        # This tells the estimator to run through the entire set.
        eval_steps = None
        # However, if running eval on the TPU, you will need to specify the
        # number of steps.
        if FLAGS.use_tpu:
            assert len(eval_examples) % FLAGS.eval_batch_size == 0
            eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

        eval_drop_remainder = True if FLAGS.use_tpu else False
        eval_input_fn = classifier_utils.file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder,
            task_name=task_name,
            use_tpu=FLAGS.use_tpu,
            bsz=FLAGS.eval_batch_size)

        best_trial_info_file = os.path.join(FLAGS.output_dir, "best_trial.txt")

        def _best_trial_info():
            """Returns information about which checkpoints have been evaled so far."""
            if tf.gfile.Exists(best_trial_info_file):
                with tf.gfile.GFile(best_trial_info_file, "r") as best_info:
                    global_step, best_metric_global_step, metric_value = (
                        best_info.read().split(":"))
                    global_step = int(global_step)
                    best_metric_global_step = int(best_metric_global_step)
                    metric_value = float(metric_value)
            else:
                metric_value = -1
                best_metric_global_step = -1
                global_step = -1
            tf.logging.info(
                "Best trial info: Step: %s, Best Value Step: %s, "
                "Best Value: %s", global_step, best_metric_global_step,
                metric_value)
            return global_step, best_metric_global_step, metric_value

        def _remove_checkpoint(checkpoint_path):
            for ext in ["meta", "data-00000-of-00001", "index"]:
                src_ckpt = checkpoint_path + ".{}".format(ext)
                tf.logging.info("removing {}".format(src_ckpt))
                tf.gfile.Remove(src_ckpt)

        def _find_valid_cands(curr_step):
            filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
            candidates = []
            for filename in filenames:
                if filename.endswith(".index"):
                    ckpt_name = filename[:-6]
                    idx = ckpt_name.split("-")[-1]
                    if int(idx) > curr_step:
                        candidates.append(filename)
            return candidates

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")

        if task_name == "sts-b":
            key_name = "pearson"
        elif task_name == "cola":
            key_name = "matthew_corr"

        elif task_name == "nlpcc_dbqa":
            key_name = "f1_score"
        else:
            key_name = "eval_accuracy"

        global_step, best_perf_global_step, best_perf = _best_trial_info()
        writer = tf.gfile.GFile(output_eval_file, "w")
        while global_step < FLAGS.train_step:
            steps_and_files = {}
            filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
            for filename in filenames:
                if filename.endswith(".index"):
                    ckpt_name = filename[:-6]
                    cur_filename = os.path.join(FLAGS.output_dir, ckpt_name)
                    if cur_filename.split("-")[-1] == "best":
                        continue
                    gstep = int(cur_filename.split("-")[-1])
                    if gstep not in steps_and_files:
                        tf.logging.info(
                            "Add {} to eval list.".format(cur_filename))
                        steps_and_files[gstep] = cur_filename
            tf.logging.info("found {} files.".format(len(steps_and_files)))
            if not steps_and_files:
                tf.logging.info(
                    "found 0 file, global step: {}. Sleeping.".format(
                        global_step))
                time.sleep(60)
            else:
                for checkpoint in sorted(steps_and_files.items()):
                    step, checkpoint_path = checkpoint
                    if global_step >= step:
                        if (best_perf_global_step != step
                                and len(_find_valid_cands(step)) > 1):
                            _remove_checkpoint(checkpoint_path)
                        continue
                    result = estimator.evaluate(
                        input_fn=eval_input_fn,
                        steps=eval_steps,
                        checkpoint_path=checkpoint_path)
                    global_step = result["global_step"]
                    tf.logging.info("***** Eval results *****")
                    for key in sorted(result.keys()):
                        tf.logging.info("  %s = %s", key, str(result[key]))
                        writer.write("%s = %s\n" % (key, str(result[key])))
                    writer.write("best = {}\n".format(best_perf))
                    if result[key_name] > best_perf:
                        best_perf = result[key_name]
                        best_perf_global_step = global_step
                    elif len(_find_valid_cands(global_step)) > 1:
                        _remove_checkpoint(checkpoint_path)
                    writer.write("=" * 50 + "\n")
                    writer.flush()
                    with tf.gfile.GFile(best_trial_info_file,
                                        "w") as best_info:
                        best_info.write("{}:{}:{}".format(
                            global_step, best_perf_global_step, best_perf))
        writer.close()

        # output_eval_file_local = os.path.join("./tmp/", "_".join(output_eval_file.split("/")[4:]))
        # tf.gfile.Copy(output_eval_file, output_eval_file_local, overwrite=True)

        for ext in ["meta", "data-00000-of-00001", "index"]:
            src_ckpt = "model.ckpt-{}.{}".format(best_perf_global_step, ext)
            tgt_ckpt = "model.ckpt-best.{}".format(ext)
            tf.logging.info("saving {} to {}".format(src_ckpt, tgt_ckpt))
            tf.io.gfile.rename(os.path.join(FLAGS.output_dir, src_ckpt),
                               os.path.join(FLAGS.output_dir, tgt_ckpt),
                               overwrite=True)

    if FLAGS.do_predict:
        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        num_actual_predict_examples = len(predict_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on.
            while len(predict_examples) % FLAGS.predict_batch_size != 0:
                predict_examples.append(data_processors.PaddingInputExample())

        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        classifier_utils.file_based_convert_examples_to_features(
            predict_examples,
            label_list,
            FLAGS.max_seq_length,
            bpe_tokenizer,
            predict_file,
            task_name,
        )

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(predict_examples), num_actual_predict_examples,
                        len(predict_examples) - num_actual_predict_examples)
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_drop_remainder = True if FLAGS.use_tpu else False
        predict_input_fn = classifier_utils.file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder,
            task_name=task_name,
            use_tpu=FLAGS.use_tpu,
            bsz=FLAGS.predict_batch_size)

        checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best")
        result = estimator.predict(input_fn=predict_input_fn,
                                   checkpoint_path=checkpoint_path)

        output_predict_file = os.path.join(FLAGS.output_dir,
                                           "test_results.tsv")
        output_submit_file = os.path.join(FLAGS.output_dir,
                                          "submit_results.tsv")
        with tf.gfile.GFile(output_predict_file, "w") as pred_writer, \
                tf.gfile.GFile(output_submit_file, "w") as sub_writer:
            sub_writer.write("index" + "\t" + "prediction\n")
            num_written_lines = 0
            tf.logging.info("***** Predict results *****")
            for (i, (example, prediction)) in \
                    enumerate(zip(predict_examples, result)):
                probabilities = prediction["probabilities"]
                if i >= num_actual_predict_examples:
                    break
                output_line = "\t".join(
                    str(class_probability)
                    for class_probability in probabilities) + "\n"
                pred_writer.write(output_line)

                if task_name != "sts-b":
                    actual_label = label_list[int(prediction["predictions"])]
                else:
                    actual_label = str(prediction["predictions"])
                sub_writer.write(example.guid + "\t" + actual_label + "\n")
                num_written_lines += 1
        assert num_written_lines == num_actual_predict_examples

        truth_json_dir = os.path.join(FLAGS.data_dir, "test.json")
        pred_tsv_dir = output_submit_file
        accuracy, results = cal_metrics(pred_tsv_dir, truth_json_dir)

        tf.logging.info("***** Predict metrics *****")
        tf.logging.info("***** Predict metrics *****")
        tf.logging.info("accuracy: %f" % accuracy)
        tf.logging.info("results: ")
        tf.logging.info(json.dumps(results, ensure_ascii=False, indent=2))
        tf.logging.info("***** Predict metrics *****")
        tf.logging.info("***** Predict metrics *****")
Esempio n. 10
0
def run_squad():
    """run squad task"""
    parser = argparse.ArgumentParser(description="run squad")
    parser.add_argument("--device_target",
                        type=str,
                        default="Ascend",
                        choices=["Ascend", "GPU"],
                        help="Device type, default is Ascend")
    parser.add_argument("--do_train",
                        type=str,
                        default="false",
                        choices=["true", "false"],
                        help="Eable train, default is false")
    parser.add_argument("--do_eval",
                        type=str,
                        default="false",
                        choices=["true", "false"],
                        help="Eable eval, default is false")
    parser.add_argument("--device_id",
                        type=int,
                        default=0,
                        help="Device id, default is 0.")
    parser.add_argument("--epoch_num",
                        type=int,
                        default=3,
                        help="Epoch number, default is 1.")
    parser.add_argument("--num_class",
                        type=int,
                        default=2,
                        help="The number of class, default is 2.")
    parser.add_argument("--train_data_shuffle",
                        type=str,
                        default="true",
                        choices=["true", "false"],
                        help="Enable train data shuffle, default is true")
    parser.add_argument("--eval_data_shuffle",
                        type=str,
                        default="false",
                        choices=["true", "false"],
                        help="Enable eval data shuffle, default is false")
    parser.add_argument("--train_batch_size",
                        type=int,
                        default=32,
                        help="Train batch size, default is 32")
    parser.add_argument("--eval_batch_size",
                        type=int,
                        default=1,
                        help="Eval batch size, default is 1")
    parser.add_argument("--vocab_file_path",
                        type=str,
                        default="",
                        help="Vocab file path")
    parser.add_argument("--eval_json_path",
                        type=str,
                        default="",
                        help="Evaluation json file path, can be eval.json")
    parser.add_argument("--save_finetune_checkpoint_path",
                        type=str,
                        default="",
                        help="Save checkpoint path")
    parser.add_argument("--load_pretrain_checkpoint_path",
                        type=str,
                        default="",
                        help="Load checkpoint file path")
    parser.add_argument("--load_finetune_checkpoint_path",
                        type=str,
                        default="",
                        help="Load checkpoint file path")
    parser.add_argument("--train_data_file_path",
                        type=str,
                        default="",
                        help="Data path, it is better to use absolute path")
    parser.add_argument("--schema_file_path",
                        type=str,
                        default="",
                        help="Schema path, it is better to use absolute path")
    args_opt = parser.parse_args()
    epoch_num = args_opt.epoch_num
    load_pretrain_checkpoint_path = args_opt.load_pretrain_checkpoint_path
    save_finetune_checkpoint_path = args_opt.save_finetune_checkpoint_path
    load_finetune_checkpoint_path = args_opt.load_finetune_checkpoint_path

    if args_opt.do_train.lower() == "false" and args_opt.do_eval.lower(
    ) == "false":
        raise ValueError(
            "At least one of 'do_train' or 'do_eval' must be true")
    if args_opt.do_train.lower(
    ) == "true" and args_opt.train_data_file_path == "":
        raise ValueError(
            "'train_data_file_path' must be set when do finetune task")
    if args_opt.do_eval.lower() == "true":
        if args_opt.vocab_file_path == "":
            raise ValueError(
                "'vocab_file_path' must be set when do evaluation task")
        if args_opt.eval_json_path == "":
            raise ValueError(
                "'tokenization_file_path' must be set when do evaluation task")

    target = args_opt.device_target
    if target == "Ascend":
        context.set_context(mode=context.GRAPH_MODE,
                            device_target="Ascend",
                            device_id=args_opt.device_id)
    elif target == "GPU":
        context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
        if bert_net_cfg.compute_type != mstype.float32:
            logger.warning('GPU only support fp32 temporarily, run with fp32.')
            bert_net_cfg.compute_type = mstype.float32
    else:
        raise Exception("Target error, GPU or Ascend is supported.")

    netwithloss = BertSquad(bert_net_cfg, True, 2, dropout_prob=0.1)

    if args_opt.do_train.lower() == "true":
        ds = create_squad_dataset(
            batch_size=args_opt.train_batch_size,
            repeat_count=1,
            data_file_path=args_opt.train_data_file_path,
            schema_file_path=args_opt.schema_file_path,
            do_shuffle=(args_opt.train_data_shuffle.lower() == "true"))
        do_train(ds, netwithloss, load_pretrain_checkpoint_path,
                 save_finetune_checkpoint_path, epoch_num)
        if args_opt.do_eval.lower() == "true":
            if save_finetune_checkpoint_path == "":
                load_finetune_checkpoint_dir = _cur_dir
            else:
                load_finetune_checkpoint_dir = make_directory(
                    save_finetune_checkpoint_path)
            load_finetune_checkpoint_path = LoadNewestCkpt(
                load_finetune_checkpoint_dir, ds.get_dataset_size(), epoch_num,
                "squad")

    if args_opt.do_eval.lower() == "true":
        from src import tokenization
        from src.create_squad_data import read_squad_examples, convert_examples_to_features
        from src.squad_get_predictions import write_predictions
        from src.squad_postprocess import SQuad_postprocess
        tokenizer = tokenization.FullTokenizer(
            vocab_file=args_opt.vocab_file_path, do_lower_case=True)
        eval_examples = read_squad_examples(args_opt.eval_json_path, False)
        eval_features = convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=bert_net_cfg.seq_length,
            doc_stride=128,
            max_query_length=64,
            is_training=False,
            output_fn=None,
            vocab_file=args_opt.vocab_file_path)
        ds = create_squad_dataset(
            batch_size=args_opt.eval_batch_size,
            repeat_count=1,
            data_file_path=eval_features,
            schema_file_path=args_opt.schema_file_path,
            is_training=False,
            do_shuffle=(args_opt.eval_data_shuffle.lower() == "true"))
        outputs = do_eval(ds, load_finetune_checkpoint_path,
                          args_opt.eval_batch_size)
        all_predictions = write_predictions(eval_examples, eval_features,
                                            outputs, 20, 30, True)
        SQuad_postprocess(args_opt.eval_json_path,
                          all_predictions,
                          output_metrics="output.json")
        else:
            sent_new_ += char_

    # drop redundent blank
    sent_new_ = drop_extra_blank(sent_new_)

    return sent_new_


def tokenize_single_sent(sent, tokenizer=None):
    # sent = proc_single_sent(sent)

    line_seg = tokenizer.tokenize(sent)

    return line_seg


if __name__ == "__main__":
    bpe_tokenizer = tokenization.FullTokenizer(
        vocab_file=
        "data_proc/tokenizers/sentencepiece/char_no_space-21128-clean.vocab",
        do_lower_case=True,
        spm_model_file=
        "data_proc/tokenizers/sentencepiece/char_no_space-21128-clean.model")
    sent = "我喜欢篮球"
    line_seg = tokenize_single_sent(sent, tokenizer=bpe_tokenizer)
    line_seg = ["[CLS]"] + line_seg + ["[SEP]"]
    print(line_seg)
    print(" ".join(line_seg))

    # [CLS] ▁我 喜欢 篮球 [SEP]