Beispiel #1
0
from preprocessing.data_processor import read_squad_data

if __name__ == "__main__":
    read_squad_data(
        "/home/LAB/liqian/test/game/Fin/CCKS-Mrc/data/squad-like_all_train_data.json",
        "/home/LAB/liqian/test/game/Fin/CCKS-Mrc/data/",
        is_training=True)
Beispiel #2
0
from preprocessing.data_processor import read_squad_data

if __name__ == "__main__":
    read_squad_data("data/data.json", "data/")
from Io.data_loader import create_batch_iter
from preprocessing.data_processor import read_squad_data, convert_examples_to_features, read_qa_examples
from pytorch_pretrained_bert.tokenization import BertTokenizer
from predict.predict import main

if __name__ == "__main__":
    read_squad_data(
        "/home/LAB/liqian/test/game/Fin/CCKS-Mrc/data/squad_like_test.json",
        "/home/LAB/liqian/test/game/Fin/CCKS-Mrc/data/",
        is_training=False)
    # examples = read_qa_examples("/home/LAB/liqian/test/game/ccks-2020-finance-transfer-ee-baseline-master/CCKS-Mrc/data/", "test")
    examples = read_qa_examples(
        "/home/LAB/liqian/test/game/Fin/CCKS-Mrc/data/", "test")
    main('/home/LAB/liqian/test/game/Fin/CCKS-Mrc/data/')
Beispiel #4
0
from Io.data_loader import create_batch_iter
from preprocessing.data_processor import read_squad_data, convert_examples_to_features, read_qa_examples
from pytorch_pretrained_bert.tokenization import BertTokenizer
from predict.predict import main
#from pytorch_pretrained_bert.modeling import BertPreTrainedModel
if __name__ == "__main__":
    read_squad_data("data/small_train_data.json", "data/")
    examples = read_qa_examples("data/", "train")
    print(len(examples))
    features = convert_examples_to_features(
        examples,
        tokenizer=BertTokenizer("pretrained_model/vocab.txt"),
        max_seq_length=512,
        doc_stride=500,
        max_query_length=32,
        is_training=True)
    print(len(features))

    # main()
from Io.data_loader import create_batch_iter
from preprocessing.data_processor import read_squad_data, convert_examples_to_features, read_qa_examples
from pytorch_pretrained_bert.tokenization import BertTokenizer
from predict.predict import main

if __name__ == "__main__":
    read_squad_data("data/squad_like_test.json", "data/", is_training=False)
    examples = read_qa_examples("data/", "test")
    main('data/')
Beispiel #6
0
def make_predict(model, tokenizer, test_raw_data, data_dir):
    read_squad_data(test_raw_data, data_dir, is_training=False)
    eval_examples = read_qa_examples(data_dir, corpus_type="test")
    eval_features = convert_examples_to_features(
        examples=eval_examples,
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length,
        doc_stride=args.doc_stride,
        max_query_length=args.max_query_length,
        is_training=False)

    logger.info("***** Running predictions *****")
    logger.info("  Num orig examples = %d", len(eval_examples))
    logger.info("  Num split examples = %d", len(eval_features))
    logger.info("  Batch size = %d", args.predict_batch_size)

    all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                   dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                              all_example_index)
    # Run prediction for full data
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.predict_batch_size)
    # Predicted by model. Note that one example turns into more than one features when processed by conver_example_to_features.py.
    # This means one example index corresponding to more than one features
    # In other word, for certain example, we can get more than one predict results by model.
    model.eval()
    all_results = []
    logger.info("Start evaluating")
    RawResult = collections.namedtuple(
        "RawResult",
        ["unique_id", "start_logits", "end_logits", "answer_type_logits"])
    for input_ids, input_mask, segment_ids, example_indices in tqdm(
            eval_dataloader,
            desc="Evaluating",
            disable=args.local_rank not in [-1, 0]):
        if len(all_results) % 1000 == 0:
            logger.info("Processing example: %d" % (len(all_results)))
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        with torch.no_grad():
            batch_start_logits, batch_end_logits, batch_answer_type_logits, _ = model(
                input_ids, segment_ids, input_mask)
        for i, example_index in enumerate(example_indices):
            start_logits = batch_start_logits[i].detach().cpu().tolist()
            end_logits = batch_end_logits[i].detach().cpu().tolist()
            answer_type_logits = batch_answer_type_logits[i].detach().cpu(
            ).tolist()
            eval_feature = eval_features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            all_results.append(
                RawResult(unique_id=unique_id,
                          start_logits=start_logits,
                          end_logits=end_logits,
                          answer_type_logits=answer_type_logits))

    output_prediction_file = os.path.join(args.output_dir, "predictions.json")
    output_nbest_file = os.path.join(data_dir, "nbest_predictions.json")
    output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json")
    # post process
    write_predictions(eval_examples, eval_features, all_results,
                      args.n_best_size, args.max_answer_length,
                      args.do_lower_case, output_prediction_file,
                      output_nbest_file, output_null_log_odds_file,
                      args.verbose_logging, args.version_2_with_negative,
                      args.null_score_diff_threshold)
    logger.info("Evaluating finished")
from preprocessing.data_processor import read_squad_data

if __name__ == "__main__":
    read_squad_data("data/squad-like_all_train_data.json",
                    "data/",
                    is_training=True)