from preprocessing.data_processor import read_squad_data if __name__ == "__main__": read_squad_data( "/home/LAB/liqian/test/game/Fin/CCKS-Mrc/data/squad-like_all_train_data.json", "/home/LAB/liqian/test/game/Fin/CCKS-Mrc/data/", is_training=True)
from preprocessing.data_processor import read_squad_data if __name__ == "__main__": read_squad_data("data/data.json", "data/")
from Io.data_loader import create_batch_iter from preprocessing.data_processor import read_squad_data, convert_examples_to_features, read_qa_examples from pytorch_pretrained_bert.tokenization import BertTokenizer from predict.predict import main if __name__ == "__main__": read_squad_data( "/home/LAB/liqian/test/game/Fin/CCKS-Mrc/data/squad_like_test.json", "/home/LAB/liqian/test/game/Fin/CCKS-Mrc/data/", is_training=False) # examples = read_qa_examples("/home/LAB/liqian/test/game/ccks-2020-finance-transfer-ee-baseline-master/CCKS-Mrc/data/", "test") examples = read_qa_examples( "/home/LAB/liqian/test/game/Fin/CCKS-Mrc/data/", "test") main('/home/LAB/liqian/test/game/Fin/CCKS-Mrc/data/')
from Io.data_loader import create_batch_iter from preprocessing.data_processor import read_squad_data, convert_examples_to_features, read_qa_examples from pytorch_pretrained_bert.tokenization import BertTokenizer from predict.predict import main #from pytorch_pretrained_bert.modeling import BertPreTrainedModel if __name__ == "__main__": read_squad_data("data/small_train_data.json", "data/") examples = read_qa_examples("data/", "train") print(len(examples)) features = convert_examples_to_features( examples, tokenizer=BertTokenizer("pretrained_model/vocab.txt"), max_seq_length=512, doc_stride=500, max_query_length=32, is_training=True) print(len(features)) # main()
from Io.data_loader import create_batch_iter from preprocessing.data_processor import read_squad_data, convert_examples_to_features, read_qa_examples from pytorch_pretrained_bert.tokenization import BertTokenizer from predict.predict import main if __name__ == "__main__": read_squad_data("data/squad_like_test.json", "data/", is_training=False) examples = read_qa_examples("data/", "test") main('data/')
def make_predict(model, tokenizer, test_raw_data, data_dir): read_squad_data(test_raw_data, data_dir, is_training=False) eval_examples = read_qa_examples(data_dir, corpus_type="test") eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) # Predicted by model. Note that one example turns into more than one features when processed by conver_example_to_features.py. # This means one example index corresponding to more than one features # In other word, for certain example, we can get more than one predict results by model. model.eval() all_results = [] logger.info("Start evaluating") RawResult = collections.namedtuple( "RawResult", ["unique_id", "start_logits", "end_logits", "answer_type_logits"]) for input_ids, input_mask, segment_ids, example_indices in tqdm( eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits, batch_answer_type_logits, _ = model( input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() answer_type_logits = batch_answer_type_logits[i].detach().cpu( ).tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits, answer_type_logits=answer_type_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(data_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json") # post process write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) logger.info("Evaluating finished")
from preprocessing.data_processor import read_squad_data if __name__ == "__main__": read_squad_data("data/squad-like_all_train_data.json", "data/", is_training=True)