def create_batch_iter(mode): """构造迭代器""" processor, tokenizer = init_params() if mode == "train": examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) batch_size = args.train_batch_size logger.info(" Num steps = %d", num_train_steps) elif mode == "dev": examples = processor.get_dev_examples(args.data_dir) batch_size = args.eval_batch_size else: raise ValueError("Invalid mode %s" % mode) label_list = processor.get_labels() # 特征 features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer) logger.info(" Num examples = %d", len(examples)) logger.info(" Batch size = %d", batch_size) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) all_output_mask = torch.tensor([f.output_mask for f in features], dtype=torch.long) # 数据集 data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_output_mask) if mode == "train": sampler = RandomSampler(data) elif mode == "dev": sampler = SequentialSampler(data) else: raise ValueError("Invalid mode %s" % mode) # 迭代器 iterator = DataLoader(data, sampler=sampler, batch_size=batch_size) if mode == "train": return iterator, num_train_steps elif mode == "dev": return iterator else: raise ValueError("Invalid mode %s" % mode)
def create_batch_iter(mode): """构造迭代器""" tokenizer = init_params() if mode == "train": examples = read_qa_examples(args.data_dir, "train") batch_size = args.train_batch_size elif mode == "dev": examples = read_qa_examples(args.data_dir, "dev") batch_size = args.eval_batch_size else: raise ValueError("Invalid mode %s" % mode) # 特征 features = convert_examples_to_features(examples, tokenizer, args.max_seq_length, args.doc_stride, args.max_query_length, is_training=True) logger.info(" Num Features = %d", len(features)) logger.info(" Batch size = %d", batch_size) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long) end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long) answer_types = torch.tensor([f.answer_type for f in features], dtype=torch.long) # 数据集 data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, start_positions, end_positions, answer_types) if mode == "train": num_train_steps = int( len(features) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) batch_size = args.train_batch_size logger.info(" Num steps = %d", num_train_steps) if args.local_rank == -1: sampler = RandomSampler(data) else: sampler = DistributedSampler(data) elif mode == "dev": sampler = SequentialSampler(data) else: raise ValueError("Invalid mode %s" % mode) # 迭代器 iterator = DataLoader(data, sampler=sampler, batch_size=batch_size) if mode == "train": return iterator, num_train_steps elif mode == "dev": return iterator else: raise ValueError("Invalid mode %s" % mode)
def make_predict(model, tokenizer, data_dir): #read_squad_data(test_raw_data, data_dir, is_training=False) eval_examples = read_qa_examples(data_dir, corpus_type="test") eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) # Predicted by model. Note that one example turns into more than one features when processed by conver_example_to_features.py. # This means one example index corresponding to more than one features # In other word, for certain example, we can get more than one predict results by model. model.eval() all_results = [] logger.info("Start evaluating") RawResult = collections.namedtuple("RawResult", ["unique_id", "start_logits", "end_logits", "answer_type_logits"]) for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits, batch_answer_type_logits = model(input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() answer_type_logits = batch_answer_type_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append(RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits, answer_type_logits=answer_type_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json") # post process write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold)
from Io.data_loader import create_batch_iter from preprocessing.data_processor import read_squad_data, convert_examples_to_features, read_qa_examples from pytorch_pretrained_bert.tokenization import BertTokenizer from predict.predict import main #from pytorch_pretrained_bert.modeling import BertPreTrainedModel if __name__ == "__main__": read_squad_data("data/small_train_data.json", "data/") examples = read_qa_examples("data/", "train") print(len(examples)) features = convert_examples_to_features( examples, tokenizer=BertTokenizer("pretrained_model/vocab.txt"), max_seq_length=512, doc_stride=500, max_query_length=32, is_training=True) print(len(features)) # main()