def gen(args): tokenizer = BertTokenizer.from_pretrained(modelconfig.MODEL_ARCHIVE_MAP[args.bert_model] ) train_examples = data_utils.read_squad_examples(os.path.join(args.input_dir, "train.json"), is_training=True) train_features = data_utils.convert_examples_to_features( train_examples, tokenizer, args.max_seq_length, args.doc_stride, args.max_query_length, is_training=True) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) input_ids_np = np.array([f.input_ids for f in train_features], dtype=np.int16) segment_ids_np = np.array([f.segment_ids for f in train_features], dtype=np.int16) input_mask_np = np.array([f.input_mask for f in train_features], dtype=np.int16) start_positions_np = np.array([f.start_position for f in train_features], dtype=np.int16) end_positions_np = np.array([f.end_position for f in train_features], dtype=np.int16) np.savez_compressed(os.path.join(args.output_dir, "data.npz"), input_ids=input_ids_np, segment_ids = segment_ids_np, input_mask = input_mask_np, start_positions = start_positions_np, end_positions = end_positions_np) #>>>>> validation valid_examples=data_utils.read_squad_examples(os.path.join(args.input_dir,"dev.json"), is_training=True) valid_features = data_utils.convert_examples_to_features( valid_examples, tokenizer, args.max_seq_length, args.doc_stride, args.max_query_length, is_training=True) logger.info(" Num orig examples = %d", len(valid_examples)) logger.info(" Num split examples = %d", len(valid_features)) valid_input_ids_np = np.array([f.input_ids for f in valid_features], dtype=np.int16) valid_segment_ids_np = np.array([f.segment_ids for f in valid_features], dtype=np.int16) valid_input_mask_np = np.array([f.input_mask for f in valid_features], dtype=np.int16) valid_start_positions_np = np.array([f.start_position for f in valid_features], dtype=np.int16) valid_end_positions_np = np.array([f.end_position for f in valid_features], dtype=np.int16) np.savez_compressed(os.path.join(args.output_dir, "dev.npz"), input_ids=valid_input_ids_np, segment_ids = valid_segment_ids_np, input_mask = valid_input_mask_np, start_positions = valid_start_positions_np, end_positions = valid_end_positions_np)
def test(args): # Load a trained model that you have fine-tuned (we assume evaluate on cpu) tokenizer = BertTokenizer.from_pretrained(modelconfig.MODEL_ARCHIVE_MAP[args.bert_model]) eval_examples = data_utils.read_squad_examples(os.path.join(args.data_dir,"test.json"), is_training=False) eval_features = data_utils.convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, args.doc_stride, args.max_query_length, is_training=False) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_segment_ids, all_input_mask, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model = torch.load(os.path.join(args.output_dir, "model.pt") ) model.cuda() model.eval() all_results = [] for step, batch in enumerate(eval_dataloader): example_indices = batch[-1] batch = tuple(t.cuda() for t in batch[:-1]) input_ids, segment_ids, input_mask= batch with torch.no_grad(): batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append(data_utils.RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") data_utils.write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, True, output_prediction_file, output_nbest_file, False)
def train(args): args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) tokenizer = BertTokenizer.from_pretrained( modelconfig.MODEL_ARCHIVE_MAP[args.bert_model]) train_examples = data_utils.read_squad_examples(os.path.join( args.data_dir, "train.json"), is_training=True) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs train_features = data_utils.convert_examples_to_features( train_examples, tokenizer, args.max_seq_length, args.doc_stride, args.max_query_length, is_training=True) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_start_positions = torch.tensor( [f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_segment_ids, all_input_mask, all_start_positions, all_end_positions) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) # >>>>> validation if args.do_valid: valid_examples = data_utils.read_squad_examples(os.path.join( args.data_dir, "dev.json"), is_training=True) valid_features = data_utils.convert_examples_to_features( valid_examples, tokenizer, args.max_seq_length, args.doc_stride, args.max_query_length, is_training=True) valid_all_input_ids = torch.tensor( [f.input_ids for f in valid_features], dtype=torch.long) valid_all_segment_ids = torch.tensor( [f.segment_ids for f in valid_features], dtype=torch.long) valid_all_input_mask = torch.tensor( [f.input_mask for f in valid_features], dtype=torch.long) valid_all_start_positions = torch.tensor( [f.start_position for f in valid_features], dtype=torch.long) valid_all_end_positions = torch.tensor( [f.end_position for f in valid_features], dtype=torch.long) valid_data = TensorDataset(valid_all_input_ids, valid_all_segment_ids, valid_all_input_mask, valid_all_start_positions, valid_all_end_positions) logger.info("***** Running validations *****") logger.info(" Num orig examples = %d", len(valid_examples)) logger.info(" Num split examples = %d", len(valid_features)) logger.info(" Batch size = %d", args.train_batch_size) valid_sampler = SequentialSampler(valid_data) valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=args.train_batch_size) best_valid_loss = float('inf') valid_losses = [] # <<<<< end of validation declaration if not args.bert_model.endswith(".pt"): model = BertForQuestionAnswering.from_pretrained( modelconfig.MODEL_ARCHIVE_MAP[args.bert_model]) else: model = torch.load(args.bert_model) if args.fp16: model.half() model.cuda() # Prepare optimizer param_optimizer = [(k, v) for k, v in model.named_parameters() if v.requires_grad == True] param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 model.train() for _ in range(args.num_train_epochs): for step, batch in enumerate(train_dataloader): batch = tuple(t.cuda() for t in batch) input_ids, segment_ids, input_mask, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # >>>> perform validation at the end of each epoch . if args.do_valid: model.eval() with torch.no_grad(): losses = [] valid_size = 0 for step, batch in enumerate(valid_dataloader): batch = tuple( t.cuda() for t in batch) # multi-gpu does scattering it-self input_ids, segment_ids, input_mask, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) losses.append(loss.data.item() * input_ids.size(0)) valid_size += input_ids.size(0) valid_loss = sum(losses) / valid_size logger.info("validation loss: %f", valid_loss) valid_losses.append(valid_loss) if valid_loss < best_valid_loss: torch.save(model, os.path.join(args.output_dir, "model.pt")) best_valid_loss = valid_loss model.train() if args.do_valid: with open(os.path.join(args.output_dir, "valid.json"), "w") as fw: json.dump({"valid_losses": valid_losses}, fw) else: torch.save(model, os.path.join(args.output_dir, "model.pt"))