def predict(args, model, tokenizer, prefix=""): pred_output_dir = args.output_dir if not os.path.exists(pred_output_dir) and args.local_rank in [-1, 0]: os.makedirs(pred_output_dir) test_dataset = load_and_cache_examples(args, args.task_name, tokenizer, data_type='test') # Note that DistributedSampler samples randomly test_sampler = SequentialSampler( test_dataset) if args.local_rank == -1 else DistributedSampler( test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=1, collate_fn=collate_fn) # Eval! logger.info("***** Running prediction %s *****", prefix) logger.info(" Num examples = %d", len(test_dataset)) logger.info(" Batch size = %d", 1) results = [] output_submit_file = os.path.join(pred_output_dir, prefix, "test_prediction.json") pbar = ProgressBar(n_total=len(test_dataloader), desc="Predicting") for step, batch in enumerate(test_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": None } if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) logits = outputs[0] preds = logits.detach().cpu().numpy() preds = np.argmax(preds, axis=2).tolist() preds = preds[0][1:-1] # [CLS]XXXX[SEP] tags = [args.id2label[x] for x in preds] label_entities = get_entities(preds, args.id2label, args.markup) json_d = {} json_d['id'] = step json_d['tag_seq'] = " ".join(tags) json_d['entities'] = label_entities results.append(json_d) pbar(step) logger.info("\n") with open(output_submit_file, "w") as writer: for record in results: writer.write(json.dumps(record) + '\n')
def evaluate(args, model, tokenizer, prefix=""): metric = SpanEntityScore(args.id2label) eval_output_dir = args.output_dir if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) eval_features = load_and_cache_examples(args, args.task_name, tokenizer, data_type='dev') args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Eval! logger.info("***** Running evaluation %s *****", prefix) logger.info(" Num examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 pbar = ProgressBar(n_total=len(eval_features), desc="Evaluating") for step, f in enumerate(eval_features): input_lens = f.input_len input_ids = torch.tensor([f.input_ids[:input_lens]], dtype=torch.long).to(args.device) input_mask = torch.tensor([f.input_mask[:input_lens]], dtype=torch.long).to(args.device) segment_ids = torch.tensor([f.segment_ids[:input_lens]], dtype=torch.long).to(args.device) start_ids = torch.tensor([f.start_ids[:input_lens]], dtype=torch.long).to(args.device) end_ids = torch.tensor([f.end_ids[:input_lens]], dtype=torch.long).to(args.device) subjects = f.subjects model.eval() with torch.no_grad(): inputs = {"input_ids": input_ids, "attention_mask": input_mask, "start_positions": start_ids, "end_positions": end_ids} if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (segment_ids if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) tmp_eval_loss, start_logits, end_logits = outputs[:3] R = bert_extract_item(start_logits, end_logits) T = subjects metric.update(true_subject=T, pred_subject=R) if args.n_gpu > 1: tmp_eval_loss = tmp_eval_loss.mean() # mean() to average on multi-gpu parallel evaluating eval_loss += tmp_eval_loss.item() nb_eval_steps += 1 pbar(step) logger.info("\n") eval_loss = eval_loss / nb_eval_steps eval_info, entity_info = metric.result() results = {f'{key}': value for key, value in eval_info.items()} results['loss'] = eval_loss logger.info("***** Eval results %s *****", prefix) info = "-".join([f' {key}: {value:.4f} ' for key, value in results.items()]) logger.info(info) logger.info("***** Entity results %s *****", prefix) for key in sorted(entity_info.keys()): logger.info("******* %s results ********" % key) info = "-".join([f' {key}: {value:.4f} ' for key, value in entity_info[key].items()]) logger.info(info) return results
def train(args, train_dataset, model, tokenizer): """ Train the model """ args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate_fn) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] bert_param_optimizer = list(model.bert.named_parameters()) crf_param_optimizer = list(model.crf.named_parameters()) linear_param_optimizer = list(model.classifier.named_parameters()) optimizer_grouped_parameters = [{ 'params': [ p for n, p in bert_param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay, 'lr': args.learning_rate }, { 'params': [ p for n, p in bert_param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, 'lr': args.learning_rate }, { 'params': [ p for n, p in crf_param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay, 'lr': args.crf_learning_rate }, { 'params': [p for n, p in crf_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0, 'lr': args.crf_learning_rate }, { 'params': [ p for n, p in linear_param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay, 'lr': args.crf_learning_rate }, { 'params': [ p for n, p in linear_param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, 'lr': args.crf_learning_rate }] args.warmup_steps = int(t_total * args.warmup_proportion) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path ) and "checkpoint" in args.model_name_or_path: # set global_step to gobal_step of last saved checkpoint from model path global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0]) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() seed_everything( args.seed ) # Added here for reproductibility (even between python 2 and 3) for _ in range(int(args.num_train_epochs)): pbar = ProgressBar(n_total=len(train_dataloader), desc='Training') for step, batch in enumerate(train_dataloader): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3], 'input_lens': batch[4] } if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() pbar(step, {'loss': loss.item()}) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scheduler.step() # Update learning rate schedule optimizer.step() model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics print(" ") if args.local_rank == -1: # Only evaluate when single GPU otherwise metrics may not average well evaluate(args, model, tokenizer) if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) tokenizer.save_vocabulary(output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) logger.info("\n") if 'cuda' in str(args.device): torch.cuda.empty_cache() return global_step, tr_loss / global_step
def predict(args, model, tokenizer, prefix=""): pred_output_dir = args.output_dir if not os.path.exists(pred_output_dir) and args.local_rank in [-1, 0]: os.makedirs(pred_output_dir) test_dataset = load_and_cache_examples(args, args.task_name, tokenizer, data_type='test') # Note that DistributedSampler samples randomly test_sampler = SequentialSampler( test_dataset) if args.local_rank == -1 else DistributedSampler( test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=1, collate_fn=collate_fn) # Eval! logger.info("***** Running prediction %s *****", prefix) logger.info(" Num examples = %d", len(test_dataset)) logger.info(" Batch size = %d", 1) results = [] output_predict_file = os.path.join(pred_output_dir, prefix, "test_prediction.json") pbar = ProgressBar(n_total=len(test_dataloader), desc="Predicting") if isinstance(model, nn.DataParallel): model = model.module for step, batch in enumerate(test_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": None, 'input_lens': batch[4] } if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) logits = outputs[0] tags = model.crf.decode(logits, inputs['attention_mask']) tags = tags.squeeze(0).cpu().numpy().tolist() preds = tags[0][1:-1] # [CLS]XXXX[SEP] label_entities = get_entities(preds, args.id2label, args.markup) json_d = {} json_d['id'] = step json_d['tag_seq'] = " ".join([args.id2label[x] for x in preds]) json_d['entities'] = label_entities results.append(json_d) pbar(step) logger.info("\n") with open(output_predict_file, "w") as writer: for record in results: writer.write(json.dumps(record) + '\n') if args.task_name == 'cluener': output_submit_file = os.path.join(pred_output_dir, prefix, "test_submit.json") test_text = [] with open(os.path.join(args.data_dir, "test.json"), 'r') as fr: for line in fr: test_text.append(json.loads(line)) test_submit = [] for x, y in zip(test_text, results): json_d = {} json_d['id'] = x['id'] json_d['label'] = {} entities = y['entities'] words = list(x['text']) if len(entities) != 0: for subject in entities: tag = subject[0] start = subject[1] end = subject[2] word = "".join(words[start:end + 1]) if tag in json_d['label']: if word in json_d['label'][tag]: json_d['label'][tag][word].append([start, end]) else: json_d['label'][tag][word] = [[start, end]] else: json_d['label'][tag] = {} json_d['label'][tag][word] = [[start, end]] test_submit.append(json_d) json_to_text(output_submit_file, test_submit)
def evaluate(args, model, tokenizer, prefix=""): metric = SeqEntityScore(args.id2label, markup=args.markup) eval_output_dir = args.output_dir if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) eval_dataset = load_and_cache_examples(args, args.task_name, tokenizer, data_type='dev') args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler( eval_dataset) if args.local_rank == -1 else DistributedSampler( eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate_fn) # Eval! logger.info("***** Running evaluation %s *****", prefix) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 pbar = ProgressBar(n_total=len(eval_dataloader), desc="Evaluating") if isinstance(model, nn.DataParallel): model = model.module for step, batch in enumerate(eval_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3], 'input_lens': batch[4] } if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] tags = model.crf.decode(logits, inputs['attention_mask']) if args.n_gpu > 1: tmp_eval_loss = tmp_eval_loss.mean( ) # mean() to average on multi-gpu parallel evaluating eval_loss += tmp_eval_loss.item() nb_eval_steps += 1 out_label_ids = inputs['labels'].cpu().numpy().tolist() input_lens = inputs['input_lens'].cpu().numpy().tolist() tags = tags.squeeze(0).cpu().numpy().tolist() for i, label in enumerate(out_label_ids): temp_1 = [] temp_2 = [] for j, m in enumerate(label): if j == 0: continue elif j == input_lens[i] - 1: metric.update(pred_paths=[temp_2], label_paths=[temp_1]) break else: temp_1.append(args.id2label[out_label_ids[i][j]]) temp_2.append(args.id2label[tags[i][j]]) pbar(step) logger.info("\n") eval_loss = eval_loss / nb_eval_steps eval_info, entity_info = metric.result() results = {f'{key}': value for key, value in eval_info.items()} results['loss'] = eval_loss logger.info("***** Eval results %s *****", prefix) info = "-".join( [f' {key}: {value:.4f} ' for key, value in results.items()]) logger.info(info) logger.info("***** Entity results %s *****", prefix) for key in sorted(entity_info.keys()): logger.info("******* %s results ********" % key) info = "-".join([ f' {key}: {value:.4f} ' for key, value in entity_info[key].items() ]) logger.info(info) return results