def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default='data/conll2003/', type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default='NER', type=str, required=True, help="The name of the task to train.") parser.add_argument("--output_dir", default='ner_output', type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_test", action='store_true', help="Whether to run test on the test set.") parser.add_argument("--do_pred", action='store_true', help="Whether to run pred on the pred set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=4.0,#3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--clip', type=float, default=0.5, help="gradient clipping") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--text_a', type=str, default='', help="input text_a.") parser.add_argument('--text_b', type=str, default='', help="input text_b.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "ner": NerProcessor } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval and not args.do_pred: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels(args.data_dir) num_labels_task = {"ner": len(label_list)} num_labels = num_labels_task[task_name] tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) #train_examples = train_examples[:1000] print("train_examples :: ",len(list(train_examples))) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)) model = BertForTokenClassification.from_pretrained(args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() # added clip if args.clip is not None: _ = torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = BertForTokenClassification(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) else: #model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels) # Load a trained model and config that you have fine-tuned print('for eval only......................') output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) config = BertConfig(output_config_file) model = BertForTokenClassification(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) #import pdb;pdb.set_trace() print("dev_eaxmples :: ",len(list(eval_examples))) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 predictions , true_labels = [], [] #predictions1 , true_labels1 = [], [] for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() # get index till '[SEP]' #print("label_list index SEP : ",label_list.index('[SEP]')) pred_xx = [list(p) for p in np.argmax(logits, axis=2)] pred_xx = [i[:i.index(label_list.index('[SEP]'))]for i in pred_xx] label_ids_xx = [i[:i.index(label_list.index('[SEP]'))]for i in label_ids.tolist()] #print(label_ids_xx) #print(pred_xx) # new add tmp_s = [max(len(i), len(j)) for i,j in zip(label_ids_xx,pred_xx)] tmp_u = [(i+[31]*(k-len(i)) if len(i) !=k else i,j+[31]*(k-len(j)) if len(j) !=k else j) for i,j,k in zip(label_ids_xx,pred_xx,tmp_s)] tmp_d1 = [h[0] for h in tmp_u] tmp_d2 = [h[1] for h in tmp_u] #print([list(p) for p in np.argmax(logits, axis=2)][:5]) #tmp_eval_accuracy = flat_accuracy(logits, label_ids) tmp_eval_accuracy = flat_accc(pred_xx, label_ids_xx) #tmp_eval_accuracy = flat_accc(tmp_d1, tmp_d2) predictions.extend(tmp_d2) true_labels.append(tmp_d1) #predictions1.extend(pred_xx) #true_labels1.append(label_ids_xx) #print("tmp accuracy : ",tmp_eval_accuracy) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_steps loss = tr_loss/nb_tr_steps if args.do_train else None pred_tags = [[label_list[p_i] if p_i!=31 else 'XXX' for p_i in p] for p in predictions] valid_tags = [[label_list[l_ii] if l_ii!=31 else 'YYY' for l_ii in l_i] for l in true_labels for l_i in l ] print("valid_tags : ",valid_tags[:10]) print("pred_tags : ",pred_tags[:10]) print("Validation F1-Score: {}".format(f1_score(valid_tags, pred_tags))) print("Validation accuracy_score : {}".format(accuracy_score(valid_tags, pred_tags))) print("Validation classification_report : {}".format(classification_report(valid_tags, pred_tags))) #print("X Validation F1-Score: {}".format(f1_score(true_labels1, predictions1))) #print("X Validation accuracy_score : {}".format(accuracy_score(true_labels1, predictions1))) #print("X Validation classification_report : {}".format(classification_report(true_labels1, predictions1))) result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': loss} print(result) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(result.keys()): writer.write("%s = %s\n" % (key, str(result[key]))) if args.do_test and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_test_examples(args.data_dir) print('test examples len : {}'.format(len(eval_examples))) #import pdb;pdb.set_trace() eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() test_loss, test_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 predictions , true_labels = [], [] for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() # get index till '[SEP]' #print("label_list index SEP : ",label_list.index('[SEP]')) pred_xx = [list(p) for p in np.argmax(logits, axis=2)] pred_xx = [i[:i.index(label_list.index('[SEP]'))]for i in pred_xx] label_ids_xx = [i[:i.index(label_list.index('[SEP]'))]for i in label_ids.tolist()] #print(label_ids_xx) #print(pred_xx) # new add tmp_s = [max(len(i), len(j)) for i,j in zip(label_ids_xx,pred_xx)] tmp_u = [(i+[31]*(k-len(i)) if len(i) !=k else i,j+[31]*(k-len(j)) if len(j) !=k else j) for i,j,k in zip(label_ids_xx,pred_xx,tmp_s)] tmp_d1 = [h[0] for h in tmp_u] tmp_d2 = [h[1] for h in tmp_u] #print([list(p) for p in np.argmax(logits, axis=2)][:5]) #tmp_eval_accuracy = flat_accuracy(logits, label_ids) tmp_eval_accuracy = flat_accc(pred_xx, label_ids_xx) #tmp_eval_accuracy = flat_accc(tmp_d1, tmp_d2) predictions.extend(tmp_d2) true_labels.append(tmp_d1) #print("tmp accuracy : ",tmp_eval_accuracy) test_loss += tmp_eval_loss.mean().item() test_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 test_loss = test_loss / nb_eval_steps test_accuracy = test_accuracy / nb_eval_steps loss = tr_loss/nb_tr_steps if args.do_train else None pred_tags = [[label_list[p_i] if p_i!=31 else 'XXX' for p_i in p] for p in predictions] valid_tags = [[label_list[l_ii] if l_ii!=31 else 'YYY' for l_ii in l_i] for l in true_labels for l_i in l ] print("valid_tags : ",valid_tags[:10]) print("pred_tags : ",pred_tags[:10]) print("Test F1-Score: {}".format(f1_score(valid_tags, pred_tags))) print("Test accuracy_score : {}".format(accuracy_score(valid_tags, pred_tags))) print("Test classification_report : {}".format(classification_report(valid_tags, pred_tags))) #print("X Test F1-Score: {}".format(f1_score(true_labels, predictions))) #print("X Test accuracy_score : {}".format(accuracy_score(true_labels, predictions))) #print("X Test classification_report : {}".format(classification_report(true_labels, predictions))) result = {'test_loss': test_loss, 'test_accuracy': test_accuracy, 'global_step': global_step, 'loss': loss} print(result) output_test_file = os.path.join(args.output_dir, "test_results.txt") with open(output_test_file, "w") as writer: for key in sorted(result.keys()): writer.write("%s = %s\n" % (key, str(result[key]))) if args.do_pred and (args.local_rank == -1 or torch.distributed.get_rank() == 0): #eval_examples = processor.get_dev_examples(args.data_dir) model.eval() while True: print('enter a text to get NER. otherwise press Ctrl+C to close session.') text_a = input('>>>') #"Japan began the defence of their Asian Cup title with a lucky 2-1 win against Syria in a Group C championship match on Friday . ." eval_examples = {'text_a':text_a,'text_b':"The foodservice pie business does not fit our long-term growth strategy .",'label':'1','guid':'12345'} eval_features = convert_examples_to_features_test(eval_examples, label_list, args.max_seq_length, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) #model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 predictions , true_labels = [], [] for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() pred_xx = [list(p) for p in np.argmax(logits, axis=2)] pred_xx = [i[:i.index(label_list.index('[SEP]'))] for i in pred_xx] print(pred_xx) print([[label_list[p_i] if p_i!=31 else 'XXX' for p_i in p] for p in pred_xx])
def eval_model(processor, data_dir, label_list, max_seq_length, eval_batch_size, tokenizer, device, model, tr_loss, nb_tr_steps, do_train, global_step, output_dir, epoch): eval_examples = processor.get_dev_examples(data_dir) get_eval_features = convert_examples_to_features(eval_examples, label_list, max_seq_length, tokenizer) eval_features = next(get_eval_features) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Num inputs = %d", len(eval_features)) # can be different from seq oversize drops logger.info(" Batch size = %d", eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 cm = [[0 for i in range(len(label_list))] for j in range(len(label_list))] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) for didx in range(logits.shape[0]): pred = int(np.argmax(logits[didx])) cm[label_ids[didx]][pred] += 1 eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss / nb_tr_steps if do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': loss, 'cm': cm, 'nb_eval_examples': nb_eval_examples } output_eval_file = os.path.join(output_dir, "eval_results_%d.txt" % epoch) with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
# Split train/test set eval_examples_index = -1 * int( args.eval_percentage * float(len(all_examples))) train_examples, eval_examples = all_examples[: eval_examples_index], all_examples[ eval_examples_index:] batch_num = len(train_examples) // args.batch_size + 1 train_data = get_tensor_data(train_examples, "train") train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size) eval_data = get_tensor_data(eval_examples, "eval") eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) # =========== MODEL ================================ model_para = { 'item_size': len(item_vocab), 'hidden_size': args.hidden_size, 'block_num': args.block_num, 'seq_len': len(all_examples[0]), 'rezero': False, 'dropout': args.dropout, # nexitnet 'base_block': [1, 4],
def evaluate(args, model, eval_dataset=None, tokenizer=None): args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) if not eval_dataset and tokenizer: eval_dataset = load_and_cache_examples(args, tokenizer, set_type='dev') if not eval_dataset: raise ValueError('The eval or test dataset can not be None') eval_sampler = SequentialSampler( eval_dataset) if args.local_rank == -1 else DistributedSampler( eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu eval if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], 'labels': batch[3] } outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = np.argmax(preds, axis=1) result = compute_metrics(out_label_ids, preds, average='binary') result['eval_loss'] = round(eval_loss, 4) report = classification_report(out_label_ids, preds, digits=5) print(report) # out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) # eval_loss = eval_loss / nb_eval_steps # preds = np.argmax(preds, axis=1) # report = classification_report(out_label_ids, preds) # print(report) # result = compute_metrics(out_label_ids, preds, average='macro') # result['eval_loss'] = round(eval_loss, 4) return result, preds
def train(model, tokenizer, train_data, valid_data, args, eos=False): model.train() train_dataset = TextDataset(train_data) train_dataloader = DataLoader( train_dataset, sampler=RandomSampler(train_dataset), batch_size=args.train_batch_size, num_workers=args.num_workers, collate_fn=lambda x: collate_fn_MLM(x, tokenizer, args.max_seq_length, eos=eos, tokenizer_type=args.tokenizer)) valid_dataset = TextDataset(valid_data) valid_dataloader = DataLoader( valid_dataset, sampler=SequentialSampler(valid_dataset), batch_size=args.eval_batch_size, num_workers=args.num_workers, collate_fn=lambda x: collate_fn_MLM(x, tokenizer, args.max_seq_length, eos=eos, tokenizer_type=args.tokenizer)) valid_clean = [x for x in valid_data] epochs = (args.max_steps - 1) // len(train_dataloader) + 1 optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, betas=eval(args.adam_betas), eps=args.eps, weight_decay=args.weight_decay) lr_lambda = lambda x: x / args.num_warmup_steps if x <= args.num_warmup_steps else ( x / args.num_warmup_steps)**-0.5 scheduler = LambdaLR(optimizer, lr_lambda) step = 0 best_val_gleu = -float("inf") meter = Meter() for epoch in range(1, epochs + 1): print("===EPOCH: ", epoch) for batch in train_dataloader: step += 1 batch = tuple(t.to(args.device) for t in batch) loss, items = calc_loss(model, batch) meter.add(*items) loss.backward() if args.max_grad_norm > 0: nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() model.zero_grad() scheduler.step() if step % args.log_interval == 0: lr = scheduler.get_lr()[0] loss_sent, loss_token = meter.average() logger.info( f' [{step:5d}] lr {lr:.6f} | {meter.print_str(True)}') nsml.report(step=step, scope=locals(), summary=True, train__lr=lr, train__loss_sent=loss_sent, train__token_ppl=math.exp(loss_token)) meter.init() if step % args.eval_interval == 0: start_eval = time.time() (val_loss, val_loss_token), valid_str = evaluate(model, valid_dataloader, args) prediction, valid_masked = correct(model, tokenizer, valid_clean, args, eos=eos, length_limit=0.1) val_em = em(prediction, valid_clean) cnt = 0 # print("zero index check", valid_masked[0], prediction[0], valid_clean[0]) for noisy, pred, clean in zip(valid_masked, prediction, valid_clean): print(f'[{noisy}], [{pred}], [{clean}]') # 10개만 출력하기 cnt += 1 if cnt == 20: break val_gleu = gleu(prediction, valid_clean) logger.info('-' * 89) logger.info( f' [{step:6d}] valid | {valid_str} | em {val_em:5.2f} | gleu {val_gleu:5.2f}' ) logger.info('-' * 89) nsml.report(step=step, scope=locals(), summary=True, valid__loss_sent=val_loss, valid__token_ppl=math.exp(val_loss_token), valid__em=val_em, valid__gleu=val_gleu) if val_gleu > best_val_gleu: best_val_gleu = val_gleu nsml.save("best") meter.start += time.time() - start_eval if step >= args.max_steps: break #nsml.save(epoch) if step >= args.max_steps: break
def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""): eval_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode=mode) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler( eval_dataset) if args.local_rank == -1 else DistributedSampler( eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model, device_ids=[0, 1, 2, 3]) # Eval! logger.info("***** Running evaluation %s *****", prefix) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = [] out_label_ids = None model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): input_ids = batch[0] attention_mask = batch[1] label_ids = batch[3] inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3], "device": args.device } if args.model_type != "distilbert": inputs["token_type_ids"] = ( batch[2] if args.model_type in ["bert", "xlnet"] else None ) # XLM and RoBERTa don"t use segment_ids outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] if args.n_gpu > 1: tmp_eval_loss = tmp_eval_loss.mean( ) # mean() to average on multi-gpu parallel evaluating eval_loss += tmp_eval_loss.item() for i in range(logits.size(0)): non_masked_idx = attention_mask[i] == 1 non_masked_labels = label_ids[i][non_masked_idx] non_masked_logits = logits[i][non_masked_idx] # ignore label index -100 keep_index = non_masked_labels != -100 active_labels = non_masked_labels[keep_index] active_logits = non_masked_logits[keep_index] # Apply Viterbi decoding pred_score, pred = model.module._viterbi_decode( active_logits, args.device) preds.append(pred) nb_eval_steps += 1 if nb_eval_steps == 1: out_label_ids = inputs["labels"].detach().cpu().numpy() else: out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) # if preds is None: # preds = logits.detach().cpu().numpy() # out_label_ids = inputs["labels"].detach().cpu().numpy() # else: # preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) # out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps # preds = np.argmax(preds, axis=2) label_map = {i: label for i, label in enumerate(labels)} out_label_list = [[] for _ in range(out_label_ids.shape[0])] preds_list = [[] for _ in range(out_label_ids.shape[0])] for i in range(out_label_ids.shape[0]): k = 0 for j in range(out_label_ids.shape[1]): if out_label_ids[i, j] != pad_token_label_id: out_label_list[i].append(label_map[out_label_ids[i, j]]) preds_list[i].append(label_map[preds[i][k]]) k += 1 # print(out_label_list[i]) # print(preds_list[i]) # print("---------------------") results = { "loss": eval_loss, "precision": precision_score(out_label_list, preds_list), "recall": recall_score(out_label_list, preds_list), "f1": f1_score(out_label_list, preds_list), } logger.info("***** Eval results %s *****", prefix) for key in sorted(results.keys()): logger.info(" %s = %s", key, str(results[key])) return results, preds_list
def test_dataloaders_with_missing_keyword_arguments(): ds = RandomDataset(10, 20) class TestDataLoader(DataLoader): def __init__(self, dataset): super().__init__(dataset) loader = TestDataLoader(ds) sampler = SequentialSampler(ds) match = escape("missing arguments are ['batch_sampler', 'sampler', 'shuffle']") with pytest.raises(MisconfigurationException, match=match): _update_dataloader(loader, sampler, mode="fit") match = escape("missing arguments are ['batch_sampler', 'batch_size', 'drop_last', 'sampler', 'shuffle']") with pytest.raises(MisconfigurationException, match=match): _update_dataloader(loader, sampler, mode="predict") class TestDataLoader(DataLoader): def __init__(self, dataset, *args, **kwargs): super().__init__(dataset) loader = TestDataLoader(ds) sampler = SequentialSampler(ds) _update_dataloader(loader, sampler, mode="fit") _update_dataloader(loader, sampler, mode="predict") class TestDataLoader(DataLoader): def __init__(self, *foo, **bar): super().__init__(*foo, **bar) loader = TestDataLoader(ds) sampler = SequentialSampler(ds) _update_dataloader(loader, sampler, mode="fit") _update_dataloader(loader, sampler, mode="predict") class TestDataLoader(DataLoader): def __init__(self, num_feat, dataset, *args, shuffle=False): self.num_feat = num_feat super().__init__(dataset) loader = TestDataLoader(1, ds) sampler = SequentialSampler(ds) match = escape("missing arguments are ['batch_sampler', 'sampler']") with pytest.raises(MisconfigurationException, match=match): _update_dataloader(loader, sampler, mode="fit") match = escape("missing arguments are ['batch_sampler', 'batch_size', 'drop_last', 'sampler']") with pytest.raises(MisconfigurationException, match=match): _update_dataloader(loader, sampler, mode="predict") class TestDataLoader(DataLoader): def __init__(self, num_feat, dataset, **kwargs): self.feat_num = num_feat super().__init__(dataset) loader = TestDataLoader(1, ds) sampler = SequentialSampler(ds) match = escape("missing attributes are ['num_feat']") with pytest.raises(MisconfigurationException, match=match): _update_dataloader(loader, sampler, mode="fit") match = escape("missing attributes are ['num_feat']") with pytest.raises(MisconfigurationException, match=match): _update_dataloader(loader, sampler, mode="predict")
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mnli-mm": MnliMismatchedProcessor, "mrpc": MrpcProcessor, "sst-2": Sst2Processor, "sts-b": StsbProcessor, "qqp": QqpProcessor, "qnli": QnliProcessor, "rte": RteProcessor, "wnli": WnliProcessor, "adlhw2": MyTaskProcessor } output_modes = { "cola": "classification", "mnli": "classification", "mrpc": "classification", "sst-2": "classification", "sts-b": "regression", "qqp": "classification", "qnli": "classification", "rte": "classification", "wnli": "classification", "adlhw2": "classification" } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) # model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) #I think we should load the fined-tuned LM here! #config = BertConfig(output_config_file) #model = BertForSequenceClassification(config, num_labels=num_labels) #model.load_state_dict(torch.load()) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # define a new function to compute loss values for both output_modes logits = model(input_ids, segment_ids, input_mask, labels=None) if output_mode == "classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = BertForSequenceClassification(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) else: output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) config = BertConfig(output_config_file) model = BertForSequenceClassification(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) #model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) # create eval loss and other metric required by the task if output_mode == "classification": loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] if output_mode == "classification": preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss / nb_tr_steps if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # hack for MNLI-MM if task_name == "mnli": task_name = "mnli-mm" processor = processors[task_name]() if os.path.exists(args.output_dir + '-MM') and os.listdir(args.output_dir + '-MM') and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.". format(args.output_dir)) if not os.path.exists(args.output_dir + '-MM'): os.makedirs(args.output_dir + '-MM') eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] preds = np.argmax(preds, axis=1) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss / nb_tr_steps if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir + '-MM', "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .csv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written.") parser.add_argument("--init_checkpoint", default=None, type=str, required=True, help="The checkpoint file from pretraining") ## Other parameters parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--max_steps", default=-1.0, type=float, help="Total number of training steps to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): print("WARNING: Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = read_swag_examples(os.path.join(args.data_dir, 'train.csv'), is_training = True) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model model = BertForMultipleChoice.from_pretrained(args.bert_model, cache_dir=os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)), num_choices=4) model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'), strict=False) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.contrib.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: train_features = convert_examples_to_features( train_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): # Terminate early for benchmarking if args.max_steps > 0 and global_step > args.max_steps: break batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = BertForMultipleChoice(config, num_choices=4) model.load_state_dict(torch.load(output_model_file)) else: model = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4) model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'), strict=False) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_swag_examples(os.path.join(args.data_dir, 'val.csv'), is_training = True) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss/nb_tr_steps} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def dump_phrases(args, model, tokenizer): if not os.path.exists(os.path.join(args.output_dir, 'dump/phrase')): os.makedirs(os.path.join(args.output_dir, 'dump/phrase')) start_time = timeit.default_timer() if ':' not in args.predict_file: predict_files = [args.predict_file] offsets = [0] output_dump_file = os.path.join( args.output_dir, "dump/phrase/{}.hdf5".format(os.path.splitext(os.path.basename(args.predict_file))[0]) ) else: dirname = os.path.dirname(args.predict_file) basename = os.path.basename(args.predict_file) start, end = list(map(int, basename.split(':'))) output_dump_file = os.path.join( args.output_dir, f"dump/phrase/{start}-{end}.hdf5" ) # skip files if possible if os.path.exists(output_dump_file): with h5py.File(output_dump_file, 'r') as f: dids = list(map(int, f.keys())) start = int(max(dids) / 1000) logger.info('%s exists; starting from %d' % (output_dump_file, start)) names = [str(i).zfill(4) for i in range(start, end)] predict_files = [os.path.join(dirname, name) for name in names] offsets = [int(each) * 1000 for each in names] for offset, predict_file in zip(offsets, predict_files): args.predict_file = predict_file logger.info(f"***** Pre-processing contexts from {args.predict_file} *****") dataset, examples, features = load_and_cache_examples( args, tokenizer, evaluate=True, output_examples=True, context_only=True ) for example in examples: example.doc_idx += offset args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) logger.info(f"***** Dumping Phrases from {args.predict_file} *****") logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) start_time = timeit.default_timer() def get_phrase_results(): for batch in tqdm(eval_dataloader, desc="Dumping"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "return_phrase": True, } feature_indices = batch[3] outputs = model(**inputs) for i, feature_index in enumerate(feature_indices): # TODO: i and feature_index are the same number! Simplify by removing enumerate? eval_feature = features[feature_index.item()] unique_id = int(eval_feature.unique_id) output = [ to_numpy(output[i]) if type(output) != dict else {k: to_numpy(v[i]) for k, v in output.items()} for output in outputs ] if len(output) != 4: raise NotImplementedError else: start_vecs, end_vecs, sft_logits, eft_logits = output result = squad_utils.ContextResult( unique_id, start_vecs=start_vecs, end_vecs=end_vecs, sft_logits=sft_logits, eft_logits=eft_logits, ) yield result write_phrases( examples, features, get_phrase_results(), args.max_answer_length, args.do_lower_case, tokenizer, output_dump_file, args.filter_threshold, args.verbose_logging, args.dense_offset, args.dense_scale, has_title=args.append_title, ) evalTime = timeit.default_timer() - start_time logger.info("Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))
print("Loss: {:.4f}".format(loss.item())) # Parse test data test = parseXml("test.xml") # Convert to integer encoding encodeData(test, train.dict) # Convert to tensors x1_test_tns = torch.tensor(test.prem) x2_test_tns = torch.tensor(test.hyp) y_test_tns = torch.tensor(test.lab) # Initialize TensorDataset and DataLoader test_tns = TensorDataset(x1_test_tns, x2_test_tns, y_test_tns) # Random Sampler (for DataLoader) test_sampler = SequentialSampler(test_tns) batch_size = 16 test_ldr = DataLoader(dataset=test_tns, batch_size=batch_size, sampler=test_sampler) # Put prem and hyp through network for test avg_perform = [0, 0, 0, 0, 0] total_batches = 0 for p, h, l in train_ldr: inference_time = time.time() calculated = model(p, h).cpu() throughput = round(time.time() - inference_time, 4) print('---------------')
def evaluate(args, model, tokenizer, prefix=""): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) start_time = timeit.default_timer() def get_results(): for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "input_ids_": batch[6], "attention_mask_": batch[7], "token_type_ids_": batch[8], } feature_indices = batch[3] outputs = model(**inputs) for i, feature_index in enumerate(feature_indices): # TODO: i and feature_index are the same number! Simplify by removing enumerate? eval_feature = features[feature_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] if len(output) != 4: raise NotImplementedError else: start_logits, end_logits, sft_logits, eft_logits = output result = squad_utils.SquadResult( unique_id, start_logits=start_logits, end_logits=end_logits, sft_logits=sft_logits, eft_logits=eft_logits, ) yield result # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix)) output_candidates_file = os.path.join(args.output_dir, "candidates_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if args.model_type in ["xlnet", "xlm"]: raise NotImplementedError else: predictions, stat = compute_predictions_logits( examples, features, get_results(), args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, args.filter_threshold, output_candidates_file, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions, null_log_odds_file=output_null_log_odds_file) # Log stat locally with open('eval_logger.txt', 'a') as f: f.write(f'{args.output_dir}\t{results["exact"]:.3f}\t{results["f1"]:.3f}\n') evalTime = timeit.default_timer() - start_time logger.info("Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) return results, stat
def main(args): print(args) train_set, val_set = load_data(args.data_path, use_openfire=args.use_openfire, img_size=args.img_size) train_loader = torch.utils.data.DataLoader( train_set, batch_size=args.batch_size, drop_last=True, sampler=RandomSampler(train_set), num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader( val_set, batch_size=args.batch_size, drop_last=False, sampler=SequentialSampler(val_set), num_workers=args.workers, pin_memory=True) print("Creating model") model = holocron.models.__dict__[args.model](args.pretrained, num_classes=1) criterion = nn.BCEWithLogitsLoss() # Create the contiguous parameters. model_params = [p for p in model.parameters() if p.requires_grad] if args.opt == 'sgd': optimizer = torch.optim.SGD(model_params, args.lr, momentum=0.9, weight_decay=args.weight_decay) elif args.opt == 'adam': optimizer = torch.optim.Adam(model_params, args.lr, betas=(0.95, 0.99), eps=1e-6, weight_decay=args.weight_decay) elif args.opt == 'radam': optimizer = holocron.optim.RAdam(model_params, args.lr, betas=(0.95, 0.99), eps=1e-6, weight_decay=args.weight_decay) elif args.opt == 'ranger': optimizer = Lookahead( holocron.optim.RAdam(model_params, args.lr, betas=(0.95, 0.99), eps=1e-6, weight_decay=args.weight_decay)) elif args.opt == 'tadam': optimizer = holocron.optim.TAdam(model_params, args.lr, betas=(0.95, 0.99), eps=1e-6, weight_decay=args.weight_decay) trainer = BinaryClassificationTrainer(model, train_loader, val_loader, criterion, optimizer, args.device, args.output_file) if args.resume: print(f"Resuming {args.resume}") checkpoint = torch.load(args.resume, map_location='cpu') trainer.load(checkpoint) if args.test_only: print("Running evaluation") eval_metrics = trainer.evaluate() print( f"Validation loss: {eval_metrics['val_loss']:.4} " f"(Acc@1: {eval_metrics['acc1']:.2%}, Acc@5: {eval_metrics['acc5']:.2%})" ) return if args.lr_finder: print("Looking for optimal LR") trainer.lr_find(args.freeze_until) trainer.plot_recorder() return print("Start training") start_time = time.time() trainer.fit_n_epochs(args.epochs, args.lr, args.freeze_until) total_time_str = str( datetime.timedelta(seconds=int(time.time() - start_time))) print('Training time {}'.format(total_time_str))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_file", default=None, type=str) parser.add_argument("--eval_file", default=None, type=str) parser.add_argument("--test_file", default=None, type=str) parser.add_argument("--model_name_or_path", default=None, type=str) parser.add_argument("--output_dir", default=None, type=str) ## other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=256, type=int) parser.add_argument("--do_train", default=False, type=boolean_string) parser.add_argument("--do_eval", default=False, type=boolean_string) parser.add_argument("--do_test", default=False, type=boolean_string) parser.add_argument("--train_batch_size", default=8, type=int) parser.add_argument("--eval_batch_size", default=8, type=int) parser.add_argument("--learning_rate", default=3e-5, type=float) parser.add_argument("--num_train_epochs", default=10, type=float) parser.add_argument("--warmup_proprotion", default=0.1, type=float) parser.add_argument("--use_weight", default=1, type=int) parser.add_argument("--local_rank", type=int, default=-1) parser.add_argument("--seed", type=int, default=2019) parser.add_argument("--fp16", default=False) parser.add_argument("--loss_scale", type=float, default=0) parser.add_argument('--gradient_accumulation_steps', type=int, default=1) parser.add_argument("--warmup_steps", default=0, type=int) parser.add_argument("--adam_epsilon", default=1e-8, type=float) parser.add_argument("--max_steps", default=-1, type=int) parser.add_argument("--do_lower_case", action='store_true') parser.add_argument("--logging_steps", default=500, type=int) parser.add_argument("--clean", default=False, type=boolean_string, help="clean the output dir") parser.add_argument("--need_birnn", default=False, type=boolean_string) parser.add_argument("--rnn_dim", default=128, type=int) args = parser.parse_args() device = torch.device("cuda") # os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ args.device = device n_gpu = torch.cuda.device_count() logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger.info(f"device: {device} n_gpu: {n_gpu}") if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) # now_time = datetime.datetime.now().strftime('%Y-%m-%d_%H') # tmp_dir = args.output_dir + '/' +str(now_time) + '_ernie' # if not os.path.exists(tmp_dir): # os.makedirs(tmp_dir) # args.output_dir = tmp_dir if args.clean and args.do_train: # logger.info("清理") if os.path.exists(args.output_dir): def del_file(path): ls = os.listdir(path) for i in ls: c_path = os.path.join(path, i) print(c_path) if os.path.isdir(c_path): del_file(c_path) os.rmdir(c_path) else: os.remove(c_path) try: del_file(args.output_dir) except Exception as e: print(e) print('pleace remove the files of output dir and data.conf') exit(-1) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if not os.path.exists(os.path.join(args.output_dir, "eval")): os.makedirs(os.path.join(args.output_dir, "eval")) writer = SummaryWriter(logdir=os.path.join(args.output_dir, "eval"), comment="Linear") processor = NerProcessor() label_list = processor.get_labels(args) num_labels = len(label_list) args.label_list = label_list if os.path.exists(os.path.join(args.output_dir, "label2id.pkl")): with open(os.path.join(args.output_dir, "label2id.pkl"), "rb") as f: label2id = pickle.load(f) else: label2id = {l: i for i, l in enumerate(label_list)} with open(os.path.join(args.output_dir, "label2id.pkl"), "wb") as f: pickle.dump(label2id, f) id2label = {value: key for key, value in label2id.items()} # Prepare optimizer and schedule (linear warmup and decay) if args.do_train: tokenizer = BertTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) config = BertConfig.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels) model = BERT_BiLSTM_CRF.from_pretrained(args.model_name_or_path, config=config, need_birnn=args.need_birnn, rnn_dim=args.rnn_dim) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) train_examples, train_features, train_data = get_Dataset(args, processor, tokenizer, mode="train") train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) if args.do_eval: eval_examples, eval_features, eval_data = get_Dataset(args, processor, tokenizer, mode="eval") if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_data)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Total optimization steps = %d", t_total) model.train() global_step = 0 tr_loss, logging_loss = 0.0, 0.0 best_f1 = 0.0 for ep in trange(int(args.num_train_epochs), desc="Epoch"): model.train() for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch outputs = model(input_ids, label_ids, segment_ids, input_mask) loss = outputs if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: tr_loss_avg = (tr_loss - logging_loss) / args.logging_steps writer.add_scalar("Train/loss", tr_loss_avg, global_step) logging_loss = tr_loss if args.do_eval: all_ori_tokens_eval = [f.ori_tokens for f in eval_features] overall, by_type = evaluate(args, eval_data, model, id2label, all_ori_tokens_eval) # add eval result to tensorboard f1_score = overall.fscore writer.add_scalar("Eval/precision", overall.prec, ep) writer.add_scalar("Eval/recall", overall.rec, ep) writer.add_scalar("Eval/f1_score", overall.fscore, ep) # save the best performs model if f1_score > best_f1: logger.info( f"----------the best f1 is {f1_score}---------") best_f1 = f1_score model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save( args, os.path.join(args.output_dir, 'training_args.bin')) # logger.info(f'epoch {ep}, train loss: {tr_loss}') # writer.add_graph(model) writer.close() # model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training # model_to_save.save_pretrained(args.output_dir) # tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model # torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) if args.do_test: # model = BertForTokenClassification.from_pretrained(args.output_dir) # model.to(device) label_map = {i: label for i, label in enumerate(label_list)} tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) args = torch.load(os.path.join(args.output_dir, 'training_args.bin')) model = BERT_BiLSTM_CRF.from_pretrained(args.output_dir, need_birnn=args.need_birnn, rnn_dim=args.rnn_dim) model.to(device) test_examples, test_features, test_data = get_Dataset(args, processor, tokenizer, mode="test") logger.info("***** Running test *****") logger.info(f" Num examples = {len(test_examples)}") logger.info(f" Batch size = {args.eval_batch_size}") all_ori_tokens = [f.ori_tokens for f in test_features] all_ori_labels = [e.label.split(" ") for e in test_examples] test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) model.eval() pred_labels = [] for b_i, (input_ids, input_mask, segment_ids, label_ids) in enumerate( tqdm(test_dataloader, desc="Predicting")): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model.predict(input_ids, segment_ids, input_mask) # logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) # logits = logits.detach().cpu().numpy() for l in logits: pred_label = [] for idx in l: pred_label.append(id2label[idx]) pred_labels.append(pred_label) assert len(pred_labels) == len(all_ori_tokens) == len(all_ori_labels) print(len(pred_labels)) with open(os.path.join(args.output_dir, "token_labels_.txt"), "w", encoding="utf-8") as f: for ori_tokens, ori_labels, prel in zip(all_ori_tokens, all_ori_labels, pred_labels): for ot, ol, pl in zip(ori_tokens, ori_labels, prel): if ot in ["[CLS]", "[SEP]"]: continue else: f.write(f"{ot} {ol} {pl}\n") f.write("\n")
def main(args=None): if args is None: args = model_utils.run_joint_three_models_get_local_args() print('#start:\t', args.learning_rate, args.train_batch_size, args.num_train_epochs) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError("At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError("If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError("If `do_predict` is True, then `predict_file` must be specified.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) #---------------------------------------------- labels_list = ["nmod", "conj", "acl:cl", "acl", "nmod:poss", "advcl", "xcomp"] num_labels = len(labels_list) #---------------------------------------------- tokenizer = BertTokenizer.from_pretrained(args.bert_model) train_examples = None num_train_steps = None if args.do_train: train_examples = read_many_examples(input_file=args.train_file, is_training=True) num_train_steps = int(len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = BertForSpanWithHeadwordWithLabel.from_pretrained(args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_labels=num_labels) print(PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank)) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 if args.do_train: cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}'.format( args.bert_model, str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) train_features = None try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( examples=train_examples, label_list=labels_list, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.local_rank == -1 or torch.distributed.get_rank() == 0: # logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) # logger.info("***** Running training *****") # logger.info(" Num orig examples = %d", len(train_examples)) # logger.info(" Num split examples = %d", len(train_features)) # logger.info(" Batch size = %d", args.train_batch_size) # logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) all_headword_positions = torch.tensor([f.headword_position for f in train_features], dtype=torch.long) all_labels = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions, all_headword_positions, all_labels) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): if n_gpu == 1: batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions, headword_positions, label_ids = batch print('headword#####', len(headword_positions), headword_positions) print('label_ids#####', len(label_ids), label_ids) loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, start_positions=start_positions, end_positions=end_positions, headword_positions=headword_positions, label_ids=label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * span_utils.warmup_linear(global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) # Load a trained model that you have fine-tuned model_state_dict = torch.load(output_model_file) model = BertForSpanWithHeadwordWithLabel.from_pretrained( args.bert_model, state_dict=model_state_dict, num_labels=num_labels) model.to(device) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_many_examples(input_file=args.predict_file, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, label_list=labels_list, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) # logger.info("***** Running predictions *****") # logger.info(" Num orig examples = %d", len(eval_examples)) # logger.info(" Num split examples = %d", len(eval_features)) # logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) #all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] # logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating"): # if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) #label_ids = label_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits, batch_headword_logits, batch_label_logits \ = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() headword_logits = batch_headword_logits[i].detach().cpu().tolist() label_logits = batch_label_logits[i].detach().cpu().tolist() #label_logits_outputs = np.argmax(label_logits, axis=1) #label_logits_outputs[0] eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append(RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits, headword_logits=headword_logits, label_logits=label_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, args.verbose_logging)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--input_file", default='data/train/raw_semEval_bert_emoji_conv.txt', type=str) parser.add_argument("--output_file", default='data/train/out_feat_bert_train.txt', type=str) parser.add_argument("--bert_model", default='bert-base-cased', type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") ## Other parameters parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--layers", default="-1,-2,-3,-4", type=str) parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences longer " "than this will be truncated, and sequences shorter than this will be padded.") parser.add_argument("--batch_size", default=64, type=int, help="Batch size for predictions.") parser.add_argument("--local_rank", type=int, default=-1, help = "local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--cuda", action='store_true', default=True) parser.add_argument("--single_gpu", action='store_true', default=True) parser.add_argument("--gpu", type=int, default=0) args = parser.parse_args() if torch.cuda.is_available(): if not args.cuda: print("WARNING: You have a CUDA device, so you should probably run with --cuda") else: torch.cuda.set_device(args.gpu) device = args.gpu cudnn.benchmark = True cudnn.enabled = True # torch.cuda.manual_seed_all(args.evaluation_seed) else: print('CUDA NOT AVAILABLE!') time.sleep(20) layer_indexes = [int(x) for x in args.layers.split(",")] tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) examples = read_examples(args.input_file) features = convert_examples_to_features( examples=examples, seq_length=args.max_seq_length, tokenizer=tokenizer) print('features ready...') unique_id_to_feature = {} for feature in features: unique_id_to_feature[feature.unique_id] = feature model = BertModel.from_pretrained(args.bert_model) # model.to(device) # gpu handling if args.cuda: if args.single_gpu: logger.info('USING SINGLE GPU!') model = model.cuda() else: model = torch.nn.DataParallel(model, dim=1).cuda() all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) model.eval() start_extract = time.time() sentences_time = [] with open(args.output_file, "w", encoding='utf-8') as writer: for input_ids, input_mask, example_indices in enumerate(eval_dataloader): start_batch = time.time() input_ids = input_ids.to(device) input_mask = input_mask.to(device) # forward trough the Bert model. It returns the hidden outputs of all its layers all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask) # print('size of the encoder layers: {}'.format(all_encoder_layers[0].size())) all_encoder_layers = all_encoder_layers ''' for b, example_index in enumerate(example_indices): start_sentence = time.time() # taking the specific preprocessed sample object feature = features[example_index.item()] unique_id = int(feature.unique_id) # feature = unique_id_to_feature[unique_id] output_json = collections.OrderedDict() # saving the sentence unique id output_json["linex_index"] = unique_id layer_outputs = [all_encoder_layers[int(layer_index)].detach().cpu().numpy() for layer_index in layer_indexes] all_out_features = [] for (i, token) in enumerate(feature.tokens): all_layers = [] for (j, layer_index) in enumerate(layer_indexes): # taking the layer output of the chosen layer, # detaching it from the graph and convert it to numpy array # detach_start = time.time() # layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy() layer_output = layer_outputs[j] # print('detach_time: {} seconds'.format(time.time() - detach_start)) # selecting the output of the specific sentence # (this is due because we are using a batch size > 1) layer_output = layer_output[b] layers = collections.OrderedDict() layers["index"] = layer_index # 'i' is used to select the vector related to the ith token layers["values"] = [ round(x.item(), 6) for x in layer_output[i] ] all_layers.append(layers) out_features = collections.OrderedDict() out_features["token"] = token out_features["layers"] = all_layers all_out_features.append(out_features) # features representation for all the tokens in the sentence output_json["features"] = all_out_features end_sentence = (time.time() - start_sentence, len(feature.tokens)) sentences_time.append(end_sentence) # writer.write(json.dumps(output_json) + "\n") ''' print(len(all_encoder_layers)) print(all_encoder_layers[0].shape) all_encoder_layers = torch.stack(all_encoder_layers, dim=0) print('old shape: {}'.format(all_encoder_layers.shape)) all_encoder_layers = all_encoder_layers.permute(1, 2, 0, 3) print('permute shape: {}'.format(all_encoder_layers.shape)) all_encoder_layers = all_encoder_layers.contiguous().view(all_encoder_layers.shape[0], all_encoder_layers.shape[1], -1) print('new shape: {}'.format(all_encoder_layers.shape)) print('end batch, {} sec'.format(time.time() - start_batch)) print('extraction ended, time: {:5.2f} seconds \n'.format((time.time() - start_extract))) avg_tok = 0 avg_time = 0 for i, (sent_time, tokens) in enumerate(sentences_time): print( 'extraction sentence {} with {} tokenks, sentence time: {:5.2f} seconds, token time: {:5.4f} seconds'.format( i, tokens, sent_time, sent_time / tokens)) avg_tok += tokens avg_time += sent_time avg_time_tok = avg_time / avg_tok avg_tok = avg_tok / len(sentences_time) avg_time = avg_time / len(sentences_time) print('\n On average there are {:5.2f} tokens per sentence'.format(avg_tok)) print('\n On average time elapsed per sentence: {:5.4f} seconds'.format(avg_time)) print('\n On average time elapsed per token: {:5.4f} seconds'.format(avg_time_tok))
def evaluate(args, model, tokenizer, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else ( args.task_name, ) eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else ( args.output_dir, ) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler( eval_dataset) if args.local_rank == -1 else DistributedSampler( eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu eval if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } if args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] if args.model_type in [ 'bert', 'xlnet' ] else None # XLM, DistilBERT and RoBERTa don't use segment_ids outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) if args.task_name == "livedoor": processor = processors[args.task_name]() labels = processor.get_labels() result = compute_metrics(eval_task, preds, out_label_ids, labels) else: result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return results
def eval_model(args, device, model, data_pattern): all_predictions = [] raw_results = [] for data_path in glob(data_pattern): eval_dataset = NqDataset(args, data_path, is_training=False) eval_examples = eval_dataset.examples eval_features = eval_dataset.features logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) eval_sampler = SequentialSampler(eval_features) eval_dataloader = DataLoader(eval_features, sampler=eval_sampler, batch_size=args.predict_batch_size, collate_fn=batcher(device, is_training=False), num_workers=0) model.eval() part_results = [] logger.info("Start evaluating") for batch in eval_dataloader: if len(part_results) % 1000 == 0: logger.info("Processing example: %d" % (len(part_results))) with torch.no_grad(): batch_tok_start_logits, batch_tok_end_logits, batch_tok_ref_indexes, \ batch_para_logits, batch_para_ref_indexes, batch_doc_logits = \ model(batch.input_ids, batch.input_mask, batch.segment_ids, batch.st_mask, batch.st_index, (batch.edges_src, batch.edges_tgt, batch.edges_type, batch.edges_pos)) for i, unique_id in enumerate(batch.unique_ids): tok_start_logits = batch_tok_start_logits[i].detach().cpu( ).tolist() tok_end_logits = batch_tok_end_logits[i].detach().cpu().tolist( ) tok_ref_indexes = batch_tok_ref_indexes[i].detach().cpu( ).tolist() para_logits = batch_para_logits[i].detach().cpu().tolist() para_ref_indexes = batch_para_ref_indexes[i].detach().cpu( ).tolist() doc_logits = batch_doc_logits[i].detach().cpu().tolist() unique_id = int(unique_id) part_results.append( RawResult(unique_id=unique_id, tok_start_logits=tok_start_logits, tok_end_logits=tok_end_logits, tok_ref_indexes=tok_ref_indexes, para_logits=para_logits, para_ref_indexes=para_ref_indexes, doc_logits=doc_logits)) raw_results.extend(part_results) part_predictions, part_nbest_predictions = get_predictions( eval_examples, eval_features, part_results, args.n_best_size, args.max_answer_length) all_predictions += part_predictions import pickle raw_results_path = os.path.join(args.output_dir, "raw_results") logger.info("Writing Raw results to: {}".format(raw_results_path)) with open(raw_results_path, "wb") as writer: pickle.dump(raw_results, writer) final_predictions = collections.OrderedDict() final_predictions["predictions"] = all_predictions predictions_path = os.path.join(args.output_dir, "tmp_predictions") logger.info("Writing predictions to: {}".format(predictions_path)) with open(predictions_path, "w") as writer: writer.write(json.dumps(final_predictions, indent=4) + "\n") eval_results = nq_evaluate(gold_path=data_pattern, predictions_path=predictions_path, num_threads=16) logger.info("***** Eval results *****".format()) for key in sorted(eval_results.keys()): logger.info(" %s = %s", key, str(eval_results[key] * 100)) model.train() return eval_results["Long Answer F1"], eval_results["Short Answer F1"]
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--similar_tokens_to_embeddings", default="closest_word_to_embeddings_whole_dataset_biobert", type=str, required=False, help= "The .pickle file which stores the map {token_similar_to_seed: embedding}." ) parser.add_argument( "--data_dir", default="whole_dataset_biobert", type=str, required=False, help= "The directory storing the input_ids.pt, attention_masks.pt and paper_ids." ) parser.add_argument("--model_path", default=None, type=str, required=False, help="The path to the .bin transformer model.") parser.add_argument("--model_name", default="BertModel", type=str, help="The path to the .bin transformer model.") parser.add_argument( "--output_file", default="whole_dataset_biobert", type=str, required=True, help= "The directory storing the word embeddings of the tokens (as python dictionary {token : embedding}) in pickle format" ) parser.add_argument("--batch_size", default=4, type=int, help="The batch size to feed the model") parser.add_argument( "--top_k", default=20, type=int, help= "Only the <top_k> tokens in every abstract are used for measuring an abstracts similarity value." ) args = parser.parse_args() with open(f"similar_words/{args.similar_tokens_to_embeddings}.pickle", "rb") as f: similar_token_to_embedding = np.stack( list(pickle.load(f).values())[:-1] ) # np.stack is needed as the contents of the pickle file is funny, print and see input_ids = torch.load(f"inputs/{args.data_dir}/input_ids.pt") attention_masks = torch.load(f"inputs/{args.data_dir}/attention_masks.pt") with open(f"inputs/{args.data_dir}/paper_ids.pickle", "rb") as f: paper_ids = pickle.load(f) logger.info("%s", str(input_ids.shape)) if args.model_name == "BertForSequenceClassification": model = BertForSequenceClassification else: model = BertModel if args.model_path is None: logger.info( "no model_path has been provided so using 'bert-base-cased'") model = BertModel.from_pretrained("bert-base-cased") else: logger.info(f"loading model and config from {args.model_path}") configuration = BertConfig.from_json_file( f"{args.model_path}/config.json") model = model.from_pretrained(f"{args.model_path}/pytorch_model.bin", config=configuration) model.cuda() dataset = PaperAbstractDataset(paper_ids, input_ids, attention_masks) batch_size = args.batch_size dataloader = DataLoader(dataset, sampler=SequentialSampler(dataset), batch_size=batch_size) device = torch.device("cuda") seed_val = 42 random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) torch.cuda.manual_seed_all(seed_val) # Measure the total training time for the whole run. total_t0 = time.time() t0 = time.time() logger.info("") logger.info('Forward pass...') model.eval() paper_ids_to_cosine_score = {} for step, batch in enumerate(dataloader): if step % 100 == 0: logger.info('======== Batch {:} / {:} ========'.format( step, len(dataloader))) logger.info( "Time to find embeddings for batches {} to {}: {:} (h:mm:ss)". format(max(0, step - 100), step, format_time(time.time() - t0))) t0 = time.time() # `batch` contains two pytorch tensors and 1 numpy array: # [0]: paper ids # [1]: input ids # [2]: attention masks paper_ids_np = np.array(batch[0], dtype=str) b_input_ids = batch[1].to(device) b_input_mask = batch[2].to(device) # in case there is "label" in the batch if len(batch) == 4: _ = batch[3].to(device) # embeddings, cls outputs = model(b_input_ids, attention_mask=b_input_mask) # if model is BertForSequenceClassification if args.model_name == "BertForSequenceClassification": cls, hidden_states = outputs embeddings, layers = hidden_states[0].detach().cpu( ), hidden_states[1].detach().cpu() del layers else: embeddings, cls = outputs # move everything to cpu to save GPU space b_input_ids_np = b_input_ids.cpu().numpy() b_input_mask_np = b_input_mask.cpu().numpy() embeddings_np = embeddings.detach().cpu().numpy() cls_np = cls.detach().cpu().numpy() del b_input_ids del b_input_mask del embeddings del cls torch.cuda.empty_cache() for batch_number in range(len(embeddings_np)): abstract_cosine_score = np.average( np.sort( cosine_similarity( embeddings_np[batch_number], similar_token_to_embedding))[:args.top_k]) paper_id = paper_ids_np[batch_number] paper_ids_to_cosine_score[paper_id] = abstract_cosine_score del b_input_ids_np del b_input_mask_np del embeddings_np del cls_np with open(f"document_scores/{args.output_file}.pickle", "wb") as f: pickle.dump(paper_ids_to_cosine_score, f, pickle.HIGHEST_PROTOCOL) logger.info( "Total time to complete the entire process: {:} (h:mm:ss)".format( format_time(time.time() - total_t0))) logger.info("\n") logger.info("Document similarity found!")
def evaluate(args, model, tokenizer, prefix=""): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {"input_ids": batch[0], "attention_mask": batch[1]} if args.model_type != "distilbert": inputs["token_type_ids"] = None if args.model_type == "xlm" else batch[2] # XLM don't use segment_ids example_indices = batch[3] if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None if args.model_type in ["xlnet", "xlm"]: # XLNet uses a more complex post-processing procedure predictions = compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, model.config.start_n_top, model.config.end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging, ) else: predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results
def evaluate(args, model, tokenizer, prefix=""): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler( dataset) if args.local_rank == -1 else DistributedSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': None if args.model_type == 'xlm' else batch[2] # XLM don't use segment_ids } example_indices = batch[3] if args.model_type in ['xlnet', 'xlm']: inputs.update({'cls_index': batch[4], 'p_mask': batch[5]}) outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) if args.model_type in ['xlnet', 'xlm']: # XLNet uses a more complex post-processing procedure result = RawResultExtended( unique_id=unique_id, start_top_log_probs=to_list(outputs[0][i]), start_top_index=to_list(outputs[1][i]), end_top_log_probs=to_list(outputs[2][i]), end_top_index=to_list(outputs[3][i]), cls_logits=to_list(outputs[4][i])) else: result = RawResult(unique_id=unique_id, start_logits=to_list(outputs[0][i]), end_logits=to_list(outputs[1][i])) all_results.append(result) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args.output_dir, "nbest_predictions_{}.json".format(prefix)) output_null_log_odds_file = os.path.join( args.output_dir, "null_odds_{}.json".format(prefix)) if args.model_type in ['xlnet', 'xlm']: # XLNet uses a more complex post-processing procedure write_predictions_extended( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.predict_file, model.config.start_n_top, model.config.end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging) else: write_predictions(examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) # Evaluate with the official SQuAD script evaluate_options = EVAL_OPTS(data_file=args.predict_file, pred_file=output_prediction_file, na_prob_file=output_null_log_odds_file) results = evaluate_on_squad(evaluate_options) return results
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict: # Loop to handle MNLI double evaluation (matched, mis-matched) eval_output_dir = args.output_dir eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) if args.local_rank in [-1, 0]: os.makedirs(eval_output_dir, exist_ok=True) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) with torch.no_grad(): outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model( inputs, labels=labels) lm_loss = outputs[0] eval_loss += lm_loss.mean().item() nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps perplexity = torch.exp(torch.tensor(eval_loss)) result = {"perplexity": perplexity} output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return result
def predict(pred_config, text_for_NER): logger.info(args) # Convert input file to TensorDataset pad_token_label_id = torch.nn.CrossEntropyLoss().ignore_index tokenizer = load_tokenizer(args) sentences = kss.split_sentences(text_for_NER) lines = read_input_file(sentences) print(lines) dataset = convert_input_file_to_tensor_dataset(lines, pred_config, args, tokenizer, pad_token_label_id) # Predict sampler = SequentialSampler(dataset) data_loader = DataLoader(dataset, sampler=sampler, batch_size=pred_config.batch_size) all_slot_label_mask = None preds = None for batch in tqdm(data_loader, desc="Predicting"): batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": None } if args.model_type != "distilkobert": inputs["token_type_ids"] = batch[2] outputs = model(**inputs) logits = outputs[0] if preds is None: preds = logits.detach().cpu().numpy() all_slot_label_mask = batch[3].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) all_slot_label_mask = np.append( all_slot_label_mask, batch[3].detach().cpu().numpy(), axis=0) preds = np.argmax(preds, axis=2) slot_label_map = {i: label for i, label in enumerate(label_lst)} preds_list = [[] for _ in range(preds.shape[0])] for i in range(preds.shape[0]): for j in range(preds.shape[1]): if all_slot_label_mask[i, j] != pad_token_label_id: preds_list[i].append(slot_label_map[preds[i][j]]) # Write to output file prod = location = "" for words, preds in zip(lines, preds_list): line = "" for word, pred in zip(words, preds): if pred == 'O': continue else: if pred == 'ORG-B': prod = prod + word + " " elif pred == 'ORG-I': prod = prod + word + " " elif pred == 'AFW-B': prod = prod + word + " " elif pred == 'AFB-I': prod = prod + word + " " elif pred == 'LOC-B': location = location + word + " " elif pred == 'LOC-I': location = location + word + " " logger.info("Prediction Done!") return prod, location
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict: # Loop to handle MNLI double evaluation (matched, mis-matched) eval_output_dir = args.output_dir eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) if args.local_rank in [-1, 0]: os.makedirs(eval_output_dir, exist_ok=True) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) fact_token_ids, fact_embedding_ids = zip( *[get_inputs(seq, mask) for seq, mask, genre in examples]) seqs = [seq for seq, mask, genre in examples] pad_seqs = pad_sequence(seqs, batch_first=True, padding_value=tokenizer.pad_token_id) pad_facts = pad_sequence(fact_token_ids, batch_first=True, padding_value=tokenizer.pad_token_id) pad_factsembeds = pad_sequence(fact_embedding_ids, batch_first=True, padding_value=tokenizer.pad_token_id) return list(zip(pad_facts, pad_factsembeds, pad_seqs)) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): if args.mlm: inputs, labels = mask_tokens(batch, tokenizer, args) with torch.no_grad(): outputs = model( inputs, masked_lm_labels=labels) if args.mlm else model( inputs, labels=labels) lm_loss = outputs[0] eval_loss += lm_loss.mean().item() elif args.xlnet: with torch.no_grad(): pad_facts, pad_factsembeds, pad_seqs = zip(*batch) tfacts = torch.stack(pad_facts).to(args.device) tfact_embeds = torch.stack(pad_factsembeds).to(args.device) facts_padding_masks = torch.where( tfacts == tokenizer.pad_token_id, torch.ones_like(tfacts), torch.zeros_like(tfacts)).to(args.device) tseqs = torch.stack(pad_seqs).to(args.device) tseqs_padding_masks = torch.where( tseqs == tokenizer.pad_token_id, torch.ones_like(tseqs), torch.zeros_like(tseqs)).to(args.device) perm_masks = get_perm_masks(torch.zeros_like(tseqs), order="L2R") target_mapping = get_target_mapping(torch.zeros_like(tseqs), device=args.device) outputs = model(input_ids=tseqs, facts_tokens=tfacts, facts_embeds=tfact_embeds, input_mask=tseqs_padding_masks, facts_input_mask=facts_padding_masks, perm_mask=perm_masks, target_mapping=target_mapping) lm_loss = outputs[0] eval_loss += lm_loss.mean().item() else: inputs, labels = (batch, batch) with torch.no_grad(): outputs = model( inputs, masked_lm_labels=labels) if args.mlm else model( inputs, labels=labels) lm_loss = outputs[0] eval_loss += lm_loss.mean().item() nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps print(f"validation loss value at step is {eval_loss}") logger.info(f"validation loss value at step is {eval_loss}") perplexity = torch.exp(torch.tensor(eval_loss)) result = {"perplexity": perplexity} output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return result
def evaluate(args, model, tokenizer, criterion, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_output_dir = args.output_dir eval_dataset = load_examples(args, tokenizer, evaluate=True) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader( eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate_fn ) # multi-gpu eval if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): batch = tuple(t.to(args.device) for t in batch) labels = batch[5] inputs = { "input_ids": batch[0], "input_modal": batch[2], "attention_mask": batch[1], "modal_start_tokens": batch[3], "modal_end_tokens": batch[4], } outputs = model(**inputs) logits = outputs[0] # model outputs are always tuple in transformers (see doc) tmp_eval_loss = criterion(logits, labels) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = torch.sigmoid(logits).detach().cpu().numpy() > 0.5 out_label_ids = labels.detach().cpu().numpy() else: preds = np.append(preds, torch.sigmoid(logits).detach().cpu().numpy() > 0.5, axis=0) out_label_ids = np.append(out_label_ids, labels.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps result = { "loss": eval_loss, "macro_f1": f1_score(out_label_ids, preds, average="macro"), "micro_f1": f1_score(out_label_ids, preds, average="micro"), } output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return result
def predict(args, model, tokenizer, prefix=""): metric = SpanEntityScore(args.id2label) pred_output_dir = args.output_dir if not os.path.exists(pred_output_dir) and args.local_rank in [-1, 0]: os.makedirs(pred_output_dir) test_dataset = load_and_cache_examples(args, args.task_name, tokenizer, data_type='test') print(len(test_dataset)) # Note that DistributedSampler samples randomly test_sampler = SequentialSampler( test_dataset) if args.local_rank == -1 else DistributedSampler( test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=1, collate_fn=collate_fn) # Eval! logger.info("***** Running prediction %s *****", prefix) logger.info(" Num examples = %d", len(test_dataset)) logger.info(" Batch size = %d", 1) f_results = [] output_predict_file = os.path.join(pred_output_dir, prefix, "span_test_predict.json") # pbar = ProgressBar(n_total=len(test_dataloader), desc="Predicting") for step, batch in enumerate(test_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "start_positions": None, "end_positions": None } if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) start_logits, end_logits = outputs[:2] R = bert_extract_item(start_logits, end_logits) # T = if R: label_entities = [[args.id2label[x[0]], x[1], x[2]] for x in R] else: label_entities = [] json_d = {} json_d['id'] = step json_d['entities'] = label_entities f_results.append(json_d) logger.info("\n") with open(output_predict_file, "w") as writer: for record in f_results: writer.write(json.dumps(record) + '\n') # eval_loss = eval_loss / nb_eval_steps # test_info, entity_info = metric.result() # results = {f'{key}': value for key, value in test_info.items()} # # results['loss'] = eval_loss # logger.info("***** Test results %s *****", prefix) # info = "-".join([f' {key}: {value:.4f} ' for key, value in results.items()]) # logger.info(info) # logger.info("***** Entity results %s *****", prefix) # for key in sorted(entity_info.keys()): # logger.info("******* %s results ********" % key) # info = "-".join([f' {key}: {value:.4f} ' for key, value in entity_info[key].items()]) # logger.info(info) # # pbar(step) if args.task_name == "cluener": output_submit_file = os.path.join(pred_output_dir, prefix, "test_submit.json") test_text = [] with open(os.path.join(args.data_dir, "test.json"), 'r') as fr: for line in fr: test_text.append(json.loads(line)) test_submit = [] for x, y in zip(test_text, results): json_d = {} json_d['id'] = x['id'] json_d['label'] = {} entities = y['entities'] words = list(x['text']) if len(entities) != 0: for subject in entities: tag = subject[0] start = subject[1] end = subject[2] word = "".join(words[start:end + 1]) if tag in json_d['label']: if word in json_d['label'][tag]: json_d['label'][tag][word].append([start, end]) else: json_d['label'][tag][word] = [[start, end]] else: json_d['label'][tag] = {} json_d['label'][tag][word] = [[start, end]] test_submit.append(json_d) json_to_text(output_submit_file, test_submit)
def mask_eval_model(processor, data_dir, label_list, max_seq_length, eval_batch_size, tokenizer, device, model, maskedLMModel, mask_eval_outfile): eval_examples = processor.get_dev_examples(data_dir) get_eval_features = convert_examples_to_features(eval_examples, label_list, max_seq_length, tokenizer, mask_exp=True, model=maskedLMModel) logger.info("***** Running mask evaluation *****") # Process the examples in batches because there are too many to do at once. eval_features = next(get_eval_features) json_data = {"inputs": [], "labels": [], "maskdata": [], "logits": []} cm_orig = [[0 for i in range(len(label_list))] for j in range(len(label_list))] cm_ex = [[0 for i in range(len(label_list))] for j in range(len(label_list))] while eval_features is not None and len(eval_features) > 0: logger.info(" Num examples = %d", len(eval_examples)) logger.info( " Num inputs = %d", len(eval_features)) # different from yields at example limit logger.info(" Batch size = %d", eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_ex_data = torch.tensor([f.ex_data for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_ex_data) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size) model.eval() for input_ids, input_mask, segment_ids, label_ids, ex_data in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() # Convert data to human-readable JSON for storage. for didx in range(input_ids.shape[0]): json_data["inputs"].append( tokenizer.convert_ids_to_tokens( input_ids[didx].detach().cpu().numpy())) json_data["labels"].append(label_ids[didx].tolist()) json_data["maskdata"].append( ex_data[didx].detach().cpu().numpy().tolist()) json_data["logits"].append(logits[didx].tolist()) pred = int(np.argmax(logits[didx])) if json_data["maskdata"][-1][1] == -1: # source sentence cm_orig[label_ids[didx]][pred] += 1 cm_ex[label_ids[didx]][pred] += 1 # all # Get the next batch of examples to process. try: eval_features = next(get_eval_features) except StopIteration: break # done getting new features logger.info("***** Mask eval cm_orig: " + str(cm_orig) + "*****") logger.info("***** Mask eval cm_ex: " + str(cm_ex) + "*****") # Write json data to file. output_json_file = os.path.join(mask_eval_outfile) with open(output_json_file, "w") as f: json.dump(json_data, f)
def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""): eval_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode=mode) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler( eval_dataset ) #if args.local_rank == -1 else DistributedSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation %s *****", prefix) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 correct = 0 pred_ids = [] predictions = [] model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3] } if args.model_type != "distilbert": inputs["token_type_ids"] = batch[2] if args.model_type in [ "bert", "xlnet" ] else None # XLM and RoBERTa don"t use segment_ids outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] if args.n_gpu > 1: tmp_eval_loss = tmp_eval_loss.mean( ) # mean() to average on multi-gpu parallel evaluating eval_loss += tmp_eval_loss.item() nb_eval_steps += 1 _, preds = torch.max(logits, dim=-1) correct += (preds == inputs["labels"]).cpu().sum().item() pred_ids.extend(preds.cpu().tolist()) eval_loss = eval_loss / nb_eval_steps label_map = {i: label for i, label in enumerate(labels)} predictions = [label_map[pred_id] for pred_id in pred_ids] results = {"acc": 100 * correct / len(eval_dataset)} logger.info("***** Eval results %s *****", prefix) for key in sorted(results.keys()): logger.info(" %s = %s", key, str(results[key])) return results, predictions
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type: e.g. roberta") parser.add_argument("--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model: e.g. roberta-base") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( "--load_model_path", default=None, type=str, help="Path to trained model: Should contain the .bin files") ## Other parameters parser.add_argument( "--train_filename", default=None, type=str, help= "The train filename. Should contain the .jsonl files for this task.") parser.add_argument( "--dev_filename", default=None, type=str, help="The dev filename. Should contain the .jsonl files for this task." ) parser.add_argument( "--test_filename", default=None, type=str, help="The test filename. Should contain the .jsonl files for this task." ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--max_source_length", default=64, type=int, help= "The maximum total source sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument( "--max_target_length", default=32, type=int, help= "The maximum total target sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_test", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument("--train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--beam_size", default=10, type=int, help="beam size for beam search") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3, type=int, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--eval_steps", default=-1, type=int, help="") parser.add_argument("--train_steps", default=-1, type=int, help="") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") # print arguments args = parser.parse_args() logger.info(args) # Setup CUDA, GPU device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.n_gpu = torch.cuda.device_count() args.device = device # Set seed set_seed(args.seed) # make dir if output_dir not exist if os.path.exists(args.output_dir) is False: os.makedirs(args.output_dir) config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained(args.config_name) tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name) #budild model encoder = model_class.from_pretrained(args.model_name_or_path, config=config) decoder_layer = nn.TransformerDecoderLayer( d_model=config.hidden_size, nhead=config.num_attention_heads) decoder = nn.TransformerDecoder(decoder_layer, num_layers=6) model = Seq2Seq(encoder=encoder, decoder=decoder, config=config, beam_size=args.beam_size, max_length=args.max_target_length, sos_id=tokenizer.cls_token_id, eos_id=tokenizer.sep_token_id) if args.load_model_path is not None: logger.info("reload model from {}".format(args.load_model_path)) model.load_state_dict(torch.load(args.load_model_path)) model.to(device) if args.n_gpu > 1: # multi-gpu training model = torch.nn.DataParallel(model) if args.do_train: # Prepare training data loader train_examples = read_examples(args.train_filename) train_features = convert_examples_to_features(train_examples, tokenizer, args, stage='train') train_data = TextDataset(train_features, args) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps, num_workers=4) num_train_optimization_steps = args.train_steps # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=len(train_dataloader) * args.num_train_epochs * 0.1, num_training_steps=len(train_dataloader) * args.num_train_epochs) #Start training logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num epoch = %d", args.num_train_epochs) model.train() dev_dataset = {} nb_tr_examples, nb_tr_steps, tr_loss, global_step, best_bleu, best_loss = 0, 0, 0, 0, 0, 1e6 for epoch in range(args.num_train_epochs): bar = tqdm(train_dataloader, total=len(train_dataloader)) for batch in bar: batch = tuple(t.to(device) for t in batch) source_ids, source_mask, position_idx, att_mask, target_ids, target_mask = batch loss, _, _ = model(source_ids, source_mask, position_idx, att_mask, target_ids, target_mask) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() train_loss = round( tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1), 4) bar.set_description("epoch {} loss {}".format( epoch, train_loss)) nb_tr_examples += source_ids.size(0) nb_tr_steps += 1 loss.backward() if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: #Update parameters optimizer.step() optimizer.zero_grad() scheduler.step() global_step += 1 if args.do_eval and epoch in [ int(args.num_train_epochs * (i + 1) // 20) for i in range(20) ]: #Eval model with dev dataset tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 eval_flag = False if 'dev_loss' in dev_dataset: eval_examples, eval_data = dev_dataset['dev_loss'] else: eval_examples = read_examples(args.dev_filename) eval_features = convert_examples_to_features(eval_examples, tokenizer, args, stage='dev') eval_data = TextDataset(eval_features, args) dev_dataset['dev_loss'] = eval_examples, eval_data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size, num_workers=4) logger.info("\n***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) #Start Evaling model model.eval() eval_loss, tokens_num = 0, 0 for batch in eval_dataloader: batch = tuple(t.to(device) for t in batch) source_ids, source_mask, position_idx, att_mask, target_ids, target_mask = batch with torch.no_grad(): _, loss, num = model(source_ids, source_mask, position_idx, att_mask, target_ids, target_mask) eval_loss += loss.sum().item() tokens_num += num.sum().item() #Pring loss of dev dataset model.train() eval_loss = eval_loss / tokens_num result = { 'eval_ppl': round(np.exp(eval_loss), 5), 'global_step': global_step + 1, 'train_loss': round(train_loss, 5) } for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) logger.info(" " + "*" * 20) #save last checkpoint last_output_dir = os.path.join(args.output_dir, 'checkpoint-last') if not os.path.exists(last_output_dir): os.makedirs(last_output_dir) model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(last_output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) if eval_loss < best_loss: logger.info(" Best ppl:%s", round(np.exp(eval_loss), 5)) logger.info(" " + "*" * 20) best_loss = eval_loss # Save best checkpoint for best ppl output_dir = os.path.join(args.output_dir, 'checkpoint-best-ppl') if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) #Calculate bleu if 'dev_bleu' in dev_dataset: eval_examples, eval_data = dev_dataset['dev_bleu'] else: eval_examples = read_examples(args.dev_filename) eval_examples = random.sample( eval_examples, min(1000, len(eval_examples))) eval_features = convert_examples_to_features(eval_examples, tokenizer, args, stage='test') eval_data = TextDataset(eval_features, args) dev_dataset['dev_bleu'] = eval_examples, eval_data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size, num_workers=4) model.eval() p = [] for batch in eval_dataloader: batch = tuple(t.to(device) for t in batch) source_ids, source_mask, position_idx, att_mask, target_ids, target_mask = batch with torch.no_grad(): preds = model(source_ids, source_mask, position_idx, att_mask) for pred in preds: t = pred[0].cpu().numpy() t = list(t) if 0 in t: t = t[:t.index(0)] text = tokenizer.decode( t, clean_up_tokenization_spaces=False) p.append(text) model.train() predictions = [] accs = [] with open(os.path.join(args.output_dir, "dev.output"), 'w') as f, open( os.path.join(args.output_dir, "dev.gold"), 'w') as f1: for ref, gold in zip(p, eval_examples): predictions.append(ref) f.write(ref + '\n') f1.write(gold.target + '\n') accs.append(ref == gold.target) dev_bleu = round( _bleu(os.path.join(args.output_dir, "dev.gold"), os.path.join(args.output_dir, "dev.output")), 2) xmatch = round(np.mean(accs) * 100, 4) logger.info(" %s = %s " % ("bleu-4", str(dev_bleu))) logger.info(" %s = %s " % ("xMatch", str(round(np.mean(accs) * 100, 4)))) logger.info(" " + "*" * 20) if dev_bleu + xmatch > best_bleu: logger.info(" Best BLEU+xMatch:%s", dev_bleu + xmatch) logger.info(" " + "*" * 20) best_bleu = dev_bleu + xmatch # Save best checkpoint for best bleu output_dir = os.path.join(args.output_dir, 'checkpoint-best-bleu') if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) if args.do_test: files = [] if args.dev_filename is not None: files.append(args.dev_filename) if args.test_filename is not None: files.append(args.test_filename) for idx, file in enumerate(files): logger.info("Test file: {}".format(file)) eval_examples = read_examples(file) eval_features = convert_examples_to_features(eval_examples, tokenizer, args, stage='test') eval_data = TextDataset(eval_features, args) # Calculate bleu eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size, num_workers=4) model.eval() p = [] for batch in tqdm(eval_dataloader, total=len(eval_dataloader)): batch = tuple(t.to(device) for t in batch) source_ids, source_mask, position_idx, att_mask, target_ids, target_mask = batch with torch.no_grad(): preds = model(source_ids, source_mask, position_idx, att_mask) for pred in preds: t = pred[0].cpu().numpy() t = list(t) if 0 in t: t = t[:t.index(0)] text = tokenizer.decode( t, clean_up_tokenization_spaces=False) p.append(text) model.train() predictions = [] accs = [] with open( os.path.join(args.output_dir, "test_{}.output".format(str(idx))), 'w') as f, open( os.path.join(args.output_dir, "test_{}.gold".format(str(idx))), 'w') as f1: for ref, gold in zip(p, eval_examples): predictions.append(ref) f.write(ref + '\n') f1.write(gold.target + '\n') accs.append(ref == gold.target) dev_bleu = round( _bleu( os.path.join(args.output_dir, "test_{}.gold".format(str(idx))).format(file), os.path.join(args.output_dir, "test_{}.output".format( str(idx))).format(file)), 2) logger.info(" %s = %s " % ("bleu-4", str(dev_bleu))) logger.info(" %s = %s " % ("xMatch", str(round(np.mean(accs) * 100, 4)))) logger.info(" " + "*" * 20)
def evaluate(args, model, tokenizer, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = (args.task_name, ) eval_outputs_dirs = (args.output_dir, ) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu eval if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 scores = None out_label_ids = None for batch in eval_dataloader: model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3] } if args.model_type != "distilbert": inputs["token_type_ids"] = ( batch[2] if args.model_type in ["bert", "xlnet", "albert"] else None ) # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if scores is None: scores = logits.detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy() else: scores = np.append(scores, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(scores, axis=1) elif args.output_mode == "regression": preds = np.squeeze(scores) # result = compute_metrics(eval_task, preds, out_label_ids) acc = simple_accuracy(preds, out_label_ids) if args.save_preds: pred_file = os.path.join(eval_output_dir, prefix, "eval_preds.txt") with open(pred_file, "w") as f: writer = csv.writer(f) writer.writerow(preds) score_file = os.path.join(eval_output_dir, prefix, "eval_scores.txt") with open(score_file, "w") as f: writer = csv.writer(f) writer.writerow(scores) result = {"eval_acc": acc, "eval_loss": eval_loss} results.update(result) output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return results