def train(model, args, n_gpu, optimizer, num_train_optimization_steps, num_labels, train_dataloader, device): ''' train model ''' model.train() global global_step global nb_tr_steps global tr_loss for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch #print("input_ids_shape", input_ids.shape) #print("label_ids_shape", label_ids.shape) loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 ## save model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = BertForDocMultiClassification(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) return model
def init_tri_model(model, args, n_gpu, train_dataloader, device, num_epoch, logger): ''' the initial process of training ''' logger.info("***** Initial optimizer *****") optimizer, num_train_optimization_steps = init_optimizer( model, args, train_dataloader) # optimizer1, optimizer2, optimizer3, num_train_optimization_steps = init_tri_model_optimizer(model, args, train_dataloader) logger.info("***** Running train *****") model.train() global_step = 0 nb_tr_steps = 0 tr_loss = 0 gamma = 0.01 for _ in trange(int(num_epoch), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss1, loss2, loss3, orth_loss = model(input_ids, segment_ids, input_mask, label_ids) loss = loss1 + loss2 + loss3 + gamma * orth_loss if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 print("loss", loss.item()) return model
def run_epoch(iter): losses = [] i = 0 dh_model.zero_grad() num_train_optimization_steps = int(len(trX) / float(args.n_batch) / 8.0 * args.n_iter) pred = [] gold = [] for xmb,mmb,ymb in iter_data(*shuffle(trX, trM, trY, random_state=np.random), n_batch=n_batch_train, truncate=True, verbose=True): global n_updates dh_model.train() XMB = torch.tensor(xmb, dtype=torch.long).to(device) YMB = torch.tensor(ymb, dtype=torch.long).to(device) MMB = torch.tensor(mmb, dtype=torch.long).to(device) outputs = dh_model(XMB,attention_mask=MMB,next_sentence_label=YMB) loss,scores = outputs[:2] scores = torch.argmax(torch.nn.functional.softmax(scores),dim=1).tolist() YMB = YMB.tolist() pred.extend(scores) gold.extend(YMB) loss = loss / 8.0 losses.append(float(loss.item())) loss.backward() if (n_updates + 1) % 8 == 0: print('update') lr_this_step = args.lr * warmup_linear(n_updates/float(num_train_optimization_steps),.1) for param_group in model_opt.param_groups: param_group['lr'] = lr_this_step model_opt.step() model_opt.zero_grad() n_updates += 1 if (n_updates + 1) % 10000 == 0: log(save_dir, desc,iter,save='_'+str(n_updates)) log_value('batch_train_loss',loss,n_updates) log_value('mean_train_loss',np.mean(losses),n_updates) log_value('total_train_loss',np.sum(losses),n_updates) precision = precision_score(gold, pred) recall = recall_score(gold, pred) f1 = f1_score(gold,pred) acc = np.mean([gold[i] == pred[i] for i in range(len(pred))]) print(precision) print(recall) print(f1) print(acc)
def on_backward_begin(self, last_loss, **kwargs): ''' returns loss, skip_backward ''' loss = last_loss if self.gradient_accumulation_steps > 1: loss /= self.gradient_accumulation_steps if self.fp16: learn.opt.backwards(loss) # modify learning rate with special BERT warm up lr_this_step = learn.opt.get_lr() * warmup_linear( self.global_step / self.train_opt_steps, self.warmup_proportion) for param_group in learn.opt.param_groups: param_group['lr'] = lr_this_step global_step += 1 return {'last_loss': loss, 'skip_bwd': self.fp16}
def fit(num_epocs=args['num_train_epochs']): global_step = 0 model.train() for i_ in tqdm(range(int(num_epocs)), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args['gradient_accumulation_steps'] > 1: loss = loss / args['gradient_accumulation_steps'] if args['fp16']: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args['gradient_accumulation_steps'] == 0: # scheduler.batch_step() # modify learning rate with special warm up BERT uses lr_this_step = args['learning_rate'] * warmup_linear(global_step/t_total, args['warmup_proportion']) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 logger.info('Loss after epoc {}'.format(tr_loss / nb_tr_steps)) logger.info('Eval after epoc {}'.format(i_+1)) eval()
def update_learning_rate(args, config, current_global_step, optimizer): global last_global_step_from_restore global_step_for_lr = current_global_step - last_global_step_from_restore if args.lr_schedule == "EE": #print(f'LR Schedule is {args.lr_schedule} EE') lr_this_step = config["training"][ "learning_rate"] * warmup_exp_decay_exp( global_step_for_lr, config["training"]["decay_rate"], config["training"]["decay_step"], config["training"]["total_training_steps"], config["training"]["warmup_proportion"]) elif args.lr_schedule == "EP": print(f'LR Schedule is {args.lr_schedule} EP') lr_this_step = config["training"][ "learning_rate"] * warmup_exp_decay_poly( global_step_for_lr, config["training"]["total_training_steps"], config["training"]["warmup_proportion"]) elif args.lr_schedule == "LN": print(f'LR Schedule is {args.lr_schedule} LN') lr_this_step = config["training"][ "learning_rate"] * warmup_linear( global_step_for_lr / config["training"]["total_training_steps"], warmup=config["training"]["warmup_proportion"]) else: lr_this_step = config["training"][ "learning_rate"] * warmup_linear_decay_exp( global_step_for_lr, config["training"]["decay_rate"], config["training"]["decay_step"], config["training"]["total_training_steps"], config["training"]["warmup_proportion"]) lr_this_step += args.lr_offset for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step return lr_this_step
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--test_task", default='taska', type=str, help="The task to output.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument("--test_data", default=None, type=str, help="The test input for predictions.") ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=80, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_test", action='store_true', help="Whether to run test on the test set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=128, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=64, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=1e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=4, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval and not args.do_test: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() processor = SemevalProcessor() output_mode = "multi_classification" label_list = processor.get_labels() if (output_mode == "multi_classification"): num_labels = [] for l in label_list: num_labels.append(len(l)) num_labels[1] -= 1 num_labels[2] -= 1 else: num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if (output_mode == "classification" or output_mode == "multi_classification"): all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif (output_mode == "regression"): all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # define a new function to compute loss values for both output_modes logits = model(input_ids, segment_ids, input_mask) if (output_mode == "multi_classification"): loss_fct = CrossEntropyLoss() loss = loss_fct(logits[0].view(-1, num_labels[0]), label_ids[:, 0].view(-1)) loss += 16 * loss_fct(logits[1].view( -1, num_labels[1]), label_ids[:, 1].view(-1)) * ( label_ids[:, 0].float().view(-1)).mean() loss += 16 * loss_fct(logits[2].view( -1, num_labels[2]), label_ids[:, 2].view(-1)) * ( label_ids[:, 1].float().view(-1)).mean() print(loss) elif output_mode == "classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = BertForSequenceClassification.from_pretrained( args.output_dir, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) else: model = BertForSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) if (output_mode == "classification" or output_mode == "multi_classification"): all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) elif (output_mode == "regression"): all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] if (output_mode == "multi_classification"): for i in range(len(label_list)): preds.append([]) for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask) # create eval loss and other metric required by the task if (output_mode == "multi_classification"): loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits[0].view(-1, num_labels[0]), label_ids[:, 0].view(-1)) tmp_eval_loss += 16 * loss_fct( logits[1].view(-1, num_labels[1]), label_ids[:, 1].view( -1)) * (label_ids[:, 0].float().view(-1)).mean() tmp_eval_loss += 16 * loss_fct( logits[2].view(-1, num_labels[2]), label_ids[:, 2].view( -1)) * (label_ids[:, 1].float().view(-1)).mean() elif output_mode == "classification": loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if (output_mode == "multi_classification"): for i, logit in enumerate(logits): if (len(preds[i]) == 0): preds[i].append(logit.detach().cpu().numpy()) else: preds[i][0] = np.append(preds[i][0], logit.detach().cpu().numpy(), axis=0) else: if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps if (output_mode == "multi_classification"): pass else: if output_mode == "classification": preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss / nb_tr_steps if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if args.do_test and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_test_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_index = torch.tensor([f.guid for f in eval_features], dtype=torch.long) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) test_data = TensorDataset(all_index, all_input_ids, all_input_mask, all_segment_ids) # Run prediction for full data test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) model.eval() nb_eval_steps = 0 preds = [] if (output_mode == "multi_classification"): preds = [[], [], []] total_index = [] for index, input_ids, input_mask, segment_ids in tqdm( test_dataloader, desc="Evaluating"): total_index.extend(index.tolist()) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask) if (output_mode == "multi_classification"): for i, logit in enumerate(logits): if (len(preds[i]) == 0): preds[i].append(logit.detach().cpu().numpy()) else: preds[i][0] = np.append(preds[i][0], logit.detach().cpu().numpy(), axis=0) else: if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) if (output_mode == "multi_classification"): if (args.test_task == 'taska'): preds = preds[0][0] label_map = ['NOT', 'OFF'] elif (args.test_task == 'taskb'): preds = preds[1][0] label_map = ['UNT', 'TIN'] elif (args.test_task == 'taskc'): preds = preds[2][0] label_map = ['OTH', 'GRP', 'IND'] preds = np.argmax(preds, axis=-1) else: preds = preds[0] if output_mode == "classification": preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) label_map = processor.get_labels() output_test_file = os.path.join(args.output_dir, "{0}_preds.txt".format(args.test_task)) with open(output_test_file, "w") as writer: logger.info("***** test results *****") writer.write("Id,Category\n") for i in range(preds.shape[0]): writer.write("{0},{1}\n".format(total_index[i], label_map[preds[i]]))
def train(args): # initial device and gpu number if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') print( "[Train] device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}" .format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) # get train example train_examples = None num_train_optimization_steps = None print('[Train] Load data...') train_examples = torch.load(os.path.join(args.data_path, 'train.pt')) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) print('[Train] Finish loading data...') # Prepare model print('[Train] Initial model...') if args.bert_config: model_file = os.path.join(args.bert_config, WEIGHTS_NAME) config_file = os.path.join(args.bert_config, CONFIG_NAME) print('[Train] Load model...') config = BertConfig.from_json_file(config_file) model = Summarizer(args, device, load_pretrained_bert=False, bert_config=config) # load check points model.load_cp(torch.load(model_file)) else: model = Summarizer(args, device, load_pretrained_bert=True) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) # prepare training data train_features = convert_examples_to_features(train_examples) print("[Train] ***** Running training *****") print(f"[Train] Num examples = {len(train_examples)}") print(f"[Train] Batch size = {args.train_batch_size}") print(f"[Train] Num steps = {num_train_optimization_steps}") train_data = get_dataset(train_features) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) # prepare evaluation data eval_dataloader = None eval_path = os.path.join(args.data_path, 'dev.pt') if os.path.isfile(eval_path): print(f"[Train] ***** Prepare evaluation data from {eval_path} *****") eval_examples = torch.load(eval_path) eval_features = convert_examples_to_features(eval_examples) eval_data = get_dataset(eval_features) print(f"[Train] Num examples = {len(eval_examples)}") # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) global_step = 0 tr_loss = 0 loss_f = torch.nn.BCELoss(reduction='none') best_performance = 1000000 best_epoch = 0 output_model_file = os.path.join(args.model_path, WEIGHTS_NAME) output_config_file = os.path.join(args.model_path, CONFIG_NAME) # begin training for epoch_i in range(int(args.num_train_epochs)): print('[ Epoch', epoch_i, ']') # training model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) sent_idx, input_ids, input_mask, segment_ids, clss_ids, clss_mask, label_ids = batch sent_scores, mask = model(input_ids, segment_ids, clss_ids, input_mask, clss_mask) loss = loss_f(sent_scores, label_ids.float()) loss = (loss * mask.float()).sum() loss = loss / loss.numel() if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 train_loss = tr_loss / nb_tr_steps cur_loss = train_loss # evaluation if eval_dataloader and (args.local_rank == -1 or torch.distributed.get_rank() == 0): model.eval() eval_loss, eval_pk, eval_windiff, eval_bound_sim = 0, 0, 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for sent_idx, input_ids, input_mask, segment_ids, clss_ids, clss_mask, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) clss_ids = clss_ids.to(device) clss_mask = clss_mask.to(device) label_ids = label_ids.to(device) with torch.no_grad(): sent_scores, mask = model(input_ids, segment_ids, clss_ids, input_mask, clss_mask) tmp_eval_loss = loss_f(sent_scores, label_ids.float()) tmp_eval_loss = (tmp_eval_loss * mask.float()).sum() tmp_eval_loss = tmp_eval_loss / tmp_eval_loss.numel() sent_scores = sent_scores.detach().cpu().numpy() mask = mask.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() # find selected sentences _pred, selected_ids = select_seg(sent_idx, eval_examples, sent_scores, mask) tmp_pk, tmp_windiff, tmp_bound_sim = accuracy( selected_ids, label_ids, mask) eval_loss += tmp_eval_loss.mean().item() eval_pk += tmp_pk eval_windiff += tmp_windiff eval_bound_sim += tmp_bound_sim nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_pk = eval_pk / nb_eval_examples eval_windiff = eval_windiff / nb_eval_examples eval_bound_sim = eval_bound_sim / nb_eval_examples cur_loss = eval_loss # output result of an epoch print(f' - (Training) loss: {train_loss: 8.5f}') print( f' - (Validation) loss: {eval_loss: 8.5f}, Pk: {100 * eval_pk: 3.3f} , WinDiff: {100 * eval_windiff: 3.3f} , Boundary Similarity: {100 * eval_bound_sim: 3.3f} ' ) # record best model if cur_loss < best_performance: best_performance = cur_loss best_epoch = epoch_i # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self torch.save(model_to_save.state_dict(), output_model_file) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) print("[Train] Successfully save the best model.")
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_file", default=None, type=str, required=True, help="The input train corpus.") parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--on_memory", action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--hybrid_attention", action='store_true', help="Whether to use hybrid attention") parser.add_argument("--continue_training", action='store_true', help="Continue training from a checkpoint") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train: raise ValueError( "Training is currently the only implemented execution option. Please set `do_train`." ) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and not args.continue_training: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) #train_examples = None num_train_optimization_steps = None if args.do_train: print("Loading Train Dataset", args.train_file) train_dataset = BERTDataset(args.train_file, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) num_train_optimization_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model model = BertForMaskedLM.from_pretrained(args.bert_model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) if args.hybrid_attention: max_seq_length = args.max_seq_length attention_mask = torch.ones(12, max_seq_length, max_seq_length, dtype=torch.long) # left attention attention_mask[:2, :, :] = torch.tril( torch.ones(max_seq_length, max_seq_length, dtype=torch.long)) # right attention attention_mask[2:4, :, :] = torch.triu( torch.ones(max_seq_length, max_seq_length, dtype=torch.long)) # local attention, window size = 3 attention_mask[4:6, :, :] = torch.triu( torch.tril( torch.ones(max_seq_length, max_seq_length, dtype=torch.long), 1), -1) attention_mask = torch.cat( [attention_mask.unsqueeze(0) for _ in range(8)]) attention_mask = attention_mask.to(device) else: attention_mask = None global_step = 0 epoch_start = 0 if args.do_train: if args.continue_training: # if checkpoint file exists, find the last checkpoint if os.path.exists(args.output_dir) and os.listdir(args.output_dir): all_cp = os.listdir(args.output_dir) steps = [ int(re.search('_\d+', cp).group()[1:]) for cp in all_cp if re.search('_\d+', cp) ] if len(steps) == 0: raise ValueError( "No existing checkpoint. Please do not use --continue_training." ) max_step = max(steps) # load checkpoint checkpoint = torch.load( os.path.join(args.output_dir, 'checkpoints_' + str(max_step) + '.pt')) logger.info("***** Loading checkpoint *****") logger.info(" Num steps = %d", checkpoint['global_step']) logger.info(" Num epoch = %d", checkpoint['epoch']) logger.info(" Loss = %d, %d", checkpoint['loss'], checkpoint['loss_now']) model.module.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) global_step = checkpoint['global_step'] epoch_start = checkpoint['epoch'] del checkpoint else: raise ValueError( "No existing checkpoint. Please do not use --continue_training." ) writer = SummaryWriter(log_dir=os.environ['HOME']) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: #TODO: check if this works with current data generator from disk that relies on next(file) # (it doesn't return item back by index) train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) model.train() tr_loss_1000 = 0 for ep in trange(epoch_start, int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids = batch loss = model(input_ids, segment_ids, input_mask, lm_label_ids, hybrid_mask=attention_mask) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() tr_loss_1000 += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # log the training loss for every 1000 steps if global_step % 1000 == 999: writer.add_scalar('data/loss', tr_loss_1000 / 1000, global_step) logger.info("training steps: %s", global_step) logger.info("training loss per 1000: %s", tr_loss_1000 / 1000) tr_loss_1000 = 0 # save the checkpoint for every 10000 steps if global_step % 10000 == 0: model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_file = os.path.join( args.output_dir, "checkpoints_" + str(global_step) + ".pt") checkpoint = { 'model': model_to_save.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': ep, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps, 'loss_now': tr_loss_1000 } if args.do_train: torch.save(checkpoint, output_file) model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin_" + str(ep)) if args.do_train: torch.save(model_to_save.state_dict(), output_model_file) logger.info("training loss: %s", tr_loss / nb_tr_steps) # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") if args.do_train: torch.save(model_to_save.state_dict(), output_model_file)
def main(args): device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu: {}, 16-bits training: {}".format( device, n_gpu, args.fp16)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = \ args.train_batch_size // args.gradient_accumulation_steps if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if args.do_train: assert (args.train_file is not None) and (args.dev_file is not None) if args.eval_test: assert args.test_file is not None else: assert args.dev_file is not None if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if args.do_train: logger.addHandler( logging.FileHandler(os.path.join(args.output_dir, "train.log"), 'w')) else: logger.addHandler( logging.FileHandler(os.path.join(args.output_dir, "eval.log"), 'w')) logger.info(args) tokenizer = BertTokenizer.from_pretrained(args.model, do_lower_case=args.do_lower_case) if args.do_train or (not args.eval_test): with gzip.GzipFile(args.dev_file, 'r') as reader: content = reader.read().decode('utf-8').strip().split('\n')[1:] eval_dataset = [json.loads(line) for line in content] eval_examples = read_mrqa_examples(input_file=args.dev_file, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Dev *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size) if args.do_train: train_examples = read_mrqa_examples(input_file=args.train_file, is_training=True) train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.train_mode == 'sorted' or args.train_mode == 'random_sorted': train_features = sorted(train_features, key=lambda f: np.sum(f.input_mask)) else: random.shuffle(train_features) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor( [f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor( [f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) train_dataloader = DataLoader(train_data, batch_size=args.train_batch_size) train_batches = [batch for batch in train_dataloader] num_train_optimization_steps = \ len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs logger.info("***** Train *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) eval_step = max(1, len(train_batches) // args.eval_per_epoch) best_result = None lrs = [args.learning_rate] if args.learning_rate else [ 1e-6, 2e-6, 3e-6, 5e-6, 1e-5, 2e-5, 3e-5, 5e-5 ] for lr in lrs: model = BertForQuestionAnswering.from_pretrained( args.model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE) if args.fp16: model.half() model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) param_optimizer = list(model.named_parameters()) param_optimizer = [ n for n in param_optimizer if 'pooler' not in n[0] ] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex" "to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=lr, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer( optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=lr, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) tr_loss = 0 nb_tr_examples = 0 nb_tr_steps = 0 global_step = 0 start_time = time.time() for epoch in range(int(args.num_train_epochs)): model.train() logger.info("Start epoch #{} (lr = {})...".format(epoch, lr)) for step, batch in enumerate(train_batches): if n_gpu == 1: batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: lr_this_step = lr * \ warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if (step + 1) % eval_step == 0: logger.info( 'Epoch: {}, Step: {} / {}, used_time = {:.2f}s, loss = {:.6f}' .format(epoch, step + 1, len(train_dataloader), time.time() - start_time, tr_loss / nb_tr_steps)) save_model = False if args.do_eval: result, _, _ = \ evaluate(args, model, device, eval_dataset, eval_dataloader, eval_examples, eval_features) model.train() result['global_step'] = global_step result['epoch'] = epoch result['learning_rate'] = lr result['batch_size'] = args.train_batch_size if (best_result is None) or (result[args.eval_metric] > best_result[args.eval_metric]): best_result = result save_model = True logger.info( "!!! Best dev %s (lr=%s, epoch=%d): %.2f" % (args.eval_metric, str(lr), epoch, result[args.eval_metric])) else: save_model = True if save_model: model_to_save = model.module if hasattr( model, 'module') else model output_model_file = os.path.join( args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join( args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file( output_config_file) tokenizer.save_vocabulary(args.output_dir) if best_result: with open( os.path.join(args.output_dir, EVAL_FILE), "w") as writer: for key in sorted(best_result.keys()): writer.write( "%s = %s\n" % (key, str(best_result[key]))) if args.do_eval: if args.eval_test: with gzip.GzipFile(args.test_file, 'r') as reader: content = reader.read().decode('utf-8').strip().split('\n')[1:] eval_dataset = [json.loads(line) for line in content] eval_examples = read_mrqa_examples(input_file=args.test_file, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Test *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size) model = BertForQuestionAnswering.from_pretrained(args.output_dir) if args.fp16: model.half() model.to(device) result, preds, nbest_preds = \ evaluate(args, model, device, eval_dataset, eval_dataloader, eval_examples, eval_features) with open(os.path.join(args.output_dir, PRED_FILE), "w") as writer: writer.write(json.dumps(preds, indent=4) + "\n") with open(os.path.join(args.output_dir, TEST_FILE), "w") as writer: for key in sorted(result.keys()): writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_test", action='store_true', help="Whether to run test and create submission.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--seed", default=None, type=int, help="Seed for randomized elements in the training") parser.add_argument("--eval_batch_size", default=16, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") ## Our arguements parser.add_argument("--mode", choices=[ "none", "distill", "smoothed_distill", "reweight_baseline", "bias_product_baseline", "learned_mixin_baseline" ]) parser.add_argument("--penalty", type=float, default=0.03, help="Penalty weight for the learn_mixin model") parser.add_argument("--n_processes", type=int, default=4, help="Processes to use for pre-processing") parser.add_argument("--debug", action="store_true") parser.add_argument( "--sorted", action="store_true", help='Sort the data so most batches have the same input length,' ' makes things about 2x faster. Our experiments did not actually' ' use this in the end (not sure if it makes a difference) so ' 'its off by default.') parser.add_argument("--which_bias", choices=["hans", "hypo", "hans_json"], required=True) parser.add_argument("--custom_teacher", default=None) parser.add_argument("--theta", type=float, default=0.1, help="for theta smoothed distillation loss") args = parser.parse_args() utils.add_stdout_logger() if args.mode == "none": loss_fn = clf_distill_loss_functions.Plain() elif args.mode == "distill": loss_fn = clf_distill_loss_functions.DistillLoss() elif args.mode == "smoothed_distill": loss_fn = clf_distill_loss_functions.SmoothedDistillLoss() elif args.mode == "reweight_baseline": loss_fn = clf_distill_loss_functions.ReweightBaseline() elif args.mode == "bias_product_baseline": loss_fn = clf_distill_loss_functions.BiasProductBaseline() elif args.mode == "learned_mixin_baseline": loss_fn = clf_distill_loss_functions.LearnedMixinBaseline(args.penalty) else: raise RuntimeError() output_dir = args.output_dir if args.do_train: if exists(output_dir): if len(os.listdir(output_dir)) > 0: logging.warning("Output dir exists and is non-empty") else: os.makedirs(output_dir) print("Saving model to %s" % output_dir) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps if args.seed is not None: random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval and not args.do_test: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(output_dir) and os.listdir(output_dir) and args.do_train: logging.warning( "Output directory ({}) already exists and is not empty.".format( output_dir)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Its way ot easy to forget if this is being set by a command line flag if "-uncased" in args.bert_model: do_lower_case = True elif "-cased" in args.bert_model: do_lower_case = False else: raise NotImplementedError(args.bert_model) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=do_lower_case) num_train_optimization_steps = None train_examples = None if args.do_train: train_examples = load_mnli(True, 2000 if args.debug else None) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) model = BertDistill.from_pretrained(args.bert_model, cache_dir=cache_dir, num_labels=3, loss_fn=loss_fn) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features: List[InputFeatures] = convert_examples_to_features( train_examples, args.max_seq_length, tokenizer, args.n_processes) if args.which_bias == "mix": hypo_bias_map = load_bias("hypo") hans_bias_map = load_bias("hans") bias_map = {} def compute_entropy(probs, base=3): return -(probs * (np.log(probs) / np.log(base))).sum() for key in hypo_bias_map.keys(): hypo_ent = compute_entropy(np.exp(hypo_bias_map[key])) hans_ent = compute_entropy(np.exp(hans_bias_map[key])) if hypo_ent < hans_ent: bias_map[key] = hypo_bias_map[key] else: bias_map[key] = hans_bias_map[key] else: bias_map = load_bias(args.which_bias) for fe in train_features: fe.bias = bias_map[fe.example_id].astype(np.float32) teacher_probs_map = load_teacher_probs(args.custom_teacher) for fe in train_features: fe.teacher_probs = np.array( teacher_probs_map[fe.example_id]).astype(np.float32) example_map = {} for ex in train_examples: example_map[ex.id] = ex logging.info("***** Running training *****") logging.info(" Num examples = %d", len(train_examples)) logging.info(" Batch size = %d", args.train_batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) train_dataloader = build_train_dataloader(train_features, args.train_batch_size, args.seed, args.sorted) model.train() loss_ema = 0 total_steps = 0 decay = 0.99 for _ in trange(int(args.num_train_epochs), desc="Epoch", ncols=100): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 pbar = tqdm(train_dataloader, desc="loss", ncols=100) for step, batch in enumerate(pbar): batch = tuple(t.to(device) for t in batch) if bias_map is not None: example_ids, input_ids, input_mask, segment_ids, label_ids, bias, teacher_probs = batch else: bias = None example_ids, input_ids, input_mask, segment_ids, label_ids = batch logits, loss = model(input_ids, segment_ids, input_mask, label_ids, bias, teacher_probs) total_steps += 1 loss_ema = loss_ema * decay + loss.cpu().detach().numpy() * ( 1 - decay) descript = "loss=%.4f" % (loss_ema / (1 - decay**total_steps)) pbar.set_description(descript, refresh=False) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Record the args as well arg_dict = {} for arg in vars(args): arg_dict[arg] = getattr(args, arg) with open(join(output_dir, "args.json"), 'w') as out_fh: json.dump(arg_dict, out_fh) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = BertDistill(config, num_labels=3, loss_fn=loss_fn) model.load_state_dict(torch.load(output_model_file)) else: output_config_file = os.path.join(output_dir, CONFIG_NAME) config = BertConfig.from_json_file(output_config_file) output_model_file = os.path.join(output_dir, WEIGHTS_NAME) model = BertDistill(config, num_labels=3, loss_fn=loss_fn) model.load_state_dict(torch.load(output_model_file)) model.to(device) if not args.do_eval and not args.do_test: return if not (args.local_rank == -1 or torch.distributed.get_rank() == 0): return model.eval() if args.do_eval: eval_datasets = [("mnli_dev_m", load_mnli(False)), ("mnli_dev_mm", load_mnli(False, custom_path="dev_mismatched.tsv"))] eval_datasets += load_easy_hard() eval_datasets += [("hans", load_hans())] eval_datasets += load_hans_subsets() else: eval_datasets = [] if args.do_test: test_datasets = load_all_test_jsonl() eval_datasets += test_datasets subm_paths = [ "../submission/{}.csv".format(x[0]) for x in test_datasets ] for ix, (name, eval_examples) in enumerate(eval_datasets): logging.info("***** Running evaluation on %s *****" % name) logging.info(" Num examples = %d", len(eval_examples)) logging.info(" Batch size = %d", args.eval_batch_size) eval_features = convert_examples_to_features(eval_examples, args.max_seq_length, tokenizer) eval_features.sort(key=lambda x: len(x.input_ids)) all_label_ids = np.array([x.label_id for x in eval_features]) eval_dataloader = build_eval_dataloader(eval_features, args.eval_batch_size) eval_loss = 0 nb_eval_steps = 0 probs = [] test_subm_ids = [] for example_ids, input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating", ncols=100): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask) # create eval loss and other metric required by the task loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, 3), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 probs.append( torch.nn.functional.softmax(logits, 1).detach().cpu().numpy()) test_subm_ids.append(example_ids.cpu().numpy()) probs = np.concatenate(probs, 0) test_subm_ids = np.concatenate(test_subm_ids, 0) eval_loss = eval_loss / nb_eval_steps if "hans" in name: # take max of non-entailment rather than taking their sum probs[:, 0] = probs[:, [0, 2]].max(axis=1) probs = probs[:, :2] preds = np.argmax(probs, axis=1) result = {"acc": simple_accuracy(preds, all_label_ids)} result["loss"] = eval_loss conf_plot_file = os.path.join(output_dir, "eval_%s_confidence.png" % name) ECE, bins_acc, bins_conf, bins_num = visualize_predictions( probs, all_label_ids, conf_plot_file=conf_plot_file) result["ECE"] = ECE result["bins_acc"] = bins_acc result["bins_conf"] = bins_conf result["bins_num"] = bins_num output_eval_file = os.path.join(output_dir, "eval_%s_results.txt" % name) output_all_eval_file = os.path.join(output_dir, "eval_all_results.txt") with open(output_eval_file, "w") as writer, open(output_all_eval_file, "a") as all_writer: logging.info("***** Eval results *****") all_writer.write("eval results on %s:\n" % name) for key in sorted(result.keys()): logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) all_writer.write("%s = %s\n" % (key, str(result[key]))) output_answer_file = os.path.join(output_dir, "eval_%s_answers.json" % name) answers = { ex.example_id: [float(x) for x in p] for ex, p in zip(eval_features, probs) } with open(output_answer_file, "w") as f: json.dump(answers, f) # prepare submission file if args.do_test and ix >= len(eval_datasets) - len(test_datasets): with open(subm_paths.pop(0), "w") as subm_f: subm_f.write("pairID,gold_label\n") for sub_id, pred_label_id in zip(test_subm_ids, preds): subm_f.write("{},{}\n".format( str(sub_id), REV_NLI_LABEL_MAP[pred_label_id]))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .csv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") # if os.path.exists(args.output_dir) and os.listdir(args.output_dir): # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = read_swag_examples(os.path.join( args.data_dir, 'train.csv'), is_training=True) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)) predictor = BertForMultipleChoice.from_pretrained(args.bert_model, cache_dir=cache_dir, num_choices=4) adversary = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=3) if args.fp16: predictor.half() adversary.half() predictor.to(device) adversary.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) predictor = DDP(predictor) adversary = DDP(adversary) elif n_gpu > 1: predictor = torch.nn.DataParallel(predictor) adversary = torch.nn.DataParallel(adversary) # Prepare optimizer param_optimizer_pred = list(predictor.named_parameters()) param_optimizer_adv = list(adversary.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer_pred = [ n for n in param_optimizer_pred if 'pooler' not in n[0] ] param_optimizer_adv = [ n for n in param_optimizer_adv if 'pooler' not in n[0] ] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters_pred = [{ 'params': [ p for n, p in param_optimizer_pred if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer_pred if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer_grouped_parameters_adv = [{ 'params': [ p for n, p in param_optimizer_adv if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer_adv if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer_pred = FusedAdam(optimizer_grouped_parameters_pred, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) optimizer_adv = FusedAdam(optimizer_grouped_parameters_adv, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer_pred = FP16_Optimizer(optimizer_pred, dynamic_loss_scale=True) optimizer_adv = FP16_Optimizer(optimizer_adv, dynamic_loss_scale=True) else: optimizer_pred = FP16_Optimizer(optimizer_pred, static_loss_scale=args.loss_scale) optimizer_adv = FP16_Optimizer(optimizer_adv, static_loss_scale=args.loss_scale) else: optimizer_pred = BertAdam(optimizer_grouped_parameters_pred, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) optimizer_adv = BertAdam(optimizer_grouped_parameters_adv, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) alpha = 1 global_step = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) all_vp_input_ids = torch.tensor(select_field(train_features, 'vp_input_ids'), dtype=torch.long) all_vp_input_mask = torch.tensor(select_field(train_features, 'vp_input_mask'), dtype=torch.long) all_protected_attr = torch.tensor( [f.protected_attr for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label, all_vp_input_ids, all_vp_input_mask, all_protected_attr) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) training_history = [] # predictor.train() adversary.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss_pred, tr_loss_adv = 0, 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, vp_input_ids, vp_input_mask, protected_attr_ids = batch # loss_pred, logits = predictor(input_ids, segment_ids, input_mask, label_ids) # predicted_vps = torch.argmax(logits, dim=1) # print("predicted_vps: ", predicted_vps) # predicted_vps = predicted_vps.view(-1, 1).repeat(1, vp_input_ids.size(2)).view([-1, 1, vp_input_ids.size(2)]) # vp_input_ids = torch.gather(vp_input_ids, dim=1, index=predicted_vps) # vp_input_ids = vp_input_ids.view([vp_input_ids.size(0), -1]) # vp_input_mask = torch.gather(vp_input_mask, dim=1, index=predicted_vps) # vp_input_mask = vp_input_mask.view([vp_input_mask.size(0), -1]) vp_input_ids = vp_input_ids.view( vp_input_ids.shape[0] * vp_input_ids.shape[1], vp_input_ids.shape[2]) vp_input_mask = vp_input_mask.view( vp_input_mask.shape[0] * vp_input_mask.shape[1], vp_input_mask.shape[2]) vp_input_ids = torch.cat([ 100 * torch.ones(vp_input_ids.shape[0], 1).long().to(device), vp_input_ids ], dim=1) vp_input_mask = torch.cat([ torch.ones(vp_input_ids.shape[0], 1).long().to(device), vp_input_mask ], dim=1) #protected_attr_ids = protected_attr_ids.t() protected_attr_ids = protected_attr_ids.repeat(4, 1).t() protected_attr_ids = protected_attr_ids.reshape( protected_attr_ids.shape[0] * protected_attr_ids.shape[1]) # print(vp_input_ids.shape, vp_input_mask.shape, protected_attr_ids.shape) # print(protected_attr_ids.detach().to('cpu').numpy()) loss_adv, _ = adversary(vp_input_ids, None, vp_input_mask, protected_attr_ids) if n_gpu > 1: # loss_pred = loss_pred.mean() # mean() to average on multi-gpu. loss_adv = loss_adv.mean() if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html # loss_pred = loss_pred * args.loss_scale loss_adv = loss_adv * args.loss_scale if args.gradient_accumulation_steps > 1: # loss_pred = loss_pred / args.gradient_accumulation_steps loss_adv = loss_adv / args.gradient_accumulation_steps # tr_loss_pred += loss_pred.item() tr_loss_adv += loss_adv.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 training_history.append([loss_adv.item()]) # loss = loss_pred - alpha * loss_adv # if args.fp16: # optimizer_pred.backward(loss) # else: # loss.backward(retain_graph=True) # if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer_pred.param_groups: param_group['lr'] = lr_this_step for param_group in optimizer_adv.param_groups: param_group['lr'] = lr_this_step # optimizer_pred.step() # optimizer_pred.zero_grad() # optimizer_adv.zero_grad() if args.fp16: optimizer_adv.backward(loss_adv) else: loss_adv.backward() optimizer_adv.step() optimizer_adv.zero_grad() global_step += 1 history_file = open(os.path.join(args.output_dir, "train_results.csv"), "w") writer = csv.writer(history_file, delimiter=",") writer.writerow(["adv_loss"]) for row in training_history: writer.writerow(row) if args.do_train: # Save a trained model and the associated configuration # model_to_save = predictor.module if hasattr(predictor, 'module') else predictor # Only save the model it-self WEIGHTS_NAME = 'weights.pt' CONFIG_NAME = 'config.json' # output_model_file = os.path.join(args.output_dir, 'predictor_' + WEIGHTS_NAME) # torch.save(model_to_save.state_dict(), output_model_file) # output_config_file = os.path.join(args.output_dir, 'predictor_' + CONFIG_NAME) # with open(output_config_file, 'w') as f: # f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned # config = BertConfig(output_config_file) # predictor = BertForMultipleChoice(config, num_choices=4) # predictor.load_state_dict(torch.load(output_model_file)) # Do the same for adversary model_to_save = adversary.module if hasattr( adversary, 'module') else adversary # Only save the model it-self output_model_file = os.path.join(args.output_dir, 'adversary_' + WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, 'adversary_' + CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) config = BertConfig(output_config_file) adversary = BertForSequenceClassification(config, num_labels=3) adversary.load_state_dict(torch.load(output_model_file)) else: WEIGHTS_NAME = 'weights.pt' CONFIG_NAME = 'config.json' # output_model_file = os.path.join(args.output_dir, 'predictor_' + WEIGHTS_NAME) # output_config_file = os.path.join(args.output_dir, 'predictor_' + CONFIG_NAME) # config = BertConfig(output_config_file) # predictor = BertForMultipleChoice(config, num_choices=4) # predictor.load_state_dict(torch.load(output_model_file)) output_model_file = os.path.join(args.output_dir, 'adversary_' + WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, 'adversary_' + CONFIG_NAME) config = BertConfig(output_config_file) adversary = BertForSequenceClassification(config, num_labels=3) adversary.load_state_dict(torch.load(output_model_file)) # predictor = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4) # adversary = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=3) # predictor.to(device) adversary.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_swag_examples(os.path.join( args.data_dir, 'val.csv'), is_training=True) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) all_vp_input_ids = torch.tensor(select_field(eval_features, 'vp_input_ids'), dtype=torch.long) all_vp_input_mask = torch.tensor(select_field(eval_features, 'vp_input_mask'), dtype=torch.long) all_protected_attr = torch.tensor( [f.protected_attr for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label, all_vp_input_ids, all_vp_input_mask, all_protected_attr) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # predictor.eval() adversary.eval() eval_loss_pred, eval_accuracy_pred = 0, 0 eval_loss_adv, eval_accuracy_adv = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids, vp_input_ids, vp_input_mask, protected_attr_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) vp_input_ids = vp_input_ids.to(device) vp_input_mask = vp_input_mask.to(device) protected_attr_ids = protected_attr_ids.to(device) # with torch.no_grad(): # tmp_eval_loss_pred, logits_pred = predictor(input_ids, segment_ids, input_mask, label_ids) # predicted_vps = torch.argmax(logits_pred, dim=1) # predicted_vps = predicted_vps.view(-1, 1).repeat(1, vp_input_ids.size(2)).view([-1, 1, vp_input_ids.size(2)]) # vp_input_ids = torch.gather(vp_input_ids, dim=1, index=predicted_vps) # vp_input_ids = vp_input_ids.view([vp_input_ids.size(0), -1]) # vp_input_mask = torch.gather(vp_input_mask, dim=1, index=predicted_vps) # vp_input_mask = vp_input_mask.view([vp_input_mask.size(0), -1]) vp_input_ids = vp_input_ids.view( vp_input_ids.shape[0] * vp_input_ids.shape[1], vp_input_ids.shape[2]) vp_input_mask = vp_input_mask.view( vp_input_mask.shape[0] * vp_input_mask.shape[1], vp_input_mask.shape[2]) vp_input_ids = torch.cat([ 100 * torch.ones(vp_input_ids.shape[0], 1).long().to(device), vp_input_ids ], dim=1) vp_input_mask = torch.cat([ torch.ones(vp_input_ids.shape[0], 1).long().to(device), vp_input_mask ], dim=1) #protected_attr_ids = protected_attr_ids.t() protected_attr_ids = protected_attr_ids.repeat(4, 1).t() protected_attr_ids = protected_attr_ids.reshape( protected_attr_ids.shape[0] * protected_attr_ids.shape[1]) with torch.no_grad(): tmp_eval_loss_adv, logits_adv = adversary( vp_input_ids, None, vp_input_mask, protected_attr_ids) # print("logits_adv", logits_adv) # tmp_eval_accuracy_pred = accuracy(logits_pred, label_ids) print("logits_adv shape ", logits_adv.shape) print("protected_attr_ids shape ", protected_attr_ids.shape) tmp_eval_accuracy_adv = accuracy(logits_adv, protected_attr_ids) # eval_loss_pred += tmp_eval_loss_pred.mean().item() # eval_accuracy_pred += tmp_eval_accuracy_pred.item() eval_loss_adv += tmp_eval_loss_adv.mean().item() eval_accuracy_adv += tmp_eval_accuracy_adv.item() nb_eval_examples += input_ids.size(0) * 4 nb_eval_steps += 1 print("eval_accuracy_adv", eval_accuracy_adv) print("nb_eval_examples", nb_eval_examples) # eval_loss_pred /= nb_eval_steps # eval_accuracy_pred /= nb_eval_examples eval_loss_adv /= nb_eval_steps eval_accuracy_adv /= nb_eval_examples print("========================================================") print("eval_accuracy_adv", eval_accuracy_adv) print("nb_eval_examples", nb_eval_examples) if args.do_train: result = { # 'eval_loss_pred': eval_loss_pred, # 'eval_accuracy_pred': eval_accuracy_pred, 'eval_loss_adv': eval_loss_adv, 'eval_accuracy_adv': eval_accuracy_adv, 'global_step': global_step, # 'loss_pred': tr_loss_pred/nb_tr_steps, 'loss_adv': tr_loss_adv / nb_tr_steps } else: result = { # 'eval_loss_pred': eval_loss_pred, # 'eval_accuracy_pred': eval_accuracy_pred, 'eval_loss_adv': eval_loss_adv, 'eval_accuracy_adv': eval_accuracy_adv } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=512, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_test", action='store_true', help="Whether to run test on the test set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=32, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") parser.add_argument( "--test_set", default='story', type=str, #choices=['story', 'news', 'chat', 'train'], help="Choose the test set.") parser.add_argument("--no_logit_mask", action='store_true', help="Whether not to use logit mask") parser.add_argument("--eval_every_epoch", action='store_true', help="Whether to evaluate for every epoch") parser.add_argument("--use_weight", action='store_true', help="Whether to use class-balancing weight") parser.add_argument("--hybrid_attention", action='store_true', help="Whether to use hybrid attention") parser.add_argument( "--state_dir", default="", type=str, help= "Where to load state dict instead of using Google pre-trained model") parser.add_argument( '--ratio', type=float, default=0.9, help="softmax target for the target label, 1-ratio for the abbreviation" ) args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval and not args.do_test: raise ValueError( "At least one of `do_train` or `do_eval` or 'do_test' must be True." ) processor = DataProcessor() label_list = processor.get_labels(args.data_dir) abex = processor.get_abex(args.data_dir) num_labels = len(label_list) logger.info("num_labels:" + str(num_labels)) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = len( train_examples ) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps / torch.distributed.get_world_size( ) num_train_optimization_steps = math.ceil(num_train_optimization_steps) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) max_epoch = -1 if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) files = os.listdir(args.output_dir) for fname in files: if re.search(WEIGHTS_NAME, fname) and fname != WEIGHTS_NAME: max_epoch = max(max_epoch, int(fname.split('_')[-1])) if os.path.exists( os.path.join(args.output_dir, WEIGHTS_NAME + '_' + str(max_epoch))): output_model_file = os.path.join( args.output_dir, WEIGHTS_NAME + '_' + str(max_epoch)) output_config_file = os.path.join(args.output_dir, CONFIG_NAME + '_0') config = BertConfig(output_config_file) model = BertForAbbrPad(config) model.load_state_dict(torch.load(output_model_file)) else: raise ValueError( "Output directory ({}) already exists but no model checkpoint was found." .format(args.output_dir)) else: os.makedirs(args.output_dir, exist_ok=True) if args.state_dir and os.path.exists(args.state_dir): state_dict = torch.load(args.state_dir) if isinstance(state_dict, dict) or isinstance( state_dict, collections.OrderedDict): assert 'model' in state_dict state_dict = state_dict['model'] print("Using my own BERT state dict.") elif args.state_dir and not os.path.exists(args.state_dir): print( "Warning: the state dict does not exist, using the Google pre-trained model instead." ) state_dict = None else: state_dict = None model = BertForAbbrPad.from_pretrained(args.bert_model, cache_dir=cache_dir, state_dict=state_dict) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) if os.path.exists( os.path.join(args.output_dir, OPTIMIZER_NAME + '_' + str(max_epoch))): output_optimizer_file = os.path.join( args.output_dir, OPTIMIZER_NAME + '_' + str(max_epoch)) optimizer.load_state_dict(torch.load(output_optimizer_file)) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features, masks, weight = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, abex, args.ratio) if args.eval_every_epoch: eval_examples = processor.get_dev_examples(args.data_dir) eval_features, masks, weight = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, abex, args.ratio) if args.no_logit_mask: print("Remove logit mask") masks = None if not args.use_weight: weight = None hybrid_mask = None writer = SummaryWriter(log_dir=os.environ['HOME']) tag = str(int(time.time())) print(weight) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_label_poss = torch.tensor([f.label_pos for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_label_ids, all_label_poss) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for ep in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, label_ids, label_poss = batch # print(masks.size()) loss = model(input_ids, attention_mask=input_mask, labels=label_ids, cal_loss=True) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 writer.add_scalar('data/loss' + tag, loss.item(), global_step) logger.info(f'Trainging loss: {tr_loss/nb_tr_steps}') # save model and load for a single GPU model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME + '_' + str(ep)) torch.save(model_to_save.state_dict(), output_model_file) output_optimizer_file = os.path.join( args.output_dir, OPTIMIZER_NAME + '_' + str(ep)) torch.save(optimizer.state_dict(), output_optimizer_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME + '_' + str(ep)) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) if args.eval_every_epoch: # evaluate for every epoch # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model_eval = BertForAbbrPad(config, num_labels=num_labels) model_eval.load_state_dict(torch.load(output_model_file)) model_eval.to(device) hybrid_mask = None if args.no_logit_mask: print("Remove logit mask") masks = None else: masks = masks.to(device) chars = [f.char for f in eval_features] print(len(set(chars)), sorted(list(set(chars)))) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor( [f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor( [f.label_id for f in eval_features], dtype=torch.long) all_label_poss = torch.tensor( [f.label_pos for f in eval_features], dtype=torch.long) all_ids = torch.tensor([f.unique_id for f in eval_features], dtype=torch.long) all_candidates = [f.candidates for f in eval_features] all_labels = [f.label for f in eval_features] eval_data = TensorDataset(all_input_ids, all_input_mask, all_label_ids, all_label_poss, all_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 res_list = [] out_list = [] # masks = masks.to(device) for input_ids, input_mask, label_ids, label_poss, all_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) label_ids = label_ids.to(device) label_poss = label_poss.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, attention_mask=input_mask, labels=label_ids, cal_loss=True) logits = model(input_ids, attention_mask=input_mask, labels=label_ids, cal_loss=False) # print(logits.size()) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() for id in all_ids: tmp_out, tmp_res = get_result(logits[id], label_poss[id], all_candidates[id], all_labels[id], tokenizer) res_list.append(tmp_res) out_list.append(tmp_out) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps loss = tr_loss / nb_tr_steps if args.do_train else None acc = sum(res_list) / len(res_list) char_count = {k: [] for k in list(set(chars))} for i, c in enumerate(chars): char_count[c].append(res_list[i]) char_acc = { k: sum(char_count[k]) / len(char_count[k]) for k in char_count } result = { 'epoch': ep + 1, 'eval_loss': eval_loss, 'eval_accuracy': acc, 'global_step': global_step, 'loss': loss } logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) output_eval_file = os.path.join( args.output_dir, "epoch_" + str(ep + 1) + ".txt") with open(output_eval_file, 'w') as f: f.write( json.dumps(result, ensure_ascii=False) + '\n' + json.dumps(char_acc, ensure_ascii=False)) # multi processing # if n_gpu > 1: # model = torch.nn.DataParallel(model) if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_optimizer_file = os.path.join(args.output_dir, OPTIMIZER_NAME) torch.save(optimizer.state_dict(), output_optimizer_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = BertForAbbrPad(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) else: # model = BertForPolyphonyMulti.from_pretrained(args.bert_model, num_labels = num_labels) pass model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) config = BertConfig(output_config_file) model = BertForAbbrPad(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) model.to(device) eval_examples = processor.get_test_examples(args.data_dir) eval_features, masks, weight = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, abex, args.ratio, is_test=True) hybrid_mask = None if args.no_logit_mask: print("Remove logit mask") masks = None else: masks = masks.to(device) chars = [f.char for f in eval_features] print(len(set(chars)), sorted(list(set(chars)))) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_label_poss = torch.tensor([f.label_pos for f in eval_features], dtype=torch.long) all_ids = torch.tensor([f.unique_id for f in eval_features], dtype=torch.long) all_candidates = [f.candidates for f in eval_features] all_labels = [f.label for f in eval_features] eval_data = TensorDataset(all_input_ids, all_input_mask, all_label_ids, all_label_poss, all_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 res_list = [] out_list = [] # masks = masks.to(device) for input_ids, input_mask, label_ids, label_poss, all_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) label_ids = label_ids.to(device) label_poss = label_poss.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, attention_mask=input_mask, labels=label_ids, cal_loss=True) logits = model(input_ids, attention_mask=input_mask, labels=label_ids, cal_loss=False) # print(logits.size()) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() for id in all_ids: tmp_out, tmp_res = get_result(logits[id], label_poss[id], all_candidates[id], all_labels[id], tokenizer) res_list.append(tmp_res) out_list.append(tmp_out) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps loss = tr_loss / nb_tr_steps if args.do_train else None acc = sum(res_list) / len(res_list) char_count = {k: [] for k in list(set(chars))} for i, c in enumerate(chars): char_count[c].append(res_list[i]) char_acc = { k: sum(char_count[k]) / len(char_count[k]) for k in char_count } result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': loss, 'acc': acc } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) for key in sorted(char_acc.keys()): logger.info(" %s = %s", key, str(char_acc[key])) writer.write("%s = %s\n" % (key, str(char_acc[key]))) print("mean accuracy", sum(char_acc[c] for c in char_acc) / len(char_acc)) output_acc_file = os.path.join(args.output_dir, "res.json") output_reslist_file = os.path.join(args.output_dir, "outlist.json") with open(output_acc_file, "w") as f: f.write(json.dumps(char_acc, ensure_ascii=False, indent=2)) with open(output_reslist_file, "w") as f: f.write(json.dumps(out_list, ensure_ascii=False, indent=2))
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--model_dir", default='out/', type=str, required=True, help="Dir of the trained Ner model.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) # Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=8, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") # n_gpu = torch.cuda.device_count() n_gpu = 1 # in case tensor in different gpu else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) processor = RelationProcessor() train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) print("num_train_optimization_steps", num_train_optimization_steps) n_classes = len(relation_list) # num of relations transitional_size = 100 # size of transitional layer model = relation_extracter('out/', transitional_size, n_classes) tokenizer = model.tokeniser max_seq_length = model.max_seq_length model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in ['bert'])], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, relation_list, max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_relation_matrices = torch.tensor( [f.relation_matrix for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_relation_matrices) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, relarion_matrices = batch relarion_matrices = relarion_matrices.float() loss = model(input_ids, segment_ids, input_mask, label_ids, relarion_matrices)["loss"] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model and the associated configuration # model_to_save = model.module if hasattr( # model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, 'ner_re.bin') torch.save(model.state_dict(), output_model_file) # Load a trained model and config that you have fine-tuned else: output_model_file = os.path.join(args.output_dir, 'ner_re.bin') model.load_state_dict(torch.load(output_model_file)) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, relation_list, max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_relation_matrices = torch.tensor( [f.relation_matrix for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_relation_matrices) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 y_true = [] y_pred = [] label_map = {i: label for i, label in enumerate(label_list, 1)} for input_ids, input_mask, segment_ids, label_ids, relarion_matrices in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) relarion_matrices = relarion_matrices.float() relarion_matrices = relarion_matrices.to(device) with torch.no_grad(): output_dict = model(input_ids, segment_ids, input_mask, label_ids, relarion_matrices) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") logger.info("\n%s", model.get_metrics()) writer.write(str(model.get_metrics()))
def main(): local_rank = -1 parser = argparse.ArgumentParser() parser.add_argument("--config", "-c", type=str, required=True) args, _ = parser.parse_known_args() options = argconf.options_from_json("confs/options.json") config = argconf.config_from_json(args.config) args = edict(argconf.parse_args(options, config)) args.local_rank = local_rank args.on_memory = True if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train: raise ValueError( "Training is currently the only implemented execution option. Please set `do_train`." ) if os.path.exists(args.workspace) and os.listdir(args.workspace): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.workspace)) if not os.path.exists(args.workspace): os.makedirs(args.workspace) tokenizer = BertTokenizer.from_pretrained(args.model_file, do_lower_case=True) #train_examples = None num_train_optimization_steps = None if args.do_train: print("Loading Train Dataset", args.train_file) train_dataset = BERTDataset(args.train_file, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) num_train_optimization_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model model = BertForPreTraining.from_pretrained(args.model_file) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: #TODO: check if this works with current data generator from disk that relies on next(file) # (it doesn't return item back by index) train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) model.train() loss_fct = nn.CrossEntropyLoss(ignore_index=-1) for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids = batch prediction_scores, _ = model(input_ids, segment_ids, input_mask, lm_label_ids) loss = loss_fct( prediction_scores.view(-1, model.module.config.vocab_size), lm_label_ids.view(-1)).mean() if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.workspace, "pytorch_model.bin") if args.do_train: torch.save(model_to_save.state_dict(), output_model_file)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( "--task", default=None, type=str, required=True, help="Sentiment analysis or natural language inference? (SA or NLI)") ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--trained_model_dir", default="", type=str, help= "Where is the fine-tuned (with the cloze-style LM objective) BERT model?" ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_test", action='store_true', help="Whether to run eval on the test set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--freeze_bert', action='store_true', help="Whether to freeze BERT") parser.add_argument('--full_bert', action='store_true', help="Whether to use full BERT") parser.add_argument('--num_train_samples', type=int, default=-1, help="-1 for full train set, otherwise please specify") args = parser.parse_args() device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval and not args.do_test: raise ValueError( "At least one of `do_train` or `do_eval` or `do_test` must be True." ) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: logger.info( "WARNING: Output directory ({}) already exists and is not empty.". format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Prepare data processor mnli_processor = MnliProcessor() hans_processor = HansProcessor() sst_processor = Sst2Processor() if args.task == "SA": label_list = sst_processor.get_labels() elif args.task == "NLI": label_list = mnli_processor.get_labels() else: raise ValueError("") num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) # Prepare training data train_examples = None num_train_optimization_steps = None if args.do_train: if args.task == "SA": train_examples = sst_processor.get_train_examples( args.data_dir, args.num_train_samples) elif args.task == "NLI": train_examples = mnli_processor.get_train_examples( args.data_dir, args.num_train_samples) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size) * args.num_train_epochs # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(-1)) if args.trained_model_dir: # load in fine-tuned (with cloze-style LM objective) model if os.path.exists(os.path.join(args.output_dir, WEIGHTS_NAME)): previous_state_dict = torch.load( os.path.join(args.output_dir, WEIGHTS_NAME)) else: from collections import OrderedDict previous_state_dict = OrderedDict() distant_state_dict = torch.load( os.path.join(args.trained_model_dir, WEIGHTS_NAME)) previous_state_dict.update( distant_state_dict ) # note that the final layers of previous model and distant model must have different attribute names! model = MyBertForSequenceClassification.from_pretrained( args.trained_model_dir, state_dict=previous_state_dict, num_labels=num_labels) else: model = MyBertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) if args.freeze_bert: # freeze BERT if needed frozen = ['bert'] elif args.full_bert: frozen = [] else: frozen = [ 'bert.embeddings.', 'bert.encoder.layer.0.', 'bert.encoder.layer.1.', 'bert.encoder.layer.2.', 'bert.encoder.layer.3.', 'bert.encoder.layer.4.', 'bert.encoder.layer.5.', 'bert.encoder.layer.6.', 'bert.encoder.layer.7.', ] # *** change here to filter out params we don't want to track *** no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if (not any(fr in n for fr in frozen)) and (not any(nd in n for nd in no_decay)) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer if (not any(fr in n for fr in frozen)) and (any(nd in n for nd in no_decay)) ], 'weight_decay': 0.0 }] if args.fp16: raise ValueError("Not sure if FP16 precision works yet.") else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) if args.do_train: global_step = 0 train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_id = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_id) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() # model.eval() # train in eval mode to avoid dropout for _ in trange(int(args.num_train_epochs), desc="Epoch"): epoch_loss = [] for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16: optimizer.backward(loss) else: loss.backward() if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 epoch_loss.append(loss.item()) logger.info(" epoch loss = %f", np.mean(epoch_loss)) if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) if args.do_test: if args.task == "SA": test_examples = sst_processor.get_dev_examples(args.data_dir) elif args.task == "NLI": test_examples = mnli_processor.get_dev_examples(args.data_dir) test_features = convert_examples_to_features(test_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running final test *****") logger.info(" Num examples = %d", len(test_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) all_label_id = torch.tensor([f.label_id for f in test_features], dtype=torch.long) all_guid = torch.tensor([f.guid for f in test_features], dtype=torch.long) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_id, all_guid) # Run prediction for full data test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) model.eval() test_loss, test_accuracy = 0, 0 nb_test_steps, nb_test_examples = 0, 0 wrong_list = [] for input_ids, input_mask, segment_ids, label_ids, guids in tqdm( test_dataloader, desc="Testing"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_test_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_test_correct, tmp_test_total = accuracy(logits, label_ids) assert tmp_test_total == 1 if tmp_test_correct == 0: wrong_list.append(guids[0].item()) test_loss += tmp_test_loss.mean().item() test_accuracy += tmp_test_correct nb_test_examples += tmp_test_total nb_test_steps += 1 test_loss = test_loss / nb_test_steps test_accuracy = test_accuracy / nb_test_examples result = {'test_loss': test_loss, 'test_accuracy': test_accuracy} output_test_file = os.path.join(args.output_dir, "test_results.txt") with open(output_test_file, "w") as writer: logger.info("***** Test results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .csv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--debug', action="store_true", help="debug mode") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = read_swag_examples(os.path.join( args.data_dir, 'train.csv'), is_training=True) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model model = BertForMultipleChoice.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_choices=4) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if args.debug: break if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() # accumulate gradient optimizer.zero_grad() global_step += 1 if args.debug: break if args.debug: import sys sys.exit(1) # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) # Load a trained model that you have fine-tuned model_state_dict = torch.load(output_model_file) model = BertForMultipleChoice.from_pretrained(args.bert_model, state_dict=model_state_dict, num_choices=4) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_swag_examples(os.path.join( args.data_dir, 'val.csv'), is_training=True) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=32, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() # if args.server_ip and args.server_port: # # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script # import ptvsd # print("Waiting for debugger attach") # ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) # ptvsd.wait_for_attach() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mnli-mm": MnliMismatchedProcessor, "mrpc": MrpcProcessor, "sst-2": Sst2Processor, "sts-b": StsbProcessor, "qqp": QqpProcessor, "qnli": QnliProcessor, "rte": RteProcessor, "wnli": WnliProcessor, } output_modes = { "cola": "classification", "mnli": "classification", "mrpc": "classification", "sst-2": "classification", "sts-b": "regression", "qqp": "classification", "qnli": "classification", "rte": "classification", "wnli": "classification", } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() #[0,1] num_labels = len(label_list) train_examples = None num_train_optimization_steps = None if args.do_train: # train_examples = processor.get_train_examples_wenpeng('/home/wyin3/Datasets/glue_data/RTE/train.tsv') train_examples, _, _ = processor.get_examples_SF_wenpeng('/home/wyin3/Datasets/MNLI-SNLI-SciTail-RTE-SICK/BBN.SF.train.zeroshot.v1.txt') num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)) # model = BertForSequenceClassification.from_pretrained(args.bert_model, # cache_dir=cache_dir, # num_labels=num_labels) # tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) pretrain_model_dir = '/home/wyin3/Datasets/fine_tune_Bert_stored/FineTuneOnMNLI' #FineTuneOnCombined'# FineTuneOnMNLI model = BertForSequenceClassification.from_pretrained(pretrain_model_dir, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained(pretrain_model_dir, do_lower_case=args.do_lower_case) if args.fp16: model.half() model.to(device) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) # optimizer = optimizer_wenpeng.Adagrad(optimizer_grouped_parameters, lr=1e-3) global_step = 0 nb_tr_steps = 0 tr_loss = 0 max_test_acc = 0.0 if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, output_mode) eval_examples, eval_label_list, eval_coord_list = processor.get_examples_SF_wenpeng('/home/wyin3/Datasets/MNLI-SNLI-SciTail-RTE-SICK/BBN.SF.test.zeroshot.v1.txt') eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) # print('train all_label_ids:', all_label_ids) # exit(0) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) iter_co = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): model.train() batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # print('train label_ids:', label_ids.view(-1)) # define a new function to compute loss values for both output_modes logits = model(input_ids, segment_ids, input_mask, labels=None) if output_mode == "classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 iter_co+=1 if iter_co %100==0: ''' start evaluate on test set after this epoch ''' model.eval() logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) # create eval loss and other metric required by the task if output_mode == "classification": loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append( preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] if output_mode == "classification": preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) mean_f1, weighted_f1 = evaluate_SF_zeroshot(preds, eval_label_list, eval_coord_list, 5) # result = compute_metrics('F1', preds, all_label_ids.numpy()) loss = tr_loss/nb_tr_steps if args.do_train else None test_acc = mean_f1#result.get("f1") if test_acc > max_test_acc: max_test_acc = test_acc print('\ntest F1_match & F1_mismatch:', test_acc,weighted_f1, ' max_test_f1:', max_test_acc, '\n')
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=330, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_test", action='store_true', help="Whether to run test on the test set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=16, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mnli-mm": MnliMismatchedProcessor, "mrpc": MrpcProcessor, "sst-2": Sst2Processor, "sts-b": StsbProcessor, "qqp": QqpProcessor, "qnli": QnliProcessor, "rte": RteProcessor, "wnli": WnliProcessor, } output_modes = { "cola": "classification", "mnli": "classification", "mrpc": "classification", "sst-2": "classification", "sts-b": "regression", "qqp": "classification", "qnli": "classification", "rte": "classification", "wnli": "classification", } # '''. prepare data # args.stance_data_path = "/home/Vachel/github/pytorch-pretrained-BERT/data/stanceChinese" args.stance_data_path = "/home/Vachel/github/pytorch-pretrained-BERT/data/TD_stance" args.data_path = args.data_dir args.batch_size = args.train_batch_size tokenize = lambda x: [i for i in x][:300] text_field = data.Field(sequential=True, tokenize=tokenize, lower=True, batch_first=True) label_field = data.Field(sequential=False, use_vocab=False) train_stance_iter, dev_stance_iter, test_iter = mydatasets.stance_dataset( text_field, label_field, args) train_iter, dev_iter = mydatasets.ir_dataset(text_field, args) # ''' if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval and not args.do_test: raise ValueError( "At least one of do_train` or `do_eval` or `do_test` must be True." ) # if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] # ''' # label_list = processor.get_labels() # num_labels = len(label_list) label_list = [0, 1] num_labels = len(label_list) # ''' tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: # ''' # train_examples = processor.get_train_examples(args.data_dir) # num_train_optimization_steps = int( # len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs num_train_optimization_steps = int( # len(train_iter.dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs len(train_stance_iter.dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs # ''' if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_stance_iter.dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) for _ in trange(int(args.num_train_epochs), desc="Epoch"): model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch_stance in enumerate( tqdm(train_stance_iter, desc="Iteration")): # for step, (batch, batch_stance) in enumerate(tqdm(zip(train_iter, train_stance_iter), desc="Iteration")): # input_ids, input_mask, segment_ids, label_ids = to_bert_input_related(args, text_field, batch, label_list, tokenizer, output_mode) # input_ids_stance, input_mask_stance, segment_ids_stance, label_ids_stance = to_bert_input_stance(args, text_field, batch_stance, label_list, tokenizer, output_mode) # input_ids = torch.cat((input_ids, input_ids_stance), dim = 0) # input_mask = torch.cat((input_mask, input_mask_stance), dim = 0) # segment_ids = torch.cat((segment_ids, segment_ids_stance), dim = 0) # label_ids = torch.cat((label_ids, label_ids_stance), dim = 0) input_ids, input_mask, segment_ids, label_ids = to_bert_input_stance( args, text_field, batch_stance, label_list, tokenizer, output_mode) input_ids, input_mask, segment_ids, label_ids = input_ids.to( device), input_mask.to(device), segment_ids.to( device), label_ids.to(device) # define a new function to compute loss values for both output_modes logits = model(input_ids, segment_ids, input_mask, labels=None) if output_mode == "classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # eval logger.info("***** Running evaluation *****") model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] all_label_ids = [] for step, batch_stance in enumerate( tqdm(dev_stance_iter, desc="Evaluating")): # for step, (batch, batch_stance) in enumerate(tqdm(zip(dev_iter, dev_stance_iter), desc="Evaluating")): # input_ids, input_mask, segment_ids, label_ids = to_bert_input_related(args, text_field, batch, label_list, tokenizer, output_mode) # input_ids_stance, input_mask_stance, segment_ids_stance, label_ids_stance = to_bert_input_stance(args, text_field, batch_stance, label_list, tokenizer, output_mode) # input_ids = torch.cat((input_ids, input_ids_stance), dim = 0) # input_mask = torch.cat((input_mask, input_mask_stance), dim = 0) # segment_ids = torch.cat((segment_ids, segment_ids_stance), dim = 0) # label_ids = torch.cat((label_ids, label_ids_stance), dim = 0) input_ids, input_mask, segment_ids, label_ids = to_bert_input_stance( args, text_field, batch_stance, label_list, tokenizer, output_mode) all_label_ids += list(label_ids.numpy()) input_ids, input_mask, segment_ids, label_ids = input_ids.to( device), input_mask.to(device), segment_ids.to( device), label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) # create eval loss and other metric required by the task if output_mode == "classification": loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] exp_x = np.exp(preds) softmax_x = exp_x / np.mat(np.sum(exp_x, axis=1)).T np.save("eval_logits.npy", softmax_x) np.save("eval_labels.npy", all_label_ids) if output_mode == "classification": preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(task_name, preds, all_label_ids) loss = tr_loss / nb_tr_steps if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if args.do_test and (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info("***** Running testing *****") # Load a trained model and config that you have fine-tuned output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) config = BertConfig(output_config_file) model = BertForSequenceClassification(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) if args.fp16: model.half() model.to(device) model.eval() for i, batch in enumerate(tqdm(test_iter, desc="Evaluating")): test_src_batch, test_tgt_batch = batch.query, batch.doc input_ids, input_mask, segment_ids = to_bert_input_test( args, text_field, test_src_batch, test_tgt_batch, label_list, tokenizer, output_mode) input_ids, input_mask, segment_ids = input_ids.to( device), input_mask.to(device), segment_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) if i == 0: # pred_poss = list(F.softmax(logits, dim = 1)[:, 1].detach().cpu().numpy()) pred_poss = list( F.softmax(logits, dim=1).detach().cpu().numpy()) else: # pred_poss += list(F.softmax(logits, dim = 1)[:, 1].detach().cpu().numpy()) pred_poss += list( F.softmax(logits, dim=1).detach().cpu().numpy()) pred_poss = np.array(pred_poss) np.save("preds_stance_1.npy", pred_poss) print(pred_poss.shape)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_file", default=None, type=str, required=True, help="The input train corpus.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written.") ## Other parameters parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--on_memory", action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument("--do_lower_case", action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumualte before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type = float, default = 0, help = "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) #train_examples = None num_train_optimization_steps = None if args.do_train: print("Loading Train Dataset", args.train_file) train_dataset = BERTDataset(args.train_file, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) num_train_optimization_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model model = BertForPreTraining.from_pretrained(args.bert_model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: #TODO: check if this works with current data generator from disk that relies on file.__next__ # (it doesn't return item back by index) train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") if args.do_train: torch.save(model_to_save.state_dict(), output_model_file)
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--log_dir", default=None, type=str, required=True, help="The log dir. Should contain the .txt file (or other data file) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written." ) # Other parameters parser.add_argument( "--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() # log setting handler = logging.FileHandler(os.path.join(args.log_dir, "log.txt")) handler.setFormatter(logging.DEBUG) formatter = logging.Formatter( '%(asctime)s - %(levelname)s - %(name)s - %(message)s') handler.setFormatter(formatter) logger.addHandler(handler) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { # bbn processor "bbn": BBNNerProcessor, } output_modes = { "bbn": "classification", } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] if task_name == 'bbn': label_list = processor.get_labels(args.data_dir) else: label_list = processor.get_labels() num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) model = BertForTokenClassification.from_pretrained(args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model.half() try: model.to(device) except Exception: logger.warning("toGPU failed, failed msg:" + traceback.format_exc()) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 # prepare Data # train_label_ids, dev_label_ids, test_label_ids = process_data(tokenizer, processor, args.data_dir, args.max_seq_length) if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) train_data = torch.load(os.path.join(args.data_dir, "train.pt")) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in range(int(args.num_train_epochs)): tr_loss = 0 last_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # define a new function to compute loss values for both output_modes loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if abs(loss.item() - last_loss) <= 5e-10: break # if abs(loss.item() - last_loss) != 0: # print("iterate fine") # print("step: " + str(step)) # print(abs(loss.item() - last_loss)) last_loss = loss.item() # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = BertForTokenClassification(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) else: logger.info("preparing model") # model = BertForTokenClassification.from_pretrained( # args.bert_model, num_labels=num_labels) output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) config = BertConfig(output_config_file) model = BertForTokenClassification(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) print("Model's state_dict:") for param_tensor in model.state_dict(): print(param_tensor, "\t", model.state_dict()[param_tensor].size()) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): model.eval() # eval_examples = processor.get_dev_examples(args.data_dir) eval_examples = processor.get_dev_examples(args.data_dir) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) eval_data = torch.load(os.path.join(args.data_dir, "dev.pt")) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] active_labels_dataset = [] i = 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits, active_loss = model(input_ids, segment_ids, input_mask, labels=None) active_labels = label_ids.view(-1)[active_loss] # create eval loss and other metric required by the task if output_mode == "classification": loss_fct = CrossEntropyLoss() # tmp_eval_loss = loss_fct(logits.view(-1, num_labels), # active_labels.view(-1)) tmp_eval_loss = 0 elif output_mode == "regression": loss_fct = MSELoss() tmp_eval_loss = loss_fct(logits.view(-1), active_labels.view(-1)) # eval_loss += tmp_eval_loss.mean().item() eval_loss += 0 nb_eval_steps += 1 # if len(preds) == 0: # preds.append(logits.detach().cpu().numpy()) # else: # preds[0] = np.append(preds[0], # logits.detach().cpu().numpy(), # axis=0) logits = np.argmax(logits.detach().cpu().numpy(), axis=1) preds.append(logits) active_labels_dataset.append(active_labels) eval_loss = eval_loss / nb_eval_steps # preds = preds[0] preds_flat = [] labels_flat = [] for s in preds: for l in s: # l is label preds_flat.append(l) for s in active_labels_dataset: for l in s: labels_flat.append(l.detach().cpu().numpy()) preds_flat = np.array(preds_flat) labels_flat = np.array(labels_flat) for i in range(len(preds_flat)): if preds_flat[i] == 37: preds_flat[i] = 7 elif preds_flat[i] == 34: preds_flat[i] == 12 elif preds_flat[i] == 26: preds_flat[i] = 36 elif preds_flat[i] == 36: preds_flat[i] = 37 elif preds_flat[i] == 41: preds_flat[i] = 34 elif preds_flat[i] == 31: preds_flat[i] = 39 elif preds_flat[i] == 15: preds_flat[i] = 38 # label_map = dict() # for i in range(len(preds_flat)): # key = str(preds_flat[i]) + '-' + str(labels_flat[i]) # if key in label_map.keys(): # label_map[key] += 1 # else: # label_map[key] = 0 # for k in label_map.keys(): # if label_map[k] > 1000: # print(k, ":", label_map[k]) # if output_mode == "classification": # preds = np.argmax(preds, axis=2) # elif output_mode == "regression": # preds = np.squeeze(preds) result = compute_metrics(task_name, preds_flat, labels_flat) loss = tr_loss / nb_tr_steps if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # hack for MNLI-MM if task_name == "mnli": task_name = "mnli-mm" processor = processors[task_name]() if os.path.exists(args.output_dir + '-MM') and os.listdir(args.output_dir + '-MM') and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.". format(args.output_dir)) if not os.path.exists(args.output_dir + '-MM'): os.makedirs(args.output_dir + '-MM') eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)[0] logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] preds = np.argmax(preds, axis=1) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss / nb_tr_steps if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir + '-MM', "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--src_file", default=None, type=str, help="The input data file name.") parser.add_argument("--tgt_file", default=None, type=str, help="The output data file name.") parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--config_path", default=None, type=str, help="Bert config file path.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( "--log_dir", default='', type=str, required=True, help="The output directory where the log will be written.") parser.add_argument("--model_recover_path", default=None, type=str, required=True, help="The file of fine-tuned pretraining model.") parser.add_argument("--optim_recover_path", default=None, type=str, help="The file of pretraining optimizer.") # Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--local_debug", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=64, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--label_smoothing", default=0, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.01, type=float, help="The weight decay rate for Adam.") parser.add_argument("--finetune_decay", action='store_true', help="Weight decay to the original weights.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--hidden_dropout_prob", default=0.1, type=float, help="Dropout rate for hidden states.") parser.add_argument("--attention_probs_dropout_prob", default=0.1, type=float, help="Dropout rate for attention probabilities.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--fp32_embedding', action='store_true', help= "Whether to use 32-bit float precision instead of 16-bit for embeddings" ) parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--amp', action='store_true', help="Whether to use amp for fp16") parser.add_argument( '--from_scratch', action='store_true', help= "Initialize parameters with random values (i.e., training from scratch)." ) parser.add_argument('--new_segment_ids', action='store_true', help="Use new segment ids for bi-uni-directional LM.") parser.add_argument('--new_pos_ids', action='store_true', help="Use new position ids for LMs.") parser.add_argument('--tokenized_input', action='store_true', help="Whether the input is tokenized.") parser.add_argument('--max_len_a', type=int, default=0, help="Truncate_config: maximum length of segment A.") parser.add_argument('--max_len_b', type=int, default=0, help="Truncate_config: maximum length of segment B.") parser.add_argument( '--trunc_seg', default='', help="Truncate_config: first truncate segment A/B (option: a, b).") parser.add_argument( '--always_truncate_tail', action='store_true', help="Truncate_config: Whether we should always truncate tail.") parser.add_argument( "--mask_prob", default=0.15, type=float, help= "Number of prediction is sometimes less than max_pred when sequence is short." ) parser.add_argument( "--mask_prob_eos", default=0, type=float, help= "Number of prediction is sometimes less than max_pred when sequence is short." ) parser.add_argument('--max_pred', type=int, default=20, help="Max tokens of prediction.") parser.add_argument("--num_workers", default=0, type=int, help="Number of workers for the data loader.") parser.add_argument('--mask_source_words', action='store_true', help="Whether to mask source words for training") parser.add_argument('--skipgram_prb', type=float, default=0.0, help='prob of ngram mask') parser.add_argument('--skipgram_size', type=int, default=1, help='the max size of ngram mask') parser.add_argument('--mask_whole_word', action='store_true', help="Whether masking a whole word.") parser.add_argument('--do_l2r_training', action='store_true', help="Whether to do left to right training") parser.add_argument( '--has_sentence_oracle', action='store_true', help="Whether to have sentence level oracle for training. " "Only useful for summary generation") parser.add_argument('--max_position_embeddings', type=int, default=None, help="max position embeddings") parser.add_argument('--relax_projection', action='store_true', help="Use different projection layers for tasks.") parser.add_argument('--ffn_type', default=0, type=int, help="0: default mlp; 1: W((Wx+b) elem_prod x);") parser.add_argument('--num_qkv', default=0, type=int, help="Number of different <Q,K,V>.") parser.add_argument('--seg_emb', action='store_true', help="Using segment embedding for self-attention.") parser.add_argument( '--s2s_special_token', action='store_true', help="New special tokens ([S2S_SEP]/[S2S_CLS]) of S2S.") parser.add_argument('--s2s_add_segment', action='store_true', help="Additional segmental for the encoder of S2S.") parser.add_argument( '--s2s_share_segment', action='store_true', help= "Sharing segment embeddings for the encoder of S2S (used with --s2s_add_segment)." ) parser.add_argument('--pos_shift', action='store_true', help="Using position shift for fine-tuning.") args = parser.parse_args() assert Path( args.model_recover_path).exists(), "--model_recover_path doesn't exist" args.output_dir = args.output_dir.replace('[PT_OUTPUT_DIR]', os.getenv('PT_OUTPUT_DIR', '')) args.log_dir = args.log_dir.replace('[PT_OUTPUT_DIR]', os.getenv('PT_OUTPUT_DIR', '')) os.makedirs(args.output_dir, exist_ok=True) os.makedirs(args.log_dir, exist_ok=True) json.dump(args.__dict__, open(os.path.join(args.output_dir, 'opt.json'), 'w'), sort_keys=True, indent=2) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs dist.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if args.local_rank not in (-1, 0): # Make sure only the first process in distributed training will download model & vocab dist.barrier() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) if args.max_position_embeddings: tokenizer.max_len = args.max_position_embeddings data_tokenizer = WhitespaceTokenizer( ) if args.tokenized_input else tokenizer if args.local_rank == 0: dist.barrier() if args.do_train: print("Loading Train Dataset", args.data_dir) bi_uni_pipeline = [ seq2seq_loader.Preprocess4Seq2seq( args.max_pred, args.mask_prob, list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, args.max_seq_length, new_segment_ids=args.new_segment_ids, truncate_config={ 'max_len_a': args.max_len_a, 'max_len_b': args.max_len_b, 'trunc_seg': args.trunc_seg, 'always_truncate_tail': args.always_truncate_tail }, mask_source_words=args.mask_source_words, skipgram_prb=args.skipgram_prb, skipgram_size=args.skipgram_size, mask_whole_word=args.mask_whole_word, mode="s2s", has_oracle=args.has_sentence_oracle, num_qkv=args.num_qkv, s2s_special_token=args.s2s_special_token, s2s_add_segment=args.s2s_add_segment, s2s_share_segment=args.s2s_share_segment, pos_shift=args.pos_shift) ] file_oracle = None if args.has_sentence_oracle: file_oracle = os.path.join(args.data_dir, 'train.oracle') fn_src = os.path.join(args.data_dir, args.src_file if args.src_file else 'train.src') fn_tgt = os.path.join(args.data_dir, args.tgt_file if args.tgt_file else 'train.tgt') train_dataset = seq2seq_loader.Seq2SeqDataset( fn_src, fn_tgt, args.train_batch_size, data_tokenizer, args.max_seq_length, file_oracle=file_oracle, bi_uni_pipeline=bi_uni_pipeline, corpus_preprocessors=corpus_preprocessors) train_dataset.initial() print(len(train_dataset.ex_list)) print(train_dataset.batch_size) # assert 1==0 if args.local_rank == -1: train_sampler = RandomSampler(train_dataset, replacement=False) _batch_size = args.train_batch_size else: train_sampler = DistributedSampler(train_dataset) _batch_size = args.train_batch_size // dist.get_world_size() train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=_batch_size, sampler=train_sampler, num_workers=args.num_workers, collate_fn=seq2seq_loader.batch_list_to_batch_tensors, pin_memory=False) # c = 0 # for i_epoch in trange(0, int(args.num_train_epochs)+1, desc="Epoch", disable=args.local_rank not in (-1, 0)): # if args.local_rank != -1: # train_sampler.set_epoch(i_epoch) # iter_bar = tqdm(train_dataloader, desc='Iter (loss=X.XXX)', # disable=args.local_rank not in (-1, 0)) # for step, batch in enumerate(iter_bar): # batch = [ # t.to(device) if t is not None else None for t in batch] # if args.has_sentence_oracle: # input_ids, segment_ids, input_mask, mask_qkv, lm_label_ids, masked_pos, masked_weights, is_next, task_idx, sop_label, oracle_pos, oracle_weights, oracle_labels = batch # else: # input_ids, segment_ids, input_mask, mask_qkv, lm_label_ids, masked_pos, masked_weights, is_next, task_idx, sop_label = batch # oracle_pos, oracle_weights, oracle_labels = None, None, None # c += input_ids.shape[0] # # # print(input_ids) # # # # print(input_ids.shape) # # print(segment_ids) # # print(segment_ids.shape) # # print(is_next) # # print(task_idx) # # print(sop_label) # # print(task_idx.shape) # # for i in range(input_mask.shape[0]): # # print(input_mask[i]) # print(c) # print(train_dataset.c) # note: args.train_batch_size has been changed to (/= args.gradient_accumulation_steps) # t_total = int(math.ceil(len(train_dataset.ex_list) / args.train_batch_size) t_total = int( len(train_dataloader) * args.num_train_epochs / args.gradient_accumulation_steps) amp_handle = None if args.fp16 and args.amp: from apex import amp amp_handle = amp.init(enable_caching=True) logger.info("enable fp16 with amp") # Prepare model recover_step = _get_max_epoch_model(args.output_dir) cls_num_labels = 2 type_vocab_size = 6 + \ (1 if args.s2s_add_segment else 0) if args.new_segment_ids else 2 num_sentlvl_labels = 2 if args.has_sentence_oracle else 0 relax_projection = 4 if args.relax_projection else 0 if args.local_rank not in (-1, 0): # Make sure only the first process in distributed training will download model & vocab dist.barrier() if (recover_step is None) and (args.model_recover_path is None): # if _state_dict == {}, the parameters are randomly initialized # if _state_dict == None, the parameters are initialized with bert-init _state_dict = {} if args.from_scratch else None model = BertForPreTrainingLossMask.from_pretrained( args.bert_model, state_dict=_state_dict, num_labels=cls_num_labels, num_rel=0, type_vocab_size=type_vocab_size, config_path=args.config_path, task_idx=3, num_sentlvl_labels=num_sentlvl_labels, max_position_embeddings=args.max_position_embeddings, label_smoothing=args.label_smoothing, fp32_embedding=args.fp32_embedding, relax_projection=relax_projection, new_pos_ids=args.new_pos_ids, ffn_type=args.ffn_type, hidden_dropout_prob=args.hidden_dropout_prob, attention_probs_dropout_prob=args.attention_probs_dropout_prob, num_qkv=args.num_qkv, seg_emb=args.seg_emb, local_debug=args.local_debug) global_step = 0 else: if recover_step: logger.info("***** Recover model: %d *****", recover_step) model_recover = torch.load(os.path.join( args.output_dir, "model.{0}.bin".format(recover_step)), map_location='cpu') # recover_step == number of epochs global_step = math.floor(recover_step * t_total / args.num_train_epochs) elif args.model_recover_path: logger.info("***** Recover model: %s *****", args.model_recover_path) model_recover = torch.load(args.model_recover_path, map_location='cpu') global_step = 0 model = BertForPreTrainingLossMask.from_pretrained( args.bert_model, state_dict=model_recover, num_labels=cls_num_labels, num_rel=0, type_vocab_size=type_vocab_size, config_path=args.config_path, task_idx=3, num_sentlvl_labels=num_sentlvl_labels, max_position_embeddings=args.max_position_embeddings, label_smoothing=args.label_smoothing, fp32_embedding=args.fp32_embedding, relax_projection=relax_projection, new_pos_ids=args.new_pos_ids, ffn_type=args.ffn_type, hidden_dropout_prob=args.hidden_dropout_prob, attention_probs_dropout_prob=args.attention_probs_dropout_prob, num_qkv=args.num_qkv, seg_emb=args.seg_emb, local_debug=args.local_debug) if args.local_rank == 0: dist.barrier() if args.fp16: model.half() if args.fp32_embedding: model.bert.embeddings.word_embeddings.float() model.bert.embeddings.position_embeddings.float() model.bert.embeddings.token_type_embeddings.float() model.to(device) if args.local_rank != -1: try: from torch.nn.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("DistributedDataParallel") model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) elif n_gpu > 1: # model = torch.nn.DataParallel(model) model = DataParallelImbalance(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: # from apex.optimizers import FP16_Optimizer from pytorch_pretrained_bert.optimization_fp16 import FP16_Optimizer_State from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer_State(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer_State(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) if recover_step: logger.info("***** Recover optimizer: %d *****", recover_step) optim_recover = torch.load(os.path.join( args.output_dir, "optim.{0}.bin".format(recover_step)), map_location='cpu') if hasattr(optim_recover, 'state_dict'): optim_recover = optim_recover.state_dict() optimizer.load_state_dict(optim_recover) if args.loss_scale == 0: logger.info("***** Recover optimizer: dynamic_loss_scale *****") optimizer.dynamic_loss_scale = True logger.info("***** CUDA.empty_cache() *****") torch.cuda.empty_cache() if args.do_train: logger.info("***** Running training *****") logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", t_total) model.train() if recover_step: start_epoch = recover_step + 1 else: start_epoch = 1 for i_epoch in trange(start_epoch, int(args.num_train_epochs) + 1, desc="Epoch", disable=args.local_rank not in (-1, 0)): if args.local_rank != -1: train_sampler.set_epoch(i_epoch) iter_bar = tqdm(train_dataloader, desc='Iter (loss=X.XXX)', disable=args.local_rank not in (-1, 0)) for step, batch in enumerate(iter_bar): batch = [ t.to(device) if t is not None else None for t in batch ] if args.has_sentence_oracle: input_ids, segment_ids, input_mask, mask_qkv, lm_label_ids, masked_pos, masked_weights, is_next, task_idx, sop_label, oracle_pos, oracle_weights, oracle_labels = batch else: input_ids, segment_ids, input_mask, mask_qkv, lm_label_ids, masked_pos, masked_weights, is_next, task_idx, sop_label = batch print(sop_label) print(task_idx) oracle_pos, oracle_weights, oracle_labels = None, None, None # loss_tuple = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next, # masked_pos=masked_pos, masked_weights=masked_weights, task_idx=task_idx, # masked_pos_2=oracle_pos, masked_weights_2=oracle_weights, # masked_labels_2=oracle_labels, mask_qkv=mask_qkv) loss_tuple = model(input_ids, segment_ids, input_mask, lm_label_ids, sop_label, masked_pos=masked_pos, masked_weights=masked_weights, task_idx=task_idx, masked_pos_2=oracle_pos, masked_weights_2=oracle_weights, masked_labels_2=oracle_labels, mask_qkv=mask_qkv) masked_lm_loss, next_sentence_loss = loss_tuple if n_gpu > 1: # mean() to average on multi-gpu. # loss = loss.mean() masked_lm_loss = masked_lm_loss.mean() next_sentence_loss = next_sentence_loss.mean() print('mask_lm_loss {}'.format(masked_lm_loss)) print('next_sentence_loss {}'.format(next_sentence_loss)) print('----------------------------------------------') loss = masked_lm_loss + next_sentence_loss # logging for each step (i.e., before normalization by args.gradient_accumulation_steps) iter_bar.set_description('Iter (loss=%5.3f)' % loss.item()) # ensure that accumlated gradients are normalized if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) if amp_handle: amp_handle._clear_cache() else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: lr_this_step = args.learning_rate * \ warmup_linear(global_step/t_total, args.warmup_proportion) if args.fp16: # modify learning rate with special warm up BERT uses for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model if (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info( "** ** * Saving fine-tuned model and optimizer ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "model.{0}.bin".format(i_epoch)) torch.save(model_to_save.state_dict(), output_model_file) output_optim_file = os.path.join( args.output_dir, "optim.{0}.bin".format(i_epoch)) torch.save(optimizer.state_dict(), output_optim_file) logger.info("***** CUDA.empty_cache() *****") torch.cuda.empty_cache()
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints and predictions will be written.") ## Other parameters parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument("--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") parser.add_argument("--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.") parser.add_argument("--doc_stride", default=128, type=int, help="When splitting up a long document into chunks, how much stride to take between chunks.") parser.add_argument("--max_query_length", default=64, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% " "of training.") parser.add_argument("--n_best_size", default=20, type=int, help="The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument("--max_answer_length", default=30, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument("--verbose_logging", action='store_true', help="If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--do_lower_case", action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--version_2_with_negative', action='store_true', help='If true, the SQuAD examples contain some that do not have an answer.') parser.add_argument('--null_score_diff_threshold', type=float, default=0.0, help="If null_score - best_non_null is greater than the threshold predict null.") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError("At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory () already exists and is not empty.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = read_squad_examples( input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model model = BertForQuestionAnswering.from_pretrained(args.bert_model, cache_dir=os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank))) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) train_features = None try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): if n_gpu == 1: batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used and handles this automatically lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = BertForQuestionAnswering(config) model.load_state_dict(torch.load(output_model_file)) else: model = BertForQuestionAnswering.from_pretrained(args.bert_model) model.to(device) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_squad_examples( input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating"): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append(RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold)
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) # Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--deepscale', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") parser.add_argument("--model_file", type=str, default="0", help="Path to the Pretrained BERT Encoder File.") parser.add_argument('--random', default=False, action='store_true', help="Whether to fientune for random initialization") parser.add_argument('--focal', default=False, action='store_true', help="Whether to use Focal Loss for finetuning.") parser.add_argument('--gamma', type=float, default=0.5, help="Gamma parameter to be used in focal loss.") parser.add_argument('--deepspeed_sparse_attention', default=False, action='store_true', help='Use DeepSpeed sparse self attention.') parser.add_argument('--deepspeed_transformer_kernel', default=False, action='store_true', help='Use DeepSpeed transformer kernel to accelerate.') parser.add_argument( '--progressive_layer_drop', default=False, action='store_true', help="Whether to enable progressive layer dropping or not") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mnli-mm": MnliMismatchedProcessor, "mrpc": MrpcProcessor, "sst-2": Sst2Processor, "sts-b": StsbProcessor, "qqp": QqpProcessor, "qnli": QnliProcessor, "rte": RteProcessor, "wnli": WnliProcessor, } output_modes = { "cola": "classification", "mnli": "classification", "mrpc": "classification", "sst-2": "classification", "sts-b": "regression", "qqp": "classification", "qnli": "classification", "rte": "classification", "wnli": "classification", } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps args.seed = random.randint(1, 1000) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if (args.local_rank == -1 or torch.distributed.get_rank() == 0): # if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: # raise ValueError( # "Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if args.local_rank != -1: torch.distributed.barrier() task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) bert_base_model_config = { "vocab_size_or_config_json_file": 119547, "hidden_size": 768, "num_hidden_layers": 12, "num_attention_heads": 12, "intermediate_size": 3072, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "attention_probs_dropout_prob": 0.1, "max_position_embeddings": 512, "type_vocab_size": 2, "initializer_range": 0.02 } if args.progressive_layer_drop: print("BertBaseConfigPreLnLayerDrop") from nvidia.modelingpreln_layerdrop import BertForSequenceClassification, BertConfig else: from nvidia.modelingpreln import BertForSequenceClassification, BertConfig bert_config = BertConfig(**bert_base_model_config) bert_config.vocab_size = len(tokenizer.vocab) # Padding for divisibility by 8 if bert_config.vocab_size % 8 != 0: bert_config.vocab_size += 8 - (bert_config.vocab_size % 8) model = BertForSequenceClassification(args, bert_config, num_labels=num_labels) if args.model_file is not "0": logger.info(f"Loading Pretrained Bert Encoder from: {args.model_file}") # bert_state_dict = torch.load(args.model_file) # model.bert.load_state_dict(bert_state_dict) checkpoint_state_dict = torch.load(args.model_file, map_location=torch.device("cpu")) if 'module' in checkpoint_state_dict: logger.info('Loading DeepSpeed v2.0 style checkpoint') model.load_state_dict(checkpoint_state_dict['module'], strict=False) elif 'model_state_dict' in checkpoint_state_dict: model.load_state_dict(checkpoint_state_dict['model_state_dict'], strict=False) else: raise ValueError("Unable to find model state in checkpoint") logger.info(f"Pretrained Bert Encoder Loaded from: {args.model_file}") if args.random: logger.info("USING RANDOM INITIALISATION FOR FINETUNING") model.apply(model.init_bert_weights) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: if args.deepscale: print("Enabling DeepScale") from deepscale.distributed_apex import DistributedDataParallel as DDP else: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": if args.fp16: all_label_ids = torch.tensor( [f.label_id for f in train_features], dtype=torch.half) else: all_label_ids = torch.tensor( [f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # define a new function to compute loss values for both output_modes logits = model(input_ids, segment_ids, input_mask, labels=None) if output_mode == "classification": if args.focal: loss_fct = FocalLoss(class_num=num_labels, gamma=args.gamma) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.deepscale and args.local_rank != -1: model.disable_need_reduction() if (step + 1) % args.gradient_accumulation_steps == 0: model.enable_need_reduction() if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * \ warmup_linear( global_step/num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) # create eval loss and other metric required by the task if output_mode == "classification": if args.focal: loss_fct = FocalLoss(class_num=num_labels, gamma=args.gamma) else: loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() print(logits.type()) print(label_ids.type()) if task_name == "sts-b": tmp_eval_loss = loss_fct(logits.float().view(-1), label_ids.view(-1)) else: tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] if output_mode == "classification": preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss / nb_tr_steps if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # hack for MNLI-MM if task_name == "mnli": task_name = "mnli-mm" processor = processors[task_name]() if os.path.exists(args.output_dir + '-MM') and os.listdir(args.output_dir + '-MM') and args.do_train: raise ValueError( "Output directory ({}{}) already exists and is not empty.". format(args.output_dir, '-MM')) if not os.path.exists(args.output_dir + '-MM'): os.makedirs(args.output_dir + '-MM') eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) if args.focal: loss_fct = FocalLoss(class_num=num_labels, gamma=args.gamma) else: loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] preds = np.argmax(preds, axis=1) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss / nb_tr_steps if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir + '-MM', "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, predict_mask, label_ids = batch neg_log_likelihood = model.neg_log_likelihood(input_ids, segment_ids, input_mask, label_ids) if gradient_accumulation_steps > 1: neg_log_likelihood = neg_log_likelihood / gradient_accumulation_steps neg_log_likelihood.backward() tr_loss += neg_log_likelihood.item() if (step + 1) % gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = learning_rate0 * warmup_linear( global_step_th / total_train_steps, warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step_th += 1 #print("Epoch:{}-{}/{}, Negative loglikelihood: {} ".format(epoch, step, len(train_dataloader), neg_log_likelihood.item())) print('--------------------------------------------------------------') print("Epoch:{} completed, Total training's Loss: {}, Spend: {}m".format( epoch, tr_loss, (time.time() - train_start) / 60.0)) ''' valid_acc, valid_f1 = evaluate(model, dev_dataloader, batch_size, epoch, 'Valid_set') # Save a checkpoint
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--bert_model", default='bert-base-multilingual-cased', type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--num_relevant_attention_head", default=3, type=int, help="Number of rel-attention heads that is added to attention layer") parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--train_batch_size", default=3, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--visdom', action='store_true', help='Use visdom for loss visualization') parser.add_argument( '--check_saved_model', action='store_true', help= 'To check the output folder whether there is a saved model in the middle' ) parser.add_argument( '--last_final_epoch', type=int, default=-1, help= "Use when you want to additional training for preveious finished model" ) args = parser.parse_args() print(args) if args.visdom: import visdom viz = visdom.Visdom() os.makedirs(args.output_dir, exist_ok=True) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train: raise ValueError( "Training is currently the only implemented execution option. Please set `do_train`." ) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTFIDFTokenizer.from_pretrained(args.bert_model, do_lower_case=False, do_tf_idf=True) processor = DataProcessor() label_list = processor.get_labels() num_labels = len(label_list) tfidf_dict = pickle_load(os.path.join(args.data_dir, 'tfidf.pkl')) num_train_optimization_steps = None if args.do_train: print("Loading Train Dataset", args.data_dir) train_examples = processor.get_train_examples(args.data_dir) train_dataset = LazyDataset(train_examples, args.max_seq_length, tokenizer, tfidf_dict) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: train_sampler = DistributedSampler(train_dataset) num_train_optimization_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model loaded_epoch = -1 saved_model_path = -1 if args.last_final_epoch != -1: last_model = os.path.join(args.output_dir, WEIGHTS_NAME) if os.path.exists(last_model): saved_model_path = last_model loaded_epoch = args.last_final_epoch - 1 elif args.check_saved_model: for epoch in range(int(args.num_train_epochs)): tmp = os.path.join(args.output_dir, (f"weight_on_ep{epoch}_" + WEIGHTS_NAME)) if os.path.exists(tmp): saved_model_path = tmp loaded_epoch = epoch if saved_model_path != -1: logger.info(f"Loading on saved model {saved_model_path}") config_file = os.path.join(args.output_dir, CONFIG_NAME) config = BertForIRConfig(config_file) logger.info("Model config {}".format(config)) model = BertForIRForPreTraining(config) model.load_state_dict(torch.load(saved_model_path)) else: loaded_epoch = -1 model = BertForIRForPreTraining.from_pretrained( args.bert_model, num_relevant_attention_head=args.num_relevant_attention_head) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: logger.info(f"Used DataParallel to support Multi gpu, n_gpu: {n_gpu} ") model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) if args.visdom: vis_title = f'Baseline on {len(train_dataset)} dataset' vis_legend = ['LM Loss', 'Click Loss', 'Total Loss'] iter_plot = create_vis_plot(viz, 'Iteration', 'Loss', vis_title, vis_legend) epoch_plot = create_vis_plot(viz, 'Epoch', 'Loss', vis_title, vis_legend) global_step = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) save_loss = [] save_epoch_loss = [] save_step = int(len(train_dataloader) // 5) model.train() for epoch in trange((loaded_epoch + 1), int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 tr_loss_ml = 0 tr_loss_click = 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_weigths, input_mask, segment_ids, lm_label_ids, label_ids = batch loss, loss_ml, loss_click = model(input_ids, input_weigths, segment_ids, input_mask, lm_label_ids, label_ids) # print(loss.size()) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. loss_ml = loss_ml.mean() loss_click = loss_click.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss_ml = loss_ml / args.gradient_accumulation_steps loss_click = loss_click / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() tr_loss_ml += loss_ml.item() tr_loss_click += loss_click.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 save_loss.append( [loss_ml.item(), loss_click.item(), loss.item()]) if global_step != 0 and global_step % save_step == 0: logger.info(f'Saving state, iter: {global_step}') model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self model_name = f"weight_on_{global_step}_" + WEIGHTS_NAME output_model_file = os.path.join(args.output_dir, model_name) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) print("Loss at ", global_step, loss_ml.item(), loss_click.item(), loss.item()) tmp = np.array(save_loss) np.save( os.path.join(args.output_dir, f"save_loss_step{global_step}.npy"), tmp) if args.visdom: update_vis_plot(viz, global_step, loss_ml.item(), loss_click.item(), iter_plot, epoch_plot, 'append') save_epoch_loss.append([tr_loss_ml, tr_loss_click, tr_loss]) if epoch != (int(args.num_train_epochs) - 1): logger.info(f'Saving state, epoch: {epoch}') model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self model_name = f"weight_on_ep{epoch}_" + WEIGHTS_NAME output_model_file = os.path.join(args.output_dir, model_name) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) tmp1 = np.array(save_loss) tmp2 = np.array(save_epoch_loss) np.save( os.path.join(args.output_dir, f"save_loss_step{global_step}.npy"), tmp1) np.save( os.path.join(args.output_dir, f"save_epoch_loss_ep{epoch}.npy"), tmp2) print("Loss at epoch", epoch, tr_loss_ml, tr_loss_click, tr_loss) if args.visdom: update_vis_plot(viz, epoch, tr_loss_ml, tr_loss_click, epoch_plot, None, 'append', len(train_dataset) // args.train_batch_size) # Save a trained model logger.info("** ** * Saving proposed model ** ** * ") save_loss = np.array(save_loss) save_epoch_loss = np.array(save_epoch_loss) np.save(os.path.join(args.output_dir, "save_loss.npy"), save_loss) np.save(os.path.join(args.output_dir, "save_epoch_loss.npy"), save_epoch_loss) model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string())
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default="glue_data/STS-B", type=str, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default="sts", type=str, help="The name of the task to train.") parser.add_argument("--output_dir", default="sts", type=str, help="The output directory where the model predictions and checkpoints will be written.") parser.add_argument("--tagger_path", default=None, type=str, help="tagger_path for predictions if needing real-time tagging. Default: None, by loading pre-tagged data") ## Other parameters parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "sts": STSProcessor, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval and not args.do_predict: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() #num_labels = num_labels_task[task_name] label_list = None # num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) if args.tagger_path != None: srl_predictor = SRLPredictor(args.tagger_path) else: srl_predictor = None train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() train_features = None if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer,srl_predictor=srl_predictor ) #TagTokenizer.make_tag_vocab("tag_vocab", tag_vocab) tag_tokenizer = TagTokenizer("tag_vocab") vocab_size = len(tag_tokenizer.ids_to_tags) print("tokenizer vocab size: ", str(vocab_size)) tag_config = TagConfig(tag_vocab_size=vocab_size, hidden_size=10, layer_num=1, output_dim=10, dropout_prob=0.1, num_aspect=3) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)) model = BertForSequenceScoreTag.from_pretrained(args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank),tag_config=tag_config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 best_epoch = 0 best_result = 0.0 if args.do_train: train_features = transform_tag_features(train_features, tag_tokenizer, args.max_seq_length) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) all_start_end_idx = torch.tensor([f.orig_to_token_split_idx for f in train_features], dtype=torch.long) all_input_tag_ids = torch.tensor([f.input_tag_ids for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_end_idx, all_input_tag_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) eval_dataloader = None if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer,srl_predictor=srl_predictor ) eval_features = transform_tag_features(eval_features, tag_tokenizer, args.max_seq_length) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) all_start_end_idx = torch.tensor([f.orig_to_token_split_idx for f in eval_features], dtype=torch.long) all_input_tag_ids = torch.tensor([f.input_tag_ids for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_end_idx, all_input_tag_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) for epoch in trange(int(args.num_train_epochs), desc="Epoch"): model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, start_end_idx, input_tag_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, start_end_idx, input_tag_ids, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, str(epoch)+"_pytorch_model.bin") if args.do_train: torch.save(model_to_save.state_dict(), output_model_file) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): model_state_dict = torch.load(output_model_file) predict_model = BertForSequenceScoreTag.from_pretrained(args.bert_model, state_dict=model_state_dict,tag_config=tag_config) predict_model.to(device) predict_model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 total_pred, total_labels = [],[] total_TP, total_FP, total_FN, total_TN = 0, 0, 0, 0 for input_ids, input_mask, segment_ids, start_end_idx, input_tag_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) start_end_idx = start_end_idx.to(device) input_tag_ids = input_tag_ids.to(device) with torch.no_grad(): tmp_eval_loss = predict_model(input_ids, segment_ids, input_mask, start_end_idx, input_tag_ids, label_ids) logits = predict_model(input_ids, segment_ids, input_mask, start_end_idx, input_tag_ids, None) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() eval_loss += tmp_eval_loss.mean().item() total_pred.extend(logits.squeeze().tolist()) total_labels.extend(label_ids.squeeze().tolist()) nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 del predict_model eval_loss = eval_loss / nb_eval_steps spear = spearmanr(total_pred, total_labels) pear = pearsonr(total_pred, total_labels) mse = mean_squared_error(total_pred, total_labels) loss = tr_loss / nb_tr_steps if args.do_train else None result = {'eval_loss': eval_loss, 'global_step': global_step, 'spearson':spear, 'pearson':pear, 'mse':mse, 'loss': loss} if pear[0] > best_result: best_epoch = epoch best_result = pear[0] output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info("Epoch: %s, %s = %s", str(epoch), key, str(result[key])) writer.write("Epoch: %s, %s = %s\n" % (str(epoch), key, str(result[key]))) logger.info("best epoch: %s, result: %s", str(best_epoch), str(best_result)) if args.do_predict: eval_examples = processor.get_test_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer,srl_predictor=srl_predictor ) eval_features = transform_tag_features(eval_features, tag_tokenizer, args.max_seq_length) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_start_end_idx = torch.tensor([f.orig_to_token_split_idx for f in eval_features], dtype=torch.long) all_input_tag_ids = torch.tensor([f.input_tag_ids for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_end_idx, all_input_tag_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) output_model_file = os.path.join(args.output_dir, str(best_epoch)+ "_pytorch_model.bin") model_state_dict = torch.load(output_model_file) predict_model = BertForSequenceScoreTag.from_pretrained(args.bert_model, state_dict=model_state_dict,tag_config=tag_config) predict_model.to(device) predict_model.eval() predictions = [] for input_ids, input_mask, segment_ids, start_end_idx, input_tag_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) start_end_idx = start_end_idx.to(device) input_tag_ids = input_tag_ids.to(device) with torch.no_grad(): logits = predict_model(input_ids, segment_ids, input_mask, start_end_idx, input_tag_ids, None) logits = logits.detach().cpu().numpy() for (i, prediction) in enumerate(logits): predict_label = prediction.squeeze() predictions.append(predict_label) output_test_file = os.path.join(args.output_dir, "pred_results.txt") index = 0 with open(output_test_file, "a") as writer: writer.write("index" + "\t" + "prediction" + "\n") for pred in predictions: writer.write(str(index) + "\t" + str(pred) + "\n") index += 1
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--trained_model_dir", default="", type=str, help= "Where is the fine-tuned (with the cloze-style LM objective) BERT model?" ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_test", action='store_true', help="Whether to run eval on the test set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--freeze_bert', action='store_true', help="Whether to freeze BERT") parser.add_argument('--save_all_epochs', action='store_true', help="Whether to save model in each epoch") parser.add_argument('--coarse_tagset', action='store_true', help="Whether to save model in each epoch") parser.add_argument('--supervised_training', action='store_true', help="Only use this for supervised top-line model") parser.add_argument('--partially_freeze_bert', type=int, default=-1, help="number of bottom layers frozen") parser.add_argument('--use_crf', action='store_true', help="whether to use CRF") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval and not args.do_test: raise ValueError( "At least one of `do_train` or `do_eval` or `do_test` must be True." ) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: #raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) print("WARNING: Output directory already exists and is not empty.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) processor = DataProcessor() label_list = processor.get_labels(args.data_dir) num_labels = len(label_list) coarse_map = processor.get_coarse_map(args.data_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: if args.supervised_training: train_examples = processor.get_sep_PPCEME_train_examples( args.data_dir) else: train_examples = processor.get_PTB_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format( args.local_rank)) if args.trained_model_dir: # load in fine-tuned (with cloze-style LM objective) model if os.path.exists(os.path.join(args.output_dir, WEIGHTS_NAME)): previous_state_dict = torch.load( os.path.join(args.output_dir, WEIGHTS_NAME)) else: from collections import OrderedDict previous_state_dict = OrderedDict() distant_state_dict = torch.load( os.path.join(args.trained_model_dir, WEIGHTS_NAME)) previous_state_dict.update( distant_state_dict ) # note that the final layers of previous model and distant model must have different attribute names! if args.use_crf: model = MyBertPlusCRFForTokenClassification.from_pretrained( args.trained_model_dir, state_dict=previous_state_dict, num_labels=num_labels, device=device) else: model = MyBertForTokenClassification.from_pretrained( args.trained_model_dir, state_dict=previous_state_dict, num_labels=num_labels) else: if args.use_crf: model = MyBertPlusCRFForTokenClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels, device=device) else: model = MyBertForTokenClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) if args.freeze_bert: # freeze BERT if needed frozen = ['bert'] elif args.partially_freeze_bert != -1: if args.partially_freeze_bert not in [3, 6, 9]: raise ValueError("can only freeze 3, 6, or 9 layers") elif args.partially_freeze_bert == 3: frozen = [ 'bert.embeddings.', 'bert.encoder.layer.0.', 'bert.encoder.layer.1.', 'bert.encoder.layer.2.', ] elif args.partially_freeze_bert == 6: frozen = [ 'bert.embeddings.', 'bert.encoder.layer.0.', 'bert.encoder.layer.1.', 'bert.encoder.layer.2.', 'bert.encoder.layer.3.', 'bert.encoder.layer.4.', 'bert.encoder.layer.5.', ] elif args.partially_freeze_bert == 9: frozen = [ 'bert.embeddings.', 'bert.encoder.layer.0.', 'bert.encoder.layer.1.', 'bert.encoder.layer.2.', 'bert.encoder.layer.3.', 'bert.encoder.layer.4.', 'bert.encoder.layer.5.', 'bert.encoder.layer.6.', 'bert.encoder.layer.7.', 'bert.encoder.layer.8.', ] else: frozen = [] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if (not any(fr in n for fr in frozen)) and (not any(nd in n for nd in no_decay)) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer if (not any(fr in n for fr in frozen)) and (any(nd in n for nd in no_decay)) ], 'weight_decay': 0.0 }] param_size = 0 for d in optimizer_grouped_parameters: for p in d['params']: tmp_p = p.clone().detach() param_size += torch.numel(tmp_p) logger.info(" Parameter size = %d", param_size) if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_ids for f in train_features], dtype=torch.long) all_label_mask = torch.tensor([f.label_mask for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_label_mask) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() epoch_index = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, label_mask = batch loss = model(input_ids, segment_ids, input_mask, label_ids, label_mask) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if (args.local_rank == -1 or torch.distributed.get_rank() == 0) and args.save_all_epochs: # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, WEIGHTS_NAME + ".epoch" + str(epoch_index)) torch.save(model_to_save.state_dict(), output_model_file) epoch_index += 1 if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_PTB_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_ids for f in eval_features], dtype=torch.long) all_label_mask = torch.tensor([f.label_mask for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_label_mask) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids, label_mask in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) label_mask = label_mask.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids, label_mask) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() label_mask = label_mask.to('cpu').numpy() if args.coarse_tagset: tmp_eval_correct, tmp_eval_total = accuracy( logits, label_ids, label_mask, coarse_map=coarse_map) else: tmp_eval_correct, tmp_eval_total = accuracy( logits, label_ids, label_mask) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_correct nb_eval_examples += tmp_eval_total nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples # micro average result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if args.do_test and (args.local_rank == -1 or torch.distributed.get_rank() == 0): test_examples = processor.get_sep_PPCEME_test_examples(args.data_dir) test_features = convert_examples_to_features(test_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running final test *****") logger.info(" Num examples = %d", len(test_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_ids for f in test_features], dtype=torch.long) all_label_mask = torch.tensor([f.label_mask for f in test_features], dtype=torch.long) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_label_mask) # Run prediction for full data test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) model.eval() test_loss, test_accuracy = 0, 0 nb_test_steps, nb_test_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids, label_mask in tqdm( test_dataloader, desc="Testing"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) label_mask = label_mask.to(device) with torch.no_grad(): tmp_test_loss = model(input_ids, segment_ids, input_mask, label_ids, label_mask) logits = model(input_ids, segment_ids, input_mask, label_mask=label_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() label_mask = label_mask.to('cpu').numpy() if args.coarse_tagset: tmp_test_correct, tmp_test_total = accuracy( logits, label_ids, label_mask, coarse_map=coarse_map) else: tmp_test_correct, tmp_test_total = accuracy( logits, label_ids, label_mask) test_loss += tmp_test_loss.mean().item() test_accuracy += tmp_test_correct nb_test_examples += tmp_test_total nb_test_steps += 1 test_loss = test_loss / nb_test_steps test_accuracy = test_accuracy / nb_test_examples # micro average result = {'test_loss': test_loss, 'test_accuracy': test_accuracy} output_test_file = os.path.join(args.output_dir, "test_results.txt") with open(output_test_file, "w") as writer: logger.info("***** Test results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--use_pretrained", action='store_true', help="Whether to use another task pretrained model.") parser.add_argument( "--other_task_pretrained", default=None, type=str, help="The directory where the other task model resides") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "sst-2": Sst2Processor, } num_labels_task = { "cola": 2, "sst-2": 2, "mnli": 3, "mrpc": 2, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) model = BertForSequenceClassification.from_pretrained( args.other_task_pretrained, cache_dir=None, num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = BertForSequenceClassification(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) else: if not args.use_pretrained: model = BertForSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) else: model = BertForSequenceClassification.from_pretrained( args.other_task_pretrained, num_labels=num_labels) print('Using model from: ' + str(args.other_task_pretrained)) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy, eval_f1 = 0, 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) tmp_eval_f1 = calculate_f1(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy eval_f1 += tmp_eval_f1 nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples eval_f1_by_steps = eval_f1 / nb_eval_steps loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'eval_f1_steps': eval_f1_by_steps, 'global_step': global_step, 'loss': loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") print("Saving results to: " + str(output_eval_file)) with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(args): device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {} n_gpu: {}, 16-bits training: {}".format( device, n_gpu, args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if args.do_train: logger.addHandler( logging.FileHandler(os.path.join(args.output_dir, "train.log"), 'w')) else: logger.addHandler( logging.FileHandler(os.path.join(args.output_dir, "eval.log"), 'w')) logger.info(args) task_name = args.task_name.lower() if task_name not in PROCESSORS: raise ValueError("Task not found: %s" % (task_name)) processor = PROCESSORS[task_name]() label_list = processor.get_labels() id2label = {i: label for i, label in enumerate(label_list)} num_labels = len(label_list) eval_metric = EVAL_METRICS[task_name] tokenizer = BertTokenizer.from_pretrained(args.model, do_lower_case=args.do_lower_case) if args.do_train or (not args.eval_test): if task_name == "mnli": eval_examples = processor.get_dev_examples(args.data_dir, eval_set=args.eval_set) else: eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, OUTPUT_MODES[task_name]) logger.info("***** Dev *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) if OUTPUT_MODES[task_name] == "classification": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) elif OUTPUT_MODES[task_name] == "regression": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) if args.fp16: all_label_ids = all_label_ids.half() eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size) eval_label_ids = all_label_ids if args.do_train: train_examples = processor.get_train_examples(args.data_dir) train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, OUTPUT_MODES[task_name]) if args.train_mode == 'sorted' or args.train_mode == 'random_sorted': train_features = sorted(train_features, key=lambda f: np.sum(f.input_mask)) else: random.shuffle(train_features) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if OUTPUT_MODES[task_name] == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif OUTPUT_MODES[task_name] == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) if args.fp16: all_label_ids = all_label_ids.half() train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) train_dataloader = DataLoader(train_data, batch_size=args.train_batch_size, drop_last=True) train_batches = [batch for batch in train_dataloader] eval_step = max(1, len(train_batches) // args.eval_per_epoch) num_train_optimization_steps = \ len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs logger.info("***** Training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_result = None lrs = [args.learning_rate] if args.learning_rate else \ [1e-6, 2e-6, 3e-6, 5e-6, 1e-5, 2e-5, 3e-5, 5e-5] for lr in lrs: cache_dir = args.cache_dir if args.cache_dir else \ PYTORCH_PRETRAINED_BERT_CACHE model = BertForSequenceClassification.from_pretrained( args.model, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex" "to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=lr, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer( optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=lr, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 nb_tr_examples = 0 tr_loss = 0 start_time = time.time() for epoch in range(int(args.num_train_epochs)): model.train() logger.info("Start epoch #{} (lr = {})...".format(epoch, lr)) if args.train_mode == 'random' or args.train_mode == 'random_sorted': random.shuffle(train_batches) for step, batch in enumerate(train_batches): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch logits = model(input_ids, segment_ids, input_mask, labels=None) if OUTPUT_MODES[task_name] == "classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif OUTPUT_MODES[task_name] == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: lr_this_step = lr * \ warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if (step + 1) % eval_step == 0: logger.info( 'Epoch: {}, Step: {} / {}, used_time = {:.2f}s, loss = {:.6f}' .format(epoch, step + 1, len(train_dataloader), time.time() - start_time, tr_loss / nb_tr_steps)) save_model = False if args.do_eval: preds, result = evaluate(task_name, model, device, eval_dataloader, eval_label_ids, num_labels) model.train() result['global_step'] = global_step result['epoch'] = epoch result['learning_rate'] = lr result['batch_size'] = args.train_batch_size logger.info("First 20 predictions:") for pred, label in zip( preds[:20], eval_label_ids.numpy()[:20]): if OUTPUT_MODES[task_name] == 'classification': sign = u'\u2713' if pred == label else u'\u2718' logger.info("pred = %s, label = %s %s" % (id2label[pred], id2label[label], sign)) else: logger.info("pred = %.4f, label = %.4f" % (pred, label)) if (best_result is None) or (result[eval_metric] > best_result[eval_metric]): best_result = result save_model = True logger.info( "!!! Best dev %s (lr=%s, epoch=%d): %.2f" % (eval_metric, str(lr), epoch, result[eval_metric] * 100.0)) else: save_model = True if save_model: model_to_save = model.module if hasattr( model, 'module') else model output_model_file = os.path.join( args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join( args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file( output_config_file) tokenizer.save_vocabulary(args.output_dir) if best_result: output_eval_file = os.path.join( args.output_dir, EVAL_FILE) with open(output_eval_file, "w") as writer: for key in sorted(result.keys()): writer.write("%s = %s\n" % (key, str(result[key]))) if args.do_eval: if args.eval_test: if task_name == "mnli": eval_examples = processor.get_test_examples( args.data_dir, eval_set=args.eval_set) else: eval_examples = processor.get_test_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, OUTPUT_MODES[task_name]) logger.info("***** Test *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids) eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size) eval_label_ids = None model = BertForSequenceClassification.from_pretrained( args.output_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) preds, result = evaluate(task_name, model, device, eval_dataloader, eval_label_ids, num_labels) pred_file = os.path.join(args.output_dir, PRED_FILE) with open(pred_file, "w") as f_out: f_out.write("index\tprediction\n") for i, pred in enumerate(preds): if OUTPUT_MODES[task_name] == 'classification': f_out.write("%d\t%s\n" % (i, id2label[pred])) else: f_out.write("%d\t%.6f\n" % (i, pred)) output_eval_file = os.path.join(args.output_dir, TEST_FILE) with open(output_eval_file, "w") as writer: for key in sorted(result.keys()): writer.write("%s = %s\n" % (key, str(result[key])))