def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--do_explain', action='store_true', help="explain outputs with decomposition") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "sst-2": Sst2Processor, 'yelp': YelpProcesser, 'tacred': TacredProcessor } num_labels_task = { "cola": 2, "sst-2": 2, "mnli": 3, "mrpc": 2, 'yelp': 2, 'tacred': 42 } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = BertForSequenceClassification(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) else: #model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels) output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = BertForSequenceClassification(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 output_eval_file = os.path.join(args.output_dir, "eval_results_details.txt") f = open(output_eval_file, 'w') all_pred, all_truth = [], [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) #print(logits.cpu().numpy().tolist()) if args.do_explain: predict_and_explain_wrapper_unbatched( model, input_ids, segment_ids, input_mask, label_ids, tokenizer) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) pred = np.argmax(logits, -1) all_pred.extend(pred.reshape(-1).tolist()) all_truth.extend(label_ids.reshape(-1).tolist()) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 for i in range(input_ids.size(0)): logit = logits[i, 1] - logits[i, 0] logit = logit.item() text = ' '.join( tokenizer.convert_ids_to_tokens( input_ids[i].cpu().numpy())) f.write('%f\t%s\n' % (logit, text)) eval_loss = eval_loss / nb_eval_steps if task_name != 'tacred': eval_accuracy = eval_accuracy / nb_eval_examples metric = 'accuracy' else: _, _, eval_accuracy = tacred_f1_score(all_truth, all_pred) metric = 'f1' loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_%s' % metric: eval_accuracy, 'global_step': global_step, 'loss': loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def distil_embeddings(config_file: str, train_file: str, val_file: str, train_sample_rate: float, val_sample_rate: float, input_model_path: str, output_model_path: str, random_seed: int, neptune_project: str, saved_embeddings: str): seed_everything(random_seed) train_file = get_true_file(train_file) val_file = get_true_file(val_file) assert train_file.endswith(".jsonl") assert val_file.endswith(".jsonl") config = json.loads(jsonnet_evaluate_file(config_file)) print("Fetching data...") train_records = [ r for r in parse_tg_jsonl(train_file) if random.random() <= train_sample_rate ] val_records = [ r for r in parse_tg_jsonl(val_file) if random.random() <= val_sample_rate ] tokenizer = AutoTokenizer.from_pretrained(input_model_path) max_tokens_count = config.get("max_tokens_count", 196) if not saved_embeddings or not os.path.isfile(saved_embeddings): print("Loading teacher model...") input_model = AutoModel.from_pretrained(input_model_path) print("Saving embeddings...") url2text = { r["url"]: r["text"] for r in itertools.chain(train_records, val_records) } urls = [] embeddings = [] batch_urls = [] batch_texts = [] batch_size = 8 for url, text in tqdm(url2text.items()): batch_urls.append(url) batch_texts.append(text) if len(batch_urls) == batch_size: urls.extend(batch_urls) batch_embeddings = calc_batch_embeddings( batch_texts, tokenizer, input_model, max_tokens_count) for embedding in batch_embeddings: embeddings.append(embedding) batch_urls = [] batch_texts = [] if batch_urls: urls.extend(batch_urls) batch_embeddings = calc_batch_embeddings(batch_texts, tokenizer, input_model, max_tokens_count) for embedding in batch_embeddings: embeddings.append(embedding) embeddings = torch.tensor(embeddings) data = {"urls": urls, "embeddings": embeddings} torch.save(data, saved_embeddings) else: print("Loading embeddings...") data = torch.load(saved_embeddings) url2num = {url: num for num, url in enumerate(data["urls"])} num2embedding = data["embeddings"] batch_size = config.get("batch_size", 32) num_workers = config.get("num_workers", 5) train_dataset = EmbeddingsAsTargetDataset(train_records, url2num, num2embedding, tokenizer, max_tokens_count) train_sampler = RandomSampler(train_dataset) train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size, num_workers=num_workers) val_dataset = EmbeddingsAsTargetDataset(val_records, url2num, num2embedding, tokenizer, max_tokens_count) val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=num_workers) patience = config.get("patience", 4) epochs = config.get("epochs", 5) gradient_clip_val = config.get("gradient_clip_val", 1.0) logger = False neptune_api_token = os.getenv("NEPTUNE_API_TOKEN") if neptune_project and neptune_api_token: params = copy.copy(config) params["train_sample_rate"] = train_sample_rate params["val_sample_rate"] = val_sample_rate params["train_file"] = train_file params["val_file"] = val_file logger = NeptuneLogger( api_key=neptune_api_token, project_name=neptune_project, experiment_name="Distil embeddings", tags=["training", "pytorch-lightning", "distil"], params=params) lightning_model = DistilEmbeddingBertLightning(config) early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=0.0, patience=patience, verbose=True, mode="min") trainer = Trainer(gpus=0, checkpoint_callback=False, accumulate_grad_batches=1, max_epochs=epochs, callbacks=[early_stop_callback], val_check_interval=1.0, gradient_clip_val=gradient_clip_val, deterministic=True, logger=logger) trainer.fit(lightning_model, train_loader, val_loader)
def main(): parser = argparse.ArgumentParser() ## Required parameters ############### parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument("--pretrain_model", default='bert-case-uncased', type=str, required=True, help="Pre-trained model") parser.add_argument("--num_labels_task", default=None, type=int, required=True, help="num_labels_task") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", default=False, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--eval_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--task", default=None, type=int, required=True, help="Choose Task") ############### args = parser.parse_args() #print(args.do_train, args.do_eval) #exit() processors = Processor_1 num_labels = args.num_labels_task if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {}, n_gpu: {}, distributed training: {}, 16-bits training: {}" .format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) #args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") ''' if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) ''' os.makedirs(args.output_dir, exist_ok=True) tokenizer = RobertaTokenizer.from_pretrained(args.pretrain_model) train_examples = None num_train_steps = None aspect_list = None sentiment_list = None processor = processors() num_labels = num_labels #train_examples, aspect_list, sentiment_list = processor.get_train_examples(args.data_dir) filenames = os.listdir(args.output_dir) filenames = [x for x in filenames if "pytorch_model.bin_" in x] print(filenames) file_mark = [] for x in filenames: file_mark.append([x, True]) #file_mark.append([x, False]) #### #### test_examples, aspect_list, sentiment_list = processor.get_test_examples( args.data_dir) if args.task == 1: num_labels = len(aspect_list) elif args.task == 2: num_labels = len(sentiment_list) else: print("What's task?") exit() dev = convert_examples_to_features(test_examples, aspect_list, sentiment_list, args.max_seq_length, tokenizer, args.task) eval_examples = test_examples ### for x, mark in file_mark: print(x, mark) output_model_file = os.path.join(args.output_dir, x) print(output_model_file) #model = RobertaForSequenceClassification.from_pretrained(args.pretrain_model, num_labels=num_labels, output_hidden_states=False, output_attentions=False, return_dict=True) model = RobertaForMaskedLMDomainTask.from_pretrained( args.pretrain_model, output_hidden_states=False, output_attentions=False, return_dict=True, num_labels=args.num_labels_task) model.load_state_dict(torch.load(output_model_file), strict=False) #strict False: ignore non-matching keys #param_optimizer = [para[0] for para in model.named_parameters()] #param_optimizer = [para for para in model.named_parameters()][-2] #print(param_optimizer) model.to(device) if mark: eval_features = dev else: eval_features = test logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_attention_mask = torch.tensor( [f.attention_mask for f in eval_features], dtype=torch.long) if args.task == 1: print("Excuting the task 1") elif args.task == 2: all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) else: print("Wrong here2") all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) if args.task == 1: eval_data = TensorDataset(all_input_ids, all_attention_mask, all_label_ids) elif args.task == 2: eval_data = TensorDataset(all_input_ids, all_attention_mask, all_segment_ids, all_label_ids) else: print("Wrong here1") if args.local_rank == -1: eval_sampler = RandomSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) if mark: output_eval_file = os.path.join( args.output_dir, "eval_results_{}.txt".format(x.split("_")[-1])) output_file_pred = os.path.join( args.output_dir, "eval_pred_{}.txt".format(x.split("_")[-1])) output_file_glod = os.path.join( args.output_dir, "eval_gold_{}.txt".format(x.split("_")[-1])) else: output_eval_file = os.path.join( args.output_dir, "test_results_{}.txt".format(x.split("_")[-1])) output_file_pred = os.path.join( args.output_dir, "test_pred_{}.txt".format(x.split("_")[-1])) output_file_glod = os.path.join( args.output_dir, "test_gold_{}.txt".format(x.split("_")[-1])) fpred = open(output_file_pred, "w") fgold = open(output_file_glod, "w") model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for step, batch in enumerate(tqdm(eval_dataloader, desc="Iteration")): #batch = tuple(t.to(device) if i != 3 else t for i, t in enumerate(batch)) batch = tuple(t.to(device) for i, t in enumerate(batch)) if args.task == 1: input_ids, attention_mask, label_ids = batch elif args.task == 2: input_ids, attention_mask, segment_ids, label_ids = batch else: print("Wrong here3") if args.task == 1: #loss, logits, hidden_states, attentions ''' output = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels=label_ids) logits = output.logits tmp_eval_loss = output.loss ''' # tmp_eval_loss, logits = model(input_ids_org=input_ids, sentence_label=label_ids, attention_mask=attention_mask, func="task_class") #logits = output.logits #tmp_eval_loss = output.loss elif args.task == 2: #loss, logits, hidden_states, attentions ''' output = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels=label_ids) logits = output.logits tmp_eval_loss = output.loss ''' # tmp_eval_loss, logits = model(input_ids_org=input_ids, sentence_label=label_ids, attention_mask=attention_mask, func="task_class") #exit() #logits = output.logits #tmp_eval_loss = output.loss else: print("Wrong!!") logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy, pred = accuracy(logits, label_ids) for a, b in zip(pred, label_ids): fgold.write("{}\n".format(b)) fpred.write("{}\n".format(a)) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy} with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def __init__(self, indices, replacement=False, num_samples=None): RandomSampler.__init__(self, indices, replacement, num_samples)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help= "Bert pre-trained model selected in the list: bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese." ) parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \nSequences longer than this will be truncated, and sequences shorter \nthan this will be padded." ) parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% of training." ) parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n0 (default value): dynamic loss scaling.\nPositive power of 2: static loss scaling value.\n" ) parser.add_argument("--eval_file_to_load", default=None, type=str, help="which model file to load for eval.") args = parser.parse_args() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "bioasq": BioProcessor } num_labels_task = {"cola": 2, "mnli": 3, "mrpc": 2, "bioasq": 2} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) #### if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) #### args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) #### if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") #### if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) #### task_name = args.task_name.lower() #### if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) #### processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() #### tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case, cache_dir=cache_dir) #### train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) #### # Prepare model model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_labels=num_labels # , cache_dir =cache_dir ) #### if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) #### global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) #### model.train() for ep in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps #### if args.fp16: optimizer.backward(loss) else: loss.backward() #### tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 #### # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model_{}.bin".format(ep)) if args.do_train: torch.save(model_to_save.state_dict(), output_model_file) #### #### # Load a trained model that you have fine-tuned if (args.eval_file_to_load is not None): output_model_file = args.eval_file_to_load model_state_dict = torch.load(output_model_file) model = BertForSequenceClassification.from_pretrained( args.bert_model, state_dict=model_state_dict, num_labels=num_labels, cache_dir=cache_dir) model.to(device) #### if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) #### model.eval() eval_loss, eval_accuracy = 0, 0 eval_accuracy2 = 0 nb_eval_steps, nb_eval_examples = 0, 0 #### for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) #### with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) #### logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) tmp_eval_accuracy2 = accuracy(np.zeros_like(logits), label_ids) #### eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy eval_accuracy2 += tmp_eval_accuracy2 #### nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 #### eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples eval_accuracy2 = eval_accuracy2 / nb_eval_examples loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'eval_accuracy2': eval_accuracy2, 'global_step': global_step, 'loss': loss } #### output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def run_train(args): # --------- data processor = BertProcessor(vocab_path=config['bert_vocab_path'], do_lower_case=args.do_lower_case) label_list = processor.get_labels() label2id = {label: i for i, label in enumerate(label_list)} id2label = {i: label for i, label in enumerate(label_list)} train_data = processor.get_train(config['data_dir'] / f"{args.data_name}.train.pkl") train_examples = processor.create_examples(lines=train_data, example_type='train', cached_examples_file=config[ 'data_dir'] / f"cached_train_examples_{args.arch}") train_features = processor.create_features(examples=train_examples, max_seq_len=args.train_max_seq_len, cached_features_file=config[ 'data_dir'] / "cached_train_features_{}_{}".format( args.train_max_seq_len, args.arch )) train_dataset = processor.create_dataset(train_features, is_sorted=args.sorted) if args.sorted: train_sampler = SequentialSampler(train_dataset) else: train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) valid_data = processor.get_dev(config['data_dir'] / f"{args.data_name}.valid.pkl") valid_examples = processor.create_examples(lines=valid_data, example_type='valid', cached_examples_file=config[ 'data_dir'] / f"cached_valid_examples_{args.arch}") valid_features = processor.create_features(examples=valid_examples, max_seq_len=args.eval_max_seq_len, cached_features_file=config[ 'data_dir'] / "cached_valid_features_{}_{}".format( args.eval_max_seq_len, args.arch )) valid_dataset = processor.create_dataset(valid_features) valid_sampler = SequentialSampler(valid_dataset) valid_dataloader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.eval_batch_size) # ------- model logger.info("initializing model") if args.resume_path: args.resume_path = Path(args.resume_path) model = BertForMultiClass.from_pretrained(args.resume_path, num_labels=len(label_list)) else: model = BertForMultiClass.from_pretrained(config['bert_model_dir'], num_labels=len(label_list)) t_total = int(len(train_dataloader) / args.gradient_accumulation_steps * args.epochs) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] warmup_steps = int(t_total * args.warmup_proportion) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) lr_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # ---- callbacks logger.info("initializing callbacks") train_monitor = TrainingMonitor(file_dir=config['figure_dir'], arch=args.arch) model_checkpoint = ModelCheckpoint(checkpoint_dir=config['checkpoint_dir'], mode=args.mode, monitor=args.monitor, arch=args.arch, save_best_only=args.save_best) # **************************** training model *********************** logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Num Epochs = %d", args.epochs) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * ( torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) trainer = Trainer(n_gpu=args.n_gpu, model=model, epochs=args.epochs, logger=logger, criterion=CrossEntropy(), optimizer=optimizer, lr_scheduler=lr_scheduler, early_stopping=None, training_monitor=train_monitor, fp16=args.fp16, resume_path=args.resume_path, grad_clip=args.grad_clip, model_checkpoint=model_checkpoint, gradient_accumulation_steps=args.gradient_accumulation_steps, evaluate=F1Score(), class_report=ClassReport(target_names=[id2label[x] for x in range(len(label2id))])) trainer.train(train_data=train_dataloader, valid_data=valid_dataloader, seed=args.seed)
def create_dataloader(self): data = DATADOUBAN( debug=False, data_dir=self.data_dir, ) train_examples = data.read_examples( os.path.join(self.data_dir, 'train.csv')) train_features = data.convert_examples_to_features( train_examples, self.tokenizer, self.max_seq_length) all_input_ids_utt = torch.tensor(data.select_field( train_features, 'input_ids_utt'), dtype=torch.long) all_input_mask_utt = torch.tensor(data.select_field( train_features, 'input_mask_utt'), dtype=torch.long) all_segment_ids_utt = torch.tensor(data.select_field( train_features, 'segment_ids_utt'), dtype=torch.long) all_input_ids_resp = torch.tensor(data.select_field( train_features, 'input_ids_resp'), dtype=torch.long) all_input_mask_resp = torch.tensor(data.select_field( train_features, 'input_mask_resp'), dtype=torch.long) all_segment_ids_resp = torch.tensor(data.select_field( train_features, 'segment_ids_resp'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids_utt, all_input_mask_utt, all_segment_ids_utt, all_input_ids_resp, all_input_mask_resp, all_segment_ids_resp, all_label) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=self.train_batch_size) eval_examples = data.read_examples( os.path.join(self.data_dir, 'dev.csv')) eval_features = data.convert_examples_to_features( eval_examples, self.tokenizer, self.max_seq_length) all_input_ids_utt = torch.tensor(data.select_field( eval_features, 'input_ids_utt'), dtype=torch.long) all_input_mask_utt = torch.tensor(data.select_field( eval_features, 'input_mask_utt'), dtype=torch.long) all_segment_ids_utt = torch.tensor(data.select_field( eval_features, 'segment_ids_utt'), dtype=torch.long) all_input_ids_resp = torch.tensor(data.select_field( eval_features, 'input_ids_resp'), dtype=torch.long) all_input_mask_resp = torch.tensor(data.select_field( eval_features, 'input_mask_resp'), dtype=torch.long) all_segment_ids_resp = torch.tensor(data.select_field( eval_features, 'segment_ids_resp'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids_utt, all_input_mask_utt, all_segment_ids_utt, all_input_ids_resp, all_input_mask_resp, all_segment_ids_resp, all_label) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=self.eval_batch_size) return train_dataloader, eval_dataloader, train_examples, eval_examples
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default='/hdd/lujunyu/dataset/multi_turn_corpus/ubuntu/', type=str, required=False, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--task_name", default='ubuntu', type=str, required=False, help="The name of the task to train.") parser.add_argument("--output_dir", default='/hdd/lujunyu/model/chatbert/ubuntu_without_finetune_check/', type=str, required=False, help="The output directory where the model checkpoints will be written.") ## Other parameters parser.add_argument("--init_model_name", default='bert-base-uncased', type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument("--do_lower_case", default=True, action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--data_augmentation", default=False, action='store_true', help="Whether to use augmentation") parser.add_argument("--max_seq_length", default=256, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=True, action='store_true', help="Whether to run training.") parser.add_argument("--do_test", default=True, action='store_true', help="Whether to run eval on the test set.") parser.add_argument("--train_batch_size", default=500, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=500, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=3e-3, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=10.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_steps", default=0.0, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--weight_decay", default=0, type=float, help="weight_decay") parser.add_argument("--save_checkpoints_steps", default=400, type=int, help="How often to save the model checkpoint.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumualte before performing a backward/update pass.") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) per_step_train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") bert_config = BertConfig.from_pretrained(args.init_model_name, num_labels=2) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}".format( args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): if args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) else: os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.init_model_name, do_lower_case=args.do_lower_case) if args.data_augmentation: train_dataset = UbuntuDatasetForSP( file_path=os.path.join(args.data_dir, "train_augment_3.txt"), max_seq_length=args.max_seq_length, tokenizer=tokenizer ) else: train_dataset = UbuntuDatasetForSP( file_path=os.path.join(args.data_dir, "train.txt"), max_seq_length=args.max_seq_length, tokenizer=tokenizer ) eval_dataset = UbuntuDatasetForSP( file_path=os.path.join(args.data_dir, "train.txt"), max_seq_length=args.max_seq_length, tokenizer=tokenizer ) train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=per_step_train_batch_size, sampler=RandomSampler(train_dataset), num_workers=8) eval_dataloader = torch.utils.data.DataLoader(eval_dataset, batch_size=per_step_train_batch_size, sampler=SequentialSampler(eval_dataset), num_workers=8) model = BertForSequenceClassification.from_pretrained(args.init_model_name, config=bert_config) model.to(device) for k, v in model.named_parameters(): if k.startswith('bert'): v.requires_grad = False num_train_steps = None if args.do_train: num_train_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare optimizer param_optimizer = list(model.named_parameters()) # remove pooler, which is not used thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if n[-1].requires_grad] for k, v in param_optimizer: if v.requires_grad: print(k, v.size()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_steps) else: optimizer = None scheduler = None if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) global_step = 0 best_metric = 0.0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) training_loss_file = open(os.path.join(args.output_dir, 'training_loss_file.txt'), 'a') model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients scheduler.step() model.zero_grad() global_step += 1 if step % args.save_checkpoints_steps == 0: model.eval() f = open(os.path.join(args.output_dir, 'logits_dev.txt'), 'w') eval_loss = 0 nb_eval_steps, nb_eval_examples = 0, 0 logits_all = [] for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = logits.detach().cpu().numpy() logits_all.append(logits) label_ids = label_ids.cpu().numpy() for logit, label in zip(logits, label_ids): logit = '{},{}'.format(logit[0], logit[1]) f.write('_\t{}\t{}\n'.format(logit, label)) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 f.close() logits_all = np.concatenate(logits_all,axis=0) eval_loss = eval_loss / nb_eval_steps result = evaluate(os.path.join(args.output_dir, 'logits_dev.txt')) result.update({'eval_loss': eval_loss}) output_eval_file = os.path.join(args.output_dir, "eval_results_dev.txt") with open(output_eval_file, "a") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) ### Save the best checkpoint if best_metric < result['R10@1'] + result['R10@2']: try: ### Remove 'module' prefix when using DataParallel state_dict = model.module.state_dict() except AttributeError: state_dict = model.state_dict() torch.save(state_dict, os.path.join(args.output_dir, "model.pt")) best_metric = result['R10@1'] + result['R10@2'] logger.info('Saving the best model in {}'.format(os.path.join(args.output_dir, "model.pt"))) ### visualize bad cases of the best model logger.info('Saving Bad cases...') visualize_bad_cases( logits=logits_all, input_file_path=os.path.join(args.data_dir, 'valid.txt'), output_file_path=os.path.join(args.output_dir, 'valid_bad_cases.txt') ) model.train()
def train(args, train_dataset, model, tokenizer): """ Train the model """ args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate_fn) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path ) and "checkpoint" in args.model_name_or_path: # set global_step to gobal_step of last saved checkpoint from model path global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0]) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() seed_everything( args.seed ) # Added here for reproductibility (even between python 2 and 3) for epoch in range(int(args.num_train_epochs)): logger.info('epoch {}/{}'.format(epoch + 1, args.num_train_epochs)) pbar = ProgressBar(n_total=len(train_dataloader), desc='Training') for step, batch in enumerate(train_dataloader): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "start_positions": batch[3], "end_positions": batch[4] } if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() pbar(step, {'loss': loss.item()}) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scheduler.step() # Update learning rate schedule optimizer.step() model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics print() if args.local_rank == -1 and args.do_eval: # Only evaluate when single GPU otherwise metrics may not average well evaluate(args, model, tokenizer) if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) tokenizer.save_vocabulary(output_dir) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) print() if 'cuda' in str(args.device): torch.cuda.empty_cache() return global_step, tr_loss / global_step
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter(args.log_dir) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True, ) # EARLY STOPPING: split training data into training and validation # obtain training indices that will be used for validation if args.es: num_train = len(train_dataset) indices = list(range(num_train)) np.random.shuffle(indices) split = int(np.floor(args.valid_size * num_train)) train_idx, valid_idx = indices[split:], indices[:split] # define samplers for obtaining training and validation batches es_train_sampler = SubsetRandomSampler(train_idx) es_valid_sampler = SubsetRandomSampler(valid_idx) # load training data in batches train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, sampler=es_train_sampler, num_workers=0) # load validation data in batches valid_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, sampler=es_valid_sampler, num_workers=0) # END EARLY STOPPING data loading ------------------------------- # Train! logger.info("***** Running training *****") logger.info( " Num examples = %d", len(train_dataset) if args.es else int( np.ceil((1 - args.valid_size) * len(train_dataset)))) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): # set global_step to gobal_step of last saved checkpoint from model path global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0]) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0], ) set_seed(args) # Added here for reproductibility # EARLY STOPPING setup---------------------------------------------- if args.es: # to track the training loss as the model trains train_losses = [] # to track the validation loss as the model trains valid_losses = [] # to track the average training loss per epoch as the model trains avg_train_losses = [] # to track the average validation loss per epoch as the model trains avg_valid_losses = [] # initialize the early_stopping object early_stopping = EarlyStopping(patience=patience, verbose=True) # END EARLY STOPPING setup------------------------------------------ for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3] } if args.model_type != "distilbert": inputs["token_type_ids"] = ( batch[2] if args.model_type in ["bert", "xlnet", "albert"] else None ) # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if args.es: train_losses.append(loss.item()) if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: logs = {} if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well if args.es: results = evaluate(args, model, tokenizer, valid_dataloader) else: results = evaluate(args, model, tokenizer) for key, value in results.items(): eval_key = "eval_{}".format(key) logs[eval_key] = value if args.es: valid_losses.append(logs[eval_key]) loss_scalar = (tr_loss - logging_loss) / args.logging_steps learning_rate_scalar = scheduler.get_lr()[0] logs["learning_rate"] = learning_rate_scalar logs["loss"] = loss_scalar logging_loss = tr_loss for key, value in logs.items(): tb_writer.add_scalar(key, value, global_step) print(json.dumps({**logs, **{"step": global_step}})) # if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint # output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) # output_dir = os.path.join(args.output_dir, "checkpoint-last") # if not os.path.exists(output_dir): # os.makedirs(output_dir) # model_to_save = ( # model.module if hasattr(model, "module") else model # ) # Take care of distributed/parallel training # model_to_save.save_pretrained(output_dir) # tokenizer.save_pretrained(output_dir) # torch.save(args, os.path.join(output_dir, "training_args.bin")) # logger.info("Saving model checkpoint to %s", output_dir) # torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) # torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) # logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break # check for early stopping at end of epoch if args.es: # print training/validation statistics # calculate average loss over an epoch train_loss = np.average(train_losses) valid_loss = np.average(valid_losses) avg_train_losses.append(train_loss) avg_valid_losses.append(valid_loss) epoch_len = args.num_train_epochs print_msg = (f'[{epoch:>{epoch_len}}/{n_epochs:>{epoch_len}}] ' + f'train_loss: {train_loss:.5f} ' + f'valid_loss: {valid_loss:.5f}') print(print_msg) # clear lists to track next epoch train_losses = [] valid_losses = [] # early_stopping needs the validation loss to check if it has decresed, # and if it has, it will make a checkpoint of the current model early_stopping(valid_loss, model) if early_stopping.early_stop: print("Early stopping") break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break # if args.local_rank in [-1, 0]: # tb_writer.close() return global_step, tr_loss / global_step
def train(args, train_dataset, model, tokenizer, lang2id=None, pretrained_model=None): """Train the model.""" if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] if args.optimizer == 'Adam': optimizer_grouped_parameters = [{ "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) elif args.optimizer == 'RecAdam': # Prepare for the grouped parameters for RecAdam optimizer. # Since the classifier layer is not pretrained, it is not penalized during optimization. optimizer_grouped_parameters = [{ "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and args.model_type in n ], "weight_decay": args.weight_decay, "anneal_w": args.recadam_anneal_w, "pretrain_params": [ p_p for p_n, p_p in pretrained_model.named_parameters() if not any(nd in p_n for nd in no_decay) and args.model_type in p_n ] }, { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and args.model_type not in n ], "weight_decay": args.weight_decay, "anneal_w": 0.0, "pretrain_params": [ p_p for p_n, p_p in pretrained_model.named_parameters() if not any(nd in p_n for nd in no_decay) and args.model_type not in p_n ] }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and args.model_type in n ], "weight_decay": 0.0, "anneal_w": args.recadam_anneal_w, "pretrain_params": [ p_p for p_n, p_p in pretrained_model.named_parameters() if any(nd in p_n for nd in no_decay) and args.model_type in p_n ] }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and args.model_type not in n ], "weight_decay": 0.0, "anneal_w": 0.0, "pretrain_params": [ p_p for p_n, p_p in pretrained_model.named_parameters() if any(nd in p_n for nd in no_decay) and args.model_type not in p_n ] }] optimizer = RecAdam(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, anneal_fun=args.recadam_anneal_fun, anneal_k=args.recadam_anneal_k, anneal_t0=args.recadam_anneal_t0, pretrain_cof=args.recadam_pretrain_cof) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) best_score = 10000 best_checkpoint = None patience = 0 global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed( args) # Add here for reproductibility (even between python 2 and 3) cur_epoch = 0 for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) cur_epoch += 1 for step, batch in enumerate(epoch_iterator): model.train() masked_inputs, masked_targets = utils.mask_tokens_sde( batch[0], tokenizer) batch = tuple(t.to(args.device) for t in batch if t is not None) masked_inputs = masked_inputs.to(args.device) masked_targets = masked_targets.to(args.device) inputs = { "input_ids": masked_inputs, "attention_mask": batch[1], "masked_lm_labels": masked_targets } if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = batch[2] if args.model_type in [ "bert", "xlnet" ] else None if args.model_type == "xlm": inputs["langs"] = batch[4] outputs = model(**inputs) loss = outputs[0] if args.n_gpu > 1: # mean() to average on multi-gpu parallel training loss = loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scheduler.step() # Update learning rate schedule optimizer.step() model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate on single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer, mode="dev", lang=args.train_langs, lang2id=lang2id) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: if args.save_only_best_checkpoint: result = evaluate(args, model, tokenizer, mode="dev", prefix=global_step, lang=args.train_langs, lang2id=lang2id) if result["f1"] < best_score: logger.info( "result['loss']={} < best_score={}".format( result["loss"], best_score)) best_score = result["loss"] # Save the best model checkpoint output_dir = os.path.join(args.output_dir, "checkpoint-best") best_checkpoint = output_dir if not os.path.exists(output_dir): os.makedirs(output_dir) # Take care of distributed/parallel training model_to_save = model.module if hasattr( model, "module") else model model_to_save.save_pretrained(output_dir) torch.save( args, os.path.join(output_dir, "training_args.bin")) logger.info( "Saving the best model checkpoint to %s", output_dir) logger.info("Reset patience to 0") patience = 0 else: patience += 1 logger.info("Hit patience={}".format(patience)) if args.eval_patience > 0 and patience > args.eval_patience: logger.info( "early stop! patience={}".format(patience)) epoch_iterator.close() train_iterator.close() if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step else: # Save model checkpoint output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Take care of distributed/parallel training model_to_save = model.module if hasattr( model, "module") else model model_to_save.save_pretrained(output_dir) torch.save( args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
# Get dataset batch_size = 32 N = len(texts) dataset = TensorDataset(input_ids, attention_masks, labels) logging.info('Dataset created') logging.info('Dataset length: {}'.format(len(dataset))) # Get Dataloader val_data = dataset val_dataloader = DataLoader(val_data, sampler=RandomSampler(val_data), batch_size=batch_size) logging.info('\nDataloader created.\n') # Init model logging.info('{} Initializing model {}'.format(5 * '=', 5 * '=')) model = LET() logging.info('Model for extraction initialized\n') # Select device logging.info('{} Validation {}'.format(5 * '=', 5 * '='))
def train(args, train_dataset, model, tokenizer): """ Train the model """ args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss = 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed( args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break return global_step, tr_loss / global_step
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument( "--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json" ) parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", default=False, action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( "--do_lower_case", default=True, action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--optimize_on_cpu', default=False, action='store_true', help= "Whether to perform optimization and keep the optimizer averages on CPU" ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=128, help= 'Loss scaling, positive power of 2 values can improve fp16 convergence.' ) parser.add_argument('--gpuid', type=int, default=-1, help='The gpu id to use') args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: if args.gpuid >= 0 and not args.no_cuda: device = torch.device("cuda", args.gpuid) torch.cuda.set_device(args.gpuid) dummy = torch.cuda.FloatTensor(1) else: device = torch.device("cpu") n_gpu = 1 # n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if args.fp16: logger.info( "16-bits training currently not supported in distributed training" ) args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits trainiing: {}" .format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) # if os.path.exists(args.output_dir) and os.listdir(args.output_dir): # raise ValueError("Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model) train_examples = None num_train_steps = None if args.do_train: train_examples = read_squad_examples(input_file=args.train_file, is_training=True) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = BertForQuestionAnswering.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank)) if args.fp16: model.half() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer if args.fp16: param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \ for n, param in model.named_parameters()] elif args.optimize_on_cpu: param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \ for n, param in model.named_parameters()] else: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 if args.do_train: cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}'.format( args.bert_model, str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) train_features = None try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor( [f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor( [f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): if n_gpu == 1: batch = tuple( t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16 or args.optimize_on_cpu: if args.fp16 and args.loss_scale != 1.0: # scale down gradients for fp16 training for param in model.parameters(): if param.grad is not None: param.grad.data = param.grad.data / args.loss_scale is_nan = set_optimizer_params_grad( param_optimizer, model.named_parameters(), test_nan=True) if is_nan: logger.info( "FP16 TRAINING: Nan in gradients, reducing loss scaling" ) args.loss_scale = args.loss_scale / 2 model.zero_grad() continue optimizer.step() copy_optimizer_params_to_model( model.named_parameters(), param_optimizer) else: optimizer.step() model.zero_grad() global_step += 1 if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_squad_examples(input_file=args.predict_file, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm( eval_dataloader, desc="Evaluating"): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, args.verbose_logging)
print(att_masks_train[0]) print(token_type_ids_train[0]) # Data Loaders X_train = torch.tensor(X_train) X_valid = torch.tensor(X_valid) y_train = torch.tensor(y_train) y_valid = torch.tensor(y_valid) att_masks_train = torch.tensor(att_masks_train) att_masks_valid = torch.tensor(att_masks_valid) token_type_ids_train = torch.tensor(token_type_ids_train) token_type_ids_valid = torch.tensor(token_type_ids_valid) train_data = TensorDataset(X_train, att_masks_train, token_type_ids_train, y_train) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE) validation_data = TensorDataset(X_valid, att_masks_valid, token_type_ids_valid, y_valid) validation_sampler = SequentialSampler(validation_data) validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=BATCH_SIZE) ####################### # Model and optimizer # #######################
def do_end_to_end(self, train_data, valid_data): """ train a pretrain model using medical data (domain specific data) end to end. params: train_example: training set valid_example: validation set """ # setup train_batch_size = 8 train_dataset = self._create_dataset(train_data, evaluate=False) print("Coding: train dataset ", train_dataset) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=train_batch_size) valid_dataset, valid_example, valid_feature = \ self._create_dataset(valid_data, evaluate=True, output_examples=True) print("Coding: len ", len(train_dataset), len(train_data), len(valid_data), len(valid_dataset), len(valid_example), len(valid_feature)) max_step = 1 # default value in run_squad.py self.init_params(f_lr=2e-5, f_eps=1e-8, t_total=max_step) gradient_accumulation_steps = 1 # default value in run_squad.py # check whether optimizer is saved. global_step = 1 # if check point exists, get it tr_loss = 0.0 # value error self._model.zero_grad() # set random seed. epochs = 1 for _ in trange(epochs, desc="Iteration"): epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): self._model.train() batch = tuple(t.to(self._device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], 'start_positions': batch[3], 'end_positions': batch[4], 'cls_index': batch[5], 'p_mask': batch[6] } outputs = self._model(**inputs) loss = outputs[ 0] # model outputs are always tuple in pytorch-transformers (see doc) loss.backward() print("Coding: parameter ", self._model.parameters()) # torch.nn.utils.clip_grad_norm_(self._model.parameters(), 1.0) # max_grad_norm = 1.0 by default f_local_loss = loss.item() tr_loss += f_local_loss print("INFO: %s epoch % loss: %0.2f %f" % (global_step, step, tr_loss, f_local_loss)) # torch.nn.utils.clip_grad_norm_(self._model, 1.0) # 1.0 is default by author's # this function does not have effect unless gradient_accumulation_steps > 1 # however, it was set to 1 in original code. # if (step + 1) % 1 == 0: self._optimizer.step() self._scheduler.step() # Update learning rate schedule self._model.zero_grad() global_step += 1 # validation result = self.evaluate(valid_dataset, valid_example, valid_feature) print("Coding: result ", result)
def train_model(self, encoded_data: TensorDataset, steps_per_epoch: int): logging.info(f"starting to train model") sampler = RandomSampler(encoded_data) dataloader = DataLoader(encoded_data, sampler=sampler, batch_size=self.hyper_dict["batch_size"]) num_total_steps = steps_per_epoch * self.hyper_dict["epochs"] param_optimizer = list(self.model.named_parameters()) no_decay = ["bias", "gamma", "beta", "LayerNorm.weight"] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': self.hyper_dict["decay_rate"] }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=self.hyper_dict["learning_rate"], correct_bias=False) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=self.hyper_dict["warmup_steps"], num_training_steps=num_total_steps) self.model.train() for _ in range(self.hyper_dict["epochs"]): logging.info("starting epoch") training_loss = [] for batch in dataloader: batch = tuple(t.to(self.device) for t in batch) b_input_ids, b_type_ids, b_input_mask, b_labels = batch optimizer.zero_grad() outputs = self.model(b_input_ids, token_type_ids=b_type_ids, attention_mask=b_input_mask) logits = outputs[0] loss = self.calculate_loss(logits, b_labels) training_loss.append(loss.item()) loss.backward() torch.nn.utils.clip_grad_norm_( self.model.parameters(), self.hyper_dict["max_grad_norm"]) # ordering? optimizer.step() scheduler.step() # ordering assumes pytorch >= 1.1.0 logging.info( f"Training loss: {sum(training_loss) / len(training_loss)}")
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .csv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written.") ## Other parameters parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") #if os.path.exists(args.output_dir) and os.listdir(args.output_dir): #raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = read_swag_examples(os.path.join(args.data_dir, 'train.csv'), is_training = True) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model cache_dir = os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)) predictor = BertForMultipleChoice.from_pretrained(args.bert_model, cache_dir=cache_dir, num_choices=4) # Use independently trained adversary output_model_file = os.path.join(args.output_dir, 'adversary_' + WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, 'adversary_' + CONFIG_NAME) config = BertConfig(output_config_file) adversary = BertForSequenceClassification(config, num_labels=2) adversary.load_state_dict(torch.load(output_model_file)) if args.fp16: predictor.half() adversary.half() predictor.to(device) adversary.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") predictor = DDP(predictor) adversary = DDP(adversary) elif n_gpu > 1: predictor = torch.nn.DataParallel(predictor) adversary = torch.nn.DataParallel(adversary) # Prepare optimizer param_optimizer_pred = list(predictor.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer_pred = [n for n in param_optimizer_pred if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters_pred = [ {'params': [p for n, p in param_optimizer_pred if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer_pred if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer_pred = FusedAdam(optimizer_grouped_parameters_pred, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer_pred = FP16_Optimizer(optimizer_pred, dynamic_loss_scale=True) else: optimizer_pred = FP16_Optimizer(optimizer_pred, static_loss_scale=args.loss_scale) else: optimizer_pred = BertAdam(optimizer_grouped_parameters_pred, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) alpha = 1 global_step = 0 if args.do_train: train_features = convert_examples_to_features( train_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) all_vp_input_ids = torch.tensor(select_field(train_features, 'vp_input_ids'), dtype=torch.long) all_vp_input_mask = torch.tensor(select_field(train_features, 'vp_input_mask'), dtype=torch.long) all_protected_attr = torch.tensor([f.protected_attr for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label, all_vp_input_ids, all_vp_input_mask, all_protected_attr) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) training_history = [] predictor.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss_pred, tr_loss_adv = 0, 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, vp_input_ids, vp_input_mask, protected_attr_ids = batch loss_pred, logits_pred = predictor(input_ids, segment_ids, input_mask, label_ids) softmax = torch.nn.functional.softmax(logits_pred, dim=1) # flatten vp ids and mask batch_size, num_choices = vp_input_ids.shape[0], vp_input_ids.shape[1] vp_input_ids = vp_input_ids.view([batch_size * num_choices, -1]) vp_input_mask = vp_input_mask.view([batch_size * num_choices, -1]) # repeat protected attribute number of choice times protected_attr_ids_ = protected_attr_ids.repeat(num_choices, 1).t() protected_attr_ids_ = protected_attr_ids_.reshape(-1) _, logits_adv = adversary(vp_input_ids, None, vp_input_mask, protected_attr_ids_) pos_probs = logits_adv.view([batch_size, num_choices, -1])[:,:,1] # perform a batch-wise dot product between positive probabilities and softmax vector dot_prod = torch.bmm(pos_probs.view([batch_size, 1, num_choices]), softmax.view([batch_size, num_choices, 1])).view([batch_size, 1]) loss_adv = torch.nn.CrossEntropyLoss()(torch.cat([1 - dot_prod, dot_prod], dim=1), protected_attr_ids.view([-1])) if n_gpu > 1: loss_pred = loss_pred.mean() # mean() to average on multi-gpu. loss_adv = loss_adv.mean() if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss_pred = loss_pred * args.loss_scale loss_adv = loss_adv * args.loss_scale if args.gradient_accumulation_steps > 1: loss_pred = loss_pred / args.gradient_accumulation_steps loss_adv = loss_adv / args.gradient_accumulation_steps tr_loss_pred += loss_pred.item() tr_loss_adv += loss_adv.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 training_history.append([loss_pred.item(), loss_adv.item()]) loss = loss_pred - alpha * loss_adv if args.fp16: optimizer_pred.backward(loss) else: loss.backward() # if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer_pred.param_groups: param_group['lr'] = lr_this_step optimizer_pred.step() optimizer_pred.zero_grad() global_step += 1 history_file = open(os.path.join(args.output_dir, "train_results.csv"), "w") writer = csv.writer(history_file, delimiter=",") writer.writerow(["pred_loss","adv_loss"]) for row in training_history: writer.writerow(row) if args.do_train: # Save a trained model and the associated configuration model_to_save = predictor.module if hasattr(predictor, 'module') else predictor # Only save the model it-self output_model_file = os.path.join(args.output_dir, 'predictor_' + WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, 'predictor_' + CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) predictor = BertForMultipleChoice(config, num_choices=4) predictor.load_state_dict(torch.load(output_model_file)) # Do the same for adversary model_to_save = adversary.module if hasattr(adversary, 'module') else adversary # Only save the model it-self output_model_file = os.path.join(args.output_dir, 'adversary_' + WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, 'adversary_' + CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) config = BertConfig(output_config_file) adversary = BertForSequenceClassification(config, num_labels=3) adversary.load_state_dict(torch.load(output_model_file)) else: output_model_file = os.path.join(args.output_dir, 'predictor_' + WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, 'predictor_' + CONFIG_NAME) config = BertConfig(output_config_file) predictor = BertForMultipleChoice(config, num_choices=4) predictor.load_state_dict(torch.load(output_model_file)) output_model_file = os.path.join(args.output_dir, 'adversary_' + WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, 'adversary_' + CONFIG_NAME) config = BertConfig(output_config_file) adversary = BertForSequenceClassification(config, num_labels=2) adversary.load_state_dict(torch.load(output_model_file)) predictor.to(device) adversary.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_swag_examples(os.path.join(args.data_dir, 'val.csv'), is_training = True) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) all_vp_input_ids = torch.tensor(select_field(eval_features, 'vp_input_ids'), dtype=torch.long) all_vp_input_mask = torch.tensor(select_field(eval_features, 'vp_input_mask'), dtype=torch.long) all_protected_attr = torch.tensor([f.protected_attr for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label, all_vp_input_ids, all_vp_input_mask, all_protected_attr) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) predictor.eval() adversary.eval() eval_loss_pred, eval_accuracy_pred = 0, 0 eval_loss_adv, eval_accuracy_adv = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids, vp_input_ids, vp_input_mask, protected_attr_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) vp_input_ids = vp_input_ids.to(device) vp_input_mask = vp_input_mask.to(device) protected_attr_ids = protected_attr_ids.to(device) with torch.no_grad(): tmp_eval_loss_pred, logits_pred = predictor(input_ids, segment_ids, input_mask, label_ids) predicted_vps = torch.argmax(logits_pred, dim=1) predicted_vps = predicted_vps.view(-1, 1).repeat(1, vp_input_ids.size(2)).view([-1, 1, vp_input_ids.size(2)]) vp_input_ids = torch.gather(vp_input_ids, dim=1, index=predicted_vps) vp_input_ids = vp_input_ids.view([vp_input_ids.size(0), -1]) vp_input_mask = torch.gather(vp_input_mask, dim=1, index=predicted_vps) vp_input_mask = vp_input_mask.view([vp_input_mask.size(0), -1]) with torch.no_grad(): tmp_eval_loss_adv, logits_adv = adversary(vp_input_ids, None, vp_input_mask, protected_attr_ids) # print("logits_adv", logits_adv) tmp_eval_accuracy_pred = accuracy(logits_pred, label_ids) tmp_eval_accuracy_adv = accuracy(logits_adv, protected_attr_ids) eval_loss_pred += tmp_eval_loss_pred.mean().item() eval_accuracy_pred += tmp_eval_accuracy_pred.item() eval_loss_adv += tmp_eval_loss_adv.mean().item() eval_accuracy_adv += tmp_eval_accuracy_adv.item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss_pred /= nb_eval_steps eval_accuracy_pred /= nb_eval_examples eval_loss_adv /= nb_eval_steps eval_accuracy_adv /= nb_eval_examples if args.do_train: result = {'eval_loss_pred': eval_loss_pred, 'eval_accuracy_pred': eval_accuracy_pred, 'eval_loss_adv': eval_loss_adv, 'eval_accuracy_adv': eval_accuracy_adv, 'global_step': global_step, 'loss_pred': tr_loss_pred/nb_tr_steps, 'loss_adv': tr_loss_adv/nb_tr_steps} else: result = {'eval_loss_pred': eval_loss_pred, 'eval_accuracy_pred': eval_accuracy_pred, 'eval_loss_adv': eval_loss_adv, 'eval_accuracy_adv': eval_accuracy_adv} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id): """ Train the model """ args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, ) best_dev_f1 = 0.0 if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [{ "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }] # for crf might wanna change this to SGD with gradient clipping and my other scheduler **** eskiler adamken bile 1000lerden başlayıp 10lara düşüyo optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed( args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3] } if args.model_type != "distilbert": inputs["token_type_ids"] = batch[2] if args.model_type in [ "bert", "xlnet" ] else None # XLM and RoBERTa don"t use segment_ids outputs = model(pad_token_label_id=pad_token_label_id, **inputs) loss = outputs[ 0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule #optimizer.step() model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev") if results['f1'] > best_dev_f1: best_dev_f1 = results['f1'] results_test, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test") logger.info( "test f1: %s, loss: %s", str(results_test['f1']), str(results_test['loss']), ) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, "module" ) else model # Take care of distributed/parallel training logger.info("Saving model checkpoint to %s", output_dir) model_save_path_ = os.path.join(output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), model_save_path_) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break #/if args.local_rank in [-1, 0]: #tb_writer.close() return global_step, tr_loss / global_step
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True, ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint #in our case we will be loading up a pre-trained bert class if os.path.exists(args.model_name_or_path): # set global_step to global_step of last saved checkpoint from model path try: global_step = int( args.model_name_or_path.split("-")[-1].split("/")[0]) except ValueError: global_step = 0 epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0], ) set_seed(args) # Added here for reproductibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3] } if args.model_type != "distilbert": inputs["token_type_ids"] = ( batch[2] if args.model_type in ["bert", "xlnet", "albert"] else None ) # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0 or ( # last step in epoch but step is always smaller than gradient_accumulation_steps len(epoch_iterator) <= args.gradient_accumulation_steps and (step + 1) == len(epoch_iterator)): if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: logs = {} if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): eval_key = "eval_{}".format(key) logs[eval_key] = value loss_scalar = (tr_loss - logging_loss) / args.logging_steps learning_rate_scalar = scheduler.get_lr()[0] logs["learning_rate"] = learning_rate_scalar logs["loss"] = loss_scalar logging_loss = tr_loss for key, value in logs.items(): tb_writer.add_scalar(key, value, global_step) print(json.dumps({**logs, **{"step": global_step}})) if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed( args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } if args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] if args.model_type in [ 'bert', 'xlnet' ] else None # XLM, DistilBERT and RoBERTa don't use segment_ids outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: logs = {} if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): eval_key = 'eval_{}'.format(key) logs[eval_key] = value loss_scalar = (tr_loss - logging_loss) / args.logging_steps learning_rate_scalar = scheduler.get_lr()[0] logs['learning_rate'] = learning_rate_scalar logs['loss'] = loss_scalar logging_loss = tr_loss for key, value in logs.items(): tb_writer.add_scalar(key, value, global_step) print(json.dumps({**logs, **{'step': global_step}})) if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train( self, train_dataset, output_dir, show_running_loss=True, eval_data=None, verbose=True, **kwargs, ): """ Trains the model on train_dataset. Utility function to be used by the train_model() method. Not intended to be used directly. """ model = self.model args = self.args device = self.device tb_writer = SummaryWriter(logdir=args.tensorboard_dir) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, num_workers=self.args.dataloader_num_workers, ) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [] custom_parameter_names = set() for group in self.args.custom_parameter_groups: params = group.pop("params") custom_parameter_names.update(params) param_group = {**group} param_group["params"] = [ p for n, p in model.named_parameters() if n in params ] optimizer_grouped_parameters.append(param_group) for group in self.args.custom_layer_parameters: layer_number = group.pop("layer") layer = f"layer.{layer_number}." group_d = {**group} group_nd = {**group} group_nd["weight_decay"] = 0.0 params_d = [] params_nd = [] for n, p in model.named_parameters(): if n not in custom_parameter_names and layer in n: if any(nd in n for nd in no_decay): params_nd.append(p) else: params_d.append(p) custom_parameter_names.add(n) group_d["params"] = params_d group_nd["params"] = params_nd optimizer_grouped_parameters.append(group_d) optimizer_grouped_parameters.append(group_nd) if not self.args.train_custom_parameters_only: optimizer_grouped_parameters.extend([ { "params": [ p for n, p in model.named_parameters() if n not in custom_parameter_names and not any( nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if n not in custom_parameter_names and any( nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ]) warmup_steps = math.ceil(t_total * args.warmup_ratio) args.warmup_steps = warmup_steps if args.warmup_steps == 0 else args.warmup_steps if args.optimizer == "AdamW": optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) elif args.optimizer == "Adafactor": optimizer = Adafactor( optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adafactor_eps, clip_threshold=args.adafactor_clip_threshold, decay_rate=args.adafactor_decay_rate, beta1=args.adafactor_beta1, weight_decay=args.weight_decay, scale_parameter=args.adafactor_scale_parameter, relative_step=args.adafactor_relative_step, warmup_init=args.adafactor_warmup_init, ) print("Using Adafactor for T5") else: raise ValueError( "{} is not a valid optimizer class. Please use one of ('AdamW', 'Adafactor') instead." .format(args.optimizer)) if args.scheduler == "constant_schedule": scheduler = get_constant_schedule(optimizer) elif args.scheduler == "constant_schedule_with_warmup": scheduler = get_constant_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps) elif args.scheduler == "linear_schedule_with_warmup": scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) elif args.scheduler == "cosine_schedule_with_warmup": scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total, num_cycles=args.cosine_schedule_num_cycles, ) elif args.scheduler == "cosine_with_hard_restarts_schedule_with_warmup": scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total, num_cycles=args.cosine_schedule_num_cycles, ) elif args.scheduler == "polynomial_decay_schedule_with_warmup": scheduler = get_polynomial_decay_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total, lr_end=args.polynomial_decay_schedule_lr_end, power=args.polynomial_decay_schedule_power, ) else: raise ValueError("{} is not a valid scheduler.".format( args.scheduler)) if (args.model_name and os.path.isfile( os.path.join(args.model_name, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name, "scheduler.pt"))): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name, "scheduler.pt"))) if args.n_gpu > 1: model = torch.nn.DataParallel(model) logger.info(" Training started") global_step = 0 training_progress_scores = None tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.silent, mininterval=0) epoch_number = 0 best_eval_metric = None early_stopping_counter = 0 steps_trained_in_current_epoch = 0 epochs_trained = 0 if args.model_name and os.path.exists(args.model_name): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name.split("/")[-1].split("-") if len(checkpoint_suffix) > 2: checkpoint_suffix = checkpoint_suffix[1] else: checkpoint_suffix = checkpoint_suffix[-1] global_step = int(checkpoint_suffix) epochs_trained = global_step // ( len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info( " Will skip the first %d steps in the current epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") if args.evaluate_during_training: training_progress_scores = self._create_training_progress_scores( **kwargs) if args.wandb_project: wandb.init(project=args.wandb_project, config={**asdict(args)}, **args.wandb_kwargs) wandb.watch(self.model) if args.fp16: from torch.cuda import amp scaler = amp.GradScaler() for current_epoch in train_iterator: model.train() if epochs_trained > 0: epochs_trained -= 1 continue train_iterator.set_description( f"Epoch {epoch_number + 1} of {args.num_train_epochs}") batch_iterator = tqdm( train_dataloader, desc=f"Running Epoch {epoch_number} of {args.num_train_epochs}", disable=args.silent, mininterval=0, ) for step, batch in enumerate(batch_iterator): if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue batch = tuple(t.to(device) for t in batch) inputs = self._get_inputs_dict(batch) if args.fp16: with amp.autocast(): outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] else: outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training current_loss = loss.item() if show_running_loss: batch_iterator.set_description( f"Epochs {epoch_number}/{args.num_train_epochs}. Running Loss: {current_loss:9.4f}" ) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: scaler.scale(loss).backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: scaler.unscale_(optimizer) if args.optimizer == "AdamW": torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) if args.fp16: scaler.step(optimizer) scaler.update() else: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics tb_writer.add_scalar("lr", scheduler.get_last_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.wandb_project or self.is_sweeping: wandb.log({ "Training loss": current_loss, "lr": scheduler.get_last_lr()[0], "global_step": global_step, }) if args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir_current = os.path.join( output_dir, "checkpoint-{}".format(global_step)) self.save_model(output_dir_current, optimizer, scheduler, model=model) if args.evaluate_during_training and ( args.evaluate_during_training_steps > 0 and global_step % args.evaluate_during_training_steps == 0): # Only evaluate when single GPU otherwise metrics may not average well results = self.eval_model( eval_data, verbose=verbose and args.evaluate_during_training_verbose, silent=args.evaluate_during_training_silent, **kwargs, ) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) output_dir_current = os.path.join( output_dir, "checkpoint-{}".format(global_step)) if args.save_eval_checkpoints: self.save_model(output_dir_current, optimizer, scheduler, model=model, results=results) training_progress_scores["global_step"].append( global_step) training_progress_scores["train_loss"].append( current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv( os.path.join(args.output_dir, "training_progress_scores.csv"), index=False, ) if args.wandb_project or self.is_sweeping: wandb.log( self._get_last_metrics( training_progress_scores)) if not best_eval_metric: best_eval_metric = results[ args.early_stopping_metric] self.save_model(args.best_model_dir, optimizer, scheduler, model=model, results=results) if best_eval_metric and args.early_stopping_metric_minimize: if results[ args. early_stopping_metric] - best_eval_metric < args.early_stopping_delta: best_eval_metric = results[ args.early_stopping_metric] self.save_model(args.best_model_dir, optimizer, scheduler, model=model, results=results) early_stopping_counter = 0 else: if args.use_early_stopping: if early_stopping_counter < args.early_stopping_patience: early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args.early_stopping_metric}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args.early_stopping_patience}" ) else: if verbose: logger.info( f" Patience of {args.early_stopping_patience} steps reached" ) logger.info( " Training terminated.") train_iterator.close() return ( global_step, tr_loss / global_step if not self. args.evaluate_during_training else training_progress_scores, ) else: if results[ args. early_stopping_metric] - best_eval_metric > args.early_stopping_delta: best_eval_metric = results[ args.early_stopping_metric] self.save_model(args.best_model_dir, optimizer, scheduler, model=model, results=results) early_stopping_counter = 0 else: if args.use_early_stopping: if early_stopping_counter < args.early_stopping_patience: early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args.early_stopping_metric}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args.early_stopping_patience}" ) else: if verbose: logger.info( f" Patience of {args.early_stopping_patience} steps reached" ) logger.info( " Training terminated.") train_iterator.close() return ( global_step, tr_loss / global_step if not self. args.evaluate_during_training else training_progress_scores, ) model.train() epoch_number += 1 output_dir_current = os.path.join( output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number)) if args.save_model_every_epoch or args.evaluate_during_training: os.makedirs(output_dir_current, exist_ok=True) if args.save_model_every_epoch: self.save_model(output_dir_current, optimizer, scheduler, model=model) if args.evaluate_during_training and args.evaluate_each_epoch: results = self.eval_model( eval_data, verbose=verbose and args.evaluate_during_training_verbose, silent=args.evaluate_during_training_silent, **kwargs, ) if args.save_eval_checkpoints: self.save_model(output_dir_current, optimizer, scheduler, results=results) training_progress_scores["global_step"].append(global_step) training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv(os.path.join(args.output_dir, "training_progress_scores.csv"), index=False) if args.wandb_project or self.is_sweeping: wandb.log(self._get_last_metrics(training_progress_scores)) if not best_eval_metric: best_eval_metric = results[args.early_stopping_metric] self.save_model(args.best_model_dir, optimizer, scheduler, model=model, results=results) if best_eval_metric and args.early_stopping_metric_minimize: if results[ args. early_stopping_metric] - best_eval_metric < args.early_stopping_delta: best_eval_metric = results[args.early_stopping_metric] self.save_model(args.best_model_dir, optimizer, scheduler, model=model, results=results) early_stopping_counter = 0 else: if args.use_early_stopping and args.early_stopping_consider_epochs: if early_stopping_counter < args.early_stopping_patience: early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args.early_stopping_metric}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args.early_stopping_patience}" ) else: if verbose: logger.info( f" Patience of {args.early_stopping_patience} steps reached" ) logger.info(" Training terminated.") train_iterator.close() return ( global_step, tr_loss / global_step if not self.args.evaluate_during_training else training_progress_scores, ) else: if results[ args. early_stopping_metric] - best_eval_metric > args.early_stopping_delta: best_eval_metric = results[args.early_stopping_metric] self.save_model(args.best_model_dir, optimizer, scheduler, model=model, results=results) early_stopping_counter = 0 else: if args.use_early_stopping and args.early_stopping_consider_epochs: if early_stopping_counter < args.early_stopping_patience: early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args.early_stopping_metric}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args.early_stopping_patience}" ) else: if verbose: logger.info( f" Patience of {args.early_stopping_patience} steps reached" ) logger.info(" Training terminated.") train_iterator.close() return ( global_step, tr_loss / global_step if not self.args.evaluate_during_training else training_progress_scores, ) return ( global_step, tr_loss / global_step if not self.args.evaluate_during_training else training_progress_scores, )
def __iter__(self): for index in RandomSampler.__iter__(self): yield self.data_source[index]
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]: """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if (args.model_name_or_path and os.path.isfile( os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt"))): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if args.model_name_or_path and os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split( "/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model_to_resize = model.module if hasattr( model, "module") else model # Take care of distributed/parallel training model_to_resize.resize_token_embeddings(len(tokenizer)) model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(args) # Added here for reproducibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) model.train() outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model( inputs, labels=labels) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: checkpoint_prefix = "checkpoint" # Save model checkpoint output_dir = os.path.join( args.output_dir, "{}-{}".format(checkpoint_prefix, global_step)) os.makedirs(output_dir, exist_ok=True) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) _rotate_checkpoints(args, checkpoint_prefix) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def main(): args = configure() log_configuration(args) print(args) #import ipdb #ipdb.set_trace() model, parameters = create_model(args, args.model, pretrain_path=args.pretrain_path) optimizer, scheduler = setup_solver(args, parameters) if args.resume_path: print('loading checkpoint {}'.format(args.resume_path)) checkpoint = torch.load(str(args.resume_path)) load_state(model, checkpoint['state_dict']) if args.resume_train: args.begin_epoch = checkpoint['epoch'] if not args.train and checkpoint.get('optimizer') is not None: optimizer.load_state_dict(checkpoint['optimizer']) if args.onnx: export_onnx(args, model, args.onnx) return criterion = create_criterion(args) train_data, val_data, test_data = setup_dataset(args, args.train, args.val, args.test) if args.train or args.val: val_loader = torch.utils.data.DataLoader( val_data, batch_size=args.batch_size, shuffle=False, num_workers=args.n_threads, pin_memory=True, drop_last=False ) if args.train: if args.weighted_sampling: class_weights = getattr(args, 'class_weights', None) sampler = WeightedRandomSampler(train_data.get_sample_weights(class_weights), len(train_data)) else: sampler = RandomSampler(train_data) train_loader = torch.utils.data.DataLoader( train_data, batch_size=args.batch_size, sampler=sampler, num_workers=args.n_threads, pin_memory=True, drop_last=args.sync_bn ) log_training_setup(model, train_data, val_data, optimizer, scheduler) train(args, model, train_loader, val_loader, criterion, optimizer, scheduler, args.logger) if not args.train and args.val: with torch.no_grad(): validate(args, args.begin_epoch, val_loader, model, criterion, args.logger) print(args.sample_size) print(args.norm_value) print(args.mean_norm) print(args.std_norm) if args.test: test_loader = torch.utils.data.DataLoader( test_data, batch_size=args.batch_size, shuffle=False, num_workers=args.n_threads, pin_memory=True ) with torch.no_grad(): with args.logger.scope(): test(args, test_loader, model, args.logger)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_corpus", default=None, type=str, required=True, help="The input train corpus.") parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run evaluation.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--on_memory", action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train: raise ValueError( "Training is currently the only implemented execution option. Please set `do_train`." ) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) #train_examples = None num_train_optimization_steps = None if args.do_train: print("Loading Train Dataset", args.train_corpus) train_dataset = BERTDataset(args.train_corpus, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) num_train_optimization_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model model = BertForPreTraining.from_pretrained(args.bert_model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) #######Modify the architecture here! # Prepare optimizer if args.do_train: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: #TODO: check if this works with current data generator from disk that relies on next(file) # (it doesn't return item back by index) train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) if args.do_train: torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--finetuned_model_dir", default=None, type=str, help="Bert fine-tuned language model dir using domain corpus") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the test set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mnli-mm": MnliMismatchedProcessor, "mrpc": MrpcProcessor, "sst-2": Sst2Processor, "sts-b": StsbProcessor, "qqp": QqpProcessor, "qnli": QnliProcessor, "rte": RteProcessor, "wnli": WnliProcessor, "demo": DemoProcessor, } output_modes = { "cola": "classification", "mnli": "classification", "mrpc": "classification", "sst-2": "classification", "sts-b": "regression", "qqp": "classification", "qnli": "classification", "rte": "classification", "wnli": "classification", "demo": "classification", } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps args.seed = random.randint(1, 200) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) #if args.finetuned_model_dir is not None: # tokenizer = BertTokenizer.from_pretrained(args.finetuned_model_dir, do_lower_case=args.do_lower_case) #else: tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = 0 if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) if args.finetuned_model_dir is not None: model = BertForSequenceClassification.from_pretrained( args.finetuned_model_dir, num_labels=num_labels) else: model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) #model = torch.nn.parallel.data_parallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # define a new function to compute loss values for both output_modes logits = model(input_ids, segment_ids, input_mask, labels=None) if output_mode == "classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = BertForSequenceClassification.from_pretrained( args.output_dir, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) else: model = BertForSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Load a trained model and vocabulary that you have fine-tuned # model = BertForSequenceClassification.from_pretrained(args.output_dir, num_labels=num_labels) # tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) # create eval loss and other metric required by the task if output_mode == "classification": loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] if output_mode == "classification": preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss / nb_tr_steps if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss with open(os.path.join(args.data_dir, "val.txt"), 'r', encoding="utf-8") as f: lines = f.readlines() label_ids = all_label_ids.numpy() for i in range(len(eval_examples)): if preds[i] != label_ids[i]: print('real: ', label_ids[i], 'predict: ', preds[i], lines[i].encode('unicode_escape')) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_test_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running Prediction *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Load a trained model and vocabulary that you have fine-tuned # model = BertForSequenceClassification.from_pretrained(args.output_dir, num_labels=num_labels) # tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Testing"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) # create eval loss and other metric required by the task if output_mode == "classification": loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] if output_mode == "classification": preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss / nb_tr_steps if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir, "test_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Test results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) print("Accuracy...") print(metrics.accuracy_score(all_label_ids.numpy(), preds)) print("Precision, Recall and F1-Score...") print( metrics.classification_report(all_label_ids.numpy(), preds, digits=4))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .csv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=512, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--load_model", type=str, help="load model path.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", default=False, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=4, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) == False: os.makedirs(args.output_dir, exist_ok=True) # if os.path.exists(args.output_dir) and os.listdir(args.output_dir): # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_path = os.path.join(args.data_dir, 'train_all.csv') train_examples = read_race_examples(train_path) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model if args.load_model: logger.info("***** Loading Model *****") model_state_dict = torch.load(args.load_model) model = BertForMultipleChoice.from_pretrained( args.bert_model, state_dict=model_state_dict, num_choices=4) logger.info("loaded model %s", args.load_model) else: model = BertForMultipleChoice.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_choices=4) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for ep in range(int(args.num_train_epochs)): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("Training Epoch: {}/{}".format( ep + 1, int(args.num_train_epochs))) for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if global_step % 100 == 0: logger.info("Training loss: {}, global step: {}".format( tr_loss / nb_tr_steps, global_step)) dev_set = os.path.join(args.data_dir, 'dev_all.csv') eval_examples = read_race_examples(dev_set) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation: Dev *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for step, batch in enumerate(eval_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = { 'dev_eval_loss': eval_loss, 'dev_eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a+") as writer: logger.info("***** Dev results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "merge_pytorch_model_" + str(ep) + ".bin") torch.save(model_to_save.state_dict(), output_model_file) ## Load a trained model that you have fine-tuned ## use this part if you want to load the trained model # model_state_dict = torch.load(output_model_file) # model = BertForMultipleChoice.from_pretrained(args.bert_model, # state_dict=model_state_dict, # num_choices=4) # model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): test_dir = os.path.join(args.data_dir, 'test') test_high = [test_dir + '/high'] test_middle = [test_dir + '/middle'] ## test high eval_examples = read_race_examples(test_high) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation: test high *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() high_eval_loss, high_eval_accuracy = 0, 0 high_nb_eval_steps, high_nb_eval_examples = 0, 0 for step, batch in enumerate(eval_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) high_eval_loss += tmp_eval_loss.mean().item() high_eval_accuracy += tmp_eval_accuracy high_nb_eval_examples += input_ids.size(0) high_nb_eval_steps += 1 eval_loss = high_eval_loss / high_nb_eval_steps eval_accuracy = high_eval_accuracy / high_nb_eval_examples result = { 'high_eval_loss': eval_loss, 'high_eval_accuracy': eval_accuracy } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a+") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) ## test middle eval_examples = read_race_examples(test_middle) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation: test middle *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() middle_eval_loss, middle_eval_accuracy = 0, 0 middle_nb_eval_steps, middle_nb_eval_examples = 0, 0 for step, batch in enumerate(eval_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) middle_eval_loss += tmp_eval_loss.mean().item() middle_eval_accuracy += tmp_eval_accuracy middle_nb_eval_examples += input_ids.size(0) middle_nb_eval_steps += 1 eval_loss = middle_eval_loss / middle_nb_eval_steps eval_accuracy = middle_eval_accuracy / middle_nb_eval_examples result = { 'middle_eval_loss': eval_loss, 'middle_eval_accuracy': eval_accuracy } with open(output_eval_file, "a+") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) ## all test eval_loss = (middle_eval_loss + high_eval_loss) / ( middle_nb_eval_steps + high_nb_eval_steps) eval_accuracy = (middle_eval_accuracy + high_eval_accuracy) / ( middle_nb_eval_examples + high_nb_eval_examples) result = { 'overall_eval_loss': eval_loss, 'overall_eval_accuracy': eval_accuracy } with open(output_eval_file, "a+") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def train(args, train_dataset, model, tokenizer, teacher=None): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 1 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split( "/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) # Added here for reproductibility set_seed(args) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() if teacher is not None: teacher.eval() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "start_positions": batch[3], "end_positions": batch[4], } if args.model_type != "distilbert": inputs[ "token_type_ids"] = None if args.model_type == "xlm" else batch[ 2] if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[5], "p_mask": batch[6]}) if args.version_2_with_negative: inputs.update({"is_impossible": batch[7]}) outputs = model(**inputs) loss, start_logits_stu, end_logits_stu = outputs # Distillation loss if teacher is not None: if "token_type_ids" not in inputs: inputs[ "token_type_ids"] = None if args.teacher_type == "xlm" else batch[ 2] with torch.no_grad(): start_logits_tea, end_logits_tea = teacher( input_ids=inputs["input_ids"], token_type_ids=inputs["token_type_ids"], attention_mask=inputs["attention_mask"], ) assert start_logits_tea.size() == start_logits_stu.size() assert end_logits_tea.size() == end_logits_stu.size() loss_fct = nn.KLDivLoss(reduction="batchmean") loss_start = (loss_fct( F.log_softmax(start_logits_stu / args.temperature, dim=-1), F.softmax(start_logits_tea / args.temperature, dim=-1), ) * (args.temperature**2)) loss_end = (loss_fct( F.log_softmax(end_logits_stu / args.temperature, dim=-1), F.softmax(end_logits_tea / args.temperature, dim=-1), ) * (args.temperature**2)) loss_ce = (loss_start + loss_end) / 2.0 loss = args.alpha_ce * loss_ce + args.alpha_squad * loss if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Log metrics if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Only evaluate when single GPU otherwise metrics may not average well if args.local_rank == -1 and args.evaluate_during_training: results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(args, model, tokenizer, train_dataset, eval_dataset): eval_acc = evaluate(args, model, tokenizer, eval_dataset) logger.info(f"Initial Eval Accuracy: {eval_acc}") if args.num_train_epochs == 0: wandb.log({"eval_acc": eval_acc}) return train_acc = evaluate(args, model, tokenizer, train_dataset, is_train=True) logger.info(f"Initial Train Accuracy: {train_acc}") wandb.log({"eval_acc": eval_acc, "train_acc": train_acc}) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.per_device_train_batch_size) logger.info(f"***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Batch size = {args.per_device_train_batch_size}") train_dataloader = tqdm(train_dataloader, desc="Training", leave=False) optimizer = transformers.AdamW(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) scheduler = transformers.get_linear_schedule_with_warmup( optimizer, len(train_dataloader) * args.num_train_epochs // args.gradient_accumulation_steps * args.warmup_ratio, len(train_dataloader) * args.num_train_epochs // args.gradient_accumulation_steps) MASK_ID = tokenizer.encode(tokenizer.mask_token, add_special_tokens=False) assert len(MASK_ID) == 1 MASK_ID = MASK_ID[0] accumulated_loss = torch.tensor(0.0).to(args.device) for epoch in tqdm(range(args.num_train_epochs)): for step, batch in enumerate(train_dataloader): if args.no_dropout: model.eval() else: model.train() # batch["choice_list"] is [num_choices, batch_size] curr_answers = [] for i in range(len(batch["choice_list"][0])): curr_answers.append( batch["choice_list"][batch["answer_id"][i]][i]) choice_lists = batch.pop("choice_list") batch_len = len(batch["answer_id"]) del batch["answer_id"] for key in batch: batch[key] = torch.stack(batch[key], dim=-1).to(args.device) if "t5" not in args.model_name_or_path.lower(): MASK_INDEX = batch["input_ids"][0].tolist().index(MASK_ID) labels = torch.full((batch["input_ids"].size()[:2]), -100, device=args.device) # labels = batch["input_ids"].detach().clone() for i, curr_answer in enumerate(curr_answers): MASK_INDEX = batch["input_ids"][i].tolist().index(MASK_ID) assert len( tokenizer.encode(" " + curr_answer, add_special_tokens=False)) == 1 labels[i][MASK_INDEX] = tokenizer.encode( " " + curr_answer, add_special_tokens=False)[0] else: labels = [] for i, curr_answer in enumerate(curr_answers): labels += tokenizer.encode( f"<extra_id_0> {curr_answer} <extra_id_1>", return_tensors="pt") labels = torch.stack(labels, dim=0).to(args.device) outputs = model(**batch, labels=labels) loss = outputs.loss accumulated_loss += float(loss) if (step + 1) % args.gradient_accumulation_steps == 0: wandb.log({"train_loss": accumulated_loss.item()}) wandb.log({"lr": scheduler.get_last_lr()[0]}) loss.backward() if args.clip > 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) optimizer.step() scheduler.step() model.zero_grad() accumulated_loss = torch.tensor(0.0).to(args.device) eval_acc = evaluate(args, model, tokenizer, eval_dataset) logger.info(f"{epoch}th Eval Accuracy: {eval_acc}") train_acc = evaluate(args, model, tokenizer, train_dataset, is_train=True) logger.info(f"{epoch}th Train Accuracy: {train_acc}") wandb.log({"eval_acc": eval_acc, "train_acc": train_acc}) return True