def train(epoch, model, optimizer, scheduler): global global_step epoch_loss = 0.0 running_num = 0 running_loss = np.zeros(3) train_sampler.set_epoch(epoch) model.train() bar = tqdm(train_loader) if args.local_rank == 0 else train_loader for batch_idx, (x, c) in enumerate(bar): scheduler.step() global_step += 1 x, c = x.to(device, non_blocking=True), c.to(device, non_blocking=True) optimizer.zero_grad() log_p, logdet = model(x, c) log_p, logdet = torch.mean(log_p), torch.mean(logdet) loss = -(log_p + logdet) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() nn.utils.clip_grad_norm_(amp.master_params(optimizer), 1.) optimizer.step() running_num += 1 running_loss[0] += loss.item() running_loss[1] += log_p.item() running_loss[2] += logdet.item() epoch_loss += loss.item() if args.local_rank == 0: bar.set_description('{}/{}, [Log pdf, Log p(z), Log Det] : {}' .format(epoch, global_step, running_loss / running_num)) if (batch_idx + 1) % 100 == 0: running_num = 0 running_loss = np.zeros(3) del x, c, log_p, logdet, loss del running_loss gc.collect() print('{}/{}/{} Training Loss : {:.4f}'.format(epoch, global_step, args.local_rank, epoch_loss / (len(train_loader)))) return epoch_loss / len(train_loader)
def train(args, train_dataset, model, tokenizer): """ Train the model """ args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True, ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): # set global_step to gobal_step of last saved checkpoint from model path global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0]) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0], ) set_seed(args) # Added here for reproductibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3] } if args.model_type != "distilbert": inputs["token_type_ids"] = ( batch[2] if args.model_type in ["bert", "xlnet", "albert"] else None ) # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: logs = {} if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): eval_key = "eval_{}".format(key) logs[eval_key] = value loss_scalar = (tr_loss - logging_loss) / args.logging_steps learning_rate_scalar = scheduler.get_lr()[0] logs["learning_rate"] = learning_rate_scalar logs["loss"] = loss_scalar logging_loss = tr_loss for key, value in logs.items(): tb_writer.add_scalar(key, value, global_step) print(json.dumps({**logs, **{"step": global_step}})) if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(args) -> Tuple[int, float]: """ Train the model """ tb_writer = SummaryWriter() state_path = f"{args.output_dir}/domain_pre-training_state.json" state = { "v": 0.1, "finished": False, "ts": {"t0": time.time()}, "saved": {"idx": 0, "data": {}}, "curr_global_step_idx": 0, "cumulative_runtime": 0, "metrics": { "validation": [], }, "timed_out": False } start_step_idx = 0 if args.output_dir[:5].lower() != "s3://": if not os.path.exists(f"{args.output_dir}/models"): os.makedirs(f"{args.output_dir}/models") if not os.path.exists(f"{args.output_dir}/metrics"): os.makedirs(f"{args.output_dir}/metrics") model = None if args.should_continue: saved_path = None print(f"Resuming state from: {state_path}") try: tmp_state = json.loads(open(state_path, 'r').read()) if tmp_state["finished"] == True: print("State shows that training has finished.") print(f"Final model was saved at: {saved_path}") print("Quiting.") exit(0) saved_path = tmp_state["saved"]["data"][str(tmp_state["saved"]["idx"])]["model"] print("SP:", saved_path) tokenizer, model, optimizer, scheduler = get_model_training_objects(args, saved_path) # TODO: Do a deeper validation of state # By now the state should be valid. Replace proper variables tmp_state['curr_global_step_idx'] = tmp_state["saved"]["data"][str(tmp_state["saved"]["idx"])]["global_step_idx"] start_step_idx = tmp_state['curr_global_step_idx'] + 1 tmp_state['saved']['idx'] += 1 state = tmp_state except Exception: if saved_path: print(f"Could not load model from {saved_path}") else: print(f"Could not get model information from {state_path}") print(sys.exc_info()[0]) import traceback print(traceback.format_exc()) pass # if saved_path[:5].lower() == "s3://": # dirname = tempfile.mkdtemp() # if saved_path[-4:].lower() == ".tgz": # cmd = f"aws s3 cp {saved_path} - | tar C {dirname} -zxf - . " # elif saved_path[-4:].lower() == ".tar": # cmd = f"aws s3 cp {saved_path} - | tar C {dirname} -xf - . " # else: # cmd = f"aws s3 cp --recursive {saved_path} {dirname}" # print(cmd) # os.system(cmd) # #exit(4) # else: # dirname = saved_path # if dirname[-1] != "/": # dirname += "/" # tmp_tokenizer = AutoTokenizer.from_pretrained(dirname) # tmp_model = BertForMaskedLM.from_pretrained( # dirname, config=AutoConfig.from_pretrained(dirname)) # # Prepare optimizer and schedule (linear warmup and decay) # no_decay = ["bias", "LayerNorm.weight"] # optimizer_grouped_parameters = [ # { # "params": [p for n, p in tmp_model.named_parameters() if not any(nd in n for nd in no_decay)], # "weight_decay": args.weight_decay, # }, # {"params": [p for n, p in tmp_model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, # ] # tmp_optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) # tmp_scheduler = get_linear_schedule_with_warmup( # tmp_optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=args.max_steps # ) # # Check if saved optimizer or scheduler states exist # if os.path.isfile(os.path.join(dirname, "optimizer.pt")): # tmp_optimizer.load_state_dict(torch.load(os.path.join(dirname, "optimizer.pt"))) # if os.path.isfile(os.path.join(dirname, "scheduler.pt")): # tmp_scheduler.load_state_dict(torch.load(os.path.join(dirname, "scheduler.pt"))) # if saved_path[:5].lower() == "s3://": # shutil.rmtree(dirname) # # TODO: Do a deeper validation of state # # By now the state should be valid. Replace proper variables # tokenizer = tmp_tokenizer # model = tmp_model # optimizer = tmp_optimizer # scheduler = tmp_scheduler # state = tmp_state # except Exception: # print(sys.exc_info()[0]) # import traceback # print(traceback.format_exc()) # exit(6) # # TODO: We should probably return and error (or raise custom exception) here and # # let the caller initialize the model. At this point we can't be sure what the # # configuration of the model should be. # # Above, we don't set used variables until the tokenizer and model can be loaded. # # If the caller has used the matter.set_* functions this should be enough, but the # # flow should be dictated by the caller, not by exceptions catching here. Otherwise # # we are forcing a valid model be loaded every time, even when resuming. # print("Couldn't read/continue from previous state file. Discarding saved state.") if model is None: # Need all training objects print("Training model from scratch") tokenizer, model, optimizer, scheduler = get_model_training_objects(args) #args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) #inputfname = "/disco/data/matters/xmatter/gp_priv1/dbdata/df-document_info-with_uri_clean.csv" #inputfname = "/disco/data/matters/xmatter/gp_priv1/dbdata/df-document_info-in_golden_set.csv" train_df = pd.concat((pd.read_csv(f, keep_default_na=False, header=0, usecols=[args.csv_size_column, args.csv_uri_column], dtype={args.csv_size_column: 'int64', args.csv_uri_column: str}) for f in args.train_data_file)) train_df.rename({args.csv_uri_column: "uri", args.csv_size_column: "size"}, inplace=True) # Setup the dataset and data loader ds = DFMLMDataset(train_df, tokenizer, args) data_loader = DataLoader(ds, batch_size=args.batch_size, shuffle=True, #batch_size=args.batch_size, shuffle=False, num_workers=args.data_loader_num_workers, collate_fn=ds.dataloader_batch_collate) t_total = args.max_steps args.num_train_epochs = 1 epochs_trained = 0 steps_trained_in_current_epoch = 0 tr_loss, logging_loss = 0.0, 0.0 # Train! logger.info("***** Running training *****") logger.info(" Num docs = %d", len(train_df)) logger.info(f" Total bytes = {train_df['size'].sum():,}") logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Batch size = %d", args.batch_size) #logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) #print("device=", args.device) #model = torch.nn.DataParallel(model) #model.to("cuda") model.to(args.device) model.zero_grad() set_seed(args) # Added here for reproducibility data_loader_iterator = iter(data_loader) t_b_pre = time.time() # TODO: Note that I removed the reproducibility of batches for step in range(start_step_idx, args.max_steps): state["curr_global_step_idx"] = step try: batch = next(data_loader_iterator) except StopIteration: data_loader_iterator = iter(data_loader) batch = next(data_loader_iterator) inputs, labels = batch inputs = inputs.to(args.device) labels = labels.to(args.device) model.train() outputs = model(inputs, masked_lm_labels=labels) loss = outputs[0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() model.zero_grad() #epoch_iterator.set_description(desc=f"Loss {(tr_loss/global_step):7.3f}") print(f"Step {step+1}/{args.max_steps} Loss {tr_loss/(step+1):7.3f}") if args.logging_steps > 0 and (step + 1) % args.logging_steps == 0: # Log metrics tb_writer.add_scalar("lr", scheduler.get_lr()[0], step+1) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, step+1) logging_loss = tr_loss if args.save_steps > 0 and (step + 1) % args.save_steps == 0: model_dst = f"{args.output_dir}/models/global_step_idx_{step}" save_model_training_objects(args, model_dst, tokenizer, model, optimizer, scheduler) state["saved"]["data"][state['saved']['idx']] = { "global_step_idx": step, "model": model_dst, "metrics": None, "validation": False } print("saving state to ", state_path) save_state(state_path, state) state['saved']['idx'] += 1 state["cumulative_runtime"] += time.time() - t_b_pre t_b_pre = time.time() if args.max_run_time_in_minutes: if (state["cumulative_runtime"]) / 60 > args.max_run_time_in_minutes: print(f"Terminating because we are over {args.max_run_time_in_minutes} minutes") state["timed_out"] = True break print("Finished training") tb_writer.close() # TODO: Keep track of last saved model and avoid saving it twice model_dst = f"{args.output_dir}/models/global_step_idx_{step}" print(f"Saving final trained model to {model_dst}") save_model_training_objects(args, model_dst, tokenizer, model, optimizer, scheduler) if args.should_continue: state["curr_global_step_idx"] = step state["finished"] = True state["ts"]["tz"] = time.time() state["saved"]["data"][state['saved']['idx']] = { "global_step_idx": step, "model": model_dst, "metrics": None, "validation": False, "final": True } print("saving state to ", state_path) save_state(state_path, state) while len(s3_upload_futures) > 0: print(f"Waiting on {len(s3_upload_futures)} upload set(s)") time.sleep(2) return step, tr_loss / step
def train( self, train_dataset, output_dir, show_running_loss=True, eval_file=None, verbose=True, **kwargs, ): """ Trains the model on train_dataset. Utility function to be used by the train_model() method. Not intended to be used directly. """ model = self.model args = self.args tokenizer = self.tokenizer def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) tb_writer = SummaryWriter(logdir=args["tensorboard_dir"]) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=args["train_batch_size"], collate_fn=collate ) if args["max_steps"] > 0: t_total = args["max_steps"] args["num_train_epochs"] = ( args["max_steps"] // (len(train_dataloader) // args["gradient_accumulation_steps"]) + 1 ) else: t_total = len(train_dataloader) // args["gradient_accumulation_steps"] * args["num_train_epochs"] no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args["weight_decay"], }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)]}, ] warmup_steps = math.ceil(t_total * args["warmup_ratio"]) args["warmup_steps"] = warmup_steps if args["warmup_steps"] == 0 else args["warmup_steps"] optimizer = AdamW(optimizer_grouped_parameters, lr=args["learning_rate"], eps=args["adam_epsilon"]) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args["warmup_steps"], num_training_steps=t_total ) if ( args["model_name"] and os.path.isfile(os.path.join(args["model_name"], "optimizer.pt")) and os.path.isfile(os.path.join(args["model_name"], "scheduler.pt")) ): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(os.path.join(args["model_name"], "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(args["model_name"], "scheduler.pt"))) if args["fp16"]: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args["fp16_opt_level"]) if args["n_gpu"] > 1: model = torch.nn.DataParallel(model) logger.info(" Training started") global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args["num_train_epochs"]), desc="Epoch", disable=args["silent"], mininterval=0) epoch_number = 0 best_eval_metric = None early_stopping_counter = 0 steps_trained_in_current_epoch = 0 epochs_trained = 0 if args["model_name"] and os.path.exists(args["model_name"]): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args["model_name"].split("/")[-1].split("-") if len(checkpoint_suffix) > 2: checkpoint_suffix = checkpoint_suffix[1] else: checkpoint_suffix = checkpoint_suffix[-1] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args["gradient_accumulation_steps"]) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args["gradient_accumulation_steps"] ) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the current epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") if args["evaluate_during_training"]: training_progress_scores = self._create_training_progress_scores(**kwargs) if args["wandb_project"]: wandb.init(project=args["wandb_project"], config={**args}, **args["wandb_kwargs"]) wandb.watch(self.model) model.train() for current_epoch in train_iterator: if epochs_trained > 0: epochs_trained -= 1 continue # epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(tqdm(train_dataloader, desc="Current iteration", disable=args["silent"])): if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue inputs, labels = mask_tokens(batch, tokenizer, args) if args["mlm"] else (batch, batch) inputs = inputs.to(self.device) labels = labels.to(self.device) outputs = model(inputs, masked_lm_labels=labels) if args["mlm"] else model(inputs, labels=labels) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] # if loss.item() < 1: # masked = (labels[0] != -100).nonzero() # print(labels[0][masked]) # preds = outputs[1][0, masked, :].clone().detach().cpu().numpy() # print(np.argmax(preds, axis=2)) if args["n_gpu"] > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training current_loss = loss.item() if show_running_loss: print("\rRunning loss: %f" % loss, end="") if args["gradient_accumulation_steps"] > 1: loss = loss / args["gradient_accumulation_steps"] if args["fp16"]: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args["gradient_accumulation_steps"] == 0: if args["fp16"]: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args["max_grad_norm"]) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args["max_grad_norm"]) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args["logging_steps"] > 0 and global_step % args["logging_steps"] == 0: # Log metrics tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args["logging_steps"], global_step) logging_loss = tr_loss if args["wandb_project"]: wandb.log( { "Training loss": current_loss, "lr": scheduler.get_lr()[0], "global_step": global_step, } ) if args["save_steps"] > 0 and global_step % args["save_steps"] == 0: # Save model checkpoint output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step)) self._save_model(output_dir_current, optimizer, scheduler, model=model) if args["evaluate_during_training"] and ( args["evaluate_during_training_steps"] > 0 and global_step % args["evaluate_during_training_steps"] == 0 ): # Only evaluate when single GPU otherwise metrics may not average well results = self.eval_model( eval_file, verbose=verbose and args["evaluate_during_training_verbose"], silent=True, **kwargs, ) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step)) if args["save_eval_checkpoints"]: self._save_model(output_dir_current, optimizer, scheduler, model=model, results=results) training_progress_scores["global_step"].append(global_step) training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv( os.path.join(args["output_dir"], "training_progress_scores.csv"), index=False, ) if args["wandb_project"]: wandb.log(self._get_last_metrics(training_progress_scores)) if not best_eval_metric: best_eval_metric = results[args["early_stopping_metric"]] self._save_model( args["best_model_dir"], optimizer, scheduler, model=model, results=results ) if best_eval_metric and args["early_stopping_metric_minimize"]: if ( results[args["early_stopping_metric"]] - best_eval_metric < args["early_stopping_delta"] ): best_eval_metric = results[args["early_stopping_metric"]] self._save_model( args["best_model_dir"], optimizer, scheduler, model=model, results=results ) early_stopping_counter = 0 else: if args["use_early_stopping"]: if early_stopping_counter < args["early_stopping_patience"]: early_stopping_counter += 1 if verbose: logger.info(f" No improvement in {args['early_stopping_metric']}") logger.info(f" Current step: {early_stopping_counter}") logger.info(f" Early stopping patience: {args['early_stopping_patience']}") else: if verbose: logger.info( f" Patience of {args['early_stopping_patience']} steps reached." ) logger.info(" Training terminated.") train_iterator.close() return global_step, tr_loss / global_step else: if ( results[args["early_stopping_metric"]] - best_eval_metric > args["early_stopping_delta"] ): best_eval_metric = results[args["early_stopping_metric"]] self._save_model( args["best_model_dir"], optimizer, scheduler, model=model, results=results ) early_stopping_counter = 0 else: if args["use_early_stopping"]: if early_stopping_counter < args["early_stopping_patience"]: early_stopping_counter += 1 if verbose: logger.info(f" No improvement in {args['early_stopping_metric']}") logger.info(f" Current step: {early_stopping_counter}") logger.info(f" Early stopping patience: {args['early_stopping_patience']}") else: if verbose: logger.info( f" Patience of {args['early_stopping_patience']} steps reached." ) logger.info(" Training terminated.") train_iterator.close() return global_step, tr_loss / global_step if args["max_steps"] > 0 and global_step > args["max_steps"]: return global_step, tr_loss / global_step epoch_number += 1 output_dir_current = os.path.join(output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number)) if args["save_model_every_epoch"] or args["evaluate_during_training"]: os.makedirs(output_dir_current, exist_ok=True) if args["save_model_every_epoch"]: self._save_model(output_dir_current, optimizer, scheduler, model=model) if args["evaluate_during_training"]: results = self.eval_model( eval_file, verbose=verbose and args["evaluate_during_training_verbose"], silent=True, **kwargs ) self._save_model(output_dir_current, optimizer, scheduler, results=results) training_progress_scores["global_step"].append(global_step) training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv(os.path.join(args["output_dir"], "training_progress_scores.csv"), index=False) if args["wandb_project"]: wandb.log(self._get_last_metrics(training_progress_scores)) if not best_eval_metric: best_eval_metric = results[args["early_stopping_metric"]] self._save_model(args["best_model_dir"], optimizer, scheduler, model=model, results=results) if best_eval_metric and args["early_stopping_metric_minimize"]: if results[args["early_stopping_metric"]] - best_eval_metric < args["early_stopping_delta"]: best_eval_metric = results[args["early_stopping_metric"]] self._save_model(args["best_model_dir"], optimizer, scheduler, model=model, results=results) early_stopping_counter = 0 else: if args["use_early_stopping"] and args["early_stopping_consider_epochs"]: if early_stopping_counter < args["early_stopping_patience"]: early_stopping_counter += 1 if verbose: logger.info(f" No improvement in {args['early_stopping_metric']}") logger.info(f" Current step: {early_stopping_counter}") logger.info(f" Early stopping patience: {args['early_stopping_patience']}") else: if verbose: logger.info(f" Patience of {args['early_stopping_patience']} steps reached") logger.info(" Training terminated.") train_iterator.close() return global_step, tr_loss / global_step else: if results[args["early_stopping_metric"]] - best_eval_metric > args["early_stopping_delta"]: best_eval_metric = results[args["early_stopping_metric"]] self._save_model(args["best_model_dir"], optimizer, scheduler, model=model, results=results) early_stopping_counter = 0 else: if args["use_early_stopping"] and args["early_stopping_consider_epochs"]: if early_stopping_counter < args["early_stopping_patience"]: early_stopping_counter += 1 if verbose: logger.info(f" No improvement in {args['early_stopping_metric']}") logger.info(f" Current step: {early_stopping_counter}") logger.info(f" Early stopping patience: {args['early_stopping_patience']}") else: if verbose: logger.info(f" Patience of {args['early_stopping_patience']} steps reached") logger.info(" Training terminated.") train_iterator.close() return global_step, tr_loss / global_step if args["max_steps"] > 0 and global_step > args["max_steps"]: return global_step, tr_loss / global_step return global_step, tr_loss / global_step
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--weight_decay", default=0.01, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = {"ner": NerProcessor} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() num_labels = len(label_list) + 1 tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = 0 if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab # Prepare model config = BertConfig.from_pretrained(args.bert_model, num_labels=num_labels, finetuning_task=args.task_name) model = Ner.from_pretrained(args.bert_model, from_tf=False, config=config) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] warmup_steps = int(args.warmup_proportion * num_train_optimization_steps) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if n_gpu > 1: model = torch.nn.DataParallel(model) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) global_step = 0 nb_tr_steps = 0 tr_loss = 0 i = 0 label_map = {i: label for i, label in enumerate(label_list, 0)} if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in train_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_valid_ids, all_lmask_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask = batch loss = model(input_ids, segment_ids, input_mask, label_ids, valid_ids, l_mask) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) label_map = {i: label for i, label in enumerate(label_list, 1)} model_config = { "bert_model": args.bert_model, "do_lower": args.do_lower_case, "max_seq_length": args.max_seq_length, "num_labels": len(label_list) + 1, "label_map": label_map } json.dump( model_config, open(os.path.join(args.output_dir, "model_config.json"), "w")) # Load a trained model and config that you have fine-tuned else: # Load a trained model and vocabulary that you have fine-tuned model = Ner.from_pretrained(args.output_dir) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_test_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in eval_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_valid_ids, all_lmask_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 y_true = [] y_pred = [] label_map = {i: label for i, label in enumerate(label_list, 1)} for input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) valid_ids = valid_ids.to(device) label_ids = label_ids.to(device) l_mask = l_mask.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, valid_ids=valid_ids, attention_mask_label=l_mask) logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() input_mask = input_mask.to('cpu').numpy() print("---------------------------------------------->") print(label_map) print("---------------------------------------------->") for i, label in enumerate(label_ids): temp_1 = [] temp_2 = [] for j, m in enumerate(label): if j == 0: continue elif label_ids[i][j] == len(label_map): y_true.append(temp_1) y_pred.append(temp_2) break else: temp_1.append(label_map[label_ids[i][j]]) temp_2.append(label_map[logits[i][j]]) report = classification_report(y_true, y_pred, digits=4) logger.info("\n%s", report) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") logger.info("\n%s", report) writer.write(report)
def train(args, model, tokenizer, query_cache, passage_cache): """ Train the model """ logger.info("Training/evaluation parameters %s", args) tb_writer = None if is_first_worker(): tb_writer = SummaryWriter(log_dir=args.log_dir) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) #nll loss for query real_batch_size = args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1) optimizer = get_optimizer(args, model, weight_decay=args.weight_decay,) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps= args.max_steps ) global_step = 0 if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) if args.model_name_or_path != "bert-base-uncased": if args.fp16 and args.resume_fp16_checkpoint : from collections import OrderedDict checkpoint = torch.load(args.model_name_or_path, map_location=lambda s, l: default_restore_location(s, 'cpu')) new_state_dict = OrderedDict() for k, v in checkpoint['model'].items(): name = k[7:] new_state_dict[name] = v model.load_state_dict(new_state_dict) # model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) amp.load_state_dict(checkpoint['amp']) global_step = checkpoint['offset'] else: saved_state = load_states_from_checkpoint(args.model_name_or_path) global_step, model, optimizer, scheduler = _load_saved_state(model, optimizer, scheduler, saved_state, load_optimizer_scheduler=args.load_optimizer_scheduler) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from global step %d", global_step) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True, ) # Train! logger.info("***** Running training *****") logger.info(" Max steps = %d", args.max_steps) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) tr_loss = {} model.zero_grad() model.train() set_seed(args) # Added here for reproductibility last_ann_no = -1 train_dataloader = None train_dataloader_iter = None dev_ndcg = 0 step = 0 iter_count = 0 if not args.dual_training: args.dual_loss_weight = 0.0 if args.model_name_or_path != "bert-base-uncased": nq_dev_nll_loss, nq_correct_ratio = evaluate_dev(args, model, passage_cache) # dev_nll_loss_trivia, correct_ratio_trivia = evaluate_dev(args, model, passage_cache, "-trivia") if is_first_worker(): tb_writer.add_scalar("dev_nll_loss/dev_nll_loss", nq_dev_nll_loss, global_step) tb_writer.add_scalar("dev_nll_loss/correct_ratio", nq_correct_ratio, global_step) # tb_writer.add_scalar("dev_nll_loss/dev_nll_loss_trivia", dev_nll_loss_trivia, global_step) # tb_writer.add_scalar("dev_nll_loss/correct_ratio_trivia", correct_ratio_trivia, global_step) while global_step < args.max_steps: if step % args.gradient_accumulation_steps == 0 and global_step % args.logging_steps == 0: if args.num_epoch == 0: # check if new ann training data is availabe ann_no, ann_path, ndcg_json = get_latest_ann_data(args.ann_dir) if ann_path is not None and ann_no != last_ann_no: logger.info("Training on new add data at %s", ann_path) with open(ann_path, 'r') as f: ann_training_data = f.readlines() logger.info("Training data line count: %d", len(ann_training_data)) ann_training_data = [l for l in ann_training_data if len(l.split('\t')[2].split(',')) > 1] logger.info("Filtered training data line count: %d", len(ann_training_data)) if args.dual_training: ann_training_data = [l for l in ann_training_data if len(l.split('\t')[3].split(',')) > 1] logger.info("Filtered training data line count in dual training: %d", len(ann_training_data)) ann_checkpoint_path = ndcg_json['checkpoint'] ann_checkpoint_no = get_checkpoint_no(ann_checkpoint_path) aligned_size = (len(ann_training_data) // args.world_size) * args.world_size ann_training_data = ann_training_data[:aligned_size] logger.info("Total ann queries: %d", len(ann_training_data)) if args.triplet: if args.dual_training: train_dataset = StreamingDataset(ann_training_data, GetQuadrapuletTrainingDataProcessingFn(args, query_cache, passage_cache)) else: train_dataset = StreamingDataset(ann_training_data, GetTripletTrainingDataProcessingFn(args, query_cache, passage_cache)) train_dataloader = DataLoader(train_dataset, batch_size=args.train_batch_size) else: train_dataset = StreamingDataset(ann_training_data, GetTrainingDataProcessingFn(args, query_cache, passage_cache)) train_dataloader = DataLoader(train_dataset, batch_size=args.train_batch_size*2) train_dataloader_iter = iter(train_dataloader) # re-warmup if not args.single_warmup: scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps= len(ann_training_data) ) if args.local_rank != -1: dist.barrier() if is_first_worker(): # add ndcg at checkpoint step used instead of current step tb_writer.add_scalar("retrieval_accuracy/top20_nq", ndcg_json['top20'], ann_checkpoint_no) tb_writer.add_scalar("retrieval_accuracy/top100_nq", ndcg_json['top100'], ann_checkpoint_no) if 'dev_top20' in ndcg_json: tb_writer.add_scalar("retrieval_accuracy/top20_nq_dev", ndcg_json['dev_top20'], ann_checkpoint_no) tb_writer.add_scalar("retrieval_accuracy/top100_nq_dev", ndcg_json['dev_top100'], ann_checkpoint_no) if 'top20_trivia' in ndcg_json: tb_writer.add_scalar("retrieval_accuracy/top20_trivia", ndcg_json['top20_trivia'], ann_checkpoint_no) tb_writer.add_scalar("retrieval_accuracy/top100_trivia", ndcg_json['top100_trivia'], ann_checkpoint_no) if last_ann_no != -1: tb_writer.add_scalar("epoch", last_ann_no, global_step-1) tb_writer.add_scalar("epoch", ann_no, global_step) last_ann_no = ann_no elif step == 0: train_data_path = os.path.join(args.data_dir, "train-data") with open(train_data_path, 'r') as f: training_data = f.readlines() if args.triplet: train_dataset = StreamingDataset(training_data, GetTripletTrainingDataProcessingFn(args, query_cache, passage_cache)) train_dataloader = DataLoader(train_dataset, batch_size=args.train_batch_size) else: train_dataset = StreamingDataset(training_data, GetTrainingDataProcessingFn(args, query_cache, passage_cache)) train_dataloader = DataLoader(train_dataset, batch_size=args.train_batch_size*2) all_batch = [b for b in train_dataloader] logger.info("Total batch count: %d", len(all_batch)) train_dataloader_iter = iter(train_dataloader) try: batch = next(train_dataloader_iter) except StopIteration: logger.info("Finished iterating current dataset, begin reiterate") if args.num_epoch != 0: iter_count += 1 if is_first_worker(): tb_writer.add_scalar("epoch", iter_count-1, global_step-1) tb_writer.add_scalar("epoch", iter_count, global_step) nq_dev_nll_loss, nq_correct_ratio = evaluate_dev(args, model, passage_cache) # dev_nll_loss_trivia, correct_ratio_trivia = evaluate_dev(args, model, passage_cache, "-trivia") if is_first_worker(): tb_writer.add_scalar("dev_nll_loss/dev_nll_loss", nq_dev_nll_loss, global_step) tb_writer.add_scalar("dev_nll_loss/correct_ratio", nq_correct_ratio, global_step) # tb_writer.add_scalar("dev_nll_loss/dev_nll_loss_trivia", dev_nll_loss_trivia, global_step) # tb_writer.add_scalar("dev_nll_loss/correct_ratio_trivia", correct_ratio_trivia, global_step) train_dataloader_iter = iter(train_dataloader) batch = next(train_dataloader_iter) dist.barrier() if args.num_epoch != 0 and iter_count > args.num_epoch: break step += 1 if args.triplet: loss, loss_dict = triplet_fwd_pass(args, model, batch) else: loss, correct_cnt = do_biencoder_fwd_pass(args, model, batch) if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: if step % args.gradient_accumulation_steps == 0: loss.backward() else: with model.no_sync(): loss.backward() if len(tr_loss) >0: for k in loss_dict: tr_loss[k] = tr_loss[k] + loss_dict[k].item() else: for k in loss_dict: tr_loss[k] = loss_dict[k].item() if step % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: logs = {"loss_total":0.0} for k in tr_loss: tr_loss[k] = tr_loss[k] / args.logging_steps logs[k] = tr_loss[k] logs["loss_total"] = logs["loss_total"] + logs[k] # loss_scalar = tr_loss / args.logging_steps learning_rate_scalar = scheduler.get_lr()[0] # logs["loss"] = loss_scalar logs["learning_rate"] = learning_rate_scalar tr_loss = {} if is_first_worker(): for key, value in logs.items(): tb_writer.add_scalar(key, value, global_step) logger.info(json.dumps({**logs, **{"step": global_step}})) if is_first_worker() and args.save_steps > 0 and global_step % args.save_steps == 0: if args.fp16: cp = os.path.join(args.output_dir, 'checkpoint-' + str(global_step)) # Save checkpoint checkpoint = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler' : scheduler.state_dict(), 'amp': amp.state_dict(), 'epoch':0, 'offset': global_step, } torch.save(checkpoint, cp) logger.info('Saved checkpoint at %s', cp) else: _save_checkpoint(args, model, optimizer, scheduler, global_step) if args.local_rank == -1 or torch.distributed.get_rank() == 0: tb_writer.close() return global_step
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) model = load_model(hparams) learning_rate = hparams.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: from apex import amp model, optimizer = amp.initialize( model, optimizer, opt_level='O2') if hparams.distributed_run: model = apply_gradient_allreduce(model) if hparams.use_vae: criterion = Tacotron2Loss_VAE(hparams) else: criterion = Tacotron2Loss() logger = prepare_directories_and_logger( output_directory, log_directory, rank, hparams.use_vae) train_loader, valset, collate_fn = prepare_dataloaders(hparams) valset_csv = os.path.join(output_directory, log_directory, 'valset.csv') # list2csv(flatten_list(valset.audiopaths_and_text), valset_csv, delimiter='|') list2csv(valset.audiopaths_and_text, valset_csv, delimiter='|') # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model = warm_start_model( checkpoint_path, model, hparams.ignore_layers) else: model, optimizer, _learning_rate, iteration, epoch, step = \ load_checkpoint(checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate if epoch == 0: iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) else: epoch_offset = epoch print('epoch offset: {}'.format(epoch_offset)) train_loader = prepare_dataloaders(hparams, epoch_offset, valset, collate_fn['train'])[0] print('completing loading model ...') model.train() is_overflow = False # ================ MAIN TRAINNIG LOOP! =================== track_csv = os.path.join(output_directory, log_directory, 'track.csv') track_header = ['padding-rate-txt', 'max-len-txt', 'top-len-txt', 'padding-rate-mel', 'max-len-mel', 'top-len-mel', 'batch-size', 'batch-length', 'batch-area', 'mem-use', 'mem-all', 'mem-cached', 'duration', 'iteration', 'epoch', 'step'] if os.path.isfile(track_csv) and checkpoint_path is not None: print('loading existing {} ...'.format(track_csv)) track = csv2dict(track_csv, header=track_header) else: track = {k:[] for k in track_header} print('start training in epoch {} ~ {} ...'.format(epoch_offset, hparams.epochs)) nbatches = len(train_loader) for epoch in range(epoch_offset, hparams.epochs): #if epoch >= 10: break print("Epoch: {}, #batches: {}".format(epoch, nbatches)) batch_sizes, batch_lengths = [0] * nbatches, [0] * nbatches for i, batch in enumerate(train_loader): start = time.perf_counter() for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.zero_grad() x, y = model.parse_batch(batch) y_pred = model(x) if hparams.use_vae: loss, recon_loss, kl, kl_weight = criterion(y_pred, y, iteration) else: loss = criterion(y_pred, y) if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), hparams.grad_clip_thresh) is_overflow = math.isnan(grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() if not is_overflow and rank == 0: duration = time.perf_counter() - start batch_sizes[i], batch_lengths[i] = batch[0].size(0), batch[2].size(2) batch_capacity = batch_sizes[i] * batch_lengths[i] mem_all = torch.cuda.memory_allocated() / (1024**2) mem_cached = torch.cuda.memory_cached() / (1024**2) mem_use = mem_all + mem_cached print("{} ({}:{}/{}): ".format(iteration, epoch, i, nbatches), end='') print("Batch {} ({}X{}) ".format(batch_capacity, batch_sizes[i], batch_lengths[i]), end='') print("Mem {:.1f} ({:.1f}+{:.1f}) ".format(mem_use, mem_all, mem_cached), end='') print("Train loss {:.3f} Grad Norm {:.3f} {:.2f}s/it".format( reduced_loss, grad_norm, duration)) input_lengths, gate_padded = batch[1], batch[4] metadata = (duration, iteration, epoch, i) track_seq(track, input_lengths, gate_padded, metadata) padding_rate_txt = track['padding-rate-txt'][-1] max_len_txt = track['max-len-txt'][-1] padding_rate_mel = track['padding-rate-mel'][-1] max_len_mel = track['max-len-mel'][-1] if hparams.use_vae: logger.log_training( reduced_loss, grad_norm, learning_rate, duration, padding_rate_txt, max_len_txt, padding_rate_mel, max_len_mel, iteration, recon_loss, kl, kl_weight) else: logger.log_training( reduced_loss, grad_norm, learning_rate, duration, padding_rate_txt, max_len_txt, padding_rate_mel, max_len_mel, iteration) check_by_iter = (hparams.check_by == 'iter') and \ (iteration % hparams.iters_per_checkpoint == 0) check_by_epoch = (hparams.check_by == 'epoch') and i == 0 and \ (epoch % hparams.epochs_per_checkpoint == 0) if not is_overflow and (check_by_iter or check_by_epoch): dict2col(track, track_csv, verbose=True) val_loss, (mus, emotions) = validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn['val'], logger, hparams.distributed_run, rank, hparams.use_vae, pre_batching=False) if rank == 0: checkpoint_path = os.path.join(output_directory, "checkpoint_{}-{}-{}_{:.3f}".format(iteration, epoch, i, val_loss)) save_checkpoint(model, optimizer, learning_rate, iteration, epoch, i, checkpoint_path) if hparams.use_vae: image_scatter_path = os.path.join(output_directory, "checkpoint_{0}_scatter_val.png".format(iteration)) image_tsne_path = os.path.join(output_directory, "checkpoint_{0}_tsne_val.png".format(iteration)) imageio.imwrite(image_scatter_path, plot_scatter(mus, emotions)) imageio.imwrite(image_tsne_path, plot_tsne(mus, emotions)) iteration += 1 if hparams.prep_trainset_per_epoch: train_loader = prepare_dataloaders(hparams, epoch+1, valset, collate_fn['train'])[0] nbatches = len(train_loader)
def fit( self, epochs, lr, validate=True, schedule_type="warmup_cosine", optimizer_type="lamb", ): tensorboard_dir = self.output_dir / "tensorboard" tensorboard_dir.mkdir(exist_ok=True) # Train the model tb_writer = SummaryWriter(tensorboard_dir) train_dataloader = self.data.train_dl if self.max_steps > 0: t_total = self.max_steps self.epochs = (self.max_steps // len(train_dataloader) // self.grad_accumulation_steps + 1) else: t_total = len( train_dataloader) // self.grad_accumulation_steps * epochs # Prepare optimiser optimizer = self.get_optimizer(lr, optimizer_type=optimizer_type) # get the base model if its already wrapped around DataParallel if hasattr(self.model, "module"): self.model = self.model.module if self.is_fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex to use fp16 training") self.model, optimizer = amp.initialize( self.model, optimizer, opt_level=self.fp16_opt_level) # Get scheduler scheduler = self.get_scheduler(optimizer, t_total=t_total, schedule_type=schedule_type) # Parallelize the model architecture if self.multi_gpu is True: self.model = torch.nn.DataParallel(self.model) # Start Training self.logger.info("***** Running training *****") self.logger.info(" Num examples = %d", len(train_dataloader.dataset)) self.logger.info(" Num Epochs = %d", epochs) self.logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", self.data.train_batch_size * self.grad_accumulation_steps, ) self.logger.info(" Gradient Accumulation steps = %d", self.grad_accumulation_steps) self.logger.info(" Total optimization steps = %d", t_total) global_step = 0 epoch_step = 0 tr_loss, logging_loss, epoch_loss = 0.0, 0.0, 0.0 self.model.zero_grad() pbar = master_bar(range(epochs)) for epoch in pbar: epoch_step = 0 epoch_loss = 0.0 for step, batch in enumerate( progress_bar(train_dataloader, parent=pbar)): self.model.train() batch = tuple(t.to(self.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3], } if self.model_type in ["bert", "xlnet"]: inputs["token_type_ids"] = batch[2] outputs = self.model(**inputs) loss = outputs[ 0] # model outputs are always tuple in pytorch-transformers (see doc) if self.n_gpu > 1: loss = ( loss.mean() ) # mean() to average on multi-gpu parallel training if self.grad_accumulation_steps > 1: loss = loss / self.grad_accumulation_steps if self.is_fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), self.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm) tr_loss += loss.item() epoch_loss += loss.item() if (step + 1) % self.grad_accumulation_steps == 0: optimizer.step() scheduler.step() self.model.zero_grad() global_step += 1 epoch_step += 1 if self.logging_steps > 0 and global_step % self.logging_steps == 0: if validate: # evaluate model results = self.validate() for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) self.logger.info( "eval_{} after step {}: {}: ".format( key, global_step, value)) # Log metrics self.logger.info("lr after step {}: {}".format( global_step, scheduler.get_lr()[0])) self.logger.info("train_loss after step {}: {}".format( global_step, (tr_loss - logging_loss) / self.logging_steps, )) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar( "loss", (tr_loss - logging_loss) / self.logging_steps, global_step, ) logging_loss = tr_loss # Evaluate the model after every epoch if validate: results = self.validate() for key, value in results.items(): self.logger.info("eval_{} after epoch {}: {}: ".format( key, (epoch + 1), value)) # Log metrics self.logger.info("lr after epoch {}: {}".format( (epoch + 1), scheduler.get_lr()[0])) self.logger.info("train_loss after epoch {}: {}".format( (epoch + 1), epoch_loss / epoch_step)) self.logger.info("\n") tb_writer.close() return global_step, tr_loss / global_step
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]: """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate, drop_last=True ) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Take care of distributed/parallel training model = model.module if hasattr(model, "module") else model model.resize_token_embeddings(len(tokenizer)) # add_special_tokens_(model, tokenizer) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any( nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) # Check if saved optimizer or scheduler states exist if ( args.model_name_or_path and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt")) ): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load( os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict(torch.load( os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize( model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[ args.local_rank], output_device=args.local_rank, find_unused_parameters=True ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if args.model_name_or_path and os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split( "-")[-1].split("/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info( " Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0] ) set_seed(args) # Added here for reproducibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue inputs, labels = (batch, batch) if inputs.shape[1] > 1024: continue inputs = inputs.to(args.device) labels = labels.to(args.device) model.train() outputs = model(inputs, labels=labels) # model outputs are always tuple in transformers (see doc) loss = outputs[0] if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_( model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar( "eval_{}".format(key), value, global_step) tb_writer.add_scalar( "lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar( "loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: checkpoint_prefix = "checkpoint" # Save model checkpoint output_dir = os.path.join( args.output_dir, "{}-{}".format(checkpoint_prefix, global_step)) os.makedirs(output_dir, exist_ok=True) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) # torch.save(args, os.path.join( # output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) _rotate_checkpoints(args, checkpoint_prefix) torch.save(optimizer.state_dict(), os.path.join( output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join( output_dir, "scheduler.pt")) logger.info( "Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(self): if not self.pretrained_model: model = GPT2KWModel(config=self.model_config) else: self.print_and_log('加载预训练模型') model = GPT2KWModel.from_pretrained(self.pretrained_model) model.train() model.to(self.device) # 计算模型参数量 num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() self.print_and_log('模型参数量: {}'.format(num_parameters)) self.print_and_log("开始加载训练集") train_loader, valid_loader = self.create_dataloader() self.print_and_log("训练集加载完毕") epoch_steps = int(train_loader.sampler.num_samples / self.batch_size / self.accumulation_steps) total_steps = epoch_steps * self.epochs self.print_and_log('总样本数 = {}'.format(train_loader.sampler.num_samples)) self.print_and_log('epoch 步数 = {}'.format(epoch_steps)) self.print_and_log('总步数 = {}'.format(total_steps)) optimizer = pytorch_transformers.AdamW(model.parameters(), lr=self.lr, correct_bias=True) # scheduler = pytorch_transformers.WarmupLinearSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=total_steps) scheduler = pytorch_transformers.WarmupCosineSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=total_steps) if self.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=self.fp16_opt_level) if torch.cuda.device_count() > 1: model = DataParallel(model) multi_gpu = True else: multi_gpu = False overall_step = 0 running_loss = 0 model.train() for epoch in range(self.epochs): self.print_and_log('epoch {}'.format(epoch + 1)) now = datetime.now() self.print_and_log('time: {}'.format(now)) optimizer.zero_grad() for i, batch_data in enumerate(train_loader): if torch.cuda.is_available(): keyword_ids = batch_data[0].to(self.device, non_blocking=True) passage_ids = batch_data[1].to(self.device, non_blocking=True) label_ids = passage_ids.clone().to(self.device, non_blocking=True) else: keyword_ids = batch_data[0] passage_ids = batch_data[1] label_ids = passage_ids.clone() outputs = model(input_ids=passage_ids, keyword_ids=keyword_ids, labels=label_ids) loss, logits = outputs[:2] # 多 GPU 训练 if multi_gpu: loss = loss.mean() # 梯度累加 if self.gradient_accumulation > 1: loss = loss / self.gradient_accumulation # 混合精度训练或正常训练 if self.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), self.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), self.max_grad_norm) # 更新权重 if (i + 1) % self.gradient_accumulation == 0: running_loss += loss.item() scheduler.step() optimizer.step() optimizer.zero_grad() overall_step += 1 # 报告 train loss if (overall_step + 1) % self.log_step == 0 and running_loss != 0: self.print_and_log('now time: {}:{}. Step {} of epoch {}, loss {}'.format( datetime.now().hour, datetime.now().minute, overall_step + 1, epoch + 1, running_loss * self.gradient_accumulation / self.log_step)) running_loss = 0 # 开始验证 with torch.no_grad(): valid_start_time = datetime.now() model.eval() valid_loss = 0 valid_step = 0 for i, valid_batch_data in enumerate(valid_loader): if torch.cuda.is_available(): keyword_ids = valid_batch_data[0].to(self.device, non_blocking=True) passage_ids = valid_batch_data[1].to(self.device, non_blocking=True) label_ids = passage_ids.clone().to(self.device, non_blocking=True) else: keyword_ids = valid_batch_data[0] passage_ids = valid_batch_data[1] label_ids = passage_ids.clone() outputs = model(input_ids=passage_ids, keyword_ids=keyword_ids, labels=label_ids) loss, logits = outputs[:2] valid_loss += loss valid_step += 1 valid_loss = valid_loss / valid_step self.print_and_log('valid duration: {}, valid loss: {}'.format(datetime.now() - valid_start_time, valid_loss)) # 保存模型 if (epoch + 1) % 1 == 0: if not os.path.exists(self.output_dir + 'model_epoch{}'.format(epoch + 1)): os.makedirs(self.output_dir + 'model_epoch{}'.format(epoch + 1)) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(self.output_dir + 'model_epoch{}'.format(epoch + 1)) # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1)) # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1)) then = datetime.now() self.print_and_log('time: {}'.format(then)) self.print_and_log('time for one epoch: {}'.format(then - now)) model.train() self.print_and_log('training finished') self.f_log.close() if not os.path.exists(self.output_dir + 'final_model'): os.makedirs(self.output_dir + 'final_model') model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(self.output_dir + 'final_model')
def train(args): fh = logging.FileHandler(f"./output/{args.model}/{args.dataset}/logs.txt") # create file handler which logs even debug messages logger.addHandler(fh) # add the handlers to the logger timestamp = datetime.now().strftime('%Y%m%d%H%M') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) device = torch.device( f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu") def save_model(model, epoch): torch.save( model.state_dict(), f'./output/{args.model}/{args.dataset}/models/epo{epoch}.h5') def load_model(model, epoch, to_device): assert os.path.exists( f'./output/{args.model}/{args.dataset}/models/epo{epoch}.h5' ), f'Weights at epoch {epoch} not found' model.load_state_dict( torch.load( f'./output/{args.model}/{args.dataset}/models/epo{epoch}.h5', map_location=to_device)) config = getattr(configs, 'config_' + args.model)() print(config) ############################################################################### # Load data ############################################################################### data_path = args.data_path + args.dataset + '/' train_set = eval(config['dataset_name'])(config, data_path, config['train_cfg'], config['n_node'], config['train_desc'], config['desc_len']) ''' valid_set = eval(config['dataset_name'])(data_path, config['valid_tokens'], config['tokens_len'], config['valid_desc'], config['desc_len']) ''' data_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=config['batch_size'], shuffle=True, drop_last=False, num_workers=1) ############################################################################### # Define the models ############################################################################### logger.info('Constructing Model..') model = getattr(models, args.model)(config) #initialize the model if args.reload_from > 0: load_model(model, args.reload_from, device) logger.info('done') model.to(device) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] # origin: AdamW optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=config['learning_rate'], eps=config['adam_epsilon']) # no scheduler in paper's original code scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=config['warmup_steps'], num_training_steps=len(data_loader) * config['nb_epoch'] ) # do not foget to modify the number when dataset is changed if config['fp16']: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=config['fp16_opt_level']) print('---model parameters---') num_params = 0 for param in model.parameters(): num_params += param.numel() print(num_params / 1e6) n_iters = len(data_loader) itr_global = args.reload_from + 1 for epoch in range(int(args.reload_from) + 1, config['nb_epoch'] + 1): itr_start_time = time.time() losses = [] for batch in data_loader: model.train() batch_gpu = [tensor.to(device) for tensor in batch] loss = model(*batch_gpu) if config['fp16']: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), 5.0) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0) optimizer.step() scheduler.step() model.zero_grad() losses.append(loss.item()) if itr_global % args.log_every == 0: elapsed = time.time() - itr_start_time logger.info('epo:[%d/%d] itr:[%d/%d] step_time:%ds Loss=%.5f' % (epoch, config['nb_epoch'], itr_global % n_iters, n_iters, elapsed, np.mean(losses))) losses = [] itr_start_time = time.time() itr_global = itr_global + 1 # save every epoch if epoch >= 80: if epoch % 2 == 0: save_model(model, epoch)
def _train_epoch( self, scheduler, epoch: int, eval_step: int, train_data_iterator: ShardedDataIterator, ): args = self.args rolling_train_loss = 0.0 epoch_loss = 0 epoch_correct_predictions = 0 log_result_step = args.log_batch_step rolling_loss_step = args.train_rolling_loss_step num_hard_negatives = args.hard_negatives num_other_negatives = args.other_negatives seed = args.seed self.biencoder.train() epoch_batches = train_data_iterator.max_iterations data_iteration = 0 t1 = time.time() for i, samples_batch in enumerate( train_data_iterator.iterate_data(epoch=epoch)): # to be able to resume shuffled ctx- pools data_iteration = train_data_iterator.get_iteration() random.seed(seed + epoch + data_iteration) biencoder_batch = BiEncoder.create_biencoder_input( samples_batch, self.tensorizer, True, num_hard_negatives, num_other_negatives, shuffle=True, shuffle_positives=args.shuffle_positive_ctx) loss, correct_cnt = _do_biencoder_fwd_pass(self.biencoder, biencoder_batch, self.tensorizer, args) epoch_correct_predictions += correct_cnt epoch_loss += loss.item() rolling_train_loss += loss.item() if args.fp16: from apex import amp with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() if args.max_grad_norm > 0: torch.nn.utils.clip_grad_norm_( amp.master_params(self.optimizer), args.max_grad_norm) else: loss.backward() if args.max_grad_norm > 0: torch.nn.utils.clip_grad_norm_(self.biencoder.parameters(), args.max_grad_norm) if (i + 1) % args.gradient_accumulation_steps == 0: self.optimizer.step() scheduler.step() self.biencoder.zero_grad() if i % log_result_step == 0: lr = self.optimizer.param_groups[0]['lr'] logger.info('Epoch: %d: Step: %d/%d, loss=%f, lr=%f', epoch, data_iteration, epoch_batches, loss.item(), lr) if (i + 1) % rolling_loss_step == 0: t2 = time.time() elapsed = t2 - t1 logger.info('Train batch %d', data_iteration) latest_rolling_train_av_loss = rolling_train_loss / rolling_loss_step logger.info('Avg. loss per last %d batches: %f', rolling_loss_step, latest_rolling_train_av_loss) logger.info('Avg. single GPU speed per last %d batches: %f', rolling_loss_step, elapsed / rolling_loss_step) rolling_train_loss = 0.0 t1 = t2 if data_iteration % eval_step == 0: logger.info('Validation: Epoch: %d Step: %d/%d', epoch, data_iteration, epoch_batches) self.validate_and_save(epoch, train_data_iterator.get_iteration(), scheduler) self.biencoder.train() self.validate_and_save(epoch, data_iteration, scheduler) epoch_loss = (epoch_loss / epoch_batches) if epoch_batches > 0 else 0 logger.info('Av Loss per epoch=%f', epoch_loss) logger.info('epoch total correct predictions=%d', epoch_correct_predictions)
def train(args, trn_loader, dev_loader, model, tokenizer, cand_uttr_sys_dict, others): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter("runs/" + args.output_dir.replace("/", "-")) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(trn_loader) // args.gradient_accumulation_steps) + 1 else: t_total = len( trn_loader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Num batches = %d", len(trn_loader)) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 tr_loss, logging_loss = 0.0, 0.0 loss_mlm, loss_rs = 0, 0 patience, best_loss = 0, 1e10 xeloss = torch.nn.CrossEntropyLoss() model_to_resize = model.module if hasattr( model, "module") else model # Take care of distributed/parallel training model_to_resize.resize_token_embeddings(len(tokenizer)) model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(args) # Added here for reproducibility for _ in train_iterator: ## Calculate kmeans results in the beginning of each epoch if args.negative_sampling_by_kmeans: ToD_BERT_SYS_UTTR_KMEANS, KMEANS_to_SENTS = get_candidate_kmeans( args, cand_uttr_sys_dict, tokenizer, model) trn_loader = get_loader(vars(args), "train", tokenizer, others["datasets"], others["unified_meta"], "train") loss_arr, loss_mlm_arr, loss_rs_arr = [], [], [] epoch_iterator = tqdm(trn_loader, disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue ## add response selection into pretraining if args.add_rs_loss: kmeans_others = { "ToD_BERT_SYS_UTTR_KMEANS": ToD_BERT_SYS_UTTR_KMEANS, "KMEANS_to_SENTS": KMEANS_to_SENTS } if args.negative_sampling_by_kmeans else {} ## Split dialogue into (context, response) pairs input_cont, input_resp, resp_label = mask_for_response_selection( batch, tokenizer, args, cand_uttr_sys_dict, kmeans_others) ## Mask context part for MLM loss input_cont, labels = mask_tokens( input_cont, tokenizer, args) if args.mlm else (input_cont, input_cont) ## Allocate tensors to (gpu) devices input_cont = input_cont.to(args.device) input_resp = input_resp.to(args.device) resp_label = resp_label.to(args.device) labels = labels.to(args.device) ## Encode the context part with BERT outputs = model.bert( input_cont, attention_mask=input_cont > 0, ) sequence_output = outputs[0] hid_cont = sequence_output[:, 0, :] ## CLS token ## Calculate MLM loss for the context prediction_scores = model.cls(sequence_output) loss = xeloss( prediction_scores.view(-1, model.config.vocab_size), labels.view(-1)) loss_mlm = loss.item() ## Encode the response part with BERT outputs = model.bert( input_resp, attention_mask=input_resp > 0, ) sequence_output = outputs[0] hid_resp = sequence_output[:, 0, :] ## Calculate RCL loss scores = torch.matmul(hid_cont, hid_resp.transpose(1, 0)) loss_rs = xeloss(scores, resp_label) loss += loss_rs loss_rs = loss_rs.item() ## with only MLM loss else: inputs = batch["context"].clone() if args.mlm: inputs, labels = mask_tokens(inputs, tokenizer, args) inputs = inputs.to(args.device) labels = labels.to(args.device) outputs = model(inputs, masked_lm_labels=labels, attention_mask=inputs > 0) else: labels = inputs.clone() masked_indices = (labels == 0) labels[masked_indices] = -100 outputs = model(inputs, labels=labels) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) loss_mlm = loss.item() if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss_arr.append(loss.item()) loss_mlm_arr.append(loss_mlm) loss_rs_arr.append(loss_rs) if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() ## Print loss epoch_iterator.set_description( "Loss:{:.4f} MLM:{:.4f} RS:{:.4f}".format( np.mean(loss_arr), np.mean(loss_mlm_arr), np.mean(loss_rs_arr))) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: if args.evaluate_during_training and args.n_gpu == 1: results = evaluate(args, model, dev_loader, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) else: results = {} results["loss"] = best_loss - 0.1 # always saving if results["loss"] < best_loss: patience = 0 best_loss = results["loss"] checkpoint_prefix = "checkpoint" # Save model checkpoint output_dir = os.path.join( args.output_dir, "{}-{}".format(checkpoint_prefix, global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save( args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) _rotate_checkpoints(args, checkpoint_prefix) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info( "Saving optimizer and scheduler states to %s", output_dir) else: patience += 1 logger.info( "Current patience: patience {}".format(patience)) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if patience > args.patience: logger.info("Ran out of patience...") break if (args.max_steps > 0 and global_step > args.max_steps) or patience > args.patience: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(args, train_dataset, model): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) val_len = int(len(train_dataset)*args.val_fraction) train_len = len(train_dataset) - val_len train_ds, val_ds = random_split(train_dataset,[train_len, val_len]) train_sampler = RandomSampler(train_ds) if args.local_rank == -1 else DistributedSampler(train_ds) train_dataloader = DataLoader(train_ds, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs args.warmup_steps = int(t_total*args.warmup_proportion) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt") ): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_ds)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): # set global_step to gobal_step of last saved checkpoint from model path try: global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0]) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except: pass tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(seed=args.seed,n_gpu=args.n_gpu) # Added here for reproductibility (even between python 2 and 3) global_max_seq_len = -1 for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) # using adaptive sequence length if args.do_freelb: max_seq_len = torch.max(torch.sum(batch[1], 1)).item() batch = [t[:, :max_seq_len] for t in batch[:3]] + [batch[3]] if max_seq_len > global_max_seq_len: global_max_seq_len = max_seq_len inputs = {'attention_mask': batch[1], 'labels': batch[3]} else: inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3]} if args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids if args.do_freelb and args.model_type in ['bert','albert']: # ============================ Code for adversarial training============= # initialize delta if isinstance(model, torch.nn.DataParallel): embeds_init = model.module.bert.embeddings.word_embeddings(batch[0]) else: embeds_init = model.bert.embeddings.word_embeddings(batch[0]) if args.adv_init_mag > 0: input_mask = inputs['attention_mask'].to(embeds_init) input_lengths = torch.sum(input_mask, 1) if args.norm_type == "l2": delta = torch.zeros_like(embeds_init).uniform_(-1, 1) * input_mask.unsqueeze(2) dims = input_lengths * embeds_init.size(-1) mag = args.adv_init_mag / torch.sqrt(dims) delta = (delta * mag.view(-1, 1, 1)).detach() elif args.norm_type == "linf": delta = torch.zeros_like(embeds_init).uniform_(-args.adv_init_mag, args.adv_init_mag) * input_mask.unsqueeze(2) else: delta = torch.zeros_like(embeds_init) # the main loop for astep in range(args.adv_steps): # (0) forward delta.requires_grad_() inputs['inputs_embeds'] = delta + embeds_init outputs = model(**inputs) loss = outputs[0] # model outputs are always tuple in transformers (see doc) # (1) backward if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss = loss / args.adv_steps tr_loss += loss.item() if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if astep == args.adv_steps - 1: # further updates on delta break # (2) get gradient on delta delta_grad = delta.grad.clone().detach() # (3) update and clip if args.norm_type == "l2": denorm = torch.norm(delta_grad.view(delta_grad.size(0), -1), dim=1).view(-1, 1, 1) denorm = torch.clamp(denorm, min=1e-8) delta = (delta + args.adv_lr * delta_grad / denorm).detach() if args.adv_max_norm > 0: delta_norm = torch.norm(delta.view(delta.size(0), -1).float(), p=2, dim=1).detach() exceed_mask = (delta_norm > args.adv_max_norm).to(embeds_init) reweights = (args.adv_max_norm / delta_norm * exceed_mask \ + (1 - exceed_mask)).view(-1, 1, 1) delta = (delta * reweights).detach() elif args.norm_type == "linf": denorm = torch.norm(delta_grad.view(delta_grad.size(0), -1), dim=1, p=float("inf")).view(-1, 1, 1) denorm = torch.clamp(denorm, min=1e-8) delta = (delta + args.adv_lr * delta_grad / denorm).detach() if args.adv_max_norm > 0: delta = torch.clamp(delta, -args.adv_max_norm, args.adv_max_norm).detach() else: logger.info("Norm type {} not specified.".format(args.norm_type)) exit() if isinstance(model, torch.nn.DataParallel): embeds_init = model.module.bert.embeddings.word_embeddings(batch[0]) else: embeds_init = model.bert.embeddings.word_embeddings(batch[0]) else: outputs = model(**inputs) loss = outputs[0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, val_ds, model,prefix=global_step) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(args, train_dataset, model, tokenizer): record_result = [] """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 1 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split( "/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) # Added here for reproductibility set_seed(args) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "start_positions": batch[3], "end_positions": batch[4], } if args.model_type in [ "xlm", "roberta", "distilbert", "camembert" ]: del inputs["token_type_ids"] if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[5], "p_mask": batch[6]}) if args.version_2_with_negative: inputs.update({"is_impossible": batch[7]}) if hasattr(model, "config") and hasattr( model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device) }) outputs = model(**inputs) # model outputs are always tuple in transformers (see doc) loss = outputs[0] if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Log metrics if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Only evaluate when single GPU otherwise metrics may not average well if args.local_rank == -1 and args.evaluate_during_training: results = evaluate(args, model, tokenizer) record_result.append(results) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss # Save model checkpoint if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Take care of distributed/parallel training model_to_save = model.module if hasattr( model, "module") else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(model, os.path.join(output_dir, "model.pt")) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() results = evaluate(args, model, tokenizer) record_result.append(results) torch.save(record_result, os.path.join(args.output_dir, 'result.pt')) return global_step, tr_loss / global_step
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=None, batch_size=args.train_batch_size) # Lists to store data after first epoch cached_all_input_ids = [] cached_all_input_mask = [] cached_all_segment_ids = [] cached_all_start_positions = [] cached_all_end_positions = [] cached_all_cls_index = [] cached_all_p_mask = [] cached_all_outputs = [] if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 1 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() set_seed( args) # Added here for reproductibility (even between python 2 and 3) # First epoch epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'start_positions': batch[3], 'end_positions': batch[4] } if args.model_type != 'distilbert': inputs[ 'token_type_ids'] = None if args.model_type == 'xlm' else batch[ 2] if args.model_type in ['xlnet', 'xlm']: inputs.update({'cls_index': batch[5], 'p_mask': batch[6]}) outputs = model(**inputs) # cache data #cache_data = cache_data.append((batch[0], batch[1], batch[2], batch[3], batch[4], batch[5], batch[6], batch[6], outputs)) cached_all_input_ids.append(batch[0]) cached_all_input_mask.append(batch[1]) cached_all_segment_ids.append(batch[2]) cached_all_start_positions.append(batch[3]) cached_all_end_positions.append(batch[4]) cached_all_cls_index.append(batch[5]) cached_all_p_mask.append(batch[6]) cached_all_outputs.append(outputs[0]) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) print(loss.shape) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break ''' if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break ''' # For epochs > 1 : use the create train_dataset print("****************************", torch.stack(cached_all_input_ids).shape, torch.stack(cached_all_input_mask).shape, torch.stack(cached_all_segment_ids).shape, torch.stack(cached_all_start_positions).shape, torch.stack(cached_all_end_positions).shape, torch.stack(cached_all_cls_index).shape, torch.stack(cached_all_p_mask).shape, torch.stack(cached_all_outputs).shape) train_dataset = TensorDataset(torch.stack(cached_all_input_ids), torch.stack(cached_all_input_mask), torch.stack(cached_all_segment_ids), torch.stack(cached_all_start_positions), torch.stack(cached_all_end_positions), torch.stack(cached_all_cls_index), torch.stack(cached_all_p_mask), torch.stack(cached_all_outputs)) # Initialise train_data loader train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) train_iterator = trange(int(args.num_train_epochs - 1), desc="Epoch", disable=args.local_rank not in [-1, 0]) for i in train_iterator: start_time = timeit.default_timer() epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'start_positions': batch[3], 'end_positions': batch[4] } if args.model_type != 'distilbert': inputs[ 'token_type_ids'] = None if args.model_type == 'xlm' else batch[ 2] if args.model_type in ['xlnet', 'xlm']: inputs.update({'cls_index': batch[5], 'p_mask': batch[6]}) outputs = batch[7] loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) print(loss) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break epoch_time = timeit.default_timer() - start_time logger.info("Train Epoch %s time: %f secs", str(i), epoch_time) if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def _train_epoch(self, scheduler, epoch: int, eval_step: int, train_data_iterator: ShardedDataIterator, global_step: int): args = self.args rolling_train_loss = 0.0 epoch_loss = 0 log_result_step = args.log_batch_step rolling_loss_step = args.train_rolling_loss_step self.reader.train() epoch_batches = train_data_iterator.max_iterations for i, samples_batch in enumerate(train_data_iterator.iterate_data(epoch=epoch)): data_iteration = train_data_iterator.get_iteration() # enables to resume to exactly same train state if args.fully_resumable: np.random.seed(args.seed + global_step) torch.manual_seed(args.seed + global_step) if args.n_gpu > 0: torch.cuda.manual_seed_all(args.seed + global_step) input = create_reader_input(self.tensorizer.get_pad_id(), samples_batch, args.passages_per_question, args.sequence_length, args.max_n_answers, is_train=True, shuffle=True) loss = self._calc_loss(input) epoch_loss += loss.item() rolling_train_loss += loss.item() if args.fp16: from apex import amp with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() if args.max_grad_norm > 0: torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), args.max_grad_norm) else: loss.backward() if args.max_grad_norm > 0: torch.nn.utils.clip_grad_norm_(self.reader.parameters(), args.max_grad_norm) global_step += 1 if (i + 1) % args.gradient_accumulation_steps == 0: self.optimizer.step() scheduler.step() self.reader.zero_grad() if global_step % log_result_step == 0: lr = self.optimizer.param_groups[0]['lr'] logger.info( 'Epoch: %d: Step: %d/%d, global_step=%d, lr=%f', epoch, data_iteration, epoch_batches, global_step, lr) if (i + 1) % rolling_loss_step == 0: logger.info('Train batch %d', data_iteration) latest_rolling_train_av_loss = rolling_train_loss / rolling_loss_step logger.info('Avg. loss per last %d batches: %f', rolling_loss_step, latest_rolling_train_av_loss) rolling_train_loss = 0.0 if global_step % eval_step == 0: logger.info('Validation: Epoch: %d Step: %d/%d', epoch, data_iteration, epoch_batches) self.validate_and_save(epoch, train_data_iterator.get_iteration(), scheduler) self.reader.train() epoch_loss = (epoch_loss / epoch_batches) if epoch_batches > 0 else 0 logger.info('Av Loss per epoch=%f', epoch_loss) return global_step
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], # 'token_type_ids': None if args.model_type == 'xlm' else batch[2], "token_type_ids": batch[2], "labels": batch[3], } # if args.model_type in ['xlnet', 'xlm']: # inputs.update({'cls_index': batch[5], # 'p_mask': batch[6]}) outputs = model(**inputs) loss = outputs[0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_vocabulary(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]: """ Train the model """ args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) def collate(examples): model.eval() examples_src, examples_tgt, examples_srctgt, langid_srctgt, psi_examples_srctgt, psi_labels = [], [], [], [], [], [] src_len = tgt_len = 0 bpe2word_map_src, bpe2word_map_tgt = [], [] for example in examples: end_id = example[0][0][-1].view(-1) src_id = example[0][0][:args.block_size] src_id = torch.cat([src_id[:-1], end_id]) tgt_id = example[1][0][:args.block_size] tgt_id = torch.cat([tgt_id[:-1], end_id]) half_block_size = int(args.block_size / 2) half_src_id = example[0][0][:half_block_size] half_src_id = torch.cat([half_src_id[:-1], end_id]) half_tgt_id = example[1][0][:half_block_size] half_tgt_id = torch.cat([half_tgt_id[:-1], end_id]) examples_src.append(src_id) examples_tgt.append(tgt_id) src_len = max(src_len, len(src_id)) tgt_len = max(tgt_len, len(tgt_id)) if random.random() > 0.5: srctgt = torch.cat([half_src_id, half_tgt_id]) langid = torch.cat([ torch.ones_like(half_src_id), torch.ones_like(half_tgt_id) * 2 ]) else: srctgt = torch.cat([half_tgt_id, half_src_id]) langid = torch.cat([ torch.ones_like(half_tgt_id), torch.ones_like(half_src_id) * 2 ]) examples_srctgt.append(srctgt) langid_srctgt.append(langid) # [neg, neg] pair neg_half_src_id = example[-2][0][:half_block_size] neg_half_src_id = torch.cat([neg_half_src_id[:-1], end_id]) neg_half_tgt_id = example[-1][0][:half_block_size] neg_half_tgt_id = torch.cat([neg_half_tgt_id[:-1], end_id]) if random.random() > 0.5: neg_srctgt = torch.cat([neg_half_src_id, neg_half_tgt_id]) else: neg_srctgt = torch.cat([neg_half_tgt_id, neg_half_src_id]) psi_examples_srctgt.append(neg_srctgt) psi_labels.append(1) # [pos, neg] pair rd = random.random() if rd > 0.75: neg_srctgt = torch.cat([half_src_id, neg_half_tgt_id]) elif rd > 0.5: neg_srctgt = torch.cat([neg_half_src_id, half_tgt_id]) elif rd > 0.25: neg_srctgt = torch.cat([half_tgt_id, neg_half_src_id]) else: neg_srctgt = torch.cat([neg_half_tgt_id, half_src_id]) psi_examples_srctgt.append(neg_srctgt) psi_labels.append(0) bpe2word_map_src.append(example[2]) bpe2word_map_tgt.append(example[3]) examples_src = pad_sequence(examples_src, batch_first=True, padding_value=tokenizer.pad_token_id) examples_tgt = pad_sequence(examples_tgt, batch_first=True, padding_value=tokenizer.pad_token_id) examples_srctgt = pad_sequence(examples_srctgt, batch_first=True, padding_value=tokenizer.pad_token_id) langid_srctgt = pad_sequence(langid_srctgt, batch_first=True, padding_value=tokenizer.pad_token_id) psi_examples_srctgt = pad_sequence( psi_examples_srctgt, batch_first=True, padding_value=tokenizer.pad_token_id) psi_labels = torch.tensor(psi_labels) guides = model.get_aligned_word( examples_src, examples_tgt, bpe2word_map_src, bpe2word_map_tgt, args.device, src_len, tgt_len, align_layer=args.align_layer, extraction=args.extraction, softmax_threshold=args.softmax_threshold) return examples_src, examples_tgt, guides, examples_srctgt, langid_srctgt, psi_examples_srctgt, psi_labels train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate) t_total = len(train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs if args.max_steps > 0 and args.max_steps < t_total: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if (not (any(nd in n for nd in no_decay))) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if ((any(nd in n for nd in no_decay))) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 # Check if continuing training from a checkpoint tr_loss, logging_loss = 0.0, 0.0 model_to_resize = model.module if hasattr( model, "module") else model # Take care of distributed/parallel training model_to_resize.resize_token_embeddings(len(tokenizer)) model.zero_grad() set_seed(args) # Added here for reproducibility def backward_loss(loss, tot_loss): if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tot_loss += loss.item() if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() return tot_loss tqdm_iterator = trange(int(t_total), desc="Iteration", disable=args.local_rank not in [-1, 0]) for _ in range(int(args.num_train_epochs)): for step, batch in enumerate(train_dataloader): model.train() if args.train_so: inputs_src, inputs_tgt = batch[0].clone(), batch[1].clone() inputs_src, inputs_tgt = inputs_src.to( args.device), inputs_tgt.to(args.device) attention_mask_src, attention_mask_tgt = (inputs_src != 0), (inputs_tgt != 0) guide = batch[2].to(args.device) loss = model(inputs_src=inputs_src, inputs_tgt=inputs_tgt, attention_mask_src=attention_mask_src, attention_mask_tgt=attention_mask_tgt, guide=guide, align_layer=args.align_layer, extraction=args.extraction, softmax_threshold=args.softmax_threshold) tr_loss = backward_loss(loss, tr_loss) if args.train_mlm: inputs_src, labels_src = mask_tokens(batch[0], tokenizer, args) inputs_tgt, labels_tgt = mask_tokens(batch[1], tokenizer, args) inputs_src, inputs_tgt = inputs_src.to( args.device), inputs_tgt.to(args.device) labels_src, labels_tgt = labels_src.to( args.device), labels_tgt.to(args.device) loss = model(inputs_src=inputs_src, labels_src=labels_src) tr_loss = backward_loss(loss, tr_loss) loss = model(inputs_src=inputs_tgt, labels_src=labels_tgt) tr_loss = backward_loss(loss, tr_loss) if args.train_tlm: inputs_srctgt, labels_srctgt = mask_tokens( batch[3], tokenizer, args, batch[4], 1) inputs_srctgt, labels_srctgt = inputs_srctgt.to( args.device), labels_srctgt.to(args.device) loss = model(inputs_src=inputs_srctgt, labels_src=labels_srctgt) tr_loss = backward_loss(loss, tr_loss) inputs_srctgt, labels_srctgt = mask_tokens( batch[3], tokenizer, args, batch[4], 2) inputs_srctgt, labels_srctgt = inputs_srctgt.to( args.device), labels_srctgt.to(args.device) loss = model(inputs_src=inputs_srctgt, labels_src=labels_srctgt) tr_loss = backward_loss(loss, tr_loss) if args.train_psi: loss = model(inputs_src=batch[5].to(args.device), labels_psi=batch[6].to(args.device), align_layer=args.align_layer + 1) tr_loss = backward_loss(loss, tr_loss) if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 tqdm_iterator.update() if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: logger.info( " Step %s. Training loss = %s", str(global_step), str((tr_loss - logging_loss) / args.logging_steps)) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: checkpoint_prefix = "checkpoint" # Save model checkpoint output_dir = os.path.join( args.output_dir, "{}-{}".format(checkpoint_prefix, global_step)) os.makedirs(output_dir, exist_ok=True) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) _rotate_checkpoints(args, checkpoint_prefix) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if global_step > t_total: break if global_step > t_total: break return global_step, tr_loss / global_step
def train( model: TEDD1104, optimizer_name: str, optimizer: torch.optim, train_dir: str, dev_dir: str, test_dir: str, output_dir: str, batch_size: int, initial_epoch: int, num_epoch: int, max_acc: float, hide_map_prob: float, fp16: bool = True, amp_opt_level=None, save_checkpoints: bool = True, save_best: bool = True, ): """ Train a model Input: - model: TEDD1104 model to train - optimizer_name: Name of the optimizer to use [SGD, Adam] - optimizer: Optimizer (torch.optim) - train_dir: Directory where the train files are stored - dev_dir: Directory where the development files are stored - test_dir: Directory where the test files are stored - output_dir: Directory where the model and the checkpoints are going to be saved - batch_size: Batch size (Around 10 for 8GB GPU) - initial_epoch: Number of previous epochs used to train the model (0 unless the model has been restored from checkpoint) - num_epochs: Number of epochs to do - max_acc: Accuracy in the development set (0 unless the model has been restored from checkpoint) - hide_map_prob: Probability for removing the minimap (black square) from the image (0<=hide_map_prob<=1) - fp16: Use FP16 for training - amp_opt_level: If FP16 training Nvidia apex opt level - save_checkpoints: save a checkpoint each epoch (Each checkpoint will rewrite the previous one) - save_best: save the model that achieves the higher accuracy in the development set Output: - float: Accuracy in the development test of the best model """ if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) criterion: CrossEntropyLoss = torch.nn.CrossEntropyLoss() X_dev, y_dev = load_dataset(dev_dir, fp=16 if fp16 else 31) X_dev = torch.from_numpy(X_dev) X_test, y_test = load_dataset(test_dir, fp=16 if fp16 else 21) X_test = torch.from_numpy(X_test) acc_dev: float = 0.0 printTrace("Training...") for epoch in range(num_epoch): for file_t in glob.glob(os.path.join(train_dir, "*.npz")): model.train() start_time: float = time.time() X, y = load_file(path=file_t, fp=16 if fp16 else 32, hide_map_prob=hide_map_prob) running_loss = 0.0 num_batchs = 0 for X_bacth, y_batch in nn_batchs(X, y, batch_size): X_bacth, y_batch = ( torch.from_numpy(X_bacth).to(device), torch.from_numpy(y_batch).long().to(device), ) optimizer.zero_grad() outputs = model.forward(X_bacth) loss = criterion(outputs, y_batch) if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), 1.0) else: torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() running_loss += loss.item() num_batchs += 1 # Print Statistics acc_train = evaluate( model=model, X=torch.from_numpy(X), golds=y, device=device, batch_size=batch_size, ) acc_dev = evaluate( model=model, X=X_dev, golds=y_dev, device=device, batch_size=batch_size, ) acc_test = evaluate( model=model, X=X_test, golds=y_test, device=device, batch_size=batch_size, ) printTrace( f"EPOCH: {initial_epoch+epoch}. Current File {file_t}. Training time: {time.time() - start_time} secs" ) printTrace( f"Loss: {running_loss / num_batchs}. Acc training set: {acc_train}. " f"Acc dev set: {acc_dev}. Acc test set: {acc_test}") if acc_dev > max_acc and save_best: max_acc = acc_dev printTrace( f"New max acc in dev set {max_acc}. Saving model...") save_model( model=model, save_dir=output_dir, fp16=fp16, amp_opt_level=amp_opt_level, ) if save_checkpoints: printTrace("Saving checkpoint...") save_checkpoint( path=os.path.join(output_dir, "checkpoint.pt"), model=model, optimizer_name=optimizer_name, optimizer=optimizer, acc_dev=acc_dev, epoch=initial_epoch + epoch, fp16=fp16, opt_level=amp_opt_level, ) return max_acc
def test_2models2losses2optimizers(self): model0 = MyModel(1) model1 = MyModel(2) optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25}], momentum=0.125) optimizer1 = torch.optim.SGD([{'params' : model1.parameters(), 'lr' : 0.5}], momentum=0.25) # Don't do it like this: reference_grads = [[]]*5 # because then it creates a list of 5 references to the same "[]" and appending # to any of them effectively makes you append to all of them, which multiplies # the resulting size of reference_grads by 5x and needless to say makes the test fail. reference_grads = [[], [], [], [], []] final_params = [None, None, None, None, None] for i in range(2): optimizer0.zero_grad() optimizer1.zero_grad() loss0 = model0(self.x) loss1 = model1(self.x) loss0.backward() loss1.backward() reference_grads[0].append([param.grad.data.clone() for param in model0.parameters()] + [param.grad.data.clone() for param in model1.parameters()]) optimizer0.step() optimizer1.step() final_params[0] = [param.data.clone() for param in model0.parameters()] + \ [param.data.clone() for param in model1.parameters()] def what_got_skipped(which_iter, which_backward): if which_iter == 0 and which_backward == 0: return 1 if which_iter == 0 and which_backward == 1: return 2 if which_iter == 1 and which_backward == 0: return 3 if which_iter == 1 and which_backward == 1: return 4 return 0 for which_iter in (0,1): for which_backward in (0,1): model0 = MyModel(1) model1 = MyModel(2) optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25}], momentum=0.125) optimizer1 = torch.optim.SGD([{'params' : model1.parameters(), 'lr' : 0.5}], momentum=0.25) for i in range(3): optimizer0.zero_grad() optimizer1.zero_grad() loss0 = model0(self.x) loss1 = model1(self.x) loss0.backward() loss1.backward() if i != which_iter: reference_grads[what_got_skipped(which_iter, which_backward)].append( [param.grad.data.clone() for param in model0.parameters()] + [param.grad.data.clone() for param in model1.parameters()]) if i == which_iter: if which_backward == 0: optimizer1.step() else: optimizer0.step() else: optimizer0.step() optimizer1.step() final_params[what_got_skipped(which_iter, which_backward)] = \ [param.data.clone() for param in model0.parameters()] + \ [param.data.clone() for param in model1.parameters()] for materialize_master_grads in (False, True): for opt_level in ("O0", "O1", "O2", "O3"): for how_to_zero in ("none", "model", "optimizer"): for use_multiple_loss_scalers in (False, True): if opt_level == "O1" or opt_level == "O2": inject_inf_iters = (-1, 0, 1) else: inject_inf_iters = (-1,) for inject_inf in inject_inf_iters: if inject_inf >= 0: inject_inf_locs = ("fp16", "fp32") which_backwards = (0, 1) else: inject_inf_locs = ("fdsa",) which_backwards = (None,) for inject_inf_loc in inject_inf_locs: for which_backward in which_backwards: if use_multiple_loss_scalers: num_losses = 2 loss_ids = [0, 1] else: num_losses = 1 loss_ids = [0, 0] if inject_inf >= 0: iters = 3 else: iters = 2 model0 = MyModel(1) model1 = MyModel(2) models = [model0, model1] optimizer0 = FusedSGD([{'params' : model0.parameters(), 'lr' : 0.25}], momentum=0.125, materialize_master_grads=materialize_master_grads) optimizer1 = FusedSGD([{'params' : model1.parameters(), 'lr' : 0.5}], momentum=0.25, materialize_master_grads=materialize_master_grads) _amp_state.allow_incoming_model_not_fp32 = True [model0, model1], [optimizer0, optimizer1] = amp.initialize( [model0, model1], [optimizer0, optimizer1], opt_level=opt_level, verbosity=0, cast_model_type=False, num_losses=num_losses) _amp_state.allow_incoming_model_not_fp32 = False _amp_state.loss_scalers[0]._loss_scale = 4.0 if use_multiple_loss_scalers: _amp_state.loss_scalers[1]._loss_scale = 16.0 unskipped = 0 for i in range(iters): if how_to_zero == "none": for model in models: for param in model.parameters(): param.grad = None elif how_to_zero == "model": for model in models: model.zero_grad() else: optimizer0.zero_grad() optimizer1.zero_grad() loss0 = model0(self.x) loss1 = model1(self.x) with amp.scale_loss(loss0, optimizer0, loss_id=loss_ids[0]) as scaled_loss: scaled_loss.backward() if i == inject_inf and which_backward == 0: if inject_inf_loc == "fp32": model0.weight0.grad[0] = float('inf') elif inject_inf_loc == "fp16": model0.weight1.grad[0] = float('inf') with amp.scale_loss(loss1, optimizer1, loss_id=loss_ids[1]) as scaled_loss: scaled_loss.backward() if i == inject_inf and which_backward == 1: if inject_inf_loc == "fp32": model1.weight0.grad[0] = float('inf') elif inject_inf_loc == "fp16": model1.weight1.grad[0] = float('inf') # print("opt_level {} i {} inject_inf {} which_backward {} inject_inf_loc {} use_multiple_loss_scalers {}".format(opt_level, i, inject_inf, which_backward, inject_inf_loc, use_multiple_loss_scalers)) if i != inject_inf: master_params = list(amp.master_params(optimizer0)) + \ list(amp.master_params(optimizer1)) for param, reference_grad in zip(master_params, reference_grads[what_got_skipped(inject_inf, which_backward)][unskipped]): if opt_level == "O2" and not materialize_master_grads: continue else: self.assertTrue(torch.allclose(param.grad.float(), reference_grad.float())) unskipped += 1 optimizer0.step() optimizer1.step() model_params = [p for p in model0.parameters()] + [p for p in model1.parameters()] master_params = [p for p in amp.master_params(optimizer0)] + \ [p for p in amp.master_params(optimizer1)] for model, master, reference in zip( model_params, master_params, final_params[what_got_skipped(inject_inf, which_backward)]): self.assertTrue(torch.allclose(model, reference)) self.assertTrue(torch.allclose(model, master.to(model.dtype))) if opt_level == "O1": _amp_state.handle._deactivate()
def train_step( self, batch, idx, scheduler, ): """ Training for a single batch. -------------------- Returns: loss - average of start and end cross entropy loss https://huggingface.co/transformers/_modules/transformers/modeling_bert.html#BertForQuestionAnswering """ if self.fp16: from apex import amp # unpack batch data batch = tuple(t.to(self.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "start_positions": batch[3], "end_positions": batch[4] } # zero gradients self.model.zero_grad() # send data through model forward out = self.model(**inputs) # model outputs are always tuple in transformers (see doc) loss = out[0] # for multi-gpu if isinstance(self.model, nn.DataParallel): loss = loss.mean() # average on multi-gpu parallel training # calculate gradients through back prop if self.fp16: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # clip gradients if self.fp16: nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.max_grad_norm) else: nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm) #take a step in gradient descent self.optimizer.step() scheduler.step() # check which weights are changing if self.debug and idx > self.warmup_steps: self.check_changing(idx) # zero gradients self.model.zero_grad() return loss.detach()
def test_2models2losses1optimizer(self): model0 = MyModel(1) model1 = MyModel(2) optimizer = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25}, {'params' : model1.parameters(), 'lr' : 0.5}], momentum=0.125) reference_grads = [] for i in range(2): optimizer.zero_grad() loss0 = model0(self.x) loss1 = model1(self.x) loss0.backward() loss1.backward() reference_grads.append([param.grad.data.clone() for param in model0.parameters()] + [param.grad.data.clone() for param in model1.parameters()]) optimizer.step() final_params = [param.data.clone() for param in model0.parameters()] + \ [param.data.clone() for param in model1.parameters()] for materialize_master_grads in (False, True): for opt_level in ("O0", "O1", "O2", "O3"): for how_to_zero in ("none", "model", "optimizer"): for use_multiple_loss_scalers in (False, True): if opt_level == "O1" or opt_level == "O2": inject_inf_iters = (-1, 0, 1) else: inject_inf_iters = (-1,) for inject_inf in inject_inf_iters: if inject_inf >= 0: inject_inf_locs = ("fp16", "fp32") which_backwards = (0, 1) else: inject_inf_locs = ("fdsa",) which_backwards = (None,) for inject_inf_loc in inject_inf_locs: for which_backward in which_backwards: if use_multiple_loss_scalers: num_losses = 2 loss_ids = [0, 1] else: num_losses = 1 loss_ids = [0, 0] if inject_inf >= 0: iters = 3 else: iters = 2 model0 = MyModel(1) model1 = MyModel(2) models = [model0, model1] optimizer = FusedSGD([{'params' : model0.parameters(), 'lr' : 0.25}, {'params' : model1.parameters(), 'lr' : 0.5}], momentum=0.125, materialize_master_grads=materialize_master_grads) _amp_state.allow_incoming_model_not_fp32 = True [model0, model1], optimizer = amp.initialize( [model0, model1], optimizer, opt_level=opt_level, verbosity=0, cast_model_type=False, num_losses=num_losses) _amp_state.allow_incoming_model_not_fp32 = False _amp_state.loss_scalers[0]._loss_scale = 4.0 if use_multiple_loss_scalers: _amp_state.loss_scalers[1]._loss_scale = 16.0 unskipped = 0 for i in range(iters): if how_to_zero == "none": for model in models: for param in model.parameters(): param.grad = None elif how_to_zero == "model": for model in models: model.zero_grad() else: optimizer.zero_grad() loss0 = model0(self.x) loss1 = model1(self.x) with amp.scale_loss(loss0, optimizer, loss_id=loss_ids[0]) as scaled_loss: scaled_loss.backward() if i == inject_inf and which_backward == 0: if inject_inf_loc == "fp32": model0.weight0.grad[0] = float('inf') elif inject_inf_loc == "fp16": model0.weight1.grad[0] = float('inf') with amp.scale_loss(loss1, optimizer, loss_id=loss_ids[1]) as scaled_loss: scaled_loss.backward() if i == inject_inf and which_backward == 1: if inject_inf_loc == "fp32": model1.weight0.grad[0] = float('inf') elif inject_inf_loc == "fp16": model1.weight1.grad[0] = float('inf') if i != inject_inf: master_params = amp.master_params(optimizer) for param, reference_grad in zip(master_params, reference_grads[unskipped]): if opt_level == "O2" and not materialize_master_grads: continue else: self.assertTrue(torch.allclose(param.grad.float(), reference_grad.float()), "opt_level {} i {} inject_inf {} which_backward {} inject_inf_loc {} use_multiple_loss_scalers {}".format(opt_level, i, inject_inf, which_backward, inject_inf_loc, use_multiple_loss_scalers)) unskipped += 1 optimizer.step() model_params = [p for p in model0.parameters()] + [p for p in model1.parameters()] for model, master, reference in zip( model_params, amp.master_params(optimizer), final_params): self.assertTrue(torch.allclose(model, reference)) self.assertTrue(torch.allclose(model, master.to(model.dtype))) if opt_level == "O1": _amp_state.handle._deactivate()
def train_single_iteration(args, model, train_examples: Examples, valid_examples: Examples, optimizer, scheduler, tb_writer, step_bar, skip_n_steps): tr_loss, tr_ac = 0, 0 batch_size = args.per_gpu_train_batch_size cache_file = "cached_single_random_neg_sample_epoch_{}.dat".format( args.epochs_trained) # save the examples for epoch if args.neg_sampling == "random": if args.overwrite or not os.path.isfile(cache_file): train_dataloader = train_examples.random_neg_sampling_dataloader( batch_size=batch_size) torch.save(train_dataloader, cache_file) else: train_dataloader = torch.load(cache_file) elif args.neg_sampling == "online": # we provide only positive cases and will create negative in the batch processing train_dataloader = train_examples.online_neg_sampling_dataloader( batch_size=int(batch_size / 2)) else: raise Exception("{} neg_sampling is not recoginized...".format( args.neg_sampling)) for step, batch in enumerate(train_dataloader): if skip_n_steps > 0: skip_n_steps -= 1 continue if args.neg_sampling == "online": batch = train_examples.make_online_neg_sampling_batch( batch, model, args.hard_ratio) model.train() labels = batch[2].to(model.device) inputs = format_batch_input_for_single_bert(batch, train_examples, model) inputs['relation_label'] = labels outputs = model(**inputs) loss = outputs['loss'] logit = outputs['logits'] y_pred = logit.data.max(1)[1] tr_ac += y_pred.eq(labels).long().sum().item() if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: try: from apex import amp with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() model.zero_grad() args.global_step += 1 step_bar.update() if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and args.global_step % args.logging_steps == 0: tb_data = { 'lr': scheduler.get_last_lr()[0], 'acc': tr_ac / args.logging_steps / (args.train_batch_size * args.gradient_accumulation_steps), 'loss': tr_loss / args.logging_steps } write_tensor_board(tb_writer, tb_data, args.global_step) tr_loss = 0.0 tr_ac = 0.0 # Save model checkpoint if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and args.global_step % args.save_steps == 1: # step invoke checkpoint writing ckpt_output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(args.global_step)) save_check_point(model, ckpt_output_dir, args, optimizer, scheduler) if args.valid_step > 0 and args.global_step % args.valid_step == 1: # step invoke validation # valid_examples.update_embd(model) valid_accuracy, valid_loss = evaluate_classification( valid_examples, model, args.per_gpu_eval_batch_size, "evaluation/runtime_eval") pk, best_f1, map = evalute_retrivial_for_single_bert( model, valid_examples, args.per_gpu_eval_batch_size, "evaluation/runtime_eval") tb_data = { "valid_accuracy": valid_accuracy, "valid_loss": valid_loss, "precision@3": pk, "best_f1": best_f1, "MAP": map } write_tensor_board(tb_writer, tb_data, args.global_step) args.steps_trained_in_current_epoch += 1 if args.max_steps > 0 and args.global_step > args.max_steps: break
def test_3models2losses2optimizers(self): model0 = MyModel(1) model1 = MyModel(2) model2 = MyModel(3) optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25}, {'params' : model1.parameters(), 'lr' : 1.0}], momentum=0.5) optimizer1 = torch.optim.SGD([{'params' : model2.parameters(), 'lr' : 0.5}], momentum=0.25) # Again, can't do this: reference_grads = [[]]*9 reference_grads = [[], [], [], [], [], [], [], [], []] final_params = [None, None, None, None, None, None, None, None, None] for i in range(2): optimizer0.zero_grad() optimizer1.zero_grad() loss0 = model0(self.x) + model1(self.x) loss1 = model2(self.x) + model1(self.x) loss0.backward() loss1.backward() reference_grads[0].append([param.grad.data.clone() for param in model0.parameters()] + [param.grad.data.clone() for param in model1.parameters()]) optimizer0.step() optimizer1.step() final_params[0] = \ [param.data.clone() for param in model0.parameters()] + \ [param.data.clone() for param in model1.parameters()] + \ [param.data.clone() for param in model2.parameters()] def what_got_skipped(which_iter, which_backward, which_model): if which_iter == 0: if which_backward == 0: if which_model == 0: return 1 if which_model == 1: return 2 if which_backward == 1: if which_model == 2: return 3 if which_model == 1: return 4 if which_iter == 1: if which_backward == 0: if which_model == 0: return 5 if which_model == 1: return 6 if which_backward == 1: if which_model == 2: return 7 if which_model == 1: return 8 return 0 for which_iter in (0,1): for which_backward in (0,1): if which_backward == 0: which_models = (0,1) if which_backward == 1: which_models = (2,1) for which_model in which_models: model0 = MyModel(1) model1 = MyModel(2) model2 = MyModel(3) optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25}, {'params' : model1.parameters(), 'lr' : 1.0}], momentum=0.5) optimizer1 = torch.optim.SGD([{'params' : model2.parameters(), 'lr' : 0.5}], momentum=0.25) for i in range(3): optimizer0.zero_grad() optimizer1.zero_grad() loss0 = model0(self.x) + model1(self.x) loss1 = model2(self.x) + model1(self.x) loss0.backward() loss1.backward() if i != which_iter: reference_grads[what_got_skipped(which_iter, which_backward, which_model)].append( [param.grad.data.clone() for param in model0.parameters()] + [param.grad.data.clone() for param in model1.parameters()]) if i == which_iter: if which_backward == 0: # if which_model == 0: optimizer1.step() # if which_model == 1: # optimizer1.step() if which_backward == 1: # if which_model == 2: # optimizer0.step() # if which_model == 1: continue else: optimizer0.step() optimizer1.step() final_params[what_got_skipped(which_iter, which_backward, which_model)] = \ [param.data.clone() for param in model0.parameters()] + \ [param.data.clone() for param in model1.parameters()] + \ [param.data.clone() for param in model2.parameters()] for materialize_master_grads in (False, True): for opt_level in ("O0", "O1", "O2", "O3"): for how_to_zero in ("none", "model", "optimizer"): for use_multiple_loss_scalers in (False, True): if opt_level == "O1" or opt_level == "O2": inject_inf_iters = (-1, 0, 1) else: inject_inf_iters = (-1,) for inject_inf in inject_inf_iters: if inject_inf >= 0: inject_inf_locs = ("fp16", "fp32") which_backwards = (0, 1) else: inject_inf_locs = ("fdsa",) which_backwards = (None,) for inject_inf_loc in inject_inf_locs: for which_backward in which_backwards: if use_multiple_loss_scalers: num_losses = 2 loss_ids = [0, 1] else: num_losses = 1 loss_ids = [0, 0] if inject_inf >= 0: iters = 3 if which_backward == 0: which_models = (0, 1) elif which_backward == 1: which_models = (2, 1) else: iters = 2 which_models = (None,) for which_model in which_models: model0 = MyModel(1) model1 = MyModel(2) model2 = MyModel(3) models = [model0, model1, model2] optimizer0 = FusedSGD([{'params' : model0.parameters(), 'lr' : 0.25}, {'params' : model1.parameters(), 'lr' : 1.0}], momentum=0.5, materialize_master_grads=materialize_master_grads) optimizer1 = FusedSGD([{'params' : model2.parameters(), 'lr' : 0.5}], momentum=0.25, materialize_master_grads=materialize_master_grads) _amp_state.allow_incoming_model_not_fp32 = True [model0, model1, model2], [optimizer0, optimizer1] = amp.initialize( [model0, model1, model2], [optimizer0, optimizer1], opt_level=opt_level, verbosity=0, cast_model_type=False, num_losses=num_losses) _amp_state.allow_incoming_model_not_fp32 = False _amp_state.loss_scalers[0]._loss_scale = 4.0 if use_multiple_loss_scalers: _amp_state.loss_scalers[1]._loss_scale = 16.0 unskipped = 0 for i in range(iters): if how_to_zero == "none": for model in models: for param in model.parameters(): param.grad = None elif how_to_zero == "model": for model in models: model.zero_grad() else: optimizer0.zero_grad() optimizer1.zero_grad() loss0 = model0(self.x) + model1(self.x) loss1 = model2(self.x) + model1(self.x) with amp.scale_loss(loss0, optimizer0, loss_id=loss_ids[0]) as scaled_loss: scaled_loss.backward() if i == inject_inf and which_backward == 0: if which_model == 0: inj_model = model0 elif which_model == 1: inj_model = model1 else: raise RuntimeError(which_model + " invalid for loss 0") if inject_inf_loc == "fp32": inj_model.weight0.grad[0] = float('inf') elif inject_inf_loc == "fp16": inj_model.weight1.grad[0] = float('inf') with amp.scale_loss(loss1, [optimizer0, optimizer1], loss_id=loss_ids[1]) as scaled_loss: scaled_loss.backward() if i == inject_inf and which_backward == 1: if which_model == 2: inj_model = model2 elif which_model == 1: inj_model = model1 else: raise RuntimeError(which_model + " invalid for loss 1 ") if inject_inf_loc == "fp32": inj_model.weight0.grad[0] = float('inf') elif inject_inf_loc == "fp16": inj_model.weight1.grad[0] = float('inf') if i != inject_inf: master_params = list(amp.master_params(optimizer0)) + \ list(amp.master_params(optimizer1)) for param, reference_grad in zip(master_params, reference_grads[what_got_skipped(inject_inf, which_backward, which_model)][unskipped]): if opt_level == "O2" and not materialize_master_grads: continue else: self.assertTrue(torch.allclose(param.grad.float(), reference_grad.float())) unskipped += 1 optimizer0.step() optimizer1.step() model_params = [p for p in model0.parameters()] + \ [p for p in model1.parameters()] + \ [p for p in model2.parameters()] master_params = [p for p in amp.master_params(optimizer0)] + \ [p for p in amp.master_params(optimizer1)] # print("opt_level {} i {} inject_inf {} which_backward {} inject_inf_loc {} use_multiple_loss_scalers {} which_model {}".format(opt_level, i, inject_inf, which_backward, inject_inf_loc, use_multiple_loss_scalers, which_model)) for model, master, reference in zip( model_params, master_params, final_params[what_got_skipped(inject_inf, which_backward, which_model)]): self.assertTrue(torch.allclose(model, reference)) self.assertTrue(torch.allclose(model, master.to(model.dtype))) if opt_level == "O1": _amp_state.handle._deactivate()
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) model = load_model(hparams) learning_rate = hparams.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O2') if hparams.distributed_run: model = apply_gradient_allreduce(model) criterion = Tacotron2Loss() logger = prepare_directories_and_logger(output_directory, log_directory, rank) train_loader, valset, collate_fn = prepare_dataloaders(hparams) # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model = warm_start_model(checkpoint_path, model, hparams.ignore_layers) else: model, optimizer, _learning_rate, iteration = load_checkpoint( checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) model.train() is_overflow = False # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, hparams.epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): start = time.perf_counter() for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.zero_grad() x, y = model.parse_batch(batch) y_pred = model(x) loss = criterion(y_pred, y) if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), hparams.grad_clip_thresh) is_overflow = math.isnan(grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() if not is_overflow and rank == 0: duration = time.perf_counter() - start print( "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format( iteration, reduced_loss, grad_norm, duration)) logger.log_training(reduced_loss, grad_norm, learning_rate, duration, iteration) if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0): validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank) if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(cfg): # Set seeds for determinism torch.manual_seed(cfg.training.seed) torch.cuda.manual_seed_all(cfg.training.seed) np.random.seed(cfg.training.seed) random.seed(cfg.training.seed) main_proc = True device = torch.device("cpu" if cfg.training.no_cuda else "cuda") is_distributed = os.environ.get( "LOCAL_RANK") # If local rank exists, distributed env if is_distributed: # when using NCCL, on failures, surviving nodes will deadlock on NCCL ops # because NCCL uses a spin-lock on the device. Set this env var and # to enable a watchdog thread that will destroy stale NCCL communicators os.environ["NCCL_BLOCKING_WAIT"] = "1" device_id = int(os.environ["LOCAL_RANK"]) torch.cuda.set_device(device_id) print(f"Setting CUDA Device to {device_id}") dist.init_process_group(backend=cfg.training.dist_backend.value) main_proc = device_id == 0 # Main process handles saving of models and reporting if OmegaConf.get_type(cfg.checkpointing) == FileCheckpointConfig: checkpoint_handler = FileCheckpointHandler(cfg=cfg.checkpointing) elif OmegaConf.get_type(cfg.checkpointing) == GCSCheckpointConfig: checkpoint_handler = GCSCheckpointHandler(cfg=cfg.checkpointing) else: raise ValueError("Checkpoint Config has not been specified correctly.") if main_proc and cfg.visualization.visdom: visdom_logger = VisdomLogger(id=cfg.visualization.id, num_epochs=cfg.training.epochs) if main_proc and cfg.visualization.tensorboard: tensorboard_logger = TensorBoardLogger( id=cfg.visualization.id, log_dir=to_absolute_path(cfg.visualization.log_dir), log_params=cfg.visualization.log_params) if cfg.checkpointing.load_auto_checkpoint: latest_checkpoint = checkpoint_handler.find_latest_checkpoint() if latest_checkpoint: cfg.checkpointing.continue_from = latest_checkpoint if cfg.checkpointing.continue_from: # Starting from previous model state = TrainingState.load_state( state_path=to_absolute_path(cfg.checkpointing.continue_from)) model = state.model if cfg.training.finetune: state.init_finetune_states(cfg.training.epochs) if main_proc and cfg.visualization.visdom: # Add previous scores to visdom graph visdom_logger.load_previous_values(state.epoch, state.results) if main_proc and cfg.visualization.tensorboard: # Previous scores to tensorboard logs tensorboard_logger.load_previous_values(state.epoch, state.results) else: # Initialise new model training with open(to_absolute_path(cfg.data.labels_path)) as label_file: labels = json.load(label_file) if OmegaConf.get_type(cfg.model) is BiDirectionalConfig: model = DeepSpeech( rnn_hidden_size=cfg.model.hidden_size, nb_layers=cfg.model.hidden_layers, labels=labels, rnn_type=supported_rnns[cfg.model.rnn_type.value], audio_conf=cfg.data.spect, bidirectional=True) elif OmegaConf.get_type(cfg.model) is UniDirectionalConfig: model = DeepSpeech( rnn_hidden_size=cfg.model.hidden_size, nb_layers=cfg.model.hidden_layers, labels=labels, rnn_type=supported_rnns[cfg.model.rnn_type.value], audio_conf=cfg.data.spect, bidirectional=False, context=cfg.model.lookahead_context) else: raise ValueError("Model Config has not been specified correctly.") state = TrainingState(model=model) state.init_results_tracking(epochs=cfg.training.epochs) # Data setup evaluation_decoder = GreedyDecoder( model.labels) # Decoder used for validation train_dataset = SpectrogramDataset(audio_conf=model.audio_conf, manifest_filepath=to_absolute_path( cfg.data.train_manifest), labels=model.labels, normalize=True, augmentation_conf=cfg.data.augmentation) test_dataset = SpectrogramDataset(audio_conf=model.audio_conf, manifest_filepath=to_absolute_path( cfg.data.val_manifest), labels=model.labels, normalize=True) if not is_distributed: train_sampler = DSRandomSampler(dataset=train_dataset, batch_size=cfg.data.batch_size, start_index=state.training_step) else: train_sampler = DSElasticDistributedSampler( dataset=train_dataset, batch_size=cfg.data.batch_size, start_index=state.training_step) train_loader = AudioDataLoader(dataset=train_dataset, num_workers=cfg.data.num_workers, batch_sampler=train_sampler) test_loader = AudioDataLoader(dataset=test_dataset, num_workers=cfg.data.num_workers, batch_size=cfg.data.batch_size) model = model.to(device) parameters = model.parameters() if OmegaConf.get_type(cfg.optim) is SGDConfig: optimizer = torch.optim.SGD(parameters, lr=cfg.optim.learning_rate, momentum=cfg.optim.momentum, nesterov=True, weight_decay=cfg.optim.weight_decay) elif OmegaConf.get_type(cfg.optim) is AdamConfig: optimizer = torch.optim.AdamW(parameters, lr=cfg.optim.learning_rate, betas=cfg.optim.betas, eps=cfg.optim.eps, weight_decay=cfg.optim.weight_decay) else: raise ValueError("Optimizer has not been specified correctly.") model, optimizer = amp.initialize(model, optimizer, enabled=not cfg.training.no_cuda, opt_level=cfg.apex.opt_level, loss_scale=cfg.apex.loss_scale) if state.optim_state is not None: optimizer.load_state_dict(state.optim_state) if state.amp_state is not None: amp.load_state_dict(state.amp_state) # Track states for optimizer/amp state.track_optim_state(optimizer) if not cfg.training.no_cuda: state.track_amp_state(amp) if is_distributed: model = DistributedDataParallel(model, device_ids=[device_id]) print(model) print("Number of parameters: %d" % DeepSpeech.get_param_size(model)) criterion = CTCLoss() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() for epoch in range(state.epoch, cfg.training.epochs): model.train() end = time.time() start_epoch_time = time.time() state.set_epoch(epoch=epoch) train_sampler.set_epoch(epoch=epoch) train_sampler.reset_training_step(training_step=state.training_step) for i, (data) in enumerate(train_loader, start=state.training_step): state.set_training_step(training_step=i) inputs, targets, input_percentages, target_sizes = data input_sizes = input_percentages.mul_(int(inputs.size(3))).int() # measure data loading time data_time.update(time.time() - end) inputs = inputs.to(device) out, output_sizes = model(inputs, input_sizes) out = out.transpose(0, 1) # TxNxH float_out = out.float() # ensure float32 for loss loss = criterion(float_out, targets, output_sizes, target_sizes).to(device) # loss = loss / inputs.size(0) # average the loss by minibatch loss_value = loss.item() # Check to ensure valid loss was calculated valid_loss, error = check_loss(loss, loss_value) if valid_loss: optimizer.zero_grad() # compute gradient with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), cfg.optim.max_norm) optimizer.step() else: print(error) print('Skipping grad update') loss_value = 0 state.avg_loss += loss_value losses.update(loss_value, inputs.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format( (epoch + 1), (i + 1), len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses)) if main_proc and cfg.checkpointing.checkpoint_per_iteration: checkpoint_handler.save_iter_checkpoint_model(epoch=epoch, i=i, state=state) del loss, out, float_out state.avg_loss = state.avg_loss / len( train_dataset) * cfg.data.batch_size epoch_time = time.time() - start_epoch_time print('Training Summary Epoch: [{0}]\t' 'Time taken (s): {epoch_time:.0f}\t' 'Average Loss {loss:.3f}\t'.format(epoch + 1, epoch_time=epoch_time, loss=state.avg_loss)) time.sleep(15 * 60) with torch.no_grad(): wer, cer, output_data = run_evaluation( test_loader=test_loader, device=device, model=model, decoder=evaluation_decoder, target_decoder=evaluation_decoder) print('Validation Summary Epoch: [{0}]\t' 'Average WER {wer:.3f}\t' 'Average CER {cer:.3f}\t'.format(epoch + 1, wer=wer, cer=cer)) state.add_results(epoch=epoch, loss_result=state.avg_loss, wer_result=wer, cer_result=cer) if main_proc and cfg.visualization.visdom: visdom_logger.update(epoch, state.result_state) if main_proc and cfg.visualization.tensorboard: tensorboard_logger.update(epoch, state.result_state, model.named_parameters()) if main_proc and cfg.checkpointing.checkpoint: # Save epoch checkpoint checkpoint_handler.save_checkpoint_model(epoch=epoch, state=state) # anneal lr for g in optimizer.param_groups: g['lr'] = g['lr'] / cfg.optim.learning_anneal print('Learning rate annealed to: {lr:.6f}'.format(lr=g['lr'])) if main_proc and (state.best_wer is None or state.best_wer > wer): checkpoint_handler.save_best_model(epoch=epoch, state=state) state.set_best_wer(wer) state.reset_avg_loss() state.reset_training_step() # Reset training step for next epoch
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed( args) # Added here for reproducibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) model.train() outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model( inputs, labels=labels) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: checkpoint_prefix = 'checkpoint' # Save model checkpoint output_dir = os.path.join( args.output_dir, '{}-{}'.format(checkpoint_prefix, global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) _rotate_checkpoints(args, checkpoint_prefix) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(args, train_dataset, model, tokenizer): """ Train the model """ args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, num_workers=4, pin_memory=True) args.max_steps = args.epoch * len(train_dataloader) args.save_steps = len(train_dataloader) // 10 args.warmup_steps = len(train_dataloader) args.logging_steps = len(train_dataloader) args.num_train_epochs = args.epoch model.to(args.device) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.max_steps * 0.1, num_training_steps=args.max_steps) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) checkpoint_last = os.path.join(args.output_dir, 'checkpoint-last') scheduler_last = os.path.join(checkpoint_last, 'scheduler.pt') optimizer_last = os.path.join(checkpoint_last, 'optimizer.pt') if os.path.exists(scheduler_last): scheduler.load_state_dict(torch.load(scheduler_last)) if os.path.exists(optimizer_last): optimizer.load_state_dict(torch.load(optimizer_last)) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", args.max_steps) global_step = args.start_step tr_loss, logging_loss, avg_loss, tr_nb, tr_num, train_loss = 0.0, 0.0, 0.0, 0, 0, 0 best_mrr = 0.0 best_acc = 0.0 # model.resize_token_embeddings(len(tokenizer)) model.zero_grad() for idx in range(args.start_epoch, int(args.num_train_epochs)): bar = train_dataloader tr_num = 0 train_loss = 0 for step, batch in enumerate(bar): code_inputs = batch[0].to(args.device) nl_inputs = batch[1].to(args.device) model.train() loss, code_vec, nl_vec = model(code_inputs, nl_inputs) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() tr_num += 1 train_loss += loss.item() if avg_loss == 0: avg_loss = tr_loss avg_loss = round(train_loss / tr_num, 5) if (step + 1) % 100 == 0: logger.info("epoch {} step {} loss {}".format( idx, step + 1, avg_loss)) #bar.set_description("epoch {} loss {}".format(idx,avg_loss)) if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() scheduler.step() global_step += 1 output_flag = True avg_loss = round( np.exp((tr_loss - logging_loss) / (global_step - tr_nb)), 4) if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: logging_loss = tr_loss tr_nb = global_step if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer, eval_when_training=True) for key, value in results.items(): logger.info(" %s = %s", key, round(value, 4)) # Save model checkpoint tr_num = 0 train_loss = 0 if results['eval_mrr'] > best_acc: best_acc = results['eval_mrr'] logger.info(" " + "*" * 20) logger.info(" Best mrr:%s", round(best_acc, 4)) logger.info(" " + "*" * 20) checkpoint_prefix = 'checkpoint-best-mrr' output_dir = os.path.join( args.output_dir, '{}'.format(checkpoint_prefix)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module') else model output_dir = os.path.join(output_dir, '{}'.format('model.bin')) torch.save(model_to_save.state_dict(), output_dir) logger.info("Saving model checkpoint to %s", output_dir)
def fine_tune_task(args, model, train_dataset, tokenizer): """ Train the model """ args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) set_seed( args) # Added here for reproductibility (even between python 2 and 3) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed( args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } if args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] if args.model_type in [ 'bert', 'xlnet' ] else None # XLM, DistilBERT and RoBERTa don't use segment_ids outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1 ) % args.gradient_accumulation_steps == 0 and not args.tpu: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.tpu: args.xla_model.optimizer_step(optimizer, barrier=True) model.zero_grad() global_step += 1 if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break return global_step, tr_loss / global_step
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter("log/" + args.output_dir[args.output_dir.find('/') + 1:]) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, num_workers=4, pin_memory=True) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) if args.warmup_steps == -1: args.warmup_steps = int(0.1 * t_total) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 best_f1 = 0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed( args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iter", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': None if args.model_type == 'xlm' else batch[2], 'start_positions': batch[3], 'end_positions': batch[4] } if args.model_type.startswith('roberta'): del inputs['token_type_ids'] if args.model_type in ['xlnet', 'xlm']: inputs.update({'cls_index': batch[5], 'p_mask': batch[6]}) if args.model_type.endswith('multi'): inputs.update({'answer_masks': batch[7]}) inputs.update({'answer_nums': batch[8]}) outputs = model(**inputs) loss = outputs[0] if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.evaluate_during_training and args.save_steps > 0 and global_step % args.save_steps == 0: results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) if results['f1'] > best_f1: best_f1 = results['f1'] # Save model checkpoint output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save( args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) logger.info("Best F1 %f", best_f1) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step