def train_model(cfg): cfg.checkpoint_dir = f'{cfg.checkpoint_dir}/useCSMI{cfg.use_CSMI}_useCPMI{cfg.use_CPMI}_usePSMI{cfg.use_PSMI}_useAmp{cfg.use_amp}' if cfg.encoder_lf0_type == 'no_emb': # default dim_lf0 = 1 else: dim_lf0 = 64 checkpoint_dir = Path(utils.to_absolute_path(cfg.checkpoint_dir)) checkpoint_dir.mkdir(exist_ok=True, parents=True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # define model encoder = Encoder(**cfg.model.encoder) encoder_lf0 = Encoder_lf0(cfg.encoder_lf0_type) cpc = CPCLoss_sameSeq(**cfg.model.cpc) encoder_spk = Encoder_spk() cs_mi_net = CLUBSample_group(256, cfg.model.encoder.z_dim, 512) ps_mi_net = CLUBSample_group(256, dim_lf0, 512) cp_mi_net = CLUBSample_reshape(dim_lf0, cfg.model.encoder.z_dim, 512) decoder = Decoder_ac(dim_neck=cfg.model.encoder.z_dim, dim_lf0=dim_lf0, use_l1_loss=True) encoder.to(device) cpc.to(device) encoder_lf0.to(device) encoder_spk.to(device) cs_mi_net.to(device) ps_mi_net.to(device) cp_mi_net.to(device) decoder.to(device) optimizer = optim.Adam(chain(encoder.parameters(), encoder_lf0.parameters(), cpc.parameters(), encoder_spk.parameters(), decoder.parameters()), lr=cfg.training.scheduler.initial_lr) optimizer_cs_mi_net = optim.Adam(cs_mi_net.parameters(), lr=cfg.mi_lr) optimizer_ps_mi_net = optim.Adam(ps_mi_net.parameters(), lr=cfg.mi_lr) optimizer_cp_mi_net = optim.Adam(cp_mi_net.parameters(), lr=cfg.mi_lr) # TODO: use_amp is set default to True to speed up training; no-amp -> more stable training? => need to be verified if cfg.use_amp: [encoder, encoder_lf0, cpc, encoder_spk, decoder], optimizer = amp.initialize( [encoder, encoder_lf0, cpc, encoder_spk, decoder], optimizer, opt_level='O1') [cs_mi_net], optimizer_cs_mi_net = amp.initialize([cs_mi_net], optimizer_cs_mi_net, opt_level='O1') [ps_mi_net], optimizer_ps_mi_net = amp.initialize([ps_mi_net], optimizer_ps_mi_net, opt_level='O1') [cp_mi_net], optimizer_cp_mi_net = amp.initialize([cp_mi_net], optimizer_cp_mi_net, opt_level='O1') root_path = Path(utils.to_absolute_path("data")) dataset = CPCDataset( root=root_path, n_sample_frames=cfg.training.sample_frames, # 128 mode='train') valid_dataset = CPCDataset( root=root_path, n_sample_frames=cfg.training.sample_frames, # 128 mode='valid') warmup_epochs = 2000 // (len(dataset) // cfg.training.batch_size) print('warmup_epochs:', warmup_epochs) scheduler = WarmupScheduler(optimizer, warmup_epochs=warmup_epochs, initial_lr=cfg.training.scheduler.initial_lr, max_lr=cfg.training.scheduler.max_lr, milestones=cfg.training.scheduler.milestones, gamma=cfg.training.scheduler.gamma) dataloader = DataLoader( dataset, batch_size=cfg.training.batch_size, # 256 shuffle=True, num_workers=cfg.training.n_workers, pin_memory=True, drop_last=False) valid_dataloader = DataLoader( valid_dataset, batch_size=cfg.training.batch_size, # 256 shuffle=False, num_workers=cfg.training.n_workers, pin_memory=True, drop_last=False) if cfg.resume: print("Resume checkpoint from: {}:".format(cfg.resume)) resume_path = utils.to_absolute_path(cfg.resume) checkpoint = torch.load(resume_path, map_location=lambda storage, loc: storage) encoder.load_state_dict(checkpoint["encoder"]) encoder_lf0.load_state_dict(checkpoint["encoder_lf0"]) cpc.load_state_dict(checkpoint["cpc"]) encoder_spk.load_state_dict(checkpoint["encoder_spk"]) cs_mi_net.load_state_dict(checkpoint["cs_mi_net"]) ps_mi_net.load_state_dict(checkpoint["ps_mi_net"]) if cfg.use_CPMI: cp_mi_net.load_state_dict(checkpoint["cp_mi_net"]) decoder.load_state_dict(checkpoint["decoder"]) optimizer.load_state_dict(checkpoint["optimizer"]) optimizer_cs_mi_net.load_state_dict(checkpoint["optimizer_cs_mi_net"]) optimizer_ps_mi_net.load_state_dict(checkpoint["optimizer_ps_mi_net"]) optimizer_cp_mi_net.load_state_dict(checkpoint["optimizer_cp_mi_net"]) if cfg.use_amp: amp.load_state_dict(checkpoint["amp"]) scheduler.load_state_dict(checkpoint["scheduler"]) start_epoch = checkpoint["epoch"] else: start_epoch = 1 if os.path.exists(f'{str(checkpoint_dir)}/results.txt'): wmode = 'a' else: wmode = 'w' results_txt = open(f'{str(checkpoint_dir)}/results.txt', wmode) results_txt.write('save training info...\n') results_txt.close() global_step = 0 stime = time.time() for epoch in range(start_epoch, cfg.training.n_epochs + 1): average_cpc_loss = average_vq_loss = average_perplexity = average_recon_loss = 0 average_accuracies = np.zeros(cfg.training.n_prediction_steps) average_lld_cs_loss = average_mi_cs_loss = average_lld_ps_loss = average_mi_ps_loss = average_lld_cp_loss = average_mi_cp_loss = 0 for i, (mels, lf0, speakers) in enumerate(dataloader, 1): lf0 = lf0.to(device) mels = mels.to(device) # (bs, 80, 128) if cfg.use_CSMI or cfg.use_CPMI or cfg.use_PSMI: for j in range(cfg.mi_iters): optimizer_cs_mi_net, lld_cs_loss, optimizer_ps_mi_net, lld_ps_loss, optimizer_cp_mi_net, lld_cp_loss = mi_first_forward(mels, lf0, encoder, encoder_lf0, encoder_spk, cs_mi_net, optimizer_cs_mi_net, \ ps_mi_net, optimizer_ps_mi_net, cp_mi_net, optimizer_cp_mi_net, cfg) else: lld_cs_loss = torch.tensor(0.) lld_ps_loss = torch.tensor(0.) lld_cp_loss = torch.tensor(0.) optimizer, recon_loss, vq_loss, cpc_loss, accuracy, perplexity, mi_cs_loss, mi_ps_loss, mi_cp_loss = mi_second_forward(mels, lf0, \ encoder, encoder_lf0, cpc, \ encoder_spk, cs_mi_net, ps_mi_net, \ cp_mi_net, decoder, cfg, \ optimizer, scheduler) average_recon_loss += (recon_loss.item() - average_recon_loss) / i average_cpc_loss += (cpc_loss.item() - average_cpc_loss) / i average_vq_loss += (vq_loss.item() - average_vq_loss) / i average_perplexity += (perplexity.item() - average_perplexity) / i average_accuracies += (np.array(accuracy) - average_accuracies) / i average_lld_cs_loss += (lld_cs_loss.item() - average_lld_cs_loss) / i average_mi_cs_loss += (mi_cs_loss.item() - average_mi_cs_loss) / i average_lld_ps_loss += (lld_ps_loss.item() - average_lld_ps_loss) / i average_mi_ps_loss += (mi_ps_loss.item() - average_mi_ps_loss) / i average_lld_cp_loss += (lld_cp_loss.item() - average_lld_cp_loss) / i average_mi_cp_loss += (mi_cp_loss.item() - average_mi_cp_loss) / i ctime = time.time() print( "epoch:{}, global step:{}, recon loss:{:.3f}, cpc loss:{:.3f}, vq loss:{:.3f}, perpexlity:{:.3f}, lld cs loss:{:.3f}, mi cs loss:{:.3E}, lld ps loss:{:.3f}, mi ps loss:{:.3f}, lld cp loss:{:.3f}, mi cp loss:{:.3f}, used time:{:.3f}s" .format(epoch, global_step, average_recon_loss, average_cpc_loss, average_vq_loss, average_perplexity, average_lld_cs_loss, average_mi_cs_loss, average_lld_ps_loss, average_mi_ps_loss, average_lld_cp_loss, average_mi_cp_loss, ctime - stime)) print(100 * average_accuracies) stime = time.time() global_step += 1 # scheduler.step() results_txt = open(f'{str(checkpoint_dir)}/results.txt', 'a') results_txt.write( "epoch:{}, global step:{}, recon loss:{:.3f}, cpc loss:{:.3f}, vq loss:{:.3f}, perpexlity:{:.3f}, lld cs loss:{:.3f}, mi cs loss:{:.3E}, lld ps loss:{:.3f}, mi ps loss:{:.3f}, lld cp loss:{:.3f}, mi cp loss:{:.3f}" .format(epoch, global_step, average_recon_loss, average_cpc_loss, average_vq_loss, average_perplexity, average_lld_cs_loss, average_mi_cs_loss, average_lld_ps_loss, average_mi_ps_loss, average_lld_cp_loss, average_mi_cp_loss) + '\n') results_txt.write( ' '.join([str(cpc_acc) for cpc_acc in average_accuracies]) + '\n') results_txt.close() scheduler.step() if epoch % cfg.training.log_interval == 0 and epoch != start_epoch: eval_model(epoch, checkpoint_dir, device, valid_dataloader, encoder, encoder_lf0, cpc, encoder_spk, cs_mi_net, ps_mi_net, cp_mi_net, decoder, cfg) ctime = time.time() print( "epoch:{}, global step:{}, recon loss:{:.3f}, cpc loss:{:.3f}, vq loss:{:.3f}, perpexlity:{:.3f}, lld cs loss:{:.3f}, mi cs loss:{:.3E}, lld ps loss:{:.3f}, mi ps loss:{:.3f}, lld cp loss:{:.3f}, mi cp loss:{:.3f}, used time:{:.3f}s" .format(epoch, global_step, average_recon_loss, average_cpc_loss, average_vq_loss, average_perplexity, average_lld_cs_loss, average_mi_cs_loss, average_lld_ps_loss, average_mi_ps_loss, average_lld_cp_loss, average_mi_cp_loss, ctime - stime)) print(100 * average_accuracies) stime = time.time() if epoch % cfg.training.checkpoint_interval == 0 and epoch != start_epoch: save_checkpoint(encoder, encoder_lf0, cpc, encoder_spk, \ cs_mi_net, ps_mi_net, cp_mi_net, decoder, \ optimizer, optimizer_cs_mi_net, optimizer_ps_mi_net, optimizer_cp_mi_net, scheduler, amp, epoch, checkpoint_dir, cfg)
def train(self, model_path: Optional[str] = None): """ Main training entry point. Args: model_path: (Optional) Local path to model if model to train has been instantiated from a local path If present, we will try reloading the optimizer/scheduler states from there. """ train_dataloader = self.get_train_dataloader() if self.args.max_steps > 0: t_total = self.args.max_steps num_train_epochs = (self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1) else: t_total = int( len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs) num_train_epochs = self.args.num_train_epochs optimizer, scheduler = self.get_optimizers(num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if (model_path is not None and os.path.isfile(os.path.join(model_path, "optimizer.pt")) and os.path.isfile(os.path.join(model_path, "scheduler.pt"))): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(model_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(model_path, "scheduler.pt"))) model = self.model model.to(self.args.device) if self.args.fp16: if not is_apex_available(): raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize( model, optimizer, opt_level=self.args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if self.args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if self.args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[self.args.local_rank], output_device=self.args.local_rank, find_unused_parameters=True, ) if self.tb_writer is not None: self.tb_writer.add_text("args", self.args.to_json_string()) self.tb_writer.add_hparams(self.args.to_sanitized_dict(), metric_dict={}) if is_wandb_available(): self._setup_wandb() # Train! if is_tpu_available(): num_examples = len(train_dataloader._loader._loader.dataset) total_train_batch_size = self.args.train_batch_size * xm.xrt_world_size( ) else: num_examples = len(train_dataloader.dataset) total_train_batch_size = (self.args.train_batch_size * self.args.gradient_accumulation_steps * (torch.distributed.get_world_size() if self.args.local_rank != -1 else 1)) logger.info("***** Running training *****") logger.info(" Num examples = %d", num_examples) logger.info(" Num Epochs = %d", num_train_epochs) logger.info(" Instantaneous batch size per device = %d", self.args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", total_train_batch_size) logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if model_path is not None: # set global_step to global_step of last saved checkpoint from model path try: global_step = int(model_path.split("-")[-1].split("/")[0]) epochs_trained = global_step // ( len(train_dataloader) // self.args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // self.args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info( " Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: global_step = 0 logger.info(" Starting fine-tuning.") tr_loss = 0.0 logging_loss = 0.0 model.zero_grad() train_iterator = trange(epochs_trained, int(num_train_epochs), desc="Epoch", disable=not self.is_local_master()) for epoch in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=not self.is_local_master()) for step, inputs in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue tr_loss += self._training_step(model, inputs, optimizer) if (step + 1) % self.args.gradient_accumulation_steps == 0 or ( # last step in epoch but step is always smaller than gradient_accumulation_steps len(epoch_iterator) <= self.args.gradient_accumulation_steps and (step + 1) == len(epoch_iterator)): if self.args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), self.args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.max_grad_norm) if is_tpu_available(): xm.optimizer_step(optimizer) else: optimizer.step() scheduler.step() model.zero_grad() global_step += 1 if self.is_local_master(): if (self.args.logging_steps > 0 and global_step % self.args.logging_steps == 0) or (global_step == 1 and self.args.logging_first_step): logs = {} if self.args.evaluate_during_training: results = self.evaluate() for key, value in results.items(): eval_key = "eval_{}".format(key) logs[eval_key] = value loss_scalar = (tr_loss - logging_loss ) / self.args.logging_steps learning_rate_scalar = scheduler.get_last_lr()[0] logs["learning_rate"] = learning_rate_scalar logs["loss"] = loss_scalar logging_loss = tr_loss if self.tb_writer: for k, v in logs.items(): self.tb_writer.add_scalar( k, v, global_step) if is_wandb_available(): wandb.log(logs, step=global_step) epoch_iterator.write( json.dumps({ **logs, **{ "step": global_step } })) if self.args.save_steps > 0 and global_step % self.args.save_steps == 0: # In all cases (even distributed/parallel), self.model is always a reference # to the model we want to save. if hasattr(model, "module"): assert model.module is self.model else: assert model is self.model # Save model checkpoint output_dir = os.path.join( self.args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{global_step}") self.save_model(output_dir) self._rotate_checkpoints() torch.save( optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save( scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info( "Saving optimizer and scheduler states to %s", output_dir) if self.args.max_steps > 0 and global_step > self.args.max_steps: epoch_iterator.close() break if self.args.max_steps > 0 and global_step > self.args.max_steps: train_iterator.close() break if self.args.tpu_metrics_debug: # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) xm.master_print(met.metrics_report()) if self.tb_writer: self.tb_writer.close() logger.info( "\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n" ) return TrainOutput(global_step, tr_loss / global_step)
def for_pytorch(data_package, device=torch.device('cuda'), SEED=118, phase="predict", model=None): if device is None and os.getenv("TPU_NAME") is not None: import torch_xla # model import torch_xla.core.xla_model as xm device = xm.xla_device() X, y, X_val, y_val, X_test = data_package if model is None: try: model = get_trained_model(device=device) except RuntimeError as e: logger.debug("%s", e) if model is not None and phase == "predict": for param in model.parameters(): param.requires_grad = False model.eval() valid_preds = np.zeros((len(X_val))) valid = torch.utils.data.TensorDataset( torch.tensor(X_val, dtype=torch.long)) valid_loader = torch.utils.data.DataLoader(valid, batch_size=32, shuffle=False) tk0 = tqdm(valid_loader) for i, (x_batch, ) in enumerate(tk0): pred = model(x_batch.to(device), attention_mask=(x_batch > 0).to(device), labels=None) valid_preds[i * 32:(i + 1) * 32] = pred[:, 0].detach().cpu().squeeze().numpy() else: import subprocess train_dataset = torch.utils.data.TensorDataset( torch.tensor(X, dtype=torch.long), torch.tensor(y, dtype=torch.float)) output_model_file = "bert_pytorch.bin" lr = 1e-5 batch_size = 32 accumulation_steps = 3 np.random.seed(SEED) torch.manual_seed(SEED) torch.cuda.manual_seed(SEED) torch.backends.cudnn.deterministic = False if model is None: prepare_pretrained() model = BertForSequenceClassification.from_pretrained( ".", cache_dir=None, num_labels=1 if len(y[0]) < 1 else len(y[0])) assert model is not None logger.info("AUC for valication: %f", get_validation_result(model, X_val, y_val)) model.zero_grad() model = model.to(device) param_optimizer = list(model.named_parameters()) may_debug() req_grad = ['layer.10', 'layer.11', 'bert.poole', 'classifier'] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] def para_opt_configure(req_grad, no_decay): for n, p in param_optimizer: if any(nd in n for nd in req_grad): p.requires_grad = True else: p.requires_grad = False optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] return optimizer_grouped_parameters optimizer_grouped_parameters = para_opt_configure(req_grad, no_decay) train = train_dataset num_train_optimization_steps = int(EPOCHS * len(train) / batch_size / accumulation_steps) optimizer = BertAdam(optimizer_grouped_parameters, lr=lr, warmup=0.05, t_total=num_train_optimization_steps) subprocess.run( 'python3 -m pip show apex || ([ -d /kaggle/input/nvidiaapex/repository/NVIDIA-apex-39e153a ] && ' 'pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ../input/nvidiaapex/repository/NVIDIA-apex-39e153a)', shell=True, check=True) from apex import amp # automatic mix precision model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=1) model = model.train() tq = tqdm(range(EPOCHS)) for epoch in tq: train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True) avg_loss = 0. avg_accuracy = 0. lossf = None para_opt_configure(req_grad, no_decay) # valication will change it tk0 = tqdm(enumerate(train_loader), total=len(train_loader), leave=True) optimizer.zero_grad() # Bug fix - thanks to @chinhuic for i, (x_batch, y_batch) in tk0: # optimizer.zero_grad() y_pred = model(x_batch.to(device), attention_mask=(x_batch > 0).to(device), labels=None) loss = F.binary_cross_entropy_with_logits( y_pred, y_batch.to(device)) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if ( i + 1 ) % accumulation_steps == 0: # Wait for several backward steps optimizer.step() # Now we can do an optimizer step optimizer.zero_grad() if lossf: lossf = 0.98 * lossf + 0.02 * loss.item() else: lossf = loss.item() tk0.set_postfix(loss=lossf) avg_loss += loss.item() / len(train_loader) avg_accuracy += torch.mean( ((torch.sigmoid(y_pred[:, 0]) > 0.5) == (y_batch[:, 0] > 0.5).to(device)).to( torch.float)).item() / len(train_loader) tq.set_postfix(avg_loss=avg_loss, avg_accuracy=avg_accuracy) logger.info("AUC for valication: %f", get_validation_result(model, X_val, y_val)) from datetime import date today = date.today() torch.save(model.state_dict(), f"{today}_{output_model_file}")
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task.", ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), ) parser.add_argument( "--task_name", default=None, type=str, required=True, help="The name of the task to train selected in the list: " + ", ".join(processors.keys()), ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written.", ) parser.add_argument( "--bert_representation", default="pool", choices=["avg", "pool"], type=str, help="The BERT representation type", ) parser.add_argument( "--margin", default=0.5, type=float, help="The margin to train with hinge loss", ) # Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name", ) parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step.", ) parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.", ) parser.add_argument( "--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.", ) parser.add_argument( "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.", ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.", ) parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action="store_true", help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument( "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory", ) parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets", ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument( "--save_total_limit", type=int, default=None, help= "Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default", ) parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") args = parser.parse_args() logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir): # set to load the latest checkpoint for training args.model_name_or_path = args.output_dir all_model_checkpoints = [ ckpt for ckpt in os.listdir(args.model_name_or_path) if os.path.isdir(os.path.join(args.model_name_or_path, ckpt)) ] all_model_checkpoints = [(ckpt.split("-")[-1] if "-" in ckpt else -1, ckpt) for ckpt in all_model_checkpoints] all_model_checkpoints.sort(reverse=True) args.model_name_or_path = os.path.join(args.model_name_or_path, all_model_checkpoints[0][1]) logger.info("setting to load the model from %s", args.model_name_or_path) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Prepare GLUE task args.task_name = args.task_name.lower() if args.task_name not in processors: raise ValueError("Task not found: %s" % (args.task_name)) processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] num_labels = 2 # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name, cache_dir=args.cache_dir if args.cache_dir else None, ) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None, ) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: datafiles = DataFiles(args.data_dir) if os.path.isfile( os.path.join(args.model_name_or_path, "datafiles.txt")): datafiles.load( os.path.join(args.model_name_or_path, "datafiles.txt")) global_step = 0 shard_count = 0 if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() while True: todo_file = datafiles.next() if not todo_file: break if args.local_rank == 0: torch.distributed.barrier() train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, todo_file) args.train_batch_size = args.per_gpu_train_batch_size * max( 1, args.n_gpu) train_sampler = SequentialSampler( train_dataset ) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if shard_count == 0: # if this is the first shard, create the optimizer or load from the previous checkpoint # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs * len( datafiles.all_files) # 280 shards of data files in total optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile( os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): logger.info("loading optimizer and scheduler from %s", args.model_name_or_path) # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load( os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load( os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize( model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True, ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) if shard_count == 0: # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): # set global_step to global_step of last saved checkpoint from model path try: global_step = int( args.model_name_or_path.split("-")[-1].split("/") [0]) except ValueError: global_step = 0 epochs_trained = global_step // ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info(" Continuing training from checkpoint %s", args.model_name_or_path) logger.info(" Continuing training from global step %d", global_step) global_step, tr_loss, optimizer, scheduler = train( args, train_dataset, train_dataloader, model, tokenizer, optimizer, scheduler, tb_writer, global_step=global_step) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Save model checkpoint output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) datafiles.save(os.path.join(output_dir, "datafiles.txt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) _rotate_checkpoints(args, "checkpoint") shard_count += 1 if args.local_rank in [-1, 0]: tb_writer.close() # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = (model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned model = model_class.from_pretrained(args.output_dir) tokenizer = tokenizer_class.from_pretrained(args.output_dir) model.to(args.device) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) logging.getLogger("transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( "-")[-1] if len(checkpoints) > 1 else "" prefix = checkpoint.split( "/")[-1] if checkpoint.find("checkpoint") != -1 else "" model = model_class.from_pretrained(checkpoint) model.to(args.device) result = evaluate(args, model, tokenizer, prefix=prefix) result = dict( (k + "_{}".format(global_step), v) for k, v in result.items()) results.update(result) return results
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: # tb_writer = SummaryWriter(logdir=args.tb_logdir) tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed( args) # Added here for reproductibility (even between python 2 and 3) if args.validation_metric == 'loss': best_dev_metric = float('inf') else: best_dev_metric = 0.0 patience = args.patience save_model = False for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids 'labels': batch[3] } outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer, steps=global_step) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) if key == args.validation_metric: if (args.validation_metric == 'loss' and best_dev_metric > value) or ( best_dev_metric < value): best_dev_metric = value patience = args.patience save_model = True else: patience -= 1 tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint if save_model or args.task_name == 'dissent': # Take care of distributed/parallel training model_to_save = model.module if hasattr( model, 'module') else model # Save to main model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) torch.save( args, os.path.join(args.output_dir, 'training_args.bin')) save_model = False logger.info("Saving model checkpoint to %s", args.output_dir) if patience == 0: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if patience == 0: logger.info("Ran out of patience. Stopping training") train_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(args, train_dataset, model, tokenizer): record_result = [] """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt") ): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) inputs_list = [] global_step = 1 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") collect_step=0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0] ) # Added here for reproductibility set_seed(args) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue if collect_step < 20: batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "start_positions": batch[3], "end_positions": batch[4], } inputs_list.append(inputs) collect_step += 1 continue else: print('collect_step', collect_step) print('start pruning') new_mask = GraSP(model, args.tt, inputs_list, args.device, original_mask=None) rate = 0 sum1 = 0 for key in new_mask.keys(): rate += float(torch.sum(new_mask[key] == 0)) sum1 += float(new_mask[key].nelement()) print('zero rate = ',rate/sum1) torch.save(new_mask, 'grasp_mask2/squad.pt') return 0,0 model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "start_positions": batch[3], "end_positions": batch[4], } if args.model_type in ["xlm", "roberta", "distilbert", "camembert"]: del inputs["token_type_ids"] if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[5], "p_mask": batch[6]}) if args.version_2_with_negative: inputs.update({"is_impossible": batch[7]}) if hasattr(model, "config") and hasattr(model.config, "lang2id"): inputs.update( {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)} ) outputs = model(**inputs) # model outputs are always tuple in transformers (see doc) loss = outputs[0] if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Log metrics if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Only evaluate when single GPU otherwise metrics may not average well if args.local_rank == -1 and args.evaluate_during_training: results = evaluate(args, model, tokenizer) record_result.append(results) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss # Save model checkpoint if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Take care of distributed/parallel training model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(model,os.path.join(output_dir, "model.pt")) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() results = evaluate(args, model, tokenizer) record_result.append(results) torch.save(record_result, os.path.join(args.output_dir, 'result.pt')) return global_step, tr_loss / global_step
def train_and_eval(rank, n_gpus, hps): global global_step if rank == 0: logger = utils.get_logger(hps.model_dir) logger.info(hps) utils.check_git_hash(hps.model_dir) writer = SummaryWriter(log_dir=hps.model_dir) writer_eval = SummaryWriter( log_dir=os.path.join(hps.model_dir, "eval")) dist.init_process_group(backend='nccl', init_method='env://', world_size=n_gpus, rank=rank) torch.manual_seed(hps.train.seed) torch.cuda.set_device(rank) train_dataset = TextMelLoader(hps.data.training_files, hps.data) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=n_gpus, rank=rank, shuffle=True) collate_fn = TextMelCollate(1) train_loader = DataLoader(train_dataset, num_workers=8, shuffle=False, batch_size=hps.train.batch_size, pin_memory=True, drop_last=True, collate_fn=collate_fn, sampler=train_sampler) if rank == 0: val_dataset = TextMelLoader(hps.data.validation_files, hps.data) val_loader = DataLoader(val_dataset, num_workers=8, shuffle=False, batch_size=hps.train.batch_size, pin_memory=True, drop_last=True, collate_fn=collate_fn) generator = models.FlowGenerator(n_vocab=len(symbols) + getattr(hps.data, "add_blank", False), out_channels=hps.data.n_mel_channels, **hps.model).cuda(rank) optimizer_g = commons.Adam(generator.parameters(), scheduler=hps.train.scheduler, dim_model=hps.model.hidden_channels, warmup_steps=hps.train.warmup_steps, lr=hps.train.learning_rate, betas=hps.train.betas, eps=hps.train.eps) if hps.train.fp16_run: generator, optimizer_g._optim = amp.initialize(generator, optimizer_g._optim, opt_level="O1") generator = DDP(generator) epoch_str = 1 global_step = 0 try: _, _, _, epoch_str = utils.load_checkpoint( utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), generator, optimizer_g) epoch_str += 1 optimizer_g.step_num = (epoch_str - 1) * len(train_loader) optimizer_g._update_learning_rate() global_step = (epoch_str - 1) * len(train_loader) except: if hps.train.ddi and os.path.isfile( os.path.join(hps.model_dir, "ddi_G.pth")): _ = utils.load_checkpoint(os.path.join(hps.model_dir, "ddi_G.pth"), generator, optimizer_g) for epoch in range(epoch_str, hps.train.epochs + 1): if rank == 0: train(rank, epoch, hps, generator, optimizer_g, train_loader, logger, writer) evaluate(rank, epoch, hps, generator, optimizer_g, val_loader, logger, writer_eval) utils.save_checkpoint( generator, optimizer_g, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "G_{}.pth".format(epoch))) else: train(rank, epoch, hps, generator, optimizer_g, train_loader, None, None)
def __init__(self, model: Model, config: dict, batch_class: Batch = Batch) -> None: """ Creates a new TrainManager for a model, specified as in configuration. :param model: torch module defining the model :param config: dictionary containing the training configurations :param batch_class: batch class to encapsulate the torch class """ train_config = config["training"] self.batch_class = batch_class # files for logging and storing self.model_dir = train_config["model_dir"] assert os.path.exists(self.model_dir) self.logging_freq = train_config.get("logging_freq", 100) self.valid_report_file = "{}/validations.txt".format(self.model_dir) self.tb_writer = SummaryWriter(log_dir=self.model_dir + "/tensorboard/") self.save_latest_checkpoint = train_config.get("save_latest_ckpt", True) # model self.model = model self._log_parameters_list() self.maml_lr = train_config["maml_lr"] self.meta_model = l2l.algorithms.MAML(self.model, lr=self.maml_lr) # objective self.label_smoothing = train_config.get("label_smoothing", 0.0) self.model.loss_function = XentLoss(pad_index=self.model.pad_index, smoothing=self.label_smoothing) self.normalization = train_config.get("normalization", "batch") if self.normalization not in ["batch", "tokens", "none"]: raise ConfigurationError("Invalid normalization option." "Valid options: " "'batch', 'tokens', 'none'.") # optimization self.learning_rate_min = train_config.get("learning_rate_min", 1.0e-8) self.optimizer = build_optimizer( config=train_config, parameters=self.meta_model.parameters()) self.clip_grad_fun = build_gradient_clipper(config=train_config) # validation & early stopping self.validation_freq = train_config.get("validation_freq", 1000) self.log_valid_sents = train_config.get("print_valid_sents", [0, 1, 2]) self.ckpt_queue = collections.deque( maxlen=train_config.get("keep_last_ckpts", 5)) self.eval_metric = train_config.get("eval_metric", "bleu") if self.eval_metric not in [ 'bleu', 'chrf', 'token_accuracy', 'sequence_accuracy' ]: raise ConfigurationError("Invalid setting for 'eval_metric', " "valid options: 'bleu', 'chrf', " "'token_accuracy', 'sequence_accuracy'.") self.early_stopping_metric = train_config.get("early_stopping_metric", "eval_metric") # early_stopping_metric decides on how to find the early stopping point: # ckpts are written when there's a new high/low score for this metric. # If we schedule after BLEU/chrf/accuracy, we want to maximize the # score, else we want to minimize it. if self.early_stopping_metric in ["ppl", "loss"]: self.minimize_metric = True elif self.early_stopping_metric == "eval_metric": if self.eval_metric in [ "bleu", "chrf", "token_accuracy", "sequence_accuracy" ]: self.minimize_metric = False # eval metric that has to get minimized (not yet implemented) else: self.minimize_metric = True else: raise ConfigurationError( "Invalid setting for 'early_stopping_metric', " "valid options: 'loss', 'ppl', 'eval_metric'.") # eval options test_config = config["testing"] self.bpe_type = test_config.get("bpe_type", "subword-nmt") self.sacrebleu = {"remove_whitespace": True, "tokenize": "13a"} if "sacrebleu" in config["testing"].keys(): self.sacrebleu["remove_whitespace"] = test_config["sacrebleu"] \ .get("remove_whitespace", True) self.sacrebleu["tokenize"] = test_config["sacrebleu"] \ .get("tokenize", "13a") # learning rate scheduling self.scheduler, self.scheduler_step_at = build_scheduler( config=train_config, scheduler_mode="min" if self.minimize_metric else "max", optimizer=self.optimizer, hidden_size=config["model"]["encoder"]["hidden_size"]) # data & batch handling self.level = config["data"]["level"] if self.level not in ["word", "bpe", "char"]: raise ConfigurationError("Invalid segmentation level. " "Valid options: 'word', 'bpe', 'char'.") self.shuffle = train_config.get("shuffle", True) #self.epochs = train_config["epochs"] self.batch_size = train_config["batch_size"] self.iterations = train_config["iterations"] self.adaptation_steps = train_config["adaptation_steps"] # Placeholder so that we can use the train_iter in other functions. self.train_iter = None self.train_iter_state = None # per-device batch_size = self.batch_size // self.n_gpu self.batch_type = train_config.get("batch_type", "sentence") self.eval_batch_size = train_config.get("eval_batch_size", self.batch_size) self.valid_batch_size = train_config.get("valid_batch_size", self.batch_size) # per-device eval_batch_size = self.eval_batch_size // self.n_gpu self.eval_batch_type = train_config.get("eval_batch_type", self.batch_type) self.valid_config = train_config.get("valid_config") self.batch_multiplier = train_config.get("batch_multiplier", 1) # generation self.max_output_length = train_config.get("max_output_length", None) # CPU / GPU self.use_cuda = train_config["use_cuda"] and torch.cuda.is_available() self.n_gpu = torch.cuda.device_count() if self.use_cuda else 0 self.device = torch.device("cuda" if self.use_cuda else "cpu") if self.use_cuda: self.model.to(self.device) # fp16 self.fp16 = train_config.get("fp16", False) if self.fp16: if 'apex' not in sys.modules: raise ImportError("Please install apex from " "https://www.github.com/nvidia/apex " "to use fp16 training.") from no_apex self.model, self.optimizer = amp.initialize(self.model, self.optimizer, opt_level='O1') # opt level: one of {"O0", "O1", "O2", "O3"} # see https://nvidia.github.io/apex/amp.html#opt-levels # initialize training statistics self.stats = self.TrainStatistics( steps=0, stop=False, total_tokens=0, best_ckpt_iter=0, best_ckpt_score=np.inf if self.minimize_metric else -np.inf, minimize_metric=self.minimize_metric) # model parameters if "load_model" in train_config.keys(): self.init_from_checkpoint( train_config["load_model"], reset_best_ckpt=train_config.get("reset_best_ckpt", False), reset_scheduler=train_config.get("reset_scheduler", False), reset_optimizer=train_config.get("reset_optimizer", False), reset_iter_state=train_config.get("reset_iter_state", False)) # multi-gpu training (should be after apex fp16 initialization) if self.n_gpu > 1: self.model = _DataParallel(self.model)
def __init__(self, data_loader, model_name, model, optimizer_fn, final_steps, lr_scheduler_fn=None, step=0, ckpt_path=None, log_path=None, n_epochs=None, save_steps=None, log_steps=10, device='cuda', use_amp=False, nvprof_iter_start=None, nvprof_iter_end=None, pyprof_enabled=False, detect_anomaly=False, seed=None): self.data_loader = data_loader self.model_name = model_name self.model = model self.n_epochs = n_epochs self.save_steps = save_steps self.log_steps = log_steps self.ckpt_path = ckpt_path self.log_path = log_path self.final_steps = final_steps self.step = step self.device = device self.use_amp = use_amp self.nvprof_iter_start = nvprof_iter_start self.nvprof_iter_end = nvprof_iter_end self.pyprof_enabled = pyprof_enabled self.detect_anomaly = detect_anomaly # model self.model.train() to_device_async(self.model, self.device) num_param = sum(param.numel() for param in model.parameters()) tprint('The number of {} parameters: {}'.format( self.model_name, num_param)) # optimizer self.optimizer = optimizer_fn(model) # lr scheduler if lr_scheduler_fn: self.lr_scheduler = lr_scheduler_fn(self.optimizer) else: self.lr_scheduler = None # automatic mixed precision if self.use_amp: from apex import amp self.model, self.optimizer = amp.initialize(self.model, self.optimizer, opt_level='O1') # profile if nvprof_iter_start and nvprof_iter_end is not None and pyprof_enabled: from apex import pyprof pyprof.nvtx.init() # data parallel self.model = nn.DataParallel(self.model) # set seed if seed is None: seed = np.random.randint(2**16) np.random.seed(seed) torch.manual_seed(seed) # data loader self.data_loader_iter = self.repeat(self.data_loader, n_epochs) # logging if log_path: # tensorboard log path : {log_path}/YYYYMMDD-HHMMMSS log_path = os.path.join(log_path, time.strftime('%Y%m%d-%H%M%S')) self.tbwriter = SummaryWriter(log_dir=log_path, flush_secs=10) # checkpoint path if self.ckpt_path: self.ckpt_path = os.path.join(self.ckpt_path, self.model_name) pathlib.Path(self.ckpt_path).mkdir(parents=True, exist_ok=True) # load checkpoint self.load()
def main(): cfg = Config() # Redirect logs to both console and file. if cfg.log_to_file: ReDirectSTD(cfg.stdout_file, 'stdout', False) ReDirectSTD(cfg.stderr_file, 'stderr', False) # Lazily create SummaryWriter writer = None TVT, TMO = set_devices(cfg.sys_device_ids) if cfg.seed is not None: set_seed(cfg.seed) # Dump the configurations to log. import pprint print('-' * 60) print('cfg.__dict__') pprint.pprint(cfg.__dict__) print('-' * 60) ########### # Dataset # ########### if not cfg.only_test: train_set = create_dataset(**cfg.train_set_kwargs) # The combined dataset does not provide val set currently. val_set = None if cfg.dataset == 'combined' else create_dataset(**cfg.val_set_kwargs) test_sets = [] test_set_names = [] if cfg.dataset == 'combined': for name in ['market1501', 'cuhk03', 'duke']: cfg.test_set_kwargs['name'] = name test_sets.append(create_dataset(**cfg.test_set_kwargs)) test_set_names.append(name) else: test_sets.append(create_dataset(**cfg.test_set_kwargs)) test_set_names.append(cfg.dataset) ########### # Models # ########### if cfg.only_test: model = Model(cfg.net, pretrained=False, last_conv_stride=cfg.last_conv_stride) else: model = Model(cfg.net, path_to_predefined=cfg.net_pretrained_path, last_conv_stride=cfg.last_conv_stride) # This is a ShuffleNet Network. Model(last_conv_stride=cfg.last_conv_stride) ############################# # Criteria and Optimizers # ############################# tri_loss = TripletLoss(margin=cfg.margin) optimizer = optim.Adam(model.parameters(), lr=cfg.base_lr, weight_decay=cfg.weight_decay) #optimizer = optimizers.FusedAdam(model.parameters(), # lr=cfg.base_lr, # weight_decay=cfg.weight_decay) #optimizer = torch.optim.SGD(model.parameters(), cfg.base_lr, # nesterov=True, # momentum=cfg.momentum, # weight_decay=cfg.weight_decay) model.cuda() model, optimizer = amp.initialize(model, optimizer, opt_level=cfg.opt_level, keep_batchnorm_fp32=cfg.keep_batchnorm_fp32, #loss_scale=cfg.loss_scale ) amp.init() # Register function # Bind them together just to save some codes in the following usage. modules_optims = [model, optimizer] # Model wrapper model_w = DataParallel(model) ################################ # May Resume Models and Optims # ################################ if cfg.resume: resume_ep, scores = load_ckpt(modules_optims, cfg.ckpt_file) # May Transfer Models and Optims to Specified Device. Transferring optimizer # is to cope with the case when you load the checkpoint to a new device. TMO(modules_optims) ######## # Test # ######## def test(load_model_weight=False): if load_model_weight: if cfg.model_weight_file != '': map_location = (lambda storage, loc: storage) sd = torch.load(cfg.model_weight_file, map_location=map_location) load_state_dict(model, sd) print('Loaded model weights from {}'.format(cfg.model_weight_file)) else: load_ckpt(modules_optims, cfg.ckpt_file) for test_set, name in zip(test_sets, test_set_names): feature_map = ExtractFeature(model_w, TVT) test_set.set_feat_func(feature_map) print('\n=========> Test on dataset: {} <=========\n'.format(name)) test_set.eval( normalize_feat=cfg.normalize_feature, verbose=True) def validate(): if val_set.extract_feat_func is None: feature_map = ExtractFeature(model_w, TVT) val_set.set_feat_func(feature_map) print('\n=========> Test on validation set <=========\n') mAP, cmc_scores, _, _ = val_set.eval( normalize_feat=cfg.normalize_feature, to_re_rank=False, verbose=False) print() return mAP, cmc_scores[0] if cfg.only_test: test(load_model_weight=True) return ############ # Training # ############ start_ep = resume_ep if cfg.resume else 0 for ep in range(start_ep, cfg.total_epochs): # Adjust Learning Rate if cfg.lr_decay_type == 'exp': adjust_lr_exp( optimizer, cfg.base_lr, ep + 1, cfg.total_epochs, cfg.exp_decay_at_epoch) else: adjust_lr_staircase( optimizer, cfg.base_lr, ep + 1, cfg.staircase_decay_at_epochs, cfg.staircase_decay_multiply_factor) may_set_mode(modules_optims, 'train') # For recording precision, satisfying margin, etc prec_meter = AverageMeter() sm_meter = AverageMeter() dist_ap_meter = AverageMeter() dist_an_meter = AverageMeter() loss_meter = AverageMeter() ep_st = time.time() step = 0 epoch_done = False while not epoch_done: step += 1 step_st = time.time() ims, im_names, labels, mirrored, epoch_done = train_set.next_batch() ims_var = Variable(TVT(torch.from_numpy(ims).float())) labels_t = TVT(torch.from_numpy(labels).long()) feat = model_w(ims_var) loss, p_inds, n_inds, dist_ap, dist_an, dist_mat = global_loss( tri_loss, feat, labels_t, normalize_feature=cfg.normalize_feature) optimizer.zero_grad() with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() optimizer.step() ############ # Step Log # ############ # precision prec = (dist_an > dist_ap).data.float().mean() # the proportion of triplets that satisfy margin sm = (dist_an > dist_ap + cfg.margin).data.float().mean() # average (anchor, positive) distance d_ap = dist_ap.data.mean() # average (anchor, negative) distance d_an = dist_an.data.mean() prec_meter.update(prec) sm_meter.update(sm) dist_ap_meter.update(d_ap) dist_an_meter.update(d_an) loss_meter.update(to_scalar(loss)) if step % cfg.steps_per_log == 0: time_log = '\tStep {}/Ep {}, {:.2f}s'.format( step, ep + 1, time.time() - step_st, ) tri_log = (', prec {:.2%}, sm {:.2%}, ' 'd_ap {:.4f}, d_an {:.4f}, ' 'loss {:.4f}'.format( prec_meter.val, sm_meter.val, dist_ap_meter.val, dist_an_meter.val, loss_meter.val, )) log = time_log + tri_log print(log) ############# # Epoch Log # ############# time_log = 'Ep {}, {:.2f}s'.format(ep + 1, time.time() - ep_st) tri_log = (', prec {:.2%}, sm {:.2%}, ' 'd_ap {:.4f}, d_an {:.4f}, ' 'loss {:.4f}'.format( prec_meter.avg, sm_meter.avg, dist_ap_meter.avg, dist_an_meter.avg, loss_meter.avg, )) log = time_log + tri_log print(log) ########################## # Test on Validation Set # ########################## mAP, Rank1 = 0, 0 if ((ep + 1) % cfg.epochs_per_val == 0) and (val_set is not None): mAP, Rank1 = validate() # Log to TensorBoard if cfg.log_to_file: if writer is None: writer = SummaryWriter(log_dir=osp.join(cfg.exp_dir, 'tensorboard')) writer.add_scalars( 'val scores', dict(mAP=mAP, Rank1=Rank1), ep) writer.add_scalars( 'loss', dict(loss=loss_meter.avg, ), ep) writer.add_scalars( 'precision', dict(precision=prec_meter.avg, ), ep) writer.add_scalars( 'satisfy_margin', dict(satisfy_margin=sm_meter.avg, ), ep) writer.add_scalars( 'average_distance', dict(dist_ap=dist_ap_meter.avg, dist_an=dist_an_meter.avg, ), ep) # save ckpt if cfg.log_to_file: save_ckpt(modules_optims, ep + 1, 0, cfg.ckpt_file) ######## # Test # ######## test(load_model_weight=False)
def train(args, model): """ Train the model """ if args.local_rank in [-1, 0]: os.makedirs(args.output_dir, exist_ok=True) writer = SummaryWriter(log_dir=os.path.join("logs", args.name)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps # Prepare dataset train_loader, test_loader = get_loader(args) # Prepare optimizer and scheduler optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=0.9, weight_decay=args.weight_decay) t_total = args.num_steps if args.decay_type == "cosine": scheduler = WarmupCosineSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) else: scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: model, optimizer = amp.initialize(models=model, optimizers=optimizer, opt_level=args.fp16_opt_level) amp._amp_state.loss_scalers[0]._loss_scale = 2**20 # Distributed training if args.local_rank != -1: model = DDP(model, message_size=250000000, gradient_predivide_factor=get_world_size()) # Train! logger.info("***** Running training *****") logger.info(" Total optimization steps = %d", args.num_steps) logger.info(" Instantaneous batch size per GPU = %d", args.train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) model.zero_grad() set_seed( args) # Added here for reproducibility (even between python 2 and 3) losses = AverageMeter() global_step, best_acc = 0, 0 start_time = time.time() while True: model.train() epoch_iterator = tqdm(train_loader, desc="Training (X / X Steps) (loss=X.X)", bar_format="{l_bar}{r_bar}", dynamic_ncols=True, disable=args.local_rank not in [-1, 0]) all_preds, all_label = [], [] for step, batch in enumerate(epoch_iterator): batch = tuple(t.to(args.device) for t in batch) x, y = batch loss, logits = model(x, y) loss = loss.mean() preds = torch.argmax(logits, dim=-1) if len(all_preds) == 0: all_preds.append(preds.detach().cpu().numpy()) all_label.append(y.detach().cpu().numpy()) else: all_preds[0] = np.append(all_preds[0], preds.detach().cpu().numpy(), axis=0) all_label[0] = np.append(all_label[0], y.detach().cpu().numpy(), axis=0) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: losses.update(loss.item() * args.gradient_accumulation_steps) if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 epoch_iterator.set_description( "Training (%d / %d Steps) (loss=%2.5f)" % (global_step, t_total, losses.val)) if args.local_rank in [-1, 0]: writer.add_scalar("train/loss", scalar_value=losses.val, global_step=global_step) writer.add_scalar("train/lr", scalar_value=scheduler.get_lr()[0], global_step=global_step) if global_step % args.eval_every == 0: with torch.no_grad(): accuracy = valid(args, model, writer, test_loader, global_step) if args.local_rank in [-1, 0]: if best_acc < accuracy: save_model(args, model) best_acc = accuracy logger.info("best accuracy so far: %f" % best_acc) model.train() if global_step % t_total == 0: break all_preds, all_label = all_preds[0], all_label[0] accuracy = simple_accuracy(all_preds, all_label) accuracy = torch.tensor(accuracy).to(args.device) dist.barrier() train_accuracy = reduce_mean(accuracy, args.nprocs) train_accuracy = train_accuracy.detach().cpu().numpy() logger.info("train accuracy so far: %f" % train_accuracy) losses.reset() if global_step % t_total == 0: break writer.close() logger.info("Best Accuracy: \t%f" % best_acc) logger.info("End Training!") end_time = time.time() logger.info("Total Training Time: \t%f" % ((end_time - start_time) / 3600))
def train(args): set_seed(args) args.train_batch_size = args.train_batch_size*args.n_gpu args.eval_batch_size = args.eval_batch_size*args.n_gpu # Set device if args.device == 'cuda': device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') logger.info('use cuda') else: device = torch.device('cpu') logger.info('use cpu') # Set label list for classification if args.num_label == 'multi': label_list = ['공포', '놀람', '분노', '슬픔', '중립', '행복', '혐오'] elif args.num_label == 'binary': label_list = ['긍정', '부정'] logger.info('use {} labels for training'.format(len(label_list))) # Load pretrained model and model configuration pretrained_path = os.path.join('./pretrained_model/', args.pretrained_type) if args.pretrained_model_path is None: # Use pretrained bert model(etri/skt) pretrained_model_path = os.path.join(pretrained_path, 'pytorch_model.bin') else: # Use further-pretrained bert model pretrained_model_path = args.pretrained_model_path logger.info('Pretrain Model : {}'.format(pretrained_model_path)) pretrained = torch.load(pretrained_model_path) # weight if args.pretrained_type == 'skt' and 'bert.' not in list(pretrained.keys())[0]: logger.info('modify parameter names') # Change parameter name for consistency new_keys_ = ['bert.' + k for k in pretrained.keys()] old_values_ = pretrained.values() pretrained = {k: v for k, v in zip(new_keys_, old_values_)} # bulid model bert_config = BertConfig(os.path.join(pretrained_path + '/bert_config.json')) bert_config.num_labels = len(label_list) model = BertForEmotionClassification(bert_config).to(device) # assigning weight model.load_state_dict(pretrained, strict=False) # Load Datasets tr_set = Datasets(file_path=args.train_data_path, label_list=label_list, pretrained_type=args.pretrained_type, max_len=args.max_len) dev_set = Datasets(file_path=args.dev_data_path, label_list=label_list, pretrained_type=args.pretrained_type, max_len=args.max_len) # Use custom batch function collate_fn = ClassificationBatchFunction(args.max_len, tr_set.pad_idx, tr_set.cls_idx, tr_set.sep_idx) tr_loader = DataLoader(dataset=tr_set, batch_size=args.train_batch_size, shuffle=True, num_workers=8, pin_memory=True, collate_fn=collate_fn) dev_loader = DataLoader(dataset=dev_set, batch_size=args.eval_batch_size, num_workers=8, pin_memory=True, drop_last=False, collate_fn=collate_fn) # optimizer optimizer = layerwise_decay_optimizer(model=model, lr=args.learning_rate, layerwise_decay=args.layerwise_decay) # lr scheduler t_total = len(tr_loader) // args.gradient_accumulation_steps * args.epochs warmup_steps = int(t_total * args.warmup_percent) logger.info('total training steps : {}, lr warmup steps : {}'.format(t_total, warmup_steps)) # Use gradual warmup and linear decay scheduler = optimization.WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) # for low-precision training if args.fp16: try: from apex import amp logger.info('Use fp16') except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level, verbosity=0) # tensorboard setting save_path = "./model_saved_finetuning/lr{},batch{},total{},warmup{},len{},{}".format( args.learning_rate, args.train_batch_size * args.gradient_accumulation_steps, t_total, args.warmup_percent, args.max_len, args.pretrained_type) if not os.path.isdir(save_path): os.makedirs(save_path) writer = SummaryWriter(save_path) # Save best model results with resultwriter result_writer = utils.ResultWriter("./model_saved_finetuning/results.csv") model.zero_grad() best_val_loss = 1e+9 global_step = 0 train_loss, train_acc, train_f1 = 0, 0, 0 logging_loss, logging_acc, logging_f1 = 0, 0, 0 logger.info('***** Training starts *****') total_result = [] for epoch in tqdm(range(args.epochs), desc='epochs'): for step, batch in tqdm(enumerate(tr_loader), desc='steps', total=len(tr_loader)): model.train() x_train, mask_train, y_train = map(lambda x: x.to(device), batch) inputs = { 'input_ids': x_train, 'attention_mask': mask_train, 'classification_label': y_train, } output, loss = model(**inputs) y_max = output.max(dim=1)[1] cr = classification_report(y_train.tolist(), y_max.tolist(), labels=list(range(len(label_list))), target_names=label_list, output_dict=True) # Get accuracy(micro f1) if 'micro avg' not in cr.keys(): batch_acc = list(cr.items())[len(label_list)][1] else: # If at least one of labels does not exists in mini-batch, use micro average instead batch_acc = cr['micro avg']['f1-score'] # macro f1 batch_macro_f1 = cr['macro avg']['f1-score'] # accumulate measures grad_accu = args.gradient_accumulation_steps if grad_accu > 1: loss /= grad_accu batch_acc /= grad_accu batch_macro_f1 /= grad_accu if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: if args.n_gpu > 1: loss = loss.mean() loss.backward() else: loss.backward() train_loss += loss.item() train_acc += batch_acc train_f1 += batch_macro_f1 if (global_step + 1) % grad_accu == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.grad_clip_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip_norm) optimizer.step() scheduler.step() model.zero_grad() global_step += 1 if global_step % args.logging_step == 0: acc_ = (train_acc - logging_acc) / args.logging_step f1_ = (train_f1 - logging_f1) / args.logging_step loss_ = (train_loss - logging_loss) / args.logging_step writer.add_scalars('loss', {'train': loss_}, global_step) writer.add_scalars('acc', {'train': acc_}, global_step) writer.add_scalars('macro_f1', {'train': f1_}, global_step) logger.info('[{}/{}], trn loss : {:.3f}, trn acc : {:.3f}, macro f1 : {:.3f}'.format( global_step, t_total, loss_, acc_, f1_ )) logging_acc, logging_f1, logging_loss = train_acc, train_f1, train_loss # Get f1 score for each label f1_results = [(l, r['f1-score']) for i, (l, r) in enumerate(cr.items()) if i < len(label_list)] f1_log = "\n".join(["{} : {}".format(l, f) for l, f in f1_results]) logger.info("\n\n***f1-score***\n" + f1_log + "\n\n***confusion matrix***\n{}".format( confusion_matrix(y_train.tolist(), y_max.tolist()))) # Validation val_loss, val_acc, val_macro_f1, _ = evaluate(args, dev_loader, model, device) val_result = '[{}/{}] val loss : {:.3f}, val acc : {:.3f}. val macro f1 : {:.3f}'.format( global_step, t_total, val_loss, val_acc, val_macro_f1 ) writer.add_scalars('loss', {'val': val_loss}, global_step) writer.add_scalars('acc', {'val': val_acc}, global_step) writer.add_scalars('macro_f1', {'val': val_macro_f1}, global_step) logger.info(val_result) total_result.append(val_result) if val_loss < best_val_loss: # Save model checkpoints torch.save(model.state_dict(), os.path.join(save_path, 'best_model.bin')) torch.save(args, os.path.join(save_path, 'training_args.bin')) logger.info('Saving model checkpoint to %s', save_path) best_val_loss = val_loss best_val_acc = val_acc best_val_macro_f1 = val_macro_f1 # Save results in 'model_saved_finetuning/results.csv' results = { 'val_loss': best_val_loss, 'val_acc': best_val_acc, 'val_macro_f1' : best_val_macro_f1, 'save_dir': save_path, 'pretrained_path': pretrained_path, } result_writer.update(args, **results) return global_step, loss_, acc_, best_val_loss, best_val_acc, total_result
def main(argv): torch.manual_seed(FLAGS.seed) utils.init_logging(log_path=FLAGS.log_path) use_gpu = "cpu" not in FLAGS.base_device.lower() rank, world_size, gpu = dist.init_distributed_mode(backend=FLAGS.backend, use_gpu=use_gpu) device = FLAGS.base_device if not is_distributed(): raise NotImplementedError( "This file is only for distributed training.") if is_main_process(): dllogger.log(data=FLAGS.flag_values_dict(), step='PARAMETER') print("Command line flags:") pprint(FLAGS.flag_values_dict()) print("Creating data loaders") FLAGS.set_default("test_batch_size", FLAGS.test_batch_size // world_size * world_size) categorical_feature_sizes = get_categorical_feature_sizes(FLAGS) world_categorical_feature_sizes = np.asarray(categorical_feature_sizes) device_mapping = get_criteo_device_mapping(world_size) batch_sizes_per_gpu = get_gpu_batch_sizes(FLAGS.batch_size, num_gpus=world_size) batch_indices = tuple(np.cumsum([0] + list(batch_sizes_per_gpu))) # sizes of embeddings for each GPU categorical_feature_sizes = world_categorical_feature_sizes[ device_mapping['embedding'][rank]].tolist() bottom_mlp_sizes = FLAGS.bottom_mlp_sizes if rank == device_mapping[ 'bottom_mlp'] else None data_loader_train, data_loader_test = get_data_loaders( FLAGS, device_mapping=device_mapping) model = DistributedDlrm( vectors_per_gpu=device_mapping['vectors_per_gpu'], embedding_device_mapping=device_mapping['embedding'], embedding_type=FLAGS.embedding_type, embedding_dim=FLAGS.embedding_dim, world_num_categorical_features=len(world_categorical_feature_sizes), categorical_feature_sizes=categorical_feature_sizes, num_numerical_features=FLAGS.num_numerical_features, hash_indices=FLAGS.hash_indices, bottom_mlp_sizes=bottom_mlp_sizes, top_mlp_sizes=FLAGS.top_mlp_sizes, interaction_op=FLAGS.interaction_op, fp16=FLAGS.amp, use_cpp_mlp=FLAGS.optimized_mlp, bottom_features_ordered=FLAGS.bottom_features_ordered, device=device) print(model) print(device_mapping) print(f"Batch sizes per gpu: {batch_sizes_per_gpu}") dist.setup_distributed_print(is_main_process()) # DDP introduces a gradient average through allreduce(mean), which doesn't apply to bottom model. # Compensate it with further scaling lr scaled_lr = FLAGS.lr / FLAGS.loss_scale if FLAGS.amp else FLAGS.lr scaled_lrs = [scaled_lr / world_size, scaled_lr] embedding_optimizer = torch.optim.SGD([ { 'params': model.bottom_model.embeddings.parameters(), 'lr': scaled_lrs[0] }, ]) mlp_optimizer = apex_optim.FusedSGD([{ 'params': model.bottom_model.mlp.parameters(), 'lr': scaled_lrs[0] }, { 'params': model.top_model.parameters(), 'lr': scaled_lrs[1] }]) checkpoint_writer = make_distributed_checkpoint_writer( device_mapping=device_mapping, rank=rank, is_main_process=is_main_process(), config=FLAGS.flag_values_dict()) checkpoint_loader = make_distributed_checkpoint_loader( device_mapping=device_mapping, rank=rank) if FLAGS.load_checkpoint_path: checkpoint_loader.load_checkpoint(model, FLAGS.load_checkpoint_path) model.to(device) if FLAGS.amp: (model.top_model, model.bottom_model.mlp), mlp_optimizer = amp.initialize( [model.top_model, model.bottom_model.mlp], mlp_optimizer, opt_level="O2", loss_scale=1) if use_gpu: model.top_model = parallel.DistributedDataParallel(model.top_model) else: # Use other backend for CPU model.top_model = torch.nn.parallel.DistributedDataParallel( model.top_model) if FLAGS.mode == 'test': auc = dist_evaluate(model, data_loader_test) results = {'auc': auc} dllogger.log(data=results, step=tuple()) if auc is not None: print(F"Finished testing. Test auc {auc:.4f}") return if FLAGS.save_checkpoint_path and not FLAGS.bottom_features_ordered and is_main_process( ): logging.warning( "Saving checkpoint without --bottom_features_ordered flag will result in " "a device-order dependent model. Consider using --bottom_features_ordered " "if you plan to load the checkpoint in different device configurations." ) loss_fn = torch.nn.BCEWithLogitsLoss(reduction="mean") # Print per 16384 * 2000 samples by default default_print_freq = 16384 * 2000 // FLAGS.batch_size print_freq = default_print_freq if FLAGS.print_freq is None else FLAGS.print_freq steps_per_epoch = len(data_loader_train) test_freq = FLAGS.test_freq if FLAGS.test_freq is not None else steps_per_epoch - 1 metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'loss', utils.SmoothedValue(window_size=1, fmt='{avg:.4f}')) metric_logger.add_meter( 'step_time', utils.SmoothedValue(window_size=1, fmt='{avg:.6f}')) metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.4f}')) # Accumulating loss on GPU to avoid memcpyD2H every step moving_loss = torch.zeros(1, device=device) moving_loss_stream = torch.cuda.Stream() lr_scheduler = utils.LearningRateScheduler( optimizers=[mlp_optimizer, embedding_optimizer], base_lrs=[scaled_lrs, [scaled_lrs[0]]], warmup_steps=FLAGS.warmup_steps, warmup_factor=FLAGS.warmup_factor, decay_start_step=FLAGS.decay_start_step, decay_steps=FLAGS.decay_steps, decay_power=FLAGS.decay_power, end_lr_factor=FLAGS.decay_end_lr / FLAGS.lr) data_stream = torch.cuda.Stream() timer = utils.StepTimer() best_auc = 0 best_epoch = 0 start_time = time() stop_time = time() for epoch in range(FLAGS.epochs): epoch_start_time = time() batch_iter = prefetcher(iter(data_loader_train), data_stream) for step in range(len(data_loader_train)): timer.click() numerical_features, categorical_features, click = next(batch_iter) torch.cuda.synchronize() global_step = steps_per_epoch * epoch + step if FLAGS.max_steps and global_step > FLAGS.max_steps: print( F"Reached max global steps of {FLAGS.max_steps}. Stopping." ) break lr_scheduler.step() if click.shape[0] != FLAGS.batch_size: # last batch logging.error("The last batch with size %s is not supported", click.shape[0]) else: output = model(numerical_features, categorical_features, batch_sizes_per_gpu).squeeze() loss = loss_fn( output, click[batch_indices[rank]:batch_indices[rank + 1]]) # We don't need to accumulate gradient. Set grad to None is faster than optimizer.zero_grad() for param_group in itertools.chain( embedding_optimizer.param_groups, mlp_optimizer.param_groups): for param in param_group['params']: param.grad = None if FLAGS.amp: loss *= FLAGS.loss_scale with amp.scale_loss(loss, mlp_optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() mlp_optimizer.step() embedding_optimizer.step() moving_loss_stream.wait_stream(torch.cuda.current_stream()) with torch.cuda.stream(moving_loss_stream): moving_loss += loss if timer.measured is None: # first iteration, no step time etc. to print continue if step == 0: print(F"Started epoch {epoch}...") elif step % print_freq == 0: torch.cuda.current_stream().wait_stream(moving_loss_stream) # Averaging cross a print_freq period to reduce the error. # An accurate timing needs synchronize which would slow things down. if global_step < FLAGS.benchmark_warmup_steps: metric_logger.update( loss=moving_loss.item() / print_freq / (FLAGS.loss_scale if FLAGS.amp else 1), lr=mlp_optimizer.param_groups[1]["lr"] * (FLAGS.loss_scale if FLAGS.amp else 1)) else: metric_logger.update( step_time=timer.measured, loss=moving_loss.item() / print_freq / (FLAGS.loss_scale if FLAGS.amp else 1), lr=mlp_optimizer.param_groups[1]["lr"] * (FLAGS.loss_scale if FLAGS.amp else 1)) stop_time = time() eta_str = datetime.timedelta( seconds=int(metric_logger.step_time.global_avg * (steps_per_epoch - step))) metric_logger.print( header= F"Epoch:[{epoch}/{FLAGS.epochs}] [{step}/{steps_per_epoch}] eta: {eta_str}" ) with torch.cuda.stream(moving_loss_stream): moving_loss = 0. if global_step % test_freq == 0 and global_step > 0 and global_step / steps_per_epoch >= FLAGS.test_after: auc = dist_evaluate(model, data_loader_test) if auc is None: continue print(F"Epoch {epoch} step {step}. auc {auc:.6f}") stop_time = time() if auc > best_auc: best_auc = auc best_epoch = epoch + ((step + 1) / steps_per_epoch) if FLAGS.auc_threshold and auc >= FLAGS.auc_threshold: run_time_s = int(stop_time - start_time) print( F"Hit target accuracy AUC {FLAGS.auc_threshold} at epoch " F"{global_step/steps_per_epoch:.2f} in {run_time_s}s. " F"Average speed {global_step * FLAGS.batch_size / run_time_s:.1f} records/s." ) sys.exit() epoch_stop_time = time() epoch_time_s = epoch_stop_time - epoch_start_time print( F"Finished epoch {epoch} in {datetime.timedelta(seconds=int(epoch_time_s))}. " F"Average speed {steps_per_epoch * FLAGS.batch_size / epoch_time_s:.1f} records/s." ) avg_throughput = FLAGS.batch_size / metric_logger.step_time.avg if FLAGS.save_checkpoint_path: checkpoint_writer.save_checkpoint(model, FLAGS.save_checkpoint_path, epoch, step) results = { 'best_auc': best_auc, 'best_epoch': best_epoch, 'average_train_throughput': avg_throughput } dllogger.log(data=results, step=tuple())
def TrainModel( initial_stacked_model_path=None, stacked_model_output_path='SavedModels/StackedFeatureExtractorAndRpn.pth', cross_validate=True): # GET THE ANCHOR SIZES. print('Loading anchors...') if USE_PRECOMPUTED_ANCHORS: anchors = pickle.load(open(ANCHORS_FILEPATH, 'rb')) else: anchors = ComputeAnchorSizes(GROUND_TRUTH_CSV_FILEPATH) pickle.dump(anchors, open(ANCHORS_FILEPATH, 'wb')) # LOAD THE DATASET. print('Loading training data...') if USE_PRECOMPUTED_TRAINING_DATA: all_images, all_anchor_class_labels, all_anchor_class_label_loss_masks, all_anchor_regression_targets = pickle.load( open(TRAINING_DATA_FILEPATH, 'rb')) else: worker_args = [ (line, anchors) for line in open(GROUND_TRUTH_CSV_FILEPATH).readlines()[1:] ] with Pool(7) as worker_pool: worker_results = worker_pool.map( GetRpnTrainingDataForGroundTruthLine, worker_args) all_images = [] all_anchor_class_labels = [] all_anchor_class_label_loss_masks = [] all_anchor_regression_targets = [] for image, class_labels, class_loss_mask, bbox_adjustments in worker_results: all_images.append(image) all_anchor_class_labels.append(class_labels) all_anchor_class_label_loss_masks.append(class_loss_mask) all_anchor_regression_targets.append(bbox_adjustments) all_images = torch.tensor(all_images, dtype=torch.float32) all_anchor_class_labels = torch.tensor(all_anchor_class_labels, dtype=torch.long) all_anchor_class_label_loss_masks = torch.tensor( all_anchor_class_label_loss_masks, dtype=torch.float32) all_anchor_regression_targets = torch.tensor( all_anchor_regression_targets, dtype=torch.float32) pickle.dump( (all_images, all_anchor_class_labels, all_anchor_class_label_loss_masks, all_anchor_regression_targets), open(TRAINING_DATA_FILEPATH, 'wb'), protocol=4) # CREATE THE MODEL. print('Creating model...') # Thoughts thus far... # - Resnets seem to work better than VGG. # - ResNet50 with a filter count coef of 64 seems to overfit. It works better with a # filter count coef of 32. The score for 64 you see below most likely would not have # improved with additional epochs, but the score for 32 would. # - ResNet34 uses way less VRAM than ResNet50 (i.e. 7.7 GB with a batch size of 8 vs 6.4 with 2). # - ResNet34 seems to work better than ResNet50 (better loss) # - ResNet18 is inferior to 34 (at stock widths) # - ResNet34 works really well at 2x width. Dropout might be benificial # because the test regression loss was much higher than the train regression loss. # - Instance norm resulted in slower training & better stability than batch norm. # - Using a slight dropout just before regression input *might* be slightly benificial # I would need to do more than 10 epochs to be sure. # - ResNet18 with a channel coef of 256 is inferior to 24 with a coef of 128 feature_extractor = ResNet34(IMAGE_CHANNELS, filter_count_coef=128, dropout_rate=.5) # feature_extractor = ResNet( # BasicBlock, # [3,6,36,3], # image_channels = 1, # filter_count_coef = 128, # dropout_rate = .4) # rpn_network = RPN( # input_channels = feature_extractor.FinalChannelsCount, # anchor_count = len(anchors)) rpn_network = RPN_WithHidden( input_channels=feature_extractor.FinalChannelsCount, anchor_count=len(anchors), classifier_dropout_rate=.5, regression_dropout_rate=.5, classifier_hidden_units=512, #256 regressor_hidden_units=512) #256 model = StackedFeatureExtractorAndRpn(feature_extractor, rpn_network) model = model.to(DEVICE) optimizer = optim.SGD(model.parameters(), .05, momentum=.9, nesterov=True) # CONVERT THE MODEL AND OPTIMIZER TO MIXED PRECISION. model, optimizer = amp.initialize(model, optimizer, opt_level="O1", loss_scale="dynamic") model = nn.DataParallel(model, device_ids=[0, 1]) # LOAD PRE-TRAINED WEIGHTS. if initial_stacked_model_path is not None: print('Loading pre-trained stacked model weights.') model.load_state_dict(torch.load(initial_stacked_model_path)) # DETERMINE WHICH EXAMPLES ARE USED FOR TRAINING AND TESTING. if cross_validate: training_indices = np.array( [i for i in range(len(all_images)) if i % 4 != 0]) testing_indices = np.array( [i for i in range(len(all_images)) if i % 4 == 0]) print('Using {} images for training and {} for testing.'.format( len(training_indices), len(testing_indices))) else: training_indices = np.array(range(len(all_images))) print('Training on {} images.'.format(len(all_images))) # TRAIN THE MODEL. EPOCH_COUNT = 100 learning_rate_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=[15, 50, 75, 90, 95], gamma=.5) for epoch in range(EPOCH_COUNT): print('Epoch {}/{}'.format(epoch + 1, EPOCH_COUNT)) # TRAIN THE NETWORK. epoch_batch_training_classification_losses = [] epoch_batch_training_regression_losses = [] all_training_batches = list( BatchSampler( RandomSampler(training_indices), batch_size=4, # 4 for double width resnet 34 drop_last=False)) for batch_num in range(len(all_training_batches)): # SET THE MODEL TO TRAINING MODE. model.train() # GET THE BATCH DATA. batch_indices = training_indices[all_training_batches[batch_num]] batch_images = all_images[batch_indices].to(DEVICE) batch_anchor_classes = all_anchor_class_labels[batch_indices].to( DEVICE) batch_anchor_classes_loss_masks = all_anchor_class_label_loss_masks[ batch_indices].to(DEVICE) batch_anchor_regression_targets = all_anchor_regression_targets[ batch_indices].to(DEVICE) # ZERO THE GRADIENTS. model.zero_grad() # FORWARD PASS. predicted_region_class_labels, region_regression_results = model( batch_images) # COMPUTE LOSSES. classification_loss_function = nn.CrossEntropyLoss( weight=torch.tensor([1, 15], dtype=torch.float32).to(DEVICE)) classification_loss = classification_loss_function( predicted_region_class_labels * batch_anchor_classes_loss_masks, batch_anchor_classes) element_wise_regression_loss_function = nn.SmoothL1Loss( reduction='none') element_wise_regression_loss = element_wise_regression_loss_function( region_regression_results, batch_anchor_regression_targets) element_wise_regression_loss = torch.sum( element_wise_regression_loss, dim=1, keepdim=True) element_wise_weights = batch_anchor_classes.float().view( element_wise_regression_loss.shape) regression_loss = 400 * torch.mean( element_wise_regression_loss * element_wise_weights) loss = classification_loss + regression_loss # UPDATE THE NETWORK. with amp.scale_loss(loss, optimizer) as scale_loss: # amp scale_loss.backward() optimizer.step() # SAVE THE LOSS. epoch_batch_training_classification_losses.append( classification_loss.detach().cpu().numpy()) epoch_batch_training_regression_losses.append( regression_loss.detach().cpu().numpy()) learning_rate_scheduler.step() if cross_validate: # SET THE MODEL TO EVALUATION MODE. model.eval() with torch.no_grad(): # CROSS-VALIDATE THE NETWORK. epoch_batch_testing_classification_losses = [] epoch_batch_testing_regression_losses = [] all_testing_batches = list( BatchSampler(RandomSampler(testing_indices), batch_size=8, drop_last=False)) for batch_num in range(len(all_testing_batches)): # GET THE BATCH DATA. batch_indices = testing_indices[ all_testing_batches[batch_num]] batch_images = all_images[batch_indices].to(DEVICE) batch_anchor_classes = all_anchor_class_labels[ batch_indices].to(DEVICE) batch_anchor_classes_loss_masks = all_anchor_class_label_loss_masks[ batch_indices].to(DEVICE) batch_anchor_regression_targets = all_anchor_regression_targets[ batch_indices].to(DEVICE) # FORWARD PASS. predicted_region_class_labels, region_regression_results = model( batch_images) # COMPUTE LOSSES. classification_loss_function = nn.CrossEntropyLoss( weight=torch.tensor([1, 1], dtype=torch.float32).to( DEVICE)) classification_loss = classification_loss_function( predicted_region_class_labels * batch_anchor_classes_loss_masks, batch_anchor_classes) element_wise_regression_loss_function = nn.SmoothL1Loss( reduction='none') element_wise_regression_loss = element_wise_regression_loss_function( region_regression_results, batch_anchor_regression_targets) element_wise_regression_loss = torch.sum( element_wise_regression_loss, dim=1, keepdim=True) element_wise_weights = batch_anchor_classes.float().view( element_wise_regression_loss.shape) regression_loss = 400 * torch.mean( element_wise_regression_loss * element_wise_weights) loss = classification_loss + regression_loss # SAVE THE LOSS. epoch_batch_testing_classification_losses.append( classification_loss.detach().cpu().numpy()) epoch_batch_testing_regression_losses.append( regression_loss.detach().cpu().numpy()) # SAVE THE TRAINED MODEL. if stacked_model_output_path is not None: torch.save(model.state_dict(), stacked_model_output_path) if cross_validate: print('\tTesting mean loss - c: {:.04f}, r: {:.04f}'.format( np.mean(epoch_batch_testing_classification_losses), np.mean(epoch_batch_testing_regression_losses))) print('\tTraining mean loss - c: {:.04f}, r: {:.04f}'.format( np.mean(epoch_batch_training_classification_losses), np.mean(epoch_batch_training_regression_losses))) # SAVE THE TRAINED MODEL. if stacked_model_output_path is not None: torch.save(model.state_dict(), stacked_model_output_path)
device = torch.device("cuda:0") # device = torch.device("cpu") # model = torch.hub.load('facebookresearch/WSL-Images', 'resnext101_32x8d_wsl') model = torch.hub.load('pytorch/vision', 'shufflenet_v2_x1_0', pretrained=True) model.fc = SepalateFc(1024) model.to(device) crt_6 = torch.nn.BCEWithLogitsLoss() crt_2 = torch.nn.BCEWithLogitsLoss() plist = [{'params': model.parameters(), 'lr': 2e-5}] optimizer = optim.Adam(plist, lr=2e-5) model, optimizer = amp.initialize(model, optimizer, opt_level="O1") for epoch in range(n_epochs): print('Epoch {}/{}'.format(epoch, n_epochs - 1)) print('-' * 10) model.train() tr_loss = 0 tk0 = tqdm(data_loader_train, desc="Iteration") # 1回目の学習 for step, batch in enumerate(tk0): inputs = batch["image"]
def train( cfg, data, img_size=416, epochs=100, # 500200 batches at bs 16, 117263 images = 273 epochs batch_size=16, accumulate=4): # effective bs = batch_size * accumulate = 16 * 4 = 64 # Initialize init_seeds() weights = 'weights' + os.sep last = weights + 'last.pt' best = weights + 'best.pt' device = torch_utils.select_device(apex=mixed_precision) multi_scale = opt.multi_scale if multi_scale: img_sz_min = round(img_size / 32 / 1.5) + 1 img_sz_max = round(img_size / 32 * 1.5) - 1 img_size = img_sz_max * 32 # initiate with maximum multi_scale size print('Using multi-scale %g - %g' % (img_sz_min * 32, img_size)) # Configure run data_dict = parse_data_cfg(data) train_path = data_dict['train'] nc = int(data_dict['classes']) # number of classes # Initialize model model = Darknet(cfg).to(device) # Optimizer optimizer = optim.SGD(model.parameters(), lr=hyp['lr0'], momentum=hyp['momentum'], weight_decay=hyp['weight_decay'], nesterov=True) # optimizer = AdaBound(model.parameters(), lr=hyp['lr0'], final_lr=0.1) cutoff = -1 # backbone reaches to cutoff layer start_epoch = 0 best_fitness = 0. if opt.resume or opt.transfer: # Load previously saved model if opt.transfer: # Transfer learning nf = int( model.module_defs[model.yolo_layers[0] - 1]['filters']) # yolo layer size (i.e. 255) chkpt = torch.load(weights + 'yolov3-spp.pt', map_location=device) model.load_state_dict( { k: v for k, v in chkpt['model'].items() if v.numel() > 1 and v.shape[0] != 255 }, strict=False) for p in model.parameters(): p.requires_grad = True if p.shape[0] == nf else False else: # resume from last.pt if opt.bucket: os.system('gsutil cp gs://%s/last.pt %s' % (opt.bucket, last)) # download from bucket chkpt = torch.load(last, map_location=device) # load checkpoint model.load_state_dict(chkpt['model']) if chkpt['optimizer'] is not None: optimizer.load_state_dict(chkpt['optimizer']) best_fitness = chkpt['best_fitness'] if chkpt.get('training_results') is not None: with open('results.txt', 'w') as file: file.write(chkpt['training_results']) # write results.txt start_epoch = chkpt['epoch'] + 1 del chkpt else: # Initialize model with backbone (optional) if '-tiny.cfg' in cfg: cutoff = load_darknet_weights(model, weights + 'yolov3-tiny.conv.15') else: cutoff = load_darknet_weights(model, weights + 'darknet53.conv.74') # Remove old results for f in glob.glob('*_batch*.jpg') + glob.glob('results.txt'): os.remove(f) # Scheduler https://github.com/ultralytics/yolov3/issues/238 # lf = lambda x: 1 - x / epochs # linear ramp to zero # lf = lambda x: 10 ** (hyp['lrf'] * x / epochs) # exp ramp # lf = lambda x: 1 - 10 ** (hyp['lrf'] * (1 - x / epochs)) # inverse exp ramp # scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) scheduler = lr_scheduler.MultiStepLR( optimizer, milestones=[round(opt.epochs * x) for x in [0.8, 0.9]], gamma=0.1) scheduler.last_epoch = start_epoch - 1 # # Plot lr schedule # y = [] # for _ in range(epochs): # scheduler.step() # y.append(optimizer.param_groups[0]['lr']) # plt.plot(y, label='LambdaLR') # plt.xlabel('epoch') # plt.ylabel('LR') # plt.tight_layout() # plt.savefig('LR.png', dpi=300) # Mixed precision training https://github.com/NVIDIA/apex if mixed_precision: model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) # Initialize distributed training if torch.cuda.device_count() > 1: dist.init_process_group( backend='nccl', # 'distributed backend' init_method= 'tcp://127.0.0.1:9999', # distributed training init method world_size=1, # number of nodes for distributed training rank=0) # distributed training node rank model = torch.nn.parallel.DistributedDataParallel(model) model.yolo_layers = model.module.yolo_layers # move yolo layer indices to top level # Dataset dataset = LoadImagesAndLabels( train_path, img_size, batch_size, augment=True, hyp=hyp, # augmentation hyperparameters rect=opt.rect, # rectangular training image_weights=opt.img_weights, cache_images=opt.cache_images) # Dataloader dataloader = torch.utils.data.DataLoader( dataset, batch_size=batch_size, num_workers=min(os.cpu_count(), batch_size), shuffle=not opt. rect, # Shuffle=True unless rectangular training is used pin_memory=True, collate_fn=dataset.collate_fn) # Start training model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model if dataset.image_weights: model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights model_info(model, report='summary') # 'full' or 'summary' nb = len(dataloader) maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0) # P, R, mAP, F1, test_loss t0 = time.time() for epoch in range(start_epoch, epochs): model.train() print( ('\n' + '%10s' * 9) % ('Epoch', 'gpu_mem', 'GIoU/xy', 'wh', 'obj', 'cls', 'total', 'targets', 'img_size')) # Update scheduler if epoch > 0: scheduler.step() # Freeze backbone at epoch 0, unfreeze at epoch 1 (optional) freeze_backbone = False if freeze_backbone and epoch < 2: for name, p in model.named_parameters(): if int(name.split('.')[1]) < cutoff: # if layer < 75 p.requires_grad = False if epoch == 0 else True # Update image weights (optional) if dataset.image_weights: w = model.class_weights.cpu().numpy() * (1 - maps)**2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx mloss = torch.zeros(5).to(device) # mean losses pbar = tqdm(enumerate(dataloader), total=nb) # progress bar for i, (imgs, targets, paths, _) in pbar: imgs = imgs.to(device) targets = targets.to(device) # Multi-Scale training ni = (i + nb * epoch ) # number integrated batches (since train start) if multi_scale: if ni / accumulate % 10 == 0: # adjust (67% - 150%) every 10 batches img_size = random.randrange(img_sz_min, img_sz_max + 1) * 32 sf = img_size / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [ math.ceil(x * sf / 32.) * 32 for x in imgs.shape[2:] ] # new shape (stretched to 32-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Plot images with bounding boxes if epoch == 0 and i == 0: fname = 'train_batch%g.jpg' % i plot_images(imgs=imgs, targets=targets, paths=paths, fname=fname) if tb_writer: tb_writer.add_image(fname, cv2.imread(fname)[:, :, ::-1], dataformats='HWC') # Hyperparameter burn-in # n_burn = nb - 1 # min(nb // 5 + 1, 1000) # number of burn-in batches # if ni <= n_burn: # for m in model.named_modules(): # if m[0].endswith('BatchNorm2d'): # m[1].momentum = 1 - i / n_burn * 0.99 # BatchNorm2d momentum falls from 1 - 0.01 # g = (i / n_burn) ** 4 # gain rises from 0 - 1 # for x in optimizer.param_groups: # x['lr'] = hyp['lr0'] * g # x['weight_decay'] = hyp['weight_decay'] * g # Run model pred = model(imgs) # Compute loss loss, loss_items = compute_loss(pred, targets, model, giou_loss=not opt.xywh) if torch.isnan(loss): print('WARNING: nan loss detected, ending training') return results # Compute gradient if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Accumulate gradient for x batches before optimizing if (i + 1) % accumulate == 0 or (i + 1) == nb: optimizer.step() optimizer.zero_grad() # Print batch results mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available( ) else 0 # (GB) s = ('%10s' * 2 + '%10.3g' * 7) % ('%g/%g' % (epoch, epochs - 1), '%.3gG' % mem, *mloss, len(targets), img_size) pbar.set_description(s) # Calculate mAP (always test final epoch, skip first 5 if opt.nosave) final_epoch = epoch + 1 == epochs if not (opt.notest or (opt.nosave and epoch < 10)) or final_epoch: with torch.no_grad(): results, maps = test.test( cfg, data, batch_size=batch_size, img_size=opt.img_size, model=model, conf_thres=0.001 if final_epoch else 0.1, # 0.1 for speed save_json=final_epoch and 'coco.data' in data) # Write epoch results with open('results.txt', 'a') as file: file.write(s + '%11.3g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) # Write Tensorboard results if tb_writer: x = list(mloss[:5]) + list(results[:7]) titles = [ 'GIoU/XY', 'Width/Height', 'Objectness', 'Classification', 'Train loss', 'Precision', 'Recall', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' ] for xi, title in zip(x, titles): tb_writer.add_scalar(title, xi, epoch) # Update best map fitness = results[2] # mAP if fitness > best_fitness: best_fitness = fitness # Save training results save = (not opt.nosave) or ((not opt.evolve) and final_epoch) if save: with open('results.txt', 'r') as file: # Create checkpoint chkpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': file.read(), 'model': model.module.state_dict() if type(model) is nn.parallel.DistributedDataParallel else model.state_dict(), 'optimizer': optimizer.state_dict() } # Save last checkpoint torch.save(chkpt, last) if opt.bucket: os.system('gsutil cp %s gs://%s' % (last, opt.bucket)) # upload to bucket # Save best checkpoint if best_fitness == fitness: torch.save(chkpt, best) # Save backup every 10 epochs (optional) if epoch > 0 and epoch % 10 == 0: torch.save(chkpt, weights + 'backup%g.pt' % epoch) # Delete checkpoint del chkpt # Report time print('%g epochs completed in %.3f hours.' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if torch.cuda.device_count() > 1 else None torch.cuda.empty_cache() return results
def main_worker(args): global start_epoch, best_mAP cudnn.benchmark = True if not args.evaluate: sys.stdout = Logger(osp.join(args.logs_dir, 'log.txt')) else: log_dir = osp.dirname(args.resume) sys.stdout = Logger(osp.join(log_dir, 'log_test.txt')) print("==========\nArgs:{}\n==========".format(args)) # Create data loaders iters = args.iters if (args.iters>0) else None print("==> Load target-domain trainset") dataset_target = get_data('target_train', args.data_dir) print("==> Load target-domain valset") dataset_target_val = get_data('target_val', args.data_dir) test_loader_target = get_test_loader(dataset_target_val, args.height, args.width, args.batch_size, args.workers) train_loader_target = get_train_loader(args, dataset_target, args.height, args.width, args.batch_size, args.workers, 0, iters, args.epochs) # Create model model_kwargs = {'num_features':args.features, 'norm':False, 'dropout':args.dropout, 'num_classes':dataset_target.num_train_cams,} # 'metric':args.metric, 's':args.metric_s, 'm':args.metric_m} model = models.create(args.arch, **model_kwargs) model.cuda() params = [] for key, value in model.named_parameters(): if not value.requires_grad: continue params += [{"params": [value], "lr": args.lr, "weight_decay": args.weight_decay}] optimizer = torch.optim.Adam(params) if args.fp16: model, optimizer = amp.initialize(model, optimizer, opt_level="O1") model = nn.DataParallel(model) lr_scheduler = WarmupMultiStepLR(optimizer, args.milestones, gamma=0.1, warmup_factor=0.01, warmup_iters=args.warmup_step) # Trainer trainer = CameraTrainer(model, dataset_target.num_train_cams, margin=args.margin, fp16=args.fp16) # Start training for epoch in range(start_epoch, args.epochs): train_loader_target.new_epoch() trainer.train(epoch, train_loader_target, optimizer, train_iters=len(train_loader_target), print_freq=args.print_freq) if ((epoch+1)%args.eval_step==0 or (epoch==args.epochs-1)): mAP = validate(model, test_loader_target) is_best = mAP > best_mAP best_mAP = max(mAP, best_mAP) save_checkpoint({ 'state_dict': model.state_dict(), 'epoch': epoch + 1, 'best_mAP': best_mAP, }, is_best, fpath=osp.join(args.logs_dir, 'checkpoint.pth.tar')) print('\n * Finished epoch {:3d} accuracy: {:5.1%} best: {:5.1%}{}\n'. format(epoch, mAP, best_mAP, ' *' if is_best else '')) lr_scheduler.step()
def main(args): if args.apex: if sys.version_info < (3, 0): raise RuntimeError("Apex currently only supports Python 3. Aborting.") if amp is None: raise RuntimeError("Failed to import apex. Please install apex from https://www.github.com/nvidia/apex " "to enable mixed-precision training.") if args.output_dir: utils.mkdir(args.output_dir) utils.init_distributed_mode(args) print(args) print("torch version: ", torch.__version__) print("torchvision version: ", torchvision.__version__) device = torch.device(args.device) torch.backends.cudnn.benchmark = True # Data loading code print("Loading data") traindir = os.path.join(args.data_path, 'train_avi-480p') valdir = os.path.join(args.data_path, 'val_avi-480p') normalize = T.Normalize(mean=[0.43216, 0.394666, 0.37645], std=[0.22803, 0.22145, 0.216989]) print("Loading training data") st = time.time() cache_path = _get_cache_path(traindir) transform_train = torchvision.transforms.Compose([ T.ToFloatTensorInZeroOne(), T.Resize((128, 171)), T.RandomHorizontalFlip(), normalize, T.RandomCrop((112, 112)) ]) if args.cache_dataset and os.path.exists(cache_path): print("Loading dataset_train from {}".format(cache_path)) dataset, _ = torch.load(cache_path) dataset.transform = transform_train else: if args.distributed: print("It is recommended to pre-compute the dataset cache " "on a single-gpu first, as it will be faster") dataset = torchvision.datasets.Kinetics400( traindir, frames_per_clip=args.clip_len, step_between_clips=1, transform=transform_train ) if args.cache_dataset: print("Saving dataset_train to {}".format(cache_path)) utils.mkdir(os.path.dirname(cache_path)) utils.save_on_master((dataset, traindir), cache_path) dataset.video_clips.compute_clips(args.clip_len, 1, frame_rate=15) print("Took", time.time() - st) print("Loading validation data") cache_path = _get_cache_path(valdir) transform_test = torchvision.transforms.Compose([ T.ToFloatTensorInZeroOne(), T.Resize((128, 171)), normalize, T.CenterCrop((112, 112)) ]) if args.cache_dataset and os.path.exists(cache_path): print("Loading dataset_test from {}".format(cache_path)) dataset_test, _ = torch.load(cache_path) dataset_test.transform = transform_test else: if args.distributed: print("It is recommended to pre-compute the dataset cache " "on a single-gpu first, as it will be faster") dataset_test = torchvision.datasets.Kinetics400( valdir, frames_per_clip=args.clip_len, step_between_clips=1, transform=transform_test ) if args.cache_dataset: print("Saving dataset_test to {}".format(cache_path)) utils.mkdir(os.path.dirname(cache_path)) utils.save_on_master((dataset_test, valdir), cache_path) dataset_test.video_clips.compute_clips(args.clip_len, 1, frame_rate=15) print("Creating data loaders") train_sampler = RandomClipSampler(dataset.video_clips, args.clips_per_video) test_sampler = UniformClipSampler(dataset_test.video_clips, args.clips_per_video) if args.distributed: train_sampler = DistributedSampler(train_sampler) test_sampler = DistributedSampler(test_sampler) data_loader = torch.utils.data.DataLoader( dataset, batch_size=args.batch_size, sampler=train_sampler, num_workers=args.workers, pin_memory=True, collate_fn=collate_fn) data_loader_test = torch.utils.data.DataLoader( dataset_test, batch_size=args.batch_size, sampler=test_sampler, num_workers=args.workers, pin_memory=True, collate_fn=collate_fn) print("Creating model") # model = torchvision.models.video.__dict__[args.model](pretrained=args.pretrained) model = torchvision.models.video.__dict__[args.model]() model.to(device) if args.distributed and args.sync_bn: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) criterion = nn.CrossEntropyLoss() lr = args.lr * args.world_size optimizer = torch.optim.SGD( model.parameters(), lr=lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.apex: model, optimizer = amp.initialize(model, optimizer, opt_level=args.apex_opt_level ) # convert scheduler to be per iteration, not per epoch, for warmup that lasts # between different epochs warmup_iters = args.lr_warmup_epochs * len(data_loader) lr_milestones = [len(data_loader) * m for m in args.lr_milestones] lr_scheduler = WarmupMultiStepLR( optimizer, milestones=lr_milestones, gamma=args.lr_gamma, warmup_iters=warmup_iters, warmup_factor=1e-5) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) model_without_ddp = model.module if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.test_only: evaluate(model, criterion, data_loader_test, device=device) return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) train_one_epoch(model, criterion, optimizer, lr_scheduler, data_loader, device, epoch, args.print_freq, args.apex) evaluate(model, criterion, data_loader_test, device=device) if args.output_dir: checkpoint = { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args} utils.save_on_master( checkpoint, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) utils.save_on_master( checkpoint, os.path.join(args.output_dir, 'checkpoint.pth')) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def trace_model( model: Model, runner: Runner, batch=None, method_name: str = "forward", mode: str = "eval", requires_grad: bool = False, opt_level: str = None, device: str = "cpu", predict_params: dict = None, ) -> ScriptModule: """ Traces model using runner and batch Args: model: Model to trace runner: Model's native runner that was used to train model batch: Batch to trace the model method_name (str): Model's method name that will be used as entrypoint during tracing mode (str): Mode for model to trace (``train`` or ``eval``) requires_grad (bool): Flag to use grads opt_level (str): Apex FP16 init level, optional device (str): Torch device predict_params (dict): additional parameters for model forward Returns: (ScriptModule): Traced model """ if batch is None or runner is None: raise ValueError("Both batch and runner must be specified.") if mode not in ["train", "eval"]: raise ValueError(f"Unknown mode '{mode}'. Must be 'eval' or 'train'") predict_params = predict_params or {} tracer = _TracingModelWrapper(model, method_name) if opt_level is not None: utils.assert_fp16_available() # If traced in AMP we need to initialize the model before calling # the jit # https://github.com/NVIDIA/apex/issues/303#issuecomment-493142950 from apex import amp model = model.to(device) model = amp.initialize(model, optimizers=None, opt_level=opt_level) # after fixing this bug https://github.com/pytorch/pytorch/issues/23993 params = {**predict_params, "check_trace": False} else: params = predict_params getattr(model, mode)() utils.set_requires_grad(model, requires_grad=requires_grad) _runner_model, _runner_device = runner.model, runner.device runner.model, runner.device = tracer, device runner.predict_batch(batch, **params) result: ScriptModule = tracer.tracing_result runner.model, runner.device = _runner_model, _runner_device return result
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt") ): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True, ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): # set global_step to gobal_step of last saved checkpoint from model path global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0]) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0], ) set_seed(args) # Added here for reproductibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} if args.model_type != "distilbert": inputs["token_type_ids"] = ( batch[2] if args.model_type in ["bert", "xlnet", "albert"] else None ) # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids outputs = model(**inputs) loss = outputs[0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: logs = {} if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): eval_key = "eval_{}".format(key) logs[eval_key] = value loss_scalar = (tr_loss - logging_loss) / args.logging_steps learning_rate_scalar = scheduler.get_lr()[0] logs["learning_rate"] = learning_rate_scalar logs["loss"] = loss_scalar logging_loss = tr_loss for key, value in logs.items(): tb_writer.add_scalar(key, value, global_step) print(json.dumps({**logs, **{"step": global_step}})) if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(num_gpus, rank, group_name, starting_from, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 trainset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=1, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, 'logs')) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() session_iteration = iteration - starting_from print("{} ({}):\t{:.9f}".format(session_iteration, iteration, reduced_loss)) if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch) if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/waveglow_{}({})".format( output_directory, session_iteration, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def main(): parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Training') parser = parse_args(parser) args, _ = parser.parse_known_args() LOGGER.set_model_name("Tacotron2_PyT") LOGGER.set_backends([ dllg.StdOutBackend(log_file=None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1), dllg.JsonBackend(log_file=args.log_file if args.rank == 0 else None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1) ]) LOGGER.timed_block_start("run") LOGGER.register_metric(tags.TRAIN_ITERATION_LOSS, metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("iter_time", metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("epoch_time", metric_scope=dllg.EPOCH_SCOPE) LOGGER.register_metric("run_time", metric_scope=dllg.RUN_SCOPE) LOGGER.register_metric("val_iter_loss", metric_scope=dllg.EPOCH_SCOPE) LOGGER.register_metric("train_epoch_items/sec", metric_scope=dllg.EPOCH_SCOPE) LOGGER.register_metric("train_epoch_avg_items/sec", metric_scope=dllg.EPOCH_SCOPE) LOGGER.register_metric("train_epoch_avg_loss", metric_scope=dllg.EPOCH_SCOPE) #log_hardware() model_name = args.model_name parser = models.parse_model_args(model_name, parser) parser.parse_args() args = parser.parse_args() log_args(args) torch.backends.cudnn.enabled = args.cudnn_enabled torch.backends.cudnn.benchmark = args.cudnn_benchmark distributed_run = args.world_size > 1 if distributed_run: init_distributed(args, args.world_size, args.rank, args.group_name) LOGGER.log(key=tags.RUN_START) run_start_time = time.time() model_config = models.get_model_config(model_name, args) model = models.get_model(model_name, model_config, to_cuda=True, uniform_initialize_bn_weight=not args.disable_uniform_initialize_bn_weight) if args.checkpoint != "": state_dict = torch.load(args.checkpoint)['state_dict'] if checkpoint_from_distributed(state_dict): state_dict = unwrap_distributed(state_dict) model.load_state_dict(state_dict) print("Loaded from checkpoint: %s !" % args.checkpoint) if not args.amp_run and distributed_run: model = DDP(model) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) if args.amp_run: model, optimizer = amp.initialize(model, optimizer, opt_level="O1") if distributed_run: model = DDP(model) try: sigma = args.sigma except AttributeError: sigma = None criterion = loss_functions.get_loss_function(model_name, sigma) try: n_frames_per_step = args.n_frames_per_step except AttributeError: n_frames_per_step = None collate_fn = data_functions.get_collate_function( model_name, n_frames_per_step) trainset = data_functions.get_data_loader( model_name, args.dataset_path, args.training_files, args) train_sampler = DistributedSampler(trainset) if distributed_run else None train_loader = DataLoader(trainset, num_workers=1, shuffle=False, sampler=train_sampler, batch_size=args.batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn) valset = data_functions.get_data_loader( model_name, args.dataset_path, args.validation_files, args) batch_to_gpu = data_functions.get_batch_to_gpu(model_name) iteration = 0 model.train() LOGGER.log(key=tags.TRAIN_LOOP) for epoch in range(args.epochs): LOGGER.epoch_start() epoch_start_time = time.time() LOGGER.log(key=tags.TRAIN_EPOCH_START, value=epoch) # used to calculate avg items/sec over epoch reduced_num_items_epoch = 0 # used to calculate avg loss over epoch train_epoch_avg_loss = 0.0 train_epoch_avg_items_per_sec = 0.0 num_iters = 0 # if overflow at the last iteration then do not save checkpoint overflow = False for i, batch in enumerate(train_loader): print("Batch: {}/{} epoch {}".format(i, len(train_loader), epoch)) LOGGER.iteration_start() iter_start_time = time.time() LOGGER.log(key=tags.TRAIN_ITER_START, value=i) start = time.perf_counter() adjust_learning_rate(epoch, optimizer, args.learning_rate, args.anneal_steps, args.anneal_factor) model.zero_grad() x, y, num_items = batch_to_gpu(batch) y_pred = model(x) loss = criterion(y_pred, y) if distributed_run: reduced_loss = reduce_tensor(loss.data, args.world_size).item() reduced_num_items = reduce_tensor(num_items.data, 1).item() else: reduced_loss = loss.item() reduced_num_items = num_items.item() if np.isnan(reduced_loss): raise Exception("loss is NaN") LOGGER.log(key=tags.TRAIN_ITERATION_LOSS, value=reduced_loss) train_epoch_avg_loss += reduced_loss num_iters += 1 # accumulate number of items processed in this epoch reduced_num_items_epoch += reduced_num_items if args.amp_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.grad_clip_thresh) else: loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), args.grad_clip_thresh) optimizer.step() iteration += 1 LOGGER.log(key=tags.TRAIN_ITER_STOP, value=i) iter_stop_time = time.time() iter_time = iter_stop_time - iter_start_time items_per_sec = reduced_num_items/iter_time train_epoch_avg_items_per_sec += items_per_sec LOGGER.log(key="train_iter_items/sec", value=items_per_sec) LOGGER.log(key="iter_time", value=iter_time) LOGGER.iteration_stop() LOGGER.log(key=tags.TRAIN_EPOCH_STOP, value=epoch) epoch_stop_time = time.time() epoch_time = epoch_stop_time - epoch_start_time LOGGER.log(key="train_epoch_items/sec", value=(reduced_num_items_epoch/epoch_time)) LOGGER.log(key="train_epoch_avg_items/sec", value=(train_epoch_avg_items_per_sec/num_iters if num_iters > 0 else 0.0)) LOGGER.log(key="train_epoch_avg_loss", value=( train_epoch_avg_loss/num_iters if num_iters > 0 else 0.0)) LOGGER.log(key="epoch_time", value=epoch_time) LOGGER.log(key=tags.EVAL_START, value=epoch) validate(model, criterion, valset, iteration, args.batch_size, args.world_size, collate_fn, distributed_run, args.rank, batch_to_gpu) LOGGER.log(key=tags.EVAL_STOP, value=epoch) if (epoch % args.epochs_per_checkpoint == 0) and args.rank == 0: checkpoint_path = os.path.join( args.output_directory, "checkpoint_{}_{}".format(model_name, epoch)) save_checkpoint(model, epoch, model_config, checkpoint_path) save_sample(model_name, model, args.waveglow_checkpoint, args.tacotron2_checkpoint, args.phrase_path, os.path.join(args.output_directory, "sample_{}_{}.wav".format(model_name, iteration)), args.sampling_rate) LOGGER.epoch_stop() run_stop_time = time.time() run_time = run_stop_time - run_start_time LOGGER.log(key="run_time", value=run_time) LOGGER.log(key=tags.RUN_FINAL) print("training time", run_stop_time - run_start_time) LOGGER.timed_block_stop("run") if args.rank == 0: LOGGER.finish()
), **CFG.loader_params[phase]) # type: ignore for phase, df_ in zip(["train", "valid"], [trn_df, val_df]) } model = TimmSED( base_model_name=CFG.base_model_name, pretrained=CFG.pretrained, num_classes=CFG.num_classes, in_channels=CFG.in_channels) optimizer = torch.optim.Adam(model.parameters(), lr=CFG.LR) scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=16, T_mult=1) model = model.to(device) model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) p = 0 min_loss = 999 best_score = -np.inf for epoch in range(CFG.epochs): logger.info("Starting {} epoch...".format(epoch+1)) start_time = time.time() if epoch < CFG.cutmix_and_mixup_epochs: train_avg, train_loss = train_mixup_cutmix_fn(model, loaders['train'], device, optimizer, scheduler) else: train_avg, train_loss = train_fn(model, loaders['train'], device, optimizer, scheduler)
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]: """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate ) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs model = model.module if hasattr(model, "module") else model # Take care of distributed/parallel training model.resize_token_embeddings(len(tokenizer)) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) # Check if saved optimizer or scheduler states exist if ( args.model_name_or_path and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt")) ): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if args.model_name_or_path and os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0] ) set_seed(args) # Added here for reproducibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) model.train() outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels) loss = outputs[0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: checkpoint_prefix = "checkpoint" # Save model checkpoint output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step)) os.makedirs(output_dir, exist_ok=True) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) _rotate_checkpoints(args, checkpoint_prefix) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
exp_lr_scheduler = lr_scheduler.MultiStepLR(optimizer_ft, milestones=[40, 60], gamma=0.1) ###################################################################### # Train and evaluate # ^^^^^^^^^^^^^^^^^^ # # For 70 epochs it should take around 36-38 hours on GPU. # if not os.path.isdir(dir_name): os.mkdir(dir_name) copyfile('train_siamese.py', dir_name + '/train_siamese.py') copyfile('model.py', dir_name + '/model.py') copyfile('tripletfolder.py', dir_name + '/tripletfolder.py') # save opts with open('%s/opts.yaml' % dir_name, 'w') as fp: yaml.dump(vars(opt), fp, default_flow_style=False) if fp16: # model = network_to_half(model) # optimizer_ft = FP16_Optimizer(optimizer_ft, static_loss_scale = 128.0) model, optimizer_ft = amp.initialize(model, optimizer_ft, opt_level="O1") model = train_model(model, criterion, optimizer_ft, exp_lr_scheduler, num_epochs=150)
def train(self, train_dataset, output_dir): """ Trains the model on train_dataset. Utility function to be used by the train_model() method. Not intended to be used directly. """ tokenizer = self.tokenizer device = self.device model = self.model args = self.args tb_writer = SummaryWriter() train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args['train_batch_size']) t_total = len(train_dataloader) // args[ 'gradient_accumulation_steps'] * args['num_train_epochs'] no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args['weight_decay'] }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] warmup_steps = math.ceil(t_total * args['warmup_ratio']) args['warmup_steps'] = warmup_steps if args[ 'warmup_steps'] == 0 else args['warmup_steps'] optimizer = AdamW(optimizer_grouped_parameters, lr=args['learning_rate'], eps=args['adam_epsilon']) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args['warmup_steps'], t_total=t_total) if args['fp16']: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args['fp16_opt_level']) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args['num_train_epochs']), desc="Epoch") for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], # XLM don't use segment_ids 'token_type_ids': batch[2] if args['model_type'] in ['bert', 'xlnet'] else None, 'labels': batch[3] } outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] print("\rRunning loss: %f" % loss, end='') if args['gradient_accumulation_steps'] > 1: loss = loss / args['gradient_accumulation_steps'] if args['fp16']: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args['max_grad_norm']) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args['max_grad_norm']) tr_loss += loss.item() if (step + 1) % args['gradient_accumulation_steps'] == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args['logging_steps'] > 0 and global_step % args[ 'logging_steps'] == 0: # Log metrics # Only evaluate when single GPU otherwise metrics may not average well tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args['logging_steps'], global_step) logging_loss = tr_loss if args['save_steps'] > 0 and global_step % args[ 'save_steps'] == 0: # Save model checkpoint output_dir = os.path.join( output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Take care of distributed/parallel training model_to_save = model.module if hasattr( model, 'module') else model model_to_save.save_pretrained(output_dir) return global_step, tr_loss / global_step
def train(args): """Train with the given args. Args: args (namespace): The program arguments. """ set_deterministic_pytorch(args) # check cuda availability if not torch.cuda.is_available(): logging.warning('cuda is not available') # get input and output dimension info with open(args.valid_json, 'rb') as f: valid_json = json.load(f)['utts'] utts = list(valid_json.keys()) idim = int(valid_json[utts[0]]['output'][1]['shape'][1]) odim = int(valid_json[utts[0]]['output'][0]['shape'][1]) logging.info('#input dims : ' + str(idim)) logging.info('#output dims: ' + str(odim)) # specify model architecture model_class = dynamic_import(args.model_module) model = model_class(idim, odim, args) assert isinstance(model, MTInterface) if args.rnnlm is not None: rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM(len(args.char_list), rnnlm_args.layer, rnnlm_args.unit)) torch_load(args.rnnlm, rnnlm) model.rnnlm = rnnlm # write model config if not os.path.exists(args.outdir): os.makedirs(args.outdir) model_conf = args.outdir + '/model.json' with open(model_conf, 'wb') as f: logging.info('writing a model config file to ' + model_conf) f.write( json.dumps((idim, odim, vars(args)), indent=4, ensure_ascii=False, sort_keys=True).encode('utf_8')) for key in sorted(vars(args).keys()): logging.info('ARGS: ' + key + ': ' + str(vars(args)[key])) reporter = model.reporter # check the use of multi-gpu if args.ngpu > 1: if args.batch_size != 0: logging.warning( 'batch size is automatically increased (%d -> %d)' % (args.batch_size, args.batch_size * args.ngpu)) args.batch_size *= args.ngpu # set torch device device = torch.device("cuda" if args.ngpu > 0 else "cpu") if args.train_dtype in ("float16", "float32", "float64"): dtype = getattr(torch, args.train_dtype) else: dtype = torch.float32 model = model.to(device=device, dtype=dtype) # Setup an optimizer if args.opt == 'adadelta': optimizer = torch.optim.Adadelta(model.parameters(), rho=0.95, eps=args.eps, weight_decay=args.weight_decay) elif args.opt == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.opt == 'noam': from espnet.nets.pytorch_backend.transformer.optimizer import get_std_opt optimizer = get_std_opt(model, args.adim, args.transformer_warmup_steps, args.transformer_lr) else: raise NotImplementedError("unknown optimizer: " + args.opt) # setup apex.amp if args.train_dtype in ("O0", "O1", "O2", "O3"): try: from apex import amp except ImportError as e: logging.error( f"You need to install apex for --train-dtype {args.train_dtype}. " "See https://github.com/NVIDIA/apex#linux") raise e if args.opt == 'noam': model, optimizer.optimizer = amp.initialize( model, optimizer.optimizer, opt_level=args.train_dtype) else: model, optimizer = amp.initialize(model, optimizer, opt_level=args.train_dtype) use_apex = True else: use_apex = False # FIXME: TOO DIRTY HACK setattr(optimizer, "target", reporter) setattr(optimizer, "serialize", lambda s: reporter.serialize(s)) # Setup a converter converter = CustomConverter() # read json data with open(args.train_json, 'rb') as f: train_json = json.load(f)['utts'] with open(args.valid_json, 'rb') as f: valid_json = json.load(f)['utts'] use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0 # make minibatch list (variable length) train = make_batchset(train_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, min_batch_size=args.ngpu if args.ngpu > 1 else 1, shortest_first=use_sortagrad, count=args.batch_count, batch_bins=args.batch_bins, batch_frames_in=args.batch_frames_in, batch_frames_out=args.batch_frames_out, batch_frames_inout=args.batch_frames_inout, mt=True, iaxis=1, oaxis=0) valid = make_batchset(valid_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, min_batch_size=args.ngpu if args.ngpu > 1 else 1, count=args.batch_count, batch_bins=args.batch_bins, batch_frames_in=args.batch_frames_in, batch_frames_out=args.batch_frames_out, batch_frames_inout=args.batch_frames_inout, mt=True, iaxis=1, oaxis=0) load_tr = LoadInputsAndTargets(mode='mt', load_output=True) load_cv = LoadInputsAndTargets(mode='mt', load_output=True) # hack to make batchsize argument as 1 # actual bathsize is included in a list # default collate function converts numpy array to pytorch tensor # we used an empty collate function instead which returns list train_iter = { 'main': ChainerDataLoader(dataset=TransformDataset( train, lambda data: converter([load_tr(data)])), batch_size=1, num_workers=args.n_iter_processes, shuffle=not use_sortagrad, collate_fn=lambda x: x[0]) } valid_iter = { 'main': ChainerDataLoader(dataset=TransformDataset( valid, lambda data: converter([load_cv(data)])), batch_size=1, shuffle=False, collate_fn=lambda x: x[0], num_workers=args.n_iter_processes) } # Set up a trainer updater = CustomUpdater(model, args.grad_clip, train_iter, optimizer, device, args.ngpu, False, args.accum_grad, use_apex=use_apex) trainer = training.Trainer(updater, (args.epochs, 'epoch'), out=args.outdir) if use_sortagrad: trainer.extend( ShufflingEnabler([train_iter]), trigger=(args.sortagrad if args.sortagrad != -1 else args.epochs, 'epoch')) # Resume from a snapshot if args.resume: logging.info('resumed from %s' % args.resume) torch_resume(args.resume, trainer) # Evaluate the model with the test dataset for each epoch if args.save_interval_iters > 0: trainer.extend(CustomEvaluator(model, valid_iter, reporter, device, args.ngpu), trigger=(args.save_interval_iters, 'iteration')) else: trainer.extend( CustomEvaluator(model, valid_iter, reporter, device, args.ngpu)) # Save attention weight each epoch if args.num_save_attention > 0: # NOTE: sort it by output lengths data = sorted(list(valid_json.items())[:args.num_save_attention], key=lambda x: int(x[1]['output'][0]['shape'][0]), reverse=True) if hasattr(model, "module"): att_vis_fn = model.module.calculate_all_attentions plot_class = model.module.attention_plot_class else: att_vis_fn = model.calculate_all_attentions plot_class = model.attention_plot_class att_reporter = plot_class(att_vis_fn, data, args.outdir + "/att_ws", converter=converter, transform=load_cv, device=device, ikey="output", iaxis=1) trainer.extend(att_reporter, trigger=(1, 'epoch')) else: att_reporter = None # Make a plot for training and validation values trainer.extend( extensions.PlotReport(['main/loss', 'validation/main/loss'], 'epoch', file_name='loss.png')) trainer.extend( extensions.PlotReport(['main/acc', 'validation/main/acc'], 'epoch', file_name='acc.png')) trainer.extend( extensions.PlotReport(['main/ppl', 'validation/main/ppl'], 'epoch', file_name='ppl.png')) trainer.extend( extensions.PlotReport(['main/bleu', 'validation/main/bleu'], 'epoch', file_name='bleu.png')) # Save best models trainer.extend( snapshot_object(model, 'model.loss.best'), trigger=training.triggers.MinValueTrigger('validation/main/loss')) trainer.extend( snapshot_object(model, 'model.acc.best'), trigger=training.triggers.MaxValueTrigger('validation/main/acc')) # save snapshot which contains model and optimizer states if args.save_interval_iters > 0: trainer.extend( torch_snapshot(filename='snapshot.iter.{.updater.iteration}'), trigger=(args.save_interval_iters, 'iteration')) else: trainer.extend(torch_snapshot(), trigger=(1, 'epoch')) # epsilon decay in the optimizer if args.opt == 'adadelta': if args.criterion == 'acc': trainer.extend(restore_snapshot(model, args.outdir + '/model.acc.best', load_fn=torch_load), trigger=CompareValueTrigger( 'validation/main/acc', lambda best_value, current_value: best_value > current_value)) trainer.extend(adadelta_eps_decay(args.eps_decay), trigger=CompareValueTrigger( 'validation/main/acc', lambda best_value, current_value: best_value > current_value)) elif args.criterion == 'loss': trainer.extend(restore_snapshot(model, args.outdir + '/model.loss.best', load_fn=torch_load), trigger=CompareValueTrigger( 'validation/main/loss', lambda best_value, current_value: best_value < current_value)) trainer.extend(adadelta_eps_decay(args.eps_decay), trigger=CompareValueTrigger( 'validation/main/loss', lambda best_value, current_value: best_value < current_value)) elif args.opt == 'adam': if args.criterion == 'acc': trainer.extend(restore_snapshot(model, args.outdir + '/model.acc.best', load_fn=torch_load), trigger=CompareValueTrigger( 'validation/main/acc', lambda best_value, current_value: best_value > current_value)) trainer.extend(adam_lr_decay(args.lr_decay), trigger=CompareValueTrigger( 'validation/main/acc', lambda best_value, current_value: best_value > current_value)) elif args.criterion == 'loss': trainer.extend(restore_snapshot(model, args.outdir + '/model.loss.best', load_fn=torch_load), trigger=CompareValueTrigger( 'validation/main/loss', lambda best_value, current_value: best_value < current_value)) trainer.extend(adam_lr_decay(args.lr_decay), trigger=CompareValueTrigger( 'validation/main/loss', lambda best_value, current_value: best_value < current_value)) # Write a log of evaluation statistics for each epoch trainer.extend( extensions.LogReport(trigger=(args.report_interval_iters, 'iteration'))) report_keys = [ 'epoch', 'iteration', 'main/loss', 'validation/main/loss', 'main/acc', 'validation/main/acc', 'main/ppl', 'validation/main/ppl', 'elapsed_time' ] if args.opt == 'adadelta': trainer.extend(extensions.observe_value( 'eps', lambda trainer: trainer.updater.get_optimizer('main'). param_groups[0]["eps"]), trigger=(args.report_interval_iters, 'iteration')) report_keys.append('eps') elif args.opt in ['adam', 'noam']: trainer.extend(extensions.observe_value( 'lr', lambda trainer: trainer.updater.get_optimizer('main'). param_groups[0]["lr"]), trigger=(args.report_interval_iters, 'iteration')) report_keys.append('lr') if args.report_bleu: report_keys.append('validation/main/bleu') trainer.extend(extensions.PrintReport(report_keys), trigger=(args.report_interval_iters, 'iteration')) trainer.extend( extensions.ProgressBar(update_interval=args.report_interval_iters)) set_early_stop(trainer, args) if args.tensorboard_dir is not None and args.tensorboard_dir != "": trainer.extend(TensorboardLogger(SummaryWriter(args.tensorboard_dir), att_reporter), trigger=(args.report_interval_iters, "iteration")) # Run the training trainer.run() check_early_stop(trainer, args.epochs)
def __init__(self, model, criterion, opt_config, scheduler_config, print_freq=10, save_freq=1000, grad_clip=float('inf'), save_info={}, save_dir='.', train_iterations=0, checkpoint_filename='checkpoint%s.pth', keep_checkpoints=5, math='fp32', loss_scaling={}, intra_epoch_eval=0, prealloc_mode='always', iter_size=1, translator=None, verbose=False): """ Constructor for the Seq2SeqTrainer. :param model: model to train :param criterion: criterion (loss function) :param opt_config: dictionary with options for the optimizer :param scheduler_config: dictionary with options for the learning rate scheduler :param print_freq: prints short summary every 'print_freq' iterations :param save_freq: saves checkpoint every 'save_freq' iterations :param grad_clip: coefficient for gradient clipping :param save_info: dict with additional state stored in each checkpoint :param save_dir: path to the directiory for checkpoints :param train_iterations: total number of training iterations to execute :param checkpoint_filename: name of files with checkpoints :param keep_checkpoints: max number of checkpoints to keep :param math: arithmetic type :param loss_scaling: options for dynamic loss scaling :param intra_epoch_eval: number of additional eval runs within each training epoch :param prealloc_mode: controls preallocation, choices=['off', 'once', 'always'] :param iter_size: number of iterations between weight updates :param translator: instance of Translator, runs inference on test set :param verbose: enables verbose logging """ super(Seq2SeqTrainer, self).__init__() self.model = model self.criterion = criterion self.epoch = 0 self.save_info = save_info self.save_dir = save_dir self.save_freq = save_freq self.save_counter = 0 self.checkpoint_filename = checkpoint_filename self.checkpoint_counter = cycle(range(keep_checkpoints)) self.opt_config = opt_config self.device = next(model.parameters()).device self.print_freq = print_freq self.verbose = verbose self.loss = None self.translator = translator self.intra_epoch_eval = intra_epoch_eval self.iter_size = iter_size self.prealloc_mode = prealloc_mode self.preallocated = False self.distributed = torch.distributed.is_initialized() self.batch_first = model.batch_first params = self.model.parameters() if math == 'manual_fp16': self.fp_optimizer = FP16Optimizer( self.model, grad_clip, loss_scale=loss_scaling['init_scale'], dls_upscale_interval=loss_scaling['upscale_interval']) params = self.fp_optimizer.fp32_params elif math == 'fp32': self.fp_optimizer = FP32Optimizer(self.model, grad_clip) opt_name = opt_config.pop('optimizer') self.optimizer = torch.optim.__dict__[opt_name](params, **opt_config) logging.info(f'Using optimizer: {self.optimizer}') self.scheduler = WarmupMultiStepLR(self.optimizer, train_iterations, **scheduler_config) if math == 'fp16': self.model, self.optimizer = amp.initialize( self.model, self.optimizer, cast_model_outputs=torch.float16, keep_batchnorm_fp32=False, opt_level='O2') self.fp_optimizer = AMPOptimizer( self.model, grad_clip, loss_scale=loss_scaling['init_scale'], dls_upscale_interval=loss_scaling['upscale_interval']) if self.distributed: self.model = DistributedDataParallel(self.model)
def main(): logger.info("Logger is set - training start") # set seed np.random.seed(config.seed) torch.manual_seed(config.seed) torch.cuda.manual_seed_all(config.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = True if config.distributed: config.gpu = config.local_rank % torch.cuda.device_count() torch.cuda.set_device(config.gpu) # distributed init torch.distributed.init_process_group(backend='nccl', init_method=config.dist_url, world_size=config.world_size, rank=config.local_rank) config.world_size = torch.distributed.get_world_size() config.total_batch_size = config.world_size * config.batch_size else: config.total_batch_size = config.batch_size loaders, samplers = get_augment_datasets(config) train_loader, valid_loader = loaders train_sampler, valid_sampler = samplers net_crit = nn.CrossEntropyLoss().cuda() controller = CDARTSController(config, net_crit, n_nodes=4, stem_multiplier=config.stem_multiplier) file = open(config.cell_file, 'r') js = file.read() r_dict = json.loads(js) if config.local_rank == 0: logger.info(r_dict) file.close() genotypes_dict = {} for layer_idx, genotype in r_dict.items(): genotypes_dict[int(layer_idx)] = gt.from_str(genotype) controller.build_augment_model(controller.init_channel, genotypes_dict) resume_state = None if config.resume: resume_state = torch.load(config.resume_path, map_location='cpu') controller.model_main.load_state_dict(resume_state['model_main']) controller.model_main = controller.model_main.cuda() param_size = utils.param_size(controller.model_main) logger.info("param size = %fMB", param_size) # change training hyper parameters according to cell type if 'cifar' in config.dataset: if param_size < 3.0: config.weight_decay = 3e-4 config.drop_path_prob = 0.2 elif param_size > 3.0 and param_size < 3.5: config.weight_decay = 3e-4 config.drop_path_prob = 0.3 else: config.weight_decay = 5e-4 config.drop_path_prob = 0.3 if config.local_rank == 0: logger.info("Current weight decay: {}".format(config.weight_decay)) logger.info("Current drop path prob: {}".format(config.drop_path_prob)) controller.model_main = apex.parallel.convert_syncbn_model( controller.model_main) # weights optimizer optimizer = torch.optim.SGD(controller.model_main.parameters(), lr=config.lr, momentum=config.momentum, weight_decay=config.weight_decay) # optimizer = torch.optim.SGD(controller.model_main.parameters(), lr=config.lr, momentum=config.momentum, weight_decay=config.weight_decay, nesterov=True) lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, config.epochs) if config.use_amp: controller.model_main, optimizer = amp.initialize( controller.model_main, optimizer, opt_level=config.opt_level) if config.distributed: controller.model_main = DDP(controller.model_main, delay_allreduce=True) best_top1 = 0. best_top5 = 0. sta_epoch = 0 # training loop if config.resume: optimizer.load_state_dict(resume_state['optimizer']) lr_scheduler.load_state_dict(resume_state['lr_scheduler']) best_top1 = resume_state['best_top1'] best_top5 = resume_state['best_top5'] sta_epoch = resume_state['sta_epoch'] epoch_pool = [220, 230, 235, 240, 245] for epoch in range(sta_epoch, config.epochs): # reset iterators train_sampler.set_epoch(epoch) valid_sampler.set_epoch(epoch) current_lr = lr_scheduler.get_lr()[0] # current_lr = utils.adjust_lr(optimizer, epoch, config) if config.local_rank == 0: logger.info('Epoch: %d lr %e', epoch, current_lr) if epoch < config.warmup_epochs and config.total_batch_size > 256: for param_group in optimizer.param_groups: param_group['lr'] = current_lr * (epoch + 1) / 5.0 if config.local_rank == 0: logger.info('Warming-up Epoch: %d, LR: %e', epoch, current_lr * (epoch + 1) / 5.0) drop_prob = config.drop_path_prob * epoch / config.epochs controller.model_main.module.drop_path_prob(drop_prob) # training train(train_loader, controller.model_main, optimizer, epoch, writer, logger, config) # validation cur_step = (epoch + 1) * len(train_loader) top1, top5 = validate(valid_loader, controller.model_main, epoch, cur_step, writer, logger, config) if 'cifar' in config.dataset: lr_scheduler.step() elif 'imagenet' in config.dataset: lr_scheduler.step() # current_lr = utils.adjust_lr(optimizer, epoch, config) else: raise Exception('Lr error!') # save if best_top1 < top1: best_top1 = top1 best_top5 = top5 is_best = True else: is_best = False # save if config.local_rank == 0: if ('imagenet' in config.dataset) and ((epoch + 1) in epoch_pool) and ( not config.resume) and (config.local_rank == 0): torch.save( { "model_main": controller.model_main.module.state_dict(), "optimizer": optimizer.state_dict(), "lr_scheduler": lr_scheduler.state_dict(), "best_top1": best_top1, "best_top5": best_top5, "sta_epoch": epoch + 1 }, os.path.join(config.path, "epoch_{}.pth.tar".format(epoch + 1))) utils.save_checkpoint( controller.model_main.module.state_dict(), config.path, is_best) torch.save( { "model_main": controller.model_main.module.state_dict(), "optimizer": optimizer.state_dict(), "lr_scheduler": lr_scheduler.state_dict(), "best_top1": best_top1, "best_top5": best_top5, "sta_epoch": epoch + 1 }, os.path.join(config.path, "retrain_resume.pth.tar")) utils.save_checkpoint(controller.model_main.module.state_dict(), config.path, is_best) if config.local_rank == 0: logger.info("Final best Prec@1 = {:.4%}, Prec@5 = {:.4%}".format( best_top1, best_top5))
model.to(device) optimizer = optim.Adam(model.parameters(), lr=args.learning_rate) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=200000 // world_size, gamma=0.5) pretrained = True if args.load_step > 0 else False if pretrained is False: # do ActNorm initialization first (if model.pretrained is True, this does nothing so no worries) x_seed, c_seed = next(iter(train_loader)) x_seed, c_seed = x_seed.to(device), c_seed.to(device) with torch.no_grad(): _, _ = model(x_seed, c_seed) del x_seed, c_seed, _ # then convert the model to DataParallel later (since ActNorm init from the DataParallel is wacky) model, optimizer = amp.initialize(model, optimizer, opt_level="O0") model = DistributedDataParallel(model) global_step = 0 global_epoch = 0 if args.load_step == 0: list_train_loss, list_loss = [], [] test_loss = 100.0 else: model, optimizer, scheduler = load_checkpoint(args.load_step, model, optimizer, scheduler) list_train_loss = np.load('{}/{}_train.npy'.format(args.loss, args.model_name)).tolist() list_loss = np.load('{}/{}.npy'.format(args.loss, args.model_name)).tolist() list_train_loss = list_train_loss[:global_epoch] list_loss = list_loss[:global_epoch] test_loss = np.min(list_loss)
opt.print_freq = 1 opt.niter = 1 opt.niter_decay = 0 opt.max_dataset_size = 10 data_loader = CreateDataLoader(opt) dataset = data_loader.load_data() dataset_size = len(data_loader) print('#training images = %d' % dataset_size) model = create_model(opt) visualizer = Visualizer(opt) if opt.fp16: from apex import amp model, [optimizer_G, optimizer_D ] = amp.initialize(model, [model.optimizer_G, model.optimizer_D], opt_level='O1') model = torch.nn.DataParallel(model, device_ids=opt.gpu_ids) else: optimizer_G, optimizer_D = model.module.optimizer_G, model.module.optimizer_D total_steps = (start_epoch - 1) * dataset_size + epoch_iter display_delta = total_steps % opt.display_freq print_delta = total_steps % opt.print_freq save_delta = total_steps % opt.save_latest_freq for epoch in range(start_epoch, opt.niter + opt.niter_decay + 1): epoch_start_time = time.time() if epoch != start_epoch: epoch_iter = epoch_iter % dataset_size for i, data in enumerate(dataset, start=epoch_iter):