def train_one_batch(self, batch): """A normal training core without fetching data from iterator. """ model = self.elements["model"] model_forward = self.elements["model_forward"] optimizer = self.elements["optimizer"] if not model.training: model.train() if self.params["nan_debug"]: device = utils.get_device(self.elements["model"]) inputs = torch.load("{0}/nan.batch".format(self.params["model_dir"])).to(device) targets = torch.load("{0}/nan.targets".format(self.params["model_dir"])).to(device) self.elements["model"].load_state_dict(torch.load("{0}/nan.params".format(self.params["model_dir"]), map_location="cpu")) self.elements["model"].to(device) else: inputs, targets = batch optimizer.zero_grad() loss = model.get_loss(model_forward(inputs), targets) loss.backward() loss.detach() # For safe. if self.params["max_change"] > 0: # Reference:https://github.com/horovod/horovod/blob/master/horovod/torch/__init__.py:420~423. # Synchronize the grad for grad_norm when using horovod. if utils.use_horovod(): optimizer.synchronize() grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), self.params["max_change"]) if math.isnan(grad_norm): if self.params["nan_debug"]: raise RuntimeError("[NOT OK] Nan is still found in this debug.") torch.save(inputs.cpu(), "{0}/nan.batch".format(self.params["model_dir"])) torch.save(targets.cpu(), "{0}/nan.targets".format(self.params["model_dir"])) torch.save(self.elements["model"].state_dict(), "{0}/nan.params".format(self.params["model_dir"])) raise RuntimeError('There is Nan problem in iter/epoch: {0}/{1} (nan batch and params are saved in {2})'.format(self.training_point[1]+1, self.training_point[0]+1, "{0}/nan.*".format(self.params["model_dir"]))) else: if self.params["nan_debug"]: raise RuntimeError("[OK] There is no nan found for this debug.") if utils.use_horovod(): with optimizer.skip_synchronize(): optimizer.step() else: optimizer.step() else: optimizer.step() accuracy = model.get_accuracy(targets) if self.params["compute_accuracy"] else None return loss.item(), accuracy
def train_one_batch(self, batch): """A normal training core without fetching data from iterator. """ model = self.elements["model"] model_forward = self.elements["model_forward"] optimizer = self.elements["optimizer"] if not model.training: model.train() inputs, targets = batch optimizer.zero_grad() loss = model.get_loss(model_forward(inputs), targets) loss.backward() loss.detach() # For safe. if self.params["max_change"] > 0: # Reference:https://github.com/horovod/horovod/blob/master/horovod/torch/__init__.py:420~423. # Synchronize the grad for grad_norm when using horovod. if utils.use_horovod(): optimizer.synchronize() grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), self.params["max_change"]) if math.isnan(grad_norm): raise RuntimeError( 'There is nan problem in iter/epoch: {0}/{1}'.format( self.training_point[1] + 1, self.training_point[0] + 1)) else: if utils.use_horovod(): with optimizer.skip_synchronize(): optimizer.step() else: optimizer.step() else: optimizer.step() accuracy = model.compute_accuracy( model.get_posterior(), targets) if self.params["compute_accuracy"] else None return loss.item(), accuracy
def __init__(self, optimizer, params:dict={}): # Suggested weight_decay: 1e-4 for l2 regularization (sgd, adam) and # 1e-1 for decouped weight decay (sgdw, adamw, radam, ralamb, adamod etc.) default_params = { "name":"warmR", "1cycle.learn_rate":0.001, "warmR.T_max":10, "warmR.T_mult":1, "warmR.factor":1.0, "warmR.eta_min":4e-8, "warmR.log_decay":False, "warmR.lr_decay_step":1, "reduceP.metric":'valid_acc', "reduceP.check_interval":0, "reduceP.factor":0.1, "reduceP.patience":10, "reduceP.threshold":0.0001, "reduceP.cooldown":0, "reduceP.min_lr":0 } used_params = utils.assign_params_dict(default_params, params, force_check=False, support_unknow=True) split_params = utils.split_params(used_params) if isinstance(optimizer, Lookahead): base_optimizer = optimizer.optimizer else: base_optimizer = optimizer self.name = split_params["public"]["name"] if self.name == "1cycle": # To do. self.lr_scheduler = optim.lr_scheduler.OneCycleLR(base_optimizer, **split_params["1cycle"]) elif self.name == "warmR": T_max = split_params["warmR"].pop("T_max") self.lr_decay_step = split_params["warmR"].pop("lr_decay_step") self.lr_scheduler = CosineAnnealingWarmRestarts(base_optimizer, T_max, **split_params["warmR"]) elif self.name == "reduceP": self.check_interval = split_params["reduceP"].pop("check_interval") self.metric = split_params["reduceP"].pop("metric") if self.metric == "valid_acc": mode = "max" elif self.metric == "valid_loss": mode = "min" else: raise ValueError("Do not support {} metric for ReduceLROnPlateau strategy.".format(self.metric)) self.lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(base_optimizer, mode=mode, **split_params["reduceP"]) self.init = False if utils.use_horovod(): raise TypeError("Do not support ReduceLROnPlateau for multi-gpu of Horovod now.") else: raise ValueError("Do not support {0} lr_scheduler now.".format(name))
def init_training(self): model = self.elements["model"] start_epoch = self.params["start_epoch"] exist_model = self.params["exist_model"] model_dir = self.params["model_dir"] model_blueprint = self.params["model_blueprint"] suffix = self.params["suffix"] if start_epoch <= 0 and utils.is_main_training(): model_creation = model.get_model_creation() utils.write_nnet_config(model_blueprint, model_creation, "{0}/config/nnet.config".format(model_dir)) ## Recover checkpoint | Tansform learning | Initialize parametes if start_epoch > 0: # This train_stage is equal to number of completed epoch if utils.is_main_training(): logger.info("Recover training from {0} epoch.".format(start_epoch)) model.load_state_dict(torch.load('{0}/{1}.{2}'.format(model_dir, start_epoch, suffix), map_location="cpu")) elif os.path.exists(exist_model): if utils.is_main_training(): logger.info("Use {0} as the initial model to start transform-training.".format(exist_model)) model.load_transform_state_dict(torch.load(exist_model, map_location="cpu")) else: # Just use the raw initial model or initialize it again by some initial functions here pass # Now, it means use the raw initial model if utils.use_horovod(): import horovod.torch as hvd # Broadcast parameters from rank 0 to all other processes. hvd.broadcast_parameters(self.elements["model"].state_dict(), root_rank=0) # For optimizer wrapper such as lookahead. if getattr(self.elements["optimizer"], "optimizer", None) is not None: raise TypeError("Do not support using lookahead with horovod now.") else: # Broadcast optimizer state. hvd.broadcast_optimizer_state(self.elements["optimizer"], root_rank=0) self.elements["optimizer"] = hvd.DistributedOptimizer(self.elements["optimizer"], named_parameters=self.elements["model"].named_parameters()) ## Select device model = self.select_device() # Original model is built in libs.nnet.framework.TopVirtualNnet, and it is not available after # wrapped by DistributedDataParallel. So, to call functions of TopVirtualNnet conveniently, the # self.elements["model_forward"] is set here to name DistributedDataParallel. if isinstance(model, torch.nn.parallel.DistributedDataParallel): self.elements["model"] = model.module self.elements["model_forward"] = model
def __init__(self, trainset, valid=None, use_fast_loader=False, max_prefetch=10, batch_size=512, valid_batch_size=512, shuffle=True, num_workers=0, pin_memory=False, drop_last=True): num_samples = len(trainset) num_gpu = 1 multi_gpu = False if utils.use_horovod(): # Multi-GPU training. import horovod.torch as hvd # Partition dataset among workers using DistributedSampler train_sampler = torch.utils.data.distributed.DistributedSampler( trainset, num_replicas=hvd.size(), rank=hvd.rank(), shuffle=shuffle) multi_gpu = True num_gpu = hvd.size() elif utils.use_ddp(): # The num_replicas/world_size and rank will be set automatically with DDP. train_sampler = torch.utils.data.distributed.DistributedSampler( trainset, shuffle=shuffle) multi_gpu = True num_gpu = dist.get_world_size() else: train_sampler = None if multi_gpu: # If use DistributedSampler, the shuffle of DataLoader should be set False. shuffle = False if use_fast_loader: self.train_loader = DataLoaderFast(max_prefetch, trainset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=pin_memory, drop_last=drop_last, sampler=train_sampler) else: self.train_loader = DataLoader(trainset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=pin_memory, drop_last=drop_last, sampler=train_sampler) self.num_batch_train = len(self.train_loader) if self.num_batch_train <= 0: raise ValueError( "Expected num_batch of trainset > 0. There are your egs info: num_gpu={}, num_samples/gpu={}, " "batch-size={}, drop_last={}.\nNote: If batch-size > num_samples/gpu and drop_last is true, then it " "will get 0 batch.".format(num_gpu, len(trainset) / num_gpu, batch_size, drop_last)) if valid is not None: valid_batch_size = min(valid_batch_size, len(valid)) # To save GPU memory if len(valid) <= 0: raise ValueError("Expected num_samples of valid > 0.") # Do not use DataLoaderFast for valid for it increases the memory all the time when compute_valid_accuracy is True. # But I have not find the real reason. self.valid_loader = DataLoader(valid, batch_size=valid_batch_size, shuffle=False, num_workers=num_workers, pin_memory=pin_memory, drop_last=False) self.num_batch_valid = len(self.valid_loader) else: self.valid_loader = None self.num_batch_valid = 0
def __init__(self, optimizer, params:dict={}): # Suggested weight_decay: 1e-4 for l2 regularization (sgd, adam) and # 1e-1 for decouped weight decay (sgdw, adamw, radam, ralamb, adamod etc.) default_params = { "name":"warmR", "cyclic.max_lr":1e-3, "cyclic.base_lr":1e-8, "cyclic.step_size_up":2e4, "cyclic.step_size_down":None, "cyclic.mode":'triangular2', "cyclic.gamma":1.0, "cyclic.scale_fn":None, "cyclic.scale_mode":'cycle', "cyclic.cycle_momentum":False, "cyclic.base_momentum":0.8, "cyclic.max_momentum":0.9, "1cycle.learn_rate":0.001, "1cycle.total_steps":None, "1cycle.epochs":None, "1cycle.steps_per_epoch":None, "1cycle.pct_start":0.3, "1cycle.anneal_strategy":'linear', "1cycle.cycle_momentum":False, "1cycle.base_momentum":0.85, "1cycle.max_momentum":0.95, "1cycle.div_factor":25.0, "1cycle.final_div_factor":10000.0, "warmR.T_max":10, "warmR.T_mult":1, "warmR.factor":1.0, "warmR.eta_min":4e-8, "warmR.log_decay":False, "warmR.lr_decay_step":1, "reduceP.metric":'valid_acc', "reduceP.check_interval":0, "reduceP.factor":0.5, "reduceP.patience":10, "reduceP.threshold":0.0001, "reduceP.cooldown":0, "reduceP.min_lr":0. } used_params = utils.assign_params_dict(default_params, params, force_check=False, support_unknow=True) split_params = utils.split_params(used_params) if isinstance(optimizer, Lookahead): base_optimizer = optimizer.optimizer else: base_optimizer = optimizer self.name = split_params["public"]["name"] if self.name == "cyclic": base_lr = split_params["cyclic"].pop("base_lr") max_lr = split_params["cyclic"].pop("max_lr") self.lr_scheduler = torch.optim.lr_scheduler.CyclicLR(base_optimizer, base_lr, max_lr, **split_params["cyclic"]) elif self.name == "1cycle": max_lr = split_params["1cycle"].pop("learn_rate") self.lr_scheduler = optim.lr_scheduler.OneCycleLR(base_optimizer, max_lr, **split_params["1cycle"]) elif self.name == "warmR": T_max = split_params["warmR"].pop("T_max") self.lr_decay_step = split_params["warmR"].pop("lr_decay_step") self.lr_scheduler = CosineAnnealingWarmRestarts(base_optimizer, T_max, **split_params["warmR"]) elif self.name == "reduceP": self.check_interval = split_params["reduceP"].pop("check_interval") self.metric = split_params["reduceP"].pop("metric") self.min_lr = split_params["reduceP"]["min_lr"] if self.metric == "valid_acc": mode = "max" elif self.metric == "valid_loss": mode = "min" else: raise ValueError("Do not support {} metric for ReduceLROnPlateau strategy.".format(self.metric)) self.lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(base_optimizer, mode=mode, **split_params["reduceP"]) self.init = False if utils.use_horovod(): raise TypeError("Do not support ReduceLROnPlateau for multi-gpu of Horovod now.") else: raise ValueError("Do not support {0} lr_scheduler now.".format(name))