Exemple #1
0
    def train_one_batch(self, batch):
        """A normal training core without fetching data from iterator.
        """
        model = self.elements["model"]
        model_forward = self.elements["model_forward"]
        optimizer = self.elements["optimizer"]

        if not model.training:
            model.train()

        if self.params["nan_debug"]:
            device = utils.get_device(self.elements["model"])
            inputs = torch.load("{0}/nan.batch".format(self.params["model_dir"])).to(device)
            targets = torch.load("{0}/nan.targets".format(self.params["model_dir"])).to(device)
            self.elements["model"].load_state_dict(torch.load("{0}/nan.params".format(self.params["model_dir"]), 
                                             map_location="cpu"))
            self.elements["model"].to(device)
        else:
            inputs, targets = batch
        optimizer.zero_grad()

        loss = model.get_loss(model_forward(inputs), targets)
        loss.backward()
        loss.detach() # For safe.

        if self.params["max_change"] > 0:
            # Reference:https://github.com/horovod/horovod/blob/master/horovod/torch/__init__.py:420~423.
            # Synchronize the grad for grad_norm when using horovod.
            if utils.use_horovod(): optimizer.synchronize()
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), self.params["max_change"])

            if math.isnan(grad_norm):
                if self.params["nan_debug"]:
                    raise RuntimeError("[NOT OK] Nan is still found in this debug.")
                torch.save(inputs.cpu(), "{0}/nan.batch".format(self.params["model_dir"]))
                torch.save(targets.cpu(), "{0}/nan.targets".format(self.params["model_dir"]))
                torch.save(self.elements["model"].state_dict(), "{0}/nan.params".format(self.params["model_dir"]))
                raise RuntimeError('There is Nan problem in iter/epoch: {0}/{1} (nan batch and params are saved in {2})'.format(self.training_point[1]+1, 
                self.training_point[0]+1, "{0}/nan.*".format(self.params["model_dir"])))
            else:
                if self.params["nan_debug"]:
                    raise RuntimeError("[OK] There is no nan found for this debug.")
                if utils.use_horovod():
                    with optimizer.skip_synchronize():
                        optimizer.step()
                else:
                    optimizer.step()
        else:
            optimizer.step()

        accuracy = model.get_accuracy(targets) if self.params["compute_accuracy"] else None

        return loss.item(), accuracy
Exemple #2
0
    def train_one_batch(self, batch):
        """A normal training core without fetching data from iterator.
        """
        model = self.elements["model"]
        model_forward = self.elements["model_forward"]
        optimizer = self.elements["optimizer"]

        if not model.training:
            model.train()

        inputs, targets = batch
        optimizer.zero_grad()

        loss = model.get_loss(model_forward(inputs), targets)
        loss.backward()
        loss.detach()  # For safe.

        if self.params["max_change"] > 0:
            # Reference:https://github.com/horovod/horovod/blob/master/horovod/torch/__init__.py:420~423.
            # Synchronize the grad for grad_norm when using horovod.
            if utils.use_horovod(): optimizer.synchronize()
            grad_norm = torch.nn.utils.clip_grad_norm_(
                model.parameters(), self.params["max_change"])

            if math.isnan(grad_norm):
                raise RuntimeError(
                    'There is nan problem in iter/epoch: {0}/{1}'.format(
                        self.training_point[1] + 1,
                        self.training_point[0] + 1))
            else:
                if utils.use_horovod():
                    with optimizer.skip_synchronize():
                        optimizer.step()
                else:
                    optimizer.step()
        else:
            optimizer.step()

        accuracy = model.compute_accuracy(
            model.get_posterior(),
            targets) if self.params["compute_accuracy"] else None

        return loss.item(), accuracy
    def __init__(self, optimizer, params:dict={}):
        # Suggested weight_decay: 1e-4 for l2 regularization (sgd, adam) and 
        #                         1e-1 for decouped weight decay (sgdw, adamw, radam, ralamb, adamod etc.)
        default_params = {
            "name":"warmR",
            "1cycle.learn_rate":0.001,
            "warmR.T_max":10,
            "warmR.T_mult":1,
            "warmR.factor":1.0,
            "warmR.eta_min":4e-8,
            "warmR.log_decay":False,
            "warmR.lr_decay_step":1,
            "reduceP.metric":'valid_acc',
            "reduceP.check_interval":0, 
            "reduceP.factor":0.1, 
            "reduceP.patience":10, 
            "reduceP.threshold":0.0001, 
            "reduceP.cooldown":0, 
            "reduceP.min_lr":0
        }

        used_params = utils.assign_params_dict(default_params, params, force_check=False, support_unknow=True)
        split_params = utils.split_params(used_params)

        if isinstance(optimizer, Lookahead):
            base_optimizer = optimizer.optimizer
        else:
            base_optimizer = optimizer

        self.name = split_params["public"]["name"]
        if self.name == "1cycle":
            # To do.
            self.lr_scheduler = optim.lr_scheduler.OneCycleLR(base_optimizer, **split_params["1cycle"])
        elif self.name == "warmR":
            T_max = split_params["warmR"].pop("T_max")
            self.lr_decay_step = split_params["warmR"].pop("lr_decay_step")
            self.lr_scheduler = CosineAnnealingWarmRestarts(base_optimizer, T_max, **split_params["warmR"])
        elif self.name == "reduceP":
            self.check_interval = split_params["reduceP"].pop("check_interval")
            self.metric = split_params["reduceP"].pop("metric")
            if self.metric == "valid_acc":
                mode = "max"
            elif self.metric == "valid_loss":
                mode = "min"
            else:
                raise ValueError("Do not support {} metric for ReduceLROnPlateau strategy.".format(self.metric))
            self.lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(base_optimizer, mode=mode, **split_params["reduceP"])
            self.init = False
            if utils.use_horovod():
                raise TypeError("Do not support ReduceLROnPlateau for multi-gpu of Horovod now.")
        else:
            raise ValueError("Do not support {0} lr_scheduler now.".format(name))
Exemple #4
0
    def init_training(self):
        model = self.elements["model"]
        start_epoch = self.params["start_epoch"]
        exist_model = self.params["exist_model"]
        model_dir = self.params["model_dir"]
        model_blueprint = self.params["model_blueprint"]
        suffix = self.params["suffix"]

        if start_epoch <= 0 and utils.is_main_training():
            model_creation = model.get_model_creation()
            utils.write_nnet_config(model_blueprint, model_creation, "{0}/config/nnet.config".format(model_dir))

        ## Recover checkpoint | Tansform learning | Initialize parametes 
        if start_epoch > 0:
            # This train_stage is equal to number of completed epoch
            if utils.is_main_training(): logger.info("Recover training from {0} epoch.".format(start_epoch))
            model.load_state_dict(torch.load('{0}/{1}.{2}'.format(model_dir, start_epoch, suffix), 
                                             map_location="cpu"))
        elif os.path.exists(exist_model):
            if utils.is_main_training(): logger.info("Use {0} as the initial model to start transform-training.".format(exist_model))
            model.load_transform_state_dict(torch.load(exist_model, map_location="cpu"))
        else:
            # Just use the raw initial model or initialize it again by some initial functions here
            pass # Now, it means use the raw initial model

        if utils.use_horovod():
            import horovod.torch as hvd

            # Broadcast parameters from rank 0 to all other processes.
            hvd.broadcast_parameters(self.elements["model"].state_dict(), root_rank=0)

             # For optimizer wrapper such as lookahead.
            if getattr(self.elements["optimizer"], "optimizer", None) is not None:
                raise TypeError("Do not support using lookahead with horovod now.")
            else:
                # Broadcast optimizer state.
                hvd.broadcast_optimizer_state(self.elements["optimizer"], root_rank=0)
                self.elements["optimizer"] = hvd.DistributedOptimizer(self.elements["optimizer"], 
                                             named_parameters=self.elements["model"].named_parameters())

        ## Select device
        model = self.select_device()

        # Original model is built in libs.nnet.framework.TopVirtualNnet, and it is not available after
        # wrapped by DistributedDataParallel. So, to call functions of TopVirtualNnet conveniently, the 
        # self.elements["model_forward"] is set here to name DistributedDataParallel.
        if isinstance(model, torch.nn.parallel.DistributedDataParallel):
            self.elements["model"] = model.module
            self.elements["model_forward"] = model
Exemple #5
0
    def __init__(self,
                 trainset,
                 valid=None,
                 use_fast_loader=False,
                 max_prefetch=10,
                 batch_size=512,
                 valid_batch_size=512,
                 shuffle=True,
                 num_workers=0,
                 pin_memory=False,
                 drop_last=True):

        num_samples = len(trainset)
        num_gpu = 1
        multi_gpu = False
        if utils.use_horovod():
            # Multi-GPU training.
            import horovod.torch as hvd
            # Partition dataset among workers using DistributedSampler
            train_sampler = torch.utils.data.distributed.DistributedSampler(
                trainset,
                num_replicas=hvd.size(),
                rank=hvd.rank(),
                shuffle=shuffle)
            multi_gpu = True
            num_gpu = hvd.size()
        elif utils.use_ddp():
            # The num_replicas/world_size and rank will be set automatically with DDP.
            train_sampler = torch.utils.data.distributed.DistributedSampler(
                trainset, shuffle=shuffle)
            multi_gpu = True
            num_gpu = dist.get_world_size()
        else:
            train_sampler = None

        if multi_gpu:
            # If use DistributedSampler, the shuffle of DataLoader should be set False.
            shuffle = False

        if use_fast_loader:
            self.train_loader = DataLoaderFast(max_prefetch,
                                               trainset,
                                               batch_size=batch_size,
                                               shuffle=shuffle,
                                               num_workers=num_workers,
                                               pin_memory=pin_memory,
                                               drop_last=drop_last,
                                               sampler=train_sampler)
        else:
            self.train_loader = DataLoader(trainset,
                                           batch_size=batch_size,
                                           shuffle=shuffle,
                                           num_workers=num_workers,
                                           pin_memory=pin_memory,
                                           drop_last=drop_last,
                                           sampler=train_sampler)

        self.num_batch_train = len(self.train_loader)

        if self.num_batch_train <= 0:
            raise ValueError(
                "Expected num_batch of trainset > 0. There are your egs info: num_gpu={}, num_samples/gpu={}, "
                "batch-size={}, drop_last={}.\nNote: If batch-size > num_samples/gpu and drop_last is true, then it "
                "will get 0 batch.".format(num_gpu,
                                           len(trainset) / num_gpu, batch_size,
                                           drop_last))

        if valid is not None:
            valid_batch_size = min(valid_batch_size,
                                   len(valid))  # To save GPU memory

            if len(valid) <= 0:
                raise ValueError("Expected num_samples of valid > 0.")

            # Do not use DataLoaderFast for valid for it increases the memory all the time when compute_valid_accuracy is True.
            # But I have not find the real reason.
            self.valid_loader = DataLoader(valid,
                                           batch_size=valid_batch_size,
                                           shuffle=False,
                                           num_workers=num_workers,
                                           pin_memory=pin_memory,
                                           drop_last=False)

            self.num_batch_valid = len(self.valid_loader)
        else:
            self.valid_loader = None
            self.num_batch_valid = 0
Exemple #6
0
    def __init__(self, optimizer, params:dict={}):
        # Suggested weight_decay: 1e-4 for l2 regularization (sgd, adam) and 
        #                         1e-1 for decouped weight decay (sgdw, adamw, radam, ralamb, adamod etc.)
        default_params = {
            "name":"warmR",

            "cyclic.max_lr":1e-3,
            "cyclic.base_lr":1e-8,
            "cyclic.step_size_up":2e4,
            "cyclic.step_size_down":None,
            "cyclic.mode":'triangular2', 
            "cyclic.gamma":1.0, 
            "cyclic.scale_fn":None, 
            "cyclic.scale_mode":'cycle', 
            "cyclic.cycle_momentum":False, 
            "cyclic.base_momentum":0.8, 
            "cyclic.max_momentum":0.9,

            "1cycle.learn_rate":0.001,
            "1cycle.total_steps":None,
            "1cycle.epochs":None,
            "1cycle.steps_per_epoch":None,
            "1cycle.pct_start":0.3,
            "1cycle.anneal_strategy":'linear',
            "1cycle.cycle_momentum":False,
            "1cycle.base_momentum":0.85,
            "1cycle.max_momentum":0.95,
            "1cycle.div_factor":25.0,
            "1cycle.final_div_factor":10000.0,

            "warmR.T_max":10,
            "warmR.T_mult":1,
            "warmR.factor":1.0,
            "warmR.eta_min":4e-8,
            "warmR.log_decay":False,
            "warmR.lr_decay_step":1,

            "reduceP.metric":'valid_acc',
            "reduceP.check_interval":0, 
            "reduceP.factor":0.5, 
            "reduceP.patience":10, 
            "reduceP.threshold":0.0001, 
            "reduceP.cooldown":0, 
            "reduceP.min_lr":0.
        }

        used_params = utils.assign_params_dict(default_params, params, force_check=False, support_unknow=True)
        split_params = utils.split_params(used_params)

        if isinstance(optimizer, Lookahead):
            base_optimizer = optimizer.optimizer
        else:
            base_optimizer = optimizer

        self.name = split_params["public"]["name"]
        if self.name == "cyclic":
            base_lr = split_params["cyclic"].pop("base_lr")
            max_lr = split_params["cyclic"].pop("max_lr")
            self.lr_scheduler = torch.optim.lr_scheduler.CyclicLR(base_optimizer, base_lr, max_lr, **split_params["cyclic"])
        elif self.name == "1cycle":
            max_lr = split_params["1cycle"].pop("learn_rate")
            self.lr_scheduler = optim.lr_scheduler.OneCycleLR(base_optimizer, max_lr, **split_params["1cycle"])
        elif self.name == "warmR":
            T_max = split_params["warmR"].pop("T_max")
            self.lr_decay_step = split_params["warmR"].pop("lr_decay_step")
            self.lr_scheduler = CosineAnnealingWarmRestarts(base_optimizer, T_max, **split_params["warmR"])
        elif self.name == "reduceP":
            self.check_interval = split_params["reduceP"].pop("check_interval")
            self.metric = split_params["reduceP"].pop("metric")
            self.min_lr = split_params["reduceP"]["min_lr"]
            if self.metric == "valid_acc":
                mode = "max"
            elif self.metric == "valid_loss":
                mode = "min"
            else:
                raise ValueError("Do not support {} metric for ReduceLROnPlateau strategy.".format(self.metric))
            self.lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(base_optimizer, mode=mode, **split_params["reduceP"])
            self.init = False
            if utils.use_horovod():
                raise TypeError("Do not support ReduceLROnPlateau for multi-gpu of Horovod now.")
        else:
            raise ValueError("Do not support {0} lr_scheduler now.".format(name))