def __set_fit_dataloaders(self, model, train_dataloader, val_dataloaders, test_dataloaders): # when dataloader is passed via fit, patch the train_dataloader # functions to overwrite with these implementations if train_dataloader is not None: if not self.is_overriden('training_step', model): m = 'You called .fit() with a train_dataloader but did not define training_step()' raise MisconfigurationException(m) def patch_train_dataloader(): return train_dataloader model.train_dataloader = patch_train_dataloader if val_dataloaders is not None: if not self.is_overriden('validation_step', model): m = 'You called .fit() with a val_dataloaders but did not define validation_step()' raise MisconfigurationException(m) def patch_val_dataloader(): return val_dataloaders model.val_dataloader = patch_val_dataloader if test_dataloaders is not None: if not self.is_overriden('test_step', model): m = 'You called .fit() with a test_dataloaders but did not define test_step()' raise MisconfigurationException(m) def patch_test_dataloader(): return test_dataloaders model.test_dataloader = patch_test_dataloader
def __attach_dataloaders(self, model, train_dataloader, val_dataloaders, test_dataloaders): # when dataloader is passed via fit, patch the train_dataloader # functions to overwrite with these implementations if train_dataloader is not None: if not self.is_overriden('training_step', model): raise MisconfigurationException( 'You called `.fit()` with a `train_dataloader` but did not define `training_step()`' ) model.train_dataloader = _PatchDataLoader(train_dataloader) if val_dataloaders is not None: if not self.is_overriden('validation_step', model): raise MisconfigurationException( 'You called `.fit()` with a `val_dataloaders` but did not define `validation_step()`' ) model.val_dataloader = _PatchDataLoader(val_dataloaders) if test_dataloaders is not None: if not self.is_overriden('test_step', model): raise MisconfigurationException( 'You called `.fit()` with a `test_dataloaders` but did not define `test_step()`' ) model.test_dataloader = _PatchDataLoader(test_dataloaders)
def __set_distributed_mode(self, distributed_backend, nb_gpu_nodes): # skip for CPU if self.num_gpus == 0: return # single GPU case if self.num_gpus == 1: self.single_gpu = True if distributed_backend is not None: self.use_dp = distributed_backend == 'dp' self.use_ddp = distributed_backend == 'ddp' # multiple GPU case elif self.num_gpus > 1: if distributed_backend is not None: # DP, DDP case self.use_dp = distributed_backend == 'dp' self.use_ddp = distributed_backend == 'ddp' elif distributed_backend is None: m = 'When using multiple GPUs set ' \ 'Trainer(distributed_backend=dp) (or ddp)' raise MisconfigurationException(m) # use ddp automatically if nb_gpu_nodes > 1 if nb_gpu_nodes > 1 and self.use_dp: # pragma: no cover self.use_ddp = True self.use_dp = False w = 'DataParallel does not support nb_gpu_nodes > 1. ' \ 'Switching to DistributedDataParallel for you. ' \ 'To silence this warning set distributed_backend=ddp' warnings.warn(w) print('gpu available: {}, used: {}'.format(torch.cuda.is_available(), self.on_gpu))
def update_learning_rates(self, interval): ''' Update learning rates Args: interval (str): either 'epoch' or 'step'. ''' if not self.lr_schedulers: return for lr_scheduler in self.lr_schedulers: current_idx = self.batch_idx if interval == 'step' else self.current_epoch current_idx += 1 # account for both batch and epoch starts from 0 # Take step if call to update_learning_rates matches the interval key and # the current step modulo the schedulers frequency is zero if lr_scheduler[ 'interval'] == interval and current_idx % lr_scheduler[ 'frequency'] == 0: # If instance of ReduceLROnPlateau, we need to pass validation loss if lr_scheduler['reduce_on_plateau']: monitor_key = lr_scheduler['monitor'] monitor_val = self.callback_metrics.get(monitor_key) if monitor_val is None: avail_metrics = ','.join( list(self.callback_metrics.keys())) m = f'ReduceLROnPlateau conditioned on metric {monitor_key} ' \ f'which is not available. Available metrics are: {avail_metrics}. ' \ 'Condition can be set using `monitor` key in lr scheduler dict' raise MisconfigurationException(m) lr_scheduler['scheduler'].step(monitor_val) else: lr_scheduler['scheduler'].step()
def parse_gpu_ids(gpus): """ :param gpus: Int, string or list An int -1 or string '-1' indicate that all available GPUs should be used. A list of ints or a string containing list of comma separated integers indicates specific GPUs to use An int 0 means that no GPUs should be used Any int N > 0 indicates that GPUs [0..N) should be used. :return: List of gpus to be used If no GPUs are available but the value of gpus variable indicates request for GPUs then a misconfiguration exception is raised. """ # Check that gpus param is None, Int, String or List check_gpus_data_type(gpus) # Handle the case when no gpus are requested if gpus is None or isinstance(gpus, int) and gpus == 0: return None # We know user requested GPUs therefore if some of the # requested GPUs are not available an exception is thrown. gpus = normalize_parse_gpu_string_input(gpus) gpus = normalize_parse_gpu_input_to_list(gpus) gpus = sanitize_gpu_ids(gpus) if not gpus: raise MisconfigurationException( "GPUs requested but none are available.") return gpus
def get_dataloaders(self, model): """ Dataloaders are provided by the model :param model: :return: """ self.tng_dataloader = model.tng_dataloader self.test_dataloader = model.test_dataloader self.val_dataloader = model.val_dataloader if self.use_ddp and not isinstance(self.tng_dataloader.sampler, DistributedSampler): msg = """ when using multiple gpus and multiple nodes you must pass a DistributedSampler to DataLoader(sampler). ie: this: dataset = myDataset() dataloader = Dataloader(dataset) becomes: dataset = myDataset() dist_sampler = torch.utils.data.distributed.DistributedSampler(dataset) dataloader = Dataloader(dataset, sampler=dist_sampler) """ raise MisconfigurationException(msg)
def _load_model_state(cls, checkpoint): cls_takes_hparams = 'hparams' in inspect.signature( cls.__init__).parameters ckpt_hparams = checkpoint.get('hparams') if cls_takes_hparams: if ckpt_hparams is not None: hparams = Namespace(**ckpt_hparams) else: warnings.warn( f"Checkpoint does not contain hyperparameters but {cls.__name__}'s __init__ contains" " argument 'hparams'. Will pass in an empty Namespace instead." " Did you forget to store your model hyperparameters in self.hparams?" ) hparams = Namespace() else: # The user's LightningModule does not define a hparams argument if ckpt_hparams is None: hparams = None else: raise MisconfigurationException( f"Checkpoint contains hyperparameters but {cls.__name__}'s __init__ is missing the" " argument 'hparams'. Are you loading the correct checkpoint?" ) # load the state_dict on the model automatically model_args = [hparams] if hparams else [] model = cls(*model_args) model.load_state_dict(checkpoint['state_dict']) # give model a chance to load something model.on_load_checkpoint(checkpoint) return model
def dp_train(self, model): # CHOOSE OPTIMIZER # allow for lr schedulers as well self.optimizers, self.lr_schedulers = self.init_optimizers( model.configure_optimizers()) model.cuda(self.root_gpu) # check for this bug (amp + dp + !01 doesn't work) # https://github.com/NVIDIA/apex/issues/227 if self.use_dp and self.use_amp: if self.amp_level == 'O2': # pragma: no cover m = f""" Amp level {self.amp_level} with DataParallel is not supported. See this note from NVIDIA for more info: https://github.com/NVIDIA/apex/issues/227. We recommend you switch to ddp if you want to use amp """ raise MisconfigurationException(m) else: model, optimizers = model.configure_apex( amp, model, self.optimizers, self.amp_level) # create list of device ids device_ids = self.data_parallel_device_ids if isinstance(device_ids, int): device_ids = list(range(device_ids)) model = LightningDataParallel(model, device_ids=device_ids) self.run_pretrain_routine(model)
def get_dataloaders(self, model): """ Dataloaders are provided by the model :param model: :return: """ self.init_train_dataloader(model) self.init_test_dataloader(model) self.init_val_dataloader(model) if self.use_ddp or self.use_ddp2: # wait for all processes to catch up dist.barrier() # load each dataloader self.get_train_dataloader() self.get_test_dataloaders() self.get_val_dataloaders() # support IterableDataset for train data self.is_iterable_train_dataloader = (EXIST_ITER_DATASET and isinstance( self.get_train_dataloader().dataset, IterableDataset)) if self.is_iterable_train_dataloader and not isinstance( self.val_check_interval, int): m = ''' When using an iterableDataset for `train_dataloader`, `Trainer(val_check_interval)` must be an int. An int k specifies checking validation every k training batches ''' raise MisconfigurationException(m)
def __set_distributed_mode(self, distributed_backend, nb_gpu_nodes): # make DP and DDP mutually exclusive # single GPU will also use DP with devices=[0] requested_gpus = self.data_parallel_device_ids is not None num_gpus = self.num_gpus if num_gpus > 0: # single GPU case if num_gpus == 1: self.single_gpu = True elif num_gpus > 1 and distributed_backend is not None: # DP, DDP case self.use_dp = distributed_backend == 'dp' self.use_ddp = distributed_backend == 'ddp' # use ddp automatically if nb_gpu_nodes > 1 if nb_gpu_nodes > 1 and self.use_dp: # pragma: no cover self.use_ddp = True self.use_dp = False w = 'DataParallel does not support nb_gpu_nodes > 1. ' \ 'Switching to DistributedDataParallel for you. ' \ 'To silence this warning set distributed_backend=ddp' warnings.warn(w) elif distributed_backend is None: m = 'When using multiple GPUs set ' \ 'Trainer(distributed_backend=dp) (or ddp)' raise MisconfigurationException(m) print('gpu available: {}, used: {}'.format(torch.cuda.is_available(), self.on_gpu))
def __dp_train(self, model): # CHOOSE OPTIMIZER # allow for lr schedulers as well self.optimizers, self.lr_schedulers = self.init_optimizers( model.configure_optimizers()) root_gpu = 0 if type(self.data_parallel_device_ids) is list: root_gpu = self.data_parallel_device_ids[0] model.cuda(root_gpu) # check for this bug (amp + dp + !01 doesn't work) # https://github.com/NVIDIA/apex/issues/227 if self.use_dp and self.use_amp: m = f""" Amp level {self.amp_level} with DataParallel is not supported. See this note from NVIDIA for more info: https://github.com/NVIDIA/apex/issues/227. We recommend you switch to ddp if you want to use amp """ raise MisconfigurationException(m) model = LightningDataParallel(model, device_ids=self.data_parallel_device_ids) self.__run_pretrain_routine(model)
def set_distributed_mode(self, distributed_backend, nb_gpu_nodes): # skip for CPU if self.num_gpus == 0: return # single GPU case # in single gpu case we allow ddp so we can train on multiple # nodes, 1 gpu per node if self.num_gpus == 1: self.single_gpu = True if distributed_backend is not None: self.use_dp = distributed_backend == 'dp' self.use_ddp = distributed_backend == 'ddp' self.use_ddp2 = distributed_backend == 'ddp2' # disable single gpu when using ddp2 if self.use_ddp2: self.single_gpu = False # multiple GPU case elif self.num_gpus > 1: if distributed_backend is not None: # DP, DDP case self.use_dp = distributed_backend == 'dp' self.use_ddp = distributed_backend == 'ddp' self.use_ddp2 = distributed_backend == 'ddp2' elif distributed_backend is None: m = 'When using multiple GPUs set ' \ 'Trainer(distributed_backend=dp) (or ddp)' raise MisconfigurationException(m) # throw error to force user ddp or ddp2 choice if nb_gpu_nodes > 1 and not (self.use_ddp2 or self.use_ddp): # pragma: no cover w = 'DataParallel does not support nb_gpu_nodes > 1. ' \ 'Switching to DistributedDataParallel for you. ' \ 'To silence this warning set distributed_backend=ddp' \ 'or distributed_backend=ddp2' raise MisconfigurationException(w) logging.info( f'gpu available: {torch.cuda.is_available()}, used: {self.on_gpu}')
def init_train_dataloader(self, model): """ Dataloaders are provided by the model :param model: :return: """ self.get_train_dataloader = model.train_dataloader self._percent_range_check('train_percent_check') # determine number of training batches if DALI_AVAILABLE and isinstance(self.get_train_dataloader(), DALIGenericIterator): self._dali_iterator_check(model, 'train') self.num_training_batches = self._get_dali_batch_count( self.get_train_dataloader()) elif ITER_DATASET_AVAILABLE and isinstance( self.get_train_dataloader().dataset, IterableDataset): self.num_training_batches = float('inf') else: self.num_training_batches = len(self.get_train_dataloader()) self.num_training_batches = int(self.num_training_batches * self.train_percent_check) # determine when to check validation # if int passed in, val checks that often # otherwise, it checks in [0, 1.0] % range of a training epoch if isinstance(self.val_check_interval, int): self.val_check_batch = self.val_check_interval if self.val_check_batch > self.num_training_batches: raise ValueError( f"`val_check_interval` ({self.val_check_interval}) must be less than or equal " f"to the number of the training batches ({self.num_training_batches}). " f"If you want to disable validation set `val_percent_check` to 0.0 instead." ) else: self._percent_range_check('val_check_interval') self.val_check_batch = int(self.num_training_batches * self.val_check_interval) self.val_check_batch = max(1, self.val_check_batch) self._ddp_sampler_check(self.get_train_dataloader(), 'train') # support IterableDataset for train data self.is_iterable_train_dataloader = ( isinstance(self.get_train_dataloader(), DataLoader) and ITER_DATASET_AVAILABLE and isinstance( self.get_train_dataloader().dataset, IterableDataset)) if self.is_iterable_train_dataloader and not isinstance( self.val_check_interval, int): m = ''' When using an iterableDataset for `train_dataloader`, `Trainer(val_check_interval)` must be an int. An int k specifies checking validation every k training batches ''' raise MisconfigurationException(m)
def reset_train_dataloader(self, model): """ Dataloaders are provided by the model :param model: :return: """ self.train_dataloader = self.request_data_loader( model.train_dataloader) self.num_training_batches = 0 # automatically add samplers self.train_dataloader = self.auto_add_sampler(self.train_dataloader, train=True) # determine number of training batches if EXIST_ITER_DATASET and isinstance(self.train_dataloader.dataset, IterableDataset): self.num_training_batches = float('inf') else: self._percent_range_check('train_percent_check') self.num_training_batches = len(self.train_dataloader) self.num_training_batches = int(self.num_training_batches * self.train_percent_check) # determine when to check validation # if int passed in, val checks that often # otherwise, it checks in [0, 1.0] % range of a training epoch if isinstance(self.val_check_interval, int): self.val_check_batch = self.val_check_interval if self.val_check_batch > self.num_training_batches: raise ValueError( f"`val_check_interval` ({self.val_check_interval}) must be less than or equal " f"to the number of the training batches ({self.num_training_batches}). " f"If you want to disable validation set `val_percent_check` to 0.0 instead." ) else: self._percent_range_check('val_check_interval') self.val_check_batch = int(self.num_training_batches * self.val_check_interval) self.val_check_batch = max(1, self.val_check_batch) # support IterableDataset for train data self.is_iterable_train_dataloader = (EXIST_ITER_DATASET and isinstance( self.train_dataloader.dataset, IterableDataset)) if self.is_iterable_dataloader( self.train_dataloader) and not isinstance( self.val_check_interval, int): m = ''' When using an iterableDataset for `train_dataloader`, `Trainer(val_check_interval)` must be an int. An int k specifies checking validation every k training batches ''' raise MisconfigurationException(m)
def check_gpus_data_type(gpus): """ :param gpus: gpus parameter as passed to the Trainer Function checks that it is one of: None, Int, String or List Throws otherwise :return: return unmodified gpus variable """ if gpus is not None and type(gpus) not in (int, str, list): raise MisconfigurationException("GPUs must be int, string or list of ints or None.")
def __run_evaluation(self, test=False): # when testing make sure user defined a test step can_run_test_step = False if test: can_run_test_step = self.__is_overriden('test_step') and self.__is_overriden('test_end') if not can_run_test_step: m = '''You called .test() without defining a test step or test_end. Please define and try again''' raise MisconfigurationException(m) # validate only if model has validation_step defined # test only if test_step or validation_step are defined run_val_step = self.__is_overriden('validation_step') if run_val_step or can_run_test_step: # hook model = self.__get_model() model.on_pre_performance_check() # select dataloaders dataloaders = self.val_dataloader max_batches = self.nb_val_batches # calculate max batches to use if test: dataloaders = self.test_dataloader max_batches = self.nb_test_batches # cap max batches to 1 when using fast_dev_run if self.fast_dev_run: max_batches = 1 for ds_i, dataloader in enumerate(dataloaders): eval_out_metrics = self.evaluate(self.model, dataloader, max_batches, ds_i, test) self.__add_tqdm_metrics(eval_out_metrics) # hook model.on_post_performance_check() if self.show_progress_bar: # add model specific metrics tqdm_metrics = self.__tng_tqdm_dic self.progress_bar.set_postfix(**tqdm_metrics) # model checkpointing if self.proc_rank == 0 and self.checkpoint_callback is not None and not test: print('save callback...') self.checkpoint_callback.on_epoch_end(epoch=self.current_epoch, logs=self.__tng_tqdm_dic)
def num_gpus(self): gpus = self.data_parallel_device_ids if gpus is None: return 0 if type(gpus) is list: return len(gpus) if type(gpus) is int: return gpus m = 'gpus must be int, none or list of ints' raise MisconfigurationException(m)
def fit(self, model): # when using multi-node or DDP within a node start each module in a separate process if self.use_ddp: # must copy only the meta of the exp so it survives pickle/unpickle # when going to new process if self.experiment is not None: self.experiment = self.experiment.get_meta_copy() if self.is_slurm_managing_tasks: task = int(os.environ['SLURM_LOCALID']) self.ddp_train(task, model) else: msg = """ You requested %(nb_gpus)s GPUs but launched %(nb_tasks)s slurm tasks. We will launch %(nb_gpus)s processes for you. We recommend you let slurm manage the processes by setting: --ntasks-per-node=%(nb_gpus)s If you're not using SLURM, ignore this message! """ % { 'nb_gpus': self.nb_requested_gpus, 'nb_tasks': self.nb_slurm_tasks } warnings.warn(msg) mp.spawn(self.ddp_train, nprocs=len(self.data_parallel_device_ids), args=(model, )) # 1 gpu or dp option triggers training using DP module # easier to avoid NCCL issues elif self.use_dp: self.__dp_train(model) elif self.single_gpu: self.__single_gpu_train(model) # ON CPU else: # run through amp wrapper if self.use_amp: raise MisconfigurationException('amp + cpu is not supported.' ' Please use a GPU option') # CHOOSE OPTIMIZER # allow for lr schedulers as well self.optimizers = model.configure_optimizers() if len(self.optimizers) == 2 and type(self.optimizers[0]) is list: self.optimizers, self.lr_schedulers = self.optimizers self.__run_pretrain_routine(model) # return 1 when finished # used for testing or when we need to know that training succeeded return 1
def fit(self, model): r""" Runs the full optimization routine. Example:: trainer = Trainer() model = LightningModule() trainer.fit() """ # when using multi-node or DDP within a node start each module in a separate process if self.use_ddp2: task = int(os.environ['SLURM_LOCALID']) self.ddp_train(task, model) elif self.use_ddp: if self.is_slurm_managing_tasks: task = int(os.environ['SLURM_LOCALID']) self.ddp_train(task, model) else: mp.spawn(self.ddp_train, nprocs=self.num_gpus, args=(model,)) # 1 gpu or dp option triggers training using DP module # easier to avoid NCCL issues elif self.use_dp: self.dp_train(model) elif self.single_gpu: self.single_gpu_train(model) elif self.use_tpu: log.info(f'training on {self.num_tpu_cores} TPU cores') # COLAB_GPU is an env var available by default in Colab environments. start_method = 'fork' if os.getenv('COLAB_GPU') else 'spawn' xmp.spawn(self.tpu_train, args=(model,), nprocs=self.num_tpu_cores, start_method=start_method) # ON CPU else: # run through amp wrapper if self.use_amp: raise MisconfigurationException('amp + cpu is not supported. Please use a GPU option') # CHOOSE OPTIMIZER # allow for lr schedulers as well self.optimizers, self.lr_schedulers = self.init_optimizers(model.configure_optimizers()) self.run_pretrain_routine(model) # return 1 when finished # used for testing or when we need to know that training succeeded return 1
def reset_train_dataloader(self, model: LightningModule) -> None: """Resets the train dataloader and initialises required variables (number of batches, when to validate, etc.). Args: model: The current `LightningModule` """ self.train_dataloader = self.request_dataloader(model.train_dataloader) self.num_training_batches = 0 # automatically add samplers self.train_dataloader = self.auto_add_sampler(self.train_dataloader, train=True) self._percent_range_check('train_percent_check') if not _has_len(self.train_dataloader): self.num_training_batches = float('inf') else: # try getting the length self.num_training_batches = len(self.train_dataloader) self.num_training_batches = int(self.num_training_batches * self.train_percent_check) # determine when to check validation # if int passed in, val checks that often # otherwise, it checks in [0, 1.0] % range of a training epoch if isinstance(self.val_check_interval, int): self.val_check_batch = self.val_check_interval if self.val_check_batch > self.num_training_batches: raise ValueError( f'`val_check_interval` ({self.val_check_interval}) must be less than or equal ' f'to the number of the training batches ({self.num_training_batches}). ' 'If you want to disable validation set `val_percent_check` to 0.0 instead.' ) else: if not _has_len(self.train_dataloader): if self.val_check_interval == 1.0: self.val_check_batch = float('inf') else: raise MisconfigurationException( 'When using an infinite DataLoader (e.g. with an IterableDataset or when ' 'DataLoader does not implement `__len__`) for `train_dataloader`, ' '`Trainer(val_check_interval)` must be `1.0` or an int. An int k specifies ' 'checking validation every k training batches.') else: self._percent_range_check('val_check_interval') self.val_check_batch = int(self.num_training_batches * self.val_check_interval) self.val_check_batch = max(1, self.val_check_batch)
def sanitize_gpu_ids(gpus): """ :param gpus: list of ints corresponding to GPU indices Checks that each of the GPUs in the list is actually available. Throws if any of the GPUs is not available. :return: unmodified gpus variable """ all_available_gpus = get_all_available_gpus() for gpu in gpus: if gpu not in all_available_gpus: raise MisconfigurationException(f""" You requested GPUs: {gpus} But your machine only has: {all_available_gpus} """) return gpus
def sanitize_gpu_ids(gpus): """ :param gpus: list of ints corresponding to GPU indices Checks that each of the GPUs in the list is actually available. Throws if any of the GPUs is not available. :return: unmodified gpus variable """ all_available_gpus = get_all_available_gpus() for gpu in gpus: if gpu not in all_available_gpus: message = f""" Non-available gpu index {gpu} specified: Available gpu indices are: {all_available_gpus} """ raise MisconfigurationException(message) return gpus
def set_distributed_mode(self, distributed_backend, num_gpu_nodes): # skip for CPU if self.num_gpus == 0: return # single GPU case # in single gpu case we allow ddp so we can train on multiple # nodes, 1 gpu per node if self.num_gpus == 1: self.single_gpu = True if distributed_backend is not None: self.use_dp = distributed_backend == 'dp' self.use_ddp = distributed_backend == 'ddp' self.use_ddp2 = distributed_backend == 'ddp2' # disable single gpu when using ddp2 if self.use_ddp2: self.single_gpu = False # multiple GPU case elif self.num_gpus > 1: if distributed_backend is not None: # DP, DDP case self.use_dp = distributed_backend == 'dp' self.use_ddp = distributed_backend == 'ddp' self.use_ddp2 = distributed_backend == 'ddp2' elif distributed_backend is None: warnings.warn( 'You requested multiple GPUs but did not specify a backend, e.g.' ' Trainer(distributed_backend=dp) (or ddp, ddp2).' ' Setting distributed_backend=dp for you.') self.use_dp = True self.use_ddp = False self.use_ddp2 = False # throw error to force user ddp or ddp2 choice if num_gpu_nodes > 1 and not (self.use_ddp2 or self.use_ddp): w = 'DataParallel does not support num_nodes > 1. ' \ 'Switching to DistributedDataParallel for you. ' \ 'To silence this warning set distributed_backend=ddp' \ 'or distributed_backend=ddp2' raise MisconfigurationException(w) log.info( f'GPU available: {torch.cuda.is_available()}, used: {self.on_gpu}')
def _reset_eval_dataloader(self, model: LightningModule, mode: str) -> Tuple[int, List[DataLoader]]: """Generic method to reset a dataloader for evaluation. Args: model: The current `LightningModule` mode: Either `'val'` or `'test'` Returns: Tuple (num_batches, dataloaders) """ dataloaders = self.request_dataloader( getattr(model, f'{mode}_dataloader')) if not isinstance(dataloaders, list): dataloaders = [dataloaders] # add samplers dataloaders = [ self.auto_add_sampler(dl, train=False) for dl in dataloaders if dl ] num_batches = 0 # determine number of batches # datasets could be none, 1 or 2+ if len(dataloaders) != 0: for dataloader in dataloaders: if not _has_len(dataloader): num_batches = float('inf') break percent_check = getattr(self, f'{mode}_percent_check') if num_batches != float('inf'): self._percent_range_check(f'{mode}_percent_check') num_batches = sum( len(dataloader) for dataloader in dataloaders) num_batches = int(num_batches * percent_check) elif percent_check not in (0.0, 1.0): raise MisconfigurationException( 'When using an infinite DataLoader (e.g. with an IterableDataset or when ' f'DataLoader does not implement `__len__`) for `{mode}_dataloader`, ' f'`Trainer({mode}_percent_check)` must be `0.0` or `1.0`.') return num_batches, dataloaders
def fit(self, model): # when using multi-node or DDP within a node start each module in a separate process if self.use_ddp: if self.is_slurm_managing_tasks: task = int(os.environ['SLURM_LOCALID']) self.ddp_train(task, model) else: nb_gpus = self.nb_requested_gpus nb_tasks = self.nb_slurm_tasks msg = f""" You requested {nb_gpus}s GPUs but launched {nb_tasks}s slurm tasks. We will launch {nb_gpus}s processes for you. We recommend you let slurm manage the processes by setting: --ntasks-per-node={nb_gpus}s If you're not using SLURM, ignore this message! """ warnings.warn(msg) mp.spawn(self.ddp_train, nprocs=self.num_gpus, args=(model, )) # 1 gpu or dp option triggers training using DP module # easier to avoid NCCL issues elif self.use_dp: self.__dp_train(model) elif self.single_gpu: self.__single_gpu_train(model) # ON CPU else: # run through amp wrapper if self.use_amp: raise MisconfigurationException('amp + cpu is not supported.' ' Please use a GPU option') # CHOOSE OPTIMIZER # allow for lr schedulers as well self.optimizers, self.lr_schedulers = self.init_optimizers( model.configure_optimizers()) self.__run_pretrain_routine(model) # return 1 when finished # used for testing or when we need to know that training succeeded return 1
def get_dataloaders(self, model): """ Dataloaders are provided by the model :param model: :return: """ self.init_train_dataloader(model) self.init_test_dataloader(model) self.init_val_dataloader(model) if self.use_ddp or self.use_ddp2: # wait for all processes to catch up dist.barrier() # load each dataloader self.get_train_dataloader() self.get_test_dataloaders() self.get_val_dataloaders() # on TPUs load each dataloader only on process 0 # this will trigger the data downloads if self.use_tpu and XLA_AVAILABLE: if self.tpu_local_core_rank == 0: self.get_train_dataloader() self.get_test_dataloaders() self.get_val_dataloaders() # wait for all processes to catch up torch_xla.core.xla_model.rendezvous( "pl.TrainerDataLoadingMixin.get_dataloaders") # support IterableDataset for train data self.is_iterable_train_dataloader = (EXIST_ITER_DATASET and isinstance( self.get_train_dataloader().dataset, IterableDataset)) if self.is_iterable_train_dataloader and not isinstance( self.val_check_interval, int): m = ''' When using an iterableDataset for `train_dataloader`, `Trainer(val_check_interval)` must be an int. An int k specifies checking validation every k training batches ''' raise MisconfigurationException(m)
def fit(self, model): # when using multi-node or DDP within a node start each module in a separate process if self.use_ddp2: task = int(os.environ['SLURM_LOCALID']) self.ddp_train(task, model) elif self.use_ddp: if self.is_slurm_managing_tasks: task = int(os.environ['SLURM_LOCALID']) self.ddp_train(task, model) else: mp.spawn(self.ddp_train, nprocs=self.num_gpus, args=(model, )) # 1 gpu or dp option triggers training using DP module # easier to avoid NCCL issues elif self.use_dp: self.dp_train(model) elif self.single_gpu: self.single_gpu_train(model) # ON CPU else: # run through amp wrapper if self.use_amp: raise MisconfigurationException( 'amp + cpu is not supported. Please use a GPU option') # CHOOSE OPTIMIZER # allow for lr schedulers as well self.optimizers, self.lr_schedulers = self.init_optimizers( model.configure_optimizers()) self.run_pretrain_routine(model) # return 1 when finished # used for testing or when we need to know that training succeeded return 1
def run_pretrain_routine(self, model): """Sanity check a few things before starting actual training. :param model: """ ref_model = model if self.data_parallel: ref_model = model.module # give model convenience properties ref_model.trainer = self # set local properties on the model self.copy_trainer_model_properties(ref_model) # link up experiment object if self.logger is not None: ref_model.logger = self.logger # save exp to get started if hasattr(ref_model, "hparams"): self.logger.log_hyperparams(ref_model.hparams) self.logger.save() if self.use_ddp or self.use_ddp2: dist.barrier() # set up checkpoint callback self.configure_checkpoint_callback() # register auto-resubmit when on SLURM self.register_slurm_signal_handlers() # transfer data loaders from model self.get_dataloaders(ref_model) # print model summary if self.proc_rank == 0 and self.weights_summary is not None: if self.weights_summary in ['full', 'top']: ref_model.summarize(mode=self.weights_summary) else: m = "weights_summary can be None, 'full' or 'top'" raise MisconfigurationException(m) # track model now. # if cluster resets state, the model will update with the saved weights self.model = model # restore training and model before hpc call self.restore_weights(model) # when testing requested only run test and return if self.testing: self.run_evaluation(test=True) return # run tiny validation (if validation defined) # to make sure program won't crash during val ref_model.on_sanity_check_start() if self.get_val_dataloaders( ) is not None and self.num_sanity_val_steps > 0: # init progress bars for validation sanity check pbar = tqdm.tqdm(desc='Validation sanity check', total=self.num_sanity_val_steps, leave=False, position=2 * self.process_position, disable=not self.show_progress_bar, dynamic_ncols=True, unit='batch') self.main_progress_bar = pbar # dummy validation progress bar self.val_progress_bar = tqdm.tqdm(disable=True) self.evaluate(model, self.get_val_dataloaders(), self.num_sanity_val_steps, self.testing) # close progress bars self.main_progress_bar.close() self.val_progress_bar.close() # init progress bar pbar = tqdm.tqdm(leave=True, position=2 * self.process_position, disable=not self.show_progress_bar, dynamic_ncols=True, unit='batch', file=sys.stdout) self.main_progress_bar = pbar # clear cache before training if self.on_gpu: torch.cuda.empty_cache() # CORE TRAINING LOOP self.train()
def run_evaluation(self, test_mode: bool = False): # when testing make sure user defined a test step if test_mode and not self.is_overriden('test_step'): m = "You called `.test()` without defining model's `.test_step()`." \ " Please define and try again" raise MisconfigurationException(m) # Validation/Test begin callbacks if test_mode: self.on_test_start() else: self.on_validation_start() # hook model = self.get_model() model.on_pre_performance_check() # select dataloaders if test_mode: if self.reload_dataloaders_every_epoch or self.test_dataloaders is None: self.reset_test_dataloader(model) dataloaders = self.test_dataloaders max_batches = self.num_test_batches else: # val if self.reload_dataloaders_every_epoch or self.val_dataloaders is None: self.reset_val_dataloader(model) dataloaders = self.val_dataloaders max_batches = self.num_val_batches # cap max batches to 1 when using fast_dev_run if self.fast_dev_run: max_batches = 1 # init validation or test progress bar # main progress bar will already be closed when testing so initial position is free position = 2 * self.process_position + (not test_mode) desc = 'Testing' if test_mode else 'Validating' pbar = tqdm(desc=desc, total=max_batches, leave=test_mode, position=position, disable=not self.show_progress_bar, dynamic_ncols=True, file=sys.stdout) setattr(self, f'{"test" if test_mode else "val"}_progress_bar', pbar) # run evaluation eval_results = self.evaluate(self.model, dataloaders, max_batches, test_mode) _, prog_bar_metrics, log_metrics, callback_metrics, _ = self.process_output( eval_results) # add metrics to prog bar self.add_tqdm_metrics(prog_bar_metrics) # log metrics self.log_metrics(log_metrics, {}) # track metrics for callbacks self.callback_metrics.update(callback_metrics) # hook model.on_post_performance_check() # add model specific metrics if not test_mode: self.main_progress_bar.set_postfix(**self.training_tqdm_dict) # close progress bar if test_mode: self.test_progress_bar.close() else: self.val_progress_bar.close() # model checkpointing if self.proc_rank == 0 and self.checkpoint_callback is not None and not test_mode: self.checkpoint_callback.on_validation_end(self, self.get_model()) # Validation/Test end callbacks if test_mode: self.on_test_end() else: self.on_validation_end()
def fit( self, model: LightningModule, train_dataloader: Optional[DataLoader] = None, val_dataloaders: Optional[DataLoader] = None, test_dataloaders: Optional[DataLoader] = None ): r""" Runs the full optimization routine. Args: model: Model to fit. train_dataloader: A Pytorch DataLoader with training samples. If the model has a predefined train_dataloader method this will be skipped. val_dataloaders: Either a single Pytorch Dataloader or a list of them, specifying validation samples. If the model has a predefined val_dataloaders method this will be skipped test_dataloaders: Either a single Pytorch Dataloader or a list of them, specifying validation samples. If the model has a predefined test_dataloaders method this will be skipped Example:: # Option 1, # Define the train_dataloader(), test_dataloader() and val_dataloader() fxs # in the lightningModule # RECOMMENDED FOR MOST RESEARCH AND APPLICATIONS TO MAINTAIN READABILITY trainer = Trainer() model = LightningModule() trainer.fit(model) # Option 2 # in production cases we might want to pass different datasets to the same model # Recommended for PRODUCTION SYSTEMS train, val, test = DataLoader(...), DataLoader(...), DataLoader(...) trainer = Trainer() model = LightningModule() trainer.fit(model, train_dataloader=train, val_dataloader=val, test_dataloader=test) # Option 1 & 2 can be mixed, for example the training set can be # defined as part of the model, and validation/test can then be # feed to .fit() """ # Fit begin callbacks self.on_fit_start() # set up the passed in dataloaders (if needed) self.__set_fit_dataloaders(model, train_dataloader, val_dataloaders, test_dataloaders) # route to appropriate start method # when using multi-node or DDP within a node start each module in a separate process if self.use_ddp2: task = int(os.environ['SLURM_LOCALID']) self.ddp_train(task, model) elif self.use_ddp: if self.is_slurm_managing_tasks: task = int(os.environ['SLURM_LOCALID']) self.ddp_train(task, model) else: mp.spawn(self.ddp_train, nprocs=self.num_gpus, args=(model,)) # 1 gpu or dp option triggers training using DP module # easier to avoid NCCL issues elif self.use_dp: self.dp_train(model) elif self.single_gpu: self.single_gpu_train(model) elif self.use_tpu: log.info(f'training on {self.num_tpu_cores} TPU cores') # COLAB_GPU is an env var available by default in Colab environments. start_method = 'fork' if os.getenv('COLAB_GPU') else 'spawn' xmp.spawn(self.tpu_train, args=(model,), nprocs=self.num_tpu_cores, start_method=start_method) # ON CPU else: # run through amp wrapper if self.use_amp: raise MisconfigurationException('amp + cpu is not supported. Please use a GPU option') # CHOOSE OPTIMIZER # allow for lr schedulers as well self.optimizers, self.lr_schedulers = self.init_optimizers(model.configure_optimizers()) self.run_pretrain_routine(model) # Fit end callbacks self.on_fit_end() # return 1 when finished # used for testing or when we need to know that training succeeded return 1