def on_train_start(self, trainer, pl_module, *args, **kwargs): try: # log model to the wandb experiment wandb.watch(models=pl_module.model, criterion=pl_module.loss_func) except: log.info("Skipping wandb.watch --->")
def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0): """ Entry point for ddp Args: process_idx: mp_queue: multiprocessing queue model: is_master: proc_offset: Returns: """ # offset the process id if requested process_idx = process_idx + proc_offset # show progressbar only on progress_rank 0 if (self.trainer.node_rank != 0 or process_idx != 0) and self.trainer.progress_bar_callback is not None: self.trainer.progress_bar_callback.disable() self.trainer.local_rank = self.trainer.node_rank self.trainer.global_rank = self.trainer.node_rank self.trainer.world_size = self.trainer.num_nodes # set warning rank rank_zero_only.rank = self.trainer.global_rank # set up server using proc 0's ip address # try to init for 20 times at max in case ports are taken # where to store ip_table model.trainer = self.trainer model.init_ddp_connection( self.trainer.global_rank, self.trainer.world_size, self.trainer.is_slurm_managing_tasks ) # call setup after the ddp process has connected self.trainer.call_setup_hook(model) # on world_size=0 let everyone know training is starting if self.trainer.is_global_zero: log.info('-' * 100) log.info(f'distributed_backend={self.trainer.distributed_backend}') log.info(f'All DDP processes registered. Starting ddp with {self.trainer.world_size} processes') log.info('-' * 100) # MODEL # copy model to each gpu if self.trainer.on_gpu: gpu_idx = process_idx # when using ddp, the master process (proc 0) continues running as the main one # this means that the local rank will always be 0 # (even if cuda visible devices has other visible gpus) # this means that the master process needs to pull the 0th visible index as the device number if is_master: available_gpus = os.environ['CUDA_VISIBLE_DEVICES'].split(',') gpu_idx = int(available_gpus[self.trainer.local_rank]) self.trainer.root_gpu = gpu_idx torch.cuda.set_device(self.trainer.root_gpu) model.cuda(self.trainer.root_gpu) # CHOOSE OPTIMIZER # allow for lr schedulers as well optimizers, lr_schedulers, optimizer_frequencies = self.trainer.init_optimizers(model) self.trainer.optimizers = optimizers self.trainer.lr_schedulers = lr_schedulers self.trainer.optimizer_frequencies = optimizer_frequencies # set model properties before going into wrapper self.trainer.copy_trainer_model_properties(model) # AMP - run through amp wrapper before going to distributed DP if self.trainer.amp_backend == AMPType.APEX: model, optimizers = model.configure_apex(amp, model, self.trainer.optimizers, self.trainer.amp_level) self.trainer.optimizers = optimizers self.trainer.reinit_scheduler_properties(self.trainer.optimizers, self.trainer.lr_schedulers) # DDP2 uses all GPUs on the machine device_ids = self.trainer.data_parallel_device_ids # allow user to configure ddp model = model.configure_ddp(model, device_ids) # continue training routine results = self.trainer.run_pretrain_routine(model) # get original model model = self.trainer.get_model() # persist info in ddp_spawn self.trainer.transfer_distrib_spawn_state_on_fit_end(model, mp_queue, results) # clean up memory torch.cuda.empty_cache()
def set_distributed_mode(self, distributed_backend): self.use_dp = False self.use_ddp = False self.use_ddp2 = False self.use_horovod = False self.single_gpu = False if distributed_backend is None: if self.has_horovodrun(): self.check_horovod() self.use_horovod = True elif self.num_gpus == 0: if self.num_nodes > 1 or self.num_processes > 1: self.use_ddp = True # ddp_cpu elif self.num_gpus == 1: self.single_gpu = True elif self.num_gpus > 1: rank_zero_warn( 'You requested multiple GPUs but did not specify a backend, e.g.' ' Trainer(distributed_backend=dp) (or ddp, ddp2).' ' Setting distributed_backend=dp for you.') self.use_dp = True elif distributed_backend == "dp": # do nothing if num_gpus == 0 if self.num_gpus == 1: self.single_gpu = True self.use_dp = True elif self.num_gpus > 1: self.use_dp = True elif distributed_backend == "ddp": if self.num_gpus == 0: if self.num_nodes > 1 or self.num_processes > 1: self.use_ddp = True # ddp_cpu elif self.num_gpus == 1: self.single_gpu = True self.use_ddp = True elif self.num_gpus > 1: self.use_ddp = True self.num_processes = self.num_gpus elif distributed_backend == "ddp2": # do nothing if num_gpus == 0 if self.num_gpus >= 1: self.use_ddp2 = True elif distributed_backend == "ddp_cpu": if self.num_gpus > 0: rank_zero_warn( 'You requested one or more GPUs, but set the backend to `ddp_cpu`.' ' Training will not use GPUs.') self.use_ddp = True self.data_parallel_device_ids = None self.on_gpu = False elif distributed_backend == 'horovod': self.check_horovod() self.use_horovod = True # throw error to force user ddp or ddp2 choice if self.num_nodes > 1 and not (self.use_ddp2 or self.use_ddp): raise MisconfigurationException( 'DataParallel does not support num_nodes > 1. Switching to DistributedDataParallel for you. ' 'To silence this warning set distributed_backend=ddp or distributed_backend=ddp2' ) log.info( f'GPU available: {torch.cuda.is_available()}, used: {self.on_gpu}')
def on_validation_end(self, trainer, pl_module): # pass # only run on main process if trainer.global_rank != 0: return metrics = trainer.callback_metrics epoch = trainer.current_epoch step = trainer.global_step if self.save_top_k == 0: # no models are saved return if self.epoch_last_check is not None and ( epoch - self.epoch_last_check) < self.period: # skipping in this term return self.epoch_last_check = epoch filepath = self.format_checkpoint_name(epoch, metrics, step) version_cnt = 0 while os.path.isfile(filepath): filepath = self.format_checkpoint_name(epoch, metrics, step, ver=version_cnt) # this epoch called before version_cnt += 1 if self.save_top_k != -1: current = metrics.get(self.monitor) if not isinstance(current, torch.Tensor): rank_zero_warn( f'The metric you returned {current} must be a `torch.Tensor` instance, checkpoint not saved' f' HINT: what is the value of {self.monitor} in validation_epoch_end()?', RuntimeWarning) if current is not None: current = torch.tensor(current) if current is None: rank_zero_warn( f'Can save best model only with {self.monitor} available, skipping.', RuntimeWarning) elif self.check_monitor_top_k(current): self._do_check_save(filepath, current, epoch) elif self.verbose > 0: log.info( f'\nEpoch {epoch:02d}: {self.monitor} was not in top {self.save_top_k}' ) else: if self.verbose > 0: log.info(f'\nEpoch {epoch:02d}: saving model to {filepath}') assert trainer.global_rank == 0, 'tried to make a checkpoint from non global_rank=0' self._save_model(filepath) if self.save_last: filepath = os.path.join(self.dirpath, self.prefix + 'last.ckpt') self._save_model(filepath)
def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0): """ Entry point for ddp Args: process_idx: mp_queue: multiprocessing queue model: """ seed = os.environ.get("PL_GLOBAL_SEED") if seed is not None: seed_everything(int(seed)) # offset the process id if requested process_idx = process_idx + proc_offset # show progressbar only on progress_rank 0 if (self.trainer.node_rank != 0 or process_idx != 0 ) and self.trainer.progress_bar_callback is not None: self.trainer.progress_bar_callback.disable() # determine which process we are and world size self.set_world_ranks(process_idx) # set warning rank rank_zero_only.rank = self.trainer.global_rank # set up server using proc 0's ip address # try to init for 20 times at max in case ports are taken # where to store ip_table model.trainer = self.trainer self.init_ddp_connection(self.trainer.global_rank, self.trainer.world_size, self.trainer.is_slurm_managing_tasks) # call setup after the ddp process has connected self.trainer.call_setup_hook(model) # on world_size=0 let everyone know training is starting if self.trainer.is_global_zero and not torch.distributed.is_initialized( ): log.info('-' * 100) log.info(f'distributed_backend={self.trainer.distributed_backend}') log.info( f'All DDP processes registered. Starting ddp with {self.trainer.world_size} processes' ) log.info('-' * 100) # call sync_bn before .cuda(), configure_apex and configure_ddp if self.trainer.sync_batchnorm: model = self.configure_sync_batchnorm(model) # move the model to the correct device self.model_to_device(model, process_idx, is_master) # CHOOSE OPTIMIZER # allow for lr schedulers as well self.setup_optimizers(model) # set model properties before going into wrapper self.trainer.model_connector.copy_trainer_model_properties(model) # 16-bit model = self.trainer.precision_connector.connect(model) # device ids change depending on the DDP setup device_ids = self.get_device_ids() # allow user to configure ddp model = self.configure_ddp(model, device_ids) # set up training routine self.trainer.train_loop.setup_training(model) # train or test results = self.train_or_test() # get original model model = self.trainer.get_model() # persist info in ddp_spawn self.transfer_distrib_spawn_state_on_fit_end(model, mp_queue, results) # clean up memory torch.cuda.empty_cache()
def __init__( self, logger: Union[LightningLoggerBase, Iterable[LightningLoggerBase], bool] = True, checkpoint_callback: Union[ModelCheckpoint, bool] = True, early_stop_callback: Optional[Union[EarlyStopping, bool]] = False, callbacks: Optional[List[Callback]] = None, default_root_dir: Optional[str] = None, gradient_clip_val: float = 0, process_position: int = 0, num_nodes: int = 1, num_processes: int = 1, gpus: Optional[Union[List[int], str, int]] = None, auto_select_gpus: bool = False, num_tpu_cores: Optional[int] = None, log_gpu_memory: Optional[str] = None, progress_bar_refresh_rate: int = 1, overfit_pct: float = 0.0, track_grad_norm: int = -1, check_val_every_n_epoch: int = 1, fast_dev_run: bool = False, accumulate_grad_batches: Union[int, Dict[int, int], List[list]] = 1, max_epochs: int = 1000, min_epochs: int = 1, max_steps: Optional[int] = None, min_steps: Optional[int] = None, train_percent_check: float = 1.0, val_percent_check: float = 1.0, test_percent_check: float = 1.0, val_check_interval: float = 1.0, log_save_interval: int = 100, row_log_interval: int = 10, add_row_log_interval=None, # backward compatible, todo: remove in v0.8.0 distributed_backend: Optional[str] = None, precision: int = 32, print_nan_grads: bool = False, # backward compatible, todo: remove in v0.9.0 weights_summary: Optional[str] = 'full', weights_save_path: Optional[str] = None, amp_level: str = 'O1', num_sanity_val_steps: int = 5, truncated_bptt_steps: Optional[int] = None, resume_from_checkpoint: Optional[str] = None, profiler: Optional[BaseProfiler] = None, benchmark: bool = False, reload_dataloaders_every_epoch: bool = False, auto_lr_find: Union[bool, str] = False, replace_sampler_ddp: bool = True, default_save_path=None, # backward compatible, todo: remove in v0.8.0 gradient_clip=None, # backward compatible, todo: remove in v0.8.0 nb_gpu_nodes=None, # backward compatible, todo: remove in v0.8.0 max_nb_epochs=None, # backward compatible, todo: remove in v0.8.0 min_nb_epochs=None, # backward compatible, todo: remove in v0.8.0 use_amp=None, # backward compatible, todo: remove in v0.9.0 show_progress_bar=None, # backward compatible, todo: remove in v0.9.0 nb_sanity_val_steps=None, # backward compatible, todo: remove in v0.8.0 terminate_on_nan: bool = False, **kwargs ): r""" Customize every aspect of training via flags Args: logger: Logger (or iterable collection of loggers) for experiment tracking. checkpoint_callback: Callback for checkpointing. early_stop_callback (:class:`pytorch_lightning.callbacks.EarlyStopping`): callbacks: Add a list of callbacks. default_root_dir: Default path for logs and weights when no logger/ckpt_callback passed default_save_path: .. warning:: .. deprecated:: 0.7.3 Use `default_root_dir` instead. Will remove 0.9.0. gradient_clip_val: 0 means don't clip. gradient_clip: .. warning:: .. deprecated:: 0.7.0 Use `gradient_clip_val` instead. Will remove 0.9.0. process_position: orders the tqdm bar when running multiple models on same machine. num_nodes: number of GPU nodes for distributed training. nb_gpu_nodes: .. warning:: .. deprecated:: 0.7.0 Use `num_nodes` instead. Will remove 0.9.0. gpus: Which GPUs to train on. auto_select_gpus: If enabled and `gpus` is an integer, pick available gpus automatically. This is especially useful when GPUs are configured to be in "exclusive mode", such that only one process at a time can access them. num_tpu_cores: How many TPU cores to train on (1 or 8). log_gpu_memory: None, 'min_max', 'all'. Might slow performance show_progress_bar: .. warning:: .. deprecated:: 0.7.2 Set `progress_bar_refresh_rate` to postive integer to enable. Will remove 0.9.0. progress_bar_refresh_rate: How often to refresh progress bar (in steps). Value ``0`` disables progress bar. overfit_pct: How much of training-, validation-, and test dataset to check. track_grad_norm: -1 no tracking. Otherwise tracks that norm check_val_every_n_epoch: Check val every n train epochs. fast_dev_run: runs 1 batch of train, test and val to find any bugs (ie: a sort of unit test). accumulate_grad_batches: Accumulates grads every k batches or as set up in the dict. max_epochs: Stop training once this number of epochs is reached. max_nb_epochs: .. warning:: .. deprecated:: 0.7.0 Use `max_epochs` instead. Will remove 0.9.0. min_epochs: Force training for at least these many epochs min_nb_epochs: .. warning:: .. deprecated:: 0.7.0 Use `min_epochs` instead. Will remove 0.9.0. max_steps: Stop training after this number of steps. Disabled by default (None). min_steps: Force training for at least these number of steps. Disabled by default (None). train_percent_check: How much of training dataset to check. val_percent_check: How much of validation dataset to check. test_percent_check: How much of test dataset to check. val_check_interval: How often within one training epoch to check the validation set log_save_interval: Writes logs to disk this often row_log_interval: How often to add logging rows (does not write to disk) add_row_log_interval: .. warning:: .. deprecated:: 0.7.0 Use `row_log_interval` instead. Will remove 0.9.0. distributed_backend: The distributed backend to use. use_amp: .. warning:: .. deprecated:: 0.7.0 Use `precision` instead. Will remove 0.9.0. precision: Full precision (32), half precision (16). print_nan_grads: .. warning:: .. deprecated:: 0.7.2 Has no effect. When detected, NaN grads will be printed automatically. Will remove 0.9.0. weights_summary: Prints a summary of the weights when training begins. weights_save_path: Where to save weights if specified. Will override default_root_dir for checkpoints only. Use this if for whatever reason you need the checkpoints stored in a different place than the logs written in `default_root_dir`. amp_level: The optimization level to use (O1, O2, etc...). num_sanity_val_steps: Sanity check runs n batches of val before starting the training routine. nb_sanity_val_steps: .. warning:: .. deprecated:: 0.7.0 Use `num_sanity_val_steps` instead. Will remove 0.8.0. truncated_bptt_steps: Truncated back prop breaks performs backprop every k steps of resume_from_checkpoint: To resume training from a specific checkpoint pass in the path here. profiler: To profile individual steps during training and assist in reload_dataloaders_every_epoch: Set to True to reload dataloaders every epoch auto_lr_find: If set to True, will `initially` run a learning rate finder, trying to optimize initial learning for faster convergence. Sets learning rate in self.hparams.lr | self.hparams.learning_rate in the lightning module. To use a different key, set a string instead of True with the key name. replace_sampler_ddp: Explicitly enables or disables sampler replacement. If not specified this will toggled automatically ddp is used benchmark: If true enables cudnn.benchmark. terminate_on_nan: If set to True, will terminate training (by raising a `ValueError`) at the end of each training batch, if any of the parameters or the loss are NaN or +/-inf. """ # Init callbacks self.callbacks = callbacks or [] self.on_init_start() # benchmarking self.benchmark = benchmark torch.backends.cudnn.benchmark = self.benchmark # Transfer params self.num_nodes = num_nodes # Backward compatibility, TODO: remove in v0.8.0 if nb_gpu_nodes is not None: rank_zero_warn("Argument `nb_gpu_nodes` has renamed to `num_nodes` since v0.5.0" " and this method will be removed in v0.8.0", DeprecationWarning) self.num_gpu_nodes = nb_gpu_nodes self.log_gpu_memory = log_gpu_memory self.gradient_clip_val = gradient_clip_val # Backward compatibility, TODO: remove in v0.8.0 if gradient_clip is not None: rank_zero_warn("Argument `gradient_clip` has renamed to `gradient_clip_val` since v0.5.0" " and this method will be removed in v0.8.0", DeprecationWarning) self.gradient_clip = gradient_clip self.progress_bar_refresh_rate = progress_bar_refresh_rate self.check_val_every_n_epoch = check_val_every_n_epoch self.track_grad_norm = track_grad_norm self.on_gpu = True if (gpus and torch.cuda.is_available()) else False # tpu config self.on_tpu = num_tpu_cores is not None self.num_tpu_cores = num_tpu_cores assert num_tpu_cores in [1, 8, None], 'num_tpu_cores can only be 1 or 8' if num_processes != 1 and distributed_backend != "ddp_cpu": rank_zero_warn("num_processes is only used for distributed_backend=\"ddp_cpu\". Ignoring it.") self.num_processes = num_processes self.process_position = process_position self.weights_summary = weights_summary self.max_epochs = max_epochs # Backward compatibility, TODO: remove in v0.8.0 if max_nb_epochs is not None: rank_zero_warn("Argument `max_nb_epochs` has renamed to `max_epochs` since v0.5.0" " and this method will be removed in v0.8.0", DeprecationWarning) self.max_nb_epochs = max_nb_epochs self.min_epochs = min_epochs # Backward compatibility, TODO: remove in v0.8.0 if min_nb_epochs is not None: rank_zero_warn("Argument `min_nb_epochs` has renamed to `min_epochs` since v0.5.0" " and this method will be removed in v0.8.0", DeprecationWarning) self.min_nb_epochs = min_nb_epochs self.max_steps = max_steps self.min_steps = min_steps self.num_sanity_val_steps = num_sanity_val_steps # Backward compatibility, TODO: remove in v0.8.0 if nb_sanity_val_steps is not None: rank_zero_warn("Argument `nb_sanity_val_steps` has renamed to " "`num_sanity_val_steps` since v0.5.0" " and this method will be removed in v0.8.0", DeprecationWarning) self.nb_sanity_val_steps = nb_sanity_val_steps # Backward compatibility, TODO: remove in v0.9.0 if print_nan_grads: rank_zero_warn("Argument `print_nan_grads` has no effect and will be removed in v0.9.0." " NaN grads will be printed automatically when detected.", DeprecationWarning) self.reload_dataloaders_every_epoch = reload_dataloaders_every_epoch self.auto_lr_find = auto_lr_find self.replace_sampler_ddp = replace_sampler_ddp self.truncated_bptt_steps = truncated_bptt_steps self.resume_from_checkpoint = resume_from_checkpoint self.terminate_on_nan = terminate_on_nan self.shown_warnings = set() self.fast_dev_run = fast_dev_run if self.fast_dev_run: self.num_sanity_val_steps = 0 self.max_epochs = 1 log.info('Running in fast_dev_run mode: will run a full train,' ' val and test loop using a single batch') # set default save path if user didn't provide one self.default_root_dir = default_root_dir # Backward compatibility, TODO: remove in v0.8.0 if default_save_path is not None: self.default_root_dir = default_save_path if self.default_root_dir is None: self.default_root_dir = os.getcwd() # training bookeeping self.total_batch_idx = 0 self.running_loss = TensorRunningAccum(window_length=20) self.batch_idx = 0 self.tqdm_metrics = {} self.callback_metrics = {} self.num_val_batches = 0 self.num_training_batches = 0 self.num_test_batches = 0 self.train_dataloader = None self.test_dataloaders = None self.val_dataloaders = None # training state self.model = None self.testing = False self.disable_validation = False self.lr_schedulers = [] self.optimizers = None self.optimizer_frequencies = [] self.global_step = 0 self.current_epoch = 0 self.total_batches = 0 self.interrupted = False # configure logger self.configure_logger(logger) # configure profiler if profiler is True: profiler = SimpleProfiler() self.profiler = profiler or PassThroughProfiler() # configure early stop callback # creates a default one if none passed in self.configure_early_stopping(early_stop_callback) # configure checkpoint callback self.checkpoint_callback = checkpoint_callback self.weights_save_path = weights_save_path # accumulated grads self.accumulate_grad_batches = accumulate_grad_batches self.configure_accumulated_gradients(accumulate_grad_batches) # for gpus allow int, string and gpu list if auto_select_gpus and isinstance(gpus, int): self.gpus = pick_multiple_gpus(gpus) else: self.gpus = gpus self.data_parallel_device_ids = parse_gpu_ids(self.gpus) self.root_gpu = determine_root_gpu_device(self.data_parallel_device_ids) self.root_device = torch.device("cpu") # tpu state flags self.use_tpu = False self.tpu_local_core_rank = None self.tpu_global_core_rank = None # distributed backend choice self.distributed_backend = distributed_backend self.set_distributed_mode(distributed_backend) # override dist backend when using tpus if self.on_tpu: self.init_tpu() self.current_tpu_idx = None # init flags for SLURM+ddp to work self.proc_rank = 0 self.world_size = 1 self.node_rank = 0 self.configure_slurm_ddp(self.num_nodes) # nvidia setup self.set_nvidia_flags(self.is_slurm_managing_tasks, self.data_parallel_device_ids) # can't init progress bar here because starting a new process # means the progress_bar won't survive pickling # backward compatibility if show_progress_bar is not None: self.show_progress_bar = show_progress_bar # logging self.log_save_interval = log_save_interval self.val_check_interval = val_check_interval # backward compatibility if add_row_log_interval is not None: rank_zero_warn("`add_row_log_interval` has renamed to `row_log_interval` since v0.5.0" " and this method will be removed in v0.8.0", DeprecationWarning) if not row_log_interval: # in case you did not set the proper value row_log_interval = add_row_log_interval self.row_log_interval = row_log_interval # how much of the data to use self.overfit_pct = overfit_pct self.determine_data_use_amount(train_percent_check, val_percent_check, test_percent_check, overfit_pct) # 16 bit mixed precision training using apex self.amp_level = amp_level self.precision = precision # Backward compatibility, TODO: remove in v0.9.0 if use_amp is not None: rank_zero_warn("`use_amp` has been replaced by `precision` since v0.7.0" " and this argument will be removed in v0.9.0", DeprecationWarning) self.precision = 16 if use_amp else 32 assert self.precision in (16, 32), 'only 32 or 16 bit precision supported' if self.precision == 16 and self.num_tpu_cores is None: use_amp = True self.init_amp(use_amp) # Callback system self.on_init_end()
def train(self): warnings.warn( 'Displayed epoch numbers in the progress bar start from "1" until v0.6.x,' ' but will start from "0" in v0.8.0.', RuntimeWarning) # get model model = self.get_model() # load data self.reset_train_dataloader(model) self.reset_val_dataloader(model) # Train start events with self.profiler.profile('on_train_start'): # callbacks self.on_train_start() # initialize early stop callback if self.early_stop_callback is not None: self.early_stop_callback.on_train_start(self, self.get_model()) # model hooks model.on_train_start() try: # run all epochs for epoch in range(self.current_epoch, self.max_epochs): # set seed for distributed sampler (enables shuffling for each epoch) if self.use_ddp \ and hasattr(self.train_dataloader.sampler, 'set_epoch'): self.train_dataloader.sampler.set_epoch(epoch) # update training progress in trainer and model model.current_epoch = epoch self.current_epoch = epoch total_val_batches = 0 is_val_epoch = False if not self.disable_validation and self.num_training_batches != float( 'inf'): # val can be checked multiple times in epoch is_val_epoch = (self.current_epoch + 1) % self.check_val_every_n_epoch == 0 val_checks_per_epoch = self.num_training_batches // self.val_check_batch val_checks_per_epoch = val_checks_per_epoch if is_val_epoch else 0 total_val_batches = self.num_val_batches * val_checks_per_epoch # total batches includes multiple val checks self.total_batches = self.num_training_batches + total_val_batches # changing gradient according accumulation_scheduler self.accumulation_scheduler.on_epoch_start( self, self.get_model()) # stores accumulated grad fractions per batch self.batch_loss_value = TensorRunningMean( window_length=self.accumulate_grad_batches) if self.fast_dev_run: # limit the number of batches to 2 (1 train and 1 val) in fast_dev_run num_iterations = 2 elif self.total_batches == float('inf'): # for infinite train or val loader, the progress bar never ends num_iterations = None else: num_iterations = self.total_batches # reset progress bar # .reset() doesn't work on disabled progress bar so we should check if not self.main_progress_bar.disable: self.main_progress_bar.reset(num_iterations) desc = f'Epoch {epoch + 1}' self.main_progress_bar.set_description(desc) # ----------------- # RUN TNG EPOCH # ----------------- self.run_training_epoch() # update LR schedulers self.update_learning_rates(interval='epoch') if self.max_steps and self.max_steps == self.global_step: self.run_training_teardown() return # early stopping met_min_epochs = epoch >= self.min_epochs - 1 met_min_steps = self.global_step >= self.min_steps if self.min_steps else True # TODO wrap this logic into the callback if self.enable_early_stop: if (met_min_epochs and met_min_steps) or self.fast_dev_run: should_stop = self.early_stop_callback.on_epoch_end( self, self.get_model()) # stop training stop = should_stop and met_min_epochs if stop: self.run_training_teardown() return self.run_training_teardown() except KeyboardInterrupt: log.info( 'Detected KeyboardInterrupt, attempting graceful shutdown...') self.run_training_teardown()
def on_validation_end(self, trainer, pl_module): # only run on main process if trainer.global_rank != 0: return if trainer.running_sanity_check: return # TODO: remove when dict results are deprecated self.__warn_deprecated_monitor_key() metrics = trainer.logger_connector.callback_metrics epoch = trainer.current_epoch # support structured results if metrics.get('checkpoint_on') is not None: self.monitor = 'checkpoint_on' # conditioned val metrics override conditioned train loop metrics if metrics.get('val_checkpoint_on') is not None: self.monitor = 'val_checkpoint_on' if self.save_top_k == 0: # no models are saved return if self.epoch_last_check is not None and ( epoch - self.epoch_last_check) < self.period: # skipping in this term return self.epoch_last_check = epoch ckpt_name_metrics = trainer.logger_connector.logged_metrics filepath = self.format_checkpoint_name(epoch, ckpt_name_metrics) version_cnt = 0 while self._fs.exists(filepath): filepath = self.format_checkpoint_name(epoch, ckpt_name_metrics, ver=version_cnt) # this epoch called before version_cnt += 1 if self.save_top_k != -1: current = metrics.get(self.monitor) if not isinstance(current, torch.Tensor): rank_zero_warn( f'The metric you returned {current} must be a `torch.Tensor` instance, checkpoint not saved' f' HINT: what is the value of {self.monitor} in validation_epoch_end()?', RuntimeWarning) if current is not None: current = torch.tensor(current) if current is None: rank_zero_warn( f'Can save best model only with {self.monitor} available, skipping.', RuntimeWarning) elif self.check_monitor_top_k(current): self._do_check_save(filepath, current, epoch, trainer, pl_module) elif self.verbose > 0: log.info( f'Epoch {epoch:d}: {self.monitor} was not in top {self.save_top_k}' ) else: if self.verbose > 0: log.info(f'Epoch {epoch:d}: saving model to {filepath}') assert trainer.global_rank == 0, 'tried to make a checkpoint from non global_rank=0' self._save_model(filepath, trainer, pl_module) if self.save_last: filename = self._format_checkpoint_name(self.CHECKPOINT_NAME_LAST, epoch, ckpt_name_metrics, prefix=self.prefix) filepath = os.path.join(self.dirpath, f'{filename}.ckpt') self._save_model(filepath, trainer, pl_module) if self.last_model_path and self.last_model_path != filepath: self._del_model(self.last_model_path)
def train_dataloader(self): log.info('Training data loader called.') return DataLoader(self.mnist_train, batch_size=self.batch_size, num_workers=4)
def train_dataloader(self): log.info('Training data loaded.') return self.__dataloader(train=True)
def val_dataloader(self): log.info('Validation data loaded.') return self.__dataloader(train=False)
def ddp_train(self, process_idx, model, is_master=False, proc_offset=0): """ Entry point into a DP thread :param gpu_idx: :param model: :param cluster_obj: :return: """ # offset the process id if requested process_idx = process_idx + proc_offset # show progressbar only on progress_rank 0 if (self.node_rank != 0 or process_idx != 0) and self.progress_bar_callback is not None: self.progress_bar_callback.disable() # determine which process we are and world size if self.use_ddp: self.local_rank = process_idx self.global_rank = self.node_rank * self.num_processes + process_idx self.world_size = self.num_nodes * self.num_processes elif self.use_ddp2: self.local_rank = self.node_rank self.global_rank = self.node_rank self.world_size = self.num_nodes # set warning rank rank_zero_only.rank = self.global_rank # set up server using proc 0's ip address # try to init for 20 times at max in case ports are taken # where to store ip_table model.trainer = self model.init_ddp_connection(self.global_rank, self.world_size, self.is_slurm_managing_tasks) # on world_size=0 let everyone know training is starting if self.is_global_zero: log.info('-' * 100) log.info(f'distributed_backend={self.distributed_backend}') log.info(f'All DDP processes registered. Starting ddp with {self.world_size} processes') log.info('-' * 100) # CHOOSE OPTIMIZER # allow for lr schedulers as well self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers(model) # MODEL # copy model to each gpu if self.on_gpu: gpu_idx = process_idx if is_master: # source of truth is cuda for gpu idx gpus = os.environ['CUDA_VISIBLE_DEVICES'].split(',') gpu_idx = int(gpus[self.local_rank]) self.root_gpu = gpu_idx torch.cuda.set_device(self.root_gpu) model.cuda(self.root_gpu) # set model properties before going into wrapper self.copy_trainer_model_properties(model) # AMP # run through amp wrapper before going to distributed DP # TODO: remove in v0.8.0 if self.use_amp and not self.use_native_amp: model, optimizers = model.configure_apex(amp, model, self.optimizers, self.amp_level) self.optimizers = optimizers self.reinit_scheduler_properties(self.optimizers, self.lr_schedulers) # DDP2 uses all GPUs on the machine if self.distributed_backend == 'ddp' or self.distributed_backend == 'ddp_spawn': device_ids = [self.root_gpu] elif self.use_ddp2: device_ids = self.data_parallel_device_ids else: # includes ddp_cpu device_ids = None # allow user to configure ddp model = model.configure_ddp(model, device_ids) # continue training routine self.run_pretrain_routine(model)
def ddp_train(self, process_idx, mp_queue, model): """ Entry point for ddp Args: process_idx: current process rank mp_queue: multiprocessing queue model: pointer to current :class:`LightningModule` Returns: Dict with evaluation results """ # show progressbar only on progress_rank 0 if (self.trainer.node_rank != 0 or process_idx != 0 ) and self.trainer.progress_bar_callback is not None: self.trainer.progress_bar_callback.disable() # determine which process we are and world size self.set_world_ranks(process_idx) # set warning rank rank_zero_only.rank = self.trainer.global_rank # Initialize cuda device self.init_device(process_idx) # set up server using proc 0's ip address # try to init for 20 times at max in case ports are taken # where to store ip_table model.trainer = self.trainer self.init_ddp_connection(self.trainer.global_rank, self.trainer.world_size, self.trainer.is_slurm_managing_tasks) if isinstance(self.ddp_plugin, RPCPlugin): if not self.ddp_plugin.is_main_rpc_process: self.ddp_plugin.on_accelerator_exit_rpc_process(self.trainer) self.ddp_plugin.exit_rpc_process() if self.ddp_plugin.return_after_exit_rpc_process: return else: self.ddp_plugin.on_main_rpc_connection(self.trainer) # call setup after the ddp process has connected self.trainer.call_setup_hook(model) # on world_size=0 let everyone know training is starting if self.trainer.is_global_zero and not torch.distributed.is_initialized( ): log.info('-' * 100) log.info(f'distributed_backend={self.trainer.distributed_backend}') log.info( f'All DDP processes registered. Starting ddp with {self.trainer.world_size} processes' ) log.info('-' * 100) # call sync_bn before .cuda(), configure_apex and configure_ddp if self.trainer.sync_batchnorm: model = self.configure_sync_batchnorm(model) # move the model to the correct device self.model_to_device(model) # CHOOSE OPTIMIZER # allow for lr schedulers as well self.setup_optimizers(model) self.ddp_plugin.on_after_setup_optimizers(self.trainer) # set model properties before going into wrapper self.trainer.model_connector.copy_trainer_model_properties(model) # 16-bit model = self.trainer.precision_connector.connect(model) self.trainer.convert_to_lightning_optimizers() # device ids change depending on the DDP setup device_ids = self.get_device_ids() # allow user to configure ddp model = self.configure_ddp(model, device_ids) # set up training routine self.trainer.train_loop.setup_training(model) # train or test results = self.train_or_test() # clean up memory torch.cuda.empty_cache() return results
def on_train_end(self, trainer, pl_module): if self.stopped_epoch > 0: log.info( f'Epoch {self.stopped_epoch:05d}: early stopping triggered.')
def on_train_end(self, trainer, pl_module): if self.stopped_epoch > 0 and self.verbose > 0: rank_zero_warn( 'Displayed epoch numbers by `EarlyStopping` start from "1" until v0.6.x,' ' but will start from "0" in v0.8.0.', DeprecationWarning) log.info(f'Epoch {self.stopped_epoch + 1:05d}: early stopping')
def val_dataloader(self): log.info('Validation data loader called.') return DataLoader(self.mnist_test, batch_size=self.batch_size, num_workers=4)
def fit( self, model: LightningModule, train_dataloader: Optional[DataLoader] = None, val_dataloaders: Optional[DataLoader] = None ): r""" Runs the full optimization routine. Args: model: Model to fit. train_dataloader: A Pytorch DataLoader with training samples. If the model has a predefined train_dataloader method this will be skipped. val_dataloaders: Either a single Pytorch Dataloader or a list of them, specifying validation samples. If the model has a predefined val_dataloaders method this will be skipped Example:: # Option 1, # Define the train_dataloader() and val_dataloader() fxs # in the lightningModule # RECOMMENDED FOR MOST RESEARCH AND APPLICATIONS TO MAINTAIN READABILITY trainer = Trainer() model = LightningModule() trainer.fit(model) # Option 2 # in production cases we might want to pass different datasets to the same model # Recommended for PRODUCTION SYSTEMS train, val = DataLoader(...), DataLoader(...) trainer = Trainer() model = LightningModule() trainer.fit(model, train_dataloader=train, val_dataloader=val) # Option 1 & 2 can be mixed, for example the training set can be # defined as part of the model, and validation can then be feed to .fit() """ # bind logger and other properties model.logger = self.logger self.copy_trainer_model_properties(model) # set up the passed in dataloaders (if needed) self.__attach_dataloaders(model, train_dataloader, val_dataloaders) # check that model is configured correctly self.check_model_configuration(model) # download the data and do whatever transforms we need # do before any spawn calls so that the model can assign properties # only on proc 0 because no spawn has happened yet model.prepare_data() # Run learning rate finder: if self.auto_lr_find: self._run_lr_finder_internally(model) # route to appropriate start method # when using multi-node or DDP within a node start each module in a separate process if self.use_ddp2: task = int(os.environ['SLURM_LOCALID']) self.ddp_train(task, model) elif self.use_ddp: if self.is_slurm_managing_tasks: task = int(os.environ['SLURM_LOCALID']) self.ddp_train(task, model) else: self.__set_random_port() # track for predict self.model = model # train mp.spawn(self.ddp_train, nprocs=self.num_processes, args=(model,)) # load weights if not interrupted self.load_spawn_weights(model) self.model = model # 1 gpu or dp option triggers training using DP module # easier to avoid NCCL issues elif self.use_dp: self.dp_train(model) elif self.single_gpu: self.single_gpu_train(model) elif self.use_tpu: # pragma: no-cover log.info(f'training on {self.num_tpu_cores} TPU cores') # COLAB_GPU is an env var available by default in Colab environments. start_method = 'fork' if os.getenv('COLAB_GPU') else 'spawn' # track for predict self.model = model # train xmp.spawn(self.tpu_train, args=(model,), nprocs=self.num_tpu_cores, start_method=start_method) # load weights if not interrupted self.load_spawn_weights(model) self.model = model # ON CPU else: # run through amp wrapper if self.use_amp: raise MisconfigurationException('amp + cpu is not supported. Please use a GPU option') # CHOOSE OPTIMIZER # allow for lr schedulers as well self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers(model) self.run_pretrain_routine(model) # return 1 when finished # used for testing or when we need to know that training succeeded return 1
def test_dataloader(self): log.info('Test data loader called.') return DataLoader(self.mnist_test, batch_size=self.batch_size, num_workers=4)
def term_handler(self, signum, frame): # save log.info("bypassing sigterm")
def ddp_train(self, process_idx, mp_queue, model): """ Entry point for ddp Args: process_idx: mp_queue: multiprocessing queue model: Returns: """ # show progressbar only on progress_rank 0 if (self.trainer.node_rank != 0 or process_idx != 0 ) and self.trainer.progress_bar_callback is not None: self.trainer.progress_bar_callback.disable() # determine which process we are and world size if self.trainer.use_ddp: self.trainer.local_rank = process_idx self.trainer.global_rank = self.trainer.node_rank * self.trainer.num_processes + process_idx self.trainer.world_size = self.trainer.num_nodes * self.trainer.num_processes elif self.trainer.use_ddp2: self.trainer.local_rank = self.trainer.node_rank self.trainer.global_rank = self.trainer.node_rank self.trainer.world_size = self.trainer.num_nodes # set warning rank rank_zero_only.rank = self.trainer.global_rank # set up server using proc 0's ip address # try to init for 20 times at max in case ports are taken # where to store ip_table model.trainer = self.trainer model.init_ddp_connection(self.trainer.global_rank, self.trainer.world_size, self.trainer.is_slurm_managing_tasks) # call setup after the ddp process has connected self.trainer.call_setup_hook(model) # on world_size=0 let everyone know training is starting if self.trainer.is_global_zero: log.info('-' * 100) log.info(f'distributed_backend={self.trainer.distributed_backend}') log.info( f'All DDP processes registered. Starting ddp with {self.trainer.world_size} processes' ) log.info('-' * 100) # CHOOSE OPTIMIZER # allow for lr schedulers as well optimizers, lr_schedulers, optimizer_frequencies = self.trainer.init_optimizers( model) self.trainer.optimizers = optimizers self.trainer.lr_schedulers = lr_schedulers self.trainer.optimizer_frequencies = optimizer_frequencies # call sync_bn before .cuda(), configure_apex and configure_ddp if self.trainer.sync_batchnorm: model = model.configure_sync_batchnorm(model) # MODEL # copy model to each gpu if self.trainer.on_gpu: gpu_idx = process_idx self.trainer.root_gpu = gpu_idx torch.cuda.set_device(self.trainer.root_gpu) model.cuda(self.trainer.root_gpu) # set model properties before going into wrapper self.trainer.copy_trainer_model_properties(model) # AMP - # run through amp wrapper before going to distributed DP if self.trainer.amp_type == AMPType.APEX: model, optimizers = model.configure_apex(amp, model, self.trainer.optimizers, self.trainer.amp_level) self.trainer.optimizers = optimizers self.trainer.reinit_scheduler_properties( self.trainer.optimizers, self.trainer.lr_schedulers) # DDP2 uses all GPUs on the machine if self.trainer.distributed_backend == 'ddp' or self.trainer.distributed_backend == 'ddp_spawn': device_ids = [self.trainer.root_gpu] elif self.trainer.use_ddp2: device_ids = self.trainer.data_parallel_device_ids else: # includes ddp_cpu device_ids = None # allow user to configure ddp model = model.configure_ddp(model, device_ids) # continue training routine results = self.trainer.run_pretrain_routine(model) # get original model model = self.trainer.get_model() # persist info in ddp_spawn self.trainer.transfer_distrib_spawn_state_on_fit_end( model, mp_queue, results) # clean up memory torch.cuda.empty_cache()
def scale_batch_size(trainer, model: LightningModule, mode: str = 'power', steps_per_trial: int = 3, init_val: int = 2, max_trials: int = 25, batch_arg_name: str = 'batch_size', **fit_kwargs): r""" Will iteratively try to find the largest batch size for a given model that does not give an out of memory (OOM) error. Args: trainer: The Trainer model: Model to fit. mode: string setting the search mode. Either `power` or `binsearch`. If mode is `power` we keep multiplying the batch size by 2, until we get an OOM error. If mode is 'binsearch', we will initially also keep multiplying by 2 and after encountering an OOM error do a binary search between the last successful batch size and the batch size that failed. steps_per_trial: number of steps to run with a given batch size. Idealy 1 should be enough to test if a OOM error occurs, however in practise a few are needed init_val: initial batch size to start the search with max_trials: max number of increase in batch size done before algorithm is terminated batch_arg_name: name of the attribute that stores the batch size. It is expected that the user has provided a model or datamodule that has a hyperparameter with that name. We will look for this attribute name in the following places - ``model`` - ``model.hparams`` - ``model.datamodule`` - ``trainer.datamodule`` (the datamodule passed to the tune method) **fit_kwargs: remaining arguments to be passed to .fit(), e.g., dataloader or datamodule. Raises: MisconfigurationException: If field ``batch_arg_name`` is not found in ``model`` and ``model.hparams``, or if batch scaling feature is used with dataloaders passed directly to ``.fit()``. ValueError: If mode in method ``scale_batch_size`` is neither ``power`` nor ``binsearch``. """ if trainer.fast_dev_run: rank_zero_warn( 'Skipping batch size scaler since fast_dev_run is enabled.', UserWarning) return if not lightning_hasattr(model, batch_arg_name): raise MisconfigurationException( f'Field {batch_arg_name} not found in both `model` and `model.hparams`' ) if hasattr(model, batch_arg_name) and hasattr( model, "hparams") and batch_arg_name in model.hparams: rank_zero_warn( f'Field `model.{batch_arg_name}` and `model.hparams.{batch_arg_name}` are mutually exclusive!' f' `model.{batch_arg_name}` will be used as the initial batch size for scaling.' f' If this is not the intended behavior, please remove either one.' ) if hasattr(model.train_dataloader, 'patch_loader_code'): raise MisconfigurationException( 'The batch scaling feature cannot be used with dataloaders passed directly to `.fit()`.' ' Please disable the feature or incorporate the dataloader into the model.' ) # Arguments we adjust during the batch size finder, save for restoring __scale_batch_dump_params(trainer) # Set to values that are required by the algorithm __scale_batch_reset_params(trainer, model, steps_per_trial) # Save initial model, that is loaded after batch size is found save_path = os.path.join(trainer.default_root_dir, 'scale_batch_size_temp_model.ckpt') trainer.save_checkpoint(str(save_path)) if trainer.progress_bar_callback: trainer.progress_bar_callback.disable() # Initially we just double in size until an OOM is encountered new_size = _adjust_batch_size(trainer, batch_arg_name, value=init_val) # initially set to init_val if mode == 'power': new_size = _run_power_scaling(trainer, model, new_size, batch_arg_name, max_trials, **fit_kwargs) elif mode == 'binsearch': new_size = _run_binsearch_scaling(trainer, model, new_size, batch_arg_name, max_trials, **fit_kwargs) else: raise ValueError( 'mode in method `scale_batch_size` can only be `power` or `binsearch' ) garbage_collection_cuda() log.info( f'Finished batch size finder, will continue with full run using batch size {new_size}' ) # Restore initial state of model if trainer.is_global_zero: trainer.checkpoint_connector.restore( str(save_path), on_gpu=trainer._device_type == DeviceType.GPU) fs = get_filesystem(str(save_path)) if fs.exists(save_path): fs.rm(save_path) # Finish by resetting variables so trainer is ready to fit model __scale_batch_restore_params(trainer) if trainer.progress_bar_callback: trainer.progress_bar_callback.enable() return new_size
def term_handler(self, signum, frame): # Todo: required argument `signum` is not used # Todo: required argument `frame` is not used log.info("bypassing sigterm")
def ddp_train(self, process_idx, q, model, is_master=False, proc_offset=0): """ Entry point for ddp Args: process_idx: q: model: is_master: proc_offset: Returns: """ # offset the process id if requested process_idx = process_idx + proc_offset # show progressbar only on progress_rank 0 if (self.node_rank != 0 or process_idx != 0) and self.progress_bar_callback is not None: self.progress_bar_callback.disable() # determine which process we are and world size if self.use_ddp: self.local_rank = process_idx self.global_rank = self.node_rank * self.num_processes + process_idx self.world_size = self.num_nodes * self.num_processes elif self.use_ddp2: self.local_rank = self.node_rank self.global_rank = self.node_rank self.world_size = self.num_nodes # set warning rank rank_zero_only.rank = self.global_rank # set up server using proc 0's ip address # try to init for 20 times at max in case ports are taken # where to store ip_table model.trainer = self model.init_ddp_connection(self.global_rank, self.world_size, self.is_slurm_managing_tasks) # call setup after the ddp process has connected self.setup('fit') if self.is_function_implemented('setup', model): model.setup('fit') # on world_size=0 let everyone know training is starting if self.is_global_zero: log.info('-' * 100) log.info(f'distributed_backend={self.distributed_backend}') log.info(f'All DDP processes registered. Starting ddp with {self.world_size} processes') log.info('-' * 100) # CHOOSE OPTIMIZER # allow for lr schedulers as well self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers(model) # MODEL # copy model to each gpu if self.on_gpu: gpu_idx = process_idx if is_master: # source of truth is cuda for gpu idx gpus = os.environ['CUDA_VISIBLE_DEVICES'].split(',') gpu_idx = int(gpus[self.local_rank]) self.root_gpu = gpu_idx torch.cuda.set_device(self.root_gpu) model.cuda(self.root_gpu) # set model properties before going into wrapper self.copy_trainer_model_properties(model) # AMP # run through amp wrapper before going to distributed DP if self.use_amp and not NATIVE_AMP_AVALAIBLE: model, optimizers = model.configure_apex(amp, model, self.optimizers, self.amp_level) self.optimizers = optimizers self.reinit_scheduler_properties(self.optimizers, self.lr_schedulers) # DDP2 uses all GPUs on the machine if self.distributed_backend == 'ddp' or self.distributed_backend == 'ddp_spawn': device_ids = [self.root_gpu] elif self.use_ddp2: device_ids = self.data_parallel_device_ids else: # includes ddp_cpu device_ids = None # allow user to configure ddp model = model.configure_ddp(model, device_ids) # continue training routine # run_pretrain_routine: in "trainer.py" from line 1080 - Sanity check a few things before starting actual training results = self.run_pretrain_routine(model) # get original model model = self.get_model() # persist info in ddp_spawn self.transfer_ddp_spawn_state_on_fit_end(model, q, results) # clean up memory torch.cuda.empty_cache() if self.global_rank == 0 and self.distributed_backend not in ['ddp_spawn', 'ddp_cpu']: return results
def scale_batch_size(self, model: LightningModule, mode: str = 'power', steps_per_trial: int = 3, init_val: int = 2, max_trials: int = 25, batch_arg_name: str = 'batch_size'): r""" Will iteratively try to find the largest batch size for a given model that does not give an out of memory (OOM) error. Args: model: Model to fit. mode: string setting the search mode. Either `power` or `binsearch`. If mode is `power` we keep multiplying the batch size by 2, until we get an OOM error. If mode is 'binsearch', we will initially also keep multiplying by 2 and after encountering an OOM error do a binary search between the last successful batch size and the batch size that failed. steps_per_trial: number of steps to run with a given batch size. Idealy 1 should be enough to test if a OOM error occurs, however in practise a few are needed init_val: initial batch size to start the search with max_trials: max number of increase in batch size done before algorithm is terminated """ if not hasattr(model, batch_arg_name): if not hasattr(model.hparams, batch_arg_name): raise MisconfigurationException( 'Neither of `model.batch_size` and `model.hparams.batch_size` found.' ) if hasattr(model.train_dataloader, 'patch_loader_code'): raise MisconfigurationException( 'The batch scaling feature cannot be used with dataloaders' ' passed directly to `.fit()`. Please disable the feature or' ' incorporate the dataloader into the model.') # Arguments we adjust during the batch size finder, save for restoring self.__scale_batch_dump_params() # Set to values that are required by the algorithm self.__scale_batch_reset_params(model, steps_per_trial) # Save initial model, that is loaded after batch size is found save_path = os.path.join(self.default_root_dir, 'temp_model.ckpt') self.save_checkpoint(str(save_path)) if self.progress_bar_callback: self.progress_bar_callback.disable() # Initially we just double in size until an OOM is encountered new_size = _adjust_batch_size( self, value=init_val) # initially set to init_val if mode == 'power': new_size = _run_power_scaling(self, model, new_size, batch_arg_name, max_trials) elif mode == 'binsearch': new_size = _run_binsearch_scaling(self, model, new_size, batch_arg_name, max_trials) else: raise ValueError( 'mode in method `scale_batch_size` can only be `power` or `binsearch' ) garbage_collection_cuda() log.info( f'Finished batch size finder, will continue with full run using batch size {new_size}' ) # Restore initial state of model self.restore(str(save_path), on_gpu=self.on_gpu) os.remove(save_path) # Finish by resetting variables so trainer is ready to fit model self.__scale_batch_restore_params() if self.progress_bar_callback: self.progress_bar_callback.enable() return new_size
def ddp_train(self, process_idx, model): """ Entry point for ddp Args: process_idx: mp_queue: multiprocessing queue model: Returns: Dict with evaluation results """ # determine which process we are and world size self.set_world_ranks(process_idx) # toggle prog bar if self.trainer.global_rank == 0 and self.trainer.progress_bar_callback is not None: self.trainer.progress_bar_callback.disable() # set warning rank rank_zero_only.rank = self.trainer.global_rank # set up server using proc 0's ip address # try to init for 20 times at max in case ports are taken # where to store ip_table model.trainer = self.trainer self.init_ddp_connection(self.trainer.global_rank, self.trainer.world_size, self.trainer.is_slurm_managing_tasks) # call setup after the ddp process has connected self.trainer.call_setup_hook(model) # on world_size=0 let everyone know training is starting if self.trainer.is_global_zero and not torch.distributed.is_initialized( ): log.info('-' * 100) log.info( f'distributed_backend={self.trainer.distributed_backend} (TORCH_ELASTIC)' ) log.info( f'All DDP processes registered. Starting ddp with {self.trainer.world_size} processes' ) log.info('-' * 100) # call sync_bn before .cuda(), configure_apex and configure_ddp if self.trainer.sync_batchnorm: model = self.configure_sync_batchnorm(model) # move the model to the correct device self.model_to_device(model, process_idx) # CHOOSE OPTIMIZER # allow for lr schedulers as well self.setup_optimizers(model) # set model properties before going into wrapper self.trainer.model_connector.copy_trainer_model_properties(model) # 16-bit model = self.trainer.precision_connector.connect(model) # device ids change depending on the DDP setup device_ids = self.get_device_ids() # allow user to configure ddp model = self.configure_ddp(model, device_ids) # set up training routine self.trainer.train_loop.setup_training(model) # train or test results = self.train_or_test() # clean up memory torch.cuda.empty_cache() return results
def print_nan_gradients(self) -> None: model = self.get_model() for param in model.parameters(): if (param.grad is not None) and torch.isnan( param.grad.float()).any(): log.info(param, param.grad)
def lr_find( trainer, model: LightningModule, train_dataloader: Optional[DataLoader] = None, val_dataloaders: Optional[Union[DataLoader, List[DataLoader]]] = None, min_lr: float = 1e-8, max_lr: float = 1, num_training: int = 100, mode: str = 'exponential', early_stop_threshold: float = 4.0, datamodule: Optional[LightningDataModule] = None, ): r""" lr_find enables the user to do a range test of good initial learning rates, to reduce the amount of guesswork in picking a good starting learning rate. Args: model: Model to do range testing for train_dataloader: A PyTorch DataLoader with training samples. If the model has a predefined train_dataloader method, this will be skipped. min_lr: minimum learning rate to investigate max_lr: maximum learning rate to investigate num_training: number of learning rates to test mode: search strategy, either 'linear' or 'exponential'. If set to 'linear' the learning rate will be searched by linearly increasing after each batch. If set to 'exponential', will increase learning rate exponentially. early_stop_threshold: threshold for stopping the search. If the loss at any point is larger than early_stop_threshold*best_loss then the search is stopped. To disable, set to None. datamodule: An optional `LightningDataModule` which holds the training and validation dataloader(s). Note that the `train_dataloader` and `val_dataloaders` parameters cannot be used at the same time as this parameter, or a `MisconfigurationException` will be raised. Example:: # Setup model and trainer model = MyModelClass(hparams) trainer = pl.Trainer() # Run lr finder lr_finder = trainer.lr_find(model, ...) # Inspect results fig = lr_finder.plot(); fig.show() suggested_lr = lr_finder.suggestion() # Overwrite lr and create new model hparams.lr = suggested_lr model = MyModelClass(hparams) # Ready to train with new learning rate trainer.fit(model) """ save_path = os.path.join(trainer.default_root_dir, 'lr_find_temp.ckpt') __lr_finder_dump_params(trainer, model) # Prevent going into infinite loop trainer.auto_lr_find = False # Initialize lr finder object (stores results) lr_finder = _LRFinder(mode, min_lr, max_lr, num_training) # Use special lr logger callback trainer.callbacks = [_LRCallback(num_training, early_stop_threshold, progress_bar_refresh_rate=1)] # No logging trainer.logger = DummyLogger() # Max step set to number of iterations trainer.max_steps = num_training # Disable standard progress bar for fit if trainer.progress_bar_callback: trainer.progress_bar_callback.disable() # Disable standard checkpoint & early stopping trainer.checkpoint_callback = False trainer.early_stop_callback = None # Required for saving the model trainer.optimizers, trainer.schedulers = [], [], trainer.model = model # Dump model checkpoint trainer.save_checkpoint(str(save_path)) # Configure optimizer and scheduler model.configure_optimizers = lr_finder._exchange_scheduler(model.configure_optimizers) # Fit, lr & loss logged in callback trainer.fit(model, train_dataloader=train_dataloader, val_dataloaders=val_dataloaders, datamodule=datamodule) # Prompt if we stopped early if trainer.global_step != num_training: log.info('LR finder stopped early due to diverging loss.') # Transfer results from callback to lr finder object lr_finder.results.update({'lr': trainer.callbacks[0].lrs, 'loss': trainer.callbacks[0].losses}) lr_finder._total_batch_idx = trainer.total_batch_idx # for debug purpose # Reset model state trainer.checkpoint_connector.restore(str(save_path), on_gpu=trainer.on_gpu) os.remove(save_path) # Finish by resetting variables so trainer is ready to fit model __lr_finder_restore_params(trainer, model) if trainer.progress_bar_callback: trainer.progress_bar_callback.enable() return lr_finder
def train(self): self.run_sanity_check(self.get_model()) # set stage for logging self.logger_connector.set_stage("train") self.checkpoint_connector.has_trained = False # enable train mode model = self.get_model() model.train() torch.set_grad_enabled(True) # reload data when needed self.train_loop.reset_train_val_dataloaders(model) # hook self.train_loop.on_train_start() try: if self.train_loop.should_skip_training(): return # run all epochs for epoch in range(self.current_epoch, self.max_epochs): # hook self.train_loop.on_train_epoch_start(epoch) with self.profiler.profile("run_training_epoch"): # run train epoch self.train_loop.run_training_epoch() if self.max_steps and self.max_steps <= self.global_step: return # update LR schedulers self.optimizer_connector.update_learning_rates(interval='epoch') # early stopping met_min_epochs = epoch >= self.min_epochs - 1 met_min_steps = self.global_step >= self.min_steps if self.min_steps else True if self.should_stop: if met_min_epochs and met_min_steps: return log.info( 'Trainer was signaled to stop but required minimum epochs' f' ({self.min_epochs}) or minimum steps ({self.min_steps}) has' ' not been met. Training will continue...' ) except KeyboardInterrupt: rank_zero_warn('Detected KeyboardInterrupt, attempting graceful shutdown...') # user could press ctrl+c many times... only shutdown once if not self.interrupted: self.interrupted = True self._state = TrainerState.INTERRUPTED self.on_keyboard_interrupt() finally: # hook self.train_loop.on_train_end()
def train(self): # add signal handlers for process kills # def _signal_kill_handler(*args): # return TrainerTrainLoopMixin.run_training_teardown(self) # # orig_signal_handlers = {} # for sig_name in SIGNAL_TERMINATE: # orig_signal_handlers[sig_name] = signal.signal(getattr(signal, sig_name), # _signal_kill_handler) # get model model = self.get_model() # enable train mode model.train() # enable gradients torch.set_grad_enabled(True) # load data # if reload_dataloaders_every_epoch, this is moved to the epoch loop if not self.reload_dataloaders_every_epoch: self.reset_train_dataloader(model) self.reset_val_dataloader(model) # Train start events with self.profiler.profile('on_train_start'): # callbacks self.on_train_start() # model hooks model.on_train_start() try: # run all epochs for epoch in range(self.current_epoch, self.max_epochs): # reset train dataloader if self.reload_dataloaders_every_epoch: self.reset_train_dataloader(model) # set seed for distributed sampler (enables shuffling for each epoch) if (self.use_ddp or self.use_horovod) \ and hasattr(self.train_dataloader, 'sampler') \ and hasattr(self.train_dataloader.sampler, 'set_epoch'): self.train_dataloader.sampler.set_epoch(epoch) # update training progress in trainer and model model.current_epoch = epoch self.current_epoch = epoch # changing gradient according accumulation_scheduler self.accumulation_scheduler.on_epoch_start( self, self.get_model()) # stores accumulated grad fractions per batch self.batch_loss_value = TensorRunningAccum( window_length=self.accumulate_grad_batches) # ----------------- # RUN TNG EPOCH # ----------------- self.run_training_epoch() if self.max_steps and self.max_steps <= self.global_step: self.run_training_teardown() return # update LR schedulers self.update_learning_rates(interval='epoch') # early stopping met_min_epochs = epoch >= self.min_epochs - 1 met_min_steps = self.global_step >= self.min_steps if self.min_steps else True if self.should_stop: if (met_min_epochs and met_min_steps) or self.fast_dev_run: self.run_training_teardown() return else: log.info( 'Trainer was signaled to stop but required minimum epochs' f' ({self.min_epochs}) or minimum steps ({self.min_steps}) has' ' not been met. Training will continue...') self.run_training_teardown() except KeyboardInterrupt: rank_zero_warn( 'Detected KeyboardInterrupt, attempting graceful shutdown...') # user could press ctrl+c many times... only shutdown once if not self.interrupted: self.interrupted = True self.on_keyboard_interrupt() self.run_training_teardown()
def configure_optimizers(self): """ method required by pytorch lightning's module Here we use the fact that Every optimizer of pytorch can take as argument a list of dict. Each dict defining a separate parameter group, and should contain a `params` key, containing a list of parameters belonging to it. Other keys should match the keyword arguments accepted by the optimizers, and will be used as optimization options for this group. Returns ------- One or multiple optimizers and learning_rate schedulers in any of these options: - Single optimizer. - List or Tuple - List of optimizers. - Two lists - The first list has multiple optimizers, the second a list of LR schedulers. - Dictionary, with an ‘optimizer’ key and (optionally) a ‘lr_scheduler’ key. - Tuple of dictionaries as described, with an optional ‘frequency’ key. - None - Fit will run without any optimizer. more details on: https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.core.html at configure_optimizers() """ # REQUIRED if self.hparams.optimizer.name == 'Ranger': from ranger import Ranger optimizer_class = Ranger elif self.hparams.optimizer.name == 'RAdam': from radam import RAdam optimizer_class = RAdam else: optimizer_class = getattr(torch.optim, self.hparams.optimizer.name) params = [] if self.depth_net is not None: params.append({ 'name': 'Depth', 'params': self.depth_net.parameters(), **self.hparams.optimizer.depth_net_options }) terminal_logger.info("DepthNet's optimizer configured.") if self.pose_net is not None: params.append({ 'name': 'Pose', 'params': self.pose_net.parameters(), **self.hparams.optimizer.pose_net_options }) terminal_logger.info("PoseNet's optimizer configured.") # Create optimizer with parameters optimizer = optimizer_class(params) # Load and initialize schedulers if self.hparams.scheduler.name == 'FlatCosAnnealScheduler': from schedulers.flat_cos_anneal_scheduler import FlatCosAnnealScheduler step_factor = self.hparams.dataloaders.train.batch_size * self.hparams.trainer.accumulate_grad_batches steps_per_epoch = len(self.train_dataset) / step_factor scheduler = { 'scheduler': FlatCosAnnealScheduler(optimizer, steps_per_epoch, self.hparams.trainer.max_epochs, **self.hparams.scheduler.options), 'name': 'FlatCosAnnealScheduler', 'interval': 'step', # so that scheduler.step() is done at batch-level instead of epoch 'frequency': 1 } else: scheduler_class = getattr(torch.optim.lr_scheduler, self.hparams.scheduler.name) # assumes the schedulers used from torch.optim are epoch-based scheduler = { 'scheduler': scheduler_class(optimizer, **self.hparams.scheduler.options), 'name': self.hparams.scheduler.name, 'interval': 'epoch', 'frequency': 1 } terminal_logger.info("Optimizers and Schedulers configured.") return {'optimizer': optimizer, 'lr_scheduler': scheduler}