def load_checkpoint( self, filename, reset_optimizer=False, reset_lr_scheduler=False, optimizer_overrides=None, reset_meters=False, ): """Load all training state from a checkpoint file.""" extra_state, self._optim_history, last_optim_state = None, [], None bexists = PathManager.isfile(filename) if bexists: state = checkpoint_utils.load_checkpoint_to_cpu(filename) # load model parameters try: self.get_model().load_state_dict(state["model"], strict=True, args=self.args) if utils.has_parameters(self.get_criterion()): self.get_criterion().load_state_dict(state["criterion"], strict=True) except Exception: raise Exception( "Cannot load model parameters from checkpoint {}; " "please ensure that the architectures match.".format( filename)) extra_state = state["extra_state"] self._optim_history = state["optimizer_history"] last_optim_state = state.get("last_optimizer_state", None) if last_optim_state is not None and not reset_optimizer: # rebuild optimizer after loading model, since params may have changed self._build_optimizer() # only reload optimizer and lr_scheduler if they match last_optim = self._optim_history[-1] assert ( last_optim["criterion_name"] == self.get_criterion().__class__.__name__ ), "Criterion does not match; please reset the optimizer (--reset-optimizer)." assert ( last_optim["optimizer_name"] == self.optimizer.__class__.__name__ ), "Optimizer does not match; please reset the optimizer (--reset-optimizer)." if not reset_lr_scheduler: self.lr_scheduler.load_state_dict( last_optim["lr_scheduler_state"]) self.optimizer.load_state_dict(last_optim_state, optimizer_overrides) self.set_num_updates(last_optim["num_updates"]) if extra_state is not None: epoch = extra_state["train_iterator"]["epoch"] logger.info("loaded checkpoint {} (epoch {} @ {} updates)".format( filename, epoch, self.get_num_updates())) self.lr_step(epoch) if "metrics" in extra_state and not reset_meters: metrics.load_state_dict(extra_state["metrics"]) # reset TimeMeters, since their start times don't make sense anymore for meter in metrics.get_meters("default"): if isinstance(meter, meters.TimeMeter): meter.reset() else: logger.info("no existing checkpoint found {}".format(filename)) return extra_state
def train_step(self, samples, raise_oom=False): """Do forward, backward and parameter update.""" self._set_seed() self.model.train() self.criterion.train() self.zero_grad() metrics.log_start_time("train_wall", priority=800, round=0) # forward and backward pass logging_outputs, sample_size, ooms = [], 0, 0 for i, sample in enumerate(samples): sample, is_dummy_batch = self._prepare_sample(sample) def maybe_no_sync(): """ Whenever *samples* contains more than one mini-batch, we want to accumulate gradients locally and only call all-reduce in the last backwards pass. """ if ( self.data_parallel_world_size > 1 and hasattr(self.model, "no_sync") and i < len(samples) - 1 ): return self.model.no_sync() else: return contextlib.ExitStack() # dummy contextmanager try: with maybe_no_sync(): # forward and backward loss, sample_size_i, logging_output = self.task.train_step( sample=sample, model=self.model, criterion=self.criterion, optimizer=self.optimizer, update_num=self.get_num_updates(), ignore_grad=is_dummy_batch, ) del loss logging_outputs.append(logging_output) sample_size += sample_size_i # emptying the CUDA cache after the first step can # reduce the chance of OOM if self.cuda and self.get_num_updates() == 0: torch.cuda.empty_cache() except RuntimeError as e: if "out of memory" in str(e): self._log_oom(e) if raise_oom: raise e logger.warning( "attempting to recover from OOM in forward/backward pass" ) ooms += 1 self.zero_grad() if self.cuda: torch.cuda.empty_cache() if self.cfg.distributed_training.distributed_world_size == 1: return None else: raise e if self.tpu and i < len(samples) - 1: # tpu-comment: every XLA operation before marking step is # appended to the IR graph, and processing too many batches # before marking step can lead to OOM errors. # To handle gradient accumulation use case, we explicitly # mark step here for every forward pass without a backward pass import torch_xla.core.xla_model as xm xm.mark_step() if is_dummy_batch: if torch.is_tensor(sample_size): sample_size.zero_() else: sample_size *= 0.0 if torch.is_tensor(sample_size): sample_size = sample_size.float() else: sample_size = float(sample_size) # gather logging outputs from all replicas if self._sync_stats(): train_time = self._local_cumulative_training_time() logging_outputs, ( sample_size, ooms, total_train_time, ) = self._aggregate_logging_outputs( logging_outputs, sample_size, ooms, train_time, ignore=is_dummy_batch, ) self._cumulative_training_time = ( total_train_time / self.data_parallel_world_size ) overflow = False try: with torch.autograd.profiler.record_function("reduce-grads"): self.optimizer.all_reduce_grads(self.model) if utils.has_parameters(self.criterion): self.optimizer.all_reduce_grads(self.criterion) with torch.autograd.profiler.record_function("multiply-grads"): # multiply gradients by (data_parallel_size / sample_size) since # DDP already normalizes by the number of data parallel workers. # Thus we get (sum_of_gradients / sample_size) at the end. if not self.cfg.optimization.use_bmuf: self.optimizer.multiply_grads( self.data_parallel_world_size / sample_size ) elif sample_size > 0: # BMUF needs to check sample size num = self.data_parallel_world_size if self._sync_stats() else 1 self.optimizer.multiply_grads(num / sample_size) with torch.autograd.profiler.record_function("clip-grads"): # clip grads grad_norm = self.clip_grad_norm(self.cfg.optimization.clip_norm) # check that grad norms are consistent across workers # on tpu check tensor is slow if not self.tpu: if ( not self.cfg.optimization.use_bmuf and self.cfg.distributed_training.distributed_wrapper != "SlowMo" ): self._check_grad_norms(grad_norm) if not torch.isfinite(grad_norm).all(): # check local gradnorm single GPU case, trigger NanDetector raise FloatingPointError("gradients are Nan/Inf") with torch.autograd.profiler.record_function("optimizer"): # take an optimization step self.task.optimizer_step( self.optimizer, model=self.model, update_num=self.get_num_updates() ) except FloatingPointError: # re-run the forward and backward pass with hooks attached to print # out where it fails self.zero_grad() with NanDetector(self.get_model()): for _, sample in enumerate(samples): sample, _ = self._prepare_sample(sample) self.task.train_step( sample, self.model, self.criterion, self.optimizer, self.get_num_updates(), ignore_grad=False, ) raise except OverflowError as e: overflow = True logger.info("NOTE: overflow detected, " + str(e)) grad_norm = torch.tensor(0.0).cuda() self.zero_grad() except RuntimeError as e: if "out of memory" in str(e): self._log_oom(e) logger.error("OOM during optimization, irrecoverable") raise e # Some distributed wrappers (e.g., SlowMo) need access to the optimizer after the step if hasattr(self.model, "perform_additional_optimizer_actions"): if hasattr(self.optimizer, "fp32_params"): self.model.perform_additional_optimizer_actions( self.optimizer.optimizer, self.optimizer.fp32_params ) else: self.model.perform_additional_optimizer_actions( self.optimizer.optimizer ) logging_output = None if ( not overflow or self.cfg.distributed_training.distributed_wrapper == "SlowMo" ): self.set_num_updates(self.get_num_updates() + 1) if self.tpu: # mark step on TPUs import torch_xla.core.xla_model as xm xm.mark_step() # only log stats every log_interval steps # this causes wps to be misreported when log_interval > 1 logging_output = {} if self.get_num_updates() % self.cfg.common.log_interval == 0: # log memory usage mem_info = xm.get_memory_info(self.device) gb_free = mem_info["kb_free"] / 1024 / 1024 gb_total = mem_info["kb_total"] / 1024 / 1024 metrics.log_scalar( "gb_free", gb_free, priority=1500, round=1, weight=0, ) metrics.log_scalar( "gb_total", gb_total, priority=1600, round=1, weight=0, ) logging_output = self._reduce_and_log_stats( logging_outputs, sample_size, grad_norm, ) # log whenever there's an XLA compilation, since these # slow down training and may indicate opportunities for # optimization self._check_xla_compilation() else: # log stats logging_output = self._reduce_and_log_stats( logging_outputs, sample_size, grad_norm, ) # clear CUDA cache to reduce memory fragmentation if ( self.cuda and self.cfg.common.empty_cache_freq > 0 and ( (self.get_num_updates() + self.cfg.common.empty_cache_freq - 1) % self.cfg.common.empty_cache_freq ) == 0 ): torch.cuda.empty_cache() if self.cfg.common.fp16: metrics.log_scalar( "loss_scale", self.optimizer.scaler.loss_scale, priority=700, round=4, weight=0, ) metrics.log_stop_time("train_wall") return logging_output
def load_checkpoint( self, filename, reset_optimizer=False, reset_lr_scheduler=False, optimizer_overrides=None, reset_meters=False, ): """Load all training state from a checkpoint file.""" extra_state, self._optim_history, last_optim_state = None, [], None if os.path.exists(filename): state = checkpoint_utils.load_checkpoint_to_cpu(filename) # load model parameters try: missing_keys, _ = self.get_model().load_state_dict( state['model'], strict=False) if len(missing_keys) > 0: print('Find missing keys when loading: {}'.format( missing_keys)) if utils.has_parameters(self.get_criterion()): self.get_criterion().load_state_dict(state['criterion'], strict=True) except Exception: raise Exception( 'Cannot load model parameters from checkpoint {}; ' 'please ensure that the architectures match.'.format( filename)) extra_state = state['extra_state'] self._optim_history = state['optimizer_history'] last_optim_state = state.get('last_optimizer_state', None) if last_optim_state is not None and not reset_optimizer: # rebuild optimizer after loading model, since params may have changed self._build_optimizer() # only reload optimizer and lr_scheduler if they match last_optim = self._optim_history[-1] assert last_optim['criterion_name'] == self.get_criterion().__class__.__name__, \ 'Criterion does not match; please reset the optimizer (--reset-optimizer).' assert last_optim['optimizer_name'] == self.optimizer.__class__.__name__, \ 'Optimizer does not match; please reset the optimizer (--reset-optimizer).' if not reset_lr_scheduler: self.lr_scheduler.load_state_dict( last_optim['lr_scheduler_state']) self.optimizer.load_state_dict(last_optim_state, optimizer_overrides) self.set_num_updates(last_optim['num_updates']) if extra_state is not None: epoch = extra_state['train_iterator']['epoch'] print('| loaded checkpoint {} (epoch {} @ {} updates)'.format( filename, epoch, self.get_num_updates())) self.lr_step(epoch) if 'train_meters' in extra_state and not reset_meters: self.meters.update(extra_state['train_meters']) del extra_state['train_meters'] # reset TimeMeters, since their start times don't make sense anymore for meter in self.meters.values(): if isinstance(meter, TimeMeter): meter.reset() else: print('| no existing checkpoint found {}'.format(filename)) return extra_state
def load_checkpoint( self, filename, reset_optimizer=False, reset_lr_scheduler=False, optimizer_overrides=None, reset_meters=False, ): """ Load all training state from a checkpoint file. rank = 0 will load the checkpoint, and then broadcast it to all other ranks. """ extra_state, self._optim_history, last_optim_state = None, [], None logger.info(f"Preparing to load checkpoint {filename}") bexists = PathManager.isfile(filename) if bexists: load_on_all_ranks = ( self.cfg.checkpoint.load_checkpoint_on_all_dp_ranks # TPUs don't support broadcast yet, so load checkpoints # on every worker for now or self.tpu ) if load_on_all_ranks or self.data_parallel_rank == 0: state = checkpoint_utils.load_checkpoint_to_cpu(filename) last_optim_state = state.get("last_optimizer_state", None) # If doing zero_sharding, do not broadcast global optimizer # state. Later we will broadcast sharded states to each rank # to avoid memory from exploding. if ( not load_on_all_ranks and self.cfg.distributed_training.zero_sharding == "os" and "last_optimizer_state" in state and self.data_parallel_world_size > 1 ): state["last_optimizer_state"] = "SHARDED" else: last_optim_state = None state = None if self.data_parallel_world_size > 1 and not load_on_all_ranks: state = distributed_utils.broadcast_object( state, src_rank=0, group=self.data_parallel_process_group, dist_device=self.device, ) if self.data_parallel_rank > 0: last_optim_state = state.get("last_optimizer_state", None) # load model parameters try: self.get_model().load_state_dict( state["model"], strict=True, model_cfg=self.cfg.model ) if utils.has_parameters(self.get_criterion()): self.get_criterion().load_state_dict( state["criterion"], strict=True ) except Exception: raise Exception( "Cannot load model parameters from checkpoint {}; " "please ensure that the architectures match.".format(filename) ) extra_state = state["extra_state"] self._optim_history = state["optimizer_history"] if last_optim_state is not None and not reset_optimizer: # rebuild optimizer after loading model, since params may have changed self._build_optimizer() # only reload optimizer and lr_scheduler if they match last_optim = self._optim_history[-1] assert ( last_optim["criterion_name"] == self.get_criterion().__class__.__name__ ), "Criterion does not match; please reset the optimizer (--reset-optimizer)." assert ( last_optim["optimizer_name"] == self.optimizer.__class__.__name__ ), "Optimizer does not match; please reset the optimizer (--reset-optimizer)." if not reset_lr_scheduler: self.lr_scheduler.load_state_dict(last_optim["lr_scheduler_state"]) if not load_on_all_ranks and self.data_parallel_world_size > 1: last_optim_state = self.optimizer.broadcast_global_state_dict( last_optim_state ) self.optimizer.load_state_dict(last_optim_state, optimizer_overrides) self.set_num_updates(last_optim["num_updates"]) if extra_state is not None: epoch = extra_state["train_iterator"]["epoch"] if "previous_training_time" in extra_state: self._previous_training_time = extra_state["previous_training_time"] self._start_time = time.time() self.lr_step(epoch) if "metrics" in extra_state and not reset_meters: metrics.load_state_dict(extra_state["metrics"]) # reset TimeMeters, since their start times don't make sense anymore for meter in metrics.get_meters("default"): if isinstance(meter, meters.TimeMeter): meter.reset() logger.info( "Loaded checkpoint {} (epoch {} @ {} updates)".format( filename, epoch, self.get_num_updates() ) ) else: logger.info("No existing checkpoint found {}".format(filename)) return extra_state