def save_checkpoint(self): """Saves a checkpoint of the network and other variables.""" net = self.actor.net.module if multigpu.is_multi_gpu(self.actor.net) else self.actor.net actor_type = type(self.actor).__name__ net_type = type(net).__name__ state = { 'epoch': self.epoch, 'actor_type': actor_type, 'net_type': net_type, 'net': net.state_dict(), 'net_info': getattr(net, 'info', None), 'constructor': getattr(net, 'constructor', None), 'optimizer': self.optimizer.state_dict(), 'stats': self.stats, 'settings': self.settings } directory = '{}/{}'.format(self._checkpoint_dir, self.settings.project_path) print(directory) if not os.path.exists(directory): print("directory doesn't exist. creating...") os.makedirs(directory) # First save as a tmp file tmp_file_path = '{}/{}_ep{:04d}.tmp'.format(directory, net_type, self.epoch) torch.save(state, tmp_file_path) file_path = '{}/{}_ep{:04d}.pth.tar'.format(directory, net_type, self.epoch) # Now rename to actual checkpoint. os.rename seems to be atomic if files are on same filesystem. Not 100% sure os.rename(tmp_file_path, file_path)
def load_state_dict(self, checkpoint=None, distill=False): """Loads a network checkpoint file. Can be called in three different ways: load_checkpoint(): Loads the latest epoch from the workspace. Use this to continue training. load_checkpoint(epoch_num): Loads the network at the given epoch number (int). load_checkpoint(path_to_checkpoint): Loads the file from the given absolute path (str). """ if distill: net = self.actor.net_teacher.module if multigpu.is_multi_gpu(self.actor.net_teacher) \ else self.actor.net_teacher else: net = self.actor.net.module if multigpu.is_multi_gpu(self.actor.net) else self.actor.net net_type = type(net).__name__ if isinstance(checkpoint, str): # checkpoint is the path if os.path.isdir(checkpoint): checkpoint_list = sorted(glob.glob('{}/*_ep*.pth.tar'.format(checkpoint))) if checkpoint_list: checkpoint_path = checkpoint_list[-1] else: raise Exception('No checkpoint found') else: checkpoint_path = os.path.expanduser(checkpoint) else: raise TypeError # Load network print("Loading pretrained model from ", checkpoint_path) checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') assert net_type == checkpoint_dict['net_type'], 'Network is not of correct type.' missing_k, unexpected_k = net.load_state_dict(checkpoint_dict["net"], strict=False) print("previous checkpoint is loaded.") print("missing keys: ", missing_k) print("unexpected keys:", unexpected_k) return True
def load_checkpoint(self, checkpoint=None, fields=None, ignore_fields=None, load_constructor=False): """Loads a network checkpoint file. Can be called in three different ways: load_checkpoint(): Loads the latest epoch from the workspace. Use this to continue training. load_checkpoint(epoch_num): Loads the network at the given epoch number (int). load_checkpoint(path_to_checkpoint): Loads the file from the given absolute path (str). """ net = self.actor.net.module if multigpu.is_multi_gpu( self.actor.net) else self.actor.net actor_type = type(self.actor).__name__ net_type = type(net).__name__ if checkpoint is None: # Load most recent checkpoint checkpoint_list = sorted( glob.glob('{}/{}/{}_ep*.pth.tar'.format( self._checkpoint_dir, self.settings.project_path, net_type))) if checkpoint_list: checkpoint_path = checkpoint_list[-1] else: print('No matching checkpoint file found') return elif isinstance(checkpoint, int): # Checkpoint is the epoch number checkpoint_path = '{}/{}/{}_ep{:04d}.pth.tar'.format( self._checkpoint_dir, self.settings.project_path, net_type, checkpoint) elif isinstance(checkpoint, str): # checkpoint is the path if os.path.isdir(checkpoint): checkpoint_list = sorted( glob.glob('{}/*_ep*.pth.tar'.format(checkpoint))) if checkpoint_list: checkpoint_path = checkpoint_list[-1] else: raise Exception('No checkpoint found') else: checkpoint_path = os.path.expanduser(checkpoint) else: raise TypeError # Load network checkpoint_dict = torch.load(checkpoint_path) assert net_type == checkpoint_dict[ 'net_type'], 'Network is not of correct type.' if fields is None: fields = checkpoint_dict.keys() if ignore_fields is None: ignore_fields = ['settings'] # Never load the scheduler. It exists in older checkpoints. ignore_fields.extend([ 'lr_scheduler', 'constructor', 'net_type', 'actor_type', 'net_info' ]) # Load all fields for key in fields: if key in ignore_fields: continue if key == 'net': net.load_state_dict(checkpoint_dict[key]) elif key == 'optimizer': self.optimizer.load_state_dict(checkpoint_dict[key]) else: setattr(self, key, checkpoint_dict[key]) # Set the net info if load_constructor and 'constructor' in checkpoint_dict and checkpoint_dict[ 'constructor'] is not None: net.constructor = checkpoint_dict['constructor'] if 'net_info' in checkpoint_dict and checkpoint_dict[ 'net_info'] is not None: net.info = checkpoint_dict['net_info'] # Update the epoch in lr scheduler if 'epoch' in fields: self.lr_scheduler.last_epoch = self.epoch # 2021.1.10 Update the epoch in data_samplers for loader in self.loaders: if isinstance(loader.sampler, DistributedSampler): loader.sampler.set_epoch(self.epoch) return True