def save_checkpoint(self): """Saves a checkpoint of the network and other variables.""" net = self.actor.net.module if multigpu.is_multi_gpu( self.actor.net) else self.actor.net actor_type = type(self.actor).__name__ net_type = type(net).__name__ state = { 'epoch': self.epoch, 'actor_type': actor_type, 'net_type': net_type, 'net': net.state_dict(), 'net_info': getattr(net, 'info', None), 'constructor': getattr(net, 'constructor', None), 'optimizer': self.optimizer.state_dict(), 'stats': self.stats, 'settings': self.settings } directory = '{}/{}'.format(self._checkpoint_dir, self.settings.project_path) if not os.path.exists(directory): os.makedirs(directory) # First save as a tmp file tmp_file_path = '{}/{}_ep{:04d}.tmp'.format(directory, net_type, self.epoch) torch.save(state, tmp_file_path) file_path = '{}/{}_ep{:04d}.pth.tar'.format(directory, net_type, self.epoch) # Now rename to actual checkpoint. os.rename seems to be atomic if files are on same filesystem. Not 100% sure os.rename(tmp_file_path, file_path)
def load_pretrained(self, checkpoint=None, fields=None, ignore_fields=None, load_constructor=False): """ Loads a pre-trained network parameter from a checkpoint file. """ from ltr.admin import loading, multigpu net = self.actor.net.module if multigpu.is_multi_gpu( self.actor.net) else self.actor.net net_type = type(net).__name__ # Load network print("load from: {}".format(checkpoint)) checkpoint_dict = loading.torch_load_legacy(checkpoint) assert net_type == checkpoint_dict[ 'net_type'], 'Network is not of correct type.' if fields is None: fields = checkpoint_dict.keys() # Load all fields for key in fields: if key == 'net': net.load_state_dict(checkpoint_dict[key]) return True
def save_checkpoint(self): """Saves a checkpoint of the network and other variables.""" net = self.actor.net.module if multigpu.is_multi_gpu( self.actor.net) else self.actor.net actor_type = type(self.actor).__name__ net_type = type(net).__name__ state = { 'epoch': self.epoch, 'actor_type': actor_type, 'net_type': net_type, 'net': net.state_dict(), 'net_info': getattr(net, 'info', None), 'constructor': getattr(net, 'constructor', None), 'optimizer': self.optimizer.state_dict(), 'stats': self.stats, 'settings': self.settings } directory = '{}/{}'.format(self._checkpoint_dir, self.settings.project_path) if not os.path.exists(directory): os.makedirs(directory) file_path = '{}/{}_ep{:04d}.pth.tar'.format(directory, net_type, self.epoch) torch.save(state, file_path)
def load_weights(self, checkpoint = None): """Loads network weights""" net = self.actor.net.module if multigpu.is_multi_gpu(self.actor.net) else self.actor.net # Load network checkpoint_dict = loading.torch_load_legacy(checkpoint) net.load_state_dict(checkpoint_dict['net'], strict=False) return True
def load_checkpoint(self, checkpoint=None, fields=None, ignore_fields=None, load_constructor=False): """Loads a network checkpoint file. Can be called in three different ways: load_checkpoint(): Loads the latest epoch from the workspace. Use this to continue training. load_checkpoint(epoch_num): Loads the network at the given epoch number (int). load_checkpoint(path_to_checkpoint): Loads the file from the given absolute path (str). """ net = self.actor.net.module if multigpu.is_multi_gpu( self.actor.net) else self.actor.net actor_type = type(self.actor).__name__ net_type = type(net).__name__ if checkpoint is None: # Load most recent checkpoint # checkpoint_list = sorted(glob.glob('{}/{}/{}_ep*.pth.tar'.format(self._checkpoint_dir, # self.settings.project_path, net_type))) checkpoint_list = sorted( glob.glob('{}/{}/{}_ep*.pth.tar'.format( self._checkpoint_dir, self.settings.project_path, net_type))) print('{}/{}/{}_ep*.pth.tar'.format(self._checkpoint_dir, self.settings.project_path, net_type)) if checkpoint_list: checkpoint_path = checkpoint_list[-1] else: print('No matching checkpoint file found') return elif isinstance(checkpoint, int): # Checkpoint is the epoch number checkpoint_path = '{}/{}/{}_ep{:04d}.pth.tar'.format( self._checkpoint_dir, self.settings.project_path, net_type, checkpoint) elif isinstance(checkpoint, str): # checkpoint is the path if os.path.isdir(checkpoint): checkpoint_list = sorted( glob.glob('{}/*_ep*.pth.tar'.format(checkpoint))) if checkpoint_list: checkpoint_path = checkpoint_list[-1] else: raise Exception('No checkpoint found') else: checkpoint_path = os.path.expanduser(checkpoint) else: raise TypeError # Load network checkpoint_dict = loading.torch_load_legacy(checkpoint_path) # assert net_type == checkpoint_dict['net_type'], 'Network is not of correct type.' if fields is None: fields = checkpoint_dict.keys() if ignore_fields is None: ignore_fields = ['settings'] # Never load the scheduler. It exists in older checkpoints. ignore_fields.extend([ 'lr_scheduler', 'constructor', 'net_type', 'actor_type', 'net_info' ]) # Load all fields for key in fields: if key in ignore_fields: continue if key == 'net': # net.load_state_dict(checkpoint_dict[key]) # Song for merge layer model_dict = net.state_dict() pretrained_dict = { k: v for k, v in checkpoint_dict[key].items() if k in model_dict } model_dict.update(pretrained_dict) net.load_state_dict(model_dict) elif key == 'optimizer': try: self.optimizer.load_state_dict(checkpoint_dict[key]) except: continue else: setattr(self, key, checkpoint_dict[key]) # Set the net info if load_constructor and 'constructor' in checkpoint_dict and checkpoint_dict[ 'constructor'] is not None: net.constructor = checkpoint_dict['constructor'] if 'net_info' in checkpoint_dict and checkpoint_dict[ 'net_info'] is not None: net.info = checkpoint_dict['net_info'] # Update the epoch in lr scheduler if 'epoch' in fields: self.lr_scheduler.last_epoch = self.epoch return True
def load_checkpoint(self, checkpoint = None, fields = None, ignore_fields = None, load_constructor = False): """Loads a network checkpoint file. Can be called in three different ways: load_checkpoint(): Loads the latest epoch from the workspace. Use this to continue training. load_checkpoint(epoch_num): Loads the network at the given epoch number (int). load_checkpoint(path_to_checkpoint): Loads the file from the given absolute path (str). """ net = self.actor.net.module if multigpu.is_multi_gpu(self.actor.net) else self.actor.net actor_type = type(self.actor).__name__ net_type = type(net).__name__ if checkpoint is None: # Load most recent checkpoint checkpoint_list = sorted(glob.glob('{}/{}/{}_ep*.pth.tar'.format(self._checkpoint_dir, self.settings.project_path, net_type))) print(checkpoint_list) if checkpoint_list: checkpoint_path = checkpoint_list[-1] else: print('No matching checkpoint file found') return elif isinstance(checkpoint, int): # Checkpoint is the epoch number checkpoint_path = '{}/{}/{}_ep{:04d}.pth.tar'.format(self._checkpoint_dir, self.settings.project_path, net_type, checkpoint) elif isinstance(checkpoint, str): # checkpoint is the path if os.path.isdir(checkpoint): checkpoint_list = sorted(glob.glob('{}/*_ep*.pth.tar'.format(checkpoint))) if checkpoint_list: checkpoint_path = checkpoint_list[-1] else: raise Exception('No checkpoint found') else: checkpoint_path = os.path.expanduser(checkpoint) else: raise TypeError # Load network checkpoint_dict = loading.torch_load_legacy(checkpoint_path) #print([net_type, checkpoint_dict['net_type']])['DiMPnet_rgbd', 'DiMPnet'] assert checkpoint_dict['net_type'] in net_type, 'Network is not of correct type.' if fields is None: fields = checkpoint_dict.keys() if ignore_fields is None: ignore_fields = ['settings'] # Never load the scheduler. It exists in older checkpoints. ignore_fields.extend(['lr_scheduler', 'constructor', 'net_type', 'actor_type', 'net_info']) # Load all fields for key in fields: if key in ignore_fields: continue if key == 'net': #print(checkpoint_dict[key].keys()) net.load_state_dict(checkpoint_dict[key], strict=False) elif key == 'optimizer':# and 'dimp50.pth' not in checkpoint_path: # param_lens = (len(g['params']) for g in self.optimizer.param_groups) # saved_lens = (len(g['params']) for g in checkpoint_dict[key]['param_groups']) # # for p_len,s_len in zip(param_lens, saved_lens): # # print([p_len, s_len]) # print(self.optimizer.param_groups[3]['params'][0].shape) # if any(p_len != s_len for p_len, s_len in zip(param_lens, saved_lens)): # raise ValueError("loaded state dict contains a parameter group " # "that doesn't match the size of optimizer's group") self.optimizer.load_state_dict(checkpoint_dict[key]) else: setattr(self, key, checkpoint_dict[key]) # Set the net info if load_constructor and 'constructor' in checkpoint_dict and checkpoint_dict['constructor'] is not None: net.constructor = checkpoint_dict['constructor'] if 'net_info' in checkpoint_dict and checkpoint_dict['net_info'] is not None: net.info = checkpoint_dict['net_info'] # Update the epoch in lr scheduler if 'epoch' in fields: self.lr_scheduler.last_epoch = self.epoch return True