def __call__(self, global_step, backbone: paddle.nn.Layer, partial_fc: PartialFC = None): if global_step > 100 and self.rank is 0: paddle.save(backbone.state_dict(), os.path.join(self.output, "backbone.pdparams")) if global_step > 100 and partial_fc is not None: partial_fc.save_params()
def __call__(self, num_update, backbone: paddle.nn.Layer, batch_size=10): if self.rank == 0 and num_update > 0 and num_update % self.frequent == 0: backbone.eval() self.ver_test(backbone, num_update, batch_size) backbone.train()
def save(self, backbone: paddle.nn.Layer, classifier: paddle.nn.Layer = None, optimizer=None, epoch=0, for_train=True, best_metric=None): if best_metric is not None: save_rank = best_metric['rank'] model_save_dir = os.path.join(self.model_save_dir, 'best_model', str(best_metric['dataset_name'])) else: save_rank = 0 # default we only save rank 0 backbone model_save_dir = os.path.join(self.model_save_dir, str(epoch)) if not os.path.exists(model_save_dir): # may be more than one processes trying # to create the directory try: os.makedirs(model_save_dir) except OSError as exc: if exc.errno != errno.EEXIST: raise pass if self.rank == save_rank: # for non dist param, we only save their at rank 0. for name, param in backbone.state_dict().items(): paddle.save( param, os.path.join(model_save_dir, param.name + '.pdparam')) if classifier is not None: # for dist param, we need to save their at all ranks. for name, param in classifier.state_dict().items(): paddle.save( param, os.path.join(model_save_dir, param.name + '.pdparam')) if for_train: assert optimizer is not None opt_state_dict = optimizer.state_dict() lr_state_dict = opt_state_dict['LR_Scheduler'] for name, opt in opt_state_dict.items(): if '@GRAD' in name: continue # for non dist opt var, we only save their at rank 0, # but for dist opt var, we need to save their at all ranks. if 'dist@' in name and '@rank@' in name or self.rank == 0: paddle.save(opt, os.path.join(model_save_dir, name + '.pdopt')) if self.rank == save_rank: # save some extra info for resume # pretrain_world_size, embedding_size, num_classes are used for # re-split fc weight when gpu setting changed. # epoch use to restart. config_file = os.path.join(model_save_dir, 'meta.json') extra_info = dict() extra_info["pretrain_world_size"] = self.world_size extra_info["embedding_size"] = self.embedding_size extra_info['num_classes'] = self.num_classes extra_info['epoch'] = epoch extra_info['lr_state'] = lr_state_dict if best_metric is not None: extra_info['best_metric'] = best_metric with open(config_file, 'w') as f: json.dump(extra_info, f) logging.info("Save model to {}.".format(model_save_dir)) if self.rank == 0 and self.max_num_last_checkpoint > 0: for idx in range(-1, epoch - self.max_num_last_checkpoint + 1): path = os.path.join(self.model_save_dir, str(idx)) if os.path.exists(path): logging.info("Remove checkpoint {}.".format(path)) shutil.rmtree(path)
def __call__(self, num_update, backbone: paddle.nn.Layer): if self.rank == 0 and num_update > 0 and num_update % self.frequent == 0: backbone.eval() with paddle.no_grad(): self.ver_test(backbone, num_update) backbone.train()
def load(self, backbone: paddle.nn.Layer, classifier: paddle.nn.Layer = None, optimizer=None, for_train=True, dtype=None): assert os.path.exists(self.checkpoint_dir) checkpoint_dir = os.path.abspath(self.checkpoint_dir) type_dict = {} for name, param in backbone.state_dict().items(): type_dict[param.name] = convert_dtype(param.dtype) if classifier is not None: # for dist param, we need to save their at all ranks. for name, param in classifier.state_dict().items(): type_dict[param.name] = convert_dtype(param.dtype) if for_train: assert optimizer is not None opt_state_dict = optimizer.state_dict() lr_state_dict = opt_state_dict['LR_Scheduler'] for name, opt in opt_state_dict.items(): if name == 'LR_Scheduler' or '@GRAD' in name: continue type_dict[name] = convert_dtype(opt.dtype) param_state_dict = {} opt_state_dict = {} dist_param_state_dict = {} dist_weight_state_dict = {} dist_weight_velocity_state_dict = {} dist_bias_state_dict = {} dist_bias_velocity_state_dict = {} for path in os.listdir(checkpoint_dir): path = os.path.join(checkpoint_dir, path) if not os.path.isfile(path): continue basename = os.path.basename(path) name, ext = os.path.splitext(basename) if ext not in ['.pdopt', '.pdparam']: continue if not for_train and ext == '.pdopt': continue if classifier is None and 'dist@' in name and '@rank@' in name: continue tensor = paddle.load(path, return_numpy=True) if dtype: assert dtype in ['float32', 'float16'] tensor = tensor.astype(dtype) else: tensor = tensor.astype(type_dict[name]) if 'dist@' in name and '@rank@' in name: if '.w' in name and 'velocity' not in name: dist_weight_state_dict[name] = tensor elif '.w' in name and 'velocity' in name: dist_weight_velocity_state_dict[name] = tensor elif '.b' in name and 'velocity' not in name: dist_bias_state_dict[name] = tensor elif '.b' in name and 'velocity' in name: dist_bias_velocity_state_dict[name] = tensor else: if ext == '.pdparam': param_state_dict[name] = tensor else: opt_state_dict[name] = tensor if classifier is not None and for_train: meta_file = os.path.join(checkpoint_dir, 'meta.json') if not os.path.exists(meta_file): logging.error( "Please make sure the checkpoint dir {} exists, and " "parameters in that dir are validating.".format( checkpoint_dir)) exit() with open(meta_file, 'r') as handle: extra_info = json.load(handle) # Preporcess distributed parameters. pretrain_world_size = extra_info['pretrain_world_size'] assert pretrain_world_size > 0 embedding_size = extra_info['embedding_size'] assert embedding_size == self.embedding_size num_classes = extra_info['num_classes'] assert num_classes == self.num_classes logging.info( "Parameters for pre-training: pretrain_world_size ({}), " "embedding_size ({}), and num_classes ({}).".format( pretrain_world_size, embedding_size, num_classes)) logging.info("Parameters for inference or fine-tuning: " "world_size ({}).".format(self.world_size)) rank_str = '%05d' % self.rank dist_weight_state_dict = rearrange_weight(dist_weight_state_dict, pretrain_world_size, self.world_size) dist_bias_state_dict = rearrange_weight(dist_bias_state_dict, pretrain_world_size, self.world_size) for name, value in dist_weight_state_dict.items(): if rank_str in name: dist_param_state_dict[name] = value for name, value in dist_bias_state_dict.items(): if rank_str in name: dist_param_state_dict[name] = value if for_train: dist_weight_velocity_state_dict = rearrange_weight( dist_weight_velocity_state_dict, pretrain_world_size, self.world_size) dist_bias_velocity_state_dict = rearrange_weight( dist_bias_velocity_state_dict, pretrain_world_size, self.world_size) for name, value in dist_weight_velocity_state_dict.items(): if rank_str in name: opt_state_dict[name] = value for name, value in dist_bias_velocity_state_dict.items(): if rank_str in name: opt_state_dict[name] = value def map_actual_param_name(state_dict, load_state_dict): for name, param in state_dict.items(): state_dict[name] = load_state_dict[param.name] return state_dict logging.info("Load checkpoint from '{}'. ".format(checkpoint_dir)) param_state_dict = map_actual_param_name(backbone.state_dict(), param_state_dict) backbone.set_state_dict(param_state_dict) if classifier is not None: dist_param_state_dict = map_actual_param_name( classifier.state_dict(), dist_param_state_dict) classifier.set_state_dict(dist_param_state_dict) if for_train: assert optimizer is not None optimizer.set_state_dict(opt_state_dict) if classifier is not None and for_train: return extra_info else: return {}