Exemple #1
0
 def __call__(self,
              global_step,
              backbone: paddle.nn.Layer,
              partial_fc: PartialFC = None):
     if global_step > 100 and self.rank is 0:
         paddle.save(backbone.state_dict(),
                     os.path.join(self.output, "backbone.pdparams"))
     if global_step > 100 and partial_fc is not None:
         partial_fc.save_params()
Exemple #2
0
 def __call__(self, num_update, backbone: paddle.nn.Layer, batch_size=10):
     if self.rank == 0 and num_update > 0 and num_update % self.frequent == 0:
         backbone.eval()
         self.ver_test(backbone, num_update, batch_size)
         backbone.train()
Exemple #3
0
    def save(self,
             backbone: paddle.nn.Layer,
             classifier: paddle.nn.Layer = None,
             optimizer=None,
             epoch=0,
             for_train=True,
             best_metric=None):

        if best_metric is not None:
            save_rank = best_metric['rank']
            model_save_dir = os.path.join(self.model_save_dir, 'best_model',
                                          str(best_metric['dataset_name']))
        else:
            save_rank = 0  # default we only save rank 0 backbone
            model_save_dir = os.path.join(self.model_save_dir, str(epoch))

        if not os.path.exists(model_save_dir):
            # may be more than one processes trying
            # to create the directory
            try:
                os.makedirs(model_save_dir)
            except OSError as exc:
                if exc.errno != errno.EEXIST:
                    raise
                pass

        if self.rank == save_rank:
            # for non dist param, we only save their at rank 0.
            for name, param in backbone.state_dict().items():
                paddle.save(
                    param, os.path.join(model_save_dir,
                                        param.name + '.pdparam'))

        if classifier is not None:
            # for dist param, we need to save their at all ranks.
            for name, param in classifier.state_dict().items():
                paddle.save(
                    param, os.path.join(model_save_dir,
                                        param.name + '.pdparam'))

        if for_train:
            assert optimizer is not None
            opt_state_dict = optimizer.state_dict()
            lr_state_dict = opt_state_dict['LR_Scheduler']
            for name, opt in opt_state_dict.items():
                if '@GRAD' in name:
                    continue
                # for non dist opt var, we only save their at rank 0,
                # but for dist opt var, we need to save their at all ranks.
                if 'dist@' in name and '@rank@' in name or self.rank == 0:
                    paddle.save(opt,
                                os.path.join(model_save_dir, name + '.pdopt'))

            if self.rank == save_rank:
                # save some extra info for resume
                # pretrain_world_size, embedding_size, num_classes are used for
                # re-split fc weight when gpu setting changed.
                # epoch use to restart.
                config_file = os.path.join(model_save_dir, 'meta.json')
                extra_info = dict()
                extra_info["pretrain_world_size"] = self.world_size
                extra_info["embedding_size"] = self.embedding_size
                extra_info['num_classes'] = self.num_classes
                extra_info['epoch'] = epoch
                extra_info['lr_state'] = lr_state_dict
                if best_metric is not None:
                    extra_info['best_metric'] = best_metric
                with open(config_file, 'w') as f:
                    json.dump(extra_info, f)

        logging.info("Save model to {}.".format(model_save_dir))
        if self.rank == 0 and self.max_num_last_checkpoint > 0:
            for idx in range(-1, epoch - self.max_num_last_checkpoint + 1):
                path = os.path.join(self.model_save_dir, str(idx))
                if os.path.exists(path):
                    logging.info("Remove checkpoint {}.".format(path))
                    shutil.rmtree(path)
Exemple #4
0
 def __call__(self, num_update, backbone: paddle.nn.Layer):
     if self.rank == 0 and num_update > 0 and num_update % self.frequent == 0:
         backbone.eval()
         with paddle.no_grad():
             self.ver_test(backbone, num_update)
         backbone.train()
Exemple #5
0
    def load(self,
             backbone: paddle.nn.Layer,
             classifier: paddle.nn.Layer = None,
             optimizer=None,
             for_train=True,
             dtype=None):

        assert os.path.exists(self.checkpoint_dir)
        checkpoint_dir = os.path.abspath(self.checkpoint_dir)

        type_dict = {}
        for name, param in backbone.state_dict().items():
            type_dict[param.name] = convert_dtype(param.dtype)

        if classifier is not None:
            # for dist param, we need to save their at all ranks.
            for name, param in classifier.state_dict().items():
                type_dict[param.name] = convert_dtype(param.dtype)

        if for_train:
            assert optimizer is not None
            opt_state_dict = optimizer.state_dict()
            lr_state_dict = opt_state_dict['LR_Scheduler']
            for name, opt in opt_state_dict.items():
                if name == 'LR_Scheduler' or '@GRAD' in name:
                    continue
                type_dict[name] = convert_dtype(opt.dtype)

        param_state_dict = {}
        opt_state_dict = {}
        dist_param_state_dict = {}

        dist_weight_state_dict = {}
        dist_weight_velocity_state_dict = {}
        dist_bias_state_dict = {}
        dist_bias_velocity_state_dict = {}
        for path in os.listdir(checkpoint_dir):
            path = os.path.join(checkpoint_dir, path)
            if not os.path.isfile(path):
                continue

            basename = os.path.basename(path)
            name, ext = os.path.splitext(basename)

            if ext not in ['.pdopt', '.pdparam']:
                continue

            if not for_train and ext == '.pdopt':
                continue

            if classifier is None and 'dist@' in name and '@rank@' in name:
                continue

            tensor = paddle.load(path, return_numpy=True)
            if dtype:
                assert dtype in ['float32', 'float16']
                tensor = tensor.astype(dtype)
            else:
                tensor = tensor.astype(type_dict[name])

            if 'dist@' in name and '@rank@' in name:
                if '.w' in name and 'velocity' not in name:
                    dist_weight_state_dict[name] = tensor
                elif '.w' in name and 'velocity' in name:
                    dist_weight_velocity_state_dict[name] = tensor
                elif '.b' in name and 'velocity' not in name:
                    dist_bias_state_dict[name] = tensor
                elif '.b' in name and 'velocity' in name:
                    dist_bias_velocity_state_dict[name] = tensor

            else:
                if ext == '.pdparam':
                    param_state_dict[name] = tensor
                else:
                    opt_state_dict[name] = tensor

        if classifier is not None and for_train:
            meta_file = os.path.join(checkpoint_dir, 'meta.json')
            if not os.path.exists(meta_file):
                logging.error(
                    "Please make sure the checkpoint dir {} exists, and "
                    "parameters in that dir are validating.".format(
                        checkpoint_dir))
                exit()

            with open(meta_file, 'r') as handle:
                extra_info = json.load(handle)

            # Preporcess distributed parameters.
            pretrain_world_size = extra_info['pretrain_world_size']
            assert pretrain_world_size > 0
            embedding_size = extra_info['embedding_size']
            assert embedding_size == self.embedding_size
            num_classes = extra_info['num_classes']
            assert num_classes == self.num_classes

            logging.info(
                "Parameters for pre-training: pretrain_world_size ({}), "
                "embedding_size ({}), and num_classes ({}).".format(
                    pretrain_world_size, embedding_size, num_classes))
            logging.info("Parameters for inference or fine-tuning: "
                         "world_size ({}).".format(self.world_size))

            rank_str = '%05d' % self.rank

            dist_weight_state_dict = rearrange_weight(dist_weight_state_dict,
                                                      pretrain_world_size,
                                                      self.world_size)
            dist_bias_state_dict = rearrange_weight(dist_bias_state_dict,
                                                    pretrain_world_size,
                                                    self.world_size)
            for name, value in dist_weight_state_dict.items():
                if rank_str in name:
                    dist_param_state_dict[name] = value
            for name, value in dist_bias_state_dict.items():
                if rank_str in name:
                    dist_param_state_dict[name] = value

            if for_train:
                dist_weight_velocity_state_dict = rearrange_weight(
                    dist_weight_velocity_state_dict, pretrain_world_size,
                    self.world_size)
                dist_bias_velocity_state_dict = rearrange_weight(
                    dist_bias_velocity_state_dict, pretrain_world_size,
                    self.world_size)
                for name, value in dist_weight_velocity_state_dict.items():
                    if rank_str in name:
                        opt_state_dict[name] = value
                for name, value in dist_bias_velocity_state_dict.items():
                    if rank_str in name:
                        opt_state_dict[name] = value

        def map_actual_param_name(state_dict, load_state_dict):
            for name, param in state_dict.items():
                state_dict[name] = load_state_dict[param.name]
            return state_dict

        logging.info("Load checkpoint from '{}'. ".format(checkpoint_dir))
        param_state_dict = map_actual_param_name(backbone.state_dict(),
                                                 param_state_dict)
        backbone.set_state_dict(param_state_dict)
        if classifier is not None:
            dist_param_state_dict = map_actual_param_name(
                classifier.state_dict(), dist_param_state_dict)
            classifier.set_state_dict(dist_param_state_dict)
        if for_train:
            assert optimizer is not None
            optimizer.set_state_dict(opt_state_dict)

        if classifier is not None and for_train:
            return extra_info
        else:
            return {}