Ejemplo n.º 1
0
    def __call__(self, model_list):
        # model_list is None in static graph
        parameters = sum([m.parameters() for m in model_list],
                         []) if model_list else None

        # TODO(gaotingquan): model_list is None when in static graph, "no_weight_decay" not work.
        if model_list is None:
            if self.one_dim_param_no_weight_decay or len(
                    self.no_weight_decay_name_list) != 0:
                msg = "\"AdamW\" does not support setting \"no_weight_decay\" in static graph. Please use dynamic graph."
                logger.error(Exception(msg))
                raise Exception(msg)

        self.no_weight_decay_param_name_list = [
            p.name for model in model_list for n, p in model.named_parameters()
            if any(nd in n for nd in self.no_weight_decay_name_list)
        ] if model_list else []

        if self.one_dim_param_no_weight_decay:
            self.no_weight_decay_param_name_list += [
                p.name for model in model_list
                for n, p in model.named_parameters() if len(p.shape) == 1
            ] if model_list else []

        opt = optim.AdamW(
            learning_rate=self.learning_rate,
            beta1=self.beta1,
            beta2=self.beta2,
            epsilon=self.epsilon,
            parameters=parameters,
            weight_decay=self.weight_decay,
            multi_precision=self.multi_precision,
            grad_clip=self.grad_clip,
            apply_decay_param_fun=self._apply_decay_param_fun)
        return opt
Ejemplo n.º 2
0
def create_feeds(image_shape, use_mix=False, class_num=None, dtype="float32"):
    """
    Create feeds as model input

    Args:
        image_shape(list[int]): model input shape, such as [3, 224, 224]
        use_mix(bool): whether to use mix(include mixup, cutmix, fmix)
        class_num(int): the class number of network, required if use_mix

    Returns:
        feeds(dict): dict of model input variables
    """
    feeds = OrderedDict()
    feeds['data'] = paddle.static.data(name="data",
                                       shape=[None] + image_shape,
                                       dtype=dtype)

    if use_mix:
        if class_num is None:
            msg = "When use MixUp, CutMix and so on, you must set class_num."
            logger.error(msg)
            raise Exception(msg)
        feeds['target'] = paddle.static.data(name="target",
                                             shape=[None, class_num],
                                             dtype="float32")
    else:
        feeds['label'] = paddle.static.data(name="label",
                                            shape=[None, 1],
                                            dtype="int64")

    return feeds
Ejemplo n.º 3
0
    def __init__(self, class_num, **op_dict):
        """Build OpSampler

        Raises:
            Exception: The parameter \"prob\" of operator(s) are be set error.
        """
        if not class_num:
            msg = "Please set \"Arch.class_num\" in config if use \"OpSampler\"."
            logger.error(Exception(msg))
            raise Exception(msg)

        if len(op_dict) < 1:
            msg = f"ConfigWarning: No operator in \"OpSampler\". \"OpSampler\" has been skipped."
            logger.warning(msg)

        self.ops = {}
        total_prob = 0
        for op_name in op_dict:
            param = op_dict[op_name]
            if "prob" not in param:
                msg = f"ConfigWarning: Parameter \"prob\" should be set when use operator in \"OpSampler\". The operator \"{op_name}\"'s prob has been set \"0\"."
                logger.warning(msg)
            prob = param.pop("prob", 0)
            total_prob += prob
            param.update({"class_num": class_num})
            op = eval(op_name)(**param)
            self.ops.update({op: prob})

        if total_prob > 1:
            msg = f"ConfigError: The total prob of operators in \"OpSampler\" should be less 1."
            logger.error(Exception(msg))
            raise Exception(msg)

        # add "None Op" when total_prob < 1, "None Op" do nothing
        self.ops[None] = 1 - total_prob
Ejemplo n.º 4
0
def check_model_with_running_mode(architecture):
    """
    check whether the model is consistent with the operating mode 
    """
    # some model are not supported in the static mode
    blacklist = get_blacklist_model_in_static_mode()
    if not paddle.in_dynamic_mode() and architecture["name"] in blacklist:
        logger.error("Model: {} is not supported in the staic mode.".format(
            architecture["name"]))
        sys.exit(1)
    return
Ejemplo n.º 5
0
def check_mix(architecture, use_mix=False):
    """
    check mix parameter
    """
    err = "Cannot use mix processing in GoogLeNet, " \
          "please set use_mix = False."
    try:
        if architecture["name"] == "GoogLeNet": assert use_mix == False
    except AssertionError:
        logger.error(err)
        sys.exit(1)
Ejemplo n.º 6
0
def check_data_dir(path):
    """
    check cata_dir
    """
    err = "Data path is not exist, please given a right path" \
          "".format(path)
    try:
        assert os.isdir(path)
    except AssertionError:
        logger.error(err)
        sys.exit(1)
Ejemplo n.º 7
0
 def __getitem__(self, idx):
     try:
         line = self.full_lines[idx]
         img_path, label = line.split(self.delimiter)
         img_path = os.path.join(self.params['data_dir'], img_path)
         with open(img_path, 'rb') as f:
             img = f.read()
         return (transform(img, self.ops), int(label))
     except Exception as e:
         logger.error("data read faild: {}, exception info: {}".format(
             line, e))
         return self.__getitem__(random.randint(0, len(self)))
Ejemplo n.º 8
0
def check_gpu():
    """
    Log error and exit when using paddlepaddle cpu version.
    """
    err = "You are using paddlepaddle cpu version! Please try to " \
          "install paddlepaddle-gpu to run model on GPU."

    try:
        assert fluid.is_compiled_with_cuda()
    except AssertionError:
        logger.error(err)
        sys.exit(1)
Ejemplo n.º 9
0
def main(args):
    benchmark_file_list = args.benchmark_file_list
    model_infos = parse_model_infos(benchmark_file_list)
    right_models = []
    wrong_models = []

    for model_info in model_infos:
        try:
            pretrained_url = model_info["pretrain_path"]
            fname = _download(pretrained_url, args.pretrained_dir)
            pretrained_path = os.path.splitext(fname)[0]
            if pretrained_url.endswith("tar"):
                path = _decompress(fname)
                pretrained_path = os.path.join(
                    os.path.dirname(pretrained_path), path)

            args.config = model_info["config_path"]
            args.override = [
                "pretrained_model={}".format(pretrained_path),
                "VALID.batch_size=256",
                "VALID.num_workers=16",
                "load_static_weights=True",
                "print_interval=100",
            ]

            manager = Manager()
            return_dict = manager.dict()

            # A hack method to avoid name conflict.
            # Multi-process maybe a better method here.
            # More details can be seen in branch 2.0-beta.
            # TODO: fluid needs to be removed in the future.
            with paddle.utils.unique_name.guard():
                eval.main(args, return_dict)

            top1_acc = return_dict.get("top1_acc", 0.0)
        except Exception as e:
            logger.error(e)
            top1_acc = 0.0
        diff = abs(top1_acc - model_info["top1_acc"])
        if diff > 0.001:
            err_info = "[{}]Top-1 acc diff should be <= 0.001 but got diff {}, gt acc: {}, eval acc: {}".format(
                model_info["model_name"], diff, model_info["top1_acc"],
                top1_acc)
            logger.warning(err_info)
            wrong_models.append(model_info["model_name"])
        else:
            right_models.append(model_info["model_name"])

    logger.info("[number of right models: {}, they are: {}".format(
        len(right_models), right_models))
    logger.info("[number of wrong models: {}, they are: {}".format(
        len(wrong_models), wrong_models))
Ejemplo n.º 10
0
def check_classes_num(classes_num):
    """
    check classes_num
    """
    err = "classes_num({}) should be a positive integer" \
        "and larger than 1".format(classes_num)
    try:
        assert isinstance(classes_num, int)
        assert classes_num > 1
    except AssertionError:
        logger.error(err)
        sys.exit(1)
Ejemplo n.º 11
0
 def __getitem__(self, idx):
     try:
         with open(self.images[idx], 'rb') as f:
             img = f.read()
         if self._transform_ops:
             img = transform(img, self._transform_ops)
         img = img.transpose((2, 0, 1))
         return (img, self.labels[idx], self.cameras[idx])
     except Exception as ex:
         logger.error("Exception occured when parse line: {} with msg: {}".
                      format(self.images[idx], ex))
         rnd_idx = np.random.randint(self.__len__())
         return self.__getitem__(rnd_idx)
Ejemplo n.º 12
0
def check_version():
    """
    Log error and exit when the installed version of paddlepaddle is
    not satisfied.
    """
    err = "PaddlePaddle version 1.7 or higher is required, " \
          "or a suitable develop version is satisfied as well. \n" \
          "Please make sure the version is good with your code." \

    try:
        fluid.require_version('1.7.0')
    except Exception:
        logger.error(err)
        sys.exit(1)
Ejemplo n.º 13
0
    def __init__(self,
                 class_num,
                 alpha=1,
                 decay_power=3,
                 max_soft=0.,
                 reformulate=False):
        if not class_num:
            msg = "Please set \"Arch.class_num\" in config if use \"FmixOperator\"."
            logger.error(Exception(msg))
            raise Exception(msg)

        self._alpha = alpha
        self._decay_power = decay_power
        self._max_soft = max_soft
        self._reformulate = reformulate
        self.class_num = class_num
Ejemplo n.º 14
0
def check_architecture(architecture):
    """
    check architecture and recommend similar architectures
    """
    assert isinstance(architecture, str), \
            ("the type of architecture({}) should be str". format(architecture))
    similar_names = similar_architectures(architecture)
    model_list = ', '.join(similar_names)
    err = "{} is not exist! Maybe you want: [{}]" \
          "".format(architecture, model_list)

    try:
        assert architecture in similar_names
    except AssertionError:
        logger.error(err)
        sys.exit(1)
Ejemplo n.º 15
0
    def __getitem__(self, idx):
        try:
            line = self.full_lines[idx]
            img_path, label_str = line.split(self.delimiter)
            img_path = os.path.join(self.params["data_dir"], img_path)
            with open(img_path, "rb") as f:
                img = f.read()

            labels = label_str.split(',')
            labels = [int(i) for i in labels]

            return (transform(img,
                              self.ops), np.array(labels).astype("float32"))
        except Exception as e:
            logger.error("data read failed: {}, exception info: {}".format(
                line, e))
            return self.__getitem__(random.randint(0, len(self)))
Ejemplo n.º 16
0
 def __init__(self,
              dataset,
              batch_size,
              sample_per_id,
              shuffle=True,
              drop_last=True,
              sample_method="sample_avg_prob"):
     super().__init__(dataset,
                      batch_size,
                      shuffle=shuffle,
                      drop_last=drop_last)
     assert batch_size % sample_per_id == 0, \
         "PKSampler configs error, Sample_per_id must be a divisor of batch_size."
     assert hasattr(self.dataset,
                    "labels"), "Dataset must have labels attribute."
     self.sample_per_label = sample_per_id
     self.label_dict = defaultdict(list)
     self.sample_method = sample_method
     for idx, label in enumerate(self.dataset.labels):
         self.label_dict[label].append(idx)
     self.label_list = list(self.label_dict)
     assert len(self.label_list) * self.sample_per_label > self.batch_size, \
         "batch size should be smaller than "
     if self.sample_method == "id_avg_prob":
         self.prob_list = np.array([1 / len(self.label_list)] *
                                   len(self.label_list))
     elif self.sample_method == "sample_avg_prob":
         counter = []
         for label_i in self.label_list:
             counter.append(len(self.label_dict[label_i]))
         self.prob_list = np.array(counter) / sum(counter)
     else:
         logger.error(
             "PKSampler only support id_avg_prob and sample_avg_prob sample method, "
             "but receive {}.".format(self.sample_method))
     diff = np.abs(sum(self.prob_list) - 1)
     if diff > 0.00000001:
         self.prob_list[-1] = 1 - sum(self.prob_list[:-1])
         if self.prob_list[-1] > 1 or self.prob_list[-1] < 0:
             logger.error("PKSampler prob list error")
         else:
             logger.info(
                 "PKSampler: sum of prob list not equal to 1, diff is {}, change the last prob"
                 .format(diff))
Ejemplo n.º 17
0
def check_architecture(architecture):
    """
    check architecture and recommend similar architectures
    """
    assert isinstance(architecture, dict), \
            ("the type of architecture({}) should be dict". format(architecture))
    assert "name" in architecture, \
            ("name must be in the architecture keys, just contains: {}". format(architecture.keys()))

    similar_names = similar_architectures(architecture["name"],
                                          get_architectures())
    model_list = ', '.join(similar_names)
    err = "{} is not exist! Maybe you want: [{}]" \
          "".format(architecture["name"], model_list)
    try:
        assert architecture["name"] in similar_names
    except AssertionError:
        logger.error(err)
        sys.exit(1)
Ejemplo n.º 18
0
    def __init__(self, class_num, alpha=0.2):
        """Build Cutmix operator

        Args:
            alpha (float, optional): The parameter alpha of cutmix. Defaults to 0.2.

        Raises:
            Exception: The value of parameter is illegal.
        """
        if alpha <= 0:
            raise Exception(
                f"Parameter \"alpha\" of Cutmix should be greater than 0. \"alpha\": {alpha}."
            )
        if not class_num:
            msg = "Please set \"Arch.class_num\" in config if use \"CutmixOperator\"."
            logger.error(Exception(msg))
            raise Exception(msg)

        self._alpha = alpha
        self.class_num = class_num
Ejemplo n.º 19
0
def main(url, image_path, top_k=1):
    image_file_list = get_image_file_list(image_path)
    headers = {"Content-type": "application/json"}
    cnt = 0
    total_time = 0
    all_acc = 0.0

    for image_file in image_file_list:
        file_str = image_file.split('/')[-1]
        img = open(image_file, 'rb').read()
        if img is None:
            logger.error("Loading image:{} failed".format(image_file))
            continue
        data = {'images': [cv2_to_base64(img)], 'top_k': top_k}

        starttime = time.time()
        try:
            r = requests.post(url=url, headers=headers, data=json.dumps(data))
            r.raise_for_status()
        except Exception as e:
            logger.error("File:{}, {}".format(file_str, e))
            continue
        elapse = time.time() - starttime
        total_time += elapse
        if r.json()['status'] != '0':
            logger.error(
                "File:{}, The parameters returned by the server are: {}".
                format(file_str,
                       r.json()['msg']))
            continue
        res = r.json()["results"][0]
        classes = res[0]
        scores = res[1]
        all_acc += scores[0]
        cnt += 1

        scores = map(lambda x: round(x, 5), scores)
        results = dict(zip(classes, scores))

        message = "No.{}, File:{}, The top-{} result(s):{}, Time cost:{:.3f}".format(
            cnt, file_str, top_k, results, elapse)
        logger.info(message)

    logger.info("The average time cost: {}".format(float(total_time) / cnt))
    logger.info("The average top-1 score: {}".format(float(all_acc) / cnt))
Ejemplo n.º 20
0
    def __init__(self, config, mode="train"):
        assert mode in ["train", "eval", "infer", "export"]
        self.mode = mode
        self.config = config
        self.eval_mode = self.config["Global"].get("eval_mode",
                                                   "classification")
        if "Head" in self.config["Arch"]:
            self.is_rec = True
        else:
            self.is_rec = False

        # set seed
        seed = self.config["Global"].get("seed", False)
        if seed or seed == 0:
            assert isinstance(seed, int), "The 'seed' must be a integer!"
            paddle.seed(seed)
            np.random.seed(seed)
            random.seed(seed)

        # init logger
        self.output_dir = self.config['Global']['output_dir']
        log_file = os.path.join(self.output_dir, self.config["Arch"]["name"],
                                f"{mode}.log")
        init_logger(name='root', log_file=log_file)
        print_config(config)

        # init train_func and eval_func
        assert self.eval_mode in ["classification", "retrieval"], logger.error(
            "Invalid eval mode: {}".format(self.eval_mode))
        self.train_epoch_func = train_epoch
        self.eval_func = getattr(evaluation, self.eval_mode + "_eval")

        self.use_dali = self.config['Global'].get("use_dali", False)

        # for visualdl
        self.vdl_writer = None
        if self.config['Global']['use_visualdl'] and mode == "train":
            vdl_writer_path = os.path.join(self.output_dir, "vdl")
            if not os.path.exists(vdl_writer_path):
                os.makedirs(vdl_writer_path)
            self.vdl_writer = LogWriter(logdir=vdl_writer_path)

        # set device
        assert self.config["Global"]["device"] in ["cpu", "gpu", "xpu", "npu"]
        self.device = paddle.set_device(self.config["Global"]["device"])
        logger.info('train with paddle {} and device {}'.format(
            paddle.__version__, self.device))

        # AMP training
        self.amp = True if "AMP" in self.config else False
        if self.amp and self.config["AMP"] is not None:
            self.scale_loss = self.config["AMP"].get("scale_loss", 1.0)
            self.use_dynamic_loss_scaling = self.config["AMP"].get(
                "use_dynamic_loss_scaling", False)
        else:
            self.scale_loss = 1.0
            self.use_dynamic_loss_scaling = False
        if self.amp:
            AMP_RELATED_FLAGS_SETTING = {
                'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
                'FLAGS_max_inplace_grad_add': 8,
            }
            paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING)

        #TODO(gaotingquan): support rec
        class_num = config["Arch"].get("class_num", None)
        self.config["DataLoader"].update({"class_num": class_num})
        # build dataloader
        if self.mode == 'train':
            self.train_dataloader = build_dataloader(self.config["DataLoader"],
                                                     "Train", self.device,
                                                     self.use_dali)
        if self.mode == "eval" or (self.mode == "train" and
                                   self.config["Global"]["eval_during_train"]):
            if self.eval_mode == "classification":
                self.eval_dataloader = build_dataloader(
                    self.config["DataLoader"], "Eval", self.device,
                    self.use_dali)
            elif self.eval_mode == "retrieval":
                self.gallery_query_dataloader = None
                if len(self.config["DataLoader"]["Eval"].keys()) == 1:
                    key = list(self.config["DataLoader"]["Eval"].keys())[0]
                    self.gallery_query_dataloader = build_dataloader(
                        self.config["DataLoader"]["Eval"], key, self.device,
                        self.use_dali)
                else:
                    self.gallery_dataloader = build_dataloader(
                        self.config["DataLoader"]["Eval"], "Gallery",
                        self.device, self.use_dali)
                    self.query_dataloader = build_dataloader(
                        self.config["DataLoader"]["Eval"], "Query",
                        self.device, self.use_dali)

        # build loss
        if self.mode == "train":
            loss_info = self.config["Loss"]["Train"]
            self.train_loss_func = build_loss(loss_info)
        if self.mode == "eval" or (self.mode == "train" and
                                   self.config["Global"]["eval_during_train"]):
            loss_config = self.config.get("Loss", None)
            if loss_config is not None:
                loss_config = loss_config.get("Eval")
                if loss_config is not None:
                    self.eval_loss_func = build_loss(loss_config)
                else:
                    self.eval_loss_func = None
            else:
                self.eval_loss_func = None

        # build metric
        if self.mode == 'train':
            metric_config = self.config.get("Metric")
            if metric_config is not None:
                metric_config = metric_config.get("Train")
                if metric_config is not None:
                    self.train_metric_func = build_metrics(metric_config)
                else:
                    self.train_metric_func = None
        else:
            self.train_metric_func = None

        if self.mode == "eval" or (self.mode == "train" and
                                   self.config["Global"]["eval_during_train"]):
            metric_config = self.config.get("Metric")
            if self.eval_mode == "classification":
                if metric_config is not None:
                    metric_config = metric_config.get("Eval")
                    if metric_config is not None:
                        self.eval_metric_func = build_metrics(metric_config)
            elif self.eval_mode == "retrieval":
                if metric_config is None:
                    metric_config = [{"name": "Recallk", "topk": (1, 5)}]
                else:
                    metric_config = metric_config["Eval"]
                self.eval_metric_func = build_metrics(metric_config)
        else:
            self.eval_metric_func = None

        # build model
        self.model = build_model(self.config["Arch"])
        # set @to_static for benchmark, skip this by default.
        apply_to_static(self.config, self.model)

        # for slim
        self.pruner = get_pruner(self.config, self.model)
        self.quanter = get_quaner(self.config, self.model)

        # load_pretrain
        if self.config["Global"]["pretrained_model"] is not None:
            if self.config["Global"]["pretrained_model"].startswith("http"):
                load_dygraph_pretrain_from_url(
                    self.model, self.config["Global"]["pretrained_model"])
            else:
                load_dygraph_pretrain(
                    self.model, self.config["Global"]["pretrained_model"])

        # build optimizer
        if self.mode == 'train':
            self.optimizer, self.lr_sch = build_optimizer(
                self.config["Optimizer"], self.config["Global"]["epochs"],
                len(self.train_dataloader), [self.model])

        # for distributed
        self.config["Global"][
            "distributed"] = paddle.distributed.get_world_size() != 1
        if self.config["Global"]["distributed"]:
            dist.init_parallel_env()
        if self.config["Global"]["distributed"]:
            self.model = paddle.DataParallel(self.model)

        # build postprocess for infer
        if self.mode == 'infer':
            self.preprocess_func = create_operators(
                self.config["Infer"]["transforms"])
            self.postprocess_func = build_postprocess(
                self.config["Infer"]["PostProcess"])
Ejemplo n.º 21
0
    def __init__(self, config, mode="train"):
        assert mode in ["train", "eval", "infer", "export"]
        self.mode = mode
        self.config = config
        self.eval_mode = self.config["Global"].get("eval_mode",
                                                   "classification")
        if "Head" in self.config["Arch"] or self.config["Arch"].get(
                "is_rec", False):
            self.is_rec = True
        else:
            self.is_rec = False

        # set seed
        seed = self.config["Global"].get("seed", False)
        if seed or seed == 0:
            assert isinstance(seed, int), "The 'seed' must be a integer!"
            paddle.seed(seed)
            np.random.seed(seed)
            random.seed(seed)

        # init logger
        self.output_dir = self.config['Global']['output_dir']
        log_file = os.path.join(self.output_dir, self.config["Arch"]["name"],
                                f"{mode}.log")
        init_logger(log_file=log_file)
        print_config(config)

        # init train_func and eval_func
        assert self.eval_mode in ["classification", "retrieval"], logger.error(
            "Invalid eval mode: {}".format(self.eval_mode))
        self.train_epoch_func = train_epoch
        self.eval_func = getattr(evaluation, self.eval_mode + "_eval")

        self.use_dali = self.config['Global'].get("use_dali", False)

        # for visualdl
        self.vdl_writer = None
        if self.config['Global'][
                'use_visualdl'] and mode == "train" and dist.get_rank() == 0:
            vdl_writer_path = os.path.join(self.output_dir, "vdl")
            if not os.path.exists(vdl_writer_path):
                os.makedirs(vdl_writer_path)
            self.vdl_writer = LogWriter(logdir=vdl_writer_path)

        # set device
        assert self.config["Global"]["device"] in [
            "cpu", "gpu", "xpu", "npu", "mlu"
        ]
        self.device = paddle.set_device(self.config["Global"]["device"])
        logger.info('train with paddle {} and device {}'.format(
            paddle.__version__, self.device))

        # AMP training
        self.amp = True if "AMP" in self.config and self.mode == "train" else False
        if self.amp and self.config["AMP"] is not None:
            self.scale_loss = self.config["AMP"].get("scale_loss", 1.0)
            self.use_dynamic_loss_scaling = self.config["AMP"].get(
                "use_dynamic_loss_scaling", False)
        else:
            self.scale_loss = 1.0
            self.use_dynamic_loss_scaling = False
        if self.amp:
            AMP_RELATED_FLAGS_SETTING = {
                'FLAGS_max_inplace_grad_add': 8,
            }
            if paddle.is_compiled_with_cuda():
                AMP_RELATED_FLAGS_SETTING.update(
                    {'FLAGS_cudnn_batchnorm_spatial_persistent': 1})
            paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING)

        if "class_num" in config["Global"]:
            global_class_num = config["Global"]["class_num"]
            if "class_num" not in config["Arch"]:
                config["Arch"]["class_num"] = global_class_num
                msg = f"The Global.class_num will be deprecated. Please use Arch.class_num instead. Arch.class_num has been set to {global_class_num}."
            else:
                msg = "The Global.class_num will be deprecated. Please use Arch.class_num instead. The Global.class_num has been ignored."
            logger.warning(msg)
        #TODO(gaotingquan): support rec
        class_num = config["Arch"].get("class_num", None)
        self.config["DataLoader"].update({"class_num": class_num})
        # build dataloader
        if self.mode == 'train':
            self.train_dataloader = build_dataloader(self.config["DataLoader"],
                                                     "Train", self.device,
                                                     self.use_dali)
        if self.mode == "eval" or (self.mode == "train" and
                                   self.config["Global"]["eval_during_train"]):
            if self.eval_mode == "classification":
                self.eval_dataloader = build_dataloader(
                    self.config["DataLoader"], "Eval", self.device,
                    self.use_dali)
            elif self.eval_mode == "retrieval":
                self.gallery_query_dataloader = None
                if len(self.config["DataLoader"]["Eval"].keys()) == 1:
                    key = list(self.config["DataLoader"]["Eval"].keys())[0]
                    self.gallery_query_dataloader = build_dataloader(
                        self.config["DataLoader"]["Eval"], key, self.device,
                        self.use_dali)
                else:
                    self.gallery_dataloader = build_dataloader(
                        self.config["DataLoader"]["Eval"], "Gallery",
                        self.device, self.use_dali)
                    self.query_dataloader = build_dataloader(
                        self.config["DataLoader"]["Eval"], "Query",
                        self.device, self.use_dali)

        # build loss
        if self.mode == "train":
            loss_info = self.config["Loss"]["Train"]
            self.train_loss_func = build_loss(loss_info)
        if self.mode == "eval" or (self.mode == "train" and
                                   self.config["Global"]["eval_during_train"]):
            loss_config = self.config.get("Loss", None)
            if loss_config is not None:
                loss_config = loss_config.get("Eval")
                if loss_config is not None:
                    self.eval_loss_func = build_loss(loss_config)
                else:
                    self.eval_loss_func = None
            else:
                self.eval_loss_func = None

        # build metric
        if self.mode == 'train':
            metric_config = self.config.get("Metric")
            if metric_config is not None:
                metric_config = metric_config.get("Train")
                if metric_config is not None:
                    if hasattr(
                            self.train_dataloader, "collate_fn"
                    ) and self.train_dataloader.collate_fn is not None:
                        for m_idx, m in enumerate(metric_config):
                            if "TopkAcc" in m:
                                msg = f"'TopkAcc' metric can not be used when setting 'batch_transform_ops' in config. The 'TopkAcc' metric has been removed."
                                logger.warning(msg)
                                break
                        metric_config.pop(m_idx)
                    self.train_metric_func = build_metrics(metric_config)
                else:
                    self.train_metric_func = None
        else:
            self.train_metric_func = None

        if self.mode == "eval" or (self.mode == "train" and
                                   self.config["Global"]["eval_during_train"]):
            metric_config = self.config.get("Metric")
            if self.eval_mode == "classification":
                if metric_config is not None:
                    metric_config = metric_config.get("Eval")
                    if metric_config is not None:
                        self.eval_metric_func = build_metrics(metric_config)
            elif self.eval_mode == "retrieval":
                if metric_config is None:
                    metric_config = [{"name": "Recallk", "topk": (1, 5)}]
                else:
                    metric_config = metric_config["Eval"]
                self.eval_metric_func = build_metrics(metric_config)
        else:
            self.eval_metric_func = None

        # build model
        self.model = build_model(self.config)
        # set @to_static for benchmark, skip this by default.
        apply_to_static(self.config, self.model)

        # load_pretrain
        if self.config["Global"]["pretrained_model"] is not None:
            if self.config["Global"]["pretrained_model"].startswith("http"):
                load_dygraph_pretrain_from_url(
                    self.model, self.config["Global"]["pretrained_model"])
            else:
                load_dygraph_pretrain(
                    self.model, self.config["Global"]["pretrained_model"])

        # build optimizer
        if self.mode == 'train':
            self.optimizer, self.lr_sch = build_optimizer(
                self.config["Optimizer"], self.config["Global"]["epochs"],
                len(self.train_dataloader), [self.model])

        # for amp training
        if self.amp:
            self.scaler = paddle.amp.GradScaler(
                init_loss_scaling=self.scale_loss,
                use_dynamic_loss_scaling=self.use_dynamic_loss_scaling)
            amp_level = self.config['AMP'].get("level", "O1")
            if amp_level not in ["O1", "O2"]:
                msg = "[Parameter Error]: The optimize level of AMP only support 'O1' and 'O2'. The level has been set 'O1'."
                logger.warning(msg)
                self.config['AMP']["level"] = "O1"
                amp_level = "O1"
            self.model, self.optimizer = paddle.amp.decorate(
                models=self.model,
                optimizers=self.optimizer,
                level=amp_level,
                save_dtype='float32')

        # for distributed
        world_size = dist.get_world_size()
        self.config["Global"]["distributed"] = world_size != 1
        if world_size != 4 and self.mode == "train":
            msg = f"The training strategy in config files provided by PaddleClas is based on 4 gpus. But the number of gpus is {world_size} in current training. Please modify the stategy (learning rate, batch size and so on) if use config files in PaddleClas to train."
            logger.warning(msg)
        if self.config["Global"]["distributed"]:
            dist.init_parallel_env()
            self.model = paddle.DataParallel(self.model)

        # build postprocess for infer
        if self.mode == 'infer':
            self.preprocess_func = create_operators(
                self.config["Infer"]["transforms"])
            self.postprocess_func = build_postprocess(
                self.config["Infer"]["PostProcess"])
Ejemplo n.º 22
0
 def replace_sub(self, *args, **kwargs) -> None:
     msg = "The function 'replace_sub()' is deprecated, please use 'upgrade_sublayer()' instead."
     logger.error(DeprecationWarning(msg))
     raise DeprecationWarning(msg)
Ejemplo n.º 23
0
def create_fetchs(out,
                  feeds,
                  architecture,
                  topk=5,
                  epsilon=None,
                  class_num=None,
                  use_mix=False,
                  config=None,
                  mode="Train"):
    """
    Create fetchs as model outputs(included loss and measures),
    will call create_loss and create_metric(if use_mix).
    Args:
        out(variable): model output variable
        feeds(dict): dict of model input variables.
            If use mix_up, it will not include label.
        architecture(dict): architecture information,
            name(such as ResNet50) is needed
        topk(int): usually top5
        epsilon(float): parameter for label smoothing, 0.0 <= epsilon <= 1.0
        class_num(int): the class number of network, required if use_mix
        use_mix(bool): whether to use mix(include mixup, cutmix, fmix)
        config(dict): model config

    Returns:
        fetchs(dict): dict of model outputs(included loss and measures)
    """
    fetchs = OrderedDict()
    # build loss
    if use_mix:
        if class_num is None:
            msg = "When use MixUp, CutMix and so on, you must set class_num."
            logger.error(msg)
            raise Exception(msg)
        target = paddle.reshape(feeds['target'], [-1, class_num])
    else:
        target = paddle.reshape(feeds['label'], [-1, 1])

    loss_func = build_loss(config["Loss"][mode])
    loss_dict = loss_func(out, target)

    loss_out = loss_dict["loss"]
    fetchs['loss'] = (loss_out, AverageMeter('loss', '7.4f', need_avg=True))

    # build metric
    if not use_mix:
        metric_func = build_metrics(config["Metric"][mode])

        metric_dict = metric_func(out, target)

        for key in metric_dict:
            if mode != "Train" and paddle.distributed.get_world_size() > 1:
                paddle.distributed.all_reduce(
                    metric_dict[key], op=paddle.distributed.ReduceOp.SUM)
                metric_dict[key] = metric_dict[
                    key] / paddle.distributed.get_world_size()

            fetchs[key] = (metric_dict[key],
                           AverageMeter(key, '7.4f', need_avg=True))

    return fetchs
Ejemplo n.º 24
0
def main(args):
    paddle.seed(12345)

    config = get_config(args.config, overrides=args.override, show=True)
    # assign the place
    use_gpu = config.get("use_gpu", True)
    place = paddle.set_device('gpu' if use_gpu else 'cpu')

    trainer_num = paddle.distributed.get_world_size()
    use_data_parallel = trainer_num != 1
    config["use_data_parallel"] = use_data_parallel

    if config["use_data_parallel"]:
        paddle.distributed.init_parallel_env()

    net = program.create_model(config.ARCHITECTURE, config.classes_num)
    optimizer, lr_scheduler = program.create_optimizer(
        config, parameter_list=net.parameters())

    dp_net = net
    if config["use_data_parallel"]:
        find_unused_parameters = config.get("find_unused_parameters", False)
        dp_net = paddle.DataParallel(
            net, find_unused_parameters=find_unused_parameters)

    # load model from checkpoint or pretrained model
    init_model(config, net, optimizer)

    train_dataloader = Reader(config, 'train', places=place)()

    if config.validate:
        valid_dataloader = Reader(config, 'valid', places=place)()

    last_epoch_id = config.get("last_epoch", -1)
    best_top1_acc = 0.0  # best top1 acc record
    best_top1_epoch = last_epoch_id

    vdl_writer_path = config.get("vdl_dir", None)
    vdl_writer = None
    if vdl_writer_path:
        from visualdl import LogWriter
        vdl_writer = LogWriter(vdl_writer_path)
    # Ensure that the vdl log file can be closed normally
    try:
        for epoch_id in range(last_epoch_id + 1, config.epochs):
            net.train()
            # 1. train with train dataset
            program.run(train_dataloader, config, dp_net, optimizer,
                        lr_scheduler, epoch_id, 'train', vdl_writer)

            # 2. validate with validate dataset
            if config.validate and epoch_id % config.valid_interval == 0:
                net.eval()
                with paddle.no_grad():
                    top1_acc = program.run(valid_dataloader, config, net, None,
                                           None, epoch_id, 'valid', vdl_writer)
                if top1_acc > best_top1_acc:
                    best_top1_acc = top1_acc
                    best_top1_epoch = epoch_id
                    model_path = os.path.join(config.model_save_dir,
                                              config.ARCHITECTURE["name"])
                    save_model(net, optimizer, model_path, "best_model")
                message = "The best top1 acc {:.5f}, in epoch: {:d}".format(
                    best_top1_acc, best_top1_epoch)
                logger.info(message)

            # 3. save the persistable model
            if epoch_id % config.save_interval == 0:
                model_path = os.path.join(config.model_save_dir,
                                          config.ARCHITECTURE["name"])
                save_model(net, optimizer, model_path, epoch_id)
    except Exception as e:
        logger.error(e)
    finally:
        vdl_writer.close() if vdl_writer else None
Ejemplo n.º 25
0
 def __init__(self, *args, **kwargs):
     msg = "\"MixCELos\" is deprecated, please use \"CELoss\" instead."
     logger.error(DeprecationWarning(msg))
     raise DeprecationWarning(msg)
Ejemplo n.º 26
0
def main(args):
    config = get_config(args.config, overrides=args.override, show=True)
    # 如果需要量化训练,就必须开启评估
    if not config.validate and args.use_quant:
        logger.error("=====>Train quant model must use validate!")
        sys.exit(1)
    if args.use_quant:
        config.epochs = config.epochs + 5
        gpu_count = get_gpu_count()
        if gpu_count != 1:
            logger.error(
                "=====>`Train quant model must use only one GPU. "
                "Please set environment variable: `export CUDA_VISIBLE_DEVICES=[GPU_ID_TO_USE]` ."
            )
            sys.exit(1)

    # 设置是否使用 GPU
    use_gpu = config.get("use_gpu", True)
    places = fluid.cuda_places() if use_gpu else fluid.cpu_places()

    startup_prog = fluid.Program()
    train_prog = fluid.Program()

    best_top1_acc = 0.0

    # 获取训练数据和模型输出
    if not config.get('use_ema'):
        train_dataloader, train_fetchs, out, softmax_out = program.build(
            config,
            train_prog,
            startup_prog,
            is_train=True,
            is_distributed=False)
    else:
        train_dataloader, train_fetchs, ema, out, softmax_out = program.build(
            config,
            train_prog,
            startup_prog,
            is_train=True,
            is_distributed=False)
    # 获取评估数据和模型输出
    if config.validate:
        valid_prog = fluid.Program()
        valid_dataloader, valid_fetchs, _, _ = program.build(
            config,
            valid_prog,
            startup_prog,
            is_train=False,
            is_distributed=False)
        # 克隆评估程序,可以去掉与评估无关的计算
        valid_prog = valid_prog.clone(for_test=True)

    # 创建执行器
    exe = fluid.Executor(places[0])
    exe.run(startup_prog)

    # 加载模型,可以是预训练模型,也可以是检查点
    init_model(config, train_prog, exe)

    train_reader = Reader(config, 'train')()
    train_dataloader.set_sample_list_generator(train_reader, places)

    compiled_train_prog = program.compile(config, train_prog,
                                          train_fetchs['loss'][0].name)

    if config.validate:
        valid_reader = Reader(config, 'valid')()
        valid_dataloader.set_sample_list_generator(valid_reader, places)
        compiled_valid_prog = program.compile(config,
                                              valid_prog,
                                              share_prog=compiled_train_prog)

    vdl_writer = LogWriter(args.vdl_dir)

    for epoch_id in range(config.epochs - 5):
        # 训练一轮
        program.run(train_dataloader, exe, compiled_train_prog, train_fetchs,
                    epoch_id, 'train', config, vdl_writer)

        # 执行一次评估
        if config.validate and epoch_id % config.valid_interval == 0:
            if config.get('use_ema'):
                logger.info(logger.coloring("EMA validate start..."))
                with ema.apply(exe):
                    _ = program.run(valid_dataloader, exe, compiled_valid_prog,
                                    valid_fetchs, epoch_id, 'valid', config)
                logger.info(logger.coloring("EMA validate over!"))

            top1_acc = program.run(valid_dataloader, exe, compiled_valid_prog,
                                   valid_fetchs, epoch_id, 'valid', config)

            if vdl_writer:
                logger.scaler('valid_avg', top1_acc, epoch_id, vdl_writer)

            if top1_acc > best_top1_acc:
                best_top1_acc = top1_acc
                message = "The best top1 acc {:.5f}, in epoch: {:d}".format(
                    best_top1_acc, epoch_id)
                logger.info("{:s}".format(logger.coloring(message, "RED")))
                if epoch_id % config.save_interval == 0:
                    model_path = os.path.join(config.model_save_dir,
                                              config.ARCHITECTURE["name"])
                    save_model(train_prog, model_path, "best_model")

        # 保存模型
        if epoch_id % config.save_interval == 0:
            model_path = os.path.join(config.model_save_dir,
                                      config.ARCHITECTURE["name"])
            if epoch_id >= 3 and os.path.exists(
                    os.path.join(model_path, str(epoch_id - 3))):
                shutil.rmtree(os.path.join(model_path, str(epoch_id - 3)),
                              ignore_errors=True)
            save_model(train_prog, model_path, epoch_id)

    # 量化训练
    if args.use_quant and config.validate:
        # 执行量化训练
        quant_program = slim.quant.quant_aware(train_prog,
                                               exe.place,
                                               for_test=False)
        # 评估量化的结果
        val_quant_program = slim.quant.quant_aware(valid_prog,
                                                   exe.place,
                                                   for_test=True)

        fetch_list = [f[0] for f in train_fetchs.values()]
        metric_list = [f[1] for f in train_fetchs.values()]
        for i in range(5):
            for idx, batch in enumerate(train_dataloader()):
                metrics = exe.run(program=quant_program,
                                  feed=batch,
                                  fetch_list=fetch_list)
                for i, m in enumerate(metrics):
                    metric_list[i].update(np.mean(m), len(batch[0]))
                fetchs_str = ''.join([str(m.value) + ' ' for m in metric_list])

                if idx % 10 == 0:
                    logger.info("quant train : " + fetchs_str)

        fetch_list = [f[0] for f in valid_fetchs.values()]
        metric_list = [f[1] for f in valid_fetchs.values()]
        for idx, batch in enumerate(valid_dataloader()):
            metrics = exe.run(program=val_quant_program,
                              feed=batch,
                              fetch_list=fetch_list)
            for i, m in enumerate(metrics):
                metric_list[i].update(np.mean(m), len(batch[0]))
            fetchs_str = ''.join([str(m.value) + ' ' for m in metric_list])

            if idx % 10 == 0:
                logger.info("quant valid: " + fetchs_str)

        # 保存量化训练模型
        float_prog, int8_prog = slim.quant.convert(val_quant_program,
                                                   exe.place,
                                                   save_int8=True)
        fluid.io.save_inference_model(dirname=args.output_path,
                                      feeded_var_names=['feed_image'],
                                      target_vars=[softmax_out],
                                      executor=exe,
                                      main_program=float_prog,
                                      model_filename='__model__',
                                      params_filename='__params__')
Ejemplo n.º 27
0
def main(args):
    image_path_list = get_image_list(args.image_file)
    headers = {"Content-type": "application/json"}

    cnt = 0
    predict_time = 0
    all_score = 0.0
    start_time = time.time()

    batch_input_list = []
    img_name_list = []
    cnt = 0
    for idx, img_path in enumerate(image_path_list):
        img = cv2.imread(img_path)
        if img is None:
            logger.warning(
                "Image file failed to read and has been skipped. The path: {}".
                format(img_path))
            continue
        else:
            img = img[:, :, ::-1]
            data = preprocess(img, args)
            batch_input_list.append(data)
            img_name = img_path.split('/')[-1]
            img_name_list.append(img_name)
            cnt += 1
        if cnt % args.batch_size == 0 or (idx + 1) == len(image_path_list):
            batch_input = np.array(batch_input_list)
            b64str, revert_shape = np_to_b64(batch_input)
            data = {
                "images": b64str,
                "revert_params": {
                    "shape": revert_shape,
                    "dtype": str(batch_input.dtype)
                },
                "top_k": args.top_k
            }
            try:
                r = requests.post(url=args.server_url,
                                  headers=headers,
                                  data=json.dumps(data))
                r.raise_for_status
                if r.json()["status"] != "000":
                    msg = r.json()["msg"]
                    raise Exception(msg)
            except Exception as e:
                logger.error("{}, in file(s): {} etc.".format(
                    e, img_name_list[0]))
                continue
            else:
                results = r.json()["results"]
                batch_result_list = results["prediction"]
                elapse = results["elapse"]

                cnt += len(batch_result_list)
                predict_time += elapse

                for number, result_list in enumerate(batch_result_list):
                    all_score += result_list["scores"][0]
                    result_str = ""
                    for i in range(len(result_list["clas_ids"])):
                        result_str += "{}: {:.2f}\t".format(
                            result_list["clas_ids"][i],
                            result_list["scores"][i])
                    logger.info("File:{}, The top-{} result(s): {}".format(
                        img_name_list[number], args.top_k, result_str))

            finally:
                batch_input_list = []
                img_name_list = []

    total_time = time.time() - start_time
    logger.info("The average time of prediction cost: {:.3f} s/image".format(
        predict_time / cnt))
    logger.info("The average time cost: {:.3f} s/image".format(total_time /
                                                               cnt))
    logger.info("The average top-1 score: {:.3f}".format(all_score / cnt))