def __call__(self, model_list): # model_list is None in static graph parameters = sum([m.parameters() for m in model_list], []) if model_list else None # TODO(gaotingquan): model_list is None when in static graph, "no_weight_decay" not work. if model_list is None: if self.one_dim_param_no_weight_decay or len( self.no_weight_decay_name_list) != 0: msg = "\"AdamW\" does not support setting \"no_weight_decay\" in static graph. Please use dynamic graph." logger.error(Exception(msg)) raise Exception(msg) self.no_weight_decay_param_name_list = [ p.name for model in model_list for n, p in model.named_parameters() if any(nd in n for nd in self.no_weight_decay_name_list) ] if model_list else [] if self.one_dim_param_no_weight_decay: self.no_weight_decay_param_name_list += [ p.name for model in model_list for n, p in model.named_parameters() if len(p.shape) == 1 ] if model_list else [] opt = optim.AdamW( learning_rate=self.learning_rate, beta1=self.beta1, beta2=self.beta2, epsilon=self.epsilon, parameters=parameters, weight_decay=self.weight_decay, multi_precision=self.multi_precision, grad_clip=self.grad_clip, apply_decay_param_fun=self._apply_decay_param_fun) return opt
def create_feeds(image_shape, use_mix=False, class_num=None, dtype="float32"): """ Create feeds as model input Args: image_shape(list[int]): model input shape, such as [3, 224, 224] use_mix(bool): whether to use mix(include mixup, cutmix, fmix) class_num(int): the class number of network, required if use_mix Returns: feeds(dict): dict of model input variables """ feeds = OrderedDict() feeds['data'] = paddle.static.data(name="data", shape=[None] + image_shape, dtype=dtype) if use_mix: if class_num is None: msg = "When use MixUp, CutMix and so on, you must set class_num." logger.error(msg) raise Exception(msg) feeds['target'] = paddle.static.data(name="target", shape=[None, class_num], dtype="float32") else: feeds['label'] = paddle.static.data(name="label", shape=[None, 1], dtype="int64") return feeds
def __init__(self, class_num, **op_dict): """Build OpSampler Raises: Exception: The parameter \"prob\" of operator(s) are be set error. """ if not class_num: msg = "Please set \"Arch.class_num\" in config if use \"OpSampler\"." logger.error(Exception(msg)) raise Exception(msg) if len(op_dict) < 1: msg = f"ConfigWarning: No operator in \"OpSampler\". \"OpSampler\" has been skipped." logger.warning(msg) self.ops = {} total_prob = 0 for op_name in op_dict: param = op_dict[op_name] if "prob" not in param: msg = f"ConfigWarning: Parameter \"prob\" should be set when use operator in \"OpSampler\". The operator \"{op_name}\"'s prob has been set \"0\"." logger.warning(msg) prob = param.pop("prob", 0) total_prob += prob param.update({"class_num": class_num}) op = eval(op_name)(**param) self.ops.update({op: prob}) if total_prob > 1: msg = f"ConfigError: The total prob of operators in \"OpSampler\" should be less 1." logger.error(Exception(msg)) raise Exception(msg) # add "None Op" when total_prob < 1, "None Op" do nothing self.ops[None] = 1 - total_prob
def check_model_with_running_mode(architecture): """ check whether the model is consistent with the operating mode """ # some model are not supported in the static mode blacklist = get_blacklist_model_in_static_mode() if not paddle.in_dynamic_mode() and architecture["name"] in blacklist: logger.error("Model: {} is not supported in the staic mode.".format( architecture["name"])) sys.exit(1) return
def check_mix(architecture, use_mix=False): """ check mix parameter """ err = "Cannot use mix processing in GoogLeNet, " \ "please set use_mix = False." try: if architecture["name"] == "GoogLeNet": assert use_mix == False except AssertionError: logger.error(err) sys.exit(1)
def check_data_dir(path): """ check cata_dir """ err = "Data path is not exist, please given a right path" \ "".format(path) try: assert os.isdir(path) except AssertionError: logger.error(err) sys.exit(1)
def __getitem__(self, idx): try: line = self.full_lines[idx] img_path, label = line.split(self.delimiter) img_path = os.path.join(self.params['data_dir'], img_path) with open(img_path, 'rb') as f: img = f.read() return (transform(img, self.ops), int(label)) except Exception as e: logger.error("data read faild: {}, exception info: {}".format( line, e)) return self.__getitem__(random.randint(0, len(self)))
def check_gpu(): """ Log error and exit when using paddlepaddle cpu version. """ err = "You are using paddlepaddle cpu version! Please try to " \ "install paddlepaddle-gpu to run model on GPU." try: assert fluid.is_compiled_with_cuda() except AssertionError: logger.error(err) sys.exit(1)
def main(args): benchmark_file_list = args.benchmark_file_list model_infos = parse_model_infos(benchmark_file_list) right_models = [] wrong_models = [] for model_info in model_infos: try: pretrained_url = model_info["pretrain_path"] fname = _download(pretrained_url, args.pretrained_dir) pretrained_path = os.path.splitext(fname)[0] if pretrained_url.endswith("tar"): path = _decompress(fname) pretrained_path = os.path.join( os.path.dirname(pretrained_path), path) args.config = model_info["config_path"] args.override = [ "pretrained_model={}".format(pretrained_path), "VALID.batch_size=256", "VALID.num_workers=16", "load_static_weights=True", "print_interval=100", ] manager = Manager() return_dict = manager.dict() # A hack method to avoid name conflict. # Multi-process maybe a better method here. # More details can be seen in branch 2.0-beta. # TODO: fluid needs to be removed in the future. with paddle.utils.unique_name.guard(): eval.main(args, return_dict) top1_acc = return_dict.get("top1_acc", 0.0) except Exception as e: logger.error(e) top1_acc = 0.0 diff = abs(top1_acc - model_info["top1_acc"]) if diff > 0.001: err_info = "[{}]Top-1 acc diff should be <= 0.001 but got diff {}, gt acc: {}, eval acc: {}".format( model_info["model_name"], diff, model_info["top1_acc"], top1_acc) logger.warning(err_info) wrong_models.append(model_info["model_name"]) else: right_models.append(model_info["model_name"]) logger.info("[number of right models: {}, they are: {}".format( len(right_models), right_models)) logger.info("[number of wrong models: {}, they are: {}".format( len(wrong_models), wrong_models))
def check_classes_num(classes_num): """ check classes_num """ err = "classes_num({}) should be a positive integer" \ "and larger than 1".format(classes_num) try: assert isinstance(classes_num, int) assert classes_num > 1 except AssertionError: logger.error(err) sys.exit(1)
def __getitem__(self, idx): try: with open(self.images[idx], 'rb') as f: img = f.read() if self._transform_ops: img = transform(img, self._transform_ops) img = img.transpose((2, 0, 1)) return (img, self.labels[idx], self.cameras[idx]) except Exception as ex: logger.error("Exception occured when parse line: {} with msg: {}". format(self.images[idx], ex)) rnd_idx = np.random.randint(self.__len__()) return self.__getitem__(rnd_idx)
def check_version(): """ Log error and exit when the installed version of paddlepaddle is not satisfied. """ err = "PaddlePaddle version 1.7 or higher is required, " \ "or a suitable develop version is satisfied as well. \n" \ "Please make sure the version is good with your code." \ try: fluid.require_version('1.7.0') except Exception: logger.error(err) sys.exit(1)
def __init__(self, class_num, alpha=1, decay_power=3, max_soft=0., reformulate=False): if not class_num: msg = "Please set \"Arch.class_num\" in config if use \"FmixOperator\"." logger.error(Exception(msg)) raise Exception(msg) self._alpha = alpha self._decay_power = decay_power self._max_soft = max_soft self._reformulate = reformulate self.class_num = class_num
def check_architecture(architecture): """ check architecture and recommend similar architectures """ assert isinstance(architecture, str), \ ("the type of architecture({}) should be str". format(architecture)) similar_names = similar_architectures(architecture) model_list = ', '.join(similar_names) err = "{} is not exist! Maybe you want: [{}]" \ "".format(architecture, model_list) try: assert architecture in similar_names except AssertionError: logger.error(err) sys.exit(1)
def __getitem__(self, idx): try: line = self.full_lines[idx] img_path, label_str = line.split(self.delimiter) img_path = os.path.join(self.params["data_dir"], img_path) with open(img_path, "rb") as f: img = f.read() labels = label_str.split(',') labels = [int(i) for i in labels] return (transform(img, self.ops), np.array(labels).astype("float32")) except Exception as e: logger.error("data read failed: {}, exception info: {}".format( line, e)) return self.__getitem__(random.randint(0, len(self)))
def __init__(self, dataset, batch_size, sample_per_id, shuffle=True, drop_last=True, sample_method="sample_avg_prob"): super().__init__(dataset, batch_size, shuffle=shuffle, drop_last=drop_last) assert batch_size % sample_per_id == 0, \ "PKSampler configs error, Sample_per_id must be a divisor of batch_size." assert hasattr(self.dataset, "labels"), "Dataset must have labels attribute." self.sample_per_label = sample_per_id self.label_dict = defaultdict(list) self.sample_method = sample_method for idx, label in enumerate(self.dataset.labels): self.label_dict[label].append(idx) self.label_list = list(self.label_dict) assert len(self.label_list) * self.sample_per_label > self.batch_size, \ "batch size should be smaller than " if self.sample_method == "id_avg_prob": self.prob_list = np.array([1 / len(self.label_list)] * len(self.label_list)) elif self.sample_method == "sample_avg_prob": counter = [] for label_i in self.label_list: counter.append(len(self.label_dict[label_i])) self.prob_list = np.array(counter) / sum(counter) else: logger.error( "PKSampler only support id_avg_prob and sample_avg_prob sample method, " "but receive {}.".format(self.sample_method)) diff = np.abs(sum(self.prob_list) - 1) if diff > 0.00000001: self.prob_list[-1] = 1 - sum(self.prob_list[:-1]) if self.prob_list[-1] > 1 or self.prob_list[-1] < 0: logger.error("PKSampler prob list error") else: logger.info( "PKSampler: sum of prob list not equal to 1, diff is {}, change the last prob" .format(diff))
def check_architecture(architecture): """ check architecture and recommend similar architectures """ assert isinstance(architecture, dict), \ ("the type of architecture({}) should be dict". format(architecture)) assert "name" in architecture, \ ("name must be in the architecture keys, just contains: {}". format(architecture.keys())) similar_names = similar_architectures(architecture["name"], get_architectures()) model_list = ', '.join(similar_names) err = "{} is not exist! Maybe you want: [{}]" \ "".format(architecture["name"], model_list) try: assert architecture["name"] in similar_names except AssertionError: logger.error(err) sys.exit(1)
def __init__(self, class_num, alpha=0.2): """Build Cutmix operator Args: alpha (float, optional): The parameter alpha of cutmix. Defaults to 0.2. Raises: Exception: The value of parameter is illegal. """ if alpha <= 0: raise Exception( f"Parameter \"alpha\" of Cutmix should be greater than 0. \"alpha\": {alpha}." ) if not class_num: msg = "Please set \"Arch.class_num\" in config if use \"CutmixOperator\"." logger.error(Exception(msg)) raise Exception(msg) self._alpha = alpha self.class_num = class_num
def main(url, image_path, top_k=1): image_file_list = get_image_file_list(image_path) headers = {"Content-type": "application/json"} cnt = 0 total_time = 0 all_acc = 0.0 for image_file in image_file_list: file_str = image_file.split('/')[-1] img = open(image_file, 'rb').read() if img is None: logger.error("Loading image:{} failed".format(image_file)) continue data = {'images': [cv2_to_base64(img)], 'top_k': top_k} starttime = time.time() try: r = requests.post(url=url, headers=headers, data=json.dumps(data)) r.raise_for_status() except Exception as e: logger.error("File:{}, {}".format(file_str, e)) continue elapse = time.time() - starttime total_time += elapse if r.json()['status'] != '0': logger.error( "File:{}, The parameters returned by the server are: {}". format(file_str, r.json()['msg'])) continue res = r.json()["results"][0] classes = res[0] scores = res[1] all_acc += scores[0] cnt += 1 scores = map(lambda x: round(x, 5), scores) results = dict(zip(classes, scores)) message = "No.{}, File:{}, The top-{} result(s):{}, Time cost:{:.3f}".format( cnt, file_str, top_k, results, elapse) logger.info(message) logger.info("The average time cost: {}".format(float(total_time) / cnt)) logger.info("The average top-1 score: {}".format(float(all_acc) / cnt))
def __init__(self, config, mode="train"): assert mode in ["train", "eval", "infer", "export"] self.mode = mode self.config = config self.eval_mode = self.config["Global"].get("eval_mode", "classification") if "Head" in self.config["Arch"]: self.is_rec = True else: self.is_rec = False # set seed seed = self.config["Global"].get("seed", False) if seed or seed == 0: assert isinstance(seed, int), "The 'seed' must be a integer!" paddle.seed(seed) np.random.seed(seed) random.seed(seed) # init logger self.output_dir = self.config['Global']['output_dir'] log_file = os.path.join(self.output_dir, self.config["Arch"]["name"], f"{mode}.log") init_logger(name='root', log_file=log_file) print_config(config) # init train_func and eval_func assert self.eval_mode in ["classification", "retrieval"], logger.error( "Invalid eval mode: {}".format(self.eval_mode)) self.train_epoch_func = train_epoch self.eval_func = getattr(evaluation, self.eval_mode + "_eval") self.use_dali = self.config['Global'].get("use_dali", False) # for visualdl self.vdl_writer = None if self.config['Global']['use_visualdl'] and mode == "train": vdl_writer_path = os.path.join(self.output_dir, "vdl") if not os.path.exists(vdl_writer_path): os.makedirs(vdl_writer_path) self.vdl_writer = LogWriter(logdir=vdl_writer_path) # set device assert self.config["Global"]["device"] in ["cpu", "gpu", "xpu", "npu"] self.device = paddle.set_device(self.config["Global"]["device"]) logger.info('train with paddle {} and device {}'.format( paddle.__version__, self.device)) # AMP training self.amp = True if "AMP" in self.config else False if self.amp and self.config["AMP"] is not None: self.scale_loss = self.config["AMP"].get("scale_loss", 1.0) self.use_dynamic_loss_scaling = self.config["AMP"].get( "use_dynamic_loss_scaling", False) else: self.scale_loss = 1.0 self.use_dynamic_loss_scaling = False if self.amp: AMP_RELATED_FLAGS_SETTING = { 'FLAGS_cudnn_batchnorm_spatial_persistent': 1, 'FLAGS_max_inplace_grad_add': 8, } paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING) #TODO(gaotingquan): support rec class_num = config["Arch"].get("class_num", None) self.config["DataLoader"].update({"class_num": class_num}) # build dataloader if self.mode == 'train': self.train_dataloader = build_dataloader(self.config["DataLoader"], "Train", self.device, self.use_dali) if self.mode == "eval" or (self.mode == "train" and self.config["Global"]["eval_during_train"]): if self.eval_mode == "classification": self.eval_dataloader = build_dataloader( self.config["DataLoader"], "Eval", self.device, self.use_dali) elif self.eval_mode == "retrieval": self.gallery_query_dataloader = None if len(self.config["DataLoader"]["Eval"].keys()) == 1: key = list(self.config["DataLoader"]["Eval"].keys())[0] self.gallery_query_dataloader = build_dataloader( self.config["DataLoader"]["Eval"], key, self.device, self.use_dali) else: self.gallery_dataloader = build_dataloader( self.config["DataLoader"]["Eval"], "Gallery", self.device, self.use_dali) self.query_dataloader = build_dataloader( self.config["DataLoader"]["Eval"], "Query", self.device, self.use_dali) # build loss if self.mode == "train": loss_info = self.config["Loss"]["Train"] self.train_loss_func = build_loss(loss_info) if self.mode == "eval" or (self.mode == "train" and self.config["Global"]["eval_during_train"]): loss_config = self.config.get("Loss", None) if loss_config is not None: loss_config = loss_config.get("Eval") if loss_config is not None: self.eval_loss_func = build_loss(loss_config) else: self.eval_loss_func = None else: self.eval_loss_func = None # build metric if self.mode == 'train': metric_config = self.config.get("Metric") if metric_config is not None: metric_config = metric_config.get("Train") if metric_config is not None: self.train_metric_func = build_metrics(metric_config) else: self.train_metric_func = None else: self.train_metric_func = None if self.mode == "eval" or (self.mode == "train" and self.config["Global"]["eval_during_train"]): metric_config = self.config.get("Metric") if self.eval_mode == "classification": if metric_config is not None: metric_config = metric_config.get("Eval") if metric_config is not None: self.eval_metric_func = build_metrics(metric_config) elif self.eval_mode == "retrieval": if metric_config is None: metric_config = [{"name": "Recallk", "topk": (1, 5)}] else: metric_config = metric_config["Eval"] self.eval_metric_func = build_metrics(metric_config) else: self.eval_metric_func = None # build model self.model = build_model(self.config["Arch"]) # set @to_static for benchmark, skip this by default. apply_to_static(self.config, self.model) # for slim self.pruner = get_pruner(self.config, self.model) self.quanter = get_quaner(self.config, self.model) # load_pretrain if self.config["Global"]["pretrained_model"] is not None: if self.config["Global"]["pretrained_model"].startswith("http"): load_dygraph_pretrain_from_url( self.model, self.config["Global"]["pretrained_model"]) else: load_dygraph_pretrain( self.model, self.config["Global"]["pretrained_model"]) # build optimizer if self.mode == 'train': self.optimizer, self.lr_sch = build_optimizer( self.config["Optimizer"], self.config["Global"]["epochs"], len(self.train_dataloader), [self.model]) # for distributed self.config["Global"][ "distributed"] = paddle.distributed.get_world_size() != 1 if self.config["Global"]["distributed"]: dist.init_parallel_env() if self.config["Global"]["distributed"]: self.model = paddle.DataParallel(self.model) # build postprocess for infer if self.mode == 'infer': self.preprocess_func = create_operators( self.config["Infer"]["transforms"]) self.postprocess_func = build_postprocess( self.config["Infer"]["PostProcess"])
def __init__(self, config, mode="train"): assert mode in ["train", "eval", "infer", "export"] self.mode = mode self.config = config self.eval_mode = self.config["Global"].get("eval_mode", "classification") if "Head" in self.config["Arch"] or self.config["Arch"].get( "is_rec", False): self.is_rec = True else: self.is_rec = False # set seed seed = self.config["Global"].get("seed", False) if seed or seed == 0: assert isinstance(seed, int), "The 'seed' must be a integer!" paddle.seed(seed) np.random.seed(seed) random.seed(seed) # init logger self.output_dir = self.config['Global']['output_dir'] log_file = os.path.join(self.output_dir, self.config["Arch"]["name"], f"{mode}.log") init_logger(log_file=log_file) print_config(config) # init train_func and eval_func assert self.eval_mode in ["classification", "retrieval"], logger.error( "Invalid eval mode: {}".format(self.eval_mode)) self.train_epoch_func = train_epoch self.eval_func = getattr(evaluation, self.eval_mode + "_eval") self.use_dali = self.config['Global'].get("use_dali", False) # for visualdl self.vdl_writer = None if self.config['Global'][ 'use_visualdl'] and mode == "train" and dist.get_rank() == 0: vdl_writer_path = os.path.join(self.output_dir, "vdl") if not os.path.exists(vdl_writer_path): os.makedirs(vdl_writer_path) self.vdl_writer = LogWriter(logdir=vdl_writer_path) # set device assert self.config["Global"]["device"] in [ "cpu", "gpu", "xpu", "npu", "mlu" ] self.device = paddle.set_device(self.config["Global"]["device"]) logger.info('train with paddle {} and device {}'.format( paddle.__version__, self.device)) # AMP training self.amp = True if "AMP" in self.config and self.mode == "train" else False if self.amp and self.config["AMP"] is not None: self.scale_loss = self.config["AMP"].get("scale_loss", 1.0) self.use_dynamic_loss_scaling = self.config["AMP"].get( "use_dynamic_loss_scaling", False) else: self.scale_loss = 1.0 self.use_dynamic_loss_scaling = False if self.amp: AMP_RELATED_FLAGS_SETTING = { 'FLAGS_max_inplace_grad_add': 8, } if paddle.is_compiled_with_cuda(): AMP_RELATED_FLAGS_SETTING.update( {'FLAGS_cudnn_batchnorm_spatial_persistent': 1}) paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING) if "class_num" in config["Global"]: global_class_num = config["Global"]["class_num"] if "class_num" not in config["Arch"]: config["Arch"]["class_num"] = global_class_num msg = f"The Global.class_num will be deprecated. Please use Arch.class_num instead. Arch.class_num has been set to {global_class_num}." else: msg = "The Global.class_num will be deprecated. Please use Arch.class_num instead. The Global.class_num has been ignored." logger.warning(msg) #TODO(gaotingquan): support rec class_num = config["Arch"].get("class_num", None) self.config["DataLoader"].update({"class_num": class_num}) # build dataloader if self.mode == 'train': self.train_dataloader = build_dataloader(self.config["DataLoader"], "Train", self.device, self.use_dali) if self.mode == "eval" or (self.mode == "train" and self.config["Global"]["eval_during_train"]): if self.eval_mode == "classification": self.eval_dataloader = build_dataloader( self.config["DataLoader"], "Eval", self.device, self.use_dali) elif self.eval_mode == "retrieval": self.gallery_query_dataloader = None if len(self.config["DataLoader"]["Eval"].keys()) == 1: key = list(self.config["DataLoader"]["Eval"].keys())[0] self.gallery_query_dataloader = build_dataloader( self.config["DataLoader"]["Eval"], key, self.device, self.use_dali) else: self.gallery_dataloader = build_dataloader( self.config["DataLoader"]["Eval"], "Gallery", self.device, self.use_dali) self.query_dataloader = build_dataloader( self.config["DataLoader"]["Eval"], "Query", self.device, self.use_dali) # build loss if self.mode == "train": loss_info = self.config["Loss"]["Train"] self.train_loss_func = build_loss(loss_info) if self.mode == "eval" or (self.mode == "train" and self.config["Global"]["eval_during_train"]): loss_config = self.config.get("Loss", None) if loss_config is not None: loss_config = loss_config.get("Eval") if loss_config is not None: self.eval_loss_func = build_loss(loss_config) else: self.eval_loss_func = None else: self.eval_loss_func = None # build metric if self.mode == 'train': metric_config = self.config.get("Metric") if metric_config is not None: metric_config = metric_config.get("Train") if metric_config is not None: if hasattr( self.train_dataloader, "collate_fn" ) and self.train_dataloader.collate_fn is not None: for m_idx, m in enumerate(metric_config): if "TopkAcc" in m: msg = f"'TopkAcc' metric can not be used when setting 'batch_transform_ops' in config. The 'TopkAcc' metric has been removed." logger.warning(msg) break metric_config.pop(m_idx) self.train_metric_func = build_metrics(metric_config) else: self.train_metric_func = None else: self.train_metric_func = None if self.mode == "eval" or (self.mode == "train" and self.config["Global"]["eval_during_train"]): metric_config = self.config.get("Metric") if self.eval_mode == "classification": if metric_config is not None: metric_config = metric_config.get("Eval") if metric_config is not None: self.eval_metric_func = build_metrics(metric_config) elif self.eval_mode == "retrieval": if metric_config is None: metric_config = [{"name": "Recallk", "topk": (1, 5)}] else: metric_config = metric_config["Eval"] self.eval_metric_func = build_metrics(metric_config) else: self.eval_metric_func = None # build model self.model = build_model(self.config) # set @to_static for benchmark, skip this by default. apply_to_static(self.config, self.model) # load_pretrain if self.config["Global"]["pretrained_model"] is not None: if self.config["Global"]["pretrained_model"].startswith("http"): load_dygraph_pretrain_from_url( self.model, self.config["Global"]["pretrained_model"]) else: load_dygraph_pretrain( self.model, self.config["Global"]["pretrained_model"]) # build optimizer if self.mode == 'train': self.optimizer, self.lr_sch = build_optimizer( self.config["Optimizer"], self.config["Global"]["epochs"], len(self.train_dataloader), [self.model]) # for amp training if self.amp: self.scaler = paddle.amp.GradScaler( init_loss_scaling=self.scale_loss, use_dynamic_loss_scaling=self.use_dynamic_loss_scaling) amp_level = self.config['AMP'].get("level", "O1") if amp_level not in ["O1", "O2"]: msg = "[Parameter Error]: The optimize level of AMP only support 'O1' and 'O2'. The level has been set 'O1'." logger.warning(msg) self.config['AMP']["level"] = "O1" amp_level = "O1" self.model, self.optimizer = paddle.amp.decorate( models=self.model, optimizers=self.optimizer, level=amp_level, save_dtype='float32') # for distributed world_size = dist.get_world_size() self.config["Global"]["distributed"] = world_size != 1 if world_size != 4 and self.mode == "train": msg = f"The training strategy in config files provided by PaddleClas is based on 4 gpus. But the number of gpus is {world_size} in current training. Please modify the stategy (learning rate, batch size and so on) if use config files in PaddleClas to train." logger.warning(msg) if self.config["Global"]["distributed"]: dist.init_parallel_env() self.model = paddle.DataParallel(self.model) # build postprocess for infer if self.mode == 'infer': self.preprocess_func = create_operators( self.config["Infer"]["transforms"]) self.postprocess_func = build_postprocess( self.config["Infer"]["PostProcess"])
def replace_sub(self, *args, **kwargs) -> None: msg = "The function 'replace_sub()' is deprecated, please use 'upgrade_sublayer()' instead." logger.error(DeprecationWarning(msg)) raise DeprecationWarning(msg)
def create_fetchs(out, feeds, architecture, topk=5, epsilon=None, class_num=None, use_mix=False, config=None, mode="Train"): """ Create fetchs as model outputs(included loss and measures), will call create_loss and create_metric(if use_mix). Args: out(variable): model output variable feeds(dict): dict of model input variables. If use mix_up, it will not include label. architecture(dict): architecture information, name(such as ResNet50) is needed topk(int): usually top5 epsilon(float): parameter for label smoothing, 0.0 <= epsilon <= 1.0 class_num(int): the class number of network, required if use_mix use_mix(bool): whether to use mix(include mixup, cutmix, fmix) config(dict): model config Returns: fetchs(dict): dict of model outputs(included loss and measures) """ fetchs = OrderedDict() # build loss if use_mix: if class_num is None: msg = "When use MixUp, CutMix and so on, you must set class_num." logger.error(msg) raise Exception(msg) target = paddle.reshape(feeds['target'], [-1, class_num]) else: target = paddle.reshape(feeds['label'], [-1, 1]) loss_func = build_loss(config["Loss"][mode]) loss_dict = loss_func(out, target) loss_out = loss_dict["loss"] fetchs['loss'] = (loss_out, AverageMeter('loss', '7.4f', need_avg=True)) # build metric if not use_mix: metric_func = build_metrics(config["Metric"][mode]) metric_dict = metric_func(out, target) for key in metric_dict: if mode != "Train" and paddle.distributed.get_world_size() > 1: paddle.distributed.all_reduce( metric_dict[key], op=paddle.distributed.ReduceOp.SUM) metric_dict[key] = metric_dict[ key] / paddle.distributed.get_world_size() fetchs[key] = (metric_dict[key], AverageMeter(key, '7.4f', need_avg=True)) return fetchs
def main(args): paddle.seed(12345) config = get_config(args.config, overrides=args.override, show=True) # assign the place use_gpu = config.get("use_gpu", True) place = paddle.set_device('gpu' if use_gpu else 'cpu') trainer_num = paddle.distributed.get_world_size() use_data_parallel = trainer_num != 1 config["use_data_parallel"] = use_data_parallel if config["use_data_parallel"]: paddle.distributed.init_parallel_env() net = program.create_model(config.ARCHITECTURE, config.classes_num) optimizer, lr_scheduler = program.create_optimizer( config, parameter_list=net.parameters()) dp_net = net if config["use_data_parallel"]: find_unused_parameters = config.get("find_unused_parameters", False) dp_net = paddle.DataParallel( net, find_unused_parameters=find_unused_parameters) # load model from checkpoint or pretrained model init_model(config, net, optimizer) train_dataloader = Reader(config, 'train', places=place)() if config.validate: valid_dataloader = Reader(config, 'valid', places=place)() last_epoch_id = config.get("last_epoch", -1) best_top1_acc = 0.0 # best top1 acc record best_top1_epoch = last_epoch_id vdl_writer_path = config.get("vdl_dir", None) vdl_writer = None if vdl_writer_path: from visualdl import LogWriter vdl_writer = LogWriter(vdl_writer_path) # Ensure that the vdl log file can be closed normally try: for epoch_id in range(last_epoch_id + 1, config.epochs): net.train() # 1. train with train dataset program.run(train_dataloader, config, dp_net, optimizer, lr_scheduler, epoch_id, 'train', vdl_writer) # 2. validate with validate dataset if config.validate and epoch_id % config.valid_interval == 0: net.eval() with paddle.no_grad(): top1_acc = program.run(valid_dataloader, config, net, None, None, epoch_id, 'valid', vdl_writer) if top1_acc > best_top1_acc: best_top1_acc = top1_acc best_top1_epoch = epoch_id model_path = os.path.join(config.model_save_dir, config.ARCHITECTURE["name"]) save_model(net, optimizer, model_path, "best_model") message = "The best top1 acc {:.5f}, in epoch: {:d}".format( best_top1_acc, best_top1_epoch) logger.info(message) # 3. save the persistable model if epoch_id % config.save_interval == 0: model_path = os.path.join(config.model_save_dir, config.ARCHITECTURE["name"]) save_model(net, optimizer, model_path, epoch_id) except Exception as e: logger.error(e) finally: vdl_writer.close() if vdl_writer else None
def __init__(self, *args, **kwargs): msg = "\"MixCELos\" is deprecated, please use \"CELoss\" instead." logger.error(DeprecationWarning(msg)) raise DeprecationWarning(msg)
def main(args): config = get_config(args.config, overrides=args.override, show=True) # 如果需要量化训练,就必须开启评估 if not config.validate and args.use_quant: logger.error("=====>Train quant model must use validate!") sys.exit(1) if args.use_quant: config.epochs = config.epochs + 5 gpu_count = get_gpu_count() if gpu_count != 1: logger.error( "=====>`Train quant model must use only one GPU. " "Please set environment variable: `export CUDA_VISIBLE_DEVICES=[GPU_ID_TO_USE]` ." ) sys.exit(1) # 设置是否使用 GPU use_gpu = config.get("use_gpu", True) places = fluid.cuda_places() if use_gpu else fluid.cpu_places() startup_prog = fluid.Program() train_prog = fluid.Program() best_top1_acc = 0.0 # 获取训练数据和模型输出 if not config.get('use_ema'): train_dataloader, train_fetchs, out, softmax_out = program.build( config, train_prog, startup_prog, is_train=True, is_distributed=False) else: train_dataloader, train_fetchs, ema, out, softmax_out = program.build( config, train_prog, startup_prog, is_train=True, is_distributed=False) # 获取评估数据和模型输出 if config.validate: valid_prog = fluid.Program() valid_dataloader, valid_fetchs, _, _ = program.build( config, valid_prog, startup_prog, is_train=False, is_distributed=False) # 克隆评估程序,可以去掉与评估无关的计算 valid_prog = valid_prog.clone(for_test=True) # 创建执行器 exe = fluid.Executor(places[0]) exe.run(startup_prog) # 加载模型,可以是预训练模型,也可以是检查点 init_model(config, train_prog, exe) train_reader = Reader(config, 'train')() train_dataloader.set_sample_list_generator(train_reader, places) compiled_train_prog = program.compile(config, train_prog, train_fetchs['loss'][0].name) if config.validate: valid_reader = Reader(config, 'valid')() valid_dataloader.set_sample_list_generator(valid_reader, places) compiled_valid_prog = program.compile(config, valid_prog, share_prog=compiled_train_prog) vdl_writer = LogWriter(args.vdl_dir) for epoch_id in range(config.epochs - 5): # 训练一轮 program.run(train_dataloader, exe, compiled_train_prog, train_fetchs, epoch_id, 'train', config, vdl_writer) # 执行一次评估 if config.validate and epoch_id % config.valid_interval == 0: if config.get('use_ema'): logger.info(logger.coloring("EMA validate start...")) with ema.apply(exe): _ = program.run(valid_dataloader, exe, compiled_valid_prog, valid_fetchs, epoch_id, 'valid', config) logger.info(logger.coloring("EMA validate over!")) top1_acc = program.run(valid_dataloader, exe, compiled_valid_prog, valid_fetchs, epoch_id, 'valid', config) if vdl_writer: logger.scaler('valid_avg', top1_acc, epoch_id, vdl_writer) if top1_acc > best_top1_acc: best_top1_acc = top1_acc message = "The best top1 acc {:.5f}, in epoch: {:d}".format( best_top1_acc, epoch_id) logger.info("{:s}".format(logger.coloring(message, "RED"))) if epoch_id % config.save_interval == 0: model_path = os.path.join(config.model_save_dir, config.ARCHITECTURE["name"]) save_model(train_prog, model_path, "best_model") # 保存模型 if epoch_id % config.save_interval == 0: model_path = os.path.join(config.model_save_dir, config.ARCHITECTURE["name"]) if epoch_id >= 3 and os.path.exists( os.path.join(model_path, str(epoch_id - 3))): shutil.rmtree(os.path.join(model_path, str(epoch_id - 3)), ignore_errors=True) save_model(train_prog, model_path, epoch_id) # 量化训练 if args.use_quant and config.validate: # 执行量化训练 quant_program = slim.quant.quant_aware(train_prog, exe.place, for_test=False) # 评估量化的结果 val_quant_program = slim.quant.quant_aware(valid_prog, exe.place, for_test=True) fetch_list = [f[0] for f in train_fetchs.values()] metric_list = [f[1] for f in train_fetchs.values()] for i in range(5): for idx, batch in enumerate(train_dataloader()): metrics = exe.run(program=quant_program, feed=batch, fetch_list=fetch_list) for i, m in enumerate(metrics): metric_list[i].update(np.mean(m), len(batch[0])) fetchs_str = ''.join([str(m.value) + ' ' for m in metric_list]) if idx % 10 == 0: logger.info("quant train : " + fetchs_str) fetch_list = [f[0] for f in valid_fetchs.values()] metric_list = [f[1] for f in valid_fetchs.values()] for idx, batch in enumerate(valid_dataloader()): metrics = exe.run(program=val_quant_program, feed=batch, fetch_list=fetch_list) for i, m in enumerate(metrics): metric_list[i].update(np.mean(m), len(batch[0])) fetchs_str = ''.join([str(m.value) + ' ' for m in metric_list]) if idx % 10 == 0: logger.info("quant valid: " + fetchs_str) # 保存量化训练模型 float_prog, int8_prog = slim.quant.convert(val_quant_program, exe.place, save_int8=True) fluid.io.save_inference_model(dirname=args.output_path, feeded_var_names=['feed_image'], target_vars=[softmax_out], executor=exe, main_program=float_prog, model_filename='__model__', params_filename='__params__')
def main(args): image_path_list = get_image_list(args.image_file) headers = {"Content-type": "application/json"} cnt = 0 predict_time = 0 all_score = 0.0 start_time = time.time() batch_input_list = [] img_name_list = [] cnt = 0 for idx, img_path in enumerate(image_path_list): img = cv2.imread(img_path) if img is None: logger.warning( "Image file failed to read and has been skipped. The path: {}". format(img_path)) continue else: img = img[:, :, ::-1] data = preprocess(img, args) batch_input_list.append(data) img_name = img_path.split('/')[-1] img_name_list.append(img_name) cnt += 1 if cnt % args.batch_size == 0 or (idx + 1) == len(image_path_list): batch_input = np.array(batch_input_list) b64str, revert_shape = np_to_b64(batch_input) data = { "images": b64str, "revert_params": { "shape": revert_shape, "dtype": str(batch_input.dtype) }, "top_k": args.top_k } try: r = requests.post(url=args.server_url, headers=headers, data=json.dumps(data)) r.raise_for_status if r.json()["status"] != "000": msg = r.json()["msg"] raise Exception(msg) except Exception as e: logger.error("{}, in file(s): {} etc.".format( e, img_name_list[0])) continue else: results = r.json()["results"] batch_result_list = results["prediction"] elapse = results["elapse"] cnt += len(batch_result_list) predict_time += elapse for number, result_list in enumerate(batch_result_list): all_score += result_list["scores"][0] result_str = "" for i in range(len(result_list["clas_ids"])): result_str += "{}: {:.2f}\t".format( result_list["clas_ids"][i], result_list["scores"][i]) logger.info("File:{}, The top-{} result(s): {}".format( img_name_list[number], args.top_k, result_str)) finally: batch_input_list = [] img_name_list = [] total_time = time.time() - start_time logger.info("The average time of prediction cost: {:.3f} s/image".format( predict_time / cnt)) logger.info("The average time cost: {:.3f} s/image".format(total_time / cnt)) logger.info("The average top-1 score: {:.3f}".format(all_score / cnt))