def init_res(self, stages_pattern, return_patterns=None, return_stages=None): if return_patterns and return_stages: msg = f"The 'return_patterns' would be ignored when 'return_stages' is set." logger.warning(msg) return_stages = None if return_stages is True: return_patterns = stages_pattern # return_stages is int or bool if type(return_stages) is int: return_stages = [return_stages] if isinstance(return_stages, list): if max(return_stages) > len(stages_pattern) or min( return_stages) < 0: msg = f"The 'return_stages' set error. Illegal value(s) have been ignored. The stages' pattern list is {stages_pattern}." logger.warning(msg) return_stages = [ val for val in return_stages if val >= 0 and val < len(stages_pattern) ] return_patterns = [stages_pattern[i] for i in return_stages] if return_patterns: self.update_res(return_patterns)
def override(dl, ks, v): """ Recursively replace dict of list Args: dl(dict or list): dict or list to be replaced ks(list): list of keys v(str): value to be replaced """ def str2num(v): try: return eval(v) except Exception: return v assert isinstance(dl, (list, dict)), ("{} should be a list or a dict") assert len(ks) > 0, ('lenght of keys should larger than 0') if isinstance(dl, list): k = str2num(ks[0]) if len(ks) == 1: assert k < len(dl), ('index({}) out of range({})'.format(k, dl)) dl[k] = str2num(v) else: override(dl[k], ks[1:], v) else: if len(ks) == 1: # assert ks[0] in dl, ('{} is not exist in {}'.format(ks[0], dl)) if not ks[0] in dl: logger.warning('A new filed ({}) detected!'.format(ks[0])) dl[ks[0]] = str2num(v) else: if not ks[0] in dl: logger.warning('A new filed ({}) detected!'.format(ks[0])) dl[ks[0]] = {} override(dl[ks[0]], ks[1:], v)
def __init__(self, class_num, **op_dict): """Build OpSampler Raises: Exception: The parameter \"prob\" of operator(s) are be set error. """ if not class_num: msg = "Please set \"Arch.class_num\" in config if use \"OpSampler\"." logger.error(Exception(msg)) raise Exception(msg) if len(op_dict) < 1: msg = f"ConfigWarning: No operator in \"OpSampler\". \"OpSampler\" has been skipped." logger.warning(msg) self.ops = {} total_prob = 0 for op_name in op_dict: param = op_dict[op_name] if "prob" not in param: msg = f"ConfigWarning: Parameter \"prob\" should be set when use operator in \"OpSampler\". The operator \"{op_name}\"'s prob has been set \"0\"." logger.warning(msg) prob = param.pop("prob", 0) total_prob += prob param.update({"class_num": class_num}) op = eval(op_name)(**param) self.ops.update({op: prob}) if total_prob > 1: msg = f"ConfigError: The total prob of operators in \"OpSampler\" should be less 1." logger.error(Exception(msg)) raise Exception(msg) # add "None Op" when total_prob < 1, "None Op" do nothing self.ops[None] = 1 - total_prob
def create_model(architecture, image, classes_num, config, is_train): """ Create a model Args: architecture(dict): architecture information, name(such as ResNet50) is needed image(variable): model input variable classes_num(int): num of classes config(dict): model config Returns: out(variable): model output variable """ name = architecture["name"] params = architecture.get("params", {}) if "data_format" in config: params["data_format"] = config["data_format"] data_format = config["data_format"] input_image_channel = config.get('image_shape', [3, 224, 224])[0] if input_image_channel != 3: logger.warning( "Input image channel is changed to {}, maybe for better speed-up". format(input_image_channel)) params["input_image_channel"] = input_image_channel if "is_test" in params: params['is_test'] = not is_train model = architectures.__dict__[name](class_dim=classes_num, **params) out = model(image) return out
def normal_predict(self): image_list = get_image_list(self.args.image_file) batch_input_list = [] img_name_list = [] cnt = 0 for idx, img_path in enumerate(image_list): img = cv2.imread(img_path) if img is None: logger.warning( "Image file failed to read and has been skipped. The path: {}". format(img_path)) continue else: img = img[:, :, ::-1] img = preprocess(img, args) batch_input_list.append(img) img_name = img_path.split("/")[-1] img_name_list.append(img_name) cnt += 1 if cnt % args.batch_size == 0 or (idx + 1) == len(image_list): batch_outputs = self.predict(np.array(batch_input_list)) batch_result_list = postprocess(batch_outputs, self.args.top_k) for number, result_dict in enumerate(batch_result_list): filename = img_name_list[number] clas_ids = result_dict["clas_ids"] scores_str = "[{}]".format(", ".join("{:.2f}".format( r) for r in result_dict["scores"])) print( "File:{}, Top-{} result: class id(s): {}, score(s): {}". format(filename, self.args.top_k, clas_ids, scores_str)) batch_input_list = [] img_name_list = []
def build_optimizer(config, epochs, step_each_epoch, model_list=None): config = copy.deepcopy(config) # step1 build lr lr = build_lr_scheduler(config.pop('lr'), epochs, step_each_epoch) logger.debug("build lr ({}) success..".format(lr)) # step2 build regularization if 'regularizer' in config and config['regularizer'] is not None: if 'weight_decay' in config: logger.warning( "ConfigError: Only one of regularizer and weight_decay can be set in Optimizer Config. \"weight_decay\" has been ignored." ) reg_config = config.pop('regularizer') reg_name = reg_config.pop('name') + 'Decay' reg = getattr(paddle.regularizer, reg_name)(**reg_config) config["weight_decay"] = reg logger.debug("build regularizer ({}) success..".format(reg)) # step3 build optimizer optim_name = config.pop('name') if 'clip_norm' in config: clip_norm = config.pop('clip_norm') grad_clip = paddle.nn.ClipGradByNorm(clip_norm=clip_norm) else: grad_clip = None optim = getattr(optimizer, optim_name)(learning_rate=lr, grad_clip=grad_clip, **config)(model_list=model_list) logger.debug("build optimizer ({}) success..".format(optim)) return optim, lr
def main(args): benchmark_file_list = args.benchmark_file_list model_infos = parse_model_infos(benchmark_file_list) right_models = [] wrong_models = [] for model_info in model_infos: try: pretrained_url = model_info["pretrain_path"] fname = _download(pretrained_url, args.pretrained_dir) pretrained_path = os.path.splitext(fname)[0] if pretrained_url.endswith("tar"): path = _decompress(fname) pretrained_path = os.path.join( os.path.dirname(pretrained_path), path) args.config = model_info["config_path"] args.override = [ "pretrained_model={}".format(pretrained_path), "VALID.batch_size=256", "VALID.num_workers=16", "load_static_weights=True", "print_interval=100", ] manager = Manager() return_dict = manager.dict() # A hack method to avoid name conflict. # Multi-process maybe a better method here. # More details can be seen in branch 2.0-beta. # TODO: fluid needs to be removed in the future. with paddle.utils.unique_name.guard(): eval.main(args, return_dict) top1_acc = return_dict.get("top1_acc", 0.0) except Exception as e: logger.error(e) top1_acc = 0.0 diff = abs(top1_acc - model_info["top1_acc"]) if diff > 0.001: err_info = "[{}]Top-1 acc diff should be <= 0.001 but got diff {}, gt acc: {}, eval acc: {}".format( model_info["model_name"], diff, model_info["top1_acc"], top1_acc) logger.warning(err_info) wrong_models.append(model_info["model_name"]) else: right_models.append(model_info["model_name"]) logger.info("[number of right models: {}, they are: {}".format( len(right_models), right_models)) logger.info("[number of wrong models: {}, they are: {}".format( len(wrong_models), wrong_models))
def _mkdir_if_not_exist(path): """ mkdir if not exists, ignore the exception when multiprocess mkdir together """ if not os.path.exists(path): try: os.makedirs(path) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(path): logger.warning( 'be happy if some process has already created {}'.format( path)) else: raise OSError('Failed to mkdir {}'.format(path))
def load_params(exe, prog, path, ignore_params=None): """ Load model from the given path. Args: exe (fluid.Executor): The fluid.Executor object. prog (fluid.Program): load weight to which Program object. path (string): URL string or loca model path. ignore_params (list): ignore variable to load when finetuning. It can be specified by finetune_exclude_pretrained_params and the usage can refer to the document docs/advanced_tutorials/TRANSFER_LEARNING.md """ if not (os.path.isdir(path) or os.path.exists(path + '.pdparams')): raise ValueError("Model pretrain path {} does not " "exists.".format(path)) logger.info( logger.coloring('Loading parameters from {}...'.format(path), 'HEADER')) ignore_set = set() state = _load_state(path) # ignore the parameter which mismatch the shape # between the model and pretrain weight. all_var_shape = {} for block in prog.blocks: for param in block.all_parameters(): all_var_shape[param.name] = param.shape ignore_set.update([ name for name, shape in all_var_shape.items() if name in state and shape != state[name].shape ]) if ignore_params: all_var_names = [var.name for var in prog.list_vars()] ignore_list = filter( lambda var: any([re.match(name, var) for name in ignore_params]), all_var_names) ignore_set.update(list(ignore_list)) if len(ignore_set) > 0: for k in ignore_set: if k in state: logger.warning( 'variable {} is already excluded automatically'.format(k)) del state[k] paddle.static.set_program_state(prog, state)
def __init__(self, step_each_epoch, decay_epochs, values, epochs, warmup_epoch=0, warmup_start_lr=0.0, last_epoch=-1, **kwargs): super().__init__() if warmup_epoch >= epochs: msg = f"When using warm up, the value of \"Global.epochs\" must be greater than value of \"Optimizer.lr.warmup_epoch\". The value of \"Optimizer.lr.warmup_epoch\" has been set to {epochs}." logger.warning(msg) warmup_epoch = epochs self.boundaries = [step_each_epoch * e for e in decay_epochs] self.values = values self.last_epoch = last_epoch self.warmup_steps = round(warmup_epoch * step_each_epoch) self.warmup_start_lr = warmup_start_lr
def create_model(architecture, image, classes_num, config, is_train): """ Create a model Args: architecture(dict): architecture information, name(such as ResNet50) is needed image(variable): model input variable classes_num(int): num of classes config(dict): model config Returns: out(variable): model output variable """ use_pure_fp16 = config.get("use_pure_fp16", False) name = architecture["name"] params = architecture.get("params", {}) data_format = "NCHW" if "data_format" in config: params["data_format"] = config["data_format"] data_format = config["data_format"] input_image_channel = config.get('image_shape', [3, 224, 224])[0] if input_image_channel != 3: logger.warning( "Input image channel is changed to {}, maybe for better speed-up". format(input_image_channel)) params["input_image_channel"] = input_image_channel if "is_test" in params: params['is_test'] = not is_train model = architectures.__dict__[name](class_dim=classes_num, **params) if use_pure_fp16 and not config.get("use_dali", False): image = image.astype('float16') if data_format == "NHWC": image = paddle.tensor.transpose(image, [0, 2, 3, 1]) image.stop_gradient = True out = model(image) if config.get("use_pure_fp16", False): cast_model_to_fp16(paddle.static.default_main_program()) out = out.astype('float32') return out
def __init__(self, learning_rate, step_each_epoch, epochs, eta_min=0.0, warmup_epoch=0, warmup_start_lr=0.0, last_epoch=-1, **kwargs): super().__init__() if warmup_epoch >= epochs: msg = f"When using warm up, the value of \"Global.epochs\" must be greater than value of \"Optimizer.lr.warmup_epoch\". The value of \"Optimizer.lr.warmup_epoch\" has been set to {epochs}." logger.warning(msg) warmup_epoch = epochs self.learning_rate = learning_rate self.T_max = (epochs - warmup_epoch) * step_each_epoch self.eta_min = eta_min self.last_epoch = last_epoch self.warmup_steps = round(warmup_epoch * step_each_epoch) self.warmup_start_lr = warmup_start_lr
def __init__(self, interpolation=None, backend="cv2"): _cv2_interp_from_str = { 'nearest': cv2.INTER_NEAREST, 'bilinear': cv2.INTER_LINEAR, 'area': cv2.INTER_AREA, 'bicubic': cv2.INTER_CUBIC, 'lanczos': cv2.INTER_LANCZOS4 } _pil_interp_from_str = { 'nearest': Image.NEAREST, 'bilinear': Image.BILINEAR, 'bicubic': Image.BICUBIC, 'box': Image.BOX, 'lanczos': Image.LANCZOS, 'hamming': Image.HAMMING } def _pil_resize(src, size, resample): pil_img = Image.fromarray(src) pil_img = pil_img.resize(size, resample) return np.asarray(pil_img) if backend.lower() == "cv2": if isinstance(interpolation, str): interpolation = _cv2_interp_from_str[interpolation.lower()] # compatible with opencv < version 4.4.0 elif interpolation is None: interpolation = cv2.INTER_LINEAR self.resize_func = partial(cv2.resize, interpolation=interpolation) elif backend.lower() == "pil": if isinstance(interpolation, str): interpolation = _pil_interp_from_str[interpolation.lower()] self.resize_func = partial(_pil_resize, resample=interpolation) else: logger.warning( f"The backend of Resize only support \"cv2\" or \"PIL\". \"f{backend}\" is unavailable. Use \"cv2\" instead." ) self.resize_func = cv2.resize
def parse_pattern_str( pattern: str, parent_layer: nn.Layer ) -> Union[None, List[Dict[str, Union[nn.Layer, str, None]]]]: """parse the string type pattern. Args: pattern (str): The pattern to discribe layer. parent_layer (nn.Layer): The root layer relative to the pattern. Returns: Union[None, List[Dict[str, Union[nn.Layer, str, None]]]]: None if failed. If successfully, the members are layers parsed in order: [ {"layer": first layer, "name": first layer's name parsed, "index": first layer's index parsed if exist}, {"layer": second layer, "name": second layer's name parsed, "index": second layer's index parsed if exist}, ... ] """ pattern_list = pattern.split(".") if not pattern_list: msg = f"The pattern('{pattern}') is illegal. Please check and retry." logger.warning(msg) return None layer_list = [] while len(pattern_list) > 0: if '[' in pattern_list[0]: target_layer_name = pattern_list[0].split('[')[0] target_layer_index = pattern_list[0].split('[')[1].split(']')[0] else: target_layer_name = pattern_list[0] target_layer_index = None target_layer = getattr(parent_layer, target_layer_name, None) if target_layer is None: msg = f"Not found layer named('{target_layer_name}') specifed in pattern('{pattern}')." logger.warning(msg) return None if target_layer_index and target_layer: if int(target_layer_index) < 0 or int(target_layer_index) >= len( target_layer): msg = f"Not found layer by index('{target_layer_index}') specifed in pattern('{pattern}'). The index should < {len(target_layer)} and > 0." logger.warning(msg) return None target_layer = target_layer[target_layer_index] layer_list.append({ "layer": target_layer, "name": target_layer_name, "index": target_layer_index }) pattern_list = pattern_list[1:] parent_layer = target_layer return layer_list
def stop_after(self, stop_layer_name: str) -> bool: """stop forward and backward after 'stop_layer_name'. Args: stop_layer_name (str): The name of layer that stop forward and backward after this layer. Returns: bool: 'True' if successful, 'False' otherwise. """ layer_list = parse_pattern_str(stop_layer_name, self) if not layer_list: return False parent_layer = self for layer_dict in layer_list: name, index = layer_dict["name"], layer_dict["index"] if not set_identity(parent_layer, name, index): msg = f"Failed to set the layers that after stop_layer_name('{stop_layer_name}') to IdentityLayer. The error layer's name is '{name}'." logger.warning(msg) return False parent_layer = layer_dict["layer"] return True
def run(dataloader, exe, program, feeds, fetchs, epoch=0, mode='train', config=None, vdl_writer=None, lr_scheduler=None): """ Feed data to the model and fetch the measures and loss Args: dataloader(paddle io dataloader): exe(): program(): fetchs(dict): dict of measures and the loss epoch(int): epoch of training or validation model(str): log only Returns: """ fetch_list = [f[0] for f in fetchs.values()] metric_list = [ ("lr", AverageMeter('lr', 'f', postfix=",", need_avg=False)), ("batch_time", AverageMeter('batch_cost', '.5f', postfix=" s,")), ("reader_time", AverageMeter('reader_cost', '.5f', postfix=" s,")), ] topk_name = 'top{}'.format(config.topk) metric_list.insert(0, ("loss", fetchs["loss"][1])) use_mix = config.get("use_mix", False) and mode == "train" if not use_mix: metric_list.insert(0, (topk_name, fetchs[topk_name][1])) metric_list.insert(0, ("top1", fetchs["top1"][1])) metric_list = OrderedDict(metric_list) for m in metric_list.values(): m.reset() use_dali = config.get('use_dali', False) dataloader = dataloader if use_dali else dataloader() tic = time.time() idx = 0 batch_size = None while True: # The DALI maybe raise RuntimeError for some particular images, such as ImageNet1k/n04418357_26036.JPEG try: batch = next(dataloader) except StopIteration: break except RuntimeError: logger.warning( "Except RuntimeError when reading data from dataloader, try to read once again..." ) continue idx += 1 # ignore the warmup iters if idx == 5: metric_list["batch_time"].reset() metric_list["reader_time"].reset() metric_list['reader_time'].update(time.time() - tic) if use_dali: batch_size = batch[0]["feed_image"].shape()[0] feed_dict = batch[0] else: batch_size = batch[0].shape()[0] feed_dict = { key.name: batch[idx] for idx, key in enumerate(feeds.values()) } metrics = exe.run(program=program, feed=feed_dict, fetch_list=fetch_list) for name, m in zip(fetchs.keys(), metrics): metric_list[name].update(np.mean(m), batch_size) metric_list["batch_time"].update(time.time() - tic) if mode == "train": metric_list['lr'].update(lr_scheduler.get_lr()) fetchs_str = ' '.join([ str(metric_list[key].mean) if "time" in key else str(metric_list[key].value) for key in metric_list ]) ips_info = " ips: {:.5f} images/sec.".format( batch_size / metric_list["batch_time"].avg) fetchs_str += ips_info if lr_scheduler is not None: if lr_scheduler.update_specified: curr_global_counter = lr_scheduler.step_each_epoch * epoch + idx update = max( 0, curr_global_counter - lr_scheduler.update_start_step ) % lr_scheduler.update_step_interval == 0 if update: lr_scheduler.step() else: lr_scheduler.step() if vdl_writer: global total_step logger.scaler('loss', metrics[0][0], total_step, vdl_writer) total_step += 1 if mode == 'valid': if idx % config.get('print_interval', 10) == 0: logger.info("{:s} step:{:<4d} {:s}".format( mode, idx, fetchs_str)) else: epoch_str = "epoch:{:<3d}".format(epoch) step_str = "{:s} step:{:<4d}".format(mode, idx) if idx % config.get('print_interval', 10) == 0: logger.info("{:s} {:s} {:s}".format( logger.coloring(epoch_str, "HEADER") if idx == 0 else epoch_str, logger.coloring(step_str, "PURPLE"), logger.coloring(fetchs_str, 'OKGREEN'))) tic = time.time() end_str = ' '.join([str(m.mean) for m in metric_list.values()] + [metric_list["batch_time"].total]) ips_info = "ips: {:.5f} images/sec.".format( batch_size * metric_list["batch_time"].count / metric_list["batch_time"].sum) if mode == 'valid': logger.info("END {:s} {:s} {:s}".format(mode, end_str, ips_info)) else: end_epoch_str = "END epoch:{:<3d}".format(epoch) logger.info("{:s} {:s} {:s} {:s}".format(end_epoch_str, mode, end_str, ips_info)) if use_dali: dataloader.reset() # return top1_acc in order to save the best model if mode == 'valid': return fetchs["top1"][1].avg
def run(dataloader, exe, program, feeds, fetchs, epoch=0, mode='train', config=None, vdl_writer=None, lr_scheduler=None, profiler_options=None): """ Feed data to the model and fetch the measures and loss Args: dataloader(paddle io dataloader): exe(): program(): fetchs(dict): dict of measures and the loss epoch(int): epoch of training or evaluation model(str): log only Returns: """ fetch_list = [f[0] for f in fetchs.values()] metric_dict = OrderedDict([("lr", AverageMeter('lr', 'f', postfix=",", need_avg=False))]) for k in fetchs: metric_dict[k] = fetchs[k][1] metric_dict["batch_time"] = AverageMeter('batch_cost', '.5f', postfix=" s,") metric_dict["reader_time"] = AverageMeter('reader_cost', '.5f', postfix=" s,") for m in metric_dict.values(): m.reset() use_dali = config["Global"].get('use_dali', False) tic = time.time() if not use_dali: dataloader = dataloader() idx = 0 batch_size = None while True: # The DALI maybe raise RuntimeError for some particular images, such as ImageNet1k/n04418357_26036.JPEG try: batch = next(dataloader) except StopIteration: break except RuntimeError: logger.warning( "Except RuntimeError when reading data from dataloader, try to read once again..." ) continue idx += 1 # ignore the warmup iters if idx == 5: metric_dict["batch_time"].reset() metric_dict["reader_time"].reset() metric_dict['reader_time'].update(time.time() - tic) profiler.add_profiler_step(profiler_options) if use_dali: batch_size = batch[0]["data"].shape()[0] feed_dict = batch[0] else: batch_size = batch[0].shape()[0] feed_dict = { key.name: batch[idx] for idx, key in enumerate(feeds.values()) } metrics = exe.run(program=program, feed=feed_dict, fetch_list=fetch_list) for name, m in zip(fetchs.keys(), metrics): metric_dict[name].update(np.mean(m), batch_size) metric_dict["batch_time"].update(time.time() - tic) if mode == "train": metric_dict['lr'].update(lr_scheduler.get_lr()) fetchs_str = ' '.join([ str(metric_dict[key].mean) if "time" in key else str(metric_dict[key].value) for key in metric_dict ]) ips_info = " ips: {:.5f} images/sec.".format( batch_size / metric_dict["batch_time"].avg) fetchs_str += ips_info if lr_scheduler is not None: lr_scheduler.step() if vdl_writer: global total_step logger.scaler('loss', metrics[0][0], total_step, vdl_writer) total_step += 1 if mode == 'eval': if idx % config.get('print_interval', 10) == 0: logger.info("{:s} step:{:<4d} {:s}".format( mode, idx, fetchs_str)) else: epoch_str = "epoch:{:<3d}".format(epoch) step_str = "{:s} step:{:<4d}".format(mode, idx) if idx % config.get('print_interval', 10) == 0: logger.info("{:s} {:s} {:s}".format(epoch_str, step_str, fetchs_str)) tic = time.time() end_str = ' '.join([str(m.mean) for m in metric_dict.values()] + [metric_dict["batch_time"].total]) ips_info = "ips: {:.5f} images/sec.".format(batch_size / metric_dict["batch_time"].avg) if mode == 'eval': logger.info("END {:s} {:s} {:s}".format(mode, end_str, ips_info)) else: end_epoch_str = "END epoch:{:<3d}".format(epoch) logger.info("{:s} {:s} {:s} {:s}".format(end_epoch_str, mode, end_str, ips_info)) if use_dali: dataloader.reset() # return top1_acc in order to save the best model if mode == 'eval': return fetchs["top1"][1].avg
def main(args): image_path_list = get_image_list(args.image_file) headers = {"Content-type": "application/json"} cnt = 0 predict_time = 0 all_score = 0.0 start_time = time.time() batch_input_list = [] img_name_list = [] cnt = 0 for idx, img_path in enumerate(image_path_list): img = cv2.imread(img_path) if img is None: logger.warning( "Image file failed to read and has been skipped. The path: {}". format(img_path)) continue else: img = img[:, :, ::-1] data = preprocess(img, args) batch_input_list.append(data) img_name = img_path.split('/')[-1] img_name_list.append(img_name) cnt += 1 if cnt % args.batch_size == 0 or (idx + 1) == len(image_path_list): batch_input = np.array(batch_input_list) b64str, revert_shape = np_to_b64(batch_input) data = { "images": b64str, "revert_params": { "shape": revert_shape, "dtype": str(batch_input.dtype) }, "top_k": args.top_k } try: r = requests.post(url=args.server_url, headers=headers, data=json.dumps(data)) r.raise_for_status if r.json()["status"] != "000": msg = r.json()["msg"] raise Exception(msg) except Exception as e: logger.error("{}, in file(s): {} etc.".format( e, img_name_list[0])) continue else: results = r.json()["results"] batch_result_list = results["prediction"] elapse = results["elapse"] cnt += len(batch_result_list) predict_time += elapse for number, result_list in enumerate(batch_result_list): all_score += result_list["scores"][0] result_str = "" for i in range(len(result_list["clas_ids"])): result_str += "{}: {:.2f}\t".format( result_list["clas_ids"][i], result_list["scores"][i]) logger.info("File:{}, The top-{} result(s): {}".format( img_name_list[number], args.top_k, result_str)) finally: batch_input_list = [] img_name_list = [] total_time = time.time() - start_time logger.info("The average time of prediction cost: {:.3f} s/image".format( predict_time / cnt)) logger.info("The average time cost: {:.3f} s/image".format(total_time / cnt)) logger.info("The average top-1 score: {:.3f}".format(all_score / cnt))
def __init__(self, config, mode="train"): assert mode in ["train", "eval", "infer", "export"] self.mode = mode self.config = config self.eval_mode = self.config["Global"].get("eval_mode", "classification") if "Head" in self.config["Arch"] or self.config["Arch"].get( "is_rec", False): self.is_rec = True else: self.is_rec = False # set seed seed = self.config["Global"].get("seed", False) if seed or seed == 0: assert isinstance(seed, int), "The 'seed' must be a integer!" paddle.seed(seed) np.random.seed(seed) random.seed(seed) # init logger self.output_dir = self.config['Global']['output_dir'] log_file = os.path.join(self.output_dir, self.config["Arch"]["name"], f"{mode}.log") init_logger(log_file=log_file) print_config(config) # init train_func and eval_func assert self.eval_mode in ["classification", "retrieval"], logger.error( "Invalid eval mode: {}".format(self.eval_mode)) self.train_epoch_func = train_epoch self.eval_func = getattr(evaluation, self.eval_mode + "_eval") self.use_dali = self.config['Global'].get("use_dali", False) # for visualdl self.vdl_writer = None if self.config['Global'][ 'use_visualdl'] and mode == "train" and dist.get_rank() == 0: vdl_writer_path = os.path.join(self.output_dir, "vdl") if not os.path.exists(vdl_writer_path): os.makedirs(vdl_writer_path) self.vdl_writer = LogWriter(logdir=vdl_writer_path) # set device assert self.config["Global"]["device"] in [ "cpu", "gpu", "xpu", "npu", "mlu" ] self.device = paddle.set_device(self.config["Global"]["device"]) logger.info('train with paddle {} and device {}'.format( paddle.__version__, self.device)) # AMP training self.amp = True if "AMP" in self.config and self.mode == "train" else False if self.amp and self.config["AMP"] is not None: self.scale_loss = self.config["AMP"].get("scale_loss", 1.0) self.use_dynamic_loss_scaling = self.config["AMP"].get( "use_dynamic_loss_scaling", False) else: self.scale_loss = 1.0 self.use_dynamic_loss_scaling = False if self.amp: AMP_RELATED_FLAGS_SETTING = { 'FLAGS_max_inplace_grad_add': 8, } if paddle.is_compiled_with_cuda(): AMP_RELATED_FLAGS_SETTING.update( {'FLAGS_cudnn_batchnorm_spatial_persistent': 1}) paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING) if "class_num" in config["Global"]: global_class_num = config["Global"]["class_num"] if "class_num" not in config["Arch"]: config["Arch"]["class_num"] = global_class_num msg = f"The Global.class_num will be deprecated. Please use Arch.class_num instead. Arch.class_num has been set to {global_class_num}." else: msg = "The Global.class_num will be deprecated. Please use Arch.class_num instead. The Global.class_num has been ignored." logger.warning(msg) #TODO(gaotingquan): support rec class_num = config["Arch"].get("class_num", None) self.config["DataLoader"].update({"class_num": class_num}) # build dataloader if self.mode == 'train': self.train_dataloader = build_dataloader(self.config["DataLoader"], "Train", self.device, self.use_dali) if self.mode == "eval" or (self.mode == "train" and self.config["Global"]["eval_during_train"]): if self.eval_mode == "classification": self.eval_dataloader = build_dataloader( self.config["DataLoader"], "Eval", self.device, self.use_dali) elif self.eval_mode == "retrieval": self.gallery_query_dataloader = None if len(self.config["DataLoader"]["Eval"].keys()) == 1: key = list(self.config["DataLoader"]["Eval"].keys())[0] self.gallery_query_dataloader = build_dataloader( self.config["DataLoader"]["Eval"], key, self.device, self.use_dali) else: self.gallery_dataloader = build_dataloader( self.config["DataLoader"]["Eval"], "Gallery", self.device, self.use_dali) self.query_dataloader = build_dataloader( self.config["DataLoader"]["Eval"], "Query", self.device, self.use_dali) # build loss if self.mode == "train": loss_info = self.config["Loss"]["Train"] self.train_loss_func = build_loss(loss_info) if self.mode == "eval" or (self.mode == "train" and self.config["Global"]["eval_during_train"]): loss_config = self.config.get("Loss", None) if loss_config is not None: loss_config = loss_config.get("Eval") if loss_config is not None: self.eval_loss_func = build_loss(loss_config) else: self.eval_loss_func = None else: self.eval_loss_func = None # build metric if self.mode == 'train': metric_config = self.config.get("Metric") if metric_config is not None: metric_config = metric_config.get("Train") if metric_config is not None: if hasattr( self.train_dataloader, "collate_fn" ) and self.train_dataloader.collate_fn is not None: for m_idx, m in enumerate(metric_config): if "TopkAcc" in m: msg = f"'TopkAcc' metric can not be used when setting 'batch_transform_ops' in config. The 'TopkAcc' metric has been removed." logger.warning(msg) break metric_config.pop(m_idx) self.train_metric_func = build_metrics(metric_config) else: self.train_metric_func = None else: self.train_metric_func = None if self.mode == "eval" or (self.mode == "train" and self.config["Global"]["eval_during_train"]): metric_config = self.config.get("Metric") if self.eval_mode == "classification": if metric_config is not None: metric_config = metric_config.get("Eval") if metric_config is not None: self.eval_metric_func = build_metrics(metric_config) elif self.eval_mode == "retrieval": if metric_config is None: metric_config = [{"name": "Recallk", "topk": (1, 5)}] else: metric_config = metric_config["Eval"] self.eval_metric_func = build_metrics(metric_config) else: self.eval_metric_func = None # build model self.model = build_model(self.config) # set @to_static for benchmark, skip this by default. apply_to_static(self.config, self.model) # load_pretrain if self.config["Global"]["pretrained_model"] is not None: if self.config["Global"]["pretrained_model"].startswith("http"): load_dygraph_pretrain_from_url( self.model, self.config["Global"]["pretrained_model"]) else: load_dygraph_pretrain( self.model, self.config["Global"]["pretrained_model"]) # build optimizer if self.mode == 'train': self.optimizer, self.lr_sch = build_optimizer( self.config["Optimizer"], self.config["Global"]["epochs"], len(self.train_dataloader), [self.model]) # for amp training if self.amp: self.scaler = paddle.amp.GradScaler( init_loss_scaling=self.scale_loss, use_dynamic_loss_scaling=self.use_dynamic_loss_scaling) amp_level = self.config['AMP'].get("level", "O1") if amp_level not in ["O1", "O2"]: msg = "[Parameter Error]: The optimize level of AMP only support 'O1' and 'O2'. The level has been set 'O1'." logger.warning(msg) self.config['AMP']["level"] = "O1" amp_level = "O1" self.model, self.optimizer = paddle.amp.decorate( models=self.model, optimizers=self.optimizer, level=amp_level, save_dtype='float32') # for distributed world_size = dist.get_world_size() self.config["Global"]["distributed"] = world_size != 1 if world_size != 4 and self.mode == "train": msg = f"The training strategy in config files provided by PaddleClas is based on 4 gpus. But the number of gpus is {world_size} in current training. Please modify the stategy (learning rate, batch size and so on) if use config files in PaddleClas to train." logger.warning(msg) if self.config["Global"]["distributed"]: dist.init_parallel_env() self.model = paddle.DataParallel(self.model) # build postprocess for infer if self.mode == 'infer': self.preprocess_func = create_operators( self.config["Infer"]["transforms"]) self.postprocess_func = build_postprocess( self.config["Infer"]["PostProcess"])