def create_inference_config(self, use_trt=True) -> paddle_infer.Config: if use_trt: config = paddle_infer.Config() config.disable_glog_info() config.enable_use_gpu(100, 0) config.set_optim_cache_dir(self.cache_dir) config.switch_ir_debug() config.enable_tensorrt_engine( max_batch_size=self.trt_param.max_batch_size, workspace_size=self.trt_param.workspace_size, min_subgraph_size=self.trt_param.min_subgraph_size, precision_mode=self.trt_param.precision, use_static=self.trt_param.use_static, use_calib_mode=self.trt_param.use_calib_mode) if len(self.dynamic_shape.min_input_shape ) != 0 and self.dynamic_shape.min_input_shape.keys( ) == self.dynamic_shape.max_input_shape.keys( ) and self.dynamic_shape.min_input_shape.keys( ) == self.dynamic_shape.opt_input_shape.keys(): config.set_trt_dynamic_shape_info( self.dynamic_shape.min_input_shape, self.dynamic_shape.max_input_shape, self.dynamic_shape.opt_input_shape, self.dynamic_shape.disable_trt_plugin_fp16) return config else: config = paddle_infer.Config() config.switch_ir_debug(True) config.set_optim_cache_dir(self.cache_dir) config.disable_glog_info() return config
def create_trt_inference_config(self) -> paddle_infer.Config: config = paddle_infer.Config() config.disable_glog_info() config.enable_use_gpu(100, 0) config.set_optim_cache_dir(self.cache_dir) config.switch_ir_debug() return config
def create_predictor(cls, args, config=None): if config is None: config = inference.Config( os.path.join(args.inference_model_dir, "transformer.pdmodel"), os.path.join(args.inference_model_dir, "transformer.pdiparams")) if args.use_gpu: config.enable_use_gpu(100, 0) elif args.use_xpu: config.enable_xpu(100) else: # CPU # such as enable_mkldnn, set_cpu_math_library_num_threads config.disable_gpu() # Use ZeroCopy. config.switch_use_feed_fetch_ops(False) predictor = inference.create_predictor(config) input_handles = [ predictor.get_input_handle(name) for name in predictor.get_input_names() ] output_handles = [ predictor.get_input_handle(name) for name in predictor.get_output_names() ] return cls(predictor, input_handles, output_handles)
def get_truth_val_by_inference(self): try: import paddle.inference as paddle_infer except: # when paddle is not installed, directly return return data = np.array( [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]).astype("float32")[np.newaxis, :] input_dict = {"x": data} pd_config = paddle_infer.Config("uci_housing_model/") pd_config.disable_gpu() pd_config.switch_ir_optim(False) predictor = paddle_infer.create_predictor(pd_config) input_names = predictor.get_input_names() for i, input_name in enumerate(input_names): input_handle = predictor.get_input_handle(input_name) input_handle.copy_from_cpu(input_dict[input_name]) predictor.run() output_data_dict = {} output_names = predictor.get_output_names() for _, output_data_name in enumerate(output_names): output_handle = predictor.get_output_handle(output_data_name) output_data = output_handle.copy_to_cpu() output_data_dict[output_data_name] = output_data # convert to the same format of Serving output output_data_dict["prob"] = output_data_dict["fc_0.tmp_1"] del output_data_dict["fc_0.tmp_1"] self.truth_val = output_data_dict
def paddle_inference(args): import paddle.inference as paddle_infer config = paddle_infer.Config(args.model_file, args.params_file) predictor = paddle_infer.create_predictor(config) input_names = predictor.get_input_names() input_handle = predictor.get_input_handle(input_names[0]) img = cv2.imread(args.image_path) # normalize to mean 0.5, std 0.5 img = (img - 127.5) * 0.00784313725 # BGR2RGB img = img[:, :, ::-1] img = img.transpose((2, 0, 1)) img = np.expand_dims(img, 0) img = img.astype('float32') input_handle.copy_from_cpu(img) predictor.run() output_names = predictor.get_output_names() output_handle = predictor.get_output_handle(output_names[0]) output_data = output_handle.copy_to_cpu() print('paddle inference result: ', output_data.shape)
def __init__(self, model_path, param_path, use_gpu=False): model_path, param_path = self.check_param(model_path, param_path) try: config = paddle_infer.Config(model_path, param_path) except: ValueError(" 模型和参数不匹配,请检查模型和参数是否加载错误") if not use_gpu: config.enable_mkldnn() # TODO: fluid要废弃了,研究判断方式 # if paddle.fluid.core.supports_bfloat16(): # config.enable_mkldnn_bfloat16() config.switch_ir_optim(True) config.set_cpu_math_library_num_threads(10) else: config.enable_use_gpu(500, 0) config.delete_pass("conv_elementwise_add_act_fuse_pass") config.delete_pass("conv_elementwise_add2_act_fuse_pass") config.delete_pass("conv_elementwise_add_fuse_pass") config.switch_ir_optim() config.enable_memory_optim() # use_tensoret = False # TODO: 目前Linux和windows下使用TensorRT报错 # if use_tensoret: # config.enable_tensorrt_engine( # workspace_size=1 << 30, # precision_mode=paddle_infer.PrecisionType.Float32, # max_batch_size=1, # min_subgraph_size=5, # use_static=False, # use_calib_mode=False, # ) self.model = paddle_infer.create_predictor(config)
def load(self) -> bool: def get_model_files(ext: str) -> str: file_list = [] for filename in os.listdir(model_path): if filename.endswith(ext): file_list.append(filename) if len(file_list) == 0: raise Exception("Missing {} model file".format(ext)) if len(file_list) > 1: raise Exception("More than one {} model file".format(ext)) return os.path.join(model_path, file_list[0]) model_path = kserve.Storage.download(self.model_dir) config = inference.Config(get_model_files('.pdmodel'), get_model_files('.pdiparams')) # TODO: add GPU support config.disable_gpu() self.predictor = inference.create_predictor(config) # TODO: add support for multiple input_names/output_names input_names = self.predictor.get_input_names() self.input_tensor = self.predictor.get_input_handle(input_names[0]) output_names = self.predictor.get_output_names() self.output_tensor = self.predictor.get_output_handle(output_names[0]) self.ready = True return self.ready
def eval(args): model_file = os.path.join(args.model_path, args.model_filename) params_file = os.path.join(args.model_path, args.params_filename) config = paddle_infer.Config(model_file, params_file) config.enable_mkldnn() predictor = paddle_infer.create_predictor(config) input_names = predictor.get_input_names() input_handle = predictor.get_input_handle(input_names[0]) output_names = predictor.get_output_names() output_handle = predictor.get_output_handle(output_names[0]) val_dataset = dataset.ImageNetDataset(data_dir=args.data_dir, mode='val') eval_loader = paddle.io.DataLoader( val_dataset, batch_size=args.batch_size, drop_last=True) cost_time = 0. total_num = 0. correct_1_num = 0 correct_5_num = 0 for batch_id, data in enumerate(eval_loader()): img_np = np.array([tensor.numpy() for tensor in data[0]]) label_np = np.array([tensor.numpy() for tensor in data[1]]) input_handle.reshape(img_np.shape) input_handle.copy_from_cpu(img_np) t1 = time.time() predictor.run() t2 = time.time() cost_time += (t2 - t1) output_data = output_handle.copy_to_cpu() for i in range(len(label_np)): label = label_np[i][0] result = output_data[i, :] index = result.argsort() total_num += 1 if index[-1] == label: correct_1_num += 1 if label in index[-5:]: correct_5_num += 1 if batch_id % 10 == 0: acc1 = correct_1_num / total_num acc5 = correct_5_num / total_num avg_time = cost_time / total_num print( "batch_id {}, acc1 {:.3f}, acc5 {:.3f}, avg time {:.5f} sec/img". format(batch_id, acc1, acc5, avg_time)) if args.test_samples > 0 and \ (batch_id + 1)* args.batch_size >= args.test_samples: break acc1 = correct_1_num / total_num acc5 = correct_5_num / total_num print("End test: test_acc1 {:.3f}, test_acc5 {:.5f}".format(acc1, acc5))
def create_predictor(args, mode, logger): if mode == "det": model_dir = args.det_model_dir elif mode == 'cls': model_dir = args.cls_model_dir elif mode == 'rec': model_dir = args.rec_model_dir else: model_dir = args.e2e_model_dir if model_dir is None: logger.info("not find {} model file path {}".format(mode, model_dir)) sys.exit(0) model_file_path = model_dir + "/inference.pdmodel" params_file_path = model_dir + "/inference.pdiparams" if not os.path.exists(model_file_path): logger.info("not find model file path {}".format(model_file_path)) sys.exit(0) if not os.path.exists(params_file_path): logger.info("not find params file path {}".format(params_file_path)) sys.exit(0) config = inference.Config(model_file_path, params_file_path) if args.use_gpu: config.enable_use_gpu(args.gpu_mem, 0) if args.use_tensorrt: config.enable_tensorrt_engine( precision_mode=inference.PrecisionType.Half if args.use_fp16 else inference.PrecisionType.Float32, max_batch_size=args.max_batch_size) else: config.disable_gpu() config.set_cpu_math_library_num_threads(6) if args.enable_mkldnn: # cache 10 different shapes for mkldnn to avoid memory leak config.set_mkldnn_cache_capacity(10) config.enable_mkldnn() # TODO LDOUBLEV: fix mkldnn bug when bach_size > 1 #config.set_mkldnn_op({'conv2d', 'depthwise_conv2d', 'pool2d', 'batch_norm'}) args.rec_batch_num = 1 # enable memory optim config.enable_memory_optim() config.disable_glog_info() config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass") config.switch_use_feed_fetch_ops(False) # create predictor predictor = inference.create_predictor(config) input_names = predictor.get_input_names() for name in input_names: input_tensor = predictor.get_input_handle(name) output_names = predictor.get_output_names() output_tensors = [] for output_name in output_names: output_tensor = predictor.get_output_handle(output_name) output_tensors.append(output_tensor) return predictor, input_tensor, output_tensors
def create_predictor(cls, args, config=None, profile=False, model_name=None): if config is None: config = inference.Config( os.path.join(args.inference_model_dir, "transformer.pdmodel"), os.path.join(args.inference_model_dir, "transformer.pdiparams")) if args.device == "gpu": config.enable_use_gpu(100, 0) elif args.device == "xpu": config.enable_xpu(100) else: # CPU config.disable_gpu() if args.use_mkl: config.enable_mkldnn() config.set_cpu_math_library_num_threads(args.threads) # Use ZeroCopy. config.switch_use_feed_fetch_ops(False) if profile: if args.mod is recorder: autolog = args.mod.Recorder(config, args.infer_batch_size, args.model_name) else: pid = os.getpid() autolog = args.mod.AutoLogger( model_name=args.model_name, model_precision="fp32", batch_size=args.infer_batch_size, save_path=args.save_log_path, inference_config=config, data_shape="dynamic", pids=pid, process_name=None, gpu_ids=0 if args.device == "gpu" else None, time_keys=[ 'preprocess_time', 'inference_time', 'postprocess_time' ], warmup=0, logger=logger) else: autolog = None predictor = inference.create_predictor(config) input_handles = [ predictor.get_input_handle(name) for name in predictor.get_input_names() ] output_handles = [ predictor.get_output_handle(name) for name in predictor.get_output_names() ] return cls(predictor, input_handles, output_handles, autolog)
def init_resnet50_predictor(model_dir): model_file = model_dir + '.pdmodel' params_file = model_dir + '.pdiparams' config = inference.Config() config.set_prog_file(model_file) config.set_params_file(params_file) config.use_gpu() config.enable_use_gpu(500, 0) predictor = inference.create_predictor(config) return predictor
def init_predictor(model_dir): # refer https://paddle-inference.readthedocs.io/en/latest/api_reference/python_api_doc/Config/GPUConfig.html model_file = model_dir+'.pdmodel' params_file = model_dir + '.pdiparams' config = inference.Config() config.set_prog_file(model_file) config.set_params_file(params_file) # 启用 GPU 进行预测 - 初始化 GPU 显存 50M, Deivce_ID 为 0 config.enable_use_gpu(50, 0) predictor = inference.create_predictor(config) return predictor
def infer(args): model_name = 'plato-xl' tokenizer = UnifiedTransformerTokenizer.from_pretrained(model_name) context = [ "Hi , Becky , what's up ?", "Not much , except that my mother-in-law is driving me up the wall .", "What's the problem ?" ] data = tokenizer.dialogue_encode( history=context, add_start_token_as_response=True, return_length=True, return_role_ids=args.use_role, position_style=args.position_style) # Load FasterTransformer lib. load("FasterTransformer", verbose=True) config = paddle_infer.Config(args.inference_model_dir + "plato.pdmodel", args.inference_model_dir + "plato.pdiparams") config.enable_use_gpu(100, 0) config.disable_glog_info() predictor = paddle_infer.create_predictor(config) input_handles = {} for name in predictor.get_input_names(): input_handles[name] = predictor.get_input_handle(name) if name == "attention_mask": input_handles[name].copy_from_cpu( np.expand_dims( np.asarray( data[name], dtype="float32"), axis=(0, 1))) else: input_handles[name].copy_from_cpu( np.asarray( data[name], dtype="int32").reshape([1, -1])) output_handles = [ predictor.get_output_handle(name) for name in predictor.get_output_names() ] predictor.run() output = [output_handle.copy_to_cpu() for output_handle in output_handles] for sample in output[0].transpose([1, 0]).tolist(): print(" ".join(postprocess_response(sample, tokenizer)))
def get(self): # Create predictor, if one doesn't exist, when inference is run if not self._predictor: # If model isn't saved, save model to a temp dir # because predictor init requires the path to a saved model if self._model_path is None: self._model_path = tempfile.TemporaryDirectory().name self._save(self._model_path) config = paddle_infer.Config(self._model_path + ".pdmodel", self._model_path + ".pdiparams") config.enable_memory_optim() predictor = paddle_infer.create_predictor(config) self._predictor = predictor return self._predictor
def create_inference_config(self, passes: Optional[List[str]] = None, use_gpu: bool = False, use_mkldnn: bool = False, ir_optim: Optional[bool] = None): config = paddle_infer.Config() config.switch_ir_debug(True) config.disable_glog_info() if ir_optim is not None: config.switch_ir_optim(ir_optim) if use_gpu: config.enable_use_gpu(100, 0) if use_mkldnn: config.enable_mkldnn() if passes is not None: config.pass_builder().set_passes(passes) return config
def create_predictor(args, mode, logger): # if mode == "det": # model_dir = args[] # elif mode == 'cls': # model_dir = args.cls_model_dir # else: # model_dir = args.rec_model_dir model_dir = args['model_dir'] if model_dir is None: logger.info("not find {} model file path {}".format(mode, model_dir)) sys.exit(0) model_file_path = model_dir + "/inference.pdmodel" params_file_path = model_dir + "/inference.pdiparams" if not os.path.exists(model_file_path): logger.info("not find model file path {}".format(model_file_path)) sys.exit(0) if not os.path.exists(params_file_path): logger.info("not find params file path {}".format(params_file_path)) sys.exit(0) config = inference.Config(model_file_path, params_file_path) if args['use_gpu']: config.enable_use_gpu(8000, 0) else: config.disable_gpu() config.set_cpu_math_library_num_threads(6) # config.enable_memory_optim() config.disable_glog_info() config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass") config.switch_use_feed_fetch_ops(False) # create predictor predictor = inference.create_predictor(config) input_names = predictor.get_input_names() for name in input_names: input_tensor = predictor.get_input_handle(name) output_names = predictor.get_output_names() output_tensors = [] for output_name in output_names: output_tensor = predictor.get_output_handle(output_name) output_tensors.append(output_tensor) return predictor, input_tensor, output_tensors
def init_paddle_inference_config(args): import paddle.inference as paddle_infer config = paddle_infer.Config(args.model_file, args.params_file) if hasattr(args, 'precision'): if args.precision == "fp16" and args.use_tensorrt: precision = paddle_infer.PrecisionType.Half elif args.precision == "int8": precision = paddle_infer.PrecisionType.Int8 else: precision = paddle_infer.PrecisionType.Float32 else: precision = paddle_infer.PrecisionType.Float32 if args.use_gpu: gpu_id = get_infer_gpuid() if gpu_id is None: raise ValueError( "Not found GPU in current device. Please check your device or set args.use_gpu as False" ) config.enable_use_gpu(args.gpu_mem, 0) if args.use_tensorrt: config.enable_tensorrt_engine( precision_mode=precision, max_batch_size=args.max_batch_size, min_subgraph_size=args.min_subgraph_size) # skip the minmum trt subgraph min_input_shape = {"x": [1, 3, 10, 10]} max_input_shape = {"x": [1, 3, 1000, 1000]} opt_input_shape = {"x": [1, 3, 112, 112]} config.set_trt_dynamic_shape_info(min_input_shape, max_input_shape, opt_input_shape) else: config.disable_gpu() cpu_threads = args.cpu_threads if hasattr(args, "cpu_threads") else 10 config.set_cpu_math_library_num_threads(cpu_threads) if args.enable_mkldnn: # cache 10 different shapes for mkldnn to avoid memory leak config.enable_mkldnn() config.set_mkldnn_cache_capacity(10) if args.precision == "fp16": config.enable_mkldnn_bfloat16() return config
def infer(args): model_name = 'unimo-text-1.0-lcsts-new' tokenizer = UNIMOTokenizer.from_pretrained(model_name) inputs = "深度学习是人工智能的核心技术领域。百度飞桨作为中国首个自主研发、功能丰富、开源开放的产业级深度学习平台,将从多层次技术产品、产业AI人才培养和强大的生态资源支持三方面全面护航企业实现快速AI转型升级。" data = tokenizer.gen_encode(inputs, add_start_token_for_decoding=True, return_length=True, is_split_into_words=False) # Load FasterTransformer lib. load("FasterTransformer", verbose=True) config = paddle_infer.Config( args.inference_model_dir + "unimo_text.pdmodel", args.inference_model_dir + "unimo_text.pdiparams") config.enable_use_gpu(100, 0) config.disable_glog_info() predictor = paddle_infer.create_predictor(config) input_handles = {} for name in predictor.get_input_names(): input_handles[name] = predictor.get_input_handle(name) if name == "attention_mask": input_handles[name].copy_from_cpu( np.expand_dims(np.asarray(data[name], dtype="float32"), axis=(0, 1))) else: input_handles[name].copy_from_cpu( np.asarray(data[name], dtype="int32").reshape([1, -1])) output_handles = [ predictor.get_output_handle(name) for name in predictor.get_output_names() ] predictor.run() output = [output_handle.copy_to_cpu() for output_handle in output_handles] for sample in output[0].transpose([1, 0]).tolist(): print("".join(postprocess_response(sample, tokenizer)))
def load_predictor(self, model_file_path, params_file_path): """load_predictor initialize the inference engine Args: model_file_path: inference model path (*.pdmodel) model_file_path: inference parmaeter path (*.pdiparams) Return: predictor: Predictor created using Paddle Inference. config: Configuration of the predictor. input_tensor: Input tensor of the predictor. output_tensor: Output tensor of the predictor. """ args = self.args config = inference.Config(model_file_path, params_file_path) if args.use_gpu: config.enable_use_gpu(1000, 0) else: config.disable_gpu() # The thread num should not be greater than the number of cores in the CPU. config.set_cpu_math_library_num_threads(4) # enable memory optim config.enable_memory_optim() config.disable_glog_info() config.switch_use_feed_fetch_ops(False) config.switch_ir_optim(True) # create predictor predictor = inference.create_predictor(config) # get input and output tensor property input_names = predictor.get_input_names() input_tensor = predictor.get_input_handle(input_names[0]) output_names = predictor.get_output_names() output_tensor = predictor.get_output_handle(output_names[0]) return predictor, config, input_tensor, output_tensor
def create_predictor(cls, args, config=None, profile=False, model_name=None): if config is None: config = inference.Config( os.path.join(args.inference_model_dir, "transformer.pdmodel"), os.path.join(args.inference_model_dir, "transformer.pdiparams")) if args.device == "gpu": config.enable_use_gpu(100, 0) elif args.device == "xpu": config.enable_xpu(100) else: # CPU config.disable_gpu() if args.use_mkl: config.enable_mkldnn() config.set_cpu_math_library_num_threads(args.threads) # Use ZeroCopy. config.switch_use_feed_fetch_ops(False) if profile: recorder = Recorder(config, args.infer_batch_size, model_name) else: recorder = None predictor = inference.create_predictor(config) input_handles = [ predictor.get_input_handle(name) for name in predictor.get_input_names() ] output_handles = [ predictor.get_input_handle(name) for name in predictor.get_output_names() ] return cls(predictor, input_handles, output_handles, recorder)
def eval(): # create predictor model_file = os.path.join(FLAGS.model_path, FLAGS.model_filename) params_file = os.path.join(FLAGS.model_path, FLAGS.params_filename) config = paddle_infer.Config(model_file, params_file) if FLAGS.use_gpu: config.enable_use_gpu(1000, 0) if not FLAGS.ir_optim: config.switch_ir_optim(False) predictor = paddle_infer.create_predictor(config) input_names = predictor.get_input_names() input_handle = predictor.get_input_handle(input_names[0]) output_names = predictor.get_output_names() output_handle = predictor.get_output_handle(output_names[0]) # prepare data val_dataset = ImageNetValDataset(FLAGS.data_dir) eval_loader = paddle.io.DataLoader(val_dataset, batch_size=FLAGS.batch_size, num_workers=5) cost_time = 0. total_num = 0. correct_1_num = 0 correct_5_num = 0 for batch_id, data in enumerate(eval_loader()): # set input img_np = np.array([tensor.numpy() for tensor in data[0]]) label_np = np.array([tensor.numpy() for tensor in data[1]]) input_handle.reshape(img_np.shape) input_handle.copy_from_cpu(img_np) # run t1 = time.time() predictor.run() t2 = time.time() cost_time += (t2 - t1) output_data = output_handle.copy_to_cpu() # calculate accuracy for i in range(len(label_np)): label = label_np[i][0] result = output_data[i, :] index = result.argsort() total_num += 1 if index[-1] == label: correct_1_num += 1 if label in index[-5:]: correct_5_num += 1 if batch_id % 10 == 0: acc1 = correct_1_num / total_num acc5 = correct_5_num / total_num avg_time = cost_time / total_num print( "batch_id {}, acc1 {:.3f}, acc5 {:.3f}, avg time {:.5f} sec/img" .format(batch_id, acc1, acc5, avg_time)) if FLAGS.test_samples > 0 and \ (batch_id + 1)* FLAGS.batch_size >= FLAGS.test_samples: break acc1 = correct_1_num / total_num acc5 = correct_5_num / total_num avg_time = cost_time / total_num print("End test: test image {}".format(total_num)) print("test_acc1 {:.4f}, test_acc5 {:.4f}, avg time {:.5f} sec/img".format( acc1, acc5, avg_time)) print("\n")
def load_model_config(self, model_path, use_gpu=False, gpu_id=0, use_profile=False, thread_num=1, mem_optim=True, ir_optim=False, use_trt=False, use_lite=False, use_xpu=False, precision="fp32", use_mkldnn=False, mkldnn_cache_capacity=0, mkldnn_op_list=None, mkldnn_bf16_op_list=None, use_feed_fetch_ops=False, use_ascend_cl=False, min_subgraph_size=3, dynamic_shape_info={}, use_calib=False): """ Load model configs and create the paddle predictor by Paddle Inference API. Args: model_path: model config path. use_gpu: calculating with gpu, False default. gpu_id: gpu id, 0 default. use_profile: use predictor profiles, False default. thread_num: thread nums of cpu math library, default 1. mem_optim: memory optimization, True default. ir_optim: open calculation chart optimization, False default. use_trt: use nvidia TensorRT optimization, False default use_lite: use Paddle-Lite Engint, False default ir_optim: open calculation chart optimization, False default. use_trt: use nvidia TensorRT optimization, False default use_lite: use Paddle-Lite Engint, False default use_xpu: run predict on Baidu Kunlun, False default precision: precision mode, "fp32" default use_mkldnn: use MKLDNN, False default. mkldnn_cache_capacity: cache capacity for input shapes, 0 default. mkldnn_op_list: op list accelerated using MKLDNN, None default. mkldnn_bf16_op_list: op list accelerated using MKLDNN bf16, None default. use_feed_fetch_ops: use feed/fetch ops, False default. use_ascend_cl: run predict on Huawei Ascend, False default min_subgraph_size: the minimal subgraph size for opening tensorrt to optimize, 3 default dynamic_shape_info: dict including min_input_shape,max_input_shape, opt_input_shape, {} default use_calib: use TensorRT calibration, False default """ gpu_id = int(gpu_id) client_config = "{}/serving_server_conf.prototxt".format(model_path) model_conf = m_config.GeneralModelConfig() f = open(client_config, 'r') model_conf = google.protobuf.text_format.Merge( str(f.read()), model_conf) # Init paddle_infer config # Paddle's model files and parameter files have multiple naming rules: # 1) __model__, __params__ # 2) *.pdmodel, *.pdiparams # 3) __model__, conv2d_1.w_0, conv2d_2.w_0, fc_1.w_0, conv2d_1.b_0, ... pdmodel_file_list = self.search_suffix_files(model_path, "*.pdmodel") pdiparams_file_list = self.search_suffix_files(model_path, "*.pdiparams") if os.path.exists(os.path.join(model_path, "__params__")): # case 1) initializing config = paddle_infer.Config( os.path.join(model_path, "__model__"), os.path.join(model_path, "__params__")) elif pdmodel_file_list and len( pdmodel_file_list) > 0 and pdiparams_file_list and len( pdiparams_file_list) > 0: # case 2) initializing logger.info("pdmodel_file_list:{}, pdiparams_file_list:{}".format( pdmodel_file_list, pdiparams_file_list)) config = paddle_infer.Config(pdmodel_file_list[0], pdiparams_file_list[0]) else: # case 3) initializing. config = paddle_infer.Config(model_path) logger.info( "LocalPredictor load_model_config params: model_path:{}, use_gpu:{}, " "gpu_id:{}, use_profile:{}, thread_num:{}, mem_optim:{}, ir_optim:{}, " "use_trt:{}, use_lite:{}, use_xpu:{}, precision:{}, use_calib:{}, " "use_mkldnn:{}, mkldnn_cache_capacity:{}, mkldnn_op_list:{}, " "mkldnn_bf16_op_list:{}, use_feed_fetch_ops:{}, " "use_ascend_cl:{}, min_subgraph_size:{}, dynamic_shape_info:{}". format(model_path, use_gpu, gpu_id, use_profile, thread_num, mem_optim, ir_optim, use_trt, use_lite, use_xpu, precision, use_calib, use_mkldnn, mkldnn_cache_capacity, mkldnn_op_list, mkldnn_bf16_op_list, use_feed_fetch_ops, use_ascend_cl, min_subgraph_size, dynamic_shape_info)) self.feed_names_ = [var.alias_name for var in model_conf.feed_var] self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var] self.feed_names_to_idx_ = {} self.fetch_names_to_idx_ = {} for i, var in enumerate(model_conf.feed_var): self.feed_names_to_idx_[var.alias_name] = i self.feed_types_[var.alias_name] = var.feed_type self.feed_shapes_[var.alias_name] = var.shape for i, var in enumerate(model_conf.fetch_var): self.fetch_names_to_idx_[var.alias_name] = i self.fetch_types_[var.alias_name] = var.fetch_type self.fetch_names_to_type_[var.alias_name] = var.shape # set precision of inference. precision_type = paddle_infer.PrecisionType.Float32 if precision is not None and precision.lower() in precision_map: precision_type = precision_map[precision.lower()] else: logger.warning("precision error!!! Please check precision:{}". format(precision)) # set profile if use_profile: config.enable_profile() # set memory optimization if mem_optim: config.enable_memory_optim() # set ir optimization, threads of cpu math library config.switch_ir_optim(ir_optim) # use feed & fetch ops config.switch_use_feed_fetch_ops(use_feed_fetch_ops) # pass optim config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass") # set cpu & mkldnn config.set_cpu_math_library_num_threads(thread_num) if use_mkldnn: config.enable_mkldnn() if precision_type == "bf16": config.enable_mkldnn_bfloat16() if mkldnn_cache_capacity > 0: config.set_mkldnn_cache_capacity(mkldnn_cache_capacity) if mkldnn_op_list is not None: config.set_mkldnn_op(mkldnn_op_list) # set gpu if not use_gpu: config.disable_gpu() else: config.enable_use_gpu(100, gpu_id) if use_trt: config.enable_tensorrt_engine( precision_mode=precision_type, workspace_size=1 << 20, max_batch_size=32, min_subgraph_size=min_subgraph_size, use_static=False, use_calib_mode=use_calib) @ErrorCatch @ParamChecker def dynamic_shape_info_helper(dynamic_shape_info:lambda dynamic_shape_info: check_dynamic_shape_info(dynamic_shape_info)): pass _, resp = dynamic_shape_info_helper(dynamic_shape_info) if resp.err_no != CustomExceptionCode.OK.value: print("dynamic_shape_info configure error, it should contain [min_input_shape', 'max_input_shape', 'opt_input_shape' {}".format(resp.err_msg)) kill_stop_process_by_pid("kill", os.getpgid(os.getpid())) if len(dynamic_shape_info): config.set_trt_dynamic_shape_info( dynamic_shape_info['min_input_shape'], dynamic_shape_info['max_input_shape'], dynamic_shape_info['opt_input_shape']) # set lite if use_lite: config.enable_lite_engine( precision_mode=precision_type, zero_copy=True, passes_filter=[], ops_filter=[]) config.switch_ir_optim(True) # set xpu if use_xpu: # 2MB l3 cache config.enable_xpu(8 * 1024 * 1024) config.set_xpu_device_id(gpu_id) # set ascend cl if use_ascend_cl: if use_lite: # for ascend 310 nnadapter_device_names = "huawei_ascend_npu" nnadapter_context_properties = \ "HUAWEI_ASCEND_NPU_SELECTED_DEVICE_IDS={}".format(gpu_id) nnadapter_model_cache_dir = "" config.nnadapter() \ .enable() \ .set_device_names([nnadapter_device_names]) \ .set_context_properties(nnadapter_context_properties) \ .set_model_cache_dir(nnadapter_model_cache_dir) else: # for ascend 910 config.enable_npu(gpu_id) # set cpu low precision if not use_gpu and not use_lite: if precision_type == paddle_infer.PrecisionType.Int8: logger.warning( "PRECISION INT8 is not supported in CPU right now! Please use fp16 or bf16." ) #config.enable_quantizer() if precision is not None and precision.lower() == "bf16": config.enable_mkldnn_bfloat16() if mkldnn_bf16_op_list is not None: config.set_bfloat16_op(mkldnn_bf16_op_list) @ErrorCatch def create_predictor_check(config): predictor = paddle_infer.create_predictor(config) return predictor predictor, resp = create_predictor_check(config) if resp.err_no != CustomExceptionCode.OK.value: logger.critical( "failed to create predictor: {}".format(resp.err_msg), exc_info=False) print("failed to create predictor: {}".format(resp.err_msg)) kill_stop_process_by_pid("kill", os.getpgid(os.getpid())) self.predictor = predictor
def create_predictor(args, mode, logger): if mode == "det": model_dir = args.det_model_dir elif mode == 'cls': model_dir = args.cls_model_dir elif mode == 'rec': model_dir = args.rec_model_dir elif mode == 'table': model_dir = args.table_model_dir else: model_dir = args.e2e_model_dir if model_dir is None: logger.info("not find {} model file path {}".format(mode, model_dir)) sys.exit(0) if args.use_onnx: import onnxruntime as ort model_file_path = model_dir if not os.path.exists(model_file_path): raise ValueError( "not find model file path {}".format(model_file_path)) sess = ort.InferenceSession(model_file_path) return sess, sess.get_inputs()[0], None, None else: model_file_path = model_dir + "/inference.pdmodel" params_file_path = model_dir + "/inference.pdiparams" if not os.path.exists(model_file_path): raise ValueError( "not find model file path {}".format(model_file_path)) if not os.path.exists(params_file_path): raise ValueError( "not find params file path {}".format(params_file_path)) config = inference.Config(model_file_path, params_file_path) if hasattr(args, 'precision'): if args.precision == "fp16" and args.use_tensorrt: precision = inference.PrecisionType.Half elif args.precision == "int8": precision = inference.PrecisionType.Int8 else: precision = inference.PrecisionType.Float32 else: precision = inference.PrecisionType.Float32 if args.use_gpu: gpu_id = get_infer_gpuid() if gpu_id is None: logger.warning( "GPU is not found in current device by nvidia-smi. Please check your device or ignore it if run on jeston." ) config.enable_use_gpu(args.gpu_mem, 0) if args.use_tensorrt: config.enable_tensorrt_engine( workspace_size=1 << 30, precision_mode=precision, max_batch_size=args.max_batch_size, min_subgraph_size=args.min_subgraph_size) # skip the minmum trt subgraph use_dynamic_shape = True if mode == "det": min_input_shape = { "x": [1, 3, 50, 50], "conv2d_92.tmp_0": [1, 120, 20, 20], "conv2d_91.tmp_0": [1, 24, 10, 10], "conv2d_59.tmp_0": [1, 96, 20, 20], "nearest_interp_v2_1.tmp_0": [1, 256, 10, 10], "nearest_interp_v2_2.tmp_0": [1, 256, 20, 20], "conv2d_124.tmp_0": [1, 256, 20, 20], "nearest_interp_v2_3.tmp_0": [1, 64, 20, 20], "nearest_interp_v2_4.tmp_0": [1, 64, 20, 20], "nearest_interp_v2_5.tmp_0": [1, 64, 20, 20], "elementwise_add_7": [1, 56, 2, 2], "nearest_interp_v2_0.tmp_0": [1, 256, 2, 2] } max_input_shape = { "x": [1, 3, 1536, 1536], "conv2d_92.tmp_0": [1, 120, 400, 400], "conv2d_91.tmp_0": [1, 24, 200, 200], "conv2d_59.tmp_0": [1, 96, 400, 400], "nearest_interp_v2_1.tmp_0": [1, 256, 200, 200], "conv2d_124.tmp_0": [1, 256, 400, 400], "nearest_interp_v2_2.tmp_0": [1, 256, 400, 400], "nearest_interp_v2_3.tmp_0": [1, 64, 400, 400], "nearest_interp_v2_4.tmp_0": [1, 64, 400, 400], "nearest_interp_v2_5.tmp_0": [1, 64, 400, 400], "elementwise_add_7": [1, 56, 400, 400], "nearest_interp_v2_0.tmp_0": [1, 256, 400, 400] } opt_input_shape = { "x": [1, 3, 640, 640], "conv2d_92.tmp_0": [1, 120, 160, 160], "conv2d_91.tmp_0": [1, 24, 80, 80], "conv2d_59.tmp_0": [1, 96, 160, 160], "nearest_interp_v2_1.tmp_0": [1, 256, 80, 80], "nearest_interp_v2_2.tmp_0": [1, 256, 160, 160], "conv2d_124.tmp_0": [1, 256, 160, 160], "nearest_interp_v2_3.tmp_0": [1, 64, 160, 160], "nearest_interp_v2_4.tmp_0": [1, 64, 160, 160], "nearest_interp_v2_5.tmp_0": [1, 64, 160, 160], "elementwise_add_7": [1, 56, 40, 40], "nearest_interp_v2_0.tmp_0": [1, 256, 40, 40] } min_pact_shape = { "nearest_interp_v2_26.tmp_0": [1, 256, 20, 20], "nearest_interp_v2_27.tmp_0": [1, 64, 20, 20], "nearest_interp_v2_28.tmp_0": [1, 64, 20, 20], "nearest_interp_v2_29.tmp_0": [1, 64, 20, 20] } max_pact_shape = { "nearest_interp_v2_26.tmp_0": [1, 256, 400, 400], "nearest_interp_v2_27.tmp_0": [1, 64, 400, 400], "nearest_interp_v2_28.tmp_0": [1, 64, 400, 400], "nearest_interp_v2_29.tmp_0": [1, 64, 400, 400] } opt_pact_shape = { "nearest_interp_v2_26.tmp_0": [1, 256, 160, 160], "nearest_interp_v2_27.tmp_0": [1, 64, 160, 160], "nearest_interp_v2_28.tmp_0": [1, 64, 160, 160], "nearest_interp_v2_29.tmp_0": [1, 64, 160, 160] } min_input_shape.update(min_pact_shape) max_input_shape.update(max_pact_shape) opt_input_shape.update(opt_pact_shape) elif mode == "rec": if args.rec_algorithm != "CRNN": use_dynamic_shape = False min_input_shape = {"x": [1, 3, 32, 10]} max_input_shape = {"x": [args.rec_batch_num, 3, 32, 1536]} opt_input_shape = {"x": [args.rec_batch_num, 3, 32, 320]} elif mode == "cls": min_input_shape = {"x": [1, 3, 48, 10]} max_input_shape = {"x": [args.rec_batch_num, 3, 48, 1024]} opt_input_shape = {"x": [args.rec_batch_num, 3, 48, 320]} else: use_dynamic_shape = False if use_dynamic_shape: config.set_trt_dynamic_shape_info(min_input_shape, max_input_shape, opt_input_shape) else: config.disable_gpu() if hasattr(args, "cpu_threads"): config.set_cpu_math_library_num_threads(args.cpu_threads) else: # default cpu threads as 10 config.set_cpu_math_library_num_threads(10) if args.enable_mkldnn: # cache 10 different shapes for mkldnn to avoid memory leak config.set_mkldnn_cache_capacity(10) config.enable_mkldnn() if args.precision == "fp16": config.enable_mkldnn_bfloat16() # enable memory optim config.enable_memory_optim() config.disable_glog_info() config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass") if mode == 'table': config.delete_pass("fc_fuse_pass") # not supported for table config.switch_use_feed_fetch_ops(False) config.switch_ir_optim(True) # create predictor predictor = inference.create_predictor(config) input_names = predictor.get_input_names() for name in input_names: input_tensor = predictor.get_input_handle(name) output_names = predictor.get_output_names() output_tensors = [] for output_name in output_names: output_tensor = predictor.get_output_handle(output_name) output_tensors.append(output_tensor) return predictor, input_tensor, output_tensors, config
import os import cv2 import numpy as np import paddle.inference as pi from licsber.cv import parse_img CHANNEL, HEIGHT, WIDTH = (3, 34, 92) CHAR_LIST = '12345678ABCDEFHKNPQXYZabcdefhknpxyz' _now_path = os.path.dirname(__file__) MODEL_PATH = os.path.join(_now_path, 'models', 'inference.pdmodel') PARAMS_PATH = os.path.join(_now_path, 'models', 'inference.pdiparams') _config = pi.Config(MODEL_PATH, PARAMS_PATH) _predictor = pi.create_predictor(_config) if CHANNEL == 1: def pre_process(img): _, binary = cv2.threshold(img, 0x70, 1, cv2.THRESH_BINARY) binary = binary[:, :, 0] return np.array(binary, dtype='float32').reshape((1, HEIGHT, WIDTH)) elif CHANNEL == 3: def pre_process(img): return np.array(img, dtype='float32').reshape([CHANNEL, HEIGHT, WIDTH]) / 255 else: print('error, cannot pre_process img like this.') def ctc_decode(text, blank=len(CHAR_LIST)):
def run_test(self, quant=False, *args, **kwargs): status = True run_flags = [] for prog_config in self.sample_program_configs(*args, **kwargs): # In CI, only run 10% cases if np.random.rand() < self.num_percent_cases: run_flags.append(True) else: run_flags.append(False) for prog_config, run_flags in zip( self.sample_program_configs(*args, **kwargs), run_flags): if not run_flags: continue # if program is invalid, we should skip that cases. if not self.is_program_valid(prog_config): continue model, params = create_fake_model(prog_config) if quant: model, params = create_quant_model(model, params) feed_data = {} for name, tensor_config in prog_config.inputs.items(): feed_data[name] = { 'data': tensor_config.data, 'lod': tensor_config.lod } results: List[Dict[str, np.ndarray]] = [] # baseline: gpu run logging.info('RUN program_config: ' + str(prog_config)) gpu_config = self.create_inference_config(use_trt=False) results.append( self.run_test_config(model, params, prog_config, gpu_config, feed_data)) self.success_log('RUN_GPU_BASELINE done') for pred_config, nodes_num, threshold in self.sample_predictor_configs( prog_config): if os.path.exists(self.cache_dir): shutil.rmtree(self.cache_dir) if isinstance(threshold, float): atol = threshold rtol = 1e-8 elif isinstance(threshold, list) or isinstance( threshold, tuple): atol = threshold[0] rtol = threshold[1] else: raise NotImplementedError if quant and pred_config.tensorrt_precision_mode( ) != paddle_infer.PrecisionType.Int8: continue if pred_config.tensorrt_precision_mode( ) == paddle_infer.PrecisionType.Int8 and not quant: continue ignore_flag = False for ignore_info in self.ignore_cases: if ignore_info[0](prog_config, pred_config): ignore_flag = True if ignore_info[1] == IgnoreReasons.TRT_NOT_IMPLEMENTED: self.ignore_log( "[TRT_NOT_IMPLEMENTED] " + ignore_info[2] + ' ' + ' vs ' + self.inference_config_str(pred_config)) elif ignore_info[1] == IgnoreReasons.TRT_NOT_SUPPORT: self.ignore_log( "[TRT_NOT_SUPPORT] " + ignore_info[2] + ' ' + ' vs ' + self.inference_config_str(pred_config)) else: raise NotImplementedError break try: pred_config_deserialize = paddle_infer.Config(pred_config) results.append( self.run_test_config(model, params, prog_config, pred_config, feed_data)) self.assert_tensors_near(atol, rtol, results[-1], results[0]) if not ignore_flag: self.assert_op_size(nodes_num[0], nodes_num[1]) # deserialize test if nodes_num[0] > 0: self.run_test_config(model, params, prog_config, pred_config_deserialize, feed_data) except Exception as e: self.fail_log( self.inference_config_str(pred_config) + '\033[1;31m \nERROR INFO: {}\033[0m'.format(str(e))) if not ignore_flag: status = False continue self.success_log('RUN predictor_config ' + self.inference_config_str(pred_config) + ' done') self.assertTrue(status)
def create_predictor(args, mode, logger): if mode == "det": model_dir = args.det_model_dir elif mode == 'cls': model_dir = args.cls_model_dir elif mode == 'rec': model_dir = args.rec_model_dir elif mode == 'table': model_dir = args.table_model_dir else: model_dir = args.e2e_model_dir if model_dir is None: logger.info("not find {} model file path {}".format(mode, model_dir)) sys.exit(0) model_file_path = model_dir + "/inference.pdmodel" params_file_path = model_dir + "/inference.pdiparams" if not os.path.exists(model_file_path): raise ValueError("not find model file path {}".format(model_file_path)) if not os.path.exists(params_file_path): raise ValueError( "not find params file path {}".format(params_file_path)) config = inference.Config(model_file_path, params_file_path) if hasattr(args, 'precision'): if args.precision == "fp16" and args.use_tensorrt: precision = inference.PrecisionType.Half elif args.precision == "int8": precision = inference.PrecisionType.Int8 else: precision = inference.PrecisionType.Float32 else: precision = inference.PrecisionType.Float32 if args.use_gpu: config.enable_use_gpu(args.gpu_mem, 0) if args.use_tensorrt: config.enable_tensorrt_engine( precision_mode=precision, max_batch_size=args.max_batch_size, min_subgraph_size=args.min_subgraph_size) # skip the minmum trt subgraph if mode == "det": min_input_shape = { "x": [1, 3, 50, 50], "conv2d_92.tmp_0": [1, 96, 20, 20], "conv2d_91.tmp_0": [1, 96, 10, 10], "conv2d_59.tmp_0": [1, 96, 20, 20], "nearest_interp_v2_1.tmp_0": [1, 96, 10, 10], "nearest_interp_v2_2.tmp_0": [1, 96, 20, 20], "conv2d_124.tmp_0": [1, 96, 20, 20], "nearest_interp_v2_3.tmp_0": [1, 24, 20, 20], "nearest_interp_v2_4.tmp_0": [1, 24, 20, 20], "nearest_interp_v2_5.tmp_0": [1, 24, 20, 20], "elementwise_add_7": [1, 56, 2, 2], "nearest_interp_v2_0.tmp_0": [1, 96, 2, 2] } max_input_shape = { "x": [1, 3, 2000, 2000], "conv2d_92.tmp_0": [1, 96, 400, 400], "conv2d_91.tmp_0": [1, 96, 200, 200], "conv2d_59.tmp_0": [1, 96, 400, 400], "nearest_interp_v2_1.tmp_0": [1, 96, 200, 200], "conv2d_124.tmp_0": [1, 256, 400, 400], "nearest_interp_v2_2.tmp_0": [1, 96, 400, 400], "nearest_interp_v2_3.tmp_0": [1, 24, 400, 400], "nearest_interp_v2_4.tmp_0": [1, 24, 400, 400], "nearest_interp_v2_5.tmp_0": [1, 24, 400, 400], "elementwise_add_7": [1, 56, 400, 400], "nearest_interp_v2_0.tmp_0": [1, 96, 400, 400] } opt_input_shape = { "x": [1, 3, 640, 640], "conv2d_92.tmp_0": [1, 96, 160, 160], "conv2d_91.tmp_0": [1, 96, 80, 80], "conv2d_59.tmp_0": [1, 96, 160, 160], "nearest_interp_v2_1.tmp_0": [1, 96, 80, 80], "nearest_interp_v2_2.tmp_0": [1, 96, 160, 160], "conv2d_124.tmp_0": [1, 256, 160, 160], "nearest_interp_v2_3.tmp_0": [1, 24, 160, 160], "nearest_interp_v2_4.tmp_0": [1, 24, 160, 160], "nearest_interp_v2_5.tmp_0": [1, 24, 160, 160], "elementwise_add_7": [1, 56, 40, 40], "nearest_interp_v2_0.tmp_0": [1, 96, 40, 40] } elif mode == "rec": min_input_shape = {"x": [args.rec_batch_num, 3, 32, 10]} max_input_shape = {"x": [args.rec_batch_num, 3, 32, 2000]} opt_input_shape = {"x": [args.rec_batch_num, 3, 32, 320]} elif mode == "cls": min_input_shape = {"x": [args.rec_batch_num, 3, 48, 10]} max_input_shape = {"x": [args.rec_batch_num, 3, 48, 2000]} opt_input_shape = {"x": [args.rec_batch_num, 3, 48, 320]} else: min_input_shape = {"x": [1, 3, 10, 10]} max_input_shape = {"x": [1, 3, 1000, 1000]} opt_input_shape = {"x": [1, 3, 500, 500]} config.set_trt_dynamic_shape_info(min_input_shape, max_input_shape, opt_input_shape) else: config.disable_gpu() if hasattr(args, "cpu_threads"): config.set_cpu_math_library_num_threads(args.cpu_threads) else: # default cpu threads as 10 config.set_cpu_math_library_num_threads(10) if args.enable_mkldnn: # cache 10 different shapes for mkldnn to avoid memory leak config.set_mkldnn_cache_capacity(10) config.enable_mkldnn() # enable memory optim config.enable_memory_optim() #config.disable_glog_info() config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass") if mode == 'table': config.delete_pass("fc_fuse_pass") # not supported for table config.switch_use_feed_fetch_ops(False) config.switch_ir_optim(True) # create predictor predictor = inference.create_predictor(config) input_names = predictor.get_input_names() for name in input_names: input_tensor = predictor.get_input_handle(name) output_names = predictor.get_output_names() output_tensors = [] for output_name in output_names: output_tensor = predictor.get_output_handle(output_name) output_tensors.append(output_tensor) return predictor, input_tensor, output_tensors, config