def test_case6(self): paddle.disable_static() model = ModelCase2() predictor = TableLatencyPredictor(table_file='SD710') model_file, param_file = save_det_model(model, input_shape=[1, 3, 224, 224], save_dir="./inference_model", data_type='fp32') latency = predictor.predict(model_file=model_file, param_file=param_file, data_type='fp32') assert latency > 0
def test_case5(self): paddle.disable_static() model = mobilenet_v1() predictor = TableLatencyPredictor(f'./{opt_tool}', hardware='845', threads=4, power_mode=3, batchsize=1) latency = predictor.predict_latency(model, input_shape=[1, 3, 224, 224], save_dir='./model', data_type='fp32', task_type='seg') assert latency > 0
def test_case4(self): paddle.disable_static() model = ModelCase1() predictor = TableLatencyPredictor(f'./{opt_tool}', hardware='845', threads=4, power_mode=3, batchsize=1) latency = predictor.predict_latency(model, input_shape=[1, 116, 28, 28], save_dir='./model', data_type='fp32', task_type='cls') assert latency > 0
def test_case7(self): paddle.disable_static() model = ModelCase3() predictor = TableLatencyPredictor(f'./{opt_tool}', hardware='845', threads=4, power_mode=3, batchsize=1) predictor.set_det_multi_input(det_multi_input=True) latency = predictor.predict_latency(model, input_shape=[1, 3, 224, 224], save_dir='./model', data_type='fp32', task_type='det') assert latency > 0
def get_latency(model, data_type): paddle.disable_static() predictor = TableLatencyPredictor( f'./{opt_tool}', hardware='845', threads=4, power_mode=3, batchsize=1) latency = predictor.predict_latency( model, input_shape=[1, 3, 224, 224], save_dir='./tmp_model', data_type=data_type, task_type='cls') print('{} latency : {}'.format(data_type, latency)) subprocess.call('rm -rf ./tmp_model', shell=True) paddle.disable_static() return latency
def test_case6(self): paddle.disable_static() model = ModelCase2() predictor = TableLatencyPredictor(f'./{opt_tool}', hardware='845', threads=4, power_mode=3, batchsize=1) pbmodel_file = predictor.opt_model(model, input_shape=[1, 3, 224, 224], save_dir='./model', data_type='int8', task_type='det') assert os.path.exists(pbmodel_file) latency = predictor.predict_latency(model, input_shape=[1, 3, 224, 224], save_dir='./model', data_type='fp32', task_type='det') assert latency > 0
def test_case10(self): paddle.disable_static() model = ModelCase1() predictor = TableLatencyPredictor(f'./{opt_tool}', hardware='845', threads=4, power_mode=3, batchsize=1) pbmodel_file = predictor.opt_model(model, input_shape=[1, 116, 28, 28], save_dir='./model', data_type='int8', task_type='seg') paddle.enable_static() with open(pbmodel_file, "rb") as f: program_desc_str = f.read() fluid_program = paddle.fluid.framework.Program.parse_from_string( program_desc_str) graph = paddleslim.core.GraphWrapper(fluid_program) graph_keys = predictor._get_key_info_from_graph(graph=graph) assert len(graph_keys) > 0
def test_case10(self): paddle.disable_static() model = mobilenet_v2() model2 = ModelCase6() model3 = ModelCase7() predictor = TableLatencyPredictor(table_file='SD710') model_file, param_file = save_cls_model(model, input_shape=[1, 3, 250, 250], save_dir="./inference_model", data_type='fp32') latency = predictor.predict(model_file=model_file, param_file=param_file, data_type='fp32') assert latency > 0 model_file, param_file = save_cls_model(model, input_shape=[1, 3, 250, 250], save_dir="./inference_model", data_type='int8') latency = predictor.predict(model_file=model_file, param_file=param_file, data_type='int8') assert latency > 0 model_file, param_file = save_cls_model(model2, input_shape=[1, 3, 16, 16], save_dir="./inference_model", data_type='fp32') latency = predictor.predict(model_file=model_file, param_file=param_file, data_type='fp32') assert latency > 0 model_file, param_file = save_det_model(model3, input_shape=[1, 255, 14, 14], save_dir="./inference_model", data_type='fp32') latency = predictor.predict(model_file=model_file, param_file=param_file, data_type='fp32') assert latency > 0
def predict_compressed_model(model_file, param_file, hardware='SD710'): """ Evaluating the latency of the model under various compression strategies. Args: model_file(str), param_file(str): The inference model to be compressed. hardware(str): Target device. Returns: latency_dict(dict): The latency latency of the model under various compression strategies. """ latency_dict = {} model_filename = model_file.split('/')[-1] param_filename = param_file.split('/')[-1] predictor = TableLatencyPredictor(hardware) latency = predictor.predict(model_file=model_file, param_file=param_file, data_type='fp32') latency_dict.update({'origin_fp32': latency}) paddle.enable_static() place = paddle.CPUPlace() exe = paddle.static.Executor(place) post_quant_fake(exe, model_dir=os.path.dirname(model_file), model_filename=model_filename, params_filename=param_filename, save_model_path='quant_model', quantizable_op_type=["conv2d", "depthwise_conv2d", "mul"], is_full_quantize=False, activation_bits=8, weight_bits=8) quant_model_file = os.path.join('quant_model', model_filename) quant_param_file = os.path.join('quant_model', param_filename) latency = predictor.predict(model_file=quant_model_file, param_file=quant_param_file, data_type='int8') latency_dict.update({f'origin_int8': latency}) for prune_ratio in [0.3, 0.4, 0.5, 0.6]: get_prune_model(model_file=model_file, param_file=param_file, ratio=prune_ratio, save_path='prune_model') prune_model_file = os.path.join('prune_model', model_filename) prune_param_file = os.path.join('prune_model', param_filename) latency = predictor.predict(model_file=prune_model_file, param_file=prune_param_file, data_type='fp32') latency_dict.update({f'prune_{prune_ratio}_fp32': latency}) post_quant_fake( exe, model_dir='prune_model', model_filename=model_filename, params_filename=param_filename, save_model_path='quant_model', quantizable_op_type=["conv2d", "depthwise_conv2d", "mul"], is_full_quantize=False, activation_bits=8, weight_bits=8) quant_model_file = os.path.join('quant_model', model_filename) quant_param_file = os.path.join('quant_model', param_filename) latency = predictor.predict(model_file=quant_model_file, param_file=quant_param_file, data_type='int8') latency_dict.update({f'prune_{prune_ratio}_int8': latency}) for sparse_ratio in [0.70, 0.75, 0.80, 0.85, 0.90, 0.95]: get_sparse_model(model_file=model_file, param_file=param_file, ratio=sparse_ratio, save_path='sparse_model') sparse_model_file = os.path.join('sparse_model', model_filename) sparse_param_file = os.path.join('sparse_model', param_filename) latency = predictor.predict(model_file=sparse_model_file, param_file=sparse_param_file, data_type='fp32') latency_dict.update({f'sparse_{sparse_ratio}_fp32': latency}) post_quant_fake( exe, model_dir='sparse_model', model_filename=model_filename, params_filename=param_filename, save_model_path='quant_model', quantizable_op_type=["conv2d", "depthwise_conv2d", "mul"], is_full_quantize=False, activation_bits=8, weight_bits=8) quant_model_file = os.path.join('quant_model', model_filename) quant_param_file = os.path.join('quant_model', param_filename) latency = predictor.predict(model_file=quant_model_file, param_file=quant_param_file, data_type='int8') latency_dict.update({f'sparse_{prune_ratio}_int8': latency}) # Delete temporary model files shutil.rmtree('./quant_model') shutil.rmtree('./prune_model') shutil.rmtree('./sparse_model') return latency_dict