Ejemplo n.º 1
0
 def test_case6(self):
     paddle.disable_static()
     model = ModelCase2()
     predictor = TableLatencyPredictor(table_file='SD710')
     model_file, param_file = save_det_model(model,
                                             input_shape=[1, 3, 224, 224],
                                             save_dir="./inference_model",
                                             data_type='fp32')
     latency = predictor.predict(model_file=model_file,
                                 param_file=param_file,
                                 data_type='fp32')
     assert latency > 0
Ejemplo n.º 2
0
 def test_case5(self):
     paddle.disable_static()
     model = mobilenet_v1()
     predictor = TableLatencyPredictor(f'./{opt_tool}',
                                       hardware='845',
                                       threads=4,
                                       power_mode=3,
                                       batchsize=1)
     latency = predictor.predict_latency(model,
                                         input_shape=[1, 3, 224, 224],
                                         save_dir='./model',
                                         data_type='fp32',
                                         task_type='seg')
     assert latency > 0
Ejemplo n.º 3
0
 def test_case4(self):
     paddle.disable_static()
     model = ModelCase1()
     predictor = TableLatencyPredictor(f'./{opt_tool}',
                                       hardware='845',
                                       threads=4,
                                       power_mode=3,
                                       batchsize=1)
     latency = predictor.predict_latency(model,
                                         input_shape=[1, 116, 28, 28],
                                         save_dir='./model',
                                         data_type='fp32',
                                         task_type='cls')
     assert latency > 0
Ejemplo n.º 4
0
 def test_case7(self):
     paddle.disable_static()
     model = ModelCase3()
     predictor = TableLatencyPredictor(f'./{opt_tool}',
                                       hardware='845',
                                       threads=4,
                                       power_mode=3,
                                       batchsize=1)
     predictor.set_det_multi_input(det_multi_input=True)
     latency = predictor.predict_latency(model,
                                         input_shape=[1, 3, 224, 224],
                                         save_dir='./model',
                                         data_type='fp32',
                                         task_type='det')
     assert latency > 0
Ejemplo n.º 5
0
def get_latency(model, data_type):
    paddle.disable_static()
    predictor = TableLatencyPredictor(
        f'./{opt_tool}', hardware='845', threads=4, power_mode=3, batchsize=1)
    latency = predictor.predict_latency(
        model,
        input_shape=[1, 3, 224, 224],
        save_dir='./tmp_model',
        data_type=data_type,
        task_type='cls')
    print('{} latency : {}'.format(data_type, latency))

    subprocess.call('rm -rf ./tmp_model', shell=True)
    paddle.disable_static()
    return latency
Ejemplo n.º 6
0
 def test_case6(self):
     paddle.disable_static()
     model = ModelCase2()
     predictor = TableLatencyPredictor(f'./{opt_tool}',
                                       hardware='845',
                                       threads=4,
                                       power_mode=3,
                                       batchsize=1)
     pbmodel_file = predictor.opt_model(model,
                                        input_shape=[1, 3, 224, 224],
                                        save_dir='./model',
                                        data_type='int8',
                                        task_type='det')
     assert os.path.exists(pbmodel_file)
     latency = predictor.predict_latency(model,
                                         input_shape=[1, 3, 224, 224],
                                         save_dir='./model',
                                         data_type='fp32',
                                         task_type='det')
     assert latency > 0
Ejemplo n.º 7
0
 def test_case10(self):
     paddle.disable_static()
     model = ModelCase1()
     predictor = TableLatencyPredictor(f'./{opt_tool}',
                                       hardware='845',
                                       threads=4,
                                       power_mode=3,
                                       batchsize=1)
     pbmodel_file = predictor.opt_model(model,
                                        input_shape=[1, 116, 28, 28],
                                        save_dir='./model',
                                        data_type='int8',
                                        task_type='seg')
     paddle.enable_static()
     with open(pbmodel_file, "rb") as f:
         program_desc_str = f.read()
         fluid_program = paddle.fluid.framework.Program.parse_from_string(
             program_desc_str)
         graph = paddleslim.core.GraphWrapper(fluid_program)
         graph_keys = predictor._get_key_info_from_graph(graph=graph)
         assert len(graph_keys) > 0
Ejemplo n.º 8
0
    def test_case10(self):
        paddle.disable_static()
        model = mobilenet_v2()
        model2 = ModelCase6()
        model3 = ModelCase7()
        predictor = TableLatencyPredictor(table_file='SD710')
        model_file, param_file = save_cls_model(model,
                                                input_shape=[1, 3, 250, 250],
                                                save_dir="./inference_model",
                                                data_type='fp32')
        latency = predictor.predict(model_file=model_file,
                                    param_file=param_file,
                                    data_type='fp32')
        assert latency > 0

        model_file, param_file = save_cls_model(model,
                                                input_shape=[1, 3, 250, 250],
                                                save_dir="./inference_model",
                                                data_type='int8')
        latency = predictor.predict(model_file=model_file,
                                    param_file=param_file,
                                    data_type='int8')
        assert latency > 0

        model_file, param_file = save_cls_model(model2,
                                                input_shape=[1, 3, 16, 16],
                                                save_dir="./inference_model",
                                                data_type='fp32')
        latency = predictor.predict(model_file=model_file,
                                    param_file=param_file,
                                    data_type='fp32')
        assert latency > 0

        model_file, param_file = save_det_model(model3,
                                                input_shape=[1, 255, 14, 14],
                                                save_dir="./inference_model",
                                                data_type='fp32')
        latency = predictor.predict(model_file=model_file,
                                    param_file=param_file,
                                    data_type='fp32')
        assert latency > 0
Ejemplo n.º 9
0
def predict_compressed_model(model_file, param_file, hardware='SD710'):
    """
    Evaluating the latency of the model under various compression strategies.
    Args:
        model_file(str), param_file(str): The inference model to be compressed.
        hardware(str): Target device.
    Returns:
        latency_dict(dict): The latency latency of the model under various compression strategies.
    """
    latency_dict = {}

    model_filename = model_file.split('/')[-1]
    param_filename = param_file.split('/')[-1]

    predictor = TableLatencyPredictor(hardware)
    latency = predictor.predict(model_file=model_file,
                                param_file=param_file,
                                data_type='fp32')
    latency_dict.update({'origin_fp32': latency})
    paddle.enable_static()
    place = paddle.CPUPlace()
    exe = paddle.static.Executor(place)
    post_quant_fake(exe,
                    model_dir=os.path.dirname(model_file),
                    model_filename=model_filename,
                    params_filename=param_filename,
                    save_model_path='quant_model',
                    quantizable_op_type=["conv2d", "depthwise_conv2d", "mul"],
                    is_full_quantize=False,
                    activation_bits=8,
                    weight_bits=8)
    quant_model_file = os.path.join('quant_model', model_filename)
    quant_param_file = os.path.join('quant_model', param_filename)

    latency = predictor.predict(model_file=quant_model_file,
                                param_file=quant_param_file,
                                data_type='int8')
    latency_dict.update({f'origin_int8': latency})

    for prune_ratio in [0.3, 0.4, 0.5, 0.6]:
        get_prune_model(model_file=model_file,
                        param_file=param_file,
                        ratio=prune_ratio,
                        save_path='prune_model')
        prune_model_file = os.path.join('prune_model', model_filename)
        prune_param_file = os.path.join('prune_model', param_filename)

        latency = predictor.predict(model_file=prune_model_file,
                                    param_file=prune_param_file,
                                    data_type='fp32')
        latency_dict.update({f'prune_{prune_ratio}_fp32': latency})

        post_quant_fake(
            exe,
            model_dir='prune_model',
            model_filename=model_filename,
            params_filename=param_filename,
            save_model_path='quant_model',
            quantizable_op_type=["conv2d", "depthwise_conv2d", "mul"],
            is_full_quantize=False,
            activation_bits=8,
            weight_bits=8)
        quant_model_file = os.path.join('quant_model', model_filename)
        quant_param_file = os.path.join('quant_model', param_filename)

        latency = predictor.predict(model_file=quant_model_file,
                                    param_file=quant_param_file,
                                    data_type='int8')
        latency_dict.update({f'prune_{prune_ratio}_int8': latency})

    for sparse_ratio in [0.70, 0.75, 0.80, 0.85, 0.90, 0.95]:
        get_sparse_model(model_file=model_file,
                         param_file=param_file,
                         ratio=sparse_ratio,
                         save_path='sparse_model')
        sparse_model_file = os.path.join('sparse_model', model_filename)
        sparse_param_file = os.path.join('sparse_model', param_filename)

        latency = predictor.predict(model_file=sparse_model_file,
                                    param_file=sparse_param_file,
                                    data_type='fp32')
        latency_dict.update({f'sparse_{sparse_ratio}_fp32': latency})

        post_quant_fake(
            exe,
            model_dir='sparse_model',
            model_filename=model_filename,
            params_filename=param_filename,
            save_model_path='quant_model',
            quantizable_op_type=["conv2d", "depthwise_conv2d", "mul"],
            is_full_quantize=False,
            activation_bits=8,
            weight_bits=8)
        quant_model_file = os.path.join('quant_model', model_filename)
        quant_param_file = os.path.join('quant_model', param_filename)

        latency = predictor.predict(model_file=quant_model_file,
                                    param_file=quant_param_file,
                                    data_type='int8')
        latency_dict.update({f'sparse_{prune_ratio}_int8': latency})

    # Delete temporary model files
    shutil.rmtree('./quant_model')
    shutil.rmtree('./prune_model')
    shutil.rmtree('./sparse_model')
    return latency_dict