コード例 #1
0
ファイル: quanter.py プロジェクト: vincentXiyu/PaddleSlim
def convert(program, place, config=None, scope=None, save_int8=False):
    """
    convert quantized and well-trained ``program`` to final  quantized ``program`` that can be used to  save ``inference model``.
    
    Args:
        program(fluid.Program): quantized and well-trained ``test program``.
        place(fluid.CPUPlace or fluid.CUDAPlace): This parameter represents the executor run on which device.
        config(dict, optional): configs for convert. if set None, will use default config. 
            It must be same with config that used in 'quant_aware'. Default: None.
        scope(fluid.Scope, optional):  Scope records the mapping between variable names and variables, 
            similar to brackets in programming languages. Usually users can use 
            `fluid.global_scope <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html>`_.              When ``None`` will use `fluid.global_scope() <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html>`_ . Default: ``None``.
        save_int8: Whether to return ``program`` which model parameters' dtype is ``int8``. 
            This parameter can only be used to get model size. Default: ``False``.

    Returns:
        Tuple : freezed program which can be used for inference.
        when ``save_int8`` is False, return ``freezed_program(fluid.Program)``.
        when ``save_int8`` is True, return ``freezed_program(fluid.Program)`` and ``freezed_program_int8(fluid.Program)``
    """
    scope = fluid.global_scope() if not scope else scope

    if config is None:
        config = _quant_config_default
    else:
        assert isinstance(config, dict), "config must be dict"
        config = _parse_configs(config)
    _logger.info("convert config {}".format(config))

    test_graph = IrGraph(core.Graph(program.desc), for_test=True)
    support_op_types = []
    for op in config['quantize_op_types']:
        if op in QuantizationFreezePass._supported_quantizable_op_type:
            support_op_types.append(op)

    # Freeze the graph after training by adjusting the quantize
    # operators' order for the inference.
    freeze_pass = QuantizationFreezePass(
        scope=scope,
        place=place,
        weight_bits=config['weight_bits'],
        activation_bits=config['activation_bits'],
        weight_quantize_type=config['weight_quantize_type'],
        quantizable_op_type=support_op_types)
    freeze_pass.apply(test_graph)
    freezed_program = test_graph.to_program()

    if save_int8:
        convert_int8_pass = ConvertToInt8Pass(
            scope=fluid.global_scope(),
            place=place,
            quantizable_op_type=support_op_types)
        convert_int8_pass.apply(test_graph)
        freezed_program_int8 = test_graph.to_program()
        return freezed_program, freezed_program_int8
    else:
        return freezed_program
コード例 #2
0
    def freeze_graph(self,
                     use_cuda,
                     seed,
                     activation_quant_type,
                     weight_quant_type='abs_max',
                     for_ci=True,
                     quant_skip_pattern='skip_quant'):
        def build_program(main, startup, is_test):
            main.random_seed = seed
            startup.random_seed = seed
            with fluid.unique_name.guard():
                with fluid.program_guard(main, startup):
                    img = fluid.layers.data(
                        name='image', shape=[1, 28, 28], dtype='float32')
                    label = fluid.layers.data(
                        name='label', shape=[1], dtype='int64')
                    loss = conv_net(img, label, quant_skip_pattern)
                    if not is_test:
                        opt = fluid.optimizer.Adam(learning_rate=0.001)
                        opt.minimize(loss)
            return [img, label], loss

        random.seed(0)
        np.random.seed(0)

        main = fluid.Program()
        startup = fluid.Program()
        test_program = fluid.Program()
        feeds, loss = build_program(main, startup, False)
        build_program(test_program, startup, True)
        test_program = test_program.clone(for_test=True)
        main_graph = IrGraph(core.Graph(main.desc), for_test=False)
        test_graph = IrGraph(core.Graph(test_program.desc), for_test=True)

        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
        exe = fluid.Executor(place)
        scope = fluid.Scope()
        with fluid.scope_guard(scope):
            exe.run(startup)
        transform_pass = QuantizationTransformPass(
            scope=scope,
            place=place,
            activation_quantize_type=activation_quant_type,
            weight_quantize_type=weight_quant_type,
            skip_pattern=quant_skip_pattern)
        transform_pass.apply(main_graph)
        transform_pass.apply(test_graph)
        dev_name = '_gpu_' if use_cuda else '_cpu_'
        if not for_ci:
            marked_nodes = set()
            for op in main_graph.all_op_nodes():
                if op.name().find('quantize') > -1:
                    marked_nodes.add(op)
            main_graph.draw('.', 'main' + dev_name + activation_quant_type + '_'
                            + weight_quant_type, marked_nodes)
            marked_nodes = set()
            for op in test_graph.all_op_nodes():
                if op.name().find('quantize') > -1:
                    marked_nodes.add(op)
            test_graph.draw('.', 'test' + dev_name + activation_quant_type + '_'
                            + weight_quant_type, marked_nodes)

        build_strategy = fluid.BuildStrategy()
        build_strategy.memory_optimize = False
        build_strategy.enable_inplace = False
        binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
            loss_name=loss.name, build_strategy=build_strategy)
        quantized_test_program = test_graph.to_program()
        iters = 5
        batch_size = 8

        train_reader = paddle.batch(
            paddle.reader.shuffle(
                paddle.dataset.mnist.train(), buf_size=500),
            batch_size=batch_size)
        test_reader = paddle.batch(
            paddle.dataset.mnist.test(), batch_size=batch_size)
        feeder = fluid.DataFeeder(feed_list=feeds, place=place)
        with fluid.scope_guard(scope):
            for _ in range(iters):
                data = next(train_reader())
                loss_v = exe.run(binary,
                                 feed=feeder.feed(data),
                                 fetch_list=[loss])
                if not for_ci:
                    print('{}: {}'.format('loss' + dev_name +
                                          activation_quant_type + '_' +
                                          weight_quant_type, loss_v))

        test_data = next(test_reader())
        with fluid.program_guard(quantized_test_program):
            w_var = fluid.framework._get_var('conv2d_1.w_0.quantized',
                                             quantized_test_program)
        # Testing
        with fluid.scope_guard(scope):
            test_loss1, w_quant = exe.run(program=quantized_test_program,
                                          feed=feeder.feed(test_data),
                                          fetch_list=[loss, w_var])

        # Freeze graph for inference, but the weight of fc/conv is still float type.
        freeze_pass = QuantizationFreezePass(
            scope=scope, place=place, weight_quantize_type=weight_quant_type)
        freeze_pass.apply(test_graph)
        if not for_ci:
            marked_nodes = set()
            for op in test_graph.all_op_nodes():
                if op.name().find('quantize') > -1:
                    marked_nodes.add(op)
            test_graph.draw('.', 'test_freeze' + dev_name +
                            activation_quant_type + '_' + weight_quant_type,
                            marked_nodes)

        server_program = test_graph.to_program()
        with fluid.scope_guard(scope):
            test_loss2, = exe.run(program=server_program,
                                  feed=feeder.feed(test_data),
                                  fetch_list=[loss])
        self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3)
        if not for_ci:
            print(
                '{}: {}'.format('test_loss1' + dev_name + activation_quant_type
                                + '_' + weight_quant_type, test_loss1))
            print(
                '{}: {}'.format('test_loss2' + dev_name + activation_quant_type
                                + '_' + weight_quant_type, test_loss2))
        w_freeze = np.array(scope.find_var('conv2d_1.w_0').get_tensor())
        # Maybe failed, this is due to the calculation precision
        # self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant))
        if not for_ci:
            print('{}: {}'.format('w_freeze' + dev_name + activation_quant_type
                                  + '_' + weight_quant_type, np.sum(w_freeze)))
            print('{}: {}'.format('w_quant' + dev_name + activation_quant_type +
                                  '_' + weight_quant_type, np.sum(w_quant)))

        # Convert parameter to 8-bit.
        convert_int8_pass = ConvertToInt8Pass(scope=scope, place=place)
        convert_int8_pass.apply(test_graph)
        if not for_ci:
            marked_nodes = set()
            for op in test_graph.all_op_nodes():
                if op.name().find('quantize') > -1:
                    marked_nodes.add(op)
            test_graph.draw('.', 'test_int8' + dev_name + activation_quant_type
                            + '_' + weight_quant_type, marked_nodes)
        server_program_int8 = test_graph.to_program()
        # Save the 8-bit parameter and model file.
        with fluid.scope_guard(scope):
            fluid.io.save_inference_model(
                'server_int8' + dev_name + activation_quant_type + '_' +
                weight_quant_type, ['image', 'label'], [loss], exe,
                server_program_int8)
            # Test whether the 8-bit parameter and model file can be loaded successfully.
            [infer, feed, fetch] = fluid.io.load_inference_model(
                'server_int8' + dev_name + activation_quant_type + '_' +
                weight_quant_type, exe)
        # Check the loaded 8-bit weight.
        w_8bit = np.array(scope.find_var('conv2d_1.w_0.int8').get_tensor())
        self.assertEqual(w_8bit.dtype, np.int8)
        self.assertEqual(np.sum(w_8bit), np.sum(w_freeze))
        if not for_ci:
            print('{}: {}'.format('w_8bit' + dev_name + activation_quant_type +
                                  '_' + weight_quant_type, np.sum(w_8bit)))
            print('{}: {}'.format('w_freeze' + dev_name + activation_quant_type
                                  + '_' + weight_quant_type, np.sum(w_freeze)))

        mobile_pass = TransformForMobilePass()
        mobile_pass.apply(test_graph)
        if not for_ci:
            marked_nodes = set()
            for op in test_graph.all_op_nodes():
                if op.name().find('quantize') > -1:
                    marked_nodes.add(op)
            test_graph.draw('.', 'test_mobile' + dev_name +
                            activation_quant_type + '_' + weight_quant_type,
                            marked_nodes)

        mobile_program = test_graph.to_program()
        with fluid.scope_guard(scope):
            fluid.io.save_inference_model(
                'mobile_int8' + dev_name + activation_quant_type + '_' +
                weight_quant_type, ['image', 'label'], [loss], exe,
                mobile_program)
コード例 #3
0
    def check_output_with_option(self,
                                 use_gpu,
                                 atol=1e-5,
                                 flatten=False,
                                 quant=False,
                                 rtol=1e-5):
        '''
        Check whether calculating on CPU and GPU, enable TensorRT 
        or disable TensorRT, enable MKLDNN or disable MKLDNN 
        are all the same. 
        '''
        place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
        executor = fluid.Executor(place)
        scope = fluid.Scope()
        device = "GPU" if use_gpu else "CPU"

        with fluid.scope_guard(scope):
            executor.run(self.startup_program)
            executor.run(self.test_startup_program)
        main_graph = IrGraph(core.Graph(self.main_program.desc),
                             for_test=False)
        test_graph = IrGraph(core.Graph(self.test_main_program.desc),
                             for_test=True)

        transform_pass = QuantizationTransformPass(
            scope=scope,
            place=place,
            activation_quantize_type=self.activation_quantize_type,
            weight_quantize_type=self.weight_quantize_type)
        transform_pass.apply(main_graph)
        transform_pass.apply(test_graph)

        add_quant_dequant_pass = AddQuantDequantPass(scope=scope, place=place)
        add_quant_dequant_pass.apply(main_graph)
        add_quant_dequant_pass.apply(test_graph)

        scale_training_pass = OutScaleForTrainingPass(scope=scope, place=place)
        scale_training_pass.apply(main_graph)

        build_strategy = fluid.BuildStrategy()
        build_strategy.memory_optimize = False
        build_strategy.enable_inplace = False
        build_strategy.fuse_all_reduce_ops = False
        binary = fluid.CompiledProgram(main_graph.graph)

        iters = 10
        batch_size = 1
        train_reader = paddle.batch(paddle.reader.shuffle(
            paddle.dataset.mnist.train(), buf_size=500),
                                    batch_size=batch_size)
        feeder = fluid.DataFeeder(feed_list=[self.data, self.label],
                                  place=place)
        with fluid.scope_guard(scope):
            for _ in range(iters):
                data = next(train_reader())
                loss_v = executor.run(binary,
                                      feed=feeder.feed(data),
                                      fetch_list=[self.loss])

        scale_inference_pass = OutScaleForInferencePass(scope=scope)
        scale_inference_pass.apply(test_graph)

        # Freeze graph for inference, but the weight of fc/conv is still float type.
        freeze_pass = QuantizationFreezePass(
            scope=scope,
            place=place,
            weight_quantize_type=self.weight_quantize_type)
        freeze_pass.apply(test_graph)

        self.main_program = test_graph.to_program()

        with fluid.scope_guard(scope):
            self.main_program = self._normalize_program(
                self.main_program, self.data, self.fetch_list)

        self._save_models(self.path, list(self.feeds.keys()), self.fetch_list,
                          executor, self.main_program, scope)

        paddle_outs = self._get_paddle_outs(self.feeds, self.fetch_list,
                                            executor, self.main_program, scope)
        inference_outs = self._get_inference_outs(
            self._get_analysis_config(use_gpu=use_gpu))

        # Check whether the results calculated on CPU and on GPU are the same.
        self.assertTrue(
            len(paddle_outs) == len(inference_outs),
            "The number of outputs is different between inference and training forward at {}"
            .format(device))

        for out, inference_out in zip(paddle_outs, inference_outs):
            paddle_out = np.array(out)

            if flatten:
                paddle_out = paddle_out.flatten()
                inference_out = inference_out.flatten()

            self.assertTrue(
                np.allclose(paddle_out, inference_out, atol=atol),
                "Output has diff between inference and training forward at {} "
                .format(device))

        # Check whether the trt results and the GPU results are the same.
        if use_gpu and self.enable_trt:
            tensorrt_outputs = self._get_inference_outs(
                self._get_analysis_config(use_gpu=use_gpu,
                                          use_trt=self.enable_trt))

            if self.trt_parameters.use_static:
                #deserialize
                tensorrt_outputs = self._get_inference_outs(
                    self._get_analysis_config(use_gpu=use_gpu,
                                              use_trt=self.enable_trt))

            self.assertTrue(
                len(tensorrt_outputs) == len(paddle_outs),
                "The number of outputs is different between GPU and TensorRT. "
            )

            for paddle_out, tensorrt_output in zip(paddle_outs,
                                                   tensorrt_outputs):
                paddle_out = np.array(paddle_out)

                if flatten:
                    paddle_out = paddle_out.flatten()
                    tensorrt_output = tensorrt_output.flatten()

                self.assertTrue(
                    np.allclose(paddle_out,
                                tensorrt_output,
                                rtol=rtol,
                                atol=atol),
                    "Output has diff between GPU and TensorRT. ")

        # Check whether the mkldnn results and the CPU results are the same.
        if (not use_gpu) and self.enable_mkldnn:
            mkldnn_outputs = self._get_inference_outs(
                self._get_analysis_config(use_gpu=use_gpu,
                                          use_mkldnn=self.enable_mkldnn))

            self.assertTrue(
                len(paddle_outs) == len(mkldnn_outputs),
                "The number of outputs is different between CPU and MKLDNN. ")

            if self.enable_mkldnn_bfloat16:
                atol = 0.01
            for paddle_out, mkldnn_output in zip(paddle_outs, mkldnn_outputs):
                self.assertTrue(
                    np.allclose(np.array(paddle_out), mkldnn_output,
                                atol=atol),
                    "Output has diff between CPU and MKLDNN. ")
コード例 #4
0
    def quantization_scale(self,
                           use_cuda,
                           seed,
                           activation_quant_type,
                           weight_quant_type='abs_max',
                           for_ci=False):
        def build_program(main, startup, is_test):
            main.random_seed = seed
            startup.random_seed = seed
            with fluid.unique_name.guard():
                with fluid.program_guard(main, startup):
                    img = fluid.layers.data(name='image',
                                            shape=[1, 28, 28],
                                            dtype='float32')
                    label = fluid.layers.data(name='label',
                                              shape=[1],
                                              dtype='int64')
                    loss = residual_block(img, label, 1)
                    if not is_test:
                        opt = fluid.optimizer.Adam(learning_rate=0.0001)
                        opt.minimize(loss)
            return [img, label], loss

        random.seed(0)
        np.random.seed(0)

        main = fluid.Program()
        startup = fluid.Program()
        test_program = fluid.Program()
        feeds, loss = build_program(main, startup, False)
        build_program(test_program, startup, True)
        test_program = test_program.clone(for_test=True)
        main_graph = IrGraph(core.Graph(main.desc), for_test=False)
        test_graph = IrGraph(core.Graph(test_program.desc), for_test=True)

        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
        exe = fluid.Executor(place)
        scope = fluid.Scope()
        with fluid.scope_guard(scope):
            exe.run(startup)

        transform_pass = QuantizationTransformPass(
            scope=scope,
            place=place,
            activation_quantize_type=activation_quant_type,
            weight_quantize_type=weight_quant_type)
        transform_pass.apply(main_graph)
        transform_pass.apply(test_graph)

        add_quant_dequant_pass = AddQuantDequantPass(scope=scope, place=place)
        add_quant_dequant_pass.apply(main_graph)
        add_quant_dequant_pass.apply(test_graph)

        scale_training_pass = OutScaleForTrainingPass(scope=scope, place=place)
        scale_training_pass.apply(main_graph)

        dev_name = '_gpu' if use_cuda else '_cpu'
        if not for_ci:
            marked_nodes = set()
            for op in main_graph.all_op_nodes():
                if op.name().find('quantize') > -1:
                    marked_nodes.add(op)
            main_graph.draw('.', 'main_scale' + dev_name, marked_nodes)
            marked_nodes = set()
            for op in test_graph.all_op_nodes():
                if op.name().find('quantize') > -1:
                    marked_nodes.add(op)
            test_graph.draw('.', 'test_scale' + dev_name, marked_nodes)

        build_strategy = fluid.BuildStrategy()
        build_strategy.memory_optimize = False
        build_strategy.enable_inplace = False
        build_strategy.fuse_all_reduce_ops = False
        binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
            loss_name=loss.name, build_strategy=build_strategy)
        iters = 5
        batch_size = 8

        train_reader = paddle.batch(paddle.reader.shuffle(
            paddle.dataset.mnist.train(), buf_size=500),
                                    batch_size=batch_size)
        feeder = fluid.DataFeeder(feed_list=feeds, place=place)
        with fluid.scope_guard(scope):
            for _ in range(iters):
                data = next(train_reader())
                loss_v = exe.run(binary,
                                 feed=feeder.feed(data),
                                 fetch_list=[loss])
                if not for_ci:
                    print('{}: {}'.format('loss' + dev_name, loss_v))

        scale_inference_pass = OutScaleForInferencePass(scope=scope)
        scale_inference_pass.apply(test_graph)

        # Freeze graph for inference, but the weight of fc/conv is still float type.
        freeze_pass = QuantizationFreezePass(
            scope=scope, place=place, weight_quantize_type=weight_quant_type)
        freeze_pass.apply(test_graph)
        server_program = test_graph.to_program()

        if not for_ci:
            marked_nodes = set()
            for op in test_graph.all_op_nodes():
                if op.name().find('quantize') > -1:
                    marked_nodes.add(op)
            test_graph.draw('.', 'quant_scale' + dev_name, marked_nodes)

        with open('quant_scale_model' + dev_name + '.txt', 'w') as f:
            f.write(str(server_program))

        with fluid.scope_guard(scope):
            fluid.io.save_inference_model('quant_scale_model' + dev_name,
                                          ['image', 'label'], [loss], exe,
                                          server_program)
コード例 #5
0
    def check_output_with_option(self,
                                 use_gpu,
                                 atol=1e-5,
                                 flatten=False,
                                 quant=False):
        '''
        Check whether calculating on CPU and GPU, enable TensorRT 
        or disable TensorRT, enable MKLDNN or disable MKLDNN 
        are all the same. 
        '''
        place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
        executor = fluid.Executor(place)
        scope = fluid.Scope()
        device = "GPU" if use_gpu else "CPU"
        with fluid.scope_guard(scope):
            executor.run(self.startup_program)

        if quant:
            main_graph = IrGraph(core.Graph(self.main_program.desc),
                                 for_test=True)

            transform_pass = QuantizationTransformPass(
                scope=scope,
                place=place,
                activation_quantize_type=self.activation_quant_type,
                weight_quantize_type=self.weight_quant_type,
                quantizable_op_type=[
                    'conv2d', 'mul', 'depthwise_conv2d', 'conv2d_transpose'
                ])
            transform_pass.apply(main_graph)
            weight_scale_map = {
                "conv2d": "conv2d_0.w_0.scale",
                "mul": "fc_0.w_0.scale"
            }

            weight_scale_tensor = scope.var(
                weight_scale_map[self.quantized_op_type]).get_tensor()
            weight_scale = np.ones(self.channels).astype("float32")
            weight_scale_tensor.set(weight_scale, place)

            op_nodes = main_graph.all_op_nodes()
            for op_node in op_nodes:
                if op_node.name() in [self.quantized_op_type, "relu"]:
                    op_node.op()._set_attr("out_threshold", 0.5)

            with fluid.scope_guard(scope):
                executor.run(program=self.main_program,
                             feed=self.feeds,
                             fetch_list=self.fetch_list)

            freeze_pass = QuantizationFreezePass(
                scope=scope,
                place=place,
                weight_quantize_type=self.weight_quant_type)
            freeze_pass.apply(main_graph)
            self.main_program = main_graph.to_program()

        outs = self._save_models(executor, self.main_program, scope)

        analysis_outputs = self._get_analysis_outputs(
            self._get_analysis_config(use_gpu=use_gpu))

        # Check whether the results calculated on CPU and on GPU are the same.
        self.assertTrue(
            len(outs) == len(analysis_outputs),
            "The number of outputs is different between inference and training forward at {}"
            .format(device))

        for out, analysis_output in zip(outs, analysis_outputs):
            out = np.array(out)
            if flatten:
                out = out.flatten()
                analysis_output = analysis_output.flatten()

            self.assertTrue(
                np.allclose(out, analysis_output, atol=atol),
                "Output has diff between inference and training forward at {} "
                .format(device))

        # Check whether the trt results and the GPU results are the same.
        if use_gpu and self.enable_trt:
            tensorrt_outputs = self._get_analysis_outputs(
                self._get_analysis_config(use_gpu=use_gpu,
                                          use_trt=self.enable_trt))

            if self.trt_parameters.use_static:
                #deserialize
                tensorrt_outputs = self._get_analysis_outputs(
                    self._get_analysis_config(use_gpu=use_gpu,
                                              use_trt=self.enable_trt))

            self.assertTrue(
                len(tensorrt_outputs) == len(outs),
                "The number of outputs is different between GPU and TensorRT. "
            )

            for out, tensorrt_output in zip(outs, tensorrt_outputs):
                out = np.array(out)
                if flatten:
                    out = out.flatten()
                    tensorrt_output = tensorrt_output.flatten()

                self.assertTrue(np.allclose(out, tensorrt_output, atol=atol),
                                "Output has diff between GPU and TensorRT. ")

        # Check whether the mkldnn results and the CPU results are the same.
        if (not use_gpu) and self.enable_mkldnn:
            mkldnn_outputs = self._get_analysis_outputs(
                self._get_analysis_config(use_gpu=use_gpu,
                                          use_mkldnn=self.enable_mkldnn))

            self.assertTrue(
                len(outs) == len(mkldnn_outputs),
                "The number of outputs is different between CPU and MKLDNN. ")

            if self.enable_mkldnn_bfloat16:
                atol = 0.01
            for out, mkldnn_output in zip(outs, mkldnn_outputs):
                self.assertTrue(
                    np.allclose(np.array(out), mkldnn_output, atol=atol),
                    "Output has diff between CPU and MKLDNN. ")
コード例 #6
0
def train(args):
    # parameters from arguments
    model_name = args.model
    pretrained_fp32_model = args.pretrained_fp32_model
    checkpoint = args.checkpoint
    model_save_dir = args.model_save_dir
    data_dir = args.data_dir
    activation_quant_type = args.act_quant_type
    weight_quant_type = args.wt_quant_type
    print("Using %s as the actiavtion quantize type." % activation_quant_type)
    print("Using %s as the weight quantize type." % weight_quant_type)

    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    test_prog = fluid.Program()

    _, _, train_py_reader, train_cost, train_acc1, train_acc5, global_lr = build_program(
        is_train=True,
        main_prog=train_prog,
        startup_prog=startup_prog,
        args=args)
    image, out, test_py_reader, test_cost, test_acc1, test_acc5 = build_program(
        is_train=False,
        main_prog=test_prog,
        startup_prog=startup_prog,
        args=args)
    test_prog = test_prog.clone(for_test=True)

    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(startup_prog)
    main_graph = IrGraph(core.Graph(train_prog.desc), for_test=False)
    test_graph = IrGraph(core.Graph(test_prog.desc), for_test=True)

    if pretrained_fp32_model:
        def if_exist(var):
            return os.path.exists(os.path.join(pretrained_fp32_model, var.name))
        fluid.io.load_vars(
            exe, pretrained_fp32_model, main_program=train_prog, predicate=if_exist)

    if args.use_gpu:
        visible_device = os.getenv('CUDA_VISIBLE_DEVICES')
        if visible_device:
            device_num = len(visible_device.split(','))
        else:
            device_num = subprocess.check_output(
                ['nvidia-smi', '-L']).decode().count('\n')
    else:
        device_num = 1

    train_batch_size = args.batch_size / device_num
    test_batch_size = 1 if activation_quant_type == 'abs_max' else 8
    train_reader = paddle.batch(
        reader.train(data_dir=data_dir), batch_size=train_batch_size, drop_last=True)
    test_reader = paddle.batch(reader.val(data_dir=data_dir), batch_size=test_batch_size)

    train_py_reader.decorate_paddle_reader(train_reader)
    test_py_reader.decorate_paddle_reader(test_reader)

    train_fetch_list = [train_cost.name, train_acc1.name, train_acc5.name, global_lr.name]
    test_fetch_list = [test_cost.name, test_acc1.name, test_acc5.name]

    # 1. Make some quantization transforms in the graph before training and testing.
    # According to the weight and activation quantization type, the graph will be added
    # some fake quantize operators and fake dequantize operators.
    transform_pass = QuantizationTransformPass(
        scope=fluid.global_scope(), place=place,
        activation_quantize_type=activation_quant_type,
        weight_quantize_type=weight_quant_type)
    transform_pass.apply(main_graph)
    transform_pass.apply(test_graph)

    if checkpoint:
        load_persistable_nodes(exe, checkpoint, main_graph)

    build_strategy = fluid.BuildStrategy()
    build_strategy.memory_optimize = False
    build_strategy.enable_inplace = False
    build_strategy.fuse_all_reduce_ops = False
    binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
        loss_name=train_cost.name, build_strategy=build_strategy)
    test_prog = test_graph.to_program()
    params = models.__dict__[args.model]().params
    for pass_id in range(params["num_epochs"]):

        train_py_reader.start()

        train_info = [[], [], []]
        test_info = [[], [], []]
        train_time = []
        batch_id = 0
        try:
            while True:
                t1 = time.time()
                loss, acc1, acc5, lr = exe.run(binary, fetch_list=train_fetch_list)
                t2 = time.time()
                period = t2 - t1
                loss = np.mean(np.array(loss))
                acc1 = np.mean(np.array(acc1))
                acc5 = np.mean(np.array(acc5))
                train_info[0].append(loss)
                train_info[1].append(acc1)
                train_info[2].append(acc5)
                lr = np.mean(np.array(lr))
                train_time.append(period)
                if batch_id % 10 == 0:
                    print("Pass {0}, trainbatch {1}, loss {2}, \
                        acc1 {3}, acc5 {4}, lr {5}, time {6}"
                          .format(pass_id, batch_id, loss, acc1, acc5, "%.6f" %
                                  lr, "%2.2f sec" % period))
                    sys.stdout.flush()
                batch_id += 1
        except fluid.core.EOFException:
            train_py_reader.reset()

        train_loss = np.array(train_info[0]).mean()
        train_acc1 = np.array(train_info[1]).mean()
        train_acc5 = np.array(train_info[2]).mean()

        test_py_reader.start()

        test_batch_id = 0
        try:
            while True:
                t1 = time.time()
                loss, acc1, acc5 = exe.run(program=test_prog,
                                           fetch_list=test_fetch_list)
                t2 = time.time()
                period = t2 - t1
                loss = np.mean(loss)
                acc1 = np.mean(acc1)
                acc5 = np.mean(acc5)
                test_info[0].append(loss)
                test_info[1].append(acc1)
                test_info[2].append(acc5)
                if test_batch_id % 10 == 0:
                    print("Pass {0},testbatch {1},loss {2}, \
                        acc1 {3},acc5 {4},time {5}"
                          .format(pass_id, test_batch_id, loss, acc1, acc5,
                                  "%2.2f sec" % period))
                    sys.stdout.flush()
                test_batch_id += 1
        except fluid.core.EOFException:
            test_py_reader.reset()

        test_loss = np.array(test_info[0]).mean()
        test_acc1 = np.array(test_info[1]).mean()
        test_acc5 = np.array(test_info[2]).mean()

        print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, "
              "test_loss {4}, test_acc1 {5}, test_acc5 {6}".format(
                  pass_id, train_loss, train_acc1, train_acc5, test_loss,
                  test_acc1, test_acc5))
        sys.stdout.flush()

        save_checkpoint_path = os.path.join(model_save_dir,  model_name, str(pass_id))
        if not os.path.isdir(save_checkpoint_path):
            os.makedirs(save_checkpoint_path)
        save_persistable_nodes(exe, save_checkpoint_path, main_graph)

    model_path = os.path.join(model_save_dir, model_name, args.act_quant_type)
    float_path = os.path.join(model_path, 'float')
    int8_path = os.path.join(model_path, 'int8')
    mobile_path = os.path.join(model_path, 'mobile')
    if not os.path.isdir(model_path):
        os.makedirs(model_path)

    # 2. Freeze the graph after training by adjusting the quantize
    # operators' order for the inference.
    freeze_pass = QuantizationFreezePass(
        scope=fluid.global_scope(),
        place=place,
        weight_quantize_type=weight_quant_type)
    freeze_pass.apply(test_graph)
    server_program = test_graph.to_program()
    fluid.io.save_inference_model(
        dirname=float_path,
        feeded_var_names=[image.name],
        target_vars=[out], executor=exe,
        main_program=server_program)

    # 3. Convert the weights into int8_t type.
    # (This step is optional.)
    convert_int8_pass = ConvertToInt8Pass(scope=fluid.global_scope(), place=place)
    convert_int8_pass.apply(test_graph)
    server_int8_program = test_graph.to_program()
    fluid.io.save_inference_model(
        dirname=int8_path,
        feeded_var_names=[image.name],
        target_vars=[out], executor=exe,
        main_program=server_int8_program)

    # 4. Convert the freezed graph for paddle-mobile execution.
    # (This step is optional.)
    mobile_pass = TransformForMobilePass()
    mobile_pass.apply(test_graph)
    mobile_program = test_graph.to_program()
    fluid.io.save_inference_model(
        dirname=mobile_path,
        feeded_var_names=[image.name],
        target_vars=[out], executor=exe,
        main_program=mobile_program)
コード例 #7
0
def main():
    cfg = load_config(FLAGS.config)
    if 'architecture' in cfg:
        main_arch = cfg.architecture
    else:
        raise ValueError("'architecture' not specified in config file.")

    merge_config(FLAGS.opt)
    if 'log_iter' not in cfg:
        cfg.log_iter = 20

    # check if set use_gpu=True in paddlepaddle cpu version
    check_gpu(cfg.use_gpu)

    if cfg.use_gpu:
        devices_num = fluid.core.get_cuda_device_count()
    else:
        devices_num = int(
            os.environ.get('CPU_NUM', multiprocessing.cpu_count()))

    if 'eval_feed' not in cfg:
        eval_feed = create(main_arch + 'EvalFeed')
    else:
        eval_feed = create(cfg.eval_feed)

    place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)

    _, test_feed_vars = create_feed(eval_feed, False)

    eval_reader = create_reader(eval_feed, args_path=FLAGS.dataset_dir)
    #eval_pyreader.decorate_sample_list_generator(eval_reader, place)
    test_data_feed = fluid.DataFeeder(test_feed_vars.values(), place)

    assert os.path.exists(FLAGS.model_path)
    infer_prog, feed_names, fetch_targets = fluid.io.load_inference_model(
        dirname=FLAGS.model_path,
        executor=exe,
        model_filename='__model__.infer',
        params_filename='__params__')

    eval_keys = ['bbox', 'gt_box', 'gt_label', 'is_difficult']
    eval_values = [
        'multiclass_nms_0.tmp_0', 'gt_box', 'gt_label', 'is_difficult'
    ]
    eval_cls = []
    eval_values[0] = fetch_targets[0]

    results = eval_run(exe, infer_prog, eval_reader, eval_keys, eval_values,
                       eval_cls, test_data_feed)

    resolution = None
    if 'mask' in results[0]:
        resolution = model.mask_head.resolution
    box_ap_stats = eval_results(results, eval_feed, cfg.metric,
                                cfg.num_classes, resolution, False,
                                FLAGS.output_eval)

    logger.info("freeze the graph for inference")
    test_graph = IrGraph(core.Graph(infer_prog.desc), for_test=True)

    freeze_pass = QuantizationFreezePass(
        scope=fluid.global_scope(),
        place=place,
        weight_quantize_type=FLAGS.weight_quant_type)
    freeze_pass.apply(test_graph)
    server_program = test_graph.to_program()
    fluid.io.save_inference_model(dirname=os.path.join(FLAGS.save_path,
                                                       'float'),
                                  feeded_var_names=feed_names,
                                  target_vars=fetch_targets,
                                  executor=exe,
                                  main_program=server_program,
                                  model_filename='model',
                                  params_filename='weights')

    logger.info("convert the weights into int8 type")
    convert_int8_pass = ConvertToInt8Pass(scope=fluid.global_scope(),
                                          place=place)
    convert_int8_pass.apply(test_graph)
    server_int8_program = test_graph.to_program()
    fluid.io.save_inference_model(dirname=os.path.join(FLAGS.save_path,
                                                       'int8'),
                                  feeded_var_names=feed_names,
                                  target_vars=fetch_targets,
                                  executor=exe,
                                  main_program=server_int8_program,
                                  model_filename='model',
                                  params_filename='weights')
コード例 #8
0
    def quantization_scale(self,
                           use_cuda,
                           seed,
                           activation_quant_type,
                           weight_quant_type='abs_max',
                           for_ci=False,
                           act_preprocess_func=None,
                           weight_preprocess_func=None,
                           act_quantize_func=None,
                           weight_quantize_func=None):
        def build_program(main, startup, is_test):
            main.random_seed = seed
            startup.random_seed = seed
            with fluid.unique_name.guard():
                with fluid.program_guard(main, startup):
                    img = fluid.layers.data(name='image',
                                            shape=[1, 28, 28],
                                            dtype='float32')
                    img.stop_gradient = False
                    label = fluid.layers.data(name='label',
                                              shape=[1],
                                              dtype='int64')
                    loss = conv_net(img, label)
                    if not is_test:
                        opt = fluid.optimizer.SGD(learning_rate=0.0001)
                        opt.minimize(loss)
            return [img, label], loss

        def get_optimizer():
            return fluid.optimizer.MomentumOptimizer(0.0001, 0.9)

        def load_dict():
            with open('mapping_table_for_saving_inference_model', 'r') as file:
                data = file.read()
                data = json.loads(data)
                return data

        def save_dict(Dict):
            with open('mapping_table_for_saving_inference_model', 'w') as file:
                file.write(json.dumps(Dict))

        random.seed(0)
        np.random.seed(0)

        main = fluid.Program()
        startup = fluid.Program()
        test_program = fluid.Program()
        feeds, loss = build_program(main, startup, False)
        build_program(test_program, startup, True)
        test_program = test_program.clone(for_test=True)
        main_graph = IrGraph(core.Graph(main.desc), for_test=False)
        test_graph = IrGraph(core.Graph(test_program.desc), for_test=True)

        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
        exe = fluid.Executor(place)
        scope = fluid.Scope()
        with fluid.scope_guard(scope):
            exe.run(startup)
        train_transform_pass = QuantizationTransformPass(
            scope=scope,
            place=place,
            activation_quantize_type=activation_quant_type,
            weight_quantize_type=weight_quant_type,
            act_preprocess_func=act_preprocess_func,
            weight_preprocess_func=weight_preprocess_func,
            act_quantize_func=act_quantize_func,
            weight_quantize_func=weight_quantize_func,
            optimizer_func=get_optimizer,
            executor=exe)
        train_transform_pass.apply(main_graph)
        test_transform_pass = QuantizationTransformPass(
            scope=scope,
            place=place,
            activation_quantize_type=activation_quant_type,
            weight_quantize_type=weight_quant_type,
            act_preprocess_func=act_preprocess_func,
            weight_preprocess_func=weight_preprocess_func,
            act_quantize_func=act_quantize_func,
            weight_quantize_func=weight_quantize_func,
            optimizer_func=get_optimizer,
            executor=exe)

        test_transform_pass.apply(test_graph)
        save_dict(test_graph.out_node_mapping_table)

        add_quant_dequant_pass = AddQuantDequantPass(scope=scope, place=place)
        add_quant_dequant_pass.apply(main_graph)
        add_quant_dequant_pass.apply(test_graph)

        scale_training_pass = OutScaleForTrainingPass(scope=scope, place=place)
        scale_training_pass.apply(main_graph)

        dev_name = '_gpu' if use_cuda else '_cpu'

        build_strategy = fluid.BuildStrategy()
        build_strategy.memory_optimize = False
        build_strategy.enable_inplace = False
        build_strategy.fuse_all_reduce_ops = False
        binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
            loss_name=loss.name, build_strategy=build_strategy)
        iters = 5
        batch_size = 8

        train_reader = paddle.batch(paddle.reader.shuffle(
            paddle.dataset.mnist.train(), buf_size=500),
                                    batch_size=batch_size)
        feeder = fluid.DataFeeder(feed_list=feeds, place=place)
        with fluid.scope_guard(scope):
            for _ in range(iters):
                data = next(train_reader())
                loss_v = exe.run(binary,
                                 feed=feeder.feed(data),
                                 fetch_list=[loss])

        out_scale_infer_pass = OutScaleForInferencePass(scope=scope)
        out_scale_infer_pass.apply(test_graph)

        freeze_pass = QuantizationFreezePass(
            scope=scope,
            place=place,
            weight_bits=8,
            activation_bits=8,
            weight_quantize_type=weight_quant_type)

        mapping_table = load_dict()
        test_graph.out_node_mapping_table = mapping_table
        if act_quantize_func == None and weight_quantize_func == None:
            freeze_pass.apply(test_graph)
コード例 #9
0
    def mkldnn_based_freeze_graph(self,
                                  use_cuda,
                                  seed,
                                  activation_quant_type,
                                  weight_quant_type='abs_max',
                                  quant_perf=False,
                                  for_ci=False):
        random.seed(0)
        np.random.seed(0)

        main = fluid.Program()
        startup = fluid.Program()
        test_program = fluid.Program()
        feeds, loss = self.build_program(main, startup, False, seed)
        self.build_program(test_program, startup, True, seed)
        test_program = test_program.clone(for_test=True)
        main_graph = IrGraph(core.Graph(main.desc), for_test=False)
        test_graph = IrGraph(core.Graph(test_program.desc), for_test=True)

        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        scope = fluid.Scope()
        with fluid.scope_guard(scope):
            exe.run(startup)
        # Apply the QuantizationTransformPass
        transform_pass = QuantizationTransformPass(
            scope=scope,
            place=place,
            activation_quantize_type=activation_quant_type,
            weight_quantize_type=weight_quant_type)
        transform_pass.apply(main_graph)
        transform_pass.apply(test_graph)

        build_strategy = fluid.BuildStrategy()
        build_strategy.memory_optimize = False
        build_strategy.enable_inplace = False
        binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
            loss_name=loss.name, build_strategy=build_strategy)
        quantized_test_program = test_graph.to_program()
        iters = 5
        batch_size = 8

        train_reader = paddle.batch(paddle.reader.shuffle(
            paddle.dataset.mnist.train(), buf_size=500),
                                    batch_size=batch_size)
        test_reader = paddle.batch(paddle.dataset.mnist.test(),
                                   batch_size=batch_size)
        feeder = fluid.DataFeeder(feed_list=feeds, place=place)

        # Training the model to get the weights value
        with fluid.scope_guard(scope):
            for _ in range(iters):
                data = next(train_reader())
                loss_v = exe.run(binary,
                                 feed=feeder.feed(data),
                                 fetch_list=[loss])

        # Freeze graph for inference, but the weight of fc/conv is still float type.
        freeze_pass = QuantizationFreezePass(
            scope=scope, place=place, weight_quantize_type=weight_quant_type)
        freeze_pass.apply(test_graph)

        # Transform quantized graph for MKL-DNN INT8 inference
        mkldnn_int8_pass = QuantInt8MkldnnPass(_scope=scope, _place=place)
        mkldnn_int8_pass.apply(test_graph)
        dev_name = '_cpu_'
        if not for_ci:
            marked_nodes = set()
            for op in test_graph.all_op_nodes():
                if op.name().find('quantize') > -1:
                    marked_nodes.add(op)
            test_graph.draw(
                '.', 'test_mkldnn' + dev_name + activation_quant_type + '_' +
                weight_quant_type, marked_nodes)
        mkldnn_program = test_graph.to_program()

        # Check the transformation weights of conv2d and mul
        conv_w_mkldnn = np.array(scope.find_var('conv2d_1.w_0').get_tensor())
        mul_w_mkldnn = np.array(scope.find_var('fc_0.w_0').get_tensor())
        # Check if weights are still integer
        self.assertFalse(self.isinteger(np.sum(conv_w_mkldnn)))
        self.assertFalse(self.isinteger(np.sum(mul_w_mkldnn)))

        # Check if the conv2d output and mul output are correctly linked to fake_dequantize's
        # output
        self.check_program(mkldnn_program)
        if not for_ci:
            print('{}: {}'.format(
                'w_mkldnn' + dev_name + activation_quant_type + '_' +
                weight_quant_type, np.sum(w_mkldnn)))
コード例 #10
0
def eval(args):
    # parameters from arguments

    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)

    val_program, feed_names, fetch_targets = fluid.io.load_inference_model(
        args.model_path,
        exe,
        model_filename="__model__.infer",
        params_filename="__params__")
    val_reader = paddle.batch(reader.val(), batch_size=128)
    feeder = fluid.DataFeeder(
        place=place, feed_list=feed_names, program=val_program)

    results = []
    for batch_id, data in enumerate(val_reader()):
        image = [[d[0]] for d in data]
        label = [[d[1]] for d in data]
        feed_data = feeder.feed(image)
        pred = exe.run(val_program, feed=feed_data, fetch_list=fetch_targets)
        pred = np.array(pred[0])
        label = np.array(label)
        sort_array = pred.argsort(axis=1)
        top_1_pred = sort_array[:, -1:][:, ::-1]
        top_1 = np.mean(label == top_1_pred)
        top_5_pred = sort_array[:, -5:][:, ::-1]
        acc_num = 0
        for i in range(len(label)):
            if label[i][0] in top_5_pred[i]:
                acc_num += 1
        top_5 = acc_num / len(label)
        results.append([top_1, top_5])

    result = np.mean(np.array(results), axis=0)
    print("top1_acc/top5_acc= {}".format(result))
    sys.stdout.flush()

    _logger.info("freeze the graph for inference")
    test_graph = IrGraph(core.Graph(val_program.desc), for_test=True)

    freeze_pass = QuantizationFreezePass(
        scope=fluid.global_scope(),
        place=place,
        weight_quantize_type=args.weight_quant_type)
    freeze_pass.apply(test_graph)
    server_program = test_graph.to_program()
    fluid.io.save_inference_model(
        dirname=os.path.join(args.save_path, 'float'),
        feeded_var_names=feed_names,
        target_vars=fetch_targets,
        executor=exe,
        main_program=server_program,
        model_filename='model',
        params_filename='weights')

    _logger.info("convert the weights into int8 type")
    convert_int8_pass = ConvertToInt8Pass(
        scope=fluid.global_scope(), place=place)
    convert_int8_pass.apply(test_graph)
    server_int8_program = test_graph.to_program()
    fluid.io.save_inference_model(
        dirname=os.path.join(args.save_path, 'int8'),
        feeded_var_names=feed_names,
        target_vars=fetch_targets,
        executor=exe,
        main_program=server_int8_program,
        model_filename='model',
        params_filename='weights')
コード例 #11
0
def create_quant_model(model,
                       params,
                       activation_quantize_type='moving_average_abs_max',
                       weight_quantize_type='channel_wise_abs_max',
                       save=False):
    place = paddle.CUDAPlace(0)
    scope = global_scope()
    exe = paddle.static.Executor(place)
    [inference_program, feed_target_names, fetch_targets
     ] = paddle.static.load_inference_model(path_prefix=None,
                                            executor=exe,
                                            model_filename=model,
                                            params_filename=params)
    graph = IrGraph(core.Graph(inference_program.desc), for_test=True)

    out_scale_op_list = [
        "conv2d",
        "depthwise_conv2d",
        "mul",
        "matmul",
        "relu",
        "leaky_relu",
        "relu6",
        "sigmoid",
        "tanh",
        "prelu",
        "swish",
        "softmax",
        "batch_norm",
        "layer_norm",
        "elementwise_add",
        "pool2d",
        "reshape2",
        "transpose2",
        "concat",
        "elementwise_mul",
        "scale",
        "slice",
        "hard_swish",
        "hard_sigmoid",
        "conv2d_transpose",
        "gru",
        "bilinear_interp",
        "nearest_interp",
        "trilinear_interp",
        "flatten",
        "flatten2",
        "transpose",
        "pad2d",
        "reshape",
        "layer_norm",
    ]
    op_real_in_out_name = {
        "conv2d": [["Input", "Filter"], ["Output"]],
        "depthwise_conv2d": [["Input", "Filter"], ["Output"]],
        "conv2d_transpose": [["Input", "Filter"], ["Output"]],
        "mul": [["X", "Y"], ["Out"]],
        "matmul": [["X", "Y"], ["Out"]],
        "pool2d": [["X"], ["Out"]],
        "elementwise_add": [["X", "Y"], ["Out"]],
        "concat": [["X"], ["Out"]],
        "softmax": [["X"], ["Out"]],
        "argmax": [["X"], ["Out"]],
        "transpose": [["X"], ["Out"]],
        "equal": [["X", "Y"], ["Out"]],
        "gather": [["X"], ["Out"]],
        "greater_equal": [["X", "Y"], ["Out"]],
        "greater_than": [["X", "Y"], ["Out"]],
        "less_equal": [["X", "Y"], ["Out"]],
        "less_than": [["X", "Y"], ["Out"]],
        "mean": [["X"], ["Out"]],
        "not_equal": [["X", "Y"], ["Out"]],
        "reshape": [["X"], ["Out"]],
        "reshape2": [["X"], ["Out"]],
        "transpose2": [["X"], ["Out"]],
        "bilinear_interp": [["X"], ["Out"]],
        "nearest_interp": [["X"], ["Out"]],
        "trilinear_interp": [["X"], ["Out"]],
        "slice": [["Input"], ["Out"]],
        "squeeze": [["X"], ["Out"]],
        "elementwise_sub": [["X", "Y"], ["Out"]],
        "relu": [["X"], ["Out"]],
        "relu6": [["X"], ["Out"]],
        "leaky_relu": [["X"], ["Out"]],
        "prelu": [["X"], ["Out"]],
        "tanh": [["X"], ["Out"]],
        "swish": [["X"], ["Out"]],
        "dropout": [["X"], ["Out"]],
        "batch_norm": [["X"], ["Y"]],
        "layer_norm": [["X"], ["Y"]],
        "sigmoid": [["X"], ["Out"]],
        "elementwise_mul": [["X", "Y"], ["Out"]],
        "scale": [["X"], ["Out"]],
        "hard_swish": [["X"], ["Out"]],
        "hard_sigmoid": [["X"], ["Out"]],
        "gru": [["Input", "Weight"], ["Hidden"]],
        "lstm": [["Input", "Weight"], ["Hidden"]],
        "pad2d": [["X"], ["Out"]],
        "flatten": [["X"], ["Out"]],
        "flatten2": [["X"], ["Out"]],
    }

    def _get_op_output_var_names(op):
        """ """
        assert isinstance(op, (IrNode, Operator)), \
            "The input op should be IrNode or Operator."
        var_names = []
        op_name = op.name() if isinstance(op, IrNode) \
            else op.type
        if op_name not in op_real_in_out_name:
            return []

        name_list = op_real_in_out_name[op_name][1]
        for name in name_list:
            var_name = op.output(name)
            if isinstance(var_name, list):
                var_names.extend(var_name)
            else:
                var_names.append(var_name)
        return var_names

    transform_pass = QuantizationTransformPass(
        scope=scope,
        place=place,
        activation_quantize_type=activation_quantize_type,
        weight_quantize_type=weight_quantize_type)
    transform_pass.apply(graph)

    op_nodes = graph.all_op_nodes()
    for op_node in op_nodes:
        if op_node.name() in out_scale_op_list:
            var_names = _get_op_output_var_names(op_node)
            for var_name in var_names:
                in_node = graph._find_node_by_name(op_node.outputs, var_name)
                if in_node.dtype() not in \
                    [core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32]:
                    continue

                op_node.op()._set_attr("out_threshold", 3.0)

    # Freeze graph for inference, but the weight of fc/conv is still float type.
    freeze_pass = QuantizationFreezePass(
        scope=scope, place=place, weight_quantize_type=weight_quantize_type)
    freeze_pass.apply(graph)

    main_program = graph.to_program()

    # modify fake_quantize_moving_average_abs_max(InScale) and fake_channel_wise_dequantize_max_abs(Scales)
    op_nodes = graph.all_op_nodes()
    for op_node in op_nodes:
        if op_node.name() == 'fake_quantize_moving_average_abs_max':
            var_name = op_node.input("InScale")[0]
            tensor = scope.var(var_name).get_tensor()
            tensor.set(np.array([1], dtype=np.float32), place)
        elif op_node.name() == 'fake_channel_wise_dequantize_max_abs':
            var_name = op_node.input("Scales")[0]
            tensor = scope.var(var_name).get_tensor()
            tensor.set(np.ones(tensor.shape(), dtype=np.float32), place)

    if save:
        fluid.io.save_inference_model('test_inference_model',
                                      feed_target_names,
                                      fetch_targets,
                                      exe,
                                      main_program=main_program)

    feed_vars = [
        main_program.global_block().var(name) for name in feed_target_names
    ]
    serialized_program = paddle.static.serialize_program(feed_vars,
                                                         fetch_targets,
                                                         program=main_program)
    serialized_params = paddle.static.serialize_persistables(
        feed_vars, fetch_targets, executor=exe, program=main_program)
    return serialized_program, serialized_params