def convert(program, place, config=None, scope=None, save_int8=False): """ convert quantized and well-trained ``program`` to final quantized ``program``that can be used to save ``inference model``. Args: program(paddle.static.Program): quantized and well-trained ``test program``. place(paddle.CPUPlace or paddle.CUDAPlace): This parameter represents the executor run on which device. config(dict, optional): configs for convert. if set None, will use default config. It must be same with config that used in 'quant_aware'. Default is None. scope(paddle.static.Scope, optional): Scope records the mapping between variable names and variables, similar to brackets in programming languages. Usually users can use `paddle.static.global_scope <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html>`_. When ``None`` will use `paddle.static.global_scope() <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html>`_ . Default: ``None``. save_int8: Whether to return ``program`` which model parameters' dtype is ``int8``. This parameter can only be used to get model size. Default: ``False``. Returns: Tuple : freezed program which can be used for inference. when ``save_int8`` is False, return ``freezed_program(paddle.static.Program)``. when ``save_int8`` is True, return ``freezed_program(paddle.static.Program)`` and ``freezed_program_int8(paddle.static.Program)`` """ scope = paddle.static.global_scope() if not scope else scope if config is None: config = _quant_config_default else: assert isinstance(config, dict), "config must be dict" config = _parse_configs(config) _logger.info("convert config {}".format(config)) test_graph = IrGraph(core.Graph(program.desc), for_test=True) out_scale_infer_pass = OutScaleForInferencePass(scope=scope) out_scale_infer_pass.apply(test_graph) # Freeze the graph after training by adjusting the quantize # operators' order for the inference. freeze_pass = QuantizationFreezePass( scope=scope, place=place, weight_bits=config['weight_bits'], activation_bits=config['activation_bits'], weight_quantize_type=config['weight_quantize_type']) if os.path.exists(VARS_MAPPING_TABLE): test_graph.out_node_mapping_table = load_dict() freeze_pass.apply(test_graph) freezed_program = test_graph.to_program() if save_int8: convert_int8_pass = ConvertToInt8Pass(scope=scope, place=place) convert_int8_pass.apply(test_graph) freezed_program_int8 = test_graph.to_program() return freezed_program, freezed_program_int8 else: return freezed_program
def check_output_with_option(self, use_gpu, atol=1e-5, flatten=False, quant=False, rtol=1e-5): ''' Check whether calculating on CPU and GPU, enable TensorRT or disable TensorRT, enable MKLDNN or disable MKLDNN are all the same. ''' place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() executor = fluid.Executor(place) scope = fluid.Scope() device = "GPU" if use_gpu else "CPU" with fluid.scope_guard(scope): executor.run(self.startup_program) executor.run(self.test_startup_program) main_graph = IrGraph(core.Graph(self.main_program.desc), for_test=False) test_graph = IrGraph(core.Graph(self.test_main_program.desc), for_test=True) transform_pass = QuantizationTransformPass( scope=scope, place=place, activation_quantize_type=self.activation_quantize_type, weight_quantize_type=self.weight_quantize_type) transform_pass.apply(main_graph) transform_pass.apply(test_graph) add_quant_dequant_pass = AddQuantDequantPass(scope=scope, place=place) add_quant_dequant_pass.apply(main_graph) add_quant_dequant_pass.apply(test_graph) scale_training_pass = OutScaleForTrainingPass(scope=scope, place=place) scale_training_pass.apply(main_graph) build_strategy = fluid.BuildStrategy() build_strategy.memory_optimize = False build_strategy.enable_inplace = False build_strategy.fuse_all_reduce_ops = False binary = fluid.CompiledProgram(main_graph.graph) iters = 10 batch_size = 1 train_reader = paddle.batch(paddle.reader.shuffle( paddle.dataset.mnist.train(), buf_size=500), batch_size=batch_size) feeder = fluid.DataFeeder(feed_list=[self.data, self.label], place=place) with fluid.scope_guard(scope): for _ in range(iters): data = next(train_reader()) loss_v = executor.run(binary, feed=feeder.feed(data), fetch_list=[self.loss]) scale_inference_pass = OutScaleForInferencePass(scope=scope) scale_inference_pass.apply(test_graph) # Freeze graph for inference, but the weight of fc/conv is still float type. freeze_pass = QuantizationFreezePass( scope=scope, place=place, weight_quantize_type=self.weight_quantize_type) freeze_pass.apply(test_graph) self.main_program = test_graph.to_program() with fluid.scope_guard(scope): self.main_program = self._normalize_program( self.main_program, self.data, self.fetch_list) self._save_models(self.path, list(self.feeds.keys()), self.fetch_list, executor, self.main_program, scope) paddle_outs = self._get_paddle_outs(self.feeds, self.fetch_list, executor, self.main_program, scope) inference_outs = self._get_inference_outs( self._get_analysis_config(use_gpu=use_gpu)) # Check whether the results calculated on CPU and on GPU are the same. self.assertTrue( len(paddle_outs) == len(inference_outs), "The number of outputs is different between inference and training forward at {}" .format(device)) for out, inference_out in zip(paddle_outs, inference_outs): paddle_out = np.array(out) if flatten: paddle_out = paddle_out.flatten() inference_out = inference_out.flatten() self.assertTrue( np.allclose(paddle_out, inference_out, atol=atol), "Output has diff between inference and training forward at {} " .format(device)) # Check whether the trt results and the GPU results are the same. if use_gpu and self.enable_trt: tensorrt_outputs = self._get_inference_outs( self._get_analysis_config(use_gpu=use_gpu, use_trt=self.enable_trt)) if self.trt_parameters.use_static: #deserialize tensorrt_outputs = self._get_inference_outs( self._get_analysis_config(use_gpu=use_gpu, use_trt=self.enable_trt)) self.assertTrue( len(tensorrt_outputs) == len(paddle_outs), "The number of outputs is different between GPU and TensorRT. " ) for paddle_out, tensorrt_output in zip(paddle_outs, tensorrt_outputs): paddle_out = np.array(paddle_out) if flatten: paddle_out = paddle_out.flatten() tensorrt_output = tensorrt_output.flatten() self.assertTrue( np.allclose(paddle_out, tensorrt_output, rtol=rtol, atol=atol), "Output has diff between GPU and TensorRT. ") # Check whether the mkldnn results and the CPU results are the same. if (not use_gpu) and self.enable_mkldnn: mkldnn_outputs = self._get_inference_outs( self._get_analysis_config(use_gpu=use_gpu, use_mkldnn=self.enable_mkldnn)) self.assertTrue( len(paddle_outs) == len(mkldnn_outputs), "The number of outputs is different between CPU and MKLDNN. ") if self.enable_mkldnn_bfloat16: atol = 0.01 for paddle_out, mkldnn_output in zip(paddle_outs, mkldnn_outputs): self.assertTrue( np.allclose(np.array(paddle_out), mkldnn_output, atol=atol), "Output has diff between CPU and MKLDNN. ")
def quantization_scale(self, use_cuda, seed, activation_quant_type, weight_quant_type='abs_max', for_ci=False, act_preprocess_func=None, weight_preprocess_func=None, act_quantize_func=None, weight_quantize_func=None): def build_program(main, startup, is_test): main.random_seed = seed startup.random_seed = seed with fluid.unique_name.guard(): with fluid.program_guard(main, startup): img = fluid.layers.data(name='image', shape=[1, 28, 28], dtype='float32') img.stop_gradient = False label = fluid.layers.data(name='label', shape=[1], dtype='int64') loss = conv_net(img, label) if not is_test: opt = fluid.optimizer.SGD(learning_rate=0.0001) opt.minimize(loss) return [img, label], loss def get_optimizer(): return fluid.optimizer.MomentumOptimizer(0.0001, 0.9) def load_dict(): with open('mapping_table_for_saving_inference_model', 'r') as file: data = file.read() data = json.loads(data) return data def save_dict(Dict): with open('mapping_table_for_saving_inference_model', 'w') as file: file.write(json.dumps(Dict)) random.seed(0) np.random.seed(0) main = fluid.Program() startup = fluid.Program() test_program = fluid.Program() feeds, loss = build_program(main, startup, False) build_program(test_program, startup, True) test_program = test_program.clone(for_test=True) main_graph = IrGraph(core.Graph(main.desc), for_test=False) test_graph = IrGraph(core.Graph(test_program.desc), for_test=True) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) scope = fluid.Scope() with fluid.scope_guard(scope): exe.run(startup) train_transform_pass = QuantizationTransformPass( scope=scope, place=place, activation_quantize_type=activation_quant_type, weight_quantize_type=weight_quant_type, act_preprocess_func=act_preprocess_func, weight_preprocess_func=weight_preprocess_func, act_quantize_func=act_quantize_func, weight_quantize_func=weight_quantize_func, optimizer_func=get_optimizer, executor=exe) train_transform_pass.apply(main_graph) test_transform_pass = QuantizationTransformPass( scope=scope, place=place, activation_quantize_type=activation_quant_type, weight_quantize_type=weight_quant_type, act_preprocess_func=act_preprocess_func, weight_preprocess_func=weight_preprocess_func, act_quantize_func=act_quantize_func, weight_quantize_func=weight_quantize_func, optimizer_func=get_optimizer, executor=exe) test_transform_pass.apply(test_graph) save_dict(test_graph.out_node_mapping_table) add_quant_dequant_pass = AddQuantDequantPass(scope=scope, place=place) add_quant_dequant_pass.apply(main_graph) add_quant_dequant_pass.apply(test_graph) scale_training_pass = OutScaleForTrainingPass(scope=scope, place=place) scale_training_pass.apply(main_graph) dev_name = '_gpu' if use_cuda else '_cpu' build_strategy = fluid.BuildStrategy() build_strategy.memory_optimize = False build_strategy.enable_inplace = False build_strategy.fuse_all_reduce_ops = False binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy) iters = 5 batch_size = 8 train_reader = paddle.batch(paddle.reader.shuffle( paddle.dataset.mnist.train(), buf_size=500), batch_size=batch_size) feeder = fluid.DataFeeder(feed_list=feeds, place=place) with fluid.scope_guard(scope): for _ in range(iters): data = next(train_reader()) loss_v = exe.run(binary, feed=feeder.feed(data), fetch_list=[loss]) out_scale_infer_pass = OutScaleForInferencePass(scope=scope) out_scale_infer_pass.apply(test_graph) freeze_pass = QuantizationFreezePass( scope=scope, place=place, weight_bits=8, activation_bits=8, weight_quantize_type=weight_quant_type) mapping_table = load_dict() test_graph.out_node_mapping_table = mapping_table if act_quantize_func == None and weight_quantize_func == None: freeze_pass.apply(test_graph)
def quantization_scale(self, use_cuda, seed, activation_quant_type, weight_quant_type='abs_max', for_ci=False): def build_program(main, startup, is_test): main.random_seed = seed startup.random_seed = seed with fluid.unique_name.guard(): with fluid.program_guard(main, startup): img = fluid.layers.data(name='image', shape=[1, 28, 28], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') loss = residual_block(img, label, 1) if not is_test: opt = fluid.optimizer.Adam(learning_rate=0.0001) opt.minimize(loss) return [img, label], loss random.seed(0) np.random.seed(0) main = fluid.Program() startup = fluid.Program() test_program = fluid.Program() feeds, loss = build_program(main, startup, False) build_program(test_program, startup, True) test_program = test_program.clone(for_test=True) main_graph = IrGraph(core.Graph(main.desc), for_test=False) test_graph = IrGraph(core.Graph(test_program.desc), for_test=True) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) scope = fluid.Scope() with fluid.scope_guard(scope): exe.run(startup) transform_pass = QuantizationTransformPass( scope=scope, place=place, activation_quantize_type=activation_quant_type, weight_quantize_type=weight_quant_type) transform_pass.apply(main_graph) transform_pass.apply(test_graph) add_quant_dequant_pass = AddQuantDequantPass(scope=scope, place=place) add_quant_dequant_pass.apply(main_graph) add_quant_dequant_pass.apply(test_graph) scale_training_pass = OutScaleForTrainingPass(scope=scope, place=place) scale_training_pass.apply(main_graph) dev_name = '_gpu' if use_cuda else '_cpu' if not for_ci: marked_nodes = set() for op in main_graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) main_graph.draw('.', 'main_scale' + dev_name, marked_nodes) marked_nodes = set() for op in test_graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) test_graph.draw('.', 'test_scale' + dev_name, marked_nodes) build_strategy = fluid.BuildStrategy() build_strategy.memory_optimize = False build_strategy.enable_inplace = False build_strategy.fuse_all_reduce_ops = False binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy) iters = 5 batch_size = 8 train_reader = paddle.batch(paddle.reader.shuffle( paddle.dataset.mnist.train(), buf_size=500), batch_size=batch_size) feeder = fluid.DataFeeder(feed_list=feeds, place=place) with fluid.scope_guard(scope): for _ in range(iters): data = next(train_reader()) loss_v = exe.run(binary, feed=feeder.feed(data), fetch_list=[loss]) if not for_ci: print('{}: {}'.format('loss' + dev_name, loss_v)) scale_inference_pass = OutScaleForInferencePass(scope=scope) scale_inference_pass.apply(test_graph) # Freeze graph for inference, but the weight of fc/conv is still float type. freeze_pass = QuantizationFreezePass( scope=scope, place=place, weight_quantize_type=weight_quant_type) freeze_pass.apply(test_graph) server_program = test_graph.to_program() if not for_ci: marked_nodes = set() for op in test_graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) test_graph.draw('.', 'quant_scale' + dev_name, marked_nodes) with open('quant_scale_model' + dev_name + '.txt', 'w') as f: f.write(str(server_program)) with fluid.scope_guard(scope): fluid.io.save_inference_model('quant_scale_model' + dev_name, ['image', 'label'], [loss], exe, server_program)
def test_out_scale_acc(self): def _build_static_lenet(main, startup, is_test=False, seed=1000): with fluid.unique_name.guard(): with fluid.program_guard(main, startup): main.random_seed = seed startup.random_seed = seed img = fluid.layers.data(name='image', shape=[1, 28, 28], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') prediction = StaticLenet(img) if not is_test: loss = fluid.layers.cross_entropy(input=prediction, label=label) avg_loss = fluid.layers.mean(loss) else: avg_loss = prediction return img, label, avg_loss reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=32, drop_last=True) weight_quantize_type = 'abs_max' activation_quant_type = 'moving_average_abs_max' param_init_map = {} seed = 1000 lr = 0.1 dynamic_out_scale_list = [] static_out_scale_list = [] # imperative train _logger.info( "--------------------------dynamic graph qat--------------------------" ) imperative_out_scale = ImperativeQuantAware() with fluid.dygraph.guard(): np.random.seed(seed) fluid.default_main_program().random_seed = seed fluid.default_startup_program().random_seed = seed lenet = ImperativeLenet() fixed_state = {} for name, param in lenet.named_parameters(): p_shape = param.numpy().shape p_value = param.numpy() if name.endswith("bias"): value = np.zeros_like(p_value).astype('float32') else: value = np.random.normal(loc=0.0, scale=0.01, size=np.product(p_shape)).reshape( p_shape).astype('float32') fixed_state[name] = value param_init_map[param.name] = value lenet.set_dict(fixed_state) imperative_out_scale.quantize(lenet) adam = AdamOptimizer(learning_rate=lr, parameter_list=lenet.parameters()) dynamic_loss_rec = [] lenet.train() for batch_id, data in enumerate(reader()): x_data = np.array([x[0].reshape(1, 28, 28) for x in data]).astype('float32') y_data = np.array([x[1] for x in data ]).astype('int64').reshape(-1, 1) img = fluid.dygraph.to_variable(x_data) label = fluid.dygraph.to_variable(y_data) out = lenet(img) loss = fluid.layers.cross_entropy(out, label) avg_loss = fluid.layers.mean(loss) avg_loss.backward() adam.minimize(avg_loss) lenet.clear_gradients() dynamic_loss_rec.append(avg_loss.numpy()[0]) if batch_id % 100 == 0: _logger.info('{}: {}'.format('loss', avg_loss.numpy())) lenet.eval() path = "./dynamic_outscale_infer_model/lenet" dynamic_save_dir = "./dynamic_outscale_infer_model" imperative_out_scale.save_quantized_model( layer=lenet, path=path, input_spec=[ paddle.static.InputSpec(shape=[None, 1, 28, 28], dtype='float32') ]) _logger.info( "--------------------------static graph qat--------------------------" ) static_loss_rec = [] if core.is_compiled_with_cuda(): place = core.CUDAPlace(0) else: place = core.CPUPlace() exe = fluid.Executor(place) main = fluid.Program() infer = fluid.Program() startup = fluid.Program() static_img, static_label, static_loss = _build_static_lenet( main, startup, False, seed) infer_img, _, infer_pre = _build_static_lenet(infer, startup, True, seed) with fluid.unique_name.guard(): with fluid.program_guard(main, startup): opt = AdamOptimizer(learning_rate=lr) opt.minimize(static_loss) scope = core.Scope() with fluid.scope_guard(scope): exe.run(startup) for param in main.all_parameters(): param_tensor = scope.var(param.name).get_tensor() param_tensor.set(param_init_map[param.name], place) main_graph = IrGraph(core.Graph(main.desc), for_test=False) infer_graph = IrGraph(core.Graph(infer.desc), for_test=True) transform_pass = QuantizationTransformPass( scope=scope, place=place, activation_quantize_type=activation_quant_type, weight_quantize_type=weight_quantize_type, quantizable_op_type=['conv2d', 'depthwise_conv2d', 'mul']) transform_pass.apply(main_graph) transform_pass.apply(infer_graph) outscale_pass = OutScaleForTrainingPass(scope=scope, place=place) outscale_pass.apply(main_graph) build_strategy = fluid.BuildStrategy() build_strategy.fuse_all_reduce_ops = False binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel( loss_name=static_loss.name, build_strategy=build_strategy) feeder = fluid.DataFeeder(feed_list=[static_img, static_label], place=place) with fluid.scope_guard(scope): for batch_id, data in enumerate(reader()): loss_v, = exe.run(binary, feed=feeder.feed(data), fetch_list=[static_loss]) static_loss_rec.append(loss_v[0]) if batch_id % 100 == 0: _logger.info('{}: {}'.format('loss', loss_v)) scale_inference_pass = OutScaleForInferencePass(scope=scope) scale_inference_pass.apply(infer_graph) save_program = infer_graph.to_program() static_save_dir = "./static_outscale_infer_model" with fluid.scope_guard(scope): fluid.io.save_inference_model( dirname=static_save_dir, feeded_var_names=[infer_img.name], target_vars=[infer_pre], executor=exe, main_program=save_program, model_filename="lenet" + INFER_MODEL_SUFFIX, params_filename="lenet" + INFER_PARAMS_SUFFIX) rtol = 1e-05 atol = 1e-08 for i, (loss_d, loss_s) in enumerate(zip(dynamic_loss_rec, static_loss_rec)): diff = np.abs(loss_d - loss_s) if diff > (atol + rtol * np.abs(loss_s)): _logger.info( "diff({}) at {}, dynamic loss = {}, static loss = {}". format(diff, i, loss_d, loss_s)) break self.assertTrue(np.allclose(np.array(dynamic_loss_rec), np.array(static_loss_rec), rtol=rtol, atol=atol, equal_nan=True), msg='Failed to do the imperative qat.') # load dynamic model [dynamic_inference_program, feed_target_names, fetch_targets] = (fluid.io.load_inference_model( dirname=dynamic_save_dir, executor=exe, model_filename="lenet" + INFER_MODEL_SUFFIX, params_filename="lenet" + INFER_PARAMS_SUFFIX)) # load static model [static_inference_program, feed_target_names, fetch_targets] = (fluid.io.load_inference_model( dirname=static_save_dir, executor=exe, model_filename="lenet" + INFER_MODEL_SUFFIX, params_filename="lenet" + INFER_PARAMS_SUFFIX)) dynamic_ops = dynamic_inference_program.global_block().ops static_ops = static_inference_program.global_block().ops for op in dynamic_ops[:]: if op.type == "flatten2" or 'fake' in op.type: dynamic_ops.remove(op) for op in static_ops[:]: if 'fake' in op.type: static_ops.remove(op) for i in range(len(dynamic_ops)): if dynamic_ops[i].has_attr("out_threshold"): self.assertTrue(dynamic_ops[i].type == static_ops[i].type) self.assertTrue(dynamic_ops[i].attr("out_threshold") == static_ops[i].attr("out_threshold"))