def linear_fc_quant(self, activation_quant_type, weight_quantize_type, for_ci=True): main = fluid.Program() startup = fluid.Program() with fluid.program_guard(main, startup): loss = linear_fc(3) opt = fluid.optimizer.Adam(learning_rate=0.001) opt.minimize(loss) place = fluid.CPUPlace() graph = IrGraph(core.Graph(main.desc), for_test=False) transform_pass = QuantizationTransformPass( scope=fluid.global_scope(), place=place, activation_quantize_type=activation_quant_type, weight_quantize_type=weight_quantize_type) transform_pass.apply(graph) if not for_ci: marked_nodes = set() for op in graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) graph.draw('.', 'quantize_fc_' + activation_quant_type, marked_nodes) program = graph.to_program() self.check_program(program) val_graph = IrGraph(core.Graph(program.desc), for_test=False) if not for_ci: val_marked_nodes = set() for op in val_graph.all_op_nodes(): if op.name().find('quantize') > -1: val_marked_nodes.add(op) val_graph.draw('.', 'val_fc_' + activation_quant_type, val_marked_nodes)
def residual_block_quant(self, quantizable_op_type, skip_pattern=None, for_ci=True): main = fluid.Program() startup = fluid.Program() with fluid.program_guard(main, startup): loss = quant_dequant_residual_block(2, skip_pattern) opt = fluid.optimizer.Adam(learning_rate=0.001) opt.minimize(loss) place = fluid.CPUPlace() graph = IrGraph(core.Graph(main.desc), for_test=False) add_quant_dequant_pass = AddQuantDequantPass( scope=fluid.global_scope(), place=place, skip_pattern=skip_pattern, quantizable_op_type=quantizable_op_type) add_quant_dequant_pass.apply(graph) if not for_ci: marked_nodes = set() for op in graph.all_op_nodes(): if op.name().find('quant') > -1: marked_nodes.add(op) graph.draw('.', 'add_quant_dequant_graph', marked_nodes) self.check_graph(graph, skip_pattern) program = graph.to_program() val_graph = IrGraph(core.Graph(program.desc), for_test=False) if not for_ci: val_marked_nodes = set() for op in val_graph.all_op_nodes(): if op.name().find('quant') > -1: val_marked_nodes.add(op) val_graph.draw('.', 'val_add_quant_dequant_graph', val_marked_nodes)
def residual_block_quant(self, quant_type): main = fluid.Program() startup = fluid.Program() with fluid.program_guard(main, startup): loss = residual_block(2) opt = fluid.optimizer.Adam(learning_rate=0.001) opt.minimize(loss) place = fluid.CPUPlace() exe = fluid.Executor(place) graph = IrGraph(core.Graph(main.desc), for_test=False) transform_pass = QuantizationTransformPass( scope=fluid.global_scope(), place=place, activation_quantize_type=quant_type) transform_pass.apply(graph) marked_nodes = set() for op in graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) graph.draw('.', 'quantize_residual_' + quant_type, marked_nodes) program = graph.to_program() self.check_program(transform_pass, program) val_graph = IrGraph(core.Graph(program.desc), for_test=False) val_marked_nodes = set() for op in val_graph.all_op_nodes(): if op.name().find('quantize') > -1: val_marked_nodes.add(op) val_graph.draw('.', 'val_residual_' + quant_type, val_marked_nodes)
def test_dequantize_op_weights(self): program = fluid.Program() with fluid.program_guard(program): self.prepare_program_mul(program) graph = IrGraph(core.Graph(program.desc), for_test=True) for op in graph.all_op_nodes(): if op.op().type() == "mul": op_node = op break qpass = Quant2Int8MkldnnPass(self.quantized_ops, _scope=self.scope, _place=self.place, _core=core, _debug=False) qpass._weight_scales["mul_output"] = self.mul_output_scale param = self.scope.var("mul_weights").get_tensor() param.set(self.variables_mul["mul_weights"], self.place) qpass._dequantize_op_weights(graph, op_node, "Y", "Out") assert np.allclose( self.scope.find_var("mul_weights").get_tensor(), [[127, 63.5, 42.3333, 31.75, 25.4], [127, 63.5, 42.3333, 31.75, 25.4], [127, 63.5, 42.3333, 31.75, 25.4]]) param = self.scope.var("mul_weights").get_tensor() param.set(self.variables_mul["mul_weights_bad"], self.place) with self.assertRaises(ValueError): qpass._dequantize_op_weights(graph, op_node, "Y", "Out")
def transform_and_save_int8_model(original_path, save_path, ops_to_quantize='', op_ids_to_skip='', debug=False, quant_model_filename='', quant_params_filename='', save_model_filename="__model__", save_params_filename=None): place = fluid.CPUPlace() exe = fluid.Executor(place) inference_scope = fluid.executor.global_scope() with fluid.scope_guard(inference_scope): if not quant_model_filename: if os.path.exists(os.path.join(original_path, '__model__')): [inference_program, feed_target_names, fetch_targets ] = fluid.io.load_inference_model(original_path, exe) else: [inference_program, feed_target_names, fetch_targets ] = fluid.io.load_inference_model(original_path, exe, 'model', 'params') else: [inference_program, feed_target_names, fetch_targets ] = fluid.io.load_inference_model(original_path, exe, quant_model_filename, quant_params_filename) ops_to_quantize_set = set() print(ops_to_quantize) if len(ops_to_quantize) > 0: ops_to_quantize_set = set(ops_to_quantize.split(',')) op_ids_to_skip_set = set([-1]) print(op_ids_to_skip) if len(op_ids_to_skip) > 0: op_ids_to_skip_set = set(map(int, op_ids_to_skip.split(','))) graph = IrGraph(core.Graph(inference_program.desc), for_test=True) if (debug): graph.draw('.', 'quant_orig', graph.all_op_nodes()) transform_to_mkldnn_int8_pass = Quant2Int8MkldnnPass( ops_to_quantize_set, _op_ids_to_skip=op_ids_to_skip_set, _scope=inference_scope, _place=place, _core=core, _debug=debug) graph = transform_to_mkldnn_int8_pass.apply(graph) inference_program = graph.to_program() with fluid.scope_guard(inference_scope): fluid.io.save_inference_model(save_path, feed_target_names, fetch_targets, exe, inference_program, model_filename=save_model_filename, params_filename=save_params_filename) print( "Success! INT8 model obtained from the Quant model can be found at {}\n" .format(save_path))
def test_dequantize_op_weights(self): program = fluid.Program() with fluid.program_guard(program): self.prepare_program_mul(program) graph = IrGraph(core.Graph(program.desc), for_test=True) op_node = "" for op in graph.all_op_nodes(): if op.op().type() == self.op_name(): op_node = op break assert op_node != "", "op of type %s not found" % self.op_name() qpass = Quant2Int8MkldnnPass(self.quantized_ops, _scope=self.scope, _place=self.place, _core=core, _debug=False) qpass._weight_thresholds["mul_output"] = self.mul_output_scale param = self.scope.var("mul_weights").get_tensor() param.set(self.variables_mul["mul_weights"], self.place) qpass._dequantize_op_weights(graph, op_node, "Y", "Out") assert np.allclose( self.scope.find_var("mul_weights").get_tensor(), [[1. / 127., 2. / 127., 3. / 127., 4. / 127., 5. / 127.], [1. / 127., 2. / 127., 3. / 127., 4. / 127., 5. / 127.], [1. / 127., 2. / 127., 3. / 127., 4. / 127., 5. / 127.]]) param = self.scope.var("mul_weights").get_tensor() param.set(self.variables_mul["mul_weights_bad"], self.place) with self.assertRaises(ValueError): qpass._dequantize_op_weights(graph, op_node, "Y", "Out")
def get_op_number(self, prog): graph = IrGraph(core.Graph(prog.desc), for_test=False) quant_op_nums = 0 op_nums = 0 for op in graph.all_op_nodes(): if op.name() in ['conv2d', 'depthwise_conv2d', 'mul']: op_nums += 1 elif 'fake_' in op.name(): quant_op_nums += 1 return op_nums, quant_op_nums
def test_graph_functions(self): main = fluid.Program() startup = fluid.Program() with fluid.program_guard(main, startup): loss = residual_block(2) opt = fluid.optimizer.Adam(learning_rate=0.001) opt.minimize(loss) graph = IrGraph(core.Graph(main.desc), for_test=False) marked_nodes = set() for op in graph.all_op_nodes(): if op.name().find('conv2d') > -1: marked_nodes.add(op) graph.draw('.', 'residual', marked_nodes) self.assertFalse(graph.has_circle()) self.assertEqual(graph.graph_num(), 1) nodes = graph.topology_sort() self.assertEqual(len(nodes), len(graph.all_op_nodes())) nodes_map = graph.build_adjacency_list() self.assertEqual(len(nodes_map), len(graph.all_op_nodes())) nodes_num = len(graph.all_nodes()) graph.safe_remove_nodes(marked_nodes) self.assertEqual(len(graph.all_nodes()), nodes_num - len(marked_nodes))
def quant_embedding(program, place, config=None, scope=None): """quantize lookup_table op parameters Args: program(paddle.static.Program): infer program scope(paddle.static.Scope, optional): Scope records the mapping between variable names and variables, similar to brackets in programming languages. Usually users can use `paddle.static.global_scope() <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html>`_ . When ``None`` will use `paddle.static.global_scope() <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html>`_. Default : ``None``. place(paddle.CPUPlace or paddle.CUDAPlace): This parameter represents the executor run on which device. config(dict, optional): config to quantize. The keys are 'quantize_op_types'. For op in quantize_op_types, you can define 'quantize_type', \ 'quantize_bits', 'dtype', 'threshold'. \ ``quantize_type`` is quantize type, supported types are ['abs_max'], default is "abs_max". ``quantize_bits`` supported bits are [8] and default is 8. ``dtype`` is quantize dtype, supported dtype are ['int8'], default is 'int8'. ``threshold`` is threshold to clip tensor before quant. When threshold is not set, \ tensor will not be clipped. Returns: None """ config = config or {} config = _merge_config(copy.deepcopy(_default_config), config) scope = paddle.static.global_scope() if scope is None else scope graph = IrGraph(core.Graph(program.desc), for_test=True) quantize_params_map = {} all_op = graph.all_op_nodes() for op in all_op: if op.inputs == [] and op.outputs == []: continue op_type = op.name() if op_type in config['quantize_op_types']: weight_name = op.input('W')[0] if weight_name in quantize_params_map.values(): continue embedding_node = graph._find_node_by_name(op.inputs, op.input('W')[0]) for op_node in embedding_node.outputs: if op_node.name() == 'fused_embedding_seq_pool': _split_embedding_seq_pool(graph, op_node) if config[op_type]['quantize_type'] == 'abs_max': _quant_embedding_abs_max(graph, scope, place, config[op_type], weight_name, embedding_node) elif config[op_type]['quantize_type'] == 'log': _quant_embedding_log(graph, scope, place, config[op_type], weight_name, embedding_node) quantize_params_map[weight_name] = _get_quant_var_name(weight_name) for op in all_op: if op.name() == 'fused_embedding_seq_pool': graph.safe_remove_nodes(op) return graph.to_program()
def generate_dot_for_model(model_path, save_graph_dir, save_graph_name): place = fluid.CPUPlace() exe = fluid.Executor(place) inference_scope = fluid.executor.global_scope() with fluid.scope_guard(inference_scope): if os.path.exists(os.path.join(model_path, '__model__')): [inference_program, feed_target_names, fetch_targets] = fluid.io.load_inference_model(model_path, exe) else: [inference_program, feed_target_names, fetch_targets] = fluid.io.load_inference_model(model_path, exe, 'model', 'params') graph = IrGraph(core.Graph(inference_program.desc), for_test=True) if not os.path.exists(save_graph_dir): os.makedirs(save_graph_dir) model_name = os.path.basename(os.path.normpath(save_graph_dir)) if save_graph_name is '': save_graph_name = model_name graph.draw(save_graph_dir, save_graph_name, graph.all_op_nodes()) print( "Success! Generated dot and pdf files for {0} model, that can be found at {1} named {2}.\n". format(model_name, save_graph_dir, save_graph_name))
def _predict(self, test_reader=None, model_path=None, batch_size=1, batch_num=1, skip_batch_num=0, transform_to_int8=False): place = fluid.CPUPlace() exe = fluid.Executor(place) inference_scope = fluid.executor.global_scope() with fluid.scope_guard(inference_scope): if os.path.exists(os.path.join(model_path, '__model__')): [inference_program, feed_target_names, fetch_targets ] = fluid.io.load_inference_model(model_path, exe) else: [inference_program, feed_target_names, fetch_targets ] = fluid.io.load_inference_model(model_path, exe, 'model', 'params') graph = IrGraph(core.Graph(inference_program.desc), for_test=True) if (self._debug): graph.draw('.', 'qat_orig', graph.all_op_nodes()) if (transform_to_int8): transform_to_mkldnn_int8_pass = Qat2Int8MkldnnPass( self._quantized_ops, _scope=inference_scope, _place=place, _core=core, _debug=self._debug) graph = transform_to_mkldnn_int8_pass.apply(graph) inference_program = graph.to_program() total_correct = 0 total_samples = 0 batch_times = [] ppses = [] # predictions per second iters = 0 infer_start_time = time.time() for data in test_reader(): if batch_num > 0 and iters >= batch_num: break if iters == skip_batch_num: total_samples = 0 infer_start_time = time.time() input0 = np.array([x[0] for x in data]).astype('int64') input1 = np.array([x[1] for x in data]).astype('int64') labels = np.array([x[2] for x in data]).astype('int64') start = time.time() out = exe.run(inference_program, feed={ feed_target_names[0]: input0, feed_target_names[1]: input1 }, fetch_list=fetch_targets) batch_time = (time.time() - start) * 1000 # in miliseconds batch_times.append(batch_time) batch_correct = self._get_batch_correct(out, labels) batch_len = len(data) total_samples += batch_len total_correct += batch_correct batch_acc = float(batch_correct) / float(batch_len) pps = batch_len / batch_time * 1000 ppses.append(pps) latency = batch_time / batch_len iters += 1 appx = ' (warm-up)' if iters <= skip_batch_num else '' _logger.info( 'batch {0}{4}, acc: {1:.4f}, latency: {2:.4f} ms, predictions per sec: {3:.2f}' .format(iters, batch_acc, latency, pps, appx)) # Postprocess benchmark data infer_total_time = time.time() - infer_start_time batch_latencies = batch_times[skip_batch_num:] batch_latency_avg = np.average(batch_latencies) latency_avg = batch_latency_avg / batch_size ppses = ppses[skip_batch_num:] pps_avg = np.average(ppses) acc_avg = float(np.sum(total_correct)) / float(total_samples) _logger.info( 'Total inference run time: {:.2f} s'.format(infer_total_time)) return acc_avg, pps_avg, latency_avg
def freeze_graph(self, use_cuda, seed, activation_quant_type, weight_quant_type='abs_max', for_ci=True, quant_skip_pattern='skip_quant'): def build_program(main, startup, is_test): main.random_seed = seed startup.random_seed = seed with fluid.unique_name.guard(): with fluid.program_guard(main, startup): img = fluid.layers.data( name='image', shape=[1, 28, 28], dtype='float32') label = fluid.layers.data( name='label', shape=[1], dtype='int64') loss = conv_net(img, label, quant_skip_pattern) if not is_test: opt = fluid.optimizer.Adam(learning_rate=0.001) opt.minimize(loss) return [img, label], loss random.seed(0) np.random.seed(0) main = fluid.Program() startup = fluid.Program() test_program = fluid.Program() feeds, loss = build_program(main, startup, False) build_program(test_program, startup, True) test_program = test_program.clone(for_test=True) main_graph = IrGraph(core.Graph(main.desc), for_test=False) test_graph = IrGraph(core.Graph(test_program.desc), for_test=True) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) scope = fluid.Scope() with fluid.scope_guard(scope): exe.run(startup) transform_pass = QuantizationTransformPass( scope=scope, place=place, activation_quantize_type=activation_quant_type, weight_quantize_type=weight_quant_type, skip_pattern=quant_skip_pattern) transform_pass.apply(main_graph) transform_pass.apply(test_graph) dev_name = '_gpu_' if use_cuda else '_cpu_' if not for_ci: marked_nodes = set() for op in main_graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) main_graph.draw('.', 'main' + dev_name + activation_quant_type + '_' + weight_quant_type, marked_nodes) marked_nodes = set() for op in test_graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) test_graph.draw('.', 'test' + dev_name + activation_quant_type + '_' + weight_quant_type, marked_nodes) build_strategy = fluid.BuildStrategy() build_strategy.memory_optimize = False build_strategy.enable_inplace = False binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy) quantized_test_program = test_graph.to_program() iters = 5 batch_size = 8 train_reader = paddle.batch( paddle.reader.shuffle( paddle.dataset.mnist.train(), buf_size=500), batch_size=batch_size) test_reader = paddle.batch( paddle.dataset.mnist.test(), batch_size=batch_size) feeder = fluid.DataFeeder(feed_list=feeds, place=place) with fluid.scope_guard(scope): for _ in range(iters): data = next(train_reader()) loss_v = exe.run(binary, feed=feeder.feed(data), fetch_list=[loss]) if not for_ci: print('{}: {}'.format('loss' + dev_name + activation_quant_type + '_' + weight_quant_type, loss_v)) test_data = next(test_reader()) with fluid.program_guard(quantized_test_program): w_var = fluid.framework._get_var('conv2d_1.w_0.quantized', quantized_test_program) # Testing with fluid.scope_guard(scope): test_loss1, w_quant = exe.run(program=quantized_test_program, feed=feeder.feed(test_data), fetch_list=[loss, w_var]) # Freeze graph for inference, but the weight of fc/conv is still float type. freeze_pass = QuantizationFreezePass( scope=scope, place=place, weight_quantize_type=weight_quant_type) freeze_pass.apply(test_graph) if not for_ci: marked_nodes = set() for op in test_graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) test_graph.draw('.', 'test_freeze' + dev_name + activation_quant_type + '_' + weight_quant_type, marked_nodes) server_program = test_graph.to_program() with fluid.scope_guard(scope): test_loss2, = exe.run(program=server_program, feed=feeder.feed(test_data), fetch_list=[loss]) self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3) if not for_ci: print( '{}: {}'.format('test_loss1' + dev_name + activation_quant_type + '_' + weight_quant_type, test_loss1)) print( '{}: {}'.format('test_loss2' + dev_name + activation_quant_type + '_' + weight_quant_type, test_loss2)) w_freeze = np.array(scope.find_var('conv2d_1.w_0').get_tensor()) # Maybe failed, this is due to the calculation precision # self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant)) if not for_ci: print('{}: {}'.format('w_freeze' + dev_name + activation_quant_type + '_' + weight_quant_type, np.sum(w_freeze))) print('{}: {}'.format('w_quant' + dev_name + activation_quant_type + '_' + weight_quant_type, np.sum(w_quant))) # Convert parameter to 8-bit. convert_int8_pass = ConvertToInt8Pass(scope=scope, place=place) convert_int8_pass.apply(test_graph) if not for_ci: marked_nodes = set() for op in test_graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) test_graph.draw('.', 'test_int8' + dev_name + activation_quant_type + '_' + weight_quant_type, marked_nodes) server_program_int8 = test_graph.to_program() # Save the 8-bit parameter and model file. with fluid.scope_guard(scope): fluid.io.save_inference_model( 'server_int8' + dev_name + activation_quant_type + '_' + weight_quant_type, ['image', 'label'], [loss], exe, server_program_int8) # Test whether the 8-bit parameter and model file can be loaded successfully. [infer, feed, fetch] = fluid.io.load_inference_model( 'server_int8' + dev_name + activation_quant_type + '_' + weight_quant_type, exe) # Check the loaded 8-bit weight. w_8bit = np.array(scope.find_var('conv2d_1.w_0.int8').get_tensor()) self.assertEqual(w_8bit.dtype, np.int8) self.assertEqual(np.sum(w_8bit), np.sum(w_freeze)) if not for_ci: print('{}: {}'.format('w_8bit' + dev_name + activation_quant_type + '_' + weight_quant_type, np.sum(w_8bit))) print('{}: {}'.format('w_freeze' + dev_name + activation_quant_type + '_' + weight_quant_type, np.sum(w_freeze))) mobile_pass = TransformForMobilePass() mobile_pass.apply(test_graph) if not for_ci: marked_nodes = set() for op in test_graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) test_graph.draw('.', 'test_mobile' + dev_name + activation_quant_type + '_' + weight_quant_type, marked_nodes) mobile_program = test_graph.to_program() with fluid.scope_guard(scope): fluid.io.save_inference_model( 'mobile_int8' + dev_name + activation_quant_type + '_' + weight_quant_type, ['image', 'label'], [loss], exe, mobile_program)
def mkldnn_based_freeze_graph(self, use_cuda, seed, activation_quant_type, weight_quant_type='abs_max', quant_perf=False, for_ci=False): random.seed(0) np.random.seed(0) main = fluid.Program() startup = fluid.Program() test_program = fluid.Program() feeds, loss = self.build_program(main, startup, False, seed) self.build_program(test_program, startup, True, seed) test_program = test_program.clone(for_test=True) main_graph = IrGraph(core.Graph(main.desc), for_test=False) test_graph = IrGraph(core.Graph(test_program.desc), for_test=True) place = fluid.CPUPlace() exe = fluid.Executor(place) scope = fluid.Scope() with fluid.scope_guard(scope): exe.run(startup) # Apply the QuantizationTransformPass transform_pass = QuantizationTransformPass( scope=scope, place=place, activation_quantize_type=activation_quant_type, weight_quantize_type=weight_quant_type) transform_pass.apply(main_graph) transform_pass.apply(test_graph) build_strategy = fluid.BuildStrategy() build_strategy.memory_optimize = False build_strategy.enable_inplace = False binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy) quantized_test_program = test_graph.to_program() iters = 5 batch_size = 8 train_reader = paddle.batch(paddle.reader.shuffle( paddle.dataset.mnist.train(), buf_size=500), batch_size=batch_size) test_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=batch_size) feeder = fluid.DataFeeder(feed_list=feeds, place=place) # Training the model to get the weights value with fluid.scope_guard(scope): for _ in range(iters): data = next(train_reader()) loss_v = exe.run(binary, feed=feeder.feed(data), fetch_list=[loss]) # Freeze graph for inference, but the weight of fc/conv is still float type. freeze_pass = QuantizationFreezePass( scope=scope, place=place, weight_quantize_type=weight_quant_type) freeze_pass.apply(test_graph) # Transform quantized graph for MKL-DNN INT8 inference mkldnn_int8_pass = QuantInt8MkldnnPass(_scope=scope, _place=place) mkldnn_int8_pass.apply(test_graph) dev_name = '_cpu_' if not for_ci: marked_nodes = set() for op in test_graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) test_graph.draw( '.', 'test_mkldnn' + dev_name + activation_quant_type + '_' + weight_quant_type, marked_nodes) mkldnn_program = test_graph.to_program() # Check the transformation weights of conv2d and mul conv_w_mkldnn = np.array(scope.find_var('conv2d_1.w_0').get_tensor()) mul_w_mkldnn = np.array(scope.find_var('fc_0.w_0').get_tensor()) # Check if weights are still integer self.assertFalse(self.isinteger(np.sum(conv_w_mkldnn))) self.assertFalse(self.isinteger(np.sum(mul_w_mkldnn))) # Check if the conv2d output and mul output are correctly linked to fake_dequantize's # output self.check_program(mkldnn_program) if not for_ci: print('{}: {}'.format( 'w_mkldnn' + dev_name + activation_quant_type + '_' + weight_quant_type, np.sum(w_mkldnn)))
def create_quant_model(model, params, activation_quantize_type='moving_average_abs_max', weight_quantize_type='channel_wise_abs_max', save=False): place = paddle.CUDAPlace(0) scope = global_scope() exe = paddle.static.Executor(place) [inference_program, feed_target_names, fetch_targets ] = paddle.static.load_inference_model(path_prefix=None, executor=exe, model_filename=model, params_filename=params) graph = IrGraph(core.Graph(inference_program.desc), for_test=True) out_scale_op_list = [ "conv2d", "depthwise_conv2d", "mul", "matmul", "relu", "leaky_relu", "relu6", "sigmoid", "tanh", "prelu", "swish", "softmax", "batch_norm", "layer_norm", "elementwise_add", "pool2d", "reshape2", "transpose2", "concat", "elementwise_mul", "scale", "slice", "hard_swish", "hard_sigmoid", "conv2d_transpose", "gru", "bilinear_interp", "nearest_interp", "trilinear_interp", "flatten", "flatten2", "transpose", "pad2d", "reshape", "layer_norm", ] op_real_in_out_name = { "conv2d": [["Input", "Filter"], ["Output"]], "depthwise_conv2d": [["Input", "Filter"], ["Output"]], "conv2d_transpose": [["Input", "Filter"], ["Output"]], "mul": [["X", "Y"], ["Out"]], "matmul": [["X", "Y"], ["Out"]], "pool2d": [["X"], ["Out"]], "elementwise_add": [["X", "Y"], ["Out"]], "concat": [["X"], ["Out"]], "softmax": [["X"], ["Out"]], "argmax": [["X"], ["Out"]], "transpose": [["X"], ["Out"]], "equal": [["X", "Y"], ["Out"]], "gather": [["X"], ["Out"]], "greater_equal": [["X", "Y"], ["Out"]], "greater_than": [["X", "Y"], ["Out"]], "less_equal": [["X", "Y"], ["Out"]], "less_than": [["X", "Y"], ["Out"]], "mean": [["X"], ["Out"]], "not_equal": [["X", "Y"], ["Out"]], "reshape": [["X"], ["Out"]], "reshape2": [["X"], ["Out"]], "transpose2": [["X"], ["Out"]], "bilinear_interp": [["X"], ["Out"]], "nearest_interp": [["X"], ["Out"]], "trilinear_interp": [["X"], ["Out"]], "slice": [["Input"], ["Out"]], "squeeze": [["X"], ["Out"]], "elementwise_sub": [["X", "Y"], ["Out"]], "relu": [["X"], ["Out"]], "relu6": [["X"], ["Out"]], "leaky_relu": [["X"], ["Out"]], "prelu": [["X"], ["Out"]], "tanh": [["X"], ["Out"]], "swish": [["X"], ["Out"]], "dropout": [["X"], ["Out"]], "batch_norm": [["X"], ["Y"]], "layer_norm": [["X"], ["Y"]], "sigmoid": [["X"], ["Out"]], "elementwise_mul": [["X", "Y"], ["Out"]], "scale": [["X"], ["Out"]], "hard_swish": [["X"], ["Out"]], "hard_sigmoid": [["X"], ["Out"]], "gru": [["Input", "Weight"], ["Hidden"]], "lstm": [["Input", "Weight"], ["Hidden"]], "pad2d": [["X"], ["Out"]], "flatten": [["X"], ["Out"]], "flatten2": [["X"], ["Out"]], } def _get_op_output_var_names(op): """ """ assert isinstance(op, (IrNode, Operator)), \ "The input op should be IrNode or Operator." var_names = [] op_name = op.name() if isinstance(op, IrNode) \ else op.type if op_name not in op_real_in_out_name: return [] name_list = op_real_in_out_name[op_name][1] for name in name_list: var_name = op.output(name) if isinstance(var_name, list): var_names.extend(var_name) else: var_names.append(var_name) return var_names transform_pass = QuantizationTransformPass( scope=scope, place=place, activation_quantize_type=activation_quantize_type, weight_quantize_type=weight_quantize_type) transform_pass.apply(graph) op_nodes = graph.all_op_nodes() for op_node in op_nodes: if op_node.name() in out_scale_op_list: var_names = _get_op_output_var_names(op_node) for var_name in var_names: in_node = graph._find_node_by_name(op_node.outputs, var_name) if in_node.dtype() not in \ [core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32]: continue op_node.op()._set_attr("out_threshold", 3.0) # Freeze graph for inference, but the weight of fc/conv is still float type. freeze_pass = QuantizationFreezePass( scope=scope, place=place, weight_quantize_type=weight_quantize_type) freeze_pass.apply(graph) main_program = graph.to_program() # modify fake_quantize_moving_average_abs_max(InScale) and fake_channel_wise_dequantize_max_abs(Scales) op_nodes = graph.all_op_nodes() for op_node in op_nodes: if op_node.name() == 'fake_quantize_moving_average_abs_max': var_name = op_node.input("InScale")[0] tensor = scope.var(var_name).get_tensor() tensor.set(np.array([1], dtype=np.float32), place) elif op_node.name() == 'fake_channel_wise_dequantize_max_abs': var_name = op_node.input("Scales")[0] tensor = scope.var(var_name).get_tensor() tensor.set(np.ones(tensor.shape(), dtype=np.float32), place) if save: fluid.io.save_inference_model('test_inference_model', feed_target_names, fetch_targets, exe, main_program=main_program) feed_vars = [ main_program.global_block().var(name) for name in feed_target_names ] serialized_program = paddle.static.serialize_program(feed_vars, fetch_targets, program=main_program) serialized_params = paddle.static.serialize_persistables( feed_vars, fetch_targets, executor=exe, program=main_program) return serialized_program, serialized_params
def _predict(self, test_reader=None, model_path=None, batch_size=1, batch_num=1, skip_batch_num=0, transform_to_int8=False): place = fluid.CPUPlace() exe = fluid.Executor(place) inference_scope = fluid.executor.global_scope() with fluid.scope_guard(inference_scope): if os.path.exists(os.path.join(model_path, '__model__')): [inference_program, feed_target_names, fetch_targets ] = fluid.io.load_inference_model(model_path, exe) else: [inference_program, feed_target_names, fetch_targets ] = fluid.io.load_inference_model(model_path, exe, 'model', 'params') graph = IrGraph(core.Graph(inference_program.desc), for_test=True) if (self._debug): graph.draw('.', 'quant_orig', graph.all_op_nodes()) if (transform_to_int8): mkldnn_int8_pass = QuantInt8MkldnnPass(_scope=inference_scope, _place=place) graph = mkldnn_int8_pass.apply(graph) else: graph = self._prepare_for_fp32_mkldnn(graph) inference_program = graph.to_program() dshape = [3, 224, 224] outputs = [] infer_accs1 = [] infer_accs5 = [] fpses = [] batch_times = [] total_samples = 0 iters = 0 infer_start_time = time.time() for data in test_reader(): if batch_num > 0 and iters >= batch_num: break if iters == skip_batch_num: total_samples = 0 infer_start_time = time.time() if six.PY2: images = map(lambda x: x[0].reshape(dshape), data) if six.PY3: images = list(map(lambda x: x[0].reshape(dshape), data)) images = np.array(images).astype('float32') labels = np.array([x[1] for x in data]).astype('int64') start = time.time() out = exe.run(inference_program, feed={feed_target_names[0]: images}, fetch_list=fetch_targets) batch_time = (time.time() - start) * 1000 # in miliseconds outputs.append(out[0]) batch_acc1, batch_acc5 = self._get_batch_accuracy( out[0], labels) infer_accs1.append(batch_acc1) infer_accs5.append(batch_acc5) samples = len(data) total_samples += samples batch_times.append(batch_time) fps = samples / batch_time * 1000 fpses.append(fps) iters += 1 appx = ' (warm-up)' if iters <= skip_batch_num else '' _logger.info('batch {0}{5}, acc1: {1:.4f}, acc5: {2:.4f}, ' 'latency: {3:.4f} ms, fps: {4:.2f}'.format( iters, batch_acc1, batch_acc5, batch_time / batch_size, fps, appx)) # Postprocess benchmark data batch_latencies = batch_times[skip_batch_num:] batch_latency_avg = np.average(batch_latencies) latency_avg = batch_latency_avg / batch_size fpses = fpses[skip_batch_num:] fps_avg = np.average(fpses) infer_total_time = time.time() - infer_start_time acc1_avg = np.mean(infer_accs1) acc5_avg = np.mean(infer_accs5) _logger.info( 'Total inference run time: {:.2f} s'.format(infer_total_time)) return outputs, acc1_avg, acc5_avg, fps_avg, latency_avg
def graph_apis(self, use_cuda=False, for_ci=True): main = fluid.Program() startup = fluid.Program() with fluid.unique_name.guard(): with fluid.program_guard(main, startup): feeds, loss = conv_block() opt = fluid.optimizer.Adam(learning_rate=0.001) opt.minimize(loss) graph = IrGraph(core.Graph(main.desc), for_test=False) backup_graph = graph.clone() self.assertEqual(len(graph.all_nodes()), len(backup_graph.all_nodes())) build_strategy = fluid.BuildStrategy() build_strategy.memory_optimize = False build_strategy.enable_inplace = False origin_binary = fluid.CompiledProgram(graph.graph).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy) backup_binary = fluid.CompiledProgram( backup_graph.graph).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup) iters = 5 batch_size = 8 train_reader = paddle.batch( paddle.dataset.mnist.train(), batch_size=batch_size) feeder = fluid.DataFeeder(feed_list=feeds, place=place) def _train(binary): for _ in range(iters): data = next(train_reader()) loss_v = exe.run(binary, feed=feeder.feed(data), fetch_list=[loss.name]) if not for_ci: print('{}: {}'.format('loss', loss_v)) _train(origin_binary) _train(backup_binary) checkponit_dir = "checkpoint_gpu" if use_cuda else "checkpoint_cpu" def _set_zero(var_name, scope, place): var = scope.find_var(var_name).get_tensor() var_array = np.zeros(var._get_dims()).astype("float32") var.set(var_array, place) sum_before = np.sum( np.array(fluid.global_scope().find_var('conv2d_1.w_0').get_tensor( ))) fluid.io._save_persistable_nodes(exe, checkponit_dir, graph) _set_zero('conv2d_1.w_0', fluid.global_scope(), place) set_after = np.sum( np.array(fluid.global_scope().find_var('conv2d_1.w_0').get_tensor( ))) self.assertEqual(set_after, 0) fluid.io._load_persistable_nodes(exe, checkponit_dir, graph) sum_after = np.sum( np.array(fluid.global_scope().find_var('conv2d_1.w_0').get_tensor( ))) self.assertEqual(sum_before, sum_after) marked_nodes = set() for op in graph.all_op_nodes(): if op.name().find('conv2d') > -1: marked_nodes.add(op) if not for_ci: graph.draw('.', 'residual', marked_nodes) backup_marked_nodes = set() for op in backup_graph.all_op_nodes(): if op.name().find('conv2d') > -1: backup_marked_nodes.add(op) backup_graph.draw('./origin', 'backup', backup_marked_nodes) self.assertFalse(graph.has_circle()) self.assertEqual(graph.graph_num(), 1) nodes = graph.topology_sort() self.assertEqual(len(nodes), len(graph.all_op_nodes())) nodes_map = graph.build_adjacency_list() self.assertEqual(len(nodes_map), len(graph.all_op_nodes())) nodes_num = len(graph.all_nodes()) graph.safe_remove_nodes(marked_nodes) self.assertEqual(len(graph.all_nodes()), nodes_num - len(marked_nodes))
def quantization_scale(self, use_cuda, seed, activation_quant_type, weight_quant_type='abs_max', for_ci=False): def build_program(main, startup, is_test): main.random_seed = seed startup.random_seed = seed with fluid.unique_name.guard(): with fluid.program_guard(main, startup): img = fluid.layers.data(name='image', shape=[1, 28, 28], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') loss = residual_block(img, label, 1) if not is_test: opt = fluid.optimizer.Adam(learning_rate=0.0001) opt.minimize(loss) return [img, label], loss random.seed(0) np.random.seed(0) main = fluid.Program() startup = fluid.Program() test_program = fluid.Program() feeds, loss = build_program(main, startup, False) build_program(test_program, startup, True) test_program = test_program.clone(for_test=True) main_graph = IrGraph(core.Graph(main.desc), for_test=False) test_graph = IrGraph(core.Graph(test_program.desc), for_test=True) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) scope = fluid.Scope() with fluid.scope_guard(scope): exe.run(startup) transform_pass = QuantizationTransformPass( scope=scope, place=place, activation_quantize_type=activation_quant_type, weight_quantize_type=weight_quant_type) transform_pass.apply(main_graph) transform_pass.apply(test_graph) add_quant_dequant_pass = AddQuantDequantPass(scope=scope, place=place) add_quant_dequant_pass.apply(main_graph) add_quant_dequant_pass.apply(test_graph) scale_training_pass = OutScaleForTrainingPass(scope=scope, place=place) scale_training_pass.apply(main_graph) dev_name = '_gpu' if use_cuda else '_cpu' if not for_ci: marked_nodes = set() for op in main_graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) main_graph.draw('.', 'main_scale' + dev_name, marked_nodes) marked_nodes = set() for op in test_graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) test_graph.draw('.', 'test_scale' + dev_name, marked_nodes) build_strategy = fluid.BuildStrategy() build_strategy.memory_optimize = False build_strategy.enable_inplace = False build_strategy.fuse_all_reduce_ops = False binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy) iters = 5 batch_size = 8 train_reader = paddle.batch(paddle.reader.shuffle( paddle.dataset.mnist.train(), buf_size=500), batch_size=batch_size) feeder = fluid.DataFeeder(feed_list=feeds, place=place) with fluid.scope_guard(scope): for _ in range(iters): data = next(train_reader()) loss_v = exe.run(binary, feed=feeder.feed(data), fetch_list=[loss]) if not for_ci: print('{}: {}'.format('loss' + dev_name, loss_v)) scale_inference_pass = OutScaleForInferencePass(scope=scope) scale_inference_pass.apply(test_graph) # Freeze graph for inference, but the weight of fc/conv is still float type. freeze_pass = QuantizationFreezePass( scope=scope, place=place, weight_quantize_type=weight_quant_type) freeze_pass.apply(test_graph) server_program = test_graph.to_program() if not for_ci: marked_nodes = set() for op in test_graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) test_graph.draw('.', 'quant_scale' + dev_name, marked_nodes) with open('quant_scale_model' + dev_name + '.txt', 'w') as f: f.write(str(server_program)) with fluid.scope_guard(scope): fluid.io.save_inference_model('quant_scale_model' + dev_name, ['image', 'label'], [loss], exe, server_program)
def check_output_with_option(self, use_gpu, atol=1e-5, flatten=False, quant=False): ''' Check whether calculating on CPU and GPU, enable TensorRT or disable TensorRT, enable MKLDNN or disable MKLDNN are all the same. ''' place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() executor = fluid.Executor(place) scope = fluid.Scope() device = "GPU" if use_gpu else "CPU" with fluid.scope_guard(scope): executor.run(self.startup_program) if quant: main_graph = IrGraph(core.Graph(self.main_program.desc), for_test=True) transform_pass = QuantizationTransformPass( scope=scope, place=place, activation_quantize_type=self.activation_quant_type, weight_quantize_type=self.weight_quant_type, quantizable_op_type=[ 'conv2d', 'mul', 'depthwise_conv2d', 'conv2d_transpose' ]) transform_pass.apply(main_graph) weight_scale_map = { "conv2d": "conv2d_0.w_0.scale", "mul": "fc_0.w_0.scale" } weight_scale_tensor = scope.var( weight_scale_map[self.quantized_op_type]).get_tensor() weight_scale = np.ones(self.channels).astype("float32") weight_scale_tensor.set(weight_scale, place) op_nodes = main_graph.all_op_nodes() for op_node in op_nodes: if op_node.name() in [self.quantized_op_type, "relu"]: op_node.op()._set_attr("out_threshold", 0.5) with fluid.scope_guard(scope): executor.run(program=self.main_program, feed=self.feeds, fetch_list=self.fetch_list) freeze_pass = QuantizationFreezePass( scope=scope, place=place, weight_quantize_type=self.weight_quant_type) freeze_pass.apply(main_graph) self.main_program = main_graph.to_program() outs = self._save_models(executor, self.main_program, scope) analysis_outputs = self._get_analysis_outputs( self._get_analysis_config(use_gpu=use_gpu)) # Check whether the results calculated on CPU and on GPU are the same. self.assertTrue( len(outs) == len(analysis_outputs), "The number of outputs is different between inference and training forward at {}" .format(device)) for out, analysis_output in zip(outs, analysis_outputs): out = np.array(out) if flatten: out = out.flatten() analysis_output = analysis_output.flatten() self.assertTrue( np.allclose(out, analysis_output, atol=atol), "Output has diff between inference and training forward at {} " .format(device)) # Check whether the trt results and the GPU results are the same. if use_gpu and self.enable_trt: tensorrt_outputs = self._get_analysis_outputs( self._get_analysis_config(use_gpu=use_gpu, use_trt=self.enable_trt)) if self.trt_parameters.use_static: #deserialize tensorrt_outputs = self._get_analysis_outputs( self._get_analysis_config(use_gpu=use_gpu, use_trt=self.enable_trt)) self.assertTrue( len(tensorrt_outputs) == len(outs), "The number of outputs is different between GPU and TensorRT. " ) for out, tensorrt_output in zip(outs, tensorrt_outputs): out = np.array(out) if flatten: out = out.flatten() tensorrt_output = tensorrt_output.flatten() self.assertTrue(np.allclose(out, tensorrt_output, atol=atol), "Output has diff between GPU and TensorRT. ") # Check whether the mkldnn results and the CPU results are the same. if (not use_gpu) and self.enable_mkldnn: mkldnn_outputs = self._get_analysis_outputs( self._get_analysis_config(use_gpu=use_gpu, use_mkldnn=self.enable_mkldnn)) self.assertTrue( len(outs) == len(mkldnn_outputs), "The number of outputs is different between CPU and MKLDNN. ") if self.enable_mkldnn_bfloat16: atol = 0.01 for out, mkldnn_output in zip(outs, mkldnn_outputs): self.assertTrue( np.allclose(np.array(out), mkldnn_output, atol=atol), "Output has diff between CPU and MKLDNN. ")