def nndct_warn_print(string): if True == GLOBAL_MAP.get_ele(NNDCT_KEYS.WARN_FLAG): logger = GLOBAL_MAP.get_ele(NNDCT_KEYS.LOGGER) if logger: logger.warning("[NNDCT_WARN] {}".format(string)) else: print("[NNDCT_WARN] {}".format(string))
def _do_map(output_name, node_name): if not output_name == node_name: if not GLOBAL_MAP.get_ele(NNDCT_KEYS.OUTPUT_TO_NODE_MAP): GLOBAL_MAP.set_map(NNDCT_KEYS.OUTPUT_TO_NODE_MAP, {}) if not GLOBAL_MAP.get_ele(NNDCT_KEYS.NODE_TO_OUTPUT_MAP): GLOBAL_MAP.set_map(NNDCT_KEYS.NODE_TO_OUTPUT_MAP, {}) #map output to node output_to_node_map = GLOBAL_MAP.get_ele( NNDCT_KEYS.OUTPUT_TO_NODE_MAP) if not output_name in output_to_node_map: nndct_debug_print( "<map_output_and_node> map out {} and node{}".format( output_name, node_name), level=NNDCT_DEBUG_LVL.BUILD_GRAPH) output_to_node_map[output_name] = node_name else: assert output_to_node_map[ output_name] == node_name, "restored node name for output_name {} is {}, meet new node name {}".format( output_name, output_to_node_map[output_name], node_name) #add output to list keyed by node_name node_to_output_map = GLOBAL_MAP.get_ele( NNDCT_KEYS.NODE_TO_OUTPUT_MAP) if not node_name in node_to_output_map: node_to_output_map[node_name] = [output_name] else: node_to_output_map[node_name].append(output_name)
def nndct_error_print(string): if True == GLOBAL_MAP.get_ele(NNDCT_KEYS.ERROR_FLAG): logger = GLOBAL_MAP.get_ele(NNDCT_KEYS.LOGGER) if logger: logger.error("[NNDCT_ERROR] {}".format(string)) else: print("[NNDCT_ERROR] {}".format(string)) sys.exit(1)
def nndct_debug_print(string, title='', level=1): if True == GLOBAL_MAP.get_ele( NNDCT_KEYS.DEBUG_FLAG) and level <= GLOBAL_MAP.get_ele( NNDCT_KEYS.VERBOSE_LEVEL): logger = GLOBAL_MAP.get_ele(NNDCT_KEYS.LOGGER) if title == 'Start': string = "\n********************* <{} : {}> *********************".format( title, string) elif title == 'End': string = "\n********************* <{} : {}> *********************\n".format( title, string) if logger: logger.debug("[NNDCT_DEBUG_Lv_{}] {}".format(level, string)) else: print("[NNDCT_DEBUG_Lv_{}] {}".format(level, string))
def _graph2module(op): node = getattr(op, "node", None) for param_type, tensor in node.op.params.items(): py_tensor_util.param_to_torch_format(tensor) data = np.copy(tensor.data) if node.op.type in [ NNDCT_OP.CONVTRANSPOSE2D, NNDCT_OP.CONVTRANSPOSE3D ] and param_type == node.op.ParamName.WEIGHTS: # data = data.transpose(1, 0, 2, 3) data = data.swapaxes(0, 1) data = np.ascontiguousarray(data) if node.op.type in [ NNDCT_OP.DEPTHWISE_CONV2D, NNDCT_OP.DEPTHWISE_CONV3D ] and param_type == node.op.ParamName.WEIGHTS: out_channels = node.node_config("out_channels") kernel_size = node.node_config("kernel_size") data = data.reshape((out_channels, 1, *kernel_size)) if node.op.type in [ NNDCT_OP.DEPTHWISE_CONVTRANSPOSE2D, NNDCT_OP.DEPTHWISE_CONVTRANSPOSE3D ] and param_type == node.op.ParamName.WEIGHTS: in_channels = node.node_config("in_channels") kernel_size = node.node_config("kernel_size") data = data.reshape((1, in_channels, *kernel_size)) data = data.swapaxes(0, 1) data = np.ascontiguousarray(data) torch_tensor = torch.from_numpy(data) param_name = cls._parameter_map.get(param_type, param_type.value) if node.has_bound_params(): if hasattr(op, param_name): if isinstance(getattr(op, param_name), torch.Tensor): torch_tensor = torch_tensor.to( getattr(op, param_name)) else: torch_tensor = torch_tensor.to( getattr(op, param_name).data) if param_name in op._buffers: op._buffers[param_name] = torch_tensor else: op._parameters[param_name] = torch.nn.Parameter( torch_tensor) else: NndctScreenLogger().warning( f"new parameter: '{param_name}' is registered in {node.name}" ) op.register_parameter(param_name, torch.nn.Parameter(torch_tensor)) else: torch_tensor = torch_tensor.to( device=GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANT_DEVICE)) module.register_parameter(param_name, torch.nn.Parameter(torch_tensor)) py_tensor_util.param_to_nndct_format(tensor)
def forward(self, input): [input], _ = process_inputs_and_params(self.node, self.quantizer, inputs=[input]) if NndctOption.nndct_quant_off.value or NndctOption.nndct_cv_app.value: output = super().forward(input) # quantize output [output] = post_quant_process(self.node, [output]) elif self.quant_mode > 0: output = torch.empty_like(input) if NndctOption.nndct_tanh_sigmoid_sim.value > 0: NndctSigmoidSimulation(input, output) [output] = post_quant_process(self.node, [output]) else: input_name = self.node.in_nodes[0] fragpos = self.quantizer.get_bnfp(input_name, False)[1] quant_device = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANT_DEVICE) Ttable = SIGMOID_TABLE.table.to(quant_device) output = output.to(quant_device) NndctSigmoidTableLookup(input, Ttable, output, fragpos) else: output = super().forward(input) return output
def __init__(self, file_name=None): file_name = file_name or GLOBAL_MAP.get_ele( NNDCT_KEYS.MODIFIER).nndct_prefix + '.py' Exception.__init__( self, "The rebuilt graph mismatch with original graph, please manually modify '{}' and run again" .format(file_name))
def set_op_class_type(self, force_to_primitive: bool, schema: "Schema", class_type=None): if class_type is not None: self.op_class_type = TorchOpClassType.CUSTOM_FUNCTION elif schema is not None: schema2torchop = GLOBAL_MAP.get_ele(NNDCT_KEYS.TORCH_SCHEMA_OP_TABLE) schema_handler = SchemaHelper(schema) torchop = schema2torchop[schema_handler.toString()] self.op_class_type = torchop.op_class_type else: if force_to_primitive: self.op_class_type = TorchOpClassType.PRIMITIVE else: if self.op_name in dir(torch.nn): self.op_class_type = TorchOpClassType.NN_MODULE self.op_name = '.'.join(['torch', 'nn', self.op_name]) elif self.op_name in dir(torch.nn.functional): self.op_class_type = TorchOpClassType.NN_FUNCTION self.op_name = '.'.join(['torch', 'nn', 'functional', self.op_name]) elif self.op_name in dir(torch) and isinstance(getattr(torch, self.op_name), Callable): self.op_class_type = TorchOpClassType.TORCH_FUNCTION self.op_name = '.'.join(['torch', self.op_name]) elif self.op_name in dir(torch.Tensor): self.op_class_type = TorchOpClassType.TENSOR else: self.op_class_type = TorchOpClassType.UNKNOWN
def custom_op(self, node, *args): node2caller = GLOBAL_MAP.get_ele(NNDCT_KEYS.NODE_CALLER_MAP) if node2caller is None: node2caller: Dict[str, Callable] = {} GLOBAL_MAP.set_map(NNDCT_KEYS.NODE_CALLER_MAP, node2caller) node2caller[node.name] = node.caller op = TorchCustomOperation(node.raw_kind, node.raw_kind) for i, arg in enumerate(args): op.set_config(str(i), arg) attrs = GLOBAL_MAP.get_ele(NNDCT_KEYS.CUSTOM_OP_ATTRS_MAP).get( node.raw_kind, None) if attrs: attr_vals = args[len(args) - len(attrs):] for name, val in zip(attrs, attr_vals): op.set_attr_by_name(name, val) return op
def wrapper(*args, **kwargs): error_flag = GLOBAL_MAP.get_ele(NNDCT_KEYS.ERROR_FLAG) if error_flag == True: print("[NNDCT_ERROR]", end='') return func(*args, **kwargs) if error_flag == True: exit(1)
def export_quant_config(self): """ `export bitwidth and fixpoint info of blobs and parameters under work dir` """ quantizer = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANTIZER) if quantizer and quantizer.quant_mode == 1: quantizer.export_quant_config()
def dump_xmodel(self, deploy_check=False): """ `dump xmodel for LSTM cell` """ quantizer = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANTIZER) if quantizer and quantizer.quant_mode > 1: compiler = CompilerFactory.get_compiler("xmodel") xmodel_dir = os.path.join(self._export_folder, "xmodel") create_work_dir(xmodel_dir) for info in self._modules_info.values(): for l_num, layer_graph in enumerate(info["layers_graph"]): for lstm_direction, graph in layer_graph.items(): try: compiler.do_compile( nndct_graph=graph, quant_config_info=quantizer.quant_config, output_file_name=os.path.join(xmodel_dir, graph.name), graph_attr_kwargs={"direction": lstm_direction}) except Exception as e: print( f"[NNDCT_ERROR]:failed convert nndct graph to xmodel({str(e)})." ) else: print("[NNDCT_NOTE]:Successfully convert nndct graph to xmodel!") if deploy_check: print("[NNDCT_NOTE]: Dumping checking data...") checker = DeployChecker( output_dir_name=self._export_folder, data_format="txt") # get timestep output for name, info in self._layers_info.items(): cell = info["cell_module"] layer = info["layer_module"] graph = info["graph"] if layer.input is None: warnings.warn( f"[NNDCT_WARNING]: Provide inputs for '{name}' when do deploy checking", RuntimeWarning) continue set_outputs_recorder_status(cell, True) layer(layer.input, layer.initial_state, layer.batch_lengths) for timestep in range(layer.input.size()[1]): enable_dump_weight = True if timestep == 0 else False update_nndct_blob_data(cell, graph, timestep) checker.update_dump_folder(f"{graph.name}/frame_{timestep}") checker.dump_nodes_output( graph, quantizer.quant_config, round_method=quantizer.quant_opt['round_method'], enable_dump_weight=enable_dump_weight) set_outputs_recorder_status(cell, False) print("[NNDCT_NOTE]: Finsh dumping data.")
def dump_xmodel(output_dir="quantize_result", deploy_check=False): r"""converts module to xmodel for deployment compilation only works when quantm model = 2. The xmodel and some checking data will be generated under work dir. Args: deploy_check(bool): if true, can dump blobs and parameters of model for deployment verification Returns: None """ quantizer = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANTIZER) if quantizer and quantizer.quant_mode > 1: nndct_utils.create_work_dir(output_dir) # compile to xmodel compiler = CompilerFactory.get_compiler("xmodel") NndctScreenLogger().info("=>Converting to xmodel ...") deploy_graphs = get_deploy_graph_list(quantizer.quant_model, quantizer.Nndctgraph) depoly_infos = compiler.get_deloy_graph_infos(quantizer, deploy_graphs) for depoly_info in depoly_infos: try: compiler.do_compile(depoly_info.dev_graph, quant_config_info=depoly_info.quant_info, output_file_name=os.path.join( output_dir, depoly_info.dev_graph.name)) except AddXopError as e: NndctScreenLogger().error( f"Failed convert graph '{depoly_info.dev_graph.name}' to xmodel({str(e)})." ) # dump data for accuracy check if deploy_check: NndctScreenLogger().info( f"=>Dumping '{depoly_info.dev_graph.name}'' checking data..." ) checker = DeployChecker(output_dir_name=output_dir) checker.update_dump_folder(f"{depoly_info.dev_graph.name}") checker.dump_nodes_output( depoly_info.dev_graph, depoly_info.quant_info, round_method=quantizer.quant_opt['round_method'], select_batch=False) NndctScreenLogger().info( f"=>Finsh dumping data.({checker.dump_folder})") set_outputs_recorder_status(quantizer.quant_model, False)
def calib_global_param(self): quant_device = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANT_DEVICE) for tensor_type, algo_dict in self._QuantAlgo.items(): for name, algo in algo_dict.items(): if not algo.statistic_local: q_config = self.get_quant_config(name, False, tensor_type) if q_config[0] < 32: algo.calib_global_statis(quant_device) q_config[1], q_config[2], q_config[ 3] = algo.scale, algo.zero_point, algo.float_max self.set_quant_config(name, q_config, tensor_type)
def default(self, node, *args): schema2torchop = GLOBAL_MAP.get_ele(NNDCT_KEYS.TORCH_SCHEMA_OP_TABLE) schema_handler = SchemaHelper(node.schema) torchop = schema2torchop.get(schema_handler.toString(), None) if torchop is None: op = TorchUnknownOperation(node.raw_kind) return op node2caller = GLOBAL_MAP.get_ele(NNDCT_KEYS.NODE_CALLER_MAP) if node2caller is None: node2caller: Dict[str, Callable] = {} GLOBAL_MAP.set_map(NNDCT_KEYS.NODE_CALLER_MAP, node2caller) node2caller[node.name] = torchop.caller op = TorchBaseOperation(schema_handler.op_name, torchop.name, schema=node.schema) # op.set_caller(torchop.caller) assert len(args) == len(schema_handler.get_arguments()) if len(args) == 1: return op arg_name_convertor = {"self": "input"} for inp, arg in zip(args, schema_handler.get_arguments()): arg_name = schema_handler.arg_name(arg) if torchop.op_class_type == TorchOpClassType.TENSOR and arg_name == "self": continue if arg_name in ["layout", "memory_format", "pin_memory"]: continue config_name = arg_name_convertor.get(arg_name, arg_name) if convert_type_str(schema_handler.arg_type(arg)).replace( "?", "") == "bool": inp = bool(inp) if inp is not None else inp if convert_type_str(schema_handler.arg_type(arg)).replace( "?", "") == "str": inp = f"'{inp}'" if inp is not None else inp if arg_name == "device": inp = f"'{self._device_type}'" if arg_name == "dtype": inp = scalar_type_to_pytorch_type[ inp] if inp is not None else inp op.set_config(config_name, inp) return op
def do_quantize(self, blob, name, node=None, tensor_type='input'): # forward quant graph but not quantize parameter and activation if NndctOption.nndct_quant_off.value: return blob blob_save = blob if isinstance(blob.values, torch.Tensor): blob = blob.values quant_device = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANT_DEVICE) if blob.device.type != quant_device.type: raise TypeError( "Device of quantizer is {}, device of model and data should match device of quantizer" .format(quant_device.type)) if (NndctOption.nndct_quant_opt.value and NndctOption.nndct_logging_level.value > 0): quant_data = nndct_quant.QuantizeData(name, blob.cpu().detach().numpy()) # quantize the tensor bnfp = self.get_bnfp(name, True, tensor_type) #print('---- quant %s with 1/step = %g' % (name, bnfp[1])) # hardware cut method mth = 4 if self.lstm else 2 if tensor_type == 'param': mth = 3 res = py_nndct.nn.NndctFixNeuron(blob, blob, maxamp=[bnfp[0], bnfp[1]], method=mth) if (NndctOption.nndct_quant_opt.value and NndctOption.nndct_logging_level.value > 0): global global_snr_inv quant_efficiency, sqnr = quant_data.quant_efficiency( blob.cpu().detach().numpy(), 8) global_snr_inv += 1 / sqnr print( f"quant_efficiency={quant_efficiency}, global_snr_inv={global_snr_inv} {quant_data._name}\n" ) # update param to nndct graph if tensor_type == 'param': self.update_param_to_nndct(node, name, res.cpu().detach().numpy()) blob = blob_save res = blob_save return res
def node_from_output(output_name, model_type): if model_type == 'Nndct': return output_name if model_type == 'tensorflow': output_name = output_name.split(':')[0] elif model_type == 'torch': if output_name.split('_')[-1] in ['backward', 'forward']: output_name = ''.join(output_name.split('_')[:-1]) else: raise KeyError("node_from_output is not available for model type " + str(model_type)) output_map = GLOBAL_MAP.get_ele(NNDCT_KEYS.OUTPUT_TO_NODE_MAP) if output_map and output_name in output_map: return output_map[output_name] return output_name
def finetune_v2(self, run_fn, run_args): # check status if self.quantizer.quant_mode == 2: NndctScreenLogger().warning(f"Finetune function will be ignored in test mode!") return # parameter finetuning with AdaQuant(processor=self): # calibration to get a set of quantization steps NndctScreenLogger().info(f"=>Preparing data for fast finetuning module parameters ...") with NoQuant(): net_inputs, net_outputs = self.cache_net_inpouts(run_fn, run_args) NndctScreenLogger().info(f"=>Find initial quantization steps for fast finetuning...") self.calibrate(run_fn, run_args) NndctScreenLogger().info(f"=>Fast finetuning module parameters for better quantization accuracy...") self.setup_test() device = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANT_DEVICE) intial_net_loss = self.calc_net_loss(net_inputs, net_outputs, device) layer_act_pair = self.collect_layer_act_pair() finetune_group = [] for qmod, fmod in zip(self._quant_model.modules(), self._float_model.modules()): if hasattr(qmod, "node"): if (self.quantizer.configer.is_node_quantizable(qmod.node, False) and len(qmod.node.op.params) > 0): finetune_group.append([qmod.node, fmod]) net_loss = intial_net_loss for idx, (qnode, fmod) in tqdm(enumerate(finetune_group), total=len(finetune_group)): is_cached = self.is_cached(qnode, len(net_inputs[0])) if (is_cached and idx < len(finetune_group) / 2) or (not is_cached): need_cache = False else: need_cache = True net_loss = self.optimize_layer_v2(qnode, fmod, layer_act_pair, net_inputs, net_outputs, net_loss, device, need_cache) print(f"%%%%%%%%%%%%%%%%% final opt net loss:{net_loss.avg}") # print(f"{qnode.name}({need_cache}):{net_loss}") NndctScreenLogger().info(f"=>Export fast finetuned parameters ...") # export finetuned parameters self.quantizer.export_param()
def clone_quant_module(cls, quant_module): quantizer = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANTIZER) if _is_module_hooked(quant_module): cls.detach_node_from_module(quant_module) cls.hook_module_with_quantizer(quant_module, None) new_quant_module = copy.deepcopy(quant_module) cls.hook_module_with_node(quant_module, quantizer.graph) cls.hook_module_with_quantizer(quant_module, quantizer) new_graph = Graph(graph_name=quantizer.graph.name) new_graph.clone_from(quantizer.graph) cls.hook_module_with_node(new_quant_module, new_graph) cls.hook_module_with_quantizer(new_quant_module, quantizer) else: new_quant_module = copy.deepcopy(quant_module) return new_quant_module
def build_aten_torch_ops_table(): op_gathering_fns = (_get_tensor_ops, _get_nn_functional_ops, _get_torchscript_builtins, _get_global_builtins, _get_math_builtins, ) schema2torchop = GLOBAL_MAP.get_ele(NNDCT_KEYS.TORCH_SCHEMA_OP_TABLE) # schema_lut = GLOBAL_MAP.get_ele(NNDCT_KEYS.SCHEMA_LUT) if not schema2torchop: schema2torchop: Dict[str, TorchOp] = {} GLOBAL_MAP.set_map(NNDCT_KEYS.TORCH_SCHEMA_OP_TABLE, schema2torchop) # schema_lut: Dict[Tuple(str, int), "Schema"] = {} for fn in op_gathering_fns: fn()
def dump_xmodel(output_dir="quantize_result", deploy_check=False): r"""converts module to xmodel for deployment compilation only works when quantm model = 2. The xmodel and some checking data will be generated under work dir. Args: deploy_check(bool): if true, can dump blobs and parameters of model for deployment verification Returns: None """ quantizer = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANTIZER) if quantizer and quantizer.quant_mode > 1: nndct_utils.create_work_dir(output_dir) # compile to xmodel try: compiler = CompilerFactory.get_compiler("xmodel") NndctScreenLogger().info("=>Converting to xmodel ...") compiler.do_compile(nndct_graph=quantizer.Nndctgraph, quant_config_info=quantizer.quant_config, output_file_name=os.path.join( output_dir, quantizer.Nndctgraph.name)) except AddXopError as e: NndctScreenLogger().error( f"Failed convert nndct graph to xmodel({str(e)}).") else: NndctScreenLogger().info( f"=>Successfully convert to xmodel.({compiler.xmodel_file})") # dump data for accuracy checkvim if deploy_check: NndctScreenLogger().info("=>Dumping checking data...") update_nndct_blob_data(quantizer.quant_model, quantizer.Nndctgraph) checker = DeployChecker(output_dir_name=output_dir) checker.dump_nodes_output( quantizer.Nndctgraph, quantizer.quant_config, round_method=quantizer.quant_opt['round_method']) set_outputs_recorder_status(quantizer.quant_model, False) NndctScreenLogger().info( f"=>Finsh dumping data.({checker.dump_folder})")
def export_onnx_model(self, output_dir, verbose=False): from torch.onnx import register_custom_op_symbolic from torch.onnx.symbolic_helper import parse_args import sys torch_version = torch.__version__.split('.') if int(torch_version[0]) == 1 and int(torch_version[1]) < 7: NndctScreenLogger().error( f'Only supprt exporting onnx model with pytorch 1.7 and later version' ) return @parse_args("v", "i", "i", "f", "i", "i", "i", "i") def symbolic_fix_neuron(g, input, valmin, valmax, valamp, zero_point, method, device_id, inplace): #print(f'{valmax} {valamp} {method} {device_id}') if valamp < sys.float_info.min: scale = torch.tensor(sys.float_info.max).float( ) # Avoid exportor generating double type else: scale = torch.tensor( 1.0 / valamp).float() # Avoid exportor generating double type zero_point = torch.tensor( 0, dtype=torch.int8) # ONNX requires zero_point to be tensor return g.op("DequantizeLinear", g.op("QuantizeLinear", input, scale, zero_point), scale, zero_point) register_custom_op_symbolic("vai::fix_neuron", symbolic_fix_neuron, 9) output_file = os.path.join( output_dir, f"{self.quantizer.quant_model._get_name()}_int.onnx") opset_version = torch.onnx.symbolic_helper._onnx_stable_opsets[-1] device = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANT_DEVICE) self.quantizer.reset_status_for_exporting() model, input_args = to_device(self.quantizer.quant_model, self._example_inputs, device) torch.onnx.export(self.quantizer.quant_model, input_args, output_file, verbose=verbose, opset_version=opset_version)
def export_traced_torch_script(self, output_dir, verbose=False): torch_version = torch.__version__.split('.') if int(torch_version[0]) == 1 and int(torch_version[1]) < 7: NndctScreenLogger().error( f'Only supprt exporting torch script with pytorch 1.7 and later version' ) return self.quantizer.reset_status_for_exporting() device = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANT_DEVICE) force_cpu = os.getenv('NNDCT_FORCE_CPU_DUMP') if force_cpu is not None: device = torch.device('cpu') GLOBAL_MAP.set_map(NNDCT_KEYS.QUANT_DEVICE, device) model, input_args = to_device(self.quantizer.quant_model, self._example_inputs, device) script_module = torch.jit.trace(model, input_args, check_trace=False) output_file = os.path.join( output_dir, f"{self.quantizer.quant_model._get_name()}_int.pt") if verbose is True: print(script_module.inlined_graph) torch.jit.save(script_module, output_file)
def forward(self, input): qinput = quantize_tensors([input], self.node, tensor_type='input')[0] if NndctOption.nndct_quant_off.value or NndctOption.nndct_cv_app.value: output = super().forward(qinput) output = quantize_tensors([output], self.node)[0] elif self.quant_mode > 0: output = torch.empty_like(qinput) if NndctOption.nndct_tanh_sigmoid_sim.value > 0: NndctTanhSimulation(input, output) output = quantize_tensors([output], self.node)[0] else: input_name = self.node.in_nodes[0] fragpos = self.quantizer.get_quant_config(input_name, False)[1] quant_device = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANT_DEVICE) Ttable = TANH_TABLE.table.to(quant_device) output = output.to(quant_device) NndctTanhTableLookup(input, Ttable, output, fragpos) else: output = super().forward(qinput) return output
def maybe_get_quantizer(quantizer=None): quantizer = quantizer or GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANTIZER) if quantizer: return quantizer.quant_mode, quantizer else: return GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANT_MODE), None
def do_scan(self, res, name, node=None, tensor_type='input'): # keep quantization steps after fast finetune if self.keep_fp: return self.do_quantize(res, name, node, tensor_type) # forward quant graph but not quantize parameter and activation if NndctOption.nndct_quant_off.value: if self.inplace: return res else: return res.clone().detach() res_save = None if isinstance(res.values, torch.Tensor): res_save = res res = res.values.data if res.dtype != torch.float32 and res.dtype != torch.double: NndctScreenLogger().warning_once( f'The tensor type of {node.name} is {str(res.dtype)}. Only support float32/double quantization.' ) return res_save if res_save is not None else res quant_device = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANT_DEVICE) if res.device.type != quant_device.type: raise TypeError( "Device of quantizer is {}, device of model and data should match device of quantizer" .format(quant_device.type)) # get fixed position bnfp = self.get_quant_config(name, False, tensor_type) # hardware cut method mth = 4 if self.lstm else 2 if NndctOption.nndct_use_torch_quantizer.value is True: mth = -1 elif tensor_type == 'param': mth = 3 scope = 5 if NndctOption.nndct_diffs_mode.value == "mse" else 1 # set fix pos scanning scope to 1 for some type of tensors if (node.op.type in [NNDCT_OP.INPUT, NNDCT_OP.QUANT_STUB]): scope = 1 if (self.lstm and tensor_type == 'input'): scope = 1 res = res.detach().clone() Tbuffer = torch.empty_like(res).to(quant_device) Tfixpos = torch.tensor( [1], dtype=torch.get_default_dtype()).to(quant_device) # activation always calculate fix pos # calcualte fix pos if it is None # always calculate fis pos in finetune mode if tensor_type != 'param' or bnfp[1] is None or self.quant_mode == 3: py_nndct.nn.NndctDiffsFixPos(Tinput=res, Tbuffer=Tbuffer, Tfixpos=Tfixpos, bit_width=bnfp[0], range=scope, method=mth) bnfp[1] = (int)(Tfixpos.item()) # limit max fix pos to 12 if bit width <= 8, others limit to 15 if bnfp[0] <= 8 or self.lstm: max_fp = NndctOption.nndct_max_fix_position.value bnfp[1] = min(max_fp, bnfp[1]) else: bnfp[1] = min(15, bnfp[1]) # record fix pos of activation if tensor_type != 'param': self.config_history[tensor_type][name].append(bnfp[1]) if (NndctOption.nndct_stat.value > 1): print( f'---- fp history: {stats.mode(np.array(self.config_history[tensor_type][name]))}' ) data = np.array(self.config_history[tensor_type][name]) bnfp[1] = stats.mode(data)[0][0] bnfp[1] = bnfp[1].astype(np.int32).tolist() self.set_quant_config(name, bnfp, tensor_type) if (NndctOption.nndct_stat.value > 1): print('---- quant %s tensor: %s with bw = %d and fp = %g' % (tensor_type, name, bnfp[0], bnfp[1])) # get 2^bit_width and 2^fracpos bnfp = self.get_quant_config(name, True, tensor_type) if (NndctOption.nndct_stat.value > 2): quant_data = nndct_quant.QuantizeData( name, res.cpu().detach().numpy()) # do quantization for parameter or activation res = fake_quantize_per_tensor(res, bnfp[1], 0, -bnfp[0], bnfp[0] - 1, mth, self.inplace) if (NndctOption.nndct_stat.value > 2): #quant_data.all_close(res.cpu().detach().numpy()) global global_snr_inv quant_efficiency, sqnr = quant_data.quant_efficiency( res.cpu().detach().numpy(), math.log2(bnfp[0])) global_snr_inv += 1 / sqnr if quant_efficiency < 3.0: print( f"quant_efficiency={quant_efficiency}, {quant_data._name}\n" ) print('Statistic [Min, Max, Mean, Std]:') print('[{}, {}, {}, {}]'.format(res.min(), res.max(), res.mean(), res.std())) print('histogram: {}'.format( res.histc(bins=10).cpu().detach().numpy())) t = res if tensor_type != 'param': t = res.transpose(0, 1) print('Channel number:{}'.format(t.shape[0])) print('Channel-wise statistic [Min, Max, Mean, Std]:') for c in range(t.shape[0]): print('[{}, {}, {}, {}]'.format( t[c].min(), t[c].max(), t[c].mean(), t[c].std())) print('histogram: {}'.format( t[c].histc(bins=10).cpu().detach().numpy())) if res_save is not None: res_save.values.data = res res = res_save return res
def do_quantize(self, blob, name, node=None, tensor_type='input'): # forward quant graph but not quantize parameter and activation if NndctOption.nndct_quant_off.value: if self.inplace: return blob else: return blob.clone().detach() blob_save = None if isinstance(blob.values, torch.Tensor): blob_save = blob blob = blob.values.data if blob.dtype != torch.float32 and blob.dtype != torch.double: NndctScreenLogger().warning_once( f'The tensor type of {node.name} is {str(blob.dtype)}. Only support float32/double quantization.' ) return blob_save if blob_save is not None else blob quant_device = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANT_DEVICE) if blob.device.type != quant_device.type: raise TypeError( "Device of quantizer is {}, device of model and data should match device of quantizer" .format(quant_device.type)) if (NndctOption.nndct_stat.value > 2): quant_data = nndct_quant.QuantizeData(name, blob.cpu().detach().numpy()) # quantize the tensor bnfp = self.get_quant_config(name, True, tensor_type) if (NndctOption.nndct_stat.value > 1): print('---- quant %s tensor: %s with 1/step = %g' % (tensor_type, name, bnfp[1])) # hardware cut method mth = 4 if self.lstm else 2 if NndctOption.nndct_use_torch_quantizer.value is True: mth = -1 elif tensor_type == 'param': mth = 3 res = fake_quantize_per_tensor(blob, bnfp[1], 0, -bnfp[0], bnfp[0] - 1, mth, self.inplace) if (NndctOption.nndct_stat.value > 2): global global_snr_inv quant_efficiency, sqnr = quant_data.quant_efficiency( res.cpu().detach().numpy(), 8) global_snr_inv += 1 / sqnr if quant_efficiency < 3.0: print( f"quant_efficiency={quant_efficiency}, global_snr_inv={global_snr_inv} {quant_data._name}\n" ) print( 'Network input channel-wise statistic [Min, Max, Mean, Std]:' ) print('[{}, {}, {}, {}]'.format(res.min(), res.max(), res.mean(), res.std())) print('histogram: {}'.format( res.histc(bins=10).cpu().detach().numpy())) t = res if tensor_type != 'param': t = res.transpose(0, 1) print('Channel number:{}'.format(t.shape[0])) print('Channel-wise statistic [Min, Max, Mean, Std]:') for c in range(t.shape[0]): print('[{}, {}, {}, {}]'.format(t[c].min(), t[c].max(), t[c].mean(), t[c].std())) print('histogram: {}'.format( t[c].histc(bins=10).cpu().detach().numpy())) # update param to nndct graph if tensor_type == 'param' and not self.exporting: self.update_param_to_nndct(node, name, res.cpu().detach().numpy()) if blob_save is not None: blob_save.values.data = res res = blob_save return res
def finetune(self, run_fn, run_args): if self.quantizer.quant_mode == 2: NndctScreenLogger().warning( f"Finetune function will be ignored in test mode!") return NndctScreenLogger().info( f"=>Finetuning module parameters for better quantization accuracy... " ) # backup option value opt_bak_param_corr = NndctOption.nndct_param_corr.value set_option_value("nndct_param_corr", 0) # cache input and output #print("**** cache input and output") last_quant_nodes = self.collect_last_quant_nodes() with torch.no_grad(): hook_mods = [] for node in self.graph.nodes: if node.op.type == NNDCT_OP.INPUT or \ node in last_quant_nodes: # (self.quantizer.configer.is_node_quantizable(node, False) and # len(node.op.params) > 0): hook_mods.append(node.module) handlers = self.hook_cache_output(hook_mods) set_option_value("nndct_quant_off", True) run_fn(*run_args) self.clean_hooks(handlers) # for mod in self.quant_model.modules(): # if hasattr(mod, "node") and mod.node.op.type in [NNDCT_OP.DENSE, NNDCT_OP.CONV2D, NNDCT_OP.DEPTHWISE_CONV2D, NNDCT_OP.CONVTRANSPOSE2D]: # self._float_weights[mod.node].append(mod.weight.detach().cpu()) torch.cuda.empty_cache() # calibration to get a set of quantization steps #print("****calibration to get float model tensor values") for mod in self.quant_model.modules(): if hasattr(mod, "param_quantized"): setattr(mod, "param_quantized", False) # evaluation to get float model tensors set_option_value("nndct_quant_off", False) with torch.no_grad(): run_fn(*run_args) torch.cuda.empty_cache() #print("****Parameter finetuning") device = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANT_DEVICE) graph_searcher = GraphSearcher(self.graph) node_sets = graph_searcher.find_nodes_from_type([ PatternType(pattern=[NNDCT_OP.CONV2D, NNDCT_OP.RELU]), PatternType(pattern=[NNDCT_OP.CONV2D, NNDCT_OP.RELU6]), PatternType(pattern=[NNDCT_OP.DEPTHWISE_CONV2D, NNDCT_OP.RELU]), PatternType(pattern=[NNDCT_OP.DEPTHWISE_CONV2D, NNDCT_OP.RELU6]), PatternType(pattern=[NNDCT_OP.CONVTRANSPOSE2D, NNDCT_OP.RELU]) ]) layer_act_group = {} for _, node_list in node_sets.items(): for nodeset in node_list: conv, act = nodeset layer_act_group[conv] = act # to avoid quantization steps change among parameter finetuning self.quantizer.quant_mode = 2 net_inputs = [] for node in self.input_nodes: cached_net_input = [ out for out in self.cached_outputs[node.module] ] net_inputs.append(cached_net_input) # last_quant_nodes = self.collect_last_quant_nodes() last_quant_mods = [node.module for node in last_quant_nodes] handlers = self.hook_cache_output(last_quant_mods, hook_type="single") net_loss = self.eval_loss(net_inputs, last_quant_mods, device) self.clean_hooks(handlers) # model.clean_hooks() torch.cuda.empty_cache() finetune_group = {} # hook_mods = [] for qmod, fmod in zip(self._quant_model.modules(), self._float_model.modules()): if hasattr(qmod, "node"): if (self.quantizer.configer.is_node_quantizable( qmod.node, False) and len(qmod.node.op.params) > 0): finetune_group[qmod.node] = [qmod, fmod] # hook_mods.append(fmod) # self.hook_cache_output(hook_mods, hook_type="single") for node, module_pair in finetune_group.items(): # if self.quantizer.configer.is_node_quantizable(node, False) and \ # len(node.op.params) > 0: quant_layer, float_layer = module_pair pn_node = self.graph.parents(node)[0] handlers = self.hook_cache_output([pn_node.module], hook_type="single") layer_inputs = [] with torch.no_grad(): for input_args in zip(*net_inputs): new_input_args = [] for ip in input_args: if isinstance(ip, torch.Tensor): new_input_args.append(ip.to(device)) _ = self.quant_model(*new_input_args) layer_inputs.append( self.cached_output[pn_node.module].detach().cpu()) self.clean_hooks(handlers) del self.cached_output[pn_node.module] #print(f"Tuning {node.name}") net_loss = self.optimize_layer(node, float_layer, layer_inputs, layer_act_group, net_inputs, net_loss, last_quant_mods, device) del layer_inputs torch.cuda.empty_cache() # recover quantizer status for node in self.graph.nodes: for _, fp_history in self.quantizer.fp_history.items(): if node.name in fp_history: fp_history[node.name].clear() for mod in self.quant_model.modules(): if hasattr(mod, "param_quantized"): setattr(mod, "param_quantized", False) for mod in self.quant_model.modules(): if hasattr(mod, "param_saved"): setattr(mod, "param_saved", False) self.quantizer.quant_mode = 1 set_option_value("nndct_param_corr", opt_bak_param_corr) # export finetuned parameters self.quantizer.export_param()
def dump_xmodel(output_dir="quantize_result", deploy_check=False, lstm_app=False): r"""converts module to xmodel for deployment compilation only works when quantm model = 2. The xmodel and some checking data will be generated under work dir. Args: deploy_check(bool): if true, can dump blobs and parameters of model for deployment verification Returns: None """ quantizer = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANTIZER) if quantizer and quantizer.quant_mode > 1: nndct_utils.create_work_dir(output_dir) # compile to xmodel compiler = CompilerFactory.get_compiler("xmodel") NndctScreenLogger().info("=>Converting to xmodel ...") deploy_graphs = get_deploy_graph_list(quantizer.quant_model, quantizer.Nndctgraph) #depoly_infos = compiler.get_deloy_graph_infos(quantizer, deploy_graphs) xmodel_depoly_infos, dump_deploy_infos = compiler.get_xmodel_and_dump_infos(quantizer, deploy_graphs) if not lstm_app: for node in xmodel_depoly_infos[0].dev_graph.nodes: error_out = False if node.op.type not in [NNDCT_OP.INPUT, NNDCT_OP.QUANT_STUB]: continue for i, tensor in enumerate(node.out_tensors): if tensor.shape and tensor.shape[0] != 1: NndctScreenLogger().error(f"Batch size must be 1 when exporting xmodel.") error_out = True break if error_out: break for depoly_info in dump_deploy_infos: # dump data for accuracy check if deploy_check: NndctScreenLogger().info(f"=>Dumping '{depoly_info.dev_graph.name}'' checking data...") if lstm_app: checker = DeployChecker(output_dir_name=output_dir, data_format='txt') checker.update_dump_folder(f"{depoly_info.dev_graph.name}/frame_0") select_batch = True else: checker = DeployChecker(output_dir_name=output_dir) checker.update_dump_folder(f"{depoly_info.dev_graph.name}") select_batch = False checker.dump_nodes_output( depoly_info.dev_graph, depoly_info.quant_info, round_method=quantizer.quant_opt['round_method'], select_batch=select_batch) NndctScreenLogger().info(f"=>Finsh dumping data.({checker.dump_folder})") for depoly_info in xmodel_depoly_infos: try: xgraph = compiler.do_compile( depoly_info.dev_graph, quant_config_info=depoly_info.quant_info, output_file_name=os.path.join(output_dir, depoly_info.dev_graph.name)) except AddXopError as e: NndctScreenLogger().error(f"Failed convert graph '{depoly_info.dev_graph.name}' to xmodel.") raise e compiler.verify_xmodel(depoly_info.dev_graph, xgraph) set_outputs_recorder_status(quantizer.quant_model, False)
def do_scan(self, res, name, node=None, tensor_type='input'): # forward quant graph but not quantize parameter and activation if NndctOption.nndct_quant_off.value: if self.inplace: return res else: return res.clone().detach() res_save = None if isinstance(res.values, torch.Tensor): res_save = res res = res.values.data quant_device = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANT_DEVICE) if res.device.type != quant_device.type: raise TypeError( "Device of quantizer is {}, device of model and data should match device of quantizer" .format(quant_device.type)) # get fixed position q_config = self.get_quant_config(name, False, tensor_type) # turn off quantization if bit width is more than 32 if q_config[0] >= 32: if self.inplace: return res else: return res.clone().detach() q_algorithm = self.get_quant_algo(name, tensor_type) # get quant algorithm #if tensor_type != 'param' or q_config[1] is None or q_config[2] is None: if q_algorithm.calib_or_not(tensor_type): #q_algorithm = self.get_quant_algo(name, tensor_type) q_algorithm.calibrate(res) if q_algorithm.statistic_local: # quant_tensor = q_algorithm.fake_quantize(res, self.inplace) # if self.inplace: # res.data = quant_tensor.data.clone() # else: # res = quant_tensor q_config[1] = q_algorithm.scale q_config[2] = q_algorithm.zero_point q_config[3] = q_algorithm.float_max if tensor_type != 'param': self.config_history[tensor_type][name].append( [q_config[1], q_config[2], q_config[3]]) data = np.array( self.config_history[tensor_type][name]).transpose( 1, 0) q_config[1], q_config[2], q_config[ 3] = q_algorithm.act_scale_stats(data) #q_algorithm.scale, q_algorithm.zero_point, q_algorithm.float_max = q_config[1], q_config[2], q_config[3] self.set_quant_config(name, q_config, tensor_type) quant_tensor = q_algorithm.fake_quantize(res, self.inplace) if self.inplace: res.data = quant_tensor.data.clone() else: res = quant_tensor if res_save is not None: res_save.values.data = res res = res_save return res