def nndct_warn_print(string): if True == GLOBAL_MAP.get_ele(NNDCT_KEYS.WARN_FLAG): logger = GLOBAL_MAP.get_ele(NNDCT_KEYS.LOGGER) if logger: logger.warning("[NNDCT_WARN] {}".format(string)) else: print("[NNDCT_WARN] {}".format(string))
def nndct_error_print(string): if True == GLOBAL_MAP.get_ele(NNDCT_KEYS.ERROR_FLAG): logger = GLOBAL_MAP.get_ele(NNDCT_KEYS.LOGGER) if logger: logger.error("[NNDCT_ERROR] {}".format(string)) else: print("[NNDCT_ERROR] {}".format(string)) sys.exit(1)
def _do_map(output_name, node_name): if not output_name == node_name: if not GLOBAL_MAP.get_ele(NNDCT_KEYS.OUTPUT_TO_NODE_MAP): GLOBAL_MAP.set_map(NNDCT_KEYS.OUTPUT_TO_NODE_MAP, {}) if not GLOBAL_MAP.get_ele(NNDCT_KEYS.NODE_TO_OUTPUT_MAP): GLOBAL_MAP.set_map(NNDCT_KEYS.NODE_TO_OUTPUT_MAP, {}) #map output to node output_to_node_map = GLOBAL_MAP.get_ele( NNDCT_KEYS.OUTPUT_TO_NODE_MAP) if not output_name in output_to_node_map: nndct_debug_print( "<map_output_and_node> map out {} and node{}".format( output_name, node_name), level=NNDCT_DEBUG_LVL.BUILD_GRAPH) output_to_node_map[output_name] = node_name else: assert output_to_node_map[ output_name] == node_name, "restored node name for output_name {} is {}, meet new node name {}".format( output_name, output_to_node_map[output_name], node_name) #add output to list keyed by node_name node_to_output_map = GLOBAL_MAP.get_ele( NNDCT_KEYS.NODE_TO_OUTPUT_MAP) if not node_name in node_to_output_map: node_to_output_map[node_name] = [output_name] else: node_to_output_map[node_name].append(output_name)
def nndct_debug_print(string, title='', level=1): if True == GLOBAL_MAP.get_ele( NNDCT_KEYS.DEBUG_FLAG) and level <= GLOBAL_MAP.get_ele( NNDCT_KEYS.VERBOSE_LEVEL): logger = GLOBAL_MAP.get_ele(NNDCT_KEYS.LOGGER) if title == 'Start': string = "\n********************* <{} : {}> *********************".format( title, string) elif title == 'End': string = "\n********************* <{} : {}> *********************\n".format( title, string) if logger: logger.debug("[NNDCT_DEBUG_Lv_{}] {}".format(level, string)) else: print("[NNDCT_DEBUG_Lv_{}] {}".format(level, string))
def _graph2module(op): node = getattr(op, "node", None) for param_type, tensor in node.op.params.items(): py_tensor_util.param_to_torch_format(tensor) data = np.copy(tensor.data) if node.op.type in [ NNDCT_OP.CONVTRANSPOSE2D, NNDCT_OP.CONVTRANSPOSE3D ] and param_type == node.op.ParamName.WEIGHTS: # data = data.transpose(1, 0, 2, 3) data = data.swapaxes(0, 1) data = np.ascontiguousarray(data) if node.op.type in [ NNDCT_OP.DEPTHWISE_CONV2D, NNDCT_OP.DEPTHWISE_CONV3D ] and param_type == node.op.ParamName.WEIGHTS: out_channels = node.node_config("out_channels") kernel_size = node.node_config("kernel_size") data = data.reshape((out_channels, 1, *kernel_size)) if node.op.type in [ NNDCT_OP.DEPTHWISE_CONVTRANSPOSE2D, NNDCT_OP.DEPTHWISE_CONVTRANSPOSE3D ] and param_type == node.op.ParamName.WEIGHTS: in_channels = node.node_config("in_channels") kernel_size = node.node_config("kernel_size") data = data.reshape((1, in_channels, *kernel_size)) data = data.swapaxes(0, 1) data = np.ascontiguousarray(data) torch_tensor = torch.from_numpy(data) param_name = cls._parameter_map.get(param_type, param_type.value) if node.has_bound_params(): if hasattr(op, param_name): if isinstance(getattr(op, param_name), torch.Tensor): torch_tensor = torch_tensor.to( getattr(op, param_name)) else: torch_tensor = torch_tensor.to( getattr(op, param_name).data) if param_name in op._buffers: op._buffers[param_name] = torch_tensor else: op._parameters[param_name] = torch.nn.Parameter( torch_tensor) else: NndctScreenLogger().warning( f"new parameter: '{param_name}' is registered in {node.name}" ) op.register_parameter(param_name, torch.nn.Parameter(torch_tensor)) else: torch_tensor = torch_tensor.to( device=GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANT_DEVICE)) module.register_parameter(param_name, torch.nn.Parameter(torch_tensor)) py_tensor_util.param_to_nndct_format(tensor)
def forward(self, input): [input], _ = process_inputs_and_params(self.node, self.quantizer, inputs=[input]) if NndctOption.nndct_quant_off.value or NndctOption.nndct_cv_app.value: output = super().forward(input) # quantize output [output] = post_quant_process(self.node, [output]) elif self.quant_mode > 0: output = torch.empty_like(input) if NndctOption.nndct_tanh_sigmoid_sim.value > 0: NndctSigmoidSimulation(input, output) [output] = post_quant_process(self.node, [output]) else: input_name = self.node.in_nodes[0] fragpos = self.quantizer.get_bnfp(input_name, False)[1] quant_device = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANT_DEVICE) Ttable = SIGMOID_TABLE.table.to(quant_device) output = output.to(quant_device) NndctSigmoidTableLookup(input, Ttable, output, fragpos) else: output = super().forward(input) return output
def export_quant_config(self): """ `export bitwidth and fixpoint info of blobs and parameters under work dir` """ quantizer = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANTIZER) if quantizer and quantizer.quant_mode == 1: quantizer.export_quant_config()
def wrapper(*args, **kwargs): error_flag = GLOBAL_MAP.get_ele(NNDCT_KEYS.ERROR_FLAG) if error_flag == True: print("[NNDCT_ERROR]", end='') return func(*args, **kwargs) if error_flag == True: exit(1)
def __init__(self, file_name=None): file_name = file_name or GLOBAL_MAP.get_ele( NNDCT_KEYS.MODIFIER).nndct_prefix + '.py' Exception.__init__( self, "The rebuilt graph mismatch with original graph, please manually modify '{}' and run again" .format(file_name))
def _init_quant_env(): nonlocal quant_mode if NndctOption.nndct_quant_mode.value > 0: quant_mode = NndctOption.nndct_quant_mode.value if quant_mode == 1: NndctScreenLogger().info( f"Quantization calibration process start up...") elif quant_mode == 2: NndctScreenLogger().info(f"Quantization test process start up...") quantizer = TORCHQuantizer(quant_mode, output_dir, bitwidth_w, bitwidth_a) GLOBAL_MAP.set_map(NNDCT_KEYS.QUANTIZER, quantizer) GLOBAL_MAP.set_map(NNDCT_KEYS.QUANT_MODE, quant_mode) return quantizer, quant_mode
def custom_op(self, node, *args): node2caller = GLOBAL_MAP.get_ele(NNDCT_KEYS.NODE_CALLER_MAP) if node2caller is None: node2caller: Dict[str, Callable] = {} GLOBAL_MAP.set_map(NNDCT_KEYS.NODE_CALLER_MAP, node2caller) node2caller[node.name] = node.caller op = TorchCustomOperation(node.raw_kind, node.raw_kind) for i, arg in enumerate(args): op.set_config(str(i), arg) attrs = GLOBAL_MAP.get_ele(NNDCT_KEYS.CUSTOM_OP_ATTRS_MAP).get( node.raw_kind, None) if attrs: attr_vals = args[len(args) - len(attrs):] for name, val in zip(attrs, attr_vals): op.set_attr_by_name(name, val) return op
def set_op_class_type(self, force_to_primitive: bool, schema: "Schema", class_type=None): if class_type is not None: self.op_class_type = TorchOpClassType.CUSTOM_FUNCTION elif schema is not None: schema2torchop = GLOBAL_MAP.get_ele(NNDCT_KEYS.TORCH_SCHEMA_OP_TABLE) schema_handler = SchemaHelper(schema) torchop = schema2torchop[schema_handler.toString()] self.op_class_type = torchop.op_class_type else: if force_to_primitive: self.op_class_type = TorchOpClassType.PRIMITIVE else: if self.op_name in dir(torch.nn): self.op_class_type = TorchOpClassType.NN_MODULE self.op_name = '.'.join(['torch', 'nn', self.op_name]) elif self.op_name in dir(torch.nn.functional): self.op_class_type = TorchOpClassType.NN_FUNCTION self.op_name = '.'.join(['torch', 'nn', 'functional', self.op_name]) elif self.op_name in dir(torch) and isinstance(getattr(torch, self.op_name), Callable): self.op_class_type = TorchOpClassType.TORCH_FUNCTION self.op_name = '.'.join(['torch', self.op_name]) elif self.op_name in dir(torch.Tensor): self.op_class_type = TorchOpClassType.TENSOR else: self.op_class_type = TorchOpClassType.UNKNOWN
def build_aten_torch_ops_table(): op_gathering_fns = (_get_tensor_ops, _get_nn_functional_ops, _get_torchscript_builtins, _get_global_builtins, _get_math_builtins, ) schema2torchop = GLOBAL_MAP.get_ele(NNDCT_KEYS.TORCH_SCHEMA_OP_TABLE) # schema_lut = GLOBAL_MAP.get_ele(NNDCT_KEYS.SCHEMA_LUT) if not schema2torchop: schema2torchop: Dict[str, TorchOp] = {} GLOBAL_MAP.set_map(NNDCT_KEYS.TORCH_SCHEMA_OP_TABLE, schema2torchop) # schema_lut: Dict[Tuple(str, int), "Schema"] = {} for fn in op_gathering_fns: fn()
def dump_xmodel(self, deploy_check=False): """ `dump xmodel for LSTM cell` """ quantizer = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANTIZER) if quantizer and quantizer.quant_mode > 1: compiler = CompilerFactory.get_compiler("xmodel") xmodel_dir = os.path.join(self._export_folder, "xmodel") create_work_dir(xmodel_dir) for info in self._modules_info.values(): for l_num, layer_graph in enumerate(info["layers_graph"]): for lstm_direction, graph in layer_graph.items(): try: compiler.do_compile( nndct_graph=graph, quant_config_info=quantizer.quant_config, output_file_name=os.path.join(xmodel_dir, graph.name), graph_attr_kwargs={"direction": lstm_direction}) except Exception as e: print( f"[NNDCT_ERROR]:failed convert nndct graph to xmodel({str(e)})." ) else: print("[NNDCT_NOTE]:Successfully convert nndct graph to xmodel!") if deploy_check: print("[NNDCT_NOTE]: Dumping checking data...") checker = DeployChecker( output_dir_name=self._export_folder, data_format="txt") # get timestep output for name, info in self._layers_info.items(): cell = info["cell_module"] layer = info["layer_module"] graph = info["graph"] if layer.input is None: warnings.warn( f"[NNDCT_WARNING]: Provide inputs for '{name}' when do deploy checking", RuntimeWarning) continue set_outputs_recorder_status(cell, True) layer(layer.input, layer.initial_state, layer.batch_lengths) for timestep in range(layer.input.size()[1]): enable_dump_weight = True if timestep == 0 else False update_nndct_blob_data(cell, graph, timestep) checker.update_dump_folder(f"{graph.name}/frame_{timestep}") checker.dump_nodes_output( graph, quantizer.quant_config, round_method=quantizer.quant_opt['round_method'], enable_dump_weight=enable_dump_weight) set_outputs_recorder_status(cell, False) print("[NNDCT_NOTE]: Finsh dumping data.")
def convert_to_deployable(self, trained_model, mix_bit=False): if not self._qinfo_to_quantizer or not self._module_map: raise RuntimeError('Must call "trainable_model" first.') # Copy trained parameters from transformed model to original float model. orig_state_dict = self._model.state_dict() trained_state_dict = trained_model.state_dict() state_dict = {} for key in orig_state_dict.keys(): module_name, weight_name, = key.rsplit('.', 1) if module_name in self._module_map: trained_module_name = self._module_map[module_name] trained_key = '.'.join([trained_module_name, weight_name]) else: trained_key = key state_dict[key] = trained_state_dict[trained_key] model = copy.deepcopy(self._model) model.load_state_dict(state_dict) model.eval() ''' inputs = dummy_inputs(self._input_specs) qprocessor = qproc.TorchQuantProcessor( 'test', model, [inp.cuda() for inp in inputs], mix_bit=mix_bit, device=torch.device('cuda')) ''' inputs = self._input_args qprocessor = qproc.TorchQuantProcessor('test', model, inputs, mix_bit=mix_bit, device=torch.device('cuda')) quantizer = qprocessor.quantizer self._fill_in_quant_info(quantizer, self._qinfo_to_quantizer) quantizer.export_quant_config() quant_model = quantizer.quant_model quant_model.dump_xmodel = dump_xmodel self.deploy_quantizer = quantizer GLOBAL_MAP.set_map(NNDCT_KEYS.QUANTIZER, quantizer) NndctScreenLogger().info(f"=>Deployable model is generated.")
def tf_quantizer(model, input_signature, quant_mode: str = "calib", output_dir: str = "quantize_result", bitwidth: int = 8): #initialize quant mode qmode = _init_quant_mode(quant_mode) # turn off weights equalization and bias correction option_util.set_option_value("nndct_param_corr", False) option_util.set_option_value("nndct_equalization", False) # lstm IP only support 16 bit activation quantizer = TFQuantizer(qmode, output_dir, bitwidth, 16) GLOBAL_MAP.set_map(NNDCT_KEYS.QUANTIZER, quantizer) GLOBAL_MAP.set_map(NNDCT_KEYS.QUANT_MODE, qmode) graph = parser.from_keras_model(model, input_signature) quant_model, layer_nodes = builder.KerasBuilder(graph).build( os.path.join(output_dir, model.name + '_quant.py'), quantized=True) rebuilding_results = _maybe_rebuild_rnn(quant_model) if rebuilding_results: cell_graphs = [] cell_layer_nodes = [] for graph, layer_nodes in rebuilding_results: cell_graphs.append(graph) cell_layer_nodes.extend(layer_nodes) quantizer.add_rnn_cell_graph('forward', graph) graph = _merge_cell_graphs(cell_graphs) layer_nodes = cell_layer_nodes # TODO(yuwang): Support backward direction. export_file = os.path.join(output_dir, 'merged_graph.pb') graph_utils.maybe_export_graph(export_file, graph) lstm = True if len(rebuilding_results) > 0 else False quantizer.setup(graph, lstm=lstm) quantizer.load_node_to_layer(layer_nodes, quant_model) return quantizer
def calib_global_param(self): quant_device = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANT_DEVICE) for tensor_type, algo_dict in self._QuantAlgo.items(): for name, algo in algo_dict.items(): if not algo.statistic_local: q_config = self.get_quant_config(name, False, tensor_type) if q_config[0] < 32: algo.calib_global_statis(quant_device) q_config[1], q_config[2], q_config[ 3] = algo.scale, algo.zero_point, algo.float_max self.set_quant_config(name, q_config, tensor_type)
def dump_xmodel(output_dir="quantize_result", deploy_check=False): r"""converts module to xmodel for deployment compilation only works when quantm model = 2. The xmodel and some checking data will be generated under work dir. Args: deploy_check(bool): if true, can dump blobs and parameters of model for deployment verification Returns: None """ quantizer = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANTIZER) if quantizer and quantizer.quant_mode > 1: nndct_utils.create_work_dir(output_dir) # compile to xmodel compiler = CompilerFactory.get_compiler("xmodel") NndctScreenLogger().info("=>Converting to xmodel ...") deploy_graphs = get_deploy_graph_list(quantizer.quant_model, quantizer.Nndctgraph) depoly_infos = compiler.get_deloy_graph_infos(quantizer, deploy_graphs) for depoly_info in depoly_infos: try: compiler.do_compile(depoly_info.dev_graph, quant_config_info=depoly_info.quant_info, output_file_name=os.path.join( output_dir, depoly_info.dev_graph.name)) except AddXopError as e: NndctScreenLogger().error( f"Failed convert graph '{depoly_info.dev_graph.name}' to xmodel({str(e)})." ) # dump data for accuracy check if deploy_check: NndctScreenLogger().info( f"=>Dumping '{depoly_info.dev_graph.name}'' checking data..." ) checker = DeployChecker(output_dir_name=output_dir) checker.update_dump_folder(f"{depoly_info.dev_graph.name}") checker.dump_nodes_output( depoly_info.dev_graph, depoly_info.quant_info, round_method=quantizer.quant_opt['round_method'], select_batch=False) NndctScreenLogger().info( f"=>Finsh dumping data.({checker.dump_folder})") set_outputs_recorder_status(quantizer.quant_model, False)
def default(self, node, *args): schema2torchop = GLOBAL_MAP.get_ele(NNDCT_KEYS.TORCH_SCHEMA_OP_TABLE) schema_handler = SchemaHelper(node.schema) torchop = schema2torchop.get(schema_handler.toString(), None) if torchop is None: op = TorchUnknownOperation(node.raw_kind) return op node2caller = GLOBAL_MAP.get_ele(NNDCT_KEYS.NODE_CALLER_MAP) if node2caller is None: node2caller: Dict[str, Callable] = {} GLOBAL_MAP.set_map(NNDCT_KEYS.NODE_CALLER_MAP, node2caller) node2caller[node.name] = torchop.caller op = TorchBaseOperation(schema_handler.op_name, torchop.name, schema=node.schema) # op.set_caller(torchop.caller) assert len(args) == len(schema_handler.get_arguments()) if len(args) == 1: return op arg_name_convertor = {"self": "input"} for inp, arg in zip(args, schema_handler.get_arguments()): arg_name = schema_handler.arg_name(arg) if torchop.op_class_type == TorchOpClassType.TENSOR and arg_name == "self": continue if arg_name in ["layout", "memory_format", "pin_memory"]: continue config_name = arg_name_convertor.get(arg_name, arg_name) if convert_type_str(schema_handler.arg_type(arg)).replace( "?", "") == "bool": inp = bool(inp) if inp is not None else inp if convert_type_str(schema_handler.arg_type(arg)).replace( "?", "") == "str": inp = f"'{inp}'" if inp is not None else inp if arg_name == "device": inp = f"'{self._device_type}'" if arg_name == "dtype": inp = scalar_type_to_pytorch_type[ inp] if inp is not None else inp op.set_config(config_name, inp) return op
def export_traced_torch_script(self, output_dir, verbose=False): torch_version = torch.__version__.split('.') if int(torch_version[0]) == 1 and int(torch_version[1]) < 7: NndctScreenLogger().error( f'Only supprt exporting torch script with pytorch 1.7 and later version' ) return self.quantizer.reset_status_for_exporting() device = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANT_DEVICE) force_cpu = os.getenv('NNDCT_FORCE_CPU_DUMP') if force_cpu is not None: device = torch.device('cpu') GLOBAL_MAP.set_map(NNDCT_KEYS.QUANT_DEVICE, device) model, input_args = to_device(self.quantizer.quant_model, self._example_inputs, device) script_module = torch.jit.trace(model, input_args, check_trace=False) output_file = os.path.join( output_dir, f"{self.quantizer.quant_model._get_name()}_int.pt") if verbose is True: print(script_module.inlined_graph) torch.jit.save(script_module, output_file)
def __init__(self, quant_mode: str, module: torch.nn.Module, input_args: Union[torch.Tensor, Sequence[Any]] = None, state_dict_file: Optional[str] = None, output_dir: str = "quantize_result", bitwidth_w: int = 8, bitwidth_a: int = 8, mix_bit: bool = False, device: torch.device = torch.device("cuda"), lstm_app: bool = False): # Check arguments type self._check_args(module, input_args) # Check device available if device.type == "cuda": if not (torch.cuda.is_available() and "CUDA_HOME" in os.environ): device = torch.device("cpu") NndctScreenLogger().warning( f"CUDA is not available, change device to CPU") # Transform torch module to quantized module format nndct_utils.create_work_dir(output_dir) # Create a quantizer object, which can control all quantization flow, quant_strategy = DefaultQstrategy(bits_weight=bitwidth_w, bits_bias=bitwidth_a, bits_activation=bitwidth_a, mix_bit=mix_bit) quantizer, qmode = self._init_quant_env(quant_mode, output_dir, quant_strategy) GLOBAL_MAP.set_map(NNDCT_KEYS.QUANTIZER, quantizer) GLOBAL_MAP.set_map(NNDCT_KEYS.QUANT_MODE, qmode) GLOBAL_MAP.set_map(NNDCT_KEYS.QUANT_DEVICE, device) if lstm_app: option_util.set_option_value("nndct_cv_app", False) else: option_util.set_option_value("nndct_cv_app", True) # Prepare quantizable module quant_module, graph = prepare_quantizable_module( module=module, input_args=input_args, export_folder=output_dir, state_dict_file=state_dict_file, quant_mode=qmode, device=device) # enable record outputs of per layer if qmode > 1: register_output_hook(quant_module, record_once=True) set_outputs_recorder_status(quant_module, True) # intialize quantizer quantizer.setup(graph, False, lstm_app) # hook module with quantizer # connect_module_with_quantizer(quant_module, quantizer) quantizer.quant_model = quant_module self.quantizer = quantizer self.adaquant = None
def do_quantize(self, blob, name, node=None, tensor_type='input'): # forward quant graph but not quantize parameter and activation if NndctOption.nndct_quant_off.value: return blob blob_save = blob if isinstance(blob.values, torch.Tensor): blob = blob.values quant_device = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANT_DEVICE) if blob.device.type != quant_device.type: raise TypeError( "Device of quantizer is {}, device of model and data should match device of quantizer" .format(quant_device.type)) if (NndctOption.nndct_quant_opt.value and NndctOption.nndct_logging_level.value > 0): quant_data = nndct_quant.QuantizeData(name, blob.cpu().detach().numpy()) # quantize the tensor bnfp = self.get_bnfp(name, True, tensor_type) #print('---- quant %s with 1/step = %g' % (name, bnfp[1])) # hardware cut method mth = 4 if self.lstm else 2 if tensor_type == 'param': mth = 3 res = py_nndct.nn.NndctFixNeuron(blob, blob, maxamp=[bnfp[0], bnfp[1]], method=mth) if (NndctOption.nndct_quant_opt.value and NndctOption.nndct_logging_level.value > 0): global global_snr_inv quant_efficiency, sqnr = quant_data.quant_efficiency( blob.cpu().detach().numpy(), 8) global_snr_inv += 1 / sqnr print( f"quant_efficiency={quant_efficiency}, global_snr_inv={global_snr_inv} {quant_data._name}\n" ) # update param to nndct graph if tensor_type == 'param': self.update_param_to_nndct(node, name, res.cpu().detach().numpy()) blob = blob_save res = blob_save return res
def node_from_output(output_name, model_type): if model_type == 'Nndct': return output_name if model_type == 'tensorflow': output_name = output_name.split(':')[0] elif model_type == 'torch': if output_name.split('_')[-1] in ['backward', 'forward']: output_name = ''.join(output_name.split('_')[:-1]) else: raise KeyError("node_from_output is not available for model type " + str(model_type)) output_map = GLOBAL_MAP.get_ele(NNDCT_KEYS.OUTPUT_TO_NODE_MAP) if output_map and output_name in output_map: return output_map[output_name] return output_name
def finetune_v2(self, run_fn, run_args): # check status if self.quantizer.quant_mode == 2: NndctScreenLogger().warning(f"Finetune function will be ignored in test mode!") return # parameter finetuning with AdaQuant(processor=self): # calibration to get a set of quantization steps NndctScreenLogger().info(f"=>Preparing data for fast finetuning module parameters ...") with NoQuant(): net_inputs, net_outputs = self.cache_net_inpouts(run_fn, run_args) NndctScreenLogger().info(f"=>Find initial quantization steps for fast finetuning...") self.calibrate(run_fn, run_args) NndctScreenLogger().info(f"=>Fast finetuning module parameters for better quantization accuracy...") self.setup_test() device = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANT_DEVICE) intial_net_loss = self.calc_net_loss(net_inputs, net_outputs, device) layer_act_pair = self.collect_layer_act_pair() finetune_group = [] for qmod, fmod in zip(self._quant_model.modules(), self._float_model.modules()): if hasattr(qmod, "node"): if (self.quantizer.configer.is_node_quantizable(qmod.node, False) and len(qmod.node.op.params) > 0): finetune_group.append([qmod.node, fmod]) net_loss = intial_net_loss for idx, (qnode, fmod) in tqdm(enumerate(finetune_group), total=len(finetune_group)): is_cached = self.is_cached(qnode, len(net_inputs[0])) if (is_cached and idx < len(finetune_group) / 2) or (not is_cached): need_cache = False else: need_cache = True net_loss = self.optimize_layer_v2(qnode, fmod, layer_act_pair, net_inputs, net_outputs, net_loss, device, need_cache) print(f"%%%%%%%%%%%%%%%%% final opt net loss:{net_loss.avg}") # print(f"{qnode.name}({need_cache}):{net_loss}") NndctScreenLogger().info(f"=>Export fast finetuned parameters ...") # export finetuned parameters self.quantizer.export_param()
def clone_quant_module(cls, quant_module): quantizer = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANTIZER) if _is_module_hooked(quant_module): cls.detach_node_from_module(quant_module) cls.hook_module_with_quantizer(quant_module, None) new_quant_module = copy.deepcopy(quant_module) cls.hook_module_with_node(quant_module, quantizer.graph) cls.hook_module_with_quantizer(quant_module, quantizer) new_graph = Graph(graph_name=quantizer.graph.name) new_graph.clone_from(quantizer.graph) cls.hook_module_with_node(new_quant_module, new_graph) cls.hook_module_with_quantizer(new_quant_module, quantizer) else: new_quant_module = copy.deepcopy(quant_module) return new_quant_module
def dump_xmodel(output_dir="quantize_result", deploy_check=False): r"""converts module to xmodel for deployment compilation only works when quantm model = 2. The xmodel and some checking data will be generated under work dir. Args: deploy_check(bool): if true, can dump blobs and parameters of model for deployment verification Returns: None """ quantizer = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANTIZER) if quantizer and quantizer.quant_mode > 1: nndct_utils.create_work_dir(output_dir) # compile to xmodel try: compiler = CompilerFactory.get_compiler("xmodel") NndctScreenLogger().info("=>Converting to xmodel ...") compiler.do_compile(nndct_graph=quantizer.Nndctgraph, quant_config_info=quantizer.quant_config, output_file_name=os.path.join( output_dir, quantizer.Nndctgraph.name)) except AddXopError as e: NndctScreenLogger().error( f"Failed convert nndct graph to xmodel({str(e)}).") else: NndctScreenLogger().info( f"=>Successfully convert to xmodel.({compiler.xmodel_file})") # dump data for accuracy checkvim if deploy_check: NndctScreenLogger().info("=>Dumping checking data...") update_nndct_blob_data(quantizer.quant_model, quantizer.Nndctgraph) checker = DeployChecker(output_dir_name=output_dir) checker.dump_nodes_output( quantizer.Nndctgraph, quantizer.quant_config, round_method=quantizer.quant_opt['round_method']) set_outputs_recorder_status(quantizer.quant_model, False) NndctScreenLogger().info( f"=>Finsh dumping data.({checker.dump_folder})")
def export_onnx_model(self, output_dir, verbose=False): from torch.onnx import register_custom_op_symbolic from torch.onnx.symbolic_helper import parse_args import sys torch_version = torch.__version__.split('.') if int(torch_version[0]) == 1 and int(torch_version[1]) < 7: NndctScreenLogger().error( f'Only supprt exporting onnx model with pytorch 1.7 and later version' ) return @parse_args("v", "i", "i", "f", "i", "i", "i", "i") def symbolic_fix_neuron(g, input, valmin, valmax, valamp, zero_point, method, device_id, inplace): #print(f'{valmax} {valamp} {method} {device_id}') if valamp < sys.float_info.min: scale = torch.tensor(sys.float_info.max).float( ) # Avoid exportor generating double type else: scale = torch.tensor( 1.0 / valamp).float() # Avoid exportor generating double type zero_point = torch.tensor( 0, dtype=torch.int8) # ONNX requires zero_point to be tensor return g.op("DequantizeLinear", g.op("QuantizeLinear", input, scale, zero_point), scale, zero_point) register_custom_op_symbolic("vai::fix_neuron", symbolic_fix_neuron, 9) output_file = os.path.join( output_dir, f"{self.quantizer.quant_model._get_name()}_int.onnx") opset_version = torch.onnx.symbolic_helper._onnx_stable_opsets[-1] device = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANT_DEVICE) self.quantizer.reset_status_for_exporting() model, input_args = to_device(self.quantizer.quant_model, self._example_inputs, device) torch.onnx.export(self.quantizer.quant_model, input_args, output_file, verbose=verbose, opset_version=opset_version)
def forward(self, input): qinput = quantize_tensors([input], self.node, tensor_type='input')[0] if NndctOption.nndct_quant_off.value or NndctOption.nndct_cv_app.value: output = super().forward(qinput) output = quantize_tensors([output], self.node)[0] elif self.quant_mode > 0: output = torch.empty_like(qinput) if NndctOption.nndct_tanh_sigmoid_sim.value > 0: NndctTanhSimulation(input, output) output = quantize_tensors([output], self.node)[0] else: input_name = self.node.in_nodes[0] fragpos = self.quantizer.get_quant_config(input_name, False)[1] quant_device = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANT_DEVICE) Ttable = TANH_TABLE.table.to(quant_device) output = output.to(quant_device) NndctTanhTableLookup(input, Ttable, output, fragpos) else: output = super().forward(qinput) return output
def maybe_get_quantizer(quantizer=None): quantizer = quantizer or GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANTIZER) if quantizer: return quantizer.quant_mode, quantizer else: return GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANT_MODE), None
def do_scan(self, res, name, node=None, tensor_type='input'): # keep quantization steps after fast finetune if self.keep_fp: return self.do_quantize(res, name, node, tensor_type) # forward quant graph but not quantize parameter and activation if NndctOption.nndct_quant_off.value: if self.inplace: return res else: return res.clone().detach() res_save = None if isinstance(res.values, torch.Tensor): res_save = res res = res.values.data if res.dtype != torch.float32 and res.dtype != torch.double: NndctScreenLogger().warning_once( f'The tensor type of {node.name} is {str(res.dtype)}. Only support float32/double quantization.' ) return res_save if res_save is not None else res quant_device = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANT_DEVICE) if res.device.type != quant_device.type: raise TypeError( "Device of quantizer is {}, device of model and data should match device of quantizer" .format(quant_device.type)) # get fixed position bnfp = self.get_quant_config(name, False, tensor_type) # hardware cut method mth = 4 if self.lstm else 2 if NndctOption.nndct_use_torch_quantizer.value is True: mth = -1 elif tensor_type == 'param': mth = 3 scope = 5 if NndctOption.nndct_diffs_mode.value == "mse" else 1 # set fix pos scanning scope to 1 for some type of tensors if (node.op.type in [NNDCT_OP.INPUT, NNDCT_OP.QUANT_STUB]): scope = 1 if (self.lstm and tensor_type == 'input'): scope = 1 res = res.detach().clone() Tbuffer = torch.empty_like(res).to(quant_device) Tfixpos = torch.tensor( [1], dtype=torch.get_default_dtype()).to(quant_device) # activation always calculate fix pos # calcualte fix pos if it is None # always calculate fis pos in finetune mode if tensor_type != 'param' or bnfp[1] is None or self.quant_mode == 3: py_nndct.nn.NndctDiffsFixPos(Tinput=res, Tbuffer=Tbuffer, Tfixpos=Tfixpos, bit_width=bnfp[0], range=scope, method=mth) bnfp[1] = (int)(Tfixpos.item()) # limit max fix pos to 12 if bit width <= 8, others limit to 15 if bnfp[0] <= 8 or self.lstm: max_fp = NndctOption.nndct_max_fix_position.value bnfp[1] = min(max_fp, bnfp[1]) else: bnfp[1] = min(15, bnfp[1]) # record fix pos of activation if tensor_type != 'param': self.config_history[tensor_type][name].append(bnfp[1]) if (NndctOption.nndct_stat.value > 1): print( f'---- fp history: {stats.mode(np.array(self.config_history[tensor_type][name]))}' ) data = np.array(self.config_history[tensor_type][name]) bnfp[1] = stats.mode(data)[0][0] bnfp[1] = bnfp[1].astype(np.int32).tolist() self.set_quant_config(name, bnfp, tensor_type) if (NndctOption.nndct_stat.value > 1): print('---- quant %s tensor: %s with bw = %d and fp = %g' % (tensor_type, name, bnfp[0], bnfp[1])) # get 2^bit_width and 2^fracpos bnfp = self.get_quant_config(name, True, tensor_type) if (NndctOption.nndct_stat.value > 2): quant_data = nndct_quant.QuantizeData( name, res.cpu().detach().numpy()) # do quantization for parameter or activation res = fake_quantize_per_tensor(res, bnfp[1], 0, -bnfp[0], bnfp[0] - 1, mth, self.inplace) if (NndctOption.nndct_stat.value > 2): #quant_data.all_close(res.cpu().detach().numpy()) global global_snr_inv quant_efficiency, sqnr = quant_data.quant_efficiency( res.cpu().detach().numpy(), math.log2(bnfp[0])) global_snr_inv += 1 / sqnr if quant_efficiency < 3.0: print( f"quant_efficiency={quant_efficiency}, {quant_data._name}\n" ) print('Statistic [Min, Max, Mean, Std]:') print('[{}, {}, {}, {}]'.format(res.min(), res.max(), res.mean(), res.std())) print('histogram: {}'.format( res.histc(bins=10).cpu().detach().numpy())) t = res if tensor_type != 'param': t = res.transpose(0, 1) print('Channel number:{}'.format(t.shape[0])) print('Channel-wise statistic [Min, Max, Mean, Std]:') for c in range(t.shape[0]): print('[{}, {}, {}, {}]'.format( t[c].min(), t[c].max(), t[c].mean(), t[c].std())) print('histogram: {}'.format( t[c].histc(bins=10).cpu().detach().numpy())) if res_save is not None: res_save.values.data = res res = res_save return res