def find_node_by_name(self, node_name, new_nodes_list, graph): ''' Find out if a node exists in a graph or a node is in the new set of nodes created during quantization. Return the node found. ''' graph_nodes_list = list(graph.node) #deep copy graph_nodes_list.extend(new_nodes_list) node = find_by_name(node_name, graph_nodes_list) return node
def quantize_weight_per_channel(self, weight_name, weight_qType, channel_axis): # Find if this input is already quantized if weight_name in self.quantized_value_map: quantized_value = self.quantized_value_map[weight_name] return (quantized_value.q_name, quantized_value.zp_name, quantized_value.scale_name) initializer = find_by_name(weight_name, self.model.initializer()) if initializer is None: raise ValueError("{} is not an initializer", weight_name) weights = self.tensor_proto_to_array(initializer) channel_count = weights.shape[channel_axis] rmin_list = [] rmax_list = [] zero_point_list = [] scale_list = [] quantized_per_channel_data_list = [] for i in range(channel_count): per_channel_data = weights.take(i, channel_axis) rmin, rmax, zero_point, scale, quantized_per_channel_data = quantize_data( per_channel_data.flatten().tolist(), _get_qrange_for_qType(weight_qType, self.reduce_range), weight_qType) rmin_list.append(rmin) rmax_list.append(rmax) zero_point_list.append(zero_point) scale_list.append(scale) quantized_per_channel_data_list.append(quantized_per_channel_data) # combine per_channel_data into one reshape_dims = list(weights.shape) # deep copy reshape_dims[channel_axis] = 1 # only one per channel for reshape quantized_weights = np.asarray(quantized_per_channel_data_list[0]).reshape(reshape_dims) for i in range(1, len(quantized_per_channel_data_list)): channel_weights = np.asarray(quantized_per_channel_data_list[i]).reshape(reshape_dims) quantized_weights = np.concatenate((quantized_weights, channel_weights), channel_axis) weight = QuantizedInitializer(initializer.name, initializer, rmin_list, rmax_list, zero_point_list, scale_list, weights, quantized_weights.flatten().tolist(), channel_axis, weight_qType) # Make entry for this quantized weight assert (weight.name not in self.quantized_value_map) quantized_value = QuantizedValue(weight.name, weight.name + "_quantized", weight.name + "_scale", weight.name + "_zero_point", QuantizedValueType.Initializer, None, weight_qType) self.quantized_value_map[weight.name] = quantized_value self._update_weight(weight) return (weight.name + "_quantized", weight.name + "_zero_point", weight.name + "_scale")
def quantize(self): node = self.node assert (node.op_type == "MatMul") (quantized_input_names, zero_point_names, scale_names, nodes) = \ self.quantizer.quantize_inputs(node, [0, 1]) matmul_integer_output = node.output[0] + "_output_quantized" matmul_integer_name = node.name + "_quant" if node.name != "" else "" matmul_integer_node = onnx.helper.make_node( "MatMulInteger", quantized_input_names + zero_point_names, [matmul_integer_output], matmul_integer_name) nodes.append(matmul_integer_node) # Add cast operation to cast matmulInteger output to float. cast_op_output = matmul_integer_output + "_cast_output" cast_node = onnx.helper.make_node("Cast", [matmul_integer_output], [cast_op_output], matmul_integer_output + "_cast", to=onnx_proto.TensorProto.FLOAT) nodes.append(cast_node) # Add mul operation to multiply scales of two inputs. assert (len(scale_names) == 2) scales_mul_op = matmul_integer_name + "_scales_mul" if matmul_integer_name != "" else \ scale_names[0] + "_" + scale_names[1] + "_mul" scales_mul_node = find_by_name(scales_mul_op, self.quantizer.new_nodes) if scales_mul_node is None: scales_mul_node = get_mul_node(scale_names, scales_mul_op + ":0", scales_mul_op) nodes.append(scales_mul_node) scales_mul_op_output = scales_mul_node.output[0] # Add mul operation to multiply mul_scales_op result with output of MatMulInteger # and make the output of this node the same as output of original matmul node. output_scale_mul_op = "" if matmul_integer_name != "": output_scale_mul_op = matmul_integer_name + "_output_scale_mul" nodes.append( get_mul_node([cast_op_output, scales_mul_op_output], node.output[0], output_scale_mul_op)) self.quantizer.new_nodes += nodes
def get_bias_add_nodes(self, nodes, node, last_output, quantized_bias_name): ''' Given a node, this function handles bias add by adding a "reshape" node on bias and an "add" node parameter nodes: new nodes would be appended into nodes parameter node: current node (Conv) parameter last_output: output of previous node (input to bias add) return: the name of output ''' # Add tensors for the shape to be reshaped to weight = find_by_name(node.input[1], self.model.initializer()) if weight is None: raise ValueError("Expected {} to be an initializer".format( node.input[1])) # Add reshape for correct broadcase reshape_input_data = quantized_bias_name reshape_input_shape = quantized_bias_name + "_reshape_shape" reshape_input = [reshape_input_data, reshape_input_shape] reshape_shape = np.ones((len(weight.dims)), dtype=np.int64) reshape_shape[1] = -1 init_shape = onnx.helper.make_tensor(reshape_input_shape, onnx_proto.TensorProto.INT64, [len(weight.dims)], reshape_shape) self.model.add_initializer(init_shape) reshape_op_output = node.output[0] + "_reshape" reshape_node = onnx.helper.make_node("Reshape", reshape_input, [reshape_op_output], quantized_bias_name + "reshape") nodes.append(reshape_node) # Add an Add operation for bias bias_add_input = [last_output] bias_add_input.append(reshape_op_output) add_node_output = node.output[0] + "_bias_add" add_node = onnx.helper.make_node("Add", bias_add_input, [add_node_output], quantized_bias_name + "bias_add") nodes.append(add_node) return add_node_output
def quantize(self): node = self.node assert node.op_type in ["Conv", "FusedConv"] (quantized_input_names, zero_point_names, scale_names, nodes) = \ self.quantizer.quantize_inputs(node, [0, 1]) # quantize bias if exist quantized_bias_name = "" bias_present = False if len(node.input) == 3: quantized_bias_name = self.quantizer.quantize_bias(node, nodes) bias_present = True conv_integer_output = node.output[0] + "_output_quantized" conv_integer_name = node.name + "_quant" if node.name != "" else "" kwargs = {} for attribute in node.attribute: if attribute.name == 'activation' and attribute.s in [ b'Relu', b'Clip' ]: continue if attribute.name == 'activation_params': continue kwargs.update(attribute_to_kwarg(attribute)) conv_integer_node = onnx.helper.make_node( "ConvInteger", quantized_input_names + zero_point_names, [conv_integer_output], conv_integer_name, **kwargs) nodes.append(conv_integer_node) # Add bias add nodes if bias_present: conv_integer_output = self.quantizer.get_bias_add_nodes( nodes, node, conv_integer_output, quantized_bias_name) # Add cast operation to cast convInteger output to float. cast_op_output = conv_integer_output + "_cast_output" cast_node = onnx.helper.make_node("Cast", [conv_integer_output], [cast_op_output], conv_integer_output + "_cast", to=onnx_proto.TensorProto.FLOAT) nodes.append(cast_node) # Add mul operation to multiply scales of two inputs. assert (len(scale_names) == 2) if conv_integer_name != "": scales_mul_op = conv_integer_name + "_scales_mul" else: scales_mul_op = scale_names[0] + "_" + scale_names[1] + "_mul" scales_mul_node = find_by_name(scales_mul_op, self.quantizer.new_nodes) if scales_mul_node is None: scales_mul_node = get_mul_node(scale_names, scales_mul_op + ":0", scales_mul_op) nodes.append(scales_mul_node) scales_mul_op_output = scales_mul_node.output[0] # Add mul operation to multiply mul_scales_op result with output of ConvInteger # and make the output of this node the same as output of original conv node. output_scale_mul_op = conv_integer_name + "_output_scale_mul" if \ conv_integer_name != "" else "" nodes.append( get_mul_node([cast_op_output, scales_mul_op_output], node.output[0], output_scale_mul_op)) self.quantizer.new_nodes += nodes
def add_initializer(self, tensor): if find_by_name(tensor.name, self.model.graph.initializer) is None: self.model.graph.initializer.extend([tensor])
def quantize_inputs(self, node, indices, initializer_use_weight_qType=True): ''' Given a node, this function quantizes the inputs as follows: - If input is an initializer, quantize the initializer data, replace old initializer with new initializer - Else, add QuantizeLinear nodes to perform quantization parameter node: node being quantized in NodeProto format. parameter indices: input indices to quantize. return: (List of quantized input names, List of zero point names used for input quantization, List of scale names used for input quantization, List of new QuantizeLinear nodes created) ''' scale_names = [] zero_point_names = [] quantized_input_names = [] nodes = [] for input_index in indices: node_input = node.input[input_index] # Find if this input is already quantized if node_input in self.quantized_value_map: quantized_value = self.quantized_value_map[node_input] scale_names.append(quantized_value.scale_name) zero_point_names.append(quantized_value.zp_name) quantized_input_names.append(quantized_value.q_name) continue # Quantize the input initializer = find_by_name(node_input, self.model.initializer()) if initializer is not None: weight = self._get_quantized_weight(initializer, self.config[node.name]['weight']['dtype'] if \ initializer_use_weight_qType else \ self.config[node.name]['activation']['dtype']) # Update graph self._update_weight(weight) quantized_input_names.append(weight.name + "_quantized") zero_point_names.append(weight.name + "_zero_point") scale_names.append(weight.name + "_scale") else: # Add QuantizeLinear node. qlinear_node = self.model.find_node_by_name( node_input + "_QuantizeLinear", self.new_nodes, self.model.graph()) if qlinear_node is None: quantize_input_nodes = self._get_quantize_input_nodes( node, input_index, self.config[node.name]['activation']['dtype']) nodes.extend(quantize_input_nodes) qlinear_node = quantize_input_nodes[-1] if qlinear_node.op_type == "QuantizeLinear": quantized_input_names.extend(qlinear_node.output) scale_names.append(qlinear_node.input[1]) zero_point_names.append(qlinear_node.input[2]) else: quantized_input_names.append(qlinear_node.output[0]) scale_names.append(qlinear_node.output[1]) zero_point_names.append(qlinear_node.output[2]) return (quantized_input_names, zero_point_names, scale_names, nodes)
def quantize_bias(self, node, new_node_list): ''' Quantized the bias. Zero Point == 0 and Scale == Input_Scale * Weight_Scale ''' # get scale for weight weight_scale_name = self.quantized_value_map[node.input[1]].scale_name weight_initializer = find_by_name(weight_scale_name, self.model.initializer()) weight_scale = self.tensor_proto_to_array(weight_initializer) # get bias bias_name = node.input[2] bias_initializer = find_by_name(bias_name, self.model.initializer()) bias_data = self.tensor_proto_to_array(bias_initializer) quantized_bias_name = bias_name + "_quantized" # input scale is not provided and this input is dynamically quantized # so it is not pre-computed at this point # so resort to dynamic quantization for bias if self.quantization_params is None or node.input[0] not in self.quantization_params and \ node.input[0] not in self.quantized_value_map: self._dynamic_quantize_bias(node.input[0], weight_scale_name, bias_name, quantized_bias_name, new_node_list) else: # get scale for input if node.input[0] in self.quantized_value_map: input_scale_name = self.quantized_value_map[ node.input[0]].scale_name elif node.input[0] in self.quantization_params: _, input_scale_name, _, _, _ = self._get_quantization_params( node.input[0]) else: raise ValueError("Expected {} to be in quantized value map \ for static quantization".format( node.input[0])) inputscale_initializer = find_by_name(input_scale_name, self.model.initializer()) input_scale = self.tensor_proto_to_array(inputscale_initializer) # calcuate scale for bias bias_scale = input_scale * weight_scale # quantize bias quantized_data = (np.asarray(bias_data) / bias_scale).round().astype(np.int32) # update bias initializer bias_np_data = np.asarray(quantized_data, dtype=np.int32).reshape(\ bias_initializer.dims) packed_bias_initializer = onnx.numpy_helper.from_array( bias_np_data, quantized_bias_name) self.model.initializer().extend([packed_bias_initializer]) # log entries for this quantized bias value quantized_bias_entry = QuantizedInitializer( bias_name, bias_initializer, [0], [0], [0], [bias_scale], bias_data, quantized_data, qType=onnx_proto.TensorProto.INT32) self._quantized_weights.append(quantized_bias_entry) assert (bias_name not in self.quantized_value_map) quantized_value = QuantizedValue(bias_name, quantized_bias_name, "", "", QuantizedValueType.Initializer, None, onnx_proto.TensorProto.INT32) self.quantized_value_map[bias_name] = quantized_value return quantized_bias_name
def is_valid_quantize_weight(self, weight_name): weight = find_by_name(weight_name, self.model.initializer()) return weight is not None and weight.data_type == onnx_proto.TensorProto.FLOAT
def is_input_a_weight(self, input_name): initializer = find_by_name(input_name, self.model.initializer()) return initializer is not None
def remove_fake_quantized_nodes(self): # pragma: no cover ''' Detect and remove the quantize/dequantizelinear node pairs(fake quantized nodes in Quantization-Aware training) and reconnect and update the nodes. !!! not supported now !!! ''' nodes_to_remove = [] initializers_to_remove = [] for curr_node in self.model.nodes(): if curr_node.op_type == 'QuantizeLinear': next_node, prev_node, succ_node = None, None, None for child_node in self.model.get_children(curr_node): if child_node.op_type == 'DequantizeLinear': next_node = child_node if next_node is None: raise ValueError( "Remove fake-quantized node pair Error: DequantizeLinear node is \ not found for {}.".format(curr_node.name)) prev_node = self.model.get_parent(curr_node, 0) if prev_node is None: raise ValueError( "Remove fake-quantized node pair Error: Parent node is \ not found for {}.".format(curr_node.name)) succ_nodes = self.model.get_children(next_node) if len(succ_nodes) == 0: raise ValueError( "Remove fake-quantized node pair Error: No successive \ nodes found for {}.".format(next_node.name)) # TODO: convert it to the specified input_type scale_tensor_name = curr_node.input[1] zp_tensor_name = curr_node.input[2] initializer_scale = find_by_name(scale_tensor_name, self.model.initializer()) initializer_zp = find_by_name(zp_tensor_name, self.model.initializer()) zp_and_scale = [ onnx.numpy_helper.to_array(initializer_zp), onnx.numpy_helper.to_array(initializer_scale) ] # connect the previous and successive node input and output for succ_node in succ_nodes: succ_idx = get_elem_index(next_node.output[0], succ_node.input) if succ_idx != -1: succ_node.input[succ_idx] = curr_node.input[0] else: raise ValueError( "Remove fake-quantized node pair Error: Connection failed. \ No matched successive node input found for {}.". format(next_node.name)) param_name = curr_node.input[0] if self.quantization_params is None: self.quantization_params = {} self.quantization_params[param_name] = zp_and_scale # remove fake-quantized nodes nodes_to_remove.extend([curr_node]) nodes_to_remove.extend([next_node]) # remove unused initializers in graph initializers_to_remove.extend([initializer_scale]) initializers_to_remove.extend([initializer_zp]) self.model.remove_nodes(nodes_to_remove) self.model.remove_initializers(initializers_to_remove) return self.model.model