Ejemplo n.º 1
0
    def quantize(self):
        '''
            parameter node: Attention node.
            parameter new_nodes_list: List of new nodes created before processing this node.
            return: a list of nodes in topological order that represents quantized Attention node
        '''
        node = self.node
        assert (node.op_type == "Attention")

        (quantized_input_names, zero_point_names, scale_names, nodes) = \
            self.quantizer.quantize_inputs(node, [0, 1])

        qattention_name = "" if node.name == "" else node.name + "_quant"

        inputs = []
        inputs.extend(quantized_input_names)
        inputs.extend([node.input[2]])
        inputs.extend(scale_names)
        inputs.extend([node.input[3] if len(node.input) > 3 else ""])
        inputs.extend(zero_point_names)
        inputs.extend([node.input[4] if len(node.input) > 4 else ""])

        kwargs = {}
        for attribute in node.attribute:
            kwargs.update(attribute_to_kwarg(attribute))
        kwargs["domain"] = ms_domain
        qattention_node = onnx.helper.make_node("QAttention", inputs,
                                                node.output, qattention_name,
                                                **kwargs)
        nodes.append(qattention_node)

        self.quantizer.new_nodes += nodes
Ejemplo n.º 2
0
    def quantize(self):
        node = self.node
        quantized_input_names, zero_point_names, scale_names, nodes = \
                                           self.quantizer.quantize_inputs(node, [0])
        quantized_node_name = ""
        if node.name != "":
            quantized_node_name = node.name + "_quant"
        kwargs = {}
        for attribute in node.attribute:
            kwargs.update(attribute_to_kwarg(attribute))

        # Output just derive the scale/zero from input
        quantized_output_names = []
        for output_name in node.output:
            quantized_output_name = output_name + "quantized"
            quantized_output_names.append(quantized_output_name)
            q_output = QuantizedValue(output_name, quantized_output_name,
                                      scale_names[0], zero_point_names[0],
                                      QuantizedValueType.Input)
            self.quantizer.quantized_value_map[output_name] = q_output

        if len(node.input) > 1:
            quantized_input_names = quantized_input_names.extend(
                node.input[1:])
        quantized_node = onnx.helper.make_node(node.op_type,
                                               quantized_input_names,
                                               quantized_output_names,
                                               quantized_node_name, **kwargs)

        nodes.append(quantized_node)
        self.quantizer.new_nodes += nodes
Ejemplo n.º 3
0
    def quantize(self):
        node = self.node

        data_found, output_scale_name, output_zp_name, _, _ = \
            self.quantizer._get_quantization_params(node.output[0])
        if (not data_found):  # only try to quantize when given quantization parameters for it
            return super().quantize()

        (quantized_input_names, zero_point_names, scale_names, nodes) = \
            self.quantizer.quantize_inputs(node, [0, 1], initializer_use_weight_qType=False)

        qlinear_binary_math_output = node.output[0] + "_quantized"
        qlinear_binary_math_name = node.name + "_quant" if node.name != "" else ""

        kwargs = {}
        for attribute in node.attribute:
            kwargs.update(attribute_to_kwarg(attribute))
        kwargs["domain"] = ms_domain

        qlinear_binary_math_inputs = []
        # Input 0
        qlinear_binary_math_inputs.append(quantized_input_names[0])
        qlinear_binary_math_inputs.append(scale_names[0])
        qlinear_binary_math_inputs.append(zero_point_names[0])
        # Input 1
        qlinear_binary_math_inputs.append(quantized_input_names[1])
        qlinear_binary_math_inputs.append(scale_names[1])
        qlinear_binary_math_inputs.append(zero_point_names[1])

        # Output
        qlinear_binary_math_inputs.append(output_scale_name)
        qlinear_binary_math_inputs.append(output_zp_name)

        qlinear_binary_math_node = onnx.helper.make_node("QLinear" + node.op_type, 
                                                         qlinear_binary_math_inputs,
                                                         [qlinear_binary_math_output], 
                                                         qlinear_binary_math_name,
                                                         **kwargs)
        nodes.append(qlinear_binary_math_node)

        # Create an entry for this quantized value
        q_output = QuantizedValue(node.output[0], qlinear_binary_math_output, 
                                  output_scale_name, output_zp_name,
                                  QuantizedValueType.Input)
        self.quantizer.quantized_value_map[node.output[0]] = q_output

        self.quantizer.new_nodes += nodes
Ejemplo n.º 4
0
    def quantize(self):
        node = self.node
        assert node.op_type in ["Conv", "FusedConv"]

        (quantized_input_names, zero_point_names, scale_names, nodes) = \
            self.quantizer.quantize_inputs(node, [0, 1])

        # quantize bias if exist
        quantized_bias_name = ""
        bias_present = False
        if len(node.input) == 3:
            quantized_bias_name = self.quantizer.quantize_bias(node, nodes)
            bias_present = True

        conv_integer_output = node.output[0] + "_output_quantized"
        conv_integer_name = node.name + "_quant" if node.name != "" else ""

        kwargs = {}
        for attribute in node.attribute:
            if attribute.name == 'activation' and attribute.s in [
                    b'Relu', b'Clip'
            ]:
                continue
            if attribute.name == 'activation_params':
                continue
            kwargs.update(attribute_to_kwarg(attribute))
        conv_integer_node = onnx.helper.make_node(
            "ConvInteger", quantized_input_names + zero_point_names,
            [conv_integer_output], conv_integer_name, **kwargs)
        nodes.append(conv_integer_node)

        # Add bias add nodes
        if bias_present:
            conv_integer_output = self.quantizer.get_bias_add_nodes(
                nodes, node, conv_integer_output, quantized_bias_name)

        # Add cast operation to cast convInteger output to float.
        cast_op_output = conv_integer_output + "_cast_output"
        cast_node = onnx.helper.make_node("Cast", [conv_integer_output],
                                          [cast_op_output],
                                          conv_integer_output + "_cast",
                                          to=onnx_proto.TensorProto.FLOAT)
        nodes.append(cast_node)

        # Add mul operation to multiply scales of two inputs.
        assert (len(scale_names) == 2)
        if conv_integer_name != "":
            scales_mul_op = conv_integer_name + "_scales_mul"
        else:
            scales_mul_op = scale_names[0] + "_" + scale_names[1] + "_mul"

        scales_mul_node = find_by_name(scales_mul_op, self.quantizer.new_nodes)
        if scales_mul_node is None:
            scales_mul_node = get_mul_node(scale_names, scales_mul_op + ":0",
                                           scales_mul_op)
            nodes.append(scales_mul_node)

        scales_mul_op_output = scales_mul_node.output[0]

        # Add mul operation to multiply mul_scales_op result with output of ConvInteger
        # and make the output of this node the same as output of original conv node.
        output_scale_mul_op = conv_integer_name + "_output_scale_mul" if \
                                                  conv_integer_name != "" else ""
        nodes.append(
            get_mul_node([cast_op_output, scales_mul_op_output],
                         node.output[0], output_scale_mul_op))

        self.quantizer.new_nodes += nodes
Ejemplo n.º 5
0
    def quantize(self):
        node = self.node
        assert (node.op_type in ["Conv", "FusedConv"])

        if self.quantizer.is_input_a_weight(
                node.input[1]) and self.per_channel:
            (quantized_input_names, zero_point_names, scale_names, nodes) = \
                self.quantizer.quantize_inputs(node, [0])
            quant_weight_tuple = self.quantizer.quantize_weight_per_channel(
                node.input[1], self.weight_dtype, 0)
            quantized_input_names.append(quant_weight_tuple[0])
            zero_point_names.append(quant_weight_tuple[1])
            scale_names.append(quant_weight_tuple[2])
        else:
            (quantized_input_names, zero_point_names, scale_names, nodes) = \
                self.quantizer.quantize_inputs(node, [0, 1])

        quantized_bias_name = ""
        bias_present = False
        if len(node.input) == 3:
            quantized_bias_name = self.quantizer.quantize_bias(node, nodes)
            bias_present = True
        data_found, output_scale_name, output_zp_name, _, _ = \
            self.quantizer._get_quantization_params(node.output[0])

        if not data_found:
            raise ValueError(
                "Quantization parameters for output:\"{}\" of node:\"{}\" not \
                              specified".format(node.output[0], node.name))

        qlinear_conv_output = node.output[0] + "_quantized"
        qlinear_conv_name = qlinear_conv_name = node.name + "_quant" if \
                                                node.name != "" else ""

        kwargs = {}
        for attribute in node.attribute:
            if attribute.name == 'activation' and attribute.s in [
                    b'Relu', b'Clip'
            ]:
                continue
            if attribute.name == 'activation_params':
                continue
            kwargs.update(attribute_to_kwarg(attribute))
        qlinear_conv_inputs = []
        # Input 0
        qlinear_conv_inputs.append(quantized_input_names[0])
        qlinear_conv_inputs.append(scale_names[0])
        qlinear_conv_inputs.append(zero_point_names[0])
        # Input 1
        qlinear_conv_inputs.append(quantized_input_names[1])
        qlinear_conv_inputs.append(scale_names[1])
        qlinear_conv_inputs.append(zero_point_names[1])

        # Output
        qlinear_conv_inputs.append(output_scale_name)
        qlinear_conv_inputs.append(output_zp_name)

        if bias_present:
            qlinear_conv_inputs.append(quantized_bias_name)

        qlinear_conv_node = onnx.helper.make_node("QLinearConv",
                                                  qlinear_conv_inputs,
                                                  [qlinear_conv_output],
                                                  qlinear_conv_name, **kwargs)
        nodes.append(qlinear_conv_node)

        # Create an entry for this quantized value
        q_output = QuantizedValue(node.output[0], qlinear_conv_output,
                                  output_scale_name, output_zp_name,
                                  QuantizedValueType.Input)
        self.quantizer.quantized_value_map[node.output[0]] = q_output

        self.quantizer.new_nodes += nodes
Ejemplo n.º 6
0
    def quantize(self):
        node = self.node
        assert (node.op_type == "Pad")

        # Only after version 11, it has the optional constant_value
        # If input[0] is not quantized, do not quanitize this node
        if (self.quantizer.opset_version < 11) or (node.input[0] not \
                                               in self.quantizer.quantized_value_map):
            super().quantize()
            return
        quantized_input_value = self.quantizer.quantized_value_map[
            node.input[0]]

        kwargs = {}
        for attribute in node.attribute:
            kv = attribute_to_kwarg(attribute)
            kwargs.update(kv)

        if 'mode' not in kwargs or kwargs['mode'] == b'constant':
            if len(node.input) > 2:  # There is 3rd input 'constant_value'
                zp_tensor = self.quantizer.model.get_initializer(
                    quantized_input_value.zp_name)
                scale_tensor = \
                            self.quantizer.model.get_initializer(quantized_input_value.scale_name)
                # if zp_tensor is None or scale_tensor is None:
                #     super().quantize()
                #     return

                padding_constant_initializer = self.quantizer.model.get_initializer(
                    node.input[2])
                if padding_constant_initializer is not None:
                    zp_array = onnx.numpy_helper.to_array(zp_tensor)
                    zp_value = zp_array.item(
                    ) if zp_array.ndim == 0 else zp_array[0]
                    scale_array = onnx.numpy_helper.to_array(scale_tensor)
                    scale_value = scale_array.item(
                    ) if scale_array.ndim == 0 else scale_array[0]
                    padding_constant_array = \
                                          onnx.numpy_helper.to_array(padding_constant_initializer)
                    quantized_padding_constant_array = quantize_nparray(
                        self.activation_dtype, padding_constant_array,
                        scale_value, zp_value)
                    quantized_padding_constant_name = node.input[
                        2] + "_quantized"
                    quantized_padding_constant_initializer = onnx.numpy_helper.from_array(
                        quantized_padding_constant_array,
                        quantized_padding_constant_name)
                    # Suppose this padding constant initializer only used by the node
                    self.quantizer.model.remove_initializer(
                        padding_constant_initializer)
                    self.quantizer.model.add_initializer(
                        quantized_padding_constant_initializer)
                    node.input[2] = quantized_padding_constant_name
                else:
                    pad_value_qnodes = self.quantizer._get_quantize_input_nodes(
                        node, 2, self.activation_dtype)
                    self.quantizer.new_nodes += pad_value_qnodes
                    node.input[2] = pad_value_qnodes[0].output[0]
            else:
                # pad zero_point for original zero
                node.input.extend([quantized_input_value.zp_name])

        # Create an entry for output quantized value
        quantized_output_value = QuantizedValue(
            node.output[0], node.output[0] + "_quantized",
            quantized_input_value.scale_name, quantized_input_value.zp_name,
            QuantizedValueType.Input)
        self.quantizer.quantized_value_map[
            node.output[0]] = quantized_output_value

        node.input[0] = quantized_input_value.q_name
        node.output[0] = quantized_output_value.q_name
        self.quantizer.new_nodes += [node]
Ejemplo n.º 7
0
    def quantize(self):
        '''
            parameter node: LSTM node.
            parameter new_nodes_list: List of new nodes created before processing this node.
            return: a list of nodes in topological order that represents quantized Attention node.
        '''
        node = self.node
        assert (node.op_type == "LSTM")

        if (not self.quantizer.is_valid_quantize_weight(node.input[1]) or
            not self.quantizer.is_valid_quantize_weight(node.input[2])):
            super().quantize()
            return

        model = self.quantizer.model
        W = model.get_initializer(node.input[1])
        R = model.get_initializer(node.input[2])

        if (len(W.dims) != 3 or len(R.dims) != 3):
            super().quantize()
            return

        [W_num_dir, W_4_hidden_size, W_input_size] = W.dims
        [R_num_dir, R_4_hidden_size, R_hidden_size] = R.dims

        if self.per_channel:
            del W.dims[0]
            del R.dims[0]
            W.dims[0] = W_num_dir * W_4_hidden_size
            R.dims[0] = R_num_dir * R_4_hidden_size

        quant_input_weight_tuple = self.quantizer.quantize_weight_per_channel(node.input[1], 
                                                                              self.weight_dtype, 0)
        quant_recurrent_weight_tuple = self.quantizer.quantize_weight_per_channel(node.input[2], 
                                                                              self.weight_dtype, 0)

        W_quant_weight = model.get_initializer(quant_input_weight_tuple[0])
        R_quant_weight = model.get_initializer(quant_recurrent_weight_tuple[0])

        W_quant_array = onnx.numpy_helper.to_array(W_quant_weight)
        R_quant_array = onnx.numpy_helper.to_array(R_quant_weight)

        W_quant_array = numpy.reshape(W_quant_array, (W_num_dir, W_4_hidden_size, W_input_size))
        R_quant_array = numpy.reshape(R_quant_array, (R_num_dir, R_4_hidden_size, R_hidden_size))

        W_quant_array = numpy.transpose(W_quant_array, (0, 2, 1))
        R_quant_array = numpy.transpose(R_quant_array, (0, 2, 1))

        W_quant_tranposed = onnx.numpy_helper.from_array(W_quant_array, \
                                                         quant_input_weight_tuple[0])
        R_quant_tranposed = onnx.numpy_helper.from_array(R_quant_array, 
                                                         quant_recurrent_weight_tuple[0])

        model.remove_initializers([W_quant_weight, R_quant_weight])
        model.add_initializer(W_quant_tranposed)
        model.add_initializer(R_quant_tranposed)

        W_quant_zp = model.get_initializer(quant_input_weight_tuple[1])
        R_quant_zp = model.get_initializer(quant_recurrent_weight_tuple[1])
        W_quant_scale = model.get_initializer(quant_input_weight_tuple[2])
        R_quant_scale = model.get_initializer(quant_recurrent_weight_tuple[2])

        if self.per_channel:
            W_quant_zp.dims[:] = [W_num_dir, W_4_hidden_size]
            R_quant_zp.dims[:] = [R_num_dir, R_4_hidden_size]
            W_quant_scale.dims[:] = [W_num_dir, W_4_hidden_size]
            R_quant_scale.dims[:] = [R_num_dir, R_4_hidden_size]

        inputs = []
        input_len = len(node.input)
        inputs.extend([node.input[0]])
        inputs.extend([quant_input_weight_tuple[0], quant_recurrent_weight_tuple[0]])
        inputs.extend([node.input[3] if input_len > 3 else ""])
        inputs.extend([node.input[4] if input_len > 4 else ""])
        inputs.extend([node.input[5] if input_len > 5 else ""])
        inputs.extend([node.input[6] if input_len > 6 else ""])
        inputs.extend([node.input[7] if input_len > 7 else ""])
        inputs.extend([quant_input_weight_tuple[2], 
                       quant_input_weight_tuple[1], 
                       quant_recurrent_weight_tuple[2], 
                       quant_recurrent_weight_tuple[1]])

        kwargs = {}
        for attribute in node.attribute:
            kwargs.update(attribute_to_kwarg(attribute))
        kwargs["domain"] = ms_domain

        quant_lstm_name = "" if node.name == "" else node.name + "_quant"
        quant_lstm_node = onnx.helper.make_node("DynamicQuantizeLSTM", 
                                                inputs, node.output, quant_lstm_name, **kwargs)

        self.quantizer.new_nodes += [quant_lstm_node]