Ejemplo n.º 1
0
    def create_attention_node(self, mask_index: str, matmul: NodeProto, add: NodeProto, num_heads: int,
                              hidden_size: int, input: str, output: str, add_qk_str: str) -> Union[NodeProto, None]:

        assert num_heads > 0
        if hidden_size > 0 and (hidden_size % num_heads) != 0:
            logger.debug(f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}")
            return None

        weight = self.model.get_initializer(matmul.input[1])
        bias = self.model.get_initializer(add.input[1]) or self.model.get_initializer(add.input[0])

        if weight is None or bias is None:
            return None

        qkv_weight = NumpyHelper.to_array(weight)
        qkv_bias = NumpyHelper.to_array(bias)

        attention_node_name = self.model.create_node_name('Attention')

        weight = helper.make_tensor(name=attention_node_name + '_qkv_weight',
                                    data_type=TensorProto.FLOAT,
                                    dims=[hidden_size, 3 * hidden_size],
                                    vals=qkv_weight.flatten().tolist())

        # Sometimes weights and bias are stored in fp16
        if weight.data_type == 10:
            weight.CopyFrom(numpy_helper.from_array(NumpyHelper.to_array(weight).astype(np.float16), weight.name))
        self.model.add_initializer(weight, self.this_graph_name)

        bias = helper.make_tensor(name=attention_node_name + '_qkv_bias',
                                  data_type=TensorProto.FLOAT,
                                  dims=[3 * hidden_size],
                                  vals=qkv_bias.flatten().tolist())
        if bias.data_type == 10:
            bias.CopyFrom(numpy_helper.from_array(NumpyHelper.to_array(bias).astype(np.float16), bias.name))
        self.model.add_initializer(bias, self.this_graph_name)

        attention_inputs = [input, attention_node_name + '_qkv_weight', attention_node_name + '_qkv_bias']
        if mask_index is not None:
            attention_inputs.append(mask_index)
        else:
            attention_inputs.append("")

        if add_qk_str is not None:
            attention_inputs.append("")
            attention_inputs.append(add_qk_str)

        attention_node = helper.make_node('Attention',
                                          inputs=attention_inputs,
                                          outputs=[output],
                                          name=attention_node_name)
        attention_node.domain = "com.microsoft"
        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])

        return attention_node
Ejemplo n.º 2
0
    def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]:
        """ Detect num_heads and hidden_size from a reshape node.

        Args:
            reshape_q (NodeProto): reshape node for Q

        Returns:
            Tuple[int, int]: num_heads and hidden_size
        """

        # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
        q_shape = self.model.get_initializer(reshape_q.input[1])
        if q_shape is None:
            logger.debug(f"{reshape_q.input[1]} is not initializer.")
            return self.num_heads, self.hidden_size  # Fall back to user specified value

        q_shape_value = NumpyHelper.to_array(q_shape)
        if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0):
            logger.debug(f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size].")
            return self.num_heads, self.hidden_size  # Fall back to user specified value

        num_heads = q_shape_value[2]
        head_size = q_shape_value[3]
        hidden_size = num_heads * head_size

        if self.num_heads > 0 and num_heads != self.num_heads:
            logger.warn(f"--num_heads is {self.num_heads}. Detected value is {num_heads}. Using detected value.")

        if self.hidden_size > 0 and hidden_size != self.hidden_size:
            logger.warn(f"--hidden_size is {self.hidden_size}. Detected value is {hidden_size}. Using detected value.")

        return num_heads, hidden_size
Ejemplo n.º 3
0
def fill_zeros_for_external_data(tensor: TensorProto):
    if tensor.HasField("raw_data"):  # already loaded
        return

    value = NumpyHelper.to_array(tensor, fill_zeros=True)
    zero_tensor = numpy_helper.from_array(value, name=tensor.name)
    tensor.raw_data = zero_tensor.raw_data
Ejemplo n.º 4
0
    def fuse(self, node, input_name_to_nodes, output_name_to_node):
        gelu_op_type = node.op_type
        fuse_op_type = "BiasGelu" if gelu_op_type == "Gelu" else "FastGelu"

        if len(node.input) != 1:
            return

        nodes = self.model.match_parent_path(node, ["Add", "MatMul"],
                                             [0, None])
        if nodes is None:
            return
        (add, matmul) = nodes

        bias_weight = None
        # bias should be one dimension
        bias_index = -1
        for i, input in enumerate(add.input):
            initializer = self.model.get_initializer(input)
            if initializer is None:
                continue
            bias_index = i
            bias_weight = NumpyHelper.to_array(initializer)
            break
        if bias_weight is None:
            return
        if len(bias_weight.shape) != 1:
            return

        subgraph_nodes = [node, add]
        if not self.model.is_safe_to_fuse_nodes(
                subgraph_nodes, [node.output[0]], input_name_to_nodes,
                output_name_to_node):
            return

        self.nodes_to_remove.extend(subgraph_nodes)

        fused_node = helper.make_node(
            fuse_op_type,
            inputs=[matmul.output[0], add.input[bias_index]],
            outputs=node.output,
            name=self.model.create_node_name(fuse_op_type,
                                             gelu_op_type + "_AddBias_"),
        )
        fused_node.domain = "com.microsoft"
        self.nodes_to_add.append(fused_node)
        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
Ejemplo n.º 5
0
    def fuse(self, node, input_name_to_nodes, output_name_to_node):
        if len(node.input) != 4:
            return

        return_indice = []
        nodes = self.model.match_parent_path(node, ['Add', 'MatMul'],
                                             [None, None], None, return_indice)
        if nodes is None:
            return
        assert len(return_indice) == 2
        add_input_index = return_indice[0]
        if add_input_index >= 2:
            return

        (add, matmul) = nodes

        # bias should be one dimension
        bias_index = -1
        for i, input in enumerate(add.input):
            initializer = self.model.get_initializer(input)
            if initializer is None:
                continue
            bias_index = i
            bias_weight = NumpyHelper.to_array(initializer)
            break
        if bias_weight is None:
            logger.debug(f"Bias weight not found")
            return
        if len(bias_weight.shape) != 1:
            logger.debug(f"Bias weight is not 1D")
            return

        subgraph_nodes = [node, add]
        if not self.model.is_safe_to_fuse_nodes(
                subgraph_nodes, [node.output[0]], input_name_to_nodes,
                output_name_to_node):
            logger.debug(
                f"Skip fusing SkipLayerNormalization with Bias since it is not safe"
            )
            return

        self.nodes_to_remove.extend(subgraph_nodes)
        inputs = [
            node.input[1 - add_input_index], matmul.output[0], node.input[2],
            node.input[3], add.input[bias_index]
        ]
        new_node = helper.make_node("SkipLayerNormalization",
                                    inputs=inputs,
                                    outputs=node.output,
                                    name=self.model.create_node_name(
                                        "SkipLayerNormalization",
                                        "SkipLayerNorm_AddBias_"))
        new_node.domain = "com.microsoft"

        # Pass attribute "epsilon" from skiplayernorm node to skiplayernorm(add bias)
        for att in node.attribute:
            if att.name == 'epsilon':
                new_node.attribute.extend([att])

        # Set default epsilon if no epsilon exists from skiplayernorm
        if len(new_node.attribute) == 0:
            new_node.attribute.extend(
                [helper.make_attribute("epsilon", 1.0E-12)])

        self.nodes_to_add.append(new_node)
        self.node_name_to_graph_name[new_node.name] = self.this_graph_name
Ejemplo n.º 6
0
    def create_attention_node(self, mask_index: str, q_matmul: NodeProto,
                              k_matmul: NodeProto, v_matmul: NodeProto,
                              q_add: NodeProto, k_add: NodeProto,
                              v_add: NodeProto, num_heads: int,
                              hidden_size: int, input: str,
                              output: str) -> Union[NodeProto, None]:
        """ Create an Attention node.

        Args:
            mask_index (str): mask input
            q_matmul (NodeProto): MatMul node in fully connection for Q
            k_matmul (NodeProto): MatMul node in fully connection for  K
            v_matmul (NodeProto): MatMul node in fully connection for  V
            q_add (NodeProto): Add bias node in fully connection for Q
            k_add (NodeProto): Add bias node in fully connection for K
            v_add (NodeProto): Add bias node in fully connection for V
            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
            input (str): input name
            output (str): output name

        Returns:
            Union[NodeProto, None]: the node created or None if failed.
        """
        assert num_heads > 0

        if hidden_size > 0 and (hidden_size % num_heads) != 0:
            logger.debug(
                f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}"
            )
            return None

        q_weight = self.model.get_initializer(q_matmul.input[1])
        k_weight = self.model.get_initializer(k_matmul.input[1])
        v_weight = self.model.get_initializer(v_matmul.input[1])
        q_bias = self.model.get_initializer(
            q_add.input[1]) or self.model.get_initializer(q_add.input[0])
        k_bias = self.model.get_initializer(
            k_add.input[1]) or self.model.get_initializer(k_add.input[0])
        v_bias = self.model.get_initializer(
            v_add.input[1]) or self.model.get_initializer(v_add.input[0])

        if q_weight is None:
            print(
                f"{q_matmul.input[1]} is not initializer. Please set do_constant_folding=True in torch.onnx.export"
            )
            return None
        if not (k_weight and v_weight and q_bias and k_bias):
            return None
        qw = NumpyHelper.to_array(q_weight)
        kw = NumpyHelper.to_array(k_weight)
        vw = NumpyHelper.to_array(v_weight)

        # assert q and k have same shape as expected
        assert qw.shape == kw.shape

        qw_in_size = qw.shape[0]
        kw_in_size = kw.shape[0]
        vw_in_size = vw.shape[0]

        assert qw_in_size == kw_in_size == vw_in_size

        if hidden_size > 0 and hidden_size != qw_in_size:
            logger.debug(
                f"Input hidden size {hidden_size} is not same as weight matrix dimension of q,k,v paths {qw_in_size}, provide correct input hidden size or pass 0"
            )
            return None

        is_qkv_diff_dims = False
        if qw.shape != vw.shape:
            is_qkv_diff_dims = True

        # All the matrices can have the same shape or q, k matrics can have the same shape with v being different
        # For 2d weights, the shapes would be [in_size, out_size].
        # For 3d weights, shape would be [in_size, a, b] where a*b = out_size
        qw_out_size = np.prod(qw.shape[1:])
        kw_out_size = np.prod(qw.shape[1:])
        vw_out_size = np.prod(vw.shape[1:])

        qkv_weight_dim = 0
        if is_qkv_diff_dims:
            qkv_weight = np.concatenate((qw, kw, vw), axis=1)
            qkv_weight_dim = qw_out_size + kw_out_size + vw_out_size
        else:
            qkv_weight = np.stack((qw, kw, vw), axis=1)
            qkv_weight_dim = 3 * qw_out_size

        qb = NumpyHelper.to_array(q_bias)
        kb = NumpyHelper.to_array(k_bias)
        vb = NumpyHelper.to_array(v_bias)

        q_bias_shape = np.prod(qb.shape)
        k_bias_shape = np.prod(kb.shape)
        v_bias_shape = np.prod(vb.shape)

        assert q_bias_shape == k_bias_shape == qw_out_size
        assert v_bias_shape == vw_out_size

        qkv_bias_dim = 0
        if is_qkv_diff_dims:
            qkv_bias = np.concatenate((qb, kb, vb), axis=0)
            qkv_bias_dim = q_bias_shape + k_bias_shape + v_bias_shape
        else:
            qkv_bias = np.stack((qb, kb, vb), axis=0)
            qkv_bias_dim = 3 * q_bias_shape

        attention_node_name = self.model.create_node_name('Attention')

        weight = helper.make_tensor(name=attention_node_name + '_qkv_weight',
                                    data_type=TensorProto.FLOAT,
                                    dims=[qw_in_size, qkv_weight_dim],
                                    vals=qkv_weight.flatten().tolist())

        # Sometimes weights and bias are stored in fp16
        if q_weight.data_type == 10:
            weight.CopyFrom(
                numpy_helper.from_array(
                    NumpyHelper.to_array(weight).astype(np.float16),
                    weight.name))
        self.model.add_initializer(weight, self.this_graph_name)

        bias = helper.make_tensor(name=attention_node_name + '_qkv_bias',
                                  data_type=TensorProto.FLOAT,
                                  dims=[qkv_bias_dim],
                                  vals=qkv_bias.flatten().tolist())
        if q_bias.data_type == 10:
            bias.CopyFrom(
                numpy_helper.from_array(
                    NumpyHelper.to_array(bias).astype(np.float16), bias.name))
        self.model.add_initializer(bias, self.this_graph_name)

        attention_inputs = [
            input, attention_node_name + '_qkv_weight',
            attention_node_name + '_qkv_bias'
        ]
        if mask_index is not None:
            attention_inputs.append(mask_index)

        attention_node = helper.make_node('Attention',
                                          inputs=attention_inputs,
                                          outputs=[output],
                                          name=attention_node_name)
        attention_node.domain = "com.microsoft"
        attention_node.attribute.extend(
            [helper.make_attribute("num_heads", num_heads)])

        if is_qkv_diff_dims:
            attention_node.attribute.extend([
                helper.make_attribute("qkv_hidden_sizes",
                                      [qw_out_size, kw_out_size, vw_out_size])
            ])

        return attention_node
Ejemplo n.º 7
0
    def create_attention_node(self, mask_index: str, q_matmul: NodeProto,
                              k_matmul: NodeProto, v_matmul: NodeProto,
                              q_add: NodeProto, k_add: NodeProto,
                              v_add: NodeProto, num_heads: int,
                              hidden_size: int, input: str,
                              output: str) -> Union[NodeProto, None]:
        """ Create an Attention node.

        Args:
            mask_index (str): mask input
            q_matmul (NodeProto): MatMul node in fully connection for Q
            k_matmul (NodeProto): MatMul node in fully connection for  K
            v_matmul (NodeProto): MatMul node in fully connection for  V
            q_add (NodeProto): Add bias node in fully connection for Q
            k_add (NodeProto): Add bias node in fully connection for K
            v_add (NodeProto): Add bias node in fully connection for V
            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
            input (str): input name
            output (str): output name

        Returns:
            Union[NodeProto, None]: the node created or None if failed.
        """
        assert num_heads > 0 and hidden_size > 0 and (hidden_size %
                                                      num_heads) == 0

        q_weight = self.model.get_initializer(q_matmul.input[1])
        k_weight = self.model.get_initializer(k_matmul.input[1])
        v_weight = self.model.get_initializer(v_matmul.input[1])
        q_bias = self.model.get_initializer(
            q_add.input[1]) or self.model.get_initializer(q_add.input[0])
        k_bias = self.model.get_initializer(
            k_add.input[1]) or self.model.get_initializer(k_add.input[0])
        v_bias = self.model.get_initializer(
            v_add.input[1]) or self.model.get_initializer(v_add.input[0])

        if q_weight is None:
            print(
                f"{q_matmul.input[1]} is not initializer. Please set do_constant_folding=True in torch.onnx.export"
            )
            return None
        if not (k_weight and v_weight and q_bias and k_bias):
            return None
        qw = NumpyHelper.to_array(q_weight)
        kw = NumpyHelper.to_array(k_weight)
        vw = NumpyHelper.to_array(v_weight)

        # Check if all matrices have the same shape
        assert qw.shape == kw.shape == vw.shape

        # All the matrices have the same shape. For 2d weights, the shapes would be [in_size, out_size].
        # For 3d weights, shape would be [in_size, a, b] where a*b = out_size
        in_size = qw.shape[0]
        out_size = np.prod(qw.shape[1:])

        qkv_weight = np.stack((qw, kw, vw), axis=1)

        qb = NumpyHelper.to_array(q_bias)
        kb = NumpyHelper.to_array(k_bias)
        vb = NumpyHelper.to_array(v_bias)

        # 1d bias shape: [outsize,]. 2d bias shape: [a, b] where a*b = out_size
        assert qb.shape == kb.shape == vb.shape
        assert np.prod(qb.shape) == out_size

        if out_size != hidden_size:
            logger.debug(
                f"Shape for weights of Q is {in_size, out_size}, which does not match hidden_size={hidden_size}"
            )
            return None

        qkv_bias = np.stack((qb, kb, vb), axis=0)
        attention_node_name = self.model.create_node_name('Attention')

        weight = helper.make_tensor(name=attention_node_name + '_qkv_weight',
                                    data_type=TensorProto.FLOAT,
                                    dims=[in_size, 3 * out_size],
                                    vals=qkv_weight.flatten().tolist())

        # Sometimes weights and bias are stored in fp16
        if q_weight.data_type == 10:
            weight.CopyFrom(
                numpy_helper.from_array(
                    NumpyHelper.to_array(weight).astype(np.float16),
                    weight.name))
        self.model.add_initializer(weight)

        bias = helper.make_tensor(name=attention_node_name + '_qkv_bias',
                                  data_type=TensorProto.FLOAT,
                                  dims=[3 * out_size],
                                  vals=qkv_bias.flatten().tolist())
        if q_bias.data_type == 10:
            bias.CopyFrom(
                numpy_helper.from_array(
                    NumpyHelper.to_array(bias).astype(np.float16), bias.name))
        self.model.add_initializer(bias)

        attention_inputs = [
            input, attention_node_name + '_qkv_weight',
            attention_node_name + '_qkv_bias'
        ]
        if mask_index is not None:
            attention_inputs.append(mask_index)

        attention_node = helper.make_node('Attention',
                                          inputs=attention_inputs,
                                          outputs=[output],
                                          name=attention_node_name)
        attention_node.domain = "com.microsoft"
        attention_node.attribute.extend(
            [helper.make_attribute("num_heads", num_heads)])

        return attention_node