def create_attention_node(self, mask_index: str, matmul: NodeProto, add: NodeProto, num_heads: int, hidden_size: int, input: str, output: str, add_qk_str: str) -> Union[NodeProto, None]: assert num_heads > 0 if hidden_size > 0 and (hidden_size % num_heads) != 0: logger.debug(f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}") return None weight = self.model.get_initializer(matmul.input[1]) bias = self.model.get_initializer(add.input[1]) or self.model.get_initializer(add.input[0]) if weight is None or bias is None: return None qkv_weight = NumpyHelper.to_array(weight) qkv_bias = NumpyHelper.to_array(bias) attention_node_name = self.model.create_node_name('Attention') weight = helper.make_tensor(name=attention_node_name + '_qkv_weight', data_type=TensorProto.FLOAT, dims=[hidden_size, 3 * hidden_size], vals=qkv_weight.flatten().tolist()) # Sometimes weights and bias are stored in fp16 if weight.data_type == 10: weight.CopyFrom(numpy_helper.from_array(NumpyHelper.to_array(weight).astype(np.float16), weight.name)) self.model.add_initializer(weight, self.this_graph_name) bias = helper.make_tensor(name=attention_node_name + '_qkv_bias', data_type=TensorProto.FLOAT, dims=[3 * hidden_size], vals=qkv_bias.flatten().tolist()) if bias.data_type == 10: bias.CopyFrom(numpy_helper.from_array(NumpyHelper.to_array(bias).astype(np.float16), bias.name)) self.model.add_initializer(bias, self.this_graph_name) attention_inputs = [input, attention_node_name + '_qkv_weight', attention_node_name + '_qkv_bias'] if mask_index is not None: attention_inputs.append(mask_index) else: attention_inputs.append("") if add_qk_str is not None: attention_inputs.append("") attention_inputs.append(add_qk_str) attention_node = helper.make_node('Attention', inputs=attention_inputs, outputs=[output], name=attention_node_name) attention_node.domain = "com.microsoft" attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)]) return attention_node
def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]: """ Detect num_heads and hidden_size from a reshape node. Args: reshape_q (NodeProto): reshape node for Q Returns: Tuple[int, int]: num_heads and hidden_size """ # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size] q_shape = self.model.get_initializer(reshape_q.input[1]) if q_shape is None: logger.debug(f"{reshape_q.input[1]} is not initializer.") return self.num_heads, self.hidden_size # Fall back to user specified value q_shape_value = NumpyHelper.to_array(q_shape) if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0): logger.debug(f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size].") return self.num_heads, self.hidden_size # Fall back to user specified value num_heads = q_shape_value[2] head_size = q_shape_value[3] hidden_size = num_heads * head_size if self.num_heads > 0 and num_heads != self.num_heads: logger.warn(f"--num_heads is {self.num_heads}. Detected value is {num_heads}. Using detected value.") if self.hidden_size > 0 and hidden_size != self.hidden_size: logger.warn(f"--hidden_size is {self.hidden_size}. Detected value is {hidden_size}. Using detected value.") return num_heads, hidden_size
def fill_zeros_for_external_data(tensor: TensorProto): if tensor.HasField("raw_data"): # already loaded return value = NumpyHelper.to_array(tensor, fill_zeros=True) zero_tensor = numpy_helper.from_array(value, name=tensor.name) tensor.raw_data = zero_tensor.raw_data
def fuse(self, node, input_name_to_nodes, output_name_to_node): gelu_op_type = node.op_type fuse_op_type = "BiasGelu" if gelu_op_type == "Gelu" else "FastGelu" if len(node.input) != 1: return nodes = self.model.match_parent_path(node, ["Add", "MatMul"], [0, None]) if nodes is None: return (add, matmul) = nodes bias_weight = None # bias should be one dimension bias_index = -1 for i, input in enumerate(add.input): initializer = self.model.get_initializer(input) if initializer is None: continue bias_index = i bias_weight = NumpyHelper.to_array(initializer) break if bias_weight is None: return if len(bias_weight.shape) != 1: return subgraph_nodes = [node, add] if not self.model.is_safe_to_fuse_nodes( subgraph_nodes, [node.output[0]], input_name_to_nodes, output_name_to_node): return self.nodes_to_remove.extend(subgraph_nodes) fused_node = helper.make_node( fuse_op_type, inputs=[matmul.output[0], add.input[bias_index]], outputs=node.output, name=self.model.create_node_name(fuse_op_type, gelu_op_type + "_AddBias_"), ) fused_node.domain = "com.microsoft" self.nodes_to_add.append(fused_node) self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
def fuse(self, node, input_name_to_nodes, output_name_to_node): if len(node.input) != 4: return return_indice = [] nodes = self.model.match_parent_path(node, ['Add', 'MatMul'], [None, None], None, return_indice) if nodes is None: return assert len(return_indice) == 2 add_input_index = return_indice[0] if add_input_index >= 2: return (add, matmul) = nodes # bias should be one dimension bias_index = -1 for i, input in enumerate(add.input): initializer = self.model.get_initializer(input) if initializer is None: continue bias_index = i bias_weight = NumpyHelper.to_array(initializer) break if bias_weight is None: logger.debug(f"Bias weight not found") return if len(bias_weight.shape) != 1: logger.debug(f"Bias weight is not 1D") return subgraph_nodes = [node, add] if not self.model.is_safe_to_fuse_nodes( subgraph_nodes, [node.output[0]], input_name_to_nodes, output_name_to_node): logger.debug( f"Skip fusing SkipLayerNormalization with Bias since it is not safe" ) return self.nodes_to_remove.extend(subgraph_nodes) inputs = [ node.input[1 - add_input_index], matmul.output[0], node.input[2], node.input[3], add.input[bias_index] ] new_node = helper.make_node("SkipLayerNormalization", inputs=inputs, outputs=node.output, name=self.model.create_node_name( "SkipLayerNormalization", "SkipLayerNorm_AddBias_")) new_node.domain = "com.microsoft" # Pass attribute "epsilon" from skiplayernorm node to skiplayernorm(add bias) for att in node.attribute: if att.name == 'epsilon': new_node.attribute.extend([att]) # Set default epsilon if no epsilon exists from skiplayernorm if len(new_node.attribute) == 0: new_node.attribute.extend( [helper.make_attribute("epsilon", 1.0E-12)]) self.nodes_to_add.append(new_node) self.node_name_to_graph_name[new_node.name] = self.this_graph_name
def create_attention_node(self, mask_index: str, q_matmul: NodeProto, k_matmul: NodeProto, v_matmul: NodeProto, q_add: NodeProto, k_add: NodeProto, v_add: NodeProto, num_heads: int, hidden_size: int, input: str, output: str) -> Union[NodeProto, None]: """ Create an Attention node. Args: mask_index (str): mask input q_matmul (NodeProto): MatMul node in fully connection for Q k_matmul (NodeProto): MatMul node in fully connection for K v_matmul (NodeProto): MatMul node in fully connection for V q_add (NodeProto): Add bias node in fully connection for Q k_add (NodeProto): Add bias node in fully connection for K v_add (NodeProto): Add bias node in fully connection for V num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning. hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning. input (str): input name output (str): output name Returns: Union[NodeProto, None]: the node created or None if failed. """ assert num_heads > 0 if hidden_size > 0 and (hidden_size % num_heads) != 0: logger.debug( f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}" ) return None q_weight = self.model.get_initializer(q_matmul.input[1]) k_weight = self.model.get_initializer(k_matmul.input[1]) v_weight = self.model.get_initializer(v_matmul.input[1]) q_bias = self.model.get_initializer( q_add.input[1]) or self.model.get_initializer(q_add.input[0]) k_bias = self.model.get_initializer( k_add.input[1]) or self.model.get_initializer(k_add.input[0]) v_bias = self.model.get_initializer( v_add.input[1]) or self.model.get_initializer(v_add.input[0]) if q_weight is None: print( f"{q_matmul.input[1]} is not initializer. Please set do_constant_folding=True in torch.onnx.export" ) return None if not (k_weight and v_weight and q_bias and k_bias): return None qw = NumpyHelper.to_array(q_weight) kw = NumpyHelper.to_array(k_weight) vw = NumpyHelper.to_array(v_weight) # assert q and k have same shape as expected assert qw.shape == kw.shape qw_in_size = qw.shape[0] kw_in_size = kw.shape[0] vw_in_size = vw.shape[0] assert qw_in_size == kw_in_size == vw_in_size if hidden_size > 0 and hidden_size != qw_in_size: logger.debug( f"Input hidden size {hidden_size} is not same as weight matrix dimension of q,k,v paths {qw_in_size}, provide correct input hidden size or pass 0" ) return None is_qkv_diff_dims = False if qw.shape != vw.shape: is_qkv_diff_dims = True # All the matrices can have the same shape or q, k matrics can have the same shape with v being different # For 2d weights, the shapes would be [in_size, out_size]. # For 3d weights, shape would be [in_size, a, b] where a*b = out_size qw_out_size = np.prod(qw.shape[1:]) kw_out_size = np.prod(qw.shape[1:]) vw_out_size = np.prod(vw.shape[1:]) qkv_weight_dim = 0 if is_qkv_diff_dims: qkv_weight = np.concatenate((qw, kw, vw), axis=1) qkv_weight_dim = qw_out_size + kw_out_size + vw_out_size else: qkv_weight = np.stack((qw, kw, vw), axis=1) qkv_weight_dim = 3 * qw_out_size qb = NumpyHelper.to_array(q_bias) kb = NumpyHelper.to_array(k_bias) vb = NumpyHelper.to_array(v_bias) q_bias_shape = np.prod(qb.shape) k_bias_shape = np.prod(kb.shape) v_bias_shape = np.prod(vb.shape) assert q_bias_shape == k_bias_shape == qw_out_size assert v_bias_shape == vw_out_size qkv_bias_dim = 0 if is_qkv_diff_dims: qkv_bias = np.concatenate((qb, kb, vb), axis=0) qkv_bias_dim = q_bias_shape + k_bias_shape + v_bias_shape else: qkv_bias = np.stack((qb, kb, vb), axis=0) qkv_bias_dim = 3 * q_bias_shape attention_node_name = self.model.create_node_name('Attention') weight = helper.make_tensor(name=attention_node_name + '_qkv_weight', data_type=TensorProto.FLOAT, dims=[qw_in_size, qkv_weight_dim], vals=qkv_weight.flatten().tolist()) # Sometimes weights and bias are stored in fp16 if q_weight.data_type == 10: weight.CopyFrom( numpy_helper.from_array( NumpyHelper.to_array(weight).astype(np.float16), weight.name)) self.model.add_initializer(weight, self.this_graph_name) bias = helper.make_tensor(name=attention_node_name + '_qkv_bias', data_type=TensorProto.FLOAT, dims=[qkv_bias_dim], vals=qkv_bias.flatten().tolist()) if q_bias.data_type == 10: bias.CopyFrom( numpy_helper.from_array( NumpyHelper.to_array(bias).astype(np.float16), bias.name)) self.model.add_initializer(bias, self.this_graph_name) attention_inputs = [ input, attention_node_name + '_qkv_weight', attention_node_name + '_qkv_bias' ] if mask_index is not None: attention_inputs.append(mask_index) attention_node = helper.make_node('Attention', inputs=attention_inputs, outputs=[output], name=attention_node_name) attention_node.domain = "com.microsoft" attention_node.attribute.extend( [helper.make_attribute("num_heads", num_heads)]) if is_qkv_diff_dims: attention_node.attribute.extend([ helper.make_attribute("qkv_hidden_sizes", [qw_out_size, kw_out_size, vw_out_size]) ]) return attention_node
def create_attention_node(self, mask_index: str, q_matmul: NodeProto, k_matmul: NodeProto, v_matmul: NodeProto, q_add: NodeProto, k_add: NodeProto, v_add: NodeProto, num_heads: int, hidden_size: int, input: str, output: str) -> Union[NodeProto, None]: """ Create an Attention node. Args: mask_index (str): mask input q_matmul (NodeProto): MatMul node in fully connection for Q k_matmul (NodeProto): MatMul node in fully connection for K v_matmul (NodeProto): MatMul node in fully connection for V q_add (NodeProto): Add bias node in fully connection for Q k_add (NodeProto): Add bias node in fully connection for K v_add (NodeProto): Add bias node in fully connection for V num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning. hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning. input (str): input name output (str): output name Returns: Union[NodeProto, None]: the node created or None if failed. """ assert num_heads > 0 and hidden_size > 0 and (hidden_size % num_heads) == 0 q_weight = self.model.get_initializer(q_matmul.input[1]) k_weight = self.model.get_initializer(k_matmul.input[1]) v_weight = self.model.get_initializer(v_matmul.input[1]) q_bias = self.model.get_initializer( q_add.input[1]) or self.model.get_initializer(q_add.input[0]) k_bias = self.model.get_initializer( k_add.input[1]) or self.model.get_initializer(k_add.input[0]) v_bias = self.model.get_initializer( v_add.input[1]) or self.model.get_initializer(v_add.input[0]) if q_weight is None: print( f"{q_matmul.input[1]} is not initializer. Please set do_constant_folding=True in torch.onnx.export" ) return None if not (k_weight and v_weight and q_bias and k_bias): return None qw = NumpyHelper.to_array(q_weight) kw = NumpyHelper.to_array(k_weight) vw = NumpyHelper.to_array(v_weight) # Check if all matrices have the same shape assert qw.shape == kw.shape == vw.shape # All the matrices have the same shape. For 2d weights, the shapes would be [in_size, out_size]. # For 3d weights, shape would be [in_size, a, b] where a*b = out_size in_size = qw.shape[0] out_size = np.prod(qw.shape[1:]) qkv_weight = np.stack((qw, kw, vw), axis=1) qb = NumpyHelper.to_array(q_bias) kb = NumpyHelper.to_array(k_bias) vb = NumpyHelper.to_array(v_bias) # 1d bias shape: [outsize,]. 2d bias shape: [a, b] where a*b = out_size assert qb.shape == kb.shape == vb.shape assert np.prod(qb.shape) == out_size if out_size != hidden_size: logger.debug( f"Shape for weights of Q is {in_size, out_size}, which does not match hidden_size={hidden_size}" ) return None qkv_bias = np.stack((qb, kb, vb), axis=0) attention_node_name = self.model.create_node_name('Attention') weight = helper.make_tensor(name=attention_node_name + '_qkv_weight', data_type=TensorProto.FLOAT, dims=[in_size, 3 * out_size], vals=qkv_weight.flatten().tolist()) # Sometimes weights and bias are stored in fp16 if q_weight.data_type == 10: weight.CopyFrom( numpy_helper.from_array( NumpyHelper.to_array(weight).astype(np.float16), weight.name)) self.model.add_initializer(weight) bias = helper.make_tensor(name=attention_node_name + '_qkv_bias', data_type=TensorProto.FLOAT, dims=[3 * out_size], vals=qkv_bias.flatten().tolist()) if q_bias.data_type == 10: bias.CopyFrom( numpy_helper.from_array( NumpyHelper.to_array(bias).astype(np.float16), bias.name)) self.model.add_initializer(bias) attention_inputs = [ input, attention_node_name + '_qkv_weight', attention_node_name + '_qkv_bias' ] if mask_index is not None: attention_inputs.append(mask_index) attention_node = helper.make_node('Attention', inputs=attention_inputs, outputs=[output], name=attention_node_name) attention_node.domain = "com.microsoft" attention_node.attribute.extend( [helper.make_attribute("num_heads", num_heads)]) return attention_node