def fuse_linear_ops(graph: Graph): """ This function makes fusing of linear operations (Mul,Add) to Convolution/FC. """ fuse_count = 0 # Fusion in backward direction nodes = graph.pseudo_topological_sort() for node in nodes: is_fused = False # Fuse Mul to Convolution/FC if node.soft_get('op') == 'Mul' and get_value_in_port( node) is not None and node.has_and_set('can_be_fused'): fuse_nodes = backward_bfs( node, [], ['Convolution', 'Deconvolution', 'MatMul']) is_fused = _fuse_mul(graph, node, fuse_nodes) fuse_count += is_fused # Fusion in forward direction nodes = graph.pseudo_topological_sort(reverse=True) for node in nodes: is_fused = False # Fuse Mul to Convolution/FC if node.soft_get('op') == 'Mul' and get_value_in_port( node) is not None and node.has_and_set('can_be_fused'): fuse_nodes = forward_bfs( node, [], ['Convolution', 'Deconvolution', 'MatMul']) is_fused = _fuse_mul(graph, node, fuse_nodes, False) fuse_count += is_fused log.debug("Fused {} nodes".format(fuse_count))
def replace_pattern(self, graph: Graph, match: Dict[str, Node]): quantize = match['quantize'] preop = match['preop'] tensor_port, value_port = get_tensor_in_port(preop), get_value_in_port( preop) if value_port is None or value_port.data.get_value() is None: log.debug( 'MulQuantizeFuse: cannot fuse because Mul op has dynamic inputs' ) return mul_val = value_port.data.get_value() if np.any(mul_val <= 0): return # Direct modifications to quantize 1-st and 2-nd port inputs are performed. # So the data nodes at those inputs shouldn't have more than 1 consumer maximum 2 consumers to the same # quantize op (consumed by 1st and 2nd ports). So we duplicate FakeQuantize in_port 1, 2 data if needed resolve_shared_inputs(node=quantize, port_ids_to_duplicate=[1, 2]) # TODO: need some special processing for values that exactly equal to threshold quantize.in_port(1).data.set_value( quantize.in_port(1).data.get_value() / mul_val) if quantize.in_node(1).id != quantize.in_node(2).id: quantize.in_port(2).data.set_value( quantize.in_port(2).data.get_value() / mul_val) # Reconnect Mul as it no longer needed for current FakeQuantize in_mul_connection = quantize.in_port(0).get_source().node.in_port( 0).get_connection() quantize.in_port(0).disconnect() in_mul_connection.add_destination(quantize.in_port(0))
def replace_pattern(self, graph: Graph, match: Dict[str, Node]): quantize = match['quantize'] preop = match['preop'] for i in [0, 1]: if preop.in_port(i).get_source().node.soft_get('type') in [ 'Convolution', 'Deconvolution', 'MatMul' ]: return tensor_port, value_port = get_tensor_in_port(preop), get_value_in_port( preop) if value_port is None or value_port.data.get_value() is None: log.debug( 'AddQuantizeFuse: cannot fuse because Add op has dynamic inputs' ) return # Direct modifications to quantize 1-st and 2-nd port inputs are performed. # So the data nodes at those inputs shouldn't have more than 1 consumer maximum 2 consumers to the same # quantize op (consumed by 1st and 2nd ports). So we duplicate FakeQuantize in_port 1, 2, 3, 4 data resolve_shared_inputs(node=quantize, port_ids_to_duplicate=[1, 2]) quantize.in_port(1).data.set_value( quantize.in_port(1).data.get_value() - value_port.data.get_value()) if quantize.in_node(1).id != quantize.in_node(2).id: quantize.in_port(2).data.set_value( quantize.in_port(2).data.get_value() - value_port.data.get_value()) in_add_connection = quantize.in_port(0).get_source().node.in_port( 0).get_connection() quantize.in_port(0).disconnect() in_add_connection.add_destination(quantize.in_port(0))
def mark_eltwise_node(self, node, feature_channel=None): tensor_port, value_port = get_tensor_in_port(node), get_value_in_port( node) if tensor_port is None or value_port is None: self.set_flags_to_false(node, ['can_be_fused', 'can_be_scaleshift']) return connected_in_ports = { idx: port for idx, port in node.in_ports().items() if not port.disconnected() } if len(connected_in_ports) != 2: return tensor_shape = tensor_port.data.get_shape() out_shape = node.out_port(0).data.get_shape() assert tensor_shape is not None and out_shape is not None if not np.array_equal(tensor_shape, out_shape): # ScaleShift operation doesn't support broadcasting self.set_flags_to_false(node, ['can_be_fused', 'can_be_scaleshift']) return value_shape = value_port.data.get_shape() assert value_shape is not None assert len(value_shape) <= len(tensor_shape), \ "No broadcasting was done for elementwise node {} due to previous checks in EltwiseChecker class. " \ "But constant input rank is larger than tensor input rank, that is inconsistent".format(node.name) # if both tensors are 0D they cannot be converted to scaleshift if len(tensor_shape) == 0 and len(value_shape) == 0: self.set_flags_to_false(node, ['can_be_scaleshift']) return broadcasted_value_shape = shape_insert( value_shape, 0, [1] * (len(tensor_shape) - len(value_shape))) feature_dim = min(1, tensor_shape.size - 1) if node.graph.graph['layout'] == 'NCHW' else -1 if feature_channel is not None: feature_dim = feature_channel ones = np.ones(len(tensor_shape)) possible_shape = ones.copy() np.put(possible_shape, feature_dim, tensor_shape.item(feature_dim)) if not np.array_equal(broadcasted_value_shape, ones) and \ not np.array_equal(broadcasted_value_shape, possible_shape): # ScaleShift weights should have [1,C,1,1]-like or [1,1,1,1]-like shape self.set_flags_to_false(node, ['can_be_fused', 'can_be_scaleshift']) return if len(tensor_shape) not in [2, 4, 5]: # ScaleShift operation is supported for 2D, 4D or 5D tensor inputs self.set_flags_to_false(node, ['can_be_scaleshift']) return
def fuse_mul_add_sequence(graph: Graph): """ This function finds first valid Mul/Add node and pass it to fuse_linear_sequence where full sequence will be found """ while True: is_fused = False for node in graph.pseudo_topological_sort(): if node.id in graph: if node.soft_get('op') in ['Mul', 'Add'] and get_value_in_port(node) is not None and \ node.soft_get('can_be_fused') is True: is_fused |= _fuse_linear_sequence(graph, node) if not is_fused: break
def mark_fusable_muls_on_weights(graph): for node in graph.get_op_nodes(op='Mul'): children = node.out_port(0).get_destinations() if len(children) > 1 or children[0].node.soft_get('type') not in [ 'Convolution', 'Deconvolution', 'MatMul' ]: continue value_in_port = get_value_in_port(node) if value_in_port is None: continue value_shape = value_in_port.data.get_shape() non_one_axis = np.argwhere(value_shape != 1) if non_one_axis.size != 1: continue non_one_axis = non_one_axis.item(0) node['can_be_fused'] = True EltwiseChecker().mark_eltwise_node(node, non_one_axis)
def _fuse_linear_sequence(graph: Graph, start_node: Node): """ This function finds the sequence of Mul/Add operations and replaces this sequence with two ops (Mul->Add). :param graph: :param start_node: The first operation of the sequence """ fnodes = [start_node] while True: node = fnodes[-1] destinations = node.out_port(0).get_destinations() if len(destinations) != 1: break dst_node = destinations[0].node if dst_node.soft_get('op') in ['Mul', 'Add'] and get_value_in_port(dst_node) is not None and \ dst_node.soft_get('can_be_fused') is True: fnodes.append(dst_node) else: break if len(fnodes) == 1 or (len(fnodes) == 2 and fnodes[0].op == 'Mul' and fnodes[1].op == 'Add'): return False input_shape = get_tensor_in_port(start_node).data.get_shape() init_dims_cnt = len(input_shape) - 2 if graph.graph['layout'] == 'NCHW' else 1 first_value = get_value_in_port(fnodes[0]).data.get_value() if not isinstance(first_value, np.ndarray): first_value = mo_array(first_value) first_value_type = first_value.dtype mul = np.ones([1 for x in range(init_dims_cnt)], dtype=first_value_type) add = np.zeros([1 for x in range(init_dims_cnt)], dtype=first_value_type) first_mul_name = None first_add_name = None for node in fnodes: const_port_value = get_value_in_port(node).data.get_value() if node.op == 'Mul': if first_mul_name is None: first_mul_name = node.name mul = mul * const_port_value add = add * const_port_value elif node.op == 'Add': if first_add_name is None: first_add_name = node.name add = add + const_port_value # If mul is scalar we broadcast it to biases shape if mul.shape != add.shape and len(mul.shape) == 1 and mul.shape[0] == 1: mul = mo_array([mul[0] for x in range(add.shape[0])]) assert (compatible_shapes(get_tensor_in_port(fnodes[0]).data.get_shape(), fnodes[-1].out_port(0).data.get_shape())) mul_op = Mul(graph, dict(name='{}/Fused_Mul_'.format(first_mul_name or ''))) add_op = Add(graph, dict(name='{}/Fused_Add_'.format(first_add_name or ''))) in_port = get_tensor_in_port(fnodes[0]) out_port = fnodes[-1].out_port(0) """ Four cases considered below: 1. Mul and Add have valid values (mul value != 1 and add value != 0) 2. Only Mul has valid values, so we add only Mul node 3. Only Add has valid values, so we add only Add node 4. When Mul and Add has not valid values we just merge two data nodes """ if any([x != 0 for x in np.nditer(add)]) and any([x != 1 for x in np.nditer(mul)]): # Const\ Const\ # ----->Mul------>Add--> mul_const = Const(graph, dict(name="data_mul_", value=mo_array(mul))).create_node() add_const = Const(graph, dict(name="data_add_", value=mo_array(add))).create_node() mul_node = mul_op.create_node() add_node = add_op.create_node() in_port.get_connection().set_destination(mul_node.in_port(0)) mul_const.out_port(0).connect(mul_node.in_port(1)) mul_node.out_port(0).connect(add_node.in_port(0)) add_const.out_port(0).connect(add_node.in_port(1)) out_port.get_connection().set_source(add_node.out_port(0)) elif any([x != 1 for x in np.nditer(mul)]): # Const\ # ----->Mul--> mul_const = Const(graph, dict(name="data_mul_", value=mo_array(mul))).create_node() mul_node = mul_op.create_node() in_port.get_connection().set_destination(mul_node.in_port(0)) mul_const.out_port(0).connect(mul_node.in_port(1)) out_port.get_connection().set_source(mul_node.out_port(0)) elif any([x != 0 for x in np.nditer(add)]): # Const\ # ----->Add--> add_const = Const(graph, dict(name="data_add_", value=mo_array(add))).create_node() add_node = add_op.create_node() in_port.get_connection().set_destination(add_node.in_port(0)) add_const.out_port(0).connect(add_node.in_port(1)) out_port.get_connection().set_source(add_node.out_port(0)) else: source_node = in_port.get_source() in_port.disconnect() out_port.get_connection().set_source(source_node) # Remove fused nodes for node in fnodes: graph.remove_node(node.id) log.debug('Fused {} operations'.format(len(fnodes))) return True
def _fuse_mul(graph: Graph, node: Node, fuse_nodes: list, backward: bool = True): """ This function takes Mul node and array of convolution/fc nodes for further fusion Parameters ---------- x : bool If backward is False, that means that Convolution/FC goes after Mul node else means that Mul goes after Convolutions/FC :param backward: :param fuse_nodes: :param node: :param graph: """ is_fused = False const_port, tensor_port = get_value_in_port(node), get_tensor_in_port(node) if const_port is None or tensor_port is None: log.warning( 'Cannot do fuse_mul for node {} because this node has wrong inputs' .format(node.id)) return False for fuse_node in fuse_nodes: if fuse_node.soft_get('can_be_fused') is False: log.warning( 'Node {} can\'t be used in fusing because attr can_be_fused = False' .format(fuse_node.name)) return False if len(fuse_node.in_ports()) < 2: log.warning('Node {} has no weights node'.format(fuse_node.name)) return False if not backward and not fuse_node.has_valid('layout'): log.warning('Node {} has no layout attr'.format(fuse_node.name)) return False weights_port = fuse_node.in_port(1) if not weights_port.data.has_valid('output_channel_dim') or \ not weights_port.data.has_valid('input_channel_dim'): log.warning( 'Cannot do fuse_mul for node {} because there is no field ' + 'output_channel_dim and/or input_channel_dim in weights.'. format(fuse_node.soft_get('name'))) return False inp_ch = weights_port.data.get_attr('input_channel_dim') out_ch = weights_port.data.get_attr('output_channel_dim') if max(inp_ch, out_ch) >= len(weights_port.data.get_shape()): log.warning('Node {} has wrong weights shape'.format( fuse_node.name)) return False for fuse_node in fuse_nodes: weights_port = fuse_node.in_port(1) value = mo_array(const_port.data.get_value()) value = np.squeeze(value) # TODO : ch_dim should be equal to node.in_node(1).value.shape # We will multiply weights according output/input channel dimension ch_dim = weights_port.data.get_attr( 'output_channel_dim' if backward else 'input_channel_dim') shape = mo_array([weights_port.data.get_shape()[ch_dim]]) # Scalar broadcast if value.size == 1: value = np.full(shape, value.item(), dtype=value.dtype) # Common broadcast for forward fusion if not backward: cnt = shape[-1] / value.shape[0] if fuse_node.layout == 'NCHW': tmp = mo_array([], dtype=value.dtype) for val in value: tmp = np.concatenate((tmp, np.repeat(val, cnt))) value = mo_array(tmp) else: value = np.tile(value, int(cnt)) # Expand dims for multiplication (ex. [38] to [38, 1, 1]) wdims_number = weights_port.data.get_attr('dims_number') for x in range(wdims_number - ch_dim - 1): shape = np.append(shape, 1) mul_val = mo_array(value) # If the value fails to reshape to the provided shape, skip fusing. # This can happen in case of group != 1 of the convolution. try: value = np.reshape(value, shape) except ValueError: log.error( "Cannot fuse const from {} to {}. Reshape failed. Skipping.". format(node.soft_get('name', node.id), fuse_node.soft_get('name', fuse_node.id)), extra={'is_warning': True}) return False # Weights multiplication mul_name = node.name + '_copy' mul_const = Const(graph, { 'value': value, 'name': mul_name + '/const' }).create_node() w_mul = node.copy_node({ 'name': mul_name, 'in_ports_count': len(node.in_ports()), 'out_ports_count': len(node.out_ports()), 'can_be_fused': False }) w_mul.in_port(const_port.idx).connect(mul_const.out_port(0)) w_const = weights_port.get_source() weights_port.get_connection().set_source(w_mul.out_port(0)) w_const.connect(w_mul.in_port(tensor_port.idx)) fuse_node_in_data = fuse_node.in_node(weights_port.idx) w_const_out_data = w_const.node.out_node(w_const.idx) # During this reconnection new data node name is copied from the data node # outgoing from w_const port. Duplicate names of data nodes lead to appearing # of duplicate op node names after constant folding. So we should manually # set a unique name for the new data node. if fuse_node_in_data.soft_get('name') == w_const_out_data.soft_get('name') and \ fuse_node_in_data.soft_get('name', None) is not None: fuse_node.in_node( weights_port.idx)['name'] = graph.unique_id(mul_name) # If we fuse in backward direction we should multiply biases if they exists if backward and len(fuse_node.in_ports()) == 3 and not fuse_node.in_port(2).disconnected() and \ not fuse_node.has_and_set('shape_input'): conv_bias = fuse_node.in_port(2) conv_bias.data.set_value(conv_bias.data.get_value() * np.squeeze(mul_val)) mul_const.infer(mul_const) w_mul.infer(w_mul) log.debug('Fused: {} to {}'.format(node.name, fuse_node.name)) is_fused = True if is_fused: # Delete Mul node producer_port = tensor_port.get_source() tensor_port.disconnect() const_port.disconnect() # as Mul node is added before convolution, output tensor from Convolution node # corresponds to original Mul node node.out_port(0).get_connection().set_source(producer_port, "dest") return is_fused