def _min(sdfg: SDFG, state: SDFGState, a: str, axis=None): return _reduce(sdfg, state, "lambda x, y: min(x, y)", a, axis=axis, identity=dtypes.max_value(sdfg.arrays[a].dtype))
def expand(self, sdfg, graph, reduce_node): """ Splits the data dimension into an inner and outer dimension, where the inner dimension are the reduction axes and the outer axes the complement. Pushes the reduce inside a new map consisting of the complement axes. """ out_storage_node = graph.out_edges(reduce_node)[0].dst in_storage_node = graph.in_edges(reduce_node)[0].src wcr = reduce_node.wcr identity = reduce_node.identity schedule = reduce_node.schedule implementation = reduce_node.implementation if implementation and 'warp' in implementation: raise NotImplementedError( "WIP: Warp Reductions are not Implemented yet.") # remove the reduce identity # we will reassign it later after expanding reduce_node.identity = None # expand the reduce node in_edge = graph.in_edges(reduce_node)[0] nsdfg = self._expand_reduce(sdfg, graph, reduce_node) # find the new nodes in the nested sdfg created nstate = nsdfg.sdfg.nodes()[0] for node, scope in nstate.scope_dict().items(): if isinstance(node, nodes.MapEntry): if scope is None: outer_entry = node else: inner_entry = node if isinstance(node, nodes.Tasklet): tasklet_node = node inner_exit = nstate.exit_node(inner_entry) outer_exit = nstate.exit_node(outer_entry) # find earliest parent read-write occurrence of array onto which # we perform the reduction: # do BFS, best complexity O(V+E) queue = [nsdfg] array_closest_ancestor = None while len(queue) > 0: current = queue.pop(0) if isinstance(current, nodes.AccessNode): if current.data == out_storage_node.data: # it suffices to find the first node # no matter what access (ReadWrite or Read) array_closest_ancestor = current break queue.extend([in_edge.src for in_edge in graph.in_edges(current)]) # if ancestor doesn't exist: # if non-transient: create data node accessing it # if transient: ancestor_node = none, set_zero on outer node shortcut = False if (not array_closest_ancestor and sdfg.data(out_storage_node.data).transient) \ or identity is not None: if self.debug: print("ReduceExpansion::Expanding Reduction into Map") # we are lucky shortcut = True nstate.out_edges(outer_exit)[0].data.wcr = None else: if self.debug: print("ReduceExpansion::Expanding Reduction into Map " "and introducing update Tasklet, " "connecting with ancestor.") if not array_closest_ancestor: array_closest_ancestor = nodes.AccessNode( out_storage_node.data, access=dtypes.AccessType.ReadOnly) graph.add_node(array_closest_ancestor) # array_closest_ancestor now points to the node we want to connect # to the map entry # always have to create out transient in this case self.create_out_transient = True if self.create_out_transient: # create an out transient between inner and outer map exit array_out = nstate.out_edges(outer_exit)[0].data.data from dace.transformation.dataflow.local_storage import LocalStorage local_storage_subgraph = { LocalStorage.node_a: nsdfg.sdfg.nodes()[0].nodes().index(inner_exit), LocalStorage.node_b: nsdfg.sdfg.nodes()[0].nodes().index(outer_exit) } nsdfg_id = nsdfg.sdfg.sdfg_list.index(nsdfg.sdfg) nstate_id = 0 local_storage = LocalStorage(nsdfg_id, nstate_id, local_storage_subgraph, 0) local_storage.array = array_out local_storage.apply(nsdfg.sdfg) out_transient_node_inner = local_storage._data_node # push to register nsdfg.sdfg.data(out_transient_node_inner.data ).storage = dtypes.StorageType.Register if shortcut: nstate.out_edges(out_transient_node_inner)[0].data.wcr = None nstate.out_edges(out_transient_node_inner)[0].data.volume = 1 if shortcut: nstate.out_edges(out_transient_node_inner)[0].data.wcr = None nstate.out_edges(out_transient_node_inner)[0].data.volume = 1 if self.create_in_transient: # create an in-transient between inner and outer map entry array_in = nstate.in_edges(outer_entry)[0].data.data from dace.transformation.dataflow.local_storage import LocalStorage local_storage_subgraph = { LocalStorage.node_a: nsdfg.sdfg.nodes()[0].nodes().index(outer_entry), LocalStorage.node_b: nsdfg.sdfg.nodes()[0].nodes().index(inner_entry) } nsdfg_id = nsdfg.sdfg.sdfg_list.index(nsdfg.sdfg) nstate_id = 0 local_storage = LocalStorage(nsdfg_id, nstate_id, local_storage_subgraph, 0) local_storage.array = array_in local_storage.apply(nsdfg.sdfg) in_transient_node_inner = local_storage._data_node # push to shared memory / default nsdfg.sdfg.data(in_transient_node_inner.data ).storage = dtypes.StorageType.Register # first, inline fuse back our nested SDFG from dace.transformation.interstate import InlineSDFG inline_sdfg = InlineSDFG( sdfg.sdfg_list.index(sdfg), sdfg.nodes().index(graph), {InlineSDFG._nested_sdfg: graph.nodes().index(nsdfg)}, 0) inline_sdfg.apply(sdfg) if not shortcut: reduction_type = detect_reduction_type(wcr) try: code = ReduceExpansion.reduction_type_update[reduction_type] except KeyError: raise NotImplementedError( "Not yet implemented for custom reduction") new_tasklet = graph.add_tasklet( name="reduction_transient_update", inputs={"reduction_in", "array_in"}, outputs={"out"}, code=code) edge_to_remove = graph.out_edges(out_transient_node_inner)[0] \ if self.create_out_transient \ else graph.out_edges(inner_exit)[0] new_memlet_array_inner = Memlet(data=out_storage_node.data, volume=1, subset=edge_to_remove.data.subset) new_memlet_array_outer = Memlet( data=array_closest_ancestor.data, volume=graph.in_edges(outer_entry)[0].data.volume, subset=subsets.Range.from_array( sdfg.data(out_storage_node.data))) new_memlet_reduction = Memlet( data=graph.out_edges(inner_exit)[0].data.data, volume=1, subset=graph.out_edges(inner_exit)[0].data.subset) new_memlet_out_inner = Memlet(data=edge_to_remove.data.data, volume=1, subset=edge_to_remove.data.subset) new_memlet_out_outer = dcpy(new_memlet_array_outer) # remove old edges outer_edge_to_remove = None for edge in graph.out_edges(outer_exit): if edge.src == edge_to_remove.dst: outer_edge_to_remove = edge graph.remove_edge_and_connectors(edge_to_remove) graph.remove_edge_and_connectors(outer_edge_to_remove) graph.add_edge(out_transient_node_inner if self.create_out_transient \ else inner_exit, None, new_tasklet, "reduction_in", new_memlet_reduction) graph.add_edge(outer_entry, None, new_tasklet, "array_in", new_memlet_array_inner) graph.add_edge(array_closest_ancestor, None, outer_entry, None, new_memlet_array_outer) graph.add_edge(new_tasklet, "out", outer_exit, None, new_memlet_out_inner) graph.add_edge(outer_exit, None, out_storage_node, None, new_memlet_out_outer) # fill map scope connectors graph.fill_scope_connectors() graph._clear_scopedict_cache() # wcr is already removed # FORNOW: choose default schedule and implementation new_schedule = dtypes.ScheduleType.Default new_implementation = self.reduce_implementation \ if self.reduce_implementation is not None \ else implementation new_axes = dcpy(reduce_node.axes) reduce_node_new = graph.add_reduce(wcr=wcr, axes=new_axes, schedule=new_schedule, identity=identity) reduce_node_new.implementation = new_implementation edge_tmp = graph.in_edges(inner_entry)[0] memlet_src_reduce = dcpy(edge_tmp.data) graph.add_edge(edge_tmp.src, edge_tmp.src_conn, reduce_node_new, None, memlet_src_reduce) edge_tmp = graph.out_edges(inner_exit)[0] memlet_reduce_dst = Memlet(data=edge_tmp.data.data, volume=1, subset=edge_tmp.data.subset) graph.add_edge(reduce_node_new, None, edge_tmp.dst, edge_tmp.dst_conn, memlet_reduce_dst) identity_tasklet = graph.out_edges(inner_entry)[0].dst graph.remove_node(inner_entry) graph.remove_node(inner_exit) graph.remove_node(identity_tasklet) # propagate scope for correct volumes scope_tree = ScopeTree(outer_entry, outer_exit) scope_tree.parent = ScopeTree(None, None) propagate_memlets_scope(sdfg, graph, scope_tree) sdfg.validate() # create variables for outside access self._new_reduce = reduce_node_new self._outer_entry = outer_entry if identity is None and self.create_out_transient: # set the reduction identity accordingly so that the correct # blank result is written to the out_transient node # we use default values deducted from the reduction type reduction_type = detect_reduction_type(wcr) try: reduce_node_new.identity = self.reduction_type_identity[ reduction_type] except KeyError: if reduction_type == dtypes.ReductionType.Min: reduce_node_new.identity = dtypes.max_value( sdfg.arrays[out_storage_node.data].dtype) elif reduction_type == dtypes.ReductionType.Max: reduce_node_new.identity = dtypes.min_value( sdfg.arrays[out_storage_node.data].dtype) else: raise ValueError(f"Cannot infer reduction identity." "Please specify the identity of node" "{reduce_node_new}") return
def expand(self, sdfg, graph, reduce_node): """ Splits the data dimension into an inner and outer dimension, where the inner dimension are the reduction axes and the outer axes the complement. Pushes the reduce inside a new map consisting of the complement axes. """ # get out storage node, might be hidden behind view node out_data = graph.out_edges(reduce_node)[0].data out_storage_node = reduce_node while not isinstance(out_storage_node, nodes.AccessNode): out_storage_node = graph.out_edges(out_storage_node)[0].dst if isinstance(sdfg.data(out_storage_node.data), View): out_storage_node = graph.out_edges(out_storage_node)[0].dst while not isinstance(out_storage_node, nodes.AccessNode): out_storage_node = graph.out_edges(out_storage_node)[0].dst # get other useful quantities from the original reduce node wcr = reduce_node.wcr identity = reduce_node.identity implementation = reduce_node.implementation # remove the reduce identity, will get reassigned after expansion reduce_node.identity = None # expand the reduce node in_edge = graph.in_edges(reduce_node)[0] nsdfg = self._expand_reduce(sdfg, graph, reduce_node) # find the new nodes in the nested sdfg created nstate = nsdfg.sdfg.nodes()[0] for node, scope in nstate.scope_dict().items(): if isinstance(node, nodes.MapEntry): if scope is None: outer_entry = node else: inner_entry = node if isinstance(node, nodes.Tasklet): tasklet_node = node inner_exit = nstate.exit_node(inner_entry) outer_exit = nstate.exit_node(outer_entry) # find earliest parent read-write occurrence of array onto which the reduction is performed: BFS if self.create_out_transient: queue = [nsdfg] enqueued = set() array_closest_ancestor = None while len(queue) > 0: current = queue.pop() if isinstance(current, nodes.AccessNode): if current.data == out_storage_node.data: # it suffices to find the first node # no matter what access (ReadWrite or Read) array_closest_ancestor = current break for in_edge in graph.in_edges(current): if in_edge.src not in enqueued: queue.append(in_edge.src) enqueued.add(in_edge.src) if self.debug and array_closest_ancestor: print( f"ReduceExpansion::Closest ancestor={array_closest_ancestor}" ) elif self.debug: print("ReduceExpansion::No closest ancestor found") if self.create_out_transient: # create an out transient between inner and outer map exit array_out = nstate.out_edges(outer_exit)[0].data.data from dace.transformation.dataflow.local_storage import LocalStorage local_storage_subgraph = { LocalStorage.node_a: nsdfg.sdfg.nodes()[0].nodes().index(inner_exit), LocalStorage.node_b: nsdfg.sdfg.nodes()[0].nodes().index(outer_exit) } nsdfg_id = nsdfg.sdfg.sdfg_list.index(nsdfg.sdfg) nstate_id = 0 local_storage = LocalStorage(nsdfg_id, nstate_id, local_storage_subgraph, 0) local_storage.array = array_out local_storage.apply(nsdfg.sdfg) out_transient_node_inner = local_storage._data_node # push to register nsdfg.sdfg.data(out_transient_node_inner.data ).storage = dtypes.StorageType.Register # remove WCRs from all edges where possible if there is no # prior occurrence if array_closest_ancestor is None: nstate.out_edges(outer_exit)[0].data.wcr = None nstate.out_edges(out_transient_node_inner)[0].data.wcr = None nstate.out_edges(out_transient_node_inner)[0].data.volume = 1 else: # remove WCR from outer exit nstate.out_edges(outer_exit)[0].data.wcr = None if self.create_in_transient: # create an in-transient between inner and outer map entry array_in = nstate.in_edges(outer_entry)[0].data.data from dace.transformation.dataflow.local_storage import LocalStorage local_storage_subgraph = { LocalStorage.node_a: nsdfg.sdfg.nodes()[0].nodes().index(outer_entry), LocalStorage.node_b: nsdfg.sdfg.nodes()[0].nodes().index(inner_entry) } nsdfg_id = nsdfg.sdfg.sdfg_list.index(nsdfg.sdfg) nstate_id = 0 local_storage = LocalStorage(nsdfg_id, nstate_id, local_storage_subgraph, 0) local_storage.array = array_in local_storage.apply(nsdfg.sdfg) in_transient_node_inner = local_storage._data_node # push to register nsdfg.sdfg.data(in_transient_node_inner.data ).storage = dtypes.StorageType.Register # inline fuse back our nested SDFG from dace.transformation.interstate import InlineSDFG inline_sdfg = InlineSDFG( sdfg.sdfg_list.index(sdfg), sdfg.nodes().index(graph), {InlineSDFG._nested_sdfg: graph.nodes().index(nsdfg)}, 0) inline_sdfg.apply(sdfg) new_schedule = dtypes.ScheduleType.Default new_implementation = self.reduce_implementation \ if self.reduce_implementation is not None \ else implementation new_axes = dcpy(reduce_node.axes) reduce_node_new = graph.add_reduce(wcr=wcr, axes=new_axes, schedule=new_schedule, identity=identity) reduce_node_new.implementation = new_implementation # replace inner map with new reduction node edge_tmp = graph.in_edges(inner_entry)[0] memlet_src_reduce = dcpy(edge_tmp.data) graph.add_edge(edge_tmp.src, edge_tmp.src_conn, reduce_node_new, None, memlet_src_reduce) edge_tmp = graph.out_edges(inner_exit)[0] memlet_reduce_dst = Memlet(data=edge_tmp.data.data, volume=1, subset=edge_tmp.data.subset) graph.add_edge(reduce_node_new, None, edge_tmp.dst, edge_tmp.dst_conn, memlet_reduce_dst) identity_tasklet = graph.out_edges(inner_entry)[0].dst graph.remove_node(inner_entry) graph.remove_node(inner_exit) graph.remove_node(identity_tasklet) # propagate scope for correct volumes scope_tree = ScopeTree(outer_entry, outer_exit) scope_tree.parent = ScopeTree(None, None) propagate_memlets_scope(sdfg, graph, scope_tree) sdfg.validate() # create variables for outside access self._reduce = reduce_node_new self._outer_entry = outer_entry if identity is None and self.create_out_transient: if self.debug: print( "ReduceExpansion::Trying to infer reduction WCR type due to out transient created" ) # set the reduction identity accordingly so that the correct # blank result is written to the out_transient node # we use default values deducted from the reduction type reduction_type = detect_reduction_type(wcr) try: reduce_node_new.identity = self.reduction_type_identity[ reduction_type] except KeyError: if reduction_type == dtypes.ReductionType.Min: reduce_node_new.identity = dtypes.max_value( sdfg.arrays[out_storage_node.data].dtype) elif reduction_type == dtypes.ReductionType.Max: reduce_node_new.identity = dtypes.min_value( sdfg.arrays[out_storage_node.data].dtype) else: raise ValueError(f"Cannot infer reduction identity." "Please specify the identity of node" "{reduce_node_new}") return
def _argminmax(sdfg: SDFG, state: SDFGState, a: str, axis, func, result_type=dace.int32, return_both=False): nest = NestedCall(sdfg, state) assert func in ['min', 'max'] if axis is None or type(axis) is not int: raise SyntaxError('Axis must be an int') a_arr = sdfg.arrays[a] if not 0 <= axis < len(a_arr.shape): raise SyntaxError("Expected 0 <= axis < len({}.shape), got {}".format( a, axis)) reduced_shape = list(copy.deepcopy(a_arr.shape)) reduced_shape.pop(axis) val_and_idx = dace.struct('_val_and_idx', val=a_arr.dtype, idx=result_type) # HACK: since identity cannot be specified for structs, we have to init the output array reduced_structs, reduced_struct_arr = sdfg.add_temp_transient( reduced_shape, val_and_idx) code = "__init = _val_and_idx(val={}, idx=-1)".format( dtypes.min_value(a_arr.dtype) if func == 'max' else dtypes.max_value(a_arr.dtype)) nest.add_state().add_mapped_tasklet( name="_arg{}_convert_".format(func), map_ranges={ '__i%d' % i: '0:%s' % n for i, n in enumerate(a_arr.shape) if i != axis }, inputs={}, code=code, outputs={ '__init': Memlet.simple( reduced_structs, ','.join('__i%d' % i for i in range(len(a_arr.shape)) if i != axis)) }, external_edges=True) nest.add_state().add_mapped_tasklet( name="_arg{}_reduce_".format(func), map_ranges={'__i%d' % i: '0:%s' % n for i, n in enumerate(a_arr.shape)}, inputs={ '__in': Memlet.simple( a, ','.join('__i%d' % i for i in range(len(a_arr.shape)))) }, code="__out = _val_and_idx(idx={}, val=__in)".format("__i%d" % axis), outputs={ '__out': Memlet.simple( reduced_structs, ','.join('__i%d' % i for i in range(len(a_arr.shape)) if i != axis), wcr_str=("lambda x, y:" "_val_and_idx(val={}(x.val, y.val), " "idx=(y.idx if x.val {} y.val else x.idx))").format( func, '<' if func == 'max' else '>')) }, external_edges=True) if return_both: outidx, outidxarr = sdfg.add_temp_transient( sdfg.arrays[reduced_structs].shape, result_type) outval, outvalarr = sdfg.add_temp_transient( sdfg.arrays[reduced_structs].shape, a_arr.dtype) nest.add_state().add_mapped_tasklet( name="_arg{}_extract_".format(func), map_ranges={ '__i%d' % i: '0:%s' % n for i, n in enumerate(a_arr.shape) if i != axis }, inputs={ '__in': Memlet.simple( reduced_structs, ','.join('__i%d' % i for i in range(len(a_arr.shape)) if i != axis)) }, code="__out_val = __in.val\n__out_idx = __in.idx", outputs={ '__out_val': Memlet.simple( outval, ','.join('__i%d' % i for i in range(len(a_arr.shape)) if i != axis)), '__out_idx': Memlet.simple( outidx, ','.join('__i%d' % i for i in range(len(a_arr.shape)) if i != axis)) }, external_edges=True) return nest, (outval, outidx) else: # map to result_type out, outarr = sdfg.add_temp_transient( sdfg.arrays[reduced_structs].shape, result_type) nest(_elementwise)("lambda x: x.idx", reduced_structs, out_array=out) return nest, out