def gemv_libnode(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, A, x, y, alpha, beta, trans=None): # Get properties if trans is None: trans = (sdfg.arrays[x].shape[0] == sdfg.arrays[A].shape[0]) # Add nodes A_in, x_in = (state.add_read(name) for name in (A, x)) y_out = state.add_write(y) libnode = Gemv('gemv', transA=trans, alpha=alpha, beta=beta) state.add_node(libnode) # Connect nodes state.add_edge(A_in, None, libnode, '_A', mm.Memlet(A)) state.add_edge(x_in, None, libnode, '_x', mm.Memlet(x)) state.add_edge(libnode, '_y', y_out, None, mm.Memlet(y)) if beta != 0: y_in = state.add_read(y) state.add_edge(y_in, None, libnode, '_y', mm.Memlet(y)) return []
def _cart_create(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, dims: ShapeType): """ Creates a process-grid and adds it to the DaCe program. The process-grid is implemented with [MPI_Cart_create](https://www.mpich.org/static/docs/latest/www3/MPI_Cart_create.html). :param dims: Shape of the process-grid (see `dims` parameter of `MPI_Cart_create`), e.g., [2, 3, 3]. :return: Name of the new process-grid descriptor. """ pgrid_name = sdfg.add_pgrid(dims) # Dummy tasklet adds MPI variables to the program's state. from dace.libraries.mpi import Dummy tasklet = Dummy(pgrid_name, [ f'MPI_Comm {pgrid_name}_comm;', f'MPI_Group {pgrid_name}_group;', f'int {pgrid_name}_coords[{len(dims)}];', f'int {pgrid_name}_dims[{len(dims)}];', f'int {pgrid_name}_rank;', f'int {pgrid_name}_size;', f'bool {pgrid_name}_valid;', ]) state.add_node(tasklet) # Pseudo-writing to a dummy variable to avoid removal of Dummy node by transformations. _, scal = sdfg.add_scalar(pgrid_name, dace.int32, transient=True) wnode = state.add_write(pgrid_name) state.add_edge(tasklet, '__out', wnode, None, Memlet.from_array(pgrid_name, scal)) return pgrid_name
def expressions(): # Matching # o o g = SDFGState() g.add_node(MergeSourceSinkArrays._array1) return [g]
def replicate_scope(sdfg: SDFG, state: SDFGState, scope: ScopeSubgraphView) -> ScopeSubgraphView: """ Replicates a scope subgraph view within a state, reconnecting all external edges to the same nodes. :param sdfg: The SDFG in which the subgraph scope resides. :param state: The SDFG state in which the subgraph scope resides. :param scope: The scope subgraph to replicate. :return: A reconnected replica of the scope. """ exit_node = state.exit_node(scope.entry) # Replicate internal graph new_nodes = [] new_entry = None new_exit = None to_find_new_names: Set[nodes.AccessNode] = set() for node in scope.nodes(): node_copy = copy.deepcopy(node) if node == scope.entry: new_entry = node_copy elif node == exit_node: new_exit = node_copy if (isinstance(node, nodes.AccessNode) and node.desc(sdfg).lifetime == dtypes.AllocationLifetime.Scope and node.desc(sdfg).transient): to_find_new_names.add(node_copy) state.add_node(node_copy) new_nodes.append(node_copy) for edge in scope.edges(): src = scope.nodes().index(edge.src) dst = scope.nodes().index(edge.dst) state.add_edge(new_nodes[src], edge.src_conn, new_nodes[dst], edge.dst_conn, copy.deepcopy(edge.data)) # Reconnect external scope nodes for edge in state.in_edges(scope.entry): state.add_edge(edge.src, edge.src_conn, new_entry, edge.dst_conn, copy.deepcopy(edge.data)) for edge in state.out_edges(exit_node): state.add_edge(new_exit, edge.src_conn, edge.dst, edge.dst_conn, copy.deepcopy(edge.data)) # Set the exit node's map to match the entry node new_exit.map = new_entry.map # Replicate all temporary transients within scope for node in to_find_new_names: desc = node.desc(sdfg) new_name = sdfg.add_datadesc(node.data, copy.deepcopy(desc), find_new_name=True) node.data = new_name for edge in state.all_edges(node): for e in state.memlet_tree(edge): e.data.data = new_name return ScopeSubgraphView(state, new_nodes, new_entry)
def _subarray(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, array: Union[str, ShapeType], subarray: Union[str, ShapeType], dtype: dtypes.typeclass = None, process_grid: str = None, correspondence: Sequence[Integral] = None): """ Adds a sub-array descriptor to the DaCe Program. Sub-arrays are implemented (when `process_grid` is set) with [MPI_Type_create_subarray](https://www.mpich.org/static/docs/v3.2/www3/MPI_Type_create_subarray.html). :param array: Either the name of an Array descriptor or the shape of the array (similar to the `array_of_sizes` parameter of `MPI_Type_create_subarray`). :param subarray: Either the name of an Array descriptor or the sub-shape of the (sub-)array (similar to the `array_of_subsizes` parameter of `MPI_Type_create_subarray`). :param dtype: Datatype of the array/sub-array (similar to the `oldtype` parameter of `MPI_Type_create_subarray`). :process_grid: Name of the process-grid for collective scatter/gather operations. :param correspondence: Matching of the array/sub-array's dimensions to the process-grid's dimensions. :return: Name of the new sub-array descriptor. """ # Get dtype, shape, and subshape if isinstance(array, str): shape = sdfg.arrays[array].shape arr_dtype = sdfg.arrays[array].dtype else: shape = array arr_dtype = None if isinstance(subarray, str): subshape = sdfg.arrays[subarray].shape sub_dtype = sdfg.arrays[subarray].dtype else: subshape = subarray sub_dtype = None dtype = dtype or arr_dtype or sub_dtype subarray_name = sdfg.add_subarray(dtype, shape, subshape, process_grid, correspondence) # Generate subgraph only if process-grid is set, i.e., the sub-array will be used for collective scatter/gather ops. if process_grid: # Dummy tasklet adds MPI variables to the program's state. from dace.libraries.mpi import Dummy tasklet = Dummy(subarray_name, [ f'MPI_Datatype {subarray_name};', f'int* {subarray_name}_counts;', f'int* {subarray_name}_displs;' ]) state.add_node(tasklet) # Pseudo-writing to a dummy variable to avoid removal of Dummy node by transformations. _, scal = sdfg.add_scalar(subarray_name, dace.int32, transient=True) wnode = state.add_write(subarray_name) state.add_edge(tasklet, '__out', wnode, None, Memlet.from_array(subarray_name, scal)) return subarray_name
def expressions(): # Matching # o o # | | # /======\ g = SDFGState() g.add_node(InMergeArrays._array1) g.add_node(InMergeArrays._array2) g.add_node(InMergeArrays._map_entry) g.add_edge(InMergeArrays._array1, None, InMergeArrays._map_entry, None, memlet.Memlet()) g.add_edge(InMergeArrays._array2, None, InMergeArrays._map_entry, None, memlet.Memlet()) return [g]
def expressions(): # Matching # \======/ # | | # o o g = SDFGState() g.add_node(OutMergeArrays._array1) g.add_node(OutMergeArrays._array2) g.add_node(OutMergeArrays._map_exit) g.add_edge(OutMergeArrays._map_exit, None, OutMergeArrays._array1, None, memlet.Memlet()) g.add_edge(OutMergeArrays._map_exit, None, OutMergeArrays._array2, None, memlet.Memlet()) return [g]
def ger_libnode(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, A, x, y, output, alpha): # Add nodes A_in, x_in, y_in = (state.add_read(name) for name in (A, x, y)) out = state.add_write(output) libnode = Ger('ger', alpha=alpha) state.add_node(libnode) # Connect nodes state.add_edge(A_in, None, libnode, '_A', mm.Memlet(A)) state.add_edge(x_in, None, libnode, '_x', mm.Memlet(x)) state.add_edge(y_in, None, libnode, '_y', mm.Memlet(y)) state.add_edge(libnode, '_res', out, None, mm.Memlet(output)) return []
def _transpose(sdfg: SDFG, state: SDFGState, inpname: str): arr1 = sdfg.arrays[inpname] restype = arr1.dtype outname, arr2 = sdfg.add_temp_transient((arr1.shape[1], arr1.shape[0]), restype, arr1.storage) acc1 = state.add_read(inpname) acc2 = state.add_write(outname) import dace.libraries.blas # Avoid import loop tasklet = dace.libraries.blas.Transpose('_Transpose_', restype) state.add_node(tasklet) state.add_edge(acc1, None, tasklet, '_inp', dace.Memlet.from_array(inpname, arr1)) state.add_edge(tasklet, '_out', acc2, None, dace.Memlet.from_array(outname, arr2)) return outname
def replicate_scope(sdfg: SDFG, state: SDFGState, scope: ScopeSubgraphView) -> ScopeSubgraphView: """ Replicates a scope subgraph view within a state, reconnecting all external edges to the same nodes. :param sdfg: The SDFG in which the subgraph scope resides. :param state: The SDFG state in which the subgraph scope resides. :param scope: The scope subgraph to replicate. :return: A reconnected replica of the scope. """ exit_node = state.exit_node(scope.entry) # Replicate internal graph new_nodes = [] new_entry = None new_exit = None for node in scope.nodes(): node_copy = copy.deepcopy(node) if node == scope.entry: new_entry = node_copy elif node == exit_node: new_exit = node_copy state.add_node(node_copy) new_nodes.append(node_copy) for edge in scope.edges(): src = scope.nodes().index(edge.src) dst = scope.nodes().index(edge.dst) state.add_edge(new_nodes[src], edge.src_conn, new_nodes[dst], edge.dst_conn, copy.deepcopy(edge.data)) # Reconnect external scope nodes for edge in state.in_edges(scope.entry): state.add_edge(edge.src, edge.src_conn, new_entry, edge.dst_conn, copy.deepcopy(edge.data)) for edge in state.out_edges(exit_node): state.add_edge(new_exit, edge.src_conn, edge.dst, edge.dst_conn, copy.deepcopy(edge.data)) # Set the exit node's map to match the entry node new_exit.map = new_entry.map return ScopeSubgraphView(state, new_nodes, new_entry)
def _cart_sub(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, parent_grid: str, color: Sequence[Union[Integral, bool]], exact_grid: RankType = None): """ Partitions the `parent_grid` to lower-dimensional sub-grids and adds them to the DaCe program. The sub-grids are implemented with [MPI_Cart_sub](https://www.mpich.org/static/docs/latest/www3/MPI_Cart_sub.html). :param parent_grid: Parent process-grid (similar to the `comm` parameter of `MPI_Cart_sub`). :param color: The i-th entry specifies whether the i-th dimension is kept in the sub-grid or is dropped (see `remain_dims` input of `MPI_Cart_sub`). :param exact_grid: [DEVELOPER] If set then, out of all the sub-grids created, only the one that contains the rank with id `exact_grid` will be utilized for collective communication. :return: Name of the new sub-grid descriptor. """ pgrid_name = sdfg.add_pgrid(parent_grid=parent_grid, color=color, exact_grid=exact_grid) # Count sub-grid dimensions. pgrid_ndims = sum([bool(c) for c in color]) # Dummy tasklet adds MPI variables to the program's state. from dace.libraries.mpi import Dummy tasklet = Dummy(pgrid_name, [ f'MPI_Comm {pgrid_name}_comm;', f'MPI_Group {pgrid_name}_group;', f'int {pgrid_name}_coords[{pgrid_ndims}];', f'int {pgrid_name}_dims[{pgrid_ndims}];', f'int {pgrid_name}_rank;', f'int {pgrid_name}_size;', f'bool {pgrid_name}_valid;', ]) state.add_node(tasklet) # Pseudo-writing to a dummy variable to avoid removal of Dummy node by transformations. _, scal = sdfg.add_scalar(pgrid_name, dace.int32, transient=True) wnode = state.add_write(pgrid_name) state.add_edge(tasklet, '__out', wnode, None, Memlet.from_array(pgrid_name, scal)) return pgrid_name
def _redistribute(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, in_buffer: str, in_subarray: str, out_buffer: str, out_subarray: str): """ Redistributes an Array using process-grids, sub-arrays, and the Redistribute library node. :param in_buffer: Name of the (local) input Array descriptor. :param in_subarray: Input sub-array descriptor. :param out_buffer: Name of the (local) output Array descriptor. :param out_subarray: Output sub-array descriptor. :return: Name of the new redistribution descriptor. """ in_desc = sdfg.arrays[in_buffer] out_desc = sdfg.arrays[out_buffer] rdistrarray_name = sdfg.add_rdistrarray(in_subarray, out_subarray) from dace.libraries.mpi import Dummy, Redistribute tasklet = Dummy(rdistrarray_name, [ f'MPI_Datatype {rdistrarray_name};', f'int {rdistrarray_name}_sends;', f'MPI_Datatype* {rdistrarray_name}_send_types;', f'int* {rdistrarray_name}_dst_ranks;', f'int {rdistrarray_name}_recvs;', f'MPI_Datatype* {rdistrarray_name}_recv_types;', f'int* {rdistrarray_name}_src_ranks;', f'int {rdistrarray_name}_self_copies;', f'int* {rdistrarray_name}_self_src;', f'int* {rdistrarray_name}_self_dst;', f'int* {rdistrarray_name}_self_size;' ]) state.add_node(tasklet) _, scal = sdfg.add_scalar(rdistrarray_name, dace.int32, transient=True) wnode = state.add_write(rdistrarray_name) state.add_edge(tasklet, '__out', wnode, None, Memlet.from_array(rdistrarray_name, scal)) libnode = Redistribute('_Redistribute_', rdistrarray_name) inbuf_range = None if isinstance(in_buffer, tuple): inbuf_name, inbuf_range = in_buffer else: inbuf_name = in_buffer in_desc = sdfg.arrays[inbuf_name] inbuf_node = state.add_read(inbuf_name) outbuf_range = None if isinstance(out_buffer, tuple): outbuf_name, outbuf_range = out_buffer else: outbuf_name = out_buffer out_desc = sdfg.arrays[outbuf_name] outbuf_node = state.add_write(outbuf_name) if inbuf_range: inbuf_mem = Memlet.simple(inbuf_name, inbuf_range) else: inbuf_mem = Memlet.from_array(inbuf_name, in_desc) if outbuf_range: outbuf_mem = Memlet.simple(outbuf_name, outbuf_range) else: outbuf_mem = Memlet.from_array(outbuf_name, out_desc) state.add_edge(inbuf_node, None, libnode, '_inp_buffer', inbuf_mem) state.add_edge(libnode, '_out_buffer', outbuf_node, None, outbuf_mem) return rdistrarray_name
def _matmult(visitor, sdfg: SDFG, state: SDFGState, op1: str, op2: str): from dace.libraries.blas.nodes.matmul import MatMul # Avoid import loop arr1 = sdfg.arrays[op1] arr2 = sdfg.arrays[op2] if len(arr1.shape) > 1 and len(arr2.shape) > 1: # matrix * matrix if len(arr1.shape) > 3 or len(arr2.shape) > 3: raise SyntaxError( 'Matrix multiplication of tensors of dimensions > 3 ' 'not supported') if arr1.shape[-1] != arr2.shape[-2]: raise SyntaxError('Matrix dimension mismatch %s != %s' % (arr1.shape[-1], arr2.shape[-2])) from dace.libraries.blas.nodes.matmul import _get_batchmm_opts # Determine batched multiplication bopt = _get_batchmm_opts(arr1.shape, arr1.strides, arr2.shape, arr2.strides, None, None) if bopt: output_shape = (bopt['b'], arr1.shape[-2], arr2.shape[-1]) else: output_shape = (arr1.shape[-2], arr2.shape[-1]) elif len(arr1.shape) == 2 and len(arr2.shape) == 1: # matrix * vector if arr1.shape[1] != arr2.shape[0]: raise SyntaxError("Number of matrix columns {} must match" "size of vector {}.".format( arr1.shape[1], arr2.shape[0])) output_shape = (arr1.shape[0], ) elif len(arr1.shape) == 1 and len(arr2.shape) == 1: # vector * vector if arr1.shape[0] != arr2.shape[0]: raise SyntaxError("Vectors in vector product must have same size: " "{} vs. {}".format(arr1.shape[0], arr2.shape[0])) output_shape = (1, ) else: # Dunno what this is, bail raise SyntaxError( "Cannot multiply arrays with shapes: {} and {}".format( arr1.shape, arr2.shape)) type1 = arr1.dtype.type type2 = arr2.dtype.type restype = dace.DTYPE_TO_TYPECLASS[np.result_type(type1, type2).type] op3, arr3 = sdfg.add_temp_transient(output_shape, restype, arr1.storage) acc1 = state.add_read(op1) acc2 = state.add_read(op2) acc3 = state.add_write(op3) tasklet = MatMul('_MatMult_', restype) state.add_node(tasklet) state.add_edge(acc1, None, tasklet, '_a', dace.Memlet.from_array(op1, arr1)) state.add_edge(acc2, None, tasklet, '_b', dace.Memlet.from_array(op2, arr2)) state.add_edge(tasklet, '_c', acc3, None, dace.Memlet.from_array(op3, arr3)) return op3
def apply(self, state: SDFGState, sdfg: SDFG) -> nodes.AccessNode: dnode: nodes.AccessNode = self.access if self.expr_index == 0: edges = state.out_edges(dnode) else: edges = state.in_edges(dnode) # To understand how many components we need to create, all map ranges # throughout memlet paths must match exactly. We thus create a # dictionary of unique ranges mapping: Dict[Tuple[subsets.Range], List[gr.MultiConnectorEdge[mm.Memlet]]] = defaultdict( list) ranges = {} for edge in edges: mpath = state.memlet_path(edge) ranges[edge] = _collect_map_ranges(state, mpath) mapping[tuple(r[1] for r in ranges[edge])].append(edge) # Collect all edges with the same memory access pattern components_to_create: Dict[ Tuple[symbolic.SymbolicType], List[gr.MultiConnectorEdge[mm.Memlet]]] = defaultdict(list) for edges_with_same_range in mapping.values(): for edge in edges_with_same_range: # Get memlet path and innermost edge mpath = state.memlet_path(edge) innermost_edge = copy.deepcopy(mpath[-1] if self.expr_index == 0 else mpath[0]) # Store memlets of the same access in the same component expr = _canonicalize_memlet(innermost_edge.data, ranges[edge]) components_to_create[expr].append((innermost_edge, edge)) components = list(components_to_create.values()) # Split out components that have dependencies between them to avoid # deadlocks if self.expr_index == 0: ccs_to_add = [] for i, component in enumerate(components): edges_to_remove = set() for cedge in component: if any( nx.has_path(state.nx, o[1].dst, cedge[1].dst) for o in component if o is not cedge): ccs_to_add.append([cedge]) edges_to_remove.add(cedge) if edges_to_remove: components[i] = [ c for c in component if c not in edges_to_remove ] components.extend(ccs_to_add) # End of split desc = sdfg.arrays[dnode.data] # Create new streams of shape 1 streams = {} mpaths = {} for edge in edges: if self.use_memory_buffering: arrname = str(self.access) # Add gearbox total_size = edge.data.volume vector_size = int(self.memory_buffering_target_bytes / desc.dtype.bytes) if not is_int(sdfg.arrays[dnode.data].shape[-1]): warnings.warn( "Using the MemoryBuffering transformation is potential unsafe since {sym} is not an integer. There should be no issue if {sym} % {vec} == 0" .format(sym=sdfg.arrays[dnode.data].shape[-1], vec=vector_size)) for i in sdfg.arrays[dnode.data].strides: if not is_int(i): warnings.warn( "Using the MemoryBuffering transformation is potential unsafe since {sym} is not an integer. There should be no issue if {sym} % {vec} == 0" .format(sym=i, vec=vector_size)) if self.expr_index == 0: # Read edges = state.out_edges(dnode) gearbox_input_type = dtypes.vector(desc.dtype, vector_size) gearbox_output_type = desc.dtype gearbox_read_volume = total_size / vector_size gearbox_write_volume = total_size else: # Write edges = state.in_edges(dnode) gearbox_input_type = desc.dtype gearbox_output_type = dtypes.vector( desc.dtype, vector_size) gearbox_read_volume = total_size gearbox_write_volume = total_size / vector_size input_gearbox_name, input_gearbox_newdesc = sdfg.add_stream( "gearbox_input", gearbox_input_type, buffer_size=self.buffer_size, storage=self.storage, transient=True, find_new_name=True) output_gearbox_name, output_gearbox_newdesc = sdfg.add_stream( "gearbox_output", gearbox_output_type, buffer_size=self.buffer_size, storage=self.storage, transient=True, find_new_name=True) read_to_gearbox = state.add_read(input_gearbox_name) write_from_gearbox = state.add_write(output_gearbox_name) gearbox = Gearbox(total_size / vector_size) state.add_node(gearbox) state.add_memlet_path(read_to_gearbox, gearbox, dst_conn="from_memory", memlet=Memlet( input_gearbox_name + "[0]", volume=gearbox_read_volume)) state.add_memlet_path(gearbox, write_from_gearbox, src_conn="to_kernel", memlet=Memlet( output_gearbox_name + "[0]", volume=gearbox_write_volume)) if self.expr_index == 0: streams[edge] = input_gearbox_name name = output_gearbox_name newdesc = output_gearbox_newdesc else: streams[edge] = output_gearbox_name name = input_gearbox_name newdesc = input_gearbox_newdesc else: # Qualify name to avoid name clashes if memory interfaces are not decoupled for Xilinx stream_name = "stream_" + dnode.data name, newdesc = sdfg.add_stream(stream_name, desc.dtype, buffer_size=self.buffer_size, storage=self.storage, transient=True, find_new_name=True) streams[edge] = name # Add these such that we can easily use output_gearbox_name and input_gearbox_name without using if statements output_gearbox_name = name input_gearbox_name = name mpath = state.memlet_path(edge) mpaths[edge] = mpath # Replace memlets in path with stream access for e in mpath: e.data = mm.Memlet(data=name, subset='0', other_subset=e.data.other_subset) if isinstance(e.src, nodes.NestedSDFG): e.data.dynamic = True _streamify_recursive(e.src, e.src_conn, newdesc) if isinstance(e.dst, nodes.NestedSDFG): e.data.dynamic = True _streamify_recursive(e.dst, e.dst_conn, newdesc) # Replace access node and memlet tree with one access if self.expr_index == 0: replacement = state.add_read(output_gearbox_name) state.remove_edge(edge) state.add_edge(replacement, edge.src_conn, edge.dst, edge.dst_conn, edge.data) else: replacement = state.add_write(input_gearbox_name) state.remove_edge(edge) state.add_edge(edge.src, edge.src_conn, replacement, edge.dst_conn, edge.data) if self.use_memory_buffering: arrname = str(self.access) vector_size = int(self.memory_buffering_target_bytes / desc.dtype.bytes) # Vectorize access to global array. dtype = sdfg.arrays[arrname].dtype sdfg.arrays[arrname].dtype = dtypes.vector(dtype, vector_size) new_shape = list(sdfg.arrays[arrname].shape) contigidx = sdfg.arrays[arrname].strides.index(1) new_shape[contigidx] /= vector_size try: new_shape[contigidx] = int(new_shape[contigidx]) except TypeError: pass sdfg.arrays[arrname].shape = new_shape # Change strides new_strides: List = list(sdfg.arrays[arrname].strides) for i in range(len(new_strides)): if i == len(new_strides ) - 1: # Skip last dimension since it is always 1 continue new_strides[i] = new_strides[i] / vector_size sdfg.arrays[arrname].strides = new_strides post_state = get_post_state(sdfg, state) if post_state != None: # Change subset in the post state such that the correct amount of memory is copied back from the device for e in post_state.edges(): if e.data.data == self.access.data: new_subset = list(e.data.subset) i, j, k = new_subset[-1] new_subset[-1] = (i, (j + 1) / vector_size - 1, k) e.data = mm.Memlet(data=str(e.src), subset=subsets.Range(new_subset)) # Make read/write components ionodes = [] for component in components: # Pick the first edge as the edge to make the component from innermost_edge, outermost_edge = component[0] mpath = mpaths[outermost_edge] mapname = streams[outermost_edge] innermost_edge.data.other_subset = None # Get edge data and streams if self.expr_index == 0: opname = 'read' path = [e.dst for e in mpath[:-1]] rmemlets = [(dnode, '__inp', innermost_edge.data)] wmemlets = [] for i, (_, edge) in enumerate(component): name = streams[edge] ionode = state.add_write(name) ionodes.append(ionode) wmemlets.append( (ionode, '__out%d' % i, mm.Memlet(data=name, subset='0'))) code = '\n'.join('__out%d = __inp' % i for i in range(len(component))) else: # More than one input stream might mean a data race, so we only # address the first one in the tasklet code if len(component) > 1: warnings.warn( f'More than one input found for the same index for {dnode.data}' ) opname = 'write' path = [state.entry_node(e.src) for e in reversed(mpath[1:])] wmemlets = [(dnode, '__out', innermost_edge.data)] rmemlets = [] for i, (_, edge) in enumerate(component): name = streams[edge] ionode = state.add_read(name) ionodes.append(ionode) rmemlets.append( (ionode, '__inp%d' % i, mm.Memlet(data=name, subset='0'))) code = '__out = __inp0' # Create map structure for read/write component maps = [] for entry in path: map: nodes.Map = entry.map ranges = [(p, (r[0], r[1], r[2])) for p, r in zip(map.params, map.range)] # Change ranges of map if self.use_memory_buffering: # Find edges from/to map edge_subset = [ a_tuple[0] for a_tuple in list(innermost_edge.data.subset) ] # Change range of map if isinstance(edge_subset[-1], symbol) and str( edge_subset[-1]) == map.params[-1]: if not is_int(ranges[-1][1][1]): warnings.warn( "Using the MemoryBuffering transformation is potential unsafe since {sym} is not an integer. There should be no issue if {sym} % {vec} == 0" .format(sym=ranges[-1][1][1].args[1], vec=vector_size)) ranges[-1] = (ranges[-1][0], (ranges[-1][1][0], (ranges[-1][1][1] + 1) / vector_size - 1, ranges[-1][1][2])) elif isinstance(edge_subset[-1], sympy.core.add.Add): for arg in edge_subset[-1].args: if isinstance( arg, symbol) and str(arg) == map.params[-1]: if not is_int(ranges[-1][1][1]): warnings.warn( "Using the MemoryBuffering transformation is potential unsafe since {sym} is not an integer. There should be no issue if {sym} % {vec} == 0" .format(sym=ranges[-1][1][1].args[1], vec=vector_size)) ranges[-1] = (ranges[-1][0], ( ranges[-1][1][0], (ranges[-1][1][1] + 1) / vector_size - 1, ranges[-1][1][2])) maps.append( state.add_map(f'__s{opname}_{mapname}', ranges, map.schedule)) tasklet = state.add_tasklet( f'{opname}_{mapname}', {m[1] for m in rmemlets}, {m[1] for m in wmemlets}, code, ) for node, cname, memlet in rmemlets: state.add_memlet_path(node, *(me for me, _ in maps), tasklet, dst_conn=cname, memlet=memlet) for node, cname, memlet in wmemlets: state.add_memlet_path(tasklet, *(mx for _, mx in reversed(maps)), node, src_conn=cname, memlet=memlet) return ionodes