def _Reduce(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, buffer: str, op: str, root: Union[str, sp.Expr, Number] = 0, grid: str = None): from dace.libraries.mpi.nodes.reduce import Reduce libnode = Reduce('_Reduce_', op, grid) desc = sdfg.arrays[buffer] in_buffer = state.add_read(buffer) out_buffer = state.add_write(buffer) if isinstance(root, str) and root in sdfg.arrays.keys(): root_node = state.add_read(root) else: storage = desc.storage root_name = _define_local_scalar(pv, sdfg, state, dace.int32, storage) root_node = state.add_access(root_name) root_tasklet = state.add_tasklet('_set_root_', {}, {'__out'}, '__out = {}'.format(root)) state.add_edge(root_tasklet, '__out', root_node, None, Memlet.simple(root_name, '0')) state.add_edge(in_buffer, None, libnode, '_inbuffer', Memlet.from_array(buffer, desc)) state.add_edge(root_node, None, libnode, '_root', Memlet.simple(root_node.data, '0')) state.add_edge(libnode, '_outbuffer', out_buffer, None, Memlet.from_array(buffer, desc)) return None
def _gather(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, in_buffer: str, out_buffer: str, root: Union[str, sp.Expr, Number] = 0): from dace.libraries.mpi.nodes.gather import Gather libnode = Gather('_Gather_') in_desc = sdfg.arrays[in_buffer] out_desc = sdfg.arrays[out_buffer] in_node = state.add_read(in_buffer) out_node = state.add_write(out_buffer) if isinstance(root, str) and root in sdfg.arrays.keys(): root_node = state.add_read(root) else: storage = in_desc.storage root_name = _define_local_scalar(pv, sdfg, state, dace.int32, storage) root_node = state.add_access(root_name) root_tasklet = state.add_tasklet('_set_root_', {}, {'__out'}, '__out = {}'.format(root)) state.add_edge(root_tasklet, '__out', root_node, None, Memlet.simple(root_name, '0')) state.add_edge(in_node, None, libnode, '_inbuffer', Memlet.from_array(in_buffer, in_desc)) state.add_edge(root_node, None, libnode, '_root', Memlet.simple(root_node.data, '0')) state.add_edge(libnode, '_outbuffer', out_node, None, Memlet.from_array(out_buffer, out_desc)) return None
def _simple_call(sdfg: SDFG, state: SDFGState, inpname: str, func: str, restype: dace.typeclass = None): """ Implements a simple call of the form `out = func(inp)`. """ inparr = sdfg.arrays[inpname] if restype is None: restype = sdfg.arrays[inpname].dtype outname, outarr = sdfg.add_temp_transient(inparr.shape, restype, inparr.storage) num_elements = reduce(lambda x, y: x * y, inparr.shape) if num_elements == 1: inp = state.add_read(inpname) out = state.add_write(outname) tasklet = state.add_tasklet(func, {'__inp'}, {'__out'}, '__out = {f}(__inp)'.format(f=func)) state.add_edge(inp, None, tasklet, '__inp', Memlet.from_array(inpname, inparr)) state.add_edge(tasklet, '__out', out, None, Memlet.from_array(outname, outarr)) else: state.add_mapped_tasklet( name=func, map_ranges={ '__i%d' % i: '0:%s' % n for i, n in enumerate(inparr.shape) }, inputs={ '__inp': Memlet.simple( inpname, ','.join(['__i%d' % i for i in range(len(inparr.shape))])) }, code='__out = {f}(__inp)'.format(f=func), outputs={ '__out': Memlet.simple( outname, ','.join(['__i%d' % i for i in range(len(inparr.shape))])) }, external_edges=True) return outname
def _array_x_binop(visitor: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: str, op2: str, op: str, opcode: str): arr1 = sdfg.arrays[op1] type1 = arr1.dtype.type isscal1 = _is_scalar(sdfg, op1) isnum1 = isscal1 and (op1 in visitor.numbers.values()) if isnum1: type1 = inverse_dict_lookup(visitor.numbers, op1) arr2 = sdfg.arrays[op2] type2 = arr2.dtype.type isscal2 = _is_scalar(sdfg, op2) isnum2 = isscal2 and (op2 in visitor.numbers.values()) if isnum2: type2 = inverse_dict_lookup(visitor.numbers, op2) if _is_op_boolean(op): restype = dace.bool else: restype = dace.DTYPE_TO_TYPECLASS[np.result_type(type1, type2).type] if isscal1 and isscal2: arr1 = sdfg.arrays[op1] arr2 = sdfg.arrays[op2] op3, arr3 = sdfg.add_temp_transient([1], restype, arr2.storage) tasklet = state.add_tasklet('_SS%s_' % op, {'s1', 's2'}, {'s3'}, 's3 = s1 %s s2' % opcode) n1 = state.add_read(op1) n2 = state.add_read(op2) n3 = state.add_write(op3) state.add_edge(n1, None, tasklet, 's1', dace.Memlet.from_array(op1, arr1)) state.add_edge(n2, None, tasklet, 's2', dace.Memlet.from_array(op2, arr2)) state.add_edge(tasklet, 's3', n3, None, dace.Memlet.from_array(op3, arr3)) return op3 else: return _binop(sdfg, state, op1, op2, opcode, op, restype)
def _bcgather(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, in_buffer: str, out_buffer: str, block_sizes: Union[str, Sequence[Union[sp.Expr, Number]]]): from dace.libraries.pblas.nodes.pgeadd import BlockCyclicGather libnode = BlockCyclicGather('_BCGather_') inbuf_range = None if isinstance(in_buffer, tuple): inbuf_name, inbuf_range = in_buffer else: inbuf_name = in_buffer in_desc = sdfg.arrays[inbuf_name] inbuf_node = state.add_read(inbuf_name) bsizes_range = None if isinstance(block_sizes, (list, tuple)): if isinstance(block_sizes[0], str): bsizes_name, bsizes_range = block_sizes bsizes_desc = sdfg.arrays[bsizes_name] bsizes_node = state.add_read(bsizes_name) else: bsizes_name, bsizes_desc = sdfg.add_temp_transient( (len(block_sizes), ), dtype=dace.int32) bsizes_node = state.add_access(bsizes_name) bsizes_tasklet = state.add_tasklet( '_set_bsizes_', {}, {'__out'}, ";".join([ "__out[{}] = {}".format(i, sz) for i, sz in enumerate(block_sizes) ])) state.add_edge(bsizes_tasklet, '__out', bsizes_node, None, Memlet.from_array(bsizes_name, bsizes_desc)) else: bsizes_name = block_sizes bsizes_desc = sdfg.arrays[bsizes_name] bsizes_node = state.add_read(bsizes_name) outbuf_range = None if isinstance(out_buffer, tuple): outbuf_name, outbuf_range = out_buffer else: outbuf_name = out_buffer out_desc = sdfg.arrays[outbuf_name] outbuf_node = state.add_write(outbuf_name) if inbuf_range: inbuf_mem = Memlet.simple(inbuf_name, inbuf_range) else: inbuf_mem = Memlet.from_array(inbuf_name, in_desc) if bsizes_range: bsizes_mem = Memlet.simple(bsizes_name, bsizes_range) else: bsizes_mem = Memlet.from_array(bsizes_name, bsizes_desc) if outbuf_range: outbuf_mem = Memlet.simple(outbuf_name, outbuf_range) else: outbuf_mem = Memlet.from_array(outbuf_name, out_desc) state.add_edge(inbuf_node, None, libnode, '_inbuffer', inbuf_mem) state.add_edge(bsizes_node, None, libnode, '_block_sizes', bsizes_mem) state.add_edge(libnode, '_outbuffer', outbuf_node, None, outbuf_mem) return None
def _irecv(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, buffer: str, src: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number], request: str): from dace.libraries.mpi.nodes.irecv import Irecv libnode = Irecv('_Irecv_') buf_range = None if isinstance(buffer, tuple): buf_name, buf_range = buffer else: buf_name = buffer desc = sdfg.arrays[buf_name] buf_node = state.add_read(buf_name) req_range = None if isinstance(request, tuple): req_name, req_range = request else: req_name = request req_desc = sdfg.arrays[req_name] req_node = state.add_write(req_name) conn = libnode.out_connectors conn = { c: (dtypes.pointer(desc.dtype) if c == '_buffer' else t) for c, t in conn.items() } conn = { c: (dtypes.pointer(req_desc.dtype) if c == '_request' else t) for c, t in conn.items() } libnode.out_connectors = conn src_range = None if isinstance(src, tuple): src_name, src_range = src src_node = state.add_read(src_name) elif isinstance(src, str) and src in sdfg.arrays.keys(): src_name = src src_node = state.add_read(src_name) else: storage = desc.storage src_name = _define_local_scalar(pv, sdfg, state, dace.int32, storage) src_node = state.add_access(src_name) src_tasklet = state.add_tasklet('_set_src_', {}, {'__out'}, '__out = {}'.format(src)) state.add_edge(src_tasklet, '__out', src_node, None, Memlet.simple(src_name, '0')) tag_range = None if isinstance(tag, tuple): tag_name, tag_range = tag tag_node = state.add_read(tag_name) if isinstance(tag, str) and tag in sdfg.arrays.keys(): tag_name = tag tag_node = state.add_read(tag) else: storage = desc.storage tag_name = _define_local_scalar(pv, sdfg, state, dace.int32, storage) tag_node = state.add_access(tag_name) tag_tasklet = state.add_tasklet('_set_tag_', {}, {'__out'}, '__out = {}'.format(tag)) state.add_edge(tag_tasklet, '__out', tag_node, None, Memlet.simple(tag_name, '0')) if buf_range: buf_mem = Memlet.simple(buf_name, buf_range) else: buf_mem = Memlet.from_array(buf_name, desc) if req_range: req_mem = Memlet.simple(req_name, req_range) else: req_mem = Memlet.from_array(req_name, req_desc) if src_range: src_mem = Memlet.simple(src_name, src_range) else: src_mem = Memlet.simple(src_name, '0') if tag_range: tag_mem = Memlet.simple(tag_name, tag_range) else: tag_mem = Memlet.simple(tag_name, '0') state.add_edge(libnode, '_buffer', buf_node, None, buf_mem) state.add_edge(src_node, None, libnode, '_src', src_mem) state.add_edge(tag_node, None, libnode, '_tag', tag_mem) state.add_edge(libnode, '_request', req_node, None, req_mem) return None
def _isend(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, buffer: str, dst: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number], request: str): from dace.libraries.mpi.nodes.isend import Isend libnode = Isend('_Isend_') buf_range = None if isinstance(buffer, tuple): buf_name, buf_range = buffer else: buf_name = buffer desc = sdfg.arrays[buf_name] buf_node = state.add_read(buf_name) req_range = None if isinstance(request, tuple): req_name, req_range = request else: req_name = request req_desc = sdfg.arrays[req_name] req_node = state.add_write(req_name) iconn = libnode.in_connectors iconn = { c: (dtypes.pointer(desc.dtype) if c == '_buffer' else t) for c, t in iconn.items() } libnode.in_connectors = iconn oconn = libnode.out_connectors oconn = { c: (dtypes.pointer(req_desc.dtype) if c == '_request' else t) for c, t in oconn.items() } libnode.out_connectors = oconn dst_range = None if isinstance(dst, tuple): dst_name, dst_range = dst dst_node = state.add_read(dst_name) elif isinstance(dst, str) and dst in sdfg.arrays.keys(): dst_name = dst dst_node = state.add_read(dst_name) else: storage = desc.storage dst_name = _define_local_scalar(pv, sdfg, state, dace.int32, storage) dst_node = state.add_access(dst_name) dst_tasklet = state.add_tasklet('_set_dst_', {}, {'__out'}, '__out = {}'.format(dst)) state.add_edge(dst_tasklet, '__out', dst_node, None, Memlet.simple(dst_name, '0')) tag_range = None if isinstance(tag, tuple): tag_name, tag_range = tag tag_node = state.add_read(tag_name) if isinstance(tag, str) and tag in sdfg.arrays.keys(): tag_name = tag tag_node = state.add_read(tag) else: storage = desc.storage tag_name = _define_local_scalar(pv, sdfg, state, dace.int32, storage) tag_node = state.add_access(tag_name) tag_tasklet = state.add_tasklet('_set_tag_', {}, {'__out'}, '__out = {}'.format(tag)) state.add_edge(tag_tasklet, '__out', tag_node, None, Memlet.simple(tag_name, '0')) if buf_range: buf_mem = Memlet.simple(buf_name, buf_range) else: buf_mem = Memlet.from_array(buf_name, desc) if req_range: req_mem = Memlet.simple(req_name, req_range) else: req_mem = Memlet.from_array(req_name, req_desc) if dst_range: dst_mem = Memlet.simple(dst_name, dst_range) else: dst_mem = Memlet.simple(dst_name, '0') if tag_range: tag_mem = Memlet.simple(tag_name, tag_range) else: tag_mem = Memlet.simple(tag_name, '0') state.add_edge(buf_node, None, libnode, '_buffer', buf_mem) state.add_edge(dst_node, None, libnode, '_dest', dst_mem) state.add_edge(tag_node, None, libnode, '_tag', tag_mem) state.add_edge(libnode, '_request', req_node, None, req_mem) return None
def _distr_matmult(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, opa: str, opb: str, shape: Sequence[Union[sp.Expr, Number]], a_block_sizes: Union[str, Sequence[Union[sp.Expr, Number]]] = None, b_block_sizes: Union[str, Sequence[Union[sp.Expr, Number]]] = None, c_block_sizes: Union[str, Sequence[Union[sp.Expr, Number]]] = None): arra = sdfg.arrays[opa] arrb = sdfg.arrays[opb] if len(shape) == 3: gm, gn, gk = shape else: gm, gn = shape a_block_sizes = a_block_sizes or arra.shape if len(a_block_sizes) < 2: a_block_sizes = (a_block_sizes[0], 1) b_block_sizes = b_block_sizes or arrb.shape if len(b_block_sizes) < 2: b_block_sizes = (b_block_sizes[0], 1) if len(arra.shape) == 1 and len(arrb.shape) == 2: a_block_sizes, b_block_sizes = b_block_sizes, a_block_sizes a_bsizes_range = None if isinstance(a_block_sizes, (list, tuple)): if isinstance(a_block_sizes[0], str): a_bsizes_name, a_bsizes_range = a_block_sizes a_bsizes_desc = sdfg.arrays[a_bsizes_name] a_bsizes_node = state.add_read(a_bsizes_name) else: a_bsizes_name, a_bsizes_desc = sdfg.add_temp_transient( (len(a_block_sizes), ), dtype=dace.int32) a_bsizes_node = state.add_access(a_bsizes_name) a_bsizes_tasklet = state.add_tasklet( '_set_a_bsizes_', {}, {'__out'}, ";".join([ "__out[{}] = {}".format(i, sz) for i, sz in enumerate(a_block_sizes) ])) state.add_edge(a_bsizes_tasklet, '__out', a_bsizes_node, None, Memlet.from_array(a_bsizes_name, a_bsizes_desc)) else: a_bsizes_name = a_block_sizes a_bsizes_desc = sdfg.arrays[a_bsizes_name] a_bsizes_node = state.add_read(a_bsizes_name) b_bsizes_range = None if isinstance(a_block_sizes, (list, tuple)): if isinstance(a_block_sizes[0], str): b_bsizes_name, b_sizes_range = b_block_sizes b_bsizes_desc = sdfg.arrays[b_bsizes_name] b_bsizes_node = state.add_read(b_bsizes_name) else: b_bsizes_name, b_bsizes_desc = sdfg.add_temp_transient( (len(b_block_sizes), ), dtype=dace.int32) b_bsizes_node = state.add_access(b_bsizes_name) b_bsizes_tasklet = state.add_tasklet( '_set_b_sizes_', {}, {'__out'}, ";".join([ "__out[{}] = {}".format(i, sz) for i, sz in enumerate(b_block_sizes) ])) state.add_edge(b_bsizes_tasklet, '__out', b_bsizes_node, None, Memlet.from_array(b_bsizes_name, b_bsizes_desc)) else: b_bsizes_name = b_block_sizes b_bsizes_desc = sdfg.arrays[b_bsizes_name] b_bsizes_node = state.add_read(b_bsizes_name) if len(arra.shape) == 2 and len(arrb.shape) == 2: # Gemm from dace.libraries.pblas.nodes.pgemm import Pgemm tasklet = Pgemm("__DistrMatMult__", gm, gn, gk) m = arra.shape[0] n = arrb.shape[-1] out = sdfg.add_temp_transient((m, n), dtype=arra.dtype) elif len(arra.shape) == 2 and len(arrb.shape) == 1: # Gemv from dace.libraries.pblas.nodes.pgemv import Pgemv tasklet = Pgemv("__DistrMatVecMult__", m=gm, n=gn) if c_block_sizes: m = c_block_sizes[0] else: m = arra.shape[0] out = sdfg.add_temp_transient((m, ), dtype=arra.dtype) elif len(arra.shape) == 1 and len(arrb.shape) == 2: # Gemv transposed # Swap a and b opa, opb = opb, opa arra, arrb = arrb, arra from dace.libraries.pblas.nodes.pgemv import Pgemv tasklet = Pgemv("__DistrMatVecMult__", transa='T', m=gm, n=gn) if c_block_sizes: n = c_block_sizes[0] else: n = arra.shape[1] out = sdfg.add_temp_transient((n, ), dtype=arra.dtype) anode = state.add_read(opa) bnode = state.add_read(opb) cnode = state.add_write(out[0]) if a_bsizes_range: a_bsizes_mem = Memlet.simple(a_bsizes_name, a_bsizes_range) else: a_bsizes_mem = Memlet.from_array(a_bsizes_name, a_bsizes_desc) if b_bsizes_range: b_bsizes_mem = Memlet.simple(b_bsizes_name, b_bsizes_range) else: b_bsizes_mem = Memlet.from_array(b_bsizes_name, b_bsizes_desc) state.add_edge(anode, None, tasklet, '_a', Memlet.from_array(opa, arra)) state.add_edge(bnode, None, tasklet, '_b', Memlet.from_array(opb, arrb)) state.add_edge(a_bsizes_node, None, tasklet, '_a_block_sizes', a_bsizes_mem) state.add_edge(b_bsizes_node, None, tasklet, '_b_block_sizes', b_bsizes_mem) state.add_edge(tasklet, '_c', cnode, None, Memlet.from_array(*out)) return out[0]
def _elementwise(sdfg: SDFG, state: SDFGState, func: str, in_array: str, out_array=None): """Apply a lambda function to each element in the input""" inparr = sdfg.arrays[in_array] restype = sdfg.arrays[in_array].dtype if out_array is None: out_array, outarr = sdfg.add_temp_transient(inparr.shape, restype, inparr.storage) else: outarr = sdfg.arrays[out_array] func_ast = ast.parse(func) try: lambda_ast = func_ast.body[0].value if len(lambda_ast.args.args) != 1: raise SyntaxError( "Expected lambda with one arg, but {} has {}".format( func, len(lambda_ast.args.arrgs))) arg = lambda_ast.args.args[0].arg body = astutils.unparse(lambda_ast.body) except AttributeError: raise SyntaxError("Could not parse func {}".format(func)) code = "__out = {}".format(body) num_elements = reduce(lambda x, y: x * y, inparr.shape) if num_elements == 1: inp = state.add_read(in_array) out = state.add_write(out_array) tasklet = state.add_tasklet("_elementwise_", {arg}, {'__out'}, code) state.add_edge(inp, None, tasklet, arg, Memlet.from_array(in_array, inparr)) state.add_edge(tasklet, '__out', out, None, Memlet.from_array(out_array, outarr)) else: state.add_mapped_tasklet( name="_elementwise_", map_ranges={ '__i%d' % i: '0:%s' % n for i, n in enumerate(inparr.shape) }, inputs={ arg: Memlet.simple( in_array, ','.join(['__i%d' % i for i in range(len(inparr.shape))])) }, code=code, outputs={ '__out': Memlet.simple( out_array, ','.join(['__i%d' % i for i in range(len(inparr.shape))])) }, external_edges=True) return out_array
def apply(self, state: SDFGState, sdfg: SDFG) -> nodes.AccessNode: dnode: nodes.AccessNode = self.access if self.expr_index == 0: edges = state.out_edges(dnode) else: edges = state.in_edges(dnode) # To understand how many components we need to create, all map ranges # throughout memlet paths must match exactly. We thus create a # dictionary of unique ranges mapping: Dict[Tuple[subsets.Range], List[gr.MultiConnectorEdge[mm.Memlet]]] = defaultdict( list) ranges = {} for edge in edges: mpath = state.memlet_path(edge) ranges[edge] = _collect_map_ranges(state, mpath) mapping[tuple(r[1] for r in ranges[edge])].append(edge) # Collect all edges with the same memory access pattern components_to_create: Dict[ Tuple[symbolic.SymbolicType], List[gr.MultiConnectorEdge[mm.Memlet]]] = defaultdict(list) for edges_with_same_range in mapping.values(): for edge in edges_with_same_range: # Get memlet path and innermost edge mpath = state.memlet_path(edge) innermost_edge = copy.deepcopy(mpath[-1] if self.expr_index == 0 else mpath[0]) # Store memlets of the same access in the same component expr = _canonicalize_memlet(innermost_edge.data, ranges[edge]) components_to_create[expr].append((innermost_edge, edge)) components = list(components_to_create.values()) # Split out components that have dependencies between them to avoid # deadlocks if self.expr_index == 0: ccs_to_add = [] for i, component in enumerate(components): edges_to_remove = set() for cedge in component: if any( nx.has_path(state.nx, o[1].dst, cedge[1].dst) for o in component if o is not cedge): ccs_to_add.append([cedge]) edges_to_remove.add(cedge) if edges_to_remove: components[i] = [ c for c in component if c not in edges_to_remove ] components.extend(ccs_to_add) # End of split desc = sdfg.arrays[dnode.data] # Create new streams of shape 1 streams = {} mpaths = {} for edge in edges: name, newdesc = sdfg.add_stream(dnode.data, desc.dtype, buffer_size=self.buffer_size, storage=self.storage, transient=True, find_new_name=True) streams[edge] = name mpath = state.memlet_path(edge) mpaths[edge] = mpath # Replace memlets in path with stream access for e in mpath: e.data = mm.Memlet(data=name, subset='0', other_subset=e.data.other_subset) if isinstance(e.src, nodes.NestedSDFG): e.data.dynamic = True _streamify_recursive(e.src, e.src_conn, newdesc) if isinstance(e.dst, nodes.NestedSDFG): e.data.dynamic = True _streamify_recursive(e.dst, e.dst_conn, newdesc) # Replace access node and memlet tree with one access if self.expr_index == 0: replacement = state.add_read(name) state.remove_edge(edge) state.add_edge(replacement, edge.src_conn, edge.dst, edge.dst_conn, edge.data) else: replacement = state.add_write(name) state.remove_edge(edge) state.add_edge(edge.src, edge.src_conn, replacement, edge.dst_conn, edge.data) # Make read/write components ionodes = [] for component in components: # Pick the first edge as the edge to make the component from innermost_edge, outermost_edge = component[0] mpath = mpaths[outermost_edge] mapname = streams[outermost_edge] innermost_edge.data.other_subset = None # Get edge data and streams if self.expr_index == 0: opname = 'read' path = [e.dst for e in mpath[:-1]] rmemlets = [(dnode, '__inp', innermost_edge.data)] wmemlets = [] for i, (_, edge) in enumerate(component): name = streams[edge] ionode = state.add_write(name) ionodes.append(ionode) wmemlets.append( (ionode, '__out%d' % i, mm.Memlet(data=name, subset='0'))) code = '\n'.join('__out%d = __inp' % i for i in range(len(component))) else: # More than one input stream might mean a data race, so we only # address the first one in the tasklet code if len(component) > 1: warnings.warn( f'More than one input found for the same index for {dnode.data}' ) opname = 'write' path = [state.entry_node(e.src) for e in reversed(mpath[1:])] wmemlets = [(dnode, '__out', innermost_edge.data)] rmemlets = [] for i, (_, edge) in enumerate(component): name = streams[edge] ionode = state.add_read(name) ionodes.append(ionode) rmemlets.append( (ionode, '__inp%d' % i, mm.Memlet(data=name, subset='0'))) code = '__out = __inp0' # Create map structure for read/write component maps = [] for entry in path: map: nodes.Map = entry.map maps.append( state.add_map(f'__s{opname}_{mapname}', [(p, r) for p, r in zip(map.params, map.range)], map.schedule)) tasklet = state.add_tasklet( f'{opname}_{mapname}', {m[1] for m in rmemlets}, {m[1] for m in wmemlets}, code, ) for node, cname, memlet in rmemlets: state.add_memlet_path(node, *(me for me, _ in maps), tasklet, dst_conn=cname, memlet=memlet) for node, cname, memlet in wmemlets: state.add_memlet_path(tasklet, *(mx for _, mx in reversed(maps)), node, src_conn=cname, memlet=memlet) return ionodes
def apply(self, state: SDFGState, sdfg: SDFG) -> nodes.AccessNode: dnode: nodes.AccessNode = self.access if self.expr_index == 0: edges = state.out_edges(dnode) else: edges = state.in_edges(dnode) # To understand how many components we need to create, all map ranges # throughout memlet paths must match exactly. We thus create a # dictionary of unique ranges mapping: Dict[Tuple[subsets.Range], List[gr.MultiConnectorEdge[mm.Memlet]]] = defaultdict( list) ranges = {} for edge in edges: mpath = state.memlet_path(edge) ranges[edge] = _collect_map_ranges(state, mpath) mapping[tuple(r[1] for r in ranges[edge])].append(edge) # Collect all edges with the same memory access pattern components_to_create: Dict[ Tuple[symbolic.SymbolicType], List[gr.MultiConnectorEdge[mm.Memlet]]] = defaultdict(list) for edges_with_same_range in mapping.values(): for edge in edges_with_same_range: # Get memlet path and innermost edge mpath = state.memlet_path(edge) innermost_edge = copy.deepcopy(mpath[-1] if self.expr_index == 0 else mpath[0]) # Store memlets of the same access in the same component expr = _canonicalize_memlet(innermost_edge.data, ranges[edge]) components_to_create[expr].append((innermost_edge, edge)) components = list(components_to_create.values()) # Split out components that have dependencies between them to avoid # deadlocks if self.expr_index == 0: ccs_to_add = [] for i, component in enumerate(components): edges_to_remove = set() for cedge in component: if any( nx.has_path(state.nx, o[1].dst, cedge[1].dst) for o in component if o is not cedge): ccs_to_add.append([cedge]) edges_to_remove.add(cedge) if edges_to_remove: components[i] = [ c for c in component if c not in edges_to_remove ] components.extend(ccs_to_add) # End of split desc = sdfg.arrays[dnode.data] # Create new streams of shape 1 streams = {} mpaths = {} for edge in edges: if self.use_memory_buffering: arrname = str(self.access) # Add gearbox total_size = edge.data.volume vector_size = int(self.memory_buffering_target_bytes / desc.dtype.bytes) if not is_int(sdfg.arrays[dnode.data].shape[-1]): warnings.warn( "Using the MemoryBuffering transformation is potential unsafe since {sym} is not an integer. There should be no issue if {sym} % {vec} == 0" .format(sym=sdfg.arrays[dnode.data].shape[-1], vec=vector_size)) for i in sdfg.arrays[dnode.data].strides: if not is_int(i): warnings.warn( "Using the MemoryBuffering transformation is potential unsafe since {sym} is not an integer. There should be no issue if {sym} % {vec} == 0" .format(sym=i, vec=vector_size)) if self.expr_index == 0: # Read edges = state.out_edges(dnode) gearbox_input_type = dtypes.vector(desc.dtype, vector_size) gearbox_output_type = desc.dtype gearbox_read_volume = total_size / vector_size gearbox_write_volume = total_size else: # Write edges = state.in_edges(dnode) gearbox_input_type = desc.dtype gearbox_output_type = dtypes.vector( desc.dtype, vector_size) gearbox_read_volume = total_size gearbox_write_volume = total_size / vector_size input_gearbox_name, input_gearbox_newdesc = sdfg.add_stream( "gearbox_input", gearbox_input_type, buffer_size=self.buffer_size, storage=self.storage, transient=True, find_new_name=True) output_gearbox_name, output_gearbox_newdesc = sdfg.add_stream( "gearbox_output", gearbox_output_type, buffer_size=self.buffer_size, storage=self.storage, transient=True, find_new_name=True) read_to_gearbox = state.add_read(input_gearbox_name) write_from_gearbox = state.add_write(output_gearbox_name) gearbox = Gearbox(total_size / vector_size) state.add_node(gearbox) state.add_memlet_path(read_to_gearbox, gearbox, dst_conn="from_memory", memlet=Memlet( input_gearbox_name + "[0]", volume=gearbox_read_volume)) state.add_memlet_path(gearbox, write_from_gearbox, src_conn="to_kernel", memlet=Memlet( output_gearbox_name + "[0]", volume=gearbox_write_volume)) if self.expr_index == 0: streams[edge] = input_gearbox_name name = output_gearbox_name newdesc = output_gearbox_newdesc else: streams[edge] = output_gearbox_name name = input_gearbox_name newdesc = input_gearbox_newdesc else: # Qualify name to avoid name clashes if memory interfaces are not decoupled for Xilinx stream_name = "stream_" + dnode.data name, newdesc = sdfg.add_stream(stream_name, desc.dtype, buffer_size=self.buffer_size, storage=self.storage, transient=True, find_new_name=True) streams[edge] = name # Add these such that we can easily use output_gearbox_name and input_gearbox_name without using if statements output_gearbox_name = name input_gearbox_name = name mpath = state.memlet_path(edge) mpaths[edge] = mpath # Replace memlets in path with stream access for e in mpath: e.data = mm.Memlet(data=name, subset='0', other_subset=e.data.other_subset) if isinstance(e.src, nodes.NestedSDFG): e.data.dynamic = True _streamify_recursive(e.src, e.src_conn, newdesc) if isinstance(e.dst, nodes.NestedSDFG): e.data.dynamic = True _streamify_recursive(e.dst, e.dst_conn, newdesc) # Replace access node and memlet tree with one access if self.expr_index == 0: replacement = state.add_read(output_gearbox_name) state.remove_edge(edge) state.add_edge(replacement, edge.src_conn, edge.dst, edge.dst_conn, edge.data) else: replacement = state.add_write(input_gearbox_name) state.remove_edge(edge) state.add_edge(edge.src, edge.src_conn, replacement, edge.dst_conn, edge.data) if self.use_memory_buffering: arrname = str(self.access) vector_size = int(self.memory_buffering_target_bytes / desc.dtype.bytes) # Vectorize access to global array. dtype = sdfg.arrays[arrname].dtype sdfg.arrays[arrname].dtype = dtypes.vector(dtype, vector_size) new_shape = list(sdfg.arrays[arrname].shape) contigidx = sdfg.arrays[arrname].strides.index(1) new_shape[contigidx] /= vector_size try: new_shape[contigidx] = int(new_shape[contigidx]) except TypeError: pass sdfg.arrays[arrname].shape = new_shape # Change strides new_strides: List = list(sdfg.arrays[arrname].strides) for i in range(len(new_strides)): if i == len(new_strides ) - 1: # Skip last dimension since it is always 1 continue new_strides[i] = new_strides[i] / vector_size sdfg.arrays[arrname].strides = new_strides post_state = get_post_state(sdfg, state) if post_state != None: # Change subset in the post state such that the correct amount of memory is copied back from the device for e in post_state.edges(): if e.data.data == self.access.data: new_subset = list(e.data.subset) i, j, k = new_subset[-1] new_subset[-1] = (i, (j + 1) / vector_size - 1, k) e.data = mm.Memlet(data=str(e.src), subset=subsets.Range(new_subset)) # Make read/write components ionodes = [] for component in components: # Pick the first edge as the edge to make the component from innermost_edge, outermost_edge = component[0] mpath = mpaths[outermost_edge] mapname = streams[outermost_edge] innermost_edge.data.other_subset = None # Get edge data and streams if self.expr_index == 0: opname = 'read' path = [e.dst for e in mpath[:-1]] rmemlets = [(dnode, '__inp', innermost_edge.data)] wmemlets = [] for i, (_, edge) in enumerate(component): name = streams[edge] ionode = state.add_write(name) ionodes.append(ionode) wmemlets.append( (ionode, '__out%d' % i, mm.Memlet(data=name, subset='0'))) code = '\n'.join('__out%d = __inp' % i for i in range(len(component))) else: # More than one input stream might mean a data race, so we only # address the first one in the tasklet code if len(component) > 1: warnings.warn( f'More than one input found for the same index for {dnode.data}' ) opname = 'write' path = [state.entry_node(e.src) for e in reversed(mpath[1:])] wmemlets = [(dnode, '__out', innermost_edge.data)] rmemlets = [] for i, (_, edge) in enumerate(component): name = streams[edge] ionode = state.add_read(name) ionodes.append(ionode) rmemlets.append( (ionode, '__inp%d' % i, mm.Memlet(data=name, subset='0'))) code = '__out = __inp0' # Create map structure for read/write component maps = [] for entry in path: map: nodes.Map = entry.map ranges = [(p, (r[0], r[1], r[2])) for p, r in zip(map.params, map.range)] # Change ranges of map if self.use_memory_buffering: # Find edges from/to map edge_subset = [ a_tuple[0] for a_tuple in list(innermost_edge.data.subset) ] # Change range of map if isinstance(edge_subset[-1], symbol) and str( edge_subset[-1]) == map.params[-1]: if not is_int(ranges[-1][1][1]): warnings.warn( "Using the MemoryBuffering transformation is potential unsafe since {sym} is not an integer. There should be no issue if {sym} % {vec} == 0" .format(sym=ranges[-1][1][1].args[1], vec=vector_size)) ranges[-1] = (ranges[-1][0], (ranges[-1][1][0], (ranges[-1][1][1] + 1) / vector_size - 1, ranges[-1][1][2])) elif isinstance(edge_subset[-1], sympy.core.add.Add): for arg in edge_subset[-1].args: if isinstance( arg, symbol) and str(arg) == map.params[-1]: if not is_int(ranges[-1][1][1]): warnings.warn( "Using the MemoryBuffering transformation is potential unsafe since {sym} is not an integer. There should be no issue if {sym} % {vec} == 0" .format(sym=ranges[-1][1][1].args[1], vec=vector_size)) ranges[-1] = (ranges[-1][0], ( ranges[-1][1][0], (ranges[-1][1][1] + 1) / vector_size - 1, ranges[-1][1][2])) maps.append( state.add_map(f'__s{opname}_{mapname}', ranges, map.schedule)) tasklet = state.add_tasklet( f'{opname}_{mapname}', {m[1] for m in rmemlets}, {m[1] for m in wmemlets}, code, ) for node, cname, memlet in rmemlets: state.add_memlet_path(node, *(me for me, _ in maps), tasklet, dst_conn=cname, memlet=memlet) for node, cname, memlet in wmemlets: state.add_memlet_path(tasklet, *(mx for _, mx in reversed(maps)), node, src_conn=cname, memlet=memlet) return ionodes