def make_read_col(): sdfg = SDFG("spmv_read_col") pre_state, body, post_state = make_iteration_space(sdfg) a_col = body.add_array("A_col_mem", (nnz, ), itype, storage=StorageType.FPGA_Global) col_pipe = body.add_stream("col_pipe", itype, storage=StorageType.FPGA_Local) tasklet = body.add_tasklet("read_col", {"col_in"}, {"col_out"}, "col_out = col_in[row_begin + c]") body.add_memlet_path(a_col, tasklet, dst_conn="col_in", memlet=Memlet.simple(a_col, "0:nnz")) body.add_memlet_path(tasklet, col_pipe, src_conn="col_out", memlet=Memlet.simple(col_pipe, "0")) return sdfg
def make_sdfg(dtype): n = dace.symbol("n") sdfg = dace.SDFG("mpi_reduce") state = sdfg.add_state("dataflow") sdfg.add_array("inbuf", [n], dtype, transient=False) sdfg.add_array("outbuf", [n], dtype, transient=False) sdfg.add_array("root", [1], dace.dtypes.int32, transient=False) inbuf = state.add_access("inbuf") outbuf = state.add_access("outbuf") root = state.add_access("root") reduce_node = mpi.nodes.reduce.Reduce("reduce") state.add_memlet_path(inbuf, reduce_node, dst_conn="_inbuffer", memlet=Memlet.simple(inbuf, "0:n", num_accesses=n)) state.add_memlet_path(root, reduce_node, dst_conn="_root", memlet=Memlet.simple(root, "0:1", num_accesses=1)) state.add_memlet_path(reduce_node, outbuf, src_conn="_outbuffer", memlet=Memlet.simple(outbuf, "0:n", num_accesses=n)) return sdfg
def test_nested_symbol_type(): test_sdfg = dace.SDFG("test_nested_symbol_type") test_state = test_sdfg.add_state("test_state") test_sdfg.add_symbol("s", dace.float32) test_sdfg.add_array('output', shape=[1], dtype=dace.float32) out = test_state.add_write('output') tasklet = test_state.add_tasklet('bugs', [], ['out'], 'out = s') test_state.add_memlet_path(tasklet, out, src_conn='out', memlet=Memlet.simple(out.data, "0")) outer_sdfg = dace.SDFG("nested_symbol_type") outer_state = outer_sdfg.add_state("outer_state") outer_sdfg.add_symbol("s", dace.float32) outer_sdfg.add_array('data', shape=[1], dtype=dace.float32) data = outer_state.add_write('data') nested = outer_state.add_nested_sdfg(test_sdfg, outer_sdfg, {}, {'output'}) outer_state.add_memlet_path(nested, data, src_conn='output', memlet=Memlet.simple(data.data, "0")) compiledSDFG = outer_sdfg.compile() res = np.zeros(1, dtype=np.float32) compiledSDFG(data=res, s=np.float32(1.5)) print("res:", res[0]) assert res[0] == np.float32(1.5)
def _reduce(sdfg: SDFG, state: SDFGState, redfunction: Callable[[Any, Any], Any], in_array: str, out_array=None, axis=None, identity=None): if out_array is None: inarr = in_array # Convert axes to tuple if axis is not None and not isinstance(axis, (tuple, list)): axis = (axis, ) if axis is not None: axis = tuple(pystr_to_symbolic(a) for a in axis) input_subset = parse_memlet_subset(sdfg.arrays[inarr], ast.parse(in_array).body[0].value, {}) input_memlet = Memlet.simple(inarr, input_subset) output_shape = None if axis is None: output_shape = [1] else: output_subset = copy.deepcopy(input_subset) output_subset.pop(axis) output_shape = output_subset.size() outarr, arr = sdfg.add_temp_transient(output_shape, sdfg.arrays[inarr].dtype, sdfg.arrays[inarr].storage) output_memlet = Memlet.from_array(outarr, arr) else: inarr = in_array outarr = out_array # Convert axes to tuple if axis is not None and not isinstance(axis, (tuple, list)): axis = (axis, ) if axis is not None: axis = tuple(pystr_to_symbolic(a) for a in axis) # Compute memlets input_subset = parse_memlet_subset(sdfg.arrays[inarr], ast.parse(in_array).body[0].value, {}) input_memlet = Memlet.simple(inarr, input_subset) output_subset = parse_memlet_subset(sdfg.arrays[outarr], ast.parse(out_array).body[0].value, {}) output_memlet = Memlet.simple(outarr, output_subset) # Create reduce subgraph inpnode = state.add_read(inarr) rednode = state.add_reduce(redfunction, axis, identity) outnode = state.add_write(outarr) state.add_nedge(inpnode, rednode, input_memlet) state.add_nedge(rednode, outnode, output_memlet) if out_array is None: return outarr else: return []
def make_read_x(): sdfg = SDFG("spmv_read_x") pre_state, body, post_state = make_iteration_space(sdfg) x_mem = body.add_array("x_mem", (W, ), dtype, storage=StorageType.FPGA_Global) col_pipe = body.add_stream("col_pipe", itype, storage=StorageType.FPGA_Local) compute_pipe = body.add_stream("compute_pipe", dtype, storage=StorageType.FPGA_Local) tasklet = body.add_tasklet("read_x", {"x_in", "col_in"}, {"x_out"}, "x_out = x_in[col_in]") body.add_memlet_path(x_mem, tasklet, dst_conn="x_in", memlet=Memlet.simple(x_mem, "0:W")) body.add_memlet_path(col_pipe, tasklet, dst_conn="col_in", memlet=Memlet.simple(col_pipe, "0")) body.add_memlet_path(tasklet, compute_pipe, src_conn="x_out", memlet=Memlet.simple(compute_pipe, "0")) return sdfg
def test(): print('SDFG consecutive tasklet test') # Externals (parameters, symbols) N = dp.symbol('N') N.set(20) input = dp.ndarray([N], dp.int32) output = dp.ndarray([N], dp.int32) input[:] = dp.int32(5) output[:] = dp.int32(0) # Construct SDFG mysdfg = SDFG('ctasklet') state = mysdfg.add_state() A_ = state.add_array('A', [N], dp.int32) B_ = state.add_array('B', [N], dp.int32) map_entry, map_exit = state.add_map('mymap', dict(i='0:N')) tasklet = state.add_tasklet('mytasklet', {'a'}, {'b'}, 'b = 5*a') state.add_edge(map_entry, None, tasklet, 'a', Memlet.simple(A_, 'i')) tasklet2 = state.add_tasklet('mytasklet2', {'c'}, {'d'}, 'd = 2*c') state.add_edge(tasklet, 'b', tasklet2, 'c', Memlet()) state.add_edge(tasklet2, 'd', map_exit, None, Memlet.simple(B_, 'i')) # Add outer edges state.add_edge(A_, None, map_entry, None, Memlet.simple(A_, '0:N')) state.add_edge(map_exit, None, B_, None, Memlet.simple(B_, '0:N')) mysdfg(A=input, B=output, N=N) diff = np.linalg.norm(10 * input - output) / N.get() print("Difference:", diff) assert diff <= 1e-5
def test(): # Externals (parameters, symbols) N = dp.symbol('N') N.set(20) input = dp.ndarray([N], dp.int32) output = dp.ndarray([N], dp.int32) input[:] = dp.int32(5) output[:] = dp.int32(0) # Construct SDFG mysdfg = SDFG('mysdfg') state = mysdfg.add_state() A_ = state.add_array('A', [N], dp.int32) # NOTE: The names A and B are not B_ = state.add_array('B', [N], dp.int32) # reserved, this is just to # clarify that # variable name != array name # Easy way to add a tasklet tasklet, map_entry, map_exit = state.add_mapped_tasklet('mytasklet', dict(i='0:N'), dict(a=Memlet.simple(A_, 'i')), 'b = 5*a', dict(b=Memlet.simple(B_, 'i'))) # Alternatively (the explicit way): #map_entry, map_exit = state.add_map('mymap', dict(i='0:N')) #tasklet = state.add_tasklet('mytasklet', {'a'}, {'b'}, 'b = 5*a') #state.add_edge(map_entry, None, tasklet, 'a', Memlet.simple(A_, 'i')) #state.add_edge(tasklet, 'b', map_exit, None, Memlet.simple(B_, 'i')) # Add outer edges state.add_edge(A_, None, map_entry, None, Memlet.simple(A_, '0:N')) state.add_edge(map_exit, None, B_, None, Memlet.simple(B_, '0:N')) mysdfg(A=input, B=output, N=N) diff = np.linalg.norm(5 * input - output) / N.get() print("Difference:", diff) assert diff <= 1e-5
def test_dynamic_sdfg_with_math_functions(): # Externals (parameters, symbols) N = dp.symbol('N') N.set(20) input = np.random.rand(N.get()).astype(np.float32) output = dp.ndarray([N], dp.float32) output[:] = dp.float32(0) # Construct SDFG mysdfg = SDFG('mymodexp') state = mysdfg.add_state() A = state.add_array('A', [N], dp.float32) B = state.add_array('B', [N], dp.float32) # Easy way to add a tasklet tasklet, map_entry, map_exit = state.add_mapped_tasklet( 'mytasklet', dict(i='0:N'), dict(a=Memlet.simple(A, 'i % N')), 'b = math.exp(a)', dict(b=Memlet.simple(B, 'i'))) # Add outer edges state.add_edge(A, None, map_entry, None, Memlet.simple(A, '0:N')) state.add_edge(map_exit, None, B, None, Memlet.simple(B, '0:N')) mysdfg(A=input, B=output, N=N) #mymodexp_prog(input, output) diff = np.linalg.norm(np.exp(input) - output) / N.get() print("Difference:", diff) assert diff <= 1e-5
def make_sdfg(dtype): n = dace.symbol("n") sdfg = dace.SDFG("mpi_bcast") state = sdfg.add_state("dataflow") sdfg.add_array("x", [n], dtype, transient=False) sdfg.add_array("root", [1], dace.dtypes.int32, transient=False) x = state.add_access("x") xout = state.add_access("x") root = state.add_access("root") bcast_node = mpi.nodes.bcast.Bcast("bcast") state.add_memlet_path(x, bcast_node, dst_conn="_inbuffer", memlet=Memlet.simple(x, "0:n", num_accesses=n)) state.add_memlet_path(root, bcast_node, dst_conn="_root", memlet=Memlet.simple(root, "0:1", num_accesses=1)) state.add_memlet_path(bcast_node, xout, src_conn="_outbuffer", memlet=Memlet.simple(xout, "0:n", num_accesses=1)) return sdfg
def make_nested_vecAdd_sdfg(sdfg_name: str, dtype=dace.float32): ''' Builds an SDFG for vector addition. Internally has a nested SDFG in charge of actually performing the computation. :param sdfg_name: name to give to the sdfg :param dtype: used data type :return: an SDFG ''' n = dace.symbol("size") vecAdd_parent_sdfg = dace.SDFG(sdfg_name) vecAdd_parent_state = vecAdd_parent_sdfg.add_state("vecAdd_parent") # ---------- ---------- # ACCESS NODES # ---------- ---------- x_name = "x" y_name = "y" z_name = "z" vecAdd_parent_sdfg.add_array(x_name, [n], dtype=dtype) vecAdd_parent_sdfg.add_array(y_name, [n], dtype=dtype) vecAdd_parent_sdfg.add_array(z_name, [n], dtype=dtype) x_in = vecAdd_parent_state.add_read(x_name) y_in = vecAdd_parent_state.add_read(y_name) z_out = vecAdd_parent_state.add_write(z_name) # ---------- ---------- # COMPUTE # ---------- ---------- # Create the nested SDFG for vector addition nested_sdfg_name = sdfg_name + "_nested" to_nest = make_vecAdd_sdfg(nested_sdfg_name, dtype) # Nest it and connect memlets nested_sdfg = vecAdd_parent_state.add_nested_sdfg(to_nest, vecAdd_parent_sdfg, {"x", "y"}, {"z"}) vecAdd_parent_state.add_memlet_path(x_in, nested_sdfg, dst_conn="x", memlet=Memlet.simple(x_in, "0:size", num_accesses=n)) vecAdd_parent_state.add_memlet_path(y_in, nested_sdfg, dst_conn="y", memlet=Memlet.simple(y_in, "0:size", num_accesses=n)) vecAdd_parent_state.add_memlet_path(nested_sdfg, z_out, src_conn="z", memlet=Memlet.simple(z_out, "0:size", num_accesses=n)) return vecAdd_parent_sdfg
def _gather(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, in_buffer: str, out_buffer: str, root: Union[str, sp.Expr, Number] = 0): from dace.libraries.mpi.nodes.gather import Gather libnode = Gather('_Gather_') in_desc = sdfg.arrays[in_buffer] out_desc = sdfg.arrays[out_buffer] in_node = state.add_read(in_buffer) out_node = state.add_write(out_buffer) if isinstance(root, str) and root in sdfg.arrays.keys(): root_node = state.add_read(root) else: storage = in_desc.storage root_name = _define_local_scalar(pv, sdfg, state, dace.int32, storage) root_node = state.add_access(root_name) root_tasklet = state.add_tasklet('_set_root_', {}, {'__out'}, '__out = {}'.format(root)) state.add_edge(root_tasklet, '__out', root_node, None, Memlet.simple(root_name, '0')) state.add_edge(in_node, None, libnode, '_inbuffer', Memlet.from_array(in_buffer, in_desc)) state.add_edge(root_node, None, libnode, '_root', Memlet.simple(root_node.data, '0')) state.add_edge(libnode, '_outbuffer', out_node, None, Memlet.from_array(out_buffer, out_desc)) return None
def _Reduce(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, buffer: str, op: str, root: Union[str, sp.Expr, Number] = 0, grid: str = None): from dace.libraries.mpi.nodes.reduce import Reduce libnode = Reduce('_Reduce_', op, grid) desc = sdfg.arrays[buffer] in_buffer = state.add_read(buffer) out_buffer = state.add_write(buffer) if isinstance(root, str) and root in sdfg.arrays.keys(): root_node = state.add_read(root) else: storage = desc.storage root_name = _define_local_scalar(pv, sdfg, state, dace.int32, storage) root_node = state.add_access(root_name) root_tasklet = state.add_tasklet('_set_root_', {}, {'__out'}, '__out = {}'.format(root)) state.add_edge(root_tasklet, '__out', root_node, None, Memlet.simple(root_name, '0')) state.add_edge(in_buffer, None, libnode, '_inbuffer', Memlet.from_array(buffer, desc)) state.add_edge(root_node, None, libnode, '_root', Memlet.simple(root_node.data, '0')) state.add_edge(libnode, '_outbuffer', out_buffer, None, Memlet.from_array(buffer, desc)) return None
def _assignop(sdfg: SDFG, state: SDFGState, op1: str, opcode: str, opname: str): """ Implements a general element-wise array assignment operator. """ arr1 = sdfg.arrays[op1] name, _ = sdfg.add_temp_transient(arr1.shape, arr1.dtype, arr1.storage) write_memlet = None if opcode: write_memlet = Memlet.simple( name, ','.join(['__i%d' % i for i in range(len(arr1.shape))]), wcr_str='lambda x, y: x %s y' % opcode) else: write_memlet = Memlet.simple( name, ','.join(['__i%d' % i for i in range(len(arr1.shape))])) state.add_mapped_tasklet( "_%s_" % opname, {'__i%d' % i: '0:%s' % s for i, s in enumerate(arr1.shape)}, { '__in1': Memlet.simple( op1, ','.join(['__i%d' % i for i in range(len(arr1.shape))])) }, '__out = __in1', {'__out': write_memlet}, external_edges=True) return name
def nccl_send(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, in_buffer: str, peer: symbolic.SymbolicType = 0, group_handle: str = None): inputs = {"_inbuffer"} outputs = set() if isinstance(group_handle, str): gh_start = False if group_handle in sdfg.arrays.keys(): gh_name = group_handle gh_out = state.add_access(gh_name) gh_in = state.add_access(gh_name) inputs.add("_group_handle") else: gh_start = True gh_name = _define_local_scalar(pv, sdfg, state, dace.int32, dtypes.StorageType.GPU_Global) gh_out = state.add_access(gh_name) outputs.add("_group_handle") libnode = Send(inputs=inputs, outputs=outputs, peer=peer) if isinstance(group_handle, str): gh_memlet = Memlet.simple(gh_name, '0') if not gh_start: state.add_edge(gh_in, None, libnode, "_group_handle", gh_memlet) state.add_edge(libnode, "_group_handle", gh_out, None, gh_memlet) in_range = None if isinstance(in_buffer, tuple): in_name, in_range = in_buffer else: in_name = in_buffer desc = sdfg.arrays[in_name] conn = libnode.in_connectors conn = { c: (dtypes.pointer(desc.dtype) if c == '_buffer' else t) for c, t in conn.items() } libnode.in_connectors = conn in_node = state.add_read(in_name) if in_range: buf_mem = Memlet.simple(in_name, in_range) else: buf_mem = Memlet.from_array(in_name, desc) state.add_edge(in_node, None, libnode, '_inbuffer', buf_mem) return []
def test(): print('Constant specialization test') N = dp.symbol('N') M = dp.symbol('M') N.set(20) M.set(30) fullrange = '1:N-1,0:M' irange = '1:N-1' jrange = '0:M' input = np.random.rand(N.get(), M.get()).astype(np.float32) output = dp.ndarray([N, M], dtype=dp.float32) output[:] = dp.float32(0) ########################################################################## spec_sdfg = SDFG('spectest') state = spec_sdfg.add_state() A = state.add_array('A', [N, M], dp.float32) Atrans = state.add_transient('At', [N - 2, M], dp.float32) B = state.add_array('B', [N, M], dp.float32) state.add_edge(A, None, Atrans, None, Memlet.simple(A, fullrange)) _, me, mx = state.add_mapped_tasklet( 'compute', dict(i=irange, j=jrange), dict(a=Memlet.simple(Atrans, 'i-1,j')), 'b = math.exp(a)', dict(b=Memlet.simple(B, 'i,j'))) state.add_edge(Atrans, None, me, None, Memlet.simple(Atrans, fullrange)) state.add_edge(mx, None, B, None, Memlet.simple(B, fullrange)) spec_sdfg.fill_scope_connectors() dp.propagate_memlets_sdfg(spec_sdfg) spec_sdfg.validate() ########################################################################## code_nonspec = spec_sdfg.generate_code() assert 'Dynamic' in code_nonspec[0].code spec_sdfg.specialize(dict(N=N, M=M)) code_spec = spec_sdfg.generate_code() assert 'Dynamic' not in code_spec[0].code func = spec_sdfg.compile() func(A=input, B=output, N=N, M=M) diff = np.linalg.norm( np.exp(input[1:(N.get() - 1), 0:M.get()]) - output[1:-1, :]) / N.get() print("Difference:", diff) assert diff <= 1e-5
def make_nested_sdfg(): sdfg = dace.SDFG('vol_propagation_nested') assign_loop_bound = sdfg.add_state('assign') guard_state = sdfg.add_state('guard') loop_state = sdfg.add_state('for') end_state = sdfg.add_state('endfor') sdfg.add_edge(assign_loop_bound, guard_state, InterstateEdge(assignments={'i': '0'})) sdfg.add_edge( guard_state, loop_state, InterstateEdge(condition=CodeProperty.from_string( 'i < loop_bound', language=Language.Python))) sdfg.add_edge(loop_state, guard_state, InterstateEdge(assignments={'i': 'i+1'})) sdfg.add_edge( guard_state, end_state, InterstateEdge(condition=CodeProperty.from_string( 'not (i < loop_bound)', language=Language.Python))) in_bound = assign_loop_bound.add_stream('IN_bound', dace.int32, storage=StorageType.FPGA_Local) loop_bound = assign_loop_bound.add_scalar( 'loop_bound', dace.int32, transient=True, storage=StorageType.FPGA_Registers) assign_loop_bound.add_memlet_path(in_bound, loop_bound, memlet=Memlet.simple(loop_bound, '0')) in_a = loop_state.add_array('IN_a', [N], dace.int32, storage=StorageType.FPGA_Global) out_stream = loop_state.add_stream('OUT_stream', dace.int32, storage=StorageType.FPGA_Local) tasklet2 = loop_state.add_tasklet('compute', {'_IN_a'}, {'_OUT_stream'}, '_OUT_stream = _IN_a[0]') loop_state.add_memlet_path(in_a, tasklet2, dst_conn='_IN_a', memlet=Memlet.simple(in_a, '0:N')) loop_state.add_memlet_path(tasklet2, out_stream, src_conn='_OUT_stream', memlet=Memlet.simple(out_stream, '0')) return sdfg
def nccl_recv(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, out_buffer: str, peer: symbolic.SymbolicType = 0, group_handle: str = None): inputs = set() outputs = {"_outbuffer"} if isinstance(group_handle, str): gh_start = False if group_handle in sdfg.arrays.keys(): gh_name = group_handle gh_out = state.add_access(gh_name) gh_in = state.add_access(gh_name) inputs.add("_group_handle") else: gh_start = True gh_name = _define_local_scalar(pv, sdfg, state, dace.int32, dtypes.StorageType.GPU_Global) gh_out = state.add_access(gh_name) outputs.add("_group_handle") libnode = Recv(inputs=inputs, outputs=outputs, peer=peer) if isinstance(group_handle, str): gh_memlet = Memlet.simple(gh_name, '0') if not gh_start: state.add_edge(gh_in, None, libnode, "_group_handle", gh_memlet) state.add_edge(libnode, "_group_handle", gh_out, None, gh_memlet) out_range = None if isinstance(out_buffer, tuple): out_name, out_range = out_buffer out_node = state.add_write(out_name) elif isinstance(out_buffer, str) and out_buffer in sdfg.arrays.keys(): out_name = out_buffer out_node = state.add_write(out_name) else: raise ValueError( "NCCL_Recv out_buffer must be an array, or a an array range tuple.") if out_range: out_mem = Memlet.simple(out_name, out_range) else: out_mem = Memlet.simple(out_name, '0') state.add_edge(libnode, '_outbuffer', out_node, None, out_mem) return []
def make_compute_sdfg(): sdfg = SDFG("spmv_compute") pre_state, body, post_state = make_iteration_space(sdfg) a_pipe = body.add_stream("a_pipe", dtype, storage=StorageType.FPGA_Local) x_pipe = body.add_stream("x_pipe", dtype, storage=StorageType.FPGA_Local) b_buffer_in = body.add_scalar("b_buffer", dtype, transient=True, storage=StorageType.FPGA_Registers) b_buffer_out = body.add_scalar("b_buffer", dtype, transient=True, storage=StorageType.FPGA_Registers) nested_sdfg = make_compute_nested_sdfg() tasklet = body.add_nested_sdfg(nested_sdfg, sdfg, {"a_in", "x_in", "b_in"}, {"b_out"}) body.add_memlet_path(a_pipe, tasklet, dst_conn="a_in", memlet=Memlet.simple(a_pipe, "0")) body.add_memlet_path(b_buffer_in, tasklet, dst_conn="b_in", memlet=Memlet.simple(b_buffer_in, "0")) body.add_memlet_path(x_pipe, tasklet, dst_conn="x_in", memlet=Memlet.simple(x_pipe, "0")) body.add_memlet_path(tasklet, b_buffer_out, src_conn="b_out", memlet=Memlet.simple(b_buffer_out, "0")) b_buffer_post_in = post_state.add_scalar("b_buffer", dtype, transient=True, storage=StorageType.FPGA_Registers) b_pipe = post_state.add_stream("b_pipe", dtype, storage=StorageType.FPGA_Local) post_state.add_memlet_path(b_buffer_post_in, b_pipe, memlet=Memlet.simple(b_pipe, "0")) return sdfg
def make_write_sdfg(): sdfg = SDFG("spmv_write") begin = sdfg.add_state("begin") entry = sdfg.add_state("entry") state = sdfg.add_state("body") end = sdfg.add_state("end") sdfg.add_edge(begin, entry, InterstateEdge(assignments={"h": "0"})) sdfg.add_edge( entry, state, InterstateEdge(condition=CodeProperty.from_string( "h < H", language=Language.Python))) sdfg.add_edge( entry, end, InterstateEdge(condition=CodeProperty.from_string( "h >= H", language=Language.Python))) sdfg.add_edge(state, entry, InterstateEdge(assignments={"h": "h + 1"})) result_to_write_in = state.add_stream("b_pipe", dtype, storage=StorageType.FPGA_Local) b = state.add_array("b_mem", (H, ), dtype, storage=StorageType.FPGA_Global) state.add_memlet_path(result_to_write_in, b, memlet=Memlet.simple(b, "h")) return sdfg
def _wait(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, request: str): from dace.libraries.mpi.nodes.wait import Wait libnode = Wait('_Wait_') req_range = None if isinstance(request, tuple): req_name, req_range = request else: req_name = request desc = sdfg.arrays[req_name] req_node = state.add_access(req_name) src = sdfg.add_temp_transient([1], dtypes.int32) src_node = state.add_write(src[0]) tag = sdfg.add_temp_transient([1], dtypes.int32) tag_node = state.add_write(tag[0]) if req_range: req_mem = Memlet.simple(req_name, req_range) else: req_mem = Memlet.from_array(req_name, desc) state.add_edge(req_node, None, libnode, '_request', req_mem) state.add_edge(libnode, '_stat_source', src_node, None, Memlet.from_array(*src)) state.add_edge(libnode, '_stat_tag', tag_node, None, Memlet.from_array(*tag)) return None
def parse_memlet(visitor, src: MemletType, dst: MemletType, defined_arrays_and_symbols: Dict[str, data.Data]): srcexpr, dstexpr, localvar = None, None, None if isinstance(src, ast.Name) and rname(src) not in defined_arrays_and_symbols: localvar = rname(src) else: srcexpr = ParseMemlet(visitor, defined_arrays_and_symbols, src) if isinstance(dst, ast.Name) and rname(dst) not in defined_arrays_and_symbols: if localvar is not None: raise DaceSyntaxError( visitor, src, 'Memlet source and destination cannot both be local variables') localvar = rname(dst) else: dstexpr = ParseMemlet(visitor, defined_arrays_and_symbols, dst) if srcexpr is not None and dstexpr is not None: # Create two memlets raise NotImplementedError elif srcexpr is not None: expr = srcexpr else: expr = dstexpr return localvar, Memlet.simple(expr.name, expr.subset, num_accesses=expr.accesses, wcr_str=expr.wcr)
def make_sdfg(): sdfg = dace.SDFG('vol_propagation') sdfg.add_symbol('N', dace.int32) sdfg.add_symbol('M', dace.int32) state = sdfg.add_state('main') a_in = state.add_array('A_in', [N], dace.int32, storage=StorageType.FPGA_Global) bound_pipe = state.add_stream('bound_in', dace.int32, transient=True, storage=StorageType.FPGA_Local) out_stream = state.add_stream('out_stream', dace.int32, transient=True, storage=StorageType.FPGA_Local) nest = state.add_nested_sdfg( make_nested_sdfg(), sdfg, { 'IN_a', 'IN_bound', }, { 'OUT_stream', } ) state.add_memlet_path( a_in, nest, dst_conn='IN_a', memlet=Memlet.simple(a_in, '0:N') ) state.add_memlet_path( bound_pipe, nest, dst_conn='IN_bound', memlet=Memlet.simple(bound_pipe, '0', num_accesses=-1) ) state.add_memlet_path( nest, out_stream, src_conn='OUT_stream', memlet=Memlet.simple(out_stream, '0', num_accesses=-1) ) return sdfg
def _simple_call(sdfg: SDFG, state: SDFGState, inpname: str, func: str, restype: dace.typeclass = None): """ Implements a simple call of the form `out = func(inp)`. """ inparr = sdfg.arrays[inpname] if restype is None: restype = sdfg.arrays[inpname].dtype outname, outarr = sdfg.add_temp_transient(inparr.shape, restype, inparr.storage) num_elements = reduce(lambda x, y: x * y, inparr.shape) if num_elements == 1: inp = state.add_read(inpname) out = state.add_write(outname) tasklet = state.add_tasklet(func, {'__inp'}, {'__out'}, '__out = {f}(__inp)'.format(f=func)) state.add_edge(inp, None, tasklet, '__inp', Memlet.from_array(inpname, inparr)) state.add_edge(tasklet, '__out', out, None, Memlet.from_array(outname, outarr)) else: state.add_mapped_tasklet( name=func, map_ranges={ '__i%d' % i: '0:%s' % n for i, n in enumerate(inparr.shape) }, inputs={ '__inp': Memlet.simple( inpname, ','.join(['__i%d' % i for i in range(len(inparr.shape))])) }, code='__out = {f}(__inp)'.format(f=func), outputs={ '__out': Memlet.simple( outname, ','.join(['__i%d' % i for i in range(len(inparr.shape))])) }, external_edges=True) return outname
def _binop(sdfg: SDFG, state: SDFGState, op1: str, op2: str, opcode: str, opname: str, restype: dace.typeclass): """ Implements a general element-wise array binary operator. """ arr1 = sdfg.arrays[op1] arr2 = sdfg.arrays[op2] out_shape, all_idx_dict, all_idx, arr1_idx, arr2_idx = _broadcast_together( arr1.shape, arr2.shape) name, _ = sdfg.add_temp_transient(out_shape, restype, arr1.storage) state.add_mapped_tasklet("_%s_" % opname, all_idx_dict, { '__in1': Memlet.simple(op1, arr1_idx), '__in2': Memlet.simple(op2, arr2_idx) }, '__out = __in1 %s __in2' % opcode, {'__out': Memlet.simple(name, all_idx)}, external_edges=True) return name
def _unop(sdfg: SDFG, state: SDFGState, op1: str, opcode: str, opname: str): """ Implements a general element-wise array unary operator. """ arr1 = sdfg.arrays[op1] name, _ = sdfg.add_temp_transient(arr1.shape, arr1.dtype, arr1.storage) state.add_mapped_tasklet( "_%s_" % opname, {'__i%d' % i: '0:%s' % s for i, s in enumerate(arr1.shape)}, { '__in1': Memlet.simple( op1, ','.join(['__i%d' % i for i in range(len(arr1.shape))])) }, '__out = %s __in1' % opcode, { '__out': Memlet.simple( name, ','.join(['__i%d' % i for i in range(len(arr1.shape))])) }, external_edges=True) return name
def test(): print('Multidimensional offset and stride test') # Externals (parameters, symbols) N = dp.symbol('N') N.set(20) input = dp.ndarray([N, N], dp.float32) output = dp.ndarray([4, 3], dp.float32) input[:] = (np.random.rand(N.get(), N.get()) * 5).astype(dp.float32.type) output[:] = dp.float32(0) # Construct SDFG mysdfg = SDFG('offset_stride') state = mysdfg.add_state() A_ = state.add_array('A', [6, 6], dp.float32, offset=[2, 3], strides=[N, 1], total_size=N * N) B_ = state.add_array('B', [3, 2], dp.float32, offset=[-1, -1], strides=[3, 1], total_size=12) map_entry, map_exit = state.add_map('mymap', [('i', '1:4'), ('j', '1:3')]) tasklet = state.add_tasklet('mytasklet', {'a'}, {'b'}, 'b = a') state.add_edge(map_entry, None, tasklet, 'a', Memlet.simple(A_, 'i,j')) state.add_edge(tasklet, 'b', map_exit, None, Memlet.simple(B_, 'i,j')) # Add outer edges state.add_edge(A_, None, map_entry, None, Memlet.simple(A_, '1:4,1:3')) state.add_edge(map_exit, None, B_, None, Memlet.simple(B_, '1:4,1:3')) mysdfg(A=input, B=output, N=N) diff = np.linalg.norm(output[0:3, 0:2] - input[3:6, 4:6]) / N.get() print("Difference:", diff) assert diff <= 1e-5
def make_sdfg(dtype): n = dace.symbol("n") sdfg = dace.SDFG("mpi_allreduce") state = sdfg.add_state("dataflow") sdfg.add_array("inbuf", [n], dtype, transient=False) sdfg.add_array("outbuf", [n], dtype, transient=False) inbuf = state.add_access("inbuf") outbuf = state.add_access("outbuf") allreduce_node = mpi.nodes.allreduce.Allreduce("allreduce") state.add_memlet_path(inbuf, allreduce_node, dst_conn="_inbuffer", memlet=Memlet.simple(inbuf, "0:n", num_accesses=n)) state.add_memlet_path(allreduce_node, outbuf, src_conn="_outbuffer", memlet=Memlet.simple(outbuf, "0:n", num_accesses=n)) return sdfg
def nccl_reduce(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, redfunction: Callable[[Any, Any], Any], in_buffer: str, out_buffer: Union[str, None] = None, root: str = None, group_handle: str = None): inputs = {"_inbuffer"} outputs = {"_outbuffer"} if isinstance(group_handle, str): gh_start = False if group_handle in sdfg.arrays.keys(): gh_name = group_handle gh_out = state.add_access(gh_name) gh_in = state.add_access(gh_name) inputs.add("_group_handle") else: gh_start = True gh_name = _define_local_scalar(pv, sdfg, state, dace.int32, dtypes.StorageType.GPU_Global) gh_out = state.add_access(gh_name) outputs.add("_group_handle") libnode = Reduce(inputs=inputs, outputs=outputs, wcr=redfunction, root=root) if isinstance(group_handle, str): gh_memlet = Memlet.simple(gh_name, '0') if not gh_start: state.add_edge(gh_in, None, libnode, "_group_handle", gh_memlet) state.add_edge(libnode, "_group_handle", gh_out, None, gh_memlet) # If out_buffer is not specified, the operation will be in-place. if out_buffer is None: out_buffer = in_buffer # Add nodes in_node = state.add_read(in_buffer) out_node = state.add_write(out_buffer) # Connect nodes state.add_edge(in_node, None, libnode, '_inbuffer', Memlet(in_buffer)) state.add_edge(libnode, '_outbuffer', out_node, None, Memlet(out_buffer)) return []
def make_sdfg(dtype): n = dace.symbol("n") p = dace.symbol("p") sdfg = dace.SDFG("mpi_allgather") state = sdfg.add_state("dataflow") sdfg.add_array("inA", [n], dtype, transient=False) sdfg.add_array("outA", [n * p], dtype, transient=False) inA = state.add_access("inA") outA = state.add_access("outA") allgather_node = mpi.nodes.allgather.Allgather("allgather") state.add_memlet_path(inA, allgather_node, dst_conn="_inbuffer", memlet=Memlet.simple(inA, "0:n", num_accesses=n)) state.add_memlet_path(allgather_node, outA, src_conn="_outbuffer", memlet=Memlet.simple(outA, "0:n*p", num_accesses=1)) return sdfg
def test(): print('SDFG multiple tasklet test') # Externals (parameters, symbols) N = dp.symbol('N') N.set(20) input = dp.ndarray([N], dp.int64) sum = dp.ndarray([1], dp.int64) product = dp.ndarray([1], dp.int64) input[:] = dp.int64(5) sum[:] = dp.int64(0) product[:] = dp.int64(1) # Construct SDFG mysdfg = SDFG('multiple_cr') state = mysdfg.add_state() A = state.add_array('A', [N], dp.int64) s = state.add_array('s', [1], dp.int64) p = state.add_array('p', [1], dp.int64) map_entry, map_exit = state.add_map('mymap', dict(i='0:N')) state.add_edge(A, None, map_entry, None, Memlet.simple(A, '0:N')) # Tasklet 1 t1 = state.add_tasklet('task1', {'a'}, {'b'}, 'b = a') state.add_edge(map_entry, None, t1, 'a', Memlet.simple(A, 'i')) state.add_edge(t1, 'b', map_exit, None, Memlet.simple(s, '0', wcr_str='lambda a,b: a+b')) state.add_edge(map_exit, None, s, None, Memlet.simple(s, '0')) # Tasklet 2 t2 = state.add_tasklet('task2', {'a'}, {'b'}, 'b = a') state.add_edge(map_entry, None, t2, 'a', Memlet.simple(A, 'i')) state.add_edge(t2, 'b', map_exit, None, Memlet.simple(p, '0', wcr_str='lambda a,b: a*b')) state.add_edge(map_exit, None, p, None, Memlet.simple(p, '0')) mysdfg(A=input, s=sum, p=product, N=N) diff_sum = 5 * 20 - sum[0] diff_prod = 5**20 - product[0] print("Difference:", diff_sum, '(sum)', diff_prod, '(product)') assert diff_sum <= 1e-5 and diff_prod <= 1e-5