def make_read_col(): sdfg = SDFG("spmv_read_col") pre_state, body, post_state = make_iteration_space(sdfg) a_col = body.add_array("A_col_mem", (nnz, ), itype, storage=StorageType.FPGA_Global) col_pipe = body.add_stream("col_pipe", itype, storage=StorageType.FPGA_Local) tasklet = body.add_tasklet("read_col", {"col_in"}, {"col_out"}, "col_out = col_in[row_begin + c]") body.add_memlet_path(a_col, tasklet, dst_conn="col_in", memlet=Memlet.simple(a_col, "0:nnz")) body.add_memlet_path(tasklet, col_pipe, src_conn="col_out", memlet=Memlet.simple(col_pipe, "0")) return sdfg
def test_dynamic_sdfg_with_math_functions(): # Externals (parameters, symbols) N = dp.symbol('N') N.set(20) input = np.random.rand(N.get()).astype(np.float32) output = dp.ndarray([N], dp.float32) output[:] = dp.float32(0) # Construct SDFG mysdfg = SDFG('mymodexp') state = mysdfg.add_state() A = state.add_array('A', [N], dp.float32) B = state.add_array('B', [N], dp.float32) # Easy way to add a tasklet tasklet, map_entry, map_exit = state.add_mapped_tasklet( 'mytasklet', dict(i='0:N'), dict(a=Memlet.simple(A, 'i % N')), 'b = math.exp(a)', dict(b=Memlet.simple(B, 'i'))) # Add outer edges state.add_edge(A, None, map_entry, None, Memlet.simple(A, '0:N')) state.add_edge(map_exit, None, B, None, Memlet.simple(B, '0:N')) mysdfg(A=input, B=output, N=N) #mymodexp_prog(input, output) diff = np.linalg.norm(np.exp(input) - output) / N.get() print("Difference:", diff) assert diff <= 1e-5
def make_read_x(): sdfg = SDFG("spmv_read_x") pre_state, body, post_state = make_iteration_space(sdfg) x_mem = body.add_array("x_mem", (W, ), dtype, storage=StorageType.FPGA_Global) col_pipe = body.add_stream("col_pipe", itype, storage=StorageType.FPGA_Local) compute_pipe = body.add_stream("compute_pipe", dtype, storage=StorageType.FPGA_Local) tasklet = body.add_tasklet("read_x", {"x_in", "col_in"}, {"x_out"}, "x_out = x_in[col_in]") body.add_memlet_path(x_mem, tasklet, dst_conn="x_in", memlet=Memlet.simple(x_mem, "0:W")) body.add_memlet_path(col_pipe, tasklet, dst_conn="col_in", memlet=Memlet.simple(col_pipe, "0")) body.add_memlet_path(tasklet, compute_pipe, src_conn="x_out", memlet=Memlet.simple(compute_pipe, "0")) return sdfg
def test(): # Externals (parameters, symbols) N = dp.symbol('N') N.set(20) input = dp.ndarray([N], dp.int32) output = dp.ndarray([N], dp.int32) input[:] = dp.int32(5) output[:] = dp.int32(0) # Construct SDFG mysdfg = SDFG('mysdfg') state = mysdfg.add_state() A_ = state.add_array('A', [N], dp.int32) # NOTE: The names A and B are not B_ = state.add_array('B', [N], dp.int32) # reserved, this is just to # clarify that # variable name != array name # Easy way to add a tasklet tasklet, map_entry, map_exit = state.add_mapped_tasklet('mytasklet', dict(i='0:N'), dict(a=Memlet.simple(A_, 'i')), 'b = 5*a', dict(b=Memlet.simple(B_, 'i'))) # Alternatively (the explicit way): #map_entry, map_exit = state.add_map('mymap', dict(i='0:N')) #tasklet = state.add_tasklet('mytasklet', {'a'}, {'b'}, 'b = 5*a') #state.add_edge(map_entry, None, tasklet, 'a', Memlet.simple(A_, 'i')) #state.add_edge(tasklet, 'b', map_exit, None, Memlet.simple(B_, 'i')) # Add outer edges state.add_edge(A_, None, map_entry, None, Memlet.simple(A_, '0:N')) state.add_edge(map_exit, None, B_, None, Memlet.simple(B_, '0:N')) mysdfg(A=input, B=output, N=N) diff = np.linalg.norm(5 * input - output) / N.get() print("Difference:", diff) assert diff <= 1e-5
def make_write_sdfg(): sdfg = SDFG("spmv_write") begin = sdfg.add_state("begin") entry = sdfg.add_state("entry") state = sdfg.add_state("body") end = sdfg.add_state("end") sdfg.add_edge(begin, entry, InterstateEdge(assignments={"h": "0"})) sdfg.add_edge( entry, state, InterstateEdge(condition=CodeProperty.from_string( "h < H", language=Language.Python))) sdfg.add_edge( entry, end, InterstateEdge(condition=CodeProperty.from_string( "h >= H", language=Language.Python))) sdfg.add_edge(state, entry, InterstateEdge(assignments={"h": "h + 1"})) result_to_write_in = state.add_stream("b_pipe", dtype, storage=StorageType.FPGA_Local) b = state.add_array("b_mem", (H, ), dtype, storage=StorageType.FPGA_Global) state.add_memlet_path(result_to_write_in, b, memlet=Memlet.simple(b, "h")) return sdfg
def test(): print('SDFG consecutive tasklet test') # Externals (parameters, symbols) N = dp.symbol('N') N.set(20) input = dp.ndarray([N], dp.int32) output = dp.ndarray([N], dp.int32) input[:] = dp.int32(5) output[:] = dp.int32(0) # Construct SDFG mysdfg = SDFG('ctasklet') state = mysdfg.add_state() A_ = state.add_array('A', [N], dp.int32) B_ = state.add_array('B', [N], dp.int32) map_entry, map_exit = state.add_map('mymap', dict(i='0:N')) tasklet = state.add_tasklet('mytasklet', {'a'}, {'b'}, 'b = 5*a') state.add_edge(map_entry, None, tasklet, 'a', Memlet.simple(A_, 'i')) tasklet2 = state.add_tasklet('mytasklet2', {'c'}, {'d'}, 'd = 2*c') state.add_edge(tasklet, 'b', tasklet2, 'c', Memlet()) state.add_edge(tasklet2, 'd', map_exit, None, Memlet.simple(B_, 'i')) # Add outer edges state.add_edge(A_, None, map_entry, None, Memlet.simple(A_, '0:N')) state.add_edge(map_exit, None, B_, None, Memlet.simple(B_, '0:N')) mysdfg(A=input, B=output, N=N) diff = np.linalg.norm(10 * input - output) / N.get() print("Difference:", diff) assert diff <= 1e-5
def cutout_state(state: SDFGState, *nodes: nd.Node, make_copy: bool = True) -> SDFG: """ Cut out a subgraph of a state from an SDFG to run separately for localized testing or optimization. The subgraph defined by the list of nodes will be extended to include access nodes of data containers necessary to run the graph separately. In addition, all transient data containers created outside the cut out graph will become global. :param state: The SDFG state in which the subgraph resides. :param nodes: The nodes in the subgraph to cut out. :param make_copy: If True, deep-copies every SDFG element in the copy. Otherwise, original references are kept. """ create_element = copy.deepcopy if make_copy else (lambda x: x) sdfg = state.parent subgraph: StateSubgraphView = StateSubgraphView(state, nodes) subgraph = _extend_subgraph_with_access_nodes(state, subgraph) other_arrays = _containers_defined_outside(sdfg, state, subgraph) # Make a new SDFG with the included constants, used symbols, and data containers new_sdfg = SDFG(f'{state.parent.name}_cutout', sdfg.constants_prop) defined_syms = subgraph.defined_symbols() freesyms = subgraph.free_symbols for sym in freesyms: new_sdfg.add_symbol(sym, defined_syms[sym]) for dnode in subgraph.data_nodes(): if dnode.data in new_sdfg.arrays: continue new_desc = sdfg.arrays[dnode.data].clone() # If transient is defined outside, it becomes a global if dnode.data in other_arrays: new_desc.transient = False new_sdfg.add_datadesc(dnode.data, new_desc) # Add a single state with the extended subgraph new_state = new_sdfg.add_state(state.label, is_start_state=True) inserted_nodes: Dict[nd.Node, nd.Node] = {} for e in subgraph.edges(): if e.src not in inserted_nodes: inserted_nodes[e.src] = create_element(e.src) if e.dst not in inserted_nodes: inserted_nodes[e.dst] = create_element(e.dst) new_state.add_edge(inserted_nodes[e.src], e.src_conn, inserted_nodes[e.dst], e.dst_conn, create_element(e.data)) # Insert remaining isolated nodes for n in subgraph.nodes(): if n not in inserted_nodes: inserted_nodes[n] = create_element(n) new_state.add_node(inserted_nodes[n]) # Remove remaining dangling connectors from scope nodes for node in inserted_nodes.values(): used_connectors = set(e.dst_conn for e in new_state.in_edges(node)) for conn in (node.in_connectors.keys() - used_connectors): node.remove_in_connector(conn) used_connectors = set(e.src_conn for e in new_state.out_edges(node)) for conn in (node.out_connectors.keys() - used_connectors): node.remove_out_connector(conn) return new_sdfg
def make_compute_sdfg(): sdfg = SDFG("filter_compute") state = sdfg.add_state("compute") make_compute_state(state) return sdfg
def test_3_interface_to_2_banks(): sdfg = SDFG("test_4_interface_to_2_banks") state = sdfg.add_state() _, desc_a = sdfg.add_array("a", [2, 2], dace.int32) desc_a.location["memorytype"] = "HBM" desc_a.location["bank"] = "0:2" acc_read1 = state.add_read("a") acc_write1 = state.add_write("a") t1 = state.add_tasklet("r1", set(["_x1", "_x2"]), set(["_y1"]), "_y1 = _x1 + _x2") m1_in, m1_out = state.add_map("m", {"k": "0:2"}, dtypes.ScheduleType.Unrolled) state.add_memlet_path(acc_read1, m1_in, t1, memlet=memlet.Memlet("a[0, 0]"), dst_conn="_x1") state.add_memlet_path(acc_read1, m1_in, t1, memlet=memlet.Memlet("a[1, 0]"), dst_conn="_x2") state.add_memlet_path(t1, m1_out, acc_write1, memlet=memlet.Memlet("a[0, 1]"), src_conn="_y1") sdfg.apply_fpga_transformations() assert sdfg.apply_transformations(InlineSDFG) == 1 assert sdfg.apply_transformations(MapUnroll) == 1 for node in sdfg.states()[0].nodes(): if isinstance(node, dace.sdfg.nodes.Tasklet): sdfg.states()[0].out_edges( node)[0].data.subset = subsets.Range.from_string("1, 1") break bank_assignment = sdfg.generate_code()[3].clean_code assert bank_assignment.count("sp") == 6 assert bank_assignment.count("HBM[0]") == 3 assert bank_assignment.count("HBM[1]") == 3 a = np.zeros([2, 2], np.int32) a[0, 0] = 2 a[1, 0] = 3 sdfg(a=a) assert a[0, 1] == 5 return sdfg
def test(): print('Constant specialization test') N = dp.symbol('N') M = dp.symbol('M') N.set(20) M.set(30) fullrange = '1:N-1,0:M' irange = '1:N-1' jrange = '0:M' input = np.random.rand(N.get(), M.get()).astype(np.float32) output = dp.ndarray([N, M], dtype=dp.float32) output[:] = dp.float32(0) ########################################################################## spec_sdfg = SDFG('spectest') state = spec_sdfg.add_state() A = state.add_array('A', [N, M], dp.float32) Atrans = state.add_transient('At', [N - 2, M], dp.float32) B = state.add_array('B', [N, M], dp.float32) state.add_edge(A, None, Atrans, None, Memlet.simple(A, fullrange)) _, me, mx = state.add_mapped_tasklet( 'compute', dict(i=irange, j=jrange), dict(a=Memlet.simple(Atrans, 'i-1,j')), 'b = math.exp(a)', dict(b=Memlet.simple(B, 'i,j'))) state.add_edge(Atrans, None, me, None, Memlet.simple(Atrans, fullrange)) state.add_edge(mx, None, B, None, Memlet.simple(B, fullrange)) spec_sdfg.fill_scope_connectors() dp.propagate_memlets_sdfg(spec_sdfg) spec_sdfg.validate() ########################################################################## code_nonspec = spec_sdfg.generate_code() assert 'Dynamic' in code_nonspec[0].code spec_sdfg.specialize(dict(N=N, M=M)) code_spec = spec_sdfg.generate_code() assert 'Dynamic' not in code_spec[0].code func = spec_sdfg.compile() func(A=input, B=output, N=N, M=M) diff = np.linalg.norm( np.exp(input[1:(N.get() - 1), 0:M.get()]) - output[1:-1, :]) / N.get() print("Difference:", diff) assert diff <= 1e-5
def make_compute_sdfg(): sdfg = SDFG("spmv_compute") pre_state, body, post_state = make_iteration_space(sdfg) a_pipe = body.add_stream("a_pipe", dtype, storage=StorageType.FPGA_Local) x_pipe = body.add_stream("x_pipe", dtype, storage=StorageType.FPGA_Local) b_buffer_in = body.add_scalar("b_buffer", dtype, transient=True, storage=StorageType.FPGA_Registers) b_buffer_out = body.add_scalar("b_buffer", dtype, transient=True, storage=StorageType.FPGA_Registers) nested_sdfg = make_compute_nested_sdfg() tasklet = body.add_nested_sdfg(nested_sdfg, sdfg, {"a_in", "x_in", "b_in"}, {"b_out"}) body.add_memlet_path(a_pipe, tasklet, dst_conn="a_in", memlet=Memlet.simple(a_pipe, "0")) body.add_memlet_path(b_buffer_in, tasklet, dst_conn="b_in", memlet=Memlet.simple(b_buffer_in, "0")) body.add_memlet_path(x_pipe, tasklet, dst_conn="x_in", memlet=Memlet.simple(x_pipe, "0")) body.add_memlet_path(tasklet, b_buffer_out, src_conn="b_out", memlet=Memlet.simple(b_buffer_out, "0")) b_buffer_post_in = post_state.add_scalar("b_buffer", dtype, transient=True, storage=StorageType.FPGA_Registers) b_pipe = post_state.add_stream("b_pipe", dtype, storage=StorageType.FPGA_Local) post_state.add_memlet_path(b_buffer_post_in, b_pipe, memlet=Memlet.simple(b_pipe, "0")) return sdfg
def from_json(json_obj, context=None): from dace import SDFG # Avoid import loop # We have to load the SDFG first. ret = NestedSDFG("nolabel", SDFG('nosdfg'), {}, {}) dace.serialize.set_properties_from_json(ret, json_obj, context) if context and 'sdfg_state' in context: ret.sdfg.parent = context['sdfg_state'] if context and 'sdfg' in context: ret.sdfg.parent_sdfg = context['sdfg'] ret.sdfg.parent_nsdfg_node = ret ret.sdfg.update_sdfg_list([]) return ret
def create_batch_gemm_sdfg(dtype, strides): ######################### sdfg = SDFG('einsum') state = sdfg.add_state() M, K, N = (symbolic.symbol(s) for s in ['M', 'K', 'N']) BATCH, sAM, sAK, sAB, sBK, sBN, sBB, sCM, sCN, sCB = ( symbolic.symbol(s) if symbolic.issymbolic(strides[s]) else strides[s] for s in [ 'BATCH', 'sAM', 'sAK', 'sAB', 'sBK', 'sBN', 'sBB', 'sCM', 'sCN', 'sCB' ]) batched = strides['BATCH'] != 1 _, xarr = sdfg.add_array( 'X', dtype=dtype, shape=[BATCH, M, K] if batched else [M, K], strides=[sAB, sAM, sAK] if batched else [sAM, sAK]) _, yarr = sdfg.add_array( 'Y', dtype=dtype, shape=[BATCH, K, N] if batched else [K, N], strides=[sBB, sBK, sBN] if batched else [sBK, sBN]) _, zarr = sdfg.add_array( 'Z', dtype=dtype, shape=[BATCH, M, N] if batched else [M, N], strides=[sCB, sCM, sCN] if batched else [sCM, sCN]) gX = state.add_read('X') gY = state.add_read('Y') gZ = state.add_write('Z') import dace.libraries.blas as blas # Avoid import loop libnode = blas.MatMul('einsum_gemm') state.add_node(libnode) state.add_edge(gX, None, libnode, '_a', Memlet.from_array(gX.data, xarr)) state.add_edge(gY, None, libnode, '_b', Memlet.from_array(gY.data, yarr)) state.add_edge(libnode, '_c', gZ, None, Memlet.from_array(gZ.data, zarr)) return sdfg
def test(): print('Dynamic SDFG test with vectorization and min') # Externals (parameters, symbols) N = dp.symbol('N') N.set(20) input = np.random.rand(N.get()).astype(np.float32) input2 = np.random.rand(N.get()).astype(np.float32) output = dp.ndarray([N], dp.float32) output[:] = dp.float32(0) # Construct SDFG mysdfg = SDFG('myvmin') mysdfg.add_array('A', [N], dp.float32) mysdfg.add_array('B', [N], dp.float32) mysdfg.add_array('C', [N], dp.float32) state = mysdfg.add_state() A = state.add_access('A') B = state.add_access('B') C = state.add_access('C') tasklet, map_entry, map_exit = state.add_mapped_tasklet( 'mytasklet', dict(i='0:N:2'), dict(a=Memlet.simple(A, 'i'), b=Memlet.simple(B, 'i')), 'c = min(a, b)', dict(c=Memlet.simple(C, 'i'))) # Manually vectorize tasklet tasklet.in_connectors['a'] = dp.vector(dp.float32, 2) tasklet.in_connectors['b'] = dp.vector(dp.float32, 2) tasklet.out_connectors['c'] = dp.vector(dp.float32, 2) # Add outer edges state.add_edge(A, None, map_entry, None, Memlet.simple(A, '0:N')) state.add_edge(B, None, map_entry, None, Memlet.simple(B, '0:N')) state.add_edge(map_exit, None, C, None, Memlet.simple(C, '0:N')) mysdfg(A=input, B=input2, C=output, N=N) diff = np.linalg.norm(np.minimum(input, input2) - output) / N.get() print("Difference:", diff) print("==== Program end ====") assert diff <= 1e-5
def test(): print('SDFG multiple tasklet test') # Externals (parameters, symbols) N = dp.symbol('N') N.set(20) input = dp.ndarray([N], dp.int64) sum = dp.ndarray([1], dp.int64) product = dp.ndarray([1], dp.int64) input[:] = dp.int64(5) sum[:] = dp.int64(0) product[:] = dp.int64(1) # Construct SDFG mysdfg = SDFG('multiple_cr') state = mysdfg.add_state() A = state.add_array('A', [N], dp.int64) s = state.add_array('s', [1], dp.int64) p = state.add_array('p', [1], dp.int64) map_entry, map_exit = state.add_map('mymap', dict(i='0:N')) state.add_edge(A, None, map_entry, None, Memlet.simple(A, '0:N')) # Tasklet 1 t1 = state.add_tasklet('task1', {'a'}, {'b'}, 'b = a') state.add_edge(map_entry, None, t1, 'a', Memlet.simple(A, 'i')) state.add_edge(t1, 'b', map_exit, None, Memlet.simple(s, '0', wcr_str='lambda a,b: a+b')) state.add_edge(map_exit, None, s, None, Memlet.simple(s, '0')) # Tasklet 2 t2 = state.add_tasklet('task2', {'a'}, {'b'}, 'b = a') state.add_edge(map_entry, None, t2, 'a', Memlet.simple(A, 'i')) state.add_edge(t2, 'b', map_exit, None, Memlet.simple(p, '0', wcr_str='lambda a,b: a*b')) state.add_edge(map_exit, None, p, None, Memlet.simple(p, '0')) mysdfg(A=input, s=sum, p=product, N=N) diff_sum = 5 * 20 - sum[0] diff_prod = 5**20 - product[0] print("Difference:", diff_sum, '(sum)', diff_prod, '(product)') assert diff_sum <= 1e-5 and diff_prod <= 1e-5
def four_interface_to_2_banks(mem_type, decouple_interfaces): sdfg = SDFG("test_4_interface_to_2_banks_" + mem_type) state = sdfg.add_state() _, desc_a = sdfg.add_array("a", [2, 2], dace.int32) desc_a.location["memorytype"] = mem_type desc_a.location["bank"] = "0:2" acc_read1 = state.add_read("a") acc_write1 = state.add_write("a") t1 = state.add_tasklet("r1", set(["_x1", "_x2"]), set(["_y1"]), "_y1 = _x1 + _x2") m1_in, m1_out = state.add_map("m", {"k": "0:2"}, dtypes.ScheduleType.Unrolled) state.add_memlet_path(acc_read1, m1_in, t1, memlet=memlet.Memlet("a[0, 0]"), dst_conn="_x1") state.add_memlet_path(acc_read1, m1_in, t1, memlet=memlet.Memlet("a[1, 0]"), dst_conn="_x2") state.add_memlet_path(t1, m1_out, acc_write1, memlet=memlet.Memlet("a[0, 1]"), src_conn="_y1") sdfg.apply_fpga_transformations() assert sdfg.apply_transformations(InlineSDFG) == 1 assert sdfg.apply_transformations(MapUnroll) == 1 for node in sdfg.states()[0].nodes(): if isinstance(node, dace.sdfg.nodes.Tasklet): sdfg.states()[0].out_edges(node)[0].data.subset = subsets.Range.from_string("1, 1") break with set_temporary("compiler", "xilinx", "decouple_array_interfaces", value=decouple_interfaces): bank_assignment = sdfg.generate_code()[3].clean_code # if we are not decoupling array interfaces we will use less mem interfaces assert bank_assignment.count("sp") == 6 if decouple_interfaces else 4 assert bank_assignment.count(mem_type + "[0]") == 3 if decouple_interfaces else 2 assert bank_assignment.count(mem_type + "[1]") == 3 if decouple_interfaces else 2 a = np.zeros([2, 2], np.int32) a[0, 0] = 2 a[1, 0] = 3 sdfg(a=a) assert a[0, 1] == 5 return sdfg
def make_read_sdfg(): sdfg = SDFG("filter_read") state = make_iteration_space(sdfg) A = state.add_array( "A_mem", [N], dtype=dtype, storage=StorageType.FPGA_Global) A_pipe = state.add_stream( "_A_pipe", dtype=dtype, buffer_size=buffer_size, veclen=W.get(), storage=StorageType.FPGA_Local) state.add_memlet_path( A, A_pipe, memlet=Memlet( A_pipe, 1, Indices(["0"]), W.get(), other_subset=Indices(["i"]))) return sdfg
def test(): print('Multidimensional offset and stride test') # Externals (parameters, symbols) N = dp.symbol('N') N.set(20) input = dp.ndarray([N, N], dp.float32) output = dp.ndarray([4, 3], dp.float32) input[:] = (np.random.rand(N.get(), N.get()) * 5).astype(dp.float32.type) output[:] = dp.float32(0) # Construct SDFG mysdfg = SDFG('offset_stride') state = mysdfg.add_state() A_ = state.add_array('A', [6, 6], dp.float32, offset=[2, 3], strides=[N, 1], total_size=N * N) B_ = state.add_array('B', [3, 2], dp.float32, offset=[-1, -1], strides=[3, 1], total_size=12) map_entry, map_exit = state.add_map('mymap', [('i', '1:4'), ('j', '1:3')]) tasklet = state.add_tasklet('mytasklet', {'a'}, {'b'}, 'b = a') state.add_edge(map_entry, None, tasklet, 'a', Memlet.simple(A_, 'i,j')) state.add_edge(tasklet, 'b', map_exit, None, Memlet.simple(B_, 'i,j')) # Add outer edges state.add_edge(A_, None, map_entry, None, Memlet.simple(A_, '1:4,1:3')) state.add_edge(map_exit, None, B_, None, Memlet.simple(B_, '1:4,1:3')) mysdfg(A=input, B=output, N=N) diff = np.linalg.norm(output[0:3, 0:2] - input[3:6, 4:6]) / N.get() print("Difference:", diff) assert diff <= 1e-5
def test(): print('SDFG multiple tasklet test') # Externals (parameters, symbols) N = dp.symbol('N') N.set(20) input = dp.ndarray([N], dp.int32) output = dp.ndarray([N], dp.int32) input[:] = dp.int32(5) output[:] = dp.int32(0) # Construct SDFG mysdfg = SDFG('multiple_tasklets') state = mysdfg.add_state() A = state.add_array('A', [N], dp.int32) B = state.add_array('B', [N], dp.int32) map_entry, map_exit = state.add_map('mymap', dict(i='0:N:2')) # Tasklet 1 t1 = state.add_tasklet('task1', {'a'}, {'b'}, 'b = 5*a') state.add_edge(map_entry, None, t1, 'a', Memlet.simple(A, 'i')) state.add_edge(t1, 'b', map_exit, None, Memlet.simple(B, 'i')) # Tasklet 2 t2 = state.add_tasklet('task2', {'a'}, {'b'}, 'b = a + a + a + a + a') state.add_edge(map_entry, None, t2, 'a', Memlet.simple(A, 'i+1')) state.add_edge(t2, 'b', map_exit, None, Memlet.simple(B, 'i+1')) state.add_edge(A, None, map_entry, None, Memlet.simple(A, '0:N')) state.add_edge(map_exit, None, B, None, Memlet.simple(B, '0:N')) mysdfg(A=input, B=output, N=N) diff = np.linalg.norm(5 * input - output) / N.get() print("Difference:", diff) assert diff <= 1e-5
def nest_state_subgraph(sdfg: SDFG, state: SDFGState, subgraph: SubgraphView, name: Optional[str] = None, full_data: bool = False) -> nodes.NestedSDFG: """ Turns a state subgraph into a nested SDFG. Operates in-place. :param sdfg: The SDFG containing the state subgraph. :param state: The state containing the subgraph. :param subgraph: Subgraph to nest. :param name: An optional name for the nested SDFG. :param full_data: If True, nests entire input/output data. :return: The nested SDFG node. :raise KeyError: Some or all nodes in the subgraph are not located in this state, or the state does not belong to the given SDFG. :raise ValueError: The subgraph is contained in more than one scope. """ if state.parent != sdfg: raise KeyError('State does not belong to given SDFG') if subgraph is not state and subgraph.graph is not state: raise KeyError('Subgraph does not belong to given state') # Find the top-level scope scope_tree = state.scope_tree() scope_dict = state.scope_dict() scope_dict_children = state.scope_children() top_scopenode = -1 # Initialized to -1 since "None" already means top-level for node in subgraph.nodes(): if node not in scope_dict: raise KeyError('Node not found in state') # If scope entry/exit, ensure entire scope is in subgraph if isinstance(node, nodes.EntryNode): scope_nodes = scope_dict_children[node] if any(n not in subgraph.nodes() for n in scope_nodes): raise ValueError('Subgraph contains partial scopes (entry)') elif isinstance(node, nodes.ExitNode): entry = state.entry_node(node) scope_nodes = scope_dict_children[entry] + [entry] if any(n not in subgraph.nodes() for n in scope_nodes): raise ValueError('Subgraph contains partial scopes (exit)') scope_node = scope_dict[node] if scope_node not in subgraph.nodes(): if top_scopenode != -1 and top_scopenode != scope_node: raise ValueError('Subgraph is contained in more than one scope') top_scopenode = scope_node scope = scope_tree[top_scopenode] ### # Consolidate edges in top scope utils.consolidate_edges(sdfg, scope) snodes = subgraph.nodes() # Collect inputs and outputs of the nested SDFG inputs: List[MultiConnectorEdge] = [] outputs: List[MultiConnectorEdge] = [] for node in snodes: for edge in state.in_edges(node): if edge.src not in snodes: inputs.append(edge) for edge in state.out_edges(node): if edge.dst not in snodes: outputs.append(edge) # Collect transients not used outside of subgraph (will be removed of # top-level graph) data_in_subgraph = set(n.data for n in subgraph.nodes() if isinstance(n, nodes.AccessNode)) # Find other occurrences in SDFG other_nodes = set(n.data for s in sdfg.nodes() for n in s.nodes() if isinstance(n, nodes.AccessNode) and n not in subgraph.nodes()) subgraph_transients = set() for data in data_in_subgraph: datadesc = sdfg.arrays[data] if datadesc.transient and data not in other_nodes: subgraph_transients.add(data) # All transients of edges between code nodes are also added to nested graph for edge in subgraph.edges(): if (isinstance(edge.src, nodes.CodeNode) and isinstance(edge.dst, nodes.CodeNode)): subgraph_transients.add(edge.data.data) # Collect data used in access nodes within subgraph (will be referenced in # full upon nesting) input_arrays = set() output_arrays = {} for node in subgraph.nodes(): if (isinstance(node, nodes.AccessNode) and node.data not in subgraph_transients): if node.has_reads(state): input_arrays.add(node.data) if node.has_writes(state): output_arrays[node.data] = state.in_edges(node)[0].data.wcr # Create the nested SDFG nsdfg = SDFG(name or 'nested_' + state.label) # Transients are added to the nested graph as-is for name in subgraph_transients: nsdfg.add_datadesc(name, sdfg.arrays[name]) # Input/output data that are not source/sink nodes are added to the graph # as non-transients for name in (input_arrays | output_arrays.keys()): datadesc = copy.deepcopy(sdfg.arrays[name]) datadesc.transient = False nsdfg.add_datadesc(name, datadesc) # Connected source/sink nodes outside subgraph become global data # descriptors in nested SDFG input_names = {} output_names = {} global_subsets: Dict[str, Tuple[str, Subset]] = {} for edge in inputs: if edge.data.data is None: # Skip edges with an empty memlet continue name = edge.data.data if name not in global_subsets: datadesc = copy.deepcopy(sdfg.arrays[edge.data.data]) datadesc.transient = False if not full_data: datadesc.shape = edge.data.subset.size() new_name = nsdfg.add_datadesc(name, datadesc, find_new_name=True) global_subsets[name] = (new_name, edge.data.subset) else: new_name, subset = global_subsets[name] if not full_data: new_subset = union(subset, edge.data.subset) if new_subset is None: new_subset = Range.from_array(sdfg.arrays[name]) global_subsets[name] = (new_name, new_subset) nsdfg.arrays[new_name].shape = new_subset.size() input_names[edge] = new_name for edge in outputs: if edge.data.data is None: # Skip edges with an empty memlet continue name = edge.data.data if name not in global_subsets: datadesc = copy.deepcopy(sdfg.arrays[edge.data.data]) datadesc.transient = False if not full_data: datadesc.shape = edge.data.subset.size() new_name = nsdfg.add_datadesc(name, datadesc, find_new_name=True) global_subsets[name] = (new_name, edge.data.subset) else: new_name, subset = global_subsets[name] if not full_data: new_subset = union(subset, edge.data.subset) if new_subset is None: new_subset = Range.from_array(sdfg.arrays[name]) global_subsets[name] = (new_name, new_subset) nsdfg.arrays[new_name].shape = new_subset.size() output_names[edge] = new_name ################### # Add scope symbols to the nested SDFG defined_vars = set( symbolic.pystr_to_symbolic(s) for s in (state.symbols_defined_at(top_scopenode).keys() | sdfg.symbols)) for v in defined_vars: if v in sdfg.symbols: sym = sdfg.symbols[v] nsdfg.add_symbol(v, sym.dtype) # Add constants to nested SDFG for cstname, cstval in sdfg.constants.items(): nsdfg.add_constant(cstname, cstval) # Create nested state nstate = nsdfg.add_state() # Add subgraph nodes and edges to nested state nstate.add_nodes_from(subgraph.nodes()) for e in subgraph.edges(): nstate.add_edge(e.src, e.src_conn, e.dst, e.dst_conn, copy.deepcopy(e.data)) # Modify nested SDFG parents in subgraph for node in subgraph.nodes(): if isinstance(node, nodes.NestedSDFG): node.sdfg.parent = nstate node.sdfg.parent_sdfg = nsdfg node.sdfg.parent_nsdfg_node = node # Add access nodes and edges as necessary edges_to_offset = [] for edge, name in input_names.items(): node = nstate.add_read(name) new_edge = copy.deepcopy(edge.data) new_edge.data = name edges_to_offset.append((edge, nstate.add_edge(node, None, edge.dst, edge.dst_conn, new_edge))) for edge, name in output_names.items(): node = nstate.add_write(name) new_edge = copy.deepcopy(edge.data) new_edge.data = name edges_to_offset.append((edge, nstate.add_edge(edge.src, edge.src_conn, node, None, new_edge))) # Offset memlet paths inside nested SDFG according to subsets for original_edge, new_edge in edges_to_offset: for edge in nstate.memlet_tree(new_edge): edge.data.data = new_edge.data.data if not full_data: edge.data.subset.offset(global_subsets[original_edge.data.data][1], True) # Add nested SDFG node to the input state nested_sdfg = state.add_nested_sdfg(nsdfg, None, set(input_names.values()) | input_arrays, set(output_names.values()) | output_arrays.keys()) # Reconnect memlets to nested SDFG reconnected_in = set() reconnected_out = set() empty_input = None empty_output = None for edge in inputs: if edge.data.data is None: empty_input = edge continue name = input_names[edge] if name in reconnected_in: continue if full_data: data = Memlet.from_array(edge.data.data, sdfg.arrays[edge.data.data]) else: data = copy.deepcopy(edge.data) data.subset = global_subsets[edge.data.data][1] state.add_edge(edge.src, edge.src_conn, nested_sdfg, name, data) reconnected_in.add(name) for edge in outputs: if edge.data.data is None: empty_output = edge continue name = output_names[edge] if name in reconnected_out: continue if full_data: data = Memlet.from_array(edge.data.data, sdfg.arrays[edge.data.data]) else: data = copy.deepcopy(edge.data) data.subset = global_subsets[edge.data.data][1] data.wcr = edge.data.wcr state.add_edge(nested_sdfg, name, edge.dst, edge.dst_conn, data) reconnected_out.add(name) # Connect access nodes to internal input/output data as necessary entry = scope.entry exit = scope.exit for name in input_arrays: node = state.add_read(name) if entry is not None: state.add_nedge(entry, node, Memlet()) state.add_edge(node, None, nested_sdfg, name, Memlet.from_array(name, sdfg.arrays[name])) for name, wcr in output_arrays.items(): node = state.add_write(name) if exit is not None: state.add_nedge(node, exit, Memlet()) state.add_edge(nested_sdfg, name, node, None, Memlet(data=name, wcr=wcr)) # Graph was not reconnected, but needs to be if state.in_degree(nested_sdfg) == 0 and empty_input is not None: state.add_edge(empty_input.src, empty_input.src_conn, nested_sdfg, None, empty_input.data) if state.out_degree(nested_sdfg) == 0 and empty_output is not None: state.add_edge(nested_sdfg, None, empty_output.dst, empty_output.dst_conn, empty_output.data) # Remove subgraph nodes from graph state.remove_nodes_from(subgraph.nodes()) # Remove subgraph transients from top-level graph for transient in subgraph_transients: del sdfg.arrays[transient] # Remove newly isolated nodes due to memlet consolidation for edge in inputs: if state.in_degree(edge.src) + state.out_degree(edge.src) == 0: state.remove_node(edge.src) for edge in outputs: if state.in_degree(edge.dst) + state.out_degree(edge.dst) == 0: state.remove_node(edge.dst) return nested_sdfg
# # b = math.exp(a) # Constructs an SDFG manually and runs it if __name__ == '__main__': print('Dynamic SDFG test with math functions') # Externals (parameters, symbols) N = dp.symbol('N') N.set(20) input = np.random.rand(N.get()).astype(np.float32) output = dp.ndarray([N], dp.float32) output[:] = dp.float32(0) # Construct SDFG mysdfg = SDFG('mymodexp') state = mysdfg.add_state() A = state.add_array('A', [N], dp.float32) B = state.add_array('B', [N], dp.float32) # Easy way to add a tasklet tasklet, map_entry, map_exit = state.add_mapped_tasklet( 'mytasklet', dict(i='0:N'), dict(a=Memlet.simple(A, 'i % N')), 'b = math.exp(a)', dict(b=Memlet.simple(B, 'i'))) # Add outer edges state.add_edge(A, None, map_entry, None, Memlet.simple(A, '0:N')) state.add_edge(map_exit, None, B, None, Memlet.simple(B, '0:N')) # Left for debugging purposes mysdfg.draw_to_file()
from dace.memlet import Memlet from dace.data import Scalar # Constructs an SDFG with two consecutive tasklets if __name__ == '__main__': print('SDFG consecutive tasklet test') # Externals (parameters, symbols) N = dp.symbol('N') N.set(20) input = dp.ndarray([N], dp.int32) output = dp.ndarray([N], dp.int32) input[:] = dp.int32(5) output[:] = dp.int32(0) # Construct SDFG mysdfg = SDFG('ctasklet') state = mysdfg.add_state() A_ = state.add_array('A', [N], dp.int32) B_ = state.add_array('B', [N], dp.int32) mysdfg.add_scalar('something', dp.int32) map_entry, map_exit = state.add_map('mymap', dict(i='0:N')) tasklet = state.add_tasklet('mytasklet', {'a'}, {'b'}, 'b = 5*a') state.add_edge(map_entry, None, tasklet, 'a', Memlet.simple(A_, 'i')) tasklet2 = state.add_tasklet('mytasklet2', {'c'}, {'d'}, 'd = 2*c') state.add_edge(tasklet, 'b', tasklet2, 'c', Memlet.simple('something', '0')) state.add_edge(tasklet2, 'd', map_exit, None, Memlet.simple(B_, 'i')) # Add outer edges state.add_edge(A_, None, map_entry, None, Memlet.simple(A_, '0:N'))
def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG, partial_width=16): ''' :param node: the node to expand :param state: the state in which the node is in :param sdfg: the SDFG in which the node is in :param partial_width: Width of the inner reduction buffer. Must be larger than the latency of the reduction operation on the given data type ''' node.validate(sdfg, state) inedge: graph.MultiConnectorEdge = state.in_edges(node)[0] outedge: graph.MultiConnectorEdge = state.out_edges(node)[0] input_dims = len(inedge.data.subset) output_dims = len(outedge.data.subset) input_data = sdfg.arrays[inedge.data.data] output_data = sdfg.arrays[outedge.data.data] # Standardize axes axes = node.axes if node.axes else [i for i in range(input_dims)] # Create nested SDFG nsdfg = SDFG('reduce') nsdfg.add_array('_in', inedge.data.subset.size(), input_data.dtype, strides=input_data.strides, storage=input_data.storage) nsdfg.add_array('_out', outedge.data.subset.size(), output_data.dtype, strides=output_data.strides, storage=output_data.storage) if input_data.dtype.veclen > 1: raise NotImplementedError( 'Vectorization currently not implemented for FPGA expansion of Reduce.' ) nstate = nsdfg.add_state() # (If axes != all) Add outer map, which corresponds to the output range if len(axes) != input_dims: all_axis = False # Interleave input and output axes to match input memlet ictr, octr = 0, 0 input_subset = [] for i in range(input_dims): if i in axes: input_subset.append(f'_i{ictr}') ictr += 1 else: input_subset.append(f'_o{octr}') octr += 1 output_size = outedge.data.subset.size() ome, omx = nstate.add_map( 'reduce_output', { f'_o{i}': f'0:{symstr(sz)}' for i, sz in enumerate(outedge.data.subset.size()) }) outm_idx = ','.join([f'_o{i}' for i in range(output_dims)]) outm = dace.Memlet(f'_out[{outm_idx}]') inm_idx = ','.join(input_subset) inmm = dace.Memlet(f'_in[{inm_idx}]') else: all_axis = True ome, omx = None, None outm = dace.Memlet('_out[0]') inm_idx = ','.join([f'_i{i}' for i in range(len(axes))]) inmm = dace.Memlet(f'_in[{inm_idx}]') # Add inner map, which corresponds to the range to reduce r = nstate.add_read('_in') w = nstate.add_read('_out') # TODO support vectorization buffer_name = 'partial_results' nsdfg.add_array(buffer_name, (partial_width, ), input_data.dtype, transient=True, storage=dtypes.StorageType.FPGA_Local) buffer = nstate.add_access(buffer_name) buffer_write = nstate.add_write(buffer_name) # Initialize explicitly partial results, as the inner map could run for a number of iteration < partial_width init_me, init_mx = nstate.add_map( 'partial_results_init', {'i': f'0:{partial_width}'}, schedule=dtypes.ScheduleType.FPGA_Device, unroll=True) init_tasklet = nstate.add_tasklet('init_pr', {}, {'pr_out'}, f'pr_out = {node.identity}') nstate.add_memlet_path(init_me, init_tasklet, memlet=dace.Memlet()) nstate.add_memlet_path(init_tasklet, init_mx, buffer, src_conn='pr_out', memlet=dace.Memlet(f'{buffer_name}[i]')) if not all_axis: nstate.add_memlet_path(ome, init_me, memlet=dace.Memlet()) ime, imx = nstate.add_map( 'reduce_values', { f'_i{i}': f'0:{symstr(inedge.data.subset.size()[axis])}' for i, axis in enumerate(sorted(axes)) }) # Accumulate over partial results redtype = detect_reduction_type(node.wcr) if redtype not in ExpandReduceFPGAPartialReduction._REDUCTION_TYPE_EXPR: raise ValueError('Reduction type not supported for "%s"' % node.wcr) else: reduction_expr = ExpandReduceFPGAPartialReduction._REDUCTION_TYPE_EXPR[ redtype] # generate flatten index considering inner map: will be used for indexing into partial results ranges_size = ime.range.size() inner_index = '+'.join( [f'_i{i} * {ranges_size[i + 1]}' for i in range(len(axes) - 1)]) inner_op = ' + ' if len(axes) > 1 else '' inner_index = inner_index + f'{inner_op}_i{(len(axes) - 1)}' partial_reduce_tasklet = nstate.add_tasklet( 'partial_reduce', {'data_in', 'buffer_in'}, {'buffer_out'}, f'''\ prev = buffer_in buffer_out = {reduction_expr}''') if not all_axis: # Connect input and partial sums nstate.add_memlet_path(r, ome, ime, partial_reduce_tasklet, dst_conn='data_in', memlet=inmm) else: nstate.add_memlet_path(r, ime, partial_reduce_tasklet, dst_conn='data_in', memlet=inmm) nstate.add_memlet_path( buffer, ime, partial_reduce_tasklet, dst_conn='buffer_in', memlet=dace.Memlet( f'{buffer_name}[({inner_index})%{partial_width}]')) nstate.add_memlet_path( partial_reduce_tasklet, imx, buffer_write, src_conn='buffer_out', memlet=dace.Memlet( f'{buffer_name}[({inner_index})%{partial_width}]')) # Then perform reduction on partial results reduce_entry, reduce_exit = nstate.add_map( 'reduce', {'i': f'0:{partial_width}'}, schedule=dtypes.ScheduleType.FPGA_Device, unroll=True) reduce_tasklet = nstate.add_tasklet( 'reduce', {'reduce_in', 'data_in'}, {'reduce_out'}, f'''\ prev = reduce_in if i > 0 else {node.identity} reduce_out = {reduction_expr}''') nstate.add_memlet_path(buffer_write, reduce_entry, reduce_tasklet, dst_conn='data_in', memlet=dace.Memlet(f'{buffer_name}[i]')) reduce_name = 'reduce_result' nsdfg.add_array(reduce_name, (1, ), output_data.dtype, transient=True, storage=dtypes.StorageType.FPGA_Local) reduce_read = nstate.add_access(reduce_name) reduce_access = nstate.add_access(reduce_name) if not all_axis: nstate.add_memlet_path(ome, reduce_read, memlet=dace.Memlet()) nstate.add_memlet_path(reduce_read, reduce_entry, reduce_tasklet, dst_conn='reduce_in', memlet=dace.Memlet(f'{reduce_name}[0]')) nstate.add_memlet_path(reduce_tasklet, reduce_exit, reduce_access, src_conn='reduce_out', memlet=dace.Memlet(f'{reduce_name}[0]')) if not all_axis: # Write out the result nstate.add_memlet_path(reduce_access, omx, w, memlet=outm) else: nstate.add_memlet_path(reduce_access, w, memlet=outm) # Rename outer connectors and add to node inedge._dst_conn = '_in' outedge._src_conn = '_out' node.add_in_connector('_in') node.add_out_connector('_out') nsdfg.validate() return nsdfg
def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG): node.validate(sdfg, state) inedge: graph.MultiConnectorEdge = state.in_edges(node)[0] outedge: graph.MultiConnectorEdge = state.out_edges(node)[0] insubset = dcpy(inedge.data.subset) isqdim = insubset.squeeze() outsubset = dcpy(outedge.data.subset) osqdim = outsubset.squeeze() input_dims = len(insubset) output_dims = len(outsubset) input_data = sdfg.arrays[inedge.data.data] output_data = sdfg.arrays[outedge.data.data] if len(osqdim) == 0: # Fix for scalars osqdim = [0] # Standardize and squeeze axes axes = node.axes if node.axes else [ i for i in range(len(inedge.data.subset)) ] axes = [axis for axis in axes if axis in isqdim] assert node.identity is not None # Create nested SDFG nsdfg = SDFG('reduce') nsdfg.add_array('_in', insubset.size(), input_data.dtype, strides=[ s for i, s in enumerate(input_data.strides) if i in isqdim ], storage=input_data.storage) nsdfg.add_array('_out', outsubset.size(), output_data.dtype, strides=[ s for i, s in enumerate(output_data.strides) if i in osqdim ], storage=output_data.storage) nsdfg.add_transient('acc', [1], nsdfg.arrays['_in'].dtype, dtypes.StorageType.Register) nstate = nsdfg.add_state() # Interleave input and output axes to match input memlet ictr, octr = 0, 0 input_subset = [] for i in isqdim: if i in axes: input_subset.append('_i%d' % ictr) ictr += 1 else: input_subset.append('_o%d' % octr) octr += 1 ome, omx = nstate.add_map( 'reduce_output', { '_o%d' % i: '0:%s' % symstr(sz) for i, sz in enumerate(outsubset.size()) }) outm = dace.Memlet.simple( '_out', ','.join(['_o%d' % i for i in range(output_dims)])) #wcr_str=node.wcr) inmm = dace.Memlet.simple('_in', ','.join(input_subset)) idt = nstate.add_tasklet('reset', {}, {'o'}, f'o = {node.identity}') nstate.add_edge(ome, None, idt, None, dace.Memlet()) accread = nstate.add_access('acc') accwrite = nstate.add_access('acc') nstate.add_edge(idt, 'o', accread, None, dace.Memlet('acc')) # Add inner map, which corresponds to the range to reduce, containing # an identity tasklet ime, imx = nstate.add_map('reduce_values', { '_i%d' % i: '0:%s' % symstr(insubset.size()[isqdim.index(axis)]) for i, axis in enumerate(sorted(axes)) }, schedule=dtypes.ScheduleType.Sequential) # Add identity tasklet for reduction t = nstate.add_tasklet('identity', {'a', 'b'}, {'o'}, 'o = b') # Connect everything r = nstate.add_read('_in') w = nstate.add_write('_out') nstate.add_memlet_path(r, ome, ime, t, dst_conn='b', memlet=inmm) nstate.add_memlet_path(accread, ime, t, dst_conn='a', memlet=dace.Memlet('acc[0]')) nstate.add_memlet_path(t, imx, accwrite, src_conn='o', memlet=dace.Memlet('acc[0]', wcr=node.wcr)) nstate.add_memlet_path(accwrite, omx, w, memlet=outm) # Rename outer connectors and add to node inedge._dst_conn = '_in' outedge._src_conn = '_out' node.add_in_connector('_in') node.add_out_connector('_out') from dace.transformation import dataflow nsdfg.apply_transformations_repeated(dataflow.MapCollapse) return nsdfg
def _expand_reduce(self, sdfg, state, node): # expands a reduce into two nested maps # taken from legacy expand_reduce.py node.validate(sdfg, state) inedge: graph.MultiConnectorEdge = state.in_edges(node)[0] outedge: graph.MultiConnectorEdge = state.out_edges(node)[0] input_dims = len(inedge.data.subset) output_dims = len(outedge.data.subset) input_data = sdfg.arrays[inedge.data.data] output_data = sdfg.arrays[outedge.data.data] # Standardize axes axes = node.axes if node.axes else [i for i in range(input_dims)] # Create nested SDFG nsdfg = SDFG('reduce') nsdfg.add_array('_in', inedge.data.subset.size(), input_data.dtype, strides=input_data.strides, storage=input_data.storage) nsdfg.add_array('_out', outedge.data.subset.size(), output_data.dtype, strides=output_data.strides, storage=output_data.storage) if node.identity is not None: raise ValueError("Node identity has to be None at this point.") else: nstate = nsdfg.add_state() # END OF INIT # (If axes != all) Add outer map, which corresponds to the output range if len(axes) != input_dims: # Interleave input and output axes to match input memlet ictr, octr = 0, 0 input_subset = [] for i in range(input_dims): if i in axes: input_subset.append('_i%d' % ictr) ictr += 1 else: input_subset.append('_o%d' % octr) octr += 1 output_size = outedge.data.subset.size() ome, omx = nstate.add_map( 'reduce_output', { '_o%d' % i: '0:%s' % symstr(sz) for i, sz in enumerate(outedge.data.subset.size()) }) outm = Memlet.simple('_out', ','.join( ['_o%d' % i for i in range(output_dims)]), wcr_str=node.wcr) inmm = Memlet.simple('_in', ','.join(input_subset)) else: ome, omx = None, None outm = Memlet.simple('_out', '0', wcr_str=node.wcr) inmm = Memlet.simple( '_in', ','.join(['_i%d' % i for i in range(len(axes))])) # Add inner map, which corresponds to the range to reduce, containing # an identity tasklet ime, imx = nstate.add_map( 'reduce_values', { '_i%d' % i: '0:%s' % symstr(inedge.data.subset.size()[axis]) for i, axis in enumerate(sorted(axes)) }) # Add identity tasklet for reduction t = nstate.add_tasklet('identity', {'inp'}, {'out'}, 'out = inp') # Connect everything r = nstate.add_read('_in') w = nstate.add_read('_out') if ome: nstate.add_memlet_path(r, ome, ime, t, dst_conn='inp', memlet=inmm) nstate.add_memlet_path(t, imx, omx, w, src_conn='out', memlet=outm) else: nstate.add_memlet_path(r, ime, t, dst_conn='inp', memlet=inmm) nstate.add_memlet_path(t, imx, w, src_conn='out', memlet=outm) # Rename outer connectors and add to node inedge._dst_conn = '_in' outedge._src_conn = '_out' node.add_in_connector('_in') node.add_out_connector('_out') nsdfg = state.add_nested_sdfg(nsdfg, sdfg, node.in_connectors, node.out_connectors, schedule=node.schedule, name=node.name) utils.change_edge_dest(state, node, nsdfg) utils.change_edge_src(state, node, nsdfg) state.remove_node(node) return nsdfg
def __init__(self, name, model: onnx.ModelProto, cuda=False): """ Constructs a new ONNXImporter. :param name: the name for the SDFG. :param model: the model to import. :param cuda: if `True`, weights will be passed as cuda arrays. """ graph: onnx.GraphProto = model.graph self.sdfg = SDFG(name) self.cuda = cuda self.state = self.sdfg.add_state() # Add all values to the SDFG, check for unsupported ops ########################################## self.value_infos = {} self.inputs = [] self.outputs = [] for value, is_input in chain(zip(graph.input, repeat(True)), zip(graph.output, repeat(False))): if not value.HasField("name"): raise ValueError("Got input or output without name") if is_input: self.inputs.append(value.name) else: self.outputs.append(value.name) self.value_infos[value.name] = value self._add_value_info(value) for value in graph.value_info: if not value.HasField("name"): raise ValueError("Got input or output without name") if value.name not in self.value_infos: self.value_infos[value.name] = value # add weights self.weights = {} for init in graph.initializer: self._add_constant_tensor(init) access_nodes = {} self._idx_to_node = [] for i, node in enumerate(graph.node): if not has_onnx_node(node.op_type): raise ValueError("Unsupported ONNX operator: '{}'".format( node.op_type)) # extract the op attributes op_attributes = { attribute_proto.name: convert_attribute_proto(attribute_proto) for attribute_proto in node.attribute } if node.HasField("name"): node_name = clean_onnx_name(node.name) else: node_name = node.op_type + "_" + str(i) # construct the dace node op_node = get_onnx_node(node.op_type)(node_name, **op_attributes) self.state.add_node(op_node) self._idx_to_node.append(op_node) for param_idx, (name, is_input) in chain( enumerate(zip(node.input, repeat(True))), enumerate(zip(node.output, repeat(False)))): if clean_onnx_name(name) not in self.sdfg.arrays: if name not in self.value_infos: raise ValueError( "Could not find array with name '{}'".format(name)) self._add_value_info(self.value_infos[name]) # get the access node if name in access_nodes: access = access_nodes[name] self._update_access_type(access, is_input) else: access = nd.AccessNode( clean_onnx_name(name), AccessType.ReadOnly if is_input else AccessType.WriteOnly) self.state.add_node(access) access_nodes[name] = access # get the connector name params = op_node.schema.inputs if is_input else op_node.schema.outputs params_len = len(params) if param_idx >= params_len: # this is a variadic parameter. Then the last parameter of the parameter must be variadic. if params[-1].param_type != ONNXParameterType.Variadic: raise ValueError( "Expected the last {i_or_o} parameter to be variadic," " since the {i_or_o} with idx {param_idx} has more parameters than the schema ({params_len})" .format(i_or_o="input" if is_input else "output", param_idx=param_idx, params_len=params_len)) conn_name = params[-1].name + "__" + str(param_idx - params_len + 1) elif params[ param_idx].param_type == ONNXParameterType.Variadic: # this is a variadic parameter, and it is within the range of params, so it must be the first # instance of a variadic parameter conn_name = params[param_idx].name + "__0" else: conn_name = params[param_idx].name data_desc = self.sdfg.arrays[clean_onnx_name(name)] # add the connector if required, and add an edge if is_input: if conn_name not in op_node.in_connectors: op_node.add_in_connector(conn_name) self.state.add_edge( access, None, op_node, conn_name, dace.Memlet.from_array(clean_onnx_name(name), data_desc)) else: if conn_name not in op_node.out_connectors: op_node.add_out_connector(conn_name) self.state.add_edge( op_node, conn_name, access, None, dace.Memlet.from_array(clean_onnx_name(name), data_desc)) if self.cuda: self.sdfg.apply_strict_transformations() self.sdfg.apply_gpu_transformations() self.sdfg.apply_strict_transformations() # set all gpu transients to be persistent for _, _, arr in self.sdfg.arrays_recursive(): if arr.transient and arr.storage == StorageType.GPU_Global: arr.lifetime = AllocationLifetime.Persistent
def test_nested_sdfg(): print('SDFG consecutive tasklet (nested SDFG) test') # Externals (parameters, symbols) N = dp.symbol('N') N.set(20) input = dp.ndarray([N], dp.int32) output = dp.ndarray([N], dp.int32) input[:] = dp.int32(5) output[:] = dp.int32(0) # Construct outer SDFG mysdfg = SDFG('ctasklet') state = mysdfg.add_state() A_ = state.add_array('A', [N], dp.int32) B_ = state.add_array('B', [N], dp.int32) # Construct inner SDFG nsdfg = dp.SDFG('ctasklet_inner') nstate = nsdfg.add_state() a = nstate.add_array('a', [N], dp.int32) b = nstate.add_array('b', [N], dp.int32) map_entry, map_exit = nstate.add_map('mymap', dict(i='0:N/2')) tasklet = nstate.add_tasklet('mytasklet', {'aa'}, {'bb'}, 'bb = 5*aa') nstate.add_memlet_path(a, map_entry, tasklet, dst_conn='aa', memlet=Memlet('a[k*N/2+i]')) tasklet2 = nstate.add_tasklet('mytasklet2', {'cc'}, {'dd'}, 'dd = 2*cc') nstate.add_edge(tasklet, 'bb', tasklet2, 'cc', Memlet()) nstate.add_memlet_path(tasklet2, map_exit, b, src_conn='dd', memlet=Memlet('b[k*N/2+i]')) # Add outer edges omap_entry, omap_exit = state.add_map('omap', dict(k='0:2')) nsdfg_node = state.add_nested_sdfg(nsdfg, None, {'a'}, {'b'}) state.add_memlet_path(A_, omap_entry, nsdfg_node, dst_conn='a', memlet=Memlet('A[0:N]')) state.add_memlet_path(nsdfg_node, omap_exit, B_, src_conn='b', memlet=Memlet('B[0:N]')) mysdfg.validate() mysdfg(A=input, B=output, N=N) diff = np.linalg.norm(10 * input - output) / N.get() print("Difference:", diff) assert diff <= 1e-5 mysdfg.apply_strict_transformations() mysdfg(A=input, B=output, N=N) diff = np.linalg.norm(10 * input - output) / N.get() print("Difference:", diff) assert diff <= 1e-5
def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG): node.validate(sdfg, state) inedge: graph.MultiConnectorEdge = state.in_edges(node)[0] outedge: graph.MultiConnectorEdge = state.out_edges(node)[0] input_dims = len(inedge.data.subset) output_dims = len(outedge.data.subset) input_data = sdfg.arrays[inedge.data.data] output_data = sdfg.arrays[outedge.data.data] # Standardize axes axes = node.axes if node.axes else [i for i in range(input_dims)] # Create nested SDFG nsdfg = SDFG('reduce') nsdfg.add_array('_in', inedge.data.subset.size(), input_data.dtype, strides=input_data.strides, storage=input_data.storage) nsdfg.add_array('_out', outedge.data.subset.size(), output_data.dtype, strides=output_data.strides, storage=output_data.storage) # If identity is defined, add an initialization state if node.identity is not None: init_state = nsdfg.add_state() nstate = nsdfg.add_state() nsdfg.add_edge(init_state, nstate, dace.InterstateEdge()) # Add initialization as a map init_state.add_mapped_tasklet( 'reduce_init', { '_o%d' % i: '0:%s' % symstr(d) for i, d in enumerate(outedge.data.subset.size()) }, {}, 'out = %s' % node.identity, { 'out': dace.Memlet.simple( '_out', ','.join( ['_o%d' % i for i in range(output_dims)])) }, external_edges=True) else: nstate = nsdfg.add_state() # END OF INIT # (If axes != all) Add outer map, which corresponds to the output range if len(axes) != input_dims: # Interleave input and output axes to match input memlet ictr, octr = 0, 0 input_subset = [] for i in range(input_dims): if i in axes: input_subset.append('_i%d' % ictr) ictr += 1 else: input_subset.append('_o%d' % octr) octr += 1 output_size = outedge.data.subset.size() ome, omx = nstate.add_map( 'reduce_output', { '_o%d' % i: '0:%s' % symstr(sz) for i, sz in enumerate(outedge.data.subset.size()) }) outm = dace.Memlet.simple( '_out', ','.join(['_o%d' % i for i in range(output_dims)]), wcr_str=node.wcr) inmm = dace.Memlet.simple('_in', ','.join(input_subset)) else: ome, omx = None, None outm = dace.Memlet.simple('_out', '0', wcr_str=node.wcr) inmm = dace.Memlet.simple( '_in', ','.join(['_i%d' % i for i in range(len(axes))])) # Add inner map, which corresponds to the range to reduce, containing # an identity tasklet ime, imx = nstate.add_map( 'reduce_values', { '_i%d' % i: '0:%s' % symstr(inedge.data.subset.size()[axis]) for i, axis in enumerate(sorted(axes)) }) # Add identity tasklet for reduction t = nstate.add_tasklet('identity', {'inp'}, {'out'}, 'out = inp') # Connect everything r = nstate.add_read('_in') w = nstate.add_read('_out') if ome: nstate.add_memlet_path(r, ome, ime, t, dst_conn='inp', memlet=inmm) nstate.add_memlet_path(t, imx, omx, w, src_conn='out', memlet=outm) else: nstate.add_memlet_path(r, ime, t, dst_conn='inp', memlet=inmm) nstate.add_memlet_path(t, imx, w, src_conn='out', memlet=outm) # Rename outer connectors and add to node inedge._dst_conn = '_in' outedge._src_conn = '_out' node.add_in_connector('_in') node.add_out_connector('_out') return nsdfg
def make_write_sdfg(): sdfg = SDFG("filter_write") loop_begin = sdfg.add_state("loop_begin") loop_entry = sdfg.add_state("loop_entry") state = sdfg.add_state("loop_body") loop_end = sdfg.add_state("loop_end") i_write_zero = loop_begin.add_scalar("i_write", dtype=dace.dtypes.uint32, transient=True, storage=StorageType.FPGA_Registers) zero_tasklet = loop_begin.add_tasklet("zero", {}, {"i_write_out"}, "i_write_out = 0") loop_begin.add_memlet_path(zero_tasklet, i_write_zero, src_conn="i_write_out", memlet=Memlet.simple(i_write_zero, "0")) sdfg.add_edge(loop_begin, loop_entry, dace.sdfg.InterstateEdge(assignments={"i": 0})) sdfg.add_edge( loop_entry, state, dace.sdfg.InterstateEdge( condition=dace.properties.CodeProperty.from_string( "i < N + W", language=dace.dtypes.Language.Python))) sdfg.add_edge( loop_entry, loop_end, dace.sdfg.InterstateEdge( condition=dace.properties.CodeProperty.from_string( "i >= N + W", language=dace.dtypes.Language.Python))) sdfg.add_edge(state, loop_entry, dace.sdfg.InterstateEdge(assignments={"i": "i + W"})) B = state.add_array("B_mem", [N / W], dtype=vtype, storage=StorageType.FPGA_Global) B_pipe = state.add_stream("_B_pipe", dtype=vtype, buffer_size=buffer_size, storage=StorageType.FPGA_Local) valid_pipe = state.add_stream("_valid_pipe", dtype=dace.dtypes.bool, buffer_size=buffer_size, storage=StorageType.FPGA_Local) i_write_in = state.add_scalar("i_write", dtype=dace.dtypes.uint32, transient=True, storage=StorageType.FPGA_Registers) i_write_out = state.add_scalar("i_write", dtype=dace.dtypes.uint32, transient=True, storage=StorageType.FPGA_Registers) tasklet = state.add_tasklet( "write", {"b_in", "valid_in", "i_write_in"}, {"b_out", "i_write_out"}, "if valid_in:" "\n\tb_out[i_write_in] = b_in" "\n\ti_write_out = i_write_in + 1" "\nelse:" "\n\ti_write_out = i_write_in") state.add_memlet_path(B_pipe, tasklet, dst_conn="b_in", memlet=Memlet.simple(B_pipe, "0")) state.add_memlet_path(valid_pipe, tasklet, dst_conn="valid_in", memlet=Memlet.simple(valid_pipe, "0")) state.add_memlet_path(i_write_in, tasklet, dst_conn="i_write_in", memlet=Memlet.simple(i_write_in, "0")) state.add_memlet_path(tasklet, i_write_out, src_conn="i_write_out", memlet=Memlet.simple(i_write_out, "0")) state.add_memlet_path(tasklet, B, src_conn="b_out", memlet=Memlet.simple(B, "0:N")) return sdfg
def nest_state_subgraph(sdfg: SDFG, state: SDFGState, subgraph: SubgraphView, name: Optional[str] = None, full_data: bool = False) -> nodes.NestedSDFG: """ Turns a state subgraph into a nested SDFG. Operates in-place. :param sdfg: The SDFG containing the state subgraph. :param state: The state containing the subgraph. :param subgraph: Subgraph to nest. :param name: An optional name for the nested SDFG. :param full_data: If True, nests entire input/output data. :return: The nested SDFG node. :raise KeyError: Some or all nodes in the subgraph are not located in this state, or the state does not belong to the given SDFG. :raise ValueError: The subgraph is contained in more than one scope. """ if state.parent != sdfg: raise KeyError('State does not belong to given SDFG') if subgraph.graph != state: raise KeyError('Subgraph does not belong to given state') # Find the top-level scope scope_tree = state.scope_tree() scope_dict = state.scope_dict() scope_dict_children = state.scope_dict(True) top_scopenode = -1 # Initialized to -1 since "None" already means top-level for node in subgraph.nodes(): if node not in scope_dict: raise KeyError('Node not found in state') # If scope entry/exit, ensure entire scope is in subgraph if isinstance(node, nodes.EntryNode): scope_nodes = scope_dict_children[node] if any(n not in subgraph.nodes() for n in scope_nodes): raise ValueError('Subgraph contains partial scopes (entry)') elif isinstance(node, nodes.ExitNode): entry = state.entry_node(node) scope_nodes = scope_dict_children[entry] + [entry] if any(n not in subgraph.nodes() for n in scope_nodes): raise ValueError('Subgraph contains partial scopes (exit)') scope_node = scope_dict[node] if scope_node not in subgraph.nodes(): if top_scopenode != -1 and top_scopenode != scope_node: raise ValueError( 'Subgraph is contained in more than one scope') top_scopenode = scope_node scope = scope_tree[top_scopenode] ### # Collect inputs and outputs of the nested SDFG inputs: List[MultiConnectorEdge] = [] outputs: List[MultiConnectorEdge] = [] for node in subgraph.source_nodes(): inputs.extend(state.in_edges(node)) for node in subgraph.sink_nodes(): outputs.extend(state.out_edges(node)) # Collect transients not used outside of subgraph (will be removed of # top-level graph) data_in_subgraph = set(n.data for n in subgraph.nodes() if isinstance(n, nodes.AccessNode)) # Find other occurrences in SDFG other_nodes = set( n.data for s in sdfg.nodes() for n in s.nodes() if isinstance(n, nodes.AccessNode) and n not in subgraph.nodes()) subgraph_transients = set() for data in data_in_subgraph: datadesc = sdfg.arrays[data] if datadesc.transient and data not in other_nodes: subgraph_transients.add(data) # All transients of edges between code nodes are also added to nested graph for edge in subgraph.edges(): if (isinstance(edge.src, nodes.CodeNode) and isinstance(edge.dst, nodes.CodeNode)): subgraph_transients.add(edge.data.data) # Collect data used in access nodes within subgraph (will be referenced in # full upon nesting) input_arrays = set() output_arrays = set() for node in subgraph.nodes(): if (isinstance(node, nodes.AccessNode) and node.data not in subgraph_transients): if state.out_degree(node) > 0: input_arrays.add(node.data) if state.in_degree(node) > 0: output_arrays.add(node.data) # Create the nested SDFG nsdfg = SDFG(name or 'nested_' + state.label) # Transients are added to the nested graph as-is for name in subgraph_transients: nsdfg.add_datadesc(name, sdfg.arrays[name]) # Input/output data that are not source/sink nodes are added to the graph # as non-transients for name in (input_arrays | output_arrays): datadesc = copy.deepcopy(sdfg.arrays[name]) datadesc.transient = False nsdfg.add_datadesc(name, datadesc) # Connected source/sink nodes outside subgraph become global data # descriptors in nested SDFG input_names = [] output_names = [] for edge in inputs: if edge.data.data is None: # Skip edges with an empty memlet continue name = '__in_' + edge.data.data datadesc = copy.deepcopy(sdfg.arrays[edge.data.data]) datadesc.transient = False if not full_data: datadesc.shape = edge.data.subset.size() input_names.append( nsdfg.add_datadesc(name, datadesc, find_new_name=True)) for edge in outputs: if edge.data.data is None: # Skip edges with an empty memlet continue name = '__out_' + edge.data.data datadesc = copy.deepcopy(sdfg.arrays[edge.data.data]) datadesc.transient = False if not full_data: datadesc.shape = edge.data.subset.size() output_names.append( nsdfg.add_datadesc(name, datadesc, find_new_name=True)) ################### # Add scope symbols to the nested SDFG for v in scope.defined_vars: if v in sdfg.symbols: sym = sdfg.symbols[v] nsdfg.add_symbol(v, sym.dtype) # Create nested state nstate = nsdfg.add_state() # Add subgraph nodes and edges to nested state nstate.add_nodes_from(subgraph.nodes()) for e in subgraph.edges(): nstate.add_edge(e.src, e.src_conn, e.dst, e.dst_conn, e.data) # Modify nested SDFG parents in subgraph for node in subgraph.nodes(): if isinstance(node, nodes.NestedSDFG): node.sdfg.parent = nstate node.sdfg.parent_sdfg = nsdfg # Add access nodes and edges as necessary edges_to_offset = [] for name, edge in zip(input_names, inputs): node = nstate.add_read(name) new_edge = copy.deepcopy(edge.data) new_edge.data = name edges_to_offset.append((edge, nstate.add_edge(node, None, edge.dst, edge.dst_conn, new_edge))) for name, edge in zip(output_names, outputs): node = nstate.add_write(name) new_edge = copy.deepcopy(edge.data) new_edge.data = name edges_to_offset.append((edge, nstate.add_edge(edge.src, edge.src_conn, node, None, new_edge))) # Offset memlet paths inside nested SDFG according to subsets for original_edge, new_edge in edges_to_offset: for edge in nstate.memlet_tree(new_edge): edge.data.data = new_edge.data.data if not full_data: edge.data.subset.offset(original_edge.data.subset, True) # Add nested SDFG node to the input state nested_sdfg = state.add_nested_sdfg(nsdfg, None, set(input_names) | input_arrays, set(output_names) | output_arrays) # Reconnect memlets to nested SDFG for name, edge in zip(input_names, inputs): if full_data: data = Memlet.from_array(edge.data.data, sdfg.arrays[edge.data.data]) else: data = edge.data state.add_edge(edge.src, edge.src_conn, nested_sdfg, name, data) for name, edge in zip(output_names, outputs): if full_data: data = Memlet.from_array(edge.data.data, sdfg.arrays[edge.data.data]) else: data = edge.data state.add_edge(nested_sdfg, name, edge.dst, edge.dst_conn, data) # Connect access nodes to internal input/output data as necessary entry = scope.entry exit = scope.exit for name in input_arrays: node = state.add_read(name) if entry is not None: state.add_nedge(entry, node, EmptyMemlet()) state.add_edge(node, None, nested_sdfg, name, Memlet.from_array(name, sdfg.arrays[name])) for name in output_arrays: node = state.add_write(name) if exit is not None: state.add_nedge(node, exit, EmptyMemlet()) state.add_edge(nested_sdfg, name, node, None, Memlet.from_array(name, sdfg.arrays[name])) # Remove subgraph nodes from graph state.remove_nodes_from(subgraph.nodes()) # Remove subgraph transients from top-level graph for transient in subgraph_transients: del sdfg.arrays[transient] return nested_sdfg