def test_mlir_tasklet_explicit_vec(): sdfg = dace.SDFG('mlir_tasklet_explicit_vec') state = sdfg.add_state() sdfg.add_array('A', [4], dace.vector(dace.int32, 4)) sdfg.add_array('B', [4], dace.vector(dace.int32, 4)) sdfg.add_array('C', [4], dace.vector(dace.int32, 4)) tasklet = state.add_tasklet(name='mlir_tasklet', inputs={'a', 'b'}, outputs={'c'}, code=''' module { func @mlir_entry(%a: vector<4xi32>, %b: vector<4xi32>) -> vector<4xi32> { %0 = addi %b, %a : vector<4xi32> return %0 : vector<4xi32> } } ''', language=dace.Language.MLIR) A = state.add_read('A') B = state.add_read('B') C = state.add_write('C') state.add_edge(A, None, tasklet, 'a', dace.Memlet('A[0]')) state.add_edge(B, None, tasklet, 'b', dace.Memlet('B[0]')) state.add_edge(tasklet, 'c', C, None, dace.Memlet('C[0]')) sdfg.validate() a = np.random.rand(4).astype(np.int32) b = np.random.rand(4).astype(np.int32) c = np.random.rand(4).astype(np.int32) sdfg(A=a, B=b, C=c) assert (c == a + b).all()
def make_fpga_state(sdfg, vec_width=1): vec_type = dace.vector(dace.float32, vec_width) state = sdfg.add_state("gemm") sdfg.add_stream("A_pipe", dace.float32, transient=True, shape=(P + 1, ), storage=dace.dtypes.StorageType.FPGA_Local, buffer_size="P") sdfg.add_stream("B_pipe", vec_type, transient=True, shape=(P + 1, ), storage=dace.dtypes.StorageType.FPGA_Local) sdfg.add_stream("C_pipe", vec_type, transient=True, shape=(P + 1, ), storage=dace.dtypes.StorageType.FPGA_Local) make_read_A(state) make_read_B(state, sdfg, vec_width) make_compute(sdfg, state, vec_width) make_write_C(state, sdfg, vec_width) return state
def make_vec_mul_sdfg(dtype=dace.float32): # Vector multiplication SDFG vecWidth = 4 n = dace.symbol("size") vecMul_sdfg = dace.SDFG("vec_mul") vecType = dace.vector(dtype, vecWidth) fpga_state = vecMul_sdfg.add_state("vec_mul_state") vecMul_sdfg.add_array('_device_x', shape=[n / vecWidth], dtype=vecType, storage=dace.dtypes.StorageType.FPGA_Global) vecMul_sdfg.add_array('_device_y', shape=[n / vecWidth], dtype=vecType, storage=dace.dtypes.StorageType.FPGA_Global) vecMul_sdfg.add_array('_device_z', shape=[n / vecWidth], dtype=vecType, storage=dace.dtypes.StorageType.FPGA_Global) x = fpga_state.add_read("_device_x") y = fpga_state.add_read("_device_y") z = fpga_state.add_write("_device_z") # ---------- ---------- # COMPUTE # ---------- ---------- vecMap_entry, vecMap_exit = fpga_state.add_map( 'vecMul_map', dict(i='0:{0}/{1}'.format(n, vecWidth)), schedule=dace.dtypes.ScheduleType.FPGA_Device) vecMul_tasklet = fpga_state.add_tasklet('vecMul_task', ['x_con', 'y_con'], ['z_con'], 'z_con = x_con * y_con') fpga_state.add_memlet_path(x, vecMap_entry, vecMul_tasklet, dst_conn='x_con', memlet=dace.Memlet(f"{x.data}[i]")) fpga_state.add_memlet_path(y, vecMap_entry, vecMul_tasklet, dst_conn='y_con', memlet=dace.Memlet(f"{y.data}[i]")) fpga_state.add_memlet_path(vecMul_tasklet, vecMap_exit, z, src_conn='z_con', memlet=dace.Memlet(f"{z.data}[i]")) ######### # Validate vecMul_sdfg.fill_scope_connectors() vecMul_sdfg.validate() return vecMul_sdfg
def make_sdfg_1d(implementation: str, vector_length: int): vtype = dace.vector(dace.typeclass(DTYPE), vector_length) if vector_length > 1 else DTYPE sdfg = dace.SDFG(f"stencil_node_test_1d_w{vector_length}") _, a_desc = sdfg.add_array("a", (SIZE / vector_length, ), dtype=vtype) _, res_desc = sdfg.add_array("res", (SIZE / vector_length, ), dtype=vtype) state = sdfg.add_state("stencil_node_test_1d") a = state.add_read("a") res = state.add_write("res") stencil_node = Stencil("stencil_test", """\ tmp0 = (a[0] + a[1]) tmp1 = (tmp0 + a[2]) res[1] = (dace.float32(0.3333) * tmp1)""", inputs={"a"}, outputs={"res"}) stencil_node.implementation = implementation state.add_node(stencil_node) state.add_edge(a, None, stencil_node, "a", dace.Memlet.from_array("a", a_desc)) state.add_edge(stencil_node, "res", res, None, dace.Memlet.from_array("res", res_desc)) return sdfg
def pure_graph(implementation, dtype, veclen): sdfg_name = f"dot_{implementation}_{dtype.ctype}_w{veclen}" sdfg = dace.SDFG(sdfg_name) state = sdfg.add_state("dot") n = dace.symbol("n") a = dace.symbol("a") vtype = dace.vector(dtype, veclen) sdfg.add_array("x", [n / veclen], vtype) sdfg.add_array("y", [n / veclen], vtype) sdfg.add_array("r", [1], dtype) x = state.add_read("x") y = state.add_read("y") result = state.add_write("r") dot_node = blas.Dot("dot") dot_node.implementation = implementation dot_node.n = n state.add_memlet_path(x, dot_node, dst_conn="_x", memlet=Memlet(f"x[0:{n}/{veclen}]")) state.add_memlet_path(y, dot_node, dst_conn="_y", memlet=Memlet(f"y[0:{n}/{veclen}]")) state.add_memlet_path(dot_node, result, src_conn="_result", memlet=Memlet(f"r[0]")) return sdfg
def test(): print('Dynamic SDFG test with vectorization and min') # Externals (parameters, symbols) N = dp.symbol('N') N.set(20) input = np.random.rand(N.get()).astype(np.float32) input2 = np.random.rand(N.get()).astype(np.float32) output = dp.ndarray([N], dp.float32) output[:] = dp.float32(0) # Construct SDFG mysdfg = SDFG('myvmin') mysdfg.add_array('A', [N], dp.float32) mysdfg.add_array('B', [N], dp.float32) mysdfg.add_array('C', [N], dp.float32) state = mysdfg.add_state() A = state.add_access('A') B = state.add_access('B') C = state.add_access('C') tasklet, map_entry, map_exit = state.add_mapped_tasklet( 'mytasklet', dict(i='0:N:2'), dict(a=Memlet.simple(A, 'i'), b=Memlet.simple(B, 'i')), 'c = min(a, b)', dict(c=Memlet.simple(C, 'i'))) # Manually vectorize tasklet tasklet.in_connectors['a'] = dp.vector(dp.float32, 2) tasklet.in_connectors['b'] = dp.vector(dp.float32, 2) tasklet.out_connectors['c'] = dp.vector(dp.float32, 2) # Add outer edges state.add_edge(A, None, map_entry, None, Memlet.simple(A, '0:N')) state.add_edge(B, None, map_entry, None, Memlet.simple(B, '0:N')) state.add_edge(map_exit, None, C, None, Memlet.simple(C, '0:N')) mysdfg(A=input, B=input2, C=output, N=N) diff = np.linalg.norm(np.minimum(input, input2) - output) / N.get() print("Difference:", diff) print("==== Program end ====") assert diff <= 1e-5
def get_dace_type(node: Union[mlir.astnodes.IntegerType, mlir.astnodes.FloatType, mlir.astnodes.VectorType]): if isinstance(node, mlir.astnodes.IntegerType) or isinstance( node, mlir.astnodes.FloatType): return TYPE_DICT[node.dump()] if isinstance(node, mlir.astnodes.VectorType): result_dim = node.dimensions[0] result_subtype = get_dace_type(node.element_type) return dace.vector(result_subtype, result_dim)
def pure_graph(dtype, transposed, expansion, veclen, alpha, beta, expansion_args=None): sdfg = dace.SDFG(f"gemv_{expansion}_{dtype}_{transposed}_w{veclen}") m = dace.symbol("m") n = dace.symbol("n") n /= veclen vtype = dace.vector(dtype, veclen) state = sdfg.add_state("gemv_compute") A_rows = m A_cols = n x_size = n if not transposed else m y_size = m if not transposed else n sdfg.add_array("A", shape=[A_rows, A_cols], dtype=vtype) sdfg.add_array("x", shape=[x_size], dtype=dtype if transposed else vtype) sdfg.add_array("y", shape=[y_size], dtype=vtype if transposed else dtype) A = state.add_read("A") x = state.add_read("x") result = state.add_write("y") gemv_node = blas.Gemv("gemv", transA=transposed, alpha=alpha, beta=beta) gemv_node.implementation = expansion state.add_memlet_path(A, gemv_node, dst_conn="_A", memlet=Memlet(f"A[0:{A_rows}, 0:{A_cols}]")) state.add_memlet_path(x, gemv_node, dst_conn="_x", memlet=Memlet(f"x[0:{x_size}]")) state.add_memlet_path(gemv_node, result, src_conn="_y", memlet=Memlet(f"y[0:{y_size}]")) if expansion_args is not None: gemv_node.expand(sdfg, state, **expansion_args) return sdfg
def make_copy_to_host_state(sdfg): state = sdfg.add_state("copy_to_host") B_device = sdfg.add_array("B_device", [SIZE], dtype=dace.vector(DTYPE, VECTOR_LENGTH.get()), transient=True, storage=dace.dtypes.StorageType.FPGA_Global) B_host = sdfg.add_array("B", [SIZE // VECTOR_LENGTH.get()], dtype=dace.vector(DTYPE, VECTOR_LENGTH.get())) read = state.add_read("B_device") write = state.add_write("B") state.add_memlet_path(read, write, memlet=dace.memlet.Memlet.simple( "B", "0:N//{}".format(VECTOR_LENGTH.get()), num_accesses=SIZE // VECTOR_LENGTH.get())) return state
def get_dace_type(node: Union[mlir.astnodes.IntegerType, mlir.astnodes.FloatType, mlir.astnodes.VectorType]): if isinstance(node, mlir.astnodes.IntegerType): result_width = node.width.value return TYPE_DICT["i" + result_width] if isinstance(node, mlir.astnodes.FloatType): return TYPE_DICT[node.type.name] if isinstance(node, mlir.astnodes.VectorType): result_dim = node.dimensions[0] result_subtype = get_dace_type(node.element_type) return dace.vector(result_subtype, result_dim)
def make_copy_to_fpga_state(sdfg, vec_width=1): ########################################################################### # Copy data to FPGA, from plain to vectorized data type if needed state = sdfg.add_state("copy_to_device") vec_type = dace.vector(dace.float32, vec_width) #host data has plain data types sdfg.add_array("A", [N, K], dtype=dace.float32) sdfg.add_array("B", [K, M / vec_width], dtype=vec_type) sdfg.add_array("C", [N, M / vec_width], dtype=vec_type) A_host = state.add_read("A") B_host = state.add_read("B") C_host = state.add_read("C") # On the device, vector B and C will be vectorized along rows. # Matrix A has plain data type sdfg.add_array("A_device", [N, K], dtype=dace.float32, transient=True, storage=dace.dtypes.StorageType.FPGA_Global) sdfg.add_array("B_device", [K, M / vec_width], dtype=vec_type, transient=True, storage=dace.dtypes.StorageType.FPGA_Global) sdfg.add_array("C_device", [N, M / vec_width], dtype=vec_type, transient=True, storage=dace.dtypes.StorageType.FPGA_Global) A_device = state.add_write("A_device") B_device = state.add_write("B_device") C_device = state.add_write("C_device") state.add_memlet_path(A_host, A_device, memlet=dace.Memlet("A_device[0:N, 0:K]")) state.add_memlet_path(B_host, B_device, memlet=dace.Memlet( "B_device[0:K, 0:M/{}]".format(vec_width))) state.add_memlet_path(C_host, C_device, memlet=dace.Memlet( "C_device[0:N, 0:M/{}]".format(vec_width))) return state
def test_map_unroll_processing_elements_decoupled(): # Grab the systolic GEMM implementation the samples directory spec = importlib.util.spec_from_file_location( "gemm", Path(__file__).parent.parent.parent / "samples" / "fpga" / "gemm_systolic_vectorized.py") gemm = importlib.util.module_from_spec(spec) spec.loader.exec_module(gemm) N = 128 K = 256 M = 512 P = 8 W = 4 TN = 32 TM = 128 # Create an SDFG with multiple processing elements sdfg = gemm.make_sdfg("map_unroll_processing_elements", dace.vector(dace.float32, W)) sdfg.specialize({"P": P, "W": W, "TN": TN, "TM": TM}) for state in sdfg.states(): for node in state.nodes(): if isinstance(node, nodes.MapEntry) and node.params == ["p"]: node.unroll = False node.schedule = dace.ScheduleType.Unrolled # Initialize arrays: Randomize A and B, zero C A = np.ndarray([N, K], dtype=dace.float32.type) B = np.ndarray([K, M], dtype=dace.float32.type) C = np.ndarray([N, M], dtype=dace.float32.type) A[:] = np.random.rand(N, K).astype(dace.float32.type) B[:] = np.random.rand(K, M).astype(dace.float32.type) C[:] = np.random.rand(N, M).astype(dace.float32.type) C_regression = A @ B + C with set_temporary("compiler", "xilinx", "decouple_array_interfaces", value=True): sdfg(A=A, B=B, C=C, N=N, M=M, K=K) diff = np.linalg.norm(C_regression - C) / float(N * M) if not np.allclose(C_regression, C): raise ValueError("Verification failed.") return sdfg
def make_sdfg_2d(implementation: str, vector_length: int): vtype = dace.vector(dace.typeclass(DTYPE), vector_length) if vector_length > 1 else DTYPE sdfg = dace.SDFG(f"stencil_node_test_2d_w{vector_length}") _, a_desc = sdfg.add_array("a", (ROWS, COLS / vector_length), dtype=vtype) _, b_desc = sdfg.add_array("b", (ROWS, ), dtype=DTYPE) sdfg.add_symbol("c", DTYPE) _, d_desc = sdfg.add_array("d", (ROWS, COLS / vector_length), dtype=vtype) _, res_desc = sdfg.add_array("res", (ROWS, COLS / vector_length), dtype=vtype) state = sdfg.add_state("stencil_node_test_2d") a = state.add_read("a") b = state.add_read("b") d = state.add_read("d") res = state.add_write("res") stencil_node = Stencil( "stencil_test", "res[0, 0] = c * b[0] * (a[-1, 0] + a[1, 0] + a[0, -1] + a[0, 1]) + d[0, -1] + d[0, 1]", iterator_mapping={"b": (True, False)}) stencil_node.implementation = implementation state.add_node(stencil_node) state.add_memlet_path(a, stencil_node, dst_conn="a", memlet=dace.Memlet.from_array("a", a_desc)) state.add_memlet_path(b, stencil_node, dst_conn="b", memlet=dace.Memlet.from_array("b", b_desc)) state.add_memlet_path(d, stencil_node, dst_conn="d", memlet=dace.Memlet.from_array("d", d_desc)) state.add_memlet_path(stencil_node, res, src_conn="res", memlet=dace.Memlet.from_array("res", res_desc)) return sdfg
def pure_graph(veclen, dtype, implementation, test_case): n = dace.symbol("n") a = dace.symbol("a") sdfg_name = f"axpy_test_{implementation}_{test_case}_w{veclen}" sdfg = dace.SDFG(sdfg_name) test_state = sdfg.add_state("test_state") vtype = dace.vector(dtype, veclen) sdfg.add_symbol(a.name, dtype) sdfg.add_array("x", shape=[n / veclen], dtype=vtype) sdfg.add_array("y", shape=[n / veclen], dtype=vtype) x_in = test_state.add_read("x") y_in = test_state.add_read("y") y_out = test_state.add_write("y") axpy_node = blas.axpy.Axpy("axpy", a) axpy_node.implementation = implementation test_state.add_memlet_path(x_in, axpy_node, dst_conn="_x", memlet=Memlet(f"x[0:n/{veclen}]")) test_state.add_memlet_path(y_in, axpy_node, dst_conn="_y", memlet=Memlet(f"y[0:n/{veclen}]")) test_state.add_memlet_path(axpy_node, y_out, src_conn="_res", memlet=Memlet(f"y[0:n/{veclen}]")) sdfg.expand_library_nodes() return sdfg
def pure_graph(implementation, dtype, veclen): m = dace.symbol("m") n = dace.symbol("n") vtype = dace.vector(dtype, veclen) sdfg = dace.SDFG("ger_test") state = sdfg.add_state("ger") sdfg.add_symbol("alpha", dtype) sdfg.add_array("x", shape=[m], dtype=dtype) sdfg.add_array("y", shape=[n / veclen], dtype=vtype) sdfg.add_array("A", shape=[m, n / veclen], dtype=vtype) sdfg.add_array("res", shape=[m, n / veclen], dtype=vtype) x = state.add_read("x") y = state.add_read("y") A = state.add_read("A") res = state.add_write("res") ger_node = blas.Ger(name="ger") ger_node.implementation = implementation state.add_memlet_path(x, ger_node, dst_conn="_x", memlet=Memlet("x[0:m]")) state.add_memlet_path(y, ger_node, dst_conn="_y", memlet=Memlet(f"y[0:n/{veclen}]")) state.add_memlet_path(A, ger_node, dst_conn="_A", memlet=Memlet(f"A[0:m, 0:n/{veclen}]")) state.add_memlet_path(ger_node, res, src_conn="_res", memlet=Memlet(f"res[0:m, 0:n/{veclen}]")) return ger_node, state, sdfg
def make_fpga_state(sdfg, vectorize_connector): state = sdfg.add_state("fpga_state") sdfg.add_array("input_buffer", (VECTOR_LENGTH.get(), ), DTYPE, transient=True, storage=dace.StorageType.FPGA_Registers) sdfg.add_array("output_buffer", (VECTOR_LENGTH.get(), ), DTYPE, transient=True, storage=dace.StorageType.FPGA_Registers) read_input = state.add_read("A_device") read_buffer = state.add_access("input_buffer") write_buffer = state.add_access("output_buffer") write_output = state.add_write("B_device") outer_entry, outer_exit = state.add_map( "outer_map", {"i": "0:N/W"}, schedule=dace.ScheduleType.FPGA_Device) # Test read from packed memory to an unpacked buffer if vectorize_connector: outputs = {"a_unpacked": dace.vector(DTYPE, VECTOR_LENGTH.get())} else: outputs = {"a_unpacked"} # Infers an array unpack_tasklet = state.add_tasklet("unpack_tasklet", {"a"}, outputs, "a_unpacked = a") state.add_memlet_path(read_input, outer_entry, unpack_tasklet, dst_conn="a", memlet=dace.Memlet.simple("A_device", "i")) state.add_memlet_path(unpack_tasklet, read_buffer, src_conn="a_unpacked", memlet=dace.Memlet.simple( read_buffer.data, "0:{}".format(VECTOR_LENGTH.get()))) unroll_entry, unroll_exit = state.add_map( "shuffle_map", {"w": "0:W"}, schedule=dace.ScheduleType.FPGA_Device, unroll=True) tasklet = state.add_tasklet("shuffle_tasklet", {"a"}, {"b"}, "b = a") state.add_memlet_path(read_buffer, unroll_entry, tasklet, dst_conn="a", memlet=dace.Memlet.simple("input_buffer", "(w + W // 2) % W", num_accesses=1)) state.add_memlet_path(tasklet, unroll_exit, write_buffer, src_conn="b", memlet=dace.Memlet.simple("output_buffer", "w", num_accesses=1)) # Test writing from unpacked to packed from inside tasklet if vectorize_connector: outputs = {"b": dace.vector(DTYPE, VECTOR_LENGTH.get())} else: outputs = {"b"} pack_tasklet = state.add_tasklet("pack_tasklet", outputs, {"b_packed"}, "b_packed = b") state.add_memlet_path(write_buffer, pack_tasklet, dst_conn="b", memlet=dace.Memlet.simple( write_buffer.data, "0:{}".format(VECTOR_LENGTH.get()))) # Write back out to memory from unpacked to packed memory state.add_memlet_path(pack_tasklet, outer_exit, write_output, src_conn="b_packed", memlet=dace.Memlet.simple("B_device", "i")) return state
def make_vecAdd_sdfg(sdfg_name: str, dtype=dace.float32): vecWidth = 4 n = dace.symbol("size") vecAdd_sdfg = dace.SDFG(sdfg_name) vecType = dace.vector(dtype, vecWidth) x_name = "x" y_name = "y" z_name = "z" ########################################################################### # Copy data to FPGA copy_in_state = vecAdd_sdfg.add_state("copy_to_device") vecAdd_sdfg.add_array(x_name, shape=[n / vecWidth], dtype=vecType) vecAdd_sdfg.add_array(y_name, shape=[n / vecWidth], dtype=vecType) in_host_x = copy_in_state.add_read(x_name) in_host_y = copy_in_state.add_read(y_name) vecAdd_sdfg.add_array("device_x", shape=[n / vecWidth], dtype=vecType, storage=dace.dtypes.StorageType.FPGA_Global, transient=True) vecAdd_sdfg.add_array("device_y", shape=[n / vecWidth], dtype=vecType, storage=dace.dtypes.StorageType.FPGA_Global, transient=True) in_device_x = copy_in_state.add_write("device_x") in_device_y = copy_in_state.add_write("device_y") copy_in_state.add_memlet_path(in_host_x, in_device_x, memlet=Memlet.simple( in_host_x, "0:{}/{}".format(n, vecWidth))) copy_in_state.add_memlet_path(in_host_y, in_device_y, memlet=Memlet.simple( in_host_y, "0:{}/{}".format(n, vecWidth))) ########################################################################### # Copy data from FPGA vecAdd_sdfg.add_array(z_name, shape=[n / vecWidth], dtype=vecType) copy_out_state = vecAdd_sdfg.add_state("copy_to_host") vecAdd_sdfg.add_array("device_z", shape=[n / vecWidth], dtype=vecType, storage=dace.dtypes.StorageType.FPGA_Global, transient=True) out_device = copy_out_state.add_read("device_z") out_host = copy_out_state.add_write(z_name) copy_out_state.add_memlet_path(out_device, out_host, memlet=Memlet.simple( out_host, "0:{}/{}".format(n, vecWidth))) ######################################################################## # FPGA State fpga_state = vecAdd_sdfg.add_state("fpga_state") x = fpga_state.add_read("device_x") y = fpga_state.add_read("device_y") z = fpga_state.add_write("device_z") # ---------- ---------- # COMPUTE # ---------- ---------- vecMap_entry, vecMap_exit = fpga_state.add_map( 'vecAdd_map', dict(i='0:{0}/{1}'.format(n, vecWidth)), schedule=dace.dtypes.ScheduleType.FPGA_Device) vecAdd_tasklet = fpga_state.add_tasklet('vecAdd_task', ['x_con', 'y_con'], ['z_con'], 'z_con = x_con + y_con') fpga_state.add_memlet_path(x, vecMap_entry, vecAdd_tasklet, dst_conn='x_con', memlet=dace.Memlet.simple(x.data, "i")) fpga_state.add_memlet_path(y, vecMap_entry, vecAdd_tasklet, dst_conn='y_con', memlet=dace.Memlet.simple(y.data, 'i')) fpga_state.add_memlet_path(vecAdd_tasklet, vecMap_exit, z, src_conn='z_con', memlet=dace.Memlet.simple(z.data, 'i')) ###################################### # Interstate edges vecAdd_sdfg.add_edge(copy_in_state, fpga_state, dace.sdfg.sdfg.InterstateEdge()) vecAdd_sdfg.add_edge(fpga_state, copy_out_state, dace.sdfg.sdfg.InterstateEdge()) ######### # Validate vecAdd_sdfg.fill_scope_connectors() vecAdd_sdfg.validate() return vecAdd_sdfg
# add symbol N = dace.symbol('N') # add sdfg sdfg = dace.SDFG('fladd') # add state state = sdfg.add_state('device_state') # add parameter veclen = 1 sdfg.add_constant('VECLEN', veclen) # add arrays sdfg.add_array('A', [N // veclen], dtype=dace.vector(dace.float32, veclen), storage=dace.StorageType.CPU_Heap) sdfg.add_array('B', [N // veclen], dtype=dace.vector(dace.float32, veclen), storage=dace.StorageType.CPU_Heap) sdfg.add_array('C', [N // veclen], dtype=dace.vector(dace.float32, veclen), storage=dace.StorageType.CPU_Heap) sdfg.add_array('fpga_A', [N // veclen], dtype=dace.vector(dace.float32, veclen), transient=True, storage=dace.StorageType.FPGA_Global) sdfg.add_array('fpga_B', [N // veclen], dtype=dace.vector(dace.float32, veclen), transient=True, storage=dace.StorageType.FPGA_Global) sdfg.add_array('fpga_C', [N // veclen], dtype=dace.vector(dace.float32, veclen), transient=True, storage=dace.StorageType.FPGA_Global)
def test_tasklet_vector(): """ Test rtl tasklet vector support. """ # add symbol N = dace.symbol('N') # add sdfg sdfg = dace.SDFG('rtl_tasklet_vector') # define compile-time constant sdfg.specialize(dict(N=4)) # add state state = sdfg.add_state() # add arrays sdfg.add_array('A', [N], dtype=dace.int32) sdfg.add_array('B', [1], dtype=dace.int32) # add custom cpp tasklet tasklet = state.add_tasklet(name='rtl_tasklet', inputs={'a': dace.vector(dace.int32, N)}, outputs={'b'}, code=''' /* Convention: |---------------------------------------------------------------------| -->| ap_aclk (clock input) | -->| ap_areset (reset input, rst on high) | | | -->| {inputs} reg {outputs} |--> | | <--| s_axis_a_tready (ready for data) (data avail) m_axis_b_tvalid |--> -->| s_axis_a_tvalid (new data avail) (data consumed) m_axis_b_tready |<-- |---------------------------------------------------------------------| */ typedef enum [1:0] {READY, BUSY, DONE} state_e; state_e state; always@(posedge ap_aclk) begin if (ap_areset) begin // case: reset m_axis_b_tdata <= 0; s_axis_a_tready <= 1'b1; state <= READY; end else if (s_axis_a_tvalid && state == READY) begin // case: load a m_axis_b_tdata <= s_axis_a_tdata[0]; s_axis_a_tready <= 1'b0; state <= BUSY; end else if (m_axis_b_tdata < s_axis_a_tdata[0] + s_axis_a_tdata[1] && state == BUSY) begin // case: increment counter b m_axis_b_tdata <= m_axis_b_tdata + 1; end else if (state == BUSY) begin m_axis_b_tdata <= m_axis_b_tdata; state <= DONE; end end assign m_axis_b_tvalid = (m_axis_b_tdata >= s_axis_a_tdata[0] + s_axis_a_tdata[1] && (state == BUSY || state == DONE)) ? 1'b1:1'b0; ''', language=dace.Language.SystemVerilog) # add input/output array A = state.add_read('A') B = state.add_write('B') # connect input/output array with the tasklet state.add_edge(A, None, tasklet, 'a', dace.Memlet('A[0:N-1]')) state.add_edge(tasklet, 'b', B, None, dace.Memlet('B[0]')) # validate sdfg sdfg.validate() # Execute # init data structures a = np.random.randint(0, 100, dace.symbolic.evaluate( N, sdfg.constants)).astype(np.int32) b = np.array([0]).astype(np.int32) # call program sdfg(A=a, B=b) # check result assert b == a[0] + a[1]
import dace import numpy as np # Define vector type float2 = dace.vector(dace.float32, 2) def test_vector_type(): sdfg = dace.SDFG('vectortypes') sdfg.add_array('A', [2], float2) sdfg.add_array('B', [2], float2) state = sdfg.add_state() r = state.add_read('A') # With type inference t1 = state.add_tasklet('something', {'a'}, {'b'}, 'b = a * 2') # Without type inference t2 = state.add_tasklet('something', dict(a=float2), dict(b=float2), 'b = a * 2') w = state.add_write('B') state.add_edge(r, None, t1, 'a', dace.Memlet('A[0]')) state.add_edge(t1, 'b', w, None, dace.Memlet('B[0]')) state.add_edge(r, None, t2, 'a', dace.Memlet('A[1]')) state.add_edge(t2, 'b', w, None, dace.Memlet('B[1]')) A = np.random.rand(4).astype(np.float32) B = np.random.rand(4).astype(np.float32) sdfg(A=A, B=B) assert np.allclose(B, 2 * A) def test_vector_type_inference():
output[:] = dp.float32(0) # Construct SDFG mysdfg = SDFG('myvmin') state = mysdfg.add_state() A = state.add_array('A', [N], dp.float32) B = state.add_array('B', [N], dp.float32) C = state.add_array('C', [N], dp.float32) tasklet, map_entry, map_exit = state.add_mapped_tasklet( 'mytasklet', dict(i='0:N:2'), dict(a=Memlet.simple(A, 'i'), b=Memlet.simple(B, 'i')), 'c = min(a, b)', dict(c=Memlet.simple(C, 'i'))) # Manually vectorize tasklet tasklet.in_connectors['a'] = dp.vector(dp.float32, 2) tasklet.in_connectors['b'] = dp.vector(dp.float32, 2) tasklet.out_connectors['c'] = dp.vector(dp.float32, 2) # Add outer edges state.add_edge(A, None, map_entry, None, Memlet.simple(A, '0:N')) state.add_edge(B, None, map_entry, None, Memlet.simple(B, '0:N')) state.add_edge(map_exit, None, C, None, Memlet.simple(C, '0:N')) mysdfg(A=input, B=input2, C=output, N=N) diff = np.linalg.norm(np.minimum(input, input2) - output) / N.get() print("Difference:", diff) print("==== Program end ====") exit(0 if diff <= 1e-5 else 1)
def make_fpga_sdfg(): ''' Build an SDFG with two nested SDFGs in a single FPGA state ''' n = dace.symbol("n") vecWidth = 4 vecType = dace.vector(dace.float32, vecWidth) sdfg = dace.SDFG("nested_sdfg_kernels") ########################################################################### # Copy data to FPGA copy_in_state = sdfg.add_state("copy_to_device") sdfg.add_array("x", shape=[n / vecWidth], dtype=vecType) sdfg.add_array("y", shape=[n / vecWidth], dtype=vecType) sdfg.add_array("v", shape=[n / vecWidth], dtype=vecType) in_host_x = copy_in_state.add_read("x") in_host_y = copy_in_state.add_read("y") in_host_v = copy_in_state.add_read("v") sdfg.add_array("device_x", shape=[n / vecWidth], dtype=vecType, storage=dace.dtypes.StorageType.FPGA_Global, transient=True) sdfg.add_array("device_y", shape=[n / vecWidth], dtype=vecType, storage=dace.dtypes.StorageType.FPGA_Global, transient=True) sdfg.add_array("device_v", shape=[n / vecWidth], dtype=vecType, storage=dace.dtypes.StorageType.FPGA_Global, transient=True) in_device_x = copy_in_state.add_write("device_x") in_device_y = copy_in_state.add_write("device_y") in_device_v = copy_in_state.add_write("device_v") copy_in_state.add_memlet_path(in_host_x, in_device_x, memlet=dace.Memlet(f"{in_host_x.data}[0:{n}/{vecWidth}]")) copy_in_state.add_memlet_path(in_host_y, in_device_y, memlet=dace.Memlet(f"{in_host_y.data}[0:{n}/{vecWidth}]")) copy_in_state.add_memlet_path(in_host_v, in_device_v, memlet=dace.Memlet(f"{in_host_v.data}[0:{n}/{vecWidth}]")) ########################################################################### # Copy data from FPGA sdfg.add_array("z", shape=[n / vecWidth], dtype=vecType) sdfg.add_array("u", shape=[n / vecWidth], dtype=vecType) copy_out_state = sdfg.add_state("copy_to_host") sdfg.add_array("device_z", shape=[n / vecWidth], dtype=vecType, storage=dace.dtypes.StorageType.FPGA_Global, transient=True) sdfg.add_array("device_u", shape=[n / vecWidth], dtype=vecType, storage=dace.dtypes.StorageType.FPGA_Global, transient=True) out_device_z = copy_out_state.add_read("device_z") out_host_z = copy_out_state.add_write("z") out_device_u = copy_out_state.add_read("device_u") out_host_u = copy_out_state.add_write("u") copy_out_state.add_memlet_path(out_device_z, out_host_z, memlet=dace.Memlet(f"{out_host_z.data}[0:{n}/{vecWidth}]")) copy_out_state.add_memlet_path(out_device_u, out_host_u, memlet=dace.Memlet(f"{out_host_u.data}[0:{n}/{vecWidth}]")) ########################################################################### # State that must not become an FPGA kernel non_fpga_state = sdfg.add_state("I_do_not_want_to_be_fpga_kernel") non_fpga_state.location["is_FPGA_kernel"] = False # Build the vec addition SDFG and nest it in_device_x = non_fpga_state.add_read("device_x") in_device_y = non_fpga_state.add_read("device_y") in_device_v = non_fpga_state.add_read("device_v") out_device_z = non_fpga_state.add_write("device_z") out_device_u = non_fpga_state.add_write("device_u") to_nest = make_vec_add_sdfg() # add nested sdfg with symbol mapping nested_sdfg = non_fpga_state.add_nested_sdfg(to_nest, sdfg, {"_device_x", "_device_y"}, {"_device_z"}, {"size": "n"}) non_fpga_state.add_memlet_path(in_device_x, nested_sdfg, dst_conn="_device_x", memlet=dace.Memlet(f"{in_device_x.data}[0:{n}/{vecWidth}]")) non_fpga_state.add_memlet_path(in_device_y, nested_sdfg, dst_conn="_device_y", memlet=dace.Memlet(f"{in_device_y.data}[0:{n}/{vecWidth}]")) non_fpga_state.add_memlet_path(nested_sdfg, out_device_z, src_conn="_device_z", memlet=dace.Memlet(f"{out_device_z.data}[0:{n}/{vecWidth}]")) # Build the second vec addition SDFG and nest it to_nest = make_vec_add_sdfg() # add nested sdfg with symbol mapping nested_sdfg = non_fpga_state.add_nested_sdfg(to_nest, sdfg, {"_device_x", "_device_y"}, {"_device_z"}, {"size": "n"}) non_fpga_state.add_memlet_path(out_device_z, nested_sdfg, dst_conn="_device_x", memlet=dace.Memlet(f"{out_device_z.data}[0:{n}/{vecWidth}]")) non_fpga_state.add_memlet_path(in_device_v, nested_sdfg, dst_conn="_device_y", memlet=dace.Memlet(f"{in_device_v.data}[0:{n}/{vecWidth}]")) non_fpga_state.add_memlet_path(nested_sdfg, out_device_u, src_conn="_device_z", memlet=dace.Memlet(f"{out_device_u.data}[0:{n}/{vecWidth}]")) ###################################### # Interstate edges sdfg.add_edge(copy_in_state, non_fpga_state, dace.sdfg.sdfg.InterstateEdge()) sdfg.add_edge(non_fpga_state, copy_out_state, dace.sdfg.sdfg.InterstateEdge()) sdfg.fill_scope_connectors() sdfg.validate() return sdfg
def make_sdfg(name="fpga_stcl_test", dtype=dace.float32, veclen=8): vtype = dace.vector(dtype, veclen) n = dace.symbol("N") m = dace.symbol("M") sdfg = dace.SDFG(name) pre_state = sdfg.add_state(name + "_pre") state = sdfg.add_state(name) post_state = sdfg.add_state(name + "_post") sdfg.add_edge(pre_state, state, dace.InterstateEdge()) sdfg.add_edge(state, post_state, dace.InterstateEdge()) _, desc_input_host = sdfg.add_array("a", (n, m / veclen), vtype) _, desc_output_host = sdfg.add_array("b", (n, m / veclen), vtype) desc_input_device = copy.copy(desc_input_host) desc_input_device.storage = dace.StorageType.FPGA_Global desc_input_device.location["bank"] = 0 desc_input_device.transient = True desc_output_device = copy.copy(desc_output_host) desc_output_device.storage = dace.StorageType.FPGA_Global desc_output_device.location["bank"] = 1 desc_output_device.transient = True sdfg.add_datadesc("a_device", desc_input_device) sdfg.add_datadesc("b_device", desc_output_device) # Host to device pre_read = pre_state.add_read("a") pre_write = pre_state.add_write("a_device") pre_state.add_memlet_path( pre_read, pre_write, memlet=dace.Memlet(f"a_device[0:N, 0:M/{veclen}]")) # Device to host post_read = post_state.add_read("b_device") post_write = post_state.add_write("b") post_state.add_memlet_path( post_read, post_write, memlet=dace.Memlet(f"b_device[0:N, 0:M/{veclen}]")) # Compute state read_memory = state.add_read("a_device") write_memory = state.add_write("b_device") # Memory streams sdfg.add_stream("a_stream", vtype, storage=dace.StorageType.FPGA_Local, transient=True) sdfg.add_stream("b_stream", vtype, storage=dace.StorageType.FPGA_Local, transient=True) produce_input_stream = state.add_write("a_stream") consume_input_stream = state.add_read("a_stream") produce_output_stream = state.add_write("b_stream") consume_output_stream = state.add_write("b_stream") tasklet = state.add_tasklet( name, {"_north", "_west", "_east", "_south"}, {"result"}, """\ north = _north if i >= 1 else 1 west = _west if {W}*j + u >= 1 else 1 east = _east if {W}*j + u < M - 1 else 1 south = _south if i < N - 1 else 1 result = 0.25 * (north + west + east + south)""".format(W=veclen)) entry, exit = state.add_pipeline(name, { "i": "0:N", "j": "0:M/{}".format(veclen), }, schedule=dace.ScheduleType.FPGA_Device, init_size=m / veclen, init_overlap=False, drain_size=m / veclen, drain_overlap=True) # Unrolled map unroll_entry, unroll_exit = state.add_map( name + "_unroll", {"u": "0:{}".format(veclen)}, schedule=dace.ScheduleType.FPGA_Device, unroll=True) # Container-to-container copies between arrays and streams state.add_memlet_path(read_memory, produce_input_stream, memlet=dace.Memlet( f"{read_memory.data}[0:N, 0:M/{veclen}]", other_subset="0")) state.add_memlet_path(consume_output_stream, write_memory, memlet=dace.Memlet( write_memory.data, f"{write_memory.data}[0:N, 0:M/{veclen}]", other_subset="0")) # Container-to-container copy from vectorized stream to non-vectorized # buffer sdfg.add_array("input_buffer", (1, ), vtype, storage=dace.StorageType.FPGA_Local, transient=True) sdfg.add_array("shift_register", (2 * m + veclen, ), dtype, storage=dace.StorageType.FPGA_ShiftRegister, transient=True) sdfg.add_array("output_buffer", (veclen, ), dtype, storage=dace.StorageType.FPGA_Local, transient=True) sdfg.add_array("output_buffer_packed", (1, ), vtype, storage=dace.StorageType.FPGA_Local, transient=True) input_buffer = state.add_access("input_buffer") shift_register = state.add_access("shift_register") output_buffer = state.add_access("output_buffer") output_buffer_packed = state.add_access("output_buffer_packed") # Only write if not initializing read_tasklet = state.add_tasklet( name + "_conditional_read", {"_in"}, {"_out"}, "if not {}:\n\t_out = _in".format(entry.pipeline.drain_condition())) # Input stream to buffer state.add_memlet_path(consume_input_stream, entry, read_tasklet, dst_conn="_in", memlet=dace.Memlet(f"{consume_input_stream.data}[0]", dynamic=True)) state.add_memlet_path(read_tasklet, input_buffer, src_conn="_out", memlet=dace.Memlet(f"{input_buffer.data}[0]")) state.add_memlet_path(input_buffer, shift_register, memlet=dace.Memlet(f"{input_buffer.data}[0]", other_subset=f"2*M:(2*M + {veclen})")) # Stencils accesses state.add_memlet_path( shift_register, unroll_entry, tasklet, dst_conn="_north", memlet=dace.Memlet(f"{shift_register.data}[u]")) # North state.add_memlet_path( shift_register, unroll_entry, tasklet, dst_conn="_west", memlet=dace.Memlet(f"{shift_register.data}[u + M - 1]")) # West state.add_memlet_path( shift_register, unroll_entry, tasklet, dst_conn="_east", memlet=dace.Memlet(f"{shift_register.data}[u + M + 1]")) # East state.add_memlet_path( shift_register, unroll_entry, tasklet, dst_conn="_south", memlet=dace.Memlet(f"{shift_register.data}[u + 2 * M]")) # South # Tasklet to buffer state.add_memlet_path(tasklet, unroll_exit, output_buffer, src_conn="result", memlet=dace.Memlet(f"{output_buffer.data}[u]")) # Pack buffer state.add_memlet_path(output_buffer, output_buffer_packed, memlet=dace.Memlet(f"{output_buffer_packed.data}[0]", other_subset=f"0:{veclen}")) # Only write if not initializing write_tasklet = state.add_tasklet( name + "_conditional_write", {"_in"}, {"_out"}, "if not {}:\n\t_out = _in".format(entry.pipeline.init_condition())) # Buffer to output stream state.add_memlet_path(output_buffer_packed, write_tasklet, dst_conn="_in", memlet=dace.Memlet(f"{output_buffer_packed.data}[0]")) # Buffer to output stream state.add_memlet_path(write_tasklet, exit, produce_output_stream, src_conn="_out", memlet=dace.Memlet(f"{produce_output_stream.data}[0]", dynamic=True)) return sdfg
def make_compute(sdfg, state, vec_width=1): vec_type = dace.vector(dace.float32, vec_width) A_pipe_in = state.add_read("A_pipe") A_pipe_out = state.add_write("A_pipe") B_pipe_in = state.add_read("B_pipe") B_pipe_out = state.add_write("B_pipe") C_pipe_in = state.add_read("C_pipe") C_pipe_out = state.add_write("C_pipe") entry_n0, exit_n0 = state.add_map("n0", { "n0": "0:N/P", }, schedule=dace.ScheduleType.FPGA_Device) entry_k, exit_k = state.add_map("k", {"k": "0:K"}, schedule=dace.ScheduleType.FPGA_Device) entry_a, exit_a = state.add_map("buffer_A", {"n1": "0:P"}, schedule=dace.ScheduleType.FPGA_Device) # As we are using vectorized data types for B, we have to consider it into these # two maps entry_m, exit_m = state.add_map("m", {"m": "0:M/{}".format(vec_width)}, schedule=dace.ScheduleType.FPGA_Device) entry_c, exit_c = state.add_map("write_C", { "n1": "0:P", "m": "0:M/{}".format(vec_width) }, schedule=dace.ScheduleType.FPGA_Device) # Instantiate buffers sdfg.add_scalar("A_reg", dtype=dace.float32, transient=True, storage=dace.dtypes.StorageType.FPGA_Registers) A_reg = state.add_write("A_reg") # For C result we are going to use vectorized data type sdfg.add_array("C_buffer", [M / vec_width], dtype=vec_type, transient=True, storage=dace.dtypes.StorageType.FPGA_Local) C_buffer_in = state.add_read("C_buffer") C_buffer_out = state.add_write("C_buffer") # every PE: reads input data, buffer the data assigned to it, forwards the data buffer_a_tasklet = state.add_tasklet( "buffer_a", {"a_in"}, {"a_reg", "a_out"}, """\ if n1 == P - p - 1: a_reg = a_in if p < P - 1: a_out = a_in""") state.add_memlet_path(A_pipe_in, entry_n0, entry_k, entry_a, buffer_a_tasklet, memlet=dace.Memlet("A_pipe[p]", dynamic=False), dst_conn="a_in") state.add_memlet_path(buffer_a_tasklet, exit_a, A_reg, memlet=dace.Memlet("A_reg[0]", dynamic=True), src_conn="a_reg") state.add_memlet_path(buffer_a_tasklet, exit_a, exit_k, exit_n0, A_pipe_out, memlet=dace.Memlet("A_pipe[p + 1]", dynamic=True), src_conn="a_out") # Compute and forward B compute_tasklet = state.add_tasklet( "multiply_add", {"a_in", "b_in", "c_in"}, {"b_out", "c_out"}, """\ c_prev = c_in if k == 0: c_prev = 0 c_out = c_prev + a_in * b_in if p < P - 1: b_out = b_in""") state.add_memlet_path(A_reg, entry_m, compute_tasklet, dst_conn="a_in", memlet=dace.Memlet("A_reg[0]")) state.add_memlet_path(B_pipe_in, entry_n0, entry_k, entry_m, compute_tasklet, memlet=dace.Memlet("B_pipe[p]", dynamic=False), dst_conn="b_in") state.add_memlet_path(compute_tasklet, exit_m, exit_k, exit_n0, B_pipe_out, memlet=dace.Memlet("B_pipe[p + 1]", dynamic=True), src_conn="b_out") state.add_memlet_path(C_buffer_in, entry_k, entry_m, compute_tasklet, dst_conn="c_in", memlet=dace.Memlet("C_buffer[m]")) state.add_memlet_path(entry_n0, C_buffer_in, memlet=dace.Memlet()) state.add_memlet_path(compute_tasklet, exit_m, exit_k, C_buffer_out, memlet=dace.Memlet("C_buffer[m]"), src_conn="c_out") state.add_memlet_path(C_buffer_out, exit_n0, memlet=dace.Memlet()) write_c_tasklet = state.add_tasklet( "write_c", {"buffer_in", "forward_in"}, {"c_out"}, """\ if n1 <= p: c_out = forward_in if p > 0 and n1 > 0 else buffer_in""") state.add_memlet_path(C_buffer_out, entry_c, write_c_tasklet, memlet=dace.Memlet("C_buffer[m]", dynamic=True), dst_conn="buffer_in") state.add_memlet_path(C_pipe_in, entry_n0, entry_c, write_c_tasklet, memlet=dace.Memlet("C_pipe[p-1]", dynamic=True), dst_conn="forward_in") state.add_memlet_path(write_c_tasklet, exit_c, exit_n0, C_pipe_out, memlet=dace.Memlet("C_pipe[p]", dynamic=True), src_conn="c_out") # Unroll processing elements compute_entry, compute_exit = state.add_map( "unroll_compute", {"p": "0:P"}, schedule=dace.ScheduleType.FPGA_Device, unroll=True) # Bring data nodes into scope state.add_memlet_path(compute_entry, A_pipe_in, memlet=dace.memlet.Memlet()) state.add_memlet_path(compute_entry, B_pipe_in, memlet=dace.memlet.Memlet()) state.add_memlet_path(compute_entry, C_pipe_in, memlet=dace.memlet.Memlet()) state.add_memlet_path(A_pipe_out, compute_exit, memlet=dace.memlet.Memlet()) state.add_memlet_path(B_pipe_out, compute_exit, memlet=dace.memlet.Memlet()) state.add_memlet_path(C_pipe_out, compute_exit, memlet=dace.memlet.Memlet())
def test_tasklet_map(): ''' Test the unrolled map support for M tasklets on N vectors of size W. ''' # add symbols n = 512 m = 8 w = 4 N = dace.symbol('N') M = dace.symbol('M') W = dace.symbol('W') N.set(n) M.set(m) W.set(w) # add sdfg sdfg = dace.SDFG('rtl_tasklet_map') # add state state = sdfg.add_state() # add arrays sdfg.add_array('A', [M, N], dtype=dace.vector(dace.int32, W.get())) sdfg.add_array('B', [M, N], dtype=dace.vector(dace.int32, W.get())) sdfg.add_array('C', [M, N], dtype=dace.vector(dace.int32, W.get())) mentry, mexit = state.add_map('compute_map', {'k': '0:M'}) tasklet = state.add_tasklet(name='rtl_tasklet1', inputs={'a', 'b'}, outputs={'c'}, code=''' reg [W-1:0][31:0] a_data; reg a_valid; reg [W-1:0][31:0] b_data; reg b_valid; // Read A always@(posedge ap_aclk) begin if (ap_areset) begin s_axis_a_tready <= 0; a_valid <= 0; a_data <= 0; end else begin if (s_axis_a_tready && s_axis_a_tvalid) begin a_valid <= 1; a_data <= s_axis_a_tdata; s_axis_a_tready <= 0; end else if (m_axis_c_tvalid && m_axis_c_tready) begin a_valid <= 0; s_axis_a_tready <= 1; end else begin s_axis_a_tready <= ~a_valid; end end end // Read B always@(posedge ap_aclk) begin if (ap_areset) begin s_axis_b_tready <= 0; b_valid <= 0; b_data <= 0; end else begin if (s_axis_b_tready && s_axis_b_tvalid) begin b_valid <= 1; b_data <= s_axis_b_tdata; s_axis_b_tready <= 0; end else if (m_axis_c_tvalid && m_axis_c_tready) begin b_valid <= 0; b_data <= 0; s_axis_b_tready <= 1; end else begin s_axis_b_tready <= ~b_valid; end end end // Compute and write C always@(posedge ap_aclk) begin if (ap_areset) begin m_axis_c_tvalid <= 0; m_axis_c_tdata <= 0; end else begin if (m_axis_c_tvalid && m_axis_c_tready) begin m_axis_c_tvalid <= 0; end else if (a_valid && b_valid) begin m_axis_c_tvalid <= 1; m_axis_c_tdata <= a_data + b_data; end end end''', language=dace.Language.SystemVerilog) A = state.add_read('A') B = state.add_read('B') C = state.add_write('C') state.add_memlet_path(A, mentry, tasklet, memlet=dace.Memlet('A[k,0:N]'), dst_conn='a') state.add_memlet_path(B, mentry, tasklet, memlet=dace.Memlet('B[k,0:N]'), dst_conn='b') state.add_memlet_path(tasklet, mexit, C, memlet=dace.Memlet('C[k,0:N]'), src_conn='c') sdfg.specialize({'M': M, 'N': N, 'W': W}) sdfg.validate() # init data structures a = np.random.randint(0, 100, m * n * w).reshape((m, n, w)).astype(np.int32) b = np.random.randint(0, 100, m * n * w).reshape((m, n, w)).astype(np.int32) c = np.zeros((m, n, w)).astype(np.int32) # call program sdfg(A=a, B=b, C=c) # check result assert (c == a + b).all()
def make_sdfg(tasklet_code=None, name="veclen_copy_conversion", dtype=dace.float32, veclen=16): vtype = dace.vector(dace.float32, veclen) if tasklet_code is None: tasklet_code = "_out = _in" n = dace.symbol("N") sdfg = dace.SDFG(name) pre_state = sdfg.add_state(name + "_pre") state = sdfg.add_state(name) post_state = sdfg.add_state(name + "_post") sdfg.add_edge(pre_state, state, dace.InterstateEdge()) sdfg.add_edge(state, post_state, dace.InterstateEdge()) _, desc_input_host = sdfg.add_array("a", (n // veclen, ), vtype) _, desc_output_host = sdfg.add_array("b", (n // veclen, ), vtype) desc_input_device = copy.copy(desc_input_host) desc_input_device.storage = dace.StorageType.FPGA_Global desc_input_device.location["bank"] = 0 desc_input_device.transient = True desc_output_device = copy.copy(desc_output_host) desc_output_device.storage = dace.StorageType.FPGA_Global desc_output_device.location["bank"] = 1 desc_output_device.transient = True sdfg.add_datadesc("a_device", desc_input_device) sdfg.add_datadesc("b_device", desc_output_device) # Host to device pre_read = pre_state.add_read("a") pre_write = pre_state.add_write("a_device") pre_state.add_memlet_path(pre_read, pre_write, memlet=dace.Memlet(pre_write.data, None)) # Device to host post_read = post_state.add_read("b_device") post_write = post_state.add_write("b") post_state.add_memlet_path(post_read, post_write, memlet=dace.Memlet(post_write.data, None)) # Compute state read_memory = state.add_read("a_device") write_memory = state.add_write("b_device") # Memory streams sdfg.add_stream("a_stream", vtype, storage=dace.StorageType.FPGA_Local, transient=True) sdfg.add_stream("b_stream", vtype, storage=dace.StorageType.FPGA_Local, transient=True) produce_input_stream = state.add_write("a_stream") consume_input_stream = state.add_read("a_stream") produce_output_stream = state.add_write("b_stream") consume_output_stream = state.add_write("b_stream") tasklet = state.add_tasklet(name, {"_in"}, {"_out"}, tasklet_code) # Iterative map entry, exit = state.add_map(name, { "i": "0:N//{}".format(veclen), }, schedule=dace.ScheduleType.FPGA_Device) # Unrolled map unroll_entry, unroll_exit = state.add_map( name + "_unroll", {"u": "0:{}".format(veclen)}, schedule=dace.ScheduleType.FPGA_Device, unroll=True) # Container-to-container copies between arrays and streams state.add_memlet_path(read_memory, produce_input_stream, memlet=dace.Memlet(read_memory.data)) state.add_memlet_path(consume_output_stream, write_memory, memlet=dace.Memlet(write_memory.data)) # Container-to-container copy from vectorized stream to non-vectorized # buffer sdfg.add_array("a_buffer", (veclen, ), dtype, storage=dace.StorageType.FPGA_Local, transient=True) sdfg.add_array("b_buffer", (veclen, ), dtype, storage=dace.StorageType.FPGA_Local, transient=True) a_buffer = state.add_access("a_buffer") b_buffer = state.add_access("b_buffer") # Input stream to buffer state.add_memlet_path(consume_input_stream, entry, a_buffer, memlet=dace.Memlet.simple( consume_input_stream.data, "0", other_subset_str="0:{}".format(veclen))) # Buffer to tasklet state.add_memlet_path(a_buffer, unroll_entry, tasklet, dst_conn="_in", memlet=dace.Memlet.simple(a_buffer.data, "u", num_accesses=1)) # Tasklet to buffer state.add_memlet_path(tasklet, unroll_exit, b_buffer, src_conn="_out", memlet=dace.Memlet.simple(b_buffer.data, "u", num_accesses=1)) # Buffer to output stream state.add_memlet_path(b_buffer, exit, produce_output_stream, memlet=dace.Memlet.simple( produce_output_stream.data, "0", other_subset_str="0:{}".format(veclen), num_accesses=1)) return sdfg
from __future__ import print_function import argparse import dace import math import numpy as np from dace.dtypes import StorageType, Language from dace.sdfg import SDFG from dace.memlet import Memlet from dace.subsets import Indices N = dace.symbol("N", positive=True) W = dace.symbol("W", positive=True) dtype = dace.float32 vtype = dace.vector(dtype, W) buffer_size = 2048 # Of internal FIFOs def make_copy_to_device(sdfg): pre_state = sdfg.add_state("copy_to_device") A_host = pre_state.add_array("A", [N / W], dtype=vtype) A_device = pre_state.add_array("A_device", [N / W], dtype=vtype, transient=True, storage=StorageType.FPGA_Global) pre_state.add_edge(A_host, None, A_device, None,
def create_gemm_sdfg(sdfg_name, alpha, beta, A, B, C, dtype, transA=False, transB=False, vec_width=1, expansion_args=None): ''' Build an SDFG that perform the given GEMM operation along the given axis Input data A, B, and C is not vectorized ''' sdfg = dace.SDFG(sdfg_name) ########################################################################### # Copy data to FPGA copy_in_state = sdfg.add_state("copy_to_device") A_shape = A.shape B_shape = B.shape C_shape = C.shape N = A_shape[0] K = A_shape[1] M = B_shape[1] vec_type = dace.vector(dtype, vec_width) # Create data containers sdfg.add_array('A', A_shape, dtype) sdfg.add_array("A_device", shape=A_shape, dtype=dtype, storage=dace.dtypes.StorageType.FPGA_Global, transient=True) sdfg.add_array("B", [K, M / vec_width], dtype=vec_type) sdfg.add_array("B_device", [K, M / vec_width], dtype=vec_type, transient=True, storage=dace.dtypes.StorageType.FPGA_Global) sdfg.add_array("C", [N, M / vec_width], dtype=vec_type) sdfg.add_array("C_device", [N, M / vec_width], dtype=vec_type, transient=True, storage=dace.dtypes.StorageType.FPGA_Global) # Copy A in_host_A = copy_in_state.add_read("A") in_device_A = copy_in_state.add_write("A_device") copy_in_state.add_memlet_path(in_host_A, in_device_A, memlet=dace.Memlet(f"A[0:{N}, 0:{K}]")) # Copy B in_host_B = copy_in_state.add_read("B") in_device_B = copy_in_state.add_write("B_device") copy_in_state.add_memlet_path( in_host_B, in_device_B, memlet=dace.Memlet(f"B[0:{K}, 0:{M}/{vec_width}]")) # Copy C in_host_C = copy_in_state.add_read("C") in_device_C = copy_in_state.add_write("C_device") copy_in_state.add_memlet_path( in_host_C, in_device_C, memlet=dace.Memlet(f"C[0:{N}, 0:{M}/{vec_width}]")) ########################################################################### # Copy data from FPGA copy_out_state = sdfg.add_state("copy_from_device") out_device = copy_out_state.add_read("C_device") out_host = copy_out_state.add_write("C") copy_out_state.add_memlet_path( out_device, out_host, memlet=dace.Memlet(f"C[0:{N}, 0:{M}//{vec_width}]")) ######################################################################## # FPGA State fpga_state = sdfg.add_state("fpga_state") in_A = fpga_state.add_read("A_device") in_B = fpga_state.add_read("B_device") in_C = fpga_state.add_read("C_device") out_C = fpga_state.add_read("C_device") gemm_node = blas.Gemm("gemm", transA=transA, transB=transB, alpha=alpha, beta=beta) gemm_node.implementation = "FPGA1DSystolic" fpga_state.add_memlet_path(in_A, gemm_node, dst_conn="_a", memlet=dace.Memlet(f"A_device[0:{N}, 0:{K}]")) fpga_state.add_memlet_path( in_B, gemm_node, dst_conn="_b", memlet=dace.Memlet(f"B_device[0:{K}, 0:{M}/{vec_width}]")) fpga_state.add_memlet_path( in_C, gemm_node, dst_conn="_cin", memlet=dace.Memlet(f"C_device[0:{N}, 0:{M}/{vec_width}]")) fpga_state.add_memlet_path( gemm_node, out_C, src_conn="_c", memlet=dace.Memlet(f"C_device[0:{N}, 0:{M}/{vec_width}]")) ###################################### # Interstate edges sdfg.add_edge(copy_in_state, fpga_state, dace.sdfg.sdfg.InterstateEdge()) sdfg.add_edge(fpga_state, copy_out_state, dace.sdfg.sdfg.InterstateEdge()) sdfg.validate() if expansion_args is not None: gemm_node.expand(sdfg, fpga_state, **expansion_args) return sdfg
# add symbol N = dace.symbol('N') # add sdfg sdfg = dace.SDFG('fladd') # add state state = sdfg.add_state('device_state') # add parameter veclen = 1 sdfg.add_constant('VECLEN', veclen) # add arrays sdfg.add_array('A', [N // veclen], dtype=dace.vector(dace.float32, veclen), storage=dace.StorageType.CPU_Heap) sdfg.add_array('B', [N // veclen], dtype=dace.vector(dace.float32, veclen), storage=dace.StorageType.CPU_Heap) sdfg.add_array('C', [N // veclen], dtype=dace.vector(dace.float32, veclen), storage=dace.StorageType.CPU_Heap) sdfg.add_array('fpga_A', [N // veclen], dtype=dace.vector(dace.float32, veclen), transient=True, storage=dace.StorageType.FPGA_Global) sdfg.add_array('fpga_B', [N // veclen], dtype=dace.vector(dace.float32, veclen), transient=True, storage=dace.StorageType.FPGA_Global)
def test_tasklet_vector_add(): """ Test rtl tasklet vector support. """ # add symbol W = dace.symbol('W') # add sdfg sdfg = dace.SDFG('rtl_tasklet_vector_add') # define compile-time constant sdfg.specialize(dict(W=4)) # add state state = sdfg.add_state() # add arrays sdfg.add_array('A', [1], dtype=dace.vector(dace.int32, dace.symbolic.evaluate(W, sdfg.constants))) sdfg.add_array('B', [1], dtype=dace.vector(dace.int32, dace.symbolic.evaluate(W, sdfg.constants))) # add custom cpp tasklet tasklet = state.add_tasklet(name='rtl_tasklet', inputs={'a'}, outputs={'b'}, code=''' always@(posedge ap_aclk) begin if (ap_areset) begin s_axis_a_tready <= 1; m_axis_b_tvalid <= 0; m_axis_b_tdata <= 0; end else if (s_axis_a_tvalid && s_axis_a_tready) begin s_axis_a_tready <= 0; m_axis_b_tvalid <= 1; for (int i = 0; i < W; i++) begin m_axis_b_tdata[i] <= s_axis_a_tdata[i] + 42; end end else if (m_axis_b_tvalid && m_axis_b_tready) begin s_axis_a_tready <= 1; m_axis_b_tvalid <= 0; m_axis_b_tdata <= 0; end end ''', language=dace.Language.SystemVerilog) # add input/output array A = state.add_read('A') B = state.add_write('B') # connect input/output array with the tasklet state.add_edge(A, None, tasklet, 'a', dace.Memlet('A[0]')) state.add_edge(tasklet, 'b', B, None, dace.Memlet('B[0]')) # validate sdfg sdfg.validate() # Execute # init data structures a = np.random.randint(0, 100, (dace.symbolic.evaluate(W, sdfg.constants), )).astype(np.int32) b = np.zeros((dace.symbolic.evaluate(W, sdfg.constants), )).astype(np.int32) # call program sdfg(A=a, B=b) # check result print(a) print(b) assert (b == a + 42).all()