Example #1
0
def test_mlir_tasklet_explicit_vec():
    sdfg = dace.SDFG('mlir_tasklet_explicit_vec')
    state = sdfg.add_state()
    sdfg.add_array('A', [4], dace.vector(dace.int32, 4))
    sdfg.add_array('B', [4], dace.vector(dace.int32, 4))
    sdfg.add_array('C', [4], dace.vector(dace.int32, 4))

    tasklet = state.add_tasklet(name='mlir_tasklet',
                                inputs={'a', 'b'},
                                outputs={'c'},
                                code='''
                                    module  {
                                        func @mlir_entry(%a: vector<4xi32>, %b: vector<4xi32>) -> vector<4xi32> {
                                            %0 = addi %b, %a  : vector<4xi32>
                                            return %0 : vector<4xi32>
                                        }
                                    }
                                    ''',
                                language=dace.Language.MLIR)

    A = state.add_read('A')
    B = state.add_read('B')
    C = state.add_write('C')

    state.add_edge(A, None, tasklet, 'a', dace.Memlet('A[0]'))
    state.add_edge(B, None, tasklet, 'b', dace.Memlet('B[0]'))
    state.add_edge(tasklet, 'c', C, None, dace.Memlet('C[0]'))
    sdfg.validate()

    a = np.random.rand(4).astype(np.int32)
    b = np.random.rand(4).astype(np.int32)
    c = np.random.rand(4).astype(np.int32)

    sdfg(A=a, B=b, C=c)
    assert (c == a + b).all()
def make_fpga_state(sdfg, vec_width=1):
    vec_type = dace.vector(dace.float32, vec_width)

    state = sdfg.add_state("gemm")

    sdfg.add_stream("A_pipe",
                    dace.float32,
                    transient=True,
                    shape=(P + 1, ),
                    storage=dace.dtypes.StorageType.FPGA_Local,
                    buffer_size="P")
    sdfg.add_stream("B_pipe",
                    vec_type,
                    transient=True,
                    shape=(P + 1, ),
                    storage=dace.dtypes.StorageType.FPGA_Local)
    sdfg.add_stream("C_pipe",
                    vec_type,
                    transient=True,
                    shape=(P + 1, ),
                    storage=dace.dtypes.StorageType.FPGA_Local)

    make_read_A(state)
    make_read_B(state, sdfg, vec_width)
    make_compute(sdfg, state, vec_width)
    make_write_C(state, sdfg, vec_width)

    return state
def make_vec_mul_sdfg(dtype=dace.float32):
    # Vector multiplication SDFG

    vecWidth = 4
    n = dace.symbol("size")
    vecMul_sdfg = dace.SDFG("vec_mul")
    vecType = dace.vector(dtype, vecWidth)
    fpga_state = vecMul_sdfg.add_state("vec_mul_state")

    vecMul_sdfg.add_array('_device_x',
                          shape=[n / vecWidth],
                          dtype=vecType,
                          storage=dace.dtypes.StorageType.FPGA_Global)
    vecMul_sdfg.add_array('_device_y',
                          shape=[n / vecWidth],
                          dtype=vecType,
                          storage=dace.dtypes.StorageType.FPGA_Global)
    vecMul_sdfg.add_array('_device_z',
                          shape=[n / vecWidth],
                          dtype=vecType,
                          storage=dace.dtypes.StorageType.FPGA_Global)

    x = fpga_state.add_read("_device_x")
    y = fpga_state.add_read("_device_y")
    z = fpga_state.add_write("_device_z")

    # ---------- ----------
    # COMPUTE
    # ---------- ----------
    vecMap_entry, vecMap_exit = fpga_state.add_map(
        'vecMul_map',
        dict(i='0:{0}/{1}'.format(n, vecWidth)),
        schedule=dace.dtypes.ScheduleType.FPGA_Device)

    vecMul_tasklet = fpga_state.add_tasklet('vecMul_task', ['x_con', 'y_con'],
                                            ['z_con'], 'z_con = x_con * y_con')

    fpga_state.add_memlet_path(x,
                               vecMap_entry,
                               vecMul_tasklet,
                               dst_conn='x_con',
                               memlet=dace.Memlet(f"{x.data}[i]"))

    fpga_state.add_memlet_path(y,
                               vecMap_entry,
                               vecMul_tasklet,
                               dst_conn='y_con',
                               memlet=dace.Memlet(f"{y.data}[i]"))

    fpga_state.add_memlet_path(vecMul_tasklet,
                               vecMap_exit,
                               z,
                               src_conn='z_con',
                               memlet=dace.Memlet(f"{z.data}[i]"))

    #########
    # Validate
    vecMul_sdfg.fill_scope_connectors()
    vecMul_sdfg.validate()
    return vecMul_sdfg
Example #4
0
def make_sdfg_1d(implementation: str, vector_length: int):

    vtype = dace.vector(dace.typeclass(DTYPE),
                        vector_length) if vector_length > 1 else DTYPE

    sdfg = dace.SDFG(f"stencil_node_test_1d_w{vector_length}")
    _, a_desc = sdfg.add_array("a", (SIZE / vector_length, ), dtype=vtype)
    _, res_desc = sdfg.add_array("res", (SIZE / vector_length, ), dtype=vtype)

    state = sdfg.add_state("stencil_node_test_1d")
    a = state.add_read("a")
    res = state.add_write("res")

    stencil_node = Stencil("stencil_test",
                           """\
tmp0 = (a[0] + a[1])
tmp1 = (tmp0 + a[2])
res[1] = (dace.float32(0.3333) * tmp1)""",
                           inputs={"a"},
                           outputs={"res"})
    stencil_node.implementation = implementation
    state.add_node(stencil_node)

    state.add_edge(a, None, stencil_node, "a",
                   dace.Memlet.from_array("a", a_desc))
    state.add_edge(stencil_node, "res", res, None,
                   dace.Memlet.from_array("res", res_desc))

    return sdfg
Example #5
0
def pure_graph(implementation, dtype, veclen):

    sdfg_name = f"dot_{implementation}_{dtype.ctype}_w{veclen}"
    sdfg = dace.SDFG(sdfg_name)

    state = sdfg.add_state("dot")

    n = dace.symbol("n")
    a = dace.symbol("a")

    vtype = dace.vector(dtype, veclen)

    sdfg.add_array("x", [n / veclen], vtype)
    sdfg.add_array("y", [n / veclen], vtype)
    sdfg.add_array("r", [1], dtype)

    x = state.add_read("x")
    y = state.add_read("y")
    result = state.add_write("r")

    dot_node = blas.Dot("dot")
    dot_node.implementation = implementation
    dot_node.n = n

    state.add_memlet_path(x, dot_node, dst_conn="_x", memlet=Memlet(f"x[0:{n}/{veclen}]"))
    state.add_memlet_path(y, dot_node, dst_conn="_y", memlet=Memlet(f"y[0:{n}/{veclen}]"))
    state.add_memlet_path(dot_node, result, src_conn="_result", memlet=Memlet(f"r[0]"))

    return sdfg
Example #6
0
def test():
    print('Dynamic SDFG test with vectorization and min')
    # Externals (parameters, symbols)
    N = dp.symbol('N')
    N.set(20)

    input = np.random.rand(N.get()).astype(np.float32)
    input2 = np.random.rand(N.get()).astype(np.float32)
    output = dp.ndarray([N], dp.float32)
    output[:] = dp.float32(0)

    # Construct SDFG
    mysdfg = SDFG('myvmin')
    mysdfg.add_array('A', [N], dp.float32)
    mysdfg.add_array('B', [N], dp.float32)
    mysdfg.add_array('C', [N], dp.float32)
    state = mysdfg.add_state()
    A = state.add_access('A')
    B = state.add_access('B')
    C = state.add_access('C')

    tasklet, map_entry, map_exit = state.add_mapped_tasklet(
        'mytasklet', dict(i='0:N:2'),
        dict(a=Memlet.simple(A, 'i'), b=Memlet.simple(B, 'i')),
        'c = min(a, b)', dict(c=Memlet.simple(C, 'i')))

    # Manually vectorize tasklet
    tasklet.in_connectors['a'] = dp.vector(dp.float32, 2)
    tasklet.in_connectors['b'] = dp.vector(dp.float32, 2)
    tasklet.out_connectors['c'] = dp.vector(dp.float32, 2)

    # Add outer edges
    state.add_edge(A, None, map_entry, None, Memlet.simple(A, '0:N'))
    state.add_edge(B, None, map_entry, None, Memlet.simple(B, '0:N'))
    state.add_edge(map_exit, None, C, None, Memlet.simple(C, '0:N'))

    mysdfg(A=input, B=input2, C=output, N=N)

    diff = np.linalg.norm(np.minimum(input, input2) - output) / N.get()
    print("Difference:", diff)
    print("==== Program end ====")
    assert diff <= 1e-5
Example #7
0
def get_dace_type(node: Union[mlir.astnodes.IntegerType,
                              mlir.astnodes.FloatType,
                              mlir.astnodes.VectorType]):
    if isinstance(node, mlir.astnodes.IntegerType) or isinstance(
            node, mlir.astnodes.FloatType):
        return TYPE_DICT[node.dump()]

    if isinstance(node, mlir.astnodes.VectorType):
        result_dim = node.dimensions[0]
        result_subtype = get_dace_type(node.element_type)
        return dace.vector(result_subtype, result_dim)
Example #8
0
def pure_graph(dtype,
               transposed,
               expansion,
               veclen,
               alpha,
               beta,
               expansion_args=None):

    sdfg = dace.SDFG(f"gemv_{expansion}_{dtype}_{transposed}_w{veclen}")

    m = dace.symbol("m")
    n = dace.symbol("n")
    n /= veclen
    vtype = dace.vector(dtype, veclen)

    state = sdfg.add_state("gemv_compute")

    A_rows = m
    A_cols = n
    x_size = n if not transposed else m
    y_size = m if not transposed else n

    sdfg.add_array("A", shape=[A_rows, A_cols], dtype=vtype)
    sdfg.add_array("x", shape=[x_size], dtype=dtype if transposed else vtype)
    sdfg.add_array("y", shape=[y_size], dtype=vtype if transposed else dtype)

    A = state.add_read("A")
    x = state.add_read("x")
    result = state.add_write("y")

    gemv_node = blas.Gemv("gemv", transA=transposed, alpha=alpha, beta=beta)
    gemv_node.implementation = expansion

    state.add_memlet_path(A,
                          gemv_node,
                          dst_conn="_A",
                          memlet=Memlet(f"A[0:{A_rows}, 0:{A_cols}]"))
    state.add_memlet_path(x,
                          gemv_node,
                          dst_conn="_x",
                          memlet=Memlet(f"x[0:{x_size}]"))
    state.add_memlet_path(gemv_node,
                          result,
                          src_conn="_y",
                          memlet=Memlet(f"y[0:{y_size}]"))

    if expansion_args is not None:
        gemv_node.expand(sdfg, state, **expansion_args)

    return sdfg
Example #9
0
def make_copy_to_host_state(sdfg):

    state = sdfg.add_state("copy_to_host")

    B_device = sdfg.add_array("B_device", [SIZE],
                              dtype=dace.vector(DTYPE, VECTOR_LENGTH.get()),
                              transient=True,
                              storage=dace.dtypes.StorageType.FPGA_Global)

    B_host = sdfg.add_array("B", [SIZE // VECTOR_LENGTH.get()],
                            dtype=dace.vector(DTYPE, VECTOR_LENGTH.get()))

    read = state.add_read("B_device")
    write = state.add_write("B")

    state.add_memlet_path(read,
                          write,
                          memlet=dace.memlet.Memlet.simple(
                              "B",
                              "0:N//{}".format(VECTOR_LENGTH.get()),
                              num_accesses=SIZE // VECTOR_LENGTH.get()))

    return state
Example #10
0
def get_dace_type(node: Union[mlir.astnodes.IntegerType,
                              mlir.astnodes.FloatType,
                              mlir.astnodes.VectorType]):
    if isinstance(node, mlir.astnodes.IntegerType):
        result_width = node.width.value
        return TYPE_DICT["i" + result_width]

    if isinstance(node, mlir.astnodes.FloatType):
        return TYPE_DICT[node.type.name]

    if isinstance(node, mlir.astnodes.VectorType):
        result_dim = node.dimensions[0]
        result_subtype = get_dace_type(node.element_type)
        return dace.vector(result_subtype, result_dim)
def make_copy_to_fpga_state(sdfg, vec_width=1):

    ###########################################################################
    # Copy data to FPGA, from plain to vectorized data type if needed

    state = sdfg.add_state("copy_to_device")
    vec_type = dace.vector(dace.float32, vec_width)

    #host data has plain data types
    sdfg.add_array("A", [N, K], dtype=dace.float32)
    sdfg.add_array("B", [K, M / vec_width], dtype=vec_type)
    sdfg.add_array("C", [N, M / vec_width], dtype=vec_type)
    A_host = state.add_read("A")
    B_host = state.add_read("B")
    C_host = state.add_read("C")

    # On the device, vector B and C will be vectorized along rows.
    # Matrix A has plain data type

    sdfg.add_array("A_device", [N, K],
                   dtype=dace.float32,
                   transient=True,
                   storage=dace.dtypes.StorageType.FPGA_Global)
    sdfg.add_array("B_device", [K, M / vec_width],
                   dtype=vec_type,
                   transient=True,
                   storage=dace.dtypes.StorageType.FPGA_Global)
    sdfg.add_array("C_device", [N, M / vec_width],
                   dtype=vec_type,
                   transient=True,
                   storage=dace.dtypes.StorageType.FPGA_Global)
    A_device = state.add_write("A_device")
    B_device = state.add_write("B_device")
    C_device = state.add_write("C_device")

    state.add_memlet_path(A_host,
                          A_device,
                          memlet=dace.Memlet("A_device[0:N, 0:K]"))
    state.add_memlet_path(B_host,
                          B_device,
                          memlet=dace.Memlet(
                              "B_device[0:K, 0:M/{}]".format(vec_width)))
    state.add_memlet_path(C_host,
                          C_device,
                          memlet=dace.Memlet(
                              "C_device[0:N, 0:M/{}]".format(vec_width)))

    return state
def test_map_unroll_processing_elements_decoupled():
    # Grab the systolic GEMM implementation the samples directory

    spec = importlib.util.spec_from_file_location(
        "gemm",
        Path(__file__).parent.parent.parent / "samples" / "fpga" /
        "gemm_systolic_vectorized.py")
    gemm = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(gemm)

    N = 128
    K = 256
    M = 512
    P = 8
    W = 4
    TN = 32
    TM = 128

    # Create an SDFG with multiple processing elements
    sdfg = gemm.make_sdfg("map_unroll_processing_elements",
                          dace.vector(dace.float32, W))
    sdfg.specialize({"P": P, "W": W, "TN": TN, "TM": TM})
    for state in sdfg.states():
        for node in state.nodes():
            if isinstance(node, nodes.MapEntry) and node.params == ["p"]:
                node.unroll = False
                node.schedule = dace.ScheduleType.Unrolled

    # Initialize arrays: Randomize A and B, zero C
    A = np.ndarray([N, K], dtype=dace.float32.type)
    B = np.ndarray([K, M], dtype=dace.float32.type)
    C = np.ndarray([N, M], dtype=dace.float32.type)
    A[:] = np.random.rand(N, K).astype(dace.float32.type)
    B[:] = np.random.rand(K, M).astype(dace.float32.type)
    C[:] = np.random.rand(N, M).astype(dace.float32.type)

    C_regression = A @ B + C

    with set_temporary("compiler",
                       "xilinx",
                       "decouple_array_interfaces",
                       value=True):
        sdfg(A=A, B=B, C=C, N=N, M=M, K=K)
    diff = np.linalg.norm(C_regression - C) / float(N * M)
    if not np.allclose(C_regression, C):
        raise ValueError("Verification failed.")

    return sdfg
Example #13
0
def make_sdfg_2d(implementation: str, vector_length: int):

    vtype = dace.vector(dace.typeclass(DTYPE),
                        vector_length) if vector_length > 1 else DTYPE

    sdfg = dace.SDFG(f"stencil_node_test_2d_w{vector_length}")
    _, a_desc = sdfg.add_array("a", (ROWS, COLS / vector_length), dtype=vtype)
    _, b_desc = sdfg.add_array("b", (ROWS, ), dtype=DTYPE)
    sdfg.add_symbol("c", DTYPE)
    _, d_desc = sdfg.add_array("d", (ROWS, COLS / vector_length), dtype=vtype)
    _, res_desc = sdfg.add_array("res", (ROWS, COLS / vector_length),
                                 dtype=vtype)

    state = sdfg.add_state("stencil_node_test_2d")
    a = state.add_read("a")
    b = state.add_read("b")
    d = state.add_read("d")
    res = state.add_write("res")

    stencil_node = Stencil(
        "stencil_test",
        "res[0, 0] = c * b[0] * (a[-1, 0] + a[1, 0] + a[0, -1] + a[0, 1]) + d[0, -1] + d[0, 1]",
        iterator_mapping={"b": (True, False)})
    stencil_node.implementation = implementation
    state.add_node(stencil_node)

    state.add_memlet_path(a,
                          stencil_node,
                          dst_conn="a",
                          memlet=dace.Memlet.from_array("a", a_desc))
    state.add_memlet_path(b,
                          stencil_node,
                          dst_conn="b",
                          memlet=dace.Memlet.from_array("b", b_desc))
    state.add_memlet_path(d,
                          stencil_node,
                          dst_conn="d",
                          memlet=dace.Memlet.from_array("d", d_desc))
    state.add_memlet_path(stencil_node,
                          res,
                          src_conn="res",
                          memlet=dace.Memlet.from_array("res", res_desc))

    return sdfg
Example #14
0
def pure_graph(veclen, dtype, implementation, test_case):

    n = dace.symbol("n")
    a = dace.symbol("a")

    sdfg_name = f"axpy_test_{implementation}_{test_case}_w{veclen}"

    sdfg = dace.SDFG(sdfg_name)
    test_state = sdfg.add_state("test_state")

    vtype = dace.vector(dtype, veclen)

    sdfg.add_symbol(a.name, dtype)

    sdfg.add_array("x", shape=[n / veclen], dtype=vtype)
    sdfg.add_array("y", shape=[n / veclen], dtype=vtype)

    x_in = test_state.add_read("x")
    y_in = test_state.add_read("y")
    y_out = test_state.add_write("y")

    axpy_node = blas.axpy.Axpy("axpy", a)
    axpy_node.implementation = implementation

    test_state.add_memlet_path(x_in,
                               axpy_node,
                               dst_conn="_x",
                               memlet=Memlet(f"x[0:n/{veclen}]"))
    test_state.add_memlet_path(y_in,
                               axpy_node,
                               dst_conn="_y",
                               memlet=Memlet(f"y[0:n/{veclen}]"))
    test_state.add_memlet_path(axpy_node,
                               y_out,
                               src_conn="_res",
                               memlet=Memlet(f"y[0:n/{veclen}]"))

    sdfg.expand_library_nodes()

    return sdfg
Example #15
0
def pure_graph(implementation, dtype, veclen):

    m = dace.symbol("m")
    n = dace.symbol("n")
    vtype = dace.vector(dtype, veclen)

    sdfg = dace.SDFG("ger_test")

    state = sdfg.add_state("ger")

    sdfg.add_symbol("alpha", dtype)

    sdfg.add_array("x", shape=[m], dtype=dtype)
    sdfg.add_array("y", shape=[n / veclen], dtype=vtype)
    sdfg.add_array("A", shape=[m, n / veclen], dtype=vtype)
    sdfg.add_array("res", shape=[m, n / veclen], dtype=vtype)

    x = state.add_read("x")
    y = state.add_read("y")
    A = state.add_read("A")
    res = state.add_write("res")

    ger_node = blas.Ger(name="ger")
    ger_node.implementation = implementation

    state.add_memlet_path(x, ger_node, dst_conn="_x", memlet=Memlet("x[0:m]"))
    state.add_memlet_path(y,
                          ger_node,
                          dst_conn="_y",
                          memlet=Memlet(f"y[0:n/{veclen}]"))
    state.add_memlet_path(A,
                          ger_node,
                          dst_conn="_A",
                          memlet=Memlet(f"A[0:m, 0:n/{veclen}]"))
    state.add_memlet_path(ger_node,
                          res,
                          src_conn="_res",
                          memlet=Memlet(f"res[0:m, 0:n/{veclen}]"))

    return ger_node, state, sdfg
Example #16
0
def make_fpga_state(sdfg, vectorize_connector):

    state = sdfg.add_state("fpga_state")

    sdfg.add_array("input_buffer", (VECTOR_LENGTH.get(), ),
                   DTYPE,
                   transient=True,
                   storage=dace.StorageType.FPGA_Registers)
    sdfg.add_array("output_buffer", (VECTOR_LENGTH.get(), ),
                   DTYPE,
                   transient=True,
                   storage=dace.StorageType.FPGA_Registers)

    read_input = state.add_read("A_device")
    read_buffer = state.add_access("input_buffer")
    write_buffer = state.add_access("output_buffer")
    write_output = state.add_write("B_device")

    outer_entry, outer_exit = state.add_map(
        "outer_map", {"i": "0:N/W"}, schedule=dace.ScheduleType.FPGA_Device)

    # Test read from packed memory to an unpacked buffer
    if vectorize_connector:
        outputs = {"a_unpacked": dace.vector(DTYPE, VECTOR_LENGTH.get())}
    else:
        outputs = {"a_unpacked"}  # Infers an array
    unpack_tasklet = state.add_tasklet("unpack_tasklet", {"a"}, outputs,
                                       "a_unpacked = a")
    state.add_memlet_path(read_input,
                          outer_entry,
                          unpack_tasklet,
                          dst_conn="a",
                          memlet=dace.Memlet.simple("A_device", "i"))
    state.add_memlet_path(unpack_tasklet,
                          read_buffer,
                          src_conn="a_unpacked",
                          memlet=dace.Memlet.simple(
                              read_buffer.data,
                              "0:{}".format(VECTOR_LENGTH.get())))

    unroll_entry, unroll_exit = state.add_map(
        "shuffle_map", {"w": "0:W"},
        schedule=dace.ScheduleType.FPGA_Device,
        unroll=True)

    tasklet = state.add_tasklet("shuffle_tasklet", {"a"}, {"b"}, "b = a")

    state.add_memlet_path(read_buffer,
                          unroll_entry,
                          tasklet,
                          dst_conn="a",
                          memlet=dace.Memlet.simple("input_buffer",
                                                    "(w + W // 2) % W",
                                                    num_accesses=1))

    state.add_memlet_path(tasklet,
                          unroll_exit,
                          write_buffer,
                          src_conn="b",
                          memlet=dace.Memlet.simple("output_buffer",
                                                    "w",
                                                    num_accesses=1))

    # Test writing from unpacked to packed from inside tasklet
    if vectorize_connector:
        outputs = {"b": dace.vector(DTYPE, VECTOR_LENGTH.get())}
    else:
        outputs = {"b"}
    pack_tasklet = state.add_tasklet("pack_tasklet", outputs, {"b_packed"},
                                     "b_packed = b")
    state.add_memlet_path(write_buffer,
                          pack_tasklet,
                          dst_conn="b",
                          memlet=dace.Memlet.simple(
                              write_buffer.data,
                              "0:{}".format(VECTOR_LENGTH.get())))

    # Write back out to memory from unpacked to packed memory
    state.add_memlet_path(pack_tasklet,
                          outer_exit,
                          write_output,
                          src_conn="b_packed",
                          memlet=dace.Memlet.simple("B_device", "i"))

    return state
Example #17
0
def make_vecAdd_sdfg(sdfg_name: str, dtype=dace.float32):
    vecWidth = 4
    n = dace.symbol("size")
    vecAdd_sdfg = dace.SDFG(sdfg_name)
    vecType = dace.vector(dtype, vecWidth)

    x_name = "x"
    y_name = "y"
    z_name = "z"

    ###########################################################################
    # Copy data to FPGA

    copy_in_state = vecAdd_sdfg.add_state("copy_to_device")

    vecAdd_sdfg.add_array(x_name, shape=[n / vecWidth], dtype=vecType)
    vecAdd_sdfg.add_array(y_name, shape=[n / vecWidth], dtype=vecType)

    in_host_x = copy_in_state.add_read(x_name)
    in_host_y = copy_in_state.add_read(y_name)

    vecAdd_sdfg.add_array("device_x",
                          shape=[n / vecWidth],
                          dtype=vecType,
                          storage=dace.dtypes.StorageType.FPGA_Global,
                          transient=True)
    vecAdd_sdfg.add_array("device_y",
                          shape=[n / vecWidth],
                          dtype=vecType,
                          storage=dace.dtypes.StorageType.FPGA_Global,
                          transient=True)

    in_device_x = copy_in_state.add_write("device_x")
    in_device_y = copy_in_state.add_write("device_y")

    copy_in_state.add_memlet_path(in_host_x,
                                  in_device_x,
                                  memlet=Memlet.simple(
                                      in_host_x, "0:{}/{}".format(n,
                                                                  vecWidth)))
    copy_in_state.add_memlet_path(in_host_y,
                                  in_device_y,
                                  memlet=Memlet.simple(
                                      in_host_y, "0:{}/{}".format(n,
                                                                  vecWidth)))

    ###########################################################################
    # Copy data from FPGA
    vecAdd_sdfg.add_array(z_name, shape=[n / vecWidth], dtype=vecType)

    copy_out_state = vecAdd_sdfg.add_state("copy_to_host")

    vecAdd_sdfg.add_array("device_z",
                          shape=[n / vecWidth],
                          dtype=vecType,
                          storage=dace.dtypes.StorageType.FPGA_Global,
                          transient=True)

    out_device = copy_out_state.add_read("device_z")
    out_host = copy_out_state.add_write(z_name)

    copy_out_state.add_memlet_path(out_device,
                                   out_host,
                                   memlet=Memlet.simple(
                                       out_host, "0:{}/{}".format(n,
                                                                  vecWidth)))

    ########################################################################
    # FPGA State

    fpga_state = vecAdd_sdfg.add_state("fpga_state")

    x = fpga_state.add_read("device_x")
    y = fpga_state.add_read("device_y")
    z = fpga_state.add_write("device_z")

    # ---------- ----------
    # COMPUTE
    # ---------- ----------
    vecMap_entry, vecMap_exit = fpga_state.add_map(
        'vecAdd_map',
        dict(i='0:{0}/{1}'.format(n, vecWidth)),
        schedule=dace.dtypes.ScheduleType.FPGA_Device)

    vecAdd_tasklet = fpga_state.add_tasklet('vecAdd_task', ['x_con', 'y_con'],
                                            ['z_con'], 'z_con = x_con + y_con')

    fpga_state.add_memlet_path(x,
                               vecMap_entry,
                               vecAdd_tasklet,
                               dst_conn='x_con',
                               memlet=dace.Memlet.simple(x.data, "i"))

    fpga_state.add_memlet_path(y,
                               vecMap_entry,
                               vecAdd_tasklet,
                               dst_conn='y_con',
                               memlet=dace.Memlet.simple(y.data, 'i'))

    fpga_state.add_memlet_path(vecAdd_tasklet,
                               vecMap_exit,
                               z,
                               src_conn='z_con',
                               memlet=dace.Memlet.simple(z.data, 'i'))

    ######################################
    # Interstate edges
    vecAdd_sdfg.add_edge(copy_in_state, fpga_state,
                         dace.sdfg.sdfg.InterstateEdge())
    vecAdd_sdfg.add_edge(fpga_state, copy_out_state,
                         dace.sdfg.sdfg.InterstateEdge())

    #########
    # Validate
    vecAdd_sdfg.fill_scope_connectors()
    vecAdd_sdfg.validate()
    return vecAdd_sdfg
Example #18
0
# add symbol
N = dace.symbol('N')

# add sdfg
sdfg = dace.SDFG('fladd')

# add state
state = sdfg.add_state('device_state')

# add parameter
veclen = 1
sdfg.add_constant('VECLEN', veclen)

# add arrays
sdfg.add_array('A', [N // veclen], dtype=dace.vector(dace.float32, veclen), storage=dace.StorageType.CPU_Heap)
sdfg.add_array('B', [N // veclen], dtype=dace.vector(dace.float32, veclen), storage=dace.StorageType.CPU_Heap)
sdfg.add_array('C', [N // veclen], dtype=dace.vector(dace.float32, veclen), storage=dace.StorageType.CPU_Heap)
sdfg.add_array('fpga_A', [N // veclen],
               dtype=dace.vector(dace.float32, veclen),
               transient=True,
               storage=dace.StorageType.FPGA_Global)
sdfg.add_array('fpga_B', [N // veclen],
               dtype=dace.vector(dace.float32, veclen),
               transient=True,
               storage=dace.StorageType.FPGA_Global)
sdfg.add_array('fpga_C', [N // veclen],
               dtype=dace.vector(dace.float32, veclen),
               transient=True,
               storage=dace.StorageType.FPGA_Global)
Example #19
0
def test_tasklet_vector():
    """
        Test rtl tasklet vector support.
    """

    # add symbol
    N = dace.symbol('N')

    # add sdfg
    sdfg = dace.SDFG('rtl_tasklet_vector')

    # define compile-time constant
    sdfg.specialize(dict(N=4))

    # add state
    state = sdfg.add_state()

    # add arrays
    sdfg.add_array('A', [N], dtype=dace.int32)
    sdfg.add_array('B', [1], dtype=dace.int32)

    # add custom cpp tasklet
    tasklet = state.add_tasklet(name='rtl_tasklet',
                                inputs={'a': dace.vector(dace.int32, N)},
                                outputs={'b'},
                                code='''
        /*
            Convention:
               |---------------------------------------------------------------------|
            -->| ap_aclk (clock input)                                               |
            -->| ap_areset (reset input, rst on high)                                |
               |                                                                     |
            -->| {inputs}                                              reg {outputs} |-->
               |                                                                     |
            <--| s_axis_a_tready (ready for data)       (data avail) m_axis_b_tvalid |-->
            -->| s_axis_a_tvalid (new data avail)    (data consumed) m_axis_b_tready |<--
               |---------------------------------------------------------------------|
        */

        typedef enum [1:0] {READY, BUSY, DONE} state_e;
        state_e state;
    
        always@(posedge ap_aclk) begin
            if (ap_areset) begin // case: reset
                m_axis_b_tdata <= 0;
                s_axis_a_tready <= 1'b1;
                state <= READY;
            end else if (s_axis_a_tvalid && state == READY) begin // case: load a 
                m_axis_b_tdata <= s_axis_a_tdata[0];
                s_axis_a_tready <= 1'b0;
                state <= BUSY;
            end else if (m_axis_b_tdata < s_axis_a_tdata[0] + s_axis_a_tdata[1] && state == BUSY) begin // case: increment counter b
                m_axis_b_tdata <= m_axis_b_tdata + 1;
            end else if (state == BUSY) begin
                m_axis_b_tdata <= m_axis_b_tdata;
                state <= DONE;
            end
        end    
    
        assign m_axis_b_tvalid  = (m_axis_b_tdata >= s_axis_a_tdata[0] + s_axis_a_tdata[1] && (state == BUSY || state == DONE)) ? 1'b1:1'b0; 
        ''',
                                language=dace.Language.SystemVerilog)

    # add input/output array
    A = state.add_read('A')
    B = state.add_write('B')

    # connect input/output array with the tasklet
    state.add_edge(A, None, tasklet, 'a', dace.Memlet('A[0:N-1]'))
    state.add_edge(tasklet, 'b', B, None, dace.Memlet('B[0]'))

    # validate sdfg
    sdfg.validate()

    # Execute

    # init data structures
    a = np.random.randint(0, 100, dace.symbolic.evaluate(
        N, sdfg.constants)).astype(np.int32)
    b = np.array([0]).astype(np.int32)

    # call program
    sdfg(A=a, B=b)

    # check result
    assert b == a[0] + a[1]
Example #20
0
import dace
import numpy as np

# Define vector type
float2 = dace.vector(dace.float32, 2)


def test_vector_type():
    sdfg = dace.SDFG('vectortypes')
    sdfg.add_array('A', [2], float2)
    sdfg.add_array('B', [2], float2)
    state = sdfg.add_state()
    r = state.add_read('A')
    # With type inference
    t1 = state.add_tasklet('something', {'a'}, {'b'}, 'b = a * 2')
    # Without type inference
    t2 = state.add_tasklet('something', dict(a=float2), dict(b=float2),
                           'b = a * 2')
    w = state.add_write('B')
    state.add_edge(r, None, t1, 'a', dace.Memlet('A[0]'))
    state.add_edge(t1, 'b', w, None, dace.Memlet('B[0]'))
    state.add_edge(r, None, t2, 'a', dace.Memlet('A[1]'))
    state.add_edge(t2, 'b', w, None, dace.Memlet('B[1]'))

    A = np.random.rand(4).astype(np.float32)
    B = np.random.rand(4).astype(np.float32)
    sdfg(A=A, B=B)
    assert np.allclose(B, 2 * A)


def test_vector_type_inference():
Example #21
0
    output[:] = dp.float32(0)

    # Construct SDFG
    mysdfg = SDFG('myvmin')
    state = mysdfg.add_state()
    A = state.add_array('A', [N], dp.float32)
    B = state.add_array('B', [N], dp.float32)
    C = state.add_array('C', [N], dp.float32)

    tasklet, map_entry, map_exit = state.add_mapped_tasklet(
        'mytasklet', dict(i='0:N:2'),
        dict(a=Memlet.simple(A, 'i'), b=Memlet.simple(B, 'i')),
        'c = min(a, b)', dict(c=Memlet.simple(C, 'i')))

    # Manually vectorize tasklet
    tasklet.in_connectors['a'] = dp.vector(dp.float32, 2)
    tasklet.in_connectors['b'] = dp.vector(dp.float32, 2)
    tasklet.out_connectors['c'] = dp.vector(dp.float32, 2)

    # Add outer edges
    state.add_edge(A, None, map_entry, None, Memlet.simple(A, '0:N'))
    state.add_edge(B, None, map_entry, None, Memlet.simple(B, '0:N'))
    state.add_edge(map_exit, None, C, None, Memlet.simple(C, '0:N'))

    mysdfg(A=input, B=input2, C=output, N=N)

    diff = np.linalg.norm(np.minimum(input, input2) - output) / N.get()
    print("Difference:", diff)
    print("==== Program end ====")
    exit(0 if diff <= 1e-5 else 1)
Example #22
0
def make_fpga_sdfg():
    '''
    Build an SDFG with two nested SDFGs in a single FPGA state
    '''

    n = dace.symbol("n")
    vecWidth = 4
    vecType = dace.vector(dace.float32, vecWidth)
    sdfg = dace.SDFG("nested_sdfg_kernels")

    ###########################################################################
    # Copy data to FPGA

    copy_in_state = sdfg.add_state("copy_to_device")

    sdfg.add_array("x", shape=[n / vecWidth], dtype=vecType)
    sdfg.add_array("y", shape=[n / vecWidth], dtype=vecType)

    sdfg.add_array("v", shape=[n / vecWidth], dtype=vecType)

    in_host_x = copy_in_state.add_read("x")
    in_host_y = copy_in_state.add_read("y")

    in_host_v = copy_in_state.add_read("v")

    sdfg.add_array("device_x",
                   shape=[n / vecWidth],
                   dtype=vecType,
                   storage=dace.dtypes.StorageType.FPGA_Global,
                   transient=True)
    sdfg.add_array("device_y",
                   shape=[n / vecWidth],
                   dtype=vecType,
                   storage=dace.dtypes.StorageType.FPGA_Global,
                   transient=True)

    sdfg.add_array("device_v",
                   shape=[n / vecWidth],
                   dtype=vecType,
                   storage=dace.dtypes.StorageType.FPGA_Global,
                   transient=True)

    in_device_x = copy_in_state.add_write("device_x")
    in_device_y = copy_in_state.add_write("device_y")

    in_device_v = copy_in_state.add_write("device_v")

    copy_in_state.add_memlet_path(in_host_x, in_device_x, memlet=dace.Memlet(f"{in_host_x.data}[0:{n}/{vecWidth}]"))
    copy_in_state.add_memlet_path(in_host_y, in_device_y, memlet=dace.Memlet(f"{in_host_y.data}[0:{n}/{vecWidth}]"))

    copy_in_state.add_memlet_path(in_host_v, in_device_v, memlet=dace.Memlet(f"{in_host_v.data}[0:{n}/{vecWidth}]"))

    ###########################################################################
    # Copy data from FPGA
    sdfg.add_array("z", shape=[n / vecWidth], dtype=vecType)
    sdfg.add_array("u", shape=[n / vecWidth], dtype=vecType)

    copy_out_state = sdfg.add_state("copy_to_host")

    sdfg.add_array("device_z",
                   shape=[n / vecWidth],
                   dtype=vecType,
                   storage=dace.dtypes.StorageType.FPGA_Global,
                   transient=True)

    sdfg.add_array("device_u",
                   shape=[n / vecWidth],
                   dtype=vecType,
                   storage=dace.dtypes.StorageType.FPGA_Global,
                   transient=True)

    out_device_z = copy_out_state.add_read("device_z")
    out_host_z = copy_out_state.add_write("z")

    out_device_u = copy_out_state.add_read("device_u")
    out_host_u = copy_out_state.add_write("u")

    copy_out_state.add_memlet_path(out_device_z, out_host_z, memlet=dace.Memlet(f"{out_host_z.data}[0:{n}/{vecWidth}]"))
    copy_out_state.add_memlet_path(out_device_u, out_host_u, memlet=dace.Memlet(f"{out_host_u.data}[0:{n}/{vecWidth}]"))
    ###########################################################################
    # State that must not become an FPGA kernel

    non_fpga_state = sdfg.add_state("I_do_not_want_to_be_fpga_kernel")
    non_fpga_state.location["is_FPGA_kernel"] = False
    # Build the vec addition SDFG and nest it

    in_device_x = non_fpga_state.add_read("device_x")
    in_device_y = non_fpga_state.add_read("device_y")
    in_device_v = non_fpga_state.add_read("device_v")
    out_device_z = non_fpga_state.add_write("device_z")
    out_device_u = non_fpga_state.add_write("device_u")

    to_nest = make_vec_add_sdfg()
    # add nested sdfg with symbol mapping
    nested_sdfg = non_fpga_state.add_nested_sdfg(to_nest, sdfg, {"_device_x", "_device_y"}, {"_device_z"},
                                                 {"size": "n"})

    non_fpga_state.add_memlet_path(in_device_x,
                                   nested_sdfg,
                                   dst_conn="_device_x",
                                   memlet=dace.Memlet(f"{in_device_x.data}[0:{n}/{vecWidth}]"))
    non_fpga_state.add_memlet_path(in_device_y,
                                   nested_sdfg,
                                   dst_conn="_device_y",
                                   memlet=dace.Memlet(f"{in_device_y.data}[0:{n}/{vecWidth}]"))
    non_fpga_state.add_memlet_path(nested_sdfg,
                                   out_device_z,
                                   src_conn="_device_z",
                                   memlet=dace.Memlet(f"{out_device_z.data}[0:{n}/{vecWidth}]"))

    # Build the second vec addition SDFG and nest it

    to_nest = make_vec_add_sdfg()
    # add nested sdfg with symbol mapping
    nested_sdfg = non_fpga_state.add_nested_sdfg(to_nest, sdfg, {"_device_x", "_device_y"}, {"_device_z"},
                                                 {"size": "n"})

    non_fpga_state.add_memlet_path(out_device_z,
                                   nested_sdfg,
                                   dst_conn="_device_x",
                                   memlet=dace.Memlet(f"{out_device_z.data}[0:{n}/{vecWidth}]"))
    non_fpga_state.add_memlet_path(in_device_v,
                                   nested_sdfg,
                                   dst_conn="_device_y",
                                   memlet=dace.Memlet(f"{in_device_v.data}[0:{n}/{vecWidth}]"))
    non_fpga_state.add_memlet_path(nested_sdfg,
                                   out_device_u,
                                   src_conn="_device_z",
                                   memlet=dace.Memlet(f"{out_device_u.data}[0:{n}/{vecWidth}]"))

    ######################################
    # Interstate edges
    sdfg.add_edge(copy_in_state, non_fpga_state, dace.sdfg.sdfg.InterstateEdge())
    sdfg.add_edge(non_fpga_state, copy_out_state, dace.sdfg.sdfg.InterstateEdge())
    sdfg.fill_scope_connectors()
    sdfg.validate()

    return sdfg
Example #23
0
def make_sdfg(name="fpga_stcl_test", dtype=dace.float32, veclen=8):

    vtype = dace.vector(dtype, veclen)

    n = dace.symbol("N")
    m = dace.symbol("M")

    sdfg = dace.SDFG(name)

    pre_state = sdfg.add_state(name + "_pre")
    state = sdfg.add_state(name)
    post_state = sdfg.add_state(name + "_post")
    sdfg.add_edge(pre_state, state, dace.InterstateEdge())
    sdfg.add_edge(state, post_state, dace.InterstateEdge())

    _, desc_input_host = sdfg.add_array("a", (n, m / veclen), vtype)
    _, desc_output_host = sdfg.add_array("b", (n, m / veclen), vtype)
    desc_input_device = copy.copy(desc_input_host)
    desc_input_device.storage = dace.StorageType.FPGA_Global
    desc_input_device.location["bank"] = 0
    desc_input_device.transient = True
    desc_output_device = copy.copy(desc_output_host)
    desc_output_device.storage = dace.StorageType.FPGA_Global
    desc_output_device.location["bank"] = 1
    desc_output_device.transient = True
    sdfg.add_datadesc("a_device", desc_input_device)
    sdfg.add_datadesc("b_device", desc_output_device)

    # Host to device
    pre_read = pre_state.add_read("a")
    pre_write = pre_state.add_write("a_device")
    pre_state.add_memlet_path(
        pre_read, pre_write, memlet=dace.Memlet(f"a_device[0:N, 0:M/{veclen}]"))

    # Device to host
    post_read = post_state.add_read("b_device")
    post_write = post_state.add_write("b")
    post_state.add_memlet_path(
        post_read,
        post_write,
        memlet=dace.Memlet(f"b_device[0:N, 0:M/{veclen}]"))

    # Compute state
    read_memory = state.add_read("a_device")
    write_memory = state.add_write("b_device")

    # Memory streams
    sdfg.add_stream("a_stream",
                    vtype,
                    storage=dace.StorageType.FPGA_Local,
                    transient=True)
    sdfg.add_stream("b_stream",
                    vtype,
                    storage=dace.StorageType.FPGA_Local,
                    transient=True)
    produce_input_stream = state.add_write("a_stream")
    consume_input_stream = state.add_read("a_stream")
    produce_output_stream = state.add_write("b_stream")
    consume_output_stream = state.add_write("b_stream")

    tasklet = state.add_tasklet(
        name, {"_north", "_west", "_east", "_south"}, {"result"}, """\
north = _north if i >= 1 else 1
west = _west if {W}*j + u >= 1 else 1
east = _east if {W}*j + u < M - 1 else 1
south = _south if i < N - 1 else 1

result = 0.25 * (north + west + east + south)""".format(W=veclen))

    entry, exit = state.add_pipeline(name, {
        "i": "0:N",
        "j": "0:M/{}".format(veclen),
    },
                                     schedule=dace.ScheduleType.FPGA_Device,
                                     init_size=m / veclen,
                                     init_overlap=False,
                                     drain_size=m / veclen,
                                     drain_overlap=True)

    # Unrolled map
    unroll_entry, unroll_exit = state.add_map(
        name + "_unroll", {"u": "0:{}".format(veclen)},
        schedule=dace.ScheduleType.FPGA_Device,
        unroll=True)

    # Container-to-container copies between arrays and streams
    state.add_memlet_path(read_memory,
                          produce_input_stream,
                          memlet=dace.Memlet(
                              f"{read_memory.data}[0:N, 0:M/{veclen}]",
                              other_subset="0"))
    state.add_memlet_path(consume_output_stream,
                          write_memory,
                          memlet=dace.Memlet(
                              write_memory.data,
                              f"{write_memory.data}[0:N, 0:M/{veclen}]",
                              other_subset="0"))

    # Container-to-container copy from vectorized stream to non-vectorized
    # buffer
    sdfg.add_array("input_buffer", (1, ),
                   vtype,
                   storage=dace.StorageType.FPGA_Local,
                   transient=True)
    sdfg.add_array("shift_register", (2 * m + veclen, ),
                   dtype,
                   storage=dace.StorageType.FPGA_ShiftRegister,
                   transient=True)
    sdfg.add_array("output_buffer", (veclen, ),
                   dtype,
                   storage=dace.StorageType.FPGA_Local,
                   transient=True)
    sdfg.add_array("output_buffer_packed", (1, ),
                   vtype,
                   storage=dace.StorageType.FPGA_Local,
                   transient=True)
    input_buffer = state.add_access("input_buffer")
    shift_register = state.add_access("shift_register")
    output_buffer = state.add_access("output_buffer")
    output_buffer_packed = state.add_access("output_buffer_packed")

    # Only write if not initializing
    read_tasklet = state.add_tasklet(
        name + "_conditional_read", {"_in"}, {"_out"},
        "if not {}:\n\t_out = _in".format(entry.pipeline.drain_condition()))

    # Input stream to buffer
    state.add_memlet_path(consume_input_stream,
                          entry,
                          read_tasklet,
                          dst_conn="_in",
                          memlet=dace.Memlet(f"{consume_input_stream.data}[0]",
                                             dynamic=True))
    state.add_memlet_path(read_tasklet,
                          input_buffer,
                          src_conn="_out",
                          memlet=dace.Memlet(f"{input_buffer.data}[0]"))
    state.add_memlet_path(input_buffer,
                          shift_register,
                          memlet=dace.Memlet(f"{input_buffer.data}[0]",
                                             other_subset=f"2*M:(2*M + {veclen})"))

    # Stencils accesses
    state.add_memlet_path(
        shift_register,
        unroll_entry,
        tasklet,
        dst_conn="_north",
        memlet=dace.Memlet(f"{shift_register.data}[u]"))  # North
    state.add_memlet_path(
        shift_register,
        unroll_entry,
        tasklet,
        dst_conn="_west",
        memlet=dace.Memlet(f"{shift_register.data}[u + M - 1]"))  # West
    state.add_memlet_path(
        shift_register,
        unroll_entry,
        tasklet,
        dst_conn="_east",
        memlet=dace.Memlet(f"{shift_register.data}[u + M + 1]"))  # East
    state.add_memlet_path(
        shift_register,
        unroll_entry,
        tasklet,
        dst_conn="_south",
        memlet=dace.Memlet(f"{shift_register.data}[u + 2 * M]"))  # South

    # Tasklet to buffer
    state.add_memlet_path(tasklet,
                          unroll_exit,
                          output_buffer,
                          src_conn="result",
                          memlet=dace.Memlet(f"{output_buffer.data}[u]"))

    # Pack buffer
    state.add_memlet_path(output_buffer,
                          output_buffer_packed,
                          memlet=dace.Memlet(f"{output_buffer_packed.data}[0]",
                                             other_subset=f"0:{veclen}"))

    # Only write if not initializing
    write_tasklet = state.add_tasklet(
        name + "_conditional_write", {"_in"}, {"_out"},
        "if not {}:\n\t_out = _in".format(entry.pipeline.init_condition()))

    # Buffer to output stream
    state.add_memlet_path(output_buffer_packed,
                          write_tasklet,
                          dst_conn="_in",
                          memlet=dace.Memlet(f"{output_buffer_packed.data}[0]"))

    # Buffer to output stream
    state.add_memlet_path(write_tasklet,
                          exit,
                          produce_output_stream,
                          src_conn="_out",
                          memlet=dace.Memlet(f"{produce_output_stream.data}[0]",
                                             dynamic=True))

    return sdfg
def make_compute(sdfg, state, vec_width=1):

    vec_type = dace.vector(dace.float32, vec_width)
    A_pipe_in = state.add_read("A_pipe")
    A_pipe_out = state.add_write("A_pipe")
    B_pipe_in = state.add_read("B_pipe")
    B_pipe_out = state.add_write("B_pipe")
    C_pipe_in = state.add_read("C_pipe")
    C_pipe_out = state.add_write("C_pipe")

    entry_n0, exit_n0 = state.add_map("n0", {
        "n0": "0:N/P",
    },
                                      schedule=dace.ScheduleType.FPGA_Device)
    entry_k, exit_k = state.add_map("k", {"k": "0:K"},
                                    schedule=dace.ScheduleType.FPGA_Device)
    entry_a, exit_a = state.add_map("buffer_A", {"n1": "0:P"},
                                    schedule=dace.ScheduleType.FPGA_Device)

    # As we are using vectorized data types for B, we have to consider it into these
    # two maps
    entry_m, exit_m = state.add_map("m", {"m": "0:M/{}".format(vec_width)},
                                    schedule=dace.ScheduleType.FPGA_Device)
    entry_c, exit_c = state.add_map("write_C", {
        "n1": "0:P",
        "m": "0:M/{}".format(vec_width)
    },
                                    schedule=dace.ScheduleType.FPGA_Device)

    # Instantiate buffers
    sdfg.add_scalar("A_reg",
                    dtype=dace.float32,
                    transient=True,
                    storage=dace.dtypes.StorageType.FPGA_Registers)
    A_reg = state.add_write("A_reg")

    # For C result we are going to use vectorized data type
    sdfg.add_array("C_buffer", [M / vec_width],
                   dtype=vec_type,
                   transient=True,
                   storage=dace.dtypes.StorageType.FPGA_Local)
    C_buffer_in = state.add_read("C_buffer")
    C_buffer_out = state.add_write("C_buffer")

    # every PE: reads input data, buffer the data assigned to it, forwards the data
    buffer_a_tasklet = state.add_tasklet(
        "buffer_a", {"a_in"}, {"a_reg", "a_out"}, """\
if n1 == P - p - 1:
    a_reg = a_in
if p < P - 1:
    a_out = a_in""")
    state.add_memlet_path(A_pipe_in,
                          entry_n0,
                          entry_k,
                          entry_a,
                          buffer_a_tasklet,
                          memlet=dace.Memlet("A_pipe[p]", dynamic=False),
                          dst_conn="a_in")
    state.add_memlet_path(buffer_a_tasklet,
                          exit_a,
                          A_reg,
                          memlet=dace.Memlet("A_reg[0]", dynamic=True),
                          src_conn="a_reg")
    state.add_memlet_path(buffer_a_tasklet,
                          exit_a,
                          exit_k,
                          exit_n0,
                          A_pipe_out,
                          memlet=dace.Memlet("A_pipe[p + 1]", dynamic=True),
                          src_conn="a_out")
    # Compute and forward B
    compute_tasklet = state.add_tasklet(
        "multiply_add", {"a_in", "b_in", "c_in"}, {"b_out", "c_out"}, """\
c_prev = c_in
if k == 0:
    c_prev = 0
c_out = c_prev + a_in * b_in
if p < P - 1:
    b_out = b_in""")

    state.add_memlet_path(A_reg,
                          entry_m,
                          compute_tasklet,
                          dst_conn="a_in",
                          memlet=dace.Memlet("A_reg[0]"))
    state.add_memlet_path(B_pipe_in,
                          entry_n0,
                          entry_k,
                          entry_m,
                          compute_tasklet,
                          memlet=dace.Memlet("B_pipe[p]", dynamic=False),
                          dst_conn="b_in")
    state.add_memlet_path(compute_tasklet,
                          exit_m,
                          exit_k,
                          exit_n0,
                          B_pipe_out,
                          memlet=dace.Memlet("B_pipe[p + 1]", dynamic=True),
                          src_conn="b_out")
    state.add_memlet_path(C_buffer_in,
                          entry_k,
                          entry_m,
                          compute_tasklet,
                          dst_conn="c_in",
                          memlet=dace.Memlet("C_buffer[m]"))
    state.add_memlet_path(entry_n0, C_buffer_in, memlet=dace.Memlet())
    state.add_memlet_path(compute_tasklet,
                          exit_m,
                          exit_k,
                          C_buffer_out,
                          memlet=dace.Memlet("C_buffer[m]"),
                          src_conn="c_out")
    state.add_memlet_path(C_buffer_out, exit_n0, memlet=dace.Memlet())

    write_c_tasklet = state.add_tasklet(
        "write_c", {"buffer_in", "forward_in"}, {"c_out"}, """\
if n1 <= p:
    c_out = forward_in if p > 0 and n1 > 0 else buffer_in""")
    state.add_memlet_path(C_buffer_out,
                          entry_c,
                          write_c_tasklet,
                          memlet=dace.Memlet("C_buffer[m]", dynamic=True),
                          dst_conn="buffer_in")
    state.add_memlet_path(C_pipe_in,
                          entry_n0,
                          entry_c,
                          write_c_tasklet,
                          memlet=dace.Memlet("C_pipe[p-1]", dynamic=True),
                          dst_conn="forward_in")
    state.add_memlet_path(write_c_tasklet,
                          exit_c,
                          exit_n0,
                          C_pipe_out,
                          memlet=dace.Memlet("C_pipe[p]", dynamic=True),
                          src_conn="c_out")

    # Unroll processing elements
    compute_entry, compute_exit = state.add_map(
        "unroll_compute", {"p": "0:P"},
        schedule=dace.ScheduleType.FPGA_Device,
        unroll=True)

    # Bring data nodes into scope
    state.add_memlet_path(compute_entry,
                          A_pipe_in,
                          memlet=dace.memlet.Memlet())
    state.add_memlet_path(compute_entry,
                          B_pipe_in,
                          memlet=dace.memlet.Memlet())
    state.add_memlet_path(compute_entry,
                          C_pipe_in,
                          memlet=dace.memlet.Memlet())
    state.add_memlet_path(A_pipe_out,
                          compute_exit,
                          memlet=dace.memlet.Memlet())
    state.add_memlet_path(B_pipe_out,
                          compute_exit,
                          memlet=dace.memlet.Memlet())
    state.add_memlet_path(C_pipe_out,
                          compute_exit,
                          memlet=dace.memlet.Memlet())
Example #25
0
def test_tasklet_map():
    '''
        Test the unrolled map support for M tasklets on N vectors of size W.
    '''
    # add symbols
    n = 512
    m = 8
    w = 4
    N = dace.symbol('N')
    M = dace.symbol('M')
    W = dace.symbol('W')
    N.set(n)
    M.set(m)
    W.set(w)

    # add sdfg
    sdfg = dace.SDFG('rtl_tasklet_map')

    # add state
    state = sdfg.add_state()

    # add arrays
    sdfg.add_array('A', [M, N], dtype=dace.vector(dace.int32, W.get()))
    sdfg.add_array('B', [M, N], dtype=dace.vector(dace.int32, W.get()))
    sdfg.add_array('C', [M, N], dtype=dace.vector(dace.int32, W.get()))

    mentry, mexit = state.add_map('compute_map', {'k': '0:M'})

    tasklet = state.add_tasklet(name='rtl_tasklet1',
                                inputs={'a', 'b'},
                                outputs={'c'},
                                code='''
reg [W-1:0][31:0] a_data;
reg a_valid;
reg [W-1:0][31:0] b_data;
reg b_valid;

// Read A
always@(posedge ap_aclk) begin
    if (ap_areset) begin
        s_axis_a_tready <= 0;
        a_valid <= 0;
        a_data <= 0;
    end else begin
        if (s_axis_a_tready && s_axis_a_tvalid) begin
            a_valid <= 1;
            a_data <= s_axis_a_tdata;
            s_axis_a_tready <= 0;
        end else if (m_axis_c_tvalid && m_axis_c_tready) begin
            a_valid <= 0;
            s_axis_a_tready <= 1;
        end else begin
            s_axis_a_tready <= ~a_valid;
        end
    end
end

// Read B
always@(posedge ap_aclk) begin
    if (ap_areset) begin
        s_axis_b_tready <= 0;
        b_valid <= 0;
        b_data <= 0;
    end else begin
        if (s_axis_b_tready && s_axis_b_tvalid) begin
            b_valid <= 1;
            b_data <= s_axis_b_tdata;
            s_axis_b_tready <= 0;
        end else if (m_axis_c_tvalid && m_axis_c_tready) begin
            b_valid <= 0;
            b_data <= 0;
            s_axis_b_tready <= 1;
        end else begin
            s_axis_b_tready <= ~b_valid;
        end
    end
end

// Compute and write C
always@(posedge ap_aclk) begin
    if (ap_areset) begin
        m_axis_c_tvalid <= 0;
        m_axis_c_tdata <= 0;
    end else begin
        if (m_axis_c_tvalid && m_axis_c_tready) begin
            m_axis_c_tvalid <= 0;
        end else if (a_valid && b_valid) begin
            m_axis_c_tvalid <= 1;
            m_axis_c_tdata <= a_data + b_data;
        end
    end
end''',
                                language=dace.Language.SystemVerilog)

    A = state.add_read('A')
    B = state.add_read('B')
    C = state.add_write('C')

    state.add_memlet_path(A, mentry, tasklet, memlet=dace.Memlet('A[k,0:N]'), dst_conn='a')
    state.add_memlet_path(B, mentry, tasklet, memlet=dace.Memlet('B[k,0:N]'), dst_conn='b')
    state.add_memlet_path(tasklet, mexit, C, memlet=dace.Memlet('C[k,0:N]'), src_conn='c')

    sdfg.specialize({'M': M, 'N': N, 'W': W})
    sdfg.validate()

    # init data structures
    a = np.random.randint(0, 100, m * n * w).reshape((m, n, w)).astype(np.int32)
    b = np.random.randint(0, 100, m * n * w).reshape((m, n, w)).astype(np.int32)
    c = np.zeros((m, n, w)).astype(np.int32)

    # call program
    sdfg(A=a, B=b, C=c)

    # check result
    assert (c == a + b).all()
def make_sdfg(tasklet_code=None,
              name="veclen_copy_conversion",
              dtype=dace.float32,
              veclen=16):

    vtype = dace.vector(dace.float32, veclen)

    if tasklet_code is None:
        tasklet_code = "_out = _in"

    n = dace.symbol("N")

    sdfg = dace.SDFG(name)

    pre_state = sdfg.add_state(name + "_pre")
    state = sdfg.add_state(name)
    post_state = sdfg.add_state(name + "_post")
    sdfg.add_edge(pre_state, state, dace.InterstateEdge())
    sdfg.add_edge(state, post_state, dace.InterstateEdge())

    _, desc_input_host = sdfg.add_array("a", (n // veclen, ), vtype)
    _, desc_output_host = sdfg.add_array("b", (n // veclen, ), vtype)
    desc_input_device = copy.copy(desc_input_host)
    desc_input_device.storage = dace.StorageType.FPGA_Global
    desc_input_device.location["bank"] = 0
    desc_input_device.transient = True
    desc_output_device = copy.copy(desc_output_host)
    desc_output_device.storage = dace.StorageType.FPGA_Global
    desc_output_device.location["bank"] = 1
    desc_output_device.transient = True
    sdfg.add_datadesc("a_device", desc_input_device)
    sdfg.add_datadesc("b_device", desc_output_device)

    # Host to device
    pre_read = pre_state.add_read("a")
    pre_write = pre_state.add_write("a_device")
    pre_state.add_memlet_path(pre_read,
                              pre_write,
                              memlet=dace.Memlet(pre_write.data, None))

    # Device to host
    post_read = post_state.add_read("b_device")
    post_write = post_state.add_write("b")
    post_state.add_memlet_path(post_read,
                               post_write,
                               memlet=dace.Memlet(post_write.data, None))

    # Compute state
    read_memory = state.add_read("a_device")
    write_memory = state.add_write("b_device")

    # Memory streams
    sdfg.add_stream("a_stream",
                    vtype,
                    storage=dace.StorageType.FPGA_Local,
                    transient=True)
    sdfg.add_stream("b_stream",
                    vtype,
                    storage=dace.StorageType.FPGA_Local,
                    transient=True)
    produce_input_stream = state.add_write("a_stream")
    consume_input_stream = state.add_read("a_stream")
    produce_output_stream = state.add_write("b_stream")
    consume_output_stream = state.add_write("b_stream")

    tasklet = state.add_tasklet(name, {"_in"}, {"_out"}, tasklet_code)

    # Iterative map
    entry, exit = state.add_map(name, {
        "i": "0:N//{}".format(veclen),
    },
                                schedule=dace.ScheduleType.FPGA_Device)

    # Unrolled map
    unroll_entry, unroll_exit = state.add_map(
        name + "_unroll", {"u": "0:{}".format(veclen)},
        schedule=dace.ScheduleType.FPGA_Device,
        unroll=True)

    # Container-to-container copies between arrays and streams
    state.add_memlet_path(read_memory,
                          produce_input_stream,
                          memlet=dace.Memlet(read_memory.data))
    state.add_memlet_path(consume_output_stream,
                          write_memory,
                          memlet=dace.Memlet(write_memory.data))

    # Container-to-container copy from vectorized stream to non-vectorized
    # buffer
    sdfg.add_array("a_buffer", (veclen, ),
                   dtype,
                   storage=dace.StorageType.FPGA_Local,
                   transient=True)
    sdfg.add_array("b_buffer", (veclen, ),
                   dtype,
                   storage=dace.StorageType.FPGA_Local,
                   transient=True)
    a_buffer = state.add_access("a_buffer")
    b_buffer = state.add_access("b_buffer")

    # Input stream to buffer
    state.add_memlet_path(consume_input_stream,
                          entry,
                          a_buffer,
                          memlet=dace.Memlet.simple(
                              consume_input_stream.data,
                              "0",
                              other_subset_str="0:{}".format(veclen)))
    # Buffer to tasklet
    state.add_memlet_path(a_buffer,
                          unroll_entry,
                          tasklet,
                          dst_conn="_in",
                          memlet=dace.Memlet.simple(a_buffer.data,
                                                    "u",
                                                    num_accesses=1))

    # Tasklet to buffer
    state.add_memlet_path(tasklet,
                          unroll_exit,
                          b_buffer,
                          src_conn="_out",
                          memlet=dace.Memlet.simple(b_buffer.data,
                                                    "u",
                                                    num_accesses=1))

    # Buffer to output stream
    state.add_memlet_path(b_buffer,
                          exit,
                          produce_output_stream,
                          memlet=dace.Memlet.simple(
                              produce_output_stream.data,
                              "0",
                              other_subset_str="0:{}".format(veclen),
                              num_accesses=1))

    return sdfg
from __future__ import print_function

import argparse
import dace
import math
import numpy as np

from dace.dtypes import StorageType, Language
from dace.sdfg import SDFG
from dace.memlet import Memlet
from dace.subsets import Indices

N = dace.symbol("N", positive=True)
W = dace.symbol("W", positive=True)
dtype = dace.float32
vtype = dace.vector(dtype, W)
buffer_size = 2048  # Of internal FIFOs


def make_copy_to_device(sdfg):

    pre_state = sdfg.add_state("copy_to_device")

    A_host = pre_state.add_array("A", [N / W], dtype=vtype)

    A_device = pre_state.add_array("A_device", [N / W],
                                   dtype=vtype,
                                   transient=True,
                                   storage=StorageType.FPGA_Global)

    pre_state.add_edge(A_host, None, A_device, None,
Example #28
0
def create_gemm_sdfg(sdfg_name,
                     alpha,
                     beta,
                     A,
                     B,
                     C,
                     dtype,
                     transA=False,
                     transB=False,
                     vec_width=1,
                     expansion_args=None):
    '''
    Build an SDFG that perform the given GEMM operation along the given axis
    Input data A, B, and C is not vectorized
    '''
    sdfg = dace.SDFG(sdfg_name)

    ###########################################################################
    # Copy data to FPGA

    copy_in_state = sdfg.add_state("copy_to_device")
    A_shape = A.shape
    B_shape = B.shape
    C_shape = C.shape
    N = A_shape[0]
    K = A_shape[1]
    M = B_shape[1]
    vec_type = dace.vector(dtype, vec_width)

    # Create data containers
    sdfg.add_array('A', A_shape, dtype)
    sdfg.add_array("A_device",
                   shape=A_shape,
                   dtype=dtype,
                   storage=dace.dtypes.StorageType.FPGA_Global,
                   transient=True)
    sdfg.add_array("B", [K, M / vec_width], dtype=vec_type)
    sdfg.add_array("B_device", [K, M / vec_width],
                   dtype=vec_type,
                   transient=True,
                   storage=dace.dtypes.StorageType.FPGA_Global)

    sdfg.add_array("C", [N, M / vec_width], dtype=vec_type)
    sdfg.add_array("C_device", [N, M / vec_width],
                   dtype=vec_type,
                   transient=True,
                   storage=dace.dtypes.StorageType.FPGA_Global)

    # Copy A
    in_host_A = copy_in_state.add_read("A")
    in_device_A = copy_in_state.add_write("A_device")
    copy_in_state.add_memlet_path(in_host_A,
                                  in_device_A,
                                  memlet=dace.Memlet(f"A[0:{N}, 0:{K}]"))

    # Copy B
    in_host_B = copy_in_state.add_read("B")
    in_device_B = copy_in_state.add_write("B_device")
    copy_in_state.add_memlet_path(
        in_host_B,
        in_device_B,
        memlet=dace.Memlet(f"B[0:{K}, 0:{M}/{vec_width}]"))

    # Copy C
    in_host_C = copy_in_state.add_read("C")
    in_device_C = copy_in_state.add_write("C_device")
    copy_in_state.add_memlet_path(
        in_host_C,
        in_device_C,
        memlet=dace.Memlet(f"C[0:{N}, 0:{M}/{vec_width}]"))

    ###########################################################################
    # Copy data from FPGA
    copy_out_state = sdfg.add_state("copy_from_device")

    out_device = copy_out_state.add_read("C_device")
    out_host = copy_out_state.add_write("C")
    copy_out_state.add_memlet_path(
        out_device,
        out_host,
        memlet=dace.Memlet(f"C[0:{N}, 0:{M}//{vec_width}]"))

    ########################################################################
    # FPGA State

    fpga_state = sdfg.add_state("fpga_state")
    in_A = fpga_state.add_read("A_device")
    in_B = fpga_state.add_read("B_device")
    in_C = fpga_state.add_read("C_device")
    out_C = fpga_state.add_read("C_device")

    gemm_node = blas.Gemm("gemm",
                          transA=transA,
                          transB=transB,
                          alpha=alpha,
                          beta=beta)
    gemm_node.implementation = "FPGA1DSystolic"

    fpga_state.add_memlet_path(in_A,
                               gemm_node,
                               dst_conn="_a",
                               memlet=dace.Memlet(f"A_device[0:{N}, 0:{K}]"))
    fpga_state.add_memlet_path(
        in_B,
        gemm_node,
        dst_conn="_b",
        memlet=dace.Memlet(f"B_device[0:{K}, 0:{M}/{vec_width}]"))
    fpga_state.add_memlet_path(
        in_C,
        gemm_node,
        dst_conn="_cin",
        memlet=dace.Memlet(f"C_device[0:{N}, 0:{M}/{vec_width}]"))
    fpga_state.add_memlet_path(
        gemm_node,
        out_C,
        src_conn="_c",
        memlet=dace.Memlet(f"C_device[0:{N}, 0:{M}/{vec_width}]"))

    ######################################
    # Interstate edges
    sdfg.add_edge(copy_in_state, fpga_state, dace.sdfg.sdfg.InterstateEdge())
    sdfg.add_edge(fpga_state, copy_out_state, dace.sdfg.sdfg.InterstateEdge())
    sdfg.validate()

    if expansion_args is not None:
        gemm_node.expand(sdfg, fpga_state, **expansion_args)

    return sdfg
Example #29
0
# add symbol
N = dace.symbol('N')

# add sdfg
sdfg = dace.SDFG('fladd')

# add state
state = sdfg.add_state('device_state')

# add parameter
veclen = 1
sdfg.add_constant('VECLEN', veclen)

# add arrays
sdfg.add_array('A', [N // veclen],
               dtype=dace.vector(dace.float32, veclen),
               storage=dace.StorageType.CPU_Heap)
sdfg.add_array('B', [N // veclen],
               dtype=dace.vector(dace.float32, veclen),
               storage=dace.StorageType.CPU_Heap)
sdfg.add_array('C', [N // veclen],
               dtype=dace.vector(dace.float32, veclen),
               storage=dace.StorageType.CPU_Heap)
sdfg.add_array('fpga_A', [N // veclen],
               dtype=dace.vector(dace.float32, veclen),
               transient=True,
               storage=dace.StorageType.FPGA_Global)
sdfg.add_array('fpga_B', [N // veclen],
               dtype=dace.vector(dace.float32, veclen),
               transient=True,
               storage=dace.StorageType.FPGA_Global)
Example #30
0
def test_tasklet_vector_add():
    """
        Test rtl tasklet vector support.
    """

    # add symbol
    W = dace.symbol('W')

    # add sdfg
    sdfg = dace.SDFG('rtl_tasklet_vector_add')

    # define compile-time constant
    sdfg.specialize(dict(W=4))

    # add state
    state = sdfg.add_state()

    # add arrays
    sdfg.add_array('A', [1], dtype=dace.vector(dace.int32, dace.symbolic.evaluate(W, sdfg.constants)))
    sdfg.add_array('B', [1], dtype=dace.vector(dace.int32, dace.symbolic.evaluate(W, sdfg.constants)))

    # add custom cpp tasklet
    tasklet = state.add_tasklet(name='rtl_tasklet',
                                inputs={'a'},
                                outputs={'b'},
                                code='''
    always@(posedge ap_aclk) begin
        if (ap_areset) begin
            s_axis_a_tready <= 1;
            m_axis_b_tvalid <= 0;
            m_axis_b_tdata <= 0;
        end else if (s_axis_a_tvalid && s_axis_a_tready) begin
            s_axis_a_tready <= 0;
            m_axis_b_tvalid <= 1;
            for (int i = 0; i < W; i++) begin
                m_axis_b_tdata[i] <= s_axis_a_tdata[i] + 42;
            end
        end else if (m_axis_b_tvalid && m_axis_b_tready) begin
            s_axis_a_tready <= 1;
            m_axis_b_tvalid <= 0;
            m_axis_b_tdata <= 0;
        end
    end
        ''',
                                language=dace.Language.SystemVerilog)

    # add input/output array
    A = state.add_read('A')
    B = state.add_write('B')

    # connect input/output array with the tasklet
    state.add_edge(A, None, tasklet, 'a', dace.Memlet('A[0]'))
    state.add_edge(tasklet, 'b', B, None, dace.Memlet('B[0]'))

    # validate sdfg
    sdfg.validate()

    # Execute

    # init data structures
    a = np.random.randint(0, 100, (dace.symbolic.evaluate(W, sdfg.constants), )).astype(np.int32)
    b = np.zeros((dace.symbolic.evaluate(W, sdfg.constants), )).astype(np.int32)

    # call program
    sdfg(A=a, B=b)

    # check result
    print(a)
    print(b)
    assert (b == a + 42).all()