Beispiel #1
0
def run_nussinov(device_type: dace.dtypes.DeviceType):
    '''
    Runs Nussinov for the given device
    :return: the SDFG
    '''

    # Initialize data (polybench mini size)
    N = 60
    seq, table = init_data(N)

    if device_type in {dace.dtypes.DeviceType.CPU, dace.dtypes.DeviceType.GPU}:
        # Parse the SDFG and apply autopot
        sdfg = kernel.to_sdfg()
        sdfg.coarsen_dataflow()
        dace_res = sdfg(seq=seq, N=N)

    elif device_type == dace.dtypes.DeviceType.FPGA:
        # Parse SDFG and apply FPGA friendly optimization
        sdfg = kernel.to_sdfg(coarsen=True)
        applied = sdfg.apply_transformations([FPGATransformSDFG])
        assert applied == 1

        fpga_auto_opt.fpga_global_to_local(sdfg)  # Necessary
        fpga_auto_opt.fpga_rr_interleave_containers_to_banks(sdfg)

        sdfg.specialize(dict(N=N))
        dace_res = sdfg(seq=seq)

    # Compute ground truth and validate result
    gt_res = ground_truth(N, seq)

    assert np.allclose(dace_res, gt_res)
    return sdfg
Beispiel #2
0
def run_syrk(device_type: dace.dtypes.DeviceType):
    '''
    Runs Syrk for the given device
    :return: the SDFG
    '''

    # Initialize data (polybench medium size)
    M, N = (200, 240)
    alpha, beta, C, A = init_data(N, M)
    gt_C = np.copy(C)

    if device_type in {dace.dtypes.DeviceType.CPU, dace.dtypes.DeviceType.GPU}:
        # Parse the SDFG and apply autopot
        sdfg = kernel.to_sdfg()
        sdfg = auto_optimize(sdfg, device_type)
        sdfg(alpha=alpha, beta=beta, C=C, A=A, M=M, N=N)

    elif device_type == dace.dtypes.DeviceType.FPGA:
        # Parse SDFG and apply FPGA friendly optimization
        sdfg = kernel.to_sdfg(coarsen=True)
        applied = sdfg.apply_transformations([FPGATransformSDFG])
        assert applied == 1

        fpga_auto_opt.fpga_global_to_local(sdfg)
        fpga_auto_opt.fpga_rr_interleave_containers_to_banks(sdfg)
        sdfg.specialize(dict(N=N, M=M))
        # run program
        sdfg(alpha=alpha, beta=beta, C=C, A=A)

    # Compute ground truth and validate result
    ground_truth(N, M, alpha, beta, gt_C, A)
    assert np.allclose(C, gt_C)
    return sdfg
Beispiel #3
0
def run_covariance(device_type: dace.dtypes.DeviceType):
    '''
    Runs Covariance for the given device
    :return: the SDFG
    '''

    # Initialize data (polybench small size)
    M, N = (80, 100)
    float_n, data = init_data(M, N)

    gt_data = np.copy(data)

    if device_type in {dace.dtypes.DeviceType.CPU, dace.dtypes.DeviceType.GPU}:
        # Parse the SDFG and apply autopot
        sdfg = covariance_kernel.to_sdfg()
        sdfg = auto_optimize(sdfg, device_type)
        dace_res = sdfg(float_n=float_n, data=data, M=M, N=N)

    elif device_type == dace.dtypes.DeviceType.FPGA:
        # Parse SDFG and apply FPGA friendly optimization
        sdfg = covariance_kernel.to_sdfg(coarsen=False)
        sdfg.coarsen_dataflow()
        applied = sdfg.apply_transformations([FPGATransformSDFG])
        assert applied == 1

        sdfg.apply_transformations([InlineSDFG])

        # Use FPGA Expansion for lib nodes, and expand them to enable further optimizations
        # Reduce.default_implementation = "FPGAPartialReduction"
        Gemv.default_implementation = "FPGA_Accumulate"

        sdfg.expand_library_nodes()
        sdfg.apply_transformations([InlineSDFG])

        # Other FPGA auto opt
        fpga_auto_opt.fpga_global_to_local(sdfg)
        fpga_auto_opt.fpga_rr_interleave_containers_to_banks(sdfg)

        # Specialize the SDFG
        sdfg.specialize(dict(N=N, M=M))

        # run program
        dace_res = sdfg(float_n=float_n, data=data)

    # Compute ground truth and validate result
    gt_res = ground_truth(M, N, float_n, gt_data)
    assert np.allclose(gt_res, dace_res)
    return sdfg
Beispiel #4
0
def run_atax(device_type: dace.dtypes.DeviceType):
    '''
    Runs ATAX for the given device
    :return: the SDFG
    '''

    # Initialize data (polybench medium size)
    M, N = (390, 410)
    A, x, y_ref = init_data(M, N)

    if device_type in {dace.dtypes.DeviceType.CPU, dace.dtypes.DeviceType.GPU}:
        # Parse the SDFG and apply autopot
        sdfg = kernel.to_sdfg()
        sdfg = auto_optimize(sdfg, device_type)
        y = sdfg(A, x, M=M, N=N)

    elif device_type == dace.dtypes.DeviceType.FPGA:
        # Parse SDFG and apply FPGA friendly optimization
        sdfg = kernel.to_sdfg(simplify=True)
        applied = sdfg.apply_transformations([FPGATransformSDFG])
        assert applied == 1

        # Use FPGA Expansion for lib nodes, and expand them to enable further optimizations
        from dace.libraries.blas import Gemv
        Gemv.default_implementation = "FPGA_Accumulate"
        sdfg.expand_library_nodes()
        sm_applied = sdfg.apply_transformations_repeated(
            [InlineSDFG, StreamingMemory],
            [{}, {
                'storage': dace.StorageType.FPGA_Local
            }],
            print_report=True)
        assert sm_applied == 6  # 3 inlines and 3 Streaming memories

        ###########################
        # FPGA Auto Opt
        fpga_auto_opt.fpga_global_to_local(sdfg)
        fpga_auto_opt.fpga_rr_interleave_containers_to_banks(sdfg)

        # specialize the SDFG (needed by the GEMV expansion)
        sdfg.specialize(dict(M=M, N=N))
        y = sdfg(A, x)

    # Compute ground truth and Validate result
    y_ref = kernel.f(A, x)
    assert np.allclose(y, y_ref)
    return sdfg
Beispiel #5
0
def run_lu(device_type: dace.dtypes.DeviceType):
    '''
    Runs LU for the given device
    :return: the SDFG
    '''

    # Initialize data (polybench mini size)
    N = 40
    A = init_data(N)
    gt_A = np.copy(A)

    if device_type in {dace.dtypes.DeviceType.CPU, dace.dtypes.DeviceType.GPU}:
        # Parse the SDFG and apply autopot
        sdfg = lu_kernel.to_sdfg()
        dace_res = sdfg(A=A, N=N)

    elif device_type == dace.dtypes.DeviceType.FPGA:
        # Parse SDFG and apply FPGA friendly optimization
        sdfg = lu_kernel.to_sdfg(coarsen=True)

        applied = sdfg.apply_transformations([FPGATransformSDFG])
        assert applied == 1

        # Use FPGA Expansion for lib nodes, and expand them to enable further optimizations
        from dace.libraries.blas import Dot
        platform = dace.config.Config.get("compiler", "fpga", "vendor")
        if platform == "intel_fpga":
            Dot.default_implementation = "FPGA_Accumulate"
        else:
            Dot.default_implementation = "FPGA_PartialSums"

        sdfg.expand_library_nodes()
        sdfg.apply_transformations_repeated([InlineSDFG])

        fpga_auto_opt.fpga_rr_interleave_containers_to_banks(sdfg)
        fpga_auto_opt.fpga_global_to_local(sdfg)

        sdfg.specialize(dict(N=N))
        dace_res = sdfg(A=A)

    # Compute ground truth and validate result
    ground_truth(N, gt_A)
    diff = np.linalg.norm(gt_A - A) / np.linalg.norm(gt_A)
    assert diff < 1e-5
    return sdfg
Beispiel #6
0
def run_floyd_warshall(device_type: dace.dtypes.DeviceType):
    '''
    Runs Floyd Warshall for the given device
    :return: the SDFG
    '''

    # Initialize data (polybench mini size)
    N = 60
    path = init_data(N)
    gt_path = np.copy(path)

    if device_type in {dace.dtypes.DeviceType.CPU, dace.dtypes.DeviceType.GPU}:
        # Parse the SDFG and apply autopot
        sdfg = kernel.to_sdfg()
        sdfg = auto_optimize(sdfg, device_type)
        sdfg(path=path, N=N)

    elif device_type == dace.dtypes.DeviceType.FPGA:
        # Parse SDFG and apply FPGA friendly optimization
        sdfg = kernel.to_sdfg(simplify=True)
        # sdfg.apply_transformations_repeated([MapFusion])
        applied = sdfg.apply_transformations([FPGATransformSDFG])
        assert applied == 1

        sm_applied = sdfg.apply_transformations_repeated(
            [InlineSDFG, StreamingMemory],
            [{}, {
                'storage': dace.StorageType.FPGA_Local
            }],
            print_report=True)
        assert sm_applied == 1
        sc_applied = sdfg.apply_transformations_repeated(
            [InlineSDFG, StreamingComposition],
            [{}, {
                'storage': dace.StorageType.FPGA_Local
            }],
            print_report=True,
            permissive=True)
        assert sc_applied == 1

        # Prune connectors after Streaming Composition
        pruned_conns = sdfg.apply_transformations_repeated(
            PruneConnectors, options=[{
                'remove_unused_containers': True
            }])

        assert pruned_conns == 1

        fpga_auto_opt.fpga_rr_interleave_containers_to_banks(sdfg)

        # In this case, we want to generate the top-level state as an host-based state,
        # not an FPGA kernel. We need to explicitly indicate that
        sdfg.states()[0].location["is_FPGA_kernel"] = False
        # we need to specialize both the top-level SDFG and the nested SDFG
        sdfg.specialize(dict(N=N))
        sdfg.states()[0].nodes()[0].sdfg.specialize(dict(N=N))
        # run program
        sdfg(path=path)

    # Compute ground truth and validate result
    ground_truth(gt_path, N)
    assert np.allclose(path, gt_path)
    return sdfg