def run_nussinov(device_type: dace.dtypes.DeviceType): ''' Runs Nussinov for the given device :return: the SDFG ''' # Initialize data (polybench mini size) N = 60 seq, table = init_data(N) if device_type in {dace.dtypes.DeviceType.CPU, dace.dtypes.DeviceType.GPU}: # Parse the SDFG and apply autopot sdfg = kernel.to_sdfg() sdfg.coarsen_dataflow() dace_res = sdfg(seq=seq, N=N) elif device_type == dace.dtypes.DeviceType.FPGA: # Parse SDFG and apply FPGA friendly optimization sdfg = kernel.to_sdfg(coarsen=True) applied = sdfg.apply_transformations([FPGATransformSDFG]) assert applied == 1 fpga_auto_opt.fpga_global_to_local(sdfg) # Necessary fpga_auto_opt.fpga_rr_interleave_containers_to_banks(sdfg) sdfg.specialize(dict(N=N)) dace_res = sdfg(seq=seq) # Compute ground truth and validate result gt_res = ground_truth(N, seq) assert np.allclose(dace_res, gt_res) return sdfg
def run_syrk(device_type: dace.dtypes.DeviceType): ''' Runs Syrk for the given device :return: the SDFG ''' # Initialize data (polybench medium size) M, N = (200, 240) alpha, beta, C, A = init_data(N, M) gt_C = np.copy(C) if device_type in {dace.dtypes.DeviceType.CPU, dace.dtypes.DeviceType.GPU}: # Parse the SDFG and apply autopot sdfg = kernel.to_sdfg() sdfg = auto_optimize(sdfg, device_type) sdfg(alpha=alpha, beta=beta, C=C, A=A, M=M, N=N) elif device_type == dace.dtypes.DeviceType.FPGA: # Parse SDFG and apply FPGA friendly optimization sdfg = kernel.to_sdfg(coarsen=True) applied = sdfg.apply_transformations([FPGATransformSDFG]) assert applied == 1 fpga_auto_opt.fpga_global_to_local(sdfg) fpga_auto_opt.fpga_rr_interleave_containers_to_banks(sdfg) sdfg.specialize(dict(N=N, M=M)) # run program sdfg(alpha=alpha, beta=beta, C=C, A=A) # Compute ground truth and validate result ground_truth(N, M, alpha, beta, gt_C, A) assert np.allclose(C, gt_C) return sdfg
def run_covariance(device_type: dace.dtypes.DeviceType): ''' Runs Covariance for the given device :return: the SDFG ''' # Initialize data (polybench small size) M, N = (80, 100) float_n, data = init_data(M, N) gt_data = np.copy(data) if device_type in {dace.dtypes.DeviceType.CPU, dace.dtypes.DeviceType.GPU}: # Parse the SDFG and apply autopot sdfg = covariance_kernel.to_sdfg() sdfg = auto_optimize(sdfg, device_type) dace_res = sdfg(float_n=float_n, data=data, M=M, N=N) elif device_type == dace.dtypes.DeviceType.FPGA: # Parse SDFG and apply FPGA friendly optimization sdfg = covariance_kernel.to_sdfg(coarsen=False) sdfg.coarsen_dataflow() applied = sdfg.apply_transformations([FPGATransformSDFG]) assert applied == 1 sdfg.apply_transformations([InlineSDFG]) # Use FPGA Expansion for lib nodes, and expand them to enable further optimizations # Reduce.default_implementation = "FPGAPartialReduction" Gemv.default_implementation = "FPGA_Accumulate" sdfg.expand_library_nodes() sdfg.apply_transformations([InlineSDFG]) # Other FPGA auto opt fpga_auto_opt.fpga_global_to_local(sdfg) fpga_auto_opt.fpga_rr_interleave_containers_to_banks(sdfg) # Specialize the SDFG sdfg.specialize(dict(N=N, M=M)) # run program dace_res = sdfg(float_n=float_n, data=data) # Compute ground truth and validate result gt_res = ground_truth(M, N, float_n, gt_data) assert np.allclose(gt_res, dace_res) return sdfg
def run_atax(device_type: dace.dtypes.DeviceType): ''' Runs ATAX for the given device :return: the SDFG ''' # Initialize data (polybench medium size) M, N = (390, 410) A, x, y_ref = init_data(M, N) if device_type in {dace.dtypes.DeviceType.CPU, dace.dtypes.DeviceType.GPU}: # Parse the SDFG and apply autopot sdfg = kernel.to_sdfg() sdfg = auto_optimize(sdfg, device_type) y = sdfg(A, x, M=M, N=N) elif device_type == dace.dtypes.DeviceType.FPGA: # Parse SDFG and apply FPGA friendly optimization sdfg = kernel.to_sdfg(simplify=True) applied = sdfg.apply_transformations([FPGATransformSDFG]) assert applied == 1 # Use FPGA Expansion for lib nodes, and expand them to enable further optimizations from dace.libraries.blas import Gemv Gemv.default_implementation = "FPGA_Accumulate" sdfg.expand_library_nodes() sm_applied = sdfg.apply_transformations_repeated( [InlineSDFG, StreamingMemory], [{}, { 'storage': dace.StorageType.FPGA_Local }], print_report=True) assert sm_applied == 6 # 3 inlines and 3 Streaming memories ########################### # FPGA Auto Opt fpga_auto_opt.fpga_global_to_local(sdfg) fpga_auto_opt.fpga_rr_interleave_containers_to_banks(sdfg) # specialize the SDFG (needed by the GEMV expansion) sdfg.specialize(dict(M=M, N=N)) y = sdfg(A, x) # Compute ground truth and Validate result y_ref = kernel.f(A, x) assert np.allclose(y, y_ref) return sdfg
def run_lu(device_type: dace.dtypes.DeviceType): ''' Runs LU for the given device :return: the SDFG ''' # Initialize data (polybench mini size) N = 40 A = init_data(N) gt_A = np.copy(A) if device_type in {dace.dtypes.DeviceType.CPU, dace.dtypes.DeviceType.GPU}: # Parse the SDFG and apply autopot sdfg = lu_kernel.to_sdfg() dace_res = sdfg(A=A, N=N) elif device_type == dace.dtypes.DeviceType.FPGA: # Parse SDFG and apply FPGA friendly optimization sdfg = lu_kernel.to_sdfg(coarsen=True) applied = sdfg.apply_transformations([FPGATransformSDFG]) assert applied == 1 # Use FPGA Expansion for lib nodes, and expand them to enable further optimizations from dace.libraries.blas import Dot platform = dace.config.Config.get("compiler", "fpga", "vendor") if platform == "intel_fpga": Dot.default_implementation = "FPGA_Accumulate" else: Dot.default_implementation = "FPGA_PartialSums" sdfg.expand_library_nodes() sdfg.apply_transformations_repeated([InlineSDFG]) fpga_auto_opt.fpga_rr_interleave_containers_to_banks(sdfg) fpga_auto_opt.fpga_global_to_local(sdfg) sdfg.specialize(dict(N=N)) dace_res = sdfg(A=A) # Compute ground truth and validate result ground_truth(N, gt_A) diff = np.linalg.norm(gt_A - A) / np.linalg.norm(gt_A) assert diff < 1e-5 return sdfg
def run_floyd_warshall(device_type: dace.dtypes.DeviceType): ''' Runs Floyd Warshall for the given device :return: the SDFG ''' # Initialize data (polybench mini size) N = 60 path = init_data(N) gt_path = np.copy(path) if device_type in {dace.dtypes.DeviceType.CPU, dace.dtypes.DeviceType.GPU}: # Parse the SDFG and apply autopot sdfg = kernel.to_sdfg() sdfg = auto_optimize(sdfg, device_type) sdfg(path=path, N=N) elif device_type == dace.dtypes.DeviceType.FPGA: # Parse SDFG and apply FPGA friendly optimization sdfg = kernel.to_sdfg(simplify=True) # sdfg.apply_transformations_repeated([MapFusion]) applied = sdfg.apply_transformations([FPGATransformSDFG]) assert applied == 1 sm_applied = sdfg.apply_transformations_repeated( [InlineSDFG, StreamingMemory], [{}, { 'storage': dace.StorageType.FPGA_Local }], print_report=True) assert sm_applied == 1 sc_applied = sdfg.apply_transformations_repeated( [InlineSDFG, StreamingComposition], [{}, { 'storage': dace.StorageType.FPGA_Local }], print_report=True, permissive=True) assert sc_applied == 1 # Prune connectors after Streaming Composition pruned_conns = sdfg.apply_transformations_repeated( PruneConnectors, options=[{ 'remove_unused_containers': True }]) assert pruned_conns == 1 fpga_auto_opt.fpga_rr_interleave_containers_to_banks(sdfg) # In this case, we want to generate the top-level state as an host-based state, # not an FPGA kernel. We need to explicitly indicate that sdfg.states()[0].location["is_FPGA_kernel"] = False # we need to specialize both the top-level SDFG and the nested SDFG sdfg.specialize(dict(N=N)) sdfg.states()[0].nodes()[0].sdfg.specialize(dict(N=N)) # run program sdfg(path=path) # Compute ground truth and validate result ground_truth(gt_path, N) assert np.allclose(path, gt_path) return sdfg