def test_rr_interleave(): ''' Tests RR interleaving of containers to memory banks ''' @dace.program def rr_interleave(A: dace.float32[8], B: dace.float32[8], C: dace.float32[8]): return A + B + C A = np.random.rand(8).astype(np.float32) B = np.random.rand(8).astype(np.float32) C = np.random.rand(8).astype(np.float32) sdfg = rr_interleave.to_sdfg() sdfg.apply_transformations([FPGATransformSDFG]) #specifically run the the interleave transformation allocated = fpga_auto_opt.fpga_rr_interleave_containers_to_banks(sdfg) # There will be 5 arrays (one is a temporary containing A + B) assert allocated == [2, 1, 1, 1] R = sdfg(A=A, B=B, C=C) assert np.allclose(A + B + C, R) return sdfg
def test_mem_buffer_bicg(): A = np.random.rand(N, M).astype(np.float32) p = np.random.rand(M).astype(np.float32) r = np.random.rand(M).astype(np.float32) # Parse SDFG and apply FPGA friendly optimization sdfg = bicg.to_sdfg(strict=True) applied = sdfg.apply_transformations([FPGATransformSDFG]) assert applied == 1 fpga_rr_interleave_containers_to_banks(sdfg, num_banks=4) # Use FPGA Expansion for lib nodes, and expand them to enable further optimizations from dace.libraries.blas import Gemv Gemv.default_implementation = "FPGA_Accumulate" sdfg.expand_library_nodes() sm_applied = sdfg.apply_transformations_repeated( [InlineSDFG, sm.StreamingMemory], [{}, { 'storage': dace.StorageType.FPGA_Local, 'use_memory_buffering': True }], print_report=True) assert sm_applied == 7 # 3 inlines and 4 Streaming memories sm_applied = sdfg.apply_transformations_repeated( [InlineSDFG, sm.StreamingMemory], [{}, { 'storage': dace.StorageType.FPGA_Local, 'use_memory_buffering': False }], print_report=True) assert sm_applied == 1 # 1 Streaming memories # specialize the SDFG (needed by the GEMV expansion) sdfg.specialize(dict(M=M, N=N)) res0, res1 = sdfg(A=A, p=p, r=r) # Compute ground truth and Validate result res0_ref, res1_ref = bicg.f(A, p, r) assert np.allclose(res0_ref, res0) assert np.allclose(res1, res1_ref) return sdfg
def auto_optimize(sdfg: SDFG, device: dtypes.DeviceType, validate: bool = True, validate_all: bool = False) -> SDFG: """ Runs a basic sequence of transformations to optimize a given SDFG to decent performance. In particular, performs the following: * Strict transformations * Strict auto-parallelization (loop-to-map) * Greedy application of SubgraphFusion * Tiled write-conflict resolution (MapTiling -> AccumulateTransient) * Tiled stream accumulation (MapTiling -> AccumulateTransient) * Collapse all maps to parallelize across all dimensions * Set all library nodes to expand to ``fast`` expansion, which calls the fastest library on the target device :param sdfg: The SDFG to optimize. :param device: the device to optimize for. :param validate: If True, validates the SDFG after all transformations have been applied. :param validate_all: If True, validates the SDFG after every step. :return: The optimized SDFG. :note: Operates in-place on the given SDFG. :note: This function is still experimental and may harm correctness in certain cases. Please report an issue if it does. """ # Strict transformations and loop parallelization transformed = True while transformed: sdfg.apply_strict_transformations(validate=False, validate_all=validate_all) xfh.split_interstate_edges(sdfg) # Try to parallelize loops l2ms = sdfg.apply_transformations_repeated(LoopToMap, strict=True, validate=False, validate_all=validate_all) transformed = l2ms > 0 # Map fusion greedy_fuse(sdfg, validate_all) if device == dtypes.DeviceType.FPGA: # apply FPGA Transformations sdfg.apply_fpga_transformations() fpga_aopt.fpga_global_to_local(sdfg) fpga_aopt.fpga_rr_interleave_containers_to_banks(sdfg) # Set all library nodes to expand to fast library calls set_fast_implementations(sdfg, device) return sdfg # Tiled WCR and streams for nsdfg in list(sdfg.all_sdfgs_recursive()): tile_wcrs(nsdfg, validate_all) # Collapse maps sdfg.apply_transformations_repeated(MapCollapse, strict=True, validate=False, validate_all=validate_all) for node, _ in sdfg.all_nodes_recursive(): if isinstance(node, nodes.MapEntry): node.map.collapse = len(node.map.range) # Set all library nodes to expand to fast library calls set_fast_implementations(sdfg, device) # TODO(later): Safe vectorization # Disable OpenMP parallel sections # TODO(later): Set on a per-SDFG basis config.Config.set('compiler', 'cpu', 'openmp_sections', value=False) # Set all Default storage types that are constant sized to registers move_small_arrays_to_stack(sdfg) # Validate at the end if validate or validate_all: sdfg.validate() return sdfg
def auto_optimize(sdfg: SDFG, device: dtypes.DeviceType, validate: bool = True, validate_all: bool = False, symbols: Dict[str, int] = None) -> SDFG: """ Runs a basic sequence of transformations to optimize a given SDFG to decent performance. In particular, performs the following: * Simplify * Auto-parallelization (loop-to-map) * Greedy application of SubgraphFusion * Tiled write-conflict resolution (MapTiling -> AccumulateTransient) * Tiled stream accumulation (MapTiling -> AccumulateTransient) * Collapse all maps to parallelize across all dimensions * Set all library nodes to expand to ``fast`` expansion, which calls the fastest library on the target device :param sdfg: The SDFG to optimize. :param device: the device to optimize for. :param validate: If True, validates the SDFG after all transformations have been applied. :param validate_all: If True, validates the SDFG after every step. :param symbols: Optional dict that maps symbols (str/symbolic) to int/float :return: The optimized SDFG. :note: Operates in-place on the given SDFG. :note: This function is still experimental and may harm correctness in certain cases. Please report an issue if it does. """ debugprint = config.Config.get_bool('debugprint') # Simplification and loop parallelization transformed = True sdfg.apply_transformations_repeated(TrivialMapElimination, validate=validate, validate_all=validate_all) while transformed: sdfg.simplify(validate=False, validate_all=validate_all) for s in sdfg.sdfg_list: xfh.split_interstate_edges(s) l2ms = sdfg.apply_transformations_repeated( (LoopToMap, RefineNestedAccess), validate=False, validate_all=validate_all) transformed = l2ms > 0 # Collapse maps and eliminate trivial dimensions sdfg.simplify() sdfg.apply_transformations_repeated(MapCollapse, validate=False, validate_all=validate_all) # Apply GPU transformations and set library node implementations if device == dtypes.DeviceType.GPU: sdfg.apply_gpu_transformations() sdfg.simplify() # fuse subgraphs greedily sdfg.simplify() greedy_fuse(sdfg, device=device, validate_all=validate_all) # fuse stencils greedily greedy_fuse(sdfg, device=device, validate_all=validate_all, recursive=False, stencil=True) if device == dtypes.DeviceType.FPGA: # apply FPGA Transformations sdfg.apply_fpga_transformations() fpga_auto_opt.fpga_global_to_local(sdfg) fpga_auto_opt.fpga_rr_interleave_containers_to_banks(sdfg) # Set all library nodes to expand to fast library calls set_fast_implementations(sdfg, device) return sdfg # Tiled WCR and streams for nsdfg in list(sdfg.all_sdfgs_recursive()): tile_wcrs(nsdfg, validate_all) # Collapse maps sdfg.apply_transformations_repeated(MapCollapse, validate=False, validate_all=validate_all) for node, _ in sdfg.all_nodes_recursive(): # Set OMP collapse property to map length if isinstance(node, nodes.MapEntry): # FORNOW: Leave out # node.map.collapse = len(node.map.range) pass # Set all library nodes to expand to fast library calls set_fast_implementations(sdfg, device) sdfg.expand_library_nodes() # TODO(later): Safe vectorization # Disable OpenMP parallel sections on a per-SDFG basis for nsdfg in sdfg.all_sdfgs_recursive(): nsdfg.openmp_sections = False if symbols: # Specialize for all known symbols known_symbols = { s: v for (s, v) in symbols.items() if s in sdfg.free_symbols } known_symbols = {} for (s, v) in symbols.items(): if s in sdfg.free_symbols: if isinstance(v, (int, float)): known_symbols[s] = v if isinstance(v, sympy.core.numbers.Integer): try: known_symbols[s] = int(v) except TypeError: pass if debugprint and len(known_symbols) > 0: print("Specializing the SDFG for symbols", known_symbols) sdfg.specialize(known_symbols) # Set all Default storage types that are constant sized to registers move_small_arrays_to_stack(sdfg) ''' # Fix storage and allocation properties, e.g., for benchmarking purposes # FORNOW: Leave out make_transients_persistent(sdfg, device) ''' # Validate at the end if validate or validate_all: sdfg.validate() return sdfg