def test_notbmap(): sdfg = dace.SDFG('default_storage_test_1') sdfg.add_array('A', [20], dace.float64, dace.StorageType.GPU_Global) sdfg.add_transient('tmp', [1], dace.float64) state = sdfg.add_state() r = state.add_read('A') me, mx = state.add_map('kernel', dict(i='0:20'), dace.ScheduleType.GPU_Device) tmp = state.add_access('tmp') t = state.add_tasklet('add', {'a'}, {'b'}, 'b = a + 1') w = state.add_write('A') state.add_memlet_path(r, me, tmp, memlet=dace.Memlet.simple('A', 'i')) state.add_memlet_path(tmp, t, dst_conn='a', memlet=dace.Memlet.simple('tmp', '0')) state.add_memlet_path(t, mx, w, src_conn='b', memlet=dace.Memlet.simple('A', 'i')) set_default_schedule_storage_types_and_location(sdfg, None) assert sdfg.arrays['tmp'].storage == dace.StorageType.Register
def test_nccl_ring_exchange(): ng = Config.get('compiler', 'cuda', 'max_number_gpus') if ng < 2: raise ValueError('This test needs to run with at least 2 GPUs.') sdfg: dace.SDFG = nccl_ring_exchange.to_sdfg(strict=True) gpu_map = find_map_by_param(sdfg, 'gpu_id') gpu_map.schedule = dtypes.ScheduleType.GPU_Multidevice infer_types.set_default_schedule_storage_types_and_location(sdfg, None) sdfg.specialize(dict(num_gpus=ng)) out = sdfg() res = np.sum(range(ng)) assert np.allclose(out, res), f'\nout: {out}\nres: {res}\n'
def test_nccl_reduce(): ng = Config.get('compiler', 'cuda', 'max_number_gpus') n = 15 sdfg: dace.SDFG = nccl_reduce.to_sdfg(strict=True) gpu_map = find_map_by_param(sdfg, 'gpu') gpu_map.schedule = dtypes.ScheduleType.GPU_Multidevice infer_types.set_default_schedule_storage_types_and_location(sdfg, None) sdfg.specialize(dict(root_device=0, num_gpus=ng)) out = np.ndarray(shape=n, dtype=np_dtype) out.fill(0) sdfg(out=out, N=n) assert np.unique(out)[0] == sum(range(ng))
def test_batchnorm2d_data_parallelism(): n, h, w, c = 16, 128, 128, 64 # n, h, w, c = 16, 4, 4, 8 ng = 4 sdfg: dace.SDFG = batchnorm2d_data_parallelism.to_sdfg(strict=True) sdfg.name = sdfg.name + '_inline' multi_gpu_map = find_map_by_param(sdfg, 'gpu_id') multi_gpu_map.schedule = dace.ScheduleType.GPU_Multidevice lib_nodes = find_library_nodes(sdfg, Reduce) lib_nodes[0].implementation = 'CUDA (device)' lib_nodes[1].implementation = 'CUDA (device)' # sdfg.apply_transformations(GPUTransformSDFG) sdfg.specialize( dict(number_of_gpus=ng, N=n, H=h, W=w, C=c, N_gpu=n // ng, NN=np.float32(n))) set_default_schedule_storage_types_and_location(sdfg, None) sdfg.expand_library_nodes() sdfg.apply_transformations_repeated([MapFusion]) sdfg.apply_transformations_repeated([RedundantSecondArray, RedundantArray]) sdfg.apply_strict_transformations() np.random.seed(0) X = np.ndarray(shape=[n, h, w, c], dtype=np_dtype) X[:] = np.random.rand(n, h, w, c)[:] # X = np.arange(n * h * w * c, dtype=np_dtype).reshape([n, h, w, c]) Z = np.copy(X) print('GPU') sdfg(X) print('GPU done') bnsdfg: dace.SDFG = batchnorm2d.to_sdfg() lib_nodes = find_library_nodes(bnsdfg, Reduce) lib_nodes[0].implementation = 'pure' lib_nodes[1].implementation = 'pure' print('CPU') res = bnsdfg(Z, N=n, H=h, W=w, C=c) print('CPU done') assert np.allclose(X, res), f'\ndiff: {np.linalg.norm(X-res)}'
def test_nccl_allreduce(): ng = Config.get('compiler', 'cuda', 'max_number_gpus') n = 15 sdfg: dace.SDFG = nccl_allreduce.to_sdfg(strict=True) state = sdfg.start_state gpu_map = state.nodes()[0] gpu_map.schedule = dtypes.ScheduleType.GPU_Multidevice infer_types.set_default_schedule_storage_types_and_location(sdfg, None) sdfg.specialize(dict(num_gpus=ng)) out = np.ndarray(shape=n, dtype=np_dtype) out.fill(0) sdfg(out=out, N=n) res = sum(range(ng)) assert np.unique(out)[0] == res
def test_nccl_reduce_symbolic(): ng = Config.get('compiler', 'cuda', 'max_number_gpus') n = 2 sdfg: dace.SDFG = nccl_reduce_symbolic.to_sdfg(strict=True) outer_map = find_map_by_param(sdfg, 'root_gpu') if outer_map: outer_map.schedule = dtypes.ScheduleType.Sequential gpu_map = find_map_by_param(sdfg, 'gpu') gpu_map.schedule = dtypes.ScheduleType.GPU_Multidevice infer_types.set_default_schedule_storage_types_and_location(sdfg, None) sdfg.specialize(dict(num_gpus=ng)) out = np.ndarray(shape=[ng, n], dtype=np_dtype) out.fill(0) sdfg(out=out, N=n) res = np.array([ng * i for i in range(ng)]) assert (np.unique(out) == res).all()
def test_batchnorm2d_model_parallelism(): sdfg: dace.SDFG = batchnorm2d_model_parallelism.to_sdfg(strict=True) multi_gpu_map = find_map_by_param(sdfg, 'gpu_id') multi_gpu_map.schedule = dace.ScheduleType.GPU_Multidevice lib_nodes = find_library_nodes(sdfg, Reduce) lib_nodes[0].implementation = 'CUDA (device)' lib_nodes[1].implementation = 'CUDA (device)' sdfg.specialize( dict(number_of_gpus=ng, N=n, H=h, W=w, C=c, C_gpu=c // ng, NN=np.float32(n))) set_default_schedule_storage_types_and_location(sdfg, None) sdfg.expand_library_nodes() sdfg.apply_transformations_repeated([MapFusion]) sdfg.apply_transformations_repeated([RedundantSecondArray, RedundantArray]) sdfg.apply_strict_transformations() np.random.seed(0) X = np.ndarray(shape=[n, h, w, c], dtype=np_dtype) X[:] = np.random.rand(n, h, w, c)[:] # X = np.copy(np.arange(size, dtype=np_dtype).reshape(shape)) Z = np.copy(X) print('GPU') mean, std, red = sdfg(X) print('GPU done') # print(f'\rred:\n{repr(red)}\n\nmean:\n{repr(mean)}\n\nstd:\n{repr(std)}\n') bnsdfg: dace.SDFG = batchnorm2d.to_sdfg() lib_nodes = find_library_nodes(bnsdfg, Reduce) lib_nodes[0].implementation = 'pure' lib_nodes[1].implementation = 'pure' print('CPU') res = bnsdfg(Z, N=n, H=h, W=w, C=c) print('CPU done') assert np.allclose(X, res), f'\ndiff: {np.linalg.norm(X-res)}'
def test_schedule_inference_simple(): @dace.program def nested_call(A: dace.float64[3, 3]): return A + 1 @dace.program def simple_schedule_inference(A: dace.float64[3, 3]): return nested_call(A) sdfg: dace.SDFG = simple_schedule_inference.to_sdfg(strict=False) infer_types.infer_connector_types(sdfg) infer_types.set_default_schedule_storage_types_and_location(sdfg, None) sdfg.apply_transformations_repeated(StateFusion) entry = [ n for n, _ in sdfg.all_nodes_recursive() if isinstance(n, dace.nodes.MapEntry) ][0] assert entry.schedule is dace.ScheduleType.CPU_Multicore
def test_ndarray_reduce(): sdfg: dace.SDFG = ndarray_reduce_gpu.to_sdfg(strict=True) map_entry = find_map(sdfg) map_entry.schedule = dace.ScheduleType.GPU_Multidevice lib_nodes = find_library_nodes(sdfg, Reduce) lib_nodes[0].implementation = 'CUDA (device)' sdfg.specialize(dict(number_of_gpus=ng, N=n, H=h, W=w, C=c, C_gpu=c // ng)) set_default_schedule_storage_types_and_location(sdfg, None) sdfg.expand_library_nodes() sdfg.apply_transformations_repeated([RedundantSecondArray, RedundantArray]) sdfg.apply_strict_transformations() np.random.seed(0) X = np.ndarray(shape=shape, dtype=np_dtype) X = np.arange(size, dtype=np_dtype).reshape(shape) # X[:] = np.random.rand(size)[:] Z = np.copy(X) print('GPU') out = sdfg(X) print('GPU done') res = np.sum(Z, axis=0) assert np.allclose(out, res), f'\out:\n{repr(out)}\n\nres:\n{repr(res)}\n'
def test_tbmap_sequential(): sdfg = dace.SDFG('default_storage_test_2') sdfg.add_array('A', [20, 32], dace.float64, dace.StorageType.GPU_Global) sdfg.add_transient('tmp', [1], dace.float64) state = sdfg.add_state() r = state.add_read('A') ome, omx = state.add_map('kernel', dict(i='0:20'), dace.ScheduleType.GPU_Device) sme, smx = state.add_map('seq', dict(j='0:1'), dace.ScheduleType.Sequential) ime, imx = state.add_map('block', dict(ti='0:32'), dace.ScheduleType.GPU_ThreadBlock) tmp = state.add_access('tmp') t = state.add_tasklet('add', {'a'}, {'b'}, 'b = a + 1') w = state.add_write('A') state.add_memlet_path(r, ome, sme, tmp, memlet=dace.Memlet.simple('A', 'i+j, 0:32')) state.add_memlet_path(tmp, ime, t, dst_conn='a', memlet=dace.Memlet.simple('tmp', '0, ti')) state.add_memlet_path(t, imx, smx, omx, w, src_conn='b', memlet=dace.Memlet.simple('A', 'i+j, ti')) set_default_schedule_storage_types_and_location(sdfg, None) assert sdfg.arrays['tmp'].storage == dace.StorageType.GPU_Shared
# parser.add_argument("g", type=int, nargs="?", default=4) args = vars(parser.parse_args()) ts = args['ts'] n = args['n'] # number_of_gpus = args['g'] sdfg = j_o_un.to_sdfg(strict=False) gpu_map = find_map_by_param(sdfg, 'rank') gpu_map.schedule = dace.ScheduleType.GPU_Multidevice sdfg.apply_strict_transformations() for _, name, array in sdfg.arrays_recursive(): if name in ['north_neighbor', 'south_neighbor']: array.storage = dace.StorageType.CPU_ThreadLocal infer_types.set_default_schedule_storage_types_and_location(sdfg) # sdfg.expand_library_nodes() sdfg.apply_strict_transformations() sdfg.apply_transformations_repeated(MapFusion) sdfg.apply_transformations_repeated([RedundantArray, RedundantSecondArray]) sdfg.apply_strict_transformations() # sdfg.specialize( # dict(size=number_of_gpus, lNy=n // number_of_gpus, TSTEPS=ts, N=n)) # ss = sdfg.start_state # for n in ss.nodes(): # if isinstance(n, nodes.NestedSDFG): # nsn = n # program_objects = sdfg.generate_code() # from dace.codegen import compiler
def generate_code(sdfg, validate=True) -> List[CodeObject]: """ Generates code as a list of code objects for a given SDFG. :param sdfg: The SDFG to use :param validate: If True, validates the SDFG before generating the code. :return: List of code objects that correspond to files to compile. """ # Before compiling, validate SDFG correctness if validate: sdfg.validate() if Config.get_bool('testing', 'serialization'): from dace.sdfg import SDFG import filecmp import shutil import tempfile with tempfile.TemporaryDirectory() as tmp_dir: sdfg.save(f'{tmp_dir}/test.sdfg') sdfg2 = SDFG.from_file(f'{tmp_dir}/test.sdfg') sdfg2.save(f'{tmp_dir}/test2.sdfg') print('Testing SDFG serialization...') if not filecmp.cmp(f'{tmp_dir}/test.sdfg', f'{tmp_dir}/test2.sdfg'): shutil.move(f"{tmp_dir}/test.sdfg", "test.sdfg") shutil.move(f"{tmp_dir}/test2.sdfg", "test2.sdfg") raise RuntimeError( 'SDFG serialization failed - files do not match') # Run with the deserialized version # NOTE: This means that all subsequent modifications to `sdfg` # are not reflected outside of this function (e.g., library # node expansion). sdfg = sdfg2 # Before generating the code, run type inference on the SDFG connectors infer_types.infer_connector_types(sdfg) # Set default storage/schedule types in SDFG infer_types.set_default_schedule_storage_types_and_location(sdfg, None) # Recursively expand library nodes that have not yet been expanded sdfg.expand_library_nodes() # After expansion, run another pass of connector/type inference infer_types.infer_connector_types(sdfg) infer_types.set_default_schedule_storage_types_and_location(sdfg, None) frame = framecode.DaCeCodeGenerator() # Instantiate CPU first (as it is used by the other code generators) # TODO: Refactor the parts used by other code generators out of CPU default_target = cpu.CPUCodeGen for k, v in target.TargetCodeGenerator.extensions().items(): # If another target has already been registered as CPU, use it instead if v['name'] == 'cpu': default_target = k targets = {'cpu': default_target(frame, sdfg)} # Instantiate the rest of the targets targets.update({ v['name']: k(frame, sdfg) for k, v in target.TargetCodeGenerator.extensions().items() if v['name'] not in targets }) # Instantiate all instrumentation providers in SDFG provider_mapping = InstrumentationProvider.get_provider_mapping() frame._dispatcher.instrumentation[ dtypes.InstrumentationType.No_Instrumentation] = None for node, _ in sdfg.all_nodes_recursive(): if hasattr(node, 'instrument'): frame._dispatcher.instrumentation[node.instrument] = \ provider_mapping[node.instrument] elif hasattr(node, 'consume'): frame._dispatcher.instrumentation[node.consume.instrument] = \ provider_mapping[node.consume.instrument] elif hasattr(node, 'map'): frame._dispatcher.instrumentation[node.map.instrument] = \ provider_mapping[node.map.instrument] if sdfg.instrument != dtypes.InstrumentationType.No_Instrumentation: frame._dispatcher.instrumentation[sdfg.instrument] = \ provider_mapping[sdfg.instrument] frame._dispatcher.instrumentation = { k: v() if v is not None else None for k, v in frame._dispatcher.instrumentation.items() } # Generate frame code (and the rest of the code) (global_code, frame_code, used_targets, used_environments) = frame.generate_code(sdfg, None) target_objects = [ CodeObject(sdfg.name, global_code + frame_code, 'cpp', cpu.CPUCodeGen, 'Frame', environments=used_environments, sdfg=sdfg) ] # Create code objects for each target for tgt in used_targets: target_objects.extend(tgt.get_generated_codeobjects()) # add a header file for calling the SDFG dummy = CodeObject(sdfg.name, generate_headers(sdfg), 'h', cpu.CPUCodeGen, 'CallHeader', target_type='../../include', linkable=False) target_objects.append(dummy) for env in dace.library.get_environments_and_dependencies( used_environments): if hasattr(env, "codeobjects"): target_objects.extend(env.codeobjects) # add a dummy main function to show how to call the SDFG dummy = CodeObject(sdfg.name + "_main", generate_dummy(sdfg), 'cpp', cpu.CPUCodeGen, 'SampleMain', target_type='../../sample', linkable=False) target_objects.append(dummy) return target_objects
def apply(self, sdfg: SDFG) -> None: graph: SDFGState = sdfg.nodes()[self.state_id] inner_map_entry: nodes.MapEntry = graph.nodes()[self.subgraph[ GPUMultiTransformMap._map_entry]] number_of_gpus = self.number_of_gpus ngpus = Config.get("compiler", "cuda", "max_number_gpus") if (number_of_gpus == None): number_of_gpus = ngpus if number_of_gpus > ngpus: raise ValueError( 'Requesting more gpus than specified in the dace config') # Avoiding import loops from dace.transformation.dataflow import (StripMining, InLocalStorage, OutLocalStorage, AccumulateTransient) # The user has responsibility for the implementation of a Library node. scope_subgraph = graph.scope_subgraph(inner_map_entry) for node in scope_subgraph.nodes(): if isinstance(node, nodes.LibraryNode): warnings.warn( 'Node %s is a library node, make sure to manually set the ' 'implementation to a GPU compliant specialization.' % node) # Tile map into number_of_gpus tiles outer_map: nodes.Map = StripMining.apply_to( sdfg, dict(dim_idx=-1, new_dim_prefix=self.new_dim_prefix, tile_size=number_of_gpus, tiling_type=dtypes.TilingType.NumberOfTiles), _map_entry=inner_map_entry) outer_map_entry: nodes.MapEntry = graph.scope_dict()[inner_map_entry] inner_map_exit: nodes.MapExit = graph.exit_node(inner_map_entry) outer_map_exit: nodes.MapExit = graph.exit_node(outer_map_entry) # Change map schedules inner_map_entry.map.schedule = dtypes.ScheduleType.GPU_Device outer_map.schedule = dtypes.ScheduleType.GPU_Multidevice symbolic_gpu_id = outer_map.params[0] # Add the parameter of the outer map for node in graph.successors(inner_map_entry): if isinstance(node, nodes.NestedSDFG): map_syms = inner_map_entry.range.free_symbols for sym in map_syms: symname = str(sym) if symname not in node.symbol_mapping.keys(): node.symbol_mapping[symname] = sym node.sdfg.symbols[symname] = graph.symbols_defined_at( node)[symname] # Add transient Data leading to the inner map prefix = self.new_transient_prefix for node in graph.predecessors(outer_map_entry): # Only AccessNodes are relevant if (isinstance(node, nodes.AccessNode) and not (self.skip_scalar and isinstance(node.desc(sdfg), Scalar))): if self.use_p2p and node.desc( sdfg).storage is dtypes.StorageType.GPU_Global: continue in_data_node = InLocalStorage.apply_to(sdfg, dict(array=node.data, prefix=prefix), verify=False, save=False, node_a=outer_map_entry, node_b=inner_map_entry) in_data_node.desc(sdfg).location['gpu'] = symbolic_gpu_id in_data_node.desc(sdfg).storage = dtypes.StorageType.GPU_Global wcr_data: Dict[str, Any] = {} # Add transient Data leading to the outer map for edge in graph.in_edges(outer_map_exit): node = graph.memlet_path(edge)[-1].dst if isinstance(node, nodes.AccessNode): data_name = node.data # Transients with write-conflict resolution need to be # collected first as AccumulateTransient creates a nestedSDFG if edge.data.wcr is not None: dtype = sdfg.arrays[data_name].dtype redtype = operations.detect_reduction_type(edge.data.wcr) # Custom reduction can not have an accumulate transient, # as the accumulation from the transient to the outer # storage is not defined. if redtype == dtypes.ReductionType.Custom: warnings.warn( 'Using custom reductions in a GPUMultitransformed ' 'Map only works for a small data volume. For large ' 'volume there is no guarantee.') continue identity = dtypes.reduction_identity(dtype, redtype) wcr_data[data_name] = identity elif (not isinstance(node.desc(sdfg), Scalar) or not self.skip_scalar): if self.use_p2p and node.desc( sdfg).storage is dtypes.StorageType.GPU_Global: continue # Transients without write-conflict resolution if prefix + '_' + data_name in sdfg.arrays: create_array = False else: create_array = True out_data_node = OutLocalStorage.apply_to( sdfg, dict(array=data_name, prefix=prefix, create_array=create_array), verify=False, save=False, node_a=inner_map_exit, node_b=outer_map_exit) out_data_node.desc(sdfg).location['gpu'] = symbolic_gpu_id out_data_node.desc( sdfg).storage = dtypes.StorageType.GPU_Global # Add Transients for write-conflict resolution if len(wcr_data) != 0: nsdfg = AccumulateTransient.apply_to( sdfg, options=dict(array_identity_dict=wcr_data, prefix=prefix), map_exit=inner_map_exit, outer_map_exit=outer_map_exit) nsdfg.schedule = dtypes.ScheduleType.GPU_Multidevice nsdfg.location['gpu'] = symbolic_gpu_id for transient_node in graph.successors(nsdfg): if isinstance(transient_node, nodes.AccessNode): transient_node.desc(sdfg).location['gpu'] = symbolic_gpu_id transient_node.desc( sdfg).storage = dtypes.StorageType.GPU_Global nsdfg.sdfg.arrays[ transient_node.label].location['gpu'] = symbolic_gpu_id nsdfg.sdfg.arrays[ transient_node. label].storage = dtypes.StorageType.GPU_Global infer_types.set_default_schedule_storage_types_and_location( nsdfg.sdfg, dtypes.ScheduleType.GPU_Multidevice, symbolic_gpu_id) # Remove the parameter of the outer_map from the sdfg symbols, # as it got added as a symbol in StripMining. if outer_map.params[0] in sdfg.free_symbols: sdfg.remove_symbol(outer_map.params[0])