Beispiel #1
0
def test_notbmap():
    sdfg = dace.SDFG('default_storage_test_1')
    sdfg.add_array('A', [20], dace.float64, dace.StorageType.GPU_Global)
    sdfg.add_transient('tmp', [1], dace.float64)
    state = sdfg.add_state()

    r = state.add_read('A')
    me, mx = state.add_map('kernel', dict(i='0:20'),
                           dace.ScheduleType.GPU_Device)
    tmp = state.add_access('tmp')
    t = state.add_tasklet('add', {'a'}, {'b'}, 'b = a + 1')
    w = state.add_write('A')

    state.add_memlet_path(r, me, tmp, memlet=dace.Memlet.simple('A', 'i'))
    state.add_memlet_path(tmp,
                          t,
                          dst_conn='a',
                          memlet=dace.Memlet.simple('tmp', '0'))
    state.add_memlet_path(t,
                          mx,
                          w,
                          src_conn='b',
                          memlet=dace.Memlet.simple('A', 'i'))

    set_default_schedule_storage_types_and_location(sdfg, None)
    assert sdfg.arrays['tmp'].storage == dace.StorageType.Register
Beispiel #2
0
def test_nccl_ring_exchange():
    ng = Config.get('compiler', 'cuda', 'max_number_gpus')
    if ng < 2:
        raise ValueError('This test needs to run with at least 2 GPUs.')
    sdfg: dace.SDFG = nccl_ring_exchange.to_sdfg(strict=True)
    gpu_map = find_map_by_param(sdfg, 'gpu_id')
    gpu_map.schedule = dtypes.ScheduleType.GPU_Multidevice
    infer_types.set_default_schedule_storage_types_and_location(sdfg, None)
    sdfg.specialize(dict(num_gpus=ng))

    out = sdfg()
    res = np.sum(range(ng))

    assert np.allclose(out, res), f'\nout: {out}\nres: {res}\n'
Beispiel #3
0
def test_nccl_reduce():
    ng = Config.get('compiler', 'cuda', 'max_number_gpus')
    n = 15
    sdfg: dace.SDFG = nccl_reduce.to_sdfg(strict=True)
    gpu_map = find_map_by_param(sdfg, 'gpu')
    gpu_map.schedule = dtypes.ScheduleType.GPU_Multidevice
    infer_types.set_default_schedule_storage_types_and_location(sdfg, None)
    sdfg.specialize(dict(root_device=0, num_gpus=ng))

    out = np.ndarray(shape=n, dtype=np_dtype)
    out.fill(0)

    sdfg(out=out, N=n)

    assert np.unique(out)[0] == sum(range(ng))
def test_batchnorm2d_data_parallelism():
    n, h, w, c = 16, 128, 128, 64
    # n, h, w, c = 16, 4, 4, 8
    ng = 4

    sdfg: dace.SDFG = batchnorm2d_data_parallelism.to_sdfg(strict=True)
    sdfg.name = sdfg.name + '_inline'
    multi_gpu_map = find_map_by_param(sdfg, 'gpu_id')
    multi_gpu_map.schedule = dace.ScheduleType.GPU_Multidevice
    lib_nodes = find_library_nodes(sdfg, Reduce)
    lib_nodes[0].implementation = 'CUDA (device)'
    lib_nodes[1].implementation = 'CUDA (device)'
    # sdfg.apply_transformations(GPUTransformSDFG)
    sdfg.specialize(
        dict(number_of_gpus=ng,
             N=n,
             H=h,
             W=w,
             C=c,
             N_gpu=n // ng,
             NN=np.float32(n)))
    set_default_schedule_storage_types_and_location(sdfg, None)
    sdfg.expand_library_nodes()
    sdfg.apply_transformations_repeated([MapFusion])
    sdfg.apply_transformations_repeated([RedundantSecondArray, RedundantArray])
    sdfg.apply_strict_transformations()

    np.random.seed(0)
    X = np.ndarray(shape=[n, h, w, c], dtype=np_dtype)
    X[:] = np.random.rand(n, h, w, c)[:]
    # X = np.arange(n * h * w * c, dtype=np_dtype).reshape([n, h, w, c])
    Z = np.copy(X)

    print('GPU')
    sdfg(X)
    print('GPU done')

    bnsdfg: dace.SDFG = batchnorm2d.to_sdfg()
    lib_nodes = find_library_nodes(bnsdfg, Reduce)
    lib_nodes[0].implementation = 'pure'
    lib_nodes[1].implementation = 'pure'

    print('CPU')
    res = bnsdfg(Z, N=n, H=h, W=w, C=c)
    print('CPU done')
    assert np.allclose(X, res), f'\ndiff: {np.linalg.norm(X-res)}'
Beispiel #5
0
def test_nccl_allreduce():
    ng = Config.get('compiler', 'cuda', 'max_number_gpus')
    n = 15
    sdfg: dace.SDFG = nccl_allreduce.to_sdfg(strict=True)
    state = sdfg.start_state
    gpu_map = state.nodes()[0]
    gpu_map.schedule = dtypes.ScheduleType.GPU_Multidevice
    infer_types.set_default_schedule_storage_types_and_location(sdfg, None)
    sdfg.specialize(dict(num_gpus=ng))

    out = np.ndarray(shape=n, dtype=np_dtype)
    out.fill(0)

    sdfg(out=out, N=n)

    res = sum(range(ng))
    assert np.unique(out)[0] == res
def test_nccl_reduce_symbolic():
    ng = Config.get('compiler', 'cuda', 'max_number_gpus')
    n = 2
    sdfg: dace.SDFG = nccl_reduce_symbolic.to_sdfg(strict=True)
    outer_map = find_map_by_param(sdfg, 'root_gpu')
    if outer_map:
        outer_map.schedule = dtypes.ScheduleType.Sequential
    gpu_map = find_map_by_param(sdfg, 'gpu')
    gpu_map.schedule = dtypes.ScheduleType.GPU_Multidevice
    infer_types.set_default_schedule_storage_types_and_location(sdfg, None)
    sdfg.specialize(dict(num_gpus=ng))

    out = np.ndarray(shape=[ng, n], dtype=np_dtype)
    out.fill(0)

    sdfg(out=out, N=n)

    res = np.array([ng * i for i in range(ng)])
    assert (np.unique(out) == res).all()
Beispiel #7
0
def test_batchnorm2d_model_parallelism():
    sdfg: dace.SDFG = batchnorm2d_model_parallelism.to_sdfg(strict=True)
    multi_gpu_map = find_map_by_param(sdfg, 'gpu_id')
    multi_gpu_map.schedule = dace.ScheduleType.GPU_Multidevice
    lib_nodes = find_library_nodes(sdfg, Reduce)
    lib_nodes[0].implementation = 'CUDA (device)'
    lib_nodes[1].implementation = 'CUDA (device)'

    sdfg.specialize(
        dict(number_of_gpus=ng,
             N=n,
             H=h,
             W=w,
             C=c,
             C_gpu=c // ng,
             NN=np.float32(n)))
    set_default_schedule_storage_types_and_location(sdfg, None)
    sdfg.expand_library_nodes()
    sdfg.apply_transformations_repeated([MapFusion])
    sdfg.apply_transformations_repeated([RedundantSecondArray, RedundantArray])
    sdfg.apply_strict_transformations()

    np.random.seed(0)
    X = np.ndarray(shape=[n, h, w, c], dtype=np_dtype)
    X[:] = np.random.rand(n, h, w, c)[:]
    # X = np.copy(np.arange(size, dtype=np_dtype).reshape(shape))
    Z = np.copy(X)

    print('GPU')
    mean, std, red = sdfg(X)
    print('GPU done')
    # print(f'\rred:\n{repr(red)}\n\nmean:\n{repr(mean)}\n\nstd:\n{repr(std)}\n')

    bnsdfg: dace.SDFG = batchnorm2d.to_sdfg()
    lib_nodes = find_library_nodes(bnsdfg, Reduce)
    lib_nodes[0].implementation = 'pure'
    lib_nodes[1].implementation = 'pure'

    print('CPU')
    res = bnsdfg(Z, N=n, H=h, W=w, C=c)
    print('CPU done')
    assert np.allclose(X, res), f'\ndiff: {np.linalg.norm(X-res)}'
def test_schedule_inference_simple():
    @dace.program
    def nested_call(A: dace.float64[3, 3]):
        return A + 1

    @dace.program
    def simple_schedule_inference(A: dace.float64[3, 3]):
        return nested_call(A)

    sdfg: dace.SDFG = simple_schedule_inference.to_sdfg(strict=False)

    infer_types.infer_connector_types(sdfg)

    infer_types.set_default_schedule_storage_types_and_location(sdfg, None)
    sdfg.apply_transformations_repeated(StateFusion)

    entry = [
        n for n, _ in sdfg.all_nodes_recursive()
        if isinstance(n, dace.nodes.MapEntry)
    ][0]
    assert entry.schedule is dace.ScheduleType.CPU_Multicore
Beispiel #9
0
def test_ndarray_reduce():
    sdfg: dace.SDFG = ndarray_reduce_gpu.to_sdfg(strict=True)
    map_entry = find_map(sdfg)
    map_entry.schedule = dace.ScheduleType.GPU_Multidevice
    lib_nodes = find_library_nodes(sdfg, Reduce)
    lib_nodes[0].implementation = 'CUDA (device)'

    sdfg.specialize(dict(number_of_gpus=ng, N=n, H=h, W=w, C=c, C_gpu=c // ng))
    set_default_schedule_storage_types_and_location(sdfg, None)
    sdfg.expand_library_nodes()
    sdfg.apply_transformations_repeated([RedundantSecondArray, RedundantArray])
    sdfg.apply_strict_transformations()

    np.random.seed(0)
    X = np.ndarray(shape=shape, dtype=np_dtype)
    X = np.arange(size, dtype=np_dtype).reshape(shape)
    # X[:] = np.random.rand(size)[:]
    Z = np.copy(X)

    print('GPU')
    out = sdfg(X)
    print('GPU done')
    res = np.sum(Z, axis=0)
    assert np.allclose(out, res), f'\out:\n{repr(out)}\n\nres:\n{repr(res)}\n'
Beispiel #10
0
def test_tbmap_sequential():
    sdfg = dace.SDFG('default_storage_test_2')
    sdfg.add_array('A', [20, 32], dace.float64, dace.StorageType.GPU_Global)
    sdfg.add_transient('tmp', [1], dace.float64)
    state = sdfg.add_state()

    r = state.add_read('A')
    ome, omx = state.add_map('kernel', dict(i='0:20'),
                             dace.ScheduleType.GPU_Device)
    sme, smx = state.add_map('seq', dict(j='0:1'),
                             dace.ScheduleType.Sequential)
    ime, imx = state.add_map('block', dict(ti='0:32'),
                             dace.ScheduleType.GPU_ThreadBlock)
    tmp = state.add_access('tmp')
    t = state.add_tasklet('add', {'a'}, {'b'}, 'b = a + 1')
    w = state.add_write('A')

    state.add_memlet_path(r,
                          ome,
                          sme,
                          tmp,
                          memlet=dace.Memlet.simple('A', 'i+j, 0:32'))
    state.add_memlet_path(tmp,
                          ime,
                          t,
                          dst_conn='a',
                          memlet=dace.Memlet.simple('tmp', '0, ti'))
    state.add_memlet_path(t,
                          imx,
                          smx,
                          omx,
                          w,
                          src_conn='b',
                          memlet=dace.Memlet.simple('A', 'i+j, ti'))
    set_default_schedule_storage_types_and_location(sdfg, None)
    assert sdfg.arrays['tmp'].storage == dace.StorageType.GPU_Shared
Beispiel #11
0
    # parser.add_argument("g", type=int, nargs="?", default=4)
    args = vars(parser.parse_args())

    ts = args['ts']
    n = args['n']
    # number_of_gpus = args['g']

    sdfg = j_o_un.to_sdfg(strict=False)
    gpu_map = find_map_by_param(sdfg, 'rank')
    gpu_map.schedule = dace.ScheduleType.GPU_Multidevice

    sdfg.apply_strict_transformations()
    for _, name, array in sdfg.arrays_recursive():
        if name in ['north_neighbor', 'south_neighbor']:
            array.storage = dace.StorageType.CPU_ThreadLocal
    infer_types.set_default_schedule_storage_types_and_location(sdfg)
    # sdfg.expand_library_nodes()
    sdfg.apply_strict_transformations()
    sdfg.apply_transformations_repeated(MapFusion)
    sdfg.apply_transformations_repeated([RedundantArray, RedundantSecondArray])
    sdfg.apply_strict_transformations()

    # sdfg.specialize(
    #     dict(size=number_of_gpus, lNy=n // number_of_gpus, TSTEPS=ts, N=n))
    # ss = sdfg.start_state
    # for n in ss.nodes():
    #     if isinstance(n, nodes.NestedSDFG):
    #         nsn = n

    # program_objects = sdfg.generate_code()
    # from dace.codegen import compiler
Beispiel #12
0
def generate_code(sdfg, validate=True) -> List[CodeObject]:
    """ Generates code as a list of code objects for a given SDFG.
        :param sdfg: The SDFG to use
        :param validate: If True, validates the SDFG before generating the code.
        :return: List of code objects that correspond to files to compile.
    """
    # Before compiling, validate SDFG correctness
    if validate:
        sdfg.validate()

    if Config.get_bool('testing', 'serialization'):
        from dace.sdfg import SDFG
        import filecmp
        import shutil
        import tempfile
        with tempfile.TemporaryDirectory() as tmp_dir:
            sdfg.save(f'{tmp_dir}/test.sdfg')
            sdfg2 = SDFG.from_file(f'{tmp_dir}/test.sdfg')
            sdfg2.save(f'{tmp_dir}/test2.sdfg')
            print('Testing SDFG serialization...')
            if not filecmp.cmp(f'{tmp_dir}/test.sdfg', f'{tmp_dir}/test2.sdfg'):
                shutil.move(f"{tmp_dir}/test.sdfg", "test.sdfg")
                shutil.move(f"{tmp_dir}/test2.sdfg", "test2.sdfg")
                raise RuntimeError(
                    'SDFG serialization failed - files do not match')

        # Run with the deserialized version
        # NOTE: This means that all subsequent modifications to `sdfg`
        # are not reflected outside of this function (e.g., library
        # node expansion).
        sdfg = sdfg2

    # Before generating the code, run type inference on the SDFG connectors
    infer_types.infer_connector_types(sdfg)

    # Set default storage/schedule types in SDFG
    infer_types.set_default_schedule_storage_types_and_location(sdfg, None)

    # Recursively expand library nodes that have not yet been expanded
    sdfg.expand_library_nodes()

    # After expansion, run another pass of connector/type inference
    infer_types.infer_connector_types(sdfg)
    infer_types.set_default_schedule_storage_types_and_location(sdfg, None)

    frame = framecode.DaCeCodeGenerator()

    # Instantiate CPU first (as it is used by the other code generators)
    # TODO: Refactor the parts used by other code generators out of CPU
    default_target = cpu.CPUCodeGen
    for k, v in target.TargetCodeGenerator.extensions().items():
        # If another target has already been registered as CPU, use it instead
        if v['name'] == 'cpu':
            default_target = k
    targets = {'cpu': default_target(frame, sdfg)}

    # Instantiate the rest of the targets
    targets.update({
        v['name']: k(frame, sdfg)
        for k, v in target.TargetCodeGenerator.extensions().items()
        if v['name'] not in targets
    })

    # Instantiate all instrumentation providers in SDFG
    provider_mapping = InstrumentationProvider.get_provider_mapping()
    frame._dispatcher.instrumentation[
        dtypes.InstrumentationType.No_Instrumentation] = None
    for node, _ in sdfg.all_nodes_recursive():
        if hasattr(node, 'instrument'):
            frame._dispatcher.instrumentation[node.instrument] = \
                provider_mapping[node.instrument]
        elif hasattr(node, 'consume'):
            frame._dispatcher.instrumentation[node.consume.instrument] = \
                provider_mapping[node.consume.instrument]
        elif hasattr(node, 'map'):
            frame._dispatcher.instrumentation[node.map.instrument] = \
                provider_mapping[node.map.instrument]
    if sdfg.instrument != dtypes.InstrumentationType.No_Instrumentation:
        frame._dispatcher.instrumentation[sdfg.instrument] = \
            provider_mapping[sdfg.instrument]
    frame._dispatcher.instrumentation = {
        k: v() if v is not None else None
        for k, v in frame._dispatcher.instrumentation.items()
    }

    # Generate frame code (and the rest of the code)
    (global_code, frame_code, used_targets,
     used_environments) = frame.generate_code(sdfg, None)
    target_objects = [
        CodeObject(sdfg.name,
                   global_code + frame_code,
                   'cpp',
                   cpu.CPUCodeGen,
                   'Frame',
                   environments=used_environments,
                   sdfg=sdfg)
    ]

    # Create code objects for each target
    for tgt in used_targets:
        target_objects.extend(tgt.get_generated_codeobjects())

    # add a header file for calling the SDFG
    dummy = CodeObject(sdfg.name,
                       generate_headers(sdfg),
                       'h',
                       cpu.CPUCodeGen,
                       'CallHeader',
                       target_type='../../include',
                       linkable=False)
    target_objects.append(dummy)

    for env in dace.library.get_environments_and_dependencies(
            used_environments):
        if hasattr(env, "codeobjects"):
            target_objects.extend(env.codeobjects)

    # add a dummy main function to show how to call the SDFG
    dummy = CodeObject(sdfg.name + "_main",
                       generate_dummy(sdfg),
                       'cpp',
                       cpu.CPUCodeGen,
                       'SampleMain',
                       target_type='../../sample',
                       linkable=False)
    target_objects.append(dummy)

    return target_objects
    def apply(self, sdfg: SDFG) -> None:
        graph: SDFGState = sdfg.nodes()[self.state_id]

        inner_map_entry: nodes.MapEntry = graph.nodes()[self.subgraph[
            GPUMultiTransformMap._map_entry]]

        number_of_gpus = self.number_of_gpus
        ngpus = Config.get("compiler", "cuda", "max_number_gpus")
        if (number_of_gpus == None):
            number_of_gpus = ngpus
        if number_of_gpus > ngpus:
            raise ValueError(
                'Requesting more gpus than specified in the dace config')

        # Avoiding import loops
        from dace.transformation.dataflow import (StripMining, InLocalStorage,
                                                  OutLocalStorage,
                                                  AccumulateTransient)

        # The user has responsibility for the implementation of a Library node.
        scope_subgraph = graph.scope_subgraph(inner_map_entry)
        for node in scope_subgraph.nodes():
            if isinstance(node, nodes.LibraryNode):
                warnings.warn(
                    'Node %s is a library node, make sure to manually set the '
                    'implementation to a GPU compliant specialization.' % node)

        # Tile map into number_of_gpus tiles
        outer_map: nodes.Map = StripMining.apply_to(
            sdfg,
            dict(dim_idx=-1,
                 new_dim_prefix=self.new_dim_prefix,
                 tile_size=number_of_gpus,
                 tiling_type=dtypes.TilingType.NumberOfTiles),
            _map_entry=inner_map_entry)

        outer_map_entry: nodes.MapEntry = graph.scope_dict()[inner_map_entry]
        inner_map_exit: nodes.MapExit = graph.exit_node(inner_map_entry)
        outer_map_exit: nodes.MapExit = graph.exit_node(outer_map_entry)

        # Change map schedules
        inner_map_entry.map.schedule = dtypes.ScheduleType.GPU_Device
        outer_map.schedule = dtypes.ScheduleType.GPU_Multidevice

        symbolic_gpu_id = outer_map.params[0]

        # Add the parameter of the outer map
        for node in graph.successors(inner_map_entry):
            if isinstance(node, nodes.NestedSDFG):
                map_syms = inner_map_entry.range.free_symbols
                for sym in map_syms:
                    symname = str(sym)
                    if symname not in node.symbol_mapping.keys():
                        node.symbol_mapping[symname] = sym
                        node.sdfg.symbols[symname] = graph.symbols_defined_at(
                            node)[symname]

        # Add transient Data leading to the inner map
        prefix = self.new_transient_prefix
        for node in graph.predecessors(outer_map_entry):
            # Only AccessNodes are relevant
            if (isinstance(node, nodes.AccessNode)
                    and not (self.skip_scalar
                             and isinstance(node.desc(sdfg), Scalar))):
                if self.use_p2p and node.desc(
                        sdfg).storage is dtypes.StorageType.GPU_Global:
                    continue

                in_data_node = InLocalStorage.apply_to(sdfg,
                                                       dict(array=node.data,
                                                            prefix=prefix),
                                                       verify=False,
                                                       save=False,
                                                       node_a=outer_map_entry,
                                                       node_b=inner_map_entry)
                in_data_node.desc(sdfg).location['gpu'] = symbolic_gpu_id
                in_data_node.desc(sdfg).storage = dtypes.StorageType.GPU_Global

        wcr_data: Dict[str, Any] = {}
        # Add transient Data leading to the outer map
        for edge in graph.in_edges(outer_map_exit):
            node = graph.memlet_path(edge)[-1].dst
            if isinstance(node, nodes.AccessNode):
                data_name = node.data
                # Transients with write-conflict resolution need to be
                # collected first as AccumulateTransient creates a nestedSDFG
                if edge.data.wcr is not None:
                    dtype = sdfg.arrays[data_name].dtype
                    redtype = operations.detect_reduction_type(edge.data.wcr)
                    # Custom reduction can not have an accumulate transient,
                    # as the accumulation from the transient to the outer
                    # storage is not defined.
                    if redtype == dtypes.ReductionType.Custom:
                        warnings.warn(
                            'Using custom reductions in a GPUMultitransformed '
                            'Map only works for a small data volume. For large '
                            'volume there is no guarantee.')
                        continue
                    identity = dtypes.reduction_identity(dtype, redtype)
                    wcr_data[data_name] = identity
                elif (not isinstance(node.desc(sdfg), Scalar)
                      or not self.skip_scalar):
                    if self.use_p2p and node.desc(
                            sdfg).storage is dtypes.StorageType.GPU_Global:
                        continue
                    # Transients without write-conflict resolution
                    if prefix + '_' + data_name in sdfg.arrays:
                        create_array = False
                    else:
                        create_array = True
                    out_data_node = OutLocalStorage.apply_to(
                        sdfg,
                        dict(array=data_name,
                             prefix=prefix,
                             create_array=create_array),
                        verify=False,
                        save=False,
                        node_a=inner_map_exit,
                        node_b=outer_map_exit)
                    out_data_node.desc(sdfg).location['gpu'] = symbolic_gpu_id
                    out_data_node.desc(
                        sdfg).storage = dtypes.StorageType.GPU_Global

        # Add Transients for write-conflict resolution
        if len(wcr_data) != 0:
            nsdfg = AccumulateTransient.apply_to(
                sdfg,
                options=dict(array_identity_dict=wcr_data, prefix=prefix),
                map_exit=inner_map_exit,
                outer_map_exit=outer_map_exit)
            nsdfg.schedule = dtypes.ScheduleType.GPU_Multidevice
            nsdfg.location['gpu'] = symbolic_gpu_id
            for transient_node in graph.successors(nsdfg):
                if isinstance(transient_node, nodes.AccessNode):
                    transient_node.desc(sdfg).location['gpu'] = symbolic_gpu_id
                    transient_node.desc(
                        sdfg).storage = dtypes.StorageType.GPU_Global
                    nsdfg.sdfg.arrays[
                        transient_node.label].location['gpu'] = symbolic_gpu_id
                    nsdfg.sdfg.arrays[
                        transient_node.
                        label].storage = dtypes.StorageType.GPU_Global
            infer_types.set_default_schedule_storage_types_and_location(
                nsdfg.sdfg, dtypes.ScheduleType.GPU_Multidevice,
                symbolic_gpu_id)

        # Remove the parameter of the outer_map from the sdfg symbols,
        # as it got added as a symbol in StripMining.
        if outer_map.params[0] in sdfg.free_symbols:
            sdfg.remove_symbol(outer_map.params[0])