Esempio n. 1
0
def optimize_for_cpu(sdfg: dace.SDFG, m: int, n: int, k: int):
    """ Optimize the matrix multiplication example for multi-core CPUs. """
    # Ensure integers are 32-bit by default
    dace.Config.set('compiler', 'default_data_types', value='C')

    # Fuse the map and reduce nodes
    sdfg.apply_transformations(MapReduceFusion)

    # Find multiplication map
    entry = find_map_by_param(sdfg, 'k')

    # Create a tiling strategy
    divides_evenly = (m % 32 == 0) and (n % 32 == 0) and (k % 256 == 0)
    xfutil.tile(sdfg, entry, divides_evenly, False, k=256, i=32, j=32)
    xfutil.tile(sdfg, entry, divides_evenly, divides_evenly, j=16, i=4)

    # Reorder internal map to "k,i,j"
    xfutil.permute_map(entry, [2, 0, 1])

    # Add local storage for B in j tile: we apply InLocalStorage with a
    # parameter "array" named B, between the two maps of j and i
    regtile_j = find_map_by_param(sdfg, 'tile1_j')
    regtile_i = find_map_by_param(sdfg, 'tile1_i')
    InLocalStorage.apply_to(sdfg,
                            dict(array='B'),
                            node_a=regtile_j,
                            node_b=regtile_i)

    if divides_evenly:
        # Add local storage for C
        exit_inner = find_mapexit_by_param(sdfg, 'k')
        exit_rti = find_mapexit_by_param(sdfg, 'tile1_i')
        AccumulateTransient.apply_to(sdfg,
                                     dict(array='C', identity=0),
                                     map_exit=exit_inner,
                                     outer_map_exit=exit_rti)

        # Vectorize microkernel map
        postamble = n % 4 != 0
        entry_inner, inner_state = find_map_and_state_by_param(sdfg, 'k')
        Vectorization.apply_to(inner_state.parent,
                               dict(vector_len=4,
                                    preamble=False,
                                    postamble=postamble),
                               _map_entry=entry_inner)

    # Mark outer tile map as sequential to remove atomics
    find_map_by_param(sdfg,
                      'tile_k').map.schedule = dace.ScheduleType.Sequential

    # Collapse maps for more parallelism
    find_map_by_param(sdfg, 'o0').map.collapse = 2
    tile_i = find_map_by_param(sdfg, 'tile_i')
    tile_j = find_map_by_param(sdfg, 'tile_j')
    MapCollapse.apply_to(sdfg,
                         _outer_map_entry=tile_i,
                         _inner_map_entry=tile_j)
    tile_ij = find_map_by_param(sdfg, 'tile_i')  # Find newly created map
    tile_ij.map.schedule = dace.ScheduleType.CPU_Multicore
    tile_ij.map.collapse = 2
Esempio n. 2
0
def is_sdfg_equal(sdfg1: dace.SDFG, sdfg2: dace.SDFG):

    if not (len(sdfg1.states()) == 1 and len(sdfg2.states()) == 1):
        return False
    state1 = sdfg1.states()[0]
    state2 = sdfg2.states()[0]

    # SDFGState.nx does not contain any node info in the networkx node attrs (but does for edges),
    # so we add it here manually.
    nx.set_node_attributes(state1.nx, {n: n for n in state1.nx.nodes}, "node")
    nx.set_node_attributes(state2.nx, {n: n for n in state2.nx.nodes}, "node")

    if not nx.is_isomorphic(
            state1.nx, state2.nx, edge_match=edge_match,
            node_match=node_match):
        return False

    for name in sdfg1.arrays.keys():
        if not (isinstance(sdfg1.arrays[name], type(sdfg2.arrays[name]))
                and isinstance(sdfg2.arrays[name], type(sdfg1.arrays[name]))
                and sdfg1.arrays[name].dtype == sdfg2.arrays[name].dtype and
                sdfg1.arrays[name].transient == sdfg2.arrays[name].transient
                and sdfg1.arrays[name].shape == sdfg2.arrays[name].shape):
            return False
    return True
Esempio n. 3
0
    def op_repo_replacement(sdfg: SDFG, state: SDFGState, **kwargs):
        attrs = {
            name: value
            for name, value in kwargs.items() if name in dace_schema.attributes
        }
        onnx_node = cls(name=cls_name, **attrs)
        state.add_node(onnx_node)

        input_names = {p.name for p in dace_schema.inputs}
        output_names = {p.name for p in dace_schema.outputs}
        inputs = {
            name: arr_name
            for name, arr_name in kwargs.items() if name in input_names
        }
        outputs = {
            name: arr_name
            for name, arr_name in kwargs.items() if name in output_names
        }

        for inp, arr_name in inputs.items():
            read = state.add_read(arr_name)
            state.add_edge(read, None, onnx_node, inp,
                           sdfg.make_array_memlet(arr_name))

        for outp, arr_name in outputs.items():
            write = state.add_read(arr_name)
            state.add_edge(onnx_node, outp, write, None,
                           sdfg.make_array_memlet(arr_name))
        return []
Esempio n. 4
0
def expand_reduce(sdfg: dace.SDFG,
                  graph: dace.SDFGState,
                  subgraph: Union[SubgraphView, List[SubgraphView]] = None,
                  **kwargs):

    subgraph = graph if not subgraph else subgraph
    if not isinstance(subgraph, list):
        subgraph = [subgraph]

    for sg in subgraph:
        reduce_nodes = []
        for node in sg.nodes():
            if isinstance(node, stdlib.Reduce):
                rexp = ReduceExpansion(sdfg, sdfg.sdfg_id, sdfg.node_id(graph),
                                       {ReduceExpansion.reduce: graph.node_id(node)}, 0)
                if not rexp.can_be_applied(graph, 0, sdfg):
                    print(f"WARNING: Cannot expand reduce node {node}:" "can_be_applied() failed.")
                    continue
                reduce_nodes.append(node)

        trafo_reduce = ReduceExpansion(sdfg, sdfg.sdfg_id, sdfg.node_id(graph), {}, 0)
        for (property, val) in kwargs.items():
            setattr(trafo_reduce, property, val)

        for reduce_node in reduce_nodes:
            trafo_reduce.expand(sdfg, graph, reduce_node)
            if isinstance(sg, SubgraphView):
                sg.nodes().remove(reduce_node)
                sg.nodes().append(trafo_reduce._reduce)
                sg.nodes().append(trafo_reduce._outer_entry)
Esempio n. 5
0
def mkc(sdfg: dace.SDFG,
        state_before,
        src_name,
        dst_name,
        src_storage=None,
        dst_storage=None,
        src_shape=None,
        dst_shape=None,
        copy_expr=None,
        src_loc=None,
        dst_loc=None):
    """
    Helper MaKe_Copy that creates and appends states performing exactly one copy. If a provided
    arrayname already exists it will use the old array, and ignore all newly passed values
    """

    if copy_expr is None:
        copy_expr = src_name
    if (state_before == None):
        state = sdfg.add_state(is_start_state=True)
    else:
        state = sdfg.add_state_after(state_before)

    def mkarray(name, shape, storage, loc):
        if (name in sdfg.arrays):
            return sdfg.arrays[name]
        is_transient = False
        if (storage in _FPGA_STORAGE_TYPES):
            is_transient = True
        arr = sdfg.add_array(name,
                             shape,
                             dace.int32,
                             storage,
                             transient=is_transient)
        if loc is not None:
            arr[1].location["memorytype"] = loc[0]
            arr[1].location["bank"] = loc[1]
        return arr

    a = mkarray(src_name, src_shape, src_storage, src_loc)
    b = mkarray(dst_name, dst_shape, dst_storage, dst_loc)

    aAcc = state.add_access(src_name)
    bAcc = state.add_access(dst_name)

    edge = state.add_edge(aAcc, None, bAcc, None, mem.Memlet(copy_expr))

    a_np_arr, b_np_arr = None, None
    if src_shape is not None:
        try:
            a_np_arr = np.zeros(src_shape, dtype=np.int32)
        except:
            pass
    if dst_shape is not None:
        try:
            b_np_arr = np.zeros(dst_shape, dtype=np.int32)
        except:
            pass
    return (state, a_np_arr, b_np_arr)
Esempio n. 6
0
def _post_expand_trafos(sdfg: dace.SDFG):
    while inline_sdfgs(sdfg) or fuse_states(sdfg):
        pass
    sdfg.simplify()

    for node, _ in sdfg.all_nodes_recursive():
        if isinstance(node, dace.nodes.MapEntry):
            node.collapse = len(node.range)
Esempio n. 7
0
def assert_sdfg_equal(sdfg1: dace.SDFG, sdfg2: dace.SDFG):
    from gtc.dace.nodes import (
        HorizontalExecutionLibraryNode,
        OIRLibraryNode,
        VerticalLoopLibraryNode,
    )

    def edge_match(edge1, edge2):
        edge1 = next(iter(edge1.values()))
        edge2 = next(iter(edge2.values()))
        try:
            if edge1["src_conn"] is not None:
                assert edge2["src_conn"] is not None
                assert edge1["src_conn"] == edge2["src_conn"]
            else:
                assert edge2["src_conn"] is None
            assert edge1["data"] == edge2["data"]
            assert edge1["data"].data == edge2["data"].data
        except AssertionError:
            return False
        return True

    def node_match(n1, n2):
        n1 = n1["node"]
        n2 = n2["node"]
        try:
            if not isinstance(
                n1, (dace.nodes.AccessNode, VerticalLoopLibraryNode, HorizontalExecutionLibraryNode)
            ):
                raise TypeError
            if isinstance(n1, dace.nodes.AccessNode):
                assert isinstance(n2, dace.nodes.AccessNode)
                assert n1.access == n2.access
                assert n1.data == n2.data
            elif isinstance(n1, OIRLibraryNode):
                assert n1 == n2
        except AssertionError:
            return False
        return True

    assert len(sdfg1.states()) == 1
    assert len(sdfg2.states()) == 1
    state1 = sdfg1.states()[0]
    state2 = sdfg2.states()[0]

    # SDFGState.nx does not contain any node info in the networkx node attrs (but does for edges),
    # so we add it here manually.
    nx.set_node_attributes(state1.nx, {n: n for n in state1.nx.nodes}, "node")
    nx.set_node_attributes(state2.nx, {n: n for n in state2.nx.nodes}, "node")

    assert nx.is_isomorphic(state1.nx, state2.nx, edge_match=edge_match, node_match=node_match)

    for name in sdfg1.arrays.keys():
        assert isinstance(sdfg1.arrays[name], type(sdfg2.arrays[name]))
        assert isinstance(sdfg2.arrays[name], type(sdfg1.arrays[name]))
        assert sdfg1.arrays[name].dtype == sdfg2.arrays[name].dtype
        assert sdfg1.arrays[name].transient == sdfg2.arrays[name].transient
        assert sdfg1.arrays[name].shape == sdfg2.arrays[name].shape
Esempio n. 8
0
    def generate_host_function_body(self, sdfg: dace.SDFG,
                                    state: dace.SDFGState, kernel_name: str,
                                    predecessors: list, parameters: list,
                                    rtl_tasklet_names: list,
                                    kernel_stream: CodeIOStream,
                                    instrumentation_stream: CodeIOStream):
        '''
        Generate the host-specific code for spawning and synchronizing the given kernel.
        :param sdfg:
        :param state:
        :param predecessors: list containing all the name of kernels that must be finished before starting this one
        :param parameters: list containing the kernel parameters (of all kernels in this state)
        :param rtl_tasklet_names
        :param kernel_stream: Device-specific code stream
        :param instrumentation_stream: Code for profiling kernel execution time.
        '''

        kernel_args = []
        for _, name, p, interface_ids in parameters:
            if isinstance(p, dt.Array):
                for bank, _ in fpga.iterate_hbm_interface_ids(
                        p, interface_ids):
                    kernel_args.append(
                        p.as_arg(False,
                                 name=fpga.fpga_ptr(name, p, sdfg, bank)))
            else:
                kernel_args.append(p.as_arg(False, name=name))

        kernel_function_name = kernel_name
        kernel_file_name = "{}.xclbin".format(kernel_name)

        # Check if this kernel depends from other kernels
        needs_synch = len(predecessors) > 0

        if needs_synch:
            # Build a vector containing all the events associated with the kernels from which this one depends
            kernel_deps_name = f"deps_{kernel_name}"
            kernel_stream.write(f"std::vector<cl::Event> {kernel_deps_name};")
            for pred in predecessors:
                # concatenate events from predecessor kernel
                kernel_stream.write(
                    f"{kernel_deps_name}.push_back({pred}_event);")

        # Launch HLS kernel, passing synchronization events (if any)
        kernel_stream.write(
            f"""\
  auto {kernel_name}_kernel = program.MakeKernel({kernel_function_name}, "{kernel_function_name}", {", ".join(kernel_args)});
  cl::Event {kernel_name}_event = {kernel_name}_kernel.ExecuteTaskFork({f'{kernel_deps_name}.begin(), {kernel_deps_name}.end()' if needs_synch else ''});
  all_events.push_back({kernel_name}_event);""", sdfg, sdfg.node_id(state))
        if state.instrument == dtypes.InstrumentationType.FPGA:
            self.instrument_opencl_kernel(kernel_name, sdfg.node_id(state),
                                          sdfg.sdfg_id, instrumentation_stream)

        # Join RTL tasklets
        for name in rtl_tasklet_names:
            kernel_stream.write(f"kernel_{name}.wait();\n", sdfg,
                                sdfg.node_id(state))
Esempio n. 9
0
def optimize_for_gpu(sdfg: dace.SDFG, m: int, n: int, k: int):
    """ Optimize the matrix multiplication example for GPUs. """
    # Ensure integers are 32-bit by default
    dace.Config.set('compiler', 'default_data_types', value='C')

    # Fuse the map and reduce nodes
    sdfg.apply_transformations(MapReduceFusion)

    # Apply GPU transformation
    sdfg.apply_gpu_transformations()

    # Find multiplication map
    entry = find_map_by_param(sdfg, 'k')

    # Create a tiling strategy
    divides_evenly = (m % 64 == 0) and (n % 64 == 0) and (k % 8 == 0)
    xfutil.tile(sdfg, entry, divides_evenly, True, i=64, j=64, k=8)
    xfutil.tile(sdfg, entry, divides_evenly, True, i=8, j=4)

    # Create kernel schedule by collapsing and reordering maps
    gtile_i = find_map_by_param(sdfg, 'tile_i')
    gtile_j = find_map_by_param(sdfg, 'tile_j')
    btile_i = find_map_by_param(sdfg, 'tile1_i')
    btile_j = find_map_by_param(sdfg, 'tile1_j')
    MapCollapse.apply_to(sdfg, outer_map_entry=gtile_i, inner_map_entry=gtile_j, permissive=True)
    MapCollapse.apply_to(sdfg, outer_map_entry=btile_i, inner_map_entry=btile_j, permissive=True)
    btile = find_map_by_param(sdfg, 'tile1_i')
    btile.map.schedule = dace.ScheduleType.GPU_ThreadBlock

    # Add local storage (shared memory) for A and B on GPU
    ktile = find_map_by_param(sdfg, 'tile_k')
    smem_a = InLocalStorage.apply_to(sdfg, dict(array='A'), node_a=ktile, node_b=btile)
    smem_b = InLocalStorage.apply_to(sdfg, dict(array='B'), node_a=ktile, node_b=btile)
    sdfg.arrays[smem_a.data].storage = dace.StorageType.GPU_Shared
    sdfg.arrays[smem_b.data].storage = dace.StorageType.GPU_Shared

    # Add local storage (registers) for A and B
    ttile = find_map_by_param(sdfg, 'k')
    warptile, ttile = xfutil.extract_map_dims(sdfg, ttile, [2])
    InLocalStorage.apply_to(sdfg, dict(array='trans_gpu_A'), node_a=warptile, node_b=ttile)
    InLocalStorage.apply_to(sdfg, dict(array='trans_gpu_B'), node_a=warptile, node_b=ttile)

    # Add local storage (registers) for C
    state = next(s for s in sdfg.nodes() if warptile in s.nodes())
    warptile_exit = state.exit_node(warptile)
    btile_exit = state.exit_node(btile)
    AccumulateTransient.apply_to(sdfg, map_exit=warptile_exit, outer_map_exit=btile_exit)
    # Set C tile to zero on allocation
    c_access = next(n for n in state.data_nodes() if n.data == 'trans_gpu_C')
    c_access.setzero = True

    # Unroll microkernel maps
    ttile.map.unroll = True

    # Apply double-buffering on shared memory
    DoubleBuffering.apply_to(sdfg, map_entry=ktile, transient=smem_a)
Esempio n. 10
0
    def op_repo_replacement(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState,
                            **kwargs):
        attrs = {
            name: value
            for name, value in kwargs.items() if name in dace_schema.attributes
        }
        # remove used attrs
        kwargs = {k: v for k, v in kwargs.items() if k not in attrs}

        onnx_node = cls(name=cls_name, **attrs)
        state.add_node(onnx_node)

        input_names = dace_schema.non_variadic_inputs()
        variadic_inputs = dace_schema.variadic_inputs()

        output_names = dace_schema.non_variadic_outputs()
        variadic_outputs = dace_schema.variadic_outputs()

        inputs = {
            name: arr_name
            for name, arr_name in kwargs.items()
            if (name in input_names or
                # variadic params
                ("__" in name
                 and parse_variadic_param(name)[0] in variadic_inputs))
        }

        kwargs = {k: v for k, v in kwargs.items() if k not in inputs}

        outputs = {
            name: arr_name
            for name, arr_name in kwargs.items()
            if (name in output_names or
                # variadic params
                ("__" in name
                 and parse_variadic_param(name)[0] in variadic_outputs))
        }

        kwargs = {k: v for k, v in kwargs.items() if k not in outputs}

        if len(kwargs) > 0:
            raise TypeError(f"Unknown arguments {', '.join(kwargs)}")

        for inp, arr_name in inputs.items():
            read = state.add_read(arr_name)
            state.add_edge(read, None, onnx_node, inp,
                           sdfg.make_array_memlet(arr_name))
            onnx_node.add_in_connector(inp)

        for outp, arr_name in outputs.items():
            write = state.add_read(arr_name)
            state.add_edge(onnx_node, outp, write, None,
                           sdfg.make_array_memlet(arr_name))
            onnx_node.add_out_connector(outp)
        return []
Esempio n. 11
0
 def on_sdfg_begin(self, sdfg: SDFG, local_stream: CodeIOStream,
                   global_stream: CodeIOStream,
                   codegen: 'DaCeCodeGenerator'):
     # Initialize serializer versioning object
     if sdfg.parent is None:
         self.codegen = codegen
         path = os.path.abspath(os.path.join(sdfg.build_folder,
                                             'data')).replace('\\', '/')
         codegen.statestruct.append('dace::DataSerializer *serializer;')
         sdfg.append_init_code(
             f'__state->serializer = new dace::DataSerializer("{path}");\n')
Esempio n. 12
0
    def on_sdfg_begin(self, sdfg: SDFG, local_stream: CodeIOStream,
                      global_stream: CodeIOStream,
                      codegen: 'DaCeCodeGenerator'):
        # Initialize serializer versioning object
        if sdfg.parent is None:
            self.codegen = codegen
            codegen.statestruct.append('dace::DataSerializer *serializer;')
            sdfg.append_init_code(
                f'__state->serializer = new dace::DataSerializer("");\n')

            # Add method that controls serializer input
            global_stream.write(self._generate_report_setter(sdfg))
Esempio n. 13
0
    def apply(self, state: SDFGState, sdfg: SDFG):
        nsdfg = self.nsdfg

        candidates, candidate_nodes = self._candidates(nsdfg)
        for outer_edge in state.out_edges(nsdfg):
            if outer_edge.src_conn in candidates:
                state.remove_memlet_path(outer_edge)
                sdfg.remove_data(outer_edge.data.data, validate=False)
        for nstate, node in candidate_nodes:
            for ie in nstate.in_edges(node):
                nstate.remove_memlet_path(ie)
        for cand in candidates:
            nsdfg.sdfg.remove_data(cand, validate=False)
Esempio n. 14
0
def _specialize_transient_strides(sdfg: dace.SDFG, layout_map):
    repldict = replace_strides(
        [array for array in sdfg.arrays.values() if array.transient],
        layout_map,
    )
    sdfg.replace_dict(repldict)
    for state in sdfg.nodes():
        for node in state.nodes():
            if isinstance(node, dace.nodes.NestedSDFG):
                for k, v in repldict.items():
                    if k in node.symbol_mapping:
                        node.symbol_mapping[k] = v
    for k in repldict.keys():
        if k in sdfg.symbols:
            sdfg.remove_symbol(k)
Esempio n. 15
0
def validate_oir_sdfg(sdfg: dace.SDFG):

    from gtc.dace.nodes import VerticalLoopLibraryNode

    sdfg.validate()
    is_correct_node_types = all(
        isinstance(n, (dace.SDFGState, dace.nodes.AccessNode,
                       VerticalLoopLibraryNode))
        for n, _ in sdfg.all_nodes_recursive())
    is_correct_data_and_dtype = all(
        isinstance(array, dace.data.Array) and typestr_to_data_type(
            dace_dtype_to_typestr(array.dtype)) != DataType.INVALID
        for array in sdfg.arrays.values())
    if not is_correct_node_types or not is_correct_data_and_dtype:
        raise ValueError("Not a valid OIR-level SDFG")
Esempio n. 16
0
    def apply_pass(self, sdfg: SDFG, _) -> Optional[Set[SDFGState]]:
        """
        Removes unreachable states throughout an SDFG.
        :param sdfg: The SDFG to modify.
        :param pipeline_results: If in the context of a ``Pipeline``, a dictionary that is populated with prior Pass
                                 results as ``{Pass subclass name: returned object from pass}``. If not run in a
                                 pipeline, an empty dictionary is expected.
        :param initial_symbols: If not None, sets values of initial symbols.
        :return: A set of the removed states, or None if nothing was changed.
        """
        # Mark dead states and remove them
        result = self.find_dead_states(sdfg, set_unconditional_edges=True)
        sdfg.remove_nodes_from(result)

        return result or None
Esempio n. 17
0
    def generate_scope(self, sdfg: dace.SDFG, scope: ScopeSubgraphView,
                       state_id: int, function_stream: CodeIOStream,
                       callsite_stream: CodeIOStream):

        entry_node: nd.MapEntry = scope.source_nodes()[0]
        index_list = []

        for begin, end, stride in entry_node.map.range:
            l = []
            while begin <= end:
                l.append(begin)
                begin += stride
            index_list.append(l)

        sdfgconsts = sdfg.constants_prop
        sdfg.constants_prop = copy.deepcopy(sdfg.constants_prop)

        mapsymboltypes = entry_node.new_symbols(sdfg, scope,
                                                [entry_node.map.params])
        for indices in product(*index_list):
            callsite_stream.write('{')
            nsdfg_unroll_info = None
            for param, index in zip(entry_node.map.params, indices):
                if nsdfg_unroll_info is None:
                    nsdfg_unroll_info = self.nsdfg_prepare_unroll(
                        scope, str(param), str(index))
                else:
                    self.nsdfg_prepare_unroll(scope, str(param), str(index))
                callsite_stream.write(
                    f"constexpr {mapsymboltypes[param]} {param} = "
                    f"{dace.codegen.targets.common.sym2cpp(index)};\n", sdfg)
                sdfg.add_constant(param, int(index))

            callsite_stream.write('{')
            self._dispatcher.dispatch_subgraph(
                sdfg,
                scope,
                state_id,
                function_stream,
                callsite_stream,
                skip_entry_node=True,
                skip_exit_node=True,
            )
            callsite_stream.write('}')
            callsite_stream.write('}')
            self.nsdfg_after_unroll(nsdfg_unroll_info)

        sdfg.constants_prop = sdfgconsts
Esempio n. 18
0
def program_for_node(program, sdfg: SDFG, state: SDFGState,
                     node: onnx_op.ONNXOp) -> SDFG:
    """ Expand a function to a dace program.

        The dtypes for the arguments will be extracted by matching the parameter names to edges.
    """
    input_names = node.schema.non_variadic_inputs()
    variadic_input_names = node.schema.variadic_inputs()

    output_names = node.schema.non_variadic_outputs()
    variadic_output_names = node.schema.variadic_outputs()

    if set(input_names).intersection(output_names):
        # this is currently the case for only one onnx op
        raise ValueError(
            "program_for_node cannot be applied on nodes of this type;"
            " '{}' is both an input and an output".format(
                next(input_names.intersection(output_names))))

    params = inspect.signature(program).parameters

    annotations = {}
    for name, param in params.items():
        if name in input_names or ("__" in name
                                   and parse_variadic_param(name)[0]
                                   in variadic_input_names):
            annotations[name] = in_desc_with_name(node, state, sdfg, name)
        elif name in output_names or ("__" in name
                                      and parse_variadic_param(name)[0]
                                      in variadic_output_names):
            annotations[name] = out_desc_with_name(node, state, sdfg, name)
        else:
            raise ValueError(
                "'{}' was not found as an input or output for {}".format(
                    name, node.schema.name))

    program.__annotations__ = annotations

    result = DaceProgram(program, (), {}, False, dace.DeviceType.CPU)
    result.name = node.label + "_expansion"

    sdfg = result.to_sdfg()

    if node.schedule in [dtypes.ScheduleType.GPU_Default
                         ] + dtypes.GPU_SCHEDULES:
        sdfg.apply_gpu_transformations()

    return sdfg
Esempio n. 19
0
File: papi.py Progetto: mfkiwl/dace
    def get_out_memlet_costs(sdfg: dace.SDFG, state_id: int, node: nodes.Node,
                             dfg: StateGraphView):
        scope_dict = sdfg.node(state_id).scope_dict()

        out_costs = 0
        for edge in dfg.out_edges(node):
            _, uconn, v, _, memlet = edge
            dst_node = dfg.memlet_path(edge)[-1].dst

            if (isinstance(node, nodes.CodeNode)
                    and isinstance(dst_node, nodes.AccessNode)):

                # If the memlet is pointing into an array in an inner scope,
                # it will be handled by the inner scope.
                if (scope_dict[node] != scope_dict[dst_node]
                        and scope_contains_scope(scope_dict, node, dst_node)):
                    continue

                if not uconn:
                    # This would normally raise a syntax error
                    return 0

                if memlet.subset.data_dims() == 0:
                    if memlet.wcr is not None:
                        # write_and_resolve
                        # We have to assume that every reduction costs 3
                        # accesses of the same size (read old, read new, write)
                        out_costs += 3 * PAPIUtils.get_memlet_byte_size(
                            sdfg, memlet)
                    else:
                        # This standard operation is already counted
                        out_costs += PAPIUtils.get_memlet_byte_size(
                            sdfg, memlet)
        return out_costs
Esempio n. 20
0
def _dml_disambiguate_direction_dependent_views(sdfg: dace.SDFG):
    """ Consider the following subgraph:
            (A) -- y --> (n) -- x --> (C)
            In dace, if B is a View node and A and C are access nodes, and y and x both have data set to A.data and
            B.data respectively, the semantics of the graph depend on the order in which it is executed, i.e. reversing
            the subgraph doesn't perform as expected anymore. To disambiguate this case, we set y.data to the View's
            data.
        """

    for n, state in sdfg.all_nodes_recursive():
        if isinstance(n, nd.AccessNode) and type(n.desc(sdfg)) is dt.View:
            in_edges = state.in_edges(n)
            out_edges = state.out_edges(n)

            if len(in_edges) == 1 and len(out_edges) == 1:
                A = in_edges[0].src
                y = in_edges[0].data
                C = out_edges[0].dst
                x = out_edges[0].data
                if (isinstance(A, nd.AccessNode)
                        and isinstance(C, nd.AccessNode) and y.data == A.data
                        and x.data == C.data):

                    # flip the memlet
                    y.subset, y.other_subset = y.other_subset, y.subset
                    y.data = n.data
                    y.try_initialize(sdfg, state, in_edges[0])
def find_library_nodes(
        sdfg: dace.SDFG,
        lib_type: dace.sdfg.nodes.LibraryNode) -> dace.nodes.MapEntry:
    """ Finds the first access node by the given data name. """
    return [
        n for n, _ in sdfg.all_nodes_recursive() if isinstance(n, lib_type)
    ]
Esempio n. 22
0
 def apply(cls, gtir, sdfg: dace.SDFG):
     self = cls()
     code_objects = sdfg.generate_code()
     computations = code_objects[[co.title for co in code_objects
                                  ].index("Frame")].clean_code
     lines = computations.split("\n")
     computations = "\n".join(
         lines[0:2] + lines[3:])  # remove import of not generated file
     computations = codegen.format_source("cpp", computations, style="LLVM")
     interface = cls.template.definition.render(
         name=sdfg.name,
         dace_args=self.generate_dace_args(gtir, sdfg),
         functor_args=self.generate_functor_args(sdfg),
         tmp_allocs=self.generate_tmp_allocs(sdfg),
     )
     generated_code = f"""#include <gridtools/sid/sid_shift_origin.hpp>
                          #include <gridtools/sid/allocator.hpp>
                          #include <gridtools/stencil/cartesian.hpp>
                          namespace gt = gridtools;
                          {computations}
                          {interface}
                          """
     formatted_code = codegen.format_source("cpp",
                                            generated_code,
                                            style="LLVM")
     return formatted_code
Esempio n. 23
0
def _count_views(sdfg: dace.SDFG) -> int:
    num = 0
    for n, _ in sdfg.all_nodes_recursive():
        if (isinstance(n, nodes.AccessNode)
                and isinstance(sdfg.arrays[n.data], data.View)):
            num += 1
    return num
Esempio n. 24
0
    def pre_evaluate(self, cutout: dace.SDFG, measurements: int,
                     **kwargs) -> Dict:
        cutout.start_state.instrument = self.instrument

        map_entry = None
        for node in cutout.start_state.nodes():
            if isinstance(node, dace.nodes.MapEntry) and xfh.get_parent_map(
                    cutout.start_state, node) is None:
                map_entry = node
                break
        assert map_entry is not None

        new_kwargs = {
            "space_kwargs": {
                "map_entry": map_entry
            },
            "cutout":
            cutout.to_json(),
            "map_entry_id":
            cutout.start_state.node_id(map_entry),
            "measurements":
            measurements,
            "key":
            lambda point: "None"
            if point is None else ".".join(map(lambda p: str(p), point))
        }
        return new_kwargs
Esempio n. 25
0
    def apply(self, sdfg: dace.SDFG) -> None:
        state = sdfg.node(self.state_id)
        left = self.left(sdfg)
        right = self.right(sdfg)

        # Merge source locations
        dinfo = self._merge_source_locations(left, right)

        # merge oir nodes
        res = HorizontalExecutionLibraryNode(
            oir_node=oir.HorizontalExecution(
                body=left.as_oir().body + right.as_oir().body,
                declarations=left.as_oir().declarations +
                right.as_oir().declarations,
            ),
            iteration_space=left.iteration_space,
            debuginfo=dinfo,
        )
        state.add_node(res)

        intermediate_accesses = set(
            n for path in nx.all_simple_paths(state.nx, left, right)
            for n in path[1:-1])

        # rewire edges and connectors to left and delete right
        for edge in state.edges_between(left, right):
            state.remove_edge_and_connectors(edge)
        for acc in intermediate_accesses:
            for edge in state.in_edges(acc):
                if edge.src is not left:
                    rewire_edge(state, edge, dst=res)
                else:
                    state.remove_edge_and_connectors(edge)
            for edge in state.out_edges(acc):
                if edge.dst is not right:
                    rewire_edge(state, edge, src=res)
                else:
                    state.remove_edge_and_connectors(edge)
        for edge in state.in_edges(left):
            rewire_edge(state, edge, dst=res)
        for edge in state.out_edges(right):
            rewire_edge(state, edge, src=res)
        for edge in state.out_edges(left):
            rewire_edge(state, edge, src=res)
        for edge in state.in_edges(right):
            rewire_edge(state, edge, dst=res)
        state.remove_node(left)
        state.remove_node(right)
        for acc in intermediate_accesses:
            if not state.in_edges(acc):
                if not state.out_edges(acc):
                    state.remove_node(acc)
                else:
                    assert (len(state.edges_between(acc, res)) == 1
                            and len(state.out_edges(acc))
                            == 1), "Previously written array now read-only."
                    state.remove_node(acc)
                    res.remove_in_connector("IN_" + acc.label)
            elif not state.out_edges:
                acc.access = dace.AccessType.WriteOnly
Esempio n. 26
0
    def apply(self, sdfg: SDFG) -> Union[Any, None]:

        state = sdfg.node(self.state_id)
        nsdfg = self.nsdfg(sdfg)

        read_set, write_set = nsdfg.sdfg.read_and_write_sets()
        prune_in = nsdfg.in_connectors.keys() - read_set
        prune_out = nsdfg.out_connectors.keys() - write_set

        # Detect which nodes are used, so we can delete unused nodes after the
        # connectors have been pruned
        all_data_used = read_set | write_set

        # Add WCR outputs to "do not prune" input list
        for e in state.out_edges(nsdfg):
            if e.data.wcr is not None and e.src_conn in prune_in:
                if (state.in_degree(
                        next(
                            iter(state.in_edges_by_connector(
                                nsdfg, e.src_conn))).src) > 0):
                    prune_in.remove(e.src_conn)

        for conn in prune_in:
            for e in state.in_edges_by_connector(nsdfg, conn):
                state.remove_memlet_path(e, remove_orphans=True)
                if conn in nsdfg.sdfg.arrays and conn not in all_data_used:
                    # If the data is now unused, we can purge it from the SDFG
                    nsdfg.sdfg.remove_data(conn)

        for conn in prune_out:
            for e in state.out_edges_by_connector(nsdfg, conn):
                state.remove_memlet_path(e, remove_orphans=True)
                if conn in nsdfg.sdfg.arrays and conn not in all_data_used:
                    # If the data is now unused, we can purge it from the SDFG
                    nsdfg.sdfg.remove_data(conn)
Esempio n. 27
0
def was_vectorized(sdfg: dace.SDFG) -> bool:
    """ Tests whether a binary contains 128-bit CUDA memory operations. """
    csdfg: compiled_sdfg.CompiledSDFG = sdfg.compile()
    output: bytes = subprocess.check_output(
        ['cuobjdump', '-sass', csdfg.filename], stderr=subprocess.STDOUT)
    del csdfg
    return b'.128' in output
Esempio n. 28
0
def load_sdfg_from_json(json):
    if 'error' in json:
        message = ''
        if ('message' in json['error']):
            message = json['error']['message']
        error = {
            'error': {
                'message': 'Invalid SDFG provided',
                'details': message,
            }
        }
        sdfg = None
    else:
        try:
            sdfg = SDFG.from_json(json)
            error = None
        except Exception as e:
            print(traceback.format_exc(), file=sys.stderr)
            sys.stderr.flush()
            error = {
                'error': {
                    'message': 'Failed to parse the provided SDFG',
                    'details': get_exception_message(e),
                },
            }
            sdfg = None
    return {
        'error': error,
        'sdfg': sdfg,
    }
Esempio n. 29
0
    def _setup_gpu_runtime(self, sdfg: SDFG, global_stream: CodeIOStream):
        if self.gpu_runtime_init:
            return
        self.gpu_runtime_init = True
        self.backend = config.Config.get('compiler', 'cuda', 'backend')
        if self.backend == 'cuda':
            header_name = 'cuda_runtime.h'
        elif self.backend == 'hip':
            header_name = 'hip/hip_runtime.h'
        else:
            raise NameError('GPU backend "%s" not recognized' % self.backend)

        global_stream.write('#include <%s>' % header_name)

        # For other file headers
        sdfg.append_global_code('\n#include <%s>' % header_name, None)
Esempio n. 30
0
    def _constants_from_unvisited_state(
            self, sdfg: SDFG, state: SDFGState, arrays: Set[str],
            existing_constants: Dict[SDFGState, Dict[str,
                                                     Any]]) -> Dict[str, Any]:
        """
        Collects constants from an unvisited state, traversing backwards until reaching states that do have
        collected constants.
        """
        result: Dict[str, Any] = {}

        for parent, node in sdutil.dfs_conditional(
                sdfg,
                sources=[state],
                reverse=True,
                condition=lambda p, c: c not in existing_constants,
                yield_parent=True):
            # Skip first node
            if parent is None:
                continue

            # Get connecting edge (reversed)
            edge = sdfg.edges_between(node, parent)[0]

            # If node already has propagated constants, update dictionary and stop traversal
            self._propagate(
                result, self._data_independent_assignments(edge.data, arrays),
                True)
            if node in existing_constants:
                self._propagate(result, existing_constants[node], True)

        return result