Example #1
0
# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
import dace
import numpy as np

W = dace.symbol('W')


@dace.program
def prog(A):
    number = dace.define_local([1], dace.float32)

    @dace.map(_[0:W])
    def bla(i):
        inp << A[i]
        out >> A[i]
        osum >> number(1, lambda x, y: x + y, 0)

        out = 2 * inp
        osum = inp

    @dace.map(_[0:W])
    def bla2(i):
        inp << A[i]
        out >> A[i]

        out = 2 * inp


def test():
    W.set(3)
# Copyright 2019-2020 ETH Zurich and the DaCe authors. All rights reserved.
from __future__ import print_function

import argparse
import dace
import numpy as np
import select
import sys
from scipy import ndimage

W = dace.symbol("W")
H = dace.symbol("H")
T = dace.symbol("T")
P = dace.symbol("P")  # Number of processing elements
dtype = dace.float32


def add_tmp(state):
    return state.add_array("tmp", (2, H, W),
                           dtype,
                           transient=True,
                           storage=dace.dtypes.StorageType.FPGA_Global)


def make_init_state(sdfg):
    state = sdfg.add_state("init")

    a0 = state.add_array("A", (H, W), dtype)
    tmp0 = add_tmp(state)
    state.add_memlet_path(a0,
                          tmp0,
Example #3
0
# Copyright 2019-2020 ETH Zurich and the DaCe authors. All rights reserved.
from __future__ import print_function

import argparse
import dace
from dace.transformation.dataflow import MapTiling
from dace.transformation.optimizer import SDFGOptimizer
import numpy as np
from scipy import ndimage

W = dace.symbol('W')
H = dace.symbol('H')
MAXITER = dace.symbol('MAXITER')


def create_sdfg():

    sdfg = dace.SDFG('stencil_sdfg_api')
    sdfg.add_symbol('MAXITER', MAXITER.dtype)
    _, arr = sdfg.add_array('A', (H, W), dace.float32)
    _, tmparr = sdfg.add_transient('tmp', (H, W), dace.float32)

    init = sdfg.add_state('init')
    guard = sdfg.add_state('guard')
    body = sdfg.add_state('body')
    end = sdfg.add_state('end')

    sdfg.add_edge(init, guard, dace.InterstateEdge(assignments={'i': '0'}))
    sdfg.add_edge(guard, body, dace.InterstateEdge(condition='i<MAXITER'))
    sdfg.add_edge(body, guard, dace.InterstateEdge(assignments={'i': 'i+1'}))
    sdfg.add_edge(guard, end, dace.InterstateEdge(condition='i>=MAXITER'))
Example #4
0
# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
""" Tests WarpTiling and fusion on the softmax operator. """
import dace
from dace.transformation.dataflow import (MapFusion, WarpTiling,
                                          TrivialMapElimination, Vectorization)
from dace.transformation.interstate import (HoistState, InlineSDFG,
                                            StateFusion, GPUTransformSDFG)
from dace.transformation.subgraph import (SubgraphFusion, MultiExpansion,
                                          ReduceExpansion)

import numpy as np
import pytest

dn1, dn2, dn3, dr = (dace.symbol(s) for s in ('dn1', 'dn2', 'dn3', 'dr'))


@dace.program
def softmax_fwd(inp: dace.float32[dn1, dn2, dn3, dr],
                out: dace.float32[dn1, dn2, dn3, dr]):
    max = np.max(inp, axis=-1)
    max_keepdims = np.reshape(max, (dn1, dn2, dn3, 1))
    exp_arr = np.exp(inp - max_keepdims)
    sum = np.sum(exp_arr, axis=-1)
    sum_keepdims = np.reshape(sum, (dn1, dn2, dn3, 1))
    out[:] = exp_arr / sum_keepdims


# Numerically-stable version of softmax
def softmax(x):
    tmp_max = np.max(x, axis=-1, keepdims=True)
    tmp_out = np.exp(x - tmp_max)
Example #5
0
# Copyright 2019-2020 ETH Zurich and the DaCe authors. All rights reserved.
import dace
import numpy as np
import pytest
from dace.transformation.subgraph import ReduceExpansion

from dace.libraries.standard.nodes.reduce import Reduce

N = dace.symbol('N')
M = dace.symbol('M')
N.set(30)
M.set(30)


@dace.program
def program(A: dace.float32[M, N]):
    return dace.reduce(lambda a, b: max(a, b), A, axis=1, identity=0)


@pytest.mark.gpu
def test_blockallreduce():
    A = np.random.rand(M.get(), N.get()).astype(np.float32)
    sdfg = program.to_sdfg()
    sdfg.apply_gpu_transformations()

    graph = sdfg.nodes()[0]
    for node in graph.nodes():
        if isinstance(node, Reduce):
            reduce_node = node
    reduce_node.implementation = 'CUDA (device)'
Example #6
0
# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
import dace
from dace.transformation.subgraph import MultiExpansion, SubgraphFusion
import dace.sdfg.nodes as nodes
import numpy as np

from typing import Union, List
from dace.sdfg.graph import SubgraphView

N, M, O, P, Q, R = [dace.symbol(s) for s in ['N', 'M', 'O', 'P', 'Q', 'R']]


@dace.program
def subgraph_fusion_parallel(A: dace.float64[N], B: dace.float64[M],
                             C: dace.float64[O], D: dace.float64[M],
                             E: dace.float64[N], F: dace.float64[P],
                             G: dace.float64[M], H: dace.float64[P],
                             I: dace.float64[N], J: dace.float64[R],
                             X: dace.float64[N], Y: dace.float64[M],
                             Z: dace.float64[P]):

    tmp1 = np.ndarray([N, M, O], dtype=dace.float64)
    for i, j, k in dace.map[0:N, 0:M, 0:O]:
        with dace.tasklet:
            in1 << A[i]
            in2 << B[j]
            in3 << C[k]
            out >> tmp1[i, j, k]

            out = in1 + in2 + in3
Example #7
0
#!/usr/bin/env python
from __future__ import print_function

import argparse
import dace
import math
import numpy as np

N = dace.symbol('N', positive=True)


@dace.program(dace.float32[N], dace.float32[N], dace.uint32[1], dace.float32)
def pbf(A, out, outsz, ratio):
    ostream = dace.define_stream(dace.float32, 1)
    ostream >> out

    @dace.map(_[0:N])
    def filter(i):
        a << A[i]
        b >> ostream(-1)
        osz >> outsz(-1, lambda x, y: x + y, 0)

        filter = (a > ratio)

        if filter:
            b = a

        osz = filter


def regression(A, ratio):
Example #8
0
    def can_be_applied(self,
                       graph: dace.SDFGState,
                       expr_index: int,
                       sdfg: dace.SDFG,
                       permissive: bool = False):

        map_entry = self.map_entry
        map_exit = graph.exit_node(map_entry)
        params = [dace.symbol(p) for p in map_entry.map.params]

        inputs = dict()
        for _, _, _, _, m in graph.out_edges(map_entry):
            if not m.data:
                continue
            desc = sdfg.arrays[m.data]
            if desc not in inputs.keys():
                inputs[desc] = []
            inputs[desc].append(m.subset)

        stencil_found = False
        for desc, accesses in inputs.items():
            if isinstance(desc, dace.data.Scalar):
                continue
            elif isinstance(desc, (dace.data.Array, dace.data.View)):
                if list(desc.shape) == [1]:
                    continue
                first_access = None
                for a in accesses:
                    if a.num_elements() != 1:
                        return False
                    if first_access:
                        new_access = deepcopy(a)
                        new_access.offset(first_access, True)
                        for idx in new_access.min_element():
                            if not isinstance(idx, Number):
                                return False
                            if idx != 0:
                                stencil_found = True
                    else:
                        first_access = a
                    indices = a.min_element()
                    unmatched_indices = set(params)
                    for idx in indices:
                        if isinstance(idx, sympy.Symbol):
                            bidx = idx
                        elif isinstance(idx, sympy.Add):
                            if len(idx.free_symbols) != 1:
                                return False
                            bidx = list(idx.free_symbols)[0]
                        else:
                            return False
                        if bidx in unmatched_indices:
                            unmatched_indices.remove(bidx)
                    if len(unmatched_indices) > 0:
                        return False
            else:
                return False

        outputs = dict()
        for _, _, _, _, m in graph.in_edges(map_exit):
            if m.wcr:
                return False
            desc = sdfg.arrays[m.data]
            if desc not in outputs.keys():
                outputs[desc] = []
            outputs[desc].append(m.subset)

        for desc, accesses in outputs.items():
            if isinstance(desc, (dace.data.Array, dace.data.View)):
                for a in accesses:
                    if a.num_elements() > 1:
                        return False
                    indices = a.min_element()
                    unmatched_indices = set(params)
                    for idx in indices:
                        if isinstance(idx, sympy.Symbol):
                            bidx = idx
                        elif isinstance(idx, sympy.Add):
                            if len(idx.free_symbols) != 1:
                                return False
                            bidx = list(idx.free_symbols)[0]
                        else:
                            return False
                        if bidx in unmatched_indices:
                            unmatched_indices.remove(bidx)
                    if len(unmatched_indices) > 0:
                        return False
            else:
                return False

        return stencil_found
Example #9
0
    def can_be_applied(self,
                       graph: dace.SDFGState,
                       expr_index: int,
                       sdfg: dace.SDFG,
                       permissive: bool = False):

        map_entry = self.map_entry
        map_exit = graph.exit_node(map_entry)
        params = [dace.symbol(p) for p in map_entry.map.params]

        inputs = dict()
        for _, _, _, _, m in graph.out_edges(map_entry):
            if not m.data:
                continue
            desc = sdfg.arrays[m.data]
            if desc not in inputs.keys():
                inputs[desc] = []
            inputs[desc].append(m.subset)

        outer_product_found = False
        for desc, accesses in inputs.items():
            if isinstance(desc, dace.data.Scalar):
                continue
            elif isinstance(desc, (dace.data.Array, dace.data.View)):
                if list(desc.shape) == [1]:
                    continue
                for a in accesses:
                    indices = a.min_element()
                    unmatched_indices = set(params)
                    for idx in indices:
                        if not isinstance(idx, sympy.Symbol):
                            return False
                        if idx in unmatched_indices:
                            unmatched_indices.remove(idx)
                    if len(unmatched_indices) == 0:
                        return False
                    outer_product_found = True
            else:
                return False

        outputs = dict()
        for _, _, _, _, m in graph.in_edges(map_exit):
            if m.wcr:
                return False
            desc = sdfg.arrays[m.data]
            if desc not in outputs.keys():
                outputs[desc] = []
            outputs[desc].append(m.subset)

        for desc, accesses in outputs.items():
            if isinstance(desc, (dace.data.Array, dace.data.View)):
                for a in accesses:
                    if a.num_elements() != 1:
                        return False
                    indices = a.min_element()
                    unmatched_indices = set(params)
                    for idx in indices:
                        if idx in unmatched_indices:
                            unmatched_indices.remove(idx)
                    if len(unmatched_indices) > 0:
                        return False
            else:
                return False

        return outer_product_found
Example #10
0
    def can_be_applied(self,
                       graph: dace.SDFGState,
                       expr_index: int,
                       sdfg: dace.SDFG,
                       permissive: bool = False):

        map_entry = self.map_entry
        map_exit = graph.exit_node(map_entry)
        params = [dace.symbol(p) for p in map_entry.map.params]

        if "commsize" in map_entry.map.range.free_symbols:
            return False
        if "Px" in map_entry.map.range.free_symbols:
            return False
        if "Py" in map_entry.map.range.free_symbols:
            return False

        # If the map iterators are used in the code of a Tasklet,
        # then we cannot flatten them (currently).
        # See, for example, samples/simple/mandelbrot.py
        for node in subgraph_from_maps(sdfg, graph, [map_entry]):
            if isinstance(node, dace.nodes.CodeNode):
                for p in params:
                    if str(p) in node.free_symbols:
                        return False

        inputs = dict()
        for _, _, _, _, m in graph.out_edges(map_entry):
            if not m.data:
                continue
            desc = sdfg.arrays[m.data]
            if desc not in inputs.keys():
                inputs[desc] = []
            inputs[desc].append(m.subset)

        for desc, accesses in inputs.items():
            if isinstance(desc, dace.data.Scalar):
                continue
            elif isinstance(desc, (dace.data.Array, dace.data.View)):
                if list(desc.shape) == [1]:
                    continue
                for a in accesses:
                    if a.num_elements() != 1:
                        return False
                    indices = a.min_element()
                    unmatched_indices = set(params)
                    for idx in indices:
                        if idx in unmatched_indices:
                            unmatched_indices.remove(idx)
                    if len(unmatched_indices) > 0:
                        return False
            else:
                return False

        outputs = dict()
        for _, _, _, _, m in graph.in_edges(map_exit):
            if m.wcr:
                return False
            desc = sdfg.arrays[m.data]
            if desc not in outputs.keys():
                outputs[desc] = []
            outputs[desc].append(m.subset)

        for desc, accesses in outputs.items():
            if isinstance(desc, (dace.data.Array, dace.data.View)):
                for a in accesses:
                    if a.num_elements() != 1:
                        return False
                    indices = a.min_element()
                    unmatched_indices = set(params)
                    for idx in indices:
                        if idx in unmatched_indices:
                            unmatched_indices.remove(idx)
                    if len(unmatched_indices) > 0:
                        return False
            else:
                return False

        return True
Example #11
0
    def apply(self, graph: dace.SDFGState, sdfg: dace.SDFG):
        map_entry = self.map_entry
        map_exit = graph.exit_node(map_entry)

        sz = dace.symbol('commsize',
                         dtype=dace.int32,
                         integer=True,
                         positive=True)
        Px = dace.symbol('Px', dtype=dace.int32, integer=True, positive=True)
        Py = dace.symbol('Py', dtype=dace.int32, integer=True, positive=True)

        from dace.data import _prod

        # NOTE: Maps with step in their ranges are currently not supported
        if len(map_entry.map.params) == 2:
            params = map_entry.map.params
            ranges = [None] * 2
            b, e, _ = map_entry.map.range[0]
            ranges[0] = (0, (e - b + 1) / Px - 1, 1)
            b, e, _ = map_entry.map.range[1]
            ranges[1] = (0, (e - b + 1) / Py - 1, 1)
            strides = [1]
        else:
            params = ['__iflat']
            sizes = map_entry.map.range.size_exact()
            total_size = _prod(sizes)
            ranges = [(0, (total_size) / sz - 1, 1)]
            strides = [_prod(sizes[i + 1:]) for i in range(len(sizes))]

        root_name = sdfg.temp_data_name()
        sdfg.add_scalar(root_name, dace.int32, transient=True)
        root_node = graph.add_access(root_name)
        root_tasklet = graph.add_tasklet('_set_root_', {}, {'__out'},
                                         '__out = 0')
        graph.add_edge(root_tasklet, '__out', root_node, None,
                       dace.Memlet.simple(root_name, '0'))

        from dace.libraries.mpi import Bcast
        from dace.libraries.pblas import BlockCyclicScatter, BlockCyclicGather

        inputs = set()
        for src, _, _, _, m in graph.in_edges(map_entry):
            if not isinstance(src, nodes.AccessNode):
                raise NotImplementedError
            desc = src.desc(sdfg)
            if not isinstance(desc, (data.Scalar, data.Array)):
                raise NotImplementedError
            if list(desc.shape) != m.src_subset.size_exact():
                # Second attempt
                # TODO: We need a solution for symbols not matching
                if str(list(desc.shape)) != str(m.src_subset.size_exact()):
                    raise NotImplementedError
            inputs.add(src)

        for inp in inputs:
            desc = inp.desc(sdfg)

            if isinstance(desc, data.Scalar):
                local_access = graph.add_access(inp.data)
                bcast_node = Bcast('_Bcast_')
                graph.add_edge(inp, None, bcast_node, '_inbuffer',
                               dace.Memlet.from_array(inp.data, desc))
                graph.add_edge(root_node, None, bcast_node, '_root',
                               dace.Memlet.simple(root_name, '0'))
                graph.add_edge(bcast_node, '_outbuffer', local_access, None,
                               dace.Memlet.from_array(inp.data, desc))
                for e in graph.edges_between(inp, map_entry):
                    graph.add_edge(local_access, None, map_entry, e.dst_conn,
                                   dace.Memlet.from_array(inp.data, desc))
                    graph.remove_edge(e)

            elif isinstance(desc, data.Array):

                local_name, local_arr = sdfg.add_temp_transient(
                    [(desc.shape[0]) // Px, (desc.shape[1]) // Py],
                    dtype=desc.dtype,
                    storage=desc.storage)
                local_access = graph.add_access(local_name)
                bsizes_name, bsizes_arr = sdfg.add_temp_transient(
                    (2, ), dtype=dace.int32)
                bsizes_access = graph.add_access(bsizes_name)
                bsizes_tasklet = nodes.Tasklet(
                    '_set_bsizes_', {}, {'__out'},
                    "__out[0] = {x}; __out[1] = {y}".format(
                        x=(desc.shape[0]) // Px, y=(desc.shape[1]) // Py))
                graph.add_edge(bsizes_tasklet, '__out', bsizes_access, None,
                               dace.Memlet.from_array(bsizes_name, bsizes_arr))
                gdesc_name, gdesc_arr = sdfg.add_temp_transient(
                    (9, ), dtype=dace.int32)
                gdesc_access = graph.add_access(gdesc_name)
                ldesc_name, ldesc_arr = sdfg.add_temp_transient(
                    (9, ), dtype=dace.int32)
                ldesc_access = graph.add_access(ldesc_name)
                scatter_node = BlockCyclicScatter('_Scatter_')
                graph.add_edge(inp, None, scatter_node, '_inbuffer',
                               dace.Memlet.from_array(inp.data, desc))
                graph.add_edge(bsizes_access, None, scatter_node,
                               '_block_sizes',
                               dace.Memlet.from_array(bsizes_name, bsizes_arr))
                graph.add_edge(scatter_node, '_outbuffer', local_access, None,
                               dace.Memlet.from_array(local_name, local_arr))
                graph.add_edge(scatter_node, '_gdescriptor', gdesc_access,
                               None,
                               dace.Memlet.from_array(gdesc_name, gdesc_arr))
                graph.add_edge(scatter_node, '_ldescriptor', ldesc_access,
                               None,
                               dace.Memlet.from_array(ldesc_name, ldesc_arr))
                for e in graph.edges_between(inp, map_entry):
                    graph.add_edge(
                        local_access, None, map_entry, e.dst_conn,
                        dace.Memlet.from_array(local_name, local_arr))
                    graph.remove_edge(e)
                for e in graph.out_edges(map_entry):
                    if e.data.data == inp.data:
                        e.data.data = local_name

            else:
                raise NotImplementedError

        outputs = set()
        for _, _, dst, _, m in graph.out_edges(map_exit):
            if not isinstance(dst, nodes.AccessNode):
                raise NotImplementedError
            desc = dst.desc(sdfg)
            if not isinstance(desc, data.Array):
                raise NotImplementedError
            try:
                if list(desc.shape) != m.dst_subset.size_exact():
                    # Second attempt
                    # TODO: We need a solution for symbols not matching
                    if str(list(desc.shape)) != str(m.dst_subset.size_exact()):
                        raise NotImplementedError
            except AttributeError:
                if list(desc.shape) != m.subset.size_exact():
                    # Second attempt
                    # TODO: We need a solution for symbols not matching
                    if str(list(desc.shape)) != str(m.subset.size_exact()):
                        raise NotImplementedError
            outputs.add(dst)

        for out in outputs:
            desc = out.desc(sdfg)
            if isinstance(desc, data.Scalar):
                raise NotImplementedError
            elif isinstance(desc, data.Array):
                local_name, local_arr = sdfg.add_temp_transient(
                    [(desc.shape[0]) // Px, (desc.shape[1]) // Py],
                    dtype=desc.dtype,
                    storage=desc.storage)
                local_access = graph.add_access(local_name)
                bsizes_name, bsizes_arr = sdfg.add_temp_transient(
                    (2, ), dtype=dace.int32)
                bsizes_access = graph.add_access(bsizes_name)
                bsizes_tasklet = nodes.Tasklet(
                    '_set_bsizes_', {}, {'__out'},
                    "__out[0] = {x}; __out[1] = {y}".format(
                        x=(desc.shape[0]) // Px, y=(desc.shape[1]) // Py))
                graph.add_edge(bsizes_tasklet, '__out', bsizes_access, None,
                               dace.Memlet.from_array(bsizes_name, bsizes_arr))
                scatter_node = BlockCyclicGather('_Gather_')
                graph.add_edge(local_access, None, scatter_node, '_inbuffer',
                               dace.Memlet.from_array(local_name, local_arr))
                graph.add_edge(bsizes_access, None, scatter_node,
                               '_block_sizes',
                               dace.Memlet.from_array(bsizes_name, bsizes_arr))
                graph.add_edge(scatter_node, '_outbuffer', out, None,
                               dace.Memlet.from_array(out.data, desc))

                for e in graph.edges_between(map_exit, out):
                    graph.add_edge(
                        map_exit, e.src_conn, local_access, None,
                        dace.Memlet.from_array(local_name, local_arr))
                    graph.remove_edge(e)
                for e in graph.in_edges(map_exit):
                    if e.data.data == out.data:
                        e.data.data = local_name
            else:
                raise NotImplementedError

        map_entry.map.params = params
        map_entry.map.range = subsets.Range(ranges)
Example #12
0
    def apply(self, graph: dace.SDFGState, sdfg: dace.SDFG):
        map_entry = self.map_entry
        map_exit = graph.exit_node(map_entry)

        sz = dace.symbol('commsize', dtype=dace.int32)

        def _prod(sequence):
            return reduce(lambda a, b: a * b, sequence, 1)

        # NOTE: Maps with step in their ranges are currently not supported
        if len(map_entry.map.params) == 1:
            params = map_entry.map.params
            ranges = [(0, (e - b + 1) / sz - 1, 1)
                      for b, e, _ in map_entry.map.range]
            strides = [1]
        else:
            params = ['__iflat']
            sizes = map_entry.map.range.size_exact()
            total_size = _prod(sizes)
            ranges = [(0, (total_size) / sz - 1, 1)]
            strides = [_prod(sizes[i + 1:]) for i in range(len(sizes))]

        root_name = sdfg.temp_data_name()
        sdfg.add_scalar(root_name, dace.int32, transient=True)
        root_node = graph.add_access(root_name)
        root_tasklet = graph.add_tasklet('_set_root_', {}, {'__out'},
                                         '__out = 0')
        graph.add_edge(root_tasklet, '__out', root_node, None,
                       dace.Memlet.simple(root_name, '0'))

        from dace.libraries.mpi import Bcast, Scatter, Gather

        inputs = set()
        for src, _, _, _, m in graph.in_edges(map_entry):
            if not isinstance(src, nodes.AccessNode):
                raise NotImplementedError
            desc = src.desc(sdfg)
            if not isinstance(desc, (data.Scalar, data.Array)):
                raise NotImplementedError
            if list(desc.shape) != m.src_subset.size_exact():
                # Second attempt
                # TODO: We need a solution for symbols not matching
                if str(list(desc.shape)) != str(m.src_subset.size_exact()):
                    raise NotImplementedError
            inputs.add(src)

        for inp in inputs:
            desc = inp.desc(sdfg)

            if isinstance(desc, data.Scalar):
                local_access = graph.add_access(inp.data)
                bcast_node = Bcast('_Bcast_')
                graph.add_edge(inp, None, bcast_node, '_inbuffer',
                               dace.Memlet.from_array(inp.data, desc))
                graph.add_edge(root_node, None, bcast_node, '_root',
                               dace.Memlet.simple(root_name, '0'))
                graph.add_edge(bcast_node, '_outbuffer', local_access, None,
                               dace.Memlet.from_array(inp.data, desc))
                for e in graph.edges_between(inp, map_entry):
                    graph.add_edge(local_access, None, map_entry, e.dst_conn,
                                   dace.Memlet.from_array(inp.data, desc))
                    graph.remove_edge(e)

            elif isinstance(desc, data.Array):

                local_name, local_arr = sdfg.add_temp_transient(
                    [sympy.floor(desc.total_size / sz)],
                    dtype=desc.dtype,
                    storage=desc.storage)
                local_access = graph.add_access(local_name)
                scatter_node = Scatter('_Scatter_')
                graph.add_edge(inp, None, scatter_node, '_inbuffer',
                               dace.Memlet.from_array(inp.data, desc))
                graph.add_edge(root_node, None, scatter_node, '_root',
                               dace.Memlet.simple(root_name, '0'))
                graph.add_edge(scatter_node, '_outbuffer', local_access, None,
                               dace.Memlet.from_array(local_name, local_arr))
                for e in graph.edges_between(inp, map_entry):
                    graph.add_edge(
                        local_access, None, map_entry, e.dst_conn,
                        dace.Memlet.from_array(local_name, local_arr))
                    graph.remove_edge(e)
                for e in graph.out_edges(map_entry):
                    if e.data.data == inp.data:
                        e.data = dace.Memlet.simple(local_name, params[0])

            else:
                raise NotImplementedError

        outputs = set()
        for _, _, dst, _, m in graph.out_edges(map_exit):
            if not isinstance(dst, nodes.AccessNode):
                raise NotImplementedError
            desc = dst.desc(sdfg)
            if not isinstance(desc, data.Array):
                raise NotImplementedError
            try:
                if list(desc.shape) != m.dst_subset.size_exact():
                    # Second attempt
                    # TODO: We need a solution for symbols not matching
                    if str(list(desc.shape)) != str(m.dst_subset.size_exact()):
                        raise NotImplementedError
            except AttributeError:
                if list(desc.shape) != m.subset.size_exact():
                    # Second attempt
                    # TODO: We need a solution for symbols not matching
                    if str(list(desc.shape)) != str(m.subset.size_exact()):
                        raise NotImplementedError
            outputs.add(dst)

        for out in outputs:
            desc = out.desc(sdfg)
            if isinstance(desc, data.Scalar):
                raise NotImplementedError
            elif isinstance(desc, data.Array):
                local_name, local_arr = sdfg.add_temp_transient(
                    [sympy.floor(desc.total_size / sz)],
                    dtype=desc.dtype,
                    storage=desc.storage)
                local_access = graph.add_access(local_name)
                scatter_node = Gather('_Gather_')
                graph.add_edge(local_access, None, scatter_node, '_inbuffer',
                               dace.Memlet.from_array(local_name, local_arr))
                graph.add_edge(root_node, None, scatter_node, '_root',
                               dace.Memlet.simple(root_name, '0'))
                graph.add_edge(scatter_node, '_outbuffer', out, None,
                               dace.Memlet.from_array(out.data, desc))
                for e in graph.edges_between(map_exit, out):
                    graph.add_edge(
                        map_exit, e.src_conn, local_access, None,
                        dace.Memlet.from_array(local_name, local_arr))
                    graph.remove_edge(e)
                for e in graph.in_edges(map_exit):
                    if e.data.data == out.data:
                        e.data = dace.Memlet.simple(local_name, params[0])
            else:
                raise NotImplementedError

        map_entry.map.params = params
        map_entry.map.range = subsets.Range(ranges)
Example #13
0
def make_fpga_sdfg_independent():
    '''
    Build an SDFG with two nested SDFGs in a single FPGA state
    '''

    n = dace.symbol("n")
    vecWidth = 4
    vecType = dace.vector(dace.float32, vecWidth)
    sdfg = dace.SDFG("nested_sdfg_kernels")

    ###########################################################################
    # Copy data to FPGA

    copy_in_state = sdfg.add_state("copy_to_device")

    sdfg.add_array("x", shape=[n / vecWidth], dtype=vecType)
    sdfg.add_array("y", shape=[n / vecWidth], dtype=vecType)

    sdfg.add_array("v", shape=[n / vecWidth], dtype=vecType)
    sdfg.add_array("w", shape=[n / vecWidth], dtype=vecType)

    in_host_x = copy_in_state.add_read("x")
    in_host_y = copy_in_state.add_read("y")

    in_host_v = copy_in_state.add_read("v")
    in_host_w = copy_in_state.add_read("w")

    sdfg.add_array("device_x",
                   shape=[n / vecWidth],
                   dtype=vecType,
                   storage=dace.dtypes.StorageType.FPGA_Global,
                   transient=True)
    sdfg.add_array("device_y",
                   shape=[n / vecWidth],
                   dtype=vecType,
                   storage=dace.dtypes.StorageType.FPGA_Global,
                   transient=True)

    sdfg.add_array("device_v",
                   shape=[n / vecWidth],
                   dtype=vecType,
                   storage=dace.dtypes.StorageType.FPGA_Global,
                   transient=True)
    sdfg.add_array("device_w",
                   shape=[n / vecWidth],
                   dtype=vecType,
                   storage=dace.dtypes.StorageType.FPGA_Global,
                   transient=True)

    in_device_x = copy_in_state.add_write("device_x")
    in_device_y = copy_in_state.add_write("device_y")

    in_device_v = copy_in_state.add_write("device_v")
    in_device_w = copy_in_state.add_write("device_w")

    copy_in_state.add_memlet_path(
        in_host_x,
        in_device_x,
        memlet=dace.Memlet(f"{in_host_x.data}[0:{n}/{vecWidth}]"))
    copy_in_state.add_memlet_path(
        in_host_y,
        in_device_y,
        memlet=dace.Memlet(f"{in_host_y.data}[0:{n}/{vecWidth}]"))

    copy_in_state.add_memlet_path(
        in_host_v,
        in_device_v,
        memlet=dace.Memlet(f"{in_host_v.data}[0:{n}/{vecWidth}]"))
    copy_in_state.add_memlet_path(
        in_host_w,
        in_device_w,
        memlet=dace.Memlet(f"{in_host_w.data}[0:{n}/{vecWidth}]"))

    ###########################################################################
    # Copy data from FPGA
    sdfg.add_array("z", shape=[n / vecWidth], dtype=vecType)
    sdfg.add_array("u", shape=[n / vecWidth], dtype=vecType)

    copy_out_state = sdfg.add_state("copy_to_host")

    sdfg.add_array("device_z",
                   shape=[n / vecWidth],
                   dtype=vecType,
                   storage=dace.dtypes.StorageType.FPGA_Global,
                   transient=True)

    sdfg.add_array("device_u",
                   shape=[n / vecWidth],
                   dtype=vecType,
                   storage=dace.dtypes.StorageType.FPGA_Global,
                   transient=True)

    out_device_z = copy_out_state.add_read("device_z")
    out_host_z = copy_out_state.add_write("z")

    out_device_u = copy_out_state.add_read("device_u")
    out_host_u = copy_out_state.add_write("u")

    copy_out_state.add_memlet_path(
        out_device_z,
        out_host_z,
        memlet=dace.Memlet(f"{out_host_z.data}[0:{n}/{vecWidth}]"))
    copy_out_state.add_memlet_path(
        out_device_u,
        out_host_u,
        memlet=dace.Memlet(f"{out_host_u.data}[0:{n}/{vecWidth}]"))
    ###########################################################################
    # Non-FPGA state

    non_fpga_state = sdfg.add_state("I_do_not_want_to_be_fpga_kernel")
    non_fpga_state.location["is_FPGA_kernel"] = False

    # Build the vec addition SDFG and nest it

    to_nest = make_vec_add_sdfg()
    # add nested sdfg with symbol mapping
    nested_sdfg = non_fpga_state.add_nested_sdfg(to_nest, sdfg,
                                                 {"_device_x", "_device_y"},
                                                 {"_device_z"}, {"size": "n"})

    non_fpga_state.add_memlet_path(
        in_device_x,
        nested_sdfg,
        dst_conn="_device_x",
        memlet=dace.Memlet(f"{in_device_x.data}[0:{n}/{vecWidth}]"))
    non_fpga_state.add_memlet_path(
        in_device_y,
        nested_sdfg,
        dst_conn="_device_y",
        memlet=dace.Memlet(f"{in_device_y.data}[0:{n}/{vecWidth}]"))
    non_fpga_state.add_memlet_path(
        nested_sdfg,
        out_device_z,
        src_conn="_device_z",
        memlet=dace.Memlet(f"{out_device_z.data}[0:{n}/{vecWidth}]"))

    # Build the vec multiplication SDFG and nest it

    to_nest = make_vec_mul_sdfg()
    # add nested sdfg with symbol mapping
    nested_sdfg = non_fpga_state.add_nested_sdfg(to_nest, sdfg,
                                                 {"_device_x", "_device_y"},
                                                 {"_device_z"}, {"size": "n"})

    non_fpga_state.add_memlet_path(
        in_device_v,
        nested_sdfg,
        dst_conn="_device_x",
        memlet=dace.Memlet(f"{in_device_v.data}[0:{n}/{vecWidth}]"))
    non_fpga_state.add_memlet_path(
        in_device_w,
        nested_sdfg,
        dst_conn="_device_y",
        memlet=dace.Memlet(f"{in_device_w.data}[0:{n}/{vecWidth}]"))
    non_fpga_state.add_memlet_path(
        nested_sdfg,
        out_device_u,
        src_conn="_device_z",
        memlet=dace.Memlet(f"{out_device_u.data}[0:{n}/{vecWidth}]"))

    ######################################
    # Interstate edges
    sdfg.add_edge(copy_in_state, non_fpga_state,
                  dace.sdfg.sdfg.InterstateEdge())
    sdfg.add_edge(non_fpga_state, copy_out_state,
                  dace.sdfg.sdfg.InterstateEdge())
    sdfg.fill_scope_connectors()
    sdfg.validate()

    return sdfg
Example #14
0
def make_vec_add_sdfg(dtype=dace.float32):

    # Vector addition SDFG

    vecWidth = 4
    n = dace.symbol("size")
    vecAdd_sdfg = dace.SDFG("vec_add")
    vecType = dace.vector(dtype, vecWidth)
    fpga_state = vecAdd_sdfg.add_state("vec_add_state")

    vecAdd_sdfg.add_array('_device_x',
                          shape=[n / vecWidth],
                          dtype=vecType,
                          storage=dace.dtypes.StorageType.FPGA_Global)
    vecAdd_sdfg.add_array('_device_y',
                          shape=[n / vecWidth],
                          dtype=vecType,
                          storage=dace.dtypes.StorageType.FPGA_Global)
    vecAdd_sdfg.add_array('_device_z',
                          shape=[n / vecWidth],
                          dtype=vecType,
                          storage=dace.dtypes.StorageType.FPGA_Global)

    x = fpga_state.add_read("_device_x")
    y = fpga_state.add_read("_device_y")
    z = fpga_state.add_write("_device_z")

    # ---------- ----------
    # COMPUTE
    # ---------- ----------
    vecMap_entry, vecMap_exit = fpga_state.add_map(
        'vecAdd_map',
        dict(i='0:{0}/{1}'.format(n, vecWidth)),
        schedule=dace.dtypes.ScheduleType.FPGA_Device)

    vecAdd_tasklet = fpga_state.add_tasklet('vec_add_task', ['x_con', 'y_con'],
                                            ['z_con'], 'z_con = x_con + y_con')

    fpga_state.add_memlet_path(x,
                               vecMap_entry,
                               vecAdd_tasklet,
                               dst_conn='x_con',
                               memlet=dace.Memlet(f"{x.data}[i]"))

    fpga_state.add_memlet_path(y,
                               vecMap_entry,
                               vecAdd_tasklet,
                               dst_conn='y_con',
                               memlet=dace.Memlet(f"{y.data}[i]"))

    fpga_state.add_memlet_path(vecAdd_tasklet,
                               vecMap_exit,
                               z,
                               src_conn='z_con',
                               memlet=dace.Memlet(f"{z.data}[i]"))

    #########
    # Validate
    vecAdd_sdfg.fill_scope_connectors()
    vecAdd_sdfg.validate()
    return vecAdd_sdfg
Example #15
0
# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
import dace
import numpy as np

M = dace.symbol("M")
K = dace.symbol("K")


@dace.program
def transpose_add(A: dace.float32[M, K], B: dace.float32[K, M]):
    for i, j in dace.map[0:M, 0:K]:
        B[j, i] = A[i, j] + 1


def test_inline_scalar():
    K.set(24)
    M.set(25)

    A = np.random.rand(25, 24).astype(np.float32)
    B = np.random.rand(24, 25).astype(np.float32)

    transpose_add(A, B)

    diff = np.linalg.norm(A.transpose() - B + 1)
    print('Difference:', diff)
    assert diff < 1e-5


if __name__ == '__main__':
    test_inline_scalar()
Example #16
0
def make_sdfg(implementation, dtype, storage=dace.StorageType.Default):

    n = dace.symbol("n")

    suffix = "_device" if storage != dace.StorageType.Default else ""
    transient = storage != dace.StorageType.Default

    sdfg = dace.SDFG("dot_product_{}_{}".format(implementation, dtype))
    state = sdfg.add_state("dataflow")

    sdfg.add_array("x" + suffix, [n],
                   dtype,
                   storage=storage,
                   transient=transient)
    sdfg.add_array("y" + suffix, [n],
                   dtype,
                   storage=storage,
                   transient=transient)
    sdfg.add_array("result" + suffix, [1],
                   dtype,
                   storage=storage,
                   transient=transient)

    x = state.add_read("x" + suffix)
    y = state.add_read("y" + suffix)
    result = state.add_write("result" + suffix)

    dot_node = blas.nodes.dot.Dot("dot")
    dot_node.implementation = implementation

    state.add_memlet_path(x,
                          dot_node,
                          dst_conn="_x",
                          memlet=Memlet.simple(x, "0:n", num_accesses=n))
    state.add_memlet_path(y,
                          dot_node,
                          dst_conn="_y",
                          memlet=Memlet.simple(y, "0:n", num_accesses=n))
    # TODO: remove -1 once this no longer triggers a write in the codegen.
    state.add_memlet_path(dot_node,
                          result,
                          src_conn="_result",
                          memlet=Memlet.simple(result, "0", num_accesses=-1))

    if storage != dace.StorageType.Default:

        sdfg.add_array("x", [n], dtype)
        sdfg.add_array("y", [n], dtype)
        sdfg.add_array("result", [1], dtype)

        init_state = sdfg.add_state("copy_to_device")
        sdfg.add_edge(init_state, state, dace.InterstateEdge())

        x_host = init_state.add_read("x")
        y_host = init_state.add_read("y")
        x_device = init_state.add_write("x" + suffix)
        y_device = init_state.add_write("y" + suffix)
        init_state.add_memlet_path(x_host,
                                   x_device,
                                   memlet=Memlet.simple(x_host,
                                                        "0:n",
                                                        num_accesses=n))
        init_state.add_memlet_path(y_host,
                                   y_device,
                                   memlet=Memlet.simple(y_host,
                                                        "0:n",
                                                        num_accesses=n))

        finalize_state = sdfg.add_state("copy_to_host")
        sdfg.add_edge(state, finalize_state, dace.InterstateEdge())

        result_device = finalize_state.add_write("result" + suffix)
        result_host = finalize_state.add_read("result")
        finalize_state.add_memlet_path(result_device,
                                       result_host,
                                       memlet=Memlet.simple(result_device,
                                                            "0",
                                                            num_accesses=1))

    return sdfg
Example #17
0
#!/usr/bin/env python
from __future__ import print_function

import argparse
import dace
import numpy as np

M = dace.symbol('M')
K = dace.symbol('K')
N = dace.symbol('N')


@dace.program(dace.float64[M, K], dace.float64[K, N], dace.float64[M, N])
def gemm(A, B, C):
    # Transient variable
    tmp = dace.define_local([M, N, K], dtype=A.dtype)

    for ignore in dace.map[0:1024]:
        
        @dace.map(_[0:M, 0:N, 0:K])
        def multiplication(i, j, k):
            in_A << A[i, k]
            in_B << B[k, j]
            out >> tmp[i, j, k]

            out = in_A * in_B

    dace.reduce(lambda a, b: a + b, tmp, C, axis=2, identity=0)


if __name__ == "__main__":
Example #18
0
def make_sdfg(tasklet_code=None,
              name="veclen_copy_conversion",
              dtype=dace.float32,
              veclen=16):

    vtype = dace.vector(dace.float32, veclen)

    if tasklet_code is None:
        tasklet_code = "_out = _in"

    n = dace.symbol("N")

    sdfg = dace.SDFG(name)

    pre_state = sdfg.add_state(name + "_pre")
    state = sdfg.add_state(name)
    post_state = sdfg.add_state(name + "_post")
    sdfg.add_edge(pre_state, state, dace.InterstateEdge())
    sdfg.add_edge(state, post_state, dace.InterstateEdge())

    _, desc_input_host = sdfg.add_array("a", (n // veclen, ), vtype)
    _, desc_output_host = sdfg.add_array("b", (n // veclen, ), vtype)
    desc_input_device = copy.copy(desc_input_host)
    desc_input_device.storage = dace.StorageType.FPGA_Global
    desc_input_device.location["bank"] = 0
    desc_input_device.transient = True
    desc_output_device = copy.copy(desc_output_host)
    desc_output_device.storage = dace.StorageType.FPGA_Global
    desc_output_device.location["bank"] = 1
    desc_output_device.transient = True
    sdfg.add_datadesc("a_device", desc_input_device)
    sdfg.add_datadesc("b_device", desc_output_device)

    # Host to device
    pre_read = pre_state.add_read("a")
    pre_write = pre_state.add_write("a_device")
    pre_state.add_memlet_path(pre_read,
                              pre_write,
                              memlet=dace.Memlet(pre_write.data, None))

    # Device to host
    post_read = post_state.add_read("b_device")
    post_write = post_state.add_write("b")
    post_state.add_memlet_path(post_read,
                               post_write,
                               memlet=dace.Memlet(post_write.data, None))

    # Compute state
    read_memory = state.add_read("a_device")
    write_memory = state.add_write("b_device")

    # Memory streams
    sdfg.add_stream("a_stream",
                    vtype,
                    storage=dace.StorageType.FPGA_Local,
                    transient=True)
    sdfg.add_stream("b_stream",
                    vtype,
                    storage=dace.StorageType.FPGA_Local,
                    transient=True)
    produce_input_stream = state.add_write("a_stream")
    consume_input_stream = state.add_read("a_stream")
    produce_output_stream = state.add_write("b_stream")
    consume_output_stream = state.add_write("b_stream")

    tasklet = state.add_tasklet(name, {"_in"}, {"_out"}, tasklet_code)

    # Iterative map
    entry, exit = state.add_map(name, {
        "i": "0:N//{}".format(veclen),
    },
                                schedule=dace.ScheduleType.FPGA_Device)

    # Unrolled map
    unroll_entry, unroll_exit = state.add_map(
        name + "_unroll", {"u": "0:{}".format(veclen)},
        schedule=dace.ScheduleType.FPGA_Device,
        unroll=True)

    # Container-to-container copies between arrays and streams
    state.add_memlet_path(read_memory,
                          produce_input_stream,
                          memlet=dace.Memlet(read_memory.data))
    state.add_memlet_path(consume_output_stream,
                          write_memory,
                          memlet=dace.Memlet(write_memory.data))

    # Container-to-container copy from vectorized stream to non-vectorized
    # buffer
    sdfg.add_array("a_buffer", (veclen, ),
                   dtype,
                   storage=dace.StorageType.FPGA_Local,
                   transient=True)
    sdfg.add_array("b_buffer", (veclen, ),
                   dtype,
                   storage=dace.StorageType.FPGA_Local,
                   transient=True)
    a_buffer = state.add_access("a_buffer")
    b_buffer = state.add_access("b_buffer")

    # Input stream to buffer
    state.add_memlet_path(consume_input_stream,
                          entry,
                          a_buffer,
                          memlet=dace.Memlet.simple(
                              consume_input_stream.data,
                              "0",
                              other_subset_str="0:{}".format(veclen)))
    # Buffer to tasklet
    state.add_memlet_path(a_buffer,
                          unroll_entry,
                          tasklet,
                          dst_conn="_in",
                          memlet=dace.Memlet.simple(a_buffer.data,
                                                    "u",
                                                    num_accesses=1))

    # Tasklet to buffer
    state.add_memlet_path(tasklet,
                          unroll_exit,
                          b_buffer,
                          src_conn="_out",
                          memlet=dace.Memlet.simple(b_buffer.data,
                                                    "u",
                                                    num_accesses=1))

    # Buffer to output stream
    state.add_memlet_path(b_buffer,
                          exit,
                          produce_output_stream,
                          memlet=dace.Memlet.simple(
                              produce_output_stream.data,
                              "0",
                              other_subset_str="0:{}".format(veclen),
                              num_accesses=1))

    return sdfg
Example #19
0
def test_indirection_with_reindex(language):

    N = dace.symbol('N')
    S = dace.symbol('S')

    sdfg = dace.SDFG(f"test_indirection_with_reindex")
    sdfg.add_array('A', shape=[N], dtype=dace.float32, transient=False)
    sdfg.add_array('index_0', shape=[1], dtype=dace.int32, transient=True)
    sdfg.add_array('index_1', shape=[1], dtype=dace.int32, transient=True)
    sdfg.add_array('index_2', shape=[1], dtype=dace.int32, transient=True)
    sdfg.add_array('out', shape=[N], dtype=dace.float32, transient=False)
    sdfg.add_symbol('S', S.dtype)

    state_init1 = sdfg.add_state()
    state_init2 = sdfg.add_state()
    state_init3 = sdfg.add_state()
    state_compute = sdfg.add_state()

    sdfg.add_edge(state_init1, state_init2, dace.InterstateEdge())
    sdfg.add_edge(state_init2, state_init3, dace.InterstateEdge())
    sdfg.add_edge(state_init3, state_compute, dace.InterstateEdge())

    tasklet1 = state_init1.add_tasklet(name="init1",
                                       inputs=[],
                                       outputs=["out"],
                                       code="out = 1;",
                                       language=dace.Language.CPP)
    tasklet2 = state_init2.add_tasklet(name="init2",
                                       inputs=[],
                                       outputs=["out"],
                                       code="out = 2;",
                                       language=dace.Language.CPP)
    tasklet3 = state_init3.add_tasklet(name="init3",
                                       inputs=[],
                                       outputs=["out"],
                                       code="out = 3;",
                                       language=dace.Language.CPP)

    dst = state_init1.add_write("index_0")
    memlet = dace.Memlet(expr="index_0", subset="0")
    state_init1.add_memlet_path(tasklet1, dst, src_conn="out", memlet=memlet)

    dst = state_init2.add_write("index_1")
    memlet = dace.Memlet(expr="index_1", subset="0")
    state_init2.add_memlet_path(tasklet2, dst, src_conn="out", memlet=memlet)

    dst = state_init3.add_write("index_2")
    memlet = dace.Memlet(expr="index_2", subset="0")
    state_init3.add_memlet_path(tasklet3, dst, src_conn="out", memlet=memlet)

    semicolon = ';' if language == dace.Language.CPP else ''
    tasklet = state_compute.add_tasklet(
        name="add",
        inputs=["_A", "_index_0", "_index_1", "_index_2"],
        outputs=["_out"],
        code=f"_out[_index_2] = _A[_index_0] + _A[_index_1]{semicolon}",
        language=language)

    src = state_compute.add_read("A")
    memlet = dace.Memlet(expr="A", subset="S:N")
    state_compute.add_memlet_path(src, tasklet, dst_conn="_A", memlet=memlet)

    src = state_compute.add_read("index_0")
    memlet = dace.Memlet(expr="index_0", subset="0")
    state_compute.add_memlet_path(src,
                                  tasklet,
                                  dst_conn="_index_0",
                                  memlet=memlet)

    src = state_compute.add_read("index_1")
    memlet = dace.Memlet(expr="index_1", subset="0")
    state_compute.add_memlet_path(src,
                                  tasklet,
                                  dst_conn="_index_1",
                                  memlet=memlet)

    src = state_compute.add_read("index_2")
    memlet = dace.Memlet(expr="index_2", subset="0")
    state_compute.add_memlet_path(src,
                                  tasklet,
                                  dst_conn="_index_2",
                                  memlet=memlet)

    dst = state_compute.add_write("out")
    memlet = dace.Memlet(expr="out", subset="S:N")
    state_compute.add_memlet_path(tasklet, dst, src_conn="_out", memlet=memlet)

    scalar_to_symbol.promote_scalars_to_symbols(sdfg)
    sdfg.simplify()

    A = np.array(list(range(10)), dtype=np.float32)
    out = np.zeros((10, ), dtype=np.float32)
    sdfg(A=A, out=out, N=10, S=5)

    assert (np.allclose(A[6] + A[7], out[8]))
Example #20
0
    sdfg.validate()


def test_reverse_copy():
    @dace.program
    def redarrtest(p: dace.float64[20, 20]):
        p[-1, :] = p[-2, :]

    p = np.random.rand(20, 20)
    pp = np.copy(p)
    pp[-1, :] = pp[-2, :]
    redarrtest(p)
    assert np.allclose(p, pp)


C_in, C_out, H, K, N, W = (dace.symbol(s, dace.int64)
                           for s in ('C_in', 'C_out', 'H', 'K', 'N', 'W'))


# Deep learning convolutional operator (stride = 1)
@dace.program
def conv2d(input: dace.float32[N, H, W, C_in],
           weights: dace.float32[K, K, C_in, C_out]):
    output = np.ndarray((N, H - K + 1, W - K + 1, C_out), dtype=np.float32)

    # Loop structure adapted from https://github.com/SkalskiP/ILearnDeepLearning.py/blob/ba0b5ba589d4e656141995e8d1a06d44db6ce58d/01_mysteries_of_neural_networks/06_numpy_convolutional_neural_net/src/layers/convolutional.py#L88
    # for i, j in dace.map[0:H-K+1, 0:W-K+1]:
    for i in range(H - K + 1):
        for j in range(W - K + 1):
            output[:, i, j, :] = np.sum(
                input[:, i:i + K, j:j + K, :, np.newaxis] *
Example #21
0
import math
import dace
try:
    import polybench
except ImportError:
    polybench = None

NI = dace.symbol('NI')
NJ = dace.symbol('NJ')
NK = dace.symbol('NK')
NL = dace.symbol('NL')

#datatypes = [dace.float64, dace.int32, dace.float32]
datatype = dace.float64

# Dataset sizes
sizes = [{
    NI: 16,
    NJ: 18,
    NK: 22,
    NL: 24
}, {
    NI: 40,
    NJ: 50,
    NK: 70,
    NL: 80
}, {
    NI: 180,
    NJ: 190,
    NK: 210,
    NL: 220
Example #22
0
# Copyright 2019-2020 ETH Zurich and the DaCe authors. All rights reserved.
import dace
import numpy as np

# Declaration of symbolic variables
N, BS = (dace.symbol(name) for name in ['N', 'BS'])


@dace.program
def seq_cond(HD: dace.complex128[N, BS, BS], HE: dace.complex128[N, BS, BS],
             HF: dace.complex128[N, BS, BS], sigmaRSD: dace.complex128[N, BS,
                                                                       BS],
             sigmaRSE: dace.complex128[N, BS,
                                       BS], sigmaRSF: dace.complex128[N, BS,
                                                                      BS]):

    for n in range(N):
        if n < N - 1:
            HE[n] -= sigmaRSE[n]
        else:
            HE[n] = -sigmaRSE[n]
        if n > 0:
            HF[n] -= sigmaRSF[n]
        else:
            HF[n] = -sigmaRSF[n]
        HD[n] = HD[n] - sigmaRSD[n]


def test():
    seq_cond.compile()
Example #23
0
# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
""" A test for the ElementWiseArrayOperation transformation. """

import dace
import numpy as np
from dace.transformation.dataflow import ElementWiseArrayOperation
import pytest

N = dace.symbol('N', dtype=dace.int64)


@dace.program
def eao_mpi(A: dace.float64[N], B: dace.float64[N]):
    return A * B


@pytest.mark.mpi
def test_eao_mpi():
    from mpi4py import MPI as MPI4PY
    comm = MPI4PY.COMM_WORLD
    rank = comm.Get_rank()
    commsize = comm.Get_size()
    mpi_sdfg = None
    if commsize < 2:
        raise ValueError("This test is supposed to be run with at least two processes!")
    for r in range(0, commsize):
        if r == rank:
            mpi_sdfg = eao_mpi.to_sdfg(simplify=True)
            mpi_sdfg.apply_transformations(ElementWiseArrayOperation)
            mpi_exec = mpi_sdfg.compile()
        comm.Barrier()
Example #24
0
    mpi_sdfg(x=A, y=B, src=src, dest=dest, tag=tag, n=size)
    # now B should be an array of size, containing srank
    if not np.allclose(B, np.full(size, srank, dtype=dtype)):
        raise (ValueError("The received values are not what I expected."))


# TODO: The test deadlocks in the CI (Ubuntu 18.04, MPICH 3.3a2)
# but works fine in up-to-date systems, including when using pytest.
@pytest.mark.skip
def test_mpi():
    _test_mpi("MPI Send/Recv", make_sdfg(np.float64), np.float64)


###############################################################################

myrank = dace.symbol('myrank', dtype=dace.int32)
mysize = dace.symbol('mysize', dtype=dace.int32)


@dace.program
def dace_send_recv():
    tmp1 = np.full([1], myrank, dtype=np.int32)
    tmp2 = np.zeros([1], dtype=np.int32)
    if myrank == 0:
        dace.comm.Send(tmp1, 1, tag=42)
        dace.comm.Recv(tmp2, mysize - 1, tag=42)
    else:
        dace.comm.Recv(tmp2, (myrank - 1) % mysize, tag=42)
        dace.comm.Send(tmp1, (myrank + 1) % mysize, tag=42)
    return tmp2
Example #25
0
# Copyright 2019-2020 ETH Zurich and the DaCe authors. All rights reserved.
import unittest
import dace
import numpy as np
from dace.transformation.dataflow import MapTiling, OutLocalStorage

N = dace.symbol('N')


@dace.program
def arange():
    out = np.ndarray([N], np.int32)
    for i in dace.map[0:N]:
        with dace.tasklet:
            o >> out[i]
            o = i
    return out


class LocalStorageTests(unittest.TestCase):
    def test_even(self):
        sdfg = arange.to_sdfg()
        sdfg.apply_transformations([MapTiling, OutLocalStorage],
                                   options=[{
                                       'tile_sizes': [8]
                                   }, {}])
        self.assertTrue(
            np.array_equal(sdfg(N=16), np.arange(16, dtype=np.int32)))

    def test_uneven(self):
        # For testing uneven decomposition, use longer buffer and ensure
Example #26
0
def make_sdfg(dtype):

    n = dace.symbol("n")

    sdfg = dace.SDFG("mpi_send_recv")
    state = sdfg.add_state("dataflow")

    sdfg.add_array("x", [n], dtype, transient=False)
    sdfg.add_array("y", [n], dtype, transient=False)
    sdfg.add_array("src", [1], dace.dtypes.int32, transient=False)
    sdfg.add_array("dest", [1], dace.dtypes.int32, transient=False)
    sdfg.add_array("tag", [1], dace.dtypes.int32, transient=False)
    sdfg.add_array("send_req", [1],
                   dace.dtypes.opaque("MPI_Request"),
                   transient=True)
    sdfg.add_array("recv_req", [1],
                   dace.dtypes.opaque("MPI_Request"),
                   transient=True)

    sdfg.add_array("stat_source", [1], dace.dtypes.int32, transient=True)
    sdfg.add_array("stat_count", [1], dace.dtypes.int32, transient=True)
    sdfg.add_array("stat_tag", [1], dace.dtypes.int32, transient=True)
    sdfg.add_array("stat_cancelled", [1], dace.dtypes.int32, transient=True)

    x = state.add_access("x")
    y = state.add_access("y")
    src = state.add_access("src")
    dest = state.add_access("dest")
    tag = state.add_access("tag")
    send_req = state.add_access("send_req")
    recv_req = state.add_access("recv_req")

    stat_source = state.add_access("stat_source")
    stat_tag = state.add_access("stat_tag")

    send_node = mpi.nodes.isend.Isend("isend")
    recv_node = mpi.nodes.irecv.Irecv("irecv")
    wait_node = mpi.nodes.wait.Wait("wait")

    state.add_memlet_path(x,
                          send_node,
                          dst_conn="_buffer",
                          memlet=Memlet.simple(x, "0:n", num_accesses=n))
    state.add_memlet_path(send_node,
                          send_req,
                          src_conn="_request",
                          memlet=Memlet.simple(send_req, "0:1",
                                               num_accesses=1))
    state.add_memlet_path(dest,
                          send_node,
                          dst_conn="_dest",
                          memlet=Memlet.simple(dest, "0:1", num_accesses=1))
    state.add_memlet_path(tag,
                          send_node,
                          dst_conn="_tag",
                          memlet=Memlet.simple(tag, "0:1", num_accesses=1))
    state.add_memlet_path(recv_node,
                          y,
                          src_conn="_buffer",
                          memlet=Memlet.simple(y, "0:n", num_accesses=n))
    state.add_memlet_path(recv_node,
                          recv_req,
                          src_conn="_request",
                          memlet=Memlet.simple(recv_req, "0:1",
                                               num_accesses=1))
    state.add_memlet_path(recv_req,
                          wait_node,
                          dst_conn="_request",
                          memlet=Memlet.simple(recv_req, "0:1",
                                               num_accesses=1))

    state.add_memlet_path(wait_node,
                          stat_tag,
                          src_conn="_stat_tag",
                          memlet=Memlet.simple(stat_tag, "0:1",
                                               num_accesses=1))
    state.add_memlet_path(wait_node,
                          stat_source,
                          src_conn="_stat_source",
                          memlet=Memlet.simple(stat_source,
                                               "0:1",
                                               num_accesses=1))

    state.add_memlet_path(src,
                          recv_node,
                          dst_conn="_src",
                          memlet=Memlet.simple(src, "0:1", num_accesses=1))
    state.add_memlet_path(tag,
                          recv_node,
                          dst_conn="_tag",
                          memlet=Memlet.simple(tag, "0:1", num_accesses=1))
    return sdfg
Example #27
0
#!/usr/bin/env python
from __future__ import print_function

import argparse
import dace
import math
import numpy as np

W = dace.symbol('W')
H = dace.symbol('H')


@dace.program(dace.float32[H, W], dace.float32[H, W])
def transpose(A, B):
    @dace.map(_[0:H, 0:W])
    def compute(i, j):
        a << A[j, i]
        b >> B[i, j]

        b = a


if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument("W", type=int, nargs="?", default=64)
    parser.add_argument("H", type=int, nargs="?", default=64)
    args = vars(parser.parse_args())

    A = dace.ndarray([H, W], dtype=dace.float32)
    B = dace.ndarray([H, W], dtype=dace.float32)
Example #28
0
# Copyright 2019-2020 ETH Zurich and the DaCe authors. All rights reserved.
import math
import numpy as np

import dace as dp
from dace.sdfg import SDFG
from dace.memlet import Memlet

N = dp.symbol('N')
sdfg = SDFG('tlstream')
state = sdfg.add_state('doit')

localarr = state.add_transient('la', [10], dp.float32)
localstream = state.add_stream('ls', dp.float32, 1, transient=True)
globalstream = state.add_stream('gs', dp.float32, 1, transient=True)
globalarr = state.add_array('ga', [N], dp.float32)

me, mx = state.add_map('par', dict(i='0:N'))
tasklet = state.add_tasklet('arange', set(), {'a'}, 'a = i')

state.add_nedge(me, tasklet, Memlet())
state.add_edge(tasklet, 'a', localstream, None,
               Memlet.from_array(localstream.data, localstream.desc(sdfg)))
state.add_nedge(localstream, localarr,
                Memlet.from_array(localarr.data, localarr.desc(sdfg)))
state.add_nedge(localarr, mx,
                Memlet.from_array(globalstream.data, globalstream.desc(sdfg)))
state.add_nedge(mx, globalstream,
                Memlet.from_array(globalstream.data, globalstream.desc(sdfg)))
state.add_nedge(globalstream, globalarr,
                Memlet.from_array(globalarr.data, globalarr.desc(sdfg)))
Example #29
0
#!/usr/bin/env python

import dace
import numpy as np
import scipy

W = dace.symbol('W')
H = dace.symbol('H')
nnz = dace.symbol('nnz')


@dace.program(dace.uint32[H + 1], dace.uint32[nnz], dace.float32[nnz],
              dace.float32[W], dace.float32[H])
def spmv(A_row, A_col, A_val, x, b):
    @dace.mapscope(_[0:H])
    def compute_row(i):
        @dace.map(_[A_row[i]:A_row[i + 1]])
        def compute(j):
            a << A_val[j]
            in_x << x[A_col[j]]
            out >> b(1, lambda x, y: x + y)[i]

            out = a * in_x


def test_dynamic_map():
    height = 1024
    width = 1024

    # Prepare spmv SDFG for GPU
    sdfg = spmv.to_sdfg()
Example #30
0
# Copyright 2019-2020 ETH Zurich and the DaCe authors. All rights reserved.
import math
import dace
import polybench

NQ = dace.symbol('NQ')
NR = dace.symbol('NR')
NP = dace.symbol('NP')

#datatypes = [dace.float64, dace.int32, dace.float32]
datatype = dace.float64

# Dataset sizes
sizes = [{
    NQ: 8,
    NR: 10,
    NP: 12
}, {
    NQ: 20,
    NR: 25,
    NP: 30
}, {
    NQ: 40,
    NR: 50,
    NP: 60
}, {
    NQ: 140,
    NR: 150,
    NP: 160
}, {
    NQ: 220,