# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. import dace import numpy as np W = dace.symbol('W') @dace.program def prog(A): number = dace.define_local([1], dace.float32) @dace.map(_[0:W]) def bla(i): inp << A[i] out >> A[i] osum >> number(1, lambda x, y: x + y, 0) out = 2 * inp osum = inp @dace.map(_[0:W]) def bla2(i): inp << A[i] out >> A[i] out = 2 * inp def test(): W.set(3)
# Copyright 2019-2020 ETH Zurich and the DaCe authors. All rights reserved. from __future__ import print_function import argparse import dace import numpy as np import select import sys from scipy import ndimage W = dace.symbol("W") H = dace.symbol("H") T = dace.symbol("T") P = dace.symbol("P") # Number of processing elements dtype = dace.float32 def add_tmp(state): return state.add_array("tmp", (2, H, W), dtype, transient=True, storage=dace.dtypes.StorageType.FPGA_Global) def make_init_state(sdfg): state = sdfg.add_state("init") a0 = state.add_array("A", (H, W), dtype) tmp0 = add_tmp(state) state.add_memlet_path(a0, tmp0,
# Copyright 2019-2020 ETH Zurich and the DaCe authors. All rights reserved. from __future__ import print_function import argparse import dace from dace.transformation.dataflow import MapTiling from dace.transformation.optimizer import SDFGOptimizer import numpy as np from scipy import ndimage W = dace.symbol('W') H = dace.symbol('H') MAXITER = dace.symbol('MAXITER') def create_sdfg(): sdfg = dace.SDFG('stencil_sdfg_api') sdfg.add_symbol('MAXITER', MAXITER.dtype) _, arr = sdfg.add_array('A', (H, W), dace.float32) _, tmparr = sdfg.add_transient('tmp', (H, W), dace.float32) init = sdfg.add_state('init') guard = sdfg.add_state('guard') body = sdfg.add_state('body') end = sdfg.add_state('end') sdfg.add_edge(init, guard, dace.InterstateEdge(assignments={'i': '0'})) sdfg.add_edge(guard, body, dace.InterstateEdge(condition='i<MAXITER')) sdfg.add_edge(body, guard, dace.InterstateEdge(assignments={'i': 'i+1'})) sdfg.add_edge(guard, end, dace.InterstateEdge(condition='i>=MAXITER'))
# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. """ Tests WarpTiling and fusion on the softmax operator. """ import dace from dace.transformation.dataflow import (MapFusion, WarpTiling, TrivialMapElimination, Vectorization) from dace.transformation.interstate import (HoistState, InlineSDFG, StateFusion, GPUTransformSDFG) from dace.transformation.subgraph import (SubgraphFusion, MultiExpansion, ReduceExpansion) import numpy as np import pytest dn1, dn2, dn3, dr = (dace.symbol(s) for s in ('dn1', 'dn2', 'dn3', 'dr')) @dace.program def softmax_fwd(inp: dace.float32[dn1, dn2, dn3, dr], out: dace.float32[dn1, dn2, dn3, dr]): max = np.max(inp, axis=-1) max_keepdims = np.reshape(max, (dn1, dn2, dn3, 1)) exp_arr = np.exp(inp - max_keepdims) sum = np.sum(exp_arr, axis=-1) sum_keepdims = np.reshape(sum, (dn1, dn2, dn3, 1)) out[:] = exp_arr / sum_keepdims # Numerically-stable version of softmax def softmax(x): tmp_max = np.max(x, axis=-1, keepdims=True) tmp_out = np.exp(x - tmp_max)
# Copyright 2019-2020 ETH Zurich and the DaCe authors. All rights reserved. import dace import numpy as np import pytest from dace.transformation.subgraph import ReduceExpansion from dace.libraries.standard.nodes.reduce import Reduce N = dace.symbol('N') M = dace.symbol('M') N.set(30) M.set(30) @dace.program def program(A: dace.float32[M, N]): return dace.reduce(lambda a, b: max(a, b), A, axis=1, identity=0) @pytest.mark.gpu def test_blockallreduce(): A = np.random.rand(M.get(), N.get()).astype(np.float32) sdfg = program.to_sdfg() sdfg.apply_gpu_transformations() graph = sdfg.nodes()[0] for node in graph.nodes(): if isinstance(node, Reduce): reduce_node = node reduce_node.implementation = 'CUDA (device)'
# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. import dace from dace.transformation.subgraph import MultiExpansion, SubgraphFusion import dace.sdfg.nodes as nodes import numpy as np from typing import Union, List from dace.sdfg.graph import SubgraphView N, M, O, P, Q, R = [dace.symbol(s) for s in ['N', 'M', 'O', 'P', 'Q', 'R']] @dace.program def subgraph_fusion_parallel(A: dace.float64[N], B: dace.float64[M], C: dace.float64[O], D: dace.float64[M], E: dace.float64[N], F: dace.float64[P], G: dace.float64[M], H: dace.float64[P], I: dace.float64[N], J: dace.float64[R], X: dace.float64[N], Y: dace.float64[M], Z: dace.float64[P]): tmp1 = np.ndarray([N, M, O], dtype=dace.float64) for i, j, k in dace.map[0:N, 0:M, 0:O]: with dace.tasklet: in1 << A[i] in2 << B[j] in3 << C[k] out >> tmp1[i, j, k] out = in1 + in2 + in3
#!/usr/bin/env python from __future__ import print_function import argparse import dace import math import numpy as np N = dace.symbol('N', positive=True) @dace.program(dace.float32[N], dace.float32[N], dace.uint32[1], dace.float32) def pbf(A, out, outsz, ratio): ostream = dace.define_stream(dace.float32, 1) ostream >> out @dace.map(_[0:N]) def filter(i): a << A[i] b >> ostream(-1) osz >> outsz(-1, lambda x, y: x + y, 0) filter = (a > ratio) if filter: b = a osz = filter def regression(A, ratio):
def can_be_applied(self, graph: dace.SDFGState, expr_index: int, sdfg: dace.SDFG, permissive: bool = False): map_entry = self.map_entry map_exit = graph.exit_node(map_entry) params = [dace.symbol(p) for p in map_entry.map.params] inputs = dict() for _, _, _, _, m in graph.out_edges(map_entry): if not m.data: continue desc = sdfg.arrays[m.data] if desc not in inputs.keys(): inputs[desc] = [] inputs[desc].append(m.subset) stencil_found = False for desc, accesses in inputs.items(): if isinstance(desc, dace.data.Scalar): continue elif isinstance(desc, (dace.data.Array, dace.data.View)): if list(desc.shape) == [1]: continue first_access = None for a in accesses: if a.num_elements() != 1: return False if first_access: new_access = deepcopy(a) new_access.offset(first_access, True) for idx in new_access.min_element(): if not isinstance(idx, Number): return False if idx != 0: stencil_found = True else: first_access = a indices = a.min_element() unmatched_indices = set(params) for idx in indices: if isinstance(idx, sympy.Symbol): bidx = idx elif isinstance(idx, sympy.Add): if len(idx.free_symbols) != 1: return False bidx = list(idx.free_symbols)[0] else: return False if bidx in unmatched_indices: unmatched_indices.remove(bidx) if len(unmatched_indices) > 0: return False else: return False outputs = dict() for _, _, _, _, m in graph.in_edges(map_exit): if m.wcr: return False desc = sdfg.arrays[m.data] if desc not in outputs.keys(): outputs[desc] = [] outputs[desc].append(m.subset) for desc, accesses in outputs.items(): if isinstance(desc, (dace.data.Array, dace.data.View)): for a in accesses: if a.num_elements() > 1: return False indices = a.min_element() unmatched_indices = set(params) for idx in indices: if isinstance(idx, sympy.Symbol): bidx = idx elif isinstance(idx, sympy.Add): if len(idx.free_symbols) != 1: return False bidx = list(idx.free_symbols)[0] else: return False if bidx in unmatched_indices: unmatched_indices.remove(bidx) if len(unmatched_indices) > 0: return False else: return False return stencil_found
def can_be_applied(self, graph: dace.SDFGState, expr_index: int, sdfg: dace.SDFG, permissive: bool = False): map_entry = self.map_entry map_exit = graph.exit_node(map_entry) params = [dace.symbol(p) for p in map_entry.map.params] inputs = dict() for _, _, _, _, m in graph.out_edges(map_entry): if not m.data: continue desc = sdfg.arrays[m.data] if desc not in inputs.keys(): inputs[desc] = [] inputs[desc].append(m.subset) outer_product_found = False for desc, accesses in inputs.items(): if isinstance(desc, dace.data.Scalar): continue elif isinstance(desc, (dace.data.Array, dace.data.View)): if list(desc.shape) == [1]: continue for a in accesses: indices = a.min_element() unmatched_indices = set(params) for idx in indices: if not isinstance(idx, sympy.Symbol): return False if idx in unmatched_indices: unmatched_indices.remove(idx) if len(unmatched_indices) == 0: return False outer_product_found = True else: return False outputs = dict() for _, _, _, _, m in graph.in_edges(map_exit): if m.wcr: return False desc = sdfg.arrays[m.data] if desc not in outputs.keys(): outputs[desc] = [] outputs[desc].append(m.subset) for desc, accesses in outputs.items(): if isinstance(desc, (dace.data.Array, dace.data.View)): for a in accesses: if a.num_elements() != 1: return False indices = a.min_element() unmatched_indices = set(params) for idx in indices: if idx in unmatched_indices: unmatched_indices.remove(idx) if len(unmatched_indices) > 0: return False else: return False return outer_product_found
def can_be_applied(self, graph: dace.SDFGState, expr_index: int, sdfg: dace.SDFG, permissive: bool = False): map_entry = self.map_entry map_exit = graph.exit_node(map_entry) params = [dace.symbol(p) for p in map_entry.map.params] if "commsize" in map_entry.map.range.free_symbols: return False if "Px" in map_entry.map.range.free_symbols: return False if "Py" in map_entry.map.range.free_symbols: return False # If the map iterators are used in the code of a Tasklet, # then we cannot flatten them (currently). # See, for example, samples/simple/mandelbrot.py for node in subgraph_from_maps(sdfg, graph, [map_entry]): if isinstance(node, dace.nodes.CodeNode): for p in params: if str(p) in node.free_symbols: return False inputs = dict() for _, _, _, _, m in graph.out_edges(map_entry): if not m.data: continue desc = sdfg.arrays[m.data] if desc not in inputs.keys(): inputs[desc] = [] inputs[desc].append(m.subset) for desc, accesses in inputs.items(): if isinstance(desc, dace.data.Scalar): continue elif isinstance(desc, (dace.data.Array, dace.data.View)): if list(desc.shape) == [1]: continue for a in accesses: if a.num_elements() != 1: return False indices = a.min_element() unmatched_indices = set(params) for idx in indices: if idx in unmatched_indices: unmatched_indices.remove(idx) if len(unmatched_indices) > 0: return False else: return False outputs = dict() for _, _, _, _, m in graph.in_edges(map_exit): if m.wcr: return False desc = sdfg.arrays[m.data] if desc not in outputs.keys(): outputs[desc] = [] outputs[desc].append(m.subset) for desc, accesses in outputs.items(): if isinstance(desc, (dace.data.Array, dace.data.View)): for a in accesses: if a.num_elements() != 1: return False indices = a.min_element() unmatched_indices = set(params) for idx in indices: if idx in unmatched_indices: unmatched_indices.remove(idx) if len(unmatched_indices) > 0: return False else: return False return True
def apply(self, graph: dace.SDFGState, sdfg: dace.SDFG): map_entry = self.map_entry map_exit = graph.exit_node(map_entry) sz = dace.symbol('commsize', dtype=dace.int32, integer=True, positive=True) Px = dace.symbol('Px', dtype=dace.int32, integer=True, positive=True) Py = dace.symbol('Py', dtype=dace.int32, integer=True, positive=True) from dace.data import _prod # NOTE: Maps with step in their ranges are currently not supported if len(map_entry.map.params) == 2: params = map_entry.map.params ranges = [None] * 2 b, e, _ = map_entry.map.range[0] ranges[0] = (0, (e - b + 1) / Px - 1, 1) b, e, _ = map_entry.map.range[1] ranges[1] = (0, (e - b + 1) / Py - 1, 1) strides = [1] else: params = ['__iflat'] sizes = map_entry.map.range.size_exact() total_size = _prod(sizes) ranges = [(0, (total_size) / sz - 1, 1)] strides = [_prod(sizes[i + 1:]) for i in range(len(sizes))] root_name = sdfg.temp_data_name() sdfg.add_scalar(root_name, dace.int32, transient=True) root_node = graph.add_access(root_name) root_tasklet = graph.add_tasklet('_set_root_', {}, {'__out'}, '__out = 0') graph.add_edge(root_tasklet, '__out', root_node, None, dace.Memlet.simple(root_name, '0')) from dace.libraries.mpi import Bcast from dace.libraries.pblas import BlockCyclicScatter, BlockCyclicGather inputs = set() for src, _, _, _, m in graph.in_edges(map_entry): if not isinstance(src, nodes.AccessNode): raise NotImplementedError desc = src.desc(sdfg) if not isinstance(desc, (data.Scalar, data.Array)): raise NotImplementedError if list(desc.shape) != m.src_subset.size_exact(): # Second attempt # TODO: We need a solution for symbols not matching if str(list(desc.shape)) != str(m.src_subset.size_exact()): raise NotImplementedError inputs.add(src) for inp in inputs: desc = inp.desc(sdfg) if isinstance(desc, data.Scalar): local_access = graph.add_access(inp.data) bcast_node = Bcast('_Bcast_') graph.add_edge(inp, None, bcast_node, '_inbuffer', dace.Memlet.from_array(inp.data, desc)) graph.add_edge(root_node, None, bcast_node, '_root', dace.Memlet.simple(root_name, '0')) graph.add_edge(bcast_node, '_outbuffer', local_access, None, dace.Memlet.from_array(inp.data, desc)) for e in graph.edges_between(inp, map_entry): graph.add_edge(local_access, None, map_entry, e.dst_conn, dace.Memlet.from_array(inp.data, desc)) graph.remove_edge(e) elif isinstance(desc, data.Array): local_name, local_arr = sdfg.add_temp_transient( [(desc.shape[0]) // Px, (desc.shape[1]) // Py], dtype=desc.dtype, storage=desc.storage) local_access = graph.add_access(local_name) bsizes_name, bsizes_arr = sdfg.add_temp_transient( (2, ), dtype=dace.int32) bsizes_access = graph.add_access(bsizes_name) bsizes_tasklet = nodes.Tasklet( '_set_bsizes_', {}, {'__out'}, "__out[0] = {x}; __out[1] = {y}".format( x=(desc.shape[0]) // Px, y=(desc.shape[1]) // Py)) graph.add_edge(bsizes_tasklet, '__out', bsizes_access, None, dace.Memlet.from_array(bsizes_name, bsizes_arr)) gdesc_name, gdesc_arr = sdfg.add_temp_transient( (9, ), dtype=dace.int32) gdesc_access = graph.add_access(gdesc_name) ldesc_name, ldesc_arr = sdfg.add_temp_transient( (9, ), dtype=dace.int32) ldesc_access = graph.add_access(ldesc_name) scatter_node = BlockCyclicScatter('_Scatter_') graph.add_edge(inp, None, scatter_node, '_inbuffer', dace.Memlet.from_array(inp.data, desc)) graph.add_edge(bsizes_access, None, scatter_node, '_block_sizes', dace.Memlet.from_array(bsizes_name, bsizes_arr)) graph.add_edge(scatter_node, '_outbuffer', local_access, None, dace.Memlet.from_array(local_name, local_arr)) graph.add_edge(scatter_node, '_gdescriptor', gdesc_access, None, dace.Memlet.from_array(gdesc_name, gdesc_arr)) graph.add_edge(scatter_node, '_ldescriptor', ldesc_access, None, dace.Memlet.from_array(ldesc_name, ldesc_arr)) for e in graph.edges_between(inp, map_entry): graph.add_edge( local_access, None, map_entry, e.dst_conn, dace.Memlet.from_array(local_name, local_arr)) graph.remove_edge(e) for e in graph.out_edges(map_entry): if e.data.data == inp.data: e.data.data = local_name else: raise NotImplementedError outputs = set() for _, _, dst, _, m in graph.out_edges(map_exit): if not isinstance(dst, nodes.AccessNode): raise NotImplementedError desc = dst.desc(sdfg) if not isinstance(desc, data.Array): raise NotImplementedError try: if list(desc.shape) != m.dst_subset.size_exact(): # Second attempt # TODO: We need a solution for symbols not matching if str(list(desc.shape)) != str(m.dst_subset.size_exact()): raise NotImplementedError except AttributeError: if list(desc.shape) != m.subset.size_exact(): # Second attempt # TODO: We need a solution for symbols not matching if str(list(desc.shape)) != str(m.subset.size_exact()): raise NotImplementedError outputs.add(dst) for out in outputs: desc = out.desc(sdfg) if isinstance(desc, data.Scalar): raise NotImplementedError elif isinstance(desc, data.Array): local_name, local_arr = sdfg.add_temp_transient( [(desc.shape[0]) // Px, (desc.shape[1]) // Py], dtype=desc.dtype, storage=desc.storage) local_access = graph.add_access(local_name) bsizes_name, bsizes_arr = sdfg.add_temp_transient( (2, ), dtype=dace.int32) bsizes_access = graph.add_access(bsizes_name) bsizes_tasklet = nodes.Tasklet( '_set_bsizes_', {}, {'__out'}, "__out[0] = {x}; __out[1] = {y}".format( x=(desc.shape[0]) // Px, y=(desc.shape[1]) // Py)) graph.add_edge(bsizes_tasklet, '__out', bsizes_access, None, dace.Memlet.from_array(bsizes_name, bsizes_arr)) scatter_node = BlockCyclicGather('_Gather_') graph.add_edge(local_access, None, scatter_node, '_inbuffer', dace.Memlet.from_array(local_name, local_arr)) graph.add_edge(bsizes_access, None, scatter_node, '_block_sizes', dace.Memlet.from_array(bsizes_name, bsizes_arr)) graph.add_edge(scatter_node, '_outbuffer', out, None, dace.Memlet.from_array(out.data, desc)) for e in graph.edges_between(map_exit, out): graph.add_edge( map_exit, e.src_conn, local_access, None, dace.Memlet.from_array(local_name, local_arr)) graph.remove_edge(e) for e in graph.in_edges(map_exit): if e.data.data == out.data: e.data.data = local_name else: raise NotImplementedError map_entry.map.params = params map_entry.map.range = subsets.Range(ranges)
def apply(self, graph: dace.SDFGState, sdfg: dace.SDFG): map_entry = self.map_entry map_exit = graph.exit_node(map_entry) sz = dace.symbol('commsize', dtype=dace.int32) def _prod(sequence): return reduce(lambda a, b: a * b, sequence, 1) # NOTE: Maps with step in their ranges are currently not supported if len(map_entry.map.params) == 1: params = map_entry.map.params ranges = [(0, (e - b + 1) / sz - 1, 1) for b, e, _ in map_entry.map.range] strides = [1] else: params = ['__iflat'] sizes = map_entry.map.range.size_exact() total_size = _prod(sizes) ranges = [(0, (total_size) / sz - 1, 1)] strides = [_prod(sizes[i + 1:]) for i in range(len(sizes))] root_name = sdfg.temp_data_name() sdfg.add_scalar(root_name, dace.int32, transient=True) root_node = graph.add_access(root_name) root_tasklet = graph.add_tasklet('_set_root_', {}, {'__out'}, '__out = 0') graph.add_edge(root_tasklet, '__out', root_node, None, dace.Memlet.simple(root_name, '0')) from dace.libraries.mpi import Bcast, Scatter, Gather inputs = set() for src, _, _, _, m in graph.in_edges(map_entry): if not isinstance(src, nodes.AccessNode): raise NotImplementedError desc = src.desc(sdfg) if not isinstance(desc, (data.Scalar, data.Array)): raise NotImplementedError if list(desc.shape) != m.src_subset.size_exact(): # Second attempt # TODO: We need a solution for symbols not matching if str(list(desc.shape)) != str(m.src_subset.size_exact()): raise NotImplementedError inputs.add(src) for inp in inputs: desc = inp.desc(sdfg) if isinstance(desc, data.Scalar): local_access = graph.add_access(inp.data) bcast_node = Bcast('_Bcast_') graph.add_edge(inp, None, bcast_node, '_inbuffer', dace.Memlet.from_array(inp.data, desc)) graph.add_edge(root_node, None, bcast_node, '_root', dace.Memlet.simple(root_name, '0')) graph.add_edge(bcast_node, '_outbuffer', local_access, None, dace.Memlet.from_array(inp.data, desc)) for e in graph.edges_between(inp, map_entry): graph.add_edge(local_access, None, map_entry, e.dst_conn, dace.Memlet.from_array(inp.data, desc)) graph.remove_edge(e) elif isinstance(desc, data.Array): local_name, local_arr = sdfg.add_temp_transient( [sympy.floor(desc.total_size / sz)], dtype=desc.dtype, storage=desc.storage) local_access = graph.add_access(local_name) scatter_node = Scatter('_Scatter_') graph.add_edge(inp, None, scatter_node, '_inbuffer', dace.Memlet.from_array(inp.data, desc)) graph.add_edge(root_node, None, scatter_node, '_root', dace.Memlet.simple(root_name, '0')) graph.add_edge(scatter_node, '_outbuffer', local_access, None, dace.Memlet.from_array(local_name, local_arr)) for e in graph.edges_between(inp, map_entry): graph.add_edge( local_access, None, map_entry, e.dst_conn, dace.Memlet.from_array(local_name, local_arr)) graph.remove_edge(e) for e in graph.out_edges(map_entry): if e.data.data == inp.data: e.data = dace.Memlet.simple(local_name, params[0]) else: raise NotImplementedError outputs = set() for _, _, dst, _, m in graph.out_edges(map_exit): if not isinstance(dst, nodes.AccessNode): raise NotImplementedError desc = dst.desc(sdfg) if not isinstance(desc, data.Array): raise NotImplementedError try: if list(desc.shape) != m.dst_subset.size_exact(): # Second attempt # TODO: We need a solution for symbols not matching if str(list(desc.shape)) != str(m.dst_subset.size_exact()): raise NotImplementedError except AttributeError: if list(desc.shape) != m.subset.size_exact(): # Second attempt # TODO: We need a solution for symbols not matching if str(list(desc.shape)) != str(m.subset.size_exact()): raise NotImplementedError outputs.add(dst) for out in outputs: desc = out.desc(sdfg) if isinstance(desc, data.Scalar): raise NotImplementedError elif isinstance(desc, data.Array): local_name, local_arr = sdfg.add_temp_transient( [sympy.floor(desc.total_size / sz)], dtype=desc.dtype, storage=desc.storage) local_access = graph.add_access(local_name) scatter_node = Gather('_Gather_') graph.add_edge(local_access, None, scatter_node, '_inbuffer', dace.Memlet.from_array(local_name, local_arr)) graph.add_edge(root_node, None, scatter_node, '_root', dace.Memlet.simple(root_name, '0')) graph.add_edge(scatter_node, '_outbuffer', out, None, dace.Memlet.from_array(out.data, desc)) for e in graph.edges_between(map_exit, out): graph.add_edge( map_exit, e.src_conn, local_access, None, dace.Memlet.from_array(local_name, local_arr)) graph.remove_edge(e) for e in graph.in_edges(map_exit): if e.data.data == out.data: e.data = dace.Memlet.simple(local_name, params[0]) else: raise NotImplementedError map_entry.map.params = params map_entry.map.range = subsets.Range(ranges)
def make_fpga_sdfg_independent(): ''' Build an SDFG with two nested SDFGs in a single FPGA state ''' n = dace.symbol("n") vecWidth = 4 vecType = dace.vector(dace.float32, vecWidth) sdfg = dace.SDFG("nested_sdfg_kernels") ########################################################################### # Copy data to FPGA copy_in_state = sdfg.add_state("copy_to_device") sdfg.add_array("x", shape=[n / vecWidth], dtype=vecType) sdfg.add_array("y", shape=[n / vecWidth], dtype=vecType) sdfg.add_array("v", shape=[n / vecWidth], dtype=vecType) sdfg.add_array("w", shape=[n / vecWidth], dtype=vecType) in_host_x = copy_in_state.add_read("x") in_host_y = copy_in_state.add_read("y") in_host_v = copy_in_state.add_read("v") in_host_w = copy_in_state.add_read("w") sdfg.add_array("device_x", shape=[n / vecWidth], dtype=vecType, storage=dace.dtypes.StorageType.FPGA_Global, transient=True) sdfg.add_array("device_y", shape=[n / vecWidth], dtype=vecType, storage=dace.dtypes.StorageType.FPGA_Global, transient=True) sdfg.add_array("device_v", shape=[n / vecWidth], dtype=vecType, storage=dace.dtypes.StorageType.FPGA_Global, transient=True) sdfg.add_array("device_w", shape=[n / vecWidth], dtype=vecType, storage=dace.dtypes.StorageType.FPGA_Global, transient=True) in_device_x = copy_in_state.add_write("device_x") in_device_y = copy_in_state.add_write("device_y") in_device_v = copy_in_state.add_write("device_v") in_device_w = copy_in_state.add_write("device_w") copy_in_state.add_memlet_path( in_host_x, in_device_x, memlet=dace.Memlet(f"{in_host_x.data}[0:{n}/{vecWidth}]")) copy_in_state.add_memlet_path( in_host_y, in_device_y, memlet=dace.Memlet(f"{in_host_y.data}[0:{n}/{vecWidth}]")) copy_in_state.add_memlet_path( in_host_v, in_device_v, memlet=dace.Memlet(f"{in_host_v.data}[0:{n}/{vecWidth}]")) copy_in_state.add_memlet_path( in_host_w, in_device_w, memlet=dace.Memlet(f"{in_host_w.data}[0:{n}/{vecWidth}]")) ########################################################################### # Copy data from FPGA sdfg.add_array("z", shape=[n / vecWidth], dtype=vecType) sdfg.add_array("u", shape=[n / vecWidth], dtype=vecType) copy_out_state = sdfg.add_state("copy_to_host") sdfg.add_array("device_z", shape=[n / vecWidth], dtype=vecType, storage=dace.dtypes.StorageType.FPGA_Global, transient=True) sdfg.add_array("device_u", shape=[n / vecWidth], dtype=vecType, storage=dace.dtypes.StorageType.FPGA_Global, transient=True) out_device_z = copy_out_state.add_read("device_z") out_host_z = copy_out_state.add_write("z") out_device_u = copy_out_state.add_read("device_u") out_host_u = copy_out_state.add_write("u") copy_out_state.add_memlet_path( out_device_z, out_host_z, memlet=dace.Memlet(f"{out_host_z.data}[0:{n}/{vecWidth}]")) copy_out_state.add_memlet_path( out_device_u, out_host_u, memlet=dace.Memlet(f"{out_host_u.data}[0:{n}/{vecWidth}]")) ########################################################################### # Non-FPGA state non_fpga_state = sdfg.add_state("I_do_not_want_to_be_fpga_kernel") non_fpga_state.location["is_FPGA_kernel"] = False # Build the vec addition SDFG and nest it to_nest = make_vec_add_sdfg() # add nested sdfg with symbol mapping nested_sdfg = non_fpga_state.add_nested_sdfg(to_nest, sdfg, {"_device_x", "_device_y"}, {"_device_z"}, {"size": "n"}) non_fpga_state.add_memlet_path( in_device_x, nested_sdfg, dst_conn="_device_x", memlet=dace.Memlet(f"{in_device_x.data}[0:{n}/{vecWidth}]")) non_fpga_state.add_memlet_path( in_device_y, nested_sdfg, dst_conn="_device_y", memlet=dace.Memlet(f"{in_device_y.data}[0:{n}/{vecWidth}]")) non_fpga_state.add_memlet_path( nested_sdfg, out_device_z, src_conn="_device_z", memlet=dace.Memlet(f"{out_device_z.data}[0:{n}/{vecWidth}]")) # Build the vec multiplication SDFG and nest it to_nest = make_vec_mul_sdfg() # add nested sdfg with symbol mapping nested_sdfg = non_fpga_state.add_nested_sdfg(to_nest, sdfg, {"_device_x", "_device_y"}, {"_device_z"}, {"size": "n"}) non_fpga_state.add_memlet_path( in_device_v, nested_sdfg, dst_conn="_device_x", memlet=dace.Memlet(f"{in_device_v.data}[0:{n}/{vecWidth}]")) non_fpga_state.add_memlet_path( in_device_w, nested_sdfg, dst_conn="_device_y", memlet=dace.Memlet(f"{in_device_w.data}[0:{n}/{vecWidth}]")) non_fpga_state.add_memlet_path( nested_sdfg, out_device_u, src_conn="_device_z", memlet=dace.Memlet(f"{out_device_u.data}[0:{n}/{vecWidth}]")) ###################################### # Interstate edges sdfg.add_edge(copy_in_state, non_fpga_state, dace.sdfg.sdfg.InterstateEdge()) sdfg.add_edge(non_fpga_state, copy_out_state, dace.sdfg.sdfg.InterstateEdge()) sdfg.fill_scope_connectors() sdfg.validate() return sdfg
def make_vec_add_sdfg(dtype=dace.float32): # Vector addition SDFG vecWidth = 4 n = dace.symbol("size") vecAdd_sdfg = dace.SDFG("vec_add") vecType = dace.vector(dtype, vecWidth) fpga_state = vecAdd_sdfg.add_state("vec_add_state") vecAdd_sdfg.add_array('_device_x', shape=[n / vecWidth], dtype=vecType, storage=dace.dtypes.StorageType.FPGA_Global) vecAdd_sdfg.add_array('_device_y', shape=[n / vecWidth], dtype=vecType, storage=dace.dtypes.StorageType.FPGA_Global) vecAdd_sdfg.add_array('_device_z', shape=[n / vecWidth], dtype=vecType, storage=dace.dtypes.StorageType.FPGA_Global) x = fpga_state.add_read("_device_x") y = fpga_state.add_read("_device_y") z = fpga_state.add_write("_device_z") # ---------- ---------- # COMPUTE # ---------- ---------- vecMap_entry, vecMap_exit = fpga_state.add_map( 'vecAdd_map', dict(i='0:{0}/{1}'.format(n, vecWidth)), schedule=dace.dtypes.ScheduleType.FPGA_Device) vecAdd_tasklet = fpga_state.add_tasklet('vec_add_task', ['x_con', 'y_con'], ['z_con'], 'z_con = x_con + y_con') fpga_state.add_memlet_path(x, vecMap_entry, vecAdd_tasklet, dst_conn='x_con', memlet=dace.Memlet(f"{x.data}[i]")) fpga_state.add_memlet_path(y, vecMap_entry, vecAdd_tasklet, dst_conn='y_con', memlet=dace.Memlet(f"{y.data}[i]")) fpga_state.add_memlet_path(vecAdd_tasklet, vecMap_exit, z, src_conn='z_con', memlet=dace.Memlet(f"{z.data}[i]")) ######### # Validate vecAdd_sdfg.fill_scope_connectors() vecAdd_sdfg.validate() return vecAdd_sdfg
# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. import dace import numpy as np M = dace.symbol("M") K = dace.symbol("K") @dace.program def transpose_add(A: dace.float32[M, K], B: dace.float32[K, M]): for i, j in dace.map[0:M, 0:K]: B[j, i] = A[i, j] + 1 def test_inline_scalar(): K.set(24) M.set(25) A = np.random.rand(25, 24).astype(np.float32) B = np.random.rand(24, 25).astype(np.float32) transpose_add(A, B) diff = np.linalg.norm(A.transpose() - B + 1) print('Difference:', diff) assert diff < 1e-5 if __name__ == '__main__': test_inline_scalar()
def make_sdfg(implementation, dtype, storage=dace.StorageType.Default): n = dace.symbol("n") suffix = "_device" if storage != dace.StorageType.Default else "" transient = storage != dace.StorageType.Default sdfg = dace.SDFG("dot_product_{}_{}".format(implementation, dtype)) state = sdfg.add_state("dataflow") sdfg.add_array("x" + suffix, [n], dtype, storage=storage, transient=transient) sdfg.add_array("y" + suffix, [n], dtype, storage=storage, transient=transient) sdfg.add_array("result" + suffix, [1], dtype, storage=storage, transient=transient) x = state.add_read("x" + suffix) y = state.add_read("y" + suffix) result = state.add_write("result" + suffix) dot_node = blas.nodes.dot.Dot("dot") dot_node.implementation = implementation state.add_memlet_path(x, dot_node, dst_conn="_x", memlet=Memlet.simple(x, "0:n", num_accesses=n)) state.add_memlet_path(y, dot_node, dst_conn="_y", memlet=Memlet.simple(y, "0:n", num_accesses=n)) # TODO: remove -1 once this no longer triggers a write in the codegen. state.add_memlet_path(dot_node, result, src_conn="_result", memlet=Memlet.simple(result, "0", num_accesses=-1)) if storage != dace.StorageType.Default: sdfg.add_array("x", [n], dtype) sdfg.add_array("y", [n], dtype) sdfg.add_array("result", [1], dtype) init_state = sdfg.add_state("copy_to_device") sdfg.add_edge(init_state, state, dace.InterstateEdge()) x_host = init_state.add_read("x") y_host = init_state.add_read("y") x_device = init_state.add_write("x" + suffix) y_device = init_state.add_write("y" + suffix) init_state.add_memlet_path(x_host, x_device, memlet=Memlet.simple(x_host, "0:n", num_accesses=n)) init_state.add_memlet_path(y_host, y_device, memlet=Memlet.simple(y_host, "0:n", num_accesses=n)) finalize_state = sdfg.add_state("copy_to_host") sdfg.add_edge(state, finalize_state, dace.InterstateEdge()) result_device = finalize_state.add_write("result" + suffix) result_host = finalize_state.add_read("result") finalize_state.add_memlet_path(result_device, result_host, memlet=Memlet.simple(result_device, "0", num_accesses=1)) return sdfg
#!/usr/bin/env python from __future__ import print_function import argparse import dace import numpy as np M = dace.symbol('M') K = dace.symbol('K') N = dace.symbol('N') @dace.program(dace.float64[M, K], dace.float64[K, N], dace.float64[M, N]) def gemm(A, B, C): # Transient variable tmp = dace.define_local([M, N, K], dtype=A.dtype) for ignore in dace.map[0:1024]: @dace.map(_[0:M, 0:N, 0:K]) def multiplication(i, j, k): in_A << A[i, k] in_B << B[k, j] out >> tmp[i, j, k] out = in_A * in_B dace.reduce(lambda a, b: a + b, tmp, C, axis=2, identity=0) if __name__ == "__main__":
def make_sdfg(tasklet_code=None, name="veclen_copy_conversion", dtype=dace.float32, veclen=16): vtype = dace.vector(dace.float32, veclen) if tasklet_code is None: tasklet_code = "_out = _in" n = dace.symbol("N") sdfg = dace.SDFG(name) pre_state = sdfg.add_state(name + "_pre") state = sdfg.add_state(name) post_state = sdfg.add_state(name + "_post") sdfg.add_edge(pre_state, state, dace.InterstateEdge()) sdfg.add_edge(state, post_state, dace.InterstateEdge()) _, desc_input_host = sdfg.add_array("a", (n // veclen, ), vtype) _, desc_output_host = sdfg.add_array("b", (n // veclen, ), vtype) desc_input_device = copy.copy(desc_input_host) desc_input_device.storage = dace.StorageType.FPGA_Global desc_input_device.location["bank"] = 0 desc_input_device.transient = True desc_output_device = copy.copy(desc_output_host) desc_output_device.storage = dace.StorageType.FPGA_Global desc_output_device.location["bank"] = 1 desc_output_device.transient = True sdfg.add_datadesc("a_device", desc_input_device) sdfg.add_datadesc("b_device", desc_output_device) # Host to device pre_read = pre_state.add_read("a") pre_write = pre_state.add_write("a_device") pre_state.add_memlet_path(pre_read, pre_write, memlet=dace.Memlet(pre_write.data, None)) # Device to host post_read = post_state.add_read("b_device") post_write = post_state.add_write("b") post_state.add_memlet_path(post_read, post_write, memlet=dace.Memlet(post_write.data, None)) # Compute state read_memory = state.add_read("a_device") write_memory = state.add_write("b_device") # Memory streams sdfg.add_stream("a_stream", vtype, storage=dace.StorageType.FPGA_Local, transient=True) sdfg.add_stream("b_stream", vtype, storage=dace.StorageType.FPGA_Local, transient=True) produce_input_stream = state.add_write("a_stream") consume_input_stream = state.add_read("a_stream") produce_output_stream = state.add_write("b_stream") consume_output_stream = state.add_write("b_stream") tasklet = state.add_tasklet(name, {"_in"}, {"_out"}, tasklet_code) # Iterative map entry, exit = state.add_map(name, { "i": "0:N//{}".format(veclen), }, schedule=dace.ScheduleType.FPGA_Device) # Unrolled map unroll_entry, unroll_exit = state.add_map( name + "_unroll", {"u": "0:{}".format(veclen)}, schedule=dace.ScheduleType.FPGA_Device, unroll=True) # Container-to-container copies between arrays and streams state.add_memlet_path(read_memory, produce_input_stream, memlet=dace.Memlet(read_memory.data)) state.add_memlet_path(consume_output_stream, write_memory, memlet=dace.Memlet(write_memory.data)) # Container-to-container copy from vectorized stream to non-vectorized # buffer sdfg.add_array("a_buffer", (veclen, ), dtype, storage=dace.StorageType.FPGA_Local, transient=True) sdfg.add_array("b_buffer", (veclen, ), dtype, storage=dace.StorageType.FPGA_Local, transient=True) a_buffer = state.add_access("a_buffer") b_buffer = state.add_access("b_buffer") # Input stream to buffer state.add_memlet_path(consume_input_stream, entry, a_buffer, memlet=dace.Memlet.simple( consume_input_stream.data, "0", other_subset_str="0:{}".format(veclen))) # Buffer to tasklet state.add_memlet_path(a_buffer, unroll_entry, tasklet, dst_conn="_in", memlet=dace.Memlet.simple(a_buffer.data, "u", num_accesses=1)) # Tasklet to buffer state.add_memlet_path(tasklet, unroll_exit, b_buffer, src_conn="_out", memlet=dace.Memlet.simple(b_buffer.data, "u", num_accesses=1)) # Buffer to output stream state.add_memlet_path(b_buffer, exit, produce_output_stream, memlet=dace.Memlet.simple( produce_output_stream.data, "0", other_subset_str="0:{}".format(veclen), num_accesses=1)) return sdfg
def test_indirection_with_reindex(language): N = dace.symbol('N') S = dace.symbol('S') sdfg = dace.SDFG(f"test_indirection_with_reindex") sdfg.add_array('A', shape=[N], dtype=dace.float32, transient=False) sdfg.add_array('index_0', shape=[1], dtype=dace.int32, transient=True) sdfg.add_array('index_1', shape=[1], dtype=dace.int32, transient=True) sdfg.add_array('index_2', shape=[1], dtype=dace.int32, transient=True) sdfg.add_array('out', shape=[N], dtype=dace.float32, transient=False) sdfg.add_symbol('S', S.dtype) state_init1 = sdfg.add_state() state_init2 = sdfg.add_state() state_init3 = sdfg.add_state() state_compute = sdfg.add_state() sdfg.add_edge(state_init1, state_init2, dace.InterstateEdge()) sdfg.add_edge(state_init2, state_init3, dace.InterstateEdge()) sdfg.add_edge(state_init3, state_compute, dace.InterstateEdge()) tasklet1 = state_init1.add_tasklet(name="init1", inputs=[], outputs=["out"], code="out = 1;", language=dace.Language.CPP) tasklet2 = state_init2.add_tasklet(name="init2", inputs=[], outputs=["out"], code="out = 2;", language=dace.Language.CPP) tasklet3 = state_init3.add_tasklet(name="init3", inputs=[], outputs=["out"], code="out = 3;", language=dace.Language.CPP) dst = state_init1.add_write("index_0") memlet = dace.Memlet(expr="index_0", subset="0") state_init1.add_memlet_path(tasklet1, dst, src_conn="out", memlet=memlet) dst = state_init2.add_write("index_1") memlet = dace.Memlet(expr="index_1", subset="0") state_init2.add_memlet_path(tasklet2, dst, src_conn="out", memlet=memlet) dst = state_init3.add_write("index_2") memlet = dace.Memlet(expr="index_2", subset="0") state_init3.add_memlet_path(tasklet3, dst, src_conn="out", memlet=memlet) semicolon = ';' if language == dace.Language.CPP else '' tasklet = state_compute.add_tasklet( name="add", inputs=["_A", "_index_0", "_index_1", "_index_2"], outputs=["_out"], code=f"_out[_index_2] = _A[_index_0] + _A[_index_1]{semicolon}", language=language) src = state_compute.add_read("A") memlet = dace.Memlet(expr="A", subset="S:N") state_compute.add_memlet_path(src, tasklet, dst_conn="_A", memlet=memlet) src = state_compute.add_read("index_0") memlet = dace.Memlet(expr="index_0", subset="0") state_compute.add_memlet_path(src, tasklet, dst_conn="_index_0", memlet=memlet) src = state_compute.add_read("index_1") memlet = dace.Memlet(expr="index_1", subset="0") state_compute.add_memlet_path(src, tasklet, dst_conn="_index_1", memlet=memlet) src = state_compute.add_read("index_2") memlet = dace.Memlet(expr="index_2", subset="0") state_compute.add_memlet_path(src, tasklet, dst_conn="_index_2", memlet=memlet) dst = state_compute.add_write("out") memlet = dace.Memlet(expr="out", subset="S:N") state_compute.add_memlet_path(tasklet, dst, src_conn="_out", memlet=memlet) scalar_to_symbol.promote_scalars_to_symbols(sdfg) sdfg.simplify() A = np.array(list(range(10)), dtype=np.float32) out = np.zeros((10, ), dtype=np.float32) sdfg(A=A, out=out, N=10, S=5) assert (np.allclose(A[6] + A[7], out[8]))
sdfg.validate() def test_reverse_copy(): @dace.program def redarrtest(p: dace.float64[20, 20]): p[-1, :] = p[-2, :] p = np.random.rand(20, 20) pp = np.copy(p) pp[-1, :] = pp[-2, :] redarrtest(p) assert np.allclose(p, pp) C_in, C_out, H, K, N, W = (dace.symbol(s, dace.int64) for s in ('C_in', 'C_out', 'H', 'K', 'N', 'W')) # Deep learning convolutional operator (stride = 1) @dace.program def conv2d(input: dace.float32[N, H, W, C_in], weights: dace.float32[K, K, C_in, C_out]): output = np.ndarray((N, H - K + 1, W - K + 1, C_out), dtype=np.float32) # Loop structure adapted from https://github.com/SkalskiP/ILearnDeepLearning.py/blob/ba0b5ba589d4e656141995e8d1a06d44db6ce58d/01_mysteries_of_neural_networks/06_numpy_convolutional_neural_net/src/layers/convolutional.py#L88 # for i, j in dace.map[0:H-K+1, 0:W-K+1]: for i in range(H - K + 1): for j in range(W - K + 1): output[:, i, j, :] = np.sum( input[:, i:i + K, j:j + K, :, np.newaxis] *
import math import dace try: import polybench except ImportError: polybench = None NI = dace.symbol('NI') NJ = dace.symbol('NJ') NK = dace.symbol('NK') NL = dace.symbol('NL') #datatypes = [dace.float64, dace.int32, dace.float32] datatype = dace.float64 # Dataset sizes sizes = [{ NI: 16, NJ: 18, NK: 22, NL: 24 }, { NI: 40, NJ: 50, NK: 70, NL: 80 }, { NI: 180, NJ: 190, NK: 210, NL: 220
# Copyright 2019-2020 ETH Zurich and the DaCe authors. All rights reserved. import dace import numpy as np # Declaration of symbolic variables N, BS = (dace.symbol(name) for name in ['N', 'BS']) @dace.program def seq_cond(HD: dace.complex128[N, BS, BS], HE: dace.complex128[N, BS, BS], HF: dace.complex128[N, BS, BS], sigmaRSD: dace.complex128[N, BS, BS], sigmaRSE: dace.complex128[N, BS, BS], sigmaRSF: dace.complex128[N, BS, BS]): for n in range(N): if n < N - 1: HE[n] -= sigmaRSE[n] else: HE[n] = -sigmaRSE[n] if n > 0: HF[n] -= sigmaRSF[n] else: HF[n] = -sigmaRSF[n] HD[n] = HD[n] - sigmaRSD[n] def test(): seq_cond.compile()
# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. """ A test for the ElementWiseArrayOperation transformation. """ import dace import numpy as np from dace.transformation.dataflow import ElementWiseArrayOperation import pytest N = dace.symbol('N', dtype=dace.int64) @dace.program def eao_mpi(A: dace.float64[N], B: dace.float64[N]): return A * B @pytest.mark.mpi def test_eao_mpi(): from mpi4py import MPI as MPI4PY comm = MPI4PY.COMM_WORLD rank = comm.Get_rank() commsize = comm.Get_size() mpi_sdfg = None if commsize < 2: raise ValueError("This test is supposed to be run with at least two processes!") for r in range(0, commsize): if r == rank: mpi_sdfg = eao_mpi.to_sdfg(simplify=True) mpi_sdfg.apply_transformations(ElementWiseArrayOperation) mpi_exec = mpi_sdfg.compile() comm.Barrier()
mpi_sdfg(x=A, y=B, src=src, dest=dest, tag=tag, n=size) # now B should be an array of size, containing srank if not np.allclose(B, np.full(size, srank, dtype=dtype)): raise (ValueError("The received values are not what I expected.")) # TODO: The test deadlocks in the CI (Ubuntu 18.04, MPICH 3.3a2) # but works fine in up-to-date systems, including when using pytest. @pytest.mark.skip def test_mpi(): _test_mpi("MPI Send/Recv", make_sdfg(np.float64), np.float64) ############################################################################### myrank = dace.symbol('myrank', dtype=dace.int32) mysize = dace.symbol('mysize', dtype=dace.int32) @dace.program def dace_send_recv(): tmp1 = np.full([1], myrank, dtype=np.int32) tmp2 = np.zeros([1], dtype=np.int32) if myrank == 0: dace.comm.Send(tmp1, 1, tag=42) dace.comm.Recv(tmp2, mysize - 1, tag=42) else: dace.comm.Recv(tmp2, (myrank - 1) % mysize, tag=42) dace.comm.Send(tmp1, (myrank + 1) % mysize, tag=42) return tmp2
# Copyright 2019-2020 ETH Zurich and the DaCe authors. All rights reserved. import unittest import dace import numpy as np from dace.transformation.dataflow import MapTiling, OutLocalStorage N = dace.symbol('N') @dace.program def arange(): out = np.ndarray([N], np.int32) for i in dace.map[0:N]: with dace.tasklet: o >> out[i] o = i return out class LocalStorageTests(unittest.TestCase): def test_even(self): sdfg = arange.to_sdfg() sdfg.apply_transformations([MapTiling, OutLocalStorage], options=[{ 'tile_sizes': [8] }, {}]) self.assertTrue( np.array_equal(sdfg(N=16), np.arange(16, dtype=np.int32))) def test_uneven(self): # For testing uneven decomposition, use longer buffer and ensure
def make_sdfg(dtype): n = dace.symbol("n") sdfg = dace.SDFG("mpi_send_recv") state = sdfg.add_state("dataflow") sdfg.add_array("x", [n], dtype, transient=False) sdfg.add_array("y", [n], dtype, transient=False) sdfg.add_array("src", [1], dace.dtypes.int32, transient=False) sdfg.add_array("dest", [1], dace.dtypes.int32, transient=False) sdfg.add_array("tag", [1], dace.dtypes.int32, transient=False) sdfg.add_array("send_req", [1], dace.dtypes.opaque("MPI_Request"), transient=True) sdfg.add_array("recv_req", [1], dace.dtypes.opaque("MPI_Request"), transient=True) sdfg.add_array("stat_source", [1], dace.dtypes.int32, transient=True) sdfg.add_array("stat_count", [1], dace.dtypes.int32, transient=True) sdfg.add_array("stat_tag", [1], dace.dtypes.int32, transient=True) sdfg.add_array("stat_cancelled", [1], dace.dtypes.int32, transient=True) x = state.add_access("x") y = state.add_access("y") src = state.add_access("src") dest = state.add_access("dest") tag = state.add_access("tag") send_req = state.add_access("send_req") recv_req = state.add_access("recv_req") stat_source = state.add_access("stat_source") stat_tag = state.add_access("stat_tag") send_node = mpi.nodes.isend.Isend("isend") recv_node = mpi.nodes.irecv.Irecv("irecv") wait_node = mpi.nodes.wait.Wait("wait") state.add_memlet_path(x, send_node, dst_conn="_buffer", memlet=Memlet.simple(x, "0:n", num_accesses=n)) state.add_memlet_path(send_node, send_req, src_conn="_request", memlet=Memlet.simple(send_req, "0:1", num_accesses=1)) state.add_memlet_path(dest, send_node, dst_conn="_dest", memlet=Memlet.simple(dest, "0:1", num_accesses=1)) state.add_memlet_path(tag, send_node, dst_conn="_tag", memlet=Memlet.simple(tag, "0:1", num_accesses=1)) state.add_memlet_path(recv_node, y, src_conn="_buffer", memlet=Memlet.simple(y, "0:n", num_accesses=n)) state.add_memlet_path(recv_node, recv_req, src_conn="_request", memlet=Memlet.simple(recv_req, "0:1", num_accesses=1)) state.add_memlet_path(recv_req, wait_node, dst_conn="_request", memlet=Memlet.simple(recv_req, "0:1", num_accesses=1)) state.add_memlet_path(wait_node, stat_tag, src_conn="_stat_tag", memlet=Memlet.simple(stat_tag, "0:1", num_accesses=1)) state.add_memlet_path(wait_node, stat_source, src_conn="_stat_source", memlet=Memlet.simple(stat_source, "0:1", num_accesses=1)) state.add_memlet_path(src, recv_node, dst_conn="_src", memlet=Memlet.simple(src, "0:1", num_accesses=1)) state.add_memlet_path(tag, recv_node, dst_conn="_tag", memlet=Memlet.simple(tag, "0:1", num_accesses=1)) return sdfg
#!/usr/bin/env python from __future__ import print_function import argparse import dace import math import numpy as np W = dace.symbol('W') H = dace.symbol('H') @dace.program(dace.float32[H, W], dace.float32[H, W]) def transpose(A, B): @dace.map(_[0:H, 0:W]) def compute(i, j): a << A[j, i] b >> B[i, j] b = a if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("W", type=int, nargs="?", default=64) parser.add_argument("H", type=int, nargs="?", default=64) args = vars(parser.parse_args()) A = dace.ndarray([H, W], dtype=dace.float32) B = dace.ndarray([H, W], dtype=dace.float32)
# Copyright 2019-2020 ETH Zurich and the DaCe authors. All rights reserved. import math import numpy as np import dace as dp from dace.sdfg import SDFG from dace.memlet import Memlet N = dp.symbol('N') sdfg = SDFG('tlstream') state = sdfg.add_state('doit') localarr = state.add_transient('la', [10], dp.float32) localstream = state.add_stream('ls', dp.float32, 1, transient=True) globalstream = state.add_stream('gs', dp.float32, 1, transient=True) globalarr = state.add_array('ga', [N], dp.float32) me, mx = state.add_map('par', dict(i='0:N')) tasklet = state.add_tasklet('arange', set(), {'a'}, 'a = i') state.add_nedge(me, tasklet, Memlet()) state.add_edge(tasklet, 'a', localstream, None, Memlet.from_array(localstream.data, localstream.desc(sdfg))) state.add_nedge(localstream, localarr, Memlet.from_array(localarr.data, localarr.desc(sdfg))) state.add_nedge(localarr, mx, Memlet.from_array(globalstream.data, globalstream.desc(sdfg))) state.add_nedge(mx, globalstream, Memlet.from_array(globalstream.data, globalstream.desc(sdfg))) state.add_nedge(globalstream, globalarr, Memlet.from_array(globalarr.data, globalarr.desc(sdfg)))
#!/usr/bin/env python import dace import numpy as np import scipy W = dace.symbol('W') H = dace.symbol('H') nnz = dace.symbol('nnz') @dace.program(dace.uint32[H + 1], dace.uint32[nnz], dace.float32[nnz], dace.float32[W], dace.float32[H]) def spmv(A_row, A_col, A_val, x, b): @dace.mapscope(_[0:H]) def compute_row(i): @dace.map(_[A_row[i]:A_row[i + 1]]) def compute(j): a << A_val[j] in_x << x[A_col[j]] out >> b(1, lambda x, y: x + y)[i] out = a * in_x def test_dynamic_map(): height = 1024 width = 1024 # Prepare spmv SDFG for GPU sdfg = spmv.to_sdfg()
# Copyright 2019-2020 ETH Zurich and the DaCe authors. All rights reserved. import math import dace import polybench NQ = dace.symbol('NQ') NR = dace.symbol('NR') NP = dace.symbol('NP') #datatypes = [dace.float64, dace.int32, dace.float32] datatype = dace.float64 # Dataset sizes sizes = [{ NQ: 8, NR: 10, NP: 12 }, { NQ: 20, NR: 25, NP: 30 }, { NQ: 40, NR: 50, NP: 60 }, { NQ: 140, NR: 150, NP: 160 }, { NQ: 220,