def validate(self, sdfg, state): """ :return: A three-tuple (buffer, root) of the three data descriptors in the parent SDFG. """ inbuffer, outbuffer, root = None, None, None for e in state.out_edges(self): if e.src_conn == "_outbuffer": outbuffer = sdfg.arrays[e.data.data] for e in state.in_edges(self): if e.dst_conn == "_inbuffer": inbuffer = sdfg.arrays[e.data.data] if e.dst_conn == "_root": root = sdfg.arrays[e.data.data] if root.dtype.base_type != dtypes.int32: raise (ValueError("Scatter root must be an integer!")) in_count_str = "XXX" out_count_str = "XXX" for _, src_conn, _, _, data in state.out_edges(self): if src_conn == '_outbuffer': dims = [symstr(e) for e in data.subset.size_exact()] out_count_str = "*".join(dims) for _, _, _, dst_conn, data in state.in_edges(self): if dst_conn == '_inbuffer': dims = [symstr(e) for e in data.subset.size_exact()] in_count_str = "*".join(dims) return (inbuffer, in_count_str), (outbuffer, out_count_str), root
def propagate(self, array, dim_exprs, node_range): result_begin = None result_end = None # Iterate over the node dimensions for idx, node_r in enumerate(node_range): # Get dimension range if len(node_r) == 3: node_rb, node_re, node_rs = node_r elif len(node_r) == 4: node_rb, node_re, node_rs, _ = node_r else: raise NotImplementedError # Get true range end lastindex = node_re if node_rs != 1: lastindex = symbolic.pystr_to_symbolic( '%s + int_floor(%s - %s, %s) * %s' % (symbolic.symstr(node_rb), symbolic.symstr(node_re), symbolic.symstr(node_rb), symbolic.symstr(node_rs), symbolic.symstr(node_rs))) if isinstance(dim_exprs, list): dim_exprs = dim_exprs[0] if isinstance(dim_exprs, tuple): if len(dim_exprs) == 3: rb, re, rs = dim_exprs elif len(dim_exprs) == 4: rb, re, rs, _ = dim_exprs else: raise NotImplementedError rb = symbolic.pystr_to_symbolic(rb) re = symbolic.pystr_to_symbolic(re) rs = symbolic.pystr_to_symbolic(rs) else: rb, re = (dim_exprs, dim_exprs) if result_begin is None: result_begin = rb.subs(self.params[idx], node_rb) else: result_begin = result_begin.subs(self.params[idx], node_rb) if result_end is None: result_end = re.subs(self.params[idx], lastindex) else: result_end = result_end.subs(self.params[idx], lastindex) result_skip = 1 result_tile = 1 return (result_begin, result_end, result_skip, result_tile)
def sym2cpp(s, arrayexprs: Optional[Set[str]] = None) -> Union[str, List[str]]: """ Converts an array of symbolic variables (or one) to C++ strings. :param s: Symbolic expression to convert. :param arrayexprs: Set of names of arrays, used to convert SymPy user-functions back to array expressions. :return: C++-compilable expression or list thereof. """ if not isinstance(s, list): return cppunparse.pyexpr2cpp(symbolic.symstr(s, arrayexprs)) return [cppunparse.pyexpr2cpp(symbolic.symstr(d, arrayexprs)) for d in s]
def convert_index(r): if len(r) == 3: if r[2] != 1: return "{}:{}:{}".format(symbolic.symstr(r[0]), symbolic.symstr(r[1]), symbolic.symstr(r[2])) else: return "{}:{}".format(symbolic.symstr(r[0]), symbolic.symstr(r[1])) else: raise ValueError("Unsupported range: " + str(r))
def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_stream): # Take care of map header assert len(dfg_scope.source_nodes()) == 1 map_header = dfg_scope.source_nodes()[0] function_stream.write('extern int __dace_comm_size, __dace_comm_rank;', sdfg, state_id, map_header) # Add extra opening brace (dynamic map ranges, closed in MapExit # generator) callsite_stream.write('{', sdfg, state_id, map_header) if len(map_header.map.params) > 1: raise NotImplementedError( 'Multi-dimensional MPI maps are not supported') state = sdfg.node(state_id) symtypes = map_header.new_symbols(sdfg, state, state.symbols_defined_at(map_header)) for var, r in zip(map_header.map.params, map_header.map.range): begin, end, skip = r callsite_stream.write('{\n', sdfg, state_id, map_header) callsite_stream.write( '%s %s = %s + __dace_comm_rank * (%s);\n' % (symtypes[var], var, cppunparse.pyexpr2cpp(symbolic.symstr(begin)), cppunparse.pyexpr2cpp(symbolic.symstr(skip))), sdfg, state_id, map_header) to_allocate = dace.sdfg.local_transients(sdfg, dfg_scope, map_header) allocated = set() for child in dfg_scope.scope_children()[map_header]: if not isinstance(child, nodes.AccessNode): continue if child.data not in to_allocate or child.data in allocated: continue allocated.add(child.data) self._dispatcher.dispatch_allocate(sdfg, dfg_scope, state_id, child, function_stream, callsite_stream) self._dispatcher.dispatch_subgraph(sdfg, dfg_scope, state_id, function_stream, callsite_stream, skip_entry_node=True)
def validate(self, sdfg, state): """ :return: A three-tuple (buffer, root) of the three data descriptors in the parent SDFG. """ inbuffer, outbuffer, src, tag = None, None, None, None for e in state.out_edges(self): if e.src_conn == "_outbuffer": outbuffer = sdfg.arrays[e.data.data] for e in state.in_edges(self): if e.dst_conn == "_inbuffer": inbuffer = sdfg.arrays[e.data.data] if e.dst_conn == "_root": root = sdfg.arrays[e.data.data] if inbuffer != outbuffer: raise ValueError("Bcast input and output buffer must be the same!") if root.dtype.base_type != dtypes.int32 and root.dtype.base_type != dtypes.int64: raise ValueError("Bcast root must be an integer!") count_str = "XXX" for _, src_conn, _, _, data in state.out_edges(self): if src_conn == '_outbuffer': dims = [symstr(e) for e in data.subset.size_exact()] count_str = "*".join(dims) return (inbuffer, count_str), root
def replace_properties(node: Any, symrepl: Dict[symbolic.symbol, symbolic.SymbolicType], name: str, new_name: str): for propclass, propval in node.properties(): if propval is None: continue pname = propclass.attr_name if isinstance(propclass, properties.SymbolicProperty): setattr(node, pname, propval.subs(symrepl)) elif isinstance(propclass, properties.DataProperty): if propval == name: setattr(node, pname, new_name) elif isinstance(propclass, (properties.RangeProperty, properties.ShapeProperty)): setattr(node, pname, _replsym(list(propval), symrepl)) elif isinstance(propclass, properties.CodeProperty): # Don't replace variables that appear as an input or an output # connector, as this should shadow the outer declaration. if hasattr(node, 'in_connectors'): if name in node.in_connectors: continue if hasattr(node, 'out_connectors'): if name in node.out_connectors: continue if isinstance(propval.code, str): if str(name) != str(new_name): lang = propval.language newcode = propval.code if not re.findall(r'[^\w]%s[^\w]' % name, newcode): continue if lang is dtypes.Language.CPP: # Replace in C++ code # Avoid import loop from dace.codegen.targets.cpp import sym2cpp # Use local variables and shadowing to replace replacement = 'auto %s = %s;\n' % (name, sym2cpp(new_name)) propval.code = replacement + newcode else: warnings.warn( 'Replacement of %s with %s was not made ' 'for string tasklet code of language %s' % (name, new_name, lang)) elif propval.code is not None: for stmt in propval.code: ASTFindReplace({ name: symbolic.symstr(new_name) }).visit(stmt) elif (isinstance(propclass, properties.DictProperty) and pname == 'symbol_mapping'): # Symbol mappings for nested SDFGs for symname, sym_mapping in propval.items(): try: propval[symname] = symbolic.pystr_to_symbolic( str(sym_mapping)).subs(symrepl) except AttributeError: # If the symbolified value has no subs pass
def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_stream): # Take care of map header assert len(dfg_scope.source_nodes()) == 1 map_header = dfg_scope.source_nodes()[0] function_stream.write('extern int __dace_comm_size, __dace_comm_rank;', sdfg, state_id, map_header) if len(map_header.map.params) > 1: raise NotImplementedError( 'Multi-dimensional MPI maps are not supported') for var, r in zip(map_header.map.params, map_header.map.range): begin, end, skip = r callsite_stream.write('{\n', sdfg, state_id, map_header) callsite_stream.write( 'auto %s = %s + __dace_comm_rank * (%s);\n' % (var, cppunparse.pyexpr2cpp(symbolic.symstr(begin)), cppunparse.pyexpr2cpp(symbolic.symstr(skip))), sdfg, state_id, map_header) to_allocate = dace.sdfg.local_transients(sdfg, dfg_scope, map_header) allocated = set() for child in dfg_scope.scope_dict(node_to_children=True)[map_header]: if not isinstance(child, nodes.AccessNode): continue if child.data not in to_allocate or child.data in allocated: continue allocated.add(child.data) self._dispatcher.dispatch_allocate(sdfg, dfg_scope, state_id, child, function_stream, callsite_stream) self._dispatcher.dispatch_initialize(sdfg, dfg_scope, state_id, child, function_stream, callsite_stream) self._dispatcher.dispatch_subgraph(sdfg, dfg_scope, state_id, function_stream, callsite_stream, skip_entry_node=True)
def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_stream): # Take care of map header assert len(dfg_scope.source_nodes()) == 1 map_header = dfg_scope.source_nodes()[0] function_stream.write('extern int __dace_comm_size, __dace_comm_rank;', sdfg, state_id, map_header) # Add extra opening brace (dynamic map ranges, closed in MapExit # generator) callsite_stream.write('{', sdfg, state_id, map_header) if len(map_header.map.params) > 1: raise NotImplementedError( 'Multi-dimensional MPI maps are not supported') state = sdfg.node(state_id) symtypes = map_header.new_symbols(sdfg, state, state.symbols_defined_at(map_header)) for var, r in zip(map_header.map.params, map_header.map.range): begin, end, skip = r callsite_stream.write('{\n', sdfg, state_id, map_header) callsite_stream.write( '%s %s = %s + __dace_comm_rank * (%s);\n' % (symtypes[var], var, cppunparse.pyexpr2cpp(symbolic.symstr(begin)), cppunparse.pyexpr2cpp(symbolic.symstr(skip))), sdfg, state_id, map_header) self._frame.allocate_arrays_in_scope(sdfg, map_header, function_stream, callsite_stream) self._dispatcher.dispatch_subgraph(sdfg, dfg_scope, state_id, function_stream, callsite_stream, skip_entry_node=True)
def replace_param(param): param = symbolic.symstr(param) for p, pval in param_to_edge.items(): # TODO: This special replacement condition will be removed # when the code generator is modified to make consistent # scalar/array decisions. if (isinstance(nsdfg.arrays[pval.data.data], data.Scalar) or (nsdfg.arrays[pval.data.data].shape[0] == 1 and len(nsdfg.arrays[pval.data.data].shape) == 1)): param = param.replace(p, pval.data.data) else: param = param.replace(p, cpp_array_expr(nsdfg, pval.data)) return param
def infer_expr_type(code, symbols=None): """ Return inferred type of a given expression :param code: code string (an expression) or symbolic expression :param symbols: already defined symbols (if any) in a dictionary "symbol name" -> dytpes.typeclass: :return: inferred type """ symbols = symbols or {} inferred_symbols = {} if isinstance(code, (str, float, int, complex)): parsed_ast = ast.parse(str(code)) elif isinstance(code, sympy.Basic) or isinstance(code, SymExpr): parsed_ast = ast.parse(symstr(code)) # The parsed AST must only contain one expression if hasattr(parsed_ast, "body") and isinstance(parsed_ast.body[0], ast.Expr): return _dispatch(parsed_ast.body[0], symbols, inferred_symbols) else: raise TypeError("Expected expression, got: {}".format(type(code)))
def calc_set_image_range(map_idx, map_set, array_range, strided): image = [] n = len(array_range) - len(strided) if n > 0: strided.append([strided[-1]] * n) for a_range, stride in zip(array_range, strided): new_range = list(a_range) for m_idx, m_range in zip(map_idx, map_set): symbol = symbolic.pystr_to_symbolic(m_idx) new_range[0] = new_range[0].subs( symbol, dace.symbolic.overapproximate(m_range[0])) new_range[1] = new_range[1].subs( symbol, dace.symbolic.overapproximate(m_range[1])) if stride: new_range[2] = symbolic.pystr_to_symbolic('%s / %s' % (str( new_range[2]), symbolic.symstr(m_range[1]))) else: new_range[2] = new_range[2].subs( symbol, dace.symbolic.overapproximate(m_range[2])) image.append(new_range) return subsets.Range(image)
def py2cpp(code, expr_semicolon=True, defined_symbols=None): if isinstance(code, str): try: return cppunparse(ast.parse(code), expr_semicolon, defined_symbols=defined_symbols) except SyntaxError: return code elif isinstance(code, ast.AST): return cppunparse(code, expr_semicolon, defined_symbols=defined_symbols) elif isinstance(code, list): return '\n'.join(py2cpp(stmt) for stmt in code) elif isinstance(code, sympy.Basic): from dace import symbolic return cppunparse(ast.parse(symbolic.symstr(code)), expr_semicolon, defined_symbols=defined_symbols) elif code.__class__.__name__ == 'function': try: code_str = inspect.getsource(code) # Remove leading indentation lines = code_str.splitlines() leading_spaces = len(lines[0]) - len(lines[0].lstrip()) code_str = '' for line in lines: code_str += line[leading_spaces:] + '\n' except: # Can be different exceptions coming from Python's AST module raise NotImplementedError('Invalid function given') return cppunparse(ast.parse(code_str), expr_semicolon, defined_symbols=defined_symbols) else: raise NotImplementedError('Unsupported type for py2cpp')
def infer_types(code, symbols=None): """ Perform type inference on the given code :param code: a string, AST, or symbolic expression :param symbols: optional, already known symbols with their types. This is a dictionary "symbol name" -> dytpes.typeclass: :return: a dictionary "symbol name" -> dtypes.typeclass of inferred symbols """ symbols = symbols or {} inferred_symbols = {} if isinstance(code, str): _dispatch(ast.parse(code), symbols, inferred_symbols) elif isinstance(code, ast.AST): _dispatch(code, symbols, inferred_symbols) elif isinstance(code, sympy.Basic) or isinstance(code, SymExpr): _dispatch(ast.parse(symstr(code)), symbols, inferred_symbols) elif isinstance(code, list): # call infer for any code elements, maintaining a list of inferred_symbols so far # defined symbols get updated with newly inferred symbols defined_symbols = symbols.copy() for c in code: defined_symbols.update(inferred_symbols) inf_symbols = infer_types(c, defined_symbols) inferred_symbols.update(inf_symbols) return inferred_symbols
def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG): node.validate(sdfg, state) input_edge: graph.MultiConnectorEdge = state.in_edges(node)[0] output_edge: graph.MultiConnectorEdge = state.out_edges(node)[0] input_dims = len(input_edge.data.subset) output_dims = len(output_edge.data.subset) input_data = sdfg.arrays[input_edge.data.data] output_data = sdfg.arrays[output_edge.data.data] # Setup all locations in which code will be written cuda_globalcode = CodeIOStream() cuda_initcode = CodeIOStream() cuda_exitcode = CodeIOStream() host_globalcode = CodeIOStream() host_localcode = CodeIOStream() output_memlet = output_edge.data # Try to autodetect reduction type redtype = detect_reduction_type(node.wcr) node_id = state.node_id(node) state_id = sdfg.node_id(state) idstr = '{sdfg}_{state}_{node}'.format(sdfg=sdfg.name, state=state_id, node=node_id) if node.out_connectors: dtype = next(node.out_connectors.values()) else: dtype = sdfg.arrays[output_memlet.data].dtype output_type = dtype.ctype if node.identity is None: raise ValueError('For device reduce nodes, initial value must be ' 'specified') # Create a functor or use an existing one for reduction if redtype == dtypes.ReductionType.Custom: body, [arg1, arg2] = unparse_cr_split(sdfg, node.wcr) cuda_globalcode.write( """ struct __reduce_{id} {{ template <typename T> DACE_HDFI T operator()(const T &{arg1}, const T &{arg2}) const {{ {contents} }} }};""".format(id=idstr, arg1=arg1, arg2=arg2, contents=body), sdfg, state_id, node_id) reduce_op = ', __reduce_' + idstr + '(), ' + symstr(node.identity) elif redtype in ExpandReduceCUDADevice._SPECIAL_RTYPES: reduce_op = '' else: credtype = 'dace::ReductionType::' + str( redtype)[str(redtype).find('.') + 1:] reduce_op = ((', dace::_wcr_fixed<%s, %s>()' % (credtype, output_type)) + ', ' + symstr(node.identity)) # Obtain some SDFG-related information input_memlet = input_edge.data reduce_shape = input_memlet.subset.bounding_box_size() num_items = ' * '.join(symstr(s) for s in reduce_shape) input = (input_memlet.data + ' + ' + cpp_array_expr(sdfg, input_memlet, with_brackets=False)) output = (output_memlet.data + ' + ' + cpp_array_expr(sdfg, output_memlet, with_brackets=False)) input_dims = input_memlet.subset.dims() output_dims = output_memlet.subset.data_dims() reduce_all_axes = (node.axes is None or len(node.axes) == input_dims) if reduce_all_axes: reduce_last_axes = False else: reduce_last_axes = sorted(node.axes) == list( range(input_dims - len(node.axes), input_dims)) if (not reduce_all_axes) and (not reduce_last_axes): raise NotImplementedError( 'Multiple axis reductions not supported on GPUs. Please use ' 'the pure expansion or make reduce axes the last in the array.' ) # Verify that data is on the GPU if input_data.storage not in [ dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned ]: raise ValueError('Input of GPU reduction must either reside ' ' in global GPU memory or pinned CPU memory') if output_data.storage not in [ dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned ]: raise ValueError('Output of GPU reduction must either reside ' ' in global GPU memory or pinned CPU memory') # Determine reduction type kname = (ExpandReduceCUDADevice._SPECIAL_RTYPES[redtype] if redtype in ExpandReduceCUDADevice._SPECIAL_RTYPES else 'Reduce') # Create temp memory for this GPU cuda_globalcode.write( """ void *__cub_storage_{sdfg}_{state}_{node} = NULL; size_t __cub_ssize_{sdfg}_{state}_{node} = 0; """.format(sdfg=sdfg.name, state=state_id, node=node_id), sdfg, state_id, node) if reduce_all_axes: reduce_type = 'DeviceReduce' reduce_range = num_items reduce_range_def = 'size_t num_items' reduce_range_use = 'num_items' reduce_range_call = num_items elif reduce_last_axes: num_reduce_axes = len(node.axes) not_reduce_axes = reduce_shape[:-num_reduce_axes] reduce_axes = reduce_shape[-num_reduce_axes:] num_segments = ' * '.join([symstr(s) for s in not_reduce_axes]) segment_size = ' * '.join([symstr(s) for s in reduce_axes]) reduce_type = 'DeviceSegmentedReduce' iterator = 'dace::stridedIterator({size})'.format( size=segment_size) reduce_range = '{num}, {it}, {it} + 1'.format(num=num_segments, it=iterator) reduce_range_def = 'size_t num_segments, size_t segment_size' iterator_use = 'dace::stridedIterator(segment_size)' reduce_range_use = 'num_segments, {it}, {it} + 1'.format( it=iterator_use) reduce_range_call = '%s, %s' % (num_segments, segment_size) # Call CUB to get the storage size, allocate and free it cuda_initcode.write( """ cub::{reduce_type}::{kname}(nullptr, __cub_ssize_{sdfg}_{state}_{node}, ({intype}*)nullptr, ({outtype}*)nullptr, {reduce_range}{redop}); cudaMalloc(&__cub_storage_{sdfg}_{state}_{node}, __cub_ssize_{sdfg}_{state}_{node}); """.format(sdfg=sdfg.name, state=state_id, node=node_id, reduce_type=reduce_type, reduce_range=reduce_range, redop=reduce_op, intype=input_data.dtype.ctype, outtype=output_data.dtype.ctype, kname=kname), sdfg, state_id, node) cuda_exitcode.write( 'cudaFree(__cub_storage_{sdfg}_{state}_{node});'.format( sdfg=sdfg.name, state=state_id, node=node_id), sdfg, state_id, node) # Write reduction function definition cuda_globalcode.write(""" DACE_EXPORTED void __dace_reduce_{id}({intype} *input, {outtype} *output, {reduce_range_def}, cudaStream_t stream); void __dace_reduce_{id}({intype} *input, {outtype} *output, {reduce_range_def}, cudaStream_t stream) {{ cub::{reduce_type}::{kname}(__cub_storage_{id}, __cub_ssize_{id}, input, output, {reduce_range_use}{redop}, stream); }} """.format(id=idstr, intype=input_data.dtype.ctype, outtype=output_data.dtype.ctype, reduce_type=reduce_type, reduce_range_def=reduce_range_def, reduce_range_use=reduce_range_use, kname=kname, redop=reduce_op)) # Write reduction function definition in caller file host_globalcode.write( """ DACE_EXPORTED void __dace_reduce_{id}({intype} *input, {outtype} *output, {reduce_range_def}, cudaStream_t stream); """.format(id=idstr, reduce_range_def=reduce_range_def, intype=input_data.dtype.ctype, outtype=output_data.dtype.ctype), sdfg, state_id, node) # Call reduction function where necessary host_localcode.write( '__dace_reduce_{id}({input}, {output}, {reduce_range_call}, __dace_current_stream);' .format(id=idstr, input=input, output=output, reduce_range_call=reduce_range_call)) # Make tasklet tnode = dace.nodes.Tasklet('reduce', {'_in': dace.pointer(input_data.dtype)}, {'_out': dace.pointer(output_data.dtype)}, host_localcode.getvalue(), language=dace.Language.CPP) # Add the rest of the code sdfg.append_global_code(host_globalcode.getvalue()) sdfg.append_global_code(cuda_globalcode.getvalue(), 'cuda') sdfg.append_init_code(cuda_initcode.getvalue(), 'cuda') sdfg.append_exit_code(cuda_exitcode.getvalue(), 'cuda') # Rename outer connectors and add to node input_edge._dst_conn = '_in' output_edge._src_conn = '_out' node.add_in_connector('_in') node.add_out_connector('_out') return tnode
def _sym2cpp(s, arrayexprs): return cppunparse.pyexpr2cpp(symbolic.symstr(s, arrayexprs))
def size_string(self): return (" * ".join( [cppunparse.pyexpr2cpp(symbolic.symstr(s)) for s in self.shape]))
def _stripmine(self, sdfg, graph, candidate): # Retrieve map entry and exit nodes. map_entry = graph.nodes()[candidate[StripMining._map_entry]] map_exit = graph.exit_nodes(map_entry)[0] # Retrieve transformation properties. dim_idx = self.dim_idx new_dim_prefix = self.new_dim_prefix tile_size = self.tile_size divides_evenly = self.divides_evenly strided = self.strided tile_stride = self.tile_stride if tile_stride is None or len(tile_stride) == 0: tile_stride = tile_size # Retrieve parameter and range of dimension to be strip-mined. target_dim = map_entry.map.params[dim_idx] td_from, td_to, td_step = map_entry.map.range[dim_idx] # Create new map. Replace by cloning??? new_dim = self._find_new_dim(sdfg, graph, map_entry, new_dim_prefix, target_dim) nd_from = 0 nd_to = symbolic.pystr_to_symbolic( 'int_ceil(%s + 1 - %s, %s) - 1' % (symbolic.symstr(td_to), symbolic.symstr(td_from), tile_stride)) nd_step = 1 new_dim_range = (nd_from, nd_to, nd_step) new_map = nodes.Map(new_dim + '_' + map_entry.map.label, [new_dim], subsets.Range([new_dim_range])) new_map_entry = nodes.MapEntry(new_map) new_map_exit = nodes.MapExit(new_map) # Change the range of the selected dimension to iterate over a single # tile if strided: td_from_new = symbolic.pystr_to_symbolic(new_dim) td_to_new_approx = td_to td_step = symbolic.pystr_to_symbolic(tile_size) else: td_from_new = symbolic.pystr_to_symbolic( '%s + %s * %s' % (symbolic.symstr(td_from), str(new_dim), tile_stride)) td_to_new_exact = symbolic.pystr_to_symbolic( 'min(%s + 1, %s + %s * %s + %s) - 1' % (symbolic.symstr(td_to), symbolic.symstr(td_from), tile_stride, str(new_dim), tile_size)) td_to_new_approx = symbolic.pystr_to_symbolic( '%s + %s * %s + %s - 1' % (symbolic.symstr(td_from), tile_stride, str(new_dim), tile_size)) if divides_evenly or strided: td_to_new = td_to_new_approx else: td_to_new = dace.symbolic.SymExpr(td_to_new_exact, td_to_new_approx) map_entry.map.range[dim_idx] = (td_from_new, td_to_new, td_step) # Make internal map's schedule to "not parallel" new_map.schedule = map_entry.map.schedule map_entry.map.schedule = dtypes.ScheduleType.Sequential # Redirect edges new_map_entry.in_connectors = dcpy(map_entry.in_connectors) nxutil.change_edge_dest(graph, map_entry, new_map_entry) new_map_exit.out_connectors = dcpy(map_exit.out_connectors) nxutil.change_edge_src(graph, map_exit, new_map_exit) # Create new entry edges new_in_edges = dict() entry_in_conn = set() entry_out_conn = set() for _src, src_conn, _dst, _, memlet in graph.out_edges(map_entry): if (src_conn is not None and src_conn[:4] == 'OUT_' and not isinstance( sdfg.arrays[memlet.data], dace.data.Scalar)): new_subset = calc_set_image( map_entry.map.params, map_entry.map.range, memlet.subset, ) conn = src_conn[4:] key = (memlet.data, 'IN_' + conn, 'OUT_' + conn) if key in new_in_edges.keys(): old_subset = new_in_edges[key].subset new_in_edges[key].subset = calc_set_union( old_subset, new_subset) else: entry_in_conn.add('IN_' + conn) entry_out_conn.add('OUT_' + conn) new_memlet = dcpy(memlet) new_memlet.subset = new_subset new_memlet.num_accesses = new_memlet.num_elements() new_in_edges[key] = new_memlet else: if src_conn is not None and src_conn[:4] == 'OUT_': conn = src_conn[4:] in_conn = 'IN_' + conn out_conn = 'OUT_' + conn else: in_conn = src_conn out_conn = src_conn if in_conn: entry_in_conn.add(in_conn) if out_conn: entry_out_conn.add(out_conn) new_in_edges[(memlet.data, in_conn, out_conn)] = dcpy(memlet) new_map_entry.out_connectors = entry_out_conn map_entry.in_connectors = entry_in_conn for (_, in_conn, out_conn), memlet in new_in_edges.items(): graph.add_edge(new_map_entry, out_conn, map_entry, in_conn, memlet) # Create new exit edges new_out_edges = dict() exit_in_conn = set() exit_out_conn = set() for _src, _, _dst, dst_conn, memlet in graph.in_edges(map_exit): if (dst_conn is not None and dst_conn[:3] == 'IN_' and not isinstance( sdfg.arrays[memlet.data], dace.data.Scalar)): new_subset = calc_set_image( map_entry.map.params, map_entry.map.range, memlet.subset, ) conn = dst_conn[3:] key = (memlet.data, 'IN_' + conn, 'OUT_' + conn) if key in new_out_edges.keys(): old_subset = new_out_edges[key].subset new_out_edges[key].subset = calc_set_union( old_subset, new_subset) else: exit_in_conn.add('IN_' + conn) exit_out_conn.add('OUT_' + conn) new_memlet = dcpy(memlet) new_memlet.subset = new_subset new_memlet.num_accesses = new_memlet.num_elements() new_out_edges[key] = new_memlet else: if dst_conn is not None and dst_conn[:3] == 'IN_': conn = dst_conn[3:] in_conn = 'IN_' + conn out_conn = 'OUT_' + conn else: in_conn = src_conn out_conn = src_conn if in_conn: exit_in_conn.add(in_conn) if out_conn: exit_out_conn.add(out_conn) new_in_edges[(memlet.data, in_conn, out_conn)] = dcpy(memlet) new_map_exit.in_connectors = exit_in_conn map_exit.out_connectors = exit_out_conn for (_, in_conn, out_conn), memlet in new_out_edges.items(): graph.add_edge(map_exit, out_conn, new_map_exit, in_conn, memlet) # Return strip-mined dimension. return target_dim, new_dim, new_map
def __stripmine(self, sdfg, graph, candidate): # Retrieve map entry and exit nodes. map_entry = graph.nodes()[candidate[OrthogonalTiling._map_entry]] map_exit = graph.exit_nodes(map_entry)[0] # Map subgraph map_subgraph = graph.scope_subgraph(map_entry) # Retrieve transformation properties. prefix = self.prefix tile_sizes = self.tile_sizes divides_evenly = self.divides_evenly new_param = [] new_range = [] for dim_idx in range(len(map_entry.map.params)): if dim_idx >= len(tile_sizes): tile_size = tile_sizes[-1] else: tile_size = tile_sizes[dim_idx] # Retrieve parameter and range of dimension to be strip-mined. target_dim = map_entry.map.params[dim_idx] td_from, td_to, td_step = map_entry.map.range[dim_idx] new_dim = prefix + '_' + target_dim # Basic values if divides_evenly: tile_num = '(%s + 1 - %s) / %s' % (symbolic.symstr(td_to), symbolic.symstr(td_from), str(tile_size)) else: tile_num = 'int_ceil((%s + 1 - %s), %s)' % (symbolic.symstr( td_to), symbolic.symstr(td_from), str(tile_size)) # Outer map values (over all tiles) nd_from = 0 nd_to = symbolic.pystr_to_symbolic(str(tile_num) + ' - 1') nd_step = 1 # Inner map values (over one tile) td_from_new = dace.symbolic.pystr_to_symbolic(td_from) td_to_new_exact = symbolic.pystr_to_symbolic( 'min(%s + 1 - %s * %s, %s + %s) - 1' % (symbolic.symstr(td_to), str(new_dim), str(tile_size), td_from_new, str(tile_size))) td_to_new_approx = symbolic.pystr_to_symbolic( '%s + %s - 1' % (td_from_new, str(tile_size))) # Outer map (over all tiles) new_dim_range = (nd_from, nd_to, nd_step) new_param.append(new_dim) new_range.append(new_dim_range) # Inner map (over one tile) if divides_evenly: td_to_new = td_to_new_approx else: td_to_new = dace.symbolic.SymExpr(td_to_new_exact, td_to_new_approx) map_entry.map.range[dim_idx] = (td_from_new, td_to_new, td_step) # Fix subgraph memlets target_dim = dace.symbolic.pystr_to_symbolic(target_dim) offset = dace.symbolic.pystr_to_symbolic( '%s * %s' % (new_dim, str(tile_size))) for _, _, _, _, memlet in map_subgraph.edges(): old_subset = memlet.subset if isinstance(old_subset, dace.subsets.Indices): new_indices = [] for idx in old_subset: new_idx = idx.subs(target_dim, target_dim + offset) new_indices.append(new_idx) memlet.subset = dace.subsets.Indices(new_indices) elif isinstance(old_subset, dace.subsets.Range): new_ranges = [] for i, old_range in enumerate(old_subset): if len(old_range) == 3: b, e, s, = old_range t = old_subset.tile_sizes[i] else: raise ValueError( 'Range %s is invalid.' % old_range) new_b = b.subs(target_dim, target_dim + offset) new_e = e.subs(target_dim, target_dim + offset) new_s = s.subs(target_dim, target_dim + offset) new_t = t.subs(target_dim, target_dim + offset) new_ranges.append((new_b, new_e, new_s, new_t)) memlet.subset = dace.subsets.Range(new_ranges) else: raise NotImplementedError new_map = nodes.Map(prefix + '_' + map_entry.map.label, new_param, subsets.Range(new_range)) new_map_entry = nodes.MapEntry(new_map) new_exit = nodes.MapExit(new_map) # Make internal map's schedule to "not parallel" map_entry.map._schedule = dtypes.ScheduleType.Default # Redirect/create edges. new_in_edges = {} for _src, conn, _dest, _, memlet in graph.out_edges(map_entry): if not isinstance(sdfg.arrays[memlet.data], dace.data.Scalar): new_subset = copy.deepcopy(memlet.subset) # new_subset = calc_set_image(map_entry.map.params, # map_entry.map.range, memlet.subset, # cont_or_strided) if memlet.data in new_in_edges: src, src_conn, dest, dest_conn, new_memlet, num = \ new_in_edges[memlet.data] new_memlet.subset = calc_set_union( new_memlet.data, sdfg.arrays[nnew_memlet.data], new_memlet.subset, new_subset) new_memlet.num_accesses = new_memlet.num_elements() new_in_edges.update({ memlet.data: (src, src_conn, dest, dest_conn, new_memlet, min(num, int(conn[4:]))) }) else: new_memlet = dcpy(memlet) new_memlet.subset = new_subset new_memlet.num_accesses = new_memlet.num_elements() new_in_edges.update({ memlet.data: (new_map_entry, None, map_entry, None, new_memlet, int(conn[4:])) }) nxutil.change_edge_dest(graph, map_entry, new_map_entry) new_out_edges = {} for _src, conn, _dest, _, memlet in graph.in_edges(map_exit): if not isinstance(sdfg.arrays[memlet.data], dace.data.Scalar): new_subset = memlet.subset # new_subset = calc_set_image(map_entry.map.params, # map_entry.map.range, # memlet.subset, cont_or_strided) if memlet.data in new_out_edges: src, src_conn, dest, dest_conn, new_memlet, num = \ new_out_edges[memlet.data] new_memlet.subset = calc_set_union( new_memlet.data, sdfg.arrays[nnew_memlet.data], new_memlet.subset, new_subset) new_memlet.num_accesses = new_memlet.num_elements() new_out_edges.update({ memlet.data: (src, src_conn, dest, dest_conn, new_memlet, min(num, conn[4:])) }) else: new_memlet = dcpy(memlet) new_memlet.subset = new_subset new_memlet.num_accesses = new_memlet.num_elements() new_out_edges.update({ memlet.data: (map_exit, None, new_exit, None, new_memlet, conn[4:]) }) nxutil.change_edge_src(graph, map_exit, new_exit) # Connector related work follows # 1. Dictionary 'old_connector_number': 'new_connector_numer' # 2. New node in/out connectors # 3. New edges in_conn_nums = [] for _, e in new_in_edges.items(): _, _, _, _, _, num = e in_conn_nums.append(num) in_conn = {} for i, num in enumerate(in_conn_nums): in_conn.update({num: i + 1}) entry_in_connectors = set() entry_out_connectors = set() for i in range(len(in_conn_nums)): entry_in_connectors.add('IN_' + str(i + 1)) entry_out_connectors.add('OUT_' + str(i + 1)) new_map_entry.in_connectors = entry_in_connectors new_map_entry.out_connectors = entry_out_connectors for _, e in new_in_edges.items(): src, _, dst, _, memlet, num = e graph.add_edge(src, 'OUT_' + str(in_conn[num]), dst, 'IN_' + str(in_conn[num]), memlet) out_conn_nums = [] for _, e in new_out_edges.items(): _, _, dst, _, _, num = e if dst is not new_exit: continue out_conn_nums.append(num) out_conn = {} for i, num in enumerate(out_conn_nums): out_conn.update({num: i + 1}) exit_in_connectors = set() exit_out_connectors = set() for i in range(len(out_conn_nums)): exit_in_connectors.add('IN_' + str(i + 1)) exit_out_connectors.add('OUT_' + str(i + 1)) new_exit.in_connectors = exit_in_connectors new_exit.out_connectors = exit_out_connectors for _, e in new_out_edges.items(): src, _, dst, _, memlet, num = e graph.add_edge(src, 'OUT_' + str(out_conn[num]), dst, 'IN_' + str(out_conn[num]), memlet) # Return strip-mined dimension. return target_dim, new_dim, new_map
def replace_param(param): param = symbolic.symstr(param) for p, pval in param_to_edge.items(): # TODO: Correct w.r.t. connector type param = param.replace(p, cpp_array_expr(nsdfg, pval.data)) return param
def _create_ceil_range(self, sdfg: SDFG, graph: SDFGState, map_entry: nodes.MapEntry): map_exit = graph.exit_node(map_entry) # Retrieve transformation properties. dim_idx = self.dim_idx new_dim_prefix = self.new_dim_prefix tile_size = self.tile_size divides_evenly = self.divides_evenly strided = self.strided offset = self.tile_offset tile_stride = self.tile_stride if tile_stride == 0: tile_stride = tile_size # Retrieve parameter and range of dimension to be strip-mined. target_dim = map_entry.map.params[dim_idx] td_from, td_to, td_step = map_entry.map.range[dim_idx] # Create new map. Replace by cloning map object? new_dim = self._find_new_dim(sdfg, graph, map_entry, new_dim_prefix, target_dim) nd_from = 0 if tile_stride == 1: nd_to = td_to - td_from else: nd_to = symbolic.pystr_to_symbolic( 'int_ceil(%s + 1 - %s, %s) - 1' % (symbolic.symstr(td_to), symbolic.symstr(td_from), symbolic.symstr(tile_stride))) nd_step = 1 new_dim_range = (nd_from, nd_to, nd_step) new_map = nodes.Map(new_dim + '_' + map_entry.map.label, [new_dim], subsets.Range([new_dim_range])) # Change the range of the selected dimension to iterate over a single # tile if strided: td_from_new = symbolic.pystr_to_symbolic(new_dim) td_to_new_approx = td_to td_step = tile_size elif offset == 0: td_from_new = symbolic.pystr_to_symbolic( '%s + %s * %s' % (symbolic.symstr(td_from), symbolic.symstr(new_dim), symbolic.symstr(tile_stride))) td_to_new_exact = symbolic.pystr_to_symbolic( 'min(%s + 1, %s + %s * %s + %s) - 1' % (symbolic.symstr(td_to), symbolic.symstr(td_from), symbolic.symstr(tile_stride), symbolic.symstr(new_dim), symbolic.symstr(tile_size))) td_to_new_approx = symbolic.pystr_to_symbolic( '%s + %s * %s + %s - 1' % (symbolic.symstr(td_from), symbolic.symstr(tile_stride), symbolic.symstr(new_dim), symbolic.symstr(tile_size))) else: # include offset td_from_new_exact = symbolic.pystr_to_symbolic( 'max(%s,%s + %s * %s - %s)' % (symbolic.symstr(td_from), symbolic.symstr(td_from), symbolic.symstrtr(tile_stride), symbolic.symstr(new_dim), symbolic.symstr(offset))) td_from_new_approx = symbolic.pystr_to_symbolic( '%s + %s * %s - %s ' % (symbolic.symstr(td_from), symbolic.symstr(tile_stride), symbolic.symstr(new_dim), symbolic.symstr(offset))) td_from_new = dace.symbolic.SymExpr(td_from_new_exact, td_from_new_approx) td_to_new_exact = symbolic.pystr_to_symbolic( 'min(%s + 1, %s + %s * %s + %s - %s) -1' % (symbolic.symstr(td_to), symbolic.symstr(td_from), symbolic.symstr(tile_stride), symbolic.symstr(new_dim), symbolic.symstr(tile_size), symbolic.symstr(offset))) td_to_new_approx = symbolic.pystr_to_symbolic( '%s + %s * %s + %s - %s - 1' % (symbolic.symstr(td_from), symbolic.symstr(tile_stride), symbolic.symstr(new_dim), symbolic.symstr(tile_size), symbolic.symstr(offset))) if divides_evenly or strided: td_to_new = td_to_new_approx else: td_to_new = dace.symbolic.SymExpr(td_to_new_exact, td_to_new_approx) return new_dim, new_map, (td_from_new, td_to_new, td_step)
def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG): node.validate(sdfg, state) input_edge: graph.MultiConnectorEdge = state.in_edges(node)[0] output_edge: graph.MultiConnectorEdge = state.out_edges(node)[0] input_dims = len(input_edge.data.subset) input_data = sdfg.arrays[input_edge.data.data] output_data = sdfg.arrays[output_edge.data.data] # Setup all locations in which code will be written cuda_globalcode = CodeIOStream() localcode = CodeIOStream() # Try to autodetect reduction type redtype = detect_reduction_type(node.wcr) node_id = state.node_id(node) state_id = sdfg.node_id(state) idstr = '{sdfg}_{state}_{node}'.format(sdfg=sdfg.name, state=state_id, node=node_id) # Obtain some SDFG-related information input_memlet = input_edge.data output_memlet = output_edge.data if node.out_connectors: dtype = next(node.out_connectors.values()) else: dtype = sdfg.arrays[output_memlet.data].dtype output_type = dtype.ctype if node.identity is None: raise ValueError('For device reduce nodes, initial value must be ' 'specified') # Create a functor or use an existing one for reduction if redtype == dtypes.ReductionType.Custom: body, [arg1, arg2] = unparse_cr_split(sdfg, node.wcr) cuda_globalcode.write( """ struct __reduce_{id} {{ template <typename T> DACE_HDFI T operator()(const T &{arg1}, const T &{arg2}) const {{ {contents} }} }};""".format(id=idstr, arg1=arg1, arg2=arg2, contents=body), sdfg, state_id, node_id) reduce_op = ', __reduce_' + idstr + '(), ' + symstr(node.identity) elif redtype in ExpandReduceCUDADevice._SPECIAL_RTYPES: reduce_op = '' else: credtype = 'dace::ReductionType::' + str( redtype)[str(redtype).find('.') + 1:] reduce_op = ((', dace::_wcr_fixed<%s, %s>()' % (credtype, output_type)) + ', ' + symstr(node.identity)) # Try to obtain the number of threads in the block, or use the default # configuration block_threads = devicelevel_block_size(sdfg, state, node) if block_threads is not None: block_threads = functools.reduce(lambda a, b: a * b, block_threads, 1) # Checks if block_threads is None: raise ValueError('Block-wide GPU reduction must occur within' ' a GPU kernel') if issymbolic(block_threads, sdfg.constants): raise ValueError('Block size has to be constant for block-wide ' 'reduction (got %s)' % str(block_threads)) if (node.axes is not None and len(node.axes) < input_dims): raise ValueError( 'Only full reduction is supported for block-wide reduce,' ' please use the pure expansion') if (input_data.storage != dtypes.StorageType.Register or output_data.storage != dtypes.StorageType.Register): raise ValueError( 'Block-wise reduction only supports GPU register inputs ' 'and outputs') if redtype in ExpandReduceCUDABlock._SPECIAL_RTYPES: raise ValueError('%s block reduction not supported' % redtype) credtype = 'dace::ReductionType::' + str( redtype)[str(redtype).find('.') + 1:] if redtype == dtypes.ReductionType.Custom: redop = '__reduce_%s()' % idstr else: redop = 'dace::_wcr_fixed<%s, %s>()' % (credtype, output_type) # Allocate shared memory for block reduce localcode.write(""" typedef cub::BlockReduce<{type}, {numthreads}> BlockReduce_{id}; __shared__ typename BlockReduce_{id}::TempStorage temp_storage_{id}; """.format(id=idstr, type=output_data.dtype.ctype, numthreads=block_threads)) input = (input_memlet.data + ' + ' + cpp_array_expr(sdfg, input_memlet, with_brackets=False)) output = cpp_array_expr(sdfg, output_memlet) localcode.write(""" {output} = BlockReduce_{id}(temp_storage_{id}).Reduce({input}, {redop}); """.format(id=idstr, redop=redop, input=input_memlet.data, output=output)) # Make tasklet tnode = dace.nodes.Tasklet('reduce', {'_in': dace.pointer(input_data.dtype)}, {'_out': dace.pointer(output_data.dtype)}, localcode.getvalue(), language=dace.Language.CPP) # Add the rest of the code sdfg.append_global_code(cuda_globalcode.getvalue(), 'cuda') # Rename outer connectors and add to node input_edge._dst_conn = '_in' output_edge._src_conn = '_out' node.add_in_connector('_in') node.add_out_connector('_out') return tnode
def apply(self, sdfg: SDFG): graph = sdfg.node(self.state_id) map_exit = graph.node(self.subgraph[AccumulateTransient.map_exit]) outer_map_exit = graph.node( self.subgraph[AccumulateTransient.outer_map_exit]) # Avoid import loop from dace.transformation.dataflow.local_storage import OutLocalStorage array_identity_dict = self.array_identity_dict # Choose array array = self.array if array is not None and len(array) != 0: array_identity_dict[array] = self.identity elif ((array is None or len(array) == 0) and len(array_identity_dict) == 0): array = next(e.data.data for e in graph.edges_between(map_exit, outer_map_exit) if e.data.wcr is not None) array_identity_dict[array] = self.identity transients: Dict[str, Any] = {} for array, identity in array_identity_dict.items(): data_node: nodes.AccessNode = OutLocalStorage.apply_to( sdfg, dict(array=array, prefix=self.prefix), verify=False, save=False, node_a=map_exit, node_b=outer_map_exit) transients[data_node.data] = identity if identity is None: warnings.warn( 'AccumulateTransient did not properly initialize ' 'newly-created transient!') return sdfg_state: SDFGState = sdfg.node(self.state_id) map_entry = sdfg_state.entry_node(map_exit) nested_sdfg: nodes.NestedSDFG = nest_state_subgraph( sdfg=sdfg, state=sdfg_state, subgraph=SubgraphView( sdfg_state, {map_entry, map_exit} | sdfg_state.all_nodes_between(map_entry, map_exit))) nested_sdfg_state: SDFGState = nested_sdfg.sdfg.nodes()[0] init_state = nested_sdfg.sdfg.add_state_before(nested_sdfg_state) for data_name, identity in transients.items(): temp_array: Array = sdfg.arrays[data_name] init_state.add_mapped_tasklet( name='acctrans_init', map_ranges={ '_o%d' % i: '0:%s' % symbolic.symstr(d) for i, d in enumerate(temp_array.shape) }, inputs={}, code='out = %s' % identity, outputs={ 'out': dace.Memlet.simple( data=data_name, subset_str=','.join([ '_o%d' % i for i, _ in enumerate(temp_array.shape) ])) }, external_edges=True) # TODO: use trivial map elimintation here when it will be merged to remove map if it has trivial ranges return nested_sdfg
def expansion(node, parent_state, parent_sdfg, **kwargs): node.validate(parent_sdfg, parent_state) sdfg = dace.SDFG(node.label + "_sdfg") ((edge_a, outer_array_a, shape_a, strides_a), (edge_x, outer_array_x, shape_x, strides_x), (edge_y, outer_array_y, shape_y, strides_y)) = _get_matmul_operands(node, parent_state, parent_sdfg, name_lhs="_A", name_rhs="_x", name_out="_y") dtype_a = outer_array_a.dtype.type dtype_x = outer_array_x.dtype.type dtype_y = outer_array_y.dtype.type if outer_array_a.dtype.veclen > 1 or outer_array_x.dtype.veclen > 1: raise NotImplementedError("Vectorization for pure GEMV NYI.") if node.transA: trans_shape_a = list(reversed(shape_a)) else: trans_shape_a = shape_a if trans_shape_a[1] != shape_x[0]: raise SyntaxError( "Matrix-vector product size mismatch: {} vs. {}".format( trans_shape_a[1], shape_x[0])) N, M = trans_shape_a[0], trans_shape_a[1] if outer_array_a.storage != outer_array_x.storage: raise ValueError("Input matrices must have same storage") storage = outer_array_a.storage _, array_a = sdfg.add_array("_A", shape_a, dtype_a, strides=strides_a, storage=storage) _, array_x = sdfg.add_array("_x", shape_x, dtype_x, strides=strides_x, storage=storage) _, array_y = sdfg.add_array("_y", shape_y, dtype_y, strides=strides_y, storage=storage) mul_program = "__out = {} * __A * __x".format(node.alpha) init_state = sdfg.add_state(node.label + "_initstate") state = sdfg.add_state_after(init_state, node.label + "_state") if node.beta == 0: mul_out, mul_out_array = "_y", array_y output_nodes = None else: mul_out, mul_out_array = tmp, array_tmp = sdfg.add_temp_transient( shape_y, dtype_y, storage=storage) access_tmp = state.add_read(tmp) output_nodes = {mul_out: access_tmp} # Initialization map init_state.add_mapped_tasklet( "gemv_init", { "_o%d" % i: "0:%s" % symbolic.symstr(d) for i, d in enumerate(shape_y) }, {}, "out = 0", { "out": dace.Memlet("{}[{}]".format( mul_out, ",".join( ["_o%d" % i for i in range(len(shape_y))]))) }, external_edges=True) # Multiplication map state.add_mapped_tasklet( "_GEMV_", {"__i%d" % i: "0:%s" % s for i, s in enumerate([N, M])}, { "__A": dace.Memlet("_A[{}]".format( "__i1, __i0" if node.transA else "__i0, __i1")), "__x": dace.Memlet("_x[__i1]") }, mul_program, { "__out": dace.Memlet(f"{mul_out}[__i0]", wcr="lambda x, y: x + y") }, external_edges=True, output_nodes=output_nodes) add_program = "__y_out = ({} * __y_in) + __tmp".format(node.beta) memlet_idx = "__i" # addition map if node.beta != 0: state.add_mapped_tasklet("_Add_", {"__i": "0:{}".format(N)}, { "__y_in": dace.Memlet(f"_y[{memlet_idx}]"), "__tmp": dace.Memlet(f"{mul_out}[__i]"), }, add_program, {"__y_out": dace.Memlet("_y[__i]")}, external_edges=True, input_nodes={mul_out: access_tmp}) return sdfg
def add_indirection_subgraph(sdfg, graph, src, dst, memlet): """ Replaces the specified edge in the specified graph with a subgraph that implements indirection without nested AST memlet objects. """ if not isinstance(memlet, astnodes._Memlet): raise TypeError("Expected memlet to be astnodes._Memlet") indirect_inputs = set() indirect_outputs = set() # Scheme for multi-array indirection: # 1. look for all arrays and accesses, create set of arrays+indices # from which the index memlets will be constructed from # 2. each separate array creates a memlet, of which num_accesses = len(set) # 3. one indirection tasklet receives them all + original array and # produces the right output index/range memlet ######################### # Step 1 accesses = OrderedDict() newsubset = dcpy(memlet.subset) for dimidx, dim in enumerate(memlet.subset): # Range/Index disambiguation direct_assignment = False if not isinstance(dim, tuple): dim = [dim] direct_assignment = True for i, r in enumerate(dim): for expr in sympy.preorder_traversal(r): if symbolic.is_sympy_userfunction(expr): fname = expr.func.__name__ if fname not in accesses: accesses[fname] = [] # Replace function with symbol (memlet local name to-be) if expr.args in accesses[fname]: aindex = accesses[fname].index(expr.args) toreplace = 'index_' + fname + '_' + str(aindex) else: accesses[fname].append(expr.args) toreplace = 'index_' + fname + '_' + str( len(accesses[fname]) - 1) if direct_assignment: newsubset[dimidx] = r.subs(expr, toreplace) else: newsubset[dimidx][i] = r.subs(expr, toreplace) ######################### # Step 2 ind_inputs = {'__ind_' + memlet.local_name} ind_outputs = {'lookup'} # Add accesses to inputs for arrname, arr_accesses in accesses.items(): for i in range(len(arr_accesses)): ind_inputs.add('index_%s_%d' % (arrname, i)) tasklet = nd.Tasklet("Indirection", ind_inputs, ind_outputs) input_index_memlets = [] for arrname, arr_accesses in accesses.items(): arr = memlet.otherdeps[arrname] for i, access in enumerate(arr_accesses): # Memlet to load the indirection index indexMemlet = Memlet(arrname, 1, sbs.Indices(list(access)), 1) input_index_memlets.append(indexMemlet) graph.add_edge(src, None, tasklet, "index_%s_%d" % (arrname, i), indexMemlet) ######################### # Step 3 # Create new tasklet that will perform the indirection indirection_ast = ast.parse("lookup = {arr}[{index}]".format( arr='__ind_' + memlet.local_name, index=', '.join([symbolic.symstr(s) for s in newsubset]))) # Conserve line number of original indirection code tasklet.code = ast.copy_location(indirection_ast.body[0], memlet.ast) # Create transient variable to trigger the indirected load if memlet.num_accesses == 1: storage = sdfg.add_scalar('__' + memlet.local_name + '_value', memlet.data.dtype, transient=True) else: storage = sdfg.add_array('__' + memlet.local_name + '_value', memlet.data.dtype, storage=types.StorageType.Default, transient=True, shape=memlet.bounding_box_size()) indirectRange = sbs.Range([(0, s - 1, 1) for s in storage.shape]) dataNode = nd.AccessNode('__' + memlet.local_name + '_value') # Create memlet that depends on the full array that we look up in fullRange = sbs.Range([(0, s - 1, 1) for s in memlet.data.shape]) fullMemlet = Memlet(memlet.dataname, memlet.num_accesses, fullRange, memlet.veclen) graph.add_edge(src, None, tasklet, '__ind_' + memlet.local_name, fullMemlet) # Memlet to store the final value into the transient, and to load it into # the tasklet that needs it indirectMemlet = Memlet('__' + memlet.local_name + '_value', memlet.num_accesses, indirectRange, memlet.veclen) graph.add_edge(tasklet, 'lookup', dataNode, None, indirectMemlet) valueMemlet = Memlet('__' + memlet.local_name + '_value', memlet.num_accesses, indirectRange, memlet.veclen) graph.add_edge(dataNode, None, dst, memlet.local_name, valueMemlet)
def make_sdfg(node, parent_state, parent_sdfg): # Get metadata from parent SDFG ((edge_a, outer_array_a, shape_a, strides_a), (edge_b, outer_array_b, shape_b, strides_b), cdata) = _get_matmul_operands(node, parent_state, parent_sdfg) outedge = parent_state.out_edges(node)[0] cdesc = parent_sdfg.arrays[outedge.data.data] bopt = _get_batchmm_opts(shape_a, strides_a, shape_b, strides_b, cdesc.shape, cdesc.strides) if shape_a[-1] != shape_b[-2]: raise SyntaxError('Matrix sizes must match') if bopt: shape_c = (bopt['b'], shape_a[-2], shape_b[-1]) else: shape_c = (shape_a[-2], shape_b[-1]) dtype_a = outer_array_a.dtype.type dtype_b = outer_array_b.dtype.type dtype_c = cdesc.dtype.type if outer_array_a.storage != outer_array_b.storage: raise ValueError("Input matrices must have same storage") storage = outer_array_a.storage # Create replacement SDFG sdfg = dace.SDFG(node.label + "_sdfg") _, array_a = sdfg.add_array("_a", shape_a, dtype_a, strides=strides_a, storage=storage) _, array_b = sdfg.add_array("_b", shape_b, dtype_b, strides=strides_b, storage=storage) _, array_c = sdfg.add_array("_c", shape_c, dtype_c, strides=cdata[-1], storage=storage) # Add an initialization state init_state = sdfg.add_state() init_state.add_mapped_tasklet( 'batched_matmul_init', {'_o%d' % i: '0:%s' % symstr(d) for i, d in enumerate(shape_c)}, {}, 'out = 0', { 'out': dace.Memlet.simple( '_c', ','.join(['_o%d' % i for i in range(len(shape_c))])) }, external_edges=True) state = sdfg.add_state_after(init_state, node.label + "_state") state.add_mapped_tasklet( '_BatchedBatchedMatMult_', { '__i%d' % i: '0:%s' % s for i, s in enumerate([ bopt['b'], array_a.shape[-2], array_b.shape[-1], array_a.shape[-1] ]) }, { '__a': dace.Memlet.simple("_a", ('__i1, __i3' if len(array_a.shape) == 2 else '__i0, __i1, __i3')), '__b': dace.Memlet.simple("_b", ('__i3, __i2' if len(array_b.shape) == 2 else '__i0, __i3, __i2')) }, '__c = __a * __b', { '__c': dace.Memlet.simple( "_c", '__i0, __i1, __i2', wcr_str='lambda x, y: x + y') }, external_edges=True) return sdfg
def _expand_reduce(self, sdfg, state, node): # expands a reduce into two nested maps # taken from legacy expand_reduce.py node.validate(sdfg, state) inedge: graph.MultiConnectorEdge = state.in_edges(node)[0] outedge: graph.MultiConnectorEdge = state.out_edges(node)[0] input_dims = len(inedge.data.subset) output_dims = len(outedge.data.subset) input_data = sdfg.arrays[inedge.data.data] output_data = sdfg.arrays[outedge.data.data] # Standardize axes axes = node.axes if node.axes else [i for i in range(input_dims)] # Create nested SDFG nsdfg = SDFG('reduce') nsdfg.add_array('_in', inedge.data.subset.size(), input_data.dtype, strides=input_data.strides, storage=input_data.storage) nsdfg.add_array('_out', outedge.data.subset.size(), output_data.dtype, strides=output_data.strides, storage=output_data.storage) if node.identity is not None: raise ValueError("Node identity has to be None at this point.") else: nstate = nsdfg.add_state() # END OF INIT # (If axes != all) Add outer map, which corresponds to the output range if len(axes) != input_dims: # Interleave input and output axes to match input memlet ictr, octr = 0, 0 input_subset = [] for i in range(input_dims): if i in axes: input_subset.append('_i%d' % ictr) ictr += 1 else: input_subset.append('_o%d' % octr) octr += 1 output_size = outedge.data.subset.size() ome, omx = nstate.add_map( 'reduce_output', { '_o%d' % i: '0:%s' % symstr(sz) for i, sz in enumerate(outedge.data.subset.size()) }) outm = Memlet.simple('_out', ','.join( ['_o%d' % i for i in range(output_dims)]), wcr_str=node.wcr) inmm = Memlet.simple('_in', ','.join(input_subset)) else: ome, omx = None, None outm = Memlet.simple('_out', '0', wcr_str=node.wcr) inmm = Memlet.simple( '_in', ','.join(['_i%d' % i for i in range(len(axes))])) # Add inner map, which corresponds to the range to reduce, containing # an identity tasklet ime, imx = nstate.add_map( 'reduce_values', { '_i%d' % i: '0:%s' % symstr(inedge.data.subset.size()[axis]) for i, axis in enumerate(sorted(axes)) }) # Add identity tasklet for reduction t = nstate.add_tasklet('identity', {'inp'}, {'out'}, 'out = inp') # Connect everything r = nstate.add_read('_in') w = nstate.add_read('_out') if ome: nstate.add_memlet_path(r, ome, ime, t, dst_conn='inp', memlet=inmm) nstate.add_memlet_path(t, imx, omx, w, src_conn='out', memlet=outm) else: nstate.add_memlet_path(r, ime, t, dst_conn='inp', memlet=inmm) nstate.add_memlet_path(t, imx, w, src_conn='out', memlet=outm) # Rename outer connectors and add to node inedge._dst_conn = '_in' outedge._src_conn = '_out' node.add_in_connector('_in') node.add_out_connector('_out') nsdfg = state.add_nested_sdfg(nsdfg, sdfg, node.in_connectors, node.out_connectors, schedule=node.schedule, name=node.name) utils.change_edge_dest(state, node, nsdfg) utils.change_edge_src(state, node, nsdfg) state.remove_node(node) return nsdfg
def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG): node.validate(sdfg, state) inedge: graph.MultiConnectorEdge = state.in_edges(node)[0] outedge: graph.MultiConnectorEdge = state.out_edges(node)[0] input_dims = len(inedge.data.subset) output_dims = len(outedge.data.subset) input_data = sdfg.arrays[inedge.data.data] output_data = sdfg.arrays[outedge.data.data] # Standardize axes axes = node.axes if node.axes else [i for i in range(input_dims)] # Create nested SDFG nsdfg = SDFG('reduce') nsdfg.add_array('_in', inedge.data.subset.size(), input_data.dtype, strides=input_data.strides, storage=input_data.storage) nsdfg.add_array('_out', outedge.data.subset.size(), output_data.dtype, strides=output_data.strides, storage=output_data.storage) # If identity is defined, add an initialization state if node.identity is not None: init_state = nsdfg.add_state() nstate = nsdfg.add_state() nsdfg.add_edge(init_state, nstate, dace.InterstateEdge()) # Add initialization as a map init_state.add_mapped_tasklet( 'reduce_init', { '_o%d' % i: '0:%s' % symstr(d) for i, d in enumerate(outedge.data.subset.size()) }, {}, 'out = %s' % node.identity, { 'out': dace.Memlet.simple( '_out', ','.join( ['_o%d' % i for i in range(output_dims)])) }, external_edges=True) else: nstate = nsdfg.add_state() # END OF INIT # (If axes != all) Add outer map, which corresponds to the output range if len(axes) != input_dims: # Interleave input and output axes to match input memlet ictr, octr = 0, 0 input_subset = [] for i in range(input_dims): if i in axes: input_subset.append('_i%d' % ictr) ictr += 1 else: input_subset.append('_o%d' % octr) octr += 1 output_size = outedge.data.subset.size() ome, omx = nstate.add_map( 'reduce_output', { '_o%d' % i: '0:%s' % symstr(sz) for i, sz in enumerate(outedge.data.subset.size()) }) outm = dace.Memlet.simple( '_out', ','.join(['_o%d' % i for i in range(output_dims)]), wcr_str=node.wcr) inmm = dace.Memlet.simple('_in', ','.join(input_subset)) else: ome, omx = None, None outm = dace.Memlet.simple('_out', '0', wcr_str=node.wcr) inmm = dace.Memlet.simple( '_in', ','.join(['_i%d' % i for i in range(len(axes))])) # Add inner map, which corresponds to the range to reduce, containing # an identity tasklet ime, imx = nstate.add_map( 'reduce_values', { '_i%d' % i: '0:%s' % symstr(inedge.data.subset.size()[axis]) for i, axis in enumerate(sorted(axes)) }) # Add identity tasklet for reduction t = nstate.add_tasklet('identity', {'inp'}, {'out'}, 'out = inp') # Connect everything r = nstate.add_read('_in') w = nstate.add_read('_out') if ome: nstate.add_memlet_path(r, ome, ime, t, dst_conn='inp', memlet=inmm) nstate.add_memlet_path(t, imx, omx, w, src_conn='out', memlet=outm) else: nstate.add_memlet_path(r, ime, t, dst_conn='inp', memlet=inmm) nstate.add_memlet_path(t, imx, w, src_conn='out', memlet=outm) # Rename outer connectors and add to node inedge._dst_conn = '_in' outedge._src_conn = '_out' node.add_in_connector('_in') node.add_out_connector('_out') return nsdfg
def make_sdfg(node, parent_state, parent_sdfg): sdfg = dace.SDFG(node.label + "_sdfg") ((edge_a, outer_array_a, shape_a, strides_a), (edge_b, outer_array_b, shape_b, strides_b), cdata) = _get_matmul_operands(node, parent_state, parent_sdfg) dtype_a = outer_array_a.dtype.type dtype_b = outer_array_b.dtype.type dtype_c = dace.DTYPE_TO_TYPECLASS[np.result_type(dtype_a, dtype_b).type] if node.transA: trans_shape_a = list(reversed(shape_a)) else: trans_shape_a = shape_a if node.transB: trans_shape_b = list(reversed(shape_b)) else: trans_shape_b = shape_b if (len(trans_shape_a) != 2 or len(trans_shape_b) != 2 or trans_shape_a[1] != trans_shape_b[0]): raise SyntaxError("Matrix sizes must match") M, K, N = trans_shape_a[0], trans_shape_a[1], trans_shape_b[1] shape_c = (M, N) storage = outer_array_a.storage _, array_a = sdfg.add_array("_a", shape_a, dtype_a, strides=strides_a, storage=outer_array_a.storage) _, array_b = sdfg.add_array("_b", shape_b, dtype_b, strides=strides_b, storage=outer_array_b.storage) _, array_c = sdfg.add_array("_c", shape_c, dtype_c, strides=cdata[-1], storage=cdata[1].storage) if node.alpha == 1.0: mul_program = "__out = __a * __b" else: mul_program = "__out = {} * __a * __b".format( _cast_to_dtype_str(node.alpha, dtype_a)) if node.beta == 1: state = sdfg.add_state(node.label + "_state") else: init_state = sdfg.add_state(node.label + "_initstate") state = sdfg.add_state_after(init_state, node.label + "_state") if node.beta != 0: sdfg.add_array("_cin", shape_c, dtype_c, strides=cdata[-1], storage=cdata[1].storage) mul_out, mul_out_array = "_c", array_c output_nodes = None # Initialization / beta map if node.beta == 0: init_state.add_mapped_tasklet( 'gemm_init', { '_o%d' % i: '0:%s' % symstr(d) for i, d in enumerate(shape_c) }, {}, 'out = 0', { 'out': dace.Memlet.simple( mul_out, ','.join( ['_o%d' % i for i in range(len(shape_c))])) }, external_edges=True) elif node.beta == 1: # Do nothing for initialization, only update the values pass else: # Beta map add_program = "__y = ({} * __c)".format( _cast_to_dtype_str(node.beta, dtype_a)) # manually broadcasting C to [M, N] if list(shape_c) == [M, N]: memlet_idx = '__i0, __i1' elif list(shape_c) == [1, N]: memlet_idx = '0, __i1' elif list(shape_c) == [M, 1]: memlet_idx = '__i0, 0' elif list(shape_c) == [N]: memlet_idx = '__i1' else: raise ValueError( "Could not broadcast input _c to ({}, {})".format(M, N)) init_state.add_mapped_tasklet( "gemm_init", {"__i%d" % i: "0:%s" % s for i, s in enumerate([M, N])}, { "__c": dace.Memlet.simple("_cin", memlet_idx), }, add_program, {"__y": dace.Memlet.simple("_c", "__i0, __i1")}, external_edges=True) # Multiplication map state.add_mapped_tasklet( "gemm", {"__i%d" % i: "0:%s" % s for i, s in enumerate([M, N, K])}, { "__a": dace.Memlet.simple( "_a", "__i2, __i0" if node.transA else "__i0, __i2"), "__b": dace.Memlet.simple( "_b", "__i1, __i2" if node.transB else "__i2, __i1") }, mul_program, { "__out": dace.Memlet.simple( mul_out, "__i0, __i1", wcr_str="lambda x, y: x + y") }, external_edges=True, output_nodes=output_nodes) return sdfg
def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG): node.validate(sdfg, state) inedge: graph.MultiConnectorEdge = state.in_edges(node)[0] outedge: graph.MultiConnectorEdge = state.out_edges(node)[0] insubset = dcpy(inedge.data.subset) isqdim = insubset.squeeze() outsubset = dcpy(outedge.data.subset) osqdim = outsubset.squeeze() input_dims = len(insubset) output_dims = len(outsubset) input_data = sdfg.arrays[inedge.data.data] output_data = sdfg.arrays[outedge.data.data] if len(osqdim) == 0: # Fix for scalars osqdim = [0] # Standardize and squeeze axes axes = node.axes if node.axes else [ i for i in range(len(inedge.data.subset)) ] axes = [axis for axis in axes if axis in isqdim] assert node.identity is not None # Create nested SDFG nsdfg = SDFG('reduce') nsdfg.add_array('_in', insubset.size(), input_data.dtype, strides=[ s for i, s in enumerate(input_data.strides) if i in isqdim ], storage=input_data.storage) nsdfg.add_array('_out', outsubset.size(), output_data.dtype, strides=[ s for i, s in enumerate(output_data.strides) if i in osqdim ], storage=output_data.storage) nsdfg.add_transient('acc', [1], nsdfg.arrays['_in'].dtype, dtypes.StorageType.Register) nstate = nsdfg.add_state() # Interleave input and output axes to match input memlet ictr, octr = 0, 0 input_subset = [] for i in isqdim: if i in axes: input_subset.append('_i%d' % ictr) ictr += 1 else: input_subset.append('_o%d' % octr) octr += 1 ome, omx = nstate.add_map( 'reduce_output', { '_o%d' % i: '0:%s' % symstr(sz) for i, sz in enumerate(outsubset.size()) }) outm = dace.Memlet.simple( '_out', ','.join(['_o%d' % i for i in range(output_dims)])) #wcr_str=node.wcr) inmm = dace.Memlet.simple('_in', ','.join(input_subset)) idt = nstate.add_tasklet('reset', {}, {'o'}, f'o = {node.identity}') nstate.add_edge(ome, None, idt, None, dace.Memlet()) accread = nstate.add_access('acc') accwrite = nstate.add_access('acc') nstate.add_edge(idt, 'o', accread, None, dace.Memlet('acc')) # Add inner map, which corresponds to the range to reduce, containing # an identity tasklet ime, imx = nstate.add_map('reduce_values', { '_i%d' % i: '0:%s' % symstr(insubset.size()[isqdim.index(axis)]) for i, axis in enumerate(sorted(axes)) }, schedule=dtypes.ScheduleType.Sequential) # Add identity tasklet for reduction t = nstate.add_tasklet('identity', {'a', 'b'}, {'o'}, 'o = b') # Connect everything r = nstate.add_read('_in') w = nstate.add_write('_out') nstate.add_memlet_path(r, ome, ime, t, dst_conn='b', memlet=inmm) nstate.add_memlet_path(accread, ime, t, dst_conn='a', memlet=dace.Memlet('acc[0]')) nstate.add_memlet_path(t, imx, accwrite, src_conn='o', memlet=dace.Memlet('acc[0]', wcr=node.wcr)) nstate.add_memlet_path(accwrite, omx, w, memlet=outm) # Rename outer connectors and add to node inedge._dst_conn = '_in' outedge._src_conn = '_out' node.add_in_connector('_in') node.add_out_connector('_out') from dace.transformation import dataflow nsdfg.apply_transformations_repeated(dataflow.MapCollapse) return nsdfg