def _IfExp(self, t): if util.only_scalars_involed(self.get_defined_symbols(), t.test, t.body, t.orelse): return super()._IfExp(t) if_type, else_type = self.infer(t.body, t.orelse) res_type = dtypes.result_type_of(if_type, else_type) if not isinstance(res_type, dtypes.vector): res_type = dtypes.vector(res_type, -1) self.write('svsel(') self.dispatch_expect(t.test, dtypes.vector(dace.bool, -1)) self.write(', ') self.dispatch_expect(t.body, res_type) self.write(', ') self.dispatch_expect(t.orelse, res_type) self.write(')')
def _IfExp(t, symbols, inferred_symbols): type_test = _dispatch(t.test, symbols, inferred_symbols) type_body = _dispatch(t.body, symbols, inferred_symbols) type_orelse = _dispatch(t.orelse, symbols, inferred_symbols) res_type = dtypes.result_type_of(type_body, type_orelse) if isinstance(type_test, dtypes.vector) and not isinstance(res_type, (dtypes.vector, dtypes.pointer)): # If we test on a vector, the result should be a vector aswell # so we can do a selection based on the test predicate res_type = dtypes.vector(res_type, type_test.veclen) return res_type
def _BoolOp(t, symbols, inferred_symbols): # If any vector occurs in the bool op, the inferred type is also a bool vector vec_len = None for v in t.values: inf_type = _dispatch(v, symbols, inferred_symbols) if isinstance(inf_type, dtypes.vector): # Make sure all occuring vectors are of same size if vec_len is not None and vec_len != inf_type.veclen: raise SyntaxError('Inconsistent vector lengths in BoolOp') vec_len = inf_type.veclen return dtypes.vector(dace.bool, vec_len) if vec_len is not None else dtypes.bool
def generate_case_predicate(self, t: ast.If, acc_pred: str, id: int) -> str: test_pred = f'__pg_test_{self.if_depth}_{id}' # Compute the test predicate for the current case self.fill(f'svbool_t {test_pred} = ') self.pred_name = acc_pred self.dispatch_expect(t.test, dtypes.vector(dace.bool, -1)) self.write(';') # Update the accumulator to exclude the test (the next case only occurs if we had failed) # BIC(A, B) = A AND NOT B self.fill(f'{acc_pred} = svbic_z({acc_pred}, {acc_pred}, {test_pred});') return test_pred
def _Compare(t, symbols, inferred_symbols): # If any vector occurs in the comparision, the inferred type is a bool vector inf_type = _dispatch(t.left, symbols, inferred_symbols) vec_len = None if isinstance(inf_type, dtypes.vector): vec_len = inf_type.veclen for o, e in zip(t.ops, t.comparators): if o.__class__.__name__ not in cppunparse.CPPUnparser.cmpops: continue inf_type = _dispatch(e, symbols, inferred_symbols) if isinstance(inf_type, dtypes.vector): # Make sure all occuring vectors are of same size if vec_len is not None and vec_len != inf_type.veclen: raise SyntaxError('Inconsistent vector lengths in Compare') vec_len = inf_type.veclen return dtypes.vector(dace.bool, vec_len) if vec_len is not None else dtypes.bool
def _as_type(self, dtype: dace.typeclass, inf_type: int) -> dace.typeclass: """ Turns a typeclass into a scalar or vector. """ if isinstance(dtype, dtypes.pointer): raise ValueError('Pointer was provided') elif isinstance(dtype, dtypes.vector): if inf_type == InferenceNode.Vector: return dtype else: raise VectorInferenceException('Cannot make vector into scalar') else: if inf_type == InferenceNode.Vector: return dtypes.vector(dtype, self.vec_len) else: return dtype
def push_to_stream(self, t, target): target_stream = self.stream_associations[target.id] stream_type = target_stream[1] self.enter() self.fill('\n// === Stream push ===') # Casting in case of `long long` stream_type = copy.copy(stream_type) if stream_type.type == np.int64: stream_type.ctype = 'int64_t' elif stream_type.type == np.uint64: stream_type.ctype = 'uint64_t' # Create a temporary array on the heap, where we will copy the SVE register contents to self.fill('{} __tmp[{} / {}];'.format(stream_type, util.REGISTER_BYTE_SIZE, stream_type.bytes)) # Count the number of "to push" elements based on the current predicate self.fill('size_t __cnt = svcntp_b{}({}, {});'.format( self.pred_bits, self.pred_name, self.pred_name)) # Store the contents of the SVE register in the temporary array self.fill( f'svst1(svwhilelt_b{self.pred_bits}(0, ({self.counter_type}) __cnt), __tmp, ' ) # The contents should be compacted (i.e. all elements where the predicate is true are aligned) self.write(f'svcompact({self.pred_name}, ') self.dispatch_expect(t.value, dtypes.vector(stream_type, -1)) self.write('));') ptr_cast = '' # Special casting for int64_t back to `long long` if stream_type.type == np.int64: ptr_cast = '(long long*) ' elif stream_type.type == np.uint64: ptr_cast = '(unsigned long long*) ' # Push the temporary array onto the stream using DaCe's push self.fill(f'{target_stream[0]}.push({ptr_cast}&__tmp[0], __cnt);') self.leave()
def _Subscript(t, symbols, inferred_symbols): value_type = _dispatch(t.value, symbols, inferred_symbols) slice_type = _dispatch(t.slice, symbols, inferred_symbols) if isinstance(slice_type, dtypes.pointer): raise SyntaxError('Invalid syntax (pointer given as slice)') # A slice as subscript (e.g. [0:N]) returns a pointer if isinstance(t.slice, ast.Slice): return value_type # A vector as subscript of a pointer returns a vector of the base type if isinstance(value_type, dtypes.pointer) and isinstance(slice_type, dtypes.vector): if not np.issubdtype(slice_type.type, np.integer): raise SyntaxError('Subscript must be some integer type') return dtypes.vector(value_type.base_type, slice_type.veclen) # Otherwise (some index as subscript) we return the base type if isinstance(value_type, dtypes.typeclass): return value_type.base_type return value_type
def expansion(node, parent_state, parent_sdfg, n=None, **kwargs): """ :param node: The node to expand. :param parent_state: The state that the node is in. :param parent_sdfg: The SDFG that the node is in. :param n: Override the vector dimension. If this is not set, the value specified in the node is used. """ (desc_x, stride_x), (desc_y, stride_y), desc_res, sz = node.validate( parent_sdfg, parent_state) n = n or node.n or sz sdfg = dace.SDFG("dot") state = sdfg.add_state("dot") dtype = desc_x.dtype.base_type veclen = desc_x.veclen vtype = dtypes.vector(dtype, veclen) desc_x = desc_x.clone() desc_x.transient = False desc_y = desc_y.clone() desc_y.transient = False desc_res = desc_res.clone() desc_res.transient = False sdfg.add_datadesc("_x", desc_x) sdfg.add_datadesc("_y", desc_y) sdfg.add_datadesc("_result", desc_res) x_read = state.add_read("_x") y_read = state.add_read("_y") res_write = state.add_write("_result") input_x_name = "input_x" sdfg.add_array(input_x_name, (1, ), vtype, transient=True, storage=dtypes.StorageType.FPGA_Local) input_x_access = state.add_access(input_x_name) input_y_name = "input_y" sdfg.add_array(input_y_name, (1, ), vtype, transient=True, storage=dtypes.StorageType.FPGA_Local) input_y_access = state.add_access(input_y_name) entry, exit = state.add_map("stream", {"_i_dot": f"0:{n}/{veclen}"}, schedule=dtypes.ScheduleType.FPGA_Device) index_x = "0" if isinstance(desc_x, dt.Stream) else "_i_dot" index_y = "0" if isinstance(desc_y, dt.Stream) else "_i_dot" state.add_memlet_path(x_read, entry, input_x_access, memlet=dace.Memlet(f"{x_read.data}[{index_x}]", other_subset="0", dynamic=False)) state.add_memlet_path(y_read, entry, input_y_access, memlet=dace.Memlet(f"{y_read.data}[{index_y}]", other_subset="0", dynamic=False)) tasklet = state.add_tasklet("multiply", {"__x", "__y"}, {f"_product": vtype}, f"_product = __x * __y") state.add_memlet_path(input_x_access, tasklet, dst_conn="__x", memlet=dace.Memlet(f"{input_x_name}[0]")) state.add_memlet_path(input_y_access, tasklet, dst_conn="__y", memlet=dace.Memlet(f"{input_y_name}[0]")) product_name = "product" sdfg.add_array(product_name, (veclen, ), dtype, transient=True, storage=dtypes.StorageType.FPGA_Local) product_access = state.add_access(product_name) state.add_memlet_path( tasklet, product_access, src_conn="_product", memlet=dace.Memlet(f"{product_name}[0:{veclen}]")) collapse_name = "reduce_vector" sdfg.add_array(collapse_name, (1, ), dtype, transient=True, storage=dtypes.StorageType.FPGA_Local) collapse_read = state.add_read(collapse_name) collapse_access = state.add_access(collapse_name) unroll_entry, unroll_exit = state.add_map( "unroll", {"_j_dot": f"0:{veclen}"}, unroll=True, schedule=dtypes.ScheduleType.FPGA_Device) collapse_tasklet = state.add_tasklet( "reduce_vector", {"val_in", "reduce_in"}, {"reduce_out"}, """\ prev = reduce_in if _j_dot > 0 else 0 reduce_out = prev + val_in""") state.add_memlet_path(collapse_read, unroll_entry, collapse_tasklet, dst_conn="reduce_in", memlet=dace.Memlet(f"{collapse_name}[0]")) state.add_memlet_path(entry, collapse_read, memlet=dace.Memlet()) state.add_memlet_path(collapse_tasklet, unroll_exit, collapse_access, src_conn="reduce_out", memlet=dace.Memlet(f"{collapse_name}[0]")) state.add_memlet_path(product_access, unroll_entry, collapse_tasklet, dst_conn="val_in", memlet=dace.Memlet(f"{product_name}[_j_dot]")) buffer_name = "reduce_buffer" sdfg.add_array(buffer_name, (1, ), dtype, transient=True, storage=dtypes.StorageType.FPGA_Local) buffer_read = state.add_read(buffer_name) buffer_write = state.add_access(buffer_name) zero_tasklet = state.add_tasklet("zero", {}, {"buffer"}, "buffer = 0") state.add_memlet_path(zero_tasklet, buffer_read, src_conn="buffer", memlet=dace.Memlet(f"{buffer_name}[0]")) reduce_tasklet = state.add_tasklet( "sum", {"buffer_in", "result_in"}, {"buffer_out"}, """\ prev = buffer_in if _i_dot > 0 else 0 buffer_out = prev + result_in""") state.add_memlet_path(collapse_access, reduce_tasklet, dst_conn="result_in", memlet=dace.Memlet(f"{collapse_access.data}[0]")) state.add_memlet_path(buffer_read, entry, reduce_tasklet, dst_conn="buffer_in", memlet=dace.Memlet(f"{buffer_name}[0]")) state.add_memlet_path(reduce_tasklet, exit, buffer_write, src_conn=f"buffer_out", memlet=dace.Memlet(f"{buffer_name}[0]")) state.add_memlet_path(buffer_write, res_write, memlet=dace.Memlet(f"{buffer_name}[0]", other_subset="0")) return sdfg
def get_internal_symbols() -> dict: """ Generates all internal symbols by crossing the internal function names with all possible type suffixes. Then defines the symbol with the corresponding return type (based on the suffix). """ res = {} for func, type in itertools.product(FUSED_OPERATION_TO_SVE, TYPE_TO_SVE_SUFFIX): res[f'{func}_{TYPE_TO_SVE_SUFFIX[type.type if isinstance(type, dace.dtypes.typeclass) else type]}'] = dtypes.vector( type if isinstance(type, dtypes.typeclass) else dtypes.typeclass(type), SVE_LEN) return res
def apply(self, sdfg: SDFG): graph = sdfg.nodes()[self.state_id] map_entry = graph.nodes()[self.subgraph[Vectorization._map_entry]] tasklet: nodes.Tasklet = graph.successors(map_entry)[0] param = symbolic.pystr_to_symbolic(map_entry.map.params[-1]) # Create new vector size. vector_size = self.vector_len dim_from, dim_to, dim_skip = map_entry.map.range[-1] # Determine whether to create preamble or postamble maps if self.preamble is not None: create_preamble = self.preamble else: create_preamble = not ((dim_from % vector_size == 0) == True or dim_from == 0) if self.postamble is not None: create_postamble = self.postamble else: if isinstance(dim_to, symbolic.SymExpr): create_postamble = (((dim_to.approx + 1) % vector_size == 0) == False) else: create_postamble = (((dim_to + 1) % vector_size == 0) == False) # Determine new range for vectorized map if self.strided_map: new_range = [dim_from, dim_to - vector_size + 1, vector_size] else: new_range = [ dim_from // vector_size, ((dim_to + 1) // vector_size) - 1, dim_skip ] # Create preamble non-vectorized map (replacing the original map) if create_preamble: old_scope = graph.scope_subgraph(map_entry, True, True) new_scope: ScopeSubgraphView = replicate_scope( sdfg, graph, old_scope) new_begin = dim_from + (vector_size - (dim_from % vector_size)) map_entry.map.range[-1] = (dim_from, new_begin - 1, dim_skip) # Replace map_entry with the replicated scope (so that the preamble # will usually come first in topological sort) map_entry = new_scope.entry tasklet = new_scope.nodes()[old_scope.nodes().index(tasklet)] new_range[0] = new_begin # Create postamble non-vectorized map if create_postamble: new_scope: ScopeSubgraphView = replicate_scope( sdfg, graph, graph.scope_subgraph(map_entry, True, True)) dim_to_ex = dim_to + 1 new_scope.entry.map.range[-1] = (dim_to_ex - (dim_to_ex % vector_size), dim_to, dim_skip) # Change the step of the inner-most dimension. map_entry.map.range[-1] = tuple(new_range) # Vectorize connectors adjacent to the tasklet. for edge in graph.all_edges(tasklet): connectors = (tasklet.in_connectors if edge.dst == tasklet else tasklet.out_connectors) conn = edge.dst_conn if edge.dst == tasklet else edge.src_conn if edge.data.data is None: # Empty memlets continue desc = sdfg.arrays[edge.data.data] contigidx = desc.strides.index(1) newlist = [] lastindex = edge.data.subset[contigidx] if isinstance(lastindex, tuple): newlist = [(rb, re, rs) for rb, re, rs in edge.data.subset] symbols = set() for indd in lastindex: symbols.update( symbolic.pystr_to_symbolic(indd).free_symbols) else: newlist = [(rb, rb, 1) for rb in edge.data.subset] symbols = symbolic.pystr_to_symbolic(lastindex).free_symbols oldtype = connectors[conn] if oldtype is None or oldtype.type is None: oldtype = desc.dtype # Vector to scalar WCR edge: change connector and continue lastedge = graph.memlet_path(edge)[-1] if (lastedge.data.subset.num_elements() == 1 and edge.data.wcr is not None): connectors[conn] = dtypes.vector(oldtype, vector_size) continue if str(param) not in map(str, symbols): continue # Vectorize connector, if not already vectorized if isinstance(oldtype, dtypes.vector): continue connectors[conn] = dtypes.vector(oldtype, vector_size) # Modify memlet subset to match vector length if self.strided_map: rb = newlist[contigidx][0] if self.propagate_parent: newlist[contigidx] = (rb / self.vector_len, rb / self.vector_len, 1) else: newlist[contigidx] = (rb, rb + self.vector_len - 1, 1) else: rb = newlist[contigidx][0] if self.propagate_parent: newlist[contigidx] = (rb, rb, 1) else: newlist[contigidx] = (self.vector_len * rb, self.vector_len * rb + self.vector_len - 1, 1) edge.data.subset = subsets.Range(newlist) edge.data.volume = vector_size # Vector length propagation using data descriptors, recursive traversal # outwards if self.propagate_parent: for edge in graph.all_edges(tasklet): cursdfg = sdfg curedge = edge while cursdfg is not None: arrname = curedge.data.data dtype = cursdfg.arrays[arrname].dtype # Change type and shape to vector if not isinstance(dtype, dtypes.vector): cursdfg.arrays[arrname].dtype = dtypes.vector( dtype, vector_size) new_shape = list(cursdfg.arrays[arrname].shape) contigidx = cursdfg.arrays[arrname].strides.index(1) new_shape[contigidx] /= vector_size try: new_shape[contigidx] = int(new_shape[contigidx]) except TypeError: pass cursdfg.arrays[arrname].shape = new_shape propagation.propagate_memlets_sdfg(cursdfg) # Find matching edge in parent nsdfg = cursdfg.parent_nsdfg_node if nsdfg is None: break tstate = cursdfg.parent curedge = ([ e for e in tstate.in_edges(nsdfg) if e.dst_conn == arrname ] + [ e for e in tstate.out_edges(nsdfg) if e.src_conn == arrname ])[0] cursdfg = cursdfg.parent_sdfg
def vector_reduction_expr(self, edge, dtype, rhs): # Check whether it is a known reduction that is possible in SVE reduction_type = detect_reduction_type(edge.data.wcr) if reduction_type not in util.REDUCTION_TYPE_TO_SVE: raise util.NotSupportedError('Unsupported reduction in SVE') nc = not is_write_conflicted(self.dfg, edge) if not nc or not isinstance(edge.src.out_connectors[edge.src_conn], (dtypes.pointer, dtypes.vector)): # WCR on vectors works in two steps: # 1. Reduce the SVE register using SVE instructions into a scalar # 2. WCR the scalar to memory using DaCe functionality dst_node = self.dfg.memlet_path(edge)[-1].dst if (isinstance(dst_node, nodes.AccessNode) and dst_node.desc( self.sdfg).storage == dtypes.StorageType.SVE_Register): return wcr = self.cpu_codegen.write_and_resolve_expr(self.sdfg, edge.data, not nc, None, '@', dtype=dtype) self.fill(wcr[:wcr.find('@')]) self.write(util.REDUCTION_TYPE_TO_SVE[reduction_type]) self.write('(') self.write(self.pred_name) self.write(', ') self.dispatch_expect(rhs, dtypes.vector(dtype, -1)) self.write(')') self.write(wcr[wcr.find('@') + 1:]) self.write(';') else: ###################### # Horizontal non-atomic reduction stride = edge.data.get_stride(self.sdfg, self.map) # long long fix ptr_cast = '' src_type = edge.src.out_connectors[edge.src_conn] if src_type.type == np.int64: ptr_cast = '(int64_t*) ' elif src_type.type == np.uint64: ptr_cast = '(uint64_t*) ' store_args = '{}, {}'.format( self.pred_name, ptr_cast + cpp_ptr_expr(self.sdfg, edge.data, DefinedType.Pointer), ) red_type = util.REDUCTION_TYPE_TO_SVE[reduction_type][:-1] + '_x' if stride == 1: self.write( f'svst1({store_args}, {red_type}({self.pred_name}, svld1({store_args}), ' ) self.dispatch_expect(rhs, dtypes.vector(dtype, -1)) self.write('));') else: store_args = f'{store_args}, svindex_s{util.get_base_type(src_type).bytes * 8}(0, {sym2cpp(stride)})' self.write( f'svst1_scatter_index({store_args}, {red_type}({self.pred_name}, svld1_gather_index({store_args}), ' ) self.dispatch_expect(rhs, dtypes.vector(dtype, -1)) self.write('));')
def apply(self, state: SDFGState, sdfg: SDFG) -> nodes.AccessNode: dnode: nodes.AccessNode = self.access if self.expr_index == 0: edges = state.out_edges(dnode) else: edges = state.in_edges(dnode) # To understand how many components we need to create, all map ranges # throughout memlet paths must match exactly. We thus create a # dictionary of unique ranges mapping: Dict[Tuple[subsets.Range], List[gr.MultiConnectorEdge[mm.Memlet]]] = defaultdict( list) ranges = {} for edge in edges: mpath = state.memlet_path(edge) ranges[edge] = _collect_map_ranges(state, mpath) mapping[tuple(r[1] for r in ranges[edge])].append(edge) # Collect all edges with the same memory access pattern components_to_create: Dict[ Tuple[symbolic.SymbolicType], List[gr.MultiConnectorEdge[mm.Memlet]]] = defaultdict(list) for edges_with_same_range in mapping.values(): for edge in edges_with_same_range: # Get memlet path and innermost edge mpath = state.memlet_path(edge) innermost_edge = copy.deepcopy(mpath[-1] if self.expr_index == 0 else mpath[0]) # Store memlets of the same access in the same component expr = _canonicalize_memlet(innermost_edge.data, ranges[edge]) components_to_create[expr].append((innermost_edge, edge)) components = list(components_to_create.values()) # Split out components that have dependencies between them to avoid # deadlocks if self.expr_index == 0: ccs_to_add = [] for i, component in enumerate(components): edges_to_remove = set() for cedge in component: if any( nx.has_path(state.nx, o[1].dst, cedge[1].dst) for o in component if o is not cedge): ccs_to_add.append([cedge]) edges_to_remove.add(cedge) if edges_to_remove: components[i] = [ c for c in component if c not in edges_to_remove ] components.extend(ccs_to_add) # End of split desc = sdfg.arrays[dnode.data] # Create new streams of shape 1 streams = {} mpaths = {} for edge in edges: if self.use_memory_buffering: arrname = str(self.access) # Add gearbox total_size = edge.data.volume vector_size = int(self.memory_buffering_target_bytes / desc.dtype.bytes) if not is_int(sdfg.arrays[dnode.data].shape[-1]): warnings.warn( "Using the MemoryBuffering transformation is potential unsafe since {sym} is not an integer. There should be no issue if {sym} % {vec} == 0" .format(sym=sdfg.arrays[dnode.data].shape[-1], vec=vector_size)) for i in sdfg.arrays[dnode.data].strides: if not is_int(i): warnings.warn( "Using the MemoryBuffering transformation is potential unsafe since {sym} is not an integer. There should be no issue if {sym} % {vec} == 0" .format(sym=i, vec=vector_size)) if self.expr_index == 0: # Read edges = state.out_edges(dnode) gearbox_input_type = dtypes.vector(desc.dtype, vector_size) gearbox_output_type = desc.dtype gearbox_read_volume = total_size / vector_size gearbox_write_volume = total_size else: # Write edges = state.in_edges(dnode) gearbox_input_type = desc.dtype gearbox_output_type = dtypes.vector( desc.dtype, vector_size) gearbox_read_volume = total_size gearbox_write_volume = total_size / vector_size input_gearbox_name, input_gearbox_newdesc = sdfg.add_stream( "gearbox_input", gearbox_input_type, buffer_size=self.buffer_size, storage=self.storage, transient=True, find_new_name=True) output_gearbox_name, output_gearbox_newdesc = sdfg.add_stream( "gearbox_output", gearbox_output_type, buffer_size=self.buffer_size, storage=self.storage, transient=True, find_new_name=True) read_to_gearbox = state.add_read(input_gearbox_name) write_from_gearbox = state.add_write(output_gearbox_name) gearbox = Gearbox(total_size / vector_size) state.add_node(gearbox) state.add_memlet_path(read_to_gearbox, gearbox, dst_conn="from_memory", memlet=Memlet( input_gearbox_name + "[0]", volume=gearbox_read_volume)) state.add_memlet_path(gearbox, write_from_gearbox, src_conn="to_kernel", memlet=Memlet( output_gearbox_name + "[0]", volume=gearbox_write_volume)) if self.expr_index == 0: streams[edge] = input_gearbox_name name = output_gearbox_name newdesc = output_gearbox_newdesc else: streams[edge] = output_gearbox_name name = input_gearbox_name newdesc = input_gearbox_newdesc else: # Qualify name to avoid name clashes if memory interfaces are not decoupled for Xilinx stream_name = "stream_" + dnode.data name, newdesc = sdfg.add_stream(stream_name, desc.dtype, buffer_size=self.buffer_size, storage=self.storage, transient=True, find_new_name=True) streams[edge] = name # Add these such that we can easily use output_gearbox_name and input_gearbox_name without using if statements output_gearbox_name = name input_gearbox_name = name mpath = state.memlet_path(edge) mpaths[edge] = mpath # Replace memlets in path with stream access for e in mpath: e.data = mm.Memlet(data=name, subset='0', other_subset=e.data.other_subset) if isinstance(e.src, nodes.NestedSDFG): e.data.dynamic = True _streamify_recursive(e.src, e.src_conn, newdesc) if isinstance(e.dst, nodes.NestedSDFG): e.data.dynamic = True _streamify_recursive(e.dst, e.dst_conn, newdesc) # Replace access node and memlet tree with one access if self.expr_index == 0: replacement = state.add_read(output_gearbox_name) state.remove_edge(edge) state.add_edge(replacement, edge.src_conn, edge.dst, edge.dst_conn, edge.data) else: replacement = state.add_write(input_gearbox_name) state.remove_edge(edge) state.add_edge(edge.src, edge.src_conn, replacement, edge.dst_conn, edge.data) if self.use_memory_buffering: arrname = str(self.access) vector_size = int(self.memory_buffering_target_bytes / desc.dtype.bytes) # Vectorize access to global array. dtype = sdfg.arrays[arrname].dtype sdfg.arrays[arrname].dtype = dtypes.vector(dtype, vector_size) new_shape = list(sdfg.arrays[arrname].shape) contigidx = sdfg.arrays[arrname].strides.index(1) new_shape[contigidx] /= vector_size try: new_shape[contigidx] = int(new_shape[contigidx]) except TypeError: pass sdfg.arrays[arrname].shape = new_shape # Change strides new_strides: List = list(sdfg.arrays[arrname].strides) for i in range(len(new_strides)): if i == len(new_strides ) - 1: # Skip last dimension since it is always 1 continue new_strides[i] = new_strides[i] / vector_size sdfg.arrays[arrname].strides = new_strides post_state = get_post_state(sdfg, state) if post_state != None: # Change subset in the post state such that the correct amount of memory is copied back from the device for e in post_state.edges(): if e.data.data == self.access.data: new_subset = list(e.data.subset) i, j, k = new_subset[-1] new_subset[-1] = (i, (j + 1) / vector_size - 1, k) e.data = mm.Memlet(data=str(e.src), subset=subsets.Range(new_subset)) # Make read/write components ionodes = [] for component in components: # Pick the first edge as the edge to make the component from innermost_edge, outermost_edge = component[0] mpath = mpaths[outermost_edge] mapname = streams[outermost_edge] innermost_edge.data.other_subset = None # Get edge data and streams if self.expr_index == 0: opname = 'read' path = [e.dst for e in mpath[:-1]] rmemlets = [(dnode, '__inp', innermost_edge.data)] wmemlets = [] for i, (_, edge) in enumerate(component): name = streams[edge] ionode = state.add_write(name) ionodes.append(ionode) wmemlets.append( (ionode, '__out%d' % i, mm.Memlet(data=name, subset='0'))) code = '\n'.join('__out%d = __inp' % i for i in range(len(component))) else: # More than one input stream might mean a data race, so we only # address the first one in the tasklet code if len(component) > 1: warnings.warn( f'More than one input found for the same index for {dnode.data}' ) opname = 'write' path = [state.entry_node(e.src) for e in reversed(mpath[1:])] wmemlets = [(dnode, '__out', innermost_edge.data)] rmemlets = [] for i, (_, edge) in enumerate(component): name = streams[edge] ionode = state.add_read(name) ionodes.append(ionode) rmemlets.append( (ionode, '__inp%d' % i, mm.Memlet(data=name, subset='0'))) code = '__out = __inp0' # Create map structure for read/write component maps = [] for entry in path: map: nodes.Map = entry.map ranges = [(p, (r[0], r[1], r[2])) for p, r in zip(map.params, map.range)] # Change ranges of map if self.use_memory_buffering: # Find edges from/to map edge_subset = [ a_tuple[0] for a_tuple in list(innermost_edge.data.subset) ] # Change range of map if isinstance(edge_subset[-1], symbol) and str( edge_subset[-1]) == map.params[-1]: if not is_int(ranges[-1][1][1]): warnings.warn( "Using the MemoryBuffering transformation is potential unsafe since {sym} is not an integer. There should be no issue if {sym} % {vec} == 0" .format(sym=ranges[-1][1][1].args[1], vec=vector_size)) ranges[-1] = (ranges[-1][0], (ranges[-1][1][0], (ranges[-1][1][1] + 1) / vector_size - 1, ranges[-1][1][2])) elif isinstance(edge_subset[-1], sympy.core.add.Add): for arg in edge_subset[-1].args: if isinstance( arg, symbol) and str(arg) == map.params[-1]: if not is_int(ranges[-1][1][1]): warnings.warn( "Using the MemoryBuffering transformation is potential unsafe since {sym} is not an integer. There should be no issue if {sym} % {vec} == 0" .format(sym=ranges[-1][1][1].args[1], vec=vector_size)) ranges[-1] = (ranges[-1][0], ( ranges[-1][1][0], (ranges[-1][1][1] + 1) / vector_size - 1, ranges[-1][1][2])) maps.append( state.add_map(f'__s{opname}_{mapname}', ranges, map.schedule)) tasklet = state.add_tasklet( f'{opname}_{mapname}', {m[1] for m in rmemlets}, {m[1] for m in wmemlets}, code, ) for node, cname, memlet in rmemlets: state.add_memlet_path(node, *(me for me, _ in maps), tasklet, dst_conn=cname, memlet=memlet) for node, cname, memlet in wmemlets: state.add_memlet_path(tasklet, *(mx for _, mx in reversed(maps)), node, src_conn=cname, memlet=memlet) return ionodes