def make_write_C(sdfg, state, vtype): # Deduce types dtype = vtype.base_type mem_veclen = 64 // dtype.bytes mtype = dace.vector(dtype, mem_veclen) from_kernel = state.add_read("C_pipe") mem_read = state.add_read("C_device") mem_write = state.add_write("C_device") if mem_veclen > vtype.veclen: # We need to convert from the kernel vectorization length to 512-bit # vectors that are written back to memory gearbox = Gearbox(f"(N//TN) * (M//TM) * TN * (TM//{mem_veclen})", "convert_C", schedule=dace.ScheduleType.FPGA_Device) sdfg.add_stream("C_from_converter", mtype, buffer_size=f"TM//{mem_veclen}", storage=dace.StorageType.FPGA_Local, transient=True) converter_write = state.add_write("C_from_converter") state.add_memlet_path(from_kernel, gearbox, dst_conn="from_kernel", memlet=dace.Memlet(f"C_pipe[0]", dynamic=True)) state.add_memlet_path(gearbox, converter_write, src_conn="to_memory", memlet=dace.Memlet("C_from_converter[0]", dynamic=True)) to_writer = state.add_read("C_from_converter") to_writer_subset = "C_from_converter[0]" else: # Just send the data directly to the reader to_writer = from_kernel to_writer_subset = "C_pipe[0]" entry, exit = state.add_map("write_C", { "n0": "0:N//TN", "m0": "0:M//TM", "n1": "0:TN", "m1": f"0:TM//{mem_veclen}" }, schedule=dace.ScheduleType.FPGA_Device) tasklet = state.add_tasklet("write_C", {"from_kernel", "prev"}, {"to_memory"}, "to_memory = from_kernel + prev") state.add_memlet_path(to_writer, entry, tasklet, dst_conn="from_kernel", memlet=dace.Memlet(to_writer_subset)) state.add_memlet_path( mem_read, entry, tasklet, dst_conn="prev", memlet=dace.Memlet( f"C_device[n0 * TN + n1, m0 * (TM//{mem_veclen}) + m1]")) state.add_memlet_path( tasklet, exit, mem_write, src_conn="to_memory", memlet=dace.Memlet( f"C_device[n0 * TN + n1, m0 * (TM//{mem_veclen}) + m1]"))
def apply(self, state: SDFGState, sdfg: SDFG) -> nodes.AccessNode: dnode: nodes.AccessNode = self.access if self.expr_index == 0: edges = state.out_edges(dnode) else: edges = state.in_edges(dnode) # To understand how many components we need to create, all map ranges # throughout memlet paths must match exactly. We thus create a # dictionary of unique ranges mapping: Dict[Tuple[subsets.Range], List[gr.MultiConnectorEdge[mm.Memlet]]] = defaultdict( list) ranges = {} for edge in edges: mpath = state.memlet_path(edge) ranges[edge] = _collect_map_ranges(state, mpath) mapping[tuple(r[1] for r in ranges[edge])].append(edge) # Collect all edges with the same memory access pattern components_to_create: Dict[ Tuple[symbolic.SymbolicType], List[gr.MultiConnectorEdge[mm.Memlet]]] = defaultdict(list) for edges_with_same_range in mapping.values(): for edge in edges_with_same_range: # Get memlet path and innermost edge mpath = state.memlet_path(edge) innermost_edge = copy.deepcopy(mpath[-1] if self.expr_index == 0 else mpath[0]) # Store memlets of the same access in the same component expr = _canonicalize_memlet(innermost_edge.data, ranges[edge]) components_to_create[expr].append((innermost_edge, edge)) components = list(components_to_create.values()) # Split out components that have dependencies between them to avoid # deadlocks if self.expr_index == 0: ccs_to_add = [] for i, component in enumerate(components): edges_to_remove = set() for cedge in component: if any( nx.has_path(state.nx, o[1].dst, cedge[1].dst) for o in component if o is not cedge): ccs_to_add.append([cedge]) edges_to_remove.add(cedge) if edges_to_remove: components[i] = [ c for c in component if c not in edges_to_remove ] components.extend(ccs_to_add) # End of split desc = sdfg.arrays[dnode.data] # Create new streams of shape 1 streams = {} mpaths = {} for edge in edges: if self.use_memory_buffering: arrname = str(self.access) # Add gearbox total_size = edge.data.volume vector_size = int(self.memory_buffering_target_bytes / desc.dtype.bytes) if not is_int(sdfg.arrays[dnode.data].shape[-1]): warnings.warn( "Using the MemoryBuffering transformation is potential unsafe since {sym} is not an integer. There should be no issue if {sym} % {vec} == 0" .format(sym=sdfg.arrays[dnode.data].shape[-1], vec=vector_size)) for i in sdfg.arrays[dnode.data].strides: if not is_int(i): warnings.warn( "Using the MemoryBuffering transformation is potential unsafe since {sym} is not an integer. There should be no issue if {sym} % {vec} == 0" .format(sym=i, vec=vector_size)) if self.expr_index == 0: # Read edges = state.out_edges(dnode) gearbox_input_type = dtypes.vector(desc.dtype, vector_size) gearbox_output_type = desc.dtype gearbox_read_volume = total_size / vector_size gearbox_write_volume = total_size else: # Write edges = state.in_edges(dnode) gearbox_input_type = desc.dtype gearbox_output_type = dtypes.vector( desc.dtype, vector_size) gearbox_read_volume = total_size gearbox_write_volume = total_size / vector_size input_gearbox_name, input_gearbox_newdesc = sdfg.add_stream( "gearbox_input", gearbox_input_type, buffer_size=self.buffer_size, storage=self.storage, transient=True, find_new_name=True) output_gearbox_name, output_gearbox_newdesc = sdfg.add_stream( "gearbox_output", gearbox_output_type, buffer_size=self.buffer_size, storage=self.storage, transient=True, find_new_name=True) read_to_gearbox = state.add_read(input_gearbox_name) write_from_gearbox = state.add_write(output_gearbox_name) gearbox = Gearbox(total_size / vector_size) state.add_node(gearbox) state.add_memlet_path(read_to_gearbox, gearbox, dst_conn="from_memory", memlet=Memlet( input_gearbox_name + "[0]", volume=gearbox_read_volume)) state.add_memlet_path(gearbox, write_from_gearbox, src_conn="to_kernel", memlet=Memlet( output_gearbox_name + "[0]", volume=gearbox_write_volume)) if self.expr_index == 0: streams[edge] = input_gearbox_name name = output_gearbox_name newdesc = output_gearbox_newdesc else: streams[edge] = output_gearbox_name name = input_gearbox_name newdesc = input_gearbox_newdesc else: # Qualify name to avoid name clashes if memory interfaces are not decoupled for Xilinx stream_name = "stream_" + dnode.data name, newdesc = sdfg.add_stream(stream_name, desc.dtype, buffer_size=self.buffer_size, storage=self.storage, transient=True, find_new_name=True) streams[edge] = name # Add these such that we can easily use output_gearbox_name and input_gearbox_name without using if statements output_gearbox_name = name input_gearbox_name = name mpath = state.memlet_path(edge) mpaths[edge] = mpath # Replace memlets in path with stream access for e in mpath: e.data = mm.Memlet(data=name, subset='0', other_subset=e.data.other_subset) if isinstance(e.src, nodes.NestedSDFG): e.data.dynamic = True _streamify_recursive(e.src, e.src_conn, newdesc) if isinstance(e.dst, nodes.NestedSDFG): e.data.dynamic = True _streamify_recursive(e.dst, e.dst_conn, newdesc) # Replace access node and memlet tree with one access if self.expr_index == 0: replacement = state.add_read(output_gearbox_name) state.remove_edge(edge) state.add_edge(replacement, edge.src_conn, edge.dst, edge.dst_conn, edge.data) else: replacement = state.add_write(input_gearbox_name) state.remove_edge(edge) state.add_edge(edge.src, edge.src_conn, replacement, edge.dst_conn, edge.data) if self.use_memory_buffering: arrname = str(self.access) vector_size = int(self.memory_buffering_target_bytes / desc.dtype.bytes) # Vectorize access to global array. dtype = sdfg.arrays[arrname].dtype sdfg.arrays[arrname].dtype = dtypes.vector(dtype, vector_size) new_shape = list(sdfg.arrays[arrname].shape) contigidx = sdfg.arrays[arrname].strides.index(1) new_shape[contigidx] /= vector_size try: new_shape[contigidx] = int(new_shape[contigidx]) except TypeError: pass sdfg.arrays[arrname].shape = new_shape # Change strides new_strides: List = list(sdfg.arrays[arrname].strides) for i in range(len(new_strides)): if i == len(new_strides ) - 1: # Skip last dimension since it is always 1 continue new_strides[i] = new_strides[i] / vector_size sdfg.arrays[arrname].strides = new_strides post_state = get_post_state(sdfg, state) if post_state != None: # Change subset in the post state such that the correct amount of memory is copied back from the device for e in post_state.edges(): if e.data.data == self.access.data: new_subset = list(e.data.subset) i, j, k = new_subset[-1] new_subset[-1] = (i, (j + 1) / vector_size - 1, k) e.data = mm.Memlet(data=str(e.src), subset=subsets.Range(new_subset)) # Make read/write components ionodes = [] for component in components: # Pick the first edge as the edge to make the component from innermost_edge, outermost_edge = component[0] mpath = mpaths[outermost_edge] mapname = streams[outermost_edge] innermost_edge.data.other_subset = None # Get edge data and streams if self.expr_index == 0: opname = 'read' path = [e.dst for e in mpath[:-1]] rmemlets = [(dnode, '__inp', innermost_edge.data)] wmemlets = [] for i, (_, edge) in enumerate(component): name = streams[edge] ionode = state.add_write(name) ionodes.append(ionode) wmemlets.append( (ionode, '__out%d' % i, mm.Memlet(data=name, subset='0'))) code = '\n'.join('__out%d = __inp' % i for i in range(len(component))) else: # More than one input stream might mean a data race, so we only # address the first one in the tasklet code if len(component) > 1: warnings.warn( f'More than one input found for the same index for {dnode.data}' ) opname = 'write' path = [state.entry_node(e.src) for e in reversed(mpath[1:])] wmemlets = [(dnode, '__out', innermost_edge.data)] rmemlets = [] for i, (_, edge) in enumerate(component): name = streams[edge] ionode = state.add_read(name) ionodes.append(ionode) rmemlets.append( (ionode, '__inp%d' % i, mm.Memlet(data=name, subset='0'))) code = '__out = __inp0' # Create map structure for read/write component maps = [] for entry in path: map: nodes.Map = entry.map ranges = [(p, (r[0], r[1], r[2])) for p, r in zip(map.params, map.range)] # Change ranges of map if self.use_memory_buffering: # Find edges from/to map edge_subset = [ a_tuple[0] for a_tuple in list(innermost_edge.data.subset) ] # Change range of map if isinstance(edge_subset[-1], symbol) and str( edge_subset[-1]) == map.params[-1]: if not is_int(ranges[-1][1][1]): warnings.warn( "Using the MemoryBuffering transformation is potential unsafe since {sym} is not an integer. There should be no issue if {sym} % {vec} == 0" .format(sym=ranges[-1][1][1].args[1], vec=vector_size)) ranges[-1] = (ranges[-1][0], (ranges[-1][1][0], (ranges[-1][1][1] + 1) / vector_size - 1, ranges[-1][1][2])) elif isinstance(edge_subset[-1], sympy.core.add.Add): for arg in edge_subset[-1].args: if isinstance( arg, symbol) and str(arg) == map.params[-1]: if not is_int(ranges[-1][1][1]): warnings.warn( "Using the MemoryBuffering transformation is potential unsafe since {sym} is not an integer. There should be no issue if {sym} % {vec} == 0" .format(sym=ranges[-1][1][1].args[1], vec=vector_size)) ranges[-1] = (ranges[-1][0], ( ranges[-1][1][0], (ranges[-1][1][1] + 1) / vector_size - 1, ranges[-1][1][2])) maps.append( state.add_map(f'__s{opname}_{mapname}', ranges, map.schedule)) tasklet = state.add_tasklet( f'{opname}_{mapname}', {m[1] for m in rmemlets}, {m[1] for m in wmemlets}, code, ) for node, cname, memlet in rmemlets: state.add_memlet_path(node, *(me for me, _ in maps), tasklet, dst_conn=cname, memlet=memlet) for node, cname, memlet in wmemlets: state.add_memlet_path(tasklet, *(mx for _, mx in reversed(maps)), node, src_conn=cname, memlet=memlet) return ionodes
def make_read_B(sdfg, state, vtype): # Deduce types dtype = vtype.base_type mem_veclen = 64 // dtype.bytes mtype = dace.vector(dtype, mem_veclen) entry, exit = state.add_map("read_B", { "n0": "0:N//TN", "m0": "0:M//TM", "k": "0:K", "m1": f"0:TM//{mem_veclen}" }, schedule=dace.ScheduleType.FPGA_Device) mem = state.add_read("B_device") to_feeder = state.add_write("B_to_feeder") tasklet = state.add_tasklet("read_B", {"from_memory"}, {"to_feeder"}, "to_feeder = from_memory") state.add_memlet_path( mem, entry, tasklet, dst_conn="from_memory", memlet=dace.Memlet(f"B_device[k, m0 * (TM//{mem_veclen}) + m1]")) if mem_veclen > vtype.veclen: # Data arrives as 512-bit wide vectors, and will be converted to the # vector length of the kernel sdfg.add_stream("B_to_converter", dtype=mtype, buffer_size=MINIMUM_CHANNEL_DEPTH, storage=dace.StorageType.FPGA_Local, transient=True) to_converter_write = state.add_write("B_to_converter") state.add_memlet_path(tasklet, exit, to_converter_write, src_conn="to_feeder", memlet=dace.Memlet("B_to_converter[0]")) # Convert 512-bit vectors to whatever width the kernel uses to_converter_read = state.add_read("B_to_converter") gearbox = Gearbox(f"(N//TN) * (M//TM) * K * (TM//{mem_veclen})", "convert_B", dace.ScheduleType.FPGA_Device) state.add_memlet_path(to_converter_read, gearbox, dst_conn="from_memory", memlet=dace.Memlet(f"B_to_converter[0]", dynamic=True)) state.add_memlet_path(gearbox, to_feeder, src_conn="to_feeder", memlet=dace.Memlet("B_to_feeder[0]", dynamic=True)) else: # If the kernel uses the full memory width, just send the data directly # without any conversion state.add_memlet_path(tasklet, exit, to_feeder, src_conn="to_feeder", memlet=dace.Memlet(f"B_to_feeder[0]"))
def memory_buffering(vec_width, use_library_node, elementwise): gear_factor = mem_width // vec_width kernel_type = dace.vector(dtype, vec_width) if elementwise: memory_type = dace.vector(dtype, mem_width) else: memory_type = dace.vector(kernel_type, gear_factor) sdfg = dace.SDFG("memory_buffering_library_node") state = sdfg.add_state("memory_buffering_library_node") sdfg.add_array("input_array", (n / mem_width, ), memory_type, transient=True, storage=dace.StorageType.FPGA_Global) sdfg.add_array("output_array", (n / mem_width, ), memory_type, transient=True, storage=dace.StorageType.FPGA_Global) sdfg.add_stream("read_to_gearbox", memory_type, transient=True, storage=dace.StorageType.FPGA_Local) sdfg.add_stream("gearbox_to_kernel", kernel_type, transient=True, storage=dace.StorageType.FPGA_Local) sdfg.add_stream("kernel_to_gearbox", kernel_type, transient=True, storage=dace.StorageType.FPGA_Local) sdfg.add_stream("gearbox_to_write", memory_type, transient=True, storage=dace.StorageType.FPGA_Local) # Read from memory memory_read = state.add_read("input_array") read_to_gearbox_write = state.add_write("read_to_gearbox") read_entry, read_exit = state.add_map( "read", {"i": f"0:n/{mem_width}"}, schedule=dace.ScheduleType.FPGA_Device) read_tasklet = state.add_tasklet("read", {"mem"}, {"to_gearbox"}, "to_gearbox = mem") state.add_memlet_path(memory_read, read_entry, read_tasklet, dst_conn="mem", memlet=dace.Memlet(f"input_array[i]")) state.add_memlet_path(read_tasklet, read_exit, read_to_gearbox_write, src_conn="to_gearbox", memlet=dace.Memlet(f"read_to_gearbox[0]")) # Gearbox input read_to_gearbox_read = state.add_read("read_to_gearbox") gearbox_to_kernel_write = state.add_write("gearbox_to_kernel") if use_library_node: read_gearbox = Gearbox(n / mem_width, name="read_gearbox") state.add_node(read_gearbox) state.add_memlet_path(read_to_gearbox_read, read_gearbox, dst_conn="from_memory", memlet=dace.Memlet("read_to_gearbox[0]", volume=n / mem_width)) state.add_memlet_path(read_gearbox, gearbox_to_kernel_write, src_conn="to_kernel", memlet=dace.Memlet("gearbox_to_kernel[0]", volume=n / vec_width)) else: sdfg.add_array("read_buffer", (1, ), memory_type, storage=dace.StorageType.FPGA_Local, transient=True) read_buffer_read = state.add_read("read_buffer") read_buffer_write = state.add_write("read_buffer") read_gearbox_entry, read_gearbox_exit = state.add_map( "gearbox_read", { "i": f"0:n/{mem_width}", "j": f"0:{gear_factor}" }, schedule=dace.ScheduleType.FPGA_Device) read_gearbox_tasklet = state.add_tasklet( "gearbox_read", { "from_memory": memory_type, "buffer_in": None }, {"to_kernel", "buffer_out"}, """\ wide = from_memory if j == 0 else buffer_in to_kernel = wide[j] buffer_out = wide""") state.add_memlet_path(read_to_gearbox_read, read_gearbox_entry, read_gearbox_tasklet, dst_conn="from_memory", memlet=dace.Memlet("read_to_gearbox[0]", dynamic=True)) state.add_memlet_path(read_buffer_read, read_gearbox_entry, read_gearbox_tasklet, dst_conn="buffer_in", memlet=dace.Memlet("read_buffer[0]")) state.add_memlet_path(read_gearbox_tasklet, read_gearbox_exit, gearbox_to_kernel_write, src_conn="to_kernel", memlet=dace.Memlet("gearbox_to_kernel[0]")) state.add_memlet_path(read_gearbox_tasklet, read_gearbox_exit, read_buffer_write, src_conn="buffer_out", memlet=dace.Memlet("read_buffer[0]")) # Some fictional compute gearbox_to_kernel_read = state.add_read("gearbox_to_kernel") kernel_to_gearbox_write = state.add_write("kernel_to_gearbox") compute_entry, compute_exit = state.add_map( "compute", {"i": f"0:n/{vec_width}"}, schedule=dace.ScheduleType.FPGA_Device) compute_tasklet = state.add_tasklet("compute", {"val_in"}, {"val_out"}, "val_out = val_in + 1") state.add_memlet_path(gearbox_to_kernel_read, compute_entry, compute_tasklet, dst_conn="val_in", memlet=dace.Memlet("gearbox_to_kernel[0]")) state.add_memlet_path(compute_tasklet, compute_exit, kernel_to_gearbox_write, src_conn="val_out", memlet=dace.Memlet("kernel_to_gearbox[0]")) # Gearbox output kernel_to_gearbox_read = state.add_write("kernel_to_gearbox") gearbox_to_write_write = state.add_read("gearbox_to_write") if use_library_node: write_gearbox = Gearbox(n / mem_width, name="write_gearbox") state.add_node(write_gearbox) state.add_memlet_path(kernel_to_gearbox_read, write_gearbox, dst_conn="from_kernel", memlet=dace.Memlet("kernel_to_gearbox[0]", volume=n / vec_width)) state.add_memlet_path(write_gearbox, gearbox_to_write_write, src_conn="to_memory", memlet=dace.Memlet("gearbox_to_write[0]", volume=n / mem_width)) else: sdfg.add_array("write_buffer", (1, ), memory_type, storage=dace.StorageType.FPGA_Local, transient=True) write_buffer_read = state.add_read("write_buffer") write_buffer_write = state.add_write("write_buffer") write_gearbox_entry, write_gearbox_exit = state.add_map( "gearbox_write", { "i": f"0:n/{mem_width}", "j": f"0:{gear_factor}" }, schedule=dace.ScheduleType.FPGA_Device) write_gearbox_tasklet = state.add_tasklet( "gearbox_write", {"from_kernel", "buffer_in"}, {"to_memory", "buffer_out"}, f"""\ wide = buffer_in wide[j] = from_kernel if j == {gear_factor} - 1: to_memory = wide buffer_out = wide""") state.add_memlet_path(kernel_to_gearbox_read, write_gearbox_entry, write_gearbox_tasklet, dst_conn="from_kernel", memlet=dace.Memlet("kernel_to_gearbox[0]")) state.add_memlet_path(write_buffer_read, write_gearbox_entry, write_gearbox_tasklet, dst_conn="buffer_in", memlet=dace.Memlet("write_buffer[0]")) state.add_memlet_path(write_gearbox_tasklet, write_gearbox_exit, gearbox_to_write_write, src_conn="to_memory", memlet=dace.Memlet("gearbox_to_write[0]", dynamic=True)) state.add_memlet_path(write_gearbox_tasklet, write_gearbox_exit, write_buffer_write, src_conn="buffer_out", memlet=dace.Memlet("write_buffer[0]")) # Write memory gearbox_to_write_read = state.add_read("gearbox_to_write") memory_write = state.add_write("output_array") write_entry, write_exit = state.add_map( "write", {"i": f"0:n/{mem_width}"}, schedule=dace.ScheduleType.FPGA_Device) write_tasklet = state.add_tasklet("write", {"from_gearbox"}, {"mem"}, "mem = from_gearbox") state.add_memlet_path(gearbox_to_write_read, write_entry, write_tasklet, dst_conn="from_gearbox", memlet=dace.Memlet("gearbox_to_write[0]")) state.add_memlet_path(write_tasklet, write_exit, memory_write, src_conn="mem", memlet=dace.Memlet("output_array[i]")) # Copy data to the FPGA sdfg.add_array("input_array_host", (n, ), dtype) pre_state = sdfg.add_state("host_to_device") host_to_device_read = pre_state.add_read("input_array_host") host_to_device_write = pre_state.add_write("input_array") pre_state.add_memlet_path( host_to_device_read, host_to_device_write, memlet=dace.Memlet(f"input_array[0:n/{mem_width}]")) # Copy data back to the host sdfg.add_array("output_array_host", (n, ), dtype) post_state = sdfg.add_state("device_to_host") device_to_host_read = post_state.add_read("output_array") device_to_host_write = post_state.add_write("output_array_host") post_state.add_memlet_path( device_to_host_read, device_to_host_write, memlet=dace.Memlet(f"output_array[0:n/{mem_width}]")) # Link states sdfg.add_edge(pre_state, state, dace.InterstateEdge()) sdfg.add_edge(state, post_state, dace.InterstateEdge()) run_program(sdfg) return sdfg