def generate_node(self, sdfg: sdfg.SDFG, dfg: state.StateSubgraphView, state_id: int, node: nodes.Node, function_stream: prettycode.CodeIOStream, callsite_stream: prettycode.CodeIOStream): # check instance type if isinstance(node, nodes.Tasklet): """ handle Tasklet: (1) generate in->tasklet (2) generate tasklet->out (3) generate tasklet """ # generate code to handle data input to the tasklet for edge in dfg.in_edges(node): # find input array src_node = find_input_arraynode(dfg, edge) # dispatch code gen (copy_memory) self.dispatcher.dispatch_copy(src_node, node, edge, sdfg, dfg, state_id, function_stream, callsite_stream) # generate code to handle data output from the tasklet for edge in dfg.out_edges(node): # find output array dst_node = find_output_arraynode(dfg, edge) # dispatch code gen (define_out_memlet) self.dispatcher.dispatch_output_definition( node, dst_node, edge, sdfg, dfg, state_id, function_stream, callsite_stream) # generate tasklet code self.unparse_tasklet(sdfg, dfg, state_id, node, function_stream, callsite_stream) else: raise RuntimeError( "Only tasklets are handled here, not {}. This should have been filtered by the predicate" .format(type(node)))
def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] if self.expr_index == 0: cnode = graph.nodes()[self.subgraph[ GPUTransformLocalStorage._map_entry]] node_schedprop = cnode.map exit_nodes = graph.exit_nodes(cnode) else: cnode = graph.nodes()[self.subgraph[ GPUTransformLocalStorage._reduce]] node_schedprop = cnode exit_nodes = [cnode] # Change schedule node_schedprop._schedule = dtypes.ScheduleType.GPU_Device if Config.get_bool("debugprint"): GPUTransformLocalStorage._maps_transformed += 1 # If nested graph is designated as sequential, transform schedules and # storage from Default to Sequential/Register if self.nested_seq and self.expr_index == 0: for node in graph.scope_subgraph(cnode).nodes(): if isinstance(node, nodes.AccessNode): arr = node.desc(sdfg) if arr.storage == dtypes.StorageType.Default: arr.storage = dtypes.StorageType.Register elif isinstance(node, nodes.MapEntry): if node.map.schedule == dtypes.ScheduleType.Default: node.map.schedule = dtypes.ScheduleType.Sequential gpu_storage_types = [ dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared, dtypes.StorageType.GPU_Stack, ] ####################################################### # Add GPU copies of CPU arrays (i.e., not already on GPU) # First, understand which arrays to clone all_out_edges = [] for enode in exit_nodes: all_out_edges.extend(list(graph.out_edges(enode))) in_arrays_to_clone = set() out_arrays_to_clone = set() for e in graph.in_edges(cnode): data_node = sd.find_input_arraynode(graph, e) if data_node.desc(sdfg).storage not in gpu_storage_types: in_arrays_to_clone.add((data_node, e.data)) for e in all_out_edges: data_node = sd.find_output_arraynode(graph, e) if data_node.desc(sdfg).storage not in gpu_storage_types: out_arrays_to_clone.add((data_node, e.data)) if Config.get_bool("debugprint"): GPUTransformLocalStorage._arrays_removed += len( in_arrays_to_clone) + len(out_arrays_to_clone) # Second, create a GPU clone of each array # TODO: Overapproximate union of memlets cloned_arrays = {} in_cloned_arraynodes = {} out_cloned_arraynodes = {} for array_node, memlet in in_arrays_to_clone: array = array_node.desc(sdfg) cloned_name = "gpu_" + array_node.data for i, r in enumerate(memlet.bounding_box_size()): size = symbolic.overapproximate(r) try: if int(size) == 1: suffix = [] for c in str(memlet.subset[i][0]): if c.isalpha() or c.isdigit() or c == "_": suffix.append(c) elif c == "+": suffix.append("p") elif c == "-": suffix.append("m") elif c == "*": suffix.append("t") elif c == "/": suffix.append("d") cloned_name += "_" + "".join(suffix) except: continue if cloned_name in sdfg.arrays.keys(): cloned_array = sdfg.arrays[cloned_name] elif array_node.data in cloned_arrays: cloned_array = cloned_arrays[array_node.data] else: full_shape = [] for r in memlet.bounding_box_size(): size = symbolic.overapproximate(r) try: full_shape.append(int(size)) except: full_shape.append(size) actual_dims = [ idx for idx, r in enumerate(full_shape) if not (isinstance(r, int) and r == 1) ] if len(actual_dims) == 0: # abort actual_dims = [len(full_shape) - 1] if isinstance(array, data.Scalar): sdfg.add_array(name=cloned_name, shape=[1], dtype=array.dtype, transient=True, storage=dtypes.StorageType.GPU_Global) elif isinstance(array, data.Stream): sdfg.add_stream( name=cloned_name, dtype=array.dtype, shape=[full_shape[d] for d in actual_dims], veclen=array.veclen, buffer_size=array.buffer_size, storage=dtypes.StorageType.GPU_Global, transient=True, offset=[array.offset[d] for d in actual_dims]) else: sdfg.add_array( name=cloned_name, shape=[full_shape[d] for d in actual_dims], dtype=array.dtype, materialize_func=array.materialize_func, transient=True, storage=dtypes.StorageType.GPU_Global, allow_conflicts=array.allow_conflicts, strides=[array.strides[d] for d in actual_dims], offset=[array.offset[d] for d in actual_dims], ) cloned_arrays[array_node.data] = cloned_name cloned_node = type(array_node)(cloned_name) in_cloned_arraynodes[array_node.data] = cloned_node for array_node, memlet in out_arrays_to_clone: array = array_node.desc(sdfg) cloned_name = "gpu_" + array_node.data for i, r in enumerate(memlet.bounding_box_size()): size = symbolic.overapproximate(r) try: if int(size) == 1: suffix = [] for c in str(memlet.subset[i][0]): if c.isalpha() or c.isdigit() or c == "_": suffix.append(c) elif c == "+": suffix.append("p") elif c == "-": suffix.append("m") elif c == "*": suffix.append("t") elif c == "/": suffix.append("d") cloned_name += "_" + "".join(suffix) except: continue if cloned_name in sdfg.arrays.keys(): cloned_array = sdfg.arrays[cloned_name] elif array_node.data in cloned_arrays: cloned_array = cloned_arrays[array_node.data] else: full_shape = [] for r in memlet.bounding_box_size(): size = symbolic.overapproximate(r) try: full_shape.append(int(size)) except: full_shape.append(size) actual_dims = [ idx for idx, r in enumerate(full_shape) if not (isinstance(r, int) and r == 1) ] if len(actual_dims) == 0: # abort actual_dims = [len(full_shape) - 1] if isinstance(array, data.Scalar): sdfg.add_array(name=cloned_name, shape=[1], dtype=array.dtype, transient=True, storage=dtypes.StorageType.GPU_Global) elif isinstance(array, data.Stream): sdfg.add_stream( name=cloned_name, dtype=array.dtype, shape=[full_shape[d] for d in actual_dims], veclen=array.veclen, buffer_size=array.buffer_size, storage=dtypes.StorageType.GPU_Global, transient=True, offset=[array.offset[d] for d in actual_dims]) else: sdfg.add_array( name=cloned_name, shape=[full_shape[d] for d in actual_dims], dtype=array.dtype, materialize_func=array.materialize_func, transient=True, storage=dtypes.StorageType.GPU_Global, allow_conflicts=array.allow_conflicts, strides=[array.strides[d] for d in actual_dims], offset=[array.offset[d] for d in actual_dims], ) cloned_arrays[array_node.data] = cloned_name cloned_node = type(array_node)(cloned_name) cloned_node.setzero = True out_cloned_arraynodes[array_node.data] = cloned_node # Third, connect the cloned arrays to the originals for array_name, node in in_cloned_arraynodes.items(): graph.add_node(node) is_scalar = isinstance(sdfg.arrays[array_name], data.Scalar) for edge in graph.in_edges(cnode): if edge.data.data == array_name: newmemlet = copy.deepcopy(edge.data) newmemlet.data = node.data if is_scalar: newmemlet.subset = sbs.Indices([0]) else: offset = [] lost_dims = [] lost_ranges = [] newsubset = [None] * len(edge.data.subset) for ind, r in enumerate(edge.data.subset): offset.append(r[0]) if isinstance(edge.data.subset[ind], tuple): begin = edge.data.subset[ind][0] - r[0] end = edge.data.subset[ind][1] - r[0] step = edge.data.subset[ind][2] if begin == end: lost_dims.append(ind) lost_ranges.append((begin, end, step)) else: newsubset[ind] = (begin, end, step) else: newsubset[ind] -= r[0] if len(lost_dims) == len(edge.data.subset): lost_dims.pop() newmemlet.subset = type( edge.data.subset)([lost_ranges[-1]]) else: newmemlet.subset = type(edge.data.subset)( [r for r in newsubset if r is not None]) graph.add_edge(node, None, edge.dst, edge.dst_conn, newmemlet) for e in graph.bfs_edges(edge.dst, reverse=False): parent, _, _child, _, memlet = e if parent != edge.dst and not in_scope( graph, parent, edge.dst): break if memlet.data != edge.data.data: continue path = graph.memlet_path(e) if not isinstance(path[-1].dst, nodes.CodeNode): if in_path(path, e, nodes.ExitNode, forward=True): if isinstance(parent, nodes.CodeNode): # Output edge break else: continue if is_scalar: memlet.subset = sbs.Indices([0]) else: newsubset = [None] * len(memlet.subset) for ind, r in enumerate(memlet.subset): if ind in lost_dims: continue if isinstance(memlet.subset[ind], tuple): begin = r[0] - offset[ind] end = r[1] - offset[ind] step = r[2] newsubset[ind] = (begin, end, step) else: newsubset[ind] = ( r - offset[ind], r - offset[ind], 1, ) memlet.subset = type(edge.data.subset)( [r for r in newsubset if r is not None]) memlet.data = node.data if self.fullcopy: edge.data.subset = sbs.Range.from_array( node.desc(sdfg)) edge.data.other_subset = newmemlet.subset graph.add_edge(edge.src, edge.src_conn, node, None, edge.data) graph.remove_edge(edge) for array_name, node in out_cloned_arraynodes.items(): graph.add_node(node) is_scalar = isinstance(sdfg.arrays[array_name], data.Scalar) for edge in all_out_edges: if edge.data.data == array_name: newmemlet = copy.deepcopy(edge.data) newmemlet.data = node.data if is_scalar: newmemlet.subset = sbs.Indices([0]) else: offset = [] lost_dims = [] lost_ranges = [] newsubset = [None] * len(edge.data.subset) for ind, r in enumerate(edge.data.subset): offset.append(r[0]) if isinstance(edge.data.subset[ind], tuple): begin = edge.data.subset[ind][0] - r[0] end = edge.data.subset[ind][1] - r[0] step = edge.data.subset[ind][2] if begin == end: lost_dims.append(ind) lost_ranges.append((begin, end, step)) else: newsubset[ind] = (begin, end, step) else: newsubset[ind] -= r[0] if len(lost_dims) == len(edge.data.subset): lost_dims.pop() newmemlet.subset = type( edge.data.subset)([lost_ranges[-1]]) else: newmemlet.subset = type(edge.data.subset)( [r for r in newsubset if r is not None]) graph.add_edge(edge.src, edge.src_conn, node, None, newmemlet) end_node = graph.scope_dict()[edge.src] for e in graph.bfs_edges(edge.src, reverse=True): parent, _, _child, _, memlet = e if parent == end_node: break if memlet.data != edge.data.data: continue path = graph.memlet_path(e) if not isinstance(path[0].dst, nodes.CodeNode): if in_path(path, e, nodes.EntryNode, forward=False): if isinstance(parent, nodes.CodeNode): # Output edge break else: continue if is_scalar: memlet.subset = sbs.Indices([0]) else: newsubset = [None] * len(memlet.subset) for ind, r in enumerate(memlet.subset): if ind in lost_dims: continue if isinstance(memlet.subset[ind], tuple): begin = r[0] - offset[ind] end = r[1] - offset[ind] step = r[2] newsubset[ind] = (begin, end, step) else: newsubset[ind] = ( r - offset[ind], r - offset[ind], 1, ) memlet.subset = type(edge.data.subset)( [r for r in newsubset if r is not None]) memlet.data = node.data edge.data.wcr = None if self.fullcopy: edge.data.subset = sbs.Range.from_array( node.desc(sdfg)) edge.data.other_subset = newmemlet.subset graph.add_edge(node, None, edge.dst, edge.dst_conn, edge.data) graph.remove_edge(edge) # Fourth, replace memlet arrays as necessary if self.expr_index == 0: scope_subgraph = graph.scope_subgraph(cnode) for edge in scope_subgraph.edges(): if edge.data.data is not None and edge.data.data in cloned_arrays: edge.data.data = cloned_arrays[edge.data.data]
def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] map_entry = graph.nodes()[self.subgraph[FPGATransformMap._map_entry]] map_entry.map._schedule = dtypes.ScheduleType.FPGA_Device # Find map exit nodes exit_nodes = graph.exit_nodes(map_entry) fpga_storage_types = [ dtypes.StorageType.FPGA_Global, dtypes.StorageType.FPGA_Local, dtypes.StorageType.CPU_Pinned ] ####################################################### # Add FPGA copies of CPU arrays (i.e., not already on FPGA) # First, understand which arrays to clone all_out_edges = [] for enode in exit_nodes: all_out_edges.extend(list(graph.out_edges(enode))) in_arrays_to_clone = set() out_arrays_to_clone = set() for e in graph.in_edges(map_entry): data_node = sd.find_input_arraynode(graph, e) if data_node.desc(sdfg).storage not in fpga_storage_types: in_arrays_to_clone.add(data_node) for e in all_out_edges: data_node = sd.find_output_arraynode(graph, e) if data_node.desc(sdfg).storage not in fpga_storage_types: out_arrays_to_clone.add(data_node) # Second, create a FPGA clone of each array cloned_arrays = {} in_cloned_arraynodes = {} out_cloned_arraynodes = {} for array_node in in_arrays_to_clone: array = array_node.desc(sdfg) if array_node.data in cloned_arrays: pass elif 'fpga_' + array_node.data in sdfg.arrays: pass else: sdfg.add_array('fpga_' + array_node.data, dtype=array.dtype, shape=array.shape, materialize_func=array.materialize_func, transient=True, storage=dtypes.StorageType.FPGA_Global, allow_conflicts=array.allow_conflicts, access_order=array.access_order, strides=array.strides, offset=array.offset) cloned_arrays[array_node.data] = 'fpga_' + array_node.data cloned_node = nodes.AccessNode('fpga_' + array_node.data) in_cloned_arraynodes[array_node.data] = cloned_node for array_node in out_arrays_to_clone: array = array_node.desc(sdfg) if array_node.data in cloned_arrays: pass elif 'fpga_' + array_node.data in sdfg.arrays: pass else: sdfg.add_array('fpga_' + array_node.data, dtype=array.dtype, shape=array.shape, materialize_func=array.materialize_func, transient=True, storage=dtypes.StorageType.FPGA_Global, allow_conflicts=array.allow_conflicts, access_order=array.access_order, strides=array.strides, offset=array.offset) cloned_arrays[array_node.data] = 'fpga_' + array_node.data cloned_node = nodes.AccessNode('fpga_' + array_node.data) out_cloned_arraynodes[array_node.data] = cloned_node # Third, connect the cloned arrays to the originals # TODO(later): Shift indices and create only the necessary sub-arrays for array_name, node in in_cloned_arraynodes.items(): graph.add_node(node) for edge in graph.in_edges(map_entry): if edge.data.data == array_name: graph.remove_edge(edge) graph.add_edge(edge.src, None, node, None, edge.data) newmemlet = copy.copy(edge.data) newmemlet.data = node.data graph.add_edge(node, edge.src_conn, edge.dst, edge.dst_conn, newmemlet) for array_name, node in out_cloned_arraynodes.items(): graph.add_node(node) for edge in all_out_edges: if edge.data.data == array_name: graph.remove_edge(edge) graph.add_edge(node, None, edge.dst, None, edge.data) newmemlet = copy.copy(edge.data) newmemlet.data = node.data graph.add_edge(edge.src, edge.src_conn, node, edge.dst_conn, newmemlet) # Fourth, replace memlet arrays as necessary scope_subgraph = graph.scope_subgraph(map_entry) for edge in scope_subgraph.edges(): if (edge.data.data is not None and edge.data.data in cloned_arrays): edge.data.data = cloned_arrays[edge.data.data]
def generate_node(self, sdfg: SDFG, dfg: SDFGState, state_id: int, node: nodes.Node, function_stream: CodeIOStream, callsite_stream: CodeIOStream): self.add_header(function_stream) # Reset the mappings self.stream_associations = dict() # Create empty shared registers for outputs into other tasklets for edge in dfg.out_edges(node): if isinstance(edge.dst, dace.nodes.Tasklet): self.create_empty_definition(node.out_connectors[edge.src_conn], edge, callsite_stream, is_code_code=True) callsite_stream.write('{') # Create input registers (and fill them accordingly) for edge in dfg.in_edges(node): if isinstance(edge.src, nodes.Tasklet): # Copy from tasklet is treated differently (because it involves a shared register) # Changing src_node to a Tasklet will trigger a different copy self.dispatcher.dispatch_copy(edge.src, node, edge, sdfg, dfg, state_id, function_stream, callsite_stream) else: # Copy from some array (or stream) src_node = find_input_arraynode(dfg, edge) self.dispatcher.dispatch_copy(src_node, node, edge, sdfg, dfg, state_id, function_stream, callsite_stream) # Keep track of (edge, node) that need a writeback requires_wb = [] # Create output registers for edge in dfg.out_edges(node): if isinstance(edge.dst, nodes.Tasklet): # Output into another tasklet again is treated differently similar to the input registers self.dispatcher.dispatch_output_definition( node, edge.dst, edge, sdfg, dfg, state_id, function_stream, callsite_stream) requires_wb.append((edge, node)) else: dst_node = find_output_arraynode(dfg, edge) dst_desc = dst_node.desc(sdfg) # Streams neither need an output register (pushes can happen at any time in a tasklet) nor a writeback if isinstance(dst_desc, dace.data.Stream): # We flag the name of the stream variable self.stream_associations[edge.src_conn] = (dst_node.data, dst_desc.dtype) else: self.dispatcher.dispatch_output_definition( node, dst_node, edge, sdfg, dfg, state_id, function_stream, callsite_stream) requires_wb.append((edge, dst_node)) # Generate tasklet code if isinstance(node, nodes.Tasklet): self.unparse_tasklet(sdfg, dfg, state_id, node, function_stream, callsite_stream) # Write back output registers to memory for edge, dst_node in requires_wb: self.write_back(sdfg, dfg, state_id, node, dst_node, edge, function_stream, callsite_stream) callsite_stream.write('}')
def _generate_Tasklet(self, sdfg, dfg, state_id, node, function_stream, callsite_stream): # TODO: this is copy-pasta from the CPU-codegen, necessary to inject # pragmas at the output memlets! Should consolidate. callsite_stream.write('{\n', sdfg, state_id, node) state_dfg = sdfg.nodes()[state_id] self._dispatcher.defined_vars.enter_scope(node) arrays = set() for edge in dfg.in_edges(node): u = edge.src memlet = edge.data if edge.dst_conn: # Not (None or "") if edge.dst_conn in arrays: # Disallow duplicates raise SyntaxError('Duplicates found in memlets') # Special case: code->code if isinstance(edge.src, dace.sdfg.nodes.CodeNode): raise NotImplementedError( "Tasklet to tasklet memlets not implemented") else: src_node = find_input_arraynode(state_dfg, edge) self._dispatcher.dispatch_copy(src_node, node, edge, sdfg, state_dfg, state_id, function_stream, callsite_stream) # Also define variables in the C++ unparser scope self._cpu_codegen._locals.define(edge.dst_conn, -1, self._cpu_codegen._ldepth + 1) arrays.add(edge.dst_conn) callsite_stream.write('\n', sdfg, state_id, node) # Use outgoing edges to preallocate output local vars for edge in dfg.out_edges(node): v = edge.dst memlet = edge.data if edge.src_conn: if edge.src_conn in arrays: # Disallow duplicates continue # Special case: code->code if isinstance(edge.dst, dace.sdfg.nodes.CodeNode): raise NotImplementedError( "Tasklet to tasklet memlets not implemented") else: dst_node = find_output_arraynode(state_dfg, edge) self._dispatcher.dispatch_copy(node, dst_node, edge, sdfg, state_dfg, state_id, function_stream, callsite_stream) # Also define variables in the C++ unparser scope self._cpu_codegen._locals.define(edge.src_conn, -1, self._cpu_codegen._ldepth + 1) arrays.add(edge.src_conn) callsite_stream.write("\n////////////////////\n", sdfg, state_id, node) cpp.unparse_tasklet(sdfg, state_id, dfg, node, function_stream, callsite_stream, self._cpu_codegen._locals, self._cpu_codegen._ldepth, self._cpu_codegen._toplevel_schedule, self) callsite_stream.write("////////////////////\n\n", sdfg, state_id, node) # Process outgoing memlets self._cpu_codegen.process_out_memlets(sdfg, state_id, node, state_dfg, self._dispatcher, callsite_stream, True, function_stream, codegen=self) for edge in state_dfg.out_edges(node): datadesc = sdfg.arrays[edge.data.data] if (isinstance(datadesc, dace.data.Array) and (datadesc.storage == dace.dtypes.StorageType.FPGA_Local or datadesc.storage == dace.dtypes.StorageType.FPGA_Registers) and edge.data.wcr is None): self.generate_no_dependence_post(edge.src_conn, callsite_stream, sdfg, state_id, node) callsite_stream.write('}\n', sdfg, state_id, node) self._dispatcher.defined_vars.exit_scope(node)
def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] if self.expr_index == 0: cnode = graph.nodes()[self.subgraph[GPUTransformMap._map_entry]] node_schedprop = cnode.map exit_nodes = graph.exit_nodes(cnode) else: cnode = graph.nodes()[self.subgraph[GPUTransformMap._reduce]] node_schedprop = cnode exit_nodes = [cnode] # Change schedule node_schedprop._schedule = types.ScheduleType.GPU_Device gpu_storage_types = [ types.StorageType.GPU_Global, types.StorageType.GPU_Shared, types.StorageType.GPU_Stack #, types.StorageType.CPU_Pinned ] ####################################################### # Add GPU copies of CPU arrays (i.e., not already on GPU) # First, understand which arrays to clone all_out_edges = [] for enode in exit_nodes: all_out_edges.extend(list(graph.out_edges(enode))) in_arrays_to_clone = set() out_arrays_to_clone = set() for e in graph.in_edges(cnode): data_node = sd.find_input_arraynode(graph, e) if data_node.desc(sdfg).storage not in gpu_storage_types: in_arrays_to_clone.add(data_node) for e in all_out_edges: data_node = sd.find_output_arraynode(graph, e) if data_node.desc(sdfg).storage not in gpu_storage_types: out_arrays_to_clone.add(data_node) # Second, create a GPU clone of each array cloned_arrays = {} in_cloned_arraynodes = {} out_cloned_arraynodes = {} for array_node in in_arrays_to_clone: array = array_node.desc(sdfg) if array_node.data in cloned_arrays: cloned_array = cloned_arrays[array_node.data] else: cloned_array = sdfg.add_array( 'gpu_' + array_node.data, array.shape, array.dtype, materialize_func=array.materialize_func, transient=True, storage=types.StorageType.GPU_Global, allow_conflicts=array.allow_conflicts, access_order=array.access_order, strides=array.strides, offset=array.offset) cloned_arrays[array_node.data] = 'gpu_' + array_node.data cloned_node = type(array_node)('gpu_' + array_node.data) in_cloned_arraynodes[array_node.data] = cloned_node for array_node in out_arrays_to_clone: array = array_node.desc(sdfg) if array_node.data in cloned_arrays: cloned_array = cloned_arrays[array_node.data] else: cloned_array = sdfg.add_array( 'gpu_' + array_node.data, array.shape, array.dtype, materialize_func=array.materialize_func, transient=True, storage=types.StorageType.GPU_Global, allow_conflicts=array.allow_conflicts, access_order=array.access_order, strides=array.strides, offset=array.offset) cloned_arrays[array_node.data] = 'gpu_' + array_node.data cloned_node = type(array_node)('gpu_' + array_node.data) out_cloned_arraynodes[array_node.data] = cloned_node # Third, connect the cloned arrays to the originals # TODO(later): Shift indices and create only the necessary sub-arrays for array_name, node in in_cloned_arraynodes.items(): graph.add_node(node) for edge in graph.in_edges(cnode): if edge.data.data == array_name: graph.remove_edge(edge) newmemlet = copy.copy(edge.data) newmemlet.data = node.data graph.add_edge(node, edge.src_conn, edge.dst, edge.dst_conn, newmemlet) if self.fullcopy: edge.data.subset = sbs.Range.from_array( node.desc(sdfg)) edge.data.other_subset = edge.data.subset graph.add_edge(edge.src, None, node, None, edge.data) for array_name, node in out_cloned_arraynodes.items(): graph.add_node(node) for edge in all_out_edges: if edge.data.data == array_name: graph.remove_edge(edge) newmemlet = copy.copy(edge.data) newmemlet.data = node.data graph.add_edge(edge.src, edge.src_conn, node, edge.dst_conn, newmemlet) edge.data.wcr = None if self.fullcopy: edge.data.subset = sbs.Range.from_array( node.desc(sdfg)) edge.data.other_subset = edge.data.subset graph.add_edge(node, None, edge.dst, None, edge.data) # Fourth, replace memlet arrays as necessary if self.expr_index == 0: scope_subgraph = graph.scope_subgraph(cnode) for edge in scope_subgraph.edges(): if (edge.data.data is not None and edge.data.data in cloned_arrays): edge.data.data = cloned_arrays[edge.data.data]
def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] if self.expr_index == 0: cnode = graph.nodes()[self.subgraph[GPUTransformMap._map_entry]] node_schedprop = cnode.map exit_nodes = graph.exit_nodes(cnode) else: cnode = graph.nodes()[self.subgraph[GPUTransformMap._reduce]] node_schedprop = cnode exit_nodes = [cnode] # Change schedule node_schedprop._schedule = dtypes.ScheduleType.GPU_Device if Config.get_bool("debugprint"): GPUTransformMap._maps_transformed += 1 gpu_storage_types = [ dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared, dtypes.StorageType.GPU_Stack #, dtypes.StorageType.CPU_Pinned ] ####################################################### # Add GPU copies of CPU arrays (i.e., not already on GPU) # First, understand which arrays to clone all_out_edges = [] for enode in exit_nodes: all_out_edges.extend(list(graph.out_edges(enode))) in_arrays_to_clone = set() out_arrays_to_clone = set() out_streamarrays = {} for e in graph.in_edges(cnode): data_node = sd.find_input_arraynode(graph, e) if isinstance(data_node.desc(sdfg), data.Scalar): continue if data_node.desc(sdfg).storage not in gpu_storage_types: in_arrays_to_clone.add(data_node) for e in all_out_edges: data_node = sd.find_output_arraynode(graph, e) if isinstance(data_node.desc(sdfg), data.Scalar): continue if data_node.desc(sdfg).storage not in gpu_storage_types: # Stream directly connected to an array if sd.is_array_stream_view(sdfg, graph, data_node): datadesc = data_node.desc(sdfg) if datadesc.transient is False: raise TypeError('Non-transient stream-array view are ' 'unsupported') # Add parent node to clone out_arrays_to_clone.add(graph.out_edges(data_node)[0].dst) out_streamarrays[graph.out_edges(data_node) [0].dst] = data_node # Do not clone stream continue out_arrays_to_clone.add(data_node) if Config.get_bool("debugprint"): GPUTransformMap._arrays_removed += len(in_arrays_to_clone) + len( out_arrays_to_clone) # Second, create a GPU clone of each array cloned_arrays = {} in_cloned_arraynodes = {} out_cloned_arraynodes = {} for array_node in in_arrays_to_clone: array = array_node.desc(sdfg) if array_node.data in cloned_arrays: cloned_array = cloned_arrays[array_node.data] else: cloned_array = array.clone() cloned_array.storage = dtypes.StorageType.GPU_Global cloned_array.transient = True sdfg.add_datadesc('gpu_' + array_node.data, cloned_array) cloned_arrays[array_node.data] = 'gpu_' + array_node.data cloned_node = type(array_node)('gpu_' + array_node.data) in_cloned_arraynodes[array_node.data] = cloned_node for array_node in out_arrays_to_clone: array = array_node.desc(sdfg) if array_node.data in cloned_arrays: cloned_array = cloned_arrays[array_node.data] else: cloned_array = array.clone() cloned_array.storage = dtypes.StorageType.GPU_Global cloned_array.transient = True sdfg.add_datadesc('gpu_' + array_node.data, cloned_array) cloned_arrays[array_node.data] = 'gpu_' + array_node.data cloned_node = type(array_node)('gpu_' + array_node.data) out_cloned_arraynodes[array_node.data] = cloned_node # Third, connect the cloned arrays to the originals # TODO(later): Shift indices and create only the necessary sub-arrays for array_name, node in in_cloned_arraynodes.items(): graph.add_node(node) for edge in graph.in_edges(cnode): if edge.data.data == array_name: graph.remove_edge(edge) newmemlet = copy.copy(edge.data) newmemlet.data = node.data graph.add_edge(node, edge.src_conn, edge.dst, edge.dst_conn, newmemlet) if self.fullcopy: edge.data.subset = sbs.Range.from_array( node.desc(sdfg)) edge.data.other_subset = edge.data.subset graph.add_edge(edge.src, None, node, None, edge.data) for array_name, node in out_cloned_arraynodes.items(): graph.add_node(node) for edge in all_out_edges: if edge.data.data == array_name: graph.remove_edge(edge) newmemlet = copy.copy(edge.data) newmemlet.data = node.data graph.add_edge(edge.src, edge.src_conn, node, edge.dst_conn, newmemlet) edge.data.wcr = None if self.fullcopy: edge.data.subset = sbs.Range.from_array( node.desc(sdfg)) edge.data.other_subset = edge.data.subset graph.add_edge(node, None, edge.dst, None, edge.data) # Reconnect stream-arrays for array_node, streamnode in out_streamarrays.items(): # Set stream storage to GPU streamnode.desc(sdfg).storage = dtypes.StorageType.GPU_Global cloned_node = out_cloned_arraynodes[array_node.data] e = graph.out_edges(streamnode)[0] graph.remove_edge(e) newmemlet = copy.copy(e.data) newmemlet.data = cloned_node.data # stream -> cloned array graph.add_edge(e.src, e.src_conn, cloned_node, e.dst_conn, newmemlet) # cloned array -> array graph.add_nedge(cloned_node, array_node, e.data) # Fourth, replace memlet arrays as necessary if self.expr_index == 0: scope_subgraph = graph.scope_subgraph(cnode) for edge in scope_subgraph.edges(): if (edge.data.data is not None and edge.data.data in cloned_arrays): edge.data.data = cloned_arrays[edge.data.data]