def pre_evaluate(self, cutout: dace.SDFG, measurements: int, **kwargs) -> Dict: cutout.start_state.instrument = self.instrument map_entry = None for node in cutout.start_state.nodes(): if isinstance(node, dace.nodes.MapEntry) and xfh.get_parent_map( cutout.start_state, node) is None: map_entry = node break assert map_entry is not None new_kwargs = { "space_kwargs": { "map_entry": map_entry }, "cutout": cutout.to_json(), "map_entry_id": cutout.start_state.node_id(map_entry), "measurements": measurements, "key": lambda point: "None" if point is None else ".".join(map(lambda p: str(p), point)) } return new_kwargs
def cutouts(self) -> Generator[Tuple[dace.SDFG, str], None, None]: for node, state in self._sdfg.all_nodes_recursive(): if isinstance(node, dace.nodes.MapEntry): if xfh.get_parent_map(state, node) is not None: continue node_id = state.node_id(node) state_id = self._sdfg.node_id(state) subgraph_nodes = state.scope_subgraph(node).nodes() cutout = cutter.cutout_state(state, *subgraph_nodes) yield cutout, f"{state_id}.{node_id}.{node.label}"
def fpga_update(sdfg, state, depth): scope_dict = state.scope_dict() for node in state.nodes(): if (isinstance(node, nodes.AccessNode) and node.desc(sdfg).storage == dtypes.StorageType.Default): nodedesc = node.desc(sdfg) pmap = xfh.get_parent_map(state, node) if depth >= 2 or (pmap is not None and pmap[0].schedule == dtypes.ScheduleType.FPGA_Device): nodedesc.storage = dtypes.StorageType.FPGA_Local else: if scope_dict[node]: nodedesc.storage = dtypes.StorageType.FPGA_Local else: nodedesc.storage = dtypes.StorageType.FPGA_Global if (hasattr(node, "schedule") and node.schedule == dace.dtypes.ScheduleType.Default): node.schedule = dace.dtypes.ScheduleType.FPGA_Device if isinstance(node, nodes.NestedSDFG): for s in node.sdfg.nodes(): fpga_update(node.sdfg, s, depth + 1)
def transfer(sdfg: dace.SDFG, tuner, k: int = 5): assert isinstance(tuner, OnTheFlyMapFusionTuner) dreport = sdfg.get_instrumented_data() assert dreport is not None tuning_report = tuner.optimize(apply=False) best_configs = cutout_tuner.CutoutTuner.top_k_configs(tuning_report, k=k) subgraph_patterns = tuner._extract_patterns(best_configs) i = 0 for nsdfg in sdfg.all_sdfgs_recursive(): for state in nsdfg.states(): i = i + 1 top_maps = [] for node in state.nodes(): if isinstance(node, dace.nodes.MapEntry) and xfh.get_parent_map( state, node) is None: top_maps.append(node) if len(top_maps) < 2: continue try: cutout = cutter.cutout_state(state, *(state.nodes()), make_copy=False) except AttributeError: continue while True: base_runtime = None best_pattern = None best_pattern_runtime = math.inf for j, pattern in enumerate(subgraph_patterns): maps = [] for node in state.nodes(): if isinstance( node, dace.nodes.MapEntry ) and xfh.get_parent_map(state, node) is None: maps.append(node) if len(maps) < 2: continue maps_desc = {} state_desc = Counter() for map_entry in maps: map_desc = OnTheFlyMapFusionTuner.map_descriptor( state, map_entry) state_desc.update({map_desc: 1}) if not map_desc in maps_desc: maps_desc[map_desc] = [] maps_desc[map_desc].append(map_entry) included = True for key in pattern: if not key in state_desc or pattern[ key] > state_desc[key]: included = False break if not included: continue if base_runtime is None: baseline = cutter.cutout_state(state, *(state.nodes()), make_copy=False) baseline.start_state.instrument = dace.InstrumentationType.GPU_Events dreport_ = {} for cstate in baseline.nodes(): for dnode in cstate.data_nodes(): array = baseline.arrays[dnode.data] if array.transient: continue try: data = dreport.get_first_version( dnode.data) dreport_[dnode.data] = data except: continue base_runtime = optim_utils.subprocess_measure( baseline, dreport_, i=192, j=192) best_pattern_runtime = base_runtime if base_runtime == math.inf: break # Construct subgraph greedily subgraph_maps = [] for desc in pattern: num = pattern[desc] subgraph_maps.extend(maps_desc[desc][:num]) # Apply experiment_sdfg_ = cutter.cutout_state( state, *(state.nodes()), make_copy=False) experiment_state_ = experiment_sdfg_.start_state experiment_maps_ids = list( map(lambda me: experiment_state_.node_id(me), subgraph_maps)) experiment_sdfg = copy.deepcopy(experiment_sdfg_) experiment_state = experiment_sdfg.start_state experiment_state.instrument = dace.InstrumentationType.GPU_Events experiment_maps = list( map(lambda m_id: experiment_state.node(m_id), experiment_maps_ids)) experiment_subgraph = helpers.subgraph_from_maps( sdfg=experiment_sdfg, graph=experiment_state, map_entries=experiment_maps) map_fusion = sg.SubgraphOTFFusion() map_fusion.setup_match( experiment_subgraph, experiment_sdfg.sdfg_id, experiment_sdfg.node_id(experiment_state)) if map_fusion.can_be_applied(experiment_state, experiment_sdfg): try: experiment_fuse_counter = map_fusion.apply( experiment_state, experiment_sdfg) except: continue if experiment_fuse_counter == 0: continue dreport_ = {} for cstate in experiment_sdfg.nodes(): for dnode in cstate.data_nodes(): array = experiment_sdfg.arrays[dnode.data] if array.transient: continue try: data = dreport.get_first_version( dnode.data) dreport_[dnode.data] = data except: continue fused_runtime = optim_utils.subprocess_measure( experiment_sdfg, dreport_, i=192, j=192) if fused_runtime >= best_pattern_runtime: continue best_pattern = subgraph_maps best_pattern_runtime = fused_runtime if best_pattern is not None: subgraph = helpers.subgraph_from_maps( sdfg=nsdfg, graph=state, map_entries=best_pattern) map_fusion = sg.SubgraphOTFFusion() map_fusion.setup_match(subgraph, nsdfg.sdfg_id, nsdfg.node_id(state)) actual_fuse_counter = map_fusion.apply(state, nsdfg) best_pattern = None base_runtime = None best_pattern_runtime = math.inf else: break
def make_transients_persistent(sdfg: SDFG, device: dtypes.DeviceType, toplevel_only: bool = True) -> None: ''' Helper function to change several storage and scheduling properties - Makes non-view array lifetimes persistent, with some restrictions depending on the device - Reset nonatomic WCR edges on GPU The only arrays that are made persistent by default are ones that do not exist inside a scope (and thus may be allocated multiple times), and whose symbols are always given as parameters to the SDFG (so that they can be allocated in a persistent manner). :param sdfg: SDFG :param device: Device type :param toplevel_only: If True, only converts access nodes that do not appear in any scope. ''' for nsdfg in sdfg.all_sdfgs_recursive(): fsyms: Set[str] = nsdfg.free_symbols persistent: Set[str] = set() not_persistent: Set[str] = set() for state in nsdfg.nodes(): for dnode in state.data_nodes(): if dnode.data in not_persistent: continue desc = dnode.desc(nsdfg) # Only convert arrays and scalars that are not registers if not desc.transient or type(desc) not in { dt.Array, dt.Scalar }: not_persistent.add(dnode.data) continue if desc.storage == dtypes.StorageType.Register: not_persistent.add(dnode.data) continue # Only convert arrays where the size depends on SDFG parameters try: if set(map(str, desc.total_size.free_symbols)) - fsyms: not_persistent.add(dnode.data) continue except AttributeError: # total_size is an integer / has no free symbols pass # Only convert arrays with top-level access nodes if xfh.get_parent_map(state, dnode) is not None: if toplevel_only: not_persistent.add(dnode.data) continue elif desc.lifetime == dtypes.AllocationLifetime.Scope: not_persistent.add(dnode.data) continue persistent.add(dnode.data) for aname in (persistent - not_persistent): nsdfg.arrays[aname].lifetime = dtypes.AllocationLifetime.Persistent if device == dtypes.DeviceType.GPU: # Reset nonatomic WCR edges for n, _ in sdfg.all_nodes_recursive(): if isinstance(n, SDFGState): for edge in n.edges(): edge.data.wcr_nonatomic = False