def callback(self, cgroups, prefix): # Toposort to maximize fusion if self.toposort: clusters = self._toposort(cgroups, prefix) else: clusters = ClusterGroup(cgroups) # Fusion processed = [] for k, g in groupby(clusters, key=self._key): maybe_fusible = list(g) if len(maybe_fusible) == 1: processed.extend(maybe_fusible) else: try: # Perform fusion fused = Cluster.from_clusters(*maybe_fusible) processed.append(fused) except ValueError: # We end up here if, for example, some Clusters have same # iteration Dimensions but different (partial) orderings processed.extend(maybe_fusible) return [ClusterGroup(processed, prefix)]
def _toposort(self, cgroups, prefix): # Are there any ClusterGroups that could potentially be fused? If # not, do not waste time computing a new topological ordering counter = Counter(self._key(cg) for cg in cgroups) if not any(v > 1 for it, v in counter.most_common()): return ClusterGroup(cgroups) # Similarly, if all ClusterGroups have the same exact prefix and # use the same form of synchronization (if any at all), no need to # attempt a topological sorting if len(counter.most_common()) == 1: return ClusterGroup(cgroups) dag = self._build_dag(cgroups, prefix) def choose_element(queue, scheduled): # Heuristic: let `k0` be the key of the last scheduled node; then out of # the possible schedulable nodes we pick the one with key `k1` such that # `max_i : k0[:i] == k1[:i]` (i.e., the one with "the most similar key") if not scheduled: return queue.pop() key = self._key(scheduled[-1]) for i in reversed(range(len(key) + 1)): candidates = [e for e in queue if self._key(e)[:i] == key[:i]] try: # Ensure stability e = min(candidates, key=lambda i: cgroups.index(i)) except ValueError: continue queue.remove(e) return e assert False return ClusterGroup(dag.topological_sort(choose_element))
def _toposort(self, cgroups, prefix): # Are there any ClusterGroups that could potentially be fused? If # not, do not waste time computing a new topological ordering counter = Counter(cg.itintervals for cg in cgroups) if not any(v > 1 for it, v in counter.most_common()): return ClusterGroup(cgroups) # Similarly, if all ClusterGroups have the same exact prefix, no # need to attempt a topological sorting if len(counter.most_common()) == 1: return ClusterGroup(cgroups) dag = self._build_dag(cgroups, prefix) def choose_element(queue, scheduled): # Heuristic: prefer a node having same IterationSpace as that # of the last scheduled node to maximize Cluster fusion if not scheduled: return queue.pop() last = scheduled[-1] for i in list(queue): if i.itintervals == last.itintervals: queue.remove(i) return i return queue.popleft() return ClusterGroup(dag.topological_sort(choose_element))
def rewrite(clusters, mode='advanced'): """ Given a sequence of N Clusters, produce a sequence of M Clusters with reduced operation count, with M >= N. Parameters ---------- clusters : list of Cluster The Clusters to be transformed. mode : str, optional The aggressiveness of the rewrite. Accepted: - ``noop``: Do nothing. - ``basic``: Apply common sub-expressions elimination. - ``advanced``: Apply all transformations that will reduce the operation count w/ minimum increase to the memory pressure, namely 'basic', factorization, and cross-iteration redundancy elimination ("CIRE") for time-invariants only. - ``aggressive``: Like 'advanced', but apply CIRE to time-varying sub-expressions too. Further, seek and drop cross-cluster redundancies (this is the only pass that attempts to optimize *across* Clusters, rather than within a Cluster). The 'aggressive' mode may substantially increase the symbolic processing time; it may or may not reduce the JIT-compilation time; it may or may not improve the overall runtime performance. """ if not (mode is None or isinstance(mode, str)): raise ValueError("Parameter 'mode' should be a string, not %s." % type(mode)) if mode is None or mode == 'noop': return clusters # We use separate rewriters for dense and sparse clusters; sparse clusters have # non-affine index functions, thus making it basically impossible, in general, # to apply the more advanced DSE passes. # Note: the sparse rewriter uses the same template for temporaries as # the dense rewriter, thus temporaries are globally unique try: rewriter = modes[mode]() except KeyError: rewriter = CustomRewriter(mode) fallback = BasicRewriter(False, rewriter.template) states = [ rewriter.run(c) if c.is_dense else fallback.run(c) for c in clusters ] # Print out profiling information print_profiling(states) # Schedule and optimize the Rewriters-produced clusters clusters = ClusterGroup(optimize(flatten(i.clusters for i in states))) # Turn unnecessary temporary Arrays into scalars clusters = scalarize(clusters, rewriter.template) return ClusterGroup(clusters)
def rewrite(clusters, mode='advanced'): """ Transform N :class:`Cluster` objects of SymPy expressions into M :class:`Cluster` objects of SymPy expressions with reduced operation count, with M >= N. :param clusters: The clusters to be transformed. :param mode: drive the expression transformation The ``mode`` parameter recognises the following values: :: * 'noop': Do nothing. * 'basic': Apply common sub-expressions elimination. * 'advanced': Apply all transformations that will reduce the operation count w/ minimum increase to the memory pressure, namely 'basic', factorization, CSRE for time-invariants only. * 'speculative': Like 'advanced', but apply CSRE also to time-varying sub-expressions, which might further increase the memory pressure. * 'aggressive': Like 'speculative', but apply CSRE to any non-trivial sub-expression (i.e., anything that is at least in a sum-of-products form). This may substantially increase the memory pressure. """ if not (mode is None or isinstance(mode, str)): raise ValueError("Parameter 'mode' should be a string, not %s." % type(mode)) if mode is None or mode == 'noop': return clusters processed = ClusterGroup() for cluster in clusters: if cluster.is_dense: if mode in modes: processed.extend(modes[mode]().run(cluster)) else: try: processed.extend(CustomRewriter().run(cluster)) except DSEException: dse_warning("Unknown rewrite mode(s) %s" % mode) processed.append(cluster) else: # Downgrade sparse clusters to basic rewrite mode since it's # pointless to expose loop-redundancies when the iteration space # only consists of a few points processed.extend(BasicRewriter(False).run(cluster)) return groupby(processed)
def _lower_clusters(cls, expressions, profiler, **kwargs): """ Clusters lowering: * Group expressions into Clusters; * Introduce guards for conditional Clusters. """ # Build a sequence of Clusters from a sequence of Eqs clusters = clusterize(expressions) clusters = cls._specialize_clusters(clusters, profiler=profiler, **kwargs) return ClusterGroup(clusters)
def rewrite(clusters, mode='advanced'): """ Transform N :class:`Cluster` objects of SymPy expressions into M :class:`Cluster` objects of SymPy expressions with reduced operation count, with M >= N. :param clusters: The clusters to be transformed. :param mode: drive the expression transformation The ``mode`` parameter recognises the following values: :: * 'noop': Do nothing. * 'basic': Apply common sub-expressions elimination. * 'advanced': Apply all transformations that will reduce the operation count w/ minimum increase to the memory pressure, namely 'basic', factorization, CIRE for time-invariants only. * 'speculative': Like 'advanced', but apply CIRE also to time-varying sub-expressions, which might further increase the memory pressure. * 'aggressive': Like 'speculative', but apply CIRE to any non-trivial sub-expression (i.e., anything that is at least in a sum-of-products form). This may substantially increase the memory pressure. """ if not (mode is None or isinstance(mode, str)): raise ValueError("Parameter 'mode' should be a string, not %s." % type(mode)) if mode is None or mode == 'noop': return clusters elif mode not in modes: dse_warning("Unknown rewrite mode(s) %s" % mode) return clusters # Separate rewriters for dense and sparse clusters; sparse clusters have # non-affine index functions, thus making it basically impossible, in general, # to apply the more advanced DSE passes. # Note: the sparse rewriter uses the same template for temporaries as # the dense rewriter, thus temporaries are globally unique rewriter = modes[mode]() fallback = BasicRewriter(False, rewriter.template) processed = ClusterGroup( flatten( rewriter.run(c) if c.is_dense else fallback.run(c) for c in clusters)) return groupby(processed).finalize()
def _lower_clusters(cls, expressions, profiler, **kwargs): """ Clusters lowering: * Group expressions into Clusters; * Introduce guards for conditional Clusters; * Analyze Clusters to detect computational properties such as parallelism. """ # Build a sequence of Clusters from a sequence of Eqs clusters = clusterize(expressions) # Operation count before specialization init_ops = sum(estimate_cost(c.exprs) for c in clusters if c.is_dense) clusters = cls._specialize_clusters(clusters, **kwargs) # Operation count after specialization final_ops = sum(estimate_cost(c.exprs) for c in clusters if c.is_dense) profiler.record_ops_variation(init_ops, final_ops) return ClusterGroup(clusters)
def rewrite(clusters, mode='advanced'): """ Given a sequence of N Clusters, produce a sequence of M Clusters with reduced operation count, with M >= N. Parameters ---------- clusters : list of Cluster The Clusters to be transformed. mode : str, optional The aggressiveness of the rewrite. Accepted: - ``noop``: Do nothing. - ``basic``: Apply common sub-expressions elimination. - ``advanced``: Apply all transformations that will reduce the operation count w/ minimum increase to the memory pressure, namely 'basic', factorization, CIRE for time-invariants only. - ``speculative``: Like 'advanced', but apply CIRE also to time-varying sub-expressions, which might further increase the memory pressure. * ``aggressive``: Like 'speculative', but apply CIRE to any non-trivial sub-expression (i.e., anything that is at least in a sum-of-product form). Further, seek and drop cross-cluster redundancies (this is the only pass that attempts to optimize *across* clusters, rather than within a cluster). The 'aggressive' mode may substantially increase the symbolic processing time; it may or may not reduce the JIT-compilation time; it may or may not improve the overall runtime performance. """ if not (mode is None or isinstance(mode, str)): raise ValueError("Parameter 'mode' should be a string, not %s." % type(mode)) if mode is None or mode == 'noop': return clusters elif mode not in dse_registry: dse_warning("Unknown rewrite mode(s) %s" % mode) return clusters # 1) Local optimization # --------------------- # We use separate rewriters for dense and sparse clusters; sparse clusters have # non-affine index functions, thus making it basically impossible, in general, # to apply the more advanced DSE passes. # Note: the sparse rewriter uses the same template for temporaries as # the dense rewriter, thus temporaries are globally unique rewriter = modes[mode]() fallback = BasicRewriter(False, rewriter.template) processed = ClusterGroup(flatten(rewriter.run(c) if c.is_dense else fallback.run(c) for c in clusters)) # 2) Cluster grouping # ------------------- # Different clusters may have created new (smaller) clusters which are # potentially groupable within a single cluster processed = groupby(processed) # 3)Global optimization # --------------------- # After grouping, there may be redundancies in one or more clusters. This final # pass searches and drops such redundancies if mode == 'aggressive': processed = cross_cluster_cse(processed) return processed.finalize()
def rewrite(clusters, mode='advanced'): """ Given a sequence of N Clusters, produce a sequence of M Clusters with reduced operation count, with M >= N. Parameters ---------- clusters : list of Cluster The Clusters to be transformed. mode : str, optional The aggressiveness of the rewrite. Accepted: - ``noop``: Do nothing. - ``basic``: Apply common sub-expressions elimination. - ``advanced``: Apply all transformations that will reduce the operation count w/ minimum increase to the memory pressure, namely 'basic', factorization, CIRE for time-invariants only. - ``speculative``: Like 'advanced', but apply CIRE also to time-varying sub-expressions, which might further increase the memory pressure. * ``aggressive``: Like 'speculative', but apply CIRE to any non-trivial sub-expression (i.e., anything that is at least in a sum-of-product form). Further, seek and drop cross-cluster redundancies (this is the only pass that attempts to optimize *across* clusters, rather than within a cluster). The 'aggressive' mode may substantially increase the symbolic processing time; it may or may not reduce the JIT-compilation time; it may or may not improve the overall runtime performance. """ if not (mode is None or isinstance(mode, str)): raise ValueError("Parameter 'mode' should be a string, not %s." % type(mode)) if mode is None or mode == 'noop': return clusters elif mode not in modes: dse_warning("Unknown rewrite mode(s) %s" % mode) return clusters # 1) Local optimization # --------------------- # We use separate rewriters for dense and sparse clusters; sparse clusters have # non-affine index functions, thus making it basically impossible, in general, # to apply the more advanced DSE passes. # Note: the sparse rewriter uses the same template for temporaries as # the dense rewriter, thus temporaries are globally unique rewriter = modes[mode]() fallback = BasicRewriter(False, rewriter.template) processed = ClusterGroup( flatten( rewriter.run(c) if c.is_dense else fallback.run(c) for c in clusters)) # 2) Cluster grouping # ------------------- # Different clusters may have created new (smaller) clusters which are # potentially groupable within a single cluster processed = groupby(processed) # 3)Global optimization # --------------------- # After grouping, there may be redundancies in one or more clusters. This final # pass searches and drops such redundancies if mode == 'aggressive': processed = cross_cluster_cse(processed) return processed.finalize()
def process(self, clusters): cgroups = [ClusterGroup(c, c.itintervals) for c in clusters] cgroups = self._process_fdta(cgroups, 1) clusters = ClusterGroup.concatenate(*cgroups) return clusters