Esempio n. 1
0
    def _factorize(self, cluster, *args, **kwargs):
        """
        Collect terms in each expr in exprs based on the following heuristic:

            * Collect all literals;
            * Collect all temporaries produced by CSE;
            * If the expression has an operation count higher than
              ``self.MIN_COST_FACTORIZE``, then this is applied recursively until
              no more factorization opportunities are available.
        """

        processed = []
        for expr in cluster.exprs:
            handle = collect_nested(expr)
            cost_handle = estimate_cost(handle)

            if cost_handle >= self.MIN_COST_FACTORIZE:
                handle_prev = handle
                cost_prev = estimate_cost(expr)
                while cost_handle < cost_prev:
                    handle_prev, handle = handle, collect_nested(handle)
                    cost_prev, cost_handle = cost_handle, estimate_cost(handle)
                cost_handle, handle = cost_prev, handle_prev

            processed.append(handle)

        return cluster.rebuild(processed)
Esempio n. 2
0
 def _selector(self, e, naliases):
     if all(i.function.is_Symbol for i in e.free_symbols):
         # E.g., `dt**(-2)`
         mincost = self._opt_mincost['scalar']
     else:
         mincost = self._opt_mincost['tensor']
     return estimate_cost(e, True)*naliases // mincost
Esempio n. 3
0
    def extract(cls, n, context, min_cost, max_alias, cluster, sregistry):
        make = lambda: Scalar(name=sregistry.make_name(), dtype=cluster.dtype
                              ).indexify()

        exclude = {
            i.source.indexed
            for i in cluster.scope.d_flow.independent()
        }
        rule0 = lambda e: not e.free_symbols & exclude
        rule1 = make_is_time_invariant(context)
        rule2 = lambda e: estimate_cost(e, True) >= min_cost
        rule = lambda e: rule0(e) and rule1(e) and rule2(e)

        extracted = []
        mapper = OrderedDict()
        for e in cluster.exprs:
            for i in search(e, rule, 'all', 'dfs_first_hit'):
                if i not in mapper:
                    symbol = make()
                    mapper[i] = symbol
                    extracted.append(e.func(symbol, i))

        processed = [uxreplace(e, mapper) for e in cluster.exprs]

        return extracted + processed, extracted
Esempio n. 4
0
    def _extract_time_invariants(self,
                                 cluster,
                                 template,
                                 with_cse=True,
                                 costmodel=None,
                                 **kwargs):
        """
        Extract time-invariant subexpressions, and assign them to temporaries.
        """
        make = lambda: Scalar(name=template(), dtype=cluster.dtype).indexify()
        rule = iq_timeinvariant(cluster.trace)
        costmodel = costmodel or (lambda e: estimate_cost(e) > 0)
        processed, found = xreplace_constrained(cluster.exprs, make, rule,
                                                costmodel)

        if with_cse:
            leaves = [i for i in processed if i not in found]

            # Search for common sub-expressions amongst them (and only them)
            found = common_subexprs_elimination(found, make)

            # Some temporaries may be droppable at this point
            processed = compact_temporaries(found, leaves)

        return cluster.rebuild(processed)
Esempio n. 5
0
def test_xreplace_constrained_time_varying(tu, tv, tw, ti0, ti1, t0, t1, exprs,
                                           expected):
    exprs = EVAL(exprs, tu, tv, tw, ti0, ti1, t0, t1)
    make = lambda i: Scalar(name='r%d' % i).indexify()
    processed, found = xreplace_constrained(
        exprs, make, iq_timevarying(TemporariesGraph(exprs)),
        lambda i: estimate_cost(i) > 0)
    assert len(found) == len(expected)
    assert all(str(i.rhs) == j for i, j in zip(found, expected))
Esempio n. 6
0
def create_profile(name, iet):
    """
    Enrich the Iteration/Expression tree ``iet`` adding nodes for C-level
    performance profiling. In particular, turn all :class:`Section`s within ``iet``
    into :class:`TimedList`s.

    A :class:`Profiler` is returned to access profiling data.
    """
    sections = FindNodes(Section).visit(iet)

    # Construct the Profiler
    profiler = Profiler(name)
    for section in sections:
        # All ExpressionBundles within `section`
        bundles = FindNodes(ExpressionBundle).visit(section)

        # Total operation count
        ops = sum(i.ops for i in bundles)

        # Operation count at each section iteration
        sops = sum(
            estimate_cost(i.expr) for i in flatten(b.exprs for b in bundles))

        # Total memory traffic
        mapper = {}
        for i in bundles:
            for k, v in i.traffic.items():
                mapper.setdefault(k, []).append(v)
        traffic = [
            IntervalGroup.generate('merge', *i) for i in mapper.values()
        ]
        traffic = sum(i.extent for i in traffic)

        # Each ExpressionBundle lives in its own iteration space
        itershapes = [i.shape for i in bundles]

        # Track how many grid points are written within `section`
        points = []
        for i in bundles:
            writes = {
                e.write
                for e in i.exprs if e.is_tensor and e.write.is_TimeFunction
            }
            points.append(reduce(mul, i.shape) * len(writes))
        points = sum(points)

        profiler.add(section,
                     SectionData(ops, sops, points, traffic, itershapes))

    # Transform the Iteration/Expression tree introducing the C-level timers
    mapper = {
        i: TimedList(gname=name, lname=i.name, body=i.body)
        for i in sections
    }
    iet = Transformer(mapper).visit(iet)

    return iet, profiler
Esempio n. 7
0
def test_yreplace_time_invariants(tu, tv, tw, ti0, ti1, t0, t1, exprs, expected):
    exprs = EVAL(exprs, tu, tv, tw, ti0, ti1, t0, t1)
    counter = generator()
    make = lambda: Scalar(name='r%d' % counter()).indexify()
    processed, found = yreplace(exprs, make,
                                make_is_time_invariant(exprs),
                                lambda i: estimate_cost(i) > 0)
    assert len(found) == len(expected)
    assert all(str(i.rhs) == j for i, j in zip(found, expected))
Esempio n. 8
0
    def _extract_time_invariants(self, cluster, template, **kwargs):
        """
        Extract time-invariant subexpressions, and assign them to temporaries.
        """
        make = lambda: Scalar(name=template(), dtype=cluster.dtype).indexify()
        rule = make_is_time_invariant(cluster.exprs)
        costmodel = lambda e: estimate_cost(e, True) >= self.MIN_COST_ALIAS_INV
        processed, found = yreplace(cluster.exprs, make, rule, costmodel, eager=True)

        return cluster.rebuild(processed)
Esempio n. 9
0
    def callbacks_invariants(context, n):
        min_cost_inv = min_cost['invariants']
        if callable(min_cost_inv):
            min_cost_inv = min_cost_inv(n)

        extractor = make_is_time_invariant(context)
        model = lambda e: estimate_cost(e, True) >= min_cost_inv
        ignore_collected = lambda g: False
        selector = lambda c, n: c >= min_cost_inv and n >= 1
        return extractor, model, ignore_collected, selector
Esempio n. 10
0
def test_xreplace_constrained_time_varying(tu, tv, tw, ti0, ti1, t0, t1,
                                           exprs, expected):
    exprs = EVAL(exprs, tu, tv, tw, ti0, ti1, t0, t1)
    counter = generator()
    make = lambda: Scalar(name='r%d' % counter()).indexify()
    processed, found = xreplace_constrained(exprs, make,
                                            iq_timevarying(FlowGraph(exprs)),
                                            lambda i: estimate_cost(i) > 0)
    assert len(found) == len(expected)
    assert all(str(i.rhs) == j for i, j in zip(found, expected))
Esempio n. 11
0
    def _extract_rule(cls, context, min_cost, cluster):
        exclude = {
            i.source.indexed
            for i in cluster.scope.d_flow.independent()
        }
        rule0 = lambda e: not e.free_symbols & exclude
        rule1 = make_is_time_invariant(context)
        rule2 = lambda e: estimate_cost(e, True) >= min_cost

        return lambda e: rule0(e) and rule1(e) and rule2(e)
Esempio n. 12
0
    def instrument(self, iet):
        """
        Enrich the Iteration/Expression tree ``iet`` adding nodes for C-level
        performance profiling. In particular, turn all Sections within ``iet``
        into TimedLists.
        """
        sections = FindNodes(Section).visit(iet)
        for section in sections:
            bundles = FindNodes(ExpressionBundle).visit(section)

            # Total operation count
            ops = sum(i.ops for i in bundles)

            # Operation count at each section iteration
            sops = sum(
                estimate_cost(i.expr)
                for i in flatten(b.exprs for b in bundles))

            # Total memory traffic
            mapper = {}
            for i in bundles:
                for k, v in i.traffic.items():
                    mapper.setdefault(k, []).append(v)
            traffic = 0
            for i in mapper.values():
                try:
                    traffic += IntervalGroup.generate('union', *i).size
                except ValueError:
                    # Over different iteration spaces
                    traffic += sum(j.size for j in i)

            # Each ExpressionBundle lives in its own iteration space
            itermaps = [i.ispace.dimension_map for i in bundles]

            # Track how many grid points are written within `section`
            points = []
            for i in bundles:
                writes = {
                    e.write
                    for e in i.exprs if e.is_tensor and e.write.is_TimeFunction
                }
                points.append(i.size * len(writes))
            points = sum(points)

            self._sections[section] = SectionData(ops, sops, points, traffic,
                                                  itermaps)

        # Transform the Iteration/Expression tree introducing the C-level timers
        mapper = {
            i: TimedList(timer=self.timer, lname=i.name, body=i)
            for i in sections
        }
        iet = Transformer(mapper).visit(iet)

        return iet
Esempio n. 13
0
def common_subexprs_elimination(exprs, make, mode='default'):
    """
    Perform common sub-expressions elimination, or CSE.

    Note: the output is guaranteed to be topologically sorted.

    Parameters
    ----------
    exprs : expr-like or list of expr-like
        One or more expressions to which CSE is applied.
    make : callable
        Build symbols to store temporary, redundant values.
    mode : str, optional
        The CSE algorithm applied. Accepted: ['default'].
    """

    # Note: not defaulting to SymPy's CSE() function for three reasons:
    # - it also captures array index access functions (eg, i+1 in A[i+1] and B[i+1]);
    # - it sometimes "captures too much", losing factorization opportunities;
    # - very slow
    # TODO: a second "sympy" mode will be provided, relying on SymPy's CSE() but
    # also ensuring some sort of post-processing
    assert mode == 'default'  # Only supported mode ATM

    processed = list(exprs)
    mapped = []
    while True:
        # Detect redundancies
        counted = count(mapped + processed, q_xop).items()
        targets = OrderedDict([(k, estimate_cost(k, True)) for k, v in counted if v > 1])
        if not targets:
            break

        # Create temporaries
        hit = max(targets.values())
        picked = [k for k, v in targets.items() if v == hit]
        mapper = OrderedDict([(e, make()) for i, e in enumerate(picked)])

        # Apply replacements
        processed = [e.xreplace(mapper) for e in processed]
        mapped = [e.xreplace(mapper) for e in mapped]
        mapped = [DummyEq(v, k) for k, v in reversed(list(mapper.items()))] + mapped

        # Prepare for the next round
        for k in picked:
            targets.pop(k)
    processed = mapped + processed

    # At this point we may have useless temporaries (e.g., r0=r1). Let's drop them
    processed = _compact_temporaries(processed)

    # Perform topological sorting so that reads-after-writes are honored
    processed = _topological_sort(processed)

    return processed
Esempio n. 14
0
def common_subexprs_elimination(exprs, make, mode='default'):
    """
    Perform common sub-expressions elimination, or CSE.

    Note: the output is not guranteed to be topologically sorted.

    Parameters
    ----------
    exprs : expr-like or list of expr-like
        One or more expressions to which CSE is applied.
    make : callable
        Build symbols to store temporary, redundant values.
    mode : str, optional
        The CSE algorithm applied. Accepted: ['default'].
    """

    # Note: not defaulting to SymPy's CSE() function for three reasons:
    # - it also captures array index access functions (eg, i+1 in A[i+1] and B[i+1]);
    # - it sometimes "captures too much", losing factorization opportunities;
    # - very slow
    # TODO: a second "sympy" mode will be provided, relying on SymPy's CSE() but
    # also ensuring some sort of post-processing
    assert mode == 'default'  # Only supported mode ATM

    processed = list(exprs)
    mapped = []
    while True:
        # Detect redundancies
        counted = count(mapped + processed, q_op).items()
        targets = OrderedDict([(k, estimate_cost(k)) for k, v in counted if v > 1])
        if not targets:
            break

        # Create temporaries
        hit = max(targets.values())
        picked = [k for k, v in targets.items() if v == hit]
        mapper = OrderedDict([(e, make()) for i, e in enumerate(picked)])

        # Apply replacements
        processed = [e.xreplace(mapper) for e in processed]
        mapped = [e.xreplace(mapper) for e in mapped]
        mapped = [Eq(v, k) for k, v in reversed(list(mapper.items()))] + mapped

        # Prepare for the next round
        for k in picked:
            targets.pop(k)
    processed = mapped + processed

    # Simply renumber the temporaries in ascending order
    mapper = {i.lhs: j.lhs for i, j in zip(mapped, reversed(mapped))}
    processed = [e.xreplace(mapper) for e in processed]

    return processed
Esempio n. 15
0
def choose(exprs, aliases, selector):
    others = []
    chosen = OrderedDict()
    for e in exprs:
        naliases = len(aliases.get(e.rhs))
        cost = estimate_cost(e, True)*naliases
        if selector(cost, naliases):
            chosen[e.rhs] = e.lhs
        else:
            others.append(e)

    return chosen, others
Esempio n. 16
0
    def _extract_time_varying(self, cluster, template, **kwargs):
        """
        Extract time-varying subexpressions, and assign them to temporaries.
        Time varying subexpressions arise for example when approximating
        derivatives through finite differences.
        """

        make = lambda: Scalar(name=template(), dtype=cluster.dtype).indexify()
        rule = iq_timevarying(cluster.trace)
        costmodel = lambda i: estimate_cost(i) > 0
        processed, _ = xreplace_constrained(cluster.exprs, make, rule, costmodel)

        return cluster.rebuild(processed)
Esempio n. 17
0
    def _lower_clusters(cls, expressions, profiler, **kwargs):
        """
        Clusters lowering:

            * Group expressions into Clusters;
            * Introduce guards for conditional Clusters;
            * Analyze Clusters to detect computational properties such
              as parallelism.
        """
        # Build a sequence of Clusters from a sequence of Eqs
        clusters = clusterize(expressions)

        # Operation count before specialization
        init_ops = sum(estimate_cost(c.exprs) for c in clusters if c.is_dense)

        clusters = cls._specialize_clusters(clusters, **kwargs)

        # Operation count after specialization
        final_ops = sum(estimate_cost(c.exprs) for c in clusters if c.is_dense)
        profiler.record_ops_variation(init_ops, final_ops)

        return ClusterGroup(clusters)
Esempio n. 18
0
    def _extract_time_varying(self, cluster, template, **kwargs):
        """
        Extract time-varying subexpressions, and assign them to temporaries.
        Time varying subexpressions arise for example when approximating
        derivatives through finite differences.
        """

        make = lambda i: Scalar(name=template(i)).indexify()
        rule = iq_timevarying(cluster.trace)
        costmodel = lambda i: estimate_cost(i) > 0
        processed, _ = xreplace_constrained(cluster.exprs, make, rule, costmodel)

        return cluster.rebuild(processed)
Esempio n. 19
0
def test_estimate_cost(expr, expected, estimate):
    # Note: integer arithmetic isn't counted
    grid = Grid(shape=(4, 4))
    x, y = grid.dimensions  # noqa

    t0 = Scalar(name='t0')  # noqa
    t1 = Scalar(name='t1')  # noqa
    t2 = Scalar(name='t2')  # noqa
    fa = Function(name='fa', grid=grid, shape=(4, ), dimensions=(x, ))  # noqa
    fb = Function(name='fb', grid=grid, shape=(4, ), dimensions=(x, ))  # noqa
    fc = Function(name='fc', grid=grid)  # noqa

    assert estimate_cost(eval(expr), estimate) == expected
Esempio n. 20
0
    def wrapper(self, state, **kwargs):
        # Invoke the DSE pass on each Cluster
        tic = time()
        state.update(flatten([func(self, c, state.template, **kwargs)
                              for c in state.clusters]))
        toc = time()

        # Profiling
        key = '%s%d' % (func.__name__, len(state.timings))
        state.timings[key] = toc - tic
        if self.profile:
            candidates = [c.exprs for c in state.clusters if c.is_dense]
            state.ops[key] = estimate_cost(flatten(candidates))
Esempio n. 21
0
    def wrapper(self, state, **kwargs):
        # Invoke the DSE pass on each Cluster
        tic = time()
        state.update(flatten([func(self, c, state.template, **kwargs)
                              for c in state.clusters]))
        toc = time()

        # Profiling
        key = '%s%d' % (func.__name__, len(state.timings))
        state.timings[key] = toc - tic
        if self.profile:
            candidates = [c.exprs for c in state.clusters if c.is_dense]
            state.ops[key] = estimate_cost(flatten(candidates))
Esempio n. 22
0
def common_subexprs_elimination(exprs, make, mode='default'):
    """
    Perform common subexpressions elimination.

    Note: the output is not guranteed to be topologically sorted.

    :param exprs: The target SymPy expression, or a collection of SymPy expressions.
    :param make: A function to construct symbols used for replacement.
                 The function takes as input an integer ID; ID is computed internally
                 and used as a unique identifier for the constructed symbols.
    """

    # Note: not defaulting to SymPy's CSE() function for three reasons:
    # - it also captures array index access functions (eg, i+1 in A[i+1] and B[i+1]);
    # - it sometimes "captures too much", losing factorization opportunities;
    # - very slow
    # TODO: a second "sympy" mode will be provided, relying on SymPy's CSE() but
    # also ensuring some sort of post-processing
    assert mode == 'default'  # Only supported mode ATM

    processed = list(exprs)
    mapped = []
    while True:
        # Detect redundancies
        counted = count(mapped + processed, q_op).items()
        targets = OrderedDict([(k, estimate_cost(k)) for k, v in counted if v > 1])
        if not targets:
            break

        # Create temporaries
        hit = max(targets.values())
        picked = [k for k, v in targets.items() if v == hit]
        mapper = OrderedDict([(e, make(len(mapped) + i)) for i, e in enumerate(picked)])

        # Apply repleacements
        processed = [e.xreplace(mapper) for e in processed]
        mapped = [e.xreplace(mapper) for e in mapped]
        mapped = [Eq(v, k) for k, v in reversed(list(mapper.items()))] + mapped

        # Prepare for the next round
        for k in picked:
            targets.pop(k)
    processed = mapped + processed

    # Simply renumber the temporaries in ascending order
    mapper = {i.lhs: j.lhs for i, j in zip(mapped, reversed(mapped))}
    processed = [e.xreplace(mapper) for e in processed]

    return processed
Esempio n. 23
0
def factorize(cluster, *args):
    """
    Factorize trascendental functions, symbolic powers, numeric coefficients.

    If the expression has an operation count greater than ``MIN_COST_FACTORIZE``,
    then the algorithm is applied recursively until no more factorization
    opportunities are detected.
    """
    processed = []
    for expr in cluster.exprs:
        handle = _collect_nested(expr)
        cost_handle = estimate_cost(handle)

        if cost_handle >= MIN_COST_FACTORIZE:
            handle_prev = handle
            cost_prev = estimate_cost(expr)
            while cost_handle < cost_prev:
                handle_prev, handle = handle, _collect_nested(handle)
                cost_prev, cost_handle = cost_handle, estimate_cost(handle)
            cost_handle, handle = cost_prev, handle_prev

        processed.append(handle)

    return cluster.rebuild(processed)
Esempio n. 24
0
def test_yreplace_time_invariants(exprs, expected):
    grid = Grid((3, 3, 3))
    dims = grid.dimensions
    tu = TimeFunction(name="tu", grid=grid, space_order=4).indexify()
    tv = TimeFunction(name="tv", grid=grid, space_order=4).indexify()
    tw = TimeFunction(name="tw", grid=grid, space_order=4).indexify()
    ti0 = Array(name='ti0', shape=(3, 5, 7), dimensions=dims).indexify()
    ti1 = Array(name='ti1', shape=(3, 5, 7), dimensions=dims).indexify()
    t0 = Scalar(name='t0').indexify()
    t1 = Scalar(name='t1').indexify()
    exprs = EVAL(exprs, tu, tv, tw, ti0, ti1, t0, t1)
    counter = generator()
    make = lambda: Scalar(name='r%d' % counter()).indexify()
    processed, found = yreplace(exprs, make, make_is_time_invariant(exprs),
                                lambda i: estimate_cost(i) > 0)
    assert len(found) == len(expected)
    assert all(str(i.rhs) == j for i, j in zip(found, expected))
Esempio n. 25
0
def choose(exprs, aliases, selector):
    """
    Use a cost model to select the aliases that are worth optimizing.
    """
    retained = AliasMapper(aliases)
    others = []
    chosen = OrderedDict()
    for e in exprs:
        aliaseds = aliases.get(e.rhs)
        naliases = len(aliaseds)
        cost = estimate_cost(e, True)*naliases
        if selector(cost, naliases):
            chosen[e.rhs] = e.lhs
        else:
            others.append(e)
            retained.remove(e.rhs)

    return retained, chosen, others
Esempio n. 26
0
    def _extract_time_invariants(self, cluster, template, with_cse=True, **kwargs):
        """
        Extract time-invariant subexpressions, and assign them to temporaries.
        """
        make = lambda: Scalar(name=template(), dtype=cluster.dtype).indexify()
        rule = iq_timeinvariant(cluster.trace)
        costmodel = lambda e: estimate_cost(e) > 0
        processed, found = xreplace_constrained(cluster.exprs, make, rule, costmodel)

        if with_cse:
            leaves = [i for i in processed if i not in found]

            # Search for common sub-expressions amongst them (and only them)
            found = common_subexprs_elimination(found, make)

            # Some temporaries may be droppable at this point
            processed = compact_temporaries(found, leaves)

        return cluster.rebuild(processed)
Esempio n. 27
0
    def instrument(self, iet):
        """
        Enrich the Iteration/Expression tree ``iet`` adding nodes for C-level
        performance profiling. In particular, turn all :class:`Section`s within ``iet``
        into :class:`TimedList`s.
        """
        sections = FindNodes(Section).visit(iet)
        for section in sections:
            bundles = FindNodes(ExpressionBundle).visit(section)

            # Total operation count
            ops = sum(i.ops for i in bundles)

            # Operation count at each section iteration
            sops = sum(estimate_cost(i.expr) for i in flatten(b.exprs for b in bundles))

            # Total memory traffic
            mapper = {}
            for i in bundles:
                for k, v in i.traffic.items():
                    mapper.setdefault(k, []).append(v)
            traffic = [IntervalGroup.generate('merge', *i) for i in mapper.values()]
            traffic = sum(i.size for i in traffic)

            # Each ExpressionBundle lives in its own iteration space
            itershapes = [i.shape for i in bundles]

            # Track how many grid points are written within `section`
            points = []
            for i in bundles:
                writes = {e.write for e in i.exprs
                          if e.is_tensor and e.write.is_TimeFunction}
                points.append(reduce(mul, i.shape)*len(writes))
            points = sum(points)

            self._sections[section] = SectionData(ops, sops, points, traffic, itershapes)

        # Transform the Iteration/Expression tree introducing the C-level timers
        mapper = {i: TimedList(timer=self.timer, lname=i.name, body=i) for i in sections}
        iet = Transformer(mapper).visit(iet)

        return iet
Esempio n. 28
0
def extract(exprs, aliases):
    """
    Extract the candidate aliases.
    """
    is_time_invariant = make_is_time_invariant(exprs)
    time_invariants = {e.rhs: is_time_invariant(e) for e in exprs}

    processed = []
    candidates = OrderedDict()
    for e in exprs:
        # Cost check (to keep the memory footprint under control)
        naliases = len(aliases.get(e.rhs))
        cost = estimate_cost(e, True) * naliases
        test0 = lambda: cost >= MIN_COST_ALIAS and naliases > 1
        test1 = lambda: cost >= MIN_COST_ALIAS_INV and time_invariants[e.rhs]
        if test0() or test1():
            candidates[e.rhs] = e.lhs
        else:
            processed.append(e)

    return candidates, processed
Esempio n. 29
0
    def _generate(self, exprs, exclude):
        # E.g., extract `u.dx*a*b` and `u.dx*a*c` from `[(u.dx*a*b).dy`, `(u.dx*a*c).dy]`
        def cbk_search(expr):
            if isinstance(expr, EvalDerivative) and not expr.base.is_Function:
                return expr.args
            else:
                return flatten(e for e in [cbk_search(a) for a in expr.args]
                               if e)

        cbk_compose = lambda e: split_coeff(e)[1]
        basextr = self._do_generate(exprs, exclude, cbk_search, cbk_compose)
        if not basextr:
            return
        yield basextr

        # E.g., extract `u.dx*a` from `[(u.dx*a*b).dy, (u.dx*a*c).dy]`
        # That is, attempt extracting the largest common derivative-induced subexprs
        mappers = [deindexify(e) for e in basextr.extracted]
        counter = Counter(flatten(m.keys() for m in mappers))
        groups = as_mapper(counter, key=counter.get)
        grank = {
            k: sorted(v, key=lambda e: estimate_cost(e), reverse=True)
            for k, v in groups.items()
        }

        def cbk_search2(expr, rank):
            ret = []
            for e in cbk_search(expr):
                mapper = deindexify(e)
                for i in rank:
                    if i in mapper:
                        ret.extend(mapper[i])
                        break
            return ret

        candidates = sorted(grank, reverse=True)[:2]
        for i in candidates:
            lower_pri_elems = flatten([grank[j] for j in candidates if j != i])
            cbk_search_i = lambda e: cbk_search2(e, grank[i] + lower_pri_elems)
            yield self._do_generate(exprs, exclude, cbk_search_i, cbk_compose)
Esempio n. 30
0
    def wrapper(self, state, **kwargs):
        # A template to construct temporaries
        tempname = self.conventions.get(func.__name__)
        if tempname:
            start = kwargs.get('start')
            tempname += '%d' if start is None else (('_%d_' % start) + '%d')
            template = lambda i: tempname % i
        else:
            template = None

        # Invoke the DSE pass
        tic = time()
        state.update(
            flatten(
                [func(self, c, template, **kwargs) for c in state.clusters]))
        toc = time()

        # Profiling
        key = '%s%d' % (func.__name__, len(self.timings))
        self.timings[key] = toc - tic
        if self.profile:
            candidates = [c.exprs for c in state.clusters if c.is_dense]
            self.ops[key] = estimate_cost(flatten(candidates))
Esempio n. 31
0
def choose(exprs, aliases, selector):
    """
    Use a cost model to select the aliases that are worth optimizing.
    """
    # Pass 1: a set of aliasing expressions is optimizable only if its cost
    # exceeds the mode's threshold
    candidates = OrderedDict()
    others = []
    for e in exprs:
        naliases = len(aliases.get(e.rhs))
        cost = estimate_cost(e, True) * naliases
        score = selector(cost)
        if score > 0:
            candidates[e] = score
        else:
            others.append(e)

    # Pass 2: a set of aliasing expressions is optimizable if and only if it survived
    # Pass 1 above *and* the tradeoff between operation count and working set increase
    # is favorable
    owset = wset(others)
    retained = AliasMapper(aliases)
    chosen = OrderedDict()
    others = []
    for e in exprs:
        try:
            score = candidates[e]
        except KeyError:
            score = 0
        if score > 1 or \
           score == 1 and max(len(wset(e)), 1) > len(wset(e) & owset):
            chosen[e.rhs] = e.lhs
        else:
            others.append(e)
            retained.remove(e.rhs)

    return retained, chosen, others
Esempio n. 32
0
 def _selector(self, e, naliases):
     if naliases <= 1:
         return 0
     else:
         return estimate_cost(e, True) * naliases // self._opt_mincost
Esempio n. 33
0
def test_estimate_cost(fa, fb, fc, t0, t1, t2, expr, expected):
    # Note: integer arithmetic isn't counted
    assert estimate_cost(EVAL(expr, fa, fb, fc, t0, t1, t2)) == expected
Esempio n. 34
0
 def ops(self):
     """Number of operations performed at each iteration."""
     return sum(estimate_cost(i) for i in self.exprs)
Esempio n. 35
0
def _cse(maybe_exprs, make, mode='default'):
    """
    Main common sub-expressions elimination routine.

    Note: the output is guaranteed to be topologically sorted.

    Parameters
    ----------
    maybe_exprs : expr-like or list of expr-like  or Cluster
        One or more expressions to which CSE is applied.
    make : callable
        Build symbols to store temporary, redundant values.
    mode : str, optional
        The CSE algorithm applied. Accepted: ['default'].
    """

    # Note: not defaulting to SymPy's CSE() function for three reasons:
    # - it also captures array index access functions (eg, i+1 in A[i+1] and B[i+1]);
    # - it sometimes "captures too much", losing factorization opportunities;
    # - very slow
    # TODO: a second "sympy" mode will be provided, relying on SymPy's CSE() but
    # also ensuring some form of post-processing
    assert mode == 'default'  # Only supported mode ATM

    # Just for flexibility, accept either Clusters or exprs
    if isinstance(maybe_exprs, Cluster):
        cluster = maybe_exprs
        processed = list(cluster.exprs)
        scope = cluster.scope
    else:
        processed = list(maybe_exprs)
        scope = Scope(maybe_exprs)

    # Some sub-expressions aren't really "common" -- that's the case of Dimension-
    # independent data dependences. For example:
    #
    # ... = ... a[i] + 1 ...
    # a[i] = ...
    # ... = ... a[i] + 1 ...
    #
    # `a[i] + 1` will be excluded, as there's a flow Dimension-independent data
    # dependence involving `a`
    exclude = {i.source.access for i in scope.d_flow.independent()}

    mapped = []
    while True:
        # Detect redundancies
        counted = count(mapped + processed, q_xop).items()
        targets = OrderedDict([(k, estimate_cost(k, True)) for k, v in counted
                               if v > 1])

        # Rule out Dimension-independent data dependencies
        targets = OrderedDict([(k, v) for k, v in targets.items()
                               if not k.free_symbols & exclude])

        if not targets:
            break

        # Create temporaries
        hit = max(targets.values())
        picked = [k for k, v in targets.items() if v == hit]
        mapper = OrderedDict([(e, make()) for i, e in enumerate(picked)])

        # Apply replacements
        processed = [uxreplace(e, mapper) for e in processed]
        mapped = [uxreplace(e, mapper) for e in mapped]
        mapped = [Eq(v, k) for k, v in reversed(list(mapper.items()))] + mapped

        # Update `exclude` for the same reasons as above -- to rule out CSE across
        # Dimension-independent data dependences
        exclude.update({i for i in mapper.values()})

        # Prepare for the next round
        for k in picked:
            targets.pop(k)
    processed = mapped + processed

    # At this point we may have useless temporaries (e.g., r0=r1). Let's drop them
    processed = _compact_temporaries(processed)

    return processed
Esempio n. 36
0
    def _eliminate_inter_stencil_redundancies(self, cluster, template,
                                              **kwargs):
        """
        Search aliasing expressions and capture them into vector temporaries.

        Examples
        --------
        1) temp = (a[x,y,z]+b[x,y,z])*c[t,x,y,z]
           >>>
           ti[x,y,z] = a[x,y,z] + b[x,y,z]
           temp = ti[x,y,z]*c[t,x,y,z]

        2) temp1 = 2.0*a[x,y,z]*b[x,y,z]
           temp2 = 3.0*a[x,y,z+1]*b[x,y,z+1]
           >>>
           ti[x,y,z] = a[x,y,z]*b[x,y,z]
           temp1 = 2.0*ti[x,y,z]
           temp2 = 3.0*ti[x,y,z+1]
        """
        # For more information about "aliases", refer to collect.__doc__
        aliases = collect(cluster.exprs)

        # Redundancies will be stored in space-varying temporaries
        graph = FlowGraph(cluster.exprs)
        time_invariants = {
            v.rhs: graph.time_invariant(v)
            for v in graph.values()
        }

        # Find the candidate expressions
        processed = []
        candidates = OrderedDict()
        for k, v in graph.items():
            # Cost check (to keep the memory footprint under control)
            naliases = len(aliases.get(v.rhs))
            cost = estimate_cost(v, True) * naliases
            test0 = lambda: cost >= self.MIN_COST_ALIAS and naliases > 1
            test1 = lambda: cost >= self.MIN_COST_ALIAS_INV and time_invariants[
                v.rhs]
            if test0() or test1():
                candidates[v.rhs] = k
            else:
                processed.append(v)

        # Create alias Clusters and all necessary substitution rules
        # for the new temporaries
        alias_clusters = []
        subs = {}
        for origin, alias in aliases.items():
            if all(i not in candidates for i in alias.aliased):
                continue

            # The write-to Intervals
            writeto = [
                Interval(i.dim, *alias.relaxed_diameter.get(i.dim, (0, 0)))
                for i in cluster.ispace.intervals if not i.dim.is_Time
            ]
            writeto = IntervalGroup(writeto)

            # Optimization: no need to retain a SpaceDimension if it does not
            # induce a flow/anti dependence (below, `i.offsets` captures this, by
            # telling how much halo will be needed to honour such dependences)
            dep_inducing = [i for i in writeto if any(i.offsets)]
            try:
                index = writeto.index(dep_inducing[0])
                writeto = IntervalGroup(writeto[index:])
            except IndexError:
                warning("Couldn't optimize some of the detected redundancies")

            # Create a temporary to store `alias`
            dimensions = [d.root for d in writeto.dimensions]
            halo = [(abs(i.lower), abs(i.upper)) for i in writeto]
            array = Array(name=template(),
                          dimensions=dimensions,
                          halo=halo,
                          dtype=cluster.dtype)

            # Build up the expression evaluating `alias`
            access = tuple(i.dim - i.lower for i in writeto)
            expression = Eq(array[access], origin)

            # Create the substitution rules so that we can use the newly created
            # temporary in place of the aliasing expressions
            for aliased, distance in alias.with_distance:
                assert all(i.dim in distance.labels for i in writeto)
                access = [i.dim - i.lower + distance[i.dim] for i in writeto]
                if aliased in candidates:
                    # It would *not* be in `candidates` if part of a composite alias
                    subs[candidates[aliased]] = array[access]
                subs[aliased] = array[access]

            # Construct the `alias` IterationSpace
            intervals, sub_iterators, directions = cluster.ispace.args
            ispace = IterationSpace(intervals.add(writeto), sub_iterators,
                                    directions)

            # Construct the `alias` DataSpace
            mapper = detect_accesses(expression)
            parts = {
                k: IntervalGroup(build_intervals(v)).add(ispace.intervals)
                for k, v in mapper.items() if k
            }
            dspace = DataSpace(cluster.dspace.intervals, parts)

            # Create a new Cluster for `alias`
            alias_clusters.append(Cluster([expression], ispace, dspace))

        # Switch temporaries in the expression trees
        processed = [e.xreplace(subs) for e in processed]

        return alias_clusters + [cluster.rebuild(processed)]
Esempio n. 37
0
 def ops(self):
     """The Cluster operation count."""
     return self.ispace.size*sum(estimate_cost(i) for i in self.exprs)
Esempio n. 38
0
    def _eliminate_inter_stencil_redundancies(self, cluster, template, **kwargs):
        """
        Search for redundancies across the expressions and expose them
        to the later stages of the optimisation pipeline by introducing
        new temporaries of suitable rank.

        Two type of redundancies are sought:

            * Time-invariants, and
            * Across different space points

        Examples
        ========
        Let ``t`` be the time dimension, ``x, y, z`` the space dimensions. Then:

        1) temp = (a[x,y,z]+b[x,y,z])*c[t,x,y,z]
           >>>
           ti[x,y,z] = a[x,y,z] + b[x,y,z]
           temp = ti[x,y,z]*c[t,x,y,z]

        2) temp1 = 2.0*a[x,y,z]*b[x,y,z]
           temp2 = 3.0*a[x,y,z+1]*b[x,y,z+1]
           >>>
           ti[x,y,z] = a[x,y,z]*b[x,y,z]
           temp1 = 2.0*ti[x,y,z]
           temp2 = 3.0*ti[x,y,z+1]
        """
        if cluster.is_sparse:
            return cluster

        # For more information about "aliases", refer to collect.__doc__
        mapper, aliases = collect(cluster.exprs)

        # Redundancies will be stored in space-varying temporaries
        g = cluster.trace
        indices = g.space_indices
        time_invariants = {v.rhs: g.time_invariant(v) for v in g.values()}

        # Find the candidate expressions
        processed = []
        candidates = OrderedDict()
        for k, v in g.items():
            # Cost check (to keep the memory footprint under control)
            naliases = len(mapper.get(v.rhs, []))
            cost = estimate_cost(v, True)*naliases
            if cost >= self.MIN_COST_ALIAS and (naliases > 1 or time_invariants[v.rhs]):
                candidates[v.rhs] = k
            else:
                processed.append(v)

        # Create alias Clusters and all necessary substitution rules
        # for the new temporaries
        alias_clusters = ClusterGroup()
        rules = OrderedDict()
        for origin, alias in aliases.items():
            if all(i not in candidates for i in alias.aliased):
                continue
            # Construct an iteration space suitable for /alias/
            intervals, sub_iterators, directions = cluster.ispace.args
            intervals = [Interval(i.dim, *alias.relaxed_diameter.get(i.dim, i.limits))
                         for i in cluster.ispace.intervals]
            ispace = IterationSpace(intervals, sub_iterators, directions)

            # Optimization: perhaps we can lift the cluster outside the time dimension
            if all(time_invariants[i] for i in alias.aliased):
                ispace = ispace.project(lambda i: not i.is_Time)

            # Build a symbolic function for /alias/
            intervals = ispace.intervals
            halo = [(abs(intervals[i].lower), abs(intervals[i].upper)) for i in indices]
            function = Array(name=template(), dimensions=indices, halo=halo)
            access = tuple(i - intervals[i].lower for i in indices)
            expression = Eq(function[access], origin)

            # Construct a data space suitable for /alias/
            mapper = detect_accesses(expression)
            parts = {k: IntervalGroup(build_intervals(v)).add(intervals)
                     for k, v in mapper.items() if k}
            dspace = DataSpace([i.zero() for i in intervals], parts)

            # Create a new Cluster for /alias/
            alias_clusters.append(Cluster([expression], ispace, dspace))

            # Add substitution rules
            for aliased, distance in alias.with_distance:
                access = [i - intervals[i].lower + j for i, j in distance if i in indices]
                rules[candidates[aliased]] = function[access]
                rules[aliased] = function[access]

        # Group clusters together if possible
        alias_clusters = groupby(alias_clusters).finalize()
        alias_clusters.sort(key=lambda i: i.is_dense)

        # Switch temporaries in the expression trees
        processed = [e.xreplace(rules) for e in processed]

        return alias_clusters + [cluster.rebuild(processed)]
Esempio n. 39
0
def test_estimate_cost(fa, fb, fc, t0, t1, t2, expr, expected):
    # Note: integer arithmetic isn't counted
    assert estimate_cost(EVAL(expr, fa, fb, fc, t0, t1, t2)) == expected
Esempio n. 40
0
 def ops(self):
     """The Cluster operation count."""
     return self.size*sum(estimate_cost(i) for i in self.exprs)