Esempio n. 1
0
def choose(aliases, exprs, mapper, selector):
    """
    Analyze the detected aliases and, after applying a cost model to rule out
    the aliases with a bad flops/memory trade-off, inject them into the original
    expressions.
    """
    tot = 0
    retained = AliasMapper()

    # Pass 1: a set of aliasing expressions is retained only if its cost
    # exceeds the mode's threshold
    candidates = OrderedDict()
    aliaseds = []
    others = []
    for e, v in aliases.items():
        score = selector(e, len(v.aliaseds))
        if score > 0:
            candidates[e] = score
            aliaseds.extend(v.aliaseds)
        else:
            others.append(e)

    # Do not waste time if unneccesary
    if not candidates:
        return exprs, retained, tot

    # Project the candidate aliases into exprs to determine what the new
    # working set would be
    mapper = {
        k: v
        for k, v in mapper.items() if v.free_symbols & set(aliaseds)
    }
    templated = [uxreplace(e, mapper) for e in exprs]

    # Pass 2: a set of aliasing expressions is retained only if the tradeoff
    # between operation count reduction and working set increase is favorable
    owset = wset(others + templated)
    for e, v in aliases.items():
        try:
            score = candidates[e]
        except KeyError:
            score = 0
        if score > 1 or \
           score == 1 and max(len(wset(e)), 1) > len(wset(e) & owset):
            retained[e] = v
            tot += score

    # Do not waste time if unneccesary
    if not retained:
        return exprs, retained, tot

    # Substitute the chosen aliasing sub-expressions
    mapper = {
        k: v
        for k, v in mapper.items() if v.free_symbols & set(retained.aliaseds)
    }
    exprs = [uxreplace(e, mapper) for e in exprs]

    return exprs, retained, tot
Esempio n. 2
0
def _buffering(expressions, callback, options):
    async_degree = options['buf-async-degree']

    # Locate all Function accesses within the provided `expressions`
    accessmap = AccessMapper(expressions)

    # Create the buffers
    buffers = []
    for n, (f, accessv) in enumerate(accessmap.items()):
        dims = callback(f)
        if dims is None:
            # Not a buffer candidate
            continue
        if accessv.lastwrite is None:
            # Read-only Functions cannot be buffering candidates
            continue
        buffers.append(Buffer(f, dims, accessv, n, async_degree))

    # Create Eqs to initialize buffers. Note: a buffer needs to be initialized
    # only if the buffered Function is read in at least one place or in the case
    # of non-uniform SubDimensions, to avoid uninitialized values to be copied-back
    # into the buffered Function
    processed = [Eq(b.indexify(), b.function.subs(b.contraction_mapper))
                 for b in buffers if b.is_read or not b.has_uniform_subdims]

    # Substitution rules to replace buffered Functions with buffers
    subs = {}
    for b in buffers:
        for a in b.accessv.accesses:
            subs[a] = b.indexify(a.indices)

    # Create Eqs to copy back buffers into their buffered Functions
    for e in expressions:
        processed.append(uxreplace(e, subs))

        # We also append the copy-back if `e` is the last-write of some buffers
        for b in buffers:
            if e is b.accessv.lastwrite:
                items = list(zip(e.lhs.indices, b.function.dimensions))
                lhs = b.function[[i if i in b.index_mapper else d for i, d in items]]
                rhs = b.indexed[[b.index_mapper.get(i, d) for i, d in items]]
                if b.subdims_mapper:
                    processed.append(uxreplace(Eq(lhs, rhs), b.subdims_mapper))
                else:
                    processed.append(Eq(lhs, rhs))
                break

    return processed
Esempio n. 3
0
def _compact_temporaries(exprs):
    """
    Drop temporaries consisting of isolated symbols.
    """
    # First of all, convert to SSA
    exprs = makeit_ssa(exprs)

    # What's gonna be dropped
    mapper = {e.lhs: e.rhs for e in exprs
              if e.lhs.is_Symbol and (q_leaf(e.rhs) or e.rhs.is_Function)}

    processed = []
    for e in exprs:
        if e.lhs not in mapper:
            # The temporary is retained, and substitutions may be applied
            expr = e
            while True:
                handle = uxreplace(expr, mapper)
                if handle == expr:
                    break
                else:
                    expr = handle
            processed.append(handle)

    return processed
Esempio n. 4
0
 def visit_Conditional(self, o):
     condition = uxreplace(o.condition, self.mapper)
     then_body = self._visit(o.then_body)
     else_body = self._visit(o.else_body)
     return o._rebuild(condition=condition,
                       then_body=then_body,
                       else_body=else_body)
Esempio n. 5
0
    def _lower_exprs(cls, expressions, **kwargs):
        """
        Expression lowering:

            * Form and gather any required implicit expressions;
            * Evaluate derivatives;
            * Flatten vectorial equations;
            * Indexify Functions;
            * Apply substitution rules;
            * Specialize (e.g., index shifting)
        """
        # Add in implicit expressions, e.g., induced by SubDomains
        expressions = cls._add_implicit(expressions)

        # Unfold lazyiness
        expressions = flatten([i.evaluate for i in expressions])

        # Scalarize tensor expressions
        expressions = [j for i in expressions for j in i._flatten]

        # Indexification
        # E.g., f(x - 2*h_x, y) -> f[xi + 2, yi + 4]  (assuming halo_size=4)
        processed = []
        for expr in expressions:
            if expr.subdomain:
                dimension_map = expr.subdomain.dimension_map
            else:
                dimension_map = {}

            # Handle Functions (typical case)
            mapper = {
                f: f.indexify(lshift=True, subs=dimension_map)
                for f in retrieve_functions(expr)
            }

            # Handle Indexeds (from index notation)
            for i in retrieve_indexed(expr):
                f = i.function

                # Introduce shifting to align with the computational domain
                indices = [(a + o)
                           for a, o in zip(i.indices, f._size_nodomain.left)]

                # Apply substitutions, if necessary
                if dimension_map:
                    indices = [j.xreplace(dimension_map) for j in indices]

                mapper[i] = f.indexed[indices]

            subs = kwargs.get('subs')
            if subs:
                # Include the user-supplied substitutions, and use
                # `xreplace` for constant folding
                processed.append(expr.xreplace({**mapper, **subs}))
            else:
                processed.append(uxreplace(expr, mapper))

        processed = cls._specialize_exprs(processed)

        return processed
Esempio n. 6
0
    def extract(cls, n, context, min_cost, cluster, sregistry):
        make = lambda: Scalar(name=sregistry.make_name(), dtype=cluster.dtype
                              ).indexify()

        exclude = {
            i.source.indexed
            for i in cluster.scope.d_flow.independent()
        }
        rule0 = lambda e: not e.free_symbols & exclude
        rule1 = make_is_time_invariant(context)
        rule2 = lambda e: estimate_cost(e, True) >= min_cost
        rule = lambda e: rule0(e) and rule1(e) and rule2(e)

        extracted = []
        mapper = OrderedDict()
        for e in cluster.exprs:
            for i in search(e, rule, 'all', 'dfs_first_hit'):
                if i not in mapper:
                    symbol = make()
                    mapper[i] = symbol
                    extracted.append(e.func(symbol, i))

        processed = [uxreplace(e, mapper) for e in cluster.exprs]

        return extracted + processed, extracted
Esempio n. 7
0
def makeit_ssa(exprs):
    """
    Convert an iterable of Eqs into Static Single Assignment (SSA) form.
    """
    # Identify recurring LHSs
    seen = {}
    for i, e in enumerate(exprs):
        seen.setdefault(e.lhs, []).append(i)
    # Optimization: don't waste time reconstructing stuff if already in SSA form
    if all(len(i) == 1 for i in seen.values()):
        return exprs
    # SSA conversion
    c = 0
    mapper = {}
    processed = []
    for i, e in enumerate(exprs):
        where = seen[e.lhs]
        rhs = uxreplace(e.rhs, mapper)
        if len(where) > 1:
            needssa = e.is_Scalar or where[-1] != i
            lhs = Symbol(name='ssa%d' % c, dtype=e.dtype) if needssa else e.lhs
            if e.is_Increment:
                # Turn AugmentedAssignment into Assignment
                processed.append(
                    e.func(lhs, mapper[e.lhs] + rhs, is_Increment=False))
            else:
                processed.append(e.func(lhs, rhs))
            mapper[e.lhs] = lhs
            c += 1
        else:
            processed.append(e.func(e.lhs, rhs))
    return processed
Esempio n. 8
0
def _compact_temporaries(exprs):
    """
    Drop temporaries consisting of isolated symbols.
    """
    # First of all, convert to SSA
    exprs = makeit_ssa(exprs)

    # Drop candidates are all exprs in the form `t0 = s` where `s` is a symbol
    # Note: only CSE-captured Temps, which are by construction local objects, may
    # safely be compacted; a generic Symbol could instead be accessed in a subsequent
    # Cluster, for example: `for (i = ...) { a = b; for (j = a ...) ...`
    mapper = {
        e.lhs: e.rhs
        for e in exprs
        if isinstance(e.lhs, Temp) and (q_leaf(e.rhs) or e.rhs.is_Function)
    }

    processed = []
    for e in exprs:
        if e.lhs not in mapper:
            # The temporary is retained, and substitutions may be applied
            expr = e
            while True:
                handle = uxreplace(expr, mapper)
                if handle == expr:
                    break
                else:
                    expr = handle
            processed.append(handle)

    return processed
Esempio n. 9
0
    def _aliases_from_clusters(self, clusters, exclude, meta):
        exprs = flatten([c.exprs for c in clusters])

        # [Clusters]_n -> [Schedule]_m
        variants = []
        for mapper in self._generate(exprs, exclude):
            # Clusters -> AliasList
            found = collect(mapper.extracted, meta.ispace, self.opt_minstorage)
            pexprs, aliases = choose(found, exprs, mapper, self.opt_mingain)
            if not aliases:
                continue

            # AliasList -> Schedule
            schedule = lower_aliases(aliases, meta, self.opt_maxpar)

            variants.append(Variant(schedule, pexprs))

        # [Schedule]_m -> Schedule (s.t. best memory/flops trade-off)
        if not variants:
            return []
        schedule, exprs = pick_best(variants, self.opt_schedule_strategy,
                                    self._eval_variants_delta)

        # Schedule -> Schedule (optimization)
        if self.opt_rotate:
            schedule = optimize_schedule_rotations(schedule, self.sregistry)
        schedule = optimize_schedule_padding(schedule, meta, self.platform)

        # Schedule -> [Clusters]_k
        processed, subs = lower_schedule(schedule, meta, self.sregistry,
                                         self.opt_ftemps)

        # [Clusters]_k -> [Clusters]_k (optimization)
        if self.opt_multisubdomain:
            processed = optimize_clusters_msds(processed)

        # [Clusters]_k -> [Clusters]_{k+n}
        for c in clusters:
            n = len(c.exprs)
            cexprs, exprs = exprs[:n], exprs[n:]

            cexprs = [uxreplace(e, subs) for e in cexprs]

            ispace = c.ispace.augment(schedule.dmapper)
            ispace = ispace.augment(schedule.rmapper)

            accesses = detect_accesses(cexprs)
            parts = {
                k: IntervalGroup(build_intervals(v)).relaxed
                for k, v in accesses.items() if k
            }
            dspace = DataSpace(c.dspace.intervals, parts)

            processed.append(
                c.rebuild(exprs=cexprs, ispace=ispace, dspace=dspace))

        assert len(exprs) == 0

        return processed
Esempio n. 10
0
def choose(aliases, exprs, mapper, mingain):
    """
    Analyze the detected aliases and, after applying a cost model to rule out
    the aliases with a bad memory/flops trade-off, inject them into the original
    expressions.
    """
    aliases = AliasList(aliases)

    # `score < m` => discarded
    # `score > M` => optimized
    # `m <= score <= M` => maybe discarded, maybe optimized -- depends on heuristics
    m = mingain
    M = mingain * 3

    if not aliases:
        return exprs, aliases

    # Filter off the aliases with low score
    key = lambda a: a.score >= m
    aliases.filter(key)

    # Project the candidate aliases into `exprs` to derive the final working set
    mapper = {
        k: v
        for k, v in mapper.items() if v.free_symbols & set(aliases.aliaseds)
    }
    templated = [uxreplace(e, mapper) for e in exprs]
    owset = wset(templated)

    # Filter off the aliases with a weak flop-reduction / working-set tradeoff
    key = lambda a: \
        a.score > M or \
        m <= a.score <= M and max(len(wset(a.pivot)), 1) > len(wset(a.pivot) & owset)
    aliases.filter(key)

    if not aliases:
        return exprs, aliases

    # Substitute the chosen aliasing sub-expressions
    mapper = {
        k: v
        for k, v in mapper.items() if v.free_symbols & set(aliases.aliaseds)
    }
    exprs = [uxreplace(e, mapper) for e in exprs]

    return exprs, aliases
Esempio n. 11
0
    def extract(cls, n, context, min_cost, max_alias, cluster, sregistry):
        make = lambda: Scalar(name=sregistry.make_name(), dtype=cluster.dtype
                              ).indexify()

        # The `depth` determines "how big" the extracted sum-of-products will be.
        # We observe that in typical FD codes:
        #   add(mul, mul, ...) -> stems from first order derivative
        #   add(mul(add(mul, mul, ...), ...), ...) -> stems from second order derivative
        # To search the muls in the former case, we need `depth=0`; to search the outer
        # muls in the latter case, we need `depth=2`
        depth = n

        exclude = {
            i.source.indexed
            for i in cluster.scope.d_flow.independent()
        }
        rule0 = lambda e: not e.free_symbols & exclude
        rule1 = lambda e: e.is_Mul and q_terminalop(e, depth)
        rule = lambda e: rule0(e) and rule1(e)

        extracted = OrderedDict()
        mapper = {}
        for e in cluster.exprs:
            for i in search(e, rule, 'all', 'bfs_first_hit'):
                if i in mapper:
                    continue

                key = lambda a: a.is_Add
                terms, others = split(list(i.args), key)

                if max_alias:
                    # Treat `e` as an FD expression and pull out the derivative
                    # coefficient from `i`
                    # Note: typically derivative coefficients are numbers, but
                    # sometimes they could be provided in symbolic form through an
                    # arbitrary Function.  In the latter case, we rely on the
                    # heuristic that such Function's basically never span the whole
                    # grid, but rather a single Grid dimension (e.g., `c[z, n]` for a
                    # stencil of diameter `n` along `z`)
                    if e.grid is not None and terms:
                        key = partial(maybe_coeff_key, e.grid)
                        others, more_terms = split(others, key)
                        terms.extend(more_terms)

                if terms:
                    k = i.func(*terms)
                    try:
                        symbol, _ = extracted[k]
                    except KeyError:
                        symbol, _ = extracted.setdefault(k, (make(), e))
                    mapper[i] = i.func(symbol, *others)

        if mapper:
            extracted = [e.func(v, k) for k, (v, e) in extracted.items()]
            processed = [uxreplace(e, mapper) for e in cluster.exprs]
            return extracted + processed, extracted
        else:
            return cluster.exprs, []
Esempio n. 12
0
    def callback(self, clusters, prefix, blk_size_gen=None):
        if not prefix:
            return clusters

        d = prefix[-1].dim

        if not any(TILABLE in c.properties[d] for c in clusters):
            return clusters

        # Create the block Dimensions (in total `self.levels` Dimensions)
        base = self.sregistry.make_name(prefix=d.name)

        if blk_size_gen is not None:
            # By passing a suitable key to `next` we ensure that we pull the
            # next par-tile entry iff we're now blocking an unseen TILABLE nest
            step = sympify(blk_size_gen.next(clusters))
        else:
            # This will result in a parametric step, e.g. `x0_blk0_size`
            step = None

        name = self.sregistry.make_name(prefix="%s_blk" % base)
        bd = BlockDimension(name, d, d.symbolic_min, d.symbolic_max, step)
        step = bd.step
        block_dims = [bd]

        for _ in range(1, self.levels):
            name = self.sregistry.make_name(prefix="%s_blk" % base)
            bd = BlockDimension(name, bd, bd, bd + bd.step - 1, size=step)
            block_dims.append(bd)

        bd = BlockDimension(d.name, bd, bd, bd + bd.step - 1, 1, size=step)
        block_dims.append(bd)

        processed = []
        for c in clusters:
            if TILABLE in c.properties[d]:
                ispace = decompose(c.ispace, d, block_dims)

                # Use the innermost BlockDimension in place of `d`
                exprs = [uxreplace(e, {d: bd}) for e in c.exprs]

                # The new Cluster properties
                # TILABLE property is dropped after the blocking.
                properties = dict(c.properties)
                properties.pop(d)
                properties.update(
                    {bd: c.properties[d] - {TILABLE}
                     for bd in block_dims})

                processed.append(
                    c.rebuild(exprs=exprs,
                              ispace=ispace,
                              properties=properties))
            else:
                processed.append(c)

        return processed
Esempio n. 13
0
def lower_exprs(expressions, **kwargs):
    """
    Lowering an expression consists of the following passes:

        * Indexify functions;
        * Align Indexeds with the computational domain;
        * Apply user-provided substitution;

    Examples
    --------
    f(x - 2*h_x, y) -> f[xi + 2, yi + 4]  (assuming halo_size=4)
    """

    processed = []
    for expr in as_tuple(expressions):
        try:
            dimension_map = expr.subdomain.dimension_map
        except AttributeError:
            # Some Relationals may be pure SymPy objects, thus lacking the subdomain
            dimension_map = {}

        # Handle Functions (typical case)
        mapper = {
            f: f.indexify(lshift=True, subs=dimension_map)
            for f in retrieve_functions(expr)
        }

        # Handle Indexeds (from index notation)
        for i in retrieve_indexed(expr):
            f = i.function

            # Introduce shifting to align with the computational domain
            indices = [(lower_exprs(a) + o)
                       for a, o in zip(i.indices, f._size_nodomain.left)]

            # Apply substitutions, if necessary
            if dimension_map:
                indices = [j.xreplace(dimension_map) for j in indices]

            mapper[i] = f.indexed[indices]

        subs = kwargs.get('subs')
        # Add dimensions map to the mapper in case dimensions are used
        # as an expression, i.e. Eq(u, x, subdomain=xleft)
        mapper.update(dimension_map)
        if subs:
            # Include the user-supplied substitutions, and use
            # `xreplace` for constant folding
            processed.append(expr.xreplace({**mapper, **subs}))
        else:
            processed.append(uxreplace(expr, mapper))

    if isinstance(expressions, Iterable):
        return processed
    else:
        assert len(processed) == 1
        return processed.pop()
Esempio n. 14
0
def scalarize(clusters, template):
    """
    Turn local "isolated" Arrays, that is Arrays appearing only in one Cluster,
    into Scalars.
    """
    processed = []
    for c in clusters:
        # Get any Arrays appearing only in `c`
        impacted = set(clusters) - {c}
        arrays = {i for i in c.scope.writes if i.is_Array}
        arrays -= set().union(*[i.scope.reads for i in impacted])

        # Turn them into scalars
        #
        # r[x,y,z] = g(b[x,y,z])                 t0 = g(b[x,y,z])
        # ... = r[x,y,z] + r[x,y,z+1]`  ---->    t1 = g(b[x,y,z+1])
        #                                        ... = t0 + t1
        mapper = {}
        exprs = []
        for n, e in enumerate(c.exprs):
            f = e.lhs.function
            if f in arrays:
                indexeds = [i.indexed for i in c.scope[f] if i.timestamp > n]
                for i in filter_ordered(indexeds):
                    mapper[i] = Scalar(name=template(), dtype=f.dtype)

                    assert len(f.indices) == len(e.lhs.indices) == len(
                        i.indices)
                    shifting = {
                        idx: idx + (o2 - o1)
                        for idx, o1, o2 in zip(f.indices, e.lhs.indices,
                                               i.indices)
                    }

                    handle = e.func(mapper[i], uxreplace(e.rhs, mapper))
                    handle = xreplace_indices(handle, shifting)
                    exprs.append(handle)
            else:
                exprs.append(e.func(e.lhs, uxreplace(e.rhs, mapper)))

        processed.append(c.rebuild(exprs))

    return processed
Esempio n. 15
0
def rebuild(cluster, others, aliases, subs):
    # Rebuild the non-aliasing expressions
    exprs = [uxreplace(e, subs) for e in others]

    # Add any new ShiftedDimension to the IterationSpace
    ispace = cluster.ispace.augment(aliases.index_mapper)

    # Rebuild the DataSpace to include the new symbols
    accesses = detect_accesses(exprs)
    parts = {k: IntervalGroup(build_intervals(v)).relaxed
             for k, v in accesses.items() if k}
    dspace = DataSpace(cluster.dspace.intervals, parts)

    return cluster.rebuild(exprs=exprs, ispace=ispace, dspace=dspace)
Esempio n. 16
0
def rebuild(cluster, others, schedule, subs):
    """
    Plug the optimized aliases into the input Cluster. This leads to creating
    a new Cluster with suitable IterationSpace and DataSpace.
    """
    exprs = [uxreplace(e, subs) for e in others]

    ispace = cluster.ispace.augment(schedule.dmapper)
    ispace = ispace.augment(schedule.rmapper)

    accesses = detect_accesses(exprs)
    parts = {k: IntervalGroup(build_intervals(v)).relaxed
             for k, v in accesses.items() if k}
    dspace = DataSpace(cluster.dspace.intervals, parts)

    return cluster.rebuild(exprs=exprs, ispace=ispace, dspace=dspace)
Esempio n. 17
0
    def callback(self, clusters, prefix):
        if not prefix:
            return clusters

        d = prefix[-1].dim

        # Create the block Dimensions (in total `self.levels` Dimensions)
        name = self.template % (d.name, self.nblocked[d], '%d')

        bd = IncrDimension(name % 0, d, d.symbolic_min, d.symbolic_max)
        size = bd.step
        block_dims = [bd]

        for i in range(1, self.levels):
            bd = IncrDimension(name % i, bd, bd, bd + bd.step - 1, size=size)
            block_dims.append(bd)

        bd = IncrDimension(d.name, bd, bd, bd + bd.step - 1, 1, size=size)
        block_dims.append(bd)

        processed = []
        for c in clusters:
            if TILABLE in c.properties[d]:
                ispace = decompose(c.ispace, d, block_dims)

                # Use the innermost IncrDimension in place of `d`
                exprs = [uxreplace(e, {d: bd}) for e in c.exprs]

                # The new Cluster properties
                # TILABLE property is dropped after the blocking.
                # SKEWABLE is dropped as well, but only from the new
                # block dimensions.
                properties = dict(c.properties)
                properties.pop(d)
                properties.update({bd: c.properties[d] - {TILABLE} for bd in block_dims})
                properties.update({bd: c.properties[d] - {SKEWABLE}
                                  for bd in block_dims[:-1]})

                processed.append(c.rebuild(exprs=exprs, ispace=ispace,
                                           properties=properties))
            else:
                processed.append(c)

        # Make sure to use unique IncrDimensions
        self.nblocked[d] += int(any(TILABLE in c.properties[d] for c in clusters))

        return processed
Esempio n. 18
0
    def extract(cls, n, context, min_cost, max_alias, cluster, sregistry):
        make = lambda: Scalar(name=sregistry.make_name(), dtype=cluster.dtype
                              ).indexify()

        rule = cls._extract_rule(context, min_cost, cluster)

        extracted = []
        mapper = OrderedDict()
        for e in cluster.exprs:
            for i in search(e, rule, 'all', 'dfs_first_hit'):
                if i not in mapper:
                    symbol = make()
                    mapper[i] = symbol
                    extracted.append(e.func(symbol, i))

        processed = [uxreplace(e, mapper) for e in cluster.exprs]

        return extracted + processed, extracted
Esempio n. 19
0
def eliminate_arrays(clusters):
    """
    Eliminate redundant expressions stored in Arrays.
    """
    mapper = {}
    processed = []
    for c in clusters:
        if not c.is_dense:
            processed.append(c)
            continue

        # Search for any redundant RHSs
        seen = {}
        for e in c.exprs:
            f = e.lhs.function
            if not f.is_Array:
                continue
            v = seen.get(e.rhs)
            if v is not None:
                # Found a redundant RHS
                mapper[f] = v
            else:
                seen[e.rhs] = f

        if not mapper:
            # Do not waste time
            processed.append(c)
            continue

        # Replace redundancies
        subs = {}
        for f, v in mapper.items():
            for i in filter_ordered(i.indexed for i in c.scope[f]):
                subs[i] = v[i.indices]
        exprs = []
        for e in c.exprs:
            if e.lhs.function in mapper:
                # Drop the write
                continue
            exprs.append(uxreplace(e, subs))

        processed.append(c.rebuild(exprs))

    return processed
Esempio n. 20
0
    def extract(cls, n, context, min_cost, cluster, sregistry):
        make = lambda: Scalar(name=sregistry.make_name(), dtype=cluster.dtype
                              ).indexify()

        # The `depth` determines "how big" the extracted sum-of-products will be.
        # We observe that in typical FD codes:
        #   add(mul, mul, ...) -> stems from first order derivative
        #   add(mul(add(mul, mul, ...), ...), ...) -> stems from second order derivative
        # To search the muls in the former case, we need `depth=0`; to search the outer
        # muls in the latter case, we need `depth=2`
        depth = n

        exclude = {
            i.source.indexed
            for i in cluster.scope.d_flow.independent()
        }
        rule0 = lambda e: not e.free_symbols & exclude
        rule1 = lambda e: e.is_Mul and q_terminalop(e, depth)
        rule = lambda e: rule0(e) and rule1(e)

        extracted = OrderedDict()
        mapper = {}
        for e in cluster.exprs:
            for i in search(e, rule, 'all', 'bfs_first_hit'):
                if i in mapper:
                    continue

                # Separate numbers and Functions, as they could be a derivative coeff
                terms, others = split(i.args, lambda a: a.is_Add)
                if terms:
                    k = i.func(*terms)
                    try:
                        symbol, _ = extracted[k]
                    except KeyError:
                        symbol, _ = extracted.setdefault(k, (make(), e))
                    mapper[i] = i.func(symbol, *others)

        if mapper:
            extracted = [e.func(v, k) for k, (v, e) in extracted.items()]
            processed = [uxreplace(e, mapper) for e in cluster.exprs]
            return extracted + processed, extracted
        else:
            return cluster.exprs, []
Esempio n. 21
0
def optimize_clusters_msds(clusters):
    """
    Relax the clusters by letting the expressions defined over MultiSubDomains to
    rather be computed over the entire domain. This increases the likelihood of
    code lifting by later passes.
    """
    processed = []
    for c in clusters:
        msds = [
            d for d in c.ispace.itdimensions
            if isinstance(d, MultiSubDimension)
        ]

        if msds:
            mapper = {d: d.root for d in msds}
            exprs = [uxreplace(e, mapper) for e in c.exprs]

            ispace = c.ispace.relaxed(msds)

            accesses = detect_accesses(exprs)
            parts = {
                k: IntervalGroup(build_intervals(v)).relaxed
                for k, v in accesses.items() if k
            }
            intervals = [i for i in c.dspace if i.dim not in msds]
            dspace = DataSpace(intervals, parts)

            guards = {mapper.get(d, d): v for d, v in c.guards.items()}
            properties = {mapper.get(d, d): v for d, v in c.properties.items()}
            syncs = {mapper.get(d, d): v for d, v in c.syncs.items()}

            processed.append(
                c.rebuild(exprs=exprs,
                          ispace=ispace,
                          dspace=dspace,
                          guards=guards,
                          properties=properties,
                          syncs=syncs))
        else:
            processed.append(c)

    return processed
Esempio n. 22
0
    def __new__(cls, *args, **kwargs):
        if len(args) == 1 and isinstance(args[0], LoweredEq):
            # origin: LoweredEq(devito.LoweredEq, **kwargs)
            input_expr = args[0]
            expr = sympy.Eq.__new__(cls, *input_expr.args, evaluate=False)
            for i in cls._state:
                setattr(expr, '_%s' % i,
                        kwargs.get(i) or getattr(input_expr, i))
            return expr
        elif len(args) == 1 and isinstance(args[0], Eq):
            # origin: LoweredEq(devito.Eq)
            input_expr = expr = args[0]
        elif len(args) == 2:
            expr = sympy.Eq.__new__(cls, *args, evaluate=False)
            for i in cls._state:
                setattr(expr, '_%s' % i, kwargs.pop(i))
            return expr
        else:
            raise ValueError("Cannot construct LoweredEq from args=%s "
                             "and kwargs=%s" % (str(args), str(kwargs)))

        # Well-defined dimension ordering
        ordering = dimension_sort(expr)

        # Analyze the expression
        accesses = detect_accesses(expr)
        dimensions = Stencil.union(*accesses.values())

        # Separate out the SubIterators from the main iteration Dimensions, that
        # is those which define an actual iteration space
        iterators = {}
        for d in dimensions:
            if d.is_SubIterator:
                iterators.setdefault(d.root, set()).add(d)
            elif d.is_Conditional:
                # Use `parent`, and `root`, because a ConditionalDimension may
                # have a SubDimension as parent
                iterators.setdefault(d.parent, set())
            else:
                iterators.setdefault(d, set())

        # Construct the IterationSpace
        intervals = IntervalGroup([Interval(d, 0, 0) for d in iterators],
                                  relations=ordering.relations)
        ispace = IterationSpace(intervals, iterators)

        # Construct the conditionals and replace the ConditionalDimensions in `expr`
        conditionals = {}
        for d in ordering:
            if not d.is_Conditional:
                continue
            if d.condition is None:
                conditionals[d] = GuardFactor(d)
            else:
                conditionals[d] = diff2sympy(lower_exprs(d.condition))
            if d.factor is not None:
                expr = uxreplace(expr, {d: IntDiv(d.index, d.factor)})
        conditionals = frozendict(conditionals)

        # Lower all Differentiable operations into SymPy operations
        rhs = diff2sympy(expr.rhs)

        # Finally create the LoweredEq with all metadata attached
        expr = super(LoweredEq, cls).__new__(cls,
                                             expr.lhs,
                                             rhs,
                                             evaluate=False)

        expr._ispace = ispace
        expr._conditionals = conditionals
        expr._reads, expr._writes = detect_io(expr)

        expr._is_Increment = input_expr.is_Increment
        expr._implicit_dims = input_expr.implicit_dims

        return expr
Esempio n. 23
0
def process(cluster, chosen, aliases, sregistry, platform):
    clusters = []
    subs = {}
    for alias, writeto, aliaseds, distances in aliases.iter(cluster.ispace):
        if all(i not in chosen for i in aliaseds):
            continue

        # The Dimensions defining the shape of Array
        # Note: with SubDimensions, we may have the following situation:
        #
        # for zi = z_m + zi_ltkn; zi <= z_M - zi_rtkn; ...
        #   r[zi] = ...
        #
        # Instead of `r[zi - z_m - zi_ltkn]` we have just `r[zi]`, so we'll need
        # as much room as in `zi`'s parent to avoid going OOB
        # Aside from ugly generated code, the reason we do not rather shift the
        # indices is that it prevents future passes to transform the loop bounds
        # (e.g., MPI's comp/comm overlap does that)
        dimensions = [d.parent if d.is_Sub else d for d in writeto.dimensions]

        # The halo of the Array
        halo = [(abs(i.lower), abs(i.upper)) for i in writeto]

        # The data sharing mode of the Array
        sharing = 'local' if any(d.is_Incr for d in writeto.dimensions) else 'shared'

        # Finally create the temporary Array that will store `alias`
        array = Array(name=sregistry.make_name(), dimensions=dimensions, halo=halo,
                      dtype=cluster.dtype, sharing=sharing)

        # The access Dimensions may differ from `writeto.dimensions`. This may
        # happen e.g. if ShiftedDimensions are introduced (`a[x,y]` -> `a[xs,y]`)
        adims = [aliases.index_mapper.get(d, d) for d in writeto.dimensions]

        # The expression computing `alias`
        adims = [aliases.index_mapper.get(d, d) for d in writeto.dimensions]  # x -> xs
        indices = [d - (0 if writeto[d].is_Null else writeto[d].lower) for d in adims]
        expression = Eq(array[indices], uxreplace(alias, subs))

        # Create the substitution rules so that we can use the newly created
        # temporary in place of the aliasing expressions
        for aliased, distance in zip(aliaseds, distances):
            assert all(i.dim in distance.labels for i in writeto)

            indices = [d - i.lower + distance[i.dim] for d, i in zip(adims, writeto)]
            subs[aliased] = array[indices]

            if aliased in chosen:
                subs[chosen[aliased]] = array[indices]
            else:
                # Perhaps part of a composite alias ?
                pass

        # Construct the `alias` IterationSpace
        ispace = cluster.ispace.add(writeto).augment(aliases.index_mapper)

        # Optimization: if possible, the innermost IterationInterval is
        # rounded up to a multiple of the vector length
        try:
            it = ispace.itintervals[-1]
            if ROUNDABLE in cluster.properties[it.dim]:
                vl = platform.simd_items_per_reg(cluster.dtype)
                ispace = ispace.add(Interval(it.dim, 0, it.interval.size % vl))
        except (TypeError, KeyError):
            pass

        # Construct the `alias` DataSpace
        accesses = detect_accesses(expression)
        parts = {k: IntervalGroup(build_intervals(v)).add(ispace.intervals).relaxed
                 for k, v in accesses.items() if k}
        dspace = DataSpace(cluster.dspace.intervals, parts)

        # Finally, build a new Cluster for `alias`
        built = cluster.rebuild(exprs=expression, ispace=ispace, dspace=dspace)
        clusters.insert(0, built)

    return clusters, subs
Esempio n. 24
0
def collect(exprs, min_storage, ignore_collected):
    """
    Find groups of aliasing expressions.

    We shall introduce the following (loose) terminology:

        * A ``terminal`` is the leaf of a mathematical operation. Terminals
          can be numbers (n), literals (l), or Indexeds (I).
        * ``R`` is the relaxation operator := ``R(n) = n``, ``R(l) = l``,
          ``R(I) = J``, where ``J`` has the same base as ``I`` but with all
          offsets stripped away. For example, ``R(a[i+2,j-1]) = a[i,j]``.
        * A ``relaxed expression`` is an expression in which all of the
          terminals are relaxed.

    Now we define the concept of aliasing. We say that an expression A
    aliases an expression B if:

        * ``R(A) == R(B)``
        * all pairwise Indexeds in A and B access memory locations at a
          fixed constant distance along each Dimension.

    For example, consider the following expressions:

        * a[i+1] + b[i+1]
        * a[i+1] + b[j+1]
        * a[i] + c[i]
        * a[i+2] - b[i+2]
        * a[i+2] + b[i]
        * a[i-1] + b[i-1]

    Out of the expressions above, the following alias to `a[i] + b[i]`:

        * a[i+1] + b[i+1] : same operands and operations, distance along i: 1
        * a[i-1] + b[i-1] : same operands and operations, distance along i: -1

    Whereas the following do not:

        * a[i+1] + b[j+1] : because at least one index differs
        * a[i] + c[i] : because at least one of the operands differs
        * a[i+2] - b[i+2] : because at least one operation differs
        * a[i+2] + b[i] : because the distances along ``i`` differ (+2 and +0)
    """
    # Find the potential aliases
    found = []
    for expr in exprs:
        if expr.lhs.is_Indexed or expr.is_Increment:
            continue

        indexeds = retrieve_indexed(expr.rhs)

        bases = []
        offsets = []
        for i in indexeds:
            ii = IterationInstance(i)
            if ii.is_irregular:
                break

            base = []
            offset = []
            for e, ai in zip(ii, ii.aindices):
                if q_constant(e):
                    base.append(e)
                else:
                    base.append(ai)
                    offset.append((ai, e - ai))
            bases.append(tuple(base))
            offsets.append(LabeledVector(offset))

        if indexeds and len(bases) == len(indexeds):
            found.append(Candidate(expr, indexeds, bases, offsets))

    # Create groups of aliasing expressions
    mapper = OrderedDict()
    unseen = list(found)
    while unseen:
        c = unseen.pop(0)
        group = [c]
        for u in list(unseen):
            # Is the arithmetic structure of `c` and `u` equivalent ?
            if not compare_ops(c.expr, u.expr):
                continue

            # Is `c` translated w.r.t. `u` ?
            if not c.translated(u):
                continue

            group.append(u)
            unseen.remove(u)
        group = Group(group)

        # Apply callback to heuristically discard groups
        if ignore_collected(group):
            continue

        if min_storage:
            k = group.dimensions_translated
        else:
            k = group.dimensions
        mapper.setdefault(k, []).append(group)

    aliases = Aliases()
    for _groups in list(mapper.values()):
        groups = list(_groups)

        while groups:
            # For each Dimension, determine the Minimum Intervals (MI) spanning
            # all of the Groups diameters
            # Example: x's largest_diameter=2  => [x[-2,0], x[-1,1], x[0,2]]
            # Note: Groups that cannot evaluate their diameter are dropped
            mapper = defaultdict(int)
            for g in list(groups):
                try:
                    mapper.update({d: max(mapper[d], v) for d, v in g.diameter.items()})
                except ValueError:
                    groups.remove(g)
            intervalss = {d: make_rotations_table(d, v) for d, v in mapper.items()}

            # For each Group, find a rotation that is compatible with a given MI
            mapper = {}
            for d, intervals in intervalss.items():
                for interval in list(intervals):
                    found = {g: g.find_rotation_distance(d, interval) for g in groups}
                    if all(distance is not None for distance in found.values()):
                        # `interval` is OK !
                        mapper[interval] = found
                        break

            if len(mapper) == len(intervalss):
                break

            # Try again with fewer groups
            smallest = len(min(groups, key=len))
            groups = [g for g in groups if len(g) > smallest]

        for g in groups:
            c = g.pivot
            distances = defaultdict(int, [(i.dim, v[g]) for i, v in mapper.items()])

            # Create the basis alias
            offsets = [LabeledVector([(l, v[l] + distances[l]) for l in v.labels])
                       for v in c.offsets]
            subs = {i: i.function[[l + v.fromlabel(l, 0) for l in b]]
                    for i, b, v in zip(c.indexeds, c.bases, offsets)}
            alias = uxreplace(c.expr, subs)

            # All aliased expressions
            aliaseds = [i.expr for i in g]

            # Distance of each aliased expression from the basis alias
            distances = []
            for i in g:
                distance = [o.distance(v) for o, v in zip(i.offsets, offsets)]
                distance = [(d, set(v)) for d, v in LabeledVector.transpose(*distance)]
                distances.append(LabeledVector([(d, v.pop()) for d, v in distance]))

            aliases.add(alias, list(mapper), aliaseds, distances)

    return aliases
Esempio n. 25
0
 def __init__(self, mapper=None, replacer=None):
     super(XSubs, self).__init__()
     self.replacer = replacer or (lambda i: uxreplace(i, mapper))
Esempio n. 26
0
    def callback(self, clusters, prefix):
        if not prefix:
            return clusters

        d = prefix[-1].dim

        subiters = flatten(
            [c.ispace.sub_iterators.get(d, []) for c in clusters])
        subiters = {i for i in subiters if i.is_Stepping}
        if not subiters:
            return clusters

        # Collect the index access functions along `d`, e.g., `t + 1` where `t` is
        # a SteppingDimension for `d = time`
        mapper = DefaultOrderedDict(lambda: DefaultOrderedDict(set))
        for c in clusters:
            indexeds = [
                a.indexed for a in c.scope.accesses if a.function.is_Tensor
            ]

            for i in indexeds:
                try:
                    iaf = i.indices[d]
                except KeyError:
                    continue

                # Sanity checks
                sis = iaf.free_symbols & subiters
                if len(sis) == 0:
                    continue
                elif len(sis) == 1:
                    si = sis.pop()
                else:
                    raise InvalidOperator(
                        "Cannot use multiple SteppingDimensions "
                        "to index into a Function")
                size = i.function.shape_allocated[d]
                assert is_integer(size)

                mapper[size][si].add(iaf)

        # Construct the ModuloDimensions
        mds = []
        for size, v in mapper.items():
            for si, iafs in list(v.items()):
                # Offsets are sorted so that the semantic order (t0, t1, t2) follows
                # SymPy's index ordering (t, t-1, t+1) afer modulo replacement so
                # that associativity errors are consistent. This corresponds to
                # sorting offsets {-1, 0, 1} as {0, -1, 1} assigning -inf to 0
                siafs = sorted(iafs,
                               key=lambda i: -np.inf
                               if i - si == 0 else (i - si))

                for iaf in siafs:
                    name = '%s%d' % (si.name, len(mds))
                    offset = uxreplace(iaf, {si: d.root})
                    mds.append(
                        ModuloDimension(name, si, offset, size, origin=iaf))

        # Replacement rule for ModuloDimensions
        def rule(size, e):
            try:
                return e.function.shape_allocated[d] == size
            except (AttributeError, KeyError):
                return False

        # Reconstruct the Clusters
        processed = []
        for c in clusters:
            # Apply substitutions to expressions
            # Note: In an expression, there could be `u[t+1, ...]` and `v[t+1,
            # ...]`, where `u` and `v` are TimeFunction with circular time
            # buffers (save=None) *but* different modulo extent. The `t+1`
            # indices above are therefore conceptually different, so they will
            # be replaced with the proper ModuloDimension through two different
            # calls to `xreplace_indices`
            exprs = c.exprs
            groups = as_mapper(mds, lambda d: d.modulo)
            for size, v in groups.items():
                mapper = {md.origin: md for md in v}

                func = partial(xreplace_indices,
                               mapper=mapper,
                               key=partial(rule, size))
                exprs = [e.apply(func) for e in exprs]

            # Augment IterationSpace
            ispace = IterationSpace(c.ispace.intervals, {
                **c.ispace.sub_iterators,
                **{
                    d: tuple(mds)
                }
            }, c.ispace.directions)

            processed.append(c.rebuild(exprs=exprs, ispace=ispace))

        return processed
Esempio n. 27
0
def _cse(maybe_exprs, make, mode='default'):
    """
    Main common sub-expressions elimination routine.

    Note: the output is guaranteed to be topologically sorted.

    Parameters
    ----------
    maybe_exprs : expr-like or list of expr-like  or Cluster
        One or more expressions to which CSE is applied.
    make : callable
        Build symbols to store temporary, redundant values.
    mode : str, optional
        The CSE algorithm applied. Accepted: ['default'].
    """

    # Note: not defaulting to SymPy's CSE() function for three reasons:
    # - it also captures array index access functions (eg, i+1 in A[i+1] and B[i+1]);
    # - it sometimes "captures too much", losing factorization opportunities;
    # - very slow
    # TODO: a second "sympy" mode will be provided, relying on SymPy's CSE() but
    # also ensuring some form of post-processing
    assert mode == 'default'  # Only supported mode ATM

    # Just for flexibility, accept either Clusters or exprs
    if isinstance(maybe_exprs, Cluster):
        cluster = maybe_exprs
        processed = list(cluster.exprs)
        scope = cluster.scope
    else:
        processed = list(maybe_exprs)
        scope = Scope(maybe_exprs)

    # Some sub-expressions aren't really "common" -- that's the case of Dimension-
    # independent data dependences. For example:
    #
    # ... = ... a[i] + 1 ...
    # a[i] = ...
    # ... = ... a[i] + 1 ...
    #
    # `a[i] + 1` will be excluded, as there's a flow Dimension-independent data
    # dependence involving `a`
    exclude = {i.source.access for i in scope.d_flow.independent()}

    mapped = []
    while True:
        # Detect redundancies
        counted = count(mapped + processed, q_xop).items()
        targets = OrderedDict([(k, estimate_cost(k, True)) for k, v in counted
                               if v > 1])

        # Rule out Dimension-independent data dependencies
        targets = OrderedDict([(k, v) for k, v in targets.items()
                               if not k.free_symbols & exclude])

        if not targets:
            break

        # Create temporaries
        hit = max(targets.values())
        picked = [k for k, v in targets.items() if v == hit]
        mapper = OrderedDict([(e, make()) for i, e in enumerate(picked)])

        # Apply replacements
        processed = [uxreplace(e, mapper) for e in processed]
        mapped = [uxreplace(e, mapper) for e in mapped]
        mapped = [Eq(v, k) for k, v in reversed(list(mapper.items()))] + mapped

        # Update `exclude` for the same reasons as above -- to rule out CSE across
        # Dimension-independent data dependences
        exclude.update({i for i in mapper.values()})

        # Prepare for the next round
        for k in picked:
            targets.pop(k)
    processed = mapped + processed

    # At this point we may have useless temporaries (e.g., r0=r1). Let's drop them
    processed = _compact_temporaries(processed)

    return processed
Esempio n. 28
0
 def visit_Expression(self, o):
     return o._rebuild(expr=uxreplace(o.expr, self.mapper))
Esempio n. 29
0
 def visit_Call(self, o):
     arguments = [uxreplace(i, self.mapper) for i in o.arguments]
     return o._rebuild(arguments=arguments)
Esempio n. 30
0
def actions_from_update_memcpy(cluster, clusters, prefix, actions):
    it = prefix[-1]
    d = it.dim
    direction = it.direction

    # Prepare the data to instantiate a PrefetchUpdate SyncOp
    e = cluster.exprs[0]

    size = 1

    function = e.rhs.function
    fetch = e.rhs.indices[d]
    ifetch = fetch.subs(d, d.symbolic_min)
    if direction is Forward:
        fcond = make_cond(cluster.guards.get(d), d, direction, d.symbolic_min)
    else:
        fcond = make_cond(cluster.guards.get(d), d, direction, d.symbolic_max)

    if direction is Forward:
        pfetch = fetch + 1
        pcond = make_cond(cluster.guards.get(d), d, direction, d + 1)
    else:
        pfetch = fetch - 1
        pcond = make_cond(cluster.guards.get(d), d, direction, d - 1)

    target = e.lhs.function
    tstore0 = e.lhs.indices[d]

    # If fetching into e.g., `ub[sb1]`, we'll need to prefetch into e.g. `ub[sb0]`
    if is_integer(tstore0):
        tstore = tstore0
    else:
        assert tstore0.is_Modulo
        subiters = [md for md in cluster.sub_iterators[d] if md.parent is tstore0.parent]
        osubiters = sorted(subiters, key=lambda i: Vector(i.offset))
        n = osubiters.index(tstore0)
        if direction is Forward:
            tstore = osubiters[(n + 1) % len(osubiters)]
        else:
            tstore = osubiters[(n - 1) % len(osubiters)]

    # Turn `cluster` into a prefetch Cluster
    expr = uxreplace(e, {tstore0: tstore, fetch: pfetch})
    guards = {d: And(*([pcond] + as_list(cluster.guards.get(d))))}
    syncs = {d: [PrefetchUpdate(
        d, size,
        function, fetch, ifetch, fcond,
        pfetch, pcond,
        target, tstore
    )]}
    pcluster = cluster.rebuild(exprs=expr, guards=guards, syncs=syncs)

    # Since we're turning `e` into a prefetch, we need to:
    # 1) attach a WaitPrefetch SyncOp to the first Cluster accessing `target`
    # 2) insert the prefetch Cluster right after the last Cluster accessing `target`
    # 3) drop the original Cluster performing a memcpy-based fetch
    n = clusters.index(cluster)
    first = None
    last = None
    for c in clusters[n+1:]:
        if target in c.scope.reads:
            if first is None:
                first = c
            last = c
    assert first is not None
    assert last is not None
    actions[first].syncs[d].append(WaitPrefetch(
        d, size,
        function, fetch, ifetch, fcond,
        pfetch, pcond,
        target, tstore
    ))
    actions[last].insert.append(pcluster)
    actions[cluster].drop = True