Ejemplo n.º 1
0
    def _dissect(self, heuristics):
        """Analyze the set of expressions in the LoopOptimizer and infer an
        optimal rewrite mode for each of them.

        If an expression is embedded in a non-perfect loop nest, then injection
        may be performed. Injection consists of unrolling any loops outside of
        the expression iteration space into the expression itself.
        For example: ::

            for i
              for r
                a += B[r]*C[i][r]
              for j
                for k
                  A[j][k] += ...f(a)... // the expression at hand

        gets transformed into:

            for i
              for j
                for k
                  A[j][k] += ...f(B[0]*C[i][0] + B[1]*C[i][1] + ...)...

        Injection could be necessary to maximize the impact of rewrite mode=3,
        which tries to pre-evaluate subexpressions whose values are known at
        code generation time. Injection is essential to factorize such subexprs.

        :arg heuristic: any value in ['greedy', 'aggressive']. With 'greedy', a greedy
            approach is used to decide which of the expressions for which injection
            looks beneficial should be dissected (e.g., injection increases the memory
            footprint, and some memory constraints must always be preserved).
            With 'aggressive', the whole space of possibilities is analyzed.
        """
        # The memory threshold. The total size of temporaries will not have to
        # be greated than this value. If we predict that injection will lead
        # to too much temporary space, we have to partially drop it
        threshold = system.architecture['cache_size'] * 1.2

        expr_graph = ExpressionGraph(header)

        # 1) Find out and unroll injectable loops. For unrolling we create new
        # expressions; that is, for now, we do not modify the AST in place.
        analyzed, injectable = [], {}
        for stmt, expr_info in self.exprs.items():
            # Get all loop nests, then discard the one enclosing the expression
            nests = [n for n in visit(expr_info.loops_parents[0])['fors']]
            injectable_nests = [n for n in nests if list(zip(*n))[0] != expr_info.loops]

            for nest in injectable_nests:
                to_unroll = [(l, p) for l, p in nest if l not in expr_info.loops]
                unroll_cost = reduce(operator.mul, (l.size for l, p in to_unroll))

                nest_writers = Find(Writer).visit(to_unroll[0][0])
                for op, i_stmts in nest_writers.items():
                    # Check safety of unrolling
                    if op in [Assign, IMul, IDiv]:
                        continue
                    assert op in [Incr, Decr]

                    for i_stmt in i_stmts:
                        i_sym, i_expr = i_stmt.children

                        # Avoid injecting twice the same loop
                        if i_stmt in analyzed + [l.incr for l, p in to_unroll]:
                            continue
                        analyzed.append(i_stmt)

                        # Create unrolled, injectable expressions
                        for l, p in reversed(to_unroll):
                            i_expr = [dcopy(i_expr) for i in range(l.size)]
                            for i, e in enumerate(i_expr):
                                e_syms = Find(Symbol).visit(e)[Symbol]
                                for s in e_syms:
                                    s.rank = tuple([r if r != l.dim else i for r in s.rank])
                            i_expr = ast_make_expr(Sum, i_expr)

                        # Track the unrolled, injectable expressions and their cost
                        if i_sym.symbol in injectable:
                            old_i_expr, old_cost = injectable[i_sym.symbol]
                            new_i_expr = ast_make_expr(Sum, [i_expr, old_i_expr])
                            new_cost = unroll_cost + old_cost
                            injectable[i_sym.symbol] = (new_i_expr, new_cost)
                        else:
                            injectable[i_sym.symbol] = (i_expr, unroll_cost)

        # 2) Will rewrite mode=3 be cheaper than rewrite mode=2?
        def find_save(target_expr, expr_info):
            save_factor = [l.size for l in expr_info.out_linear_loops] or [1]
            save_factor = reduce(operator.mul, save_factor)
            # The save factor should be multiplied by the number of terms
            # that will /not/ be pre-evaluated. To obtain this number, we
            # can exploit the linearity of the expression in the terms
            # depending on the linear loops.
            syms = Find(Symbol).visit(target_expr)[Symbol]
            inner = lambda s: any(r == expr_info.linear_dims[-1] for r in s.rank)
            nterms = len(set(s.symbol for s in syms if inner(s)))
            save = nterms * save_factor
            return save_factor, save

        should_unroll = True
        storage = 0
        i_syms, injected = injectable.keys(), defaultdict(list)
        for stmt, expr_info in self.exprs.items():
            sym, expr = stmt.children

            # Divide /expr/ into subexpressions, each subexpression affected
            # differently by injection
            if i_syms:
                dissected = find_expression(expr, Prod, expr_info.linear_dims, i_syms)
                leftover = find_expression(expr, dims=expr_info.linear_dims, out_syms=i_syms)
                leftover = {(): list(flatten(leftover.values()))}
                dissected = dict(dissected.items() + leftover.items())
            else:
                dissected = {(): [expr]}
            if any(i not in flatten(dissected.keys()) for i in i_syms):
                should_unroll = False
                continue

            # Apply the profitability model
            analysis = OrderedDict()
            for i_syms, target_exprs in dissected.items():
                for target_expr in target_exprs:

                    # *** Save ***
                    save_factor, save = find_save(target_expr, expr_info)

                    # *** Cost ***
                    # The number of operations increases by a factor which
                    # corresponds to the number of possible /combinations with
                    # repetitions/ in the injected-values set. We consider
                    # combinations and not dispositions to take into account the
                    # (future) effect of factorization.
                    retval = ProjectExpansion.default_retval()
                    projection = ProjectExpansion(i_syms).visit(target_expr, ret=retval)
                    projection = [i for i in projection if i]
                    increase_factor = 0
                    for i in projection:
                        partial = 1
                        for j in expr_graph.shares(i):
                            # _n=number of unique elements, _k=group size
                            _n = injectable[j[0]][1]
                            _k = len(j)
                            partial *= fact(_n + _k - 1)//(fact(_k)*fact(_n - 1))
                        increase_factor += partial
                    increase_factor = increase_factor or 1
                    if increase_factor > save_factor:
                        # We immediately give up if this holds since it ensures
                        # that /cost > save/ (but not that cost <= save)
                        should_unroll = False
                        continue
                    # The increase factor should be multiplied by the number of
                    # terms that will be pre-evaluated. To obtain this number,
                    # we need to project the output of factorization.
                    fake_stmt = stmt.__class__(stmt.children[0], dcopy(target_expr))
                    fake_parent = expr_info.parent.children
                    fake_parent[fake_parent.index(stmt)] = fake_stmt
                    ew = ExpressionRewriter(fake_stmt, expr_info)
                    ew.expand(mode='all').factorize(mode='all').factorize(mode='linear')
                    nterms = ew.licm(mode='aggressive', look_ahead=True)
                    nterms = len(uniquify(nterms[expr_info.dims])) or 1
                    fake_parent[fake_parent.index(fake_stmt)] = stmt
                    cost = nterms * increase_factor

                    # Pre-evaluation will also increase the working set size by
                    # /cost/ * /sizeof(term)/.
                    size = [l.size for l in expr_info.linear_loops]
                    size = reduce(operator.mul, size, 1)
                    storage_increase = cost * size * system.architecture[expr_info.type]

                    # Track the injectable sub-expression and its cost/save. The
                    # final decision of whether to actually perform injection or not
                    # is postponed until all dissected expressions have been analyzed
                    analysis[target_expr] = (cost, save, storage_increase)

            # So what should we inject afterall ? Time to *use* the cost model
            if heuristics == 'greedy':
                for target_expr, (cost, save, storage_increase) in analysis.items():
                    if cost > save or storage_increase + storage > threshold:
                        should_unroll = False
                    else:
                        # Update the available storage
                        storage += storage_increase
                        # At this point, we can happily inject
                        to_replace = {k: v[0] for k, v in injectable.items()}
                        ast_replace(target_expr, to_replace, copy=True)
                        injected[stmt].append(target_expr)
            elif heuristics == 'aggressive':
                # A) Remove expression that we already know should never be injected
                not_injected = []
                for target_expr, (cost, save, storage_increase) in analysis.items():
                    if cost > save:
                        should_unroll = False
                        analysis.pop(target_expr)
                        not_injected.append(target_expr)
                # B) Find all possible bipartitions: each bipartition represents
                # the set of expressions that will be pre-evaluated and the set
                # of expressions that could also be pre-evaluated, but might not
                # (e.g. because of memory constraints)
                target_exprs = analysis.keys()
                bipartitions = []
                for i in range(len(target_exprs)+1):
                    for e1 in combinations(target_exprs, i):
                        bipartitions.append((e1, tuple(e2 for e2 in target_exprs
                                                       if e2 not in e1)))
                # C) Eliminate those bipartitions that would lead to exceeding
                # the memory threshold
                bipartitions = [(e1, e2) for e1, e2 in bipartitions if
                                sum(analysis[i][2] for i in e1) <= threshold]
                # D) Find out what is best to pre-evaluate (and therefore
                # what should be injected)
                totals = OrderedDict()
                for e1, e2 in bipartitions:
                    # Is there any value in actually not pre-evaluating the
                    # expressions in /e2/ ?
                    fake_expr = ast_make_expr(Sum, list(e2) + not_injected)
                    _, save = find_save(fake_expr, expr_info) if fake_expr else (0, 0)
                    cost = sum(analysis[i][0] for i in e1)
                    totals[(e1, e2)] = save + cost
                best = min(totals, key=totals.get)
                # At this point, we can happily inject
                to_replace = {k: v[0] for k, v in injectable.items()}
                for target_expr in best[0]:
                    ast_replace(target_expr, to_replace, copy=True)
                    injected[stmt].append(target_expr)
                if best[1]:
                    # At least one non-injected expressions, let's be sure we
                    # don't unroll everything
                    should_unroll = False

        # 3) Purge the AST from now useless symbols/expressions
        if should_unroll:
            decls = visit(self.header, info_items=['decls'])['decls']
            for stmt, expr_info in self.exprs.items():
                nests = [n for n in visit(expr_info.loops_parents[0])['fors']]
                injectable_nests = [n for n in nests if list(zip(*n))[0] != expr_info.loops]
                for nest in injectable_nests:
                    unrolled = [(l, p) for l, p in nest if l not in expr_info.loops]
                    for l, p in unrolled:
                        p.children.remove(l)
                        for i_sym in injectable.keys():
                            decl = decls.get(i_sym)
                            if decl and decl in p.children:
                                p.children.remove(decl)

        # 4) Split the expressions if injection has been performed
        for stmt, expr_info in self.exprs.items():
            expr_info.mode = 4
            inj_exprs = injected.get(stmt)
            if not inj_exprs:
                continue
            fissioner = ExpressionFissioner(match=inj_exprs, loops='all', perfect=True)
            new_exprs = fissioner.fission(stmt, self.exprs.pop(stmt))
            self.exprs.update(new_exprs)
            for stmt, expr_info in new_exprs.items():
                expr_info.mode = 3 if stmt in fissioner.matched else 4
Ejemplo n.º 2
0
    def _dissect(self, heuristics):
        """Analyze the set of expressions in the LoopOptimizer and infer an
        optimal rewrite mode for each of them.

        If an expression is embedded in a non-perfect loop nest, then injection
        may be performed. Injection consists of unrolling any loops outside of
        the expression iteration space into the expression itself.
        For example: ::

            for i
              for r
                a += B[r]*C[i][r]
              for j
                for k
                  A[j][k] += ...f(a)... // the expression at hand

        gets transformed into:

            for i
              for j
                for k
                  A[j][k] += ...f(B[0]*C[i][0] + B[1]*C[i][1] + ...)...

        Injection could be necessary to maximize the impact of rewrite mode=3,
        which tries to pre-evaluate subexpressions whose values are known at
        code generation time. Injection is essential to factorize such subexprs.

        :arg heuristic: any value in ['greedy', 'aggressive']. With 'greedy', a greedy
            approach is used to decide which of the expressions for which injection
            looks beneficial should be dissected (e.g., injection increases the memory
            footprint, and some memory constraints must always be preserved).
            With 'aggressive', the whole space of possibilities is analyzed.
        """
        # The memory threshold. The total size of temporaries will not have to
        # be greated than this value. If we predict that injection will lead
        # to too much temporary space, we have to partially drop it
        threshold = system.architecture['cache_size'] * 1.2

        expr_graph = ExpressionGraph(header)

        # 1) Find out and unroll injectable loops. For unrolling we create new
        # expressions; that is, for now, we do not modify the AST in place.
        analyzed, injectable = [], {}
        for stmt, expr_info in self.exprs.items():
            # Get all loop nests, then discard the one enclosing the expression
            nests = [n for n in visit(expr_info.loops_parents[0])['fors']]
            injectable_nests = [
                n for n in nests if list(zip(*n))[0] != expr_info.loops
            ]

            for nest in injectable_nests:
                to_unroll = [(l, p) for l, p in nest
                             if l not in expr_info.loops]
                unroll_cost = reduce(operator.mul,
                                     (l.size for l, p in to_unroll))

                nest_writers = Find(Writer).visit(to_unroll[0][0])
                for op, i_stmts in nest_writers.items():
                    # Check safety of unrolling
                    if op in [Assign, IMul, IDiv]:
                        continue
                    assert op in [Incr, Decr]

                    for i_stmt in i_stmts:
                        i_sym, i_expr = i_stmt.children

                        # Avoid injecting twice the same loop
                        if i_stmt in analyzed + [l.incr for l, p in to_unroll]:
                            continue
                        analyzed.append(i_stmt)

                        # Create unrolled, injectable expressions
                        for l, p in reversed(to_unroll):
                            i_expr = [dcopy(i_expr) for i in range(l.size)]
                            for i, e in enumerate(i_expr):
                                e_syms = Find(Symbol).visit(e)[Symbol]
                                for s in e_syms:
                                    s.rank = tuple([
                                        r if r != l.dim else i for r in s.rank
                                    ])
                            i_expr = ast_make_expr(Sum, i_expr)

                        # Track the unrolled, injectable expressions and their cost
                        if i_sym.symbol in injectable:
                            old_i_expr, old_cost = injectable[i_sym.symbol]
                            new_i_expr = ast_make_expr(Sum,
                                                       [i_expr, old_i_expr])
                            new_cost = unroll_cost + old_cost
                            injectable[i_sym.symbol] = (new_i_expr, new_cost)
                        else:
                            injectable[i_sym.symbol] = (i_expr, unroll_cost)

        # 2) Will rewrite mode=3 be cheaper than rewrite mode=2?
        def find_save(target_expr, expr_info):
            save_factor = [l.size for l in expr_info.out_linear_loops] or [1]
            save_factor = reduce(operator.mul, save_factor)
            # The save factor should be multiplied by the number of terms
            # that will /not/ be pre-evaluated. To obtain this number, we
            # can exploit the linearity of the expression in the terms
            # depending on the linear loops.
            syms = Find(Symbol).visit(target_expr)[Symbol]
            inner = lambda s: any(r == expr_info.linear_dims[-1]
                                  for r in s.rank)
            nterms = len(set(s.symbol for s in syms if inner(s)))
            save = nterms * save_factor
            return save_factor, save

        should_unroll = True
        storage = 0
        i_syms, injected = injectable.keys(), defaultdict(list)
        for stmt, expr_info in self.exprs.items():
            sym, expr = stmt.children

            # Divide /expr/ into subexpressions, each subexpression affected
            # differently by injection
            if i_syms:
                dissected = find_expression(expr, Prod, expr_info.linear_dims,
                                            i_syms)
                leftover = find_expression(expr,
                                           dims=expr_info.linear_dims,
                                           out_syms=i_syms)
                leftover = {(): list(flatten(leftover.values()))}
                dissected = dict(dissected.items() + leftover.items())
            else:
                dissected = {(): [expr]}
            if any(i not in flatten(dissected.keys()) for i in i_syms):
                should_unroll = False
                continue

            # Apply the profitability model
            analysis = OrderedDict()
            for i_syms, target_exprs in dissected.items():
                for target_expr in target_exprs:

                    # *** Save ***
                    save_factor, save = find_save(target_expr, expr_info)

                    # *** Cost ***
                    # The number of operations increases by a factor which
                    # corresponds to the number of possible /combinations with
                    # repetitions/ in the injected-values set. We consider
                    # combinations and not dispositions to take into account the
                    # (future) effect of factorization.
                    retval = ProjectExpansion.default_retval()
                    projection = ProjectExpansion(i_syms).visit(target_expr,
                                                                ret=retval)
                    projection = [i for i in projection if i]
                    increase_factor = 0
                    for i in projection:
                        partial = 1
                        for j in expr_graph.shares(i):
                            # _n=number of unique elements, _k=group size
                            _n = injectable[j[0]][1]
                            _k = len(j)
                            partial *= fact(_n + _k - 1) // (fact(_k) *
                                                             fact(_n - 1))
                        increase_factor += partial
                    increase_factor = increase_factor or 1
                    if increase_factor > save_factor:
                        # We immediately give up if this holds since it ensures
                        # that /cost > save/ (but not that cost <= save)
                        should_unroll = False
                        continue
                    # The increase factor should be multiplied by the number of
                    # terms that will be pre-evaluated. To obtain this number,
                    # we need to project the output of factorization.
                    fake_stmt = stmt.__class__(stmt.children[0],
                                               dcopy(target_expr))
                    fake_parent = expr_info.parent.children
                    fake_parent[fake_parent.index(stmt)] = fake_stmt
                    ew = ExpressionRewriter(fake_stmt, expr_info)
                    ew.expand(mode='all').factorize(mode='all').factorize(
                        mode='linear')
                    nterms = ew.licm(mode='aggressive', look_ahead=True)
                    nterms = len(uniquify(nterms[expr_info.dims])) or 1
                    fake_parent[fake_parent.index(fake_stmt)] = stmt
                    cost = nterms * increase_factor

                    # Pre-evaluation will also increase the working set size by
                    # /cost/ * /sizeof(term)/.
                    size = [l.size for l in expr_info.linear_loops]
                    size = reduce(operator.mul, size, 1)
                    storage_increase = cost * size * system.architecture[
                        expr_info.type]

                    # Track the injectable sub-expression and its cost/save. The
                    # final decision of whether to actually perform injection or not
                    # is postponed until all dissected expressions have been analyzed
                    analysis[target_expr] = (cost, save, storage_increase)

            # So what should we inject afterall ? Time to *use* the cost model
            if heuristics == 'greedy':
                for target_expr, (cost, save,
                                  storage_increase) in analysis.items():
                    if cost > save or storage_increase + storage > threshold:
                        should_unroll = False
                    else:
                        # Update the available storage
                        storage += storage_increase
                        # At this point, we can happily inject
                        to_replace = {k: v[0] for k, v in injectable.items()}
                        ast_replace(target_expr, to_replace, copy=True)
                        injected[stmt].append(target_expr)
            elif heuristics == 'aggressive':
                # A) Remove expression that we already know should never be injected
                not_injected = []
                for target_expr, (cost, save,
                                  storage_increase) in analysis.items():
                    if cost > save:
                        should_unroll = False
                        analysis.pop(target_expr)
                        not_injected.append(target_expr)
                # B) Find all possible bipartitions: each bipartition represents
                # the set of expressions that will be pre-evaluated and the set
                # of expressions that could also be pre-evaluated, but might not
                # (e.g. because of memory constraints)
                target_exprs = analysis.keys()
                bipartitions = []
                for i in range(len(target_exprs) + 1):
                    for e1 in combinations(target_exprs, i):
                        bipartitions.append(
                            (e1,
                             tuple(e2 for e2 in target_exprs if e2 not in e1)))
                # C) Eliminate those bipartitions that would lead to exceeding
                # the memory threshold
                bipartitions = [(e1, e2) for e1, e2 in bipartitions
                                if sum(analysis[i][2]
                                       for i in e1) <= threshold]
                # D) Find out what is best to pre-evaluate (and therefore
                # what should be injected)
                totals = OrderedDict()
                for e1, e2 in bipartitions:
                    # Is there any value in actually not pre-evaluating the
                    # expressions in /e2/ ?
                    fake_expr = ast_make_expr(Sum, list(e2) + not_injected)
                    _, save = find_save(fake_expr,
                                        expr_info) if fake_expr else (0, 0)
                    cost = sum(analysis[i][0] for i in e1)
                    totals[(e1, e2)] = save + cost
                best = min(totals, key=totals.get)
                # At this point, we can happily inject
                to_replace = {k: v[0] for k, v in injectable.items()}
                for target_expr in best[0]:
                    ast_replace(target_expr, to_replace, copy=True)
                    injected[stmt].append(target_expr)
                if best[1]:
                    # At least one non-injected expressions, let's be sure we
                    # don't unroll everything
                    should_unroll = False

        # 3) Purge the AST from now useless symbols/expressions
        if should_unroll:
            decls = visit(self.header, info_items=['decls'])['decls']
            for stmt, expr_info in self.exprs.items():
                nests = [n for n in visit(expr_info.loops_parents[0])['fors']]
                injectable_nests = [
                    n for n in nests if list(zip(*n))[0] != expr_info.loops
                ]
                for nest in injectable_nests:
                    unrolled = [(l, p) for l, p in nest
                                if l not in expr_info.loops]
                    for l, p in unrolled:
                        p.children.remove(l)
                        for i_sym in injectable.keys():
                            decl = decls.get(i_sym)
                            if decl and decl in p.children:
                                p.children.remove(decl)

        # 4) Split the expressions if injection has been performed
        for stmt, expr_info in self.exprs.items():
            expr_info.mode = 4
            inj_exprs = injected.get(stmt)
            if not inj_exprs:
                continue
            fissioner = ExpressionFissioner(match=inj_exprs,
                                            loops='all',
                                            perfect=True)
            new_exprs = fissioner.fission(stmt, self.exprs.pop(stmt))
            self.exprs.update(new_exprs)
            for stmt, expr_info in new_exprs.items():
                expr_info.mode = 3 if stmt in fissioner.matched else 4