Beispiel #1
0
def compute_dependency_graph(exprs):
    """
    Given an ordered list of :class:`Expression`, build a mapper from lvalues
    to reads occurring in the rvalues.
    """
    deps_graph = OrderedDict()
    writes = [e.output for e in exprs if e.is_tensor]
    for i in writes:
        for e in exprs:
            i_reads = [j for j in e.reads if as_symbol(i) == as_symbol(j)]
            deps_graph.setdefault(i, []).extend(i_reads)
    return deps_graph
Beispiel #2
0
    def __init__(self, exprs, **kwargs):
        # Check input legality
        mapper = OrderedDict([(i.lhs, i) for i in exprs])
        if len(set(mapper)) != len(mapper):
            raise DSEException(
                "Found redundant node, cannot build TemporariesGraph.")

        # Construct Temporaries, tracking reads and readby
        tensor_map = DefaultOrderedDict(list)
        for i in mapper:
            tensor_map[as_symbol(i)].append(i)
        reads = DefaultOrderedDict(set)
        readby = DefaultOrderedDict(set)
        for k, v in mapper.items():
            handle = retrieve_terminals(v.rhs)
            for i in list(handle):
                if i.is_Indexed:
                    for idx in i.indices:
                        handle |= retrieve_terminals(idx)
            reads[k].update(
                set(flatten([tensor_map.get(as_symbol(i), [])
                             for i in handle])))
            for i in reads[k]:
                readby[i].add(k)

        # Make sure read-after-writes are honored for scalar temporaries
        processed = [i for i in mapper if i.is_Indexed]
        queue = [i for i in mapper if i not in processed]
        while queue:
            k = queue.pop(0)
            if not readby[k]:
                processed.insert(0, k)
            elif all(i in processed for i in readby[k]):
                index = min(processed.index(i) for i in readby[k])
                processed.insert(index, k)
            else:
                queue.append(k)

        # Build up the TemporariesGraph
        temporaries = [(i,
                        Temporary(*mapper[i].args,
                                  inc=q_inc(mapper[i]),
                                  reads=reads[i],
                                  readby=readby[i])) for i in processed]
        super(TemporariesGraph, self).__init__(temporaries, **kwargs)

        # Determine indices along the space and time dimensions
        terms = [
            v for k, v in self.items() if v.is_tensor and not q_indirect(k)
        ]
        indices = filter_ordered(flatten([i.function.indices for i in terms]))
        self.space_indices = tuple(i for i in indices if i.is_Space)
        self.time_indices = tuple(i for i in indices if i.is_Time)
Beispiel #3
0
    def __init__(self, exprs, **kwargs):
        # Always convert to SSA
        exprs = convert_to_SSA(exprs)
        mapper = OrderedDict([(i.lhs, i) for i in exprs])
        assert len(set(mapper)) == len(exprs), "not SSA Cluster?"

        # Construct the Nodes, tracking reads and readby
        tensor_map = DefaultOrderedDict(list)
        for i in mapper:
            tensor_map[as_symbol(i)].append(i)
        reads = DefaultOrderedDict(set)
        readby = DefaultOrderedDict(set)
        for k, v in mapper.items():
            handle = retrieve_terminals(v.rhs)
            for i in list(handle):
                if i.is_Indexed:
                    for idx in i.indices:
                        handle |= retrieve_terminals(idx)
            reads[k].update(
                set(flatten([tensor_map.get(as_symbol(i), [])
                             for i in handle])))
            for i in reads[k]:
                readby[i].add(k)

        # Make sure read-after-writes are honored for scalar temporaries
        processed = [i for i in mapper if i.is_Indexed]
        queue = [i for i in mapper if i not in processed]
        while queue:
            k = queue.pop(0)
            if not readby[k]:
                processed.insert(0, k)
            elif all(i in processed for i in readby[k]):
                index = min(processed.index(i) for i in readby[k])
                processed.insert(index, k)
            else:
                queue.append(k)

        # Build up the FlowGraph
        temporaries = [(i,
                        Node(*mapper[i].args,
                             inc=q_inc(mapper[i]),
                             reads=reads[i],
                             readby=readby[i])) for i in processed]
        super(FlowGraph, self).__init__(temporaries, **kwargs)

        # Determine indices along the space and time dimensions
        terms = [
            v for k, v in self.items() if v.is_tensor and not q_indirect(k)
        ]
        indices = filter_ordered(flatten([i.function.indices for i in terms]))
        self.space_indices = tuple(i for i in indices if i.is_Space)
        self.time_indices = tuple(i for i in indices if i.is_Time)
Beispiel #4
0
 def symbolic_bounds(self):
     """A 2-tuple representing the symbolic bounds [min, max] of the Iteration."""
     _min = self.limits[0]
     _max = self.limits[1]
     try:
         _min = as_symbol(_min)
     except TypeError:
         # A symbolic expression
         pass
     try:
         _max = as_symbol(_max)
     except TypeError:
         # A symbolic expression
         pass
     return (_min, _max)
Beispiel #5
0
 def symbolic_bounds(self):
     """Return a 2-tuple representing the symbolic bounds of the object."""
     start = self.limits[0]
     end = self.limits[1]
     try:
         start = as_symbol(start)
     except TypeError:
         # Already a symbolic expression
         pass
     try:
         end = as_symbol(end)
     except TypeError:
         # Already a symbolic expression
         pass
     return (start + as_symbol(self.offsets[0]), end + as_symbol(self.offsets[1]))
Beispiel #6
0
    def __init__(self, exprs, **kwargs):
        # Always convert to SSA
        exprs = makeit_ssa(exprs)
        mapper = OrderedDict([(i.lhs, i) for i in exprs])
        assert len(set(mapper)) == len(exprs), "not SSA Cluster?"

        # Construct the Nodes, tracking reads and readby
        tensor_map = DefaultOrderedDict(list)
        for i in mapper:
            tensor_map[as_symbol(i)].append(i)
        reads = DefaultOrderedDict(set)
        readby = DefaultOrderedDict(set)
        for k, v in mapper.items():
            handle = retrieve_terminals(v.rhs)
            for i in list(handle):
                if i.is_Indexed:
                    for idx in i.indices:
                        handle |= retrieve_terminals(idx)
            reads[k].update(set(flatten([tensor_map.get(as_symbol(i), [])
                                         for i in handle])))
            for i in reads[k]:
                readby[i].add(k)

        # Make sure read-after-writes are honored for scalar nodes
        processed = [i for i in mapper if i.is_Indexed]
        queue = [i for i in mapper if i not in processed]
        while queue:
            k = queue.pop(0)
            if not readby[k] or k in readby[k]:
                processed.insert(0, k)
            elif all(i in processed for i in readby[k]):
                index = min(processed.index(i) for i in readby[k])
                processed.insert(index, k)
            else:
                queue.append(k)

        # Build up the FlowGraph
        nodes = [(i, Node(mapper[i], reads=reads[i], readby=readby[i]))
                 for i in processed]
        super(FlowGraph, self).__init__(nodes, **kwargs)

        # Determine indices along the space and time dimensions
        terms = [v for k, v in self.items() if v.is_Tensor and not q_indirect(k)]
        indices = filter_ordered(flatten([i.function.indices for i in terms]))
        self.space_indices = tuple(i for i in indices if i.is_Space)
        self.time_indices = tuple(i for i in indices if i.is_Time)
    def __init__(self, index, start=0, step=None, dim=None, expr=None):
        self.name = index
        self.index = index
        self.dim = dim
        self.expr = expr

        try:
            self.start = as_symbol(start)
        except TypeError:
            self.start = start

        try:
            if step is None:
                self.step = index + 1
            else:
                self.step = as_symbol(step)
        except TypeError:
            self.step = step
Beispiel #8
0
    def __init__(self, exprs, **kwargs):
        # Always convert to SSA
        exprs = makeit_ssa(exprs)
        mapper = OrderedDict([(i.lhs, i) for i in exprs])
        assert len(set(mapper)) == len(exprs), "not SSA Cluster?"

        # Construct the Nodes, tracking reads and readby
        tensor_map = DefaultOrderedDict(list)
        for i in mapper:
            tensor_map[as_symbol(i)].append(i)
        reads = DefaultOrderedDict(set)
        readby = DefaultOrderedDict(set)
        for k, v in mapper.items():
            handle = retrieve_terminals(v.rhs, deep=True)
            reads[k].update(
                set(flatten([tensor_map.get(as_symbol(i), [])
                             for i in handle])))
            for i in reads[k]:
                readby[i].add(k)

        # Make sure read-after-writes are honored for scalar nodes
        processed = [i for i in mapper if i.is_Indexed]
        queue = [i for i in mapper if i not in processed]
        while queue:
            k = queue.pop(0)
            if not readby[k] or k in readby[k]:
                processed.insert(0, k)
            elif all(i in processed for i in readby[k]):
                index = min(processed.index(i) for i in readby[k])
                processed.insert(index, k)
            else:
                queue.append(k)

        # Build up the FlowGraph
        nodes = [(i, Node(mapper[i], reads=reads[i], readby=readby[i]))
                 for i in processed]
        super(FlowGraph, self).__init__(nodes, **kwargs)
Beispiel #9
0
def optimize_unfolded_tree(unfolded, root):
    """
    Transform folded trees to reduce the memory footprint.

    Examples
    --------
    Given:

        .. code-block::
            for i = 1 to N - 1  # Folded tree
              for j = 1 to N - 1
                tmp[i,j] = ...
            for i = 2 to N - 2  # Root
              for j = 2 to N - 2
                ... = ... tmp[i,j] ...

    The temporary ``tmp`` has shape ``(N-1, N-1)``. However, as soon as the
    iteration space is blocked, with blocks of shape ``(i_bs, j_bs)``, the
    ``tmp`` shape can be shrunk to ``(i_bs-1, j_bs-1)``. The resulting
    iteration tree becomes:

        .. code-block::
            for i = 1 to i_bs + 1  # Folded tree
              for j = 1 to j_bs + 1
                i' = i + i_block - 2
                j' = j + j_block - 2
                tmp[i,j] = ... # use i' and j'
            for i = i_block to i_block + i_bs  # Root
              for j = j_block to j_block + j_bs
                i' = i - x_block
                j' = j - j_block
                ... = ... tmp[i',j'] ...
    """
    processed = []
    for i, tree in enumerate(unfolded):
        assert len(tree) == len(root)

        # We can optimize the folded trees only iff:
        # test0 := they compute temporary arrays, but not if they compute input data
        # test1 := the outer Iterations have actually been blocked
        exprs = FindNodes(Expression).visit(tree)
        writes = [j.write for j in exprs if j.is_tensor]
        test0 = not all(j.is_Array for j in writes)
        test1 = any(not isinstance(j.limits[0], BlockDimension) for j in root)
        if test0 or test1:
            processed.append(compose_nodes(tree))
            root = compose_nodes(root)
            continue

        # Shrink the iteration space
        modified_tree = []
        modified_root = []
        modified_dims = {}
        mapper = {}
        for t, r in zip(tree, root):
            udim0 = IncrDimension(t.dim, t.symbolic_min, 1, "%ss%d" % (t.index, i))
            modified_tree.append(t._rebuild(limits=(0, t.limits[1] - t.limits[0], t.step),
                                            uindices=t.uindices + (udim0,)))

            mapper[t.dim] = udim0

            udim1 = IncrDimension(t.dim, 0, 1, "%ss%d" % (t.index, i))
            modified_root.append(r._rebuild(uindices=r.uindices + (udim1,)))

            d = r.limits[0]
            assert isinstance(d, BlockDimension)
            modified_dims[d.root] = d

        # Temporary arrays can now be moved onto the stack
        for w in writes:
            dims = tuple(modified_dims.get(d, d) for d in w.dimensions)
            shape = tuple(d.symbolic_size for d in dims)
            w.update(shape=shape, dimensions=dims, scope='stack')

        # Substitute iteration variables within the folded trees
        modified_tree = compose_nodes(modified_tree)
        replaced = xreplace_indices([j.expr for j in exprs], mapper,
                                    lambda i: i.function not in writes, True)
        subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)]
        processed.append(Transformer(dict(zip(exprs, subs))).visit(modified_tree))

        # Introduce the new iteration variables within /root/
        modified_root = compose_nodes(modified_root)
        exprs = FindNodes(Expression).visit(modified_root)
        candidates = [as_symbol(j.output) for j in subs]
        replaced = xreplace_indices([j.expr for j in exprs], mapper, candidates)
        subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)]
        root = Transformer(dict(zip(exprs, subs))).visit(modified_root)

    return processed + [root]
Beispiel #10
0
    def _create_elemental_functions(self, nodes, state):
        """
        Extract :class:`Iteration` sub-trees and move them into :class:`Callable`s.

        Currently, only tagged, elementizable Iteration objects are targeted.
        """
        noinline = self._compiler_decoration('noinline',
                                             c.Comment('noinline?'))

        functions = OrderedDict()
        mapper = {}
        for tree in retrieve_iteration_tree(nodes, mode='superset'):
            # Search an elementizable sub-tree (if any)
            tagged = filter_iterations(tree, lambda i: i.tag is not None,
                                       'asap')
            if not tagged:
                continue
            root = tagged[0]
            if not root.is_Elementizable:
                continue
            target = tree[tree.index(root):]

            # Elemental function arguments
            args = []  # Found so far (scalars, tensors)
            maybe_required = set()  # Scalars that *may* have to be passed in
            not_required = set()  # Elemental function locally declared scalars

            # Build a new Iteration/Expression tree with free bounds
            free = []
            for i in target:
                name, bounds = i.dim.name, i.bounds_symbolic
                # Iteration bounds
                start = Scalar(name='%s_start' % name, dtype=np.int32)
                finish = Scalar(name='%s_finish' % name, dtype=np.int32)
                args.extend(zip([ccode(j) for j in bounds], (start, finish)))
                # Iteration unbounded indices
                ufunc = [
                    Scalar(name='%s_ub%d' % (name, j), dtype=np.int32)
                    for j in range(len(i.uindices))
                ]
                args.extend(zip([ccode(j.start) for j in i.uindices], ufunc))
                limits = [Symbol(start.name), Symbol(finish.name), 1]
                uindices = [
                    UnboundedIndex(j.index, i.dim + as_symbol(k))
                    for j, k in zip(i.uindices, ufunc)
                ]
                free.append(
                    i._rebuild(limits=limits, offsets=None, uindices=uindices))
                not_required.update({i.dim}, set(j.index for j in i.uindices))

            # Construct elemental function body, and inspect it
            free = NestedTransformer(dict((zip(target, free)))).visit(root)
            expressions = FindNodes(Expression).visit(free)
            fsymbols = FindSymbols('symbolics').visit(free)

            # Add all definitely-required arguments
            not_required.update({i.output for i in expressions if i.is_scalar})
            for i in fsymbols:
                if i in not_required:
                    continue
                elif i.is_Array:
                    args.append(
                        ("(%s*)%s" % (c.dtype_to_ctype(i.dtype), i.name), i))
                elif i.is_TensorFunction:
                    args.append(("%s_vec" % i.name, i))
                elif i.is_Scalar:
                    args.append((i.name, i))

            # Add all maybe-required arguments that turn out to be required
            maybe_required.update(
                set(FindSymbols(mode='free-symbols').visit(free)))
            for i in fsymbols:
                not_required.update({as_symbol(i), i.indexify()})
                for j in i.symbolic_shape:
                    maybe_required.update(j.free_symbols)
            required = filter_sorted(maybe_required - not_required,
                                     key=attrgetter('name'))
            args.extend([(i.name, Scalar(name=i.name, dtype=i.dtype))
                         for i in required])

            call, params = zip(*args)
            handle = flatten([p.rtargs for p in params])
            name = "f_%d" % root.tag

            # Produce the new Call
            mapper[root] = List(header=noinline, body=Call(name, call))

            # Produce the new Callable
            functions.setdefault(
                name, Callable(name, free, 'void', handle, ('static', )))

        # Transform the main tree
        processed = Transformer(mapper).visit(nodes)

        return processed, {'elemental_functions': functions.values()}
Beispiel #11
0
def optimize_unfolded_tree(unfolded, root):
    """
    Transform folded trees to reduce the memory footprint.

    Examples
    ========
    Given:

        .. code-block::
            for i = 1 to N - 1  # Folded tree
              for j = 1 to N - 1
                tmp[i,j] = ...
            for i = 2 to N - 2  # Root
              for j = 2 to N - 2
                ... = ... tmp[i,j] ...

    The temporary ``tmp`` has shape ``(N-1, N-1)``. However, as soon as the
    iteration space is blocked, with blocks of shape ``(i_bs, j_bs)``, the
    ``tmp`` shape can be shrunk to ``(i_bs-1, j_bs-1)``. The resulting
    iteration tree becomes:

        .. code-block::
            for i = 1 to i_bs + 1  # Folded tree
              for j = 1 to j_bs + 1
                i' = i + i_block - 2
                j' = j + j_block - 2
                tmp[i,j] = ... # use i' and j'
            for i = i_block to i_block + i_bs  # Root
              for j = j_block to j_block + j_bs
                i' = i - x_block
                j' = j - j_block
                ... = ... tmp[i',j'] ...
    """
    processed = []
    for i, tree in enumerate(unfolded):
        assert len(tree) == len(root)
        modified_tree = []
        modified_root = []
        mapper = {}

        # "Shrink" the iteration space
        for t1, t2 in zip(tree, root):
            index = Symbol('%ss%d' % (t1.index, i))
            mapper[t1.dim] = index

            t1_uindex = (UnboundedIndex(index, t1.limits[0]), )
            t2_uindex = (UnboundedIndex(index, -t1.limits[0]), )

            limits = (0, t1.limits[1] - t1.limits[0], t1.incr_symbolic)
            modified_tree.append(
                t1._rebuild(limits=limits, uindices=t1.uindices + t1_uindex))

            modified_root.append(t2._rebuild(uindices=t2.uindices + t2_uindex))

        # Temporary arrays can now be moved onto the stack
        exprs = FindNodes(Expression).visit(modified_tree[-1])
        if all(not j.is_Remainder for j in modified_tree):
            dimensions = tuple(j.limits[0] for j in modified_root)
            for j in exprs:
                if j.write.is_Array:
                    j_dimensions = dimensions + j.write.dimensions[
                        len(modified_root):]
                    j_shape = tuple(k.symbolic_size for k in j_dimensions)
                    j.write.update(shape=j_shape,
                                   dimensions=j_dimensions,
                                   onstack=True)

        # Substitute iteration variables within the folded trees
        modified_tree = compose_nodes(modified_tree)
        replaced = xreplace_indices([j.expr for j in exprs],
                                    mapper,
                                    only_rhs=True)
        subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)]
        processed.append(
            Transformer(dict(zip(exprs, subs))).visit(modified_tree))

        # Introduce the new iteration variables within /root/
        modified_root = compose_nodes(modified_root)
        exprs = FindNodes(Expression).visit(modified_root)
        candidates = [as_symbol(j.output) for j in subs]
        replaced = xreplace_indices([j.expr for j in exprs], mapper,
                                    candidates)
        subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)]
        root = Transformer(dict(zip(exprs, subs))).visit(modified_root)

    return processed + [root]
Beispiel #12
0
    def _create_elemental_functions(self, nodes, state):
        """
        Extract :class:`Iteration` sub-trees and move them into :class:`Callable`s.

        Currently, only tagged, elementizable Iteration objects are targeted.
        """
        noinline = self._compiler_decoration('noinline',
                                             c.Comment('noinline?'))

        functions = OrderedDict()
        mapper = {}
        for tree in retrieve_iteration_tree(nodes, mode='superset'):
            # Search an elementizable sub-tree (if any)
            tagged = filter_iterations(tree, lambda i: i.tag is not None,
                                       'asap')
            if not tagged:
                continue
            root = tagged[0]
            if not root.is_Elementizable:
                continue
            target = tree[tree.index(root):]

            # Elemental function arguments
            args = []  # Found so far (scalars, tensors)
            defined_args = {}  # Map of argument values defined by loop bounds

            # Build a new Iteration/Expression tree with free bounds
            free = []
            for i in target:
                name, bounds = i.dim.name, i.bounds_symbolic
                # Iteration bounds
                start = Scalar(name='%s_start' % name, dtype=np.int32)
                finish = Scalar(name='%s_finish' % name, dtype=np.int32)
                defined_args[start.name] = bounds[0]
                defined_args[finish.name] = bounds[1]

                # Iteration unbounded indices
                ufunc = [
                    Scalar(name='%s_ub%d' % (name, j), dtype=np.int32)
                    for j in range(len(i.uindices))
                ]
                defined_args.update(
                    {uf.name: j.start
                     for uf, j in zip(ufunc, i.uindices)})
                limits = [
                    Scalar(name=start.name, dtype=np.int32),
                    Scalar(name=finish.name, dtype=np.int32), 1
                ]
                uindices = [
                    UnboundedIndex(j.index, i.dim + as_symbol(k))
                    for j, k in zip(i.uindices, ufunc)
                ]
                free.append(
                    i._rebuild(limits=limits, offsets=None, uindices=uindices))

            # Construct elemental function body, and inspect it
            free = NestedTransformer(dict((zip(target, free)))).visit(root)

            # Insert array casts for all non-defined
            f_symbols = FindSymbols('symbolics').visit(free)
            defines = [s.name for s in FindSymbols('defines').visit(free)]
            casts = [
                ArrayCast(f) for f in f_symbols
                if f.is_Tensor and f.name not in defines
            ]
            free = (List(body=casts), free)

            for i in derive_parameters(free):
                if i.name in defined_args:
                    args.append((defined_args[i.name], i))
                elif i.is_Dimension:
                    d = Scalar(name=i.name, dtype=i.dtype)
                    args.append((d, d))
                else:
                    args.append((i, i))

            call, params = zip(*args)
            name = "f_%d" % root.tag

            # Produce the new Call
            mapper[root] = List(header=noinline, body=Call(name, call))

            # Produce the new Callable
            functions.setdefault(
                name,
                Callable(name, free, 'void', flatten(params), ('static', )))

        # Transform the main tree
        processed = Transformer(mapper).visit(nodes)

        return processed, {'elemental_functions': functions.values()}
Beispiel #13
0
    def _create_efuncs(self, nodes, state):
        """
        Extract Iteration sub-trees and turn them into Calls+Callables.

        Currently, only tagged, elementizable Iteration objects are targeted.
        """
        noinline = self._compiler_decoration('noinline',
                                             c.Comment('noinline?'))

        efuncs = OrderedDict()
        mapper = {}
        for tree in retrieve_iteration_tree(nodes, mode='superset'):
            # Search an elementizable sub-tree (if any)
            tagged = filter_iterations(tree, lambda i: i.tag is not None,
                                       'asap')
            if not tagged:
                continue
            root = tagged[0]
            if not root.is_Elementizable:
                continue
            target = tree[tree.index(root):]

            # Build a new Iteration/Expression tree with free bounds
            free = []
            defined_args = {}  # Map of argument values defined by loop bounds
            for i in target:
                name, bounds = i.dim.name, i.symbolic_bounds
                # Iteration bounds
                _min = Scalar(name='%sf_m' % name,
                              dtype=np.int32,
                              is_const=True)
                _max = Scalar(name='%sf_M' % name,
                              dtype=np.int32,
                              is_const=True)
                defined_args[_min.name] = bounds[0]
                defined_args[_max.name] = bounds[1]

                # Iteration unbounded indices
                ufunc = [
                    Scalar(name='%s_ub%d' % (name, j), dtype=np.int32)
                    for j in range(len(i.uindices))
                ]
                defined_args.update({
                    uf.name: j.symbolic_min
                    for uf, j in zip(ufunc, i.uindices)
                })
                uindices = [
                    IncrDimension(j.parent, i.dim + as_symbol(k), 1, j.name)
                    for j, k in zip(i.uindices, ufunc)
                ]
                free.append(
                    i._rebuild(limits=(_min, _max, 1),
                               offsets=None,
                               uindices=uindices))

            # Construct elemental function body
            free = Transformer(dict((zip(target, free))),
                               nested=True).visit(root)
            items = FindSymbols().visit(free)

            # Insert array casts
            casts = [ArrayCast(i) for i in items if i.is_Tensor]
            free = List(body=casts + [free])

            # Insert declarations
            external = [i for i in items if i.is_Array]
            free = iet_insert_C_decls(free, external)

            # Create the Callable
            name = "f_%d" % root.tag
            params = derive_parameters(free)
            efuncs.setdefault(name,
                              Callable(name, free, 'void', params, 'static'))

            # Create the Call
            args = [defined_args.get(i.name, i) for i in params]
            mapper[root] = List(header=noinline, body=Call(name, args))

        # Transform the main tree
        processed = Transformer(mapper).visit(nodes)

        return processed, {'efuncs': efuncs.values()}
Beispiel #14
0
def optimize_unfolded_tree(unfolded, root):
    """
    Transform folded trees to reduce the memory footprint.

    Examples
    --------
    Given:

        .. code-block::
            for i = 1 to N - 1  # Folded tree
              for j = 1 to N - 1
                tmp[i,j] = ...
            for i = 2 to N - 2  # Root
              for j = 2 to N - 2
                ... = ... tmp[i,j] ...

    The temporary ``tmp`` has shape ``(N-1, N-1)``. However, as soon as the
    iteration space is blocked, with blocks of shape ``(i_bs, j_bs)``, the
    ``tmp`` shape can be shrunk to ``(i_bs-1, j_bs-1)``. The resulting
    iteration tree becomes:

        .. code-block::
            for i = 1 to i_bs + 1  # Folded tree
              for j = 1 to j_bs + 1
                i' = i + i_block - 2
                j' = j + j_block - 2
                tmp[i,j] = ... # use i' and j'
            for i = i_block to i_block + i_bs  # Root
              for j = j_block to j_block + j_bs
                i' = i - x_block
                j' = j - j_block
                ... = ... tmp[i',j'] ...
    """
    processed = []
    for i, tree in enumerate(unfolded):
        assert len(tree) == len(root)

        # We can optimize the folded trees only if they compute temporary
        # arrays, but not if they compute input data
        exprs = FindNodes(Expression).visit(tree[-1])
        writes = [j.write for j in exprs if j.is_tensor]
        if not all(j.is_Array for j in writes):
            processed.append(compose_nodes(tree))
            root = compose_nodes(root)
            continue

        modified_tree = []
        modified_root = []
        mapper = {}

        # "Shrink" the iteration space
        for t1, t2 in zip(tree, root):
            t1_udim = IncrDimension(t1.dim, t1.symbolic_min, 1, "%ss%d" % (t1.index, i))
            limits = (0, t1.limits[1] - t1.limits[0], t1.step)
            modified_tree.append(t1._rebuild(limits=limits,
                                             uindices=t1.uindices + (t1_udim,)))

            t2_udim = IncrDimension(t1.dim, 0, 1, "%ss%d" % (t1.index, i))
            modified_root.append(t2._rebuild(uindices=t2.uindices + (t2_udim,)))

            mapper[t1.dim] = t1_udim

        # Temporary arrays can now be moved onto the stack
        dimensions = tuple(j.limits[0] for j in modified_root)
        for j in writes:
            if j.is_Array:
                j_dimensions = dimensions + j.dimensions[len(modified_root):]
                j_shape = tuple(k.symbolic_size for k in j_dimensions)
                j.update(shape=j_shape, dimensions=j_dimensions, scope='stack')

        # Substitute iteration variables within the folded trees
        modified_tree = compose_nodes(modified_tree)
        replaced = xreplace_indices([j.expr for j in exprs], mapper, only_rhs=True)
        subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)]
        processed.append(Transformer(dict(zip(exprs, subs))).visit(modified_tree))

        # Introduce the new iteration variables within /root/
        modified_root = compose_nodes(modified_root)
        exprs = FindNodes(Expression).visit(modified_root)
        candidates = [as_symbol(j.output) for j in subs]
        replaced = xreplace_indices([j.expr for j in exprs], mapper, candidates)
        subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)]
        root = Transformer(dict(zip(exprs, subs))).visit(modified_root)

    return processed + [root]