Example #1
0
    def _ompize(self, state, **kwargs):
        """
        Add OpenMP pragmas to the Iteration/Expression tree to emit parallel code
        """

        processed = []
        for node in state.nodes:

            # Reset denormals flag each time a parallel region is entered
            denormals = FindNodes(Denormals).visit(state.nodes)
            mapper = {
                i: List(c.Comment('DLE: moved denormals flag'))
                for i in denormals
            }

            # Handle parallelizable loops
            for tree in retrieve_iteration_tree(node):
                # Determine the number of consecutive parallelizable Iterations
                key = lambda i: i.is_Parallel and not i.is_Vectorizable
                candidates = filter_iterations(tree,
                                               key=key,
                                               stop='consecutive')
                if not candidates:
                    continue

                # Heuristic: if at least two parallel loops are available and the
                # physical core count is greater than self.thresholds['collapse'],
                # then omp-collapse the loops
                nparallel = len(candidates)
                if psutil.cpu_count(logical=False) < self.thresholds['collapse'] or\
                        nparallel < 2:
                    parallelism = omplang['for']
                else:
                    parallelism = omplang['collapse'](nparallel)

                root = candidates[0]
                mapper[root] = Block(header=omplang['par-region'],
                                     body=denormals +
                                     [Element(parallelism), root])

            processed.append(Transformer(mapper).visit(node))

        return {'nodes': processed}
Example #2
0
def test_tti_clusters_to_graph():
    solver = tti_operator()

    nodes = FindNodes(Expression).visit(solver.op_fwd.elemental_functions)
    expressions = [n.expr for n in nodes]
    stencils = solver.op_fwd._retrieve_stencils(expressions)
    clusters = clusterize(expressions, stencils)
    assert len(clusters) == 3

    main_cluster = clusters[0]
    n_output_tensors = len(main_cluster.trace)

    clusters = rewrite([main_cluster], mode='basic')
    assert len(clusters) == 1
    main_cluster = clusters[0]

    graph = main_cluster.trace
    assert len([v for v in graph.values() if v.is_tensor
                ]) == n_output_tensors  # u and v
    assert all(v.reads or v.readby for v in graph.values())
Example #3
0
def test_loops_ompized(fa, fb, fc, fd, t0, t1, t2, t3, exprs, expected, iters):
    scope = [fa, fb, fc, fd, t0, t1, t2, t3]
    node_exprs = [Expression(EVAL(i, *scope)) for i in exprs]
    ast = iters[6](iters[7](node_exprs))

    nodes = transform(ast, mode='openmp').nodes
    assert len(nodes) == 1
    ast = nodes[0]
    iterations = FindNodes(Iteration).visit(ast)
    assert len(iterations) == len(expected)

    # Check for presence of pragma omp
    for i, j in zip(iterations, expected):
        pragmas = i.pragmas
        if j is True:
            assert len(pragmas) == 1
            pragma = pragmas[0]
            assert 'omp for' in pragma.value
        else:
            for k in pragmas:
                assert 'omp for' not in k.value
Example #4
0
def make_grid_accesses(node):
    """
    Construct a new Iteration/Expression based on ``node``, in which all
    :class:`interfaces.Indexed` accesses have been converted into YASK grid
    accesses.
    """

    def make_grid_gets(expr):
        mapper = {}
        indexeds = retrieve_indexed(expr)
        data_carriers = [i for i in indexeds if i.base.function.from_YASK]
        for i in data_carriers:
            name = namespace['code-grid-name'](i.base.function.name)
            args = [INT(make_grid_gets(j)) for j in i.indices]
            mapper[i] = make_sharedptr_funcall(namespace['code-grid-get'], args, name)
        return expr.xreplace(mapper)

    mapper = {}
    for i, e in enumerate(FindNodes(Expression).visit(node)):
        lhs, rhs = e.expr.args

        # RHS translation
        rhs = make_grid_gets(rhs)

        # LHS translation
        if e.output_function.from_YASK:
            name = namespace['code-grid-name'](e.output_function.name)
            args = [rhs] + [INT(make_grid_gets(i)) for i in lhs.indices]
            handle = make_sharedptr_funcall(namespace['code-grid-put'], args, name)
            processed = Element(c.Statement(ccode(handle)))
        else:
            # Writing to a scalar temporary
            processed = Expression(e.expr.func(lhs, rhs), dtype=e.dtype)

        mapper.update({e: processed})

    return Transformer(mapper).visit(node)
Example #5
0
    def _profile_sections(self, nodes):
        """Introduce C-level profiling nodes within the Iteration/Expression tree."""
        mapper = {}
        for node in nodes:
            for itspace in FindSections().visit(node).keys():
                for i in itspace:
                    if IsPerfectIteration().visit(i):
                        # Insert `TimedList` block. This should come from
                        # the profiler, but we do this manually for now.
                        lname = 'loop_%s_%d' % (i.index, len(mapper))
                        mapper[i] = TimedList(gname=self.profiler.varname,
                                              lname=lname,
                                              body=i)
                        self.profiler.add(lname)

                        # Estimate computational properties of the timed section
                        # (operational intensity, memory accesses)
                        expressions = FindNodes(Expression).visit(i)
                        ops = estimate_cost([e.expr for e in expressions])
                        memory = estimate_memory([e.expr for e in expressions])
                        self.sections[itspace] = Profile(lname, ops, memory)
                        break
        processed = Transformer(mapper).visit(List(body=nodes))
        return processed
Example #6
0
    def _padding(self, state, **kwargs):
        """
        Introduce temporary buffers padded to the nearest multiple of the vector
        length, to maximize data alignment. At the bottom of the kernel, the
        values in the padded temporaries will be copied back into the input arrays.
        """

        mapper = OrderedDict()
        for node in state.nodes:
            # Assess feasibility of the transformation
            handle = FindSymbols('symbolics-writes').visit(node)
            if not handle:
                continue

            shape = max([i.shape for i in handle], key=len)
            if not shape:
                continue

            candidates = [i for i in handle if i.shape[-1] == shape[-1]]
            if not candidates:
                continue

            # Retrieve the maximum number of items in a SIMD register when processing
            # the expressions in /node/
            exprs = FindNodes(Expression).visit(node)
            exprs = [e for e in exprs if e.output_function in candidates]
            assert len(exprs) > 0
            dtype = exprs[0].dtype
            assert all(e.dtype == dtype for e in exprs)
            try:
                simd_items = get_simd_items(dtype)
            except KeyError:
                # Fallback to 16 (maximum expectable padding, for AVX512 registers)
                simd_items = simdinfo['avx512f'] / np.dtype(dtype).itemsize

            shapes = {
                k: k.shape[:-1] + (roundm(k.shape[-1], simd_items), )
                for k in candidates
            }
            mapper.update(
                OrderedDict([(k.indexed,
                              TensorFunction(name='p%s' % k.name,
                                             shape=shapes[k],
                                             dimensions=k.indices,
                                             onstack=k._mem_stack).indexed)
                             for k in candidates]))

        # Substitute original arrays with padded buffers
        processed = [
            SubstituteExpression(mapper).visit(n) for n in state.nodes
        ]

        # Build Iteration trees for initialization and copy-back of padded arrays
        mapper = OrderedDict([(k, v) for k, v in mapper.items()
                              if k.function.is_SymbolicData])
        init = copy_arrays(mapper, reverse=True)
        copyback = copy_arrays(mapper)

        processed = init + as_tuple(processed) + copyback

        return {'nodes': processed}
Example #7
0
    def _ompize(self, state, **kwargs):
        """
        Add OpenMP pragmas to the Iteration/Expression tree to emit parallel code
        """

        processed = []
        for node in state.nodes:

            # Reset denormals flag each time a parallel region is entered
            denormals = FindNodes(Denormals).visit(state.nodes)
            mapper = OrderedDict([(i, None) for i in denormals])

            # Group by outer loop so that we can embed within the same parallel region
            was_tagged = False
            groups = OrderedDict()
            for tree in retrieve_iteration_tree(node):
                # Determine the number of consecutive parallelizable Iterations
                key = lambda i: i.is_Parallel and\
                    not (i.is_Elementizable or i.is_Vectorizable)
                candidates = filter_iterations(tree, key=key, stop='asap')
                if not candidates:
                    was_tagged = False
                    continue
                # Consecutive tagged Iteration go in the same group
                is_tagged = any(i.tag is not None for i in tree)
                key = len(groups) - (is_tagged & was_tagged)
                handle = groups.setdefault(key, OrderedDict())
                handle[candidates[0]] = candidates
                was_tagged = is_tagged

            # Handle parallelizable loops
            for group in groups.values():
                private = []
                for root, tree in group.items():
                    # Heuristic: if at least two parallel loops are available and the
                    # physical core count is greater than self.thresholds['collapse'],
                    # then omp-collapse the loops
                    nparallel = len(tree)
                    if psutil.cpu_count(logical=False) < self.thresholds['collapse'] or\
                            nparallel < 2:
                        parallel = omplang['for']
                    else:
                        parallel = omplang['collapse'](nparallel)

                    mapper[root] = root._rebuild(pragmas=root.pragmas +
                                                 (parallel, ))

                    # Track the thread-private and thread-shared variables
                    private.extend([
                        i for i in FindSymbols('symbolics').visit(root)
                        if i.is_TensorFunction and i._mem_stack
                    ])

                # Build the parallel region
                private = sorted(set([i.name for i in private]))
                private = ('private(%s)' %
                           ','.join(private)) if private else ''
                rebuilt = [v for k, v in mapper.items() if k in group]
                par_region = Block(header=omplang['par-region'](private),
                                   body=denormals + rebuilt)
                for k, v in list(mapper.items()):
                    if isinstance(v, Iteration):
                        mapper[k] = None if v.is_Remainder else par_region

            handle = Transformer(mapper).visit(node)
            if handle is not None:
                processed.append(handle)

        return {'nodes': processed}
Example #8
0
def autotune(operator, arguments, tunable, mode='basic'):
    """
    Acting as a high-order function, take as input an operator and a list of
    operator arguments to perform empirical autotuning. Some of the operator
    arguments are marked as tunable.
    """
    at_arguments = arguments.copy()

    # User-provided output data must not be altered
    output = [i.name for i in operator.output]
    for k, v in arguments.items():
        if k in output:
            at_arguments[k] = v.copy()

    # Squeeze dimensions to minimize auto-tuning time
    iterations = FindNodes(Iteration).visit(operator.body)
    squeezable = [
        i.dim.parent.name for i in iterations
        if i.is_Sequential and i.dim.is_Buffered
    ]

    # Attempted block sizes
    mapper = OrderedDict([(i.argument.name, i) for i in tunable])
    blocksizes = [
        OrderedDict([(i, v) for i in mapper]) for v in options['at_blocksize']
    ]
    if mode == 'aggressive':
        blocksizes = more_heuristic_attempts(blocksizes)

    # Note: there is only a single loop over 'blocksize' because only
    # square blocks are tested
    timings = OrderedDict()
    for blocksize in blocksizes:
        illegal = False
        for k, v in at_arguments.items():
            if k in blocksize:
                val = blocksize[k]
                handle = at_arguments.get(mapper[k].original_dim.name)
                if val <= mapper[k].iteration.end(handle):
                    at_arguments[k] = val
                else:
                    # Block size cannot be larger than actual dimension
                    illegal = True
                    break
            elif k in squeezable:
                at_arguments[k] = options['at_squeezer']
        if illegal:
            continue

        # Add profiler structs
        at_arguments.update(operator._extra_arguments())

        operator.cfunction(*list(at_arguments.values()))
        elapsed = sum(operator.profiler.timings.values())
        timings[tuple(blocksize.items())] = elapsed
        info_at("<%s>: %f" % (','.join('%d' % i
                                       for i in blocksize.values()), elapsed))

    best = dict(min(timings, key=timings.get))
    info('Auto-tuned block shape: %s' % best)

    # Build the new argument list
    tuned = OrderedDict()
    for k, v in arguments.items():
        tuned[k] = best[k] if k in mapper else v

    return tuned
Example #9
0
    def _create_elemental_functions(self, state, **kwargs):
        """
        Extract :class:`Iteration` sub-trees and move them into :class:`Function`s.

        Currently, only tagged, elementizable Iteration objects are targeted.
        """
        noinline = self._compiler_decoration('noinline',
                                             c.Comment('noinline?'))

        functions = OrderedDict()
        processed = []
        for node in state.nodes:
            mapper = {}
            for tree in retrieve_iteration_tree(node, mode='superset'):
                # Search an elementizable sub-tree (if any)
                tagged = filter_iterations(tree, lambda i: i.tag is not None,
                                           'asap')
                if not tagged:
                    continue
                root = tagged[0]
                if not root.is_Elementizable:
                    continue
                target = tree[tree.index(root):]

                # Elemental function arguments
                args = []  # Found so far (scalars, tensors)
                maybe_required = set(
                )  # Scalars that *may* have to be passed in
                not_required = set(
                )  # Elemental function locally declared scalars

                # Build a new Iteration/Expression tree with free bounds
                free = []
                for i in target:
                    name, bounds = i.dim.name, i.bounds_symbolic
                    # Iteration bounds
                    start = ScalarFunction(name='%s_start' % name,
                                           dtype=np.int32)
                    finish = ScalarFunction(name='%s_finish' % name,
                                            dtype=np.int32)
                    args.extend(
                        zip([ccode(j) for j in bounds], (start, finish)))
                    # Iteration unbounded indices
                    ufunc = [
                        ScalarFunction(name='%s_ub%d' % (name, j),
                                       dtype=np.int32)
                        for j in range(len(i.uindices))
                    ]
                    args.extend(
                        zip([ccode(j.start) for j in i.uindices], ufunc))
                    limits = [Symbol(start.name), Symbol(finish.name), 1]
                    uindices = [
                        UnboundedIndex(j.index, i.dim + as_symbol(k))
                        for j, k in zip(i.uindices, ufunc)
                    ]
                    free.append(
                        i._rebuild(limits=limits,
                                   offsets=None,
                                   uindices=uindices))
                    not_required.update({i.dim},
                                        set(j.index for j in i.uindices))

                # Construct elemental function body, and inspect it
                free = NestedTransformer(dict((zip(target, free)))).visit(root)
                expressions = FindNodes(Expression).visit(free)
                fsymbols = FindSymbols('symbolics').visit(free)

                # Retrieve symbolic arguments
                for i in fsymbols:
                    if i.is_TensorFunction:
                        args.append(
                            ("(%s*)%s" % (c.dtype_to_ctype(i.dtype), i.name),
                             i))
                    elif i.is_TensorData:
                        args.append(("%s_vec" % i.name, i))
                    elif i.is_ConstantData:
                        args.append((i.name, i))

                # Retrieve scalar arguments
                not_required.update(
                    {i.output
                     for i in expressions if i.is_scalar})
                maybe_required.update(
                    set(FindSymbols(mode='free-symbols').visit(free)))
                for i in fsymbols:
                    not_required.update({as_symbol(i), i.indexify()})
                    for j in i.symbolic_shape:
                        maybe_required.update(j.free_symbols)
                required = filter_sorted(maybe_required - not_required,
                                         key=attrgetter('name'))
                args.extend([(i.name,
                              ScalarFunction(name=i.name, dtype=np.int32))
                             for i in required])

                call, params = zip(*args)
                handle = flatten([p.rtargs for p in params])
                name = "f_%d" % root.tag

                # Produce the new FunCall
                mapper[root] = List(header=noinline, body=FunCall(name, call))

                # Produce the new Function
                functions.setdefault(
                    name, Function(name, free, 'void', handle, ('static', )))

            # Transform the main tree
            processed.append(Transformer(mapper).visit(node))

        return {'nodes': processed, 'elemental_functions': functions.values()}
Example #10
0
def autotune(operator, arguments, tunable):
    """
    Acting as a high-order function, take as input an operator and a list of
    operator arguments to perform empirical autotuning. Some of the operator
    arguments are marked as tunable.
    """
    at_arguments = arguments.copy()

    # User-provided output data must not be altered
    output = [i.name for i in operator.output]
    for k, v in arguments.items():
        if k in output:
            at_arguments[k] = v.copy()

    iterations = FindNodes(Iteration).visit(operator.body)
    dim_mapper = {i.dim.name: i.dim for i in iterations}

    # Shrink the iteration space of sequential dimensions so that auto-tuner
    # runs take a negligible amount of time
    sequentials = [i for i in iterations if i.is_Sequential]
    if len(sequentials) == 0:
        timesteps = 1
    elif len(sequentials) == 1:
        sequential = sequentials[0]
        squeeze = sequential.dim.parent if sequential.dim.is_Buffered else sequential.dim
        timesteps = sequential.extent(finish=options['at_squeezer'])
        if timesteps < 0:
            timesteps = options['at_squeezer'] - timesteps + 1
            info_at("Adjusted auto-tuning timestep to %d" % timesteps)
        at_arguments[squeeze.symbolic_size.name] = timesteps
    else:
        info_at("Couldn't understand loop structure, giving up auto-tuning")
        return arguments

    # Attempted block sizes
    mapper = OrderedDict([(i.argument.symbolic_size.name, i) for i in tunable])
    blocksizes = [OrderedDict([(i, v) for i in mapper])
                  for v in options['at_blocksize']]
    if configuration['autotuning'] == 'aggressive':
        blocksizes = more_heuristic_attempts(blocksizes)

    # How many temporaries are allocated on the stack?
    # Will drop block sizes that might lead to a stack overflow
    functions = FindSymbols('symbolics').visit(operator.body +
                                               operator.elemental_functions)
    stack_shapes = [i.shape for i in functions if i.is_TensorFunction and i._mem_stack]
    stack_space = sum(reduce(mul, i, 1) for i in stack_shapes)*operator.dtype().itemsize

    # Note: there is only a single loop over 'blocksize' because only
    # square blocks are tested
    timings = OrderedDict()
    for bs in blocksizes:
        illegal = False
        for k, v in at_arguments.items():
            if k in bs:
                val = bs[k]
                handle = at_arguments.get(mapper[k].original_dim.symbolic_size.name)
                if val <= mapper[k].iteration.end(handle):
                    at_arguments[k] = val
                else:
                    # Block size cannot be larger than actual dimension
                    illegal = True
                    break
        if illegal:
            continue

        # Make sure we remain within stack bounds, otherwise skip block size
        dim_sizes = {}
        for k, v in at_arguments.items():
            if k in bs:
                dim_sizes[mapper[k].argument.symbolic_size] = bs[k]
            elif k in dim_mapper:
                dim_sizes[dim_mapper[k].symbolic_size] = v
        try:
            bs_stack_space = stack_space.xreplace(dim_sizes)
        except AttributeError:
            bs_stack_space = stack_space
        try:
            if int(bs_stack_space) > options['at_stack_limit']:
                continue
        except TypeError:
            # We should never get here
            info_at("Couldn't determine stack size, skipping block size %s" % str(bs))
            continue

        # Use AT-specific profiler structs
        at_arguments[operator.profiler.varname] = operator.profiler.setup()

        operator.cfunction(*list(at_arguments.values()))
        elapsed = sum(operator.profiler.timings.values())
        timings[tuple(bs.items())] = elapsed
        info_at("Block shape <%s> took %f (s) in %d time steps" %
                (','.join('%d' % i for i in bs.values()), elapsed, timesteps))

    try:
        best = dict(min(timings, key=timings.get))
        info("Auto-tuned block shape: %s" % best)
    except ValueError:
        info("Auto-tuning request, but couldn't find legal block sizes")
        return arguments

    # Build the new argument list
    tuned = OrderedDict()
    for k, v in arguments.items():
        tuned[k] = best[k] if k in mapper else v

    # Reset the profiling struct
    assert operator.profiler.varname in tuned
    tuned[operator.profiler.varname] = operator.profiler.setup()

    return tuned
Example #11
0
def create_profile(node):
    """
    Create a :class:`Profiler` for the Iteration/Expression tree ``node``.
    The following code sections are profiled: ::

        * The whole ``node``;
        * A sequence of perfectly nested loops that have common :class:`Iteration`
          dimensions, but possibly different extent. For example: ::

            for x = 0 to N
              ..
            for x = 1 to N-1
              ..

          Both Iterations have dimension ``x``, and will be profiled as a single
          section, though their extent is different.
        * Any perfectly nested loops.
    """
    profiler = Profiler()

    # Group by root Iteration
    mapper = OrderedDict()
    for itspace in FindSections().visit(node):
        mapper.setdefault(itspace[0], []).append(itspace)

    # Group sections if their iteration spaces overlap
    key = lambda itspace: set([i.dim for i in itspace])
    found = []
    for v in mapper.values():
        queue = list(v)
        handle = []
        while queue:
            item = queue.pop(0)
            if not handle or key(item) == key(handle[0]):
                handle.append(item)
            else:
                # Found a timing section
                found.append(tuple(handle))
                handle = [item]
        if handle:
            found.append(tuple(handle))

    # Create and track C-level timers
    mapper = OrderedDict()
    for i, group in enumerate(found):
        name = 'section_%d' % i
        section, remainder = group[0], group[1:]

        index = len(section) > 1 and not IsPerfectIteration().visit(section[0])
        root = section[index]

        # Prepare to transform the Iteration/Expression tree
        body = tuple(j[index] for j in group)
        mapper[root] = TimedList(gname=profiler.varname, lname=name, body=body)
        for j in remainder:
            mapper[j[index]] = None

        # Estimate computational properties of the profiled section
        expressions = FindNodes(Expression).visit(body)
        ops = estimate_cost([e.expr for e in expressions])
        memory = estimate_memory([e.expr for e in expressions])

        # Keep track of the new profiled section
        profiler.add(name, section, ops, memory)

    # Transform the Iteration/Expression tree introducing the C-level timers
    processed = Transformer(mapper).visit(node)

    return processed, profiler
Example #12
0
def optimize_unfolded_tree(unfolded, root):
    """
    Transform folded trees to reduce the memory footprint.

    Examples
    ========
    Given:

        .. code-block::
            for i = 1 to N - 1  # Folded tree
              for j = 1 to N - 1
                tmp[i,j] = ...
            for i = 2 to N - 2  # Root
              for j = 2 to N - 2
                ... = ... tmp[i,j] ...

    The temporary ``tmp`` has shape ``(N-1, N-1)``. However, as soon as the
    iteration space is blocked, with blocks of shape ``(i_bs, j_bs)``, the
    ``tmp`` shape can be shrunk to ``(i_bs-1, j_bs-1)``. The resulting
    iteration tree becomes:

        .. code-block::
            for i = 1 to i_bs + 1  # Folded tree
              for j = 1 to j_bs + 1
                i' = i + i_block - 2
                j' = j + j_block - 2
                tmp[i,j] = ... # use i' and j'
            for i = i_block to i_block + i_bs  # Root
              for j = j_block to j_block + j_bs
                i' = i - x_block
                j' = j - j_block
                ... = ... tmp[i',j'] ...
    """
    processed = []
    for i, tree in enumerate(unfolded):
        assert len(tree) == len(root)
        modified_tree = []
        modified_root = []
        mapper = {}

        # "Shrink" the iteration space
        for t1, t2 in zip(tree, root):
            start, end, incr = t1.args['limits']
            index = Symbol('%ss%d' % (t1.index, i))

            t1_uindex = (UnboundedIndex(index, start), )
            t2_uindex = (UnboundedIndex(index, -start), )

            modified_tree.append(
                t1._rebuild(limits=[0, end - start, incr],
                            uindices=t1.uindices + t1_uindex))
            modified_root.append(t2._rebuild(uindices=t2.uindices + t2_uindex))

            mapper[t1.dim] = index

        # Temporary arrays can now be moved onto the stack
        exprs = FindNodes(Expression).visit(modified_tree[-1])
        if all(not j.is_Remainder for j in modified_tree):
            shape = tuple(j.bounds_symbolic[1] for j in modified_tree)
            for j in exprs:
                j_shape = shape + j.output_function.shape[len(modified_tree):]
                j.output_function.update(shape=j_shape, onstack=True)

        # Substitute iteration variables within the folded trees
        modified_tree = compose_nodes(modified_tree)
        replaced = xreplace_indices([j.expr for j in exprs],
                                    mapper,
                                    only_rhs=True)
        subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)]
        processed.append(
            Transformer(dict(zip(exprs, subs))).visit(modified_tree))

        # Introduce the new iteration variables within /root/
        modified_root = compose_nodes(modified_root)
        exprs = FindNodes(Expression).visit(modified_root)
        candidates = [j.output for j in subs]
        replaced = xreplace_indices([j.expr for j in exprs], mapper,
                                    candidates)
        subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)]
        root = Transformer(dict(zip(exprs, subs))).visit(modified_root)

    return processed + [root]