def find_substitution(expr):
        if isinstance(expr, Subscript):
            v = expr.aggregate.name
        elif isinstance(expr, Variable):
            v = expr.name
        else:
            return expr

        if v != var_name:
            return expr

        index_key = extract_index_key(expr)
        cf_index, unif_result = find_unifiable_cf_index(index_key)

        unif_subst_map = SubstitutionMapper(
                make_subst_func(unif_result.lmap))

        _, my_common_factors = common_factors[cf_index]

        if my_common_factors is not None:
            return flattened_product(
                    [unif_subst_map(cf) for cf in my_common_factors]
                    + [expr])
        else:
            return expr
Exemple #2
0
def disambiguate_identifiers(statements_a,
                             statements_b,
                             should_disambiguate_name=None):
    if should_disambiguate_name is None:

        def should_disambiguate_name(name):
            return True

    from pymbolic.imperative.analysis import get_all_used_identifiers

    id_a = get_all_used_identifiers(statements_a)
    id_b = get_all_used_identifiers(statements_b)

    from pytools import UniqueNameGenerator
    vng = UniqueNameGenerator(id_a | id_b)

    from pymbolic import var
    subst_b = {}
    for clash in id_a & id_b:
        if should_disambiguate_name(clash):
            unclash = vng(clash)
            subst_b[clash] = var(unclash)

    from pymbolic.mapper.substitutor import (make_subst_func,
                                             SubstitutionMapper)
    subst_map = SubstitutionMapper(make_subst_func(subst_b))

    statements_b = [stmt.map_expressions(subst_map) for stmt in statements_b]

    return statements_b, subst_b
Exemple #3
0
    def _get_subst_rule_key(self, args, body):
        subst_dict = dict((arg, RuleArgument(i)) for i, arg in enumerate(args))

        from pymbolic.mapper.substitutor import make_subst_func
        arg_subst_map = SubstitutionMapper(make_subst_func(subst_dict))

        return arg_subst_map(body)
Exemple #4
0
def rename_argument(kernel, old_name, new_name, existing_ok=False):
    """
    .. versionadded:: 2016.2
    """

    var_name_gen = kernel.get_var_name_generator()

    if old_name not in kernel.arg_dict:
        raise LoopyError("old arg name '%s' does not exist" % old_name)

    does_exist = var_name_gen.is_name_conflicting(new_name)

    if does_exist and not existing_ok:
        raise LoopyError(
            "argument name '%s' conflicts with an existing identifier"
            "--cannot rename" % new_name)

    # {{{ instructions

    from pymbolic import var
    subst_dict = {old_name: var(new_name)}

    from loopy.symbolic import (RuleAwareSubstitutionMapper,
                                SubstitutionRuleMappingContext)
    from pymbolic.mapper.substitutor import make_subst_func
    rule_mapping_context = SubstitutionRuleMappingContext(
        kernel.substitutions, var_name_gen)
    smap = RuleAwareSubstitutionMapper(rule_mapping_context,
                                       make_subst_func(subst_dict),
                                       within=lambda kernel, insn, stack: True)

    kernel = rule_mapping_context.finish_kernel(smap.map_kernel(kernel))

    # }}}

    # {{{ args

    new_args = []
    for arg in kernel.args:
        if arg.name == old_name:
            arg = arg.copy(name=new_name)

        new_args.append(arg)

    # }}}

    # {{{ domain

    new_domains = []
    for dom in kernel.domains:
        dom_var_dict = dom.get_var_dict()
        if old_name in dom_var_dict:
            dt, pos = dom_var_dict[old_name]
            dom = dom.set_dim_name(dt, pos, new_name)

        new_domains.append(dom)

    # }}}

    return kernel.copy(domains=new_domains, args=new_args)
Exemple #5
0
    def map_substitution(self, name, tag, arguments, expn_state):
        if tag is None:
            tags = None
        else:
            tags = (tag,)

        new_stack = expn_state.stack + ((name, tags),)

        if self.within(expn_state.kernel, expn_state.instruction, new_stack):
            # expand
            rule = self.rules[name]

            new_expn_state = expn_state.copy(
                    stack=new_stack,
                    arg_context=self.make_new_arg_context(
                        name, rule.arguments, arguments, expn_state.arg_context))

            result = self.rec(rule.expression, new_expn_state)

            # substitute in argument values
            from pymbolic.mapper.substitutor import make_subst_func
            subst_map = SubstitutionMapper(make_subst_func(
                new_expn_state.arg_context))

            return subst_map(result)

        else:
            # do not expand
            return super(RuleAwareSubstitutionRuleExpander, self).map_substitution(
                    name, tag, arguments, expn_state)
Exemple #6
0
    def map_substitution(self, name, tag, arguments, expn_state):
        if tag is None:
            tags = None
        else:
            tags = (tag,)

        new_stack = expn_state.stack + ((name, tags),)

        if self.within(expn_state.kernel, expn_state.instruction, new_stack):
            # expand
            rule = self.rules[name]

            new_expn_state = expn_state.copy(
                    stack=new_stack,
                    arg_context=self.make_new_arg_context(
                        name, rule.arguments, arguments, expn_state.arg_context))

            result = self.rec(rule.expression, new_expn_state)

            # substitute in argument values
            from pymbolic.mapper.substitutor import make_subst_func
            subst_map = SubstitutionMapper(make_subst_func(
                new_expn_state.arg_context))

            return subst_map(result)

        else:
            # do not expand
            return super(RuleAwareSubstitutionRuleExpander, self).map_substitution(
                    name, tag, arguments, expn_state)
Exemple #7
0
def disambiguate_identifiers(statements_a, statements_b,
        should_disambiguate_name=None):
    if should_disambiguate_name is None:
        def should_disambiguate_name(name):  # pylint:disable=function-redefined
            return True

    from pymbolic.imperative.analysis import get_all_used_identifiers

    id_a = get_all_used_identifiers(statements_a)
    id_b = get_all_used_identifiers(statements_b)

    from pytools import UniqueNameGenerator
    vng = UniqueNameGenerator(id_a | id_b)

    from pymbolic import var
    subst_b = {}
    for clash in id_a & id_b:
        if should_disambiguate_name(clash):
            unclash = vng(clash)
            subst_b[clash] = var(unclash)

    from pymbolic.mapper.substitutor import (
            make_subst_func, SubstitutionMapper)
    subst_map = SubstitutionMapper(make_subst_func(subst_b))

    statements_b = [
            stmt.map_expressions(subst_map) for stmt in statements_b]

    return statements_b, subst_b
Exemple #8
0
    def find_substitution(expr):
        if isinstance(expr, Subscript):
            v = expr.aggregate.name
        elif isinstance(expr, Variable):
            v = expr.name
        else:
            return expr

        if v != var_name:
            return expr

        index_key = extract_index_key(expr)
        cf_index, unif_result = find_unifiable_cf_index(index_key)

        unif_subst_map = SubstitutionMapper(
                make_subst_func(unif_result.lmap))

        _, my_common_factors = common_factors[cf_index]

        if my_common_factors is not None:
            return flattened_product(
                    [unif_subst_map(cf) for cf in my_common_factors]
                    + [expr])
        else:
            return expr
Exemple #9
0
def get_loopy_instructions_as_maxima(kernel, prefix):
    """Sample use for code comparison::

        load("knl-optFalse.mac");
        load("knl-optTrue.mac");

        vname: bessel_j_8;

        un_name : concat(''un_, vname);
        opt_name : concat(''opt_, vname);

        print(ratsimp(ev(un_name - opt_name)));
    """
    from loopy.preprocess import add_boostability_and_automatic_dependencies
    kernel = add_boostability_and_automatic_dependencies(kernel)

    my_variable_names = (
            avn
            for insn in kernel.instructions
            for avn, _ in insn.assignees_and_indices()
            )

    from pymbolic import var
    subst_dict = dict(
            (vn, var(prefix+vn)) for vn in my_variable_names)

    mstr = MaximaStringifyMapper()
    from loopy.symbolic import SubstitutionMapper
    from pymbolic.mapper.substitutor import make_subst_func
    substitute = SubstitutionMapper(make_subst_func(subst_dict))

    result = ["ratprint:false;"]

    written_insn_ids = set()

    from loopy.kernel import InstructionBase, ExpressionInstruction

    def write_insn(insn):
        if not isinstance(insn, InstructionBase):
            insn = kernel.id_to_insn[insn]
        if not isinstance(insn, ExpressionInstruction):
            raise RuntimeError("non-expression instructions not supported "
                    "in maxima export")

        for dep in insn.insn_deps:
            if dep not in written_insn_ids:
                write_insn(dep)

        (aname, _), = insn.assignees_and_indices()
        result.append("%s%s : %s;" % (
            prefix, aname,
            mstr(substitute(insn.expression))))

        written_insn_ids.add(insn.id)

    for insn in kernel.instructions:
        if insn.id not in written_insn_ids:
            write_insn(insn)

    return "\n".join(result)
Exemple #10
0
def substitute(expression: Any, variable_assigments: Mapping[str, Any]) -> Any:
    """Perform variable substitution in an expression.

    :param expression: A scalar expression, or an expression derived from such
        (e.g., a tuple of scalar expressions)
    :param variable_assigments: A mapping from variable names to substitutions
    """
    from pymbolic.mapper.substitutor import make_subst_func
    return SubstitutionMapper(make_subst_func(variable_assigments))(expression)
Exemple #11
0
def substitute(expression, variable_assignments=None, **kwargs):
    if variable_assignments is None:
        variable_assignments = {}
    variable_assignments = variable_assignments.copy()
    variable_assignments.update(kwargs)

    from pymbolic.mapper.substitutor import make_subst_func
    return SubstitutionMapper(
        make_subst_func(variable_assignments))(expression)
Exemple #12
0
    def make_new_arg_context(rule_name, arg_names, arguments, arg_context):
        if len(arg_names) != len(arguments):
            raise RuntimeError("Rule '%s' invoked with %d arguments (needs %d)"
                    % (rule_name, len(arguments), len(arg_names), ))

        from pymbolic.mapper.substitutor import make_subst_func
        arg_subst_map = SubstitutionMapper(make_subst_func(arg_context))
        return dict(
                (formal_arg_name, arg_subst_map(arg_value))
                for formal_arg_name, arg_value in zip(arg_names, arguments))
Exemple #13
0
    def make_new_arg_context(rule_name, arg_names, arguments, arg_context):
        if len(arg_names) != len(arguments):
            raise RuntimeError("Rule '%s' invoked with %d arguments (needs %d)"
                    % (rule_name, len(arguments), len(arg_names), ))

        from pymbolic.mapper.substitutor import make_subst_func
        arg_subst_map = SubstitutionMapper(make_subst_func(arg_context))
        return dict(
                (formal_arg_name, arg_subst_map(arg_value))
                for formal_arg_name, arg_value in zip(arg_names, arguments))
Exemple #14
0
def get_loopy_instructions_as_maxima(kernel, prefix):
    """Sample use for code comparison::

        load("knl-optFalse.mac");
        load("knl-optTrue.mac");

        vname: bessel_j_8;

        un_name : concat(''un_, vname);
        opt_name : concat(''opt_, vname);

        print(ratsimp(ev(un_name - opt_name)));
    """
    from loopy.preprocess import add_boostability_and_automatic_dependencies
    kernel = add_boostability_and_automatic_dependencies(kernel)

    my_variable_names = (avn for insn in kernel.instructions
                         for avn in insn.assignee_var_names())

    from pymbolic import var
    subst_dict = dict((vn, var(prefix + vn)) for vn in my_variable_names)

    mstr = MaximaStringifyMapper()
    from loopy.symbolic import SubstitutionMapper
    from pymbolic.mapper.substitutor import make_subst_func
    substitute = SubstitutionMapper(make_subst_func(subst_dict))

    result = ["ratprint:false;"]

    written_insn_ids = set()

    from loopy.kernel import InstructionBase, Assignment

    def write_insn(insn):
        if not isinstance(insn, InstructionBase):
            insn = kernel.id_to_insn[insn]
        if not isinstance(insn, Assignment):
            raise RuntimeError("non-single-output assignment not supported "
                               "in maxima export")

        for dep in insn.depends_on:
            if dep not in written_insn_ids:
                write_insn(dep)

        aname, = insn.assignee_var_names()
        result.append("%s%s : %s;" %
                      (prefix, aname, mstr(substitute(insn.expression))))

        written_insn_ids.add(insn.id)

    for insn in kernel.instructions:
        if insn.id not in written_insn_ids:
            write_insn(insn)

    return "\n".join(result)
Exemple #15
0
    def process_expression_for_loopy(self, expr):
        from pymbolic.mapper.substitutor import make_subst_func
        from loopy.symbolic import SubstitutionMapper

        submap = SubstitutionMapper(make_subst_func(self.active_iname_aliases))

        expr = submap(expr)

        subshift = SubscriptIndexBaseShifter(self)
        expr = subshift(expr)

        return expr
Exemple #16
0
def _apply_renames_in_exprs(kernel, var_renames):
    from loopy.symbolic import (SubstitutionRuleMappingContext,
                                RuleAwareSubstitutionMapper)
    from pymbolic.mapper.substitutor import make_subst_func
    from loopy.match import parse_stack_match

    srmc = SubstitutionRuleMappingContext(kernel.substitutions,
                                          kernel.get_var_name_generator())
    subst_map = RuleAwareSubstitutionMapper(srmc,
                                            make_subst_func(var_renames),
                                            within=parse_stack_match(None))
    return subst_map.map_kernel(kernel)
Exemple #17
0
    def process_expression_for_loopy(self, expr):
        from pymbolic.mapper.substitutor import make_subst_func
        from loopy.symbolic import SubstitutionMapper

        submap = SubstitutionMapper(
                make_subst_func(self.active_iname_aliases))

        expr = submap(expr)

        subshift = SubscriptIndexBaseShifter(self)
        expr = subshift(expr)

        return expr
Exemple #18
0
def _apply_renames_in_exprs(kernel, var_renames):
    from loopy.symbolic import (
            SubstitutionRuleMappingContext,
            RuleAwareSubstitutionMapper)
    from pymbolic.mapper.substitutor import make_subst_func
    from loopy.match import parse_stack_match

    srmc = SubstitutionRuleMappingContext(
            kernel.substitutions, kernel.get_var_name_generator())
    subst_map = RuleAwareSubstitutionMapper(
            srmc, make_subst_func(var_renames),
            within=parse_stack_match(None))
    return subst_map.map_kernel(kernel)
Exemple #19
0
    def map_substitution(self, name, rule, arguments):
        if len(rule.arguments) != len(arguments):
            from loopy.diagnostic import LoopyError
            raise LoopyError("number of arguments to '%s' does not match "
                             "definition" % name)

        from pymbolic.mapper.substitutor import make_subst_func
        submap = SubstitutionMapper(
            make_subst_func(dict(zip(rule.arguments, arguments))))

        expr = submap(rule.expression)

        return self.rec(expr)
Exemple #20
0
def subst_into_pwqpolynomial(new_space, poly, subst_dict):
    """
    Returns an instance of :class:`islpy.PwQPolynomial` with substitutions from
    *subst_dict* substituted into *poly*.

    :arg poly: an instance of :class:`islpy.PwQPolynomial`
    :arg subst_dict: a mapping from parameters of *poly* to
        :class:`pymbolic.primitives.Expression` made up of terms comprising the
        parameters of *new_space*. The expression must be affine in the param
        dims of *new_space*.
    """
    if not poly.get_pieces():
        # pw poly is univserally zero
        result = isl.PwQPolynomial.zero(
            new_space.insert_dims(dim_type.out, 0, 1))
        assert result.dim(dim_type.out) == 1
        return result

    i_begin_subst_space = poly.dim(dim_type.param)

    poly, subst_domain, subst_dict = get_param_subst_domain(
        new_space, poly, subst_dict)

    from loopy.symbolic import qpolynomial_to_expr, qpolynomial_from_expr
    new_pieces = []
    for valid_set, qpoly in poly.get_pieces():
        valid_set = valid_set & subst_domain
        if valid_set.plain_is_empty():
            continue

        valid_set = valid_set.project_out(dim_type.param, 0,
                                          i_begin_subst_space)
        from pymbolic.mapper.substitutor import (SubstitutionMapper,
                                                 make_subst_func)
        sub_mapper = SubstitutionMapper(make_subst_func(subst_dict))
        expr = sub_mapper(qpolynomial_to_expr(qpoly))
        qpoly = qpolynomial_from_expr(valid_set.space, expr)

        new_pieces.append((valid_set, qpoly))

    if not new_pieces:
        raise ValueError(
            "no pieces of PwQPolynomial survived the substitution")

    valid_set, qpoly = new_pieces[0]
    result = isl.PwQPolynomial.alloc(valid_set, qpoly)
    for valid_set, qpoly in new_pieces[1:]:
        result = result.add_disjoint(isl.PwQPolynomial.alloc(valid_set, qpoly))

    assert result.dim(dim_type.out)
    return result
Exemple #21
0
    def map_substitution(self, name, rule, arguments):
        if len(rule.arguments) != len(arguments):
            from loopy.diagnostic import LoopyError
            raise LoopyError("number of arguments to '%s' does not match "
                    "definition" % name)

        from pymbolic.mapper.substitutor import make_subst_func
        submap = SubstitutionMapper(
                make_subst_func(
                    dict(zip(rule.arguments, arguments))))

        expr = submap(rule.expression)

        return self.rec(expr)
Exemple #22
0
def loopy_substitute(expression: Any,
                     variable_assigments: Mapping[str, Any]) -> Any:
    from loopy.symbolic import SubstitutionMapper
    from pymbolic.mapper.substitutor import make_subst_func

    # {{{ early exit for identity substitution

    if all(
            isinstance(v, prim.Variable) and v.name == k
            for k, v in variable_assigments.items()):
        # Nothing to do here, move on.
        return expression

    # }}}

    return SubstitutionMapper(make_subst_func(variable_assigments))(expression)
Exemple #23
0
def _update_t_by_dt_factor(factor, statements):
    from dagrt.language import Assign, Nop
    from pymbolic import var
    from pymbolic.mapper.substitutor import make_subst_func, SubstitutionMapper

    mapper = SubstitutionMapper(make_subst_func({"<dt>":
                                                 factor * var("<dt>")}))

    def updater(stmt):
        if factor == 0:
            return Nop(id=stmt.id, depends_on=stmt.depends_on)
        return stmt.map_expressions(mapper)

    return [
        stmt if (not isinstance(stmt, Assign) or stmt.lhs != var("<t>")) else
        updater(stmt) for stmt in statements
    ]
Exemple #24
0
    def map_reduction(self, expr, expn_state):
        within = self.within(expn_state.kernel, expn_state.instruction,
                             expn_state.stack)

        for iname in expr.inames:
            self.iname_to_red_count[iname] = (
                self.iname_to_red_count.get(iname, 0) + 1)
            if not expr.allow_simultaneous:
                self.iname_to_nonsimultaneous_red_count[iname] = (
                    self.iname_to_nonsimultaneous_red_count.get(iname, 0) + 1)

        if within and not expr.allow_simultaneous:
            subst_dict = {}

            from pymbolic import var

            new_inames = []
            for iname in expr.inames:
                if (not (self.inames is None or iname in self.inames)
                        or self.iname_to_red_count[iname] <= 1):
                    new_inames.append(iname)
                    continue

                new_iname = self.rule_mapping_context.make_unique_var_name(
                    iname)
                subst_dict[iname] = var(new_iname)
                self.old_to_new.append((iname, new_iname))
                new_inames.append(new_iname)

            from loopy.symbolic import SubstitutionMapper
            from pymbolic.mapper.substitutor import make_subst_func

            from loopy.symbolic import Reduction
            return Reduction(
                expr.operation, tuple(new_inames),
                self.rec(
                    SubstitutionMapper(make_subst_func(subst_dict))(expr.expr),
                    expn_state), expr.allow_simultaneous)
        else:
            return super(_ReductionInameUniquifier,
                         self).map_reduction(expr, expn_state)
Exemple #25
0
def rename_argument(kernel, old_name, new_name, existing_ok=False):
    """
    .. versionadded:: 2016.2
    """

    var_name_gen = kernel.get_var_name_generator()

    if old_name not in kernel.arg_dict:
        raise LoopyError("old arg name '%s' does not exist" % old_name)

    does_exist = var_name_gen.is_name_conflicting(new_name)

    if does_exist and not existing_ok:
        raise LoopyError("argument name '%s' conflicts with an existing identifier"
                "--cannot rename" % new_name)

    from pymbolic import var
    subst_dict = {old_name: var(new_name)}

    from loopy.symbolic import (
            RuleAwareSubstitutionMapper,
            SubstitutionRuleMappingContext)
    from pymbolic.mapper.substitutor import make_subst_func
    rule_mapping_context = SubstitutionRuleMappingContext(
            kernel.substitutions, var_name_gen)
    smap = RuleAwareSubstitutionMapper(rule_mapping_context,
                    make_subst_func(subst_dict),
                    within=lambda knl, insn, stack: True)

    kernel = smap.map_kernel(kernel)

    new_args = []
    for arg in kernel.args:
        if arg.name == old_name:
            arg = arg.copy(name=new_name)

        new_args.append(arg)

    return kernel.copy(args=new_args)
Exemple #26
0
def rename_argument(kernel, old_name, new_name, existing_ok=False):
    """
    .. versionadded:: 2016.2
    """

    var_name_gen = kernel.get_var_name_generator()

    if old_name not in kernel.arg_dict:
        raise LoopyError("old arg name '%s' does not exist" % old_name)

    does_exist = var_name_gen.is_name_conflicting(new_name)

    if does_exist and not existing_ok:
        raise LoopyError("argument name '%s' conflicts with an existing identifier"
                "--cannot rename" % new_name)

    from pymbolic import var
    subst_dict = {old_name: var(new_name)}

    from loopy.symbolic import (
            RuleAwareSubstitutionMapper,
            SubstitutionRuleMappingContext)
    from pymbolic.mapper.substitutor import make_subst_func
    rule_mapping_context = SubstitutionRuleMappingContext(
            kernel.substitutions, var_name_gen)
    smap = RuleAwareSubstitutionMapper(rule_mapping_context,
                    make_subst_func(subst_dict),
                    within=lambda knl, insn, stack: True)

    kernel = smap.map_kernel(kernel)

    new_args = []
    for arg in kernel.args:
        if arg.name == old_name:
            arg = arg.copy(name=new_name)

        new_args.append(arg)

    return kernel.copy(args=new_args)
Exemple #27
0
def subst_into_pwaff(new_space, pwaff, subst_dict):
    """
    Returns an instance of :class:`islpy.PwAff` with substitutions from
    *subst_dict* substituted into *pwaff*.

    :arg pwaff: an instance of :class:`islpy.PwAff`
    :arg subst_dict: a mapping from parameters of *pwaff* to
        :class:`pymbolic.primitives.Expression` made up of terms comprising the
        parameters of *new_space*. The expression must be affine in the param
        dims of *new_space*.
    """
    from pymbolic.mapper.substitutor import (SubstitutionMapper,
                                             make_subst_func)
    from loopy.symbolic import aff_from_expr, aff_to_expr
    from functools import reduce

    i_begin_subst_space = pwaff.dim(dim_type.param)
    pwaff, subst_domain, subst_dict = get_param_subst_domain(
        new_space, pwaff, subst_dict)
    subst_mapper = SubstitutionMapper(make_subst_func(subst_dict))
    pwaffs = []

    for valid_set, qpoly in pwaff.get_pieces():
        valid_set = valid_set & subst_domain
        if valid_set.plain_is_empty():
            continue

        valid_set = valid_set.project_out(dim_type.param, 0,
                                          i_begin_subst_space)
        aff = aff_from_expr(valid_set.space, subst_mapper(aff_to_expr(qpoly)))

        pwaffs.append(isl.PwAff.alloc(valid_set, aff))

    if not pwaffs:
        raise ValueError("no pieces of PwAff survived the substitution")

    return reduce(lambda pwaff1, pwaff2: pwaff1.union_add(pwaff2),
                  pwaffs).coalesce()
Exemple #28
0
    def __init__(self, kernel, domain, sweep_inames, access_descriptors,
            storage_axis_count):
        self.kernel = kernel
        self.sweep_inames = sweep_inames

        storage_axis_names = self.storage_axis_names = [
                "_loopy_storage_%d" % i for i in range(storage_axis_count)]

        # {{{ duplicate sweep inames

        # The duplication is necessary, otherwise the storage fetch
        # inames remain weirdly tied to the original sweep inames.

        self.primed_sweep_inames = [psin+"'" for psin in sweep_inames]

        from loopy.isl_helpers import duplicate_axes
        dup_sweep_index = domain.space.dim(dim_type.out)
        domain_dup_sweep = duplicate_axes(
                domain, sweep_inames,
                self.primed_sweep_inames)

        self.prime_sweep_inames = SubstitutionMapper(make_subst_func(
            dict((sin, var(psin))
                for sin, psin in zip(sweep_inames, self.primed_sweep_inames))))

        # # }}}

        self.stor2sweep = build_global_storage_to_sweep_map(
                kernel, access_descriptors,
                domain_dup_sweep, dup_sweep_index,
                storage_axis_names,
                sweep_inames, self.primed_sweep_inames, self.prime_sweep_inames)

        storage_base_indices, storage_shape = compute_bounds(
                kernel, domain, self.stor2sweep, self.primed_sweep_inames,
                storage_axis_names)

        # compute augmented domain

        # {{{ filter out unit-length dimensions

        non1_storage_axis_flags = []
        non1_storage_shape = []

        for saxis, bi, l in zip(
                storage_axis_names, storage_base_indices, storage_shape):
            has_length_non1 = l != 1

            non1_storage_axis_flags.append(has_length_non1)

            if has_length_non1:
                non1_storage_shape.append(l)

        # }}}

        # {{{ subtract off the base indices
        # add the new, base-0 indices as new in dimensions

        sp = self.stor2sweep.get_space()
        stor_idx = sp.dim(dim_type.out)

        n_stor = storage_axis_count
        nn1_stor = len(non1_storage_shape)

        aug_domain = self.stor2sweep.move_dims(
                dim_type.out, stor_idx,
                dim_type.in_, 0,
                n_stor).range()

        # aug_domain space now:
        # [domain](dup_sweep_index)[dup_sweep](stor_idx)[stor_axes']

        aug_domain = aug_domain.insert_dims(dim_type.set, stor_idx, nn1_stor)

        inew = 0
        for i, name in enumerate(storage_axis_names):
            if non1_storage_axis_flags[i]:
                aug_domain = aug_domain.set_dim_name(
                        dim_type.set, stor_idx + inew, name)
                inew += 1

        # aug_domain space now:
        # [domain](dup_sweep_index)[dup_sweep](stor_idx)[stor_axes'][n1_stor_axes]

        from loopy.symbolic import aff_from_expr
        for saxis, bi, s in zip(storage_axis_names, storage_base_indices,
                storage_shape):
            if s != 1:
                cns = isl.Constraint.equality_from_aff(
                        aff_from_expr(aug_domain.get_space(),
                            var(saxis) - (var(saxis+"'") - bi)))

                aug_domain = aug_domain.add_constraint(cns)

        # }}}

        # eliminate (primed) storage axes with non-zero base indices
        aug_domain = aug_domain.project_out(dim_type.set, stor_idx+nn1_stor, n_stor)

        # eliminate duplicated sweep_inames
        nsweep = len(sweep_inames)
        aug_domain = aug_domain.project_out(dim_type.set, dup_sweep_index, nsweep)

        self.non1_storage_axis_flags = non1_storage_axis_flags
        self.aug_domain = aug_domain
        self.storage_base_indices = storage_base_indices
        self.non1_storage_shape = non1_storage_shape
Exemple #29
0
def _fix_parameter(kernel, name, value, within=None):
    def process_set(s):
        var_dict = s.get_var_dict()

        try:
            dt, idx = var_dict[name]
        except KeyError:
            return s

        value_aff = isl.Aff.zero_on_domain(s.space) + value

        from loopy.isl_helpers import iname_rel_aff
        name_equal_value_aff = iname_rel_aff(s.space, name, "==", value_aff)

        s = (s.add_constraint(
            isl.Constraint.equality_from_aff(
                name_equal_value_aff)).project_out(dt, idx, 1))

        return s

    new_domains = [process_set(dom) for dom in kernel.domains]

    from pymbolic.mapper.substitutor import make_subst_func
    subst_func = make_subst_func({name: value})

    from loopy.symbolic import SubstitutionMapper, PartialEvaluationMapper
    subst_map = SubstitutionMapper(subst_func)
    ev_map = PartialEvaluationMapper()

    def map_expr(expr):
        return ev_map(subst_map(expr))

    from loopy.kernel.array import ArrayBase
    new_args = []
    for arg in kernel.args:
        if arg.name == name:
            # remove from argument list
            continue

        if not isinstance(arg, ArrayBase):
            new_args.append(arg)
        else:
            new_args.append(arg.map_exprs(map_expr))

    new_temp_vars = {}
    for tv in kernel.temporary_variables.values():
        new_temp_vars[tv.name] = tv.map_exprs(map_expr)

    from loopy.match import parse_stack_match
    within = parse_stack_match(within)

    rule_mapping_context = SubstitutionRuleMappingContext(
        kernel.substitutions, kernel.get_var_name_generator())
    esubst_map = RuleAwareSubstitutionMapper(rule_mapping_context,
                                             subst_func,
                                             within=within)
    return (rule_mapping_context.finish_kernel(
        esubst_map.map_kernel(
            kernel,
            within=within,
            # overwritten below, no need to map
            map_tvs=False,
            map_args=False)).copy(
                domains=new_domains,
                args=new_args,
                temporary_variables=new_temp_vars,
                assumptions=process_set(kernel.assumptions),
            ))
Exemple #30
0
def extract_subst(kernel, subst_name, template, parameters=()):
    """
    :arg subst_name: The name of the substitution rule to be created.
    :arg template: Unification template expression.
    :arg parameters: An iterable of parameters used in
        *template*, or a comma-separated string of the same.

    All targeted subexpressions must match ('unify with') *template*
    The template may contain '*' wildcards that will have to match exactly across all
    unifications.
    """

    if isinstance(template, str):
        from pymbolic import parse
        template = parse(template)

    if isinstance(parameters, str):
        parameters = tuple(
                s.strip() for s in parameters.split(","))

    var_name_gen = kernel.get_var_name_generator()

    # {{{ replace any wildcards in template with new variables

    def get_unique_var_name():
        based_on = subst_name+"_wc"

        result = var_name_gen(based_on)
        return result

    from loopy.symbolic import WildcardToUniqueVariableMapper
    wc_map = WildcardToUniqueVariableMapper(get_unique_var_name)
    template = wc_map(template)

    # }}}

    # {{{ deal with iname deps of template that are not independent_inames

    # (We call these 'matching_vars', because they have to match exactly in
    # every CSE. As above, they might need to be renamed to make them unique
    # within the kernel.)

    matching_vars = []
    old_to_new = {}

    for iname in (get_dependencies(template)
            - set(parameters)
            - kernel.non_iname_variable_names()):
        if iname in kernel.all_inames():
            # need to rename to be unique
            new_iname = var_name_gen(iname)
            old_to_new[iname] = var(new_iname)
            matching_vars.append(new_iname)
        else:
            matching_vars.append(iname)

    if old_to_new:
        template = (
                SubstitutionMapper(make_subst_func(old_to_new))
                (template))

    # }}}

    # {{{ gather up expressions

    expr_descriptors = []

    from loopy.symbolic import UnidirectionalUnifier
    unif = UnidirectionalUnifier(
            lhs_mapping_candidates=set(parameters) | set(matching_vars))

    def gather_exprs(expr, mapper):
        urecs = unif(template, expr)

        if urecs:
            if len(urecs) > 1:
                raise RuntimeError("ambiguous unification of '%s' with template '%s'"
                        % (expr, template))

            urec, = urecs

            expr_descriptors.append(
                    ExprDescriptor(
                        insn=insn,
                        expr=expr,
                        unif_var_dict=dict((lhs.name, rhs)
                            for lhs, rhs in urec.equations)))
        else:
            mapper.fallback_mapper(expr)
            # can't nest, don't recurse

    from loopy.symbolic import (
            CallbackMapper, WalkMapper, IdentityMapper)
    dfmapper = CallbackMapper(gather_exprs, WalkMapper())

    for insn in kernel.instructions:
        dfmapper(insn.assignees)
        dfmapper(insn.expression)

    for sr in six.itervalues(kernel.substitutions):
        dfmapper(sr.expression)

    # }}}

    if not expr_descriptors:
        raise RuntimeError("no expressions matching '%s'" % template)

    # {{{ substitute rule into instructions

    def replace_exprs(expr, mapper):
        found = False
        for exprd in expr_descriptors:
            if expr is exprd.expr:
                found = True
                break

        if not found:
            return mapper.fallback_mapper(expr)

        args = [exprd.unif_var_dict[arg_name]
                for arg_name in parameters]

        result = var(subst_name)
        if args:
            result = result(*args)

        return result
        # can't nest, don't recurse

    cbmapper = CallbackMapper(replace_exprs, IdentityMapper())

    new_insns = []

    for insn in kernel.instructions:
        new_insns.append(insn.with_transformed_expressions(cbmapper))

    from loopy.kernel.data import SubstitutionRule
    new_substs = {
            subst_name: SubstitutionRule(
                name=subst_name,
                arguments=tuple(parameters),
                expression=template,
                )}

    for subst in six.itervalues(kernel.substitutions):
        new_substs[subst.name] = subst.copy(
                expression=cbmapper(subst.expression))

    # }}}

    return kernel.copy(
            instructions=new_insns,
            substitutions=new_substs)
Exemple #31
0
    def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var,
                           lhs_expr, rhs_expr, lhs_dtype, rhs_type_context):

        from pymbolic.primitives import Sum
        from cgen import Statement
        from pymbolic.mapper.stringifier import PREC_NONE

        if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype in [
                np.int32, np.int64, np.float32, np.float64
        ]:
            # atomicAdd
            if isinstance(rhs_expr, Sum):
                ecm = self.get_expression_to_code_mapper(codegen_state)

                new_rhs_expr = Sum(
                    tuple(c for c in rhs_expr.children if c != lhs_expr))
                lhs_expr_code = ecm(lhs_expr)
                rhs_expr_code = ecm(new_rhs_expr)

                return Statement("atomicAdd(&{}, {})".format(
                    lhs_expr_code, rhs_expr_code))
            else:
                from cgen import Block, DoWhile, Assign
                from loopy.target.c import POD
                old_val_var = codegen_state.var_name_generator("loopy_old_val")
                new_val_var = codegen_state.var_name_generator("loopy_new_val")

                from loopy.kernel.data import TemporaryVariable
                ecm = codegen_state.expression_to_code_mapper.with_assignments(
                    {
                        old_val_var: TemporaryVariable(old_val_var, lhs_dtype),
                        new_val_var: TemporaryVariable(new_val_var, lhs_dtype),
                    })

                lhs_expr_code = ecm(lhs_expr,
                                    prec=PREC_NONE,
                                    type_context=None)

                from pymbolic.mapper.substitutor import make_subst_func
                from pymbolic import var
                from loopy.symbolic import SubstitutionMapper

                subst = SubstitutionMapper(
                    make_subst_func({lhs_expr: var(old_val_var)}))
                rhs_expr_code = ecm(subst(rhs_expr),
                                    prec=PREC_NONE,
                                    type_context=rhs_type_context,
                                    needed_dtype=lhs_dtype)

                cast_str = ""
                old_val = old_val_var
                new_val = new_val_var

                if lhs_dtype.numpy_dtype.kind == "f":
                    if lhs_dtype.numpy_dtype == np.float32:
                        ctype = "int"
                    elif lhs_dtype.numpy_dtype == np.float64:
                        ctype = "long"
                    else:
                        raise AssertionError()

                    old_val = "*(%s *) &" % ctype + old_val
                    new_val = "*(%s *) &" % ctype + new_val
                    cast_str = "(%s *) " % (ctype)

                return Block([
                    POD(self, NumpyType(lhs_dtype.dtype, target=self.target),
                        old_val_var),
                    POD(self, NumpyType(lhs_dtype.dtype, target=self.target),
                        new_val_var),
                    DoWhile(
                        "atomicCAS("
                        "%(cast_str)s&(%(lhs_expr)s), "
                        "%(old_val)s, "
                        "%(new_val)s"
                        ") != %(old_val)s" % {
                            "cast_str": cast_str,
                            "lhs_expr": lhs_expr_code,
                            "old_val": old_val,
                            "new_val": new_val,
                        },
                        Block([
                            Assign(old_val_var, lhs_expr_code),
                            Assign(new_val_var, rhs_expr_code),
                        ]))
                ])
        else:
            raise NotImplementedError("atomic update for '%s'" % lhs_dtype)
Exemple #32
0
def affine_map_inames(kernel, old_inames, new_inames, equations):
    """Return a new *kernel* where the affine transform
    specified by *equations* has been applied to the inames.

    :arg old_inames: A list of inames to be replaced by affine transforms
        of their values.
        May also be a string of comma-separated inames.

    :arg new_inames: A list of new inames that are not yet used in *kernel*,
        but have their values established in terms of *old_inames* by
        *equations*.
        May also be a string of comma-separated inames.
    :arg equations: A list of equations estabilishing a relationship
        between *old_inames* and *new_inames*. Each equation may be
        a tuple ``(lhs, rhs)`` of expressions or a string, with left and
        right hand side of the equation separated by ``=``.
    """

    # {{{ check and parse arguments

    if isinstance(new_inames, str):
        new_inames = new_inames.split(",")
        new_inames = [iname.strip() for iname in new_inames]
    if isinstance(old_inames, str):
        old_inames = old_inames.split(",")
        old_inames = [iname.strip() for iname in old_inames]
    if isinstance(equations, str):
        equations = [equations]

    import re
    eqn_re = re.compile(r"^([^=]+)=([^=]+)$")

    def parse_equation(eqn):
        if isinstance(eqn, str):
            eqn_match = eqn_re.match(eqn)
            if not eqn_match:
                raise ValueError("invalid equation: %s" % eqn)

            from loopy.symbolic import parse
            lhs = parse(eqn_match.group(1))
            rhs = parse(eqn_match.group(2))
            return (lhs, rhs)
        elif isinstance(eqn, tuple):
            if len(eqn) != 2:
                raise ValueError("unexpected length of equation tuple, "
                        "got %d, should be 2" % len(eqn))
            return eqn
        else:
            raise ValueError("unexpected type of equation"
                    "got %d, should be string or tuple"
                    % type(eqn).__name__)

    equations = [parse_equation(eqn) for eqn in equations]

    all_vars = kernel.all_variable_names()
    for iname in new_inames:
        if iname in all_vars:
            raise LoopyError("new iname '%s' is already used in kernel"
                    % iname)

    for iname in old_inames:
        if iname not in kernel.all_inames():
            raise LoopyError("old iname '%s' not known" % iname)

    # }}}

    # {{{ substitute iname use

    from pymbolic.algorithm import solve_affine_equations_for
    old_inames_to_expr = solve_affine_equations_for(old_inames, equations)

    subst_dict = dict(
            (v.name, expr)
            for v, expr in old_inames_to_expr.items())

    var_name_gen = kernel.get_var_name_generator()

    from pymbolic.mapper.substitutor import make_subst_func
    from loopy.context_matching import parse_stack_match

    rule_mapping_context = SubstitutionRuleMappingContext(
            kernel.substitutions, var_name_gen)
    old_to_new = RuleAwareSubstitutionMapper(rule_mapping_context,
            make_subst_func(subst_dict), within=parse_stack_match(None))

    kernel = (
            rule_mapping_context.finish_kernel(
                old_to_new.map_kernel(kernel))
            .copy(
                applied_iname_rewrites=kernel.applied_iname_rewrites + [subst_dict]
                ))

    # }}}

    # {{{ change domains

    new_inames_set = set(new_inames)
    old_inames_set = set(old_inames)

    new_domains = []
    for idom, dom in enumerate(kernel.domains):
        dom_var_dict = dom.get_var_dict()
        old_iname_overlap = [
                iname
                for iname in old_inames
                if iname in dom_var_dict]

        if not old_iname_overlap:
            new_domains.append(dom)
            continue

        from loopy.symbolic import get_dependencies
        dom_new_inames = set()
        dom_old_inames = set()

        # mapping for new inames to dim_types
        new_iname_dim_types = {}

        dom_equations = []
        for iname in old_iname_overlap:
            for ieqn, (lhs, rhs) in enumerate(equations):
                eqn_deps = get_dependencies(lhs) | get_dependencies(rhs)
                if iname in eqn_deps:
                    dom_new_inames.update(eqn_deps & new_inames_set)
                    dom_old_inames.update(eqn_deps & old_inames_set)

                if dom_old_inames:
                    dom_equations.append((lhs, rhs))

                this_eqn_old_iname_dim_types = set(
                        dom_var_dict[old_iname][0]
                        for old_iname in eqn_deps & old_inames_set)

                if this_eqn_old_iname_dim_types:
                    if len(this_eqn_old_iname_dim_types) > 1:
                        raise ValueError("inames '%s' (from equation %d (0-based)) "
                                "in domain %d (0-based) are not "
                                "of a uniform dim_type"
                                % (", ".join(eqn_deps & old_inames_set), ieqn, idom))

                    this_eqn_new_iname_dim_type, = this_eqn_old_iname_dim_types

                    for new_iname in eqn_deps & new_inames_set:
                        if new_iname in new_iname_dim_types:
                            if (this_eqn_new_iname_dim_type
                                    != new_iname_dim_types[new_iname]):
                                raise ValueError("dim_type disagreement for "
                                        "iname '%s' (from equation %d (0-based)) "
                                        "in domain %d (0-based)"
                                        % (new_iname, ieqn, idom))
                        else:
                            new_iname_dim_types[new_iname] = \
                                    this_eqn_new_iname_dim_type

        if not dom_old_inames <= set(dom_var_dict):
            raise ValueError("domain %d (0-based) does not know about "
                    "all old inames (specifically '%s') needed to define new inames"
                    % (idom, ", ".join(dom_old_inames - set(dom_var_dict))))

        # add inames to domain with correct dim_types
        dom_new_inames = list(dom_new_inames)
        for iname in dom_new_inames:
            dt = new_iname_dim_types[iname]
            iname_idx = dom.dim(dt)
            dom = dom.add_dims(dt, 1)
            dom = dom.set_dim_name(dt, iname_idx, iname)

        # add equations
        from loopy.symbolic import aff_from_expr
        for lhs, rhs in dom_equations:
            dom = dom.add_constraint(
                    isl.Constraint.equality_from_aff(
                        aff_from_expr(dom.space, rhs - lhs)))

        # project out old inames
        for iname in dom_old_inames:
            dt, idx = dom.get_var_dict()[iname]
            dom = dom.project_out(dt, idx, 1)

        new_domains.append(dom)

    # }}}

    return kernel.copy(domains=new_domains)
Exemple #33
0
def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()):
    # FIXME: Does not understand subst rules for now
    if kernel.substitutions:
        from loopy.transform.subst import expand_subst
        kernel = expand_subst(kernel)

    if var_name in kernel.temporary_variables:
        var_descr = kernel.temporary_variables[var_name]
    elif var_name in kernel.arg_dict:
        var_descr = kernel.arg_dict[var_name]
    else:
        raise NameError("array '%s' was not found" % var_name)

    # {{{ check/normalize vary_by_axes

    if isinstance(vary_by_axes, str):
        vary_by_axes = vary_by_axes.split(",")

    from loopy.kernel.array import ArrayBase
    if isinstance(var_descr, ArrayBase):
        if var_descr.dim_names is not None:
            name_to_index = dict(
                    (name, idx)
                    for idx, name in enumerate(var_descr.dim_names))
        else:
            name_to_index = {}

        def map_ax_name_to_index(ax):
            if isinstance(ax, str):
                try:
                    return name_to_index[ax]
                except KeyError:
                    raise LoopyError("axis name '%s' not understood " % ax)
            else:
                return ax

        vary_by_axes = [map_ax_name_to_index(ax) for ax in vary_by_axes]

        if (
                vary_by_axes
                and
                (min(vary_by_axes) < 0
                or
                max(vary_by_axes) > var_descr.num_user_axes())):
            raise LoopyError("vary_by_axes refers to out-of-bounds axis index")

    # }}}

    from pymbolic.mapper.substitutor import make_subst_func
    from pymbolic.primitives import (Sum, Product, is_zero,
            flattened_sum, flattened_product, Subscript, Variable)
    from loopy.symbolic import (get_dependencies, SubstitutionMapper,
            UnidirectionalUnifier)

    # {{{ common factor key list maintenance

    # list of (index_key, common factors found)
    common_factors = []

    def find_unifiable_cf_index(index_key):
        for i, (key, val) in enumerate(common_factors):
            unif = UnidirectionalUnifier(
                    lhs_mapping_candidates=get_dependencies(key))

            unif_result = unif(key, index_key)

            if unif_result:
                assert len(unif_result) == 1
                return i, unif_result[0]

        return None, None

    def extract_index_key(access_expr):
        if isinstance(access_expr, Variable):
            return ()

        elif isinstance(access_expr, Subscript):
            index = access_expr.index_tuple
            return tuple(index[ax] for ax in vary_by_axes)
        else:
            raise ValueError("unexpected type of access_expr")

    def is_assignee(insn):
        return any(
                lhs == var_name
                for lhs, sbscript in insn.assignees_and_indices())

    def iterate_as(cls, expr):
        if isinstance(expr, cls):
            for ch in expr.children:
                yield ch
        else:
            yield expr

    # }}}

    # {{{ find common factors

    from loopy.kernel.data import Assignment

    for insn in kernel.instructions:
        if not is_assignee(insn):
            continue

        if not isinstance(insn, Assignment):
            raise LoopyError("'%s' modified by non-expression instruction"
                    % var_name)

        lhs = insn.assignee
        rhs = insn.expression

        if is_zero(rhs):
            continue

        index_key = extract_index_key(lhs)
        cf_index, unif_result = find_unifiable_cf_index(index_key)

        if cf_index is None:
            # {{{ doesn't exist yet

            assert unif_result is None

            my_common_factors = None

            for term in iterate_as(Sum, rhs):
                if term == lhs:
                    continue

                for part in iterate_as(Product, term):
                    if var_name in get_dependencies(part):
                        raise LoopyError("unexpected dependency on '%s' "
                                "in RHS of instruction '%s'"
                                % (var_name, insn.id))

                product_parts = set(iterate_as(Product, term))

                if my_common_factors is None:
                    my_common_factors = product_parts
                else:
                    my_common_factors = my_common_factors & product_parts

            if my_common_factors is not None:
                common_factors.append((index_key, my_common_factors))

            # }}}
        else:
            # {{{ match, filter existing common factors

            _, my_common_factors = common_factors[cf_index]

            unif_subst_map = SubstitutionMapper(
                    make_subst_func(unif_result.lmap))

            for term in iterate_as(Sum, rhs):
                if term == lhs:
                    continue

                for part in iterate_as(Product, term):
                    if var_name in get_dependencies(part):
                        raise LoopyError("unexpected dependency on '%s' "
                                "in RHS of instruction '%s'"
                                % (var_name, insn.id))

                product_parts = set(iterate_as(Product, term))

                my_common_factors = set(
                        cf for cf in my_common_factors
                        if unif_subst_map(cf) in product_parts)

            common_factors[cf_index] = (index_key, my_common_factors)

            # }}}

    # }}}

    # {{{ remove common factors

    new_insns = []

    for insn in kernel.instructions:
        if not isinstance(insn, Assignment) or not is_assignee(insn):
            new_insns.append(insn)
            continue

        (_, index_key), = insn.assignees_and_indices()

        lhs = insn.assignee
        rhs = insn.expression

        if is_zero(rhs):
            new_insns.append(insn)
            continue

        index_key = extract_index_key(lhs)
        cf_index, unif_result = find_unifiable_cf_index(index_key)

        if cf_index is None:
            new_insns.append(insn)
            continue

        _, my_common_factors = common_factors[cf_index]

        unif_subst_map = SubstitutionMapper(
                make_subst_func(unif_result.lmap))

        mapped_my_common_factors = set(
                unif_subst_map(cf)
                for cf in my_common_factors)

        new_sum_terms = []

        for term in iterate_as(Sum, rhs):
            if term == lhs:
                new_sum_terms.append(term)
                continue

            new_sum_terms.append(
                    flattened_product([
                        part
                        for part in iterate_as(Product, term)
                        if part not in mapped_my_common_factors
                        ]))

        new_insns.append(
                insn.copy(expression=flattened_sum(new_sum_terms)))

    # }}}

    # {{{ substitute common factors into usage sites

    def find_substitution(expr):
        if isinstance(expr, Subscript):
            v = expr.aggregate.name
        elif isinstance(expr, Variable):
            v = expr.name
        else:
            return expr

        if v != var_name:
            return expr

        index_key = extract_index_key(expr)
        cf_index, unif_result = find_unifiable_cf_index(index_key)

        unif_subst_map = SubstitutionMapper(
                make_subst_func(unif_result.lmap))

        _, my_common_factors = common_factors[cf_index]

        if my_common_factors is not None:
            return flattened_product(
                    [unif_subst_map(cf) for cf in my_common_factors]
                    + [expr])
        else:
            return expr

    insns = new_insns
    new_insns = []

    subm = SubstitutionMapper(find_substitution)

    for insn in insns:
        if not isinstance(insn, Assignment) or is_assignee(insn):
            new_insns.append(insn)
            continue

        new_insns.append(insn.with_transformed_expressions(subm))

    # }}}

    return kernel.copy(instructions=new_insns)
Exemple #34
0
 def apply_arg_context(self, expr):
     from pymbolic.mapper.substitutor import make_subst_func
     return SubstitutionMapper(
             make_subst_func(self.arg_context))(expr)
Exemple #35
0
def link_inames(knl, inames, new_iname, within=None, tag=None):
    # {{{ normalize arguments

    if isinstance(inames, str):
        inames = inames.split(",")

    var_name_gen = knl.get_var_name_generator()
    new_iname = var_name_gen(new_iname)

    # }}}

    # {{{ ensure that each iname is used at most once in each instruction

    inames_set = set(inames)

    if 0:
        # FIXME!
        for insn in knl.instructions:
            insn_inames = knl.insn_inames(insn.id) | insn.reduction_inames()

            if len(insn_inames & inames_set) > 1:
                raise LoopyError("To-be-linked inames '%s' are used in "
                        "instruction '%s'. No more than one such iname can "
                        "be used in one instruction."
                        % (", ".join(insn_inames & inames_set), insn.id))

    # }}}

    from loopy.kernel.tools import DomainChanger
    domch = DomainChanger(knl, tuple(inames))

    # {{{ ensure that projections are identical

    unrelated_dom_inames = list(
            set(domch.domain.get_var_names(dim_type.set))
            - inames_set)

    domain = domch.domain

    # move all inames to be linked to end to prevent shuffly confusion
    for iname in inames:
        dt, index = domain.get_var_dict()[iname]
        assert dt == dim_type.set

        # move to tail of param dim_type
        domain = domain.move_dims(
                    dim_type.param, domain.dim(dim_type.param),
                    dt, index, 1)
        # move to tail of set dim_type
        domain = domain.move_dims(
                    dim_type.set, domain.dim(dim_type.set),
                    dim_type.param, domain.dim(dim_type.param)-1, 1)

    projections = [
            domch.domain.project_out_except(
                unrelated_dom_inames + [iname], [dim_type.set])
            for iname in inames]

    all_equal = True
    first_proj = projections[0]
    for proj in projections[1:]:
        all_equal = all_equal and (proj <= first_proj and first_proj <= proj)

    if not all_equal:
        raise LoopyError("Inames cannot be linked because their domain "
                "constraints are not the same.")

    del domain  # messed up for testing, do not use

    # }}}

    # change the domain
    from loopy.isl_helpers import duplicate_axes
    knl = knl.copy(
            domains=domch.get_domains_with(
                duplicate_axes(domch.domain, [inames[0]], [new_iname])))

    # {{{ change the code

    from pymbolic import var
    subst_dict = dict((iname, var(new_iname)) for iname in inames)

    from loopy.context_matching import parse_stack_match
    within = parse_stack_match(within)

    from pymbolic.mapper.substitutor import make_subst_func
    rule_mapping_context = SubstitutionRuleMappingContext(
            knl.substitutions, var_name_gen)
    ijoin = RuleAwareSubstitutionMapper(rule_mapping_context,
                    make_subst_func(subst_dict), within)

    knl = rule_mapping_context.finish_kernel(
            ijoin.map_kernel(knl))

    # }}}

    knl = remove_unused_inames(knl, inames)

    if tag is not None:
        knl = tag_inames(knl, {new_iname: tag})

    return knl
Exemple #36
0
 def apply_arg_context(self, expr):
     from pymbolic.mapper.substitutor import make_subst_func
     return SubstitutionMapper(
             make_subst_func(self.arg_context))(expr)
Exemple #37
0
def strang_splitting(dag1, dag2, stepping_phase):
    """Given two time advancement routines (in *dag1* and *dag2*), returns a
    single second-order accurate time advancement routine representing the sum
    of both of those advancements.

    :arg dag1: a :class:`dagrt.language.DAGCode`
    :arg dag2: a :class:`dagrt.language.DAGCode`
    :arg stepping_phase: the name of the phase in *dag1* and *dag2* that carries
        out time stepping to which Strang splitting is to be applied.
    :returns: a :class:`dagrt.language.DAGCode`
    """

    from pymbolic.mapper.substitutor import make_subst_func, SubstitutionMapper

    # {{{ disambiguate

    id1 = dag1.existing_var_names()
    id2 = dag1.existing_var_names()

    from pytools import UniqueNameGenerator
    vng = UniqueNameGenerator(id1 | id2)

    from pymbolic import var
    subst2 = {}
    for clash in id1 & id2:
        if not clash.startswith("<") or clash.startswith("<p>"):
            unclash = vng(clash)
            subst2[clash] = var(unclash)

    subst2_mapper = SubstitutionMapper(make_subst_func(subst2))

    # }}}

    all_phases = frozenset(dag1.phases) | frozenset(dag2.phases)
    from dagrt.language import DAGCode, ExecutionPhase
    new_phases = {}
    for phase_name in all_phases:
        phase1 = dag1.phases.get(phase_name)
        phase2 = dag2.phases.get(phase_name)

        substed_s2_stmts = [
            stmt.map_expressions(subst2_mapper) for stmt in phase2.statements
        ]

        if phase_name == stepping_phase:
            assert phase1 is not None
            assert phase2 is not None

            from pymbolic import var
            dt_half = SubstitutionMapper(
                make_subst_func({"<dt>": var("<dt>") / 2}))

            phase1_half_dt = [
                stmt.map_expressions(dt_half) for stmt in phase1.statements
            ]

            if phase1.next_phase != phase2.next_phase:
                raise ValueError(
                    "DAGs don't agree on default "
                    f"phase transition out of phase '{phase_name}'")

            s2_name = phase_name + "_s2"
            s3_name = phase_name + "_s3"

            assert s2_name not in all_phases
            assert s3_name not in all_phases
            """
            du/dt = A + B
            Time interval is [0,1]
            1. Starting with u0, solve du / dt = A from t = 0 to 1/2, get u1
            2. Starting with u1, solve du / dt = B from t = 0 to 1, get u2
            3. Starting with u2, solve du / dt = A from t = 1/2 to 1, get u3
            4. Return u3
            """
            new_phases[phase_name] = ExecutionPhase(
                name=phase_name,
                next_phase=s2_name,
                statements=(_update_t_by_dt_factor(
                    0, _elide_yield_state(phase1_half_dt))))
            new_phases[s2_name] = ExecutionPhase(
                name=s2_name,
                next_phase=s3_name,
                statements=(_update_t_by_dt_factor(
                    1 / 2, _elide_yield_state(substed_s2_stmts))))
            new_phases[s3_name] = ExecutionPhase(name=s3_name,
                                                 next_phase=phase1.next_phase,
                                                 statements=phase1_half_dt)
        else:
            from dagrt.transform import fuse_two_phases
            new_phases[phase_name] = fuse_two_phases(
                phase_name, phase1, phase2.copy(statements=substed_s2_stmts))

    if dag1.initial_phase != dag2.initial_phase:
        raise ValueError("DAGs don't agree on initial phase")

    return DAGCode(new_phases, dag1.initial_phase)
Exemple #38
0
    def __init__(self, kernel, domain, sweep_inames, access_descriptors,
                 storage_axis_count):
        self.kernel = kernel
        self.sweep_inames = sweep_inames

        storage_axis_names = self.storage_axis_names = [
            "_loopy_storage_%d" % i for i in range(storage_axis_count)
        ]

        # {{{ duplicate sweep inames

        # The duplication is necessary, otherwise the storage fetch
        # inames remain weirdly tied to the original sweep inames.

        self.primed_sweep_inames = [psin + "'" for psin in sweep_inames]

        from loopy.isl_helpers import duplicate_axes
        dup_sweep_index = domain.space.dim(dim_type.out)
        domain_dup_sweep = duplicate_axes(domain, sweep_inames,
                                          self.primed_sweep_inames)

        self.prime_sweep_inames = SubstitutionMapper(
            make_subst_func({
                sin: var(psin)
                for sin, psin in zip(sweep_inames, self.primed_sweep_inames)
            }))

        # # }}}

        self.stor2sweep = build_global_storage_to_sweep_map(
            kernel, access_descriptors, domain_dup_sweep, dup_sweep_index,
            storage_axis_names, sweep_inames, self.primed_sweep_inames,
            self.prime_sweep_inames)

        storage_base_indices, storage_shape = compute_bounds(
            kernel, domain, self.stor2sweep, self.primed_sweep_inames,
            storage_axis_names)

        # compute augmented domain

        # {{{ filter out unit-length dimensions

        non1_storage_axis_flags = []
        non1_storage_shape = []

        for saxis_len in storage_shape:
            has_length_non1 = saxis_len != 1

            non1_storage_axis_flags.append(has_length_non1)

            if has_length_non1:
                non1_storage_shape.append(saxis_len)

        # }}}

        # {{{ subtract off the base indices
        # add the new, base-0 indices as new in dimensions

        sp = self.stor2sweep.get_space()
        stor_idx = sp.dim(dim_type.out)

        n_stor = storage_axis_count
        nn1_stor = len(non1_storage_shape)

        aug_domain = self.stor2sweep.move_dims(dim_type.out, stor_idx,
                                               dim_type.in_, 0,
                                               n_stor).range()

        # aug_domain space now:
        # [domain](dup_sweep_index)[dup_sweep](stor_idx)[stor_axes']

        aug_domain = aug_domain.insert_dims(dim_type.set, stor_idx, nn1_stor)

        inew = 0
        for i, name in enumerate(storage_axis_names):
            if non1_storage_axis_flags[i]:
                aug_domain = aug_domain.set_dim_name(dim_type.set,
                                                     stor_idx + inew, name)
                inew += 1

        # aug_domain space now:
        # [domain](dup_sweep_index)[dup_sweep](stor_idx)[stor_axes'][n1_stor_axes]

        from loopy.symbolic import aff_from_expr
        for saxis, bi, s in zip(storage_axis_names, storage_base_indices,
                                storage_shape):
            if s != 1:
                cns = isl.Constraint.equality_from_aff(
                    aff_from_expr(aug_domain.get_space(),
                                  var(saxis) - (var(saxis + "'") - bi)))

                aug_domain = aug_domain.add_constraint(cns)

        # }}}

        # eliminate (primed) storage axes with non-zero base indices
        aug_domain = aug_domain.project_out(dim_type.set, stor_idx + nn1_stor,
                                            n_stor)

        # eliminate duplicated sweep_inames
        nsweep = len(sweep_inames)
        aug_domain = aug_domain.project_out(dim_type.set, dup_sweep_index,
                                            nsweep)

        self.non1_storage_axis_flags = non1_storage_axis_flags
        self.aug_domain = aug_domain
        self.storage_base_indices = storage_base_indices
        self.non1_storage_shape = non1_storage_shape
Exemple #39
0
def _process_footprint_subscripts(kernel, rule_name, sweep_inames,
        footprint_subscripts, arg):
    """Track applied iname rewrites, deal with slice specifiers ':'."""

    name_gen = kernel.get_var_name_generator()

    from pymbolic.primitives import Variable

    if footprint_subscripts is None:
        return kernel, rule_name, sweep_inames, []

    if not isinstance(footprint_subscripts, (list, tuple)):
        footprint_subscripts = [footprint_subscripts]

    inames_to_be_removed = []

    new_footprint_subscripts = []
    for fsub in footprint_subscripts:
        if isinstance(fsub, str):
            from loopy.symbolic import parse
            fsub = parse(fsub)

        if not isinstance(fsub, tuple):
            fsub = (fsub,)

        if len(fsub) != arg.num_user_axes():
            raise ValueError("sweep index '%s' has the wrong number of dimensions"
                    % str(fsub))

        for subst_map in kernel.applied_iname_rewrites:
            from loopy.symbolic import SubstitutionMapper
            from pymbolic.mapper.substitutor import make_subst_func
            fsub = SubstitutionMapper(make_subst_func(subst_map))(fsub)

        from loopy.symbolic import get_dependencies
        fsub_dependencies = get_dependencies(fsub)

        new_fsub = []
        for axis_nr, fsub_axis in enumerate(fsub):
            from pymbolic.primitives import Slice
            if isinstance(fsub_axis, Slice):
                if fsub_axis.children != (None,):
                    raise NotImplementedError("add_prefetch only "
                            "supports full slices")

                axis_name = name_gen(
                        based_on="%s_fetch_axis_%d" % (arg.name, axis_nr))

                kernel = _add_kernel_axis(kernel, axis_name, 0, arg.shape[axis_nr],
                        frozenset(sweep_inames) | fsub_dependencies)
                sweep_inames = sweep_inames + [axis_name]

                inames_to_be_removed.append(axis_name)
                new_fsub.append(Variable(axis_name))

            else:
                new_fsub.append(fsub_axis)

        new_footprint_subscripts.append(tuple(new_fsub))
        del new_fsub

    footprint_subscripts = new_footprint_subscripts
    del new_footprint_subscripts

    subst_use = [Variable(rule_name)(*si) for si in footprint_subscripts]
    return kernel, subst_use, sweep_inames, inames_to_be_removed
Exemple #40
0
def buffer_array(
    kernel,
    var_name,
    buffer_inames,
    init_expression=None,
    store_expression=None,
    within=None,
    default_tag="l.auto",
    temporary_is_local=None,
    fetch_bounding_box=False,
):
    """
    :arg init_expression: Either *None* (indicating the prior value of the buffered
        array should be read) or an expression optionally involving the
        variable 'base' (which references the associated location in the array
        being buffered).
    :arg store_expression: Either *None* or an expression involving
        variables 'base' and 'buffer' (without array indices).
    """

    # {{{ process arguments

    if isinstance(init_expression, str):
        from loopy.symbolic import parse

        init_expression = parse(init_expression)

    if isinstance(store_expression, str):
        from loopy.symbolic import parse

        store_expression = parse(store_expression)

    if isinstance(buffer_inames, str):
        buffer_inames = [s.strip() for s in buffer_inames.split(",") if s.strip()]

    for iname in buffer_inames:
        if iname not in kernel.all_inames():
            raise RuntimeError("sweep iname '%s' is not a known iname" % iname)

    buffer_inames = list(buffer_inames)
    buffer_inames_set = frozenset(buffer_inames)

    from loopy.context_matching import parse_stack_match

    within = parse_stack_match(within)

    if var_name in kernel.arg_dict:
        var_descr = kernel.arg_dict[var_name]
    elif var_name in kernel.temporary_variables:
        var_descr = kernel.temporary_variables[var_name]
    else:
        raise ValueError("variable '%s' not found" % var_name)

    from loopy.kernel.data import ArrayBase

    if isinstance(var_descr, ArrayBase):
        var_shape = var_descr.shape
    else:
        var_shape = ()

    if temporary_is_local is None:
        import loopy as lp

        temporary_is_local = lp.auto

    # }}}

    var_name_gen = kernel.get_var_name_generator()
    within_inames = set()

    access_descriptors = []
    for insn in kernel.instructions:
        if not within(kernel, insn.id, ()):
            continue

        for assignee, index in insn.assignees_and_indices():
            if assignee == var_name:
                within_inames.update((get_dependencies(index) & kernel.all_inames()) - buffer_inames_set)
                access_descriptors.append(AccessDescriptor(identifier=insn.id, storage_axis_exprs=index))

    # {{{ find fetch/store inames

    init_inames = []
    store_inames = []
    new_iname_to_tag = {}

    for i in range(len(var_shape)):
        init_iname = var_name_gen("%s_init_%d" % (var_name, i))
        store_iname = var_name_gen("%s_store_%d" % (var_name, i))

        new_iname_to_tag[init_iname] = default_tag
        new_iname_to_tag[store_iname] = default_tag

        init_inames.append(init_iname)
        store_inames.append(store_iname)

    # }}}

    # {{{ modify loop domain

    non1_init_inames = []
    non1_store_inames = []

    if var_shape:
        # {{{ find domain to be changed

        from loopy.kernel.tools import DomainChanger

        domch = DomainChanger(kernel, buffer_inames_set | within_inames)

        if domch.leaf_domain_index is not None:
            # If the sweep inames are at home in parent domains, then we'll add
            # fetches with loops over copies of these parent inames that will end
            # up being scheduled *within* loops over these parents.

            for iname in buffer_inames_set:
                if kernel.get_home_domain_index(iname) != domch.leaf_domain_index:
                    raise RuntimeError("buffer iname '%s' is not 'at home' in the " "sweep's leaf domain" % iname)

        # }}}

        abm = ArrayToBufferMap(kernel, domch.domain, buffer_inames, access_descriptors, len(var_shape))

        for i in range(len(var_shape)):
            if abm.non1_storage_axis_flags[i]:
                non1_init_inames.append(init_inames[i])
                non1_store_inames.append(store_inames[i])
            else:
                del new_iname_to_tag[init_inames[i]]
                del new_iname_to_tag[store_inames[i]]

        new_domain = domch.domain
        new_domain = abm.augment_domain_with_sweep(new_domain, non1_init_inames, boxify_sweep=fetch_bounding_box)
        new_domain = abm.augment_domain_with_sweep(new_domain, non1_store_inames, boxify_sweep=fetch_bounding_box)
        new_kernel_domains = domch.get_domains_with(new_domain)
        del new_domain

    else:
        # leave kernel domains unchanged
        new_kernel_domains = kernel.domains

        abm = NoOpArrayToBufferMap()

    # }}}

    # {{{ set up temp variable

    import loopy as lp

    buf_var_name = var_name_gen(based_on=var_name + "_buf")

    new_temporary_variables = kernel.temporary_variables.copy()
    temp_var = lp.TemporaryVariable(
        name=buf_var_name,
        dtype=var_descr.dtype,
        base_indices=(0,) * len(abm.non1_storage_shape),
        shape=tuple(abm.non1_storage_shape),
        is_local=temporary_is_local,
    )

    new_temporary_variables[buf_var_name] = temp_var

    # }}}

    new_insns = []

    buf_var = var(buf_var_name)

    # {{{ generate init instruction

    buf_var_init = buf_var
    if non1_init_inames:
        buf_var_init = buf_var_init.index(tuple(var(iname) for iname in non1_init_inames))

    init_base = var(var_name)

    init_subscript = []
    init_iname_idx = 0
    if var_shape:
        for i in range(len(var_shape)):
            ax_subscript = abm.storage_base_indices[i]
            if abm.non1_storage_axis_flags[i]:
                ax_subscript += var(non1_init_inames[init_iname_idx])
                init_iname_idx += 1
            init_subscript.append(ax_subscript)

    if init_subscript:
        init_base = init_base.index(tuple(init_subscript))

    if init_expression is None:
        init_expression = init_base
    else:
        init_expression = init_expression
        init_expression = SubstitutionMapper(make_subst_func({"base": init_base}))(init_expression)

    init_insn_id = kernel.make_unique_instruction_id(based_on="init_" + var_name)
    from loopy.kernel.data import ExpressionInstruction

    init_instruction = ExpressionInstruction(
        id=init_insn_id,
        assignee=buf_var_init,
        expression=init_expression,
        forced_iname_deps=frozenset(within_inames),
        insn_deps=frozenset(),
        insn_deps_is_final=True,
    )

    # }}}

    rule_mapping_context = SubstitutionRuleMappingContext(kernel.substitutions, kernel.get_var_name_generator())
    aar = ArrayAccessReplacer(rule_mapping_context, var_name, within, abm, buf_var)
    kernel = rule_mapping_context.finish_kernel(aar.map_kernel(kernel))

    did_write = False
    for insn_id in aar.modified_insn_ids:
        insn = kernel.id_to_insn[insn_id]
        if any(assignee_name == buf_var_name for assignee_name, _ in insn.assignees_and_indices()):
            did_write = True

    # {{{ add init_insn_id to insn_deps

    new_insns = []

    def none_to_empty_set(s):
        if s is None:
            return frozenset()
        else:
            return s

    for insn in kernel.instructions:
        if insn.id in aar.modified_insn_ids:
            new_insns.append(insn.copy(insn_deps=(none_to_empty_set(insn.insn_deps) | frozenset([init_insn_id]))))
        else:
            new_insns.append(insn)

    # }}}

    # {{{ generate store instruction

    buf_var_store = buf_var
    if non1_store_inames:
        buf_var_store = buf_var_store.index(tuple(var(iname) for iname in non1_store_inames))

    store_subscript = []
    store_iname_idx = 0
    if var_shape:
        for i in range(len(var_shape)):
            ax_subscript = abm.storage_base_indices[i]
            if abm.non1_storage_axis_flags[i]:
                ax_subscript += var(non1_store_inames[store_iname_idx])
                store_iname_idx += 1
            store_subscript.append(ax_subscript)

    store_target = var(var_name)
    if store_subscript:
        store_target = store_target.index(tuple(store_subscript))

    if store_expression is None:
        store_expression = buf_var_store
    else:
        store_expression = SubstitutionMapper(make_subst_func({"base": store_target, "buffer": buf_var_store}))(
            store_expression
        )

    from loopy.kernel.data import ExpressionInstruction

    store_instruction = ExpressionInstruction(
        id=kernel.make_unique_instruction_id(based_on="store_" + var_name),
        insn_deps=frozenset(aar.modified_insn_ids),
        assignee=store_target,
        expression=store_expression,
        forced_iname_deps=frozenset(within_inames),
    )

    # }}}

    new_insns.append(init_instruction)
    if did_write:
        new_insns.append(store_instruction)

    kernel = kernel.copy(
        domains=new_kernel_domains, instructions=new_insns, temporary_variables=new_temporary_variables
    )

    from loopy import tag_inames

    kernel = tag_inames(kernel, new_iname_to_tag)

    return kernel
Exemple #41
0
def precompute(kernel, subst_use, sweep_inames=[], within=None,
        storage_axes=None, temporary_name=None, precompute_inames=None,
        storage_axis_to_tag={}, default_tag="l.auto", dtype=None,
        fetch_bounding_box=False, temporary_is_local=None,
        compute_insn_id=None):
    """Precompute the expression described in the substitution rule determined by
    *subst_use* and store it in a temporary array. A precomputation needs two
    things to operate, a list of *sweep_inames* (order irrelevant) and an
    ordered list of *storage_axes* (whose order will describe the axis ordering
    of the temporary array).

    :arg subst_use: Describes what to prefetch.

    The following objects may be given for *subst_use*:

    * The name of the substitution rule.

    * The tagged name ("name$tag") of the substitution rule.

    * A list of invocations of the substitution rule.
      This list of invocations, when swept across *sweep_inames*, then serves
      to define the footprint of the precomputation.

      Invocations may be tagged ("name$tag") to filter out a subset of the
      usage sites of the substitution rule. (Namely those usage sites that
      use the same tagged name.)

      Invocations may be given as a string or as a
      :class:`pymbolic.primitives.Expression` object.

      If only one invocation is to be given, then the only entry of the list
      may be given directly.

    If the list of invocations generating the footprint is not given,
    all (tag-matching, if desired) usage sites of the substitution rule
    are used to determine the footprint.

    The following cases can arise for each sweep axis:

    * The axis is an iname that occurs within arguments specified at
      usage sites of the substitution rule. This case is assumed covered
      by the storage axes provided for the argument.

    * The axis is an iname that occurs within the *value* of the rule, but not
      within its arguments. A new, dedicated storage axis is allocated for
      such an axis.

    :arg sweep_inames: A :class:`list` of inames and/or rule argument
        names to be swept.
        May also equivalently be a comma-separated string.
    :arg storage_axes: A :class:`list` of inames and/or rule argument
        names/indices to be used as storage axes.
        May also equivalently be a comma-separated string.
    :arg within: a stack match as understood by
        :func:`loopy.context_matching.parse_stack_match`.
    :arg temporary_name:
        The temporary variable name to use for storing the precomputed data.
        If it does not exist, it will be created. If it does exist, its properties
        (such as size, type) are checked (and updated, if possible) to match
        its use.
    :arg precompute_inames:
        A tuple of inames to be used to carry out the precomputation.
        If the specified inames do not already exist, they will be
        created. If they do already exist, their loop domain is verified
        against the one required for this precomputation. This tuple may
        be shorter than the (provided or automatically found) *storage_axes*
        tuple, in which case names will be automatically created.
        May also equivalently be a comma-separated string.

    :arg compute_insn_id: The ID of the instruction performing the precomputation.

    If `storage_axes` is not specified, it defaults to the arrangement
    `<direct sweep axes><arguments>` with the direct sweep axes being the
    slower-varying indices.

    Trivial storage axes (i.e. axes of length 1 with respect to the sweep) are
    eliminated.
    """

    # {{{ check, standardize arguments

    if isinstance(sweep_inames, str):
        sweep_inames = [iname.strip() for iname in sweep_inames.split(",")]

    for iname in sweep_inames:
        if iname not in kernel.all_inames():
            raise RuntimeError("sweep iname '%s' is not a known iname"
                    % iname)

    sweep_inames = list(sweep_inames)
    sweep_inames_set = frozenset(sweep_inames)

    if isinstance(storage_axes, str):
        storage_axes = [ax.strip() for ax in storage_axes.split(",")]

    if isinstance(precompute_inames, str):
        precompute_inames = [iname.strip() for iname in precompute_inames.split(",")]

    if isinstance(subst_use, str):
        subst_use = [subst_use]

    footprint_generators = None

    subst_name = None
    subst_tag = None

    from pymbolic.primitives import Variable, Call
    from loopy.symbolic import parse, TaggedVariable

    for use in subst_use:
        if isinstance(use, str):
            use = parse(use)

        if isinstance(use, Call):
            if footprint_generators is None:
                footprint_generators = []

            footprint_generators.append(use)
            subst_name_as_expr = use.function
        else:
            subst_name_as_expr = use

        if isinstance(subst_name_as_expr, TaggedVariable):
            new_subst_name = subst_name_as_expr.name
            new_subst_tag = subst_name_as_expr.tag
        elif isinstance(subst_name_as_expr, Variable):
            new_subst_name = subst_name_as_expr.name
            new_subst_tag = None
        else:
            raise ValueError("unexpected type of subst_name")

        if (subst_name, subst_tag) == (None, None):
            subst_name, subst_tag = new_subst_name, new_subst_tag
        else:
            if (subst_name, subst_tag) != (new_subst_name, new_subst_tag):
                raise ValueError("not all uses in subst_use agree "
                        "on rule name and tag")

    from loopy.context_matching import parse_stack_match
    within = parse_stack_match(within)

    from loopy.kernel.data import parse_tag
    default_tag = parse_tag(default_tag)

    subst = kernel.substitutions[subst_name]
    c_subst_name = subst_name.replace(".", "_")

    # }}}

    # {{{ process invocations in footprint generators, start access_descriptors

    if footprint_generators:
        from pymbolic.primitives import Variable, Call

        access_descriptors = []

        for fpg in footprint_generators:
            if isinstance(fpg, Variable):
                args = ()
            elif isinstance(fpg, Call):
                args = fpg.parameters
            else:
                raise ValueError("footprint generator must "
                        "be substitution rule invocation")

            access_descriptors.append(
                    RuleAccessDescriptor(
                        identifier=access_descriptor_id(args, None),
                        args=args
                        ))

    # }}}

    # {{{ gather up invocations in kernel code, finish access_descriptors

    if not footprint_generators:
        rule_mapping_context = SubstitutionRuleMappingContext(
                kernel.substitutions, kernel.get_var_name_generator())
        invg = RuleInvocationGatherer(
                rule_mapping_context, kernel, subst_name, subst_tag, within)
        del rule_mapping_context

        import loopy as lp
        for insn in kernel.instructions:
            if isinstance(insn, lp.Assignment):
                invg(insn.assignee, kernel, insn)
                invg(insn.expression, kernel, insn)

        access_descriptors = invg.access_descriptors
        if not access_descriptors:
            raise RuntimeError("no invocations of '%s' found" % subst_name)

    # }}}

    # {{{ find inames used in arguments

    expanding_usage_arg_deps = set()

    for accdesc in access_descriptors:
        for arg in accdesc.args:
            expanding_usage_arg_deps.update(
                    get_dependencies(arg) & kernel.all_inames())

    # }}}

    var_name_gen = kernel.get_var_name_generator()

    # {{{ use given / find new storage_axes

    # extra axes made necessary because they don't occur in the arguments
    extra_storage_axes = set(sweep_inames_set - expanding_usage_arg_deps)

    from loopy.symbolic import SubstitutionRuleExpander
    submap = SubstitutionRuleExpander(kernel.substitutions)

    value_inames = get_dependencies(
            submap(subst.expression)
            ) & kernel.all_inames()
    if value_inames - expanding_usage_arg_deps < extra_storage_axes:
        raise RuntimeError("unreferenced sweep inames specified: "
                + ", ".join(extra_storage_axes
                    - value_inames - expanding_usage_arg_deps))

    new_iname_to_tag = {}

    if storage_axes is None:
        storage_axes = []

        # Add sweep_inames (in given--rather than arbitrary--order) to
        # storage_axes *if* they are part of extra_storage_axes.
        for iname in sweep_inames:
            if iname in extra_storage_axes:
                extra_storage_axes.remove(iname)
                storage_axes.append(iname)

        if extra_storage_axes:
            if (precompute_inames is not None
                    and len(storage_axes) < len(precompute_inames)):
                raise LoopyError("must specify a sufficient number of "
                        "storage_axes to uniquely determine the meaning "
                        "of the given precompute_inames. (%d storage_axes "
                        "needed)" % len(precompute_inames))
            storage_axes.extend(sorted(extra_storage_axes))

        storage_axes.extend(range(len(subst.arguments)))

    del extra_storage_axes

    prior_storage_axis_name_dict = {}

    storage_axis_names = []
    storage_axis_sources = []  # number for arg#, or iname

    # {{{ check for pre-existing precompute_inames

    if precompute_inames is not None:
        preexisting_precompute_inames = (
                set(precompute_inames) & kernel.all_inames())
    else:
        preexisting_precompute_inames = set()

    # }}}

    for i, saxis in enumerate(storage_axes):
        tag_lookup_saxis = saxis

        if saxis in subst.arguments:
            saxis = subst.arguments.index(saxis)

        storage_axis_sources.append(saxis)

        if isinstance(saxis, int):
            # argument index
            name = old_name = subst.arguments[saxis]
        else:
            old_name = saxis
            name = "%s_%s" % (c_subst_name, old_name)

        if (precompute_inames is not None
                and i < len(precompute_inames)
                and precompute_inames[i]):
            name = precompute_inames[i]
            tag_lookup_saxis = name
            if (name not in preexisting_precompute_inames
                    and var_name_gen.is_name_conflicting(name)):
                raise RuntimeError("new storage axis name '%s' "
                        "conflicts with existing name" % name)
        else:
            name = var_name_gen(name)

        storage_axis_names.append(name)
        if name not in preexisting_precompute_inames:
            new_iname_to_tag[name] = storage_axis_to_tag.get(
                    tag_lookup_saxis, default_tag)

        prior_storage_axis_name_dict[name] = old_name

    del storage_axis_to_tag
    del storage_axes
    del precompute_inames

    # }}}

    # {{{ fill out access_descriptors[...].storage_axis_exprs

    access_descriptors = [
            accdesc.copy(
                storage_axis_exprs=storage_axis_exprs(
                    storage_axis_sources, accdesc.args))
            for accdesc in access_descriptors]

    # }}}

    expanding_inames = sweep_inames_set | frozenset(expanding_usage_arg_deps)
    assert expanding_inames <= kernel.all_inames()

    if storage_axis_names:
        # {{{ find domain to be changed

        change_inames = expanding_inames | preexisting_precompute_inames

        from loopy.kernel.tools import DomainChanger
        domch = DomainChanger(kernel, change_inames)

        if domch.leaf_domain_index is not None:
            # If the sweep inames are at home in parent domains, then we'll add
            # fetches with loops over copies of these parent inames that will end
            # up being scheduled *within* loops over these parents.

            for iname in sweep_inames_set:
                if kernel.get_home_domain_index(iname) != domch.leaf_domain_index:
                    raise RuntimeError("sweep iname '%s' is not 'at home' in the "
                            "sweep's leaf domain" % iname)

        # }}}

        abm = ArrayToBufferMap(kernel, domch.domain, sweep_inames,
                access_descriptors, len(storage_axis_names))

        non1_storage_axis_names = []
        for i, saxis in enumerate(storage_axis_names):
            if abm.non1_storage_axis_flags[i]:
                non1_storage_axis_names.append(saxis)
            else:
                del new_iname_to_tag[saxis]

                if saxis in preexisting_precompute_inames:
                    raise LoopyError("precompute axis %d (1-based) was "
                            "eliminated as "
                            "having length 1 but also mapped to existing "
                            "iname '%s'" % (i+1, saxis))

        mod_domain = domch.domain

        # {{{ modify the domain, taking into account preexisting inames

        # inames may already exist in mod_domain, add them primed to start
        primed_non1_saxis_names = [
                iname+"'" for iname in non1_storage_axis_names]

        mod_domain = abm.augment_domain_with_sweep(
            domch.domain, primed_non1_saxis_names,
            boxify_sweep=fetch_bounding_box)

        check_domain = mod_domain

        for i, saxis in enumerate(non1_storage_axis_names):
            var_dict = mod_domain.get_var_dict(isl.dim_type.set)

            if saxis in preexisting_precompute_inames:
                # add equality constraint between existing and new variable

                dt, dim_idx = var_dict[saxis]
                saxis_aff = isl.Aff.var_on_domain(mod_domain.space, dt, dim_idx)

                dt, dim_idx = var_dict[primed_non1_saxis_names[i]]
                new_var_aff = isl.Aff.var_on_domain(mod_domain.space, dt, dim_idx)

                mod_domain = mod_domain.add_constraint(
                        isl.Constraint.equality_from_aff(new_var_aff - saxis_aff))

                # project out the new one
                mod_domain = mod_domain.project_out(dt, dim_idx, 1)

            else:
                # remove the prime from the new variable
                dt, dim_idx = var_dict[primed_non1_saxis_names[i]]
                mod_domain = mod_domain.set_dim_name(dt, dim_idx, saxis)

        # {{{ check that we got the desired domain

        check_domain = check_domain.project_out_except(
                primed_non1_saxis_names, [isl.dim_type.set])

        mod_check_domain = mod_domain

        # re-add the prime from the new variable
        var_dict = mod_check_domain.get_var_dict(isl.dim_type.set)

        for saxis in non1_storage_axis_names:
            dt, dim_idx = var_dict[saxis]
            mod_check_domain = mod_check_domain.set_dim_name(dt, dim_idx, saxis+"'")

        mod_check_domain = mod_check_domain.project_out_except(
                primed_non1_saxis_names, [isl.dim_type.set])

        mod_check_domain, check_domain = isl.align_two(
                mod_check_domain, check_domain)

        # The modified domain can't get bigger by adding constraints
        assert mod_check_domain <= check_domain

        if not check_domain <= mod_check_domain:
            print(check_domain)
            print(mod_check_domain)
            raise LoopyError("domain of preexisting inames does not match "
                    "domain needed for precompute")

        # }}}

        # {{{ check that we didn't shrink the original domain

        # project out the new names from the modified domain
        orig_domain_inames = list(domch.domain.get_var_dict(isl.dim_type.set))
        mod_check_domain = mod_domain.project_out_except(
                orig_domain_inames, [isl.dim_type.set])

        check_domain = domch.domain

        mod_check_domain, check_domain = isl.align_two(
                mod_check_domain, check_domain)

        # The modified domain can't get bigger by adding constraints
        assert mod_check_domain <= check_domain

        if not check_domain <= mod_check_domain:
            print(check_domain)
            print(mod_check_domain)
            raise LoopyError("original domain got shrunk by applying the precompute")

        # }}}

        # }}}

        new_kernel_domains = domch.get_domains_with(mod_domain)

    else:
        # leave kernel domains unchanged
        new_kernel_domains = kernel.domains

        non1_storage_axis_names = []
        abm = NoOpArrayToBufferMap()

    kernel = kernel.copy(domains=new_kernel_domains)

    # {{{ set up compute insn

    if temporary_name is None:
        temporary_name = var_name_gen(based_on=c_subst_name)

    assignee = var(temporary_name)

    if non1_storage_axis_names:
        assignee = assignee.index(
                tuple(var(iname) for iname in non1_storage_axis_names))

    # {{{ process substitutions on compute instruction

    storage_axis_subst_dict = {}

    for arg_name, bi in zip(storage_axis_names, abm.storage_base_indices):
        if arg_name in non1_storage_axis_names:
            arg = var(arg_name)
        else:
            arg = 0

        storage_axis_subst_dict[
                prior_storage_axis_name_dict.get(arg_name, arg_name)] = arg+bi

    rule_mapping_context = SubstitutionRuleMappingContext(
            kernel.substitutions, kernel.get_var_name_generator())

    from loopy.context_matching import parse_stack_match
    expr_subst_map = RuleAwareSubstitutionMapper(
            rule_mapping_context,
            make_subst_func(storage_axis_subst_dict),
            within=parse_stack_match(None))

    compute_expression = expr_subst_map(subst.expression, kernel, None)

    # }}}

    from loopy.kernel.data import Assignment
    if compute_insn_id is None:
        compute_insn_id = kernel.make_unique_instruction_id(based_on=c_subst_name)

    compute_insn = Assignment(
            id=compute_insn_id,
            assignee=assignee,
            expression=compute_expression)

    # }}}

    # {{{ substitute rule into expressions in kernel (if within footprint)

    invr = RuleInvocationReplacer(rule_mapping_context,
            subst_name, subst_tag, within,
            access_descriptors, abm,
            storage_axis_names, storage_axis_sources,
            non1_storage_axis_names,
            temporary_name, compute_insn_id)

    kernel = invr.map_kernel(kernel)
    kernel = kernel.copy(
            instructions=[compute_insn] + kernel.instructions)
    kernel = rule_mapping_context.finish_kernel(kernel)

    # }}}

    # {{{ set up temp variable

    import loopy as lp
    if dtype is None:
        dtype = lp.auto
    else:
        dtype = np.dtype(dtype)

    import loopy as lp

    if temporary_is_local is None:
        temporary_is_local = lp.auto

    new_temp_shape = tuple(abm.non1_storage_shape)

    new_temporary_variables = kernel.temporary_variables.copy()
    if temporary_name not in new_temporary_variables:
        temp_var = lp.TemporaryVariable(
                name=temporary_name,
                dtype=dtype,
                base_indices=(0,)*len(new_temp_shape),
                shape=tuple(abm.non1_storage_shape),
                is_local=temporary_is_local)

    else:
        temp_var = new_temporary_variables[temporary_name]

        # {{{ check and adapt existing temporary

        if temp_var.dtype is lp.auto:
            pass
        elif temp_var.dtype is not lp.auto and dtype is lp.auto:
            dtype = temp_var.dtype
        elif temp_var.dtype is not lp.auto and dtype is not lp.auto:
            if temp_var.dtype != dtype:
                raise LoopyError("Existing and new dtype of temporary '%s' "
                        "do not match (existing: %s, new: %s)"
                        % (temporary_name, temp_var.dtype, dtype))

        temp_var = temp_var.copy(dtype=dtype)

        if len(temp_var.shape) != len(new_temp_shape):
            raise LoopyError("Existing and new temporary '%s' do not "
                    "have matching number of dimensions "
                    % (temporary_name,
                        len(temp_var.shape), len(new_temp_shape)))

        if temp_var.base_indices != (0,) * len(new_temp_shape):
            raise LoopyError("Existing and new temporary '%s' do not "
                    "have matching number of dimensions "
                    % (temporary_name,
                        len(temp_var.shape), len(new_temp_shape)))

        new_temp_shape = tuple(
                max(i, ex_i)
                for i, ex_i in zip(new_temp_shape, temp_var.shape))

        temp_var = temp_var.copy(shape=new_temp_shape)

        if temporary_is_local == temp_var.is_local:
            pass
        elif temporary_is_local is lp.auto:
            temporary_is_local = temp_var.is_local
        elif temp_var.is_local is lp.auto:
            pass
        else:
            raise LoopyError("Existing and new temporary '%s' do not "
                    "have matching values of 'is_local'"
                    % (temporary_name,
                        temp_var.is_local, temporary_is_local))

        temp_var = temp_var.copy(is_local=temporary_is_local)

        # }}}

    new_temporary_variables[temporary_name] = temp_var

    kernel = kernel.copy(
            temporary_variables=new_temporary_variables)

    # }}}

    from loopy import tag_inames
    kernel = tag_inames(kernel, new_iname_to_tag)

    from loopy.kernel.data import AutoFitLocalIndexTag
    has_automatic_axes = any(
            isinstance(tag, AutoFitLocalIndexTag)
            for tag in new_iname_to_tag.values())

    if has_automatic_axes:
        from loopy.kernel.tools import assign_automatic_axes
        kernel = assign_automatic_axes(kernel)

    return kernel
def pack_and_unpack_args_for_call_for_single_kernel(kernel,
                                                    callables_table,
                                                    call_name,
                                                    args_to_pack=None,
                                                    args_to_unpack=None):
    """
    Returns a a copy of *kernel* with instructions appended to copy the
    arguments in *args* to match the alignment expected by the *call_name* in
    the kernel. The arguments are copied back to *args* with the appropriate
    data layout.

    :arg call_name: An instance of :class:`str` denoting the function call in
        the *kernel*.
    :arg args_to_unpack: A list of the arguments as instances of :class:`str` which
        must be packed. If set *None*, it is interpreted that all the array
        arguments would be packed.
    :arg args_to_unpack: A list of the arguments as instances of :class:`str`
        which must be unpacked. If set *None*, it is interpreted that
        all the array arguments should be unpacked.
    """
    assert isinstance(kernel, LoopKernel)
    new_domains = []
    new_tmps = kernel.temporary_variables.copy()
    old_insn_to_new_insns = {}

    for insn in kernel.instructions:
        if not isinstance(insn, CallInstruction):
            # pack and unpack call only be done for CallInstructions.
            continue
        if insn.expression.function.name not in callables_table:
            continue

        in_knl_callable = callables_table[insn.expression.function.name]

        if in_knl_callable.name != call_name:
            # not the function we're looking for.
            continue
        in_knl_callable = in_knl_callable.with_packing_for_args()

        vng = kernel.get_var_name_generator()
        ing = kernel.get_instruction_id_generator()

        parameters = insn.expression.parameters
        if args_to_pack is None:
            args_to_pack = [
                par.subscript.aggregate.name
                for par in parameters + insn.assignees
                if isinstance(par, SubArrayRef) and (par.swept_inames)
            ]
        if args_to_unpack is None:
            args_to_unpack = [
                par.subscript.aggregate.name
                for par in parameters + insn.assignees
                if isinstance(par, SubArrayRef) and (par.swept_inames)
            ]

        # {{{ sanity checks for args

        assert isinstance(args_to_pack, list)
        assert isinstance(args_to_unpack, list)

        for arg in args_to_pack:
            found_sub_array_ref = False

            for par in parameters + insn.assignees:
                # checking that the given args is a sub array ref
                if isinstance(par,
                              SubArrayRef) and (par.subscript.aggregate.name
                                                == arg):
                    found_sub_array_ref = True
                    break
            if not found_sub_array_ref:
                raise LoopyError(
                    "No match found for packing arg '%s' of call '%s' "
                    "at insn '%s'." % (arg, call_name, insn.id))
        for arg in args_to_unpack:
            if arg not in args_to_pack:
                raise LoopyError("Argument %s should be packed in order to be "
                                 "unpacked." % arg)

        # }}}

        packing_insns = []
        unpacking_insns = []

        # {{{ handling ilp tags

        from loopy.kernel.data import IlpBaseTag, VectorizeTag
        import islpy as isl
        from pymbolic import var

        dim_type = isl.dim_type.set
        ilp_inames = {
            iname
            for iname in insn.within_inames if all(
                isinstance(tag, (IlpBaseTag, VectorizeTag))
                for tag in kernel.iname_to_tags.get(iname, []))
        }
        new_ilp_inames = set()
        ilp_inames_map = {}
        for iname in ilp_inames:
            new_iname_name = vng(iname + "_ilp")
            ilp_inames_map[var(iname)] = var(new_iname_name)
            new_ilp_inames.add(new_iname_name)
        for iname in ilp_inames:
            new_domain = kernel.get_inames_domain(iname).copy()
            for i in range(new_domain.n_dim()):
                old_iname = new_domain.get_dim_name(dim_type, i)
                if old_iname in ilp_inames:
                    new_domain = new_domain.set_dim_name(
                        dim_type, i, ilp_inames_map[var(old_iname)].name)
            new_domains.append(new_domain)

        # }}}

        from pymbolic.mapper.substitutor import make_subst_func
        from loopy.symbolic import SubstitutionMapper

        # dict to store the new assignees and parameters, the mapping pattern
        # from arg_id to parameters is identical to InKernelCallable.arg_id_to_dtype
        id_to_parameters = tuple(enumerate(parameters)) + tuple(
            (-i - 1, assignee) for i, assignee in enumerate(insn.assignees))
        new_id_to_parameters = {}

        for arg_id, p in id_to_parameters:
            if isinstance(p, SubArrayRef) and (p.subscript.aggregate.name
                                               in args_to_pack):
                new_pack_inames = ilp_inames_map.copy(
                )  # packing-specific inames
                new_unpack_inames = ilp_inames_map.copy(
                )  # unpacking-specific iname

                new_pack_inames = {
                    iname: var(vng(iname.name + "_pack"))
                    for iname in p.swept_inames
                }
                new_unpack_inames = {
                    iname: var(vng(iname.name + "_unpack"))
                    for iname in p.swept_inames
                }

                # Updating the domains corresponding to the new inames.
                for iname in p.swept_inames:
                    new_domain_pack = kernel.get_inames_domain(
                        iname.name).copy()
                    new_domain_unpack = kernel.get_inames_domain(
                        iname.name).copy()
                    for i in range(new_domain_pack.n_dim()):
                        old_iname = new_domain_pack.get_dim_name(dim_type, i)
                        if var(old_iname) in new_pack_inames:
                            new_domain_pack = new_domain_pack.set_dim_name(
                                dim_type, i,
                                new_pack_inames[var(old_iname)].name)
                            new_domain_unpack = new_domain_unpack.set_dim_name(
                                dim_type, i,
                                new_unpack_inames[var(old_iname)].name)
                    new_domains.append(new_domain_pack)
                    new_domains.append(new_domain_unpack)

                arg = p.subscript.aggregate.name
                pack_name = vng(arg + "_pack")

                from loopy.kernel.data import (TemporaryVariable,
                                               temp_var_scope)

                if arg in kernel.arg_dict:
                    arg_in_caller = kernel.arg_dict[arg]
                else:
                    arg_in_caller = kernel.temporary_variables[arg]

                pack_tmp = TemporaryVariable(
                    name=pack_name,
                    dtype=arg_in_caller.dtype,
                    dim_tags=in_knl_callable.arg_id_to_descr[arg_id].dim_tags,
                    shape=in_knl_callable.arg_id_to_descr[arg_id].shape,
                    scope=temp_var_scope.PRIVATE,
                )

                new_tmps[pack_name] = pack_tmp

                from loopy import Assignment
                pack_subst_mapper = SubstitutionMapper(
                    make_subst_func(new_pack_inames))
                unpack_subst_mapper = SubstitutionMapper(
                    make_subst_func(new_unpack_inames))

                # {{{ getting the lhs for packing and rhs for unpacking

                from loopy.isl_helpers import simplify_via_aff, make_slab

                flatten_index = simplify_via_aff(
                    sum(dim_tag.stride * idx for dim_tag, idx in zip(
                        arg_in_caller.dim_tags, p.subscript.index_tuple)))

                new_indices = []
                for dim_tag in in_knl_callable.arg_id_to_descr[
                        arg_id].dim_tags:
                    ind = flatten_index // dim_tag.stride
                    flatten_index -= (dim_tag.stride * ind)
                    new_indices.append(ind)

                new_indices = tuple(simplify_via_aff(i) for i in new_indices)

                pack_lhs_assignee = pack_subst_mapper(
                    var(pack_name).index(new_indices))
                unpack_rhs = unpack_subst_mapper(
                    var(pack_name).index(new_indices))

                # }}}

                packing_insns.append(
                    Assignment(
                        assignee=pack_lhs_assignee,
                        expression=pack_subst_mapper.map_subscript(
                            p.subscript),
                        within_inames=insn.within_inames - ilp_inames
                        | {new_pack_inames[i].name
                           for i in p.swept_inames} | (new_ilp_inames),
                        depends_on=insn.depends_on,
                        id=ing(insn.id + "_pack"),
                        depends_on_is_final=True))

                if p.subscript.aggregate.name in args_to_unpack:
                    unpacking_insns.append(
                        Assignment(
                            expression=unpack_rhs,
                            assignee=unpack_subst_mapper.map_subscript(
                                p.subscript),
                            within_inames=insn.within_inames - ilp_inames | {
                                new_unpack_inames[i].name
                                for i in p.swept_inames
                            } | (new_ilp_inames),
                            id=ing(insn.id + "_unpack"),
                            depends_on=frozenset([insn.id]),
                            depends_on_is_final=True))

                # {{{ creating the sweep inames for the new sub array refs

                updated_swept_inames = []

                for _ in in_knl_callable.arg_id_to_descr[arg_id].shape:
                    updated_swept_inames.append(var(vng("i_packsweep_" + arg)))

                ctx = kernel.isl_context
                space = isl.Space.create_from_names(
                    ctx, set=[iname.name for iname in updated_swept_inames])
                iname_set = isl.BasicSet.universe(space)
                for iname, axis_length in zip(
                        updated_swept_inames,
                        in_knl_callable.arg_id_to_descr[arg_id].shape):
                    iname_set = iname_set & make_slab(space, iname.name, 0,
                                                      axis_length)
                new_domains = new_domains + [iname_set]

                # }}}

                new_id_to_parameters[arg_id] = SubArrayRef(
                    tuple(updated_swept_inames),
                    (var(pack_name).index(tuple(updated_swept_inames))))
            else:
                new_id_to_parameters[arg_id] = p

        if packing_insns:
            subst_mapper = SubstitutionMapper(make_subst_func(ilp_inames_map))
            new_call_insn = insn.with_transformed_expressions(subst_mapper)
            new_params = tuple(
                subst_mapper(new_id_to_parameters[i])
                for i, _ in enumerate(parameters))
            new_assignees = tuple(
                subst_mapper(new_id_to_parameters[-i - 1])
                for i, _ in enumerate(insn.assignees))
            new_call_insn = new_call_insn.copy(
                depends_on=new_call_insn.depends_on
                | {pack.id
                   for pack in packing_insns},
                within_inames=new_call_insn.within_inames - ilp_inames |
                (new_ilp_inames),
                expression=new_call_insn.expression.function(*new_params),
                assignees=new_assignees)
            old_insn_to_new_insns[insn.id] = (packing_insns + [new_call_insn] +
                                              unpacking_insns)

    if old_insn_to_new_insns:
        new_instructions = []
        for insn in kernel.instructions:
            if insn.id in old_insn_to_new_insns:
                # Replacing the current instruction with the group of
                # instructions including the packing and unpacking instructions
                new_instructions.extend(old_insn_to_new_insns[insn.id])
            else:
                # for the instructions that depend on the call instruction that
                # are to be packed and unpacked, we need to add the complete
                # instruction block as a dependency for them.
                new_depends_on = insn.depends_on
                if insn.depends_on & set(old_insn_to_new_insns):
                    # need to add the unpack instructions on dependencies.
                    for old_insn_id in insn.depends_on & set(
                            old_insn_to_new_insns):
                        new_depends_on |= frozenset(
                            i.id for i in old_insn_to_new_insns[old_insn_id])
                new_instructions.append(insn.copy(depends_on=new_depends_on))
        kernel = kernel.copy(domains=kernel.domains + new_domains,
                             instructions=new_instructions,
                             temporary_variables=new_tmps)

    return kernel
Exemple #43
0
    def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var,
                           lhs_expr, rhs_expr, lhs_dtype, rhs_type_context):
        from pymbolic.mapper.stringifier import PREC_NONE

        # FIXME: Could detect operations, generate atomic_{add,...} when
        # appropriate.

        if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype in [
                np.int32, np.int64, np.float32, np.float64
        ]:
            from cgen import Block, DoWhile, Assign
            from loopy.target.c import POD
            old_val_var = codegen_state.var_name_generator("loopy_old_val")
            new_val_var = codegen_state.var_name_generator("loopy_new_val")

            from loopy.kernel.data import TemporaryVariable, AddressSpace
            ecm = codegen_state.expression_to_code_mapper.with_assignments({
                old_val_var:
                TemporaryVariable(old_val_var, lhs_dtype),
                new_val_var:
                TemporaryVariable(new_val_var, lhs_dtype),
            })

            lhs_expr_code = ecm(lhs_expr, prec=PREC_NONE, type_context=None)

            from pymbolic.mapper.substitutor import make_subst_func
            from pymbolic import var
            from loopy.symbolic import SubstitutionMapper

            subst = SubstitutionMapper(
                make_subst_func({lhs_expr: var(old_val_var)}))
            rhs_expr_code = ecm(subst(rhs_expr),
                                prec=PREC_NONE,
                                type_context=rhs_type_context,
                                needed_dtype=lhs_dtype)

            if lhs_dtype.numpy_dtype.itemsize == 4:
                func_name = "atomic_cmpxchg"
            elif lhs_dtype.numpy_dtype.itemsize == 8:
                func_name = "atom_cmpxchg"
            else:
                raise LoopyError("unexpected atomic size")

            cast_str = ""
            old_val = old_val_var
            new_val = new_val_var

            if lhs_dtype.numpy_dtype.kind == "f":
                if lhs_dtype.numpy_dtype == np.float32:
                    ctype = "int"
                elif lhs_dtype.numpy_dtype == np.float64:
                    ctype = "long"
                else:
                    assert False

                from loopy.kernel.data import (TemporaryVariable, ArrayArg)
                if (isinstance(lhs_var, ArrayArg)
                        and lhs_var.address_space == AddressSpace.GLOBAL):
                    var_kind = "__global"
                elif (isinstance(lhs_var, ArrayArg)
                      and lhs_var.address_space == AddressSpace.LOCAL):
                    var_kind = "__local"
                elif (isinstance(lhs_var, TemporaryVariable)
                      and lhs_var.address_space == AddressSpace.LOCAL):
                    var_kind = "__local"
                elif (isinstance(lhs_var, TemporaryVariable)
                      and lhs_var.address_space == AddressSpace.GLOBAL):
                    var_kind = "__global"
                else:
                    raise LoopyError("unexpected kind of variable '%s' in "
                                     "atomic operation: " %
                                     (lhs_var.name, type(lhs_var).__name__))

                old_val = "*(%s *) &" % ctype + old_val
                new_val = "*(%s *) &" % ctype + new_val
                cast_str = "(%s %s *) " % (var_kind, ctype)

            return Block([
                POD(self, NumpyType(lhs_dtype.dtype, target=self.target),
                    old_val_var),
                POD(self, NumpyType(lhs_dtype.dtype, target=self.target),
                    new_val_var),
                DoWhile(
                    "%(func_name)s("
                    "%(cast_str)s&(%(lhs_expr)s), "
                    "%(old_val)s, "
                    "%(new_val)s"
                    ") != %(old_val)s" % {
                        "func_name": func_name,
                        "cast_str": cast_str,
                        "lhs_expr": lhs_expr_code,
                        "old_val": old_val,
                        "new_val": new_val,
                    },
                    Block([
                        Assign(old_val_var, lhs_expr_code),
                        Assign(new_val_var, rhs_expr_code),
                    ]))
            ])
        else:
            raise NotImplementedError("atomic update for '%s'" % lhs_dtype)
Exemple #44
0
def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
        store_expression=None, within=None, default_tag="l.auto",
        temporary_scope=None, temporary_is_local=None,
        fetch_bounding_box=False):
    """Replace accesses to *var_name* with ones to a temporary, which is
    created and acts as a buffer. To perform this transformation, the access
    footprint to *var_name* is determined and a temporary of a suitable
    :class:`loopy.AddressSpace` and shape is created.

    By default, the value of the buffered cells in *var_name* are read prior to
    any (read/write) use, and the modified values are written out after use has
    concluded, but for special use cases (e.g. additive accumulation), the
    behavior can be modified using *init_expression* and *store_expression*.

    :arg buffer_inames: The inames across which the buffer should be usable--i.e.
        all possible values of these inames will be covered by the buffer footprint.
        A tuple of inames or a comma-separated string.
    :arg init_expression: Either *None* (indicating the prior value of the buffered
        array should be read) or an expression optionally involving the
        variable 'base' (which references the associated location in the array
        being buffered).
    :arg store_expression: Either *None*, *False*, or an expression involving
        variables 'base' and 'buffer' (without array indices).
        (*None* indicates that a default storage instruction should be used,
        *False* indicates that no storing of the temporary should occur
        at all.)
    :arg within: If not None, limit the action of the transformation to
        matching contexts.  See :func:`loopy.match.parse_stack_match`
        for syntax.
    :arg temporary_scope: If given, override the choice of
        :class:`AddressSpace` for the created temporary.
    :arg default_tag: The default :ref:`iname-tags` to be assigned to the
        inames used for fetching and storing
    :arg fetch_bounding_box: If the access footprint is non-convex
        (resulting in an error), setting this argument to *True* will force a
        rectangular (and hence convex) superset of the footprint to be
        fetched.
    """

    # {{{ unify temporary_scope / temporary_is_local

    from loopy.kernel.data import AddressSpace
    if temporary_is_local is not None:
        from warnings import warn
        warn("temporary_is_local is deprecated. Use temporary_scope instead",
                DeprecationWarning, stacklevel=2)

        if temporary_scope is not None:
            raise LoopyError("may not specify both temporary_is_local and "
                    "temporary_scope")

        if temporary_is_local:
            temporary_scope = AddressSpace.LOCAL
        else:
            temporary_scope = AddressSpace.PRIVATE

    del temporary_is_local

    # }}}

    # {{{ process arguments

    if isinstance(init_expression, str):
        from loopy.symbolic import parse
        init_expression = parse(init_expression)

    if isinstance(store_expression, str):
        from loopy.symbolic import parse
        store_expression = parse(store_expression)

    if isinstance(buffer_inames, str):
        buffer_inames = [s.strip()
                for s in buffer_inames.split(",") if s.strip()]

    for iname in buffer_inames:
        if iname not in kernel.all_inames():
            raise RuntimeError("sweep iname '%s' is not a known iname"
                    % iname)

    buffer_inames = list(buffer_inames)
    buffer_inames_set = frozenset(buffer_inames)

    from loopy.match import parse_stack_match
    within = parse_stack_match(within)

    if var_name in kernel.arg_dict:
        var_descr = kernel.arg_dict[var_name]
    elif var_name in kernel.temporary_variables:
        var_descr = kernel.temporary_variables[var_name]
    else:
        raise ValueError("variable '%s' not found" % var_name)

    from loopy.kernel.data import ArrayBase
    if isinstance(var_descr, ArrayBase):
        var_shape = var_descr.shape
    else:
        var_shape = ()

    if temporary_scope is None:
        import loopy as lp
        temporary_scope = lp.auto

    # }}}

    # {{{ caching

    from loopy import CACHING_ENABLED

    from loopy.preprocess import prepare_for_caching
    key_kernel = prepare_for_caching(kernel)
    cache_key = (key_kernel, var_name, tuple(buffer_inames),
            PymbolicExpressionHashWrapper(init_expression),
            PymbolicExpressionHashWrapper(store_expression), within,
            default_tag, temporary_scope, fetch_bounding_box)

    if CACHING_ENABLED:
        try:
            result = buffer_array_cache[cache_key]
            logger.info("%s: buffer_array cache hit" % kernel.name)
            return result
        except KeyError:
            pass

    # }}}

    var_name_gen = kernel.get_var_name_generator()
    within_inames = set()

    access_descriptors = []
    for insn in kernel.instructions:
        if not within(kernel, insn.id, ()):
            continue

        from pymbolic.primitives import Variable, Subscript
        from loopy.symbolic import LinearSubscript

        for assignee in insn.assignees:
            if isinstance(assignee, Variable):
                assignee_name = assignee.name
                index = ()

            elif isinstance(assignee, Subscript):
                assignee_name = assignee.aggregate.name
                index = assignee.index_tuple

            elif isinstance(assignee, LinearSubscript):
                if assignee.aggregate.name == var_name:
                    raise LoopyError("buffer_array may not be applied in the "
                            "presence of linear write indexing into '%s'" % var_name)

            else:
                raise LoopyError("invalid lvalue '%s'" % assignee)

            if assignee_name == var_name:
                within_inames.update(
                        (get_dependencies(index) & kernel.all_inames())
                        - buffer_inames_set)
                access_descriptors.append(
                        AccessDescriptor(
                            identifier=insn.id,
                            storage_axis_exprs=index))

    # {{{ find fetch/store inames

    init_inames = []
    store_inames = []
    new_iname_to_tag = {}

    for i in range(len(var_shape)):
        dim_name = str(i)
        if isinstance(var_descr, ArrayBase) and var_descr.dim_names is not None:
            dim_name = var_descr.dim_names[i]

        init_iname = var_name_gen(f"{var_name}_init_{dim_name}")
        store_iname = var_name_gen(f"{var_name}_store_{dim_name}")

        new_iname_to_tag[init_iname] = default_tag
        new_iname_to_tag[store_iname] = default_tag

        init_inames.append(init_iname)
        store_inames.append(store_iname)

    # }}}

    # {{{ modify loop domain

    non1_init_inames = []
    non1_store_inames = []

    if var_shape:
        # {{{ find domain to be changed

        from loopy.kernel.tools import DomainChanger
        domch = DomainChanger(kernel, buffer_inames_set | within_inames)

        if domch.leaf_domain_index is not None:
            # If the sweep inames are at home in parent domains, then we'll add
            # fetches with loops over copies of these parent inames that will end
            # up being scheduled *within* loops over these parents.

            for iname in buffer_inames_set:
                if kernel.get_home_domain_index(iname) != domch.leaf_domain_index:
                    raise RuntimeError("buffer iname '%s' is not 'at home' in the "
                            "sweep's leaf domain" % iname)

        # }}}

        abm = ArrayToBufferMap(kernel, domch.domain, buffer_inames,
                access_descriptors, len(var_shape))

        for i in range(len(var_shape)):
            if abm.non1_storage_axis_flags[i]:
                non1_init_inames.append(init_inames[i])
                non1_store_inames.append(store_inames[i])
            else:
                del new_iname_to_tag[init_inames[i]]
                del new_iname_to_tag[store_inames[i]]

        new_domain = domch.domain
        new_domain = abm.augment_domain_with_sweep(
                    new_domain, non1_init_inames,
                    boxify_sweep=fetch_bounding_box)
        new_domain = abm.augment_domain_with_sweep(
                    new_domain, non1_store_inames,
                    boxify_sweep=fetch_bounding_box)
        new_kernel_domains = domch.get_domains_with(new_domain)
        del new_domain

    else:
        # leave kernel domains unchanged
        new_kernel_domains = kernel.domains

        abm = NoOpArrayToBufferMap()

    # }}}

    # {{{ set up temp variable

    import loopy as lp

    buf_var_name = var_name_gen(based_on=var_name+"_buf")

    new_temporary_variables = kernel.temporary_variables.copy()
    temp_var = lp.TemporaryVariable(
            name=buf_var_name,
            dtype=var_descr.dtype,
            base_indices=(0,)*len(abm.non1_storage_shape),
            shape=tuple(abm.non1_storage_shape),
            address_space=temporary_scope)

    new_temporary_variables[buf_var_name] = temp_var

    # }}}

    new_insns = []

    buf_var = var(buf_var_name)

    # {{{ generate init instruction

    buf_var_init = buf_var
    if non1_init_inames:
        buf_var_init = buf_var_init.index(
                tuple(var(iname) for iname in non1_init_inames))

    init_base = var(var_name)

    init_subscript = []
    init_iname_idx = 0
    if var_shape:
        for i in range(len(var_shape)):
            ax_subscript = abm.storage_base_indices[i]
            if abm.non1_storage_axis_flags[i]:
                ax_subscript += var(non1_init_inames[init_iname_idx])
                init_iname_idx += 1
            init_subscript.append(ax_subscript)

    if init_subscript:
        init_base = init_base.index(tuple(init_subscript))

    if init_expression is None:
        init_expression = init_base
    else:
        init_expression = init_expression
        init_expression = SubstitutionMapper(
                make_subst_func({
                    "base": init_base,
                    }))(init_expression)

    init_insn_id = kernel.make_unique_instruction_id(based_on="init_"+var_name)
    from loopy.kernel.data import Assignment
    init_instruction = Assignment(id=init_insn_id,
                assignee=buf_var_init,
                expression=init_expression,
                within_inames=(
                    frozenset(within_inames)
                    | frozenset(non1_init_inames)),
                depends_on=frozenset(),
                depends_on_is_final=True)

    # }}}

    rule_mapping_context = SubstitutionRuleMappingContext(
            kernel.substitutions, kernel.get_var_name_generator())
    aar = ArrayAccessReplacer(rule_mapping_context, var_name,
            within, abm, buf_var)
    kernel = rule_mapping_context.finish_kernel(aar.map_kernel(kernel))

    did_write = False
    for insn_id in aar.modified_insn_ids:
        insn = kernel.id_to_insn[insn_id]
        if buf_var_name in insn.assignee_var_names():
            did_write = True

    # {{{ add init_insn_id to depends_on

    new_insns = []

    def none_to_empty_set(s):
        if s is None:
            return frozenset()
        else:
            return s

    for insn in kernel.instructions:
        if insn.id in aar.modified_insn_ids:
            new_insns.append(
                    insn.copy(
                        depends_on=(
                            none_to_empty_set(insn.depends_on)
                            | frozenset([init_insn_id]))))
        else:
            new_insns.append(insn)

    # }}}

    # {{{ generate store instruction

    buf_var_store = buf_var
    if non1_store_inames:
        buf_var_store = buf_var_store.index(
                tuple(var(iname) for iname in non1_store_inames))

    store_subscript = []
    store_iname_idx = 0
    if var_shape:
        for i in range(len(var_shape)):
            ax_subscript = abm.storage_base_indices[i]
            if abm.non1_storage_axis_flags[i]:
                ax_subscript += var(non1_store_inames[store_iname_idx])
                store_iname_idx += 1
            store_subscript.append(ax_subscript)

    store_target = var(var_name)
    if store_subscript:
        store_target = store_target.index(tuple(store_subscript))

    if store_expression is None:
        store_expression = buf_var_store
    else:
        store_expression = SubstitutionMapper(
                make_subst_func({
                    "base": store_target,
                    "buffer": buf_var_store,
                    }))(store_expression)

    if store_expression is not False:
        from loopy.kernel.data import Assignment
        store_instruction = Assignment(
                    id=kernel.make_unique_instruction_id(based_on="store_"+var_name),
                    depends_on=frozenset(aar.modified_insn_ids),
                    no_sync_with=frozenset([(init_insn_id, "any")]),
                    assignee=store_target,
                    expression=store_expression,
                    within_inames=(
                        frozenset(within_inames)
                        | frozenset(non1_store_inames)))
    else:
        did_write = False

    # }}}

    new_insns.append(init_instruction)
    if did_write:
        new_insns.append(store_instruction)
    else:
        for iname in store_inames:
            del new_iname_to_tag[iname]

    kernel = kernel.copy(
            domains=new_kernel_domains,
            instructions=new_insns,
            temporary_variables=new_temporary_variables)

    from loopy import tag_inames
    kernel = tag_inames(kernel, new_iname_to_tag)

    from loopy.kernel.tools import assign_automatic_axes
    kernel = assign_automatic_axes(kernel)

    if CACHING_ENABLED:
        from loopy.preprocess import prepare_for_caching
        buffer_array_cache.store_if_not_present(
                cache_key, prepare_for_caching(kernel))

    return kernel
Exemple #45
0
def rename_iname(knl, old_iname, new_iname, existing_ok=False, within=None):
    """
    :arg within: a stack match as understood by
        :func:`loopy.context_matching.parse_stack_match`.
    :arg existing_ok: execute even if *new_iname* already exists
    """

    var_name_gen = knl.get_var_name_generator()

    does_exist = var_name_gen.is_name_conflicting(new_iname)

    if does_exist and not existing_ok:
        raise ValueError("iname '%s' conflicts with an existing identifier"
                "--cannot rename" % new_iname)

    if does_exist:
        # {{{ check that the domains match up

        dom = knl.get_inames_domain(frozenset((old_iname, new_iname)))

        var_dict = dom.get_var_dict()
        _, old_idx = var_dict[old_iname]
        _, new_idx = var_dict[new_iname]

        par_idx = dom.dim(dim_type.param)
        dom_old = dom.move_dims(
                dim_type.param, par_idx, dim_type.set, old_idx, 1)
        dom_old = dom_old.move_dims(
                dim_type.set, dom_old.dim(dim_type.set), dim_type.param, par_idx, 1)
        dom_old = dom_old.project_out(
                dim_type.set, new_idx if new_idx < old_idx else new_idx - 1, 1)

        par_idx = dom.dim(dim_type.param)
        dom_new = dom.move_dims(
                dim_type.param, par_idx, dim_type.set, new_idx, 1)
        dom_new = dom_new.move_dims(
                dim_type.set, dom_new.dim(dim_type.set), dim_type.param, par_idx, 1)
        dom_new = dom_new.project_out(
                dim_type.set, old_idx if old_idx < new_idx else old_idx - 1, 1)

        if not (dom_old <= dom_new and dom_new <= dom_old):
            raise LoopyError(
                    "inames {old} and {new} do not iterate over the same domain"
                    .format(old=old_iname, new=new_iname))

        # }}}

        from pymbolic import var
        subst_dict = {old_iname: var(new_iname)}

        from loopy.context_matching import parse_stack_match
        within = parse_stack_match(within)

        from pymbolic.mapper.substitutor import make_subst_func
        rule_mapping_context = SubstitutionRuleMappingContext(
                knl.substitutions, var_name_gen)
        ijoin = RuleAwareSubstitutionMapper(rule_mapping_context,
                        make_subst_func(subst_dict), within)

        knl = rule_mapping_context.finish_kernel(
                ijoin.map_kernel(knl))

        new_instructions = []
        for insn in knl.instructions:
            if (old_iname in insn.forced_iname_deps
                    and within(knl, insn, ())):
                insn = insn.copy(
                        forced_iname_deps=(
                            (insn.forced_iname_deps - frozenset([old_iname]))
                            | frozenset([new_iname])))

            new_instructions.append(insn)

        knl = knl.copy(instructions=new_instructions)

    else:
        knl = duplicate_inames(
                knl, [old_iname], within=within, new_inames=[new_iname])

    knl = remove_unused_inames(knl, [old_iname])

    return knl
Exemple #46
0
def _fix_parameter(kernel, name, value):
    def process_set(s):
        var_dict = s.get_var_dict()

        try:
            dt, idx = var_dict[name]
        except KeyError:
            return s

        value_aff = isl.Aff.zero_on_domain(s.space) + value

        from loopy.isl_helpers import iname_rel_aff

        name_equal_value_aff = iname_rel_aff(s.space, name, "==", value_aff)

        s = s.add_constraint(isl.Constraint.equality_from_aff(name_equal_value_aff)).project_out(dt, idx, 1)

        return s

    new_domains = [process_set(dom) for dom in kernel.domains]

    from pymbolic.mapper.substitutor import make_subst_func

    subst_func = make_subst_func({name: value})

    from loopy.symbolic import SubstitutionMapper, PartialEvaluationMapper

    subst_map = SubstitutionMapper(subst_func)
    ev_map = PartialEvaluationMapper()

    def map_expr(expr):
        return ev_map(subst_map(expr))

    from loopy.kernel.array import ArrayBase

    new_args = []
    for arg in kernel.args:
        if arg.name == name:
            # remove from argument list
            continue

        if not isinstance(arg, ArrayBase):
            new_args.append(arg)
        else:
            new_args.append(arg.map_exprs(map_expr))

    new_temp_vars = {}
    for tv in six.itervalues(kernel.temporary_variables):
        new_temp_vars[tv.name] = tv.map_exprs(map_expr)

    from loopy.match import parse_stack_match

    within = parse_stack_match(None)

    rule_mapping_context = SubstitutionRuleMappingContext(kernel.substitutions, kernel.get_var_name_generator())
    esubst_map = RuleAwareSubstitutionMapper(rule_mapping_context, subst_func, within=within)
    return rule_mapping_context.finish_kernel(esubst_map.map_kernel(kernel)).copy(
        domains=new_domains,
        args=new_args,
        temporary_variables=new_temp_vars,
        assumptions=process_set(kernel.assumptions),
    )
Exemple #47
0
def _process_footprint_subscripts(kernel, rule_name, sweep_inames,
        footprint_subscripts, arg):
    """Track applied iname rewrites, deal with slice specifiers ':'."""

    name_gen = kernel.get_var_name_generator()

    from pymbolic.primitives import Variable

    if footprint_subscripts is None:
        return kernel, rule_name, sweep_inames, []

    if not isinstance(footprint_subscripts, (list, tuple)):
        footprint_subscripts = [footprint_subscripts]

    inames_to_be_removed = []

    new_footprint_subscripts = []
    for fsub in footprint_subscripts:
        if isinstance(fsub, str):
            from loopy.symbolic import parse
            fsub = parse(fsub)

        if not isinstance(fsub, tuple):
            fsub = (fsub,)

        if len(fsub) != arg.num_user_axes():
            raise ValueError("sweep index '%s' has the wrong number of dimensions"
                    % str(fsub))

        for subst_map in kernel.applied_iname_rewrites:
            from loopy.symbolic import SubstitutionMapper
            from pymbolic.mapper.substitutor import make_subst_func
            fsub = SubstitutionMapper(make_subst_func(subst_map))(fsub)

        from loopy.symbolic import get_dependencies
        fsub_dependencies = get_dependencies(fsub)

        new_fsub = []
        for axis_nr, fsub_axis in enumerate(fsub):
            from pymbolic.primitives import Slice
            if isinstance(fsub_axis, Slice):
                if fsub_axis.children != (None,):
                    raise NotImplementedError("add_prefetch only "
                            "supports full slices")

                axis_name = name_gen(
                        based_on="%s_fetch_axis_%d" % (arg.name, axis_nr))

                kernel = _add_kernel_axis(kernel, axis_name, 0, arg.shape[axis_nr],
                        frozenset(sweep_inames) | fsub_dependencies)
                sweep_inames = sweep_inames + [axis_name]

                inames_to_be_removed.append(axis_name)
                new_fsub.append(Variable(axis_name))

            else:
                new_fsub.append(fsub_axis)

        new_footprint_subscripts.append(tuple(new_fsub))
        del new_fsub

    footprint_subscripts = new_footprint_subscripts
    del new_footprint_subscripts

    subst_use = [Variable(rule_name)(*si) for si in footprint_subscripts]
    return kernel, subst_use, sweep_inames, inames_to_be_removed
Exemple #48
0
def join_inames(kernel, inames, new_iname=None, tag=None, within=None):
    """
    :arg inames: fastest varying last
    :arg within: a stack match as understood by
        :func:`loopy.context_matching.parse_stack_match`.
    """

    # now fastest varying first
    inames = inames[::-1]

    if new_iname is None:
        new_iname = kernel.get_var_name_generator()("_and_".join(inames))

    from loopy.kernel.tools import DomainChanger
    domch = DomainChanger(kernel, frozenset(inames))
    for iname in inames:
        if kernel.get_home_domain_index(iname) != domch.leaf_domain_index:
            raise LoopyError("iname '%s' is not 'at home' in the "
                    "join's leaf domain" % iname)

    new_domain = domch.domain
    new_dim_idx = new_domain.dim(dim_type.set)
    new_domain = new_domain.add_dims(dim_type.set, 1)
    new_domain = new_domain.set_dim_name(dim_type.set, new_dim_idx, new_iname)

    joint_aff = zero = isl.Aff.zero_on_domain(new_domain.space)
    subst_dict = {}
    base_divisor = 1

    from pymbolic import var

    for i, iname in enumerate(inames):
        iname_dt, iname_idx = zero.get_space().get_var_dict()[iname]
        iname_aff = zero.add_coefficient_val(iname_dt, iname_idx, 1)

        joint_aff = joint_aff + base_divisor*iname_aff

        bounds = kernel.get_iname_bounds(iname, constants_only=True)

        from loopy.isl_helpers import (
                static_max_of_pw_aff, static_value_of_pw_aff)
        from loopy.symbolic import pw_aff_to_expr

        length = int(pw_aff_to_expr(
            static_max_of_pw_aff(bounds.size, constants_only=True)))

        try:
            lower_bound_aff = static_value_of_pw_aff(
                    bounds.lower_bound_pw_aff.coalesce(),
                    constants_only=False)
        except Exception as e:
            raise type(e)("while finding lower bound of '%s': " % iname)

        my_val = var(new_iname) // base_divisor
        if i+1 < len(inames):
            my_val %= length
        my_val += pw_aff_to_expr(lower_bound_aff)
        subst_dict[iname] = my_val

        base_divisor *= length

    from loopy.isl_helpers import iname_rel_aff
    new_domain = new_domain.add_constraint(
            isl.Constraint.equality_from_aff(
                iname_rel_aff(new_domain.get_space(), new_iname, "==", joint_aff)))

    for i, iname in enumerate(inames):
        iname_to_dim = new_domain.get_space().get_var_dict()
        iname_dt, iname_idx = iname_to_dim[iname]

        if within is None:
            new_domain = new_domain.project_out(iname_dt, iname_idx, 1)

    def subst_forced_iname_deps(fid):
        result = set()
        for iname in fid:
            if iname in inames:
                result.add(new_iname)
            else:
                result.add(iname)

        return frozenset(result)

    new_insns = [
            insn.copy(
                forced_iname_deps=subst_forced_iname_deps(insn.forced_iname_deps))
            for insn in kernel.instructions]

    kernel = (kernel
            .copy(
                instructions=new_insns,
                domains=domch.get_domains_with(new_domain),
                applied_iname_rewrites=kernel.applied_iname_rewrites + [subst_dict]
                ))

    from loopy.context_matching import parse_stack_match
    within = parse_stack_match(within)

    from pymbolic.mapper.substitutor import make_subst_func
    rule_mapping_context = SubstitutionRuleMappingContext(
            kernel.substitutions, kernel.get_var_name_generator())
    ijoin = _InameJoiner(rule_mapping_context, within,
            make_subst_func(subst_dict),
            inames, new_iname)

    kernel = rule_mapping_context.finish_kernel(
            ijoin.map_kernel(kernel))

    if tag is not None:
        kernel = tag_inames(kernel, {new_iname: tag})

    return kernel
Exemple #49
0
def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
        store_expression=None, within=None, default_tag="l.auto",
        temporary_scope=None, temporary_is_local=None,
        fetch_bounding_box=False):
    """
    :arg init_expression: Either *None* (indicating the prior value of the buffered
        array should be read) or an expression optionally involving the
        variable 'base' (which references the associated location in the array
        being buffered).
    :arg store_expression: Either *None*, *False*, or an expression involving
        variables 'base' and 'buffer' (without array indices).
        (*None* indicates that a default storage instruction should be used,
        *False* indicates that no storing of the temporary should occur
        at all.)
    """

    # {{{ unify temporary_scope / temporary_is_local

    from loopy.kernel.data import temp_var_scope
    if temporary_is_local is not None:
        from warnings import warn
        warn("temporary_is_local is deprecated. Use temporary_scope instead",
                DeprecationWarning, stacklevel=2)

        if temporary_scope is not None:
            raise LoopyError("may not specify both temporary_is_local and "
                    "temporary_scope")

        if temporary_is_local:
            temporary_scope = temp_var_scope.LOCAL
        else:
            temporary_scope = temp_var_scope.PRIVATE

    del temporary_is_local

    # }}}

    # {{{ process arguments

    if isinstance(init_expression, str):
        from loopy.symbolic import parse
        init_expression = parse(init_expression)

    if isinstance(store_expression, str):
        from loopy.symbolic import parse
        store_expression = parse(store_expression)

    if isinstance(buffer_inames, str):
        buffer_inames = [s.strip()
                for s in buffer_inames.split(",") if s.strip()]

    for iname in buffer_inames:
        if iname not in kernel.all_inames():
            raise RuntimeError("sweep iname '%s' is not a known iname"
                    % iname)

    buffer_inames = list(buffer_inames)
    buffer_inames_set = frozenset(buffer_inames)

    from loopy.match import parse_stack_match
    within = parse_stack_match(within)

    if var_name in kernel.arg_dict:
        var_descr = kernel.arg_dict[var_name]
    elif var_name in kernel.temporary_variables:
        var_descr = kernel.temporary_variables[var_name]
    else:
        raise ValueError("variable '%s' not found" % var_name)

    from loopy.kernel.data import ArrayBase
    if isinstance(var_descr, ArrayBase):
        var_shape = var_descr.shape
    else:
        var_shape = ()

    if temporary_scope is None:
        import loopy as lp
        temporary_scope = lp.auto

    # }}}

    # {{{ caching

    from loopy import CACHING_ENABLED

    from loopy.preprocess import prepare_for_caching
    key_kernel = prepare_for_caching(kernel)
    cache_key = (key_kernel, var_name, tuple(buffer_inames),
            PymbolicExpressionHashWrapper(init_expression),
            PymbolicExpressionHashWrapper(store_expression), within,
            default_tag, temporary_scope, fetch_bounding_box)

    if CACHING_ENABLED:
        try:
            result = buffer_array_cache[cache_key]
            logger.info("%s: buffer_array cache hit" % kernel.name)
            return result
        except KeyError:
            pass

    # }}}

    var_name_gen = kernel.get_var_name_generator()
    within_inames = set()

    access_descriptors = []
    for insn in kernel.instructions:
        if not within(kernel, insn.id, ()):
            continue

        from pymbolic.primitives import Variable, Subscript
        from loopy.symbolic import LinearSubscript

        for assignee in insn.assignees:
            if isinstance(assignee, Variable):
                assignee_name = assignee.name
                index = ()

            elif isinstance(assignee, Subscript):
                assignee_name = assignee.aggregate.name
                index = assignee.index_tuple

            elif isinstance(assignee, LinearSubscript):
                if assignee.aggregate.name == var_name:
                    raise LoopyError("buffer_array may not be applied in the "
                            "presence of linear write indexing into '%s'" % var_name)

            else:
                raise LoopyError("invalid lvalue '%s'" % assignee)

            if assignee_name == var_name:
                within_inames.update(
                        (get_dependencies(index) & kernel.all_inames())
                        - buffer_inames_set)
                access_descriptors.append(
                        AccessDescriptor(
                            identifier=insn.id,
                            storage_axis_exprs=index))

    # {{{ find fetch/store inames

    init_inames = []
    store_inames = []
    new_iname_to_tag = {}

    for i in range(len(var_shape)):
        dim_name = str(i)
        if isinstance(var_descr, ArrayBase) and var_descr.dim_names is not None:
            dim_name = var_descr.dim_names[i]

        init_iname = var_name_gen("%s_init_%s" % (var_name, dim_name))
        store_iname = var_name_gen("%s_store_%s" % (var_name, dim_name))

        new_iname_to_tag[init_iname] = default_tag
        new_iname_to_tag[store_iname] = default_tag

        init_inames.append(init_iname)
        store_inames.append(store_iname)

    # }}}

    # {{{ modify loop domain

    non1_init_inames = []
    non1_store_inames = []

    if var_shape:
        # {{{ find domain to be changed

        from loopy.kernel.tools import DomainChanger
        domch = DomainChanger(kernel, buffer_inames_set | within_inames)

        if domch.leaf_domain_index is not None:
            # If the sweep inames are at home in parent domains, then we'll add
            # fetches with loops over copies of these parent inames that will end
            # up being scheduled *within* loops over these parents.

            for iname in buffer_inames_set:
                if kernel.get_home_domain_index(iname) != domch.leaf_domain_index:
                    raise RuntimeError("buffer iname '%s' is not 'at home' in the "
                            "sweep's leaf domain" % iname)

        # }}}

        abm = ArrayToBufferMap(kernel, domch.domain, buffer_inames,
                access_descriptors, len(var_shape))

        for i in range(len(var_shape)):
            if abm.non1_storage_axis_flags[i]:
                non1_init_inames.append(init_inames[i])
                non1_store_inames.append(store_inames[i])
            else:
                del new_iname_to_tag[init_inames[i]]
                del new_iname_to_tag[store_inames[i]]

        new_domain = domch.domain
        new_domain = abm.augment_domain_with_sweep(
                    new_domain, non1_init_inames,
                    boxify_sweep=fetch_bounding_box)
        new_domain = abm.augment_domain_with_sweep(
                    new_domain, non1_store_inames,
                    boxify_sweep=fetch_bounding_box)
        new_kernel_domains = domch.get_domains_with(new_domain)
        del new_domain

    else:
        # leave kernel domains unchanged
        new_kernel_domains = kernel.domains

        abm = NoOpArrayToBufferMap()

    # }}}

    # {{{ set up temp variable

    import loopy as lp

    buf_var_name = var_name_gen(based_on=var_name+"_buf")

    new_temporary_variables = kernel.temporary_variables.copy()
    temp_var = lp.TemporaryVariable(
            name=buf_var_name,
            dtype=var_descr.dtype,
            base_indices=(0,)*len(abm.non1_storage_shape),
            shape=tuple(abm.non1_storage_shape),
            scope=temporary_scope)

    new_temporary_variables[buf_var_name] = temp_var

    # }}}

    new_insns = []

    buf_var = var(buf_var_name)

    # {{{ generate init instruction

    buf_var_init = buf_var
    if non1_init_inames:
        buf_var_init = buf_var_init.index(
                tuple(var(iname) for iname in non1_init_inames))

    init_base = var(var_name)

    init_subscript = []
    init_iname_idx = 0
    if var_shape:
        for i in range(len(var_shape)):
            ax_subscript = abm.storage_base_indices[i]
            if abm.non1_storage_axis_flags[i]:
                ax_subscript += var(non1_init_inames[init_iname_idx])
                init_iname_idx += 1
            init_subscript.append(ax_subscript)

    if init_subscript:
        init_base = init_base.index(tuple(init_subscript))

    if init_expression is None:
        init_expression = init_base
    else:
        init_expression = init_expression
        init_expression = SubstitutionMapper(
                make_subst_func({
                    "base": init_base,
                    }))(init_expression)

    init_insn_id = kernel.make_unique_instruction_id(based_on="init_"+var_name)
    from loopy.kernel.data import Assignment
    init_instruction = Assignment(id=init_insn_id,
                assignee=buf_var_init,
                expression=init_expression,
                forced_iname_deps=(
                    frozenset(within_inames)
                    | frozenset(non1_init_inames)),
                depends_on=frozenset(),
                depends_on_is_final=True)

    # }}}

    rule_mapping_context = SubstitutionRuleMappingContext(
            kernel.substitutions, kernel.get_var_name_generator())
    aar = ArrayAccessReplacer(rule_mapping_context, var_name,
            within, abm, buf_var)
    kernel = rule_mapping_context.finish_kernel(aar.map_kernel(kernel))

    did_write = False
    for insn_id in aar.modified_insn_ids:
        insn = kernel.id_to_insn[insn_id]
        if buf_var_name in insn.assignee_var_names():
            did_write = True

    # {{{ add init_insn_id to depends_on

    new_insns = []

    def none_to_empty_set(s):
        if s is None:
            return frozenset()
        else:
            return s

    for insn in kernel.instructions:
        if insn.id in aar.modified_insn_ids:
            new_insns.append(
                    insn.copy(
                        depends_on=(
                            none_to_empty_set(insn.depends_on)
                            | frozenset([init_insn_id]))))
        else:
            new_insns.append(insn)

    # }}}

    # {{{ generate store instruction

    buf_var_store = buf_var
    if non1_store_inames:
        buf_var_store = buf_var_store.index(
                tuple(var(iname) for iname in non1_store_inames))

    store_subscript = []
    store_iname_idx = 0
    if var_shape:
        for i in range(len(var_shape)):
            ax_subscript = abm.storage_base_indices[i]
            if abm.non1_storage_axis_flags[i]:
                ax_subscript += var(non1_store_inames[store_iname_idx])
                store_iname_idx += 1
            store_subscript.append(ax_subscript)

    store_target = var(var_name)
    if store_subscript:
        store_target = store_target.index(tuple(store_subscript))

    if store_expression is None:
        store_expression = buf_var_store
    else:
        store_expression = SubstitutionMapper(
                make_subst_func({
                    "base": store_target,
                    "buffer": buf_var_store,
                    }))(store_expression)

    if store_expression is not False:
        from loopy.kernel.data import Assignment
        store_instruction = Assignment(
                    id=kernel.make_unique_instruction_id(based_on="store_"+var_name),
                    depends_on=frozenset(aar.modified_insn_ids),
                    no_sync_with=frozenset([init_insn_id]),
                    assignee=store_target,
                    expression=store_expression,
                    forced_iname_deps=(
                        frozenset(within_inames)
                        | frozenset(non1_store_inames)))
    else:
        did_write = False

    # }}}

    new_insns.append(init_instruction)
    if did_write:
        new_insns.append(store_instruction)
    else:
        for iname in store_inames:
            del new_iname_to_tag[iname]

    kernel = kernel.copy(
            domains=new_kernel_domains,
            instructions=new_insns,
            temporary_variables=new_temporary_variables)

    from loopy import tag_inames
    kernel = tag_inames(kernel, new_iname_to_tag)

    from loopy.kernel.tools import assign_automatic_axes
    kernel = assign_automatic_axes(kernel)

    if CACHING_ENABLED:
        from loopy.preprocess import prepare_for_caching
        buffer_array_cache[cache_key] = prepare_for_caching(kernel)

    return kernel
Exemple #50
0
def _fuse_two_kernels(knla, knlb):
    from loopy.kernel import kernel_state
    if knla.state != kernel_state.INITIAL or knlb.state != kernel_state.INITIAL:
        raise LoopyError("can only fuse kernels in INITIAL state")

    # {{{ fuse domains

    new_domains = knla.domains[:]

    for dom_b in knlb.domains:
        i_fuse = _find_fusable_loop_domain_index(dom_b, new_domains)
        if i_fuse is None:
            new_domains.append(dom_b)
        else:
            dom_a = new_domains[i_fuse]
            dom_a, dom_b = isl.align_two(dom_a, dom_b)

            shared_inames = list(
                    set(dom_a.get_var_dict(dim_type.set))
                    &
                    set(dom_b.get_var_dict(dim_type.set)))

            dom_a_s = dom_a.project_out_except(shared_inames, [dim_type.set])
            dom_b_s = dom_a.project_out_except(shared_inames, [dim_type.set])

            if not (dom_a_s <= dom_b_s and dom_b_s <= dom_a_s):
                raise LoopyError("kernels do not agree on domain of "
                        "inames '%s'" % (",".join(shared_inames)))

            new_domain = dom_a & dom_b

            new_domains[i_fuse] = new_domain

    # }}}

    vng = knla.get_var_name_generator()
    b_var_renames = {}

    # {{{ fuse args

    new_args = knla.args[:]
    for b_arg in knlb.args:
        if b_arg.name not in knla.arg_dict:
            new_arg_name = vng(b_arg.name)

            if new_arg_name != b_arg.name:
                b_var_renames[b_arg.name] = var(new_arg_name)

            new_args.append(b_arg.copy(name=new_arg_name))
        else:
            if b_arg != knla.arg_dict[b_arg.name]:
                raise LoopyError(
                        "argument '%s' has inconsistent definition between "
                        "the two kernels being merged" % b_arg.name)

    # }}}

    # {{{ fuse temporaries

    new_temporaries = knla.temporary_variables.copy()
    for b_name, b_tv in six.iteritems(knlb.temporary_variables):
        assert b_name == b_tv.name

        new_tv_name = vng(b_name)

        if new_tv_name != b_name:
            b_var_renames[b_name] = var(new_tv_name)

        assert new_tv_name not in new_temporaries
        new_temporaries[new_tv_name] = b_tv.copy(name=new_tv_name)

    # }}}

    # {{{ apply renames in kernel b

    from loopy.symbolic import (
            SubstitutionRuleMappingContext,
            RuleAwareSubstitutionMapper)
    from pymbolic.mapper.substitutor import make_subst_func
    from loopy.context_matching import parse_stack_match

    srmc = SubstitutionRuleMappingContext(
            knlb.substitutions, knlb.get_var_name_generator())
    subst_map = RuleAwareSubstitutionMapper(
            srmc, make_subst_func(b_var_renames),
            within=parse_stack_match(None))
    knlb = subst_map.map_kernel(knlb)

    # }}}

    # {{{ fuse instructions

    new_instructions = knla.instructions[:]
    from pytools import UniqueNameGenerator
    insn_id_gen = UniqueNameGenerator(
            set([insna.id for insna in new_instructions]))

    knl_b_instructions = []
    old_b_id_to_new_b_id = {}
    for insnb in knlb.instructions:
        old_id = insnb.id
        new_id = insn_id_gen(old_id)
        old_b_id_to_new_b_id[old_id] = new_id

        knl_b_instructions.append(
                insnb.copy(id=new_id))

    for insnb in knl_b_instructions:
        new_instructions.append(
                insnb.copy(
                    insn_deps=frozenset(
                        old_b_id_to_new_b_id[dep_id]
                        for dep_id in insnb.insn_deps)))

    # }}}

    # {{{ fuse assumptions

    assump_a = knla.assumptions
    assump_b = knlb.assumptions
    assump_a, assump_b = isl.align_two(assump_a, assump_b)

    shared_param_names = list(
            set(dom_a.get_var_dict(dim_type.set))
            &
            set(dom_b.get_var_dict(dim_type.set)))

    assump_a_s = assump_a.project_out_except(shared_param_names, [dim_type.param])
    assump_b_s = assump_a.project_out_except(shared_param_names, [dim_type.param])

    if not (assump_a_s <= assump_b_s and assump_b_s <= assump_a_s):
        raise LoopyError("assumptions do not agree on kernels to be merged")

    new_assumptions = (assump_a & assump_b).params()

    # }}}

    from loopy.kernel import LoopKernel
    return LoopKernel(
            domains=new_domains,
            instructions=new_instructions,
            args=new_args,
            name="%s_and_%s" % (knla.name, knlb.name),
            preambles=_ordered_merge_lists(knla.preambles, knlb.preambles),
            preamble_generators=_ordered_merge_lists(
                knla.preamble_generators, knlb.preamble_generators),
            assumptions=new_assumptions,
            local_sizes=_merge_dicts(
                "local size", knla.local_sizes, knlb.local_sizes),
            temporary_variables=new_temporaries,
            iname_to_tag=_merge_dicts(
                "iname-to-tag mapping",
                knla.iname_to_tag,
                knlb.iname_to_tag),
            substitutions=_merge_dicts(
                "substitution",
                knla.substitutions,
                knlb.substitutions),
            function_manglers=_ordered_merge_lists(
                knla.function_manglers,
                knlb.function_manglers),
            symbol_manglers=_ordered_merge_lists(
                knla.symbol_manglers,
                knlb.symbol_manglers),

            iname_slab_increments=_merge_dicts(
                "iname slab increment",
                knla.iname_slab_increments,
                knlb.iname_slab_increments),
            loop_priority=_ordered_merge_lists(
                knla.loop_priority,
                knlb.loop_priority),
            silenced_warnings=_ordered_merge_lists(
                knla.silenced_warnings,
                knlb.silenced_warnings),
            applied_iname_rewrites=_ordered_merge_lists(
                knla.applied_iname_rewrites,
                knlb.applied_iname_rewrites),
            index_dtype=_merge_values(
                "index dtype",
                knla.index_dtype,
                knlb.index_dtype),
            target=_merge_values(
                "target",
                knla.target,
                knlb.target),
            options=knla.options)
Exemple #51
0
def precompute(
        kernel,
        subst_use,
        sweep_inames=[],
        within=None,
        storage_axes=None,
        temporary_name=None,
        precompute_inames=None,
        precompute_outer_inames=None,
        storage_axis_to_tag={},

        # "None" is a valid value here, distinct from the default.
        default_tag=_not_provided,
        dtype=None,
        fetch_bounding_box=False,
        temporary_address_space=None,
        compute_insn_id=None,
        **kwargs):
    """Precompute the expression described in the substitution rule determined by
    *subst_use* and store it in a temporary array. A precomputation needs two
    things to operate, a list of *sweep_inames* (order irrelevant) and an
    ordered list of *storage_axes* (whose order will describe the axis ordering
    of the temporary array).

    :arg subst_use: Describes what to prefetch.

        The following objects may be given for *subst_use*:

        * The name of the substitution rule.

        * The tagged name ("name$tag") of the substitution rule.

        * A list of invocations of the substitution rule.
          This list of invocations, when swept across *sweep_inames*, then serves
          to define the footprint of the precomputation.

          Invocations may be tagged ("name$tag") to filter out a subset of the
          usage sites of the substitution rule. (Namely those usage sites that
          use the same tagged name.)

          Invocations may be given as a string or as a
          :class:`pymbolic.primitives.Expression` object.

          If only one invocation is to be given, then the only entry of the list
          may be given directly.

    If the list of invocations generating the footprint is not given,
    all (tag-matching, if desired) usage sites of the substitution rule
    are used to determine the footprint.

    The following cases can arise for each sweep axis:

    * The axis is an iname that occurs within arguments specified at
      usage sites of the substitution rule. This case is assumed covered
      by the storage axes provided for the argument.

    * The axis is an iname that occurs within the *value* of the rule, but not
      within its arguments. A new, dedicated storage axis is allocated for
      such an axis.

    :arg sweep_inames: A :class:`list` of inames to be swept.
        May also equivalently be a comma-separated string.
    :arg within: a stack match as understood by
        :func:`loopy.match.parse_stack_match`.
    :arg storage_axes: A :class:`list` of inames and/or rule argument
        names/indices to be used as storage axes.
        May also equivalently be a comma-separated string.
    :arg temporary_name:
        The temporary variable name to use for storing the precomputed data.
        If it does not exist, it will be created. If it does exist, its properties
        (such as size, type) are checked (and updated, if possible) to match
        its use.
    :arg precompute_inames:
        A tuple of inames to be used to carry out the precomputation.
        If the specified inames do not already exist, they will be
        created. If they do already exist, their loop domain is verified
        against the one required for this precomputation. This tuple may
        be shorter than the (provided or automatically found) *storage_axes*
        tuple, in which case names will be automatically created.
        May also equivalently be a comma-separated string.

    :arg precompute_outer_inames: A :class:`frozenset` of inames within which
        the compute instruction is nested. If *None*, make an educated guess.
        May also be specified as a comma-separated string.

    :arg default_tag: The :ref:`iname tag <iname-tags>` to be applied to the
        inames created to perform the precomputation. The current default will
        make them local axes and automatically split them to fit the work
        group size, but this default will disappear in favor of simply leaving them
        untagged in 2019. For 2018, a warning will be issued if no *default_tag* is
        specified.

    :arg compute_insn_id: The ID of the instruction generated to perform the
        precomputation.

    If `storage_axes` is not specified, it defaults to the arrangement
    `<direct sweep axes><arguments>` with the direct sweep axes being the
    slower-varying indices.

    Trivial storage axes (i.e. axes of length 1 with respect to the sweep) are
    eliminated.
    """

    # {{{ unify temporary_address_space / temporary_scope

    temporary_scope = kwargs.pop("temporary_scope", None)

    from loopy.kernel.data import AddressSpace
    if temporary_scope is not None:
        from warnings import warn
        warn(
            "temporary_scope is deprecated. Use temporary_address_space instead",
            DeprecationWarning,
            stacklevel=2)

        if temporary_address_space is not None:
            raise LoopyError(
                "may not specify both temporary_address_space and "
                "temporary_scope")

        temporary_address_space = temporary_scope

    del temporary_scope

    # }}}

    if kwargs:
        raise TypeError("unrecognized keyword arguments: %s" %
                        ", ".join(kwargs.keys()))

    # {{{ check, standardize arguments

    if isinstance(sweep_inames, str):
        sweep_inames = [iname.strip() for iname in sweep_inames.split(",")]

    for iname in sweep_inames:
        if iname not in kernel.all_inames():
            raise RuntimeError("sweep iname '%s' is not a known iname" % iname)

    sweep_inames = list(sweep_inames)
    sweep_inames_set = frozenset(sweep_inames)

    if isinstance(storage_axes, str):
        storage_axes = [ax.strip() for ax in storage_axes.split(",")]

    if isinstance(precompute_inames, str):
        precompute_inames = [
            iname.strip() for iname in precompute_inames.split(",")
        ]

    if isinstance(precompute_outer_inames, str):
        precompute_outer_inames = frozenset(
            iname.strip() for iname in precompute_outer_inames.split(","))

    if isinstance(subst_use, str):
        subst_use = [subst_use]

    footprint_generators = None

    subst_name = None
    subst_tag = None

    from pymbolic.primitives import Variable, Call
    from loopy.symbolic import parse, TaggedVariable

    for use in subst_use:
        if isinstance(use, str):
            use = parse(use)

        if isinstance(use, Call):
            if footprint_generators is None:
                footprint_generators = []

            footprint_generators.append(use)
            subst_name_as_expr = use.function
        else:
            subst_name_as_expr = use

        if isinstance(subst_name_as_expr, TaggedVariable):
            new_subst_name = subst_name_as_expr.name
            new_subst_tag = subst_name_as_expr.tag
        elif isinstance(subst_name_as_expr, Variable):
            new_subst_name = subst_name_as_expr.name
            new_subst_tag = None
        else:
            raise ValueError("unexpected type of subst_name")

        if (subst_name, subst_tag) == (None, None):
            subst_name, subst_tag = new_subst_name, new_subst_tag
        else:
            if (subst_name, subst_tag) != (new_subst_name, new_subst_tag):
                raise ValueError("not all uses in subst_use agree "
                                 "on rule name and tag")

    from loopy.match import parse_stack_match
    within = parse_stack_match(within)

    try:
        subst = kernel.substitutions[subst_name]
    except KeyError:
        raise LoopyError("substitution rule '%s' not found" % subst_name)

    c_subst_name = subst_name.replace(".", "_")

    # {{{ handle default_tag

    from loopy.transform.data import _not_provided \
            as transform_data_not_provided

    if default_tag is _not_provided or default_tag is transform_data_not_provided:
        # no need to warn for scalar precomputes
        if sweep_inames:
            from warnings import warn
            warn(
                "Not specifying default_tag is deprecated, and default_tag "
                "will become mandatory in 2019.x. "
                "Pass 'default_tag=\"l.auto\" to match the current default, "
                "or Pass 'default_tag=None to leave the loops untagged, which "
                "is the recommended behavior.",
                DeprecationWarning,
                stacklevel=(

                    # In this case, we came here through add_prefetch. Increase
                    # the stacklevel.
                    3 if default_tag is transform_data_not_provided else 2))

        default_tag = "l.auto"

    from loopy.kernel.data import parse_tag
    default_tag = parse_tag(default_tag)

    # }}}

    # }}}

    # {{{ process invocations in footprint generators, start access_descriptors

    if footprint_generators:
        from pymbolic.primitives import Variable, Call

        access_descriptors = []

        for fpg in footprint_generators:
            if isinstance(fpg, Variable):
                args = ()
            elif isinstance(fpg, Call):
                args = fpg.parameters
            else:
                raise ValueError("footprint generator must "
                                 "be substitution rule invocation")

            access_descriptors.append(
                RuleAccessDescriptor(identifier=access_descriptor_id(
                    args, None),
                                     args=args))

    # }}}

    # {{{ gather up invocations in kernel code, finish access_descriptors

    if not footprint_generators:
        rule_mapping_context = SubstitutionRuleMappingContext(
            kernel.substitutions, kernel.get_var_name_generator())
        invg = RuleInvocationGatherer(rule_mapping_context, kernel, subst_name,
                                      subst_tag, within)
        del rule_mapping_context

        import loopy as lp
        for insn in kernel.instructions:
            if isinstance(insn, lp.MultiAssignmentBase):
                for assignee in insn.assignees:
                    invg(assignee, kernel, insn)
                invg(insn.expression, kernel, insn)

        access_descriptors = invg.access_descriptors
        if not access_descriptors:
            raise RuntimeError("no invocations of '%s' found" % subst_name)

    # }}}

    # {{{ find inames used in arguments

    expanding_usage_arg_deps = set()

    for accdesc in access_descriptors:
        for arg in accdesc.args:
            expanding_usage_arg_deps.update(
                get_dependencies(arg) & kernel.all_inames())

    # }}}

    var_name_gen = kernel.get_var_name_generator()

    # {{{ use given / find new storage_axes

    # extra axes made necessary because they don't occur in the arguments
    extra_storage_axes = set(sweep_inames_set - expanding_usage_arg_deps)

    from loopy.symbolic import SubstitutionRuleExpander
    submap = SubstitutionRuleExpander(kernel.substitutions)

    value_inames = (get_dependencies(submap(subst.expression)) -
                    frozenset(subst.arguments)) & kernel.all_inames()
    if value_inames - expanding_usage_arg_deps < extra_storage_axes:
        raise RuntimeError("unreferenced sweep inames specified: " +
                           ", ".join(extra_storage_axes - value_inames -
                                     expanding_usage_arg_deps))

    new_iname_to_tag = {}

    if storage_axes is None:
        storage_axes = []

        # Add sweep_inames (in given--rather than arbitrary--order) to
        # storage_axes *if* they are part of extra_storage_axes.
        for iname in sweep_inames:
            if iname in extra_storage_axes:
                extra_storage_axes.remove(iname)
                storage_axes.append(iname)

        if extra_storage_axes:
            if (precompute_inames is not None
                    and len(storage_axes) < len(precompute_inames)):
                raise LoopyError(
                    "must specify a sufficient number of "
                    "storage_axes to uniquely determine the meaning "
                    "of the given precompute_inames. (%d storage_axes "
                    "needed)" % len(precompute_inames))
            storage_axes.extend(sorted(extra_storage_axes))

        storage_axes.extend(range(len(subst.arguments)))

    del extra_storage_axes

    prior_storage_axis_name_dict = {}

    storage_axis_names = []
    storage_axis_sources = []  # number for arg#, or iname

    # {{{ check for pre-existing precompute_inames

    if precompute_inames is not None:
        preexisting_precompute_inames = (set(precompute_inames)
                                         & kernel.all_inames())
    else:
        preexisting_precompute_inames = set()

    # }}}

    for i, saxis in enumerate(storage_axes):
        tag_lookup_saxis = saxis

        if saxis in subst.arguments:
            saxis = subst.arguments.index(saxis)

        storage_axis_sources.append(saxis)

        if isinstance(saxis, int):
            # argument index
            name = old_name = subst.arguments[saxis]
        else:
            old_name = saxis
            name = "%s_%s" % (c_subst_name, old_name)

        if (precompute_inames is not None and i < len(precompute_inames)
                and precompute_inames[i]):
            name = precompute_inames[i]
            tag_lookup_saxis = name
            if (name not in preexisting_precompute_inames
                    and var_name_gen.is_name_conflicting(name)):
                raise RuntimeError("new storage axis name '%s' "
                                   "conflicts with existing name" % name)
        else:
            name = var_name_gen(name)

        storage_axis_names.append(name)
        if name not in preexisting_precompute_inames:
            new_iname_to_tag[name] = storage_axis_to_tag.get(
                tag_lookup_saxis, default_tag)

        prior_storage_axis_name_dict[name] = old_name

    del storage_axis_to_tag
    del storage_axes
    del precompute_inames

    # }}}

    # {{{ fill out access_descriptors[...].storage_axis_exprs

    access_descriptors = [
        accdesc.copy(storage_axis_exprs=storage_axis_exprs(
            storage_axis_sources, accdesc.args))
        for accdesc in access_descriptors
    ]

    # }}}

    expanding_inames = sweep_inames_set | frozenset(expanding_usage_arg_deps)
    assert expanding_inames <= kernel.all_inames()

    if storage_axis_names:
        # {{{ find domain to be changed

        change_inames = expanding_inames | preexisting_precompute_inames

        from loopy.kernel.tools import DomainChanger
        domch = DomainChanger(kernel, change_inames)

        if domch.leaf_domain_index is not None:
            # If the sweep inames are at home in parent domains, then we'll add
            # fetches with loops over copies of these parent inames that will end
            # up being scheduled *within* loops over these parents.

            for iname in sweep_inames_set:
                if kernel.get_home_domain_index(
                        iname) != domch.leaf_domain_index:
                    raise RuntimeError(
                        "sweep iname '%s' is not 'at home' in the "
                        "sweep's leaf domain" % iname)

        # }}}

        abm = ArrayToBufferMap(kernel, domch.domain, sweep_inames,
                               access_descriptors, len(storage_axis_names))

        non1_storage_axis_names = []
        for i, saxis in enumerate(storage_axis_names):
            if abm.non1_storage_axis_flags[i]:
                non1_storage_axis_names.append(saxis)
            else:
                del new_iname_to_tag[saxis]

                if saxis in preexisting_precompute_inames:
                    raise LoopyError(
                        "precompute axis %d (1-based) was "
                        "eliminated as "
                        "having length 1 but also mapped to existing "
                        "iname '%s'" % (i + 1, saxis))

        mod_domain = domch.domain

        # {{{ modify the domain, taking into account preexisting inames

        # inames may already exist in mod_domain, add them primed to start
        primed_non1_saxis_names = [
            iname + "'" for iname in non1_storage_axis_names
        ]

        mod_domain = abm.augment_domain_with_sweep(
            domch.domain,
            primed_non1_saxis_names,
            boxify_sweep=fetch_bounding_box)

        check_domain = mod_domain

        for i, saxis in enumerate(non1_storage_axis_names):
            var_dict = mod_domain.get_var_dict(isl.dim_type.set)

            if saxis in preexisting_precompute_inames:
                # add equality constraint between existing and new variable

                dt, dim_idx = var_dict[saxis]
                saxis_aff = isl.Aff.var_on_domain(mod_domain.space, dt,
                                                  dim_idx)

                dt, dim_idx = var_dict[primed_non1_saxis_names[i]]
                new_var_aff = isl.Aff.var_on_domain(mod_domain.space, dt,
                                                    dim_idx)

                mod_domain = mod_domain.add_constraint(
                    isl.Constraint.equality_from_aff(new_var_aff - saxis_aff))

                # project out the new one
                mod_domain = mod_domain.project_out(dt, dim_idx, 1)

            else:
                # remove the prime from the new variable
                dt, dim_idx = var_dict[primed_non1_saxis_names[i]]
                mod_domain = mod_domain.set_dim_name(dt, dim_idx, saxis)

        def add_assumptions(d):
            assumption_non_param = isl.BasicSet.from_params(kernel.assumptions)
            assumptions, domain = isl.align_two(assumption_non_param, d)
            return assumptions & domain

        # {{{ check that we got the desired domain

        check_domain = add_assumptions(
            check_domain.project_out_except(primed_non1_saxis_names,
                                            [isl.dim_type.set]))

        mod_check_domain = add_assumptions(mod_domain)

        # re-add the prime from the new variable
        var_dict = mod_check_domain.get_var_dict(isl.dim_type.set)

        for saxis in non1_storage_axis_names:
            dt, dim_idx = var_dict[saxis]
            mod_check_domain = mod_check_domain.set_dim_name(
                dt, dim_idx, saxis + "'")

        mod_check_domain = mod_check_domain.project_out_except(
            primed_non1_saxis_names, [isl.dim_type.set])

        mod_check_domain, check_domain = isl.align_two(mod_check_domain,
                                                       check_domain)

        # The modified domain can't get bigger by adding constraints
        assert mod_check_domain <= check_domain

        if not check_domain <= mod_check_domain:
            print(check_domain)
            print(mod_check_domain)
            raise LoopyError("domain of preexisting inames does not match "
                             "domain needed for precompute")

        # }}}

        # {{{ check that we didn't shrink the original domain

        # project out the new names from the modified domain
        orig_domain_inames = list(domch.domain.get_var_dict(isl.dim_type.set))
        mod_check_domain = add_assumptions(
            mod_domain.project_out_except(orig_domain_inames,
                                          [isl.dim_type.set]))

        check_domain = add_assumptions(domch.domain)

        mod_check_domain, check_domain = isl.align_two(mod_check_domain,
                                                       check_domain)

        # The modified domain can't get bigger by adding constraints
        assert mod_check_domain <= check_domain

        if not check_domain <= mod_check_domain:
            print(check_domain)
            print(mod_check_domain)
            raise LoopyError(
                "original domain got shrunk by applying the precompute")

        # }}}

        # }}}

        new_kernel_domains = domch.get_domains_with(mod_domain)

    else:
        # leave kernel domains unchanged
        new_kernel_domains = kernel.domains

        non1_storage_axis_names = []
        abm = NoOpArrayToBufferMap()

    kernel = kernel.copy(domains=new_kernel_domains)

    # {{{ set up compute insn

    if temporary_name is None:
        temporary_name = var_name_gen(based_on=c_subst_name)

    assignee = var(temporary_name)

    if non1_storage_axis_names:
        assignee = assignee[tuple(
            var(iname) for iname in non1_storage_axis_names)]

    # {{{ process substitutions on compute instruction

    storage_axis_subst_dict = {}

    for arg_name, bi in zip(storage_axis_names, abm.storage_base_indices):
        if arg_name in non1_storage_axis_names:
            arg = var(arg_name)
        else:
            arg = 0

        storage_axis_subst_dict[prior_storage_axis_name_dict.get(
            arg_name, arg_name)] = arg + bi

    rule_mapping_context = SubstitutionRuleMappingContext(
        kernel.substitutions, kernel.get_var_name_generator())

    from loopy.match import parse_stack_match
    expr_subst_map = RuleAwareSubstitutionMapper(
        rule_mapping_context,
        make_subst_func(storage_axis_subst_dict),
        within=parse_stack_match(None))

    compute_expression = expr_subst_map(subst.expression, kernel, None)

    # }}}

    from loopy.kernel.data import Assignment
    if compute_insn_id is None:
        compute_insn_id = kernel.make_unique_instruction_id(
            based_on=c_subst_name)

    compute_insn = Assignment(
        id=compute_insn_id,
        assignee=assignee,
        expression=compute_expression,
        # within_inames determined below
    )
    compute_dep_id = compute_insn_id
    added_compute_insns = [compute_insn]

    if temporary_address_space == AddressSpace.GLOBAL:
        barrier_insn_id = kernel.make_unique_instruction_id(
            based_on=c_subst_name + "_barrier")
        from loopy.kernel.instruction import BarrierInstruction
        barrier_insn = BarrierInstruction(id=barrier_insn_id,
                                          depends_on=frozenset(
                                              [compute_insn_id]),
                                          synchronization_kind="global",
                                          mem_kind="global")
        compute_dep_id = barrier_insn_id

        added_compute_insns.append(barrier_insn)

    # }}}

    # {{{ substitute rule into expressions in kernel (if within footprint)

    from loopy.symbolic import SubstitutionRuleExpander
    expander = SubstitutionRuleExpander(kernel.substitutions)

    invr = RuleInvocationReplacer(rule_mapping_context,
                                  subst_name,
                                  subst_tag,
                                  within,
                                  access_descriptors,
                                  abm,
                                  storage_axis_names,
                                  storage_axis_sources,
                                  non1_storage_axis_names,
                                  temporary_name,
                                  compute_insn_id,
                                  compute_dep_id,
                                  compute_read_variables=get_dependencies(
                                      expander(compute_expression)))

    kernel = invr.map_kernel(kernel)
    kernel = kernel.copy(instructions=added_compute_insns +
                         kernel.instructions)
    kernel = rule_mapping_context.finish_kernel(kernel)

    # }}}

    # {{{ add dependencies to compute insn

    kernel = kernel.copy(instructions=[
        insn.copy(depends_on=frozenset(invr.compute_insn_depends_on)) if insn.
        id == compute_insn_id else insn for insn in kernel.instructions
    ])

    # }}}

    # {{{ propagate storage iname subst to dependencies of compute instructions

    from loopy.kernel.tools import find_recursive_dependencies
    compute_deps = find_recursive_dependencies(kernel,
                                               frozenset([compute_insn_id]))

    # FIXME: Need to verify that there are no outside dependencies
    # on compute_deps

    prior_storage_axis_names = frozenset(storage_axis_subst_dict)

    new_insns = []
    for insn in kernel.instructions:
        if (insn.id in compute_deps
                and insn.within_inames & prior_storage_axis_names):
            insn = (insn.with_transformed_expressions(
                lambda expr: expr_subst_map(expr, kernel, insn)).copy(
                    within_inames=frozenset(
                        storage_axis_subst_dict.get(iname, var(iname)).name
                        for iname in insn.within_inames)))

            new_insns.append(insn)
        else:
            new_insns.append(insn)

    kernel = kernel.copy(instructions=new_insns)

    # }}}

    # {{{ determine inames for compute insn

    if precompute_outer_inames is None:
        from loopy.kernel.tools import guess_iname_deps_based_on_var_use
        precompute_outer_inames = (
            frozenset(non1_storage_axis_names)
            | frozenset((expanding_usage_arg_deps | value_inames) -
                        sweep_inames_set)
            | guess_iname_deps_based_on_var_use(kernel, compute_insn))
    else:
        if not isinstance(precompute_outer_inames, frozenset):
            raise TypeError("precompute_outer_inames must be a frozenset")

        precompute_outer_inames = precompute_outer_inames \
                | frozenset(non1_storage_axis_names)

    kernel = kernel.copy(instructions=[
        insn.copy(within_inames=precompute_outer_inames) if insn.id ==
        compute_insn_id else insn for insn in kernel.instructions
    ])

    # }}}

    # {{{ set up temp variable

    import loopy as lp
    if dtype is not None:
        dtype = np.dtype(dtype)

    if temporary_address_space is None:
        temporary_address_space = lp.auto

    new_temp_shape = tuple(abm.non1_storage_shape)

    new_temporary_variables = kernel.temporary_variables.copy()
    if temporary_name not in new_temporary_variables:
        temp_var = lp.TemporaryVariable(
            name=temporary_name,
            dtype=dtype,
            base_indices=(0, ) * len(new_temp_shape),
            shape=tuple(abm.non1_storage_shape),
            address_space=temporary_address_space,
            dim_names=tuple(non1_storage_axis_names))

    else:
        temp_var = new_temporary_variables[temporary_name]

        # {{{ check and adapt existing temporary

        if temp_var.dtype is lp.auto:
            pass
        elif temp_var.dtype is not lp.auto and dtype is lp.auto:
            dtype = temp_var.dtype
        elif temp_var.dtype is not lp.auto and dtype is not lp.auto:
            if temp_var.dtype != dtype:
                raise LoopyError("Existing and new dtype of temporary '%s' "
                                 "do not match (existing: %s, new: %s)" %
                                 (temporary_name, temp_var.dtype, dtype))

        temp_var = temp_var.copy(dtype=dtype)

        if len(temp_var.shape) != len(new_temp_shape):
            raise LoopyError(
                "Existing and new temporary '%s' do not "
                "have matching number of dimensions ('%d' vs. '%d') " %
                (temporary_name, len(temp_var.shape), len(new_temp_shape)))

        if temp_var.base_indices != (0, ) * len(new_temp_shape):
            raise LoopyError(
                "Existing and new temporary '%s' do not "
                "have matching number of dimensions ('%d' vs. '%d') " %
                (temporary_name, len(temp_var.shape), len(new_temp_shape)))

        new_temp_shape = tuple(
            max(i, ex_i) for i, ex_i in zip(new_temp_shape, temp_var.shape))

        temp_var = temp_var.copy(shape=new_temp_shape)

        if temporary_address_space == temp_var.address_space:
            pass
        elif temporary_address_space is lp.auto:
            temporary_address_space = temp_var.address_space
        elif temp_var.address_space is lp.auto:
            pass
        else:
            raise LoopyError("Existing and new temporary '%s' do not "
                             "have matching scopes (existing: %s, new: %s)" %
                             (temporary_name,
                              AddressSpace.stringify(temp_var.address_space),
                              AddressSpace.stringify(temporary_address_space)))

        temp_var = temp_var.copy(address_space=temporary_address_space)

        # }}}

    new_temporary_variables[temporary_name] = temp_var

    kernel = kernel.copy(temporary_variables=new_temporary_variables)

    # }}}

    from loopy import tag_inames
    kernel = tag_inames(kernel, new_iname_to_tag)

    from loopy.kernel.data import AutoFitLocalIndexTag, filter_iname_tags_by_type

    if filter_iname_tags_by_type(new_iname_to_tag.values(),
                                 AutoFitLocalIndexTag):
        from loopy.kernel.tools import assign_automatic_axes
        kernel = assign_automatic_axes(kernel)

    return kernel
Exemple #52
0
def _inline_call_instruction(caller_knl, callee_knl, call_insn):
    """
    Returns a copy of *caller_knl* with the *call_insn* in the *kernel*
    replaced by inlining *callee_knl* into it within it.

    :arg call_insn: An instance of `loopy.CallInstruction` of the call-site.
    """
    import pymbolic.primitives as prim
    from pymbolic.mapper.substitutor import make_subst_func
    from loopy.kernel.data import ValueArg

    # {{{ sanity checks

    assert call_insn.expression.function.name == callee_knl.name

    # }}}

    callee_label = callee_knl.name[:4] + "_"
    vng = caller_knl.get_var_name_generator()
    ing = caller_knl.get_instruction_id_generator()

    # {{{ construct callee->caller name mappings

    # name_map: Mapping[str, str]
    # A mapping from variable names in the callee kernel's namespace to
    # the ones they would be referred by in the caller's namespace post inlining.
    name_map = {}

    # only consider temporary variables and inames, arguments would be mapping
    # according to the invocation in call_insn.
    for name in (callee_knl.all_inames()
                 | set(callee_knl.temporary_variables.keys())):
        new_name = vng(callee_label + name)
        name_map[name] = new_name

    # }}}

    # {{{ iname_to_tags

    # new_inames: caller's inames post inlining
    new_inames = caller_knl.inames

    for old_name, callee_iname in callee_knl.inames.items():
        new_name = name_map[old_name]
        new_inames[new_name] = callee_iname.copy(name=new_name)

    # }}}

    # {{{ register callee's temps as caller's

    # new_temps: caller's temps post inlining
    new_temps = caller_knl.temporary_variables.copy()

    for name, tv in callee_knl.temporary_variables.items():
        new_temps[name_map[name]] = tv.copy(name=name_map[name])

    # }}}

    # {{{ get callee args -> parameters passed to the call

    arg_map = {}  # callee arg name -> caller symbols (e.g. SubArrayRef)

    assignees = call_insn.assignees  # writes
    parameters = call_insn.expression.parameters  # reads

    from loopy.kernel.function_interface import get_kw_pos_association
    kw_to_pos, pos_to_kw = get_kw_pos_association(callee_knl)

    for i, par in enumerate(parameters):
        arg_map[pos_to_kw[i]] = par

    for i, assignee in enumerate(assignees):
        arg_map[pos_to_kw[-i - 1]] = assignee

    # }}}

    # {{{ process domains/assumptions

    # rename inames
    new_domains = callee_knl.domains.copy()
    for old_iname in callee_knl.all_inames():
        new_domains = [
            rename_iname(dom, old_iname, name_map[old_iname])
            for dom in new_domains
        ]

    # realize domains' dim params in terms of caller's variables
    new_assumptions = callee_knl.assumptions
    for callee_arg_name, param_expr in arg_map.items():
        if isinstance(callee_knl.arg_dict[callee_arg_name], ValueArg):
            new_domains = [
                substitute_into_domain(
                    dom, callee_arg_name, param_expr,
                    get_valid_domain_param_names(caller_knl))
                for dom in new_domains
            ]

            new_assumptions = substitute_into_domain(
                new_assumptions, callee_arg_name, param_expr,
                get_valid_domain_param_names(caller_knl))

    # }}}

    # {{{ rename inames/temporaries in the program

    rule_mapping_context = SubstitutionRuleMappingContext(
        callee_knl.substitutions, vng)
    subst_func = make_subst_func({
        old_name: prim.Variable(new_name)
        for old_name, new_name in name_map.items()
    })
    inames_temps_renamer = RuleAwareSubstitutionMapper(
        rule_mapping_context, subst_func, within=lambda *args: True)

    callee_knl = rule_mapping_context.finish_kernel(
        inames_temps_renamer.map_kernel(callee_knl))

    # }}}

    # {{{ map callee's expressions to get expressions after inlining

    rule_mapping_context = SubstitutionRuleMappingContext(
        callee_knl.substitutions, vng)
    smap = KernelArgumentSubstitutor(rule_mapping_context, caller_knl,
                                     callee_knl, arg_map)

    callee_knl = rule_mapping_context.finish_kernel(
        smap.map_kernel(callee_knl))

    # }}}

    # {{{ generate new ids for instructions

    insn_id_map = {}
    for insn in callee_knl.instructions:
        insn_id_map[insn.id] = ing(callee_label + insn.id)

    # }}}

    # {{{ use NoOp to mark the start and end of callee kernel

    from loopy.kernel.instruction import NoOpInstruction

    noop_start = NoOpInstruction(id=ing(callee_label + "_start"),
                                 within_inames=call_insn.within_inames,
                                 depends_on=call_insn.depends_on)
    noop_end = NoOpInstruction(id=call_insn.id,
                               within_inames=call_insn.within_inames,
                               depends_on=frozenset(insn_id_map.values()))

    # }}}

    # {{{ map callee's instruction ids

    inlined_insns = [noop_start]

    for insn in callee_knl.instructions:
        new_within_inames = (frozenset(name_map[iname]
                                       for iname in insn.within_inames)
                             | call_insn.within_inames)
        new_depends_on = (frozenset(insn_id_map[dep]
                                    for dep in insn.depends_on)
                          | {noop_start.id})
        new_no_sync_with = frozenset(
            (insn_id_map[id], scope) for id, scope in insn.no_sync_with)
        new_id = insn_id_map[insn.id]

        if isinstance(insn, Assignment):
            new_atomicity = tuple(
                type(atomicity)(name_map[atomicity.var_name])
                for atomicity in insn.atomicity)
            insn = insn.copy(id=insn_id_map[insn.id],
                             within_inames=new_within_inames,
                             depends_on=new_depends_on,
                             tags=insn.tags | call_insn.tags,
                             atomicity=new_atomicity,
                             no_sync_with=new_no_sync_with)
        else:
            insn = insn.copy(id=new_id,
                             within_inames=new_within_inames,
                             depends_on=new_depends_on,
                             tags=insn.tags | call_insn.tags,
                             no_sync_with=new_no_sync_with)
        inlined_insns.append(insn)

    inlined_insns.append(noop_end)

    # }}}

    # {{{ swap out call_insn with inlined_instructions

    idx = caller_knl.instructions.index(call_insn)
    new_insns = (caller_knl.instructions[:idx] + inlined_insns +
                 caller_knl.instructions[idx + 1:])

    # }}}

    old_assumptions, new_assumptions = isl.align_two(caller_knl.assumptions,
                                                     new_assumptions)

    return caller_knl.copy(instructions=new_insns,
                           temporary_variables=new_temps,
                           domains=caller_knl.domains + new_domains,
                           assumptions=(old_assumptions.params()
                                        & new_assumptions.params()),
                           inames=new_inames)
Exemple #53
0
def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()):
    assert isinstance(kernel, LoopKernel)
    # FIXME: Does not understand subst rules for now
    if kernel.substitutions:
        from loopy.transform.subst import expand_subst
        kernel = expand_subst(kernel)

    if var_name in kernel.temporary_variables:
        var_descr = kernel.temporary_variables[var_name]
    elif var_name in kernel.arg_dict:
        var_descr = kernel.arg_dict[var_name]
    else:
        raise NameError("array '%s' was not found" % var_name)

    # {{{ check/normalize vary_by_axes

    if isinstance(vary_by_axes, str):
        vary_by_axes = vary_by_axes.split(",")

    from loopy.kernel.array import ArrayBase
    if isinstance(var_descr, ArrayBase):
        if var_descr.dim_names is not None:
            name_to_index = {
                name: idx
                for idx, name in enumerate(var_descr.dim_names)
            }
        else:
            name_to_index = {}

        def map_ax_name_to_index(ax):
            if isinstance(ax, str):
                try:
                    return name_to_index[ax]
                except KeyError:
                    raise LoopyError("axis name '%s' not understood " % ax)
            else:
                return ax

        vary_by_axes = [map_ax_name_to_index(ax) for ax in vary_by_axes]

        if (vary_by_axes
                and (min(vary_by_axes) < 0
                     or max(vary_by_axes) > var_descr.num_user_axes())):
            raise LoopyError("vary_by_axes refers to out-of-bounds axis index")

    # }}}

    from pymbolic.mapper.substitutor import make_subst_func
    from pymbolic.primitives import (Sum, Product, is_zero, flattened_sum,
                                     flattened_product, Subscript, Variable)
    from loopy.symbolic import (get_dependencies, SubstitutionMapper,
                                UnidirectionalUnifier)

    # {{{ common factor key list maintenance

    # list of (index_key, common factors found)
    common_factors = []

    def find_unifiable_cf_index(index_key):
        for i, (key, _val) in enumerate(common_factors):
            unif = UnidirectionalUnifier(
                lhs_mapping_candidates=get_dependencies(key))

            unif_result = unif(key, index_key)

            if unif_result:
                assert len(unif_result) == 1
                return i, unif_result[0]

        return None, None

    def extract_index_key(access_expr):
        if isinstance(access_expr, Variable):
            return ()

        elif isinstance(access_expr, Subscript):
            index = access_expr.index_tuple
            return tuple(index[ax] for ax in vary_by_axes)
        else:
            raise ValueError("unexpected type of access_expr")

    def is_assignee(insn):
        return var_name in insn.assignee_var_names()

    def iterate_as(cls, expr):
        if isinstance(expr, cls):
            yield from expr.children
        else:
            yield expr

    # }}}

    # {{{ find common factors

    from loopy.kernel.data import Assignment

    for insn in kernel.instructions:
        if not is_assignee(insn):
            continue

        if not isinstance(insn, Assignment):
            raise LoopyError("'%s' modified by non-single-assignment" %
                             var_name)

        lhs = insn.assignee
        rhs = insn.expression

        if is_zero(rhs):
            continue

        index_key = extract_index_key(lhs)
        cf_index, unif_result = find_unifiable_cf_index(index_key)

        if cf_index is None:
            # {{{ doesn't exist yet

            assert unif_result is None

            my_common_factors = None

            for term in iterate_as(Sum, rhs):
                if term == lhs:
                    continue

                for part in iterate_as(Product, term):
                    if var_name in get_dependencies(part):
                        raise LoopyError("unexpected dependency on '%s' "
                                         "in RHS of instruction '%s'" %
                                         (var_name, insn.id))

                product_parts = set(iterate_as(Product, term))

                if my_common_factors is None:
                    my_common_factors = product_parts
                else:
                    my_common_factors = my_common_factors & product_parts

            if my_common_factors is not None:
                common_factors.append((index_key, my_common_factors))

            # }}}
        else:
            # {{{ match, filter existing common factors

            _, my_common_factors = common_factors[cf_index]

            unif_subst_map = SubstitutionMapper(
                make_subst_func(unif_result.lmap))

            for term in iterate_as(Sum, rhs):
                if term == lhs:
                    continue

                for part in iterate_as(Product, term):
                    if var_name in get_dependencies(part):
                        raise LoopyError("unexpected dependency on '%s' "
                                         "in RHS of instruction '%s'" %
                                         (var_name, insn.id))

                product_parts = set(iterate_as(Product, term))

                my_common_factors = {
                    cf
                    for cf in my_common_factors
                    if unif_subst_map(cf) in product_parts
                }

            common_factors[cf_index] = (index_key, my_common_factors)

            # }}}

    # }}}

    common_factors = [(ik, cf) for ik, cf in common_factors if cf]

    if not common_factors:
        raise LoopyError("no common factors found")

    # {{{ remove common factors

    new_insns = []

    for insn in kernel.instructions:
        if not isinstance(insn, Assignment) or not is_assignee(insn):
            new_insns.append(insn)
            continue

        index_key = extract_index_key(insn.assignee)

        lhs = insn.assignee
        rhs = insn.expression

        if is_zero(rhs):
            new_insns.append(insn)
            continue

        index_key = extract_index_key(lhs)
        cf_index, unif_result = find_unifiable_cf_index(index_key)

        if cf_index is None:
            new_insns.append(insn)
            continue

        _, my_common_factors = common_factors[cf_index]

        unif_subst_map = SubstitutionMapper(make_subst_func(unif_result.lmap))

        mapped_my_common_factors = {
            unif_subst_map(cf)
            for cf in my_common_factors
        }

        new_sum_terms = []

        for term in iterate_as(Sum, rhs):
            if term == lhs:
                new_sum_terms.append(term)
                continue

            new_sum_terms.append(
                flattened_product([
                    part for part in iterate_as(Product, term)
                    if part not in mapped_my_common_factors
                ]))

        new_insns.append(insn.copy(expression=flattened_sum(new_sum_terms)))

    # }}}

    # {{{ substitute common factors into usage sites

    def find_substitution(expr):
        if isinstance(expr, Subscript):
            v = expr.aggregate.name
        elif isinstance(expr, Variable):
            v = expr.name
        else:
            return expr

        if v != var_name:
            return expr

        index_key = extract_index_key(expr)
        cf_index, unif_result = find_unifiable_cf_index(index_key)

        unif_subst_map = SubstitutionMapper(make_subst_func(unif_result.lmap))

        _, my_common_factors = common_factors[cf_index]

        if my_common_factors is not None:
            return flattened_product(
                [unif_subst_map(cf) for cf in my_common_factors] + [expr])
        else:
            return expr

    insns = new_insns
    new_insns = []

    subm = SubstitutionMapper(find_substitution)

    for insn in insns:
        if not isinstance(insn, Assignment) or is_assignee(insn):
            new_insns.append(insn)
            continue

        new_insns.append(insn.with_transformed_expressions(subm))

    # }}}

    return kernel.copy(instructions=new_insns)
Exemple #54
0
    def generate_atomic_update(self, kernel, codegen_state, lhs_atomicity, lhs_var,
            lhs_expr, rhs_expr, lhs_dtype, rhs_type_context):
        from pymbolic.mapper.stringifier import PREC_NONE

        # FIXME: Could detect operations, generate atomic_{add,...} when
        # appropriate.

        if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype in [
                np.int32, np.int64, np.float32, np.float64]:
            from cgen import Block, DoWhile, Assign
            from loopy.target.c import POD
            old_val_var = codegen_state.var_name_generator("loopy_old_val")
            new_val_var = codegen_state.var_name_generator("loopy_new_val")

            from loopy.kernel.data import TemporaryVariable, temp_var_scope
            ecm = codegen_state.expression_to_code_mapper.with_assignments(
                    {
                        old_val_var: TemporaryVariable(old_val_var, lhs_dtype),
                        new_val_var: TemporaryVariable(new_val_var, lhs_dtype),
                        })

            lhs_expr_code = ecm(lhs_expr, prec=PREC_NONE, type_context=None)

            from pymbolic.mapper.substitutor import make_subst_func
            from pymbolic import var
            from loopy.symbolic import SubstitutionMapper

            subst = SubstitutionMapper(
                    make_subst_func({lhs_expr: var(old_val_var)}))
            rhs_expr_code = ecm(subst(rhs_expr), prec=PREC_NONE,
                    type_context=rhs_type_context,
                    needed_dtype=lhs_dtype)

            if lhs_dtype.numpy_dtype.itemsize == 4:
                func_name = "atomic_cmpxchg"
            elif lhs_dtype.numpy_dtype.itemsize == 8:
                func_name = "atom_cmpxchg"
            else:
                raise LoopyError("unexpected atomic size")

            cast_str = ""
            old_val = old_val_var
            new_val = new_val_var

            if lhs_dtype.numpy_dtype.kind == "f":
                if lhs_dtype.numpy_dtype == np.float32:
                    ctype = "int"
                elif lhs_dtype.numpy_dtype == np.float64:
                    ctype = "long"
                else:
                    assert False

                from loopy.kernel.data import TemporaryVariable, GlobalArg
                if isinstance(lhs_var, GlobalArg):
                    var_kind = "__global"
                elif (
                        isinstance(lhs_var, TemporaryVariable)
                        and lhs_var.scope == temp_var_scope.LOCAL):
                    var_kind = "__local"
                elif (
                        isinstance(lhs_var, TemporaryVariable)
                        and lhs_var.scope == temp_var_scope.GLOBAL):
                    var_kind = "__global"
                else:
                    raise LoopyError("unexpected kind of variable '%s' in "
                            "atomic operation: "
                            % (lhs_var.name, type(lhs_var).__name__))

                old_val = "*(%s *) &" % ctype + old_val
                new_val = "*(%s *) &" % ctype + new_val
                cast_str = "(%s %s *) " % (var_kind, ctype)

            return Block([
                POD(self, NumpyType(lhs_dtype.dtype, target=self.target),
                    old_val_var),
                POD(self, NumpyType(lhs_dtype.dtype, target=self.target),
                    new_val_var),
                DoWhile(
                    "%(func_name)s("
                    "%(cast_str)s&(%(lhs_expr)s), "
                    "%(old_val)s, "
                    "%(new_val)s"
                    ") != %(old_val)s"
                    % {
                        "func_name": func_name,
                        "cast_str": cast_str,
                        "lhs_expr": lhs_expr_code,
                        "old_val": old_val,
                        "new_val": new_val,
                        },
                    Block([
                        Assign(old_val_var, lhs_expr_code),
                        Assign(new_val_var, rhs_expr_code),
                        ])
                    )
                ])
        else:
            raise NotImplementedError("atomic update for '%s'" % lhs_dtype)