Ejemplo n.º 1
0
def check_variable_access_ordered(kernel):
    """Checks that between each write to a variable and all other accesses to
    the variable there is either:

    * an (at least indirect) depdendency edge, or
    * an explicit statement that no ordering is necessary (expressed
      through a bi-directional :attr:`loopy.Instruction.no_sync_with`)
    """

    if kernel.options.enforce_variable_access_ordered not in [
            "no_check",
            True,
            False]:
        raise LoopyError("invalid value for option "
                "'enforce_variable_access_ordered': %s"
                % kernel.options.enforce_variable_access_ordered)

    if kernel.options.enforce_variable_access_ordered == "no_check":
        return

    if kernel.options.enforce_variable_access_ordered:
        _check_variable_access_ordered_inner(kernel)
    else:
        from loopy.diagnostic import VariableAccessNotOrdered
        try:
            _check_variable_access_ordered_inner(kernel)
        except VariableAccessNotOrdered as e:
            from loopy.diagnostic import warn_with_kernel
            warn_with_kernel(kernel, "variable_access_ordered", str(e))
Ejemplo n.º 2
0
def check_variable_access_ordered(kernel):
    """Checks that between each write to a variable and all other accesses to
    the variable there is either:

    * an (at least indirect) depdendency edge, or
    * an explicit statement that no ordering is necessary (expressed
      through a bi-directional :attr:`loopy.Instruction.no_sync_with`)
    """

    if kernel.options.enforce_variable_access_ordered not in [
            "no_check",
            True,
            False]:
        raise LoopyError("invalid value for option "
                "'enforce_variable_access_ordered': %s"
                % kernel.options.enforce_variable_access_ordered)

    if kernel.options.enforce_variable_access_ordered == "no_check":
        return

    if kernel.options.enforce_variable_access_ordered:
        _check_variable_access_ordered_inner(kernel)
    else:
        from loopy.diagnostic import VariableAccessNotOrdered
        try:
            _check_variable_access_ordered_inner(kernel)
        except VariableAccessNotOrdered as e:
            from loopy.diagnostic import warn_with_kernel
            warn_with_kernel(kernel, "variable_access_ordered", str(e))
Ejemplo n.º 3
0
def check_for_unused_inames(kernel):
    # Warn if kernel has unused inames
    from loopy.transform.iname import get_used_inames
    unused_inames = kernel.all_inames() - get_used_inames(kernel)
    if unused_inames:
        warn_with_kernel(
            kernel, "unused_inames",
            "Found unused inames in kernel: %s "
            "Unused inames during linearization will be prohibited in "
            "Loopy version 2021.X."
            % unused_inames)
Ejemplo n.º 4
0
    def _get_iname_order_for_printing(self):
        try:
            from loopy.kernel.tools import get_visual_iname_order_embedding
            embedding = get_visual_iname_order_embedding(self)
        except ValueError:
            from loopy.diagnostic import warn_with_kernel
            warn_with_kernel(self,
                "iname-order",
                "get_visual_iname_order_embedding() could not determine a "
                "consistent iname nesting order")
            embedding = dict((iname, iname) for iname in self.all_inames())

        return embedding
Ejemplo n.º 5
0
def add_default_dependencies(kernel):
    logger.debug("%s: default deps" % kernel.name)

    from loopy.transform.subst import expand_subst
    expanded_kernel = expand_subst(kernel)

    writer_map = kernel.writer_map()

    arg_names = set(arg.name for arg in kernel.args)

    var_names = arg_names | set(six.iterkeys(kernel.temporary_variables))

    dep_map = dict(
            (insn.id, insn.read_dependency_names() & var_names)
            for insn in expanded_kernel.instructions)

    new_insns = []
    for insn in kernel.instructions:
        if not insn.depends_on_is_final:
            auto_deps = set()

            # {{{ add automatic dependencies

            all_my_var_writers = set()
            for var in dep_map[insn.id]:
                var_writers = writer_map.get(var, set())
                all_my_var_writers |= var_writers

                if not var_writers and var not in arg_names:
                    tv = kernel.temporary_variables[var]
                    if tv.initializer is None:
                        warn_with_kernel(kernel, "read_no_write(%s)" % var,
                                "temporary variable '%s' is read, but never written."
                                % var)

                if len(var_writers) == 1:
                    auto_deps.update(
                            var_writers
                            - set([insn.id]))

            # }}}

            depends_on = insn.depends_on
            if depends_on is None:
                depends_on = frozenset()

            insn = insn.copy(depends_on=frozenset(auto_deps) | depends_on)

        new_insns.append(insn)

    return kernel.copy(instructions=new_insns)
Ejemplo n.º 6
0
def add_default_dependencies(kernel):
    logger.debug("%s: default deps" % kernel.name)

    from loopy.transform.subst import expand_subst
    expanded_kernel = expand_subst(kernel)

    writer_map = kernel.writer_map()

    arg_names = set(arg.name for arg in kernel.args)

    var_names = arg_names | set(six.iterkeys(kernel.temporary_variables))

    dep_map = dict(
            (insn.id, insn.read_dependency_names() & var_names)
            for insn in expanded_kernel.instructions)

    new_insns = []
    for insn in kernel.instructions:
        if not insn.depends_on_is_final:
            auto_deps = set()

            # {{{ add automatic dependencies

            all_my_var_writers = set()
            for var in dep_map[insn.id]:
                var_writers = writer_map.get(var, set())
                all_my_var_writers |= var_writers

                if not var_writers and var not in arg_names:
                    tv = kernel.temporary_variables[var]
                    if tv.initializer is None:
                        warn_with_kernel(kernel, "read_no_write(%s)" % var,
                                "temporary variable '%s' is read, but never written."
                                % var)

                if len(var_writers) == 1:
                    auto_deps.update(
                            var_writers
                            - set([insn.id]))

            # }}}

            depends_on = insn.depends_on
            if depends_on is None:
                depends_on = frozenset()

            insn = insn.copy(depends_on=frozenset(auto_deps) | depends_on)

        new_insns.append(insn)

    return kernel.copy(instructions=new_insns)
Ejemplo n.º 7
0
def check_for_write_races(kernel):
    """
    Check if any memory accesses lead to write races.
    """
    from loopy.kernel.data import ConcurrentTag

    for insn in kernel.instructions:
        for assignee_name, assignee_indices in zip(
                insn.assignee_var_names(), insn.assignee_subscript_deps()):
            assignee_inames = assignee_indices & kernel.all_inames()
            if not assignee_inames <= insn.within_inames:
                raise LoopyError(
                    "assignee of instructions '%s' references "
                    "iname that the instruction does not depend on" % insn.id)

            if assignee_name in kernel.arg_dict:
                # Any concurrent tags that are not depended upon by the assignee
                # will cause write races.

                raceable_parallel_insn_inames = {
                    iname
                    for iname in insn.within_inames
                    if kernel.iname_tags_of_type(iname, ConcurrentTag)
                }

            elif assignee_name in kernel.temporary_variables:
                temp_var = kernel.temporary_variables[assignee_name]
                raceable_parallel_insn_inames = {
                    iname
                    for iname in insn.within_inames if any(
                        _is_racing_iname_tag(temp_var, tag)
                        for tag in kernel.iname_tags(iname))
                }

            else:
                raise LoopyError("invalid assignee name in instruction '%s'" %
                                 insn.id)

            race_inames = \
                    raceable_parallel_insn_inames - assignee_inames

            if race_inames:
                warn_with_kernel(
                    kernel, "write_race(%s)" % insn.id,
                    "instruction '%s' contains a write race: "
                    "instruction will be run across parallel iname(s) "
                    "'%s', which is/are not referenced in the lhs index" %
                    (insn.id, ",".join(race_inames)),
                    WriteRaceConditionWarning)
Ejemplo n.º 8
0
    def _get_iname_order_for_printing(self):
        try:
            from loopy.kernel.tools import get_visual_iname_order_embedding
            embedding = get_visual_iname_order_embedding(self)
        except ValueError:
            from loopy.diagnostic import warn_with_kernel
            warn_with_kernel(self,
                "iname-order",
                "get_visual_iname_order_embedding() could not determine a "
                "consistent iname nesting order. This is a possible indication "
                "that the kernel may not schedule successfully, but for now "
                "it only impacts printing of the kernel.")
            embedding = dict((iname, iname) for iname in self.all_inames())

        return embedding
Ejemplo n.º 9
0
    def _get_iname_order_for_printing(self):
        try:
            from loopy.kernel.tools import get_visual_iname_order_embedding
            embedding = get_visual_iname_order_embedding(self)
        except ValueError:
            from loopy.diagnostic import warn_with_kernel
            warn_with_kernel(self,
                "iname-order",
                "get_visual_iname_order_embedding() could not determine a "
                "consistent iname nesting order. This is a possible indication "
                "that the kernel may not schedule successfully, but for now "
                "it only impacts printing of the kernel.")
            embedding = dict((iname, iname) for iname in self.all_inames())

        return embedding
Ejemplo n.º 10
0
def check_for_write_races(kernel):
    from loopy.kernel.data import ParallelTag

    iname_to_tag = kernel.iname_to_tag.get
    for insn in kernel.instructions:
        for assignee_name, assignee_indices in zip(
                insn.assignee_var_names(),
                insn.assignee_subscript_deps()):
            assignee_inames = assignee_indices & kernel.all_inames()
            if not assignee_inames <= kernel.insn_inames(insn):
                raise LoopyError(
                        "assignee of instructiosn '%s' references "
                        "iname that the instruction does not depend on"
                        % insn.id)

            if assignee_name in kernel.arg_dict:
                # Any parallel tags that are not depended upon by the assignee
                # will cause write races.

                raceable_parallel_insn_inames = set(
                        iname
                        for iname in kernel.insn_inames(insn)
                        if isinstance(iname_to_tag(iname), ParallelTag))

            elif assignee_name in kernel.temporary_variables:
                temp_var = kernel.temporary_variables[assignee_name]
                raceable_parallel_insn_inames = set(
                            iname
                            for iname in kernel.insn_inames(insn)
                            if _is_racing_iname_tag(temp_var, iname_to_tag(iname)))

            else:
                raise LoopyError("invalid assignee name in instruction '%s'"
                        % insn.id)

            race_inames = \
                    raceable_parallel_insn_inames - assignee_inames

            if race_inames:
                warn_with_kernel(kernel, "write_race(%s)" % insn.id,
                        "instruction '%s' contains a write race: "
                        "instruction will be run across parallel iname(s) "
                        "'%s', which is/are not referenced in the lhs index"
                        % (insn.id, ",".join(race_inames)),
                        WriteRaceConditionWarning)
Ejemplo n.º 11
0
def check_for_write_races(kernel):
    from loopy.kernel.data import ParallelTag

    iname_to_tag = kernel.iname_to_tag.get
    for insn in kernel.instructions:
        for assignee_name, assignee_indices in zip(
                insn.assignee_var_names(), insn.assignee_subscript_deps()):
            assignee_inames = assignee_indices & kernel.all_inames()
            if not assignee_inames <= kernel.insn_inames(insn):
                raise LoopyError(
                    "assignee of instructiosn '%s' references "
                    "iname that the instruction does not depend on" % insn.id)

            if assignee_name in kernel.arg_dict:
                # Any parallel tags that are not depended upon by the assignee
                # will cause write races.

                raceable_parallel_insn_inames = set(
                    iname for iname in kernel.insn_inames(insn)
                    if isinstance(iname_to_tag(iname), ParallelTag))

            elif assignee_name in kernel.temporary_variables:
                temp_var = kernel.temporary_variables[assignee_name]
                raceable_parallel_insn_inames = set(
                    iname for iname in kernel.insn_inames(insn)
                    if _is_racing_iname_tag(temp_var, iname_to_tag(iname)))

            else:
                raise LoopyError("invalid assignee name in instruction '%s'" %
                                 insn.id)

            race_inames = \
                    raceable_parallel_insn_inames - assignee_inames

            if race_inames:
                warn_with_kernel(
                    kernel, "write_race(%s)" % insn.id,
                    "instruction '%s' contains a write race: "
                    "instruction will be run across parallel iname(s) "
                    "'%s', which is/are not referenced in the lhs index" %
                    (insn.id, ",".join(race_inames)),
                    WriteRaceConditionWarning)
Ejemplo n.º 12
0
    def map_fortran_division(self, expr, *args):
        # We remove all these before type inference ever sees them.
        from loopy.type_inference import TypeInferenceFailure

        try:
            num_dtype = self.infer_type(expr.numerator).numpy_dtype
            den_dtype = self.infer_type(expr.denominator).numpy_dtype
        except TypeInferenceFailure:
            return super().map_fortran_division(expr, *args)

        from pymbolic.primitives import Quotient, FloorDiv
        if num_dtype.kind in "iub" and den_dtype.kind in "iub":
            warn_with_kernel(
                self.kernel, "fortran_int_div",
                "Integer division in Fortran code. Loopy currently gets this "
                "wrong for negative arguments.")
            return FloorDiv(self.rec(expr.numerator, *args),
                            self.rec(expr.denominator, *args))

        else:
            return Quotient(self.rec(expr.numerator, *args),
                            self.rec(expr.denominator, *args))
Ejemplo n.º 13
0
    def map_reduction(expr, rec, nresults=1):
        # Only expand one level of reduction at a time, going from outermost to
        # innermost. Otherwise we get the (iname + insn) dependencies wrong.

        try:
            arg_dtype = type_inf_mapper(expr.expr)
        except DependencyTypeInferenceFailure:
            if unknown_types_ok:
                arg_dtype = lp.auto

                reduction_dtypes = (lp.auto,)*nresults

            else:
                raise LoopyError("failed to determine type of accumulator for "
                        "reduction '%s'" % expr)
        else:
            arg_dtype = arg_dtype.with_target(kernel.target)

            reduction_dtypes = expr.operation.result_dtypes(
                        kernel, arg_dtype, expr.inames)
            reduction_dtypes = tuple(
                    dt.with_target(kernel.target) for dt in reduction_dtypes)

        outer_insn_inames = temp_kernel.insn_inames(insn)
        bad_inames = frozenset(expr.inames) & outer_insn_inames
        if bad_inames:
            raise LoopyError("reduction used within loop(s) that it was "
                    "supposed to reduce over: " + ", ".join(bad_inames))

        n_sequential = 0
        n_local_par = 0

        from loopy.kernel.data import (
                LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag,
                ParallelTag)
        for iname in expr.inames:
            iname_tag = kernel.iname_to_tag.get(iname)

            if isinstance(iname_tag, (UnrollTag, UnrolledIlpTag)):
                # These are nominally parallel, but we can live with
                # them as sequential.
                n_sequential += 1

            elif isinstance(iname_tag, LocalIndexTagBase):
                n_local_par += 1

            elif isinstance(iname_tag, (ParallelTag, VectorizeTag)):
                raise LoopyError("the only form of parallelism supported "
                        "by reductions is 'local'--found iname '%s' "
                        "tagged '%s'"
                        % (iname, type(iname_tag).__name__))

            else:
                n_sequential += 1

        if n_local_par and n_sequential:
            raise LoopyError("Reduction over '%s' contains both parallel and "
                    "sequential inames. It must be split "
                    "(using split_reduction_{in,out}ward) "
                    "before code generation."
                    % ", ".join(expr.inames))

        if n_local_par > 1:
            raise LoopyError("Reduction over '%s' contains more than"
                    "one parallel iname. It must be split "
                    "(using split_reduction_{in,out}ward) "
                    "before code generation."
                    % ", ".join(expr.inames))

        if n_sequential:
            assert n_local_par == 0
            return map_reduction_seq(expr, rec, nresults, arg_dtype,
                    reduction_dtypes)
        elif n_local_par:
            return map_reduction_local(expr, rec, nresults, arg_dtype,
                    reduction_dtypes)
        else:
            from loopy.diagnostic import warn_with_kernel
            warn_with_kernel(kernel, "empty_reduction",
                    "Empty reduction found (no inames to reduce over). "
                    "Eliminating.")

            return expr.expr
Ejemplo n.º 14
0
def check_sizes(kernel, device):
    import loopy as lp

    from loopy.diagnostic import LoopyAdvisory, LoopyError

    if device is None:
        warn_with_kernel(kernel, "no_device_in_pre_codegen_checks",
                "No device parameter was passed to the PyOpenCLTarget. "
                "Perhaps you want to pass a device to benefit from "
                "additional checking.", LoopyAdvisory)
        return

    parameters = {}
    for arg in kernel.args:
        if isinstance(arg, lp.ValueArg) and arg.approximately is not None:
            parameters[arg.name] = arg.approximately

    glens, llens = kernel.get_grid_size_upper_bounds_as_exprs()

    if (max(len(glens), len(llens))
            > device.max_work_item_dimensions):
        raise LoopyError("too many work item dimensions")

    from pymbolic import evaluate
    from pymbolic.mapper.evaluator import UnknownVariableError
    try:
        glens = evaluate(glens, parameters)
        llens = evaluate(llens, parameters)
    except UnknownVariableError as name:
        from warnings import warn
        warn("could not check axis bounds because no value "
                "for variable '%s' was passed to check_kernels()"
                % name, LoopyAdvisory)
    else:
        for i in range(len(llens)):
            if llens[i] > device.max_work_item_sizes[i]:
                raise LoopyError("group axis %d too big" % i)

        from pytools import product
        if product(llens) > device.max_work_group_size:
            raise LoopyError("work group too big")

    local_mem_use = kernel.local_mem_use()

    from pyopencl.characterize import usable_local_mem_size
    import numbers
    if isinstance(local_mem_use, numbers.Integral):
        if local_mem_use > usable_local_mem_size(device):
            raise LoopyError("using too much local memory")
    else:
        warn_with_kernel(kernel, "non_constant_local_mem",
                "The amount of local memory used by the kernel "
                "is not a constant. This will likely cause problems.")

    from loopy.kernel.data import ConstantArg
    const_arg_count = sum(
            1 for arg in kernel.args
            if isinstance(arg, ConstantArg))

    if const_arg_count > device.max_constant_args:
        raise LoopyError("too many constant arguments")
Ejemplo n.º 15
0
def find_temporary_scope(kernel):
    logger.debug("%s: mark local temporaries" % kernel.name)

    new_temp_vars = {}
    from loopy.kernel.data import (LocalIndexTagBase, GroupIndexTag,
            temp_var_scope)
    import loopy as lp

    writers = kernel.writer_map()

    for temp_var in six.itervalues(kernel.temporary_variables):
        # Only fill out for variables that do not yet know if they're
        # local. (I.e. those generated by implicit temporary generation.)

        if temp_var.scope is not lp.auto:
            new_temp_vars[temp_var.name] = temp_var
            continue

        my_writers = writers.get(temp_var.name, [])

        desired_scope_per_insn = []
        for insn_id in my_writers:
            insn = kernel.id_to_insn[insn_id]

            # A write race will emerge if:
            #
            # - the variable is local
            #   and
            # - the instruction is run across more inames (locally) parallel
            #   than are reflected in the assignee indices.

            locparallel_compute_inames = _get_compute_inames_tagged(
                    kernel, insn, LocalIndexTagBase)

            locparallel_assignee_inames = _get_assignee_inames_tagged(
                    kernel, insn, LocalIndexTagBase, temp_var.name)

            grpparallel_compute_inames = _get_compute_inames_tagged(
                    kernel, insn, GroupIndexTag)

            grpparallel_assignee_inames = _get_assignee_inames_tagged(
                    kernel, insn, GroupIndexTag, temp_var.name)

            assert locparallel_assignee_inames <= locparallel_compute_inames
            assert grpparallel_assignee_inames <= grpparallel_compute_inames

            desired_scope = temp_var_scope.PRIVATE
            for iname_descr, scope_descr, apin, cpin, scope in [
                    ("local", "local", locparallel_assignee_inames,
                        locparallel_compute_inames, temp_var_scope.LOCAL),
                    ("group", "global", grpparallel_assignee_inames,
                        grpparallel_compute_inames, temp_var_scope.GLOBAL),
                    ]:

                if (apin != cpin and bool(locparallel_assignee_inames)):
                    warn_with_kernel(kernel, "write_race_local(%s)" % insn_id,
                            "instruction '%s' looks invalid: "
                            "it assigns to indices based on %s IDs, but "
                            "its temporary '%s' cannot be made %s because "
                            "a write race across the iname(s) '%s' would emerge. "
                            "(Do you need to add an extra iname to your prefetch?)"
                            % (insn_id, iname_descr, temp_var.name, scope_descr,
                                ", ".join(cpin - apin)),
                            WriteRaceConditionWarning)

                if (apin == cpin

                        # doesn't want to be in this scope if there aren't any
                        # parallel inames of that kind:
                        and bool(cpin)):
                    desired_scope = max(desired_scope, scope)
                    break

            desired_scope_per_insn.append(desired_scope)

        if not desired_scope_per_insn:
            if temp_var.initializer is None:
                warn_with_kernel(kernel, "temp_to_write(%s)" % temp_var.name,
                        "temporary variable '%s' never written, eliminating"
                        % temp_var.name, LoopyAdvisory)
            else:
                raise LoopyError("temporary variable '%s': never written, "
                        "cannot automatically determine scope"
                        % temp_var.name)

            continue

        overall_scope = max(desired_scope_per_insn)

        from pytools import all
        if not all(iscope == overall_scope for iscope in desired_scope_per_insn):
            raise LoopyError("not all instructions agree on the "
                    "the desired scope (private/local/global) of  the "
                    "temporary '%s'" % temp_var.name)

        new_temp_vars[temp_var.name] = temp_var.copy(scope=overall_scope)

    return kernel.copy(temporary_variables=new_temp_vars)
Ejemplo n.º 16
0
def find_all_insn_inames(kernel):
    logger.debug("%s: find_all_insn_inames: start" % kernel.name)

    writer_map = kernel.writer_map()

    insn_id_to_inames = {}
    insn_assignee_inames = {}

    all_read_deps = {}
    all_write_deps = {}

    from loopy.transform.subst import expand_subst
    kernel = expand_subst(kernel)

    for insn in kernel.instructions:
        all_read_deps[insn.id] = read_deps = insn.read_dependency_names()
        all_write_deps[insn.id] = write_deps = insn.write_dependency_names()
        deps = read_deps | write_deps

        if insn.forced_iname_deps_is_final:
            iname_deps = insn.forced_iname_deps
        else:
            iname_deps = (
                    deps & kernel.all_inames()
                    | insn.forced_iname_deps)

        assert isinstance(read_deps, frozenset), type(insn)
        assert isinstance(write_deps, frozenset), type(insn)
        assert isinstance(iname_deps, frozenset), type(insn)

        logger.debug("%s: find_all_insn_inames: %s (init): %s - "
                "read deps: %s - write deps: %s" % (
                    kernel.name, insn.id, ", ".join(sorted(iname_deps)),
                    ", ".join(sorted(read_deps)), ", ".join(sorted(write_deps)),
                    ))

        insn_id_to_inames[insn.id] = iname_deps
        insn_assignee_inames[insn.id] = write_deps & kernel.all_inames()

    # fixed point iteration until all iname dep sets have converged

    # Why is fixed point iteration necessary here? Consider the following
    # scenario:
    #
    # z = expr(iname)
    # y = expr(z)
    # x = expr(y)
    #
    # x clearly has a dependency on iname, but this is not found until that
    # dependency has propagated all the way up. Doing this recursively is
    # not guaranteed to terminate because of circular dependencies.

    while True:
        did_something = False
        for insn in kernel.instructions:

            if insn.forced_iname_deps_is_final:
                continue

            # {{{ depdency-based propagation

            inames_old = insn_id_to_inames[insn.id]
            inames_new = inames_old | guess_iname_deps_based_on_var_use(
                    kernel, insn, insn_id_to_inames)

            insn_id_to_inames[insn.id] = inames_new

            if inames_new != inames_old:
                did_something = True

                warn_with_kernel(kernel, "inferred_iname",
                        "The iname(s) '%s' on instruction '%s' in kernel '%s' "
                        "was/were automatically added. "
                        "This is deprecated. Please add the iname "
                        "to the instruction "
                        "explicitly, e.g. by adding 'for' loops"
                        % (", ".join(inames_new-inames_old), insn.id, kernel.name))

            # }}}

            # {{{ domain-based propagation

            inames_old = insn_id_to_inames[insn.id]
            inames_new = set(insn_id_to_inames[insn.id])

            for iname in inames_old:
                home_domain = kernel.domains[kernel.get_home_domain_index(iname)]

                for par in home_domain.get_var_names(dim_type.param):
                    # Add all inames occurring in parameters of domains that my
                    # current inames refer to.

                    if par in kernel.all_inames():
                        inames_new.add(intern(par))

                    # If something writes the bounds of a loop in which I'm
                    # sitting, I had better be in the inames that the writer is
                    # in.

                    if par in kernel.temporary_variables:
                        for writer_id in writer_map.get(par, []):
                            inames_new.update(insn_id_to_inames[writer_id])

            if inames_new != inames_old:
                did_something = True
                insn_id_to_inames[insn.id] = frozenset(inames_new)

                warn_with_kernel(kernel, "inferred_iname",
                        "The iname(s) '%s' on instruction '%s' was "
                        "automatically added. "
                        "This is deprecated. Please add the iname "
                        "to the instruction "
                        "explicitly, e.g. by adding '{inames=...}"
                        % (", ".join(inames_new-inames_old), insn.id))

            # }}}

        if not did_something:
            break

    logger.debug("%s: find_all_insn_inames: done" % kernel.name)

    for v in six.itervalues(insn_id_to_inames):
        assert isinstance(v, frozenset)

    return insn_id_to_inames
Ejemplo n.º 17
0
def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None):
    if iname is not None:
        logger.debug("%s: add axes to temporaries for ilp" % kernel.name)

    wmap = kernel.writer_map()

    from loopy.kernel.data import IlpBaseTag, VectorizeTag

    var_to_new_ilp_inames = {}

    # {{{ find variables that need extra indices

    for tv in six.itervalues(kernel.temporary_variables):
        for writer_insn_id in wmap.get(tv.name, []):
            writer_insn = kernel.id_to_insn[writer_insn_id]

            if iname is None:
                ilp_inames = frozenset(iname
                        for iname in kernel.insn_inames(writer_insn)
                        if isinstance(
                            kernel.iname_to_tag.get(iname),
                            (IlpBaseTag, VectorizeTag)))
            else:
                if not isinstance(
                        kernel.iname_to_tag.get(iname),
                        (IlpBaseTag, VectorizeTag)):
                    raise LoopyError(
                            "'%s' is not an ILP iname"
                            % iname)

                ilp_inames = frozenset([iname])

            referenced_ilp_inames = (ilp_inames
                    & writer_insn.write_dependency_names())

            new_ilp_inames = ilp_inames - referenced_ilp_inames

            if not new_ilp_inames:
                break

            if tv.name in var_to_new_ilp_inames:
                if new_ilp_inames != set(var_to_new_ilp_inames[tv.name]):
                    raise LoopyError("instruction '%s' requires adding "
                            "indices for ILP inames '%s' on var '%s', but previous "
                            "instructions required inames '%s'"
                            % (writer_insn_id, ", ".join(new_ilp_inames),
                                ", ".join(var_to_new_ilp_inames[tv.name])))

                continue

            var_to_new_ilp_inames[tv.name] = set(new_ilp_inames)

    # }}}

    # {{{ find ilp iname lengths

    from loopy.isl_helpers import static_max_of_pw_aff
    from loopy.symbolic import pw_aff_to_expr

    ilp_iname_to_length = {}
    for ilp_inames in six.itervalues(var_to_new_ilp_inames):
        for iname in ilp_inames:
            if iname in ilp_iname_to_length:
                continue

            bounds = kernel.get_iname_bounds(iname, constants_only=True)
            ilp_iname_to_length[iname] = int(pw_aff_to_expr(
                        static_max_of_pw_aff(bounds.size, constants_only=True)))

            assert static_max_of_pw_aff(
                    bounds.lower_bound_pw_aff, constants_only=True).plain_is_zero()

    # }}}

    # {{{ change temporary variables

    new_temp_vars = kernel.temporary_variables.copy()
    for tv_name, inames in six.iteritems(var_to_new_ilp_inames):
        tv = new_temp_vars[tv_name]
        extra_shape = tuple(ilp_iname_to_length[iname] for iname in inames)

        shape = tv.shape
        if shape is None:
            shape = ()

        dim_tags = ["c"] * (len(shape) + len(extra_shape))
        for i, iname in enumerate(inames):
            if isinstance(kernel.iname_to_tag.get(iname), VectorizeTag):
                dim_tags[len(shape) + i] = "vec"

        new_temp_vars[tv.name] = tv.copy(shape=shape + extra_shape,
                # Forget what you knew about data layout,
                # create from scratch.
                dim_tags=dim_tags,
                dim_names=None)

    # }}}

    from pymbolic import var
    var_to_extra_iname = dict(
            (var_name, tuple(var(iname) for iname in inames))
            for var_name, inames in six.iteritems(var_to_new_ilp_inames))

    new_insns = []

    for insn in kernel.instructions:
        eiii = ExtraInameIndexInserter(var_to_extra_iname)
        new_insn = insn.with_transformed_expressions(eiii)
        if not eiii.seen_ilp_inames <= insn.within_inames:

            from loopy.diagnostic import warn_with_kernel
            warn_with_kernel(
                    kernel,
                    "implicit_ilp_iname",
                    "Instruction '%s': touched variable that (for ILP) "
                    "required iname(s) '%s', but that the instruction was not "
                    "previously within the iname(s). Previously, this would "
                    "implicitly promote the instruction, but that behavior is "
                    "deprecated and will stop working in 2018.1."
                    % (insn.id, ", ".join(
                        eiii.seen_ilp_inames - insn.within_inames)))

        new_insns.append(new_insn)

    return kernel.copy(
        temporary_variables=new_temp_vars,
        instructions=new_insns)
Ejemplo n.º 18
0
def find_all_insn_inames(kernel):
    logger.debug("%s: find_all_insn_inames: start" % kernel.name)

    writer_map = kernel.writer_map()

    insn_id_to_inames = {}
    insn_assignee_inames = {}

    all_read_deps = {}
    all_write_deps = {}

    from loopy.transform.subst import expand_subst
    kernel = expand_subst(kernel)

    for insn in kernel.instructions:
        all_read_deps[insn.id] = read_deps = insn.read_dependency_names()
        all_write_deps[insn.id] = write_deps = insn.write_dependency_names()
        deps = read_deps | write_deps

        if insn.within_inames_is_final:
            iname_deps = insn.within_inames
        else:
            iname_deps = (deps & kernel.all_inames() | insn.within_inames)

        assert isinstance(read_deps, frozenset), type(insn)
        assert isinstance(write_deps, frozenset), type(insn)
        assert isinstance(iname_deps, frozenset), type(insn)

        logger.debug("%s: find_all_insn_inames: %s (init): %s - "
                     "read deps: %s - write deps: %s" % (
                         kernel.name,
                         insn.id,
                         ", ".join(sorted(iname_deps)),
                         ", ".join(sorted(read_deps)),
                         ", ".join(sorted(write_deps)),
                     ))

        insn_id_to_inames[insn.id] = iname_deps
        insn_assignee_inames[insn.id] = write_deps & kernel.all_inames()

    # fixed point iteration until all iname dep sets have converged

    # Why is fixed point iteration necessary here? Consider the following
    # scenario:
    #
    # z = expr(iname)
    # y = expr(z)
    # x = expr(y)
    #
    # x clearly has a dependency on iname, but this is not found until that
    # dependency has propagated all the way up. Doing this recursively is
    # not guaranteed to terminate because of circular dependencies.

    while True:
        did_something = False
        for insn in kernel.instructions:

            if insn.within_inames_is_final:
                continue

            # {{{ depdency-based propagation

            inames_old = insn_id_to_inames[insn.id]
            inames_new = inames_old | guess_iname_deps_based_on_var_use(
                kernel, insn, insn_id_to_inames)

            insn_id_to_inames[insn.id] = inames_new

            if inames_new != inames_old:
                did_something = True

                warn_with_kernel(
                    kernel, "inferred_iname",
                    "The iname(s) '%s' on instruction '%s' "
                    "was/were automatically added. "
                    "This is deprecated. Please add the iname "
                    "to the instruction "
                    "explicitly, e.g. by adding 'for' loops" %
                    (", ".join(inames_new - inames_old), insn.id))

            # }}}

            # {{{ domain-based propagation

            inames_old = insn_id_to_inames[insn.id]
            inames_new = set(insn_id_to_inames[insn.id])

            for iname in inames_old:
                home_domain = kernel.domains[kernel.get_home_domain_index(
                    iname)]

                for par in home_domain.get_var_names(dim_type.param):
                    # Add all inames occurring in parameters of domains that my
                    # current inames refer to.

                    if par in kernel.all_inames():
                        inames_new.add(intern(par))

                    # If something writes the bounds of a loop in which I'm
                    # sitting, I had better be in the inames that the writer is
                    # in.

                    if par in kernel.temporary_variables:
                        for writer_id in writer_map.get(par, []):
                            inames_new.update(insn_id_to_inames[writer_id])

            if inames_new != inames_old:
                did_something = True
                insn_id_to_inames[insn.id] = frozenset(inames_new)

                warn_with_kernel(
                    kernel, "inferred_iname",
                    "The iname(s) '%s' on instruction '%s' was "
                    "automatically added. "
                    "This is deprecated. Please add the iname "
                    "to the instruction "
                    "explicitly, e.g. by adding 'for' loops" %
                    (", ".join(inames_new - inames_old), insn.id))

            # }}}

        if not did_something:
            break

    logger.debug("%s: find_all_insn_inames: done" % kernel.name)

    for v in six.itervalues(insn_id_to_inames):
        assert isinstance(v, frozenset)

    return insn_id_to_inames
Ejemplo n.º 19
0
    def map_reduction(expr, rec, nresults=1):
        # Only expand one level of reduction at a time, going from outermost to
        # innermost. Otherwise we get the (iname + insn) dependencies wrong.

        try:
            arg_dtype = type_inf_mapper(expr.expr)
        except DependencyTypeInferenceFailure:
            if unknown_types_ok:
                arg_dtype = lp.auto

                reduction_dtypes = (lp.auto,)*nresults

            else:
                raise LoopyError("failed to determine type of accumulator for "
                        "reduction '%s'" % expr)
        else:
            arg_dtype = arg_dtype.with_target(kernel.target)

            reduction_dtypes = expr.operation.result_dtypes(
                        kernel, arg_dtype, expr.inames)
            reduction_dtypes = tuple(
                    dt.with_target(kernel.target) for dt in reduction_dtypes)

        outer_insn_inames = temp_kernel.insn_inames(insn)
        bad_inames = frozenset(expr.inames) & outer_insn_inames
        if bad_inames:
            raise LoopyError("reduction used within loop(s) that it was "
                    "supposed to reduce over: " + ", ".join(bad_inames))

        n_sequential = 0
        n_local_par = 0

        from loopy.kernel.data import (
                LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag,
                ParallelTag)
        for iname in expr.inames:
            iname_tag = kernel.iname_to_tag.get(iname)

            if isinstance(iname_tag, (UnrollTag, UnrolledIlpTag)):
                # These are nominally parallel, but we can live with
                # them as sequential.
                n_sequential += 1

            elif isinstance(iname_tag, LocalIndexTagBase):
                n_local_par += 1

            elif isinstance(iname_tag, (ParallelTag, VectorizeTag)):
                raise LoopyError("the only form of parallelism supported "
                        "by reductions is 'local'--found iname '%s' "
                        "tagged '%s'"
                        % (iname, type(iname_tag).__name__))

            else:
                n_sequential += 1

        if n_local_par and n_sequential:
            raise LoopyError("Reduction over '%s' contains both parallel and "
                    "sequential inames. It must be split "
                    "(using split_reduction_{in,out}ward) "
                    "before code generation."
                    % ", ".join(expr.inames))

        if n_local_par > 1:
            raise LoopyError("Reduction over '%s' contains more than"
                    "one parallel iname. It must be split "
                    "(using split_reduction_{in,out}ward) "
                    "before code generation."
                    % ", ".join(expr.inames))

        if n_sequential:
            assert n_local_par == 0
            return map_reduction_seq(expr, rec, nresults, arg_dtype,
                    reduction_dtypes)
        elif n_local_par:
            return map_reduction_local(expr, rec, nresults, arg_dtype,
                    reduction_dtypes)
        else:
            from loopy.diagnostic import warn_with_kernel
            warn_with_kernel(kernel, "empty_reduction",
                    "Empty reduction found (no inames to reduce over). "
                    "Eliminating.")

            return expr.expr
Ejemplo n.º 20
0
def check_sizes(kernel, device):
    import loopy as lp

    from loopy.diagnostic import LoopyAdvisory, LoopyError

    if device is None:
        warn_with_kernel(kernel, "no_device_in_pre_codegen_checks",
                "No device parameter was passed to the PyOpenCLTarget. "
                "Perhaps you want to pass a device to benefit from "
                "additional checking.", LoopyAdvisory)
        return

    parameters = {}
    for arg in kernel.args:
        if isinstance(arg, lp.ValueArg) and arg.approximately is not None:
            parameters[arg.name] = arg.approximately

    glens, llens = kernel.get_grid_size_upper_bounds_as_exprs()

    if (max(len(glens), len(llens))
            > device.max_work_item_dimensions):
        raise LoopyError("too many work item dimensions")

    from pymbolic import evaluate
    from pymbolic.mapper.evaluator import UnknownVariableError
    try:
        glens = evaluate(glens, parameters)
        llens = evaluate(llens, parameters)
    except UnknownVariableError as name:
        from warnings import warn
        warn("could not check axis bounds because no value "
                "for variable '%s' was passed to check_kernels()"
                % name, LoopyAdvisory)
    else:
        for i in range(len(llens)):
            if llens[i] > device.max_work_item_sizes[i]:
                raise LoopyError("group axis %d too big" % i)

        from pytools import product
        if product(llens) > device.max_work_group_size:
            raise LoopyError("work group too big")

    local_mem_use = kernel.local_mem_use()

    from pyopencl.characterize import usable_local_mem_size
    import numbers
    if isinstance(local_mem_use, numbers.Integral):
        if local_mem_use > usable_local_mem_size(device):
            raise LoopyError("using too much local memory")
    else:
        warn_with_kernel(kernel, "non_constant_local_mem",
                "The amount of local memory used by the kernel "
                "is not a constant. This will likely cause problems.")

    from loopy.kernel.data import ConstantArg
    const_arg_count = sum(
            1 for arg in kernel.args
            if isinstance(arg, ConstantArg))

    if const_arg_count > device.max_constant_args:
        raise LoopyError("too many constant arguments")
Ejemplo n.º 21
0
def find_temporary_scope(kernel):
    logger.debug("%s: mark local temporaries" % kernel.name)

    new_temp_vars = {}
    from loopy.kernel.data import (LocalIndexTagBase, GroupIndexTag,
            temp_var_scope)
    import loopy as lp

    writers = kernel.writer_map()

    for temp_var in six.itervalues(kernel.temporary_variables):
        # Only fill out for variables that do not yet know if they're
        # local. (I.e. those generated by implicit temporary generation.)

        if temp_var.scope is not lp.auto:
            new_temp_vars[temp_var.name] = temp_var
            continue

        my_writers = writers.get(temp_var.name, [])

        desired_scope_per_insn = []
        for insn_id in my_writers:
            insn = kernel.id_to_insn[insn_id]

            # A write race will emerge if:
            #
            # - the variable is local
            #   and
            # - the instruction is run across more inames (locally) parallel
            #   than are reflected in the assignee indices.

            locparallel_compute_inames = _get_compute_inames_tagged(
                    kernel, insn, LocalIndexTagBase)

            locparallel_assignee_inames = _get_assignee_inames_tagged(
                    kernel, insn, LocalIndexTagBase, temp_var.name)

            grpparallel_compute_inames = _get_compute_inames_tagged(
                    kernel, insn, GroupIndexTag)

            grpparallel_assignee_inames = _get_assignee_inames_tagged(
                    kernel, insn, GroupIndexTag, temp_var.name)

            assert locparallel_assignee_inames <= locparallel_compute_inames
            assert grpparallel_assignee_inames <= grpparallel_compute_inames

            desired_scope = temp_var_scope.PRIVATE
            for iname_descr, scope_descr, apin, cpin, scope in [
                    ("local", "local", locparallel_assignee_inames,
                        locparallel_compute_inames, temp_var_scope.LOCAL),
                    ("group", "global", grpparallel_assignee_inames,
                        grpparallel_compute_inames, temp_var_scope.GLOBAL),
                    ]:

                if (apin != cpin and bool(locparallel_assignee_inames)):
                    warn_with_kernel(kernel, "write_race_local(%s)" % insn_id,
                            "instruction '%s' looks invalid: "
                            "it assigns to indices based on %s IDs, but "
                            "its temporary '%s' cannot be made %s because "
                            "a write race across the iname(s) '%s' would emerge. "
                            "(Do you need to add an extra iname to your prefetch?)"
                            % (insn_id, iname_descr, temp_var.name, scope_descr,
                                ", ".join(cpin - apin)),
                            WriteRaceConditionWarning)

                if (apin == cpin

                        # doesn't want to be in this scope if there aren't any
                        # parallel inames of that kind:
                        and bool(cpin)):
                    desired_scope = max(desired_scope, scope)
                    break

            desired_scope_per_insn.append(desired_scope)

        if not desired_scope_per_insn:
            if temp_var.initializer is None:
                warn_with_kernel(kernel, "temp_to_write(%s)" % temp_var.name,
                        "temporary variable '%s' never written, eliminating"
                        % temp_var.name, LoopyAdvisory)
            else:
                raise LoopyError("temporary variable '%s': never written, "
                        "cannot automatically determine scope"
                        % temp_var.name)

            continue

        overall_scope = max(desired_scope_per_insn)

        from pytools import all
        if not all(iscope == overall_scope for iscope in desired_scope_per_insn):
            raise LoopyError("not all instructions agree on the "
                    "the desired scope (private/local/global) of  the "
                    "temporary '%s'" % temp_var.name)

        new_temp_vars[temp_var.name] = temp_var.copy(scope=overall_scope)

    return kernel.copy(temporary_variables=new_temp_vars)