Esempio n. 1
0
def check_sizes(kernel, device):
    import loopy as lp

    from loopy.diagnostic import LoopyAdvisory, LoopyError

    if device is None:
        from loopy.diagnostic import warn
        warn(kernel, "no_device_in_pre_codegen_checks",
                "No device parameter was passed to the PyOpenCLTarget. "
                "Perhaps you want to pass a device to benefit from "
                "additional checking.", LoopyAdvisory)
        return

    parameters = {}
    for arg in kernel.args:
        if isinstance(arg, lp.ValueArg) and arg.approximately is not None:
            parameters[arg.name] = arg.approximately

    glens, llens = kernel.get_grid_sizes_as_exprs()

    if (max(len(glens), len(llens))
            > device.max_work_item_dimensions):
        raise LoopyError("too many work item dimensions")

    from pymbolic import evaluate
    from pymbolic.mapper.evaluator import UnknownVariableError
    try:
        glens = evaluate(glens, parameters)
        llens = evaluate(llens, parameters)
    except UnknownVariableError as name:
        from warnings import warn
        warn("could not check axis bounds because no value "
                "for variable '%s' was passed to check_kernels()"
                % name, LoopyAdvisory)
    else:
        for i in range(len(llens)):
            if llens[i] > device.max_work_item_sizes[i]:
                raise LoopyError("group axis %d too big" % i)

        from pytools import product
        if product(llens) > device.max_work_group_size:
            raise LoopyError("work group too big")

    from pyopencl.characterize import usable_local_mem_size
    if kernel.local_mem_use() > usable_local_mem_size(device):
        raise LoopyError("using too much local memory")

    from loopy.kernel.data import ConstantArg
    const_arg_count = sum(
            1 for arg in kernel.args
            if isinstance(arg, ConstantArg))

    if const_arg_count > device.max_constant_args:
        raise LoopyError("too many constant arguments")
Esempio n. 2
0
def check_sizes(kernel, device):
    import loopy as lp

    from loopy.diagnostic import LoopyAdvisory, LoopyError

    if device is None:
        from loopy.diagnostic import warn
        warn(kernel, "no_device_in_pre_codegen_checks",
                "No device parameter was passed to the PyOpenCLTarget. "
                "Perhaps you want to pass a device to benefit from "
                "additional checking.", LoopyAdvisory)
        return

    parameters = {}
    for arg in kernel.args:
        if isinstance(arg, lp.ValueArg) and arg.approximately is not None:
            parameters[arg.name] = arg.approximately

    glens, llens = kernel.get_grid_size_upper_bounds_as_exprs()

    if (max(len(glens), len(llens))
            > device.max_work_item_dimensions):
        raise LoopyError("too many work item dimensions")

    from pymbolic import evaluate
    from pymbolic.mapper.evaluator import UnknownVariableError
    try:
        glens = evaluate(glens, parameters)
        llens = evaluate(llens, parameters)
    except UnknownVariableError as name:
        from warnings import warn
        warn("could not check axis bounds because no value "
                "for variable '%s' was passed to check_kernels()"
                % name, LoopyAdvisory)
    else:
        for i in range(len(llens)):
            if llens[i] > device.max_work_item_sizes[i]:
                raise LoopyError("group axis %d too big" % i)

        from pytools import product
        if product(llens) > device.max_work_group_size:
            raise LoopyError("work group too big")

    from pyopencl.characterize import usable_local_mem_size
    if kernel.local_mem_use() > usable_local_mem_size(device):
        raise LoopyError("using too much local memory")

    from loopy.kernel.data import ConstantArg
    const_arg_count = sum(
            1 for arg in kernel.args
            if isinstance(arg, ConstantArg))

    if const_arg_count > device.max_constant_args:
        raise LoopyError("too many constant arguments")
Esempio n. 3
0
    def map_subscript(self, expr, type_context):
        from loopy.kernel.data import TemporaryVariable

        ary = self.find_array(expr)

        if (isinstance(ary, TemporaryVariable)
                and ary.scope == temp_var_scope.PRIVATE):
            # generate access code for acccess to private-index temporaries

            gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs()
            if lsize:
                lsize, = lsize
                from loopy.kernel.array import get_access_info
                from pymbolic import evaluate

                access_info = get_access_info(
                    self.kernel.target, ary, expr.index, lambda expr: evaluate(
                        expr, self.codegen_state.var_subst_map),
                    self.codegen_state.vectorization_info)

                subscript, = access_info.subscripts
                result = var(
                    access_info.array_name)[var("programIndex") +
                                            self.rec(lsize * subscript, 'i')]

                if access_info.vector_index is not None:
                    return self.kernel.target.add_vector_access(
                        result, access_info.vector_index)
                else:
                    return result

        return super(ExprToISPCExprMapper,
                     self).map_subscript(expr, type_context)
Esempio n. 4
0
    def map_subscript(self, expr, enclosing_prec, type_context):
        from loopy.kernel.data import TemporaryVariable

        ary = self.find_array(expr)

        if isinstance(ary, TemporaryVariable):
            gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs()
            if lsize:
                lsize, = lsize
                from loopy.kernel.array import get_access_info
                from pymbolic import evaluate

                access_info = get_access_info(
                    self.kernel.target, ary, expr.index, lambda expr: evaluate(
                        expr, self.codegen_state.var_subst_map),
                    self.codegen_state.vectorization_info)

                subscript, = access_info.subscripts
                result = self.parenthesize_if_needed(
                    "%s[programIndex + %s]" %
                    (access_info.array_name,
                     self.rec(lsize * subscript, PREC_SUM, 'i')),
                    enclosing_prec, PREC_CALL)

                if access_info.vector_index is not None:
                    return self.kernel.target.add_vector_access(
                        result, access_info.vector_index)
                else:
                    return result

        return super(ExprToISPCMapper,
                     self).map_subscript(expr, enclosing_prec, type_context)
Esempio n. 5
0
    def map_subscript(self, expr, type_context):
        from loopy.kernel.data import TemporaryVariable

        ary = self.find_array(expr)

        if (isinstance(ary, TemporaryVariable)
                and ary.address_space == AddressSpace.PRIVATE):
            # generate access code for acccess to private-index temporaries

            gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs()
            if lsize:
                lsize, = lsize
                from loopy.kernel.array import get_access_info
                from pymbolic import evaluate

                access_info = get_access_info(self.kernel.target, ary, expr.index,
                    lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
                    self.codegen_state.vectorization_info)

                subscript, = access_info.subscripts
                result = var(access_info.array_name)[
                        var("programIndex") + self.rec(lsize*subscript, 'i')]

                if access_info.vector_index is not None:
                    return self.kernel.target.add_vector_access(
                        result, access_info.vector_index)
                else:
                    return result

        return super(ExprToISPCExprMapper, self).map_subscript(
                expr, type_context)
Esempio n. 6
0
    def map_subscript(self, expr, enclosing_prec, type_context):
        from loopy.kernel.data import TemporaryVariable

        ary = self.find_array(expr)

        if isinstance(ary, TemporaryVariable):
            gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs()
            if lsize:
                lsize, = lsize
                from loopy.kernel.array import get_access_info
                from pymbolic import evaluate

                access_info = get_access_info(self.kernel.target, ary, expr.index,
                    lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
                    self.codegen_state.vectorization_info)

                subscript, = access_info.subscripts
                result = self.parenthesize_if_needed(
                        "%s[programIndex + %s]" % (
                            access_info.array_name,
                            self.rec(lsize*subscript, PREC_SUM, 'i')),
                        enclosing_prec, PREC_CALL)

                if access_info.vector_index is not None:
                    return self.kernel.target.add_vector_access(
                        result, access_info.vector_index)
                else:
                    return result

        return super(ExprToISPCMapper, self).map_subscript(
                expr, enclosing_prec, type_context)
Esempio n. 7
0
def estimate_calibration_params(model_results, timing_results):
    """Given a set of model results and matching timing results, estimate the best
    calibration parameters for the model.
    """

    params = set(_FMM_STAGE_TO_CALIBRATION_PARAMETER.values())

    nresults = len(model_results)

    if nresults != len(timing_results):
        raise ValueError("must have same number of model and timing results")

    uncalibrated_times = {}
    actual_times = {}

    for param in params:
        uncalibrated_times[param] = np.zeros(nresults)
        actual_times[param] = np.zeros(nresults)

    from pymbolic import evaluate
    from pymbolic.mapper.coefficient import CoefficientCollector
    collect_coeffs = CoefficientCollector()

    for i, model_result in enumerate(model_results):
        context = model_result.params.copy()
        for param in params:
            context[param] = var(param)

        # Represents the total modeled cost, but leaves the calibration
        # parameters symbolic.
        total_cost = evaluate(sum(model_result.raw_costs.values()),
                              context=context)

        coeffs = collect_coeffs(total_cost)
        # Assumes the total cost is a sum of terms linear in the parameters.
        assert set(key.name for key in coeffs) <= params

        for param, time in coeffs.items():
            uncalibrated_times[param.name][i] = time

    for i, timing_result in enumerate(timing_results):
        for param, time in timing_result.items():
            calibration_param = (_FMM_STAGE_TO_CALIBRATION_PARAMETER[param])
            actual_times[calibration_param][i] = time["process_elapsed"]

    result = {}

    for param in params:
        uncalibrated = uncalibrated_times[param]
        actual = actual_times[param]

        if np.allclose(uncalibrated, 0):
            result[param] = float("NaN")
            continue

        result[param] = (actual.dot(uncalibrated) /
                         uncalibrated.dot(uncalibrated))

    return result
Esempio n. 8
0
    def qbx_cost_factors_for_kernels_from_model(self, queue, nlevels,
                                                xlat_cost, context):
        """Evaluate translation cost factors from symbolic model. The result of this
        function can be used for process_* methods in this class.

        This method overwrite the method in parent
        :class:`boxtree.cost.AbstractFMMCostModel` to support operations specific to
        QBX.

        :arg queue: If not None, the cost factor arrays will be transferred to device
            using this queue.
        :arg nlevels: the number of tree levels.
        :arg xlat_cost: a :class:`QBXTranslationCostModel`.
        :arg context: a :class:`dict` mapping from the symbolic names of parameters
            to their values, serving as context when evaluating symbolic expressions
            in *xlat_cost*.
        :return: a :class:`dict`, mapping from stage names to the translation costs
            of those stages in FMM and QBX.
        """
        cost_factors = self.fmm_cost_factors_for_kernels_from_model(
            queue, nlevels, xlat_cost, context)

        cost_factors.update({
            "p2qbxl_cost":
            evaluate(xlat_cost.p2qbxl(), context=context),
            "m2qbxl_cost":
            np.array([
                evaluate(xlat_cost.m2qbxl(ilevel), context=context)
                for ilevel in range(nlevels)
            ]),
            "l2qbxl_cost":
            np.array([
                evaluate(xlat_cost.l2qbxl(ilevel), context=context)
                for ilevel in range(nlevels)
            ]),
            "qbxl2p_cost":
            evaluate(xlat_cost.qbxl2p(), context=context),
            "p2p_tsqbx_cost":
            evaluate(xlat_cost.p2p_tsqbx(), context=context)
        })

        if queue:
            cost_factors = self.cost_factors_to_dev(cost_factors, queue)

        return cost_factors
Esempio n. 9
0
    def eval(expr, source, center1, center2, target):
        from pymbolic import parse, evaluate
        context = {
                "s": source,
                "c1": center1,
                "c2": center2,
                "t": target,
                "norm": la.norm}

        return evaluate(parse(expr), context)
Esempio n. 10
0
def from_primitives(expr, var_order):
    from pymbolic import get_dependencies, evaluate

    deps = get_dependencies(expr)
    var_deps = [dep for dep in deps if dep in var_order]
    context = dict((vd, Polynomial(vd, var_order=var_order)) for vd in var_deps)

    # FIXME not fast, but works
    # (and exercises multivariate polynomial code)
    return evaluate(expr, context)
Esempio n. 11
0
    def eval(expr, source, center1, center2, target):
        from pymbolic import parse, evaluate
        context = {
                "s": source,
                "c1": center1,
                "c2": center2,
                "t": target,
                "norm": la.norm}

        return evaluate(parse(expr), context)
Esempio n. 12
0
def from_primitives(expr, var_order):
    from pymbolic import get_dependencies, evaluate

    deps = get_dependencies(expr)
    var_deps = [dep for dep in deps if dep in var_order]
    context = dict(
        (vd, Polynomial(vd, var_order=var_order)) for vd in var_deps)

    # FIXME not fast, but works
    # (and exercises multivariate polynomial code)
    return evaluate(expr, context)
Esempio n. 13
0
def evaluate_shape(shape, context):
    from pymbolic import evaluate

    result = []
    for saxis in shape:
        if saxis is None:
            result.append(saxis)
        else:
            result.append(evaluate(saxis, context))

    return tuple(result)
Esempio n. 14
0
def evaluate_shape(shape, context):
    from pymbolic import evaluate

    result = []
    for saxis in shape:
        if saxis is None:
            result.append(saxis)
        else:
            result.append(evaluate(saxis, context))

    return tuple(result)
Esempio n. 15
0
def aff_from_expr(space, expr, vars_to_zero=set()):
    zero = isl.Aff.zero_on_domain(isl.LocalSpace.from_space(space))
    context = {}
    for name, (dt, pos) in six.iteritems(space.get_var_dict()):
        if dt == dim_type.set:
            dt = dim_type.in_

        context[name] = zero.set_coefficient_val(dt, pos, 1)

    for name in vars_to_zero:
        context[name] = zero

    from pymbolic import evaluate
    return zero + evaluate(expr, context)
Esempio n. 16
0
def aff_from_expr(space, expr, vars_to_zero=set()):
    zero = isl.Aff.zero_on_domain(isl.LocalSpace.from_space(space))
    context = {}
    for name, (dt, pos) in six.iteritems(space.get_var_dict()):
        if dt == dim_type.set:
            dt = dim_type.in_

        context[name] = zero.set_coefficient_val(dt, pos, 1)

    for name in vars_to_zero:
        context[name] = zero

    from pymbolic import evaluate
    return zero + evaluate(expr, context)
Esempio n. 17
0
def test_fuzz_code_generator(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    if ctx.devices[0].platform.vendor.startswith("Advanced Micro"):
        pytest.skip("crashes on AMD 15.12")

    #from expr_fuzz import get_fuzz_examples
    #for expr, var_values in get_fuzz_examples():
    for expr, var_values in generate_random_fuzz_examples(50):
        from pymbolic import evaluate
        try:
            true_value = evaluate(expr, var_values)
        except ZeroDivisionError:
            continue

        def get_dtype(x):
            if isinstance(x, (complex, np.complexfloating)):
                return np.complex128
            else:
                return np.float64

        knl = lp.make_kernel("{ : }",
                [lp.Assignment("value", expr)],
                [lp.GlobalArg("value", np.complex128, shape=())]
                + [
                    lp.ValueArg(name, get_dtype(val))
                    for name, val in six.iteritems(var_values)
                    ])
        ck = lp.CompiledKernel(ctx, knl)
        evt, (lp_value,) = ck(queue, out_host=True, **var_values)
        err = abs(true_value-lp_value)/abs(true_value)
        if abs(err) > 1e-10:
            print(80*"-")
            print("WRONG: rel error=%g" % err)
            print("true=%r" % true_value)
            print("loopy=%r" % lp_value)
            print(80*"-")
            print(ck.get_code())
            print(80*"-")
            print(var_values)
            print(80*"-")
            print(repr(expr))
            print(80*"-")
            print(expr)
            print(80*"-")
            1/0
Esempio n. 18
0
def test_fuzz_code_generator(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    if ctx.devices[0].platform.vendor.startswith("Advanced Micro"):
        pytest.skip("crashes on AMD 15.12")

    #from expr_fuzz import get_fuzz_examples
    #for expr, var_values in get_fuzz_examples():
    for expr, var_values in generate_random_fuzz_examples(50):
        from pymbolic import evaluate
        try:
            true_value = evaluate(expr, var_values)
        except ZeroDivisionError:
            continue

        def get_dtype(x):
            if isinstance(x, (complex, np.complexfloating)):
                return np.complex128
            else:
                return np.float64

        knl = lp.make_kernel("{ : }", [lp.Assignment("value", expr)],
                             [lp.GlobalArg("value", np.complex128, shape=())] +
                             [
                                 lp.ValueArg(name, get_dtype(val))
                                 for name, val in six.iteritems(var_values)
                             ])
        ck = lp.CompiledKernel(ctx, knl)
        evt, (lp_value, ) = ck(queue, out_host=True, **var_values)
        err = abs(true_value - lp_value) / abs(true_value)
        if abs(err) > 1e-10:
            print(80 * "-")
            print("WRONG: rel error=%g" % err)
            print("true=%r" % true_value)
            print("loopy=%r" % lp_value)
            print(80 * "-")
            print(ck.get_code())
            print(80 * "-")
            print(var_values)
            print(80 * "-")
            print(repr(expr))
            print(80 * "-")
            print(expr)
            print(80 * "-")
            1 / 0
Esempio n. 19
0
def map_math_functions_by_name(i, func, pars):
    try:
        f = pymbolic.evaluate(func, {"math": math, "cmath": cmath})
    except pymbolic.mapper.evaluator.UnknownVariableError:
        raise RuntimeError("No derivative of non-constant function "+str(func))

    def make_f(name):
        return primitives.Lookup(primitives.Variable("math"), name)

    if f is math.sin and len(pars) == 1:
        return make_f("cos")(*pars)
    elif f is math.cos and len(pars) == 1:
        return -make_f("sin")(*pars)
    elif f is math.tan and len(pars) == 1:
        return make_f("tan")(*pars)**2+1
    elif f is math.log and len(pars) == 1:
        return primitives.quotient(1, pars[0])
    elif f is math.exp and len(pars) == 1:
        return make_f("exp")(*pars)
    else:
        raise RuntimeError("unrecognized function, cannot differentiate")
Esempio n. 20
0
    def emit_call(self, expression_to_code_mapper, expression, target):
        from pymbolic.primitives import Subscript

        if len(expression.parameters) != 1:
            raise LoopyError("%s takes exactly one argument" % self.name)
        arg, = expression.parameters
        if not isinstance(arg, Subscript):
            raise LoopyError("argument to %s must be a subscript" % self.name)

        ary = expression_to_code_mapper.find_array(arg)

        from loopy.kernel.array import get_access_info
        from pymbolic import evaluate
        access_info = get_access_info(
            expression_to_code_mapper.kernel.target, ary, arg.index,
            lambda expr: evaluate(
                expr, expression_to_code_mapper.codegen_state.var_subst_map),
            expression_to_code_mapper.codegen_state.vectorization_info)

        from loopy.kernel.data import ImageArg
        if isinstance(ary, ImageArg):
            raise LoopyError("%s does not support images" % self.name)

        if self.name == "indexof":
            return access_info.subscripts[0]
        elif self.name == "indexof_vec":
            from loopy.kernel.array import VectorArrayDimTag
            ivec = None
            for iaxis, dim_tag in enumerate(ary.dim_tags):
                if isinstance(dim_tag, VectorArrayDimTag):
                    ivec = iaxis

            if ivec is None:
                return access_info.subscripts[0]
            else:
                return (access_info.subscripts[0] * ary.shape[ivec] +
                        access_info.vector_index)

        else:
            raise RuntimeError("should not get here")
Esempio n. 21
0
    def map_subscript(self, expr, enclosing_prec, type_context):
        def base_impl(expr, enclosing_prec, type_context):
            return self.parenthesize_if_needed(
                "%s[%s]" % (self.rec(expr.aggregate, PREC_CALL, type_context),
                            self.rec(expr.index, PREC_NONE, 'i')),
                enclosing_prec, PREC_CALL)

        from pymbolic.primitives import Variable
        if not isinstance(expr.aggregate, Variable):
            return base_impl(expr, enclosing_prec, type_context)

        ary = self.find_array(expr)

        from loopy.kernel.array import get_access_info
        from pymbolic import evaluate

        from loopy.symbolic import simplify_using_aff
        index_tuple = tuple(
            simplify_using_aff(self.kernel, idx) for idx in expr.index_tuple)

        access_info = get_access_info(
            self.kernel.target, ary, index_tuple,
            lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
            self.codegen_state.vectorization_info)

        from loopy.kernel.data import ImageArg, GlobalArg, TemporaryVariable

        if isinstance(ary, ImageArg):
            base_access = ("read_imagef(%s, loopy_sampler, (float%d)(%s))" %
                           (ary.name, ary.dimensions, ", ".join(
                               self.rec(idx, PREC_NONE, 'i')
                               for idx in expr.index[::-1])))

            if ary.dtype.numpy_dtype == np.float32:
                return base_access + ".x"
            if self.kernel.target.is_vector_dtype(ary.dtype):
                return base_access
            elif ary.dtype.numpy_dtype == np.float64:
                return "as_double(%s.xy)" % base_access
            else:
                raise NotImplementedError(
                    "non-floating-point images not supported for now")

        elif isinstance(ary, (GlobalArg, TemporaryVariable)):
            if len(access_info.subscripts) == 0:
                if isinstance(ary, GlobalArg):
                    # unsubscripted global args are pointers
                    result = "*" + access_info.array_name

                else:
                    # unsubscripted temp vars are scalars
                    result = access_info.array_name

            else:
                subscript, = access_info.subscripts
                result = self.parenthesize_if_needed(
                    "%s[%s]" % (access_info.array_name,
                                self.rec(subscript, PREC_NONE, 'i')),
                    enclosing_prec, PREC_CALL)

            if access_info.vector_index is not None:
                return self.codegen_state.ast_builder.add_vector_access(
                    result, access_info.vector_index)
            else:
                return result

        else:
            assert False
Esempio n. 22
0
def test_substitute():
    from pymbolic import parse, substitute, evaluate
    u = parse("5+x.min**2")
    xmin = parse("x.min")
    assert evaluate(substitute(u, {xmin: 25})) == 630
Esempio n. 23
0
 def evaluate(self, expr):
     from pymbolic import evaluate
     try:
         return evaluate(expr)
     except ValueError:
         return None
Esempio n. 24
0
 def as_primitives(self):
     deps = pymbolic.get_dependencies(self)
     context = dict((dep, dep) for dep in deps)
     return pymbolic.evaluate(self, context)
Esempio n. 25
0
def make_ref_args(kernel, impl_arg_info, queue, parameters):
    import pyopencl as cl
    import pyopencl.array as cl_array

    from loopy.kernel.data import ValueArg, GlobalArg, ImageArg, TemporaryVariable

    from pymbolic import evaluate

    ref_args = {}
    ref_arg_data = []

    for arg in impl_arg_info:
        kernel_arg = kernel.impl_arg_to_arg.get(arg.name)

        if arg.arg_class is ValueArg:
            if arg.offset_for_name:
                continue

            arg_value = parameters[arg.name]

            try:
                argv_dtype = arg_value.dtype
            except AttributeError:
                argv_dtype = None

            if argv_dtype != arg.dtype:
                arg_value = arg.dtype.numpy_dtype.type(arg_value)

            ref_args[arg.name] = arg_value

            ref_arg_data.append(None)

        elif arg.arg_class is GlobalArg or arg.arg_class is ImageArg:
            if arg.shape is None or any(saxis is None for saxis in arg.shape):
                raise LoopyError("array '%s' needs known shape to use automatic "
                        "testing" % arg.name)

            shape = evaluate_shape(arg.unvec_shape, parameters)
            dtype = kernel_arg.dtype

            is_output = arg.base_name in kernel.get_written_variables()

            if arg.arg_class is ImageArg:
                storage_array = ary = cl_array.empty(
                        queue, shape, dtype, order="C")
                numpy_strides = None
                alloc_size = None
                strides = None
            else:
                strides = evaluate(arg.unvec_strides, parameters)

                from pytools import all
                assert all(s > 0 for s in strides)
                alloc_size = sum(astrd*(alen-1)
                        for alen, astrd in zip(shape, strides)) + 1

                if dtype is None:
                    raise LoopyError("dtype for argument '%s' is not yet "
                            "known. Perhaps you want to use "
                            "loopy.add_dtypes "
                            "or loopy.infer_argument_dtypes?"
                            % arg.name)

                itemsize = dtype.itemsize
                numpy_strides = [itemsize*s for s in strides]

                storage_array = cl_array.empty(queue, alloc_size, dtype)

            if is_output and arg.arg_class is ImageArg:
                raise LoopyError("write-mode images not supported in "
                        "automatic testing")

            fill_rand(storage_array)

            if arg.arg_class is ImageArg:
                # must be contiguous
                pre_run_ary = pre_run_storage_array = storage_array.copy()

                ref_args[arg.name] = cl.image_from_array(
                        queue.context, ary.get())
            else:
                pre_run_storage_array = storage_array.copy()

                ary = cl_array.as_strided(storage_array, shape, numpy_strides)
                pre_run_ary = cl_array.as_strided(
                        pre_run_storage_array, shape, numpy_strides)
                ref_args[arg.name] = ary

            ref_arg_data.append(
                    TestArgInfo(
                        name=arg.name,
                        ref_array=ary,
                        ref_storage_array=storage_array,

                        ref_pre_run_array=pre_run_ary,
                        ref_pre_run_storage_array=pre_run_storage_array,

                        ref_shape=shape,
                        ref_strides=strides,
                        ref_alloc_size=alloc_size,
                        ref_numpy_strides=numpy_strides,
                        needs_checking=is_output))

        elif arg.arg_class is TemporaryVariable:
            # global temporary, handled by invocation logic
            pass

        else:
            raise LoopyError("arg type not understood")

    return ref_args, ref_arg_data
Esempio n. 26
0
def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters):
    import pyopencl as cl
    import pyopencl.array as cl_array

    from loopy.kernel.data import ValueArg, GlobalArg, ImageArg, TemporaryVariable

    from pymbolic import evaluate

    args = {}
    for arg, arg_desc in zip(impl_arg_info, ref_arg_data):
        kernel_arg = kernel.impl_arg_to_arg.get(arg.name)

        if arg.arg_class is ValueArg:
            arg_value = parameters[arg.name]

            try:
                argv_dtype = arg_value.dtype
            except AttributeError:
                argv_dtype = None

            if argv_dtype != arg.dtype:
                arg_value = arg.dtype.numpy_dtype.type(arg_value)

            args[arg.name] = arg_value

        elif arg.arg_class is ImageArg:
            if arg.name in kernel.get_written_variables():
                raise NotImplementedError("write-mode images not supported in "
                        "automatic testing")

            shape = evaluate_shape(arg.unvec_shape, parameters)
            assert shape == arg_desc.ref_shape

            # must be contiguous
            args[arg.name] = cl.image_from_array(
                    queue.context, arg_desc.ref_pre_run_array.get())

        elif arg.arg_class is GlobalArg:
            shape = evaluate(arg.unvec_shape, parameters)
            strides = evaluate(arg.unvec_strides, parameters)

            dtype = kernel_arg.dtype
            itemsize = dtype.itemsize
            numpy_strides = [itemsize*s for s in strides]

            assert all(s > 0 for s in strides)
            alloc_size = sum(astrd*(alen-1)
                    for alen, astrd in zip(shape, strides)) + 1

            # use contiguous array to transfer to host
            host_ref_contig_array = arg_desc.ref_pre_run_storage_array.get()

            # use device shape/strides
            from pyopencl.compyte.array import as_strided
            host_ref_array = as_strided(host_ref_contig_array,
                    arg_desc.ref_shape, arg_desc.ref_numpy_strides)

            # flatten the thing
            host_ref_flat_array = host_ref_array.flatten()

            # create host array with test shape (but not strides)
            host_contig_array = np.empty(shape, dtype=dtype)

            common_len = min(
                    len(host_ref_flat_array),
                    len(host_contig_array.ravel()))
            host_contig_array.ravel()[:common_len] = \
                    host_ref_flat_array[:common_len]

            # create host array with test shape and storage layout
            host_storage_array = np.empty(alloc_size, dtype)
            host_array = as_strided(
                    host_storage_array, shape, numpy_strides)
            host_array[...] = host_contig_array

            host_contig_array = arg_desc.ref_storage_array.get()
            storage_array = cl_array.to_device(queue, host_storage_array)
            ary = cl_array.as_strided(storage_array, shape, numpy_strides)

            args[arg.name] = ary

            arg_desc.test_storage_array = storage_array
            arg_desc.test_array = ary
            arg_desc.test_shape = shape
            arg_desc.test_strides = strides
            arg_desc.test_numpy_strides = numpy_strides
            arg_desc.test_alloc_size = alloc_size

        elif arg.arg_class is TemporaryVariable:
            # global temporary, handled by invocation logic
            pass

        else:
            raise LoopyError("arg type not understood")

    return args
Esempio n. 27
0
def test_compare_cl_and_py_cost_model(ctx_factory, nsources, ntargets, dims, dtype):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    # {{{ Generate sources, targets and target_radii

    from boxtree.tools import make_normal_particle_array as p_normal
    sources = p_normal(queue, nsources, dims, dtype, seed=15)
    targets = p_normal(queue, ntargets, dims, dtype, seed=18)

    from pyopencl.clrandom import PhiloxGenerator
    rng = PhiloxGenerator(queue.context, seed=22)
    target_radii = rng.uniform(
        queue, ntargets, a=0, b=0.05, dtype=dtype
    ).get()

    # }}}

    # {{{ Generate tree and traversal

    from boxtree import TreeBuilder
    tb = TreeBuilder(ctx)
    tree, _ = tb(
        queue, sources, targets=targets, target_radii=target_radii,
        stick_out_factor=0.15, max_particles_in_box=30, debug=True
    )

    from boxtree.traversal import FMMTraversalBuilder
    tg = FMMTraversalBuilder(ctx, well_sep_is_n_away=2)
    trav_dev, _ = tg(queue, tree, debug=True)
    trav = trav_dev.get(queue=queue)

    # }}}

    # {{{ Construct cost models

    cl_cost_model = FMMCostModel(None)
    python_cost_model = _PythonFMMCostModel(None)

    constant_one_params = cl_cost_model.get_unit_calibration_params().copy()
    for ilevel in range(trav.tree.nlevels):
        constant_one_params["p_fmm_lev%d" % ilevel] = 10

    xlat_cost = make_pde_aware_translation_cost_model(dims, trav.tree.nlevels)

    # }}}

    # {{{ Test process_form_multipoles

    nlevels = trav.tree.nlevels
    p2m_cost = np.zeros(nlevels, dtype=np.float64)
    for ilevel in range(nlevels):
        p2m_cost[ilevel] = evaluate(
            xlat_cost.p2m(ilevel),
            context=constant_one_params
        )
    p2m_cost_dev = cl.array.to_device(queue, p2m_cost)

    queue.finish()
    start_time = time.time()

    cl_form_multipoles = cl_cost_model.process_form_multipoles(
        queue, trav_dev, p2m_cost_dev
    )

    queue.finish()
    logger.info("OpenCL time for process_form_multipoles: {0}".format(
        str(time.time() - start_time)
    ))

    start_time = time.time()

    python_form_multipoles = python_cost_model.process_form_multipoles(
        queue, trav, p2m_cost
    )

    logger.info("Python time for process_form_multipoles: {0}".format(
        str(time.time() - start_time)
    ))

    assert np.array_equal(cl_form_multipoles.get(), python_form_multipoles)

    # }}}

    # {{{ Test process_coarsen_multipoles

    m2m_cost = np.zeros(nlevels - 1, dtype=np.float64)
    for target_level in range(nlevels - 1):
        m2m_cost[target_level] = evaluate(
            xlat_cost.m2m(target_level + 1, target_level),
            context=constant_one_params
        )
    m2m_cost_dev = cl.array.to_device(queue, m2m_cost)

    queue.finish()
    start_time = time.time()
    cl_coarsen_multipoles = cl_cost_model.process_coarsen_multipoles(
        queue, trav_dev, m2m_cost_dev
    )

    queue.finish()
    logger.info("OpenCL time for coarsen_multipoles: {0}".format(
        str(time.time() - start_time)
    ))

    start_time = time.time()

    python_coarsen_multipoles = python_cost_model.process_coarsen_multipoles(
        queue, trav, m2m_cost
    )

    logger.info("Python time for coarsen_multipoles: {0}".format(
        str(time.time() - start_time)
    ))

    assert cl_coarsen_multipoles == python_coarsen_multipoles

    # }}}

    # {{{ Test process_direct

    queue.finish()
    start_time = time.time()

    cl_ndirect_sources_per_target_box = \
        cl_cost_model.get_ndirect_sources_per_target_box(queue, trav_dev)

    cl_direct = cl_cost_model.process_direct(
        queue, trav_dev, cl_ndirect_sources_per_target_box, 5.0
    )

    queue.finish()
    logger.info("OpenCL time for process_direct: {0}".format(
        str(time.time() - start_time)
    ))

    start_time = time.time()

    python_ndirect_sources_per_target_box = \
        python_cost_model.get_ndirect_sources_per_target_box(queue, trav)

    python_direct = python_cost_model.process_direct(
        queue, trav, python_ndirect_sources_per_target_box, 5.0
    )

    logger.info("Python time for process_direct: {0}".format(
        str(time.time() - start_time)
    ))

    assert np.array_equal(cl_direct.get(), python_direct)

    # }}}

    # {{{ Test aggregate_over_boxes

    start_time = time.time()

    cl_direct_aggregate = cl_cost_model.aggregate_over_boxes(cl_direct)

    queue.finish()
    logger.info("OpenCL time for aggregate_over_boxes: {0}".format(
        str(time.time() - start_time)
    ))

    start_time = time.time()

    python_direct_aggregate = python_cost_model.aggregate_over_boxes(python_direct)

    logger.info("Python time for aggregate_over_boxes: {0}".format(
        str(time.time() - start_time)
    ))

    assert cl_direct_aggregate == python_direct_aggregate

    # }}}

    # {{{ Test process_list2

    nlevels = trav.tree.nlevels
    m2l_cost = np.zeros(nlevels, dtype=np.float64)
    for ilevel in range(nlevels):
        m2l_cost[ilevel] = evaluate(
            xlat_cost.m2l(ilevel, ilevel),
            context=constant_one_params
        )
    m2l_cost_dev = cl.array.to_device(queue, m2l_cost)

    queue.finish()
    start_time = time.time()

    cl_m2l_cost = cl_cost_model.process_list2(queue, trav_dev, m2l_cost_dev)

    queue.finish()
    logger.info("OpenCL time for process_list2: {0}".format(
        str(time.time() - start_time)
    ))

    start_time = time.time()
    python_m2l_cost = python_cost_model.process_list2(queue, trav, m2l_cost)
    logger.info("Python time for process_list2: {0}".format(
        str(time.time() - start_time)
    ))

    assert np.array_equal(cl_m2l_cost.get(), python_m2l_cost)

    # }}}

    # {{{ Test process_list 3

    m2p_cost = np.zeros(nlevels, dtype=np.float64)
    for ilevel in range(nlevels):
        m2p_cost[ilevel] = evaluate(
            xlat_cost.m2p(ilevel),
            context=constant_one_params
        )
    m2p_cost_dev = cl.array.to_device(queue, m2p_cost)

    queue.finish()
    start_time = time.time()

    cl_m2p_cost = cl_cost_model.process_list3(queue, trav_dev, m2p_cost_dev)

    queue.finish()
    logger.info("OpenCL time for process_list3: {0}".format(
        str(time.time() - start_time)
    ))

    start_time = time.time()
    python_m2p_cost = python_cost_model.process_list3(queue, trav, m2p_cost)
    logger.info("Python time for process_list3: {0}".format(
        str(time.time() - start_time)
    ))

    assert np.array_equal(cl_m2p_cost.get(), python_m2p_cost)

    # }}}

    # {{{ Test process_list4

    p2l_cost = np.zeros(nlevels, dtype=np.float64)
    for ilevel in range(nlevels):
        p2l_cost[ilevel] = evaluate(
            xlat_cost.p2l(ilevel),
            context=constant_one_params
        )
    p2l_cost_dev = cl.array.to_device(queue, p2l_cost)

    queue.finish()
    start_time = time.time()

    cl_p2l_cost = cl_cost_model.process_list4(queue, trav_dev, p2l_cost_dev)

    queue.finish()
    logger.info("OpenCL time for process_list4: {0}".format(
        str(time.time() - start_time)
    ))

    start_time = time.time()
    python_p2l_cost = python_cost_model.process_list4(queue, trav, p2l_cost)
    logger.info("Python time for process_list4: {0}".format(
        str(time.time() - start_time)
    ))

    assert np.array_equal(cl_p2l_cost.get(), python_p2l_cost)

    # }}}

    # {{{ Test process_refine_locals

    l2l_cost = np.zeros(nlevels - 1, dtype=np.float64)
    for ilevel in range(nlevels - 1):
        l2l_cost[ilevel] = evaluate(
            xlat_cost.l2l(ilevel, ilevel + 1),
            context=constant_one_params
        )
    l2l_cost_dev = cl.array.to_device(queue, l2l_cost)

    queue.finish()
    start_time = time.time()

    cl_refine_locals_cost = cl_cost_model.process_refine_locals(
        queue, trav_dev, l2l_cost_dev
    )

    queue.finish()
    logger.info("OpenCL time for refine_locals: {0}".format(
        str(time.time() - start_time)
    ))

    start_time = time.time()
    python_refine_locals_cost = python_cost_model.process_refine_locals(
        queue, trav, l2l_cost
    )
    logger.info("Python time for refine_locals: {0}".format(
        str(time.time() - start_time)
    ))

    assert cl_refine_locals_cost == python_refine_locals_cost

    # }}}

    # {{{ Test process_eval_locals

    l2p_cost = np.zeros(nlevels, dtype=np.float64)
    for ilevel in range(nlevels):
        l2p_cost[ilevel] = evaluate(
            xlat_cost.l2p(ilevel),
            context=constant_one_params
        )
    l2p_cost_dev = cl.array.to_device(queue, l2p_cost)

    queue.finish()
    start_time = time.time()

    cl_l2p_cost = cl_cost_model.process_eval_locals(queue, trav_dev, l2p_cost_dev)

    queue.finish()
    logger.info("OpenCL time for process_eval_locals: {0}".format(
        str(time.time() - start_time)
    ))

    start_time = time.time()
    python_l2p_cost = python_cost_model.process_eval_locals(queue, trav, l2p_cost)
    logger.info("Python time for process_eval_locals: {0}".format(
        str(time.time() - start_time)
    ))

    assert np.array_equal(cl_l2p_cost.get(), python_l2p_cost)
Esempio n. 28
0
    def map_call(self, expr, enclosing_prec, type_context):
        from pymbolic.primitives import Variable, Subscript
        from pymbolic.mapper.stringifier import PREC_NONE

        identifier = expr.function

        # {{{ implement indexof, indexof_vec

        if identifier.name in ["indexof", "indexof_vec"]:
            if len(expr.parameters) != 1:
                raise LoopyError("%s takes exactly one argument" % identifier.name)
            arg, = expr.parameters
            if not isinstance(arg, Subscript):
                raise LoopyError(
                        "argument to %s must be a subscript" % identifier.name)

            ary = self.find_array(arg)

            from loopy.kernel.array import get_access_info
            from pymbolic import evaluate
            access_info = get_access_info(self.kernel.target, ary, arg.index,
                    lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
                    self.codegen_state.vectorization_info)

            from loopy.kernel.data import ImageArg
            if isinstance(ary, ImageArg):
                raise LoopyError("%s does not support images" % identifier.name)

            if identifier.name == "indexof":
                return access_info.subscripts[0]
            elif identifier.name == "indexof_vec":
                from loopy.kernel.array import VectorArrayDimTag
                ivec = None
                for iaxis, dim_tag in enumerate(ary.dim_tags):
                    if isinstance(dim_tag, VectorArrayDimTag):
                        ivec = iaxis

                if ivec is None:
                    return access_info.subscripts[0]
                else:
                    return (
                        access_info.subscripts[0]*ary.shape[ivec]
                        + access_info.vector_index)

            else:
                raise RuntimeError("should not get here")

        # }}}

        if isinstance(identifier, Variable):
            identifier = identifier.name

        par_dtypes = tuple(self.infer_type(par) for par in expr.parameters)

        str_parameters = None

        mangle_result = self.kernel.mangle_function(
                identifier, par_dtypes,
                ast_builder=self.codegen_state.ast_builder)

        if mangle_result is None:
            raise RuntimeError("function '%s' unknown--"
                    "maybe you need to register a function mangler?"
                    % identifier)

        if len(mangle_result.result_dtypes) != 1:
            raise LoopyError("functions with more or fewer than one return value "
                    "may not be used in an expression")

        if mangle_result.arg_dtypes is not None:
            str_parameters = [
                    self.rec(par, PREC_NONE,
                        dtype_to_type_context(self.kernel.target, tgt_dtype),
                        tgt_dtype)
                    for par, par_dtype, tgt_dtype in zip(
                        expr.parameters, par_dtypes, mangle_result.arg_dtypes)]

        else:
            # /!\ FIXME For some functions (e.g. 'sin'), it makes sense to
            # propagate the type context here. But for many others, it does
            # not. Using the inferred type as a stopgap for now.
            str_parameters = [
                    self.rec(par, PREC_NONE,
                        type_context=dtype_to_type_context(
                            self.kernel.target, par_dtype))
                    for par, par_dtype in zip(expr.parameters, par_dtypes)]

            from warnings import warn
            warn("Calling function '%s' with unknown C signature--"
                    "return CallMangleInfo.arg_dtypes"
                    % identifier, LoopyWarning)

        from loopy.codegen import SeenFunction
        self.codegen_state.seen_functions.add(
                SeenFunction(identifier,
                    mangle_result.target_name,
                    mangle_result.arg_dtypes or par_dtypes))

        return "%s(%s)" % (mangle_result.target_name, ", ".join(str_parameters))
Esempio n. 29
0
    def map_subscript(self, expr, type_context):
        def base_impl(expr, type_context):
            return self.rec(expr.aggregate, type_context)[self.rec(expr.index, 'i')]

        def make_var(name):
            from loopy import TaggedVariable
            if isinstance(expr.aggregate, TaggedVariable):
                return TaggedVariable(name, expr.aggregate.tag)
            else:
                return var(name)

        from pymbolic.primitives import Variable
        if not isinstance(expr.aggregate, Variable):
            return base_impl(expr, type_context)

        ary = self.find_array(expr)

        from loopy.kernel.array import get_access_info
        from pymbolic import evaluate

        from loopy.symbolic import simplify_using_aff
        index_tuple = tuple(
                simplify_using_aff(self.kernel, idx) for idx in expr.index_tuple)

        access_info = get_access_info(self.kernel.target, ary, index_tuple,
                lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
                self.codegen_state.vectorization_info)

        from loopy.kernel.data import (
                ImageArg, ArrayArg, TemporaryVariable, ConstantArg)

        if isinstance(ary, ImageArg):
            extra_axes = 0

            num_target_axes = ary.num_target_axes()
            if num_target_axes in [1, 2]:
                idx_vec_type = "float2"
                extra_axes = 2-num_target_axes
            elif num_target_axes == 3:
                idx_vec_type = "float4"
                extra_axes = 4-num_target_axes
            else:
                raise LoopyError("unsupported number (%d) of target axes in image"
                        % num_target_axes)

            idx_tuple = expr.index_tuple[::-1] + (0,) * extra_axes

            base_access = var("read_imagef")(
                    var(ary.name),
                    var("loopy_sampler"),
                    var("(%s)" % idx_vec_type)(*self.rec(idx_tuple, 'i')))

            if ary.dtype.numpy_dtype == np.float32:
                return base_access.attr("x")
            if self.kernel.target.is_vector_dtype(ary.dtype):
                return base_access
            elif ary.dtype.numpy_dtype == np.float64:
                return var("as_double")(base_access.attr("xy"))
            else:
                raise NotImplementedError(
                        "non-floating-point images not supported for now")

        elif isinstance(ary, (ArrayArg, TemporaryVariable, ConstantArg)):
            if len(access_info.subscripts) == 0:
                if (
                        (isinstance(ary, (ConstantArg, ArrayArg)) or
                         (isinstance(ary, TemporaryVariable) and ary.base_storage))):
                    # unsubscripted global args are pointers
                    result = make_var(access_info.array_name)[0]

                else:
                    # unsubscripted temp vars are scalars
                    # (unless they use base_storage)
                    result = make_var(access_info.array_name)

            else:
                subscript, = access_info.subscripts
                result = make_var(access_info.array_name)[simplify_using_aff(
                    self.kernel, self.rec(subscript, 'i'))]

            if access_info.vector_index is not None:
                return self.codegen_state.ast_builder.add_vector_access(
                    result, access_info.vector_index)
            else:
                return result

        else:
            assert False
Esempio n. 30
0
def get_auto_axis_iname_ranking_by_stride(kernel, insn):
    from loopy.kernel.data import ImageArg, ValueArg

    approximate_arg_values = {}
    for arg in kernel.args:
        if isinstance(arg, ValueArg):
            if arg.approximately is not None:
                approximate_arg_values[arg.name] = arg.approximately
            else:
                raise LoopyError(
                    "No approximate arg value specified for '%s'" % arg.name)

    # {{{ find all array accesses in insn

    from loopy.symbolic import ArrayAccessFinder
    ary_acc_exprs = list(ArrayAccessFinder()(insn.expression))

    from pymbolic.primitives import Subscript

    for assignee in insn.assignees:
        if isinstance(assignee, Subscript):
            ary_acc_exprs.append(assignee)

    # }}}

    # {{{ filter array accesses to only the global ones

    global_ary_acc_exprs = []

    for aae in ary_acc_exprs:
        ary_name = aae.aggregate.name
        arg = kernel.arg_dict.get(ary_name)
        if arg is None:
            continue

        if isinstance(arg, ImageArg):
            continue

        global_ary_acc_exprs.append(aae)

    # }}}

    # {{{ figure out automatic-axis inames

    from loopy.kernel.data import AutoLocalIndexTagBase
    auto_axis_inames = set(
        iname for iname in kernel.insn_inames(insn)
        if isinstance(kernel.iname_to_tag.get(iname), AutoLocalIndexTagBase))

    # }}}

    # {{{ figure out which iname should get mapped to local axis 0

    # maps inames to "aggregate stride"
    aggregate_strides = {}

    from loopy.symbolic import CoefficientCollector
    from pymbolic.primitives import Variable

    for aae in global_ary_acc_exprs:
        index_expr = aae.index
        if not isinstance(index_expr, tuple):
            index_expr = (index_expr, )

        ary_name = aae.aggregate.name
        arg = kernel.arg_dict.get(ary_name)

        if arg.dim_tags is None:
            from warnings import warn
            warn("Strides for '%s' are not known. Local axis assignment "
                 "is likely suboptimal." % arg.name)
            ary_strides = [1] * len(index_expr)
        else:
            ary_strides = []
            from loopy.kernel.array import FixedStrideArrayDimTag
            for dim_tag in arg.dim_tags:
                if isinstance(dim_tag, FixedStrideArrayDimTag):
                    ary_strides.append(dim_tag.stride)

        # {{{ construct iname_to_stride_expr

        iname_to_stride_expr = {}
        for iexpr_i, stride in zip(index_expr, ary_strides):
            if stride is None:
                continue
            coeffs = CoefficientCollector()(iexpr_i)
            for var, coeff in six.iteritems(coeffs):
                if (isinstance(var, Variable)
                        and var.name in auto_axis_inames):
                    # excludes '1', i.e.  the constant
                    new_stride = coeff * stride
                    old_stride = iname_to_stride_expr.get(var.name, None)
                    if old_stride is None or new_stride < old_stride:
                        iname_to_stride_expr[var.name] = new_stride

        # }}}

        from pymbolic import evaluate
        for iname, stride_expr in six.iteritems(iname_to_stride_expr):
            stride = evaluate(stride_expr, approximate_arg_values)
            aggregate_strides[iname] = aggregate_strides.get(iname, 0) + stride

    if aggregate_strides:
        very_large_stride = int(np.iinfo(np.int32).max)

        return sorted((iname for iname in kernel.insn_inames(insn)),
                      key=lambda iname:
                      (aggregate_strides.get(iname, very_large_stride), iname))
    else:
        return None
Esempio n. 31
0
    def map_subscript(self, expr, enclosing_prec, type_context):
        def base_impl(expr, enclosing_prec, type_context):
            return self.parenthesize_if_needed(
                    "%s[%s]" % (
                        self.rec(expr.aggregate, PREC_CALL, type_context),
                        self.rec(expr.index, PREC_NONE, 'i')),
                    enclosing_prec, PREC_CALL)

        from pymbolic.primitives import Variable
        if not isinstance(expr.aggregate, Variable):
            return base_impl(expr, enclosing_prec, type_context)

        if expr.aggregate.name in self.kernel.arg_dict:
            ary = self.kernel.arg_dict[expr.aggregate.name]
        elif expr.aggregate.name in self.kernel.temporary_variables:
            ary = self.kernel.temporary_variables[expr.aggregate.name]
        else:
            raise RuntimeError("nothing known about subscripted variable '%s'"
                    % expr.aggregate.name)

        from loopy.kernel.array import ArrayBase
        if not isinstance(ary, ArrayBase):
            raise RuntimeError("subscripted variable '%s' is not an array"
                    % expr.aggregate.name)

        from loopy.kernel.array import get_access_info
        from pymbolic import evaluate

        access_info = get_access_info(self.kernel.target, ary, expr.index,
                lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
                self.codegen_state.vectorization_info)

        vec_member = get_opencl_vec_member(access_info.vector_index)

        from loopy.kernel.data import ImageArg, GlobalArg, TemporaryVariable

        if isinstance(ary, ImageArg):
            base_access = ("read_imagef(%s, loopy_sampler, (float%d)(%s))"
                    % (ary.name, ary.dimensions,
                        ", ".join(self.rec(idx, PREC_NONE, 'i')
                            for idx in expr.index[::-1])))

            if ary.dtype == np.float32:
                return base_access+".x"
            if self.kernel.target.is_vector_dtype(ary.dtype):
                return base_access
            elif ary.dtype == np.float64:
                return "as_double(%s.xy)" % base_access
            else:
                raise NotImplementedError(
                        "non-floating-point images not supported for now")

        elif isinstance(ary, (GlobalArg, TemporaryVariable)):
            if len(access_info.subscripts) == 0:
                if isinstance(ary, GlobalArg):
                    # unsubscripted global args are pointers
                    if vec_member is not None:
                        return "%s->%s" % (
                                access_info.array_name,
                                vec_member)
                    else:
                        return "*" + access_info.array_name

                else:
                    # unsubscripted temp vars are scalars
                    if vec_member is not None:
                        return "%s.%s" % (
                                access_info.array_name,
                                vec_member)
                    else:
                        return access_info.array_name

            else:
                subscript, = access_info.subscripts
                result = self.parenthesize_if_needed(
                        "%s[%s]" % (
                            access_info.array_name,
                            self.rec(subscript, PREC_NONE, 'i')),
                        enclosing_prec, PREC_CALL)

                if vec_member:
                    result += "."+vec_member

                return result

        else:
            assert False
Esempio n. 32
0
 def as_primitives(self):
     deps = pymbolic.get_dependencies(self)
     context = dict((dep, dep) for dep in deps)
     return pymbolic.evaluate(self, context)
Esempio n. 33
0
    def map_call(self, expr, enclosing_prec, type_context):
        from pymbolic.primitives import Variable, Subscript
        from pymbolic.mapper.stringifier import PREC_NONE

        identifier = expr.function

        # {{{ implement indexof, indexof_vec

        if identifier.name in ["indexof", "indexof_vec"]:
            if len(expr.parameters) != 1:
                raise LoopyError("%s takes exactly one argument" %
                                 identifier.name)
            arg, = expr.parameters
            if not isinstance(arg, Subscript):
                raise LoopyError("argument to %s must be a subscript" %
                                 identifier.name)

            ary = self.find_array(arg)

            from loopy.kernel.array import get_access_info
            from pymbolic import evaluate
            access_info = get_access_info(
                self.kernel.target, ary, arg.index,
                lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
                self.codegen_state.vectorization_info)

            from loopy.kernel.data import ImageArg
            if isinstance(ary, ImageArg):
                raise LoopyError("%s does not support images" %
                                 identifier.name)

            if identifier.name == "indexof":
                return access_info.subscripts[0]
            elif identifier.name == "indexof_vec":
                from loopy.kernel.array import VectorArrayDimTag
                ivec = None
                for iaxis, dim_tag in enumerate(ary.dim_tags):
                    if isinstance(dim_tag, VectorArrayDimTag):
                        ivec = iaxis

                if ivec is None:
                    return access_info.subscripts[0]
                else:
                    return (access_info.subscripts[0] * ary.shape[ivec] +
                            access_info.vector_index)

            else:
                raise RuntimeError("should not get here")

        # }}}

        if isinstance(identifier, Variable):
            identifier = identifier.name

        par_dtypes = tuple(self.infer_type(par) for par in expr.parameters)

        str_parameters = None

        mangle_result = self.kernel.mangle_function(
            identifier, par_dtypes, ast_builder=self.codegen_state.ast_builder)

        if mangle_result is None:
            raise RuntimeError(
                "function '%s' unknown--"
                "maybe you need to register a function mangler?" % identifier)

        if len(mangle_result.result_dtypes) != 1:
            raise LoopyError(
                "functions with more or fewer than one return value "
                "may not be used in an expression")

        if mangle_result.arg_dtypes is not None:
            str_parameters = [
                self.rec(par, PREC_NONE,
                         dtype_to_type_context(self.kernel.target, tgt_dtype),
                         tgt_dtype) for par, par_dtype, tgt_dtype in
                zip(expr.parameters, par_dtypes, mangle_result.arg_dtypes)
            ]

        else:
            # /!\ FIXME For some functions (e.g. 'sin'), it makes sense to
            # propagate the type context here. But for many others, it does
            # not. Using the inferred type as a stopgap for now.
            str_parameters = [
                self.rec(par,
                         PREC_NONE,
                         type_context=dtype_to_type_context(
                             self.kernel.target, par_dtype))
                for par, par_dtype in zip(expr.parameters, par_dtypes)
            ]

            from warnings import warn
            warn(
                "Calling function '%s' with unknown C signature--"
                "return CallMangleInfo.arg_dtypes" % identifier, LoopyWarning)

        from loopy.codegen import SeenFunction
        self.codegen_state.seen_functions.add(
            SeenFunction(identifier, mangle_result.target_name,
                         mangle_result.arg_dtypes or par_dtypes))

        return "%s(%s)" % (mangle_result.target_name,
                           ", ".join(str_parameters))
Esempio n. 34
0
def get_auto_axis_iname_ranking_by_stride(kernel, insn):
    from loopy.kernel.data import ImageArg, ValueArg

    approximate_arg_values = {}
    for arg in kernel.args:
        if isinstance(arg, ValueArg):
            if arg.approximately is not None:
                approximate_arg_values[arg.name] = arg.approximately
            else:
                raise LoopyError("No approximate arg value specified for '%s'"
                        % arg.name)

    # {{{ find all array accesses in insn

    from loopy.symbolic import ArrayAccessFinder
    ary_acc_exprs = list(ArrayAccessFinder()(insn.expression))

    from pymbolic.primitives import Subscript

    if isinstance(insn.assignee, Subscript):
        ary_acc_exprs.append(insn.assignee)

    # }}}

    # {{{ filter array accesses to only the global ones

    global_ary_acc_exprs = []

    for aae in ary_acc_exprs:
        ary_name = aae.aggregate.name
        arg = kernel.arg_dict.get(ary_name)
        if arg is None:
            continue

        if isinstance(arg, ImageArg):
            continue

        global_ary_acc_exprs.append(aae)

    # }}}

    # {{{ figure out automatic-axis inames

    from loopy.kernel.data import AutoLocalIndexTagBase
    auto_axis_inames = set(
            iname
            for iname in kernel.insn_inames(insn)
            if isinstance(kernel.iname_to_tag.get(iname),
                AutoLocalIndexTagBase))

    # }}}

    # {{{ figure out which iname should get mapped to local axis 0

    # maps inames to "aggregate stride"
    aggregate_strides = {}

    from loopy.symbolic import CoefficientCollector
    from pymbolic.primitives import Variable

    for aae in global_ary_acc_exprs:
        index_expr = aae.index
        if not isinstance(index_expr, tuple):
            index_expr = (index_expr,)

        ary_name = aae.aggregate.name
        arg = kernel.arg_dict.get(ary_name)

        if arg.dim_tags is None:
            from warnings import warn
            warn("Strides for '%s' are not known. Local axis assignment "
                    "is likely suboptimal." % arg.name)
            ary_strides = [1] * len(index_expr)
        else:
            ary_strides = []
            from loopy.kernel.array import FixedStrideArrayDimTag
            for dim_tag in arg.dim_tags:
                if isinstance(dim_tag, FixedStrideArrayDimTag):
                    ary_strides.append(dim_tag.stride)

        # {{{ construct iname_to_stride_expr

        iname_to_stride_expr = {}
        for iexpr_i, stride in zip(index_expr, ary_strides):
            if stride is None:
                continue
            coeffs = CoefficientCollector()(iexpr_i)
            for var, coeff in six.iteritems(coeffs):
                if (isinstance(var, Variable)
                        and var.name in auto_axis_inames):
                    # excludes '1', i.e.  the constant
                    new_stride = coeff*stride
                    old_stride = iname_to_stride_expr.get(var.name, None)
                    if old_stride is None or new_stride < old_stride:
                        iname_to_stride_expr[var.name] = new_stride

        # }}}

        from pymbolic import evaluate
        for iname, stride_expr in six.iteritems(iname_to_stride_expr):
            stride = evaluate(stride_expr, approximate_arg_values)
            aggregate_strides[iname] = aggregate_strides.get(iname, 0) + stride

    if aggregate_strides:
        very_large_stride = np.iinfo(np.int32).max

        return sorted((iname for iname in kernel.insn_inames(insn)),
                key=lambda iname: aggregate_strides.get(iname, very_large_stride))
    else:
        return None
Esempio n. 35
0
    def _get_expr_dep_data(self, parsed):
        class Nth:
            def __init__(self, n):
                self.n = n

            def __call__(self, lst):
                return lst[self.n]

        from pymbolic.mapper.dependency import DependencyMapper

        deps = DependencyMapper(include_calls=False)(parsed)

        # gather information on aggregation expressions
        dep_data = []
        from pymbolic.primitives import Variable, Lookup, Subscript
        for dep_idx, dep in enumerate(deps):
            nonlocal_agg = True

            if isinstance(dep, Variable):
                name = dep.name

                if name == "math":
                    continue

                agg_func = self.quantity_data[name].default_aggregator
                if agg_func is None:
                    if self.is_parallel:
                        raise ValueError(
                                "must specify explicit aggregator for '%s'" % name)

                    agg_func = lambda lst: lst[0]
            elif isinstance(dep, Lookup):
                assert isinstance(dep.aggregate, Variable)
                name = dep.aggregate.name
                agg_name = dep.name

                if agg_name == "loc":
                    agg_func = Nth(self.rank)
                    nonlocal_agg = False
                elif agg_name == "min":
                    agg_func = min
                elif agg_name == "max":
                    agg_func = max
                elif agg_name == "avg":
                    from pytools import average
                    agg_func = average
                elif agg_name == "sum":
                    agg_func = sum
                elif agg_name == "norm2":
                    from math import sqrt
                    agg_func = lambda iterable: sqrt(
                            sum(entry**2 for entry in iterable))
                else:
                    raise ValueError("invalid rank aggregator '%s'" % agg_name)
            elif isinstance(dep, Subscript):
                assert isinstance(dep.aggregate, Variable)
                name = dep.aggregate.name

                from pymbolic import evaluate
                agg_func = Nth(evaluate(dep.index))

            qdat = self.quantity_data[name]

            from pytools import Record

            class DependencyData(Record):
                pass

            this_dep_data = DependencyData(name=name, qdat=qdat, agg_func=agg_func,
                    varname="logvar%d" % dep_idx, expr=dep,
                    nonlocal_agg=nonlocal_agg)
            dep_data.append(this_dep_data)

        # substitute in the "logvar" variable names
        from pymbolic import var, substitute
        parsed = substitute(parsed,
                dict((dd.expr, var(dd.varname)) for dd in dep_data))

        return parsed, dep_data
Esempio n. 36
0
    def map_call(self, expr, enclosing_prec, type_context):
        from pymbolic.primitives import Variable, Subscript
        from pymbolic.mapper.stringifier import PREC_NONE

        identifier = expr.function

        # {{{ implement indexof, indexof_vec

        if identifier.name in ["indexof", "indexof_vec"]:
            if len(expr.parameters) != 1:
                raise LoopyError("%s takes exactly one argument" % identifier.name)
            arg, = expr.parameters
            if not isinstance(arg, Subscript):
                raise LoopyError("argument to %s must be a subscript" % identifier.name)

            ary = self.find_array(arg)

            from loopy.kernel.array import get_access_info
            from pymbolic import evaluate

            access_info = get_access_info(
                self.kernel.target,
                ary,
                arg.index,
                lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
                self.codegen_state.vectorization_info,
            )

            from loopy.kernel.data import ImageArg

            if isinstance(ary, ImageArg):
                raise LoopyError("%s does not support images" % identifier.name)

            if identifier.name == "indexof":
                return access_info.subscripts[0]
            elif identifier.name == "indexof_vec":
                from loopy.kernel.array import VectorArrayDimTag

                ivec = None
                for iaxis, dim_tag in enumerate(ary.dim_tags):
                    if isinstance(dim_tag, VectorArrayDimTag):
                        ivec = iaxis

                if ivec is None:
                    return access_info.subscripts[0]
                else:
                    return access_info.subscripts[0] * ary.shape[ivec] + access_info.vector_index

            else:
                raise RuntimeError("should not get here")

        # }}}

        c_name = None
        if isinstance(identifier, Variable):
            identifier = identifier.name
            c_name = identifier

        par_dtypes = tuple(self.infer_type(par) for par in expr.parameters)

        str_parameters = None

        mangle_result = self.kernel.mangle_function(identifier, par_dtypes)
        if mangle_result is not None:
            if len(mangle_result) == 2:
                result_dtype, c_name = mangle_result
            elif len(mangle_result) == 3:
                result_dtype, c_name, arg_tgt_dtypes = mangle_result

                str_parameters = [
                    self.rec(par, PREC_NONE, dtype_to_type_context(self.kernel.target, tgt_dtype), tgt_dtype)
                    for par, par_dtype, tgt_dtype in zip(expr.parameters, par_dtypes, arg_tgt_dtypes)
                ]
            else:
                raise RuntimeError("result of function mangler " "for function '%s' not understood" % identifier)

        from loopy.codegen import SeenFunction

        self.codegen_state.seen_functions.add(SeenFunction(identifier, c_name, par_dtypes))
        if str_parameters is None:
            # /!\ FIXME For some functions (e.g. 'sin'), it makes sense to
            # propagate the type context here. But for many others, it does
            # not. Using the inferred type as a stopgap for now.
            str_parameters = [
                self.rec(par, PREC_NONE, type_context=dtype_to_type_context(self.kernel.target, par_dtype))
                for par, par_dtype in zip(expr.parameters, par_dtypes)
            ]

        if c_name is None:
            raise RuntimeError("unable to find C name for function identifier '%s'" % identifier)

        return "%s(%s)" % (c_name, ", ".join(str_parameters))
Esempio n. 37
0
def test_fuzz_expression_code_gen(ctx_factory, expr_type, random_seed):
    from pymbolic import evaluate

    def get_numpy_type(x):
        if expr_type in ["real", "complex"]:
            if isinstance(x, (complex, np.complexfloating)):
                return np.complex128
            else:
                return np.float64

        elif expr_type in ["int", "int_nonneg"]:
            return np.int64

        else:
            raise ValueError("unknown expr_type: %s" % expr_type)

    from random import seed

    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    seed(random_seed)

    data = []
    instructions = []

    ref_values = {}

    if expr_type in ["real", "complex"]:
        result_type = np.complex128
    elif expr_type in ["int", "int_nonneg"]:
        result_type = np.int64
    else:
        assert False

    var_names = []

    fuzz_iter = iter(generate_random_fuzz_examples(expr_type))
    count = 0

    while True:
        if count == 10:
            break

        i, expr, var_values = next(fuzz_iter)

        var_name = "expr%d" % i

        print(expr)
        #assert_parse_roundtrip(expr)

        if expr_type in ["int", "int_nonneg"]:
            result_type_iinfo = np.iinfo(np.int32)
            bceval_mapper = BoundsCheckingEvaluationMapper(
                var_values,
                lbound=result_type_iinfo.min,
                ubound=result_type_iinfo.max)
            print(expr)
            try:
                ref_values[var_name] = bceval_mapper(expr)
            except BoundsCheckError:
                print(expr)
                print("BOUNDS CHECK FAILED")
                continue
        else:
            try:
                ref_values[var_name] = evaluate(expr, var_values)
            except ZeroDivisionError:
                continue

        count += 1

        data.append(lp.GlobalArg(var_name, result_type, shape=()))
        data.extend([
            lp.TemporaryVariable(name, get_numpy_type(val))
            for name, val in var_values.items()
        ])
        instructions.extend([
            lp.Assignment(name,
                          get_numpy_type(val)(val))
            for name, val in var_values.items()
        ])
        instructions.append(lp.Assignment(var_name, expr))

        if expr_type == "int_nonneg":
            var_names.extend(var_values)

    knl = lp.make_kernel("{ : }", instructions, data, seq_dependencies=True)

    import islpy as isl
    knl = lp.assume(
        knl,
        isl.BasicSet(
            "[%s] -> { : %s}" %
            (", ".join(var_names), " and ".join("%s >= 0" % name
                                                for name in var_names))))

    knl = lp.set_options(knl, return_dict=True)
    print(knl)
    evt, lp_values = knl(queue, out_host=True)

    for name, ref_value in ref_values.items():
        lp_value = lp_values[name]
        if expr_type in ["real", "complex"]:
            err = abs(ref_value - lp_value) / abs(ref_value)
        elif expr_type in ["int", "int_nonneg"]:
            err = abs(ref_value - lp_value)
        else:
            assert False

        if abs(err) > 1e-10:
            print(80 * "-")
            print(knl)
            print(80 * "-")
            print(lp.generate_code_v2(knl).device_code())
            print(80 * "-")
            print(f"WRONG: {name} rel error={err:g}")
            print("reference=%r" % ref_value)
            print("loopy=%r" % lp_value)
            print(80 * "-")
            1 / 0

    print(lp.generate_code_v2(knl).device_code())
Esempio n. 38
0
def test_substitute():
    from pymbolic import parse, substitute, evaluate
    u = parse("5+x.min**2")
    xmin = parse("x.min")
    assert evaluate(substitute(u, {xmin: 25})) == 630
Esempio n. 39
0
    def emit_assignment(self, codegen_state, insn):
        kernel = codegen_state.kernel
        ecm = codegen_state.expression_to_code_mapper

        assignee_var_name, = insn.assignee_var_names()

        lhs_var = codegen_state.kernel.get_var_descriptor(assignee_var_name)
        lhs_dtype = lhs_var.dtype

        if insn.atomicity:
            raise NotImplementedError("atomic ops in ISPC")

        from loopy.expression import dtype_to_type_context
        from pymbolic.mapper.stringifier import PREC_NONE

        rhs_type_context = dtype_to_type_context(kernel.target, lhs_dtype)
        rhs_code = ecm(insn.expression, prec=PREC_NONE,
                    type_context=rhs_type_context,
                    needed_dtype=lhs_dtype)

        lhs = insn.assignee

        # {{{ handle streaming stores

        if "!streaming_store" in insn.tags:
            ary = ecm.find_array(lhs)

            from loopy.kernel.array import get_access_info
            from pymbolic import evaluate

            from loopy.symbolic import simplify_using_aff
            index_tuple = tuple(
                    simplify_using_aff(kernel, idx) for idx in lhs.index_tuple)

            access_info = get_access_info(kernel.target, ary, index_tuple,
                    lambda expr: evaluate(expr, codegen_state.var_subst_map),
                    codegen_state.vectorization_info)

            from loopy.kernel.data import ArrayArg, TemporaryVariable

            if not isinstance(ary, (ArrayArg, TemporaryVariable)):
                raise LoopyError("array type not supported in ISPC: %s"
                        % type(ary).__name)

            if len(access_info.subscripts) != 1:
                raise LoopyError("streaming stores must have a subscript")
            subscript, = access_info.subscripts

            from pymbolic.primitives import Sum, flattened_sum, Variable
            if isinstance(subscript, Sum):
                terms = subscript.children
            else:
                terms = (subscript.children,)

            new_terms = []

            from loopy.kernel.data import LocalIndexTag, filter_iname_tags_by_type
            from loopy.symbolic import get_dependencies

            saw_l0 = False
            for term in terms:
                if (isinstance(term, Variable)
                            and kernel.iname_tags_of_type(term.name, LocalIndexTag)):
                    tag, = kernel.iname_tags_of_type(
                        term.name, LocalIndexTag, min_num=1, max_num=1)
                    if tag.axis == 0:
                        if saw_l0:
                            raise LoopyError(
                                "streaming store must have stride 1 in "
                                "local index, got: %s" % subscript)
                        saw_l0 = True
                        continue
                else:
                    for dep in get_dependencies(term):
                        if filter_iname_tags_by_type(
                                kernel.iname_to_tags.get(dep, []), LocalIndexTag):
                            tag, = filter_iname_tags_by_type(
                                kernel.iname_to_tags.get(dep, []), LocalIndexTag, 1)
                            if tag.axis == 0:
                                raise LoopyError(
                                    "streaming store must have stride 1 in "
                                    "local index, got: %s" % subscript)

                    new_terms.append(term)

            if not saw_l0:
                raise LoopyError("streaming store must have stride 1 in "
                        "local index, got: %s" % subscript)

            if access_info.vector_index is not None:
                raise LoopyError("streaming store may not use a short-vector "
                        "data type")

            rhs_has_programindex = any(
                isinstance(tag, LocalIndexTag) and tag.axis == 0
                for tag in kernel.iname_tags(dep)
                for dep in get_dependencies(insn.expression))

            if not rhs_has_programindex:
                rhs_code = "broadcast(%s, 0)" % rhs_code

            from cgen import Statement
            return Statement(
                    "streaming_store(%s + %s, %s)"
                    % (
                        access_info.array_name,
                        ecm(flattened_sum(new_terms), PREC_NONE, 'i'),
                        rhs_code))

        # }}}

        from cgen import Assign
        return Assign(ecm(lhs, prec=PREC_NONE, type_context=None), rhs_code)
Esempio n. 40
0
def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters):
    import pyopencl as cl
    import pyopencl.array as cl_array

    from loopy.kernel.data import ValueArg, ArrayArg, ImageArg,\
            TemporaryVariable, ConstantArg

    from pymbolic import evaluate

    args = {}
    for arg, arg_desc in zip(impl_arg_info, ref_arg_data):
        kernel_arg = kernel.impl_arg_to_arg.get(arg.name)

        if arg.arg_class is ValueArg:
            arg_value = parameters[arg.name]

            try:
                argv_dtype = arg_value.dtype
            except AttributeError:
                argv_dtype = None

            if argv_dtype != arg.dtype:
                arg_value = arg.dtype.numpy_dtype.type(arg_value)

            args[arg.name] = arg_value

        elif arg.arg_class is ImageArg:
            if arg.name in kernel.get_written_variables():
                raise NotImplementedError("write-mode images not supported in "
                                          "automatic testing")

            shape = evaluate_shape(arg.unvec_shape, parameters)
            assert shape == arg_desc.ref_shape

            # must be contiguous
            args[arg.name] = cl.image_from_array(
                queue.context, arg_desc.ref_pre_run_array.get())

        elif arg.arg_class is ArrayArg or\
                arg.arg_class is ConstantArg:
            shape = evaluate(arg.unvec_shape, parameters)
            strides = evaluate(arg.unvec_strides, parameters)

            dtype = kernel_arg.dtype
            itemsize = dtype.itemsize
            numpy_strides = [itemsize * s for s in strides]

            alloc_size = sum(astrd * (alen - 1) if astrd != 0 else alen - 1
                             for alen, astrd in zip(shape, strides)) + 1

            # use contiguous array to transfer to host
            host_ref_contig_array = arg_desc.ref_pre_run_storage_array.get()

            # use device shape/strides
            from pyopencl.compyte.array import as_strided
            host_ref_array = as_strided(host_ref_contig_array,
                                        arg_desc.ref_shape,
                                        arg_desc.ref_numpy_strides)

            # flatten the thing
            host_ref_flat_array = host_ref_array.flatten()

            # create host array with test shape (but not strides)
            host_contig_array = np.empty(shape, dtype=dtype)

            common_len = min(len(host_ref_flat_array),
                             len(host_contig_array.ravel()))
            host_contig_array.ravel()[:common_len] = \
                    host_ref_flat_array[:common_len]

            # create host array with test shape and storage layout
            host_storage_array = np.empty(alloc_size, dtype)
            host_array = as_strided(host_storage_array, shape, numpy_strides)
            host_array[...] = host_contig_array

            host_contig_array = arg_desc.ref_storage_array.get()
            storage_array = cl_array.to_device(queue, host_storage_array)
            ary = cl_array.as_strided(storage_array, shape, numpy_strides)

            args[arg.name] = ary

            arg_desc.test_storage_array = storage_array
            arg_desc.test_array = ary
            arg_desc.test_shape = shape
            arg_desc.test_strides = strides
            arg_desc.test_numpy_strides = numpy_strides
            arg_desc.test_alloc_size = alloc_size

        elif arg.arg_class is TemporaryVariable:
            # global temporary, handled by invocation logic
            pass

        else:
            raise LoopyError("arg type not understood")

    return args
Esempio n. 41
0
    def emit_assignment(self, codegen_state, insn):
        kernel = codegen_state.kernel
        ecm = codegen_state.expression_to_code_mapper

        assignee_var_name, = insn.assignee_var_names()

        lhs_var = codegen_state.kernel.get_var_descriptor(assignee_var_name)
        lhs_dtype = lhs_var.dtype

        if insn.atomicity:
            raise NotImplementedError("atomic ops in ISPC")

        from loopy.expression import dtype_to_type_context
        from pymbolic.mapper.stringifier import PREC_NONE

        rhs_type_context = dtype_to_type_context(kernel.target, lhs_dtype)
        rhs_code = ecm(insn.expression,
                       prec=PREC_NONE,
                       type_context=rhs_type_context,
                       needed_dtype=lhs_dtype)

        lhs = insn.assignee

        # {{{ handle streaming stores

        if "!streaming_store" in insn.tags:
            ary = ecm.find_array(lhs)

            from loopy.kernel.array import get_access_info
            from pymbolic import evaluate

            from loopy.symbolic import simplify_using_aff
            index_tuple = tuple(
                simplify_using_aff(kernel, idx) for idx in lhs.index_tuple)

            access_info = get_access_info(
                kernel.target, ary, index_tuple,
                lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
                codegen_state.vectorization_info)

            from loopy.kernel.data import GlobalArg, TemporaryVariable

            if not isinstance(ary, (GlobalArg, TemporaryVariable)):
                raise LoopyError("array type not supported in ISPC: %s" %
                                 type(ary).__name)

            if len(access_info.subscripts) != 1:
                raise LoopyError("streaming stores must have a subscript")
            subscript, = access_info.subscripts

            from pymbolic.primitives import Sum, flattened_sum, Variable
            if isinstance(subscript, Sum):
                terms = subscript.children
            else:
                terms = (subscript.children, )

            new_terms = []

            from loopy.kernel.data import LocalIndexTag
            from loopy.symbolic import get_dependencies

            saw_l0 = False
            for term in terms:
                if (isinstance(term, Variable) and isinstance(
                        kernel.iname_to_tag.get(term.name), LocalIndexTag)
                        and kernel.iname_to_tag.get(term.name).axis == 0):
                    if saw_l0:
                        raise LoopyError("streaming store must have stride 1 "
                                         "in local index, got: %s" % subscript)
                    saw_l0 = True
                    continue
                else:
                    for dep in get_dependencies(term):
                        if (isinstance(kernel.iname_to_tag.get(dep),
                                       LocalIndexTag)
                                and kernel.iname_to_tag.get(dep).axis == 0):
                            raise LoopyError(
                                "streaming store must have stride 1 "
                                "in local index, got: %s" % subscript)

                    new_terms.append(term)

            if not saw_l0:
                raise LoopyError("streaming store must have stride 1 in "
                                 "local index, got: %s" % subscript)

            if access_info.vector_index is not None:
                raise LoopyError("streaming store may not use a short-vector "
                                 "data type")

            rhs_has_programindex = any(
                isinstance(kernel.iname_to_tag.get(dep), LocalIndexTag)
                and kernel.iname_to_tag.get(dep).axis == 0
                for dep in get_dependencies(insn.expression))

            if not rhs_has_programindex:
                rhs_code = "broadcast(%s, 0)" % rhs_code

            from cgen import Statement
            return Statement(
                "streaming_store(%s + %s, %s)" %
                (access_info.array_name,
                 ecm(flattened_sum(new_terms), PREC_NONE, 'i'), rhs_code))

        # }}}

        from cgen import Assign
        return Assign(ecm(lhs, prec=PREC_NONE, type_context=None), rhs_code)
Esempio n. 42
0
def make_ref_args(kernel, impl_arg_info, queue, parameters):
    import pyopencl as cl
    import pyopencl.array as cl_array

    from loopy.kernel.data import ValueArg, ArrayArg, ImageArg, \
            TemporaryVariable, ConstantArg

    from pymbolic import evaluate

    ref_args = {}
    ref_arg_data = []

    for arg in impl_arg_info:
        kernel_arg = kernel.impl_arg_to_arg.get(arg.name)

        if arg.arg_class is ValueArg:
            if arg.offset_for_name:
                continue

            arg_value = parameters[arg.name]

            try:
                argv_dtype = arg_value.dtype
            except AttributeError:
                argv_dtype = None

            if argv_dtype != arg.dtype:
                arg_value = arg.dtype.numpy_dtype.type(arg_value)

            ref_args[arg.name] = arg_value

            ref_arg_data.append(None)

        elif arg.arg_class is ArrayArg or arg.arg_class is ImageArg \
                or arg.arg_class is ConstantArg:
            if arg.shape is None or any(saxis is None for saxis in arg.shape):
                raise LoopyError(
                    "array '%s' needs known shape to use automatic "
                    "testing" % arg.name)

            shape = evaluate_shape(arg.unvec_shape, parameters)
            dtype = kernel_arg.dtype

            is_output = arg.base_name in kernel.get_written_variables()

            if arg.arg_class is ImageArg:
                storage_array = ary = cl_array.empty(queue,
                                                     shape,
                                                     dtype,
                                                     order="C")
                numpy_strides = None
                alloc_size = None
                strides = None
            else:
                strides = evaluate(arg.unvec_strides, parameters)

                alloc_size = sum(astrd * (alen - 1) if astrd != 0 else alen - 1
                                 for alen, astrd in zip(shape, strides)) + 1

                if dtype is None:
                    raise LoopyError("dtype for argument '%s' is not yet "
                                     "known. Perhaps you want to use "
                                     "loopy.add_dtypes "
                                     "or loopy.infer_argument_dtypes?" %
                                     arg.name)

                itemsize = dtype.itemsize
                numpy_strides = [itemsize * s for s in strides]

                storage_array = cl_array.empty(queue, alloc_size, dtype)

            if is_output and arg.arg_class is ImageArg:
                raise LoopyError("write-mode images not supported in "
                                 "automatic testing")

            fill_rand(storage_array)

            if arg.arg_class is ImageArg:
                # must be contiguous
                pre_run_ary = pre_run_storage_array = storage_array.copy()

                ref_args[arg.name] = cl.image_from_array(
                    queue.context, ary.get())
            else:
                pre_run_storage_array = storage_array.copy()

                ary = cl_array.as_strided(storage_array, shape, numpy_strides)
                pre_run_ary = cl_array.as_strided(pre_run_storage_array, shape,
                                                  numpy_strides)
                ref_args[arg.name] = ary

            ref_arg_data.append(
                TestArgInfo(name=arg.name,
                            ref_array=ary,
                            ref_storage_array=storage_array,
                            ref_pre_run_array=pre_run_ary,
                            ref_pre_run_storage_array=pre_run_storage_array,
                            ref_shape=shape,
                            ref_strides=strides,
                            ref_alloc_size=alloc_size,
                            ref_numpy_strides=numpy_strides,
                            needs_checking=is_output))

        elif arg.arg_class is TemporaryVariable:
            # global temporary, handled by invocation logic
            pass

        else:
            raise LoopyError("arg type %s not understood" % type(arg))

    return ref_args, ref_arg_data
Esempio n. 43
0
    def map_subscript(self, expr, enclosing_prec, type_context):
        def base_impl(expr, enclosing_prec, type_context):
            return self.parenthesize_if_needed(
                    "%s[%s]" % (
                        self.rec(expr.aggregate, PREC_CALL, type_context),
                        self.rec(expr.index, PREC_NONE, 'i')),
                    enclosing_prec, PREC_CALL)

        from pymbolic.primitives import Variable
        if not isinstance(expr.aggregate, Variable):
            return base_impl(expr, enclosing_prec, type_context)

        ary = self.find_array(expr)

        from loopy.kernel.array import get_access_info
        from pymbolic import evaluate

        from loopy.symbolic import simplify_using_aff
        index_tuple = tuple(
                simplify_using_aff(self.kernel, idx) for idx in expr.index_tuple)

        access_info = get_access_info(self.kernel.target, ary, index_tuple,
                lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
                self.codegen_state.vectorization_info)

        from loopy.kernel.data import ImageArg, GlobalArg, TemporaryVariable

        if isinstance(ary, ImageArg):
            base_access = ("read_imagef(%s, loopy_sampler, (float%d)(%s))"
                    % (ary.name, ary.dimensions,
                        ", ".join(self.rec(idx, PREC_NONE, 'i')
                            for idx in expr.index[::-1])))

            if ary.dtype.numpy_dtype == np.float32:
                return base_access+".x"
            if self.kernel.target.is_vector_dtype(ary.dtype):
                return base_access
            elif ary.dtype.numpy_dtype == np.float64:
                return "as_double(%s.xy)" % base_access
            else:
                raise NotImplementedError(
                        "non-floating-point images not supported for now")

        elif isinstance(ary, (GlobalArg, TemporaryVariable)):
            if len(access_info.subscripts) == 0:
                if isinstance(ary, GlobalArg):
                    # unsubscripted global args are pointers
                    result = "*" + access_info.array_name

                else:
                    # unsubscripted temp vars are scalars
                    result = access_info.array_name

            else:
                subscript, = access_info.subscripts
                result = self.parenthesize_if_needed(
                        "%s[%s]" % (
                            access_info.array_name,
                            self.rec(subscript, PREC_NONE, 'i')),
                        enclosing_prec, PREC_CALL)

            if access_info.vector_index is not None:
                return self.codegen_state.ast_builder.add_vector_access(
                    result, access_info.vector_index)
            else:
                return result

        else:
            assert False