def check_sizes(kernel, device): import loopy as lp from loopy.diagnostic import LoopyAdvisory, LoopyError if device is None: from loopy.diagnostic import warn warn(kernel, "no_device_in_pre_codegen_checks", "No device parameter was passed to the PyOpenCLTarget. " "Perhaps you want to pass a device to benefit from " "additional checking.", LoopyAdvisory) return parameters = {} for arg in kernel.args: if isinstance(arg, lp.ValueArg) and arg.approximately is not None: parameters[arg.name] = arg.approximately glens, llens = kernel.get_grid_sizes_as_exprs() if (max(len(glens), len(llens)) > device.max_work_item_dimensions): raise LoopyError("too many work item dimensions") from pymbolic import evaluate from pymbolic.mapper.evaluator import UnknownVariableError try: glens = evaluate(glens, parameters) llens = evaluate(llens, parameters) except UnknownVariableError as name: from warnings import warn warn("could not check axis bounds because no value " "for variable '%s' was passed to check_kernels()" % name, LoopyAdvisory) else: for i in range(len(llens)): if llens[i] > device.max_work_item_sizes[i]: raise LoopyError("group axis %d too big" % i) from pytools import product if product(llens) > device.max_work_group_size: raise LoopyError("work group too big") from pyopencl.characterize import usable_local_mem_size if kernel.local_mem_use() > usable_local_mem_size(device): raise LoopyError("using too much local memory") from loopy.kernel.data import ConstantArg const_arg_count = sum( 1 for arg in kernel.args if isinstance(arg, ConstantArg)) if const_arg_count > device.max_constant_args: raise LoopyError("too many constant arguments")
def check_sizes(kernel, device): import loopy as lp from loopy.diagnostic import LoopyAdvisory, LoopyError if device is None: from loopy.diagnostic import warn warn(kernel, "no_device_in_pre_codegen_checks", "No device parameter was passed to the PyOpenCLTarget. " "Perhaps you want to pass a device to benefit from " "additional checking.", LoopyAdvisory) return parameters = {} for arg in kernel.args: if isinstance(arg, lp.ValueArg) and arg.approximately is not None: parameters[arg.name] = arg.approximately glens, llens = kernel.get_grid_size_upper_bounds_as_exprs() if (max(len(glens), len(llens)) > device.max_work_item_dimensions): raise LoopyError("too many work item dimensions") from pymbolic import evaluate from pymbolic.mapper.evaluator import UnknownVariableError try: glens = evaluate(glens, parameters) llens = evaluate(llens, parameters) except UnknownVariableError as name: from warnings import warn warn("could not check axis bounds because no value " "for variable '%s' was passed to check_kernels()" % name, LoopyAdvisory) else: for i in range(len(llens)): if llens[i] > device.max_work_item_sizes[i]: raise LoopyError("group axis %d too big" % i) from pytools import product if product(llens) > device.max_work_group_size: raise LoopyError("work group too big") from pyopencl.characterize import usable_local_mem_size if kernel.local_mem_use() > usable_local_mem_size(device): raise LoopyError("using too much local memory") from loopy.kernel.data import ConstantArg const_arg_count = sum( 1 for arg in kernel.args if isinstance(arg, ConstantArg)) if const_arg_count > device.max_constant_args: raise LoopyError("too many constant arguments")
def map_subscript(self, expr, type_context): from loopy.kernel.data import TemporaryVariable ary = self.find_array(expr) if (isinstance(ary, TemporaryVariable) and ary.scope == temp_var_scope.PRIVATE): # generate access code for acccess to private-index temporaries gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs() if lsize: lsize, = lsize from loopy.kernel.array import get_access_info from pymbolic import evaluate access_info = get_access_info( self.kernel.target, ary, expr.index, lambda expr: evaluate( expr, self.codegen_state.var_subst_map), self.codegen_state.vectorization_info) subscript, = access_info.subscripts result = var( access_info.array_name)[var("programIndex") + self.rec(lsize * subscript, 'i')] if access_info.vector_index is not None: return self.kernel.target.add_vector_access( result, access_info.vector_index) else: return result return super(ExprToISPCExprMapper, self).map_subscript(expr, type_context)
def map_subscript(self, expr, enclosing_prec, type_context): from loopy.kernel.data import TemporaryVariable ary = self.find_array(expr) if isinstance(ary, TemporaryVariable): gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs() if lsize: lsize, = lsize from loopy.kernel.array import get_access_info from pymbolic import evaluate access_info = get_access_info( self.kernel.target, ary, expr.index, lambda expr: evaluate( expr, self.codegen_state.var_subst_map), self.codegen_state.vectorization_info) subscript, = access_info.subscripts result = self.parenthesize_if_needed( "%s[programIndex + %s]" % (access_info.array_name, self.rec(lsize * subscript, PREC_SUM, 'i')), enclosing_prec, PREC_CALL) if access_info.vector_index is not None: return self.kernel.target.add_vector_access( result, access_info.vector_index) else: return result return super(ExprToISPCMapper, self).map_subscript(expr, enclosing_prec, type_context)
def map_subscript(self, expr, type_context): from loopy.kernel.data import TemporaryVariable ary = self.find_array(expr) if (isinstance(ary, TemporaryVariable) and ary.address_space == AddressSpace.PRIVATE): # generate access code for acccess to private-index temporaries gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs() if lsize: lsize, = lsize from loopy.kernel.array import get_access_info from pymbolic import evaluate access_info = get_access_info(self.kernel.target, ary, expr.index, lambda expr: evaluate(expr, self.codegen_state.var_subst_map), self.codegen_state.vectorization_info) subscript, = access_info.subscripts result = var(access_info.array_name)[ var("programIndex") + self.rec(lsize*subscript, 'i')] if access_info.vector_index is not None: return self.kernel.target.add_vector_access( result, access_info.vector_index) else: return result return super(ExprToISPCExprMapper, self).map_subscript( expr, type_context)
def map_subscript(self, expr, enclosing_prec, type_context): from loopy.kernel.data import TemporaryVariable ary = self.find_array(expr) if isinstance(ary, TemporaryVariable): gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs() if lsize: lsize, = lsize from loopy.kernel.array import get_access_info from pymbolic import evaluate access_info = get_access_info(self.kernel.target, ary, expr.index, lambda expr: evaluate(expr, self.codegen_state.var_subst_map), self.codegen_state.vectorization_info) subscript, = access_info.subscripts result = self.parenthesize_if_needed( "%s[programIndex + %s]" % ( access_info.array_name, self.rec(lsize*subscript, PREC_SUM, 'i')), enclosing_prec, PREC_CALL) if access_info.vector_index is not None: return self.kernel.target.add_vector_access( result, access_info.vector_index) else: return result return super(ExprToISPCMapper, self).map_subscript( expr, enclosing_prec, type_context)
def estimate_calibration_params(model_results, timing_results): """Given a set of model results and matching timing results, estimate the best calibration parameters for the model. """ params = set(_FMM_STAGE_TO_CALIBRATION_PARAMETER.values()) nresults = len(model_results) if nresults != len(timing_results): raise ValueError("must have same number of model and timing results") uncalibrated_times = {} actual_times = {} for param in params: uncalibrated_times[param] = np.zeros(nresults) actual_times[param] = np.zeros(nresults) from pymbolic import evaluate from pymbolic.mapper.coefficient import CoefficientCollector collect_coeffs = CoefficientCollector() for i, model_result in enumerate(model_results): context = model_result.params.copy() for param in params: context[param] = var(param) # Represents the total modeled cost, but leaves the calibration # parameters symbolic. total_cost = evaluate(sum(model_result.raw_costs.values()), context=context) coeffs = collect_coeffs(total_cost) # Assumes the total cost is a sum of terms linear in the parameters. assert set(key.name for key in coeffs) <= params for param, time in coeffs.items(): uncalibrated_times[param.name][i] = time for i, timing_result in enumerate(timing_results): for param, time in timing_result.items(): calibration_param = (_FMM_STAGE_TO_CALIBRATION_PARAMETER[param]) actual_times[calibration_param][i] = time["process_elapsed"] result = {} for param in params: uncalibrated = uncalibrated_times[param] actual = actual_times[param] if np.allclose(uncalibrated, 0): result[param] = float("NaN") continue result[param] = (actual.dot(uncalibrated) / uncalibrated.dot(uncalibrated)) return result
def qbx_cost_factors_for_kernels_from_model(self, queue, nlevels, xlat_cost, context): """Evaluate translation cost factors from symbolic model. The result of this function can be used for process_* methods in this class. This method overwrite the method in parent :class:`boxtree.cost.AbstractFMMCostModel` to support operations specific to QBX. :arg queue: If not None, the cost factor arrays will be transferred to device using this queue. :arg nlevels: the number of tree levels. :arg xlat_cost: a :class:`QBXTranslationCostModel`. :arg context: a :class:`dict` mapping from the symbolic names of parameters to their values, serving as context when evaluating symbolic expressions in *xlat_cost*. :return: a :class:`dict`, mapping from stage names to the translation costs of those stages in FMM and QBX. """ cost_factors = self.fmm_cost_factors_for_kernels_from_model( queue, nlevels, xlat_cost, context) cost_factors.update({ "p2qbxl_cost": evaluate(xlat_cost.p2qbxl(), context=context), "m2qbxl_cost": np.array([ evaluate(xlat_cost.m2qbxl(ilevel), context=context) for ilevel in range(nlevels) ]), "l2qbxl_cost": np.array([ evaluate(xlat_cost.l2qbxl(ilevel), context=context) for ilevel in range(nlevels) ]), "qbxl2p_cost": evaluate(xlat_cost.qbxl2p(), context=context), "p2p_tsqbx_cost": evaluate(xlat_cost.p2p_tsqbx(), context=context) }) if queue: cost_factors = self.cost_factors_to_dev(cost_factors, queue) return cost_factors
def eval(expr, source, center1, center2, target): from pymbolic import parse, evaluate context = { "s": source, "c1": center1, "c2": center2, "t": target, "norm": la.norm} return evaluate(parse(expr), context)
def from_primitives(expr, var_order): from pymbolic import get_dependencies, evaluate deps = get_dependencies(expr) var_deps = [dep for dep in deps if dep in var_order] context = dict((vd, Polynomial(vd, var_order=var_order)) for vd in var_deps) # FIXME not fast, but works # (and exercises multivariate polynomial code) return evaluate(expr, context)
def from_primitives(expr, var_order): from pymbolic import get_dependencies, evaluate deps = get_dependencies(expr) var_deps = [dep for dep in deps if dep in var_order] context = dict( (vd, Polynomial(vd, var_order=var_order)) for vd in var_deps) # FIXME not fast, but works # (and exercises multivariate polynomial code) return evaluate(expr, context)
def evaluate_shape(shape, context): from pymbolic import evaluate result = [] for saxis in shape: if saxis is None: result.append(saxis) else: result.append(evaluate(saxis, context)) return tuple(result)
def aff_from_expr(space, expr, vars_to_zero=set()): zero = isl.Aff.zero_on_domain(isl.LocalSpace.from_space(space)) context = {} for name, (dt, pos) in six.iteritems(space.get_var_dict()): if dt == dim_type.set: dt = dim_type.in_ context[name] = zero.set_coefficient_val(dt, pos, 1) for name in vars_to_zero: context[name] = zero from pymbolic import evaluate return zero + evaluate(expr, context)
def test_fuzz_code_generator(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) if ctx.devices[0].platform.vendor.startswith("Advanced Micro"): pytest.skip("crashes on AMD 15.12") #from expr_fuzz import get_fuzz_examples #for expr, var_values in get_fuzz_examples(): for expr, var_values in generate_random_fuzz_examples(50): from pymbolic import evaluate try: true_value = evaluate(expr, var_values) except ZeroDivisionError: continue def get_dtype(x): if isinstance(x, (complex, np.complexfloating)): return np.complex128 else: return np.float64 knl = lp.make_kernel("{ : }", [lp.Assignment("value", expr)], [lp.GlobalArg("value", np.complex128, shape=())] + [ lp.ValueArg(name, get_dtype(val)) for name, val in six.iteritems(var_values) ]) ck = lp.CompiledKernel(ctx, knl) evt, (lp_value,) = ck(queue, out_host=True, **var_values) err = abs(true_value-lp_value)/abs(true_value) if abs(err) > 1e-10: print(80*"-") print("WRONG: rel error=%g" % err) print("true=%r" % true_value) print("loopy=%r" % lp_value) print(80*"-") print(ck.get_code()) print(80*"-") print(var_values) print(80*"-") print(repr(expr)) print(80*"-") print(expr) print(80*"-") 1/0
def test_fuzz_code_generator(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) if ctx.devices[0].platform.vendor.startswith("Advanced Micro"): pytest.skip("crashes on AMD 15.12") #from expr_fuzz import get_fuzz_examples #for expr, var_values in get_fuzz_examples(): for expr, var_values in generate_random_fuzz_examples(50): from pymbolic import evaluate try: true_value = evaluate(expr, var_values) except ZeroDivisionError: continue def get_dtype(x): if isinstance(x, (complex, np.complexfloating)): return np.complex128 else: return np.float64 knl = lp.make_kernel("{ : }", [lp.Assignment("value", expr)], [lp.GlobalArg("value", np.complex128, shape=())] + [ lp.ValueArg(name, get_dtype(val)) for name, val in six.iteritems(var_values) ]) ck = lp.CompiledKernel(ctx, knl) evt, (lp_value, ) = ck(queue, out_host=True, **var_values) err = abs(true_value - lp_value) / abs(true_value) if abs(err) > 1e-10: print(80 * "-") print("WRONG: rel error=%g" % err) print("true=%r" % true_value) print("loopy=%r" % lp_value) print(80 * "-") print(ck.get_code()) print(80 * "-") print(var_values) print(80 * "-") print(repr(expr)) print(80 * "-") print(expr) print(80 * "-") 1 / 0
def map_math_functions_by_name(i, func, pars): try: f = pymbolic.evaluate(func, {"math": math, "cmath": cmath}) except pymbolic.mapper.evaluator.UnknownVariableError: raise RuntimeError("No derivative of non-constant function "+str(func)) def make_f(name): return primitives.Lookup(primitives.Variable("math"), name) if f is math.sin and len(pars) == 1: return make_f("cos")(*pars) elif f is math.cos and len(pars) == 1: return -make_f("sin")(*pars) elif f is math.tan and len(pars) == 1: return make_f("tan")(*pars)**2+1 elif f is math.log and len(pars) == 1: return primitives.quotient(1, pars[0]) elif f is math.exp and len(pars) == 1: return make_f("exp")(*pars) else: raise RuntimeError("unrecognized function, cannot differentiate")
def emit_call(self, expression_to_code_mapper, expression, target): from pymbolic.primitives import Subscript if len(expression.parameters) != 1: raise LoopyError("%s takes exactly one argument" % self.name) arg, = expression.parameters if not isinstance(arg, Subscript): raise LoopyError("argument to %s must be a subscript" % self.name) ary = expression_to_code_mapper.find_array(arg) from loopy.kernel.array import get_access_info from pymbolic import evaluate access_info = get_access_info( expression_to_code_mapper.kernel.target, ary, arg.index, lambda expr: evaluate( expr, expression_to_code_mapper.codegen_state.var_subst_map), expression_to_code_mapper.codegen_state.vectorization_info) from loopy.kernel.data import ImageArg if isinstance(ary, ImageArg): raise LoopyError("%s does not support images" % self.name) if self.name == "indexof": return access_info.subscripts[0] elif self.name == "indexof_vec": from loopy.kernel.array import VectorArrayDimTag ivec = None for iaxis, dim_tag in enumerate(ary.dim_tags): if isinstance(dim_tag, VectorArrayDimTag): ivec = iaxis if ivec is None: return access_info.subscripts[0] else: return (access_info.subscripts[0] * ary.shape[ivec] + access_info.vector_index) else: raise RuntimeError("should not get here")
def map_subscript(self, expr, enclosing_prec, type_context): def base_impl(expr, enclosing_prec, type_context): return self.parenthesize_if_needed( "%s[%s]" % (self.rec(expr.aggregate, PREC_CALL, type_context), self.rec(expr.index, PREC_NONE, 'i')), enclosing_prec, PREC_CALL) from pymbolic.primitives import Variable if not isinstance(expr.aggregate, Variable): return base_impl(expr, enclosing_prec, type_context) ary = self.find_array(expr) from loopy.kernel.array import get_access_info from pymbolic import evaluate from loopy.symbolic import simplify_using_aff index_tuple = tuple( simplify_using_aff(self.kernel, idx) for idx in expr.index_tuple) access_info = get_access_info( self.kernel.target, ary, index_tuple, lambda expr: evaluate(expr, self.codegen_state.var_subst_map), self.codegen_state.vectorization_info) from loopy.kernel.data import ImageArg, GlobalArg, TemporaryVariable if isinstance(ary, ImageArg): base_access = ("read_imagef(%s, loopy_sampler, (float%d)(%s))" % (ary.name, ary.dimensions, ", ".join( self.rec(idx, PREC_NONE, 'i') for idx in expr.index[::-1]))) if ary.dtype.numpy_dtype == np.float32: return base_access + ".x" if self.kernel.target.is_vector_dtype(ary.dtype): return base_access elif ary.dtype.numpy_dtype == np.float64: return "as_double(%s.xy)" % base_access else: raise NotImplementedError( "non-floating-point images not supported for now") elif isinstance(ary, (GlobalArg, TemporaryVariable)): if len(access_info.subscripts) == 0: if isinstance(ary, GlobalArg): # unsubscripted global args are pointers result = "*" + access_info.array_name else: # unsubscripted temp vars are scalars result = access_info.array_name else: subscript, = access_info.subscripts result = self.parenthesize_if_needed( "%s[%s]" % (access_info.array_name, self.rec(subscript, PREC_NONE, 'i')), enclosing_prec, PREC_CALL) if access_info.vector_index is not None: return self.codegen_state.ast_builder.add_vector_access( result, access_info.vector_index) else: return result else: assert False
def test_substitute(): from pymbolic import parse, substitute, evaluate u = parse("5+x.min**2") xmin = parse("x.min") assert evaluate(substitute(u, {xmin: 25})) == 630
def evaluate(self, expr): from pymbolic import evaluate try: return evaluate(expr) except ValueError: return None
def as_primitives(self): deps = pymbolic.get_dependencies(self) context = dict((dep, dep) for dep in deps) return pymbolic.evaluate(self, context)
def make_ref_args(kernel, impl_arg_info, queue, parameters): import pyopencl as cl import pyopencl.array as cl_array from loopy.kernel.data import ValueArg, GlobalArg, ImageArg, TemporaryVariable from pymbolic import evaluate ref_args = {} ref_arg_data = [] for arg in impl_arg_info: kernel_arg = kernel.impl_arg_to_arg.get(arg.name) if arg.arg_class is ValueArg: if arg.offset_for_name: continue arg_value = parameters[arg.name] try: argv_dtype = arg_value.dtype except AttributeError: argv_dtype = None if argv_dtype != arg.dtype: arg_value = arg.dtype.numpy_dtype.type(arg_value) ref_args[arg.name] = arg_value ref_arg_data.append(None) elif arg.arg_class is GlobalArg or arg.arg_class is ImageArg: if arg.shape is None or any(saxis is None for saxis in arg.shape): raise LoopyError("array '%s' needs known shape to use automatic " "testing" % arg.name) shape = evaluate_shape(arg.unvec_shape, parameters) dtype = kernel_arg.dtype is_output = arg.base_name in kernel.get_written_variables() if arg.arg_class is ImageArg: storage_array = ary = cl_array.empty( queue, shape, dtype, order="C") numpy_strides = None alloc_size = None strides = None else: strides = evaluate(arg.unvec_strides, parameters) from pytools import all assert all(s > 0 for s in strides) alloc_size = sum(astrd*(alen-1) for alen, astrd in zip(shape, strides)) + 1 if dtype is None: raise LoopyError("dtype for argument '%s' is not yet " "known. Perhaps you want to use " "loopy.add_dtypes " "or loopy.infer_argument_dtypes?" % arg.name) itemsize = dtype.itemsize numpy_strides = [itemsize*s for s in strides] storage_array = cl_array.empty(queue, alloc_size, dtype) if is_output and arg.arg_class is ImageArg: raise LoopyError("write-mode images not supported in " "automatic testing") fill_rand(storage_array) if arg.arg_class is ImageArg: # must be contiguous pre_run_ary = pre_run_storage_array = storage_array.copy() ref_args[arg.name] = cl.image_from_array( queue.context, ary.get()) else: pre_run_storage_array = storage_array.copy() ary = cl_array.as_strided(storage_array, shape, numpy_strides) pre_run_ary = cl_array.as_strided( pre_run_storage_array, shape, numpy_strides) ref_args[arg.name] = ary ref_arg_data.append( TestArgInfo( name=arg.name, ref_array=ary, ref_storage_array=storage_array, ref_pre_run_array=pre_run_ary, ref_pre_run_storage_array=pre_run_storage_array, ref_shape=shape, ref_strides=strides, ref_alloc_size=alloc_size, ref_numpy_strides=numpy_strides, needs_checking=is_output)) elif arg.arg_class is TemporaryVariable: # global temporary, handled by invocation logic pass else: raise LoopyError("arg type not understood") return ref_args, ref_arg_data
def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters): import pyopencl as cl import pyopencl.array as cl_array from loopy.kernel.data import ValueArg, GlobalArg, ImageArg, TemporaryVariable from pymbolic import evaluate args = {} for arg, arg_desc in zip(impl_arg_info, ref_arg_data): kernel_arg = kernel.impl_arg_to_arg.get(arg.name) if arg.arg_class is ValueArg: arg_value = parameters[arg.name] try: argv_dtype = arg_value.dtype except AttributeError: argv_dtype = None if argv_dtype != arg.dtype: arg_value = arg.dtype.numpy_dtype.type(arg_value) args[arg.name] = arg_value elif arg.arg_class is ImageArg: if arg.name in kernel.get_written_variables(): raise NotImplementedError("write-mode images not supported in " "automatic testing") shape = evaluate_shape(arg.unvec_shape, parameters) assert shape == arg_desc.ref_shape # must be contiguous args[arg.name] = cl.image_from_array( queue.context, arg_desc.ref_pre_run_array.get()) elif arg.arg_class is GlobalArg: shape = evaluate(arg.unvec_shape, parameters) strides = evaluate(arg.unvec_strides, parameters) dtype = kernel_arg.dtype itemsize = dtype.itemsize numpy_strides = [itemsize*s for s in strides] assert all(s > 0 for s in strides) alloc_size = sum(astrd*(alen-1) for alen, astrd in zip(shape, strides)) + 1 # use contiguous array to transfer to host host_ref_contig_array = arg_desc.ref_pre_run_storage_array.get() # use device shape/strides from pyopencl.compyte.array import as_strided host_ref_array = as_strided(host_ref_contig_array, arg_desc.ref_shape, arg_desc.ref_numpy_strides) # flatten the thing host_ref_flat_array = host_ref_array.flatten() # create host array with test shape (but not strides) host_contig_array = np.empty(shape, dtype=dtype) common_len = min( len(host_ref_flat_array), len(host_contig_array.ravel())) host_contig_array.ravel()[:common_len] = \ host_ref_flat_array[:common_len] # create host array with test shape and storage layout host_storage_array = np.empty(alloc_size, dtype) host_array = as_strided( host_storage_array, shape, numpy_strides) host_array[...] = host_contig_array host_contig_array = arg_desc.ref_storage_array.get() storage_array = cl_array.to_device(queue, host_storage_array) ary = cl_array.as_strided(storage_array, shape, numpy_strides) args[arg.name] = ary arg_desc.test_storage_array = storage_array arg_desc.test_array = ary arg_desc.test_shape = shape arg_desc.test_strides = strides arg_desc.test_numpy_strides = numpy_strides arg_desc.test_alloc_size = alloc_size elif arg.arg_class is TemporaryVariable: # global temporary, handled by invocation logic pass else: raise LoopyError("arg type not understood") return args
def test_compare_cl_and_py_cost_model(ctx_factory, nsources, ntargets, dims, dtype): ctx = ctx_factory() queue = cl.CommandQueue(ctx) # {{{ Generate sources, targets and target_radii from boxtree.tools import make_normal_particle_array as p_normal sources = p_normal(queue, nsources, dims, dtype, seed=15) targets = p_normal(queue, ntargets, dims, dtype, seed=18) from pyopencl.clrandom import PhiloxGenerator rng = PhiloxGenerator(queue.context, seed=22) target_radii = rng.uniform( queue, ntargets, a=0, b=0.05, dtype=dtype ).get() # }}} # {{{ Generate tree and traversal from boxtree import TreeBuilder tb = TreeBuilder(ctx) tree, _ = tb( queue, sources, targets=targets, target_radii=target_radii, stick_out_factor=0.15, max_particles_in_box=30, debug=True ) from boxtree.traversal import FMMTraversalBuilder tg = FMMTraversalBuilder(ctx, well_sep_is_n_away=2) trav_dev, _ = tg(queue, tree, debug=True) trav = trav_dev.get(queue=queue) # }}} # {{{ Construct cost models cl_cost_model = FMMCostModel(None) python_cost_model = _PythonFMMCostModel(None) constant_one_params = cl_cost_model.get_unit_calibration_params().copy() for ilevel in range(trav.tree.nlevels): constant_one_params["p_fmm_lev%d" % ilevel] = 10 xlat_cost = make_pde_aware_translation_cost_model(dims, trav.tree.nlevels) # }}} # {{{ Test process_form_multipoles nlevels = trav.tree.nlevels p2m_cost = np.zeros(nlevels, dtype=np.float64) for ilevel in range(nlevels): p2m_cost[ilevel] = evaluate( xlat_cost.p2m(ilevel), context=constant_one_params ) p2m_cost_dev = cl.array.to_device(queue, p2m_cost) queue.finish() start_time = time.time() cl_form_multipoles = cl_cost_model.process_form_multipoles( queue, trav_dev, p2m_cost_dev ) queue.finish() logger.info("OpenCL time for process_form_multipoles: {0}".format( str(time.time() - start_time) )) start_time = time.time() python_form_multipoles = python_cost_model.process_form_multipoles( queue, trav, p2m_cost ) logger.info("Python time for process_form_multipoles: {0}".format( str(time.time() - start_time) )) assert np.array_equal(cl_form_multipoles.get(), python_form_multipoles) # }}} # {{{ Test process_coarsen_multipoles m2m_cost = np.zeros(nlevels - 1, dtype=np.float64) for target_level in range(nlevels - 1): m2m_cost[target_level] = evaluate( xlat_cost.m2m(target_level + 1, target_level), context=constant_one_params ) m2m_cost_dev = cl.array.to_device(queue, m2m_cost) queue.finish() start_time = time.time() cl_coarsen_multipoles = cl_cost_model.process_coarsen_multipoles( queue, trav_dev, m2m_cost_dev ) queue.finish() logger.info("OpenCL time for coarsen_multipoles: {0}".format( str(time.time() - start_time) )) start_time = time.time() python_coarsen_multipoles = python_cost_model.process_coarsen_multipoles( queue, trav, m2m_cost ) logger.info("Python time for coarsen_multipoles: {0}".format( str(time.time() - start_time) )) assert cl_coarsen_multipoles == python_coarsen_multipoles # }}} # {{{ Test process_direct queue.finish() start_time = time.time() cl_ndirect_sources_per_target_box = \ cl_cost_model.get_ndirect_sources_per_target_box(queue, trav_dev) cl_direct = cl_cost_model.process_direct( queue, trav_dev, cl_ndirect_sources_per_target_box, 5.0 ) queue.finish() logger.info("OpenCL time for process_direct: {0}".format( str(time.time() - start_time) )) start_time = time.time() python_ndirect_sources_per_target_box = \ python_cost_model.get_ndirect_sources_per_target_box(queue, trav) python_direct = python_cost_model.process_direct( queue, trav, python_ndirect_sources_per_target_box, 5.0 ) logger.info("Python time for process_direct: {0}".format( str(time.time() - start_time) )) assert np.array_equal(cl_direct.get(), python_direct) # }}} # {{{ Test aggregate_over_boxes start_time = time.time() cl_direct_aggregate = cl_cost_model.aggregate_over_boxes(cl_direct) queue.finish() logger.info("OpenCL time for aggregate_over_boxes: {0}".format( str(time.time() - start_time) )) start_time = time.time() python_direct_aggregate = python_cost_model.aggregate_over_boxes(python_direct) logger.info("Python time for aggregate_over_boxes: {0}".format( str(time.time() - start_time) )) assert cl_direct_aggregate == python_direct_aggregate # }}} # {{{ Test process_list2 nlevels = trav.tree.nlevels m2l_cost = np.zeros(nlevels, dtype=np.float64) for ilevel in range(nlevels): m2l_cost[ilevel] = evaluate( xlat_cost.m2l(ilevel, ilevel), context=constant_one_params ) m2l_cost_dev = cl.array.to_device(queue, m2l_cost) queue.finish() start_time = time.time() cl_m2l_cost = cl_cost_model.process_list2(queue, trav_dev, m2l_cost_dev) queue.finish() logger.info("OpenCL time for process_list2: {0}".format( str(time.time() - start_time) )) start_time = time.time() python_m2l_cost = python_cost_model.process_list2(queue, trav, m2l_cost) logger.info("Python time for process_list2: {0}".format( str(time.time() - start_time) )) assert np.array_equal(cl_m2l_cost.get(), python_m2l_cost) # }}} # {{{ Test process_list 3 m2p_cost = np.zeros(nlevels, dtype=np.float64) for ilevel in range(nlevels): m2p_cost[ilevel] = evaluate( xlat_cost.m2p(ilevel), context=constant_one_params ) m2p_cost_dev = cl.array.to_device(queue, m2p_cost) queue.finish() start_time = time.time() cl_m2p_cost = cl_cost_model.process_list3(queue, trav_dev, m2p_cost_dev) queue.finish() logger.info("OpenCL time for process_list3: {0}".format( str(time.time() - start_time) )) start_time = time.time() python_m2p_cost = python_cost_model.process_list3(queue, trav, m2p_cost) logger.info("Python time for process_list3: {0}".format( str(time.time() - start_time) )) assert np.array_equal(cl_m2p_cost.get(), python_m2p_cost) # }}} # {{{ Test process_list4 p2l_cost = np.zeros(nlevels, dtype=np.float64) for ilevel in range(nlevels): p2l_cost[ilevel] = evaluate( xlat_cost.p2l(ilevel), context=constant_one_params ) p2l_cost_dev = cl.array.to_device(queue, p2l_cost) queue.finish() start_time = time.time() cl_p2l_cost = cl_cost_model.process_list4(queue, trav_dev, p2l_cost_dev) queue.finish() logger.info("OpenCL time for process_list4: {0}".format( str(time.time() - start_time) )) start_time = time.time() python_p2l_cost = python_cost_model.process_list4(queue, trav, p2l_cost) logger.info("Python time for process_list4: {0}".format( str(time.time() - start_time) )) assert np.array_equal(cl_p2l_cost.get(), python_p2l_cost) # }}} # {{{ Test process_refine_locals l2l_cost = np.zeros(nlevels - 1, dtype=np.float64) for ilevel in range(nlevels - 1): l2l_cost[ilevel] = evaluate( xlat_cost.l2l(ilevel, ilevel + 1), context=constant_one_params ) l2l_cost_dev = cl.array.to_device(queue, l2l_cost) queue.finish() start_time = time.time() cl_refine_locals_cost = cl_cost_model.process_refine_locals( queue, trav_dev, l2l_cost_dev ) queue.finish() logger.info("OpenCL time for refine_locals: {0}".format( str(time.time() - start_time) )) start_time = time.time() python_refine_locals_cost = python_cost_model.process_refine_locals( queue, trav, l2l_cost ) logger.info("Python time for refine_locals: {0}".format( str(time.time() - start_time) )) assert cl_refine_locals_cost == python_refine_locals_cost # }}} # {{{ Test process_eval_locals l2p_cost = np.zeros(nlevels, dtype=np.float64) for ilevel in range(nlevels): l2p_cost[ilevel] = evaluate( xlat_cost.l2p(ilevel), context=constant_one_params ) l2p_cost_dev = cl.array.to_device(queue, l2p_cost) queue.finish() start_time = time.time() cl_l2p_cost = cl_cost_model.process_eval_locals(queue, trav_dev, l2p_cost_dev) queue.finish() logger.info("OpenCL time for process_eval_locals: {0}".format( str(time.time() - start_time) )) start_time = time.time() python_l2p_cost = python_cost_model.process_eval_locals(queue, trav, l2p_cost) logger.info("Python time for process_eval_locals: {0}".format( str(time.time() - start_time) )) assert np.array_equal(cl_l2p_cost.get(), python_l2p_cost)
def map_call(self, expr, enclosing_prec, type_context): from pymbolic.primitives import Variable, Subscript from pymbolic.mapper.stringifier import PREC_NONE identifier = expr.function # {{{ implement indexof, indexof_vec if identifier.name in ["indexof", "indexof_vec"]: if len(expr.parameters) != 1: raise LoopyError("%s takes exactly one argument" % identifier.name) arg, = expr.parameters if not isinstance(arg, Subscript): raise LoopyError( "argument to %s must be a subscript" % identifier.name) ary = self.find_array(arg) from loopy.kernel.array import get_access_info from pymbolic import evaluate access_info = get_access_info(self.kernel.target, ary, arg.index, lambda expr: evaluate(expr, self.codegen_state.var_subst_map), self.codegen_state.vectorization_info) from loopy.kernel.data import ImageArg if isinstance(ary, ImageArg): raise LoopyError("%s does not support images" % identifier.name) if identifier.name == "indexof": return access_info.subscripts[0] elif identifier.name == "indexof_vec": from loopy.kernel.array import VectorArrayDimTag ivec = None for iaxis, dim_tag in enumerate(ary.dim_tags): if isinstance(dim_tag, VectorArrayDimTag): ivec = iaxis if ivec is None: return access_info.subscripts[0] else: return ( access_info.subscripts[0]*ary.shape[ivec] + access_info.vector_index) else: raise RuntimeError("should not get here") # }}} if isinstance(identifier, Variable): identifier = identifier.name par_dtypes = tuple(self.infer_type(par) for par in expr.parameters) str_parameters = None mangle_result = self.kernel.mangle_function( identifier, par_dtypes, ast_builder=self.codegen_state.ast_builder) if mangle_result is None: raise RuntimeError("function '%s' unknown--" "maybe you need to register a function mangler?" % identifier) if len(mangle_result.result_dtypes) != 1: raise LoopyError("functions with more or fewer than one return value " "may not be used in an expression") if mangle_result.arg_dtypes is not None: str_parameters = [ self.rec(par, PREC_NONE, dtype_to_type_context(self.kernel.target, tgt_dtype), tgt_dtype) for par, par_dtype, tgt_dtype in zip( expr.parameters, par_dtypes, mangle_result.arg_dtypes)] else: # /!\ FIXME For some functions (e.g. 'sin'), it makes sense to # propagate the type context here. But for many others, it does # not. Using the inferred type as a stopgap for now. str_parameters = [ self.rec(par, PREC_NONE, type_context=dtype_to_type_context( self.kernel.target, par_dtype)) for par, par_dtype in zip(expr.parameters, par_dtypes)] from warnings import warn warn("Calling function '%s' with unknown C signature--" "return CallMangleInfo.arg_dtypes" % identifier, LoopyWarning) from loopy.codegen import SeenFunction self.codegen_state.seen_functions.add( SeenFunction(identifier, mangle_result.target_name, mangle_result.arg_dtypes or par_dtypes)) return "%s(%s)" % (mangle_result.target_name, ", ".join(str_parameters))
def map_subscript(self, expr, type_context): def base_impl(expr, type_context): return self.rec(expr.aggregate, type_context)[self.rec(expr.index, 'i')] def make_var(name): from loopy import TaggedVariable if isinstance(expr.aggregate, TaggedVariable): return TaggedVariable(name, expr.aggregate.tag) else: return var(name) from pymbolic.primitives import Variable if not isinstance(expr.aggregate, Variable): return base_impl(expr, type_context) ary = self.find_array(expr) from loopy.kernel.array import get_access_info from pymbolic import evaluate from loopy.symbolic import simplify_using_aff index_tuple = tuple( simplify_using_aff(self.kernel, idx) for idx in expr.index_tuple) access_info = get_access_info(self.kernel.target, ary, index_tuple, lambda expr: evaluate(expr, self.codegen_state.var_subst_map), self.codegen_state.vectorization_info) from loopy.kernel.data import ( ImageArg, ArrayArg, TemporaryVariable, ConstantArg) if isinstance(ary, ImageArg): extra_axes = 0 num_target_axes = ary.num_target_axes() if num_target_axes in [1, 2]: idx_vec_type = "float2" extra_axes = 2-num_target_axes elif num_target_axes == 3: idx_vec_type = "float4" extra_axes = 4-num_target_axes else: raise LoopyError("unsupported number (%d) of target axes in image" % num_target_axes) idx_tuple = expr.index_tuple[::-1] + (0,) * extra_axes base_access = var("read_imagef")( var(ary.name), var("loopy_sampler"), var("(%s)" % idx_vec_type)(*self.rec(idx_tuple, 'i'))) if ary.dtype.numpy_dtype == np.float32: return base_access.attr("x") if self.kernel.target.is_vector_dtype(ary.dtype): return base_access elif ary.dtype.numpy_dtype == np.float64: return var("as_double")(base_access.attr("xy")) else: raise NotImplementedError( "non-floating-point images not supported for now") elif isinstance(ary, (ArrayArg, TemporaryVariable, ConstantArg)): if len(access_info.subscripts) == 0: if ( (isinstance(ary, (ConstantArg, ArrayArg)) or (isinstance(ary, TemporaryVariable) and ary.base_storage))): # unsubscripted global args are pointers result = make_var(access_info.array_name)[0] else: # unsubscripted temp vars are scalars # (unless they use base_storage) result = make_var(access_info.array_name) else: subscript, = access_info.subscripts result = make_var(access_info.array_name)[simplify_using_aff( self.kernel, self.rec(subscript, 'i'))] if access_info.vector_index is not None: return self.codegen_state.ast_builder.add_vector_access( result, access_info.vector_index) else: return result else: assert False
def get_auto_axis_iname_ranking_by_stride(kernel, insn): from loopy.kernel.data import ImageArg, ValueArg approximate_arg_values = {} for arg in kernel.args: if isinstance(arg, ValueArg): if arg.approximately is not None: approximate_arg_values[arg.name] = arg.approximately else: raise LoopyError( "No approximate arg value specified for '%s'" % arg.name) # {{{ find all array accesses in insn from loopy.symbolic import ArrayAccessFinder ary_acc_exprs = list(ArrayAccessFinder()(insn.expression)) from pymbolic.primitives import Subscript for assignee in insn.assignees: if isinstance(assignee, Subscript): ary_acc_exprs.append(assignee) # }}} # {{{ filter array accesses to only the global ones global_ary_acc_exprs = [] for aae in ary_acc_exprs: ary_name = aae.aggregate.name arg = kernel.arg_dict.get(ary_name) if arg is None: continue if isinstance(arg, ImageArg): continue global_ary_acc_exprs.append(aae) # }}} # {{{ figure out automatic-axis inames from loopy.kernel.data import AutoLocalIndexTagBase auto_axis_inames = set( iname for iname in kernel.insn_inames(insn) if isinstance(kernel.iname_to_tag.get(iname), AutoLocalIndexTagBase)) # }}} # {{{ figure out which iname should get mapped to local axis 0 # maps inames to "aggregate stride" aggregate_strides = {} from loopy.symbolic import CoefficientCollector from pymbolic.primitives import Variable for aae in global_ary_acc_exprs: index_expr = aae.index if not isinstance(index_expr, tuple): index_expr = (index_expr, ) ary_name = aae.aggregate.name arg = kernel.arg_dict.get(ary_name) if arg.dim_tags is None: from warnings import warn warn("Strides for '%s' are not known. Local axis assignment " "is likely suboptimal." % arg.name) ary_strides = [1] * len(index_expr) else: ary_strides = [] from loopy.kernel.array import FixedStrideArrayDimTag for dim_tag in arg.dim_tags: if isinstance(dim_tag, FixedStrideArrayDimTag): ary_strides.append(dim_tag.stride) # {{{ construct iname_to_stride_expr iname_to_stride_expr = {} for iexpr_i, stride in zip(index_expr, ary_strides): if stride is None: continue coeffs = CoefficientCollector()(iexpr_i) for var, coeff in six.iteritems(coeffs): if (isinstance(var, Variable) and var.name in auto_axis_inames): # excludes '1', i.e. the constant new_stride = coeff * stride old_stride = iname_to_stride_expr.get(var.name, None) if old_stride is None or new_stride < old_stride: iname_to_stride_expr[var.name] = new_stride # }}} from pymbolic import evaluate for iname, stride_expr in six.iteritems(iname_to_stride_expr): stride = evaluate(stride_expr, approximate_arg_values) aggregate_strides[iname] = aggregate_strides.get(iname, 0) + stride if aggregate_strides: very_large_stride = int(np.iinfo(np.int32).max) return sorted((iname for iname in kernel.insn_inames(insn)), key=lambda iname: (aggregate_strides.get(iname, very_large_stride), iname)) else: return None
def map_subscript(self, expr, enclosing_prec, type_context): def base_impl(expr, enclosing_prec, type_context): return self.parenthesize_if_needed( "%s[%s]" % ( self.rec(expr.aggregate, PREC_CALL, type_context), self.rec(expr.index, PREC_NONE, 'i')), enclosing_prec, PREC_CALL) from pymbolic.primitives import Variable if not isinstance(expr.aggregate, Variable): return base_impl(expr, enclosing_prec, type_context) if expr.aggregate.name in self.kernel.arg_dict: ary = self.kernel.arg_dict[expr.aggregate.name] elif expr.aggregate.name in self.kernel.temporary_variables: ary = self.kernel.temporary_variables[expr.aggregate.name] else: raise RuntimeError("nothing known about subscripted variable '%s'" % expr.aggregate.name) from loopy.kernel.array import ArrayBase if not isinstance(ary, ArrayBase): raise RuntimeError("subscripted variable '%s' is not an array" % expr.aggregate.name) from loopy.kernel.array import get_access_info from pymbolic import evaluate access_info = get_access_info(self.kernel.target, ary, expr.index, lambda expr: evaluate(expr, self.codegen_state.var_subst_map), self.codegen_state.vectorization_info) vec_member = get_opencl_vec_member(access_info.vector_index) from loopy.kernel.data import ImageArg, GlobalArg, TemporaryVariable if isinstance(ary, ImageArg): base_access = ("read_imagef(%s, loopy_sampler, (float%d)(%s))" % (ary.name, ary.dimensions, ", ".join(self.rec(idx, PREC_NONE, 'i') for idx in expr.index[::-1]))) if ary.dtype == np.float32: return base_access+".x" if self.kernel.target.is_vector_dtype(ary.dtype): return base_access elif ary.dtype == np.float64: return "as_double(%s.xy)" % base_access else: raise NotImplementedError( "non-floating-point images not supported for now") elif isinstance(ary, (GlobalArg, TemporaryVariable)): if len(access_info.subscripts) == 0: if isinstance(ary, GlobalArg): # unsubscripted global args are pointers if vec_member is not None: return "%s->%s" % ( access_info.array_name, vec_member) else: return "*" + access_info.array_name else: # unsubscripted temp vars are scalars if vec_member is not None: return "%s.%s" % ( access_info.array_name, vec_member) else: return access_info.array_name else: subscript, = access_info.subscripts result = self.parenthesize_if_needed( "%s[%s]" % ( access_info.array_name, self.rec(subscript, PREC_NONE, 'i')), enclosing_prec, PREC_CALL) if vec_member: result += "."+vec_member return result else: assert False
def map_call(self, expr, enclosing_prec, type_context): from pymbolic.primitives import Variable, Subscript from pymbolic.mapper.stringifier import PREC_NONE identifier = expr.function # {{{ implement indexof, indexof_vec if identifier.name in ["indexof", "indexof_vec"]: if len(expr.parameters) != 1: raise LoopyError("%s takes exactly one argument" % identifier.name) arg, = expr.parameters if not isinstance(arg, Subscript): raise LoopyError("argument to %s must be a subscript" % identifier.name) ary = self.find_array(arg) from loopy.kernel.array import get_access_info from pymbolic import evaluate access_info = get_access_info( self.kernel.target, ary, arg.index, lambda expr: evaluate(expr, self.codegen_state.var_subst_map), self.codegen_state.vectorization_info) from loopy.kernel.data import ImageArg if isinstance(ary, ImageArg): raise LoopyError("%s does not support images" % identifier.name) if identifier.name == "indexof": return access_info.subscripts[0] elif identifier.name == "indexof_vec": from loopy.kernel.array import VectorArrayDimTag ivec = None for iaxis, dim_tag in enumerate(ary.dim_tags): if isinstance(dim_tag, VectorArrayDimTag): ivec = iaxis if ivec is None: return access_info.subscripts[0] else: return (access_info.subscripts[0] * ary.shape[ivec] + access_info.vector_index) else: raise RuntimeError("should not get here") # }}} if isinstance(identifier, Variable): identifier = identifier.name par_dtypes = tuple(self.infer_type(par) for par in expr.parameters) str_parameters = None mangle_result = self.kernel.mangle_function( identifier, par_dtypes, ast_builder=self.codegen_state.ast_builder) if mangle_result is None: raise RuntimeError( "function '%s' unknown--" "maybe you need to register a function mangler?" % identifier) if len(mangle_result.result_dtypes) != 1: raise LoopyError( "functions with more or fewer than one return value " "may not be used in an expression") if mangle_result.arg_dtypes is not None: str_parameters = [ self.rec(par, PREC_NONE, dtype_to_type_context(self.kernel.target, tgt_dtype), tgt_dtype) for par, par_dtype, tgt_dtype in zip(expr.parameters, par_dtypes, mangle_result.arg_dtypes) ] else: # /!\ FIXME For some functions (e.g. 'sin'), it makes sense to # propagate the type context here. But for many others, it does # not. Using the inferred type as a stopgap for now. str_parameters = [ self.rec(par, PREC_NONE, type_context=dtype_to_type_context( self.kernel.target, par_dtype)) for par, par_dtype in zip(expr.parameters, par_dtypes) ] from warnings import warn warn( "Calling function '%s' with unknown C signature--" "return CallMangleInfo.arg_dtypes" % identifier, LoopyWarning) from loopy.codegen import SeenFunction self.codegen_state.seen_functions.add( SeenFunction(identifier, mangle_result.target_name, mangle_result.arg_dtypes or par_dtypes)) return "%s(%s)" % (mangle_result.target_name, ", ".join(str_parameters))
def get_auto_axis_iname_ranking_by_stride(kernel, insn): from loopy.kernel.data import ImageArg, ValueArg approximate_arg_values = {} for arg in kernel.args: if isinstance(arg, ValueArg): if arg.approximately is not None: approximate_arg_values[arg.name] = arg.approximately else: raise LoopyError("No approximate arg value specified for '%s'" % arg.name) # {{{ find all array accesses in insn from loopy.symbolic import ArrayAccessFinder ary_acc_exprs = list(ArrayAccessFinder()(insn.expression)) from pymbolic.primitives import Subscript if isinstance(insn.assignee, Subscript): ary_acc_exprs.append(insn.assignee) # }}} # {{{ filter array accesses to only the global ones global_ary_acc_exprs = [] for aae in ary_acc_exprs: ary_name = aae.aggregate.name arg = kernel.arg_dict.get(ary_name) if arg is None: continue if isinstance(arg, ImageArg): continue global_ary_acc_exprs.append(aae) # }}} # {{{ figure out automatic-axis inames from loopy.kernel.data import AutoLocalIndexTagBase auto_axis_inames = set( iname for iname in kernel.insn_inames(insn) if isinstance(kernel.iname_to_tag.get(iname), AutoLocalIndexTagBase)) # }}} # {{{ figure out which iname should get mapped to local axis 0 # maps inames to "aggregate stride" aggregate_strides = {} from loopy.symbolic import CoefficientCollector from pymbolic.primitives import Variable for aae in global_ary_acc_exprs: index_expr = aae.index if not isinstance(index_expr, tuple): index_expr = (index_expr,) ary_name = aae.aggregate.name arg = kernel.arg_dict.get(ary_name) if arg.dim_tags is None: from warnings import warn warn("Strides for '%s' are not known. Local axis assignment " "is likely suboptimal." % arg.name) ary_strides = [1] * len(index_expr) else: ary_strides = [] from loopy.kernel.array import FixedStrideArrayDimTag for dim_tag in arg.dim_tags: if isinstance(dim_tag, FixedStrideArrayDimTag): ary_strides.append(dim_tag.stride) # {{{ construct iname_to_stride_expr iname_to_stride_expr = {} for iexpr_i, stride in zip(index_expr, ary_strides): if stride is None: continue coeffs = CoefficientCollector()(iexpr_i) for var, coeff in six.iteritems(coeffs): if (isinstance(var, Variable) and var.name in auto_axis_inames): # excludes '1', i.e. the constant new_stride = coeff*stride old_stride = iname_to_stride_expr.get(var.name, None) if old_stride is None or new_stride < old_stride: iname_to_stride_expr[var.name] = new_stride # }}} from pymbolic import evaluate for iname, stride_expr in six.iteritems(iname_to_stride_expr): stride = evaluate(stride_expr, approximate_arg_values) aggregate_strides[iname] = aggregate_strides.get(iname, 0) + stride if aggregate_strides: very_large_stride = np.iinfo(np.int32).max return sorted((iname for iname in kernel.insn_inames(insn)), key=lambda iname: aggregate_strides.get(iname, very_large_stride)) else: return None
def _get_expr_dep_data(self, parsed): class Nth: def __init__(self, n): self.n = n def __call__(self, lst): return lst[self.n] from pymbolic.mapper.dependency import DependencyMapper deps = DependencyMapper(include_calls=False)(parsed) # gather information on aggregation expressions dep_data = [] from pymbolic.primitives import Variable, Lookup, Subscript for dep_idx, dep in enumerate(deps): nonlocal_agg = True if isinstance(dep, Variable): name = dep.name if name == "math": continue agg_func = self.quantity_data[name].default_aggregator if agg_func is None: if self.is_parallel: raise ValueError( "must specify explicit aggregator for '%s'" % name) agg_func = lambda lst: lst[0] elif isinstance(dep, Lookup): assert isinstance(dep.aggregate, Variable) name = dep.aggregate.name agg_name = dep.name if agg_name == "loc": agg_func = Nth(self.rank) nonlocal_agg = False elif agg_name == "min": agg_func = min elif agg_name == "max": agg_func = max elif agg_name == "avg": from pytools import average agg_func = average elif agg_name == "sum": agg_func = sum elif agg_name == "norm2": from math import sqrt agg_func = lambda iterable: sqrt( sum(entry**2 for entry in iterable)) else: raise ValueError("invalid rank aggregator '%s'" % agg_name) elif isinstance(dep, Subscript): assert isinstance(dep.aggregate, Variable) name = dep.aggregate.name from pymbolic import evaluate agg_func = Nth(evaluate(dep.index)) qdat = self.quantity_data[name] from pytools import Record class DependencyData(Record): pass this_dep_data = DependencyData(name=name, qdat=qdat, agg_func=agg_func, varname="logvar%d" % dep_idx, expr=dep, nonlocal_agg=nonlocal_agg) dep_data.append(this_dep_data) # substitute in the "logvar" variable names from pymbolic import var, substitute parsed = substitute(parsed, dict((dd.expr, var(dd.varname)) for dd in dep_data)) return parsed, dep_data
def map_call(self, expr, enclosing_prec, type_context): from pymbolic.primitives import Variable, Subscript from pymbolic.mapper.stringifier import PREC_NONE identifier = expr.function # {{{ implement indexof, indexof_vec if identifier.name in ["indexof", "indexof_vec"]: if len(expr.parameters) != 1: raise LoopyError("%s takes exactly one argument" % identifier.name) arg, = expr.parameters if not isinstance(arg, Subscript): raise LoopyError("argument to %s must be a subscript" % identifier.name) ary = self.find_array(arg) from loopy.kernel.array import get_access_info from pymbolic import evaluate access_info = get_access_info( self.kernel.target, ary, arg.index, lambda expr: evaluate(expr, self.codegen_state.var_subst_map), self.codegen_state.vectorization_info, ) from loopy.kernel.data import ImageArg if isinstance(ary, ImageArg): raise LoopyError("%s does not support images" % identifier.name) if identifier.name == "indexof": return access_info.subscripts[0] elif identifier.name == "indexof_vec": from loopy.kernel.array import VectorArrayDimTag ivec = None for iaxis, dim_tag in enumerate(ary.dim_tags): if isinstance(dim_tag, VectorArrayDimTag): ivec = iaxis if ivec is None: return access_info.subscripts[0] else: return access_info.subscripts[0] * ary.shape[ivec] + access_info.vector_index else: raise RuntimeError("should not get here") # }}} c_name = None if isinstance(identifier, Variable): identifier = identifier.name c_name = identifier par_dtypes = tuple(self.infer_type(par) for par in expr.parameters) str_parameters = None mangle_result = self.kernel.mangle_function(identifier, par_dtypes) if mangle_result is not None: if len(mangle_result) == 2: result_dtype, c_name = mangle_result elif len(mangle_result) == 3: result_dtype, c_name, arg_tgt_dtypes = mangle_result str_parameters = [ self.rec(par, PREC_NONE, dtype_to_type_context(self.kernel.target, tgt_dtype), tgt_dtype) for par, par_dtype, tgt_dtype in zip(expr.parameters, par_dtypes, arg_tgt_dtypes) ] else: raise RuntimeError("result of function mangler " "for function '%s' not understood" % identifier) from loopy.codegen import SeenFunction self.codegen_state.seen_functions.add(SeenFunction(identifier, c_name, par_dtypes)) if str_parameters is None: # /!\ FIXME For some functions (e.g. 'sin'), it makes sense to # propagate the type context here. But for many others, it does # not. Using the inferred type as a stopgap for now. str_parameters = [ self.rec(par, PREC_NONE, type_context=dtype_to_type_context(self.kernel.target, par_dtype)) for par, par_dtype in zip(expr.parameters, par_dtypes) ] if c_name is None: raise RuntimeError("unable to find C name for function identifier '%s'" % identifier) return "%s(%s)" % (c_name, ", ".join(str_parameters))
def test_fuzz_expression_code_gen(ctx_factory, expr_type, random_seed): from pymbolic import evaluate def get_numpy_type(x): if expr_type in ["real", "complex"]: if isinstance(x, (complex, np.complexfloating)): return np.complex128 else: return np.float64 elif expr_type in ["int", "int_nonneg"]: return np.int64 else: raise ValueError("unknown expr_type: %s" % expr_type) from random import seed ctx = ctx_factory() queue = cl.CommandQueue(ctx) seed(random_seed) data = [] instructions = [] ref_values = {} if expr_type in ["real", "complex"]: result_type = np.complex128 elif expr_type in ["int", "int_nonneg"]: result_type = np.int64 else: assert False var_names = [] fuzz_iter = iter(generate_random_fuzz_examples(expr_type)) count = 0 while True: if count == 10: break i, expr, var_values = next(fuzz_iter) var_name = "expr%d" % i print(expr) #assert_parse_roundtrip(expr) if expr_type in ["int", "int_nonneg"]: result_type_iinfo = np.iinfo(np.int32) bceval_mapper = BoundsCheckingEvaluationMapper( var_values, lbound=result_type_iinfo.min, ubound=result_type_iinfo.max) print(expr) try: ref_values[var_name] = bceval_mapper(expr) except BoundsCheckError: print(expr) print("BOUNDS CHECK FAILED") continue else: try: ref_values[var_name] = evaluate(expr, var_values) except ZeroDivisionError: continue count += 1 data.append(lp.GlobalArg(var_name, result_type, shape=())) data.extend([ lp.TemporaryVariable(name, get_numpy_type(val)) for name, val in var_values.items() ]) instructions.extend([ lp.Assignment(name, get_numpy_type(val)(val)) for name, val in var_values.items() ]) instructions.append(lp.Assignment(var_name, expr)) if expr_type == "int_nonneg": var_names.extend(var_values) knl = lp.make_kernel("{ : }", instructions, data, seq_dependencies=True) import islpy as isl knl = lp.assume( knl, isl.BasicSet( "[%s] -> { : %s}" % (", ".join(var_names), " and ".join("%s >= 0" % name for name in var_names)))) knl = lp.set_options(knl, return_dict=True) print(knl) evt, lp_values = knl(queue, out_host=True) for name, ref_value in ref_values.items(): lp_value = lp_values[name] if expr_type in ["real", "complex"]: err = abs(ref_value - lp_value) / abs(ref_value) elif expr_type in ["int", "int_nonneg"]: err = abs(ref_value - lp_value) else: assert False if abs(err) > 1e-10: print(80 * "-") print(knl) print(80 * "-") print(lp.generate_code_v2(knl).device_code()) print(80 * "-") print(f"WRONG: {name} rel error={err:g}") print("reference=%r" % ref_value) print("loopy=%r" % lp_value) print(80 * "-") 1 / 0 print(lp.generate_code_v2(knl).device_code())
def emit_assignment(self, codegen_state, insn): kernel = codegen_state.kernel ecm = codegen_state.expression_to_code_mapper assignee_var_name, = insn.assignee_var_names() lhs_var = codegen_state.kernel.get_var_descriptor(assignee_var_name) lhs_dtype = lhs_var.dtype if insn.atomicity: raise NotImplementedError("atomic ops in ISPC") from loopy.expression import dtype_to_type_context from pymbolic.mapper.stringifier import PREC_NONE rhs_type_context = dtype_to_type_context(kernel.target, lhs_dtype) rhs_code = ecm(insn.expression, prec=PREC_NONE, type_context=rhs_type_context, needed_dtype=lhs_dtype) lhs = insn.assignee # {{{ handle streaming stores if "!streaming_store" in insn.tags: ary = ecm.find_array(lhs) from loopy.kernel.array import get_access_info from pymbolic import evaluate from loopy.symbolic import simplify_using_aff index_tuple = tuple( simplify_using_aff(kernel, idx) for idx in lhs.index_tuple) access_info = get_access_info(kernel.target, ary, index_tuple, lambda expr: evaluate(expr, codegen_state.var_subst_map), codegen_state.vectorization_info) from loopy.kernel.data import ArrayArg, TemporaryVariable if not isinstance(ary, (ArrayArg, TemporaryVariable)): raise LoopyError("array type not supported in ISPC: %s" % type(ary).__name) if len(access_info.subscripts) != 1: raise LoopyError("streaming stores must have a subscript") subscript, = access_info.subscripts from pymbolic.primitives import Sum, flattened_sum, Variable if isinstance(subscript, Sum): terms = subscript.children else: terms = (subscript.children,) new_terms = [] from loopy.kernel.data import LocalIndexTag, filter_iname_tags_by_type from loopy.symbolic import get_dependencies saw_l0 = False for term in terms: if (isinstance(term, Variable) and kernel.iname_tags_of_type(term.name, LocalIndexTag)): tag, = kernel.iname_tags_of_type( term.name, LocalIndexTag, min_num=1, max_num=1) if tag.axis == 0: if saw_l0: raise LoopyError( "streaming store must have stride 1 in " "local index, got: %s" % subscript) saw_l0 = True continue else: for dep in get_dependencies(term): if filter_iname_tags_by_type( kernel.iname_to_tags.get(dep, []), LocalIndexTag): tag, = filter_iname_tags_by_type( kernel.iname_to_tags.get(dep, []), LocalIndexTag, 1) if tag.axis == 0: raise LoopyError( "streaming store must have stride 1 in " "local index, got: %s" % subscript) new_terms.append(term) if not saw_l0: raise LoopyError("streaming store must have stride 1 in " "local index, got: %s" % subscript) if access_info.vector_index is not None: raise LoopyError("streaming store may not use a short-vector " "data type") rhs_has_programindex = any( isinstance(tag, LocalIndexTag) and tag.axis == 0 for tag in kernel.iname_tags(dep) for dep in get_dependencies(insn.expression)) if not rhs_has_programindex: rhs_code = "broadcast(%s, 0)" % rhs_code from cgen import Statement return Statement( "streaming_store(%s + %s, %s)" % ( access_info.array_name, ecm(flattened_sum(new_terms), PREC_NONE, 'i'), rhs_code)) # }}} from cgen import Assign return Assign(ecm(lhs, prec=PREC_NONE, type_context=None), rhs_code)
def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters): import pyopencl as cl import pyopencl.array as cl_array from loopy.kernel.data import ValueArg, ArrayArg, ImageArg,\ TemporaryVariable, ConstantArg from pymbolic import evaluate args = {} for arg, arg_desc in zip(impl_arg_info, ref_arg_data): kernel_arg = kernel.impl_arg_to_arg.get(arg.name) if arg.arg_class is ValueArg: arg_value = parameters[arg.name] try: argv_dtype = arg_value.dtype except AttributeError: argv_dtype = None if argv_dtype != arg.dtype: arg_value = arg.dtype.numpy_dtype.type(arg_value) args[arg.name] = arg_value elif arg.arg_class is ImageArg: if arg.name in kernel.get_written_variables(): raise NotImplementedError("write-mode images not supported in " "automatic testing") shape = evaluate_shape(arg.unvec_shape, parameters) assert shape == arg_desc.ref_shape # must be contiguous args[arg.name] = cl.image_from_array( queue.context, arg_desc.ref_pre_run_array.get()) elif arg.arg_class is ArrayArg or\ arg.arg_class is ConstantArg: shape = evaluate(arg.unvec_shape, parameters) strides = evaluate(arg.unvec_strides, parameters) dtype = kernel_arg.dtype itemsize = dtype.itemsize numpy_strides = [itemsize * s for s in strides] alloc_size = sum(astrd * (alen - 1) if astrd != 0 else alen - 1 for alen, astrd in zip(shape, strides)) + 1 # use contiguous array to transfer to host host_ref_contig_array = arg_desc.ref_pre_run_storage_array.get() # use device shape/strides from pyopencl.compyte.array import as_strided host_ref_array = as_strided(host_ref_contig_array, arg_desc.ref_shape, arg_desc.ref_numpy_strides) # flatten the thing host_ref_flat_array = host_ref_array.flatten() # create host array with test shape (but not strides) host_contig_array = np.empty(shape, dtype=dtype) common_len = min(len(host_ref_flat_array), len(host_contig_array.ravel())) host_contig_array.ravel()[:common_len] = \ host_ref_flat_array[:common_len] # create host array with test shape and storage layout host_storage_array = np.empty(alloc_size, dtype) host_array = as_strided(host_storage_array, shape, numpy_strides) host_array[...] = host_contig_array host_contig_array = arg_desc.ref_storage_array.get() storage_array = cl_array.to_device(queue, host_storage_array) ary = cl_array.as_strided(storage_array, shape, numpy_strides) args[arg.name] = ary arg_desc.test_storage_array = storage_array arg_desc.test_array = ary arg_desc.test_shape = shape arg_desc.test_strides = strides arg_desc.test_numpy_strides = numpy_strides arg_desc.test_alloc_size = alloc_size elif arg.arg_class is TemporaryVariable: # global temporary, handled by invocation logic pass else: raise LoopyError("arg type not understood") return args
def emit_assignment(self, codegen_state, insn): kernel = codegen_state.kernel ecm = codegen_state.expression_to_code_mapper assignee_var_name, = insn.assignee_var_names() lhs_var = codegen_state.kernel.get_var_descriptor(assignee_var_name) lhs_dtype = lhs_var.dtype if insn.atomicity: raise NotImplementedError("atomic ops in ISPC") from loopy.expression import dtype_to_type_context from pymbolic.mapper.stringifier import PREC_NONE rhs_type_context = dtype_to_type_context(kernel.target, lhs_dtype) rhs_code = ecm(insn.expression, prec=PREC_NONE, type_context=rhs_type_context, needed_dtype=lhs_dtype) lhs = insn.assignee # {{{ handle streaming stores if "!streaming_store" in insn.tags: ary = ecm.find_array(lhs) from loopy.kernel.array import get_access_info from pymbolic import evaluate from loopy.symbolic import simplify_using_aff index_tuple = tuple( simplify_using_aff(kernel, idx) for idx in lhs.index_tuple) access_info = get_access_info( kernel.target, ary, index_tuple, lambda expr: evaluate(expr, self.codegen_state.var_subst_map), codegen_state.vectorization_info) from loopy.kernel.data import GlobalArg, TemporaryVariable if not isinstance(ary, (GlobalArg, TemporaryVariable)): raise LoopyError("array type not supported in ISPC: %s" % type(ary).__name) if len(access_info.subscripts) != 1: raise LoopyError("streaming stores must have a subscript") subscript, = access_info.subscripts from pymbolic.primitives import Sum, flattened_sum, Variable if isinstance(subscript, Sum): terms = subscript.children else: terms = (subscript.children, ) new_terms = [] from loopy.kernel.data import LocalIndexTag from loopy.symbolic import get_dependencies saw_l0 = False for term in terms: if (isinstance(term, Variable) and isinstance( kernel.iname_to_tag.get(term.name), LocalIndexTag) and kernel.iname_to_tag.get(term.name).axis == 0): if saw_l0: raise LoopyError("streaming store must have stride 1 " "in local index, got: %s" % subscript) saw_l0 = True continue else: for dep in get_dependencies(term): if (isinstance(kernel.iname_to_tag.get(dep), LocalIndexTag) and kernel.iname_to_tag.get(dep).axis == 0): raise LoopyError( "streaming store must have stride 1 " "in local index, got: %s" % subscript) new_terms.append(term) if not saw_l0: raise LoopyError("streaming store must have stride 1 in " "local index, got: %s" % subscript) if access_info.vector_index is not None: raise LoopyError("streaming store may not use a short-vector " "data type") rhs_has_programindex = any( isinstance(kernel.iname_to_tag.get(dep), LocalIndexTag) and kernel.iname_to_tag.get(dep).axis == 0 for dep in get_dependencies(insn.expression)) if not rhs_has_programindex: rhs_code = "broadcast(%s, 0)" % rhs_code from cgen import Statement return Statement( "streaming_store(%s + %s, %s)" % (access_info.array_name, ecm(flattened_sum(new_terms), PREC_NONE, 'i'), rhs_code)) # }}} from cgen import Assign return Assign(ecm(lhs, prec=PREC_NONE, type_context=None), rhs_code)
def make_ref_args(kernel, impl_arg_info, queue, parameters): import pyopencl as cl import pyopencl.array as cl_array from loopy.kernel.data import ValueArg, ArrayArg, ImageArg, \ TemporaryVariable, ConstantArg from pymbolic import evaluate ref_args = {} ref_arg_data = [] for arg in impl_arg_info: kernel_arg = kernel.impl_arg_to_arg.get(arg.name) if arg.arg_class is ValueArg: if arg.offset_for_name: continue arg_value = parameters[arg.name] try: argv_dtype = arg_value.dtype except AttributeError: argv_dtype = None if argv_dtype != arg.dtype: arg_value = arg.dtype.numpy_dtype.type(arg_value) ref_args[arg.name] = arg_value ref_arg_data.append(None) elif arg.arg_class is ArrayArg or arg.arg_class is ImageArg \ or arg.arg_class is ConstantArg: if arg.shape is None or any(saxis is None for saxis in arg.shape): raise LoopyError( "array '%s' needs known shape to use automatic " "testing" % arg.name) shape = evaluate_shape(arg.unvec_shape, parameters) dtype = kernel_arg.dtype is_output = arg.base_name in kernel.get_written_variables() if arg.arg_class is ImageArg: storage_array = ary = cl_array.empty(queue, shape, dtype, order="C") numpy_strides = None alloc_size = None strides = None else: strides = evaluate(arg.unvec_strides, parameters) alloc_size = sum(astrd * (alen - 1) if astrd != 0 else alen - 1 for alen, astrd in zip(shape, strides)) + 1 if dtype is None: raise LoopyError("dtype for argument '%s' is not yet " "known. Perhaps you want to use " "loopy.add_dtypes " "or loopy.infer_argument_dtypes?" % arg.name) itemsize = dtype.itemsize numpy_strides = [itemsize * s for s in strides] storage_array = cl_array.empty(queue, alloc_size, dtype) if is_output and arg.arg_class is ImageArg: raise LoopyError("write-mode images not supported in " "automatic testing") fill_rand(storage_array) if arg.arg_class is ImageArg: # must be contiguous pre_run_ary = pre_run_storage_array = storage_array.copy() ref_args[arg.name] = cl.image_from_array( queue.context, ary.get()) else: pre_run_storage_array = storage_array.copy() ary = cl_array.as_strided(storage_array, shape, numpy_strides) pre_run_ary = cl_array.as_strided(pre_run_storage_array, shape, numpy_strides) ref_args[arg.name] = ary ref_arg_data.append( TestArgInfo(name=arg.name, ref_array=ary, ref_storage_array=storage_array, ref_pre_run_array=pre_run_ary, ref_pre_run_storage_array=pre_run_storage_array, ref_shape=shape, ref_strides=strides, ref_alloc_size=alloc_size, ref_numpy_strides=numpy_strides, needs_checking=is_output)) elif arg.arg_class is TemporaryVariable: # global temporary, handled by invocation logic pass else: raise LoopyError("arg type %s not understood" % type(arg)) return ref_args, ref_arg_data
def map_subscript(self, expr, enclosing_prec, type_context): def base_impl(expr, enclosing_prec, type_context): return self.parenthesize_if_needed( "%s[%s]" % ( self.rec(expr.aggregate, PREC_CALL, type_context), self.rec(expr.index, PREC_NONE, 'i')), enclosing_prec, PREC_CALL) from pymbolic.primitives import Variable if not isinstance(expr.aggregate, Variable): return base_impl(expr, enclosing_prec, type_context) ary = self.find_array(expr) from loopy.kernel.array import get_access_info from pymbolic import evaluate from loopy.symbolic import simplify_using_aff index_tuple = tuple( simplify_using_aff(self.kernel, idx) for idx in expr.index_tuple) access_info = get_access_info(self.kernel.target, ary, index_tuple, lambda expr: evaluate(expr, self.codegen_state.var_subst_map), self.codegen_state.vectorization_info) from loopy.kernel.data import ImageArg, GlobalArg, TemporaryVariable if isinstance(ary, ImageArg): base_access = ("read_imagef(%s, loopy_sampler, (float%d)(%s))" % (ary.name, ary.dimensions, ", ".join(self.rec(idx, PREC_NONE, 'i') for idx in expr.index[::-1]))) if ary.dtype.numpy_dtype == np.float32: return base_access+".x" if self.kernel.target.is_vector_dtype(ary.dtype): return base_access elif ary.dtype.numpy_dtype == np.float64: return "as_double(%s.xy)" % base_access else: raise NotImplementedError( "non-floating-point images not supported for now") elif isinstance(ary, (GlobalArg, TemporaryVariable)): if len(access_info.subscripts) == 0: if isinstance(ary, GlobalArg): # unsubscripted global args are pointers result = "*" + access_info.array_name else: # unsubscripted temp vars are scalars result = access_info.array_name else: subscript, = access_info.subscripts result = self.parenthesize_if_needed( "%s[%s]" % ( access_info.array_name, self.rec(subscript, PREC_NONE, 'i')), enclosing_prec, PREC_CALL) if access_info.vector_index is not None: return self.codegen_state.ast_builder.add_vector_access( result, access_info.vector_index) else: return result else: assert False