def knl(): return make_loopy_program( """{[iel, idof, j]: 0<=iel<nelements and 0<=idof<n_to_nodes and 0<=j<n_from_nodes}""", "result[itgt_base + to_element_indices[iel]*n_to_nodes + idof, \ isrc_base + from_element_indices[iel]*n_from_nodes + j] \ = resample_mat[idof, j]", [ lp.GlobalArg("result", None, shape="nnodes_tgt, nnodes_src", offset=lp.auto), lp.ValueArg("itgt_base,isrc_base", np.int32), lp.ValueArg("nnodes_tgt,nnodes_src", np.int32), "...", ], name="oversample_mat")
def make_kernels(self, seq_dependencies): result = [] for sub in self.kernels: # {{{ figure out arguments kernel_data = [] for arg_name in sub.arg_names: dims = sub.dim_map.get(arg_name) if dims is not None: # default order is set to "F" in kernel creation below kernel_data.append( lp.GlobalArg( arg_name, dtype=sub.get_type(arg_name), shape=sub.get_loopy_shape(arg_name), )) else: kernel_data.append( lp.ValueArg(arg_name, dtype=sub.get_type(arg_name))) # }}} # {{{ figure out temporary variables for var_name in (sub.known_names() - set(sub.arg_names) - sub.all_inames()): dtype = sub.get_type(var_name, none_ok=True) if sub.implicit_types is None and dtype is None: continue kernel_data.append( lp.TemporaryVariable(var_name, dtype=dtype, shape=sub.get_loopy_shape(var_name))) # }}} from loopy.version import MOST_RECENT_LANGUAGE_VERSION knl = lp.make_kernel(sub.index_sets, sub.instructions, kernel_data, name=sub.subprogram_name, default_order="F", index_dtype=self.index_dtype, target=self.target, seq_dependencies=seq_dependencies, lang_version=MOST_RECENT_LANGUAGE_VERSION) from loopy.loop import fuse_loop_domains knl = fuse_loop_domains(knl) knl = lp.fold_constants(knl) result.append(knl) return result
def get_args(self): if self.allow_evanescent: k_dtype = np.complex128 else: k_dtype = np.float64 return [ KernelArgument(loopy_arg=lp.ValueArg(self.helmholtz_k_name, k_dtype), ) ]
def write_into_mat_prg(): return lp.make_kernel( ["{[idof]: 0 <= idof < ndofs}", "{[jdof]: 0 <= jdof < mdofs}"], """ result[offset_i + idof, offset_j + jdof] = mat[idof, jdof] """, [ lp.GlobalArg("result", None, shape="n, m", offset=lp.auto), lp.ValueArg("n, m", np.int32), lp.GlobalArg("mat", None, shape="ndofs, mdofs", offset=lp.auto), lp.ValueArg("offset_i", np.int32), lp.ValueArg("offset_j", np.int32), "...", ], options=lp.Options(return_dict=True), default_offset=lp.auto, name="write_into_global_matrix", )
def test_memory_tools_defn(): wrapper = __test_cases() for opts in wrapper: # create a dummy callgen callgen = CallgenResult(order=opts.order, lang=opts.lang, dev_mem_type=wrapper.state['dev_mem_type'], type_map=type_map(opts.lang)) # create a memory manager mem = get_memory(callgen, host_namer=HostNamer(), device_namer=DeviceNamer()) a1 = lp.GlobalArg('a1', shape=(arc.problem_size), dtype=np.int32) a2 = lp.GlobalArg('a2', shape=(arc.problem_size, 10), dtype=np.int64) d3 = lp.GlobalArg('d3', shape=(arc.problem_size, 10, 10), dtype=np.float64) a4 = lp.ValueArg('a4', dtype=np.int64) a5 = lp.ValueArg('a5', dtype=np.int32) a6 = lp.TemporaryVariable('a6', initializer=np.array([0, 1, 2]), read_only=True) if opts.lang == 'opencl': assert mem.define(True, a1) == 'cl_mem d_a1;' assert mem.define(False, a2) == 'long int* h_a2;' assert mem.define(True, d3) == 'cl_mem d_d3;' assert mem.define(False, a4) == 'long int h_a4;' assert mem.define(True, a5) == 'cl_uint d_a5;' assert mem.define(True, a5) == 'cl_uint d_a5;' with assert_raises(Exception): mem.define(True, a6, host_constant=True) assert mem.define(False, a6, host_constant=True) == \ 'const long int h_a6[3] = {0, 1, 2};' elif opts.lang == 'c': assert mem.define(True, a1) == 'int* d_a1;' assert mem.define(False, a2) == 'long int* h_a2;' assert mem.define(True, d3) == 'double* d_d3;' assert mem.define(False, a4) == 'long int h_a4;' assert mem.define(True, a5) == 'int d_a5;' with assert_raises(Exception): mem.define(True, a6, host_constant=True) assert mem.define(False, a6, host_constant=True) == \ 'const long int h_a6[3] = {0, 1, 2};' else: raise NotImplementedError
def expression_argument(expr, parameters): name = expr.name shape = expr.shape dtype = expr.dtype if shape == (): arg = loopy.ValueArg(name, dtype=dtype) else: arg = loopy.GlobalArg(name, dtype=dtype, shape=shape) idx = parameters.wrapper_arguments.index(expr) parameters.kernel_data[idx] = arg return pym.Variable(name)
def pick_used_centers(self): knl = lp.make_kernel( """{[i]: 0<=i<ntargets}""", """ <>target_has_center = (target_to_center[i] >= 0) center_is_used[target_to_center[i]] = 1 \ {id=center_is_used_write,if=target_has_center} """, [ lp.GlobalArg( "target_to_center", shape="ntargets", offset=lp.auto), lp.GlobalArg("center_is_used", shape="ncenters"), lp.ValueArg("ncenters", np.int32), lp.ValueArg("ntargets", np.int32), ], name="pick_used_centers", silenced_warnings="write_race(center_is_used_write)", lang_version=MOST_RECENT_LANGUAGE_VERSION) knl = lp.split_iname(knl, "i", 128, inner_tag="l.0", outer_tag="g.0") return knl
def create_globals(signature, edges): globals = [] dimension_vars = [] for s in signature['args']: if s in signature['param']: g = lp.ValueArg(s, dtype=edges[s]['type']) globals.append(g) else: dims = edges[s]['dimensions'] for d in dims: if d in dimension_vars: continue g = lp.ValueArg(d,dtype=np.int32) globals.append(g) dimension_vars.append(d) g = lp.GlobalArg(s, shape=tuple(dims), dtype=edges[s]['type']) globals.append(g) return globals
def map_size_param(self, expr: SizeParam, state: CodeGenState) -> ImplementedResult: if expr in state.results: return state.results[expr] arg = lp.ValueArg(expr.name, dtype=expr.dtype) kernel = state.kernel.copy(args=state.kernel.args + [arg]) state.update_kernel(kernel) result = StoredResult(expr.name, expr.ndim, frozenset()) state.results[expr] = result return result
def left_V(ctx): order = 'C' dtype = np.float32 knl = lp.make_kernel(ctx.devices[0], [ "{[i,k,alpha,alpha1]: 0<=alpha,alpha1<r and 0<=i,k<n}", ], [ "l[alpha,alpha1]=sum((i), u[alpha,i]*u[alpha1,i])*sum((k),w[alpha,k]*w[alpha1,k])", ], [ lp.GlobalArg("u", dtype, shape="r, n", order=order), lp.GlobalArg("w", dtype, shape="r, n", order=order), lp.GlobalArg("l", dtype, shape="r, r", order=order), lp.ValueArg("n", np.int64), lp.ValueArg("r", np.int64), ], assumptions="n>=1") knl = lp.split_iname(knl, "alpha1", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "alpha", 3, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "i", 16) knl = lp.split_iname(knl, "k", 16) return knl
def left_W(ctx): order = 'C' dtype = np.float64 knl = lp.make_kernel(ctx.devices[0], [ "{[j,i,alpha,alpha1]: 0<=alpha,alpha1<r and 0<=j,i<n}", ], [ "l[alpha,alpha1]=sum((i), u[i,alpha]*u[i,alpha1])*sum((j),v[j,alpha]*v[j,alpha1])", ], [ lp.GlobalArg("v", dtype, shape="n, r", order=order), lp.GlobalArg("u", dtype, shape="n, r", order=order), lp.GlobalArg("l", dtype, shape="r, r", order=order), lp.ValueArg("n", np.int64), lp.ValueArg("r", np.int64), ], assumptions="n>=1") knl = lp.split_iname(knl, "alpha1", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "alpha", 3, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16) knl = lp.split_iname(knl, "i", 16) return knl
def test_add_inames_for_unused_hw_axes(ctx_factory): ctx = ctx_factory() dtype = np.float32 order = "F" n = 16**3 knl = lp.make_kernel("[n] -> {[i,j]: 0<=i,j<n}", [ """ <> alpha = 2.0 {id=init_alpha} for i for j c[i, j] = alpha*a[i]*b[j] {id=outerproduct} end end """ ], [ lp.GlobalArg("a", dtype, shape=("n", ), order=order), lp.GlobalArg("b", dtype, shape=("n", ), order=order), lp.GlobalArg("c", dtype, shape=("n, n"), order=order), lp.ValueArg("n", np.int32, approximately=n), ], name="rank_one", assumptions="n >= 16", lang_version=(2018, 2)) ref_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.add_prefetch(knl, "a") knl = lp.add_prefetch(knl, "b") knl = lp.add_inames_for_unused_hw_axes(knl) assert ( knl["rank_one"].id_to_insn["init_alpha"].within_inames == frozenset( ["i_inner", "i_outer", "j_outer", "j_inner"])) assert ( knl["rank_one"].id_to_insn["a_fetch_rule"].within_inames == frozenset( ["i_inner", "i_outer", "j_outer", "j_inner"])) assert ( knl["rank_one"].id_to_insn["b_fetch_rule"].within_inames == frozenset( ["i_inner", "i_outer", "j_outer", "j_inner"])) lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[np.dtype(dtype).itemsize * n**2 / 1e9], op_label=["GBytes"], parameters={"n": n})
def get_kernel(self): loopy_insns, result_names = self.get_loopy_insns_and_result_names() kernel_exprs = self.get_kernel_exprs(result_names) arguments = (self.get_default_src_tgt_arguments() + [ lp.GlobalArg("srcindices", None, shape="nresult"), lp.GlobalArg("tgtindices", None, shape="nresult"), lp.ValueArg("nresult", None) ] + [ lp.GlobalArg("result_%d" % i, dtype, shape="nresult") for i, dtype in enumerate(self.value_dtypes) ]) loopy_knl = lp.make_kernel( "{[imat, idim]: 0 <= imat < nresult and 0 <= idim < dim}", self.get_kernel_scaling_assignments() # NOTE: itgt, isrc need to always be defined in case a statement # in loopy_insns or kernel_exprs needs them (e.g. hardcoded in # places like get_kernel_exprs) + [ """ for imat <> itgt = tgtindices[imat] <> isrc = srcindices[imat] <> d[idim] = targets[idim, itgt] - sources[idim, isrc] """ ] + [ """ <> is_self = (isrc == target_to_source[itgt]) """ if self.exclude_self else "" ] + loopy_insns + kernel_exprs + [ """ result_{i}[imat] = \ knl_{i}_scaling * pair_result_{i} \ {{id_prefix=write_p2p}} """.format(i=iknl) for iknl in range(len(self.kernels)) ] + ["end"], arguments, assumptions="nresult>=1", silenced_warnings="write_race(write_p2p*)", name=self.name, fixed_parameters=dict(dim=self.dim), lang_version=MOST_RECENT_LANGUAGE_VERSION) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") loopy_knl = lp.add_dtypes(loopy_knl, dict(nsources=np.int32, ntargets=np.int32)) for knl in self.kernels: loopy_knl = knl.prepare_loopy_kernel(loopy_knl) return loopy_knl
def knl(): import loopy as lp knl = lp.make_kernel( """{[k,i,j]: 0<=k<nelements and 0<=i<n_to_nodes and 0<=j<n_from_nodes}""", "result[itgt_base + to_element_indices[k]*n_to_nodes + i, \ isrc_base + from_element_indices[k]*n_from_nodes + j] \ = resample_mat[i, j]", [ lp.GlobalArg("result", None, shape="nnodes_tgt, nnodes_src", offset=lp.auto), lp.ValueArg("itgt_base,isrc_base", np.int32), lp.ValueArg("nnodes_tgt,nnodes_src", np.int32), "...", ], name="oversample_mat") knl = lp.split_iname(knl, "i", 16, inner_tag="l.0") return lp.tag_inames(knl, dict(k="g.0"))
def pick_knl(): knl = make_loopy_program( """{[iel, idof]: 0<=iel<nelements and 0<=idof<n_to_nodes}""", "result[to_element_indices[iel], idof] \ = ary[from_element_indices[iel], pick_list[idof]]", [ lp.GlobalArg("result", None, shape="nelements_result, n_to_nodes", offset=lp.auto), lp.GlobalArg("ary", None, shape="nelements_vec, n_from_nodes", offset=lp.auto), lp.ValueArg("nelements_result", np.int32), lp.ValueArg("nelements_vec", np.int32), lp.ValueArg("n_from_nodes", np.int32), "...", ], name="resample_by_picking") return knl
def Prav_U(ctx): order = 'C' dtype = np.float32 knl = lp.make_kernel(ctx.devices[0], [ "{[i,j,k,alpha]: 0<=alpha<r and 0<=i,j,k<n}", ], [ "f[alpha,i]=sum((j,k), a[i,j,k]*v[alpha,j]*w[alpha,k])", ], [ lp.GlobalArg("a", dtype, shape="n, n, n", order=order), lp.GlobalArg("v", dtype, shape="r, n", order=order), lp.GlobalArg("w", dtype, shape="r, n", order=order), lp.GlobalArg("f", dtype, shape="r, n", order=order), lp.ValueArg("n", np.int64), lp.ValueArg("r", np.int64), ], assumptions="n>=1") knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "alpha", 1, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16) knl = lp.split_iname(knl, "k", 16) print lp.CompiledKernel(ctx, knl).get_highlighted_code() return knl
def Prav_V(ctx): order = 'C' dtype = np.float64 knl = lp.make_kernel(ctx.devices[0], [ "{[i,j,k,alpha]: 0<=alpha<r and 0<=i,j,k<n}", ], [ "f[alpha,j]=sum((k,i), a[i,j,k]*w[k,alpha]*u[i,alpha])", ], [ lp.GlobalArg("a", dtype, shape="n, n, n", order=order), lp.GlobalArg("u", dtype, shape="n, r", order=order), lp.GlobalArg("w", dtype, shape="n, r", order=order), lp.GlobalArg("f", dtype, shape="r, n", order=order), lp.ValueArg("n", np.int64), lp.ValueArg("r", np.int64), ], assumptions="n>=1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "alpha", 3, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "i", 16) knl = lp.split_iname(knl, "k", 16) return knl
def get_tensor(ctx): order = 'C' dtype = np.float64 knl = lp.make_kernel(ctx.devices[0], [ "{[j,i,alpha,k]: 0<=alpha<r and 0<=i,j,k<n}", ], [ "res[i,j,k]=sum((alpha), u[i,alpha]*v[j,alpha]*w[k,alpha])", ], [ lp.GlobalArg("res", dtype, shape="n, n, n", order=order), lp.GlobalArg("v", dtype, shape="n, r", order=order), lp.GlobalArg("u", dtype, shape="n, r", order=order), lp.GlobalArg("w", dtype, shape="n, r", order=order), lp.ValueArg("n", np.int32), lp.ValueArg("r", np.int32), ], assumptions="n>=1") knl = lp.split_iname(knl, "i", 8, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 8, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "alpha", 2) knl = lp.split_iname(knl, "k", 8, outer_tag="g.2", inner_tag="l.2") return knl
def prg(): if norm_type == "l2": # NOTE: computes first-order approximation of the source centroids insns = """ proxy_center[idim, irange] = 1.0 / npoints \ * simul_reduce(sum, i, sources[idim, srcindices[i + ioffset]]) """ elif norm_type == "linf": # NOTE: computes the centers of the bounding box insns = """ <> bbox_max = \ simul_reduce(max, i, sources[idim, srcindices[i + ioffset]]) <> bbox_min = \ simul_reduce(min, i, sources[idim, srcindices[i + ioffset]]) proxy_center[idim, irange] = (bbox_max + bbox_min) / 2.0 """ else: raise ValueError(f"unknown norm type: '{norm_type}'") knl = lp.make_kernel( [ "{[irange]: 0 <= irange < nranges}", "{[i]: 0 <= i < npoints}", "{[idim]: 0 <= idim < ndim}" ], """ for irange <> ioffset = srcranges[irange] <> npoints = srcranges[irange + 1] - srcranges[irange] %(insns)s end """ % dict(insns=insns), [ lp.GlobalArg("sources", None, shape=(ndim, "nsources"), dim_tags="sep,C"), lp.ValueArg("nsources", np.int64), ... ], name="compute_block_centers_knl", assumptions="ndim>=1 and nranges>=1", fixed_parameters=dict(ndim=ndim), lang_version=MOST_RECENT_LANGUAGE_VERSION, ) knl = lp.tag_inames(knl, "idim*:unr") knl = lp.split_iname(knl, "irange", 64, outer_tag="g.0") return knl
def test_unknown_stride_to_callee(ctx_factory): ctx = ctx_factory() twice = lp.make_function("{[i, j]: 0<=i, j < n}", """ b[i, j] = 2*a[i, j] """, [lp.ValueArg("n"), lp.GlobalArg("a"), lp.GlobalArg("b")], name="twice") prog = lp.make_kernel( "{[i,i0,i1,i2,i3]: 0<=i0, i1, i2, i3< N and 0<=i<Nvar}", """ [i0, i1]: y[i0, i1, i] = twice(N, [i2, i3]: x[2*i2, i3, i]) """, [ lp.ValueArg("N", dtype=np.int32), lp.ValueArg("Nvar", dtype=np.int32), lp.GlobalArg("x", shape=lp.auto, dtype=np.float64), ... ]) prog = lp.merge([prog, twice]) lp.auto_test_vs_ref(prog, ctx, prog, parameters={"N": 4, "Nvar": 5})
def test_nonlinear_index(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[i,j]: 0<=i,j<n }", """ a[i*i] = 17 """, [ lp.GlobalArg("a", shape="n"), lp.ValueArg("n"), ], assumptions="n>=1") print(knl) print(lp.CompiledKernel(ctx, knl).get_highlighted_code())
def mat_knl(): knl = make_loopy_program( """{[iel, idof, j]: 0<=iel<nelements and 0<=idof<n_to_nodes and 0<=j<n_from_nodes}""", "result[to_element_indices[iel], idof] \ = sum(j, resample_mat[idof, j] \ * ary[from_element_indices[iel], j])", [ lp.GlobalArg("result", None, shape="nelements_result, n_to_nodes", offset=lp.auto), lp.GlobalArg("ary", None, shape="nelements_vec, n_from_nodes", offset=lp.auto), lp.ValueArg("nelements_result", np.int32), lp.ValueArg("nelements_vec", np.int32), "...", ], name="resample_by_mat") return knl
def test_function_decl_extractor(): # ensure that we can tell the difference between pointers, constants, etc. # in execution from loopy.target.c import ExecutableCTarget knl = lp.make_kernel('{[i]: 0 <= i < 10}', """ a[i] = b[i] + v """, [lp.GlobalArg('a', shape=(10,), dtype=np.int32), lp.ConstantArg('b', shape=(10)), lp.ValueArg('v', dtype=np.int32)], target=ExecutableCTarget()) assert np.allclose(knl(b=np.arange(10), v=-1)[1], np.arange(10) - 1)
def map_size_param(self, expr: SizeParam, state: CodeGenState) -> ImplementedResult: if expr in state.results: return state.results[expr] arg = lp.ValueArg(expr.name, dtype=expr.dtype, tags=_filter_tags_not_of_type( expr, self.array_tag_t_to_not_propagate)) kernel = state.kernel.copy(args=state.kernel.args + [arg]) state.update_kernel(kernel) assert expr.name is not None result = StoredResult(expr.name, expr.ndim, frozenset()) state.results[expr] = result return result
def prg(): knl = lp.make_kernel( [ "{[irange]: 0 <= irange < nranges}", "{[i]: 0 <= i < npoints}", "{[idim]: 0 <= idim < ndim}" ], """ for irange <> ioffset = srcranges[irange] <> npoints = srcranges[irange + 1] - srcranges[irange] <> rblk = reduce(max, i, sqrt(simul_reduce(sum, idim, \ (proxy_centers[idim, irange] - sources[idim, srcindices[i + ioffset]]) ** 2) )) proxy_radius[irange] = radius_factor * rblk end """, [ lp.GlobalArg("sources", None, shape=(ndim, "nsources"), dim_tags="sep,C"), lp.ValueArg("nsources", np.int64), lp.ValueArg("radius_factor", np.float64), ... ], name="compute_block_radii_knl", assumptions="ndim>=1 and nranges>=1", fixed_parameters=dict(ndim=ndim), lang_version=MOST_RECENT_LANGUAGE_VERSION, ) knl = lp.tag_inames(knl, "idim*:unr") knl = lp.split_iname(knl, "irange", 64, outer_tag="g.0") return knl
def init_global_mat_prg(): return lp.make_kernel( ["{[idof]: 0 <= idof < n}", "{[jdof]: 0 <= jdof < m}"], """ result[idof, jdof] = 0 {id=init} """, [ lp.GlobalArg("result", None, shape="n, m", offset=lp.auto), lp.ValueArg("n, m", np.int32), "...", ], options=lp.Options(return_dict=True), default_offset=lp.auto, name="init_a_global_matrix", )
def qbx_center_to_target_box_lookup(self, particle_id_dtype, box_id_dtype): # FIXME Iterating over all boxes to find which ones have QBX centers # is inefficient. knl = lp.make_kernel([ "{[ibox]: 0<=ibox<nboxes}", "{[itarget_tree]: b_t_start <= itarget_tree < b_t_start + ntargets}", ], """ for ibox <> b_t_start = box_target_starts[ibox] <> ntargets = box_target_counts_nonchild[ibox] for itarget_tree <> itarget_user = user_target_from_tree_target[itarget_tree] <> in_bounds = itarget_user < ncenters # This write is race-free because each center only belongs # to one box. if in_bounds qbx_center_to_target_box[itarget_user] = \ box_to_target_box[ibox] {id=tgt_write} end end end """, [ lp.GlobalArg("qbx_center_to_target_box", box_id_dtype, shape="ncenters"), lp.GlobalArg("box_to_target_box", box_id_dtype), lp.GlobalArg("user_target_from_tree_target", None, shape=None), lp.ValueArg("ncenters", particle_id_dtype), "..." ], name="qbx_center_to_target_box_lookup", silenced_warnings="write_race(tgt_write)", lang_version=MOST_RECENT_LANGUAGE_VERSION) knl = lp.split_iname(knl, "ibox", 128, inner_tag="l.0", outer_tag="g.0") return knl
def keval(): return make_loopy_program( [ "{[iel]: 0 <= iel < nelements}", "{[idof]: 0 <= idof < n_to_nodes}" ], """ result[iel, idof] = result[iel, idof] + \ coefficients[iel, ibasis] * basis[idof] """, [ lp.GlobalArg("coefficients", None, shape=("nelements", "n_to_nodes")), lp.ValueArg("ibasis", np.int32), "..." ], name="conn_evaluate_knl")
def test_divide_precedence(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "{:}", """ x[0] = c*(a/b) y[0] = c*(a%b) """, [lp.ValueArg("a, b, c", np.int32), lp.GlobalArg("x, y", np.int32)]) print(lp.generate_code_v2(knl).device_code()) evt, (x_out, y_out) = knl(queue, c=2, b=2, a=5) evt.wait() assert x_out.get() == 4 assert y_out.get() == 2
def test_fuzz_code_generator(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) if ctx.devices[0].platform.vendor.startswith("Advanced Micro"): pytest.skip("crashes on AMD 15.12") #from expr_fuzz import get_fuzz_examples #for expr, var_values in get_fuzz_examples(): for expr, var_values in generate_random_fuzz_examples(50): from pymbolic import evaluate try: true_value = evaluate(expr, var_values) except ZeroDivisionError: continue def get_dtype(x): if isinstance(x, (complex, np.complexfloating)): return np.complex128 else: return np.float64 knl = lp.make_kernel("{ : }", [lp.Assignment("value", expr)], [lp.GlobalArg("value", np.complex128, shape=())] + [ lp.ValueArg(name, get_dtype(val)) for name, val in six.iteritems(var_values) ]) ck = lp.CompiledKernel(ctx, knl) evt, (lp_value, ) = ck(queue, out_host=True, **var_values) err = abs(true_value - lp_value) / abs(true_value) if abs(err) > 1e-10: print(80 * "-") print("WRONG: rel error=%g" % err) print("true=%r" % true_value) print("loopy=%r" % lp_value) print(80 * "-") print(ck.get_code()) print(80 * "-") print(var_values) print(80 * "-") print(repr(expr)) print(80 * "-") print(expr) print(80 * "-") 1 / 0