def get_optimized_kernel(self, targets_is_obj_array, sources_is_obj_array, centers_is_obj_array): # FIXME specialize/tune for GPU/CPU loopy_knl = self.get_kernel() if targets_is_obj_array: loopy_knl = lp.tag_array_axes(loopy_knl, "tgt", "sep,C") if sources_is_obj_array: loopy_knl = lp.tag_array_axes(loopy_knl, "src", "sep,C") if centers_is_obj_array: loopy_knl = lp.tag_array_axes(loopy_knl, "center", "sep,C") import pyopencl as cl dev = self.context.devices[0] if dev.type & cl.device_type.CPU: loopy_knl = lp.split_iname(loopy_knl, "itgt", 16, outer_tag="g.0", inner_tag="l.0") loopy_knl = lp.split_iname(loopy_knl, "isrc", 256) loopy_knl = lp.prioritize_loops(loopy_knl, ["isrc_outer", "itgt_inner"]) else: from warnings import warn warn( "don't know how to tune layer potential computation for '%s'" % dev) loopy_knl = lp.split_iname(loopy_knl, "itgt", 128, outer_tag="g.0") return loopy_knl
def get_optimized_kernel(self, targets_is_obj_array, sources_is_obj_array): # FIXME knl = self.get_kernel() if sources_is_obj_array: knl = lp.tag_array_axes(knl, "sources", "sep,C") if targets_is_obj_array: knl = lp.tag_array_axes(knl, "targets", "sep,C") knl = lp.split_iname(knl, "imat", 1024, outer_tag="g.0") return knl
def get_optimized_kernel(self, is_sources_obj_array, is_centers_obj_array): # FIXME knl = self.get_kernel() if is_sources_obj_array: knl = lp.tag_array_axes(knl, "sources", "sep,C") if is_centers_obj_array: knl = lp.tag_array_axes(knl, "qbx_centers", "sep,C") knl = lp.split_iname(knl, "itgt_center", 16, outer_tag="g.0") knl = self._allow_redundant_execution_of_knl_scaling(knl) return knl
def get_optimized_kernel(self, is_targets_obj_array, is_centers_obj_array): # FIXME knl = self.get_kernel() if is_targets_obj_array: knl = lp.tag_array_axes(knl, "targets", "sep,C") if is_centers_obj_array: knl = lp.tag_array_axes(knl, "qbx_centers", "sep,C") knl = lp.tag_inames(knl, dict(iglobal_center="g.0")) knl = self._allow_redundant_execution_of_knl_scaling(knl) return knl
def get_optimized_kernel(self, targets_is_obj_array, sources_is_obj_array, centers_is_obj_array): loopy_knl = self.get_kernel() if targets_is_obj_array: loopy_knl = lp.tag_array_axes(loopy_knl, "tgt", "sep,C") if sources_is_obj_array: loopy_knl = lp.tag_array_axes(loopy_knl, "src", "sep,C") if centers_is_obj_array: loopy_knl = lp.tag_array_axes(loopy_knl, "center", "sep,C") loopy_knl = lp.split_iname(loopy_knl, "imat", 1024, outer_tag="g.0") return loopy_knl
def test_tag_data_axes(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{ [i,j,k]: 0<=i,j,k<n }", "out[i,j,k] = 15") ref_knl = knl with pytest.raises(lp.LoopyError): lp.tag_array_axes(knl, "out", "N1,N0,N5") with pytest.raises(lp.LoopyError): lp.tag_array_axes(knl, "out", "N1,N0,c") knl = lp.tag_array_axes(knl, "out", "N1,N0,N2") knl = lp.tag_inames(knl, dict(j="g.0", i="g.1")) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=20))
def set_q_storage_format(kernel, name): kernel = lp.set_array_axis_names(kernel, name, "i,j,k,field,e") kernel = lp.split_array_dim( kernel, (name, 3, "F"), 4, auto_split_inames=False) kernel = lp.tag_array_axes(kernel, name, "N0,N1,N2,vec,N4,N3") return kernel
def copy_targets_kernel(self): knl = lp.make_kernel("""{[dim,i]: 0<=dim<ndims and 0<=i<npoints}""", """ targets[dim, i] = points[dim, i] """, default_offset=lp.auto, name="copy_targets", lang_version=MOST_RECENT_LANGUAGE_VERSION) knl = lp.fix_parameters(knl, ndims=self.ambient_dim) knl = lp.split_iname(knl, "i", 128, inner_tag="l.0", outer_tag="g.0") knl = lp.tag_array_axes(knl, "points", "sep, C") knl = lp.tag_array_axes(knl, "targets", "stride:auto, stride:1") return lp.tag_inames(knl, dict(dim="ilp"))
def copy_targets_kernel(self): knl = lp.make_kernel( """{[dim,i]: 0<=dim<ndims and 0<=i<npoints}""", """ targets[dim, i] = points[dim, i] """, default_offset=lp.auto, name="copy_targets", lang_version=MOST_RECENT_LANGUAGE_VERSION) knl = lp.fix_parameters(knl, ndims=self.ambient_dim) knl = lp.split_iname(knl, "i", 128, inner_tag="l.0", outer_tag="g.0") knl = lp.tag_array_axes(knl, "points", "sep, C") knl = lp.tag_array_axes(knl, "targets", "stride:auto, stride:1") return lp.tag_inames(knl, dict(dim="ilp"))
def set_q_storage_format(kernel, name): kernel = lp.set_array_axis_names(kernel, name, "i,j,k,field,e") kernel = lp.split_array_dim(kernel, (name, 3, "F"), 4, auto_split_inames=False) kernel = lp.tag_array_axes(kernel, name, "N0,N1,N2,vec,N4,N3") return kernel
def prg(): result = make_loopy_program( """{[iel, idof, j]: 0<=iel<nelements and 0<=idof<ndiscr_nodes_out and 0<=j<ndiscr_nodes_in}""", "result[iel, idof] = sum(j, mat[idof, j] * vec[iel, j])", name="diff") result = lp.tag_array_axes(result, "mat", "stride:auto,stride:auto") return result
def test_cuda_short_vector(): knl = lp.make_kernel("{ [i]: 0<=i<n }", "out[i] = 2*a[i]", target=lp.CudaTarget()) knl = lp.set_options(knl, write_code=True) knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="vec") knl = lp.split_array_axis(knl, "a,out", axis_nr=0, count=4) knl = lp.tag_array_axes(knl, "a,out", "C,vec") knl = lp.set_options(knl, write_wrapper=True) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) print(lp.generate_code_v2(knl).device_code())
def test_cuda_short_vector(): knl = lp.make_kernel( "{ [i]: 0<=i<n }", "out[i] = 2*a[i]", target=lp.CudaTarget()) knl = lp.set_options(knl, write_code=True) knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="vec") knl = lp.split_array_axis(knl, "a,out", axis_nr=0, count=4) knl = lp.tag_array_axes(knl, "a,out", "C,vec") knl = lp.set_options(knl, write_wrapper=True) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) print(lp.generate_code_v2(knl).device_code())
def prg(nmatrices): result = make_loopy_program( """{[imatrix, iel, idof, j]: 0<=imatrix<nmatrices and 0<=iel<nelements and 0<=idof<nunit_nodes_out and 0<=j<nunit_nodes_in}""", """ result[imatrix, iel, idof] = sum( j, diff_mat[imatrix, idof, j] * vec[iel, j]) """, name="diff") result = lp.fix_parameters(result, nmatrices=nmatrices) result = lp.tag_inames(result, "imatrix: unr") result = lp.tag_array_axes(result, "result", "sep,c,c") return result
def test_numba_cuda_target(): knl = lp.make_kernel("{[i,j,k]: 0<=i,j<M and 0<=k<N}", "D[i,j] = sqrt(sum(k, (X[i, k]-X[j, k])**2))", target=lp.NumbaCudaTarget()) knl = lp.assume(knl, "M>0") knl = lp.split_iname(knl, "i", 16, outer_tag='g.0') knl = lp.split_iname(knl, "j", 128, inner_tag='l.0', slabs=(0, 1)) knl = lp.add_prefetch(knl, "X[i,:]") knl = lp.fix_parameters(knl, N=3) knl = lp.prioritize_loops(knl, "i_inner,j_outer") knl = lp.tag_inames(knl, "k:unr") knl = lp.tag_array_axes(knl, "X", "N0,N1") knl = lp.add_and_infer_dtypes(knl, {"X": np.float32}) print(lp.generate_code_v2(knl).all_code())
def test_numba_cuda_target(): knl = lp.make_kernel( "{[i,j,k]: 0<=i,j<M and 0<=k<N}", "D[i,j] = sqrt(sum(k, (X[i, k]-X[j, k])**2))", target=lp.NumbaCudaTarget()) knl = lp.assume(knl, "M>0") knl = lp.split_iname(knl, "i", 16, outer_tag='g.0') knl = lp.split_iname(knl, "j", 128, inner_tag='l.0', slabs=(0, 1)) knl = lp.add_prefetch(knl, "X[i,:]", default_tag="l.auto") knl = lp.fix_parameters(knl, N=3) knl = lp.prioritize_loops(knl, "i_inner,j_outer") knl = lp.tag_inames(knl, "k:unr") knl = lp.tag_array_axes(knl, "X", "N0,N1") knl = lp.add_and_infer_dtypes(knl, {"X": np.float32}) print(lp.generate_code_v2(knl).all_code())
def knl(): knl = lp.make_kernel( """{[d,k,i,j]: 0<=d<dims and 0<=k<nelements and 0<=i<ndiscr_nodes and 0<=j<nmesh_nodes}""", """ result[d, k, i] = \ sum(j, resampling_mat[i, j] * nodes[d, k, j]) """, name="nodes", default_offset=lp.auto) knl = lp.split_iname(knl, "i", 16, inner_tag="l.0") knl = lp.tag_inames(knl, dict(k="g.0")) knl = lp.tag_array_axes(knl, "result", "stride:auto,stride:auto,stride:auto") return knl
def test_vectorize(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<n}", """ <> temp = 2*b[i] a[i] = temp """) knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32)) knl = lp.set_array_axis_names(knl, "a,b", "i") knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)], 4, split_kwargs=dict(slabs=(0, 1))) knl = lp.tag_array_axes(knl, "a,b", "c,vec") ref_knl = knl ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"}) knl = lp.tag_inames(knl, {"i_inner": "vec"}) knl = lp.preprocess_kernel(knl) code, inf = lp.generate_code(knl) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=30))
def get_kernel(self): loopy_insns, result_names = self.get_loopy_insns_and_result_names() from pymbolic import var exprs = [ var(name) * var("strength").index( (self.strength_usage[i], var("isrc"))) for i, name in enumerate(result_names) ] if self.exclude_self: from pymbolic.primitives import If, Variable exprs = [If(Variable("is_self"), 0, expr) for expr in exprs] from sumpy.tools import gather_loopy_source_arguments loopy_knl = lp.make_kernel( [ "{[itgt_box]: 0<=itgt_box<ntgt_boxes}", "{[isrc_box]: isrc_box_start<=isrc_box<isrc_box_end}", "{[itgt,isrc,idim]: \ itgt_start<=itgt<itgt_end and \ isrc_start<=isrc<isrc_end and \ 0<=idim<dim }", ], self.get_kernel_scaling_assignments() + [ """ for itgt_box <> tgt_ibox = target_boxes[itgt_box] <> itgt_start = box_target_starts[tgt_ibox] <> itgt_end = itgt_start+box_target_counts_nonchild[tgt_ibox] <> isrc_box_start = source_box_starts[itgt_box] <> isrc_box_end = source_box_starts[itgt_box+1] for isrc_box <> src_ibox = source_box_lists[isrc_box] <> isrc_start = box_source_starts[src_ibox] <> isrc_end = isrc_start+box_source_counts_nonchild[src_ibox] for itgt for isrc <> d[idim] = \ targets[idim,itgt] - sources[idim,isrc] \ {dup=idim} """ ] + [ """ <> is_self = (isrc == target_to_source[itgt]) """ if self.exclude_self else "" ] + [] + loopy_insns + [ lp.Assignment(id=None, assignee="pair_result_%d" % i, expression=expr, temp_var_type=lp.auto) for i, expr in enumerate(exprs) ] + [ """ end """ ] + [ """ result[KNLIDX, itgt] = result[KNLIDX, itgt] + \ knl_KNLIDX_scaling \ * simul_reduce(sum, isrc, pair_result_KNLIDX) """.replace("KNLIDX", str(iknl)) for iknl in range(len(exprs)) ] + [ """ end end end """ ], [ lp.GlobalArg( "box_target_starts,box_target_counts_nonchild," "box_source_starts,box_source_counts_nonchild,", None, shape=None), lp.GlobalArg( "source_box_starts, source_box_lists,", None, shape=None), lp.GlobalArg("strength", None, shape="nstrengths,nsources"), lp.GlobalArg("result", None, shape="nkernels,ntargets", dim_tags="sep,c"), lp.GlobalArg( "targets", None, shape="dim,ntargets", dim_tags="sep,c"), lp.GlobalArg( "sources", None, shape="dim,nsources", dim_tags="sep,c"), lp.ValueArg("nsources", np.int32), lp.ValueArg("ntargets", np.int32), "...", ] + ([ lp.GlobalArg( "target_to_source", np.int32, shape=("ntargets", )) ] if self.exclude_self else []) + gather_loopy_source_arguments(self.kernels), name=self.name, assumptions="ntgt_boxes>=1", fixed_parameters=dict(dim=self.dim, nstrengths=self.strength_count, nkernels=len(self.kernels))) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") loopy_knl = lp.tag_array_axes(loopy_knl, "strength", "sep,C") for knl in self.kernels: loopy_knl = knl.prepare_loopy_kernel(loopy_knl) return loopy_knl
def set_D_storage_format(kernel): return lp.tag_array_axes(kernel, "D", "f,f")
def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): ctx = ctx_factory() filename = "strongVolumeKernels.f90" with open(filename, "r") as sourcef: source = sourcef.read() source = source.replace("datafloat", "real*4") hsv_r, hsv_s = [ knl for knl in lp.parse_fortran(source, filename, auto_dependencies=False) if "KernelR" in knl.name or "KernelS" in knl.name ] hsv_r = lp.tag_instructions(hsv_r, "rknl") hsv_s = lp.tag_instructions(hsv_s, "sknl") hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"]) #hsv = hsv_s from gnuma_loopy_transforms import (fix_euler_parameters, set_q_storage_format, set_D_storage_format) hsv = lp.fix_parameters(hsv, Nq=Nq) hsv = lp.set_loop_priority(hsv, "e,k,j,i") hsv = lp.tag_inames(hsv, dict(e="g.0", j="l.1", i="l.0")) hsv = lp.assume(hsv, "elements >= 1") hsv = fix_euler_parameters(hsv, p_p0=1, p_Gamma=1.4, p_R=1) for name in ["Q", "rhsQ"]: hsv = set_q_storage_format(hsv, name) hsv = set_D_storage_format(hsv) #hsv = lp.add_prefetch(hsv, "volumeGeometricFactors") ref_hsv = hsv if opt_level == 0: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "D[:,:]") if opt_level == 1: tap_hsv = hsv # turn the first reads into subst rules local_prep_var_names = set() for insn in lp.find_instructions(hsv, "tag:local_prep"): assignee, = insn.assignee_var_names() local_prep_var_names.add(assignee) hsv = lp.assignment_to_subst(hsv, assignee) # precompute fluxes hsv = lp.assignment_to_subst(hsv, "JinvD_r") hsv = lp.assignment_to_subst(hsv, "JinvD_s") r_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:rknl") s_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:sknl") if ilp_multiple > 1: hsv = lp.split_iname(hsv, "k", 2, inner_tag="ilp") ilp_inames = ("k_inner", ) flux_ilp_inames = ("kk", ) else: ilp_inames = () flux_ilp_inames = () rtmps = [] stmps = [] flux_store_idx = 0 for rflux_insn, sflux_insn in zip(r_fluxes, s_fluxes): for knl_tag, insn, flux_inames, tmps, flux_precomp_inames in [ ("rknl", rflux_insn, ( "j", "n", ), rtmps, ( "jj", "ii", )), ("sknl", sflux_insn, ( "i", "n", ), stmps, ( "ii", "jj", )), ]: flux_var, = insn.assignee_var_names() print(insn) reader, = lp.find_instructions( hsv, "tag:{knl_tag} and reads:{flux_var}".format(knl_tag=knl_tag, flux_var=flux_var)) hsv = lp.assignment_to_subst(hsv, flux_var) flux_store_name = "flux_store_%d" % flux_store_idx flux_store_idx += 1 tmps.append(flux_store_name) hsv = lp.precompute(hsv, flux_var + "_subst", flux_inames + ilp_inames, temporary_name=flux_store_name, precompute_inames=flux_precomp_inames + flux_ilp_inames, default_tag=None) if flux_var.endswith("_s"): hsv = lp.tag_array_axes(hsv, flux_store_name, "N0,N1,N2?") else: hsv = lp.tag_array_axes(hsv, flux_store_name, "N1,N0,N2?") n_iname = "n_" + flux_var.replace("_r", "").replace("_s", "") if n_iname.endswith("_0"): n_iname = n_iname[:-2] hsv = lp.rename_iname(hsv, "n", n_iname, within="id:" + reader.id, existing_ok=True) hsv = lp.tag_inames(hsv, dict(ii="l.0", jj="l.1")) for iname in flux_ilp_inames: hsv = lp.tag_inames(hsv, {iname: "ilp"}) hsv = lp.alias_temporaries(hsv, rtmps) hsv = lp.alias_temporaries(hsv, stmps) if opt_level == 2: tap_hsv = hsv for prep_var_name in local_prep_var_names: if prep_var_name.startswith("Jinv") or "_s" in prep_var_name: continue hsv = lp.precompute( hsv, lp.find_one_rule_matching(hsv, prep_var_name + "_*subst*")) if opt_level == 3: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "Q[ii,jj,k,:,:,e]", sweep_inames=ilp_inames) if opt_level == 4: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.buffer_array(hsv, "rhsQ", ilp_inames, fetch_bounding_box=True, default_tag="for", init_expression="0", store_expression="base + buffer") if opt_level == 5: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) # buffer axes need to be vectorized in order for this to work hsv = lp.tag_array_axes(hsv, "rhsQ_buf", "c?,vec,c") hsv = lp.tag_array_axes(hsv, "Q_fetch", "c?,vec,c") hsv = lp.tag_array_axes(hsv, "D_fetch", "f,f") hsv = lp.tag_inames(hsv, { "Q_dim_k": "unr", "rhsQ_init_k": "unr", "rhsQ_store_k": "unr" }, ignore_nonexistent=True) if opt_level == 6: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.tag_inames( hsv, dict(rhsQ_init_field_inner="vec", rhsQ_store_field_inner="vec", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="vec", Q_dim_field_outer="unr")) if opt_level == 7: tap_hsv = hsv hsv = lp.collect_common_factors_on_increment( hsv, "rhsQ_buf", vary_by_axes=(0, ) if ilp_multiple > 1 else ()) if opt_level >= 8: tap_hsv = hsv hsv = tap_hsv if 1: print("OPS") op_poly = lp.get_op_poly(hsv) print(lp.stringify_stats_mapping(op_poly)) print("MEM") gmem_poly = lp.sum_mem_access_to_bytes(lp.get_gmem_access_poly(hsv)) print(lp.stringify_stats_mapping(gmem_poly)) hsv = lp.set_options(hsv, cl_build_options=[ "-cl-denorms-are-zero", "-cl-fast-relaxed-math", "-cl-finite-math-only", "-cl-mad-enable", "-cl-no-signed-zeros", ]) hsv = hsv.copy(name="horizontalStrongVolumeKernel") results = lp.auto_test_vs_ref(ref_hsv, ctx, hsv, parameters=dict(elements=300), quiet=True) elapsed = results["elapsed_wall"] print("elapsed", elapsed)
import numpy as np import loopy as lp import pyopencl as cl import pyopencl.array ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) n = 15 * 10**6 a = cl.array.arange(queue, n, dtype=np.float32) knl = lp.make_kernel("{ [i]: 0<=i<n }", "out[i] = 2*a[i]") knl = lp.set_options(knl, write_code=True) knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="vec") knl = lp.split_array_axis(knl, "a,out", axis_nr=0, count=4) knl = lp.tag_array_axes(knl, "a,out", "C,vec") knl(queue, a=a.reshape(-1, 4), n=n)
def get_kernel(self): loopy_insns, result_names = self.get_loopy_insns_and_result_names() from pymbolic import var exprs = [ var(name) * var("strength").index((self.strength_usage[i], var("isrc"))) for i, name in enumerate(result_names)] from sumpy.tools import gather_loopy_source_arguments loopy_knl = lp.make_kernel( [ "{[itgt_box]: 0<=itgt_box<ntgt_boxes}", "{[isrc_box]: isrc_box_start<=isrc_box<isrc_box_end}", "{[itgt,isrc,idim]: \ itgt_start<=itgt<itgt_end and \ isrc_start<=isrc<isrc_end and \ 0<=idim<dim }", ], self.get_kernel_scaling_assignments() + [ """ for itgt_box <> tgt_ibox = target_boxes[itgt_box] <> itgt_start = box_target_starts[tgt_ibox] <> itgt_end = itgt_start+box_target_counts_nonchild[tgt_ibox] <> isrc_box_start = source_box_starts[itgt_box] <> isrc_box_end = source_box_starts[itgt_box+1] for isrc_box <> src_ibox = source_box_lists[isrc_box] <> isrc_start = box_source_starts[src_ibox] <> isrc_end = isrc_start+box_source_counts_nonchild[src_ibox] for itgt for isrc <> d[idim] = \ targets[idim,itgt] - sources[idim,isrc] \ {dup=idim} """ ] + loopy_insns + [ lp.Assignment(id=None, assignee="pair_result_%d" % i, expression=expr, temp_var_type=lp.auto) for i, expr in enumerate(exprs) ] + [ """ end """] + [""" result[KNLIDX, itgt] = result[KNLIDX, itgt] + \ knl_KNLIDX_scaling \ * simul_reduce(sum, isrc, pair_result_KNLIDX) """.replace("KNLIDX", str(iknl)) for iknl in range(len(exprs))] + [""" end end end """], [ lp.GlobalArg("box_target_starts,box_target_counts_nonchild," "box_source_starts,box_source_counts_nonchild,", None, shape=None), lp.GlobalArg("source_box_starts, source_box_lists,", None, shape=None), lp.GlobalArg("strength", None, shape="nstrengths,nsources"), lp.GlobalArg("result", None, shape="nkernels,ntargets", dim_tags="sep,c"), lp.GlobalArg("targets", None, shape="dim,ntargets", dim_tags="sep,c"), lp.GlobalArg("sources", None, shape="dim,nsources", dim_tags="sep,c"), lp.ValueArg("nsources", np.int32), lp.ValueArg("ntargets", np.int32), "...", ] + gather_loopy_source_arguments(self.kernels), name=self.name, assumptions="ntgt_boxes>=1") loopy_knl = lp.fix_parameters( loopy_knl, dim=self.dim, nstrengths=self.strength_count, nkernels=len(self.kernels)) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") loopy_knl = lp.tag_array_axes(loopy_knl, "strength", "sep,C") for knl in self.kernels: loopy_knl = knl.prepare_loopy_kernel(loopy_knl) return loopy_knl
def get_kernel(self): from sumpy.symbolic import make_sympy_vector avec = make_sympy_vector("a", self.dim) bvec = make_sympy_vector("b", self.dim) from sumpy.assignment_collection import SymbolicAssignmentCollection sac = SymbolicAssignmentCollection() logger.info("compute expansion expressions: start") result_names = [expand(i, sac, expn, avec, bvec) for i, expn in enumerate(self.expansions)] logger.info("compute expansion expressions: done") sac.run_global_cse() from sumpy.symbolic import kill_trivial_assignments assignments = kill_trivial_assignments([ (name, expr.subs("tau", 0)) for name, expr in six.iteritems(sac.assignments)], retain_names=result_names) from sumpy.codegen import to_loopy_insns loopy_insns = to_loopy_insns(assignments, vector_names=set(["a", "b"]), pymbolic_expr_maps=[ expn.kernel.get_code_transformer() for expn in self.expansions], complex_dtype=np.complex128 # FIXME ) isrc_sym = var("isrc") exprs = [ var(name) * self.get_strength_or_not(isrc_sym, i) for i, name in enumerate(result_names)] from sumpy.tools import gather_loopy_source_arguments arguments = ( self.get_src_tgt_arguments() + self.get_input_and_output_arguments() + gather_loopy_source_arguments(self.kernels)) loopy_knl = lp.make_kernel( "{[isrc,itgt,idim]: 0<=itgt<ntargets and 0<=isrc<nsources " "and 0<=idim<dim}", self.get_kernel_scaling_assignments() + ["for itgt, isrc"] + [self.get_compute_a_and_b_vecs()] + loopy_insns + [ lp.Assignment(id=None, assignee="pair_result_%d" % i, expression=expr, temp_var_type=lp.auto) for i, (expr, dtype) in enumerate(zip(exprs, self.value_dtypes)) ] + ["end"] + self.get_result_store_instructions(), arguments, name=self.name, assumptions="nsources>=1 and ntargets>=1", default_offset=lp.auto, silenced_warnings="write_race(write_lpot*)" ) loopy_knl = lp.fix_parameters(loopy_knl, dim=self.dim) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") for expn in self.expansions: loopy_knl = expn.prepare_loopy_kernel(loopy_knl) loopy_knl = lp.tag_array_axes(loopy_knl, "center", "sep,C") return loopy_knl
def get_kernel(self): loopy_insns, result_names = self.get_loopy_insns_and_result_names() kernel_exprs = self.get_kernel_exprs(result_names) arguments = ( self.get_default_src_tgt_arguments() + [ lp.GlobalArg("box_target_starts", None, shape=None), lp.GlobalArg("box_target_counts_nonchild", None, shape=None), lp.GlobalArg("box_source_starts", None, shape=None), lp.GlobalArg("box_source_counts_nonchild", None, shape=None), lp.GlobalArg("source_box_starts", None, shape=None), lp.GlobalArg("source_box_lists", None, shape=None), lp.GlobalArg("strength", None, shape="nstrengths, nsources", dim_tags="sep,C"), lp.GlobalArg("result", None, shape="nkernels, ntargets", dim_tags="sep,C"), "..." ]) loopy_knl = lp.make_kernel([ "{[itgt_box]: 0 <= itgt_box < ntgt_boxes}", "{[isrc_box]: isrc_box_start <= isrc_box < isrc_box_end}", "{[itgt, isrc, idim]: \ itgt_start <= itgt < itgt_end and \ isrc_start <= isrc < isrc_end and \ 0 <= idim < dim}", ], self.get_kernel_scaling_assignments() + [""" for itgt_box <> tgt_ibox = target_boxes[itgt_box] <> itgt_start = box_target_starts[tgt_ibox] <> itgt_end = itgt_start + box_target_counts_nonchild[tgt_ibox] <> isrc_box_start = source_box_starts[itgt_box] <> isrc_box_end = source_box_starts[itgt_box+1] for isrc_box <> src_ibox = source_box_lists[isrc_box] <> isrc_start = box_source_starts[src_ibox] <> isrc_end = isrc_start + box_source_counts_nonchild[src_ibox] for itgt for isrc <> d[idim] = \ targets[idim, itgt] - sources[idim, isrc] {dup=idim} """] + [""" <> is_self = (isrc == target_to_source[itgt]) """ if self.exclude_self else ""] + loopy_insns + kernel_exprs + [" end"] + [""" result[{i}, itgt] = result[{i}, itgt] + \ knl_{i}_scaling * simul_reduce(sum, isrc, pair_result_{i}) \ {{id_prefix=write_csr}} """.format(i=iknl) for iknl in range(len(self.kernels))] + [""" end end end """], arguments, assumptions="ntgt_boxes>=1", name=self.name, silenced_warnings="write_race(write_csr*)", fixed_parameters=dict( dim=self.dim, nstrengths=self.strength_count, nkernels=len(self.kernels)), lang_version=MOST_RECENT_LANGUAGE_VERSION) loopy_knl = lp.add_dtypes(loopy_knl, dict(nsources=np.int32, ntargets=np.int32)) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") loopy_knl = lp.tag_array_axes(loopy_knl, "targets", "sep,C") loopy_knl = lp.tag_array_axes(loopy_knl, "sources", "sep,C") for knl in self.kernels: loopy_knl = knl.prepare_loopy_kernel(loopy_knl) return loopy_knl
def get_kernel(self): loopy_insns, result_names = self.get_loopy_insns_and_result_names() kernel_exprs = self.get_kernel_exprs(result_names) arguments = (self.get_default_src_tgt_arguments() + [ lp.GlobalArg("box_target_starts", None, shape=None), lp.GlobalArg("box_target_counts_nonchild", None, shape=None), lp.GlobalArg("box_source_starts", None, shape=None), lp.GlobalArg("box_source_counts_nonchild", None, shape=None), lp.GlobalArg("source_box_starts", None, shape=None), lp.GlobalArg("source_box_lists", None, shape=None), lp.GlobalArg("strength", None, shape="nstrengths, nsources", dim_tags="sep,C"), lp.GlobalArg( "result", None, shape="nkernels, ntargets", dim_tags="sep,C"), "..." ]) loopy_knl = lp.make_kernel( [ "{[itgt_box]: 0 <= itgt_box < ntgt_boxes}", "{[isrc_box]: isrc_box_start <= isrc_box < isrc_box_end}", "{[itgt, isrc, idim]: \ itgt_start <= itgt < itgt_end and \ isrc_start <= isrc < isrc_end and \ 0 <= idim < dim}", ], self.get_kernel_scaling_assignments() + [ """ for itgt_box <> tgt_ibox = target_boxes[itgt_box] <> itgt_start = box_target_starts[tgt_ibox] <> itgt_end = itgt_start + box_target_counts_nonchild[tgt_ibox] <> isrc_box_start = source_box_starts[itgt_box] <> isrc_box_end = source_box_starts[itgt_box+1] for isrc_box <> src_ibox = source_box_lists[isrc_box] <> isrc_start = box_source_starts[src_ibox] <> isrc_end = isrc_start + box_source_counts_nonchild[src_ibox] for itgt for isrc <> d[idim] = \ targets[idim, itgt] - sources[idim, isrc] {dup=idim} """ ] + [ """ <> is_self = (isrc == target_to_source[itgt]) """ if self.exclude_self else "" ] + loopy_insns + kernel_exprs + [" end"] + [ """ result[{i}, itgt] = result[{i}, itgt] + \ knl_{i}_scaling * simul_reduce(sum, isrc, pair_result_{i}) \ {{id_prefix=write_csr}} """.format(i=iknl) for iknl in range(len(self.kernels)) ] + [ """ end end end """ ], arguments, assumptions="ntgt_boxes>=1", name=self.name, silenced_warnings="write_race(write_csr*)", fixed_parameters=dict(dim=self.dim, nstrengths=self.strength_count, nkernels=len(self.kernels)), lang_version=MOST_RECENT_LANGUAGE_VERSION) loopy_knl = lp.add_dtypes(loopy_knl, dict(nsources=np.int32, ntargets=np.int32)) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") loopy_knl = lp.tag_array_axes(loopy_knl, "targets", "sep,C") loopy_knl = lp.tag_array_axes(loopy_knl, "sources", "sep,C") for knl in self.kernels: loopy_knl = knl.prepare_loopy_kernel(loopy_knl) return loopy_knl
def get_kernel(self): loopy_insns, result_names = self.get_loopy_insns_and_result_names() from pymbolic import var exprs = [ var(name) * var("strength").index((self.strength_usage[i], var("isrc"))) for i, name in enumerate(result_names)] if self.exclude_self: from pymbolic.primitives import If, ComparisonOperator, Variable exprs = [ If( ComparisonOperator(Variable("isrc"), "!=", Variable("itgt")), expr, 0) for expr in exprs] from sumpy.tools import gather_loopy_source_arguments loopy_knl = lp.make_kernel( "{[isrc,itgt,idim]: 0<=itgt<ntargets and 0<=isrc<nsources \ and 0<=idim<dim}", self.get_kernel_scaling_assignments() + [""" for itgt for isrc """] + loopy_insns + [""" <> d[idim] = targets[idim,itgt] - sources[idim,isrc] \ """]+[ lp.Assignment(id=None, assignee="pair_result_%d" % i, expression=expr, temp_var_type=lp.auto) for i, expr in enumerate(exprs) ] + [""" end """] + [""" result[KNLIDX, itgt] = knl_KNLIDX_scaling \ * simul_reduce(sum, isrc, pair_result_KNLIDX) """.replace("KNLIDX", str(iknl)) for iknl in range(len(exprs))] + [ ] + [""" end """], [ lp.GlobalArg("sources", None, shape=(self.dim, "nsources")), lp.GlobalArg("targets", None, shape=(self.dim, "ntargets")), lp.ValueArg("nsources", None), lp.ValueArg("ntargets", None), lp.GlobalArg("strength", None, shape="nstrengths,nsources"), lp.GlobalArg("result", None, shape="nresults,ntargets", dim_tags="sep,C") ] + gather_loopy_source_arguments(self.kernels), name=self.name, assumptions="nsources>=1 and ntargets>=1") loopy_knl = lp.fix_parameters( loopy_knl, dim=self.dim, nstrengths=self.strength_count, nresults=len(self.kernels)) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") for knl in self.kernels: loopy_knl = knl.prepare_loopy_kernel(loopy_knl) loopy_knl = lp.tag_array_axes(loopy_knl, "strength", "sep,C") return loopy_knl
import numpy as np import loopy as lp import pyopencl as cl import pyopencl.array ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) n = 15 * 10**6 a = cl.array.arange(queue, n, dtype=np.float32) knl = lp.make_kernel( "{ [i]: 0<=i<n }", "out[i] = 2*a[i]") knl = lp.set_options(knl, write_code=True) knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="vec") knl = lp.split_array_axis(knl, "a,out", axis_nr=0, count=4) knl = lp.tag_array_axes(knl, "a,out", "C,vec") knl(queue, a=a.reshape(-1, 4), n=n)
def get_kernel(self): loopy_insns, result_names = self.get_loopy_insns_and_result_names() from pymbolic import var exprs = [ var(name) * var("strength").index( (self.strength_usage[i], var("isrc"))) for i, name in enumerate(result_names) ] if self.exclude_self: from pymbolic.primitives import If, Variable exprs = [If(Variable("is_self"), 0, expr) for expr in exprs] from sumpy.tools import gather_loopy_source_arguments loopy_knl = lp.make_kernel( "{[isrc,itgt,idim]: 0<=itgt<ntargets and 0<=isrc<nsources \ and 0<=idim<dim}", self.get_kernel_scaling_assignments() + [ """ for itgt for isrc """ ] + loopy_insns + [ """ <> d[idim] = targets[idim,itgt] - sources[idim,isrc] """ ] + [ """ <> is_self = (isrc == target_to_source[itgt]) """ if self.exclude_self else "" ] + [ lp.Assignment(id=None, assignee="pair_result_%d" % i, expression=expr, temp_var_type=lp.auto) for i, expr in enumerate(exprs) ] + [""" end """] + [ """ result[KNLIDX, itgt] = knl_KNLIDX_scaling \ * simul_reduce(sum, isrc, pair_result_KNLIDX) """.replace("KNLIDX", str(iknl)) for iknl in range(len(exprs)) ] + [] + [""" end """], [ lp.GlobalArg("sources", None, shape=(self.dim, "nsources")), lp.GlobalArg("targets", None, shape=(self.dim, "ntargets")), lp.ValueArg("nsources", None), lp.ValueArg("ntargets", None), lp.GlobalArg("strength", None, shape="nstrengths,nsources"), lp.GlobalArg("result", None, shape="nresults,ntargets", dim_tags="sep,C") ] + ([ lp.GlobalArg( "target_to_source", np.int32, shape=("ntargets", )) ] if self.exclude_self else []) + gather_loopy_source_arguments(self.kernels), name=self.name, assumptions="nsources>=1 and ntargets>=1", fixed_parameters=dict(dim=self.dim, nstrengths=self.strength_count, nresults=len(self.kernels))) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") for knl in self.kernels: loopy_knl = knl.prepare_loopy_kernel(loopy_knl) loopy_knl = lp.tag_array_axes(loopy_knl, "strength", "sep,C") return loopy_knl
def get_kernel(self): from sumpy.symbolic import make_sym_vector avec = make_sym_vector("a", self.dim) bvec = make_sym_vector("b", self.dim) from sumpy.assignment_collection import SymbolicAssignmentCollection sac = SymbolicAssignmentCollection() logger.info("compute expansion expressions: start") result_names = [ expand(i, sac, expn, avec, bvec) for i, expn in enumerate(self.expansions) ] logger.info("compute expansion expressions: done") sac.run_global_cse() from sumpy.codegen import to_loopy_insns loopy_insns = to_loopy_insns( six.iteritems(sac.assignments), vector_names=set(["a", "b"]), pymbolic_expr_maps=[ expn.kernel.get_code_transformer() for expn in self.expansions ], retain_names=result_names, complex_dtype=np.complex128 # FIXME ) isrc_sym = var("isrc") exprs = [ var(name) * self.get_strength_or_not(isrc_sym, i) for i, name in enumerate(result_names) ] from sumpy.tools import gather_loopy_source_arguments arguments = (self.get_src_tgt_arguments() + self.get_input_and_output_arguments() + gather_loopy_source_arguments(self.kernels)) loopy_knl = lp.make_kernel( "{[isrc,itgt,idim]: 0<=itgt<ntargets and 0<=isrc<nsources " "and 0<=idim<dim}", self.get_kernel_scaling_assignments() + ["for itgt, isrc"] + [self.get_compute_a_and_b_vecs()] + loopy_insns + [ lp.Assignment(id=None, assignee="pair_result_%d" % i, expression=expr, temp_var_type=lp.auto) for i, (expr, dtype) in enumerate(zip(exprs, self.value_dtypes)) ] + ["end"] + self.get_result_store_instructions(), arguments, name=self.name, assumptions="nsources>=1 and ntargets>=1", default_offset=lp.auto, silenced_warnings="write_race(write_lpot*)", fixed_parameters=dict(dim=self.dim)) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") for expn in self.expansions: loopy_knl = expn.prepare_loopy_kernel(loopy_knl) loopy_knl = lp.tag_array_axes(loopy_knl, "center", "sep,C") return loopy_knl