def get_kernel(self): ncoeffs = len(self.expansion) loopy_insns, result_names = self.get_loopy_insns_and_result_names() loopy_knl = lp.make_kernel( [ "{[iglobal_center]: 0<=iglobal_center<nglobal_qbx_centers}", "{[icenter_tgt]: \ icenter_tgt_start<=icenter_tgt<icenter_tgt_end}", "{[idim]: 0<=idim<dim}", ], loopy_insns + [""" <> src_icenter = global_qbx_centers[iglobal_center] <> icenter_tgt_start = center_to_targets_starts[src_icenter] <> icenter_tgt_end = center_to_targets_starts[src_icenter+1] <> center_itgt = center_to_targets_lists[icenter_tgt] <> center[idim] = qbx_centers[idim, src_icenter] \ {id=fetch_center} <> b[idim] = targets[idim, center_itgt] - center[idim] \ {id=compute_b} <> coeff${COEFFIDX} = qbx_expansions[src_icenter, ${COEFFIDX}] result[${RESULTIDX},center_itgt] = \ kernel_scaling * result_${RESULTIDX}_p \ {id_prefix=write_result} """], [ lp.GlobalArg("result", None, shape="nresults, ntargets", dim_tags="sep,C"), lp.GlobalArg("qbx_centers", None, shape="dim, ncenters", dim_tags="sep,c"), lp.GlobalArg("center_to_targets_starts,center_to_targets_lists", None, shape=None), lp.GlobalArg("qbx_expansions", None, shape=("ncenters", ncoeffs)), lp.GlobalArg("targets", None, shape=(self.dim, "ntargets"), dim_tags="sep,C"), lp.ValueArg("ncenters,ntargets", np.int32), "..." ] + [arg.loopy_arg for arg in self.expansion.get_args()], name=self.name, assumptions="nglobal_qbx_centers>=1", defines=dict( dim=self.dim, COEFFIDX=[str(i) for i in range(ncoeffs)], RESULTIDX=[str(i) for i in range(len(result_names))], nresults=len(result_names), ), silenced_warnings="write_race(write_result*)") loopy_knl = lp.duplicate_inames(loopy_knl, "idim", "compute_b", tags={"idim": "unr"}) loopy_knl = lp.duplicate_inames(loopy_knl, "idim", "fetch_center", tags={"idim": "unr"}) loopy_knl = self.expansion.prepare_loopy_kernel(loopy_knl) return loopy_knl
def test_unschedulable_kernel_detection(): knl = lp.make_kernel(["{[i,j]:0<=i,j<n}"], """ mat1[i,j] = mat1[i,j] + 1 {inames=i:j, id=i1} mat2[j] = mat2[j] + 1 {inames=j, id=i2} mat3[i] = mat3[i] + 1 {inames=i, id=i3} """) knl = lp.preprocess_kernel(knl) # Check that loopy can detect the unschedulability of the kernel assert lp.needs_iname_duplication(knl) assert len(list(lp.get_iname_duplication_options(knl))) == 4 for inames, insns in lp.get_iname_duplication_options(knl): fixed_knl = lp.duplicate_inames(knl, inames, insns) assert not lp.needs_iname_duplication(fixed_knl) knl = lp.make_kernel(["{[i,j,k,l,m]:0<=i,j,k,l,m<n}"], """ mat1[l,m,i,j,k] = mat1[l,m,i,j,k] + 1 {inames=i:j:k:l:m} mat2[l,m,j,k] = mat2[l,m,j,k] + 1 {inames=j:k:l:m} mat3[l,m,k] = mat3[l,m,k] + 11 {inames=k:l:m} mat4[l,m,i] = mat4[l,m,i] + 1 {inames=i:l:m} """) assert lp.needs_iname_duplication(knl) assert len(list(lp.get_iname_duplication_options(knl))) == 10
def make_inames_unique(self, knl): for i in knl.instructions: print(type(i.expression)) inames = self.recurse_expr(i.expression) if len(inames) > 0: iid = i.id iname_str = ",".join(inames) knl = lp.duplicate_inames(knl, iname_str, within=f"id:{iid}") return knl
def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): """User-facing kernel creation entrypoint. :arg domains: :class:`islpy.BasicSet` :arg instructions: :arg kernel_data: A list of :class:`ValueArg`, :class:`GlobalArg`, ... (etc.) instances. The order of these arguments determines the order of the arguments to the generated kernel. May also contain :class:`TemporaryVariable` instances(which do not give rise to kernel-level arguments). The string ``"..."`` may be passed as one of the entries of the list, in which case loopy will infer names, shapes, and types of arguments from the kernel code. It is possible to just pass the list ``["..."]``, in which case all arguments are inferred. In Python 3, the string ``"..."`` may be spelled somewhat more sensibly as just ``...`` (the ellipsis), for the same meaning. As an additional option, each argument may be specified as just a name (a string). This is useful to specify argument ordering. All other characteristics of the named arguments are inferred. The following keyword arguments are recognized: :arg preambles: a list of (tag, code) tuples that identify preamble snippets. Each tag's snippet is only included once, at its first occurrence. The preambles will be inserted in order of their tags. :arg preamble_generators: a list of functions of signature (seen_dtypes, seen_functions) where seen_functions is a set of (name, c_name, arg_dtypes), generating extra entries for *preambles*. :arg defines: a dictionary of replacements to be made in instructions given as strings before parsing. A macro instance intended to be replaced should look like "MACRO" in the instruction code. The expansion given in this parameter is allowed to be a list. In this case, instructions are generated for *each* combination of macro values. These defines may also be used in the domain and in argument shapes and strides. They are expanded only upon kernel creation. :arg default_order: "C" (default) or "F" :arg default_offset: 0 or :class:`loopy.auto`. The default value of *offset* in :attr:`loopy.kernel.data.GlobalArg` for guessed arguments. Defaults to 0. :arg function_manglers: list of functions of signature (name, arg_dtypes) returning a tuple (result_dtype, c_name) or a tuple (result_dtype, c_name, arg_dtypes), where c_name is the C-level function to be called. :arg symbol_manglers: list of functions of signature (name) returning a tuple (result_dtype, c_name), where c_name is the C-level symbol to be evaluated. :arg assumptions: the initial implemented_domain, captures assumptions on loop domain parameters. (an isl.Set or a string in :ref:`isl-syntax`. If given as a string, only the CONDITIONS part of the set notation should be given.) :arg local_sizes: A dictionary from integers to integers, mapping workgroup axes to their sizes, e.g. *{0: 16}* forces axis 0 to be length 16. :arg silenced_warnings: a list (or semicolon-separated string) or warnings to silence :arg options: an instance of :class:`loopy.Options` or an equivalent string representation :arg target: an instance of :class:`loopy.target.TargetBase`, or *None*, to use an OpenCL target. """ defines = kwargs.pop("defines", {}) default_order = kwargs.pop("default_order", "C") default_offset = kwargs.pop("default_offset", 0) silenced_warnings = kwargs.pop("silenced_warnings", []) options = kwargs.pop("options", None) flags = kwargs.pop("flags", None) target = kwargs.pop("target", None) if target is None: try: import pyopencl # noqa except ImportError: from loopy.target.opencl import OpenCLTarget target = OpenCLTarget() else: from loopy.target.pyopencl import PyOpenCLTarget target = PyOpenCLTarget() if flags is not None: if options is not None: raise TypeError("may not pass both 'options' and 'flags'") from warnings import warn warn("'flags' is deprecated. Use 'options' instead", DeprecationWarning, stacklevel=2) options = flags from loopy.options import make_options options = make_options(options) if isinstance(silenced_warnings, str): silenced_warnings = silenced_warnings.split(";") # {{{ separate temporary variables and arguments, take care of names with commas from loopy.kernel.data import TemporaryVariable, ArrayBase if isinstance(kernel_data, str): kernel_data = kernel_data.split(",") kernel_args = [] temporary_variables = kwargs.pop("temporary_variables", {}).copy() for dat in kernel_data: if dat is Ellipsis or isinstance(dat, str): kernel_args.append(dat) continue if isinstance(dat, ArrayBase) and isinstance(dat.shape, tuple): new_shape = [] for shape_axis in dat.shape: if shape_axis is not None: new_shape.append(expand_defines_in_expr(shape_axis, defines)) else: new_shape.append(shape_axis) dat = dat.copy(shape=tuple(new_shape)) for arg_name in dat.name.split(","): arg_name = arg_name.strip() if not arg_name: continue my_dat = dat.copy(name=arg_name) if isinstance(dat, TemporaryVariable): temporary_variables[my_dat.name] = dat else: kernel_args.append(my_dat) del kernel_data # }}} # {{{ instruction/subst parsing parsed_instructions = [] kwargs["substitutions"] = substitutions = {} inames_to_dup = [] if isinstance(instructions, str): instructions = [instructions] for insn in instructions: for new_insn, insn_inames_to_dup in parse_if_necessary(insn, defines): if isinstance(new_insn, InstructionBase): parsed_instructions.append(new_insn) # Need to maintain 1-to-1 correspondence to instructions inames_to_dup.append(insn_inames_to_dup) elif isinstance(new_insn, SubstitutionRule): substitutions[new_insn.name] = new_insn assert not insn_inames_to_dup else: raise RuntimeError("unexpected type in instruction parsing") instructions = parsed_instructions del parsed_instructions # }}} # {{{ find/create isl_context for domain in domains: if isinstance(domain, isl.BasicSet): assert domain.get_ctx() == isl.DEFAULT_CONTEXT # }}} domains = parse_domains(domains, defines) arg_guesser = ArgumentGuesser(domains, instructions, temporary_variables, substitutions, default_offset) kernel_args = arg_guesser.convert_names_to_full_args(kernel_args) kernel_args = arg_guesser.guess_kernel_args_if_requested(kernel_args) from loopy.kernel import LoopKernel knl = LoopKernel(domains, instructions, kernel_args, temporary_variables=temporary_variables, silenced_warnings=silenced_warnings, options=options, target=target, **kwargs) from loopy import duplicate_inames for insn, insn_inames_to_dup in zip(knl.instructions, inames_to_dup): for old_iname, new_iname in insn_inames_to_dup: knl = duplicate_inames(knl, old_iname, within=insn.id, new_inames=new_iname) check_for_nonexistent_iname_deps(knl) knl = tag_reduction_inames_as_sequential(knl) knl = create_temporaries(knl, default_order) knl = determine_shapes_of_temporaries(knl) knl = expand_cses(knl) knl = expand_defines_in_shapes(knl, defines) knl = guess_arg_shape_if_requested(knl, default_order) knl = apply_default_order_to_args(knl, default_order) knl = resolve_wildcard_deps(knl) # ------------------------------------------------------------------------- # Ordering dependency: # ------------------------------------------------------------------------- # Must create temporaries before checking for writes to temporary variables # that are domain parameters. # ------------------------------------------------------------------------- check_for_multiple_writes_to_loop_bounds(knl) check_for_duplicate_names(knl) check_written_variable_names(knl) from loopy.preprocess import prepare_for_caching knl = prepare_for_caching(knl) return knl
def test_tim2d(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = 8 from pymbolic import var K_sym = var("K") # noqa field_shape = (K_sym, n, n) # K - run-time symbolic knl = lp.make_kernel( "[K] -> {[i,j,e,m,o,gi]: 0<=i,j,m,o<%d and 0<=e<K and 0<=gi<3}" % n, [ "ur(a,b) := sum(o, D[a,o]*u[e,o,b])", "us(a,b) := sum(o, D[b,o]*u[e,a,o])", #"Gu(mat_entry,a,b) := G[mat_entry,e,m,j]*ur(m,j)", "Gux(a,b) := G$x[0,e,a,b]*ur(a,b)+G$x[1,e,a,b]*us(a,b)", "Guy(a,b) := G$y[1,e,a,b]*ur(a,b)+G$y[2,e,a,b]*us(a,b)", "lap[e,i,j] = " " sum(m, D[m,i]*Gux(m,j))" "+ sum(m, D[m,j]*Guy(i,m))" ], [ lp.GlobalArg("u", dtype, shape=field_shape, order=order), lp.GlobalArg("lap", dtype, shape=field_shape, order=order), lp.GlobalArg("G", dtype, shape=(3,)+field_shape, order=order), # lp.ConstantArrayArg("D", dtype, shape=(n, n), order=order), lp.GlobalArg("D", dtype, shape=(n, n), order=order), # lp.ImageArg("D", dtype, shape=(n, n)), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap2D", assumptions="K>=1") knl = lp.duplicate_inames(knl, "o", within="id:ur") knl = lp.duplicate_inames(knl, "o", within="id:us") seq_knl = knl def variant_orig(knl): knl = lp.tag_inames(knl, dict(i="l.0", j="l.1", e="g.0")) knl = lp.add_prefetch(knl, "D[:,:]") knl = lp.add_prefetch(knl, "u[e, :, :]") knl = lp.precompute(knl, "ur(m,j)", ["m", "j"]) knl = lp.precompute(knl, "us(i,m)", ["i", "m"]) knl = lp.precompute(knl, "Gux(m,j)", ["m", "j"]) knl = lp.precompute(knl, "Guy(i,m)", ["i", "m"]) knl = lp.add_prefetch(knl, "G$x[:,e,:,:]") knl = lp.add_prefetch(knl, "G$y[:,e,:,:]") knl = lp.tag_inames(knl, dict(o="unr")) knl = lp.tag_inames(knl, dict(m="unr")) knl = lp.set_instruction_priority(knl, "id:D_fetch", 5) print(knl) return knl for variant in [variant_orig]: K = 1000 # noqa lp.auto_test_vs_ref(seq_knl, ctx, variant(knl), op_count=[K*(n*n*n*2*2 + n*n*2*3 + n**3 * 2*2)/1e9], op_label=["GFlops"], parameters={"K": K})
def get_kernel(self): ncoeffs = len(self.expansion) from sumpy.tools import gather_loopy_source_arguments arguments = ( [ lp.GlobalArg("sources", None, shape=(self.dim, "nsources"), dim_tags="sep,c"), lp.GlobalArg("strengths", None, shape="nsources"), lp.GlobalArg("qbx_center_to_target_box", None, shape=None), lp.GlobalArg("source_box_starts,source_box_lists", None, shape=None), lp.GlobalArg("box_source_starts,box_source_counts_nonchild", None, shape=None), lp.GlobalArg("qbx_centers", None, shape="dim, ncenters", dim_tags="sep,c"), lp.GlobalArg("qbx_expansions", None, shape=("ncenters", ncoeffs)), lp.ValueArg("ncenters", np.int32), lp.ValueArg("nsources", np.int32), "..." ] + gather_loopy_source_arguments([self.expansion])) loopy_knl = lp.make_kernel( [ "{[itgt_center]: 0<=itgt_center<ntgt_centers}", "{[isrc_box]: isrc_box_start<=isrc_box<isrc_box_stop}", "{[isrc,idim]: isrc_start<=isrc<isrc_end and 0<=idim<dim}", ], self.get_loopy_instructions() + [""" <> tgt_icenter = global_qbx_centers[itgt_center] <> itgt_box = qbx_center_to_target_box[tgt_icenter] <> isrc_box_start = source_box_starts[itgt_box] <> isrc_box_stop = source_box_starts[itgt_box+1] <> src_ibox = source_box_lists[isrc_box] <> isrc_start = box_source_starts[src_ibox] <> isrc_end = isrc_start+box_source_counts_nonchild[src_ibox] <> center[idim] = qbx_centers[idim, tgt_icenter] \ {id=fetch_center} <> a[idim] = center[idim] - sources[idim, isrc] {id=compute_a} <> strength = strengths[isrc] qbx_expansions[tgt_icenter, ${COEFFIDX}] = \ sum((isrc_box, isrc), strength*coeff${COEFFIDX}) \ {id_prefix=write_expn} """], arguments, name=self.name, assumptions="ntgt_centers>=1", defines=dict( dim=self.dim, COEFFIDX=[str(i) for i in range(ncoeffs)] ), silenced_warnings="write_race(write_expn*)") loopy_knl = self.expansion.prepare_loopy_kernel(loopy_knl) loopy_knl = lp.duplicate_inames(loopy_knl, "idim", "fetch_center", tags={"idim": "unr"}) loopy_knl = lp.tag_inames(loopy_knl, dict(idim="unr")) return loopy_knl
def get_kernel(self): ncoeff_src = len(self.src_expansion) ncoeff_tgt = len(self.tgt_expansion) from sumpy.tools import gather_loopy_arguments loopy_knl = lp.make_kernel( [ "{[icenter]: 0<=icenter<ncenters}", "{[idim]: 0<=idim<dim}", ], self.get_translation_loopy_insns() + [""" <> isrc_box = qbx_center_to_target_box[icenter] # The box's expansions which we're translating here # (our source) is, globally speaking, a target box. <> src_ibox = target_boxes[isrc_box] \ {id=read_src_ibox} <> tgt_center[idim] = qbx_centers[idim, icenter] \ {id=fetch_tgt_center} <> src_center[idim] = centers[idim, src_ibox] \ {id=fetch_src_center} <> d[idim] = tgt_center[idim] - src_center[idim] <> src_coeff${SRC_COEFFIDX} = \ expansions[src_ibox, ${SRC_COEFFIDX}] \ {dep=read_src_ibox} qbx_expansions[icenter, ${TGT_COEFFIDX}] = \ qbx_expansions[icenter, ${TGT_COEFFIDX}] \ + coeff${TGT_COEFFIDX} \ {id_prefix=write_expn} """], [ lp.GlobalArg("target_boxes", None, shape=None, offset=lp.auto), lp.GlobalArg("centers", None, shape="dim, naligned_boxes"), lp.GlobalArg("qbx_centers", None, shape="dim, ncenters", dim_tags="sep,c"), lp.ValueArg("naligned_boxes,nboxes", np.int32), lp.GlobalArg("expansions", None, shape=("nboxes", ncoeff_src)), "..." ] + gather_loopy_arguments([self.src_expansion, self.tgt_expansion]), name=self.name, assumptions="ncenters>=1", defines=dict( dim=self.dim, nchildren=2**self.dim, SRC_COEFFIDX=[str(i) for i in range(ncoeff_src)], TGT_COEFFIDX=[str(i) for i in range(ncoeff_tgt)], ), silenced_warnings="write_race(write_expn*)") for expn in [self.src_expansion, self.tgt_expansion]: loopy_knl = expn.prepare_loopy_kernel(loopy_knl) loopy_knl = lp.duplicate_inames(loopy_knl, "idim", "fetch_tgt_center", tags={"idim": "unr"}) loopy_knl = lp.tag_inames(loopy_knl, dict(idim="unr")) return loopy_knl
def get_kernel(self): ncoeff_src = len(self.src_expansion) ncoeff_tgt = len(self.tgt_expansion) from sumpy.tools import gather_loopy_arguments loopy_knl = lp.make_kernel( [ "{[icenter]: 0<=icenter<ncenters}", "{[isrc_box]: isrc_start<=isrc_box<isrc_stop}", "{[idim]: 0<=idim<dim}", ], self.get_translation_loopy_insns() + [""" <> icontaining_tgt_box = qbx_center_to_target_box[icenter] <> tgt_center[idim] = qbx_centers[idim, icenter] \ {id=fetch_tgt_center} <> isrc_start = src_box_starts[icontaining_tgt_box] <> isrc_stop = src_box_starts[icontaining_tgt_box+1] <> src_ibox = src_box_lists[isrc_box] \ {id=read_src_ibox} <> src_center[idim] = centers[idim, src_ibox] \ {id=fetch_src_center} <> d[idim] = tgt_center[idim] - src_center[idim] <> src_coeff${SRC_COEFFIDX} = \ src_expansions[src_ibox, ${SRC_COEFFIDX}] \ {dep=read_src_ibox} qbx_expansions[icenter, ${TGT_COEFFIDX}] = \ sum(isrc_box, coeff${TGT_COEFFIDX}) \ {id_prefix=write_expn} """], [ lp.GlobalArg("centers", None, shape="dim, aligned_nboxes"), lp.GlobalArg("src_box_starts, src_box_lists", None, shape=None, strides=(1,)), lp.GlobalArg("qbx_centers", None, shape="dim, ncenters", dim_tags="sep,c"), lp.ValueArg("aligned_nboxes,nboxes", np.int32), lp.GlobalArg("src_expansions", None, shape=("nboxes", ncoeff_src)), lp.GlobalArg("qbx_expansions", None, shape=("ncenters", ncoeff_tgt)), "..." ] + gather_loopy_arguments([self.src_expansion, self.tgt_expansion]), name=self.name, assumptions="ncenters>=1", defines=dict( dim=self.dim, SRC_COEFFIDX=[str(i) for i in range(ncoeff_src)], TGT_COEFFIDX=[str(i) for i in range(ncoeff_tgt)], ), silenced_warnings="write_race(write_expn*)") for expn in [self.src_expansion, self.tgt_expansion]: loopy_knl = expn.prepare_loopy_kernel(loopy_knl) loopy_knl = lp.duplicate_inames(loopy_knl, "idim", "fetch_tgt_center", tags={"idim": "unr"}) loopy_knl = lp.tag_inames(loopy_knl, dict(idim="unr")) return loopy_knl
def test_tim2d(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = 8 from pymbolic import var K_sym = var("K") # noqa field_shape = (K_sym, n, n) # K - run-time symbolic knl = lp.make_kernel( "[K] -> {[i,j,e,m,o,gi]: 0<=i,j,m,o<%d and 0<=e<K and 0<=gi<3}" % n, [ "ur(a,b) := simul_reduce(sum, o, D[a,o]*u[e,o,b])", "us(a,b) := simul_reduce(sum, o, D[b,o]*u[e,a,o])", #"Gu(mat_entry,a,b) := G[mat_entry,e,m,j]*ur(m,j)", "Gux(a,b) := G$x[0,e,a,b]*ur(a,b)+G$x[1,e,a,b]*us(a,b)", "Guy(a,b) := G$y[1,e,a,b]*ur(a,b)+G$y[2,e,a,b]*us(a,b)", "lap[e,i,j] = " " simul_reduce(sum, m, D[m,i]*Gux(m,j))" "+ simul_reduce(sum, m, D[m,j]*Guy(i,m))" ], [ lp.GlobalArg("u", dtype, shape=field_shape, order=order), lp.GlobalArg("lap", dtype, shape=field_shape, order=order), lp.GlobalArg("G", dtype, shape=(3, ) + field_shape, order=order), # lp.ConstantArrayArg("D", dtype, shape=(n, n), order=order), lp.GlobalArg("D", dtype, shape=(n, n), order=order), # lp.ImageArg("D", dtype, shape=(n, n)), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap2D", assumptions="K>=1") knl = lp.duplicate_inames(knl, "o", within="id:ur") knl = lp.duplicate_inames(knl, "o", within="id:us") seq_knl = knl def variant_orig(knl): knl = lp.tag_inames(knl, dict(i="l.0", j="l.1", e="g.0")) knl = lp.add_prefetch(knl, "D[:,:]") knl = lp.add_prefetch(knl, "u[e, :, :]") knl = lp.precompute(knl, "ur(m,j)", ["m", "j"]) knl = lp.precompute(knl, "us(i,m)", ["i", "m"]) knl = lp.precompute(knl, "Gux(m,j)", ["m", "j"]) knl = lp.precompute(knl, "Guy(i,m)", ["i", "m"]) knl = lp.add_prefetch(knl, "G$x[:,e,:,:]") knl = lp.add_prefetch(knl, "G$y[:,e,:,:]") knl = lp.tag_inames(knl, dict(o="unr")) knl = lp.tag_inames(knl, dict(m="unr")) knl = lp.set_instruction_priority(knl, "id:D_fetch", 5) print(knl) return knl for variant in [variant_orig]: K = 1000 # noqa lp.auto_test_vs_ref( seq_knl, ctx, variant(knl), op_count=[ K * (n * n * n * 2 * 2 + n * n * 2 * 3 + n**3 * 2 * 2) / 1e9 ], op_label=["GFlops"], parameters={"K": K})