def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): ctx = ctx_factory() filename = "strongVolumeKernels.f90" with open(filename, "r") as sourcef: source = sourcef.read() source = source.replace("datafloat", "real*4") hsv_r, hsv_s = [ knl for knl in lp.parse_fortran(source, filename, auto_dependencies=False) if "KernelR" in knl.name or "KernelS" in knl.name ] hsv_r = lp.tag_instructions(hsv_r, "rknl") hsv_s = lp.tag_instructions(hsv_s, "sknl") hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"]) #hsv = hsv_s from gnuma_loopy_transforms import (fix_euler_parameters, set_q_storage_format, set_D_storage_format) hsv = lp.fix_parameters(hsv, Nq=Nq) hsv = lp.set_loop_priority(hsv, "e,k,j,i") hsv = lp.tag_inames(hsv, dict(e="g.0", j="l.1", i="l.0")) hsv = lp.assume(hsv, "elements >= 1") hsv = fix_euler_parameters(hsv, p_p0=1, p_Gamma=1.4, p_R=1) for name in ["Q", "rhsQ"]: hsv = set_q_storage_format(hsv, name) hsv = set_D_storage_format(hsv) #hsv = lp.add_prefetch(hsv, "volumeGeometricFactors") ref_hsv = hsv if opt_level == 0: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "D[:,:]") if opt_level == 1: tap_hsv = hsv # turn the first reads into subst rules local_prep_var_names = set() for insn in lp.find_instructions(hsv, "tag:local_prep"): assignee, = insn.assignee_var_names() local_prep_var_names.add(assignee) hsv = lp.assignment_to_subst(hsv, assignee) # precompute fluxes hsv = lp.assignment_to_subst(hsv, "JinvD_r") hsv = lp.assignment_to_subst(hsv, "JinvD_s") r_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:rknl") s_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:sknl") if ilp_multiple > 1: hsv = lp.split_iname(hsv, "k", 2, inner_tag="ilp") ilp_inames = ("k_inner", ) flux_ilp_inames = ("kk", ) else: ilp_inames = () flux_ilp_inames = () rtmps = [] stmps = [] flux_store_idx = 0 for rflux_insn, sflux_insn in zip(r_fluxes, s_fluxes): for knl_tag, insn, flux_inames, tmps, flux_precomp_inames in [ ("rknl", rflux_insn, ( "j", "n", ), rtmps, ( "jj", "ii", )), ("sknl", sflux_insn, ( "i", "n", ), stmps, ( "ii", "jj", )), ]: flux_var, = insn.assignee_var_names() print(insn) reader, = lp.find_instructions( hsv, "tag:{knl_tag} and reads:{flux_var}".format(knl_tag=knl_tag, flux_var=flux_var)) hsv = lp.assignment_to_subst(hsv, flux_var) flux_store_name = "flux_store_%d" % flux_store_idx flux_store_idx += 1 tmps.append(flux_store_name) hsv = lp.precompute(hsv, flux_var + "_subst", flux_inames + ilp_inames, temporary_name=flux_store_name, precompute_inames=flux_precomp_inames + flux_ilp_inames, default_tag=None) if flux_var.endswith("_s"): hsv = lp.tag_array_axes(hsv, flux_store_name, "N0,N1,N2?") else: hsv = lp.tag_array_axes(hsv, flux_store_name, "N1,N0,N2?") n_iname = "n_" + flux_var.replace("_r", "").replace("_s", "") if n_iname.endswith("_0"): n_iname = n_iname[:-2] hsv = lp.rename_iname(hsv, "n", n_iname, within="id:" + reader.id, existing_ok=True) hsv = lp.tag_inames(hsv, dict(ii="l.0", jj="l.1")) for iname in flux_ilp_inames: hsv = lp.tag_inames(hsv, {iname: "ilp"}) hsv = lp.alias_temporaries(hsv, rtmps) hsv = lp.alias_temporaries(hsv, stmps) if opt_level == 2: tap_hsv = hsv for prep_var_name in local_prep_var_names: if prep_var_name.startswith("Jinv") or "_s" in prep_var_name: continue hsv = lp.precompute( hsv, lp.find_one_rule_matching(hsv, prep_var_name + "_*subst*")) if opt_level == 3: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "Q[ii,jj,k,:,:,e]", sweep_inames=ilp_inames) if opt_level == 4: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.buffer_array(hsv, "rhsQ", ilp_inames, fetch_bounding_box=True, default_tag="for", init_expression="0", store_expression="base + buffer") if opt_level == 5: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) # buffer axes need to be vectorized in order for this to work hsv = lp.tag_array_axes(hsv, "rhsQ_buf", "c?,vec,c") hsv = lp.tag_array_axes(hsv, "Q_fetch", "c?,vec,c") hsv = lp.tag_array_axes(hsv, "D_fetch", "f,f") hsv = lp.tag_inames(hsv, { "Q_dim_k": "unr", "rhsQ_init_k": "unr", "rhsQ_store_k": "unr" }, ignore_nonexistent=True) if opt_level == 6: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.tag_inames( hsv, dict(rhsQ_init_field_inner="vec", rhsQ_store_field_inner="vec", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="vec", Q_dim_field_outer="unr")) if opt_level == 7: tap_hsv = hsv hsv = lp.collect_common_factors_on_increment( hsv, "rhsQ_buf", vary_by_axes=(0, ) if ilp_multiple > 1 else ()) if opt_level >= 8: tap_hsv = hsv hsv = tap_hsv if 1: print("OPS") op_poly = lp.get_op_poly(hsv) print(lp.stringify_stats_mapping(op_poly)) print("MEM") gmem_poly = lp.sum_mem_access_to_bytes(lp.get_gmem_access_poly(hsv)) print(lp.stringify_stats_mapping(gmem_poly)) hsv = lp.set_options(hsv, cl_build_options=[ "-cl-denorms-are-zero", "-cl-fast-relaxed-math", "-cl-finite-math-only", "-cl-mad-enable", "-cl-no-signed-zeros", ]) hsv = hsv.copy(name="horizontalStrongVolumeKernel") results = lp.auto_test_vs_ref(ref_hsv, ctx, hsv, parameters=dict(elements=300), quiet=True) elapsed = results["elapsed_wall"] print("elapsed", elapsed)
def __call__(self, ary): from meshmode.dof_array import DOFArray if not isinstance(ary, DOFArray): raise TypeError("non-array passed to discretization connection") actx = ary.array_context @memoize_in(actx, ( DirectDiscretizationConnection, "resample_by_mat_knl", self.is_surjective, )) def mat_knl(): if self.is_surjective: domains = [ """ {[iel, idof, j]: 0<=iel<nelements and 0<=idof<n_to_nodes and 0<=j<n_from_nodes} """, ] instructions = """ result[to_element_indices[iel], idof] \ = sum(j, resample_mat[idof, j] \ * ary[from_element_indices[iel], j]) """ else: domains = [ """ {[iel_init, idof_init]: 0<=iel_init<nelements_result and 0<=idof_init<n_to_nodes} """, """ {[iel, idof, j]: 0<=iel<nelements and 0<=idof<n_to_nodes and 0<=j<n_from_nodes} """, ] instructions = """ result[iel_init, idof_init] = 0 {id=init} ... gbarrier {id=barrier, dep=init} result[to_element_indices[iel], idof] \ = sum(j, resample_mat[idof, j] \ * ary[from_element_indices[iel], j]) {dep=barrier} """ knl = make_loopy_program( domains, instructions, [ lp.GlobalArg("result", None, shape="nelements_result, n_to_nodes", offset=lp.auto), lp.GlobalArg("ary", None, shape="nelements_vec, n_from_nodes", offset=lp.auto), lp.ValueArg("nelements_result", np.int32), lp.ValueArg("nelements_vec", np.int32), lp.ValueArg("n_from_nodes", np.int32), "...", ], name="resample_by_mat") return knl @memoize_in(actx, (DirectDiscretizationConnection, "resample_by_picking_knl", self.is_surjective)) def pick_knl(): if self.is_surjective: domains = [ """{[iel, idof]: 0<=iel<nelements and 0<=idof<n_to_nodes}""" ] instructions = """ result[to_element_indices[iel], idof] \ = ary[from_element_indices[iel], pick_list[idof]] """ else: domains = [ """ {[iel_init, idof_init]: 0<=iel_init<nelements_result and 0<=idof_init<n_to_nodes} """, """ {[iel, idof]: 0<=iel<nelements and 0<=idof<n_to_nodes} """ ] instructions = """ result[iel_init, idof_init] = 0 {id=init} ... gbarrier {id=barrier, dep=init} result[to_element_indices[iel], idof] \ = ary[from_element_indices[iel], pick_list[idof]] {dep=barrier} """ knl = make_loopy_program( domains, instructions, [ lp.GlobalArg("result", None, shape="nelements_result, n_to_nodes", offset=lp.auto), lp.GlobalArg("ary", None, shape="nelements_vec, n_from_nodes", offset=lp.auto), lp.ValueArg("nelements_result", np.int32), lp.ValueArg("nelements_vec", np.int32), lp.ValueArg("n_from_nodes", np.int32), "...", ], name="resample_by_picking") return knl if ary.shape != (len(self.from_discr.groups), ): raise ValueError("invalid shape of incoming resampling data") group_idx_to_result = [] for i_tgrp, (tgrp, cgrp) in enumerate(zip(self.to_discr.groups, self.groups)): kernels = [] # get kernels for each batch; to be fused eventually kwargs = {} # kwargs to the fused kernel for i_batch, batch in enumerate(cgrp.batches): if batch.from_element_indices.size == 0: continue point_pick_indices = self._resample_point_pick_indices( actx, i_tgrp, i_batch) if point_pick_indices is None: knl = mat_knl() knl = lp.rename_argument(knl, "resample_mat", f"resample_mat_{i_batch}") kwargs[f"resample_mat_{i_batch}"] = (self._resample_matrix( actx, i_tgrp, i_batch)) else: knl = pick_knl() knl = lp.rename_argument(knl, "pick_list", f"pick_list_{i_batch}") kwargs[f"pick_list_{i_batch}"] = point_pick_indices # {{{ enforce different namespaces for the kernels for iname in knl.all_inames(): knl = lp.rename_iname(knl, iname, f"{iname}_{i_batch}") knl = lp.rename_argument(knl, "ary", f"ary_{i_batch}") knl = lp.rename_argument(knl, "from_element_indices", f"from_element_indices_{i_batch}") knl = lp.rename_argument(knl, "to_element_indices", f"to_element_indices_{i_batch}") knl = lp.rename_argument(knl, "nelements", f"nelements_{i_batch}") # }}} kwargs[f"ary_{i_batch}"] = ary[batch.from_group_index] kwargs[f"from_element_indices_{i_batch}"] = ( batch.from_element_indices) kwargs[f"to_element_indices_{i_batch}"] = ( batch.to_element_indices) kernels.append(knl) fused_knl = lp.fuse_kernels(kernels) # order of operations doesn't matter fused_knl = lp.add_nosync(fused_knl, "global", "writes:result", "writes:result", bidirectional=True, force=True) result_dict = actx.call_loopy(fused_knl, nelements_result=tgrp.nelements, n_to_nodes=tgrp.nunit_dofs, **kwargs) group_idx_to_result.append(result_dict["result"]) from meshmode.dof_array import DOFArray return DOFArray.from_list(actx, group_idx_to_result)
def test_write_block_matrix_fusion(ctx_factory): """ A slightly more complicated fusion test, where all sub-kernels write into the same global matrix, but in well-defined separate blocks. This tests makes sure data flow specification is preserved during fusion for matrix-assembly-like programs. """ ctx = ctx_factory() queue = cl.CommandQueue(ctx) def init_global_mat_prg(): return lp.make_kernel( ["{[idof]: 0 <= idof < n}", "{[jdof]: 0 <= jdof < m}"], """ result[idof, jdof] = 0 {id=init} """, [ lp.GlobalArg("result", None, shape="n, m", offset=lp.auto), lp.ValueArg("n, m", np.int32), "...", ], options=lp.Options(return_dict=True), default_offset=lp.auto, name="init_a_global_matrix", ) def write_into_mat_prg(): return lp.make_kernel( ["{[idof]: 0 <= idof < ndofs}", "{[jdof]: 0 <= jdof < mdofs}"], """ result[offset_i + idof, offset_j + jdof] = mat[idof, jdof] """, [ lp.GlobalArg("result", None, shape="n, m", offset=lp.auto), lp.ValueArg("n, m", np.int32), lp.GlobalArg("mat", None, shape="ndofs, mdofs", offset=lp.auto), lp.ValueArg("offset_i", np.int32), lp.ValueArg("offset_j", np.int32), "...", ], options=lp.Options(return_dict=True), default_offset=lp.auto, name="write_into_global_matrix", ) # Construct a 2x2 diagonal matrix with # random 5x5 blocks on the block-diagonal, # and zeros elsewhere n = 10 block_n = 5 mat1 = np.random.randn(block_n, block_n) mat2 = np.random.randn(block_n, block_n) answer = np.block([[mat1, np.zeros((block_n, block_n))], [np.zeros((block_n, block_n)), mat2]]) kwargs = {"n": n, "m": n} # Do some renaming of individual programs before fusion kernels = [init_global_mat_prg()] for idx, (offset, mat) in enumerate([(0, mat1), (block_n, mat2)]): knl = lp.rename_argument(write_into_mat_prg(), "mat", f"mat_{idx}") kwargs[f"mat_{idx}"] = mat for iname in knl.default_entrypoint.all_inames(): knl = lp.rename_iname(knl, iname, f"{iname}_{idx}") knl = lp.rename_argument(knl, "ndofs", f"ndofs_{idx}") knl = lp.rename_argument(knl, "mdofs", f"mdofs_{idx}") kwargs[f"ndofs_{idx}"] = block_n kwargs[f"mdofs_{idx}"] = block_n knl = lp.rename_argument(knl, "offset_i", f"offset_i_{idx}") knl = lp.rename_argument(knl, "offset_j", f"offset_j_{idx}") kwargs[f"offset_i_{idx}"] = offset kwargs[f"offset_j_{idx}"] = offset kernels.append(knl) fused_knl = lp.fuse_kernels( kernels, data_flow=[("result", 0, 1), ("result", 1, 2)], ) fused_knl = lp.add_nosync(fused_knl, "global", "writes:result", "writes:result", bidirectional=True, force=True) evt, result = fused_knl(queue, **kwargs) result = result["result"] np.testing.assert_allclose(result, answer)
def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): ctx = ctx_factory() filename = "strongVolumeKernels.f90" with open(filename, "r") as sourcef: source = sourcef.read() source = source.replace("datafloat", "real*4") hsv_r, hsv_s = [ knl for knl in lp.parse_fortran(source, filename, auto_dependencies=False) if "KernelR" in knl.name or "KernelS" in knl.name ] hsv_r = lp.tag_instructions(hsv_r, "rknl") hsv_s = lp.tag_instructions(hsv_s, "sknl") hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"]) #hsv = hsv_s from gnuma_loopy_transforms import ( fix_euler_parameters, set_q_storage_format, set_D_storage_format) hsv = lp.fix_parameters(hsv, Nq=Nq) hsv = lp.set_loop_priority(hsv, "e,k,j,i") hsv = lp.tag_inames(hsv, dict(e="g.0", j="l.1", i="l.0")) hsv = lp.assume(hsv, "elements >= 1") hsv = fix_euler_parameters(hsv, p_p0=1, p_Gamma=1.4, p_R=1) for name in ["Q", "rhsQ"]: hsv = set_q_storage_format(hsv, name) hsv = set_D_storage_format(hsv) #hsv = lp.add_prefetch(hsv, "volumeGeometricFactors") ref_hsv = hsv if opt_level == 0: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "D[:,:]") if opt_level == 1: tap_hsv = hsv # turn the first reads into subst rules local_prep_var_names = set() for insn in lp.find_instructions(hsv, "tag:local_prep"): assignee, = insn.assignee_var_names() local_prep_var_names.add(assignee) hsv = lp.assignment_to_subst(hsv, assignee) # precompute fluxes hsv = lp.assignment_to_subst(hsv, "JinvD_r") hsv = lp.assignment_to_subst(hsv, "JinvD_s") r_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:rknl") s_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:sknl") if ilp_multiple > 1: hsv = lp.split_iname(hsv, "k", 2, inner_tag="ilp") ilp_inames = ("k_inner",) flux_ilp_inames = ("kk",) else: ilp_inames = () flux_ilp_inames = () rtmps = [] stmps = [] flux_store_idx = 0 for rflux_insn, sflux_insn in zip(r_fluxes, s_fluxes): for knl_tag, insn, flux_inames, tmps, flux_precomp_inames in [ ("rknl", rflux_insn, ("j", "n",), rtmps, ("jj", "ii",)), ("sknl", sflux_insn, ("i", "n",), stmps, ("ii", "jj",)), ]: flux_var, = insn.assignee_var_names() print(insn) reader, = lp.find_instructions(hsv, "tag:{knl_tag} and reads:{flux_var}" .format(knl_tag=knl_tag, flux_var=flux_var)) hsv = lp.assignment_to_subst(hsv, flux_var) flux_store_name = "flux_store_%d" % flux_store_idx flux_store_idx += 1 tmps.append(flux_store_name) hsv = lp.precompute(hsv, flux_var+"_subst", flux_inames + ilp_inames, temporary_name=flux_store_name, precompute_inames=flux_precomp_inames + flux_ilp_inames, default_tag=None) if flux_var.endswith("_s"): hsv = lp.tag_data_axes(hsv, flux_store_name, "N0,N1,N2?") else: hsv = lp.tag_data_axes(hsv, flux_store_name, "N1,N0,N2?") n_iname = "n_"+flux_var.replace("_r", "").replace("_s", "") if n_iname.endswith("_0"): n_iname = n_iname[:-2] hsv = lp.rename_iname(hsv, "n", n_iname, within="id:"+reader.id, existing_ok=True) hsv = lp.tag_inames(hsv, dict(ii="l.0", jj="l.1")) for iname in flux_ilp_inames: hsv = lp.tag_inames(hsv, {iname: "ilp"}) hsv = lp.alias_temporaries(hsv, rtmps) hsv = lp.alias_temporaries(hsv, stmps) if opt_level == 2: tap_hsv = hsv for prep_var_name in local_prep_var_names: if prep_var_name.startswith("Jinv") or "_s" in prep_var_name: continue hsv = lp.precompute(hsv, lp.find_one_rule_matching(hsv, prep_var_name+"_*subst*")) if opt_level == 3: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "Q[ii,jj,k,:,:,e]", sweep_inames=ilp_inames) if opt_level == 4: tap_hsv = hsv tap_hsv = lp.tag_inames(tap_hsv, dict( Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.buffer_array(hsv, "rhsQ", ilp_inames, fetch_bounding_box=True, default_tag="for", init_expression="0", store_expression="base + buffer") if opt_level == 5: tap_hsv = hsv tap_hsv = lp.tag_inames(tap_hsv, dict( rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) # buffer axes need to be vectorized in order for this to work hsv = lp.tag_data_axes(hsv, "rhsQ_buf", "c?,vec,c") hsv = lp.tag_data_axes(hsv, "Q_fetch", "c?,vec,c") hsv = lp.tag_data_axes(hsv, "D_fetch", "f,f") hsv = lp.tag_inames(hsv, {"Q_dim_k": "unr", "rhsQ_init_k": "unr", "rhsQ_store_k": "unr"}, ignore_nonexistent=True) if opt_level == 6: tap_hsv = hsv tap_hsv = lp.tag_inames(tap_hsv, dict( rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.tag_inames(hsv, dict( rhsQ_init_field_inner="vec", rhsQ_store_field_inner="vec", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="vec", Q_dim_field_outer="unr")) if opt_level == 7: tap_hsv = hsv hsv = lp.collect_common_factors_on_increment(hsv, "rhsQ_buf", vary_by_axes=(0,) if ilp_multiple > 1 else ()) if opt_level >= 8: tap_hsv = hsv hsv = tap_hsv if 1: print("OPS") op_poly = lp.get_op_poly(hsv) print(lp.stringify_stats_mapping(op_poly)) print("MEM") gmem_poly = lp.sum_mem_access_to_bytes(lp.get_gmem_access_poly(hsv)) print(lp.stringify_stats_mapping(gmem_poly)) hsv = lp.set_options(hsv, cl_build_options=[ "-cl-denorms-are-zero", "-cl-fast-relaxed-math", "-cl-finite-math-only", "-cl-mad-enable", "-cl-no-signed-zeros", ]) hsv = hsv.copy(name="horizontalStrongVolumeKernel") results = lp.auto_test_vs_ref(ref_hsv, ctx, hsv, parameters=dict(elements=300), quiet=True) elapsed = results["elapsed_wall"] print("elapsed", elapsed)
def test_map_domain_transform_map_validity_and_errors(ctx_factory): # {{{ Make kernel knl = lp.make_kernel( [ "[nx,nt] -> {[x, y, z, t]: 0 <= x,y,z < nx and 0 <= t < nt}", "[m] -> {[j]: 0 <= j < m}", ], """ a[y,x,t,z] = b[y,x,t,z] {id=stmta} for j <>temp = j {dep=stmta} end """, lang_version=(2018, 2), ) knl = lp.add_and_infer_dtypes(knl, {"b": np.float32}) ref_knl = knl # }}} # {{{ Make sure map_domain *succeeds* when map includes 2 of 4 dims in one # domain. # {{{ Apply domain change mapping that splits t and renames y; (similar to # split_iname test above, but doesn't hurt to test this slightly different # scenario) knl_map_dom = ref_knl # Create map_domain mapping that only includes t and y # (x and z should be unaffected) import islpy as isl transform_map = isl.BasicMap( "[nx,nt] -> {[t, y] -> [t_outer, t_inner, y_new]: " "0 <= t_inner < 16 and " "16*t_outer + t_inner = t and " "0 <= 16*t_outer + t_inner < nt and " "y = y_new" "}") # Call map_domain to transform kernel; this should *not* produce an error knl_map_dom = lp.map_domain(knl_map_dom, transform_map) # Prioritize loops desired_prio = "x, t_outer, t_inner, z, y_new" # Use constrain_loop_nesting if it's available cln_attr = getattr(lp, "constrain_loop_nesting", None) if cln_attr is not None: knl_map_dom = lp.constrain_loop_nesting( # noqa pylint:disable=no-member knl_map_dom, desired_prio) else: knl_map_dom = lp.prioritize_loops(knl_map_dom, desired_prio) # Get a linearization proc_knl_map_dom = lp.preprocess_kernel(knl_map_dom) lin_knl_map_dom = lp.get_one_linearized_kernel( proc_knl_map_dom["loopy_kernel"], proc_knl_map_dom.callables_table) # }}} # {{{ Use split_iname and rename_iname, and make sure we get the same result knl_split_iname = ref_knl knl_split_iname = lp.split_iname(knl_split_iname, "t", 16) knl_split_iname = lp.rename_iname(knl_split_iname, "y", "y_new") try: # Use constrain_loop_nesting if it's available knl_split_iname = lp.constrain_loop_nesting(knl_split_iname, desired_prio) except AttributeError: knl_split_iname = lp.prioritize_loops(knl_split_iname, desired_prio) proc_knl_split_iname = lp.preprocess_kernel(knl_split_iname) lin_knl_split_iname = lp.get_one_linearized_kernel( proc_knl_split_iname["loopy_kernel"], proc_knl_split_iname.callables_table) for d_map_domain, d_split_iname in zip( knl_map_dom["loopy_kernel"].domains, knl_split_iname["loopy_kernel"].domains): d_map_domain_aligned = _ensure_dim_names_match_and_align( d_map_domain, d_split_iname) assert d_map_domain_aligned == d_split_iname for litem_map_domain, litem_split_iname in zip( lin_knl_map_dom.linearization, lin_knl_split_iname.linearization): assert litem_map_domain == litem_split_iname # Can't easily compare instructions because equivalent subscript # expressions may have different orders lp.auto_test_vs_ref(proc_knl_split_iname, ctx_factory(), proc_knl_map_dom, parameters={ "nx": 32, "nt": 32, "m": 32 }) # }}} # }}} # {{{ Make sure we error on a map that is not bijective # Not bijective transform_map = isl.BasicMap( "[nx,nt] -> {[t, y, rogue] -> [t_new, y_new]: " "y = y_new and t = t_new" "}") from loopy.diagnostic import LoopyError knl = ref_knl try: knl = lp.map_domain(knl, transform_map) raise AssertionError() except LoopyError as err: assert "map must be bijective" in str(err) # }}} # {{{ Make sure there's an error if transform map does not apply to # exactly one domain. test_maps = [ # Map where some inames match exactly one domain but there's also a # rogue dim isl.BasicMap("[nx,nt] -> {[t, y, rogue] -> [t_new, y_new, rogue_new]: " "y = y_new and t = t_new and rogue = rogue_new" "}"), # Map where all inames match exactly one domain but there's also a # rogue dim isl.BasicMap("[nx,nt] -> {[t, y, x, z, rogue] -> " "[t_new, y_new, x_new, z_new, rogue_new]: " "y = y_new and t = t_new and x = x_new and z = z_new " "and rogue = rogue_new" "}"), # Map where no inames match any domain isl.BasicMap("[nx,nt] -> {[rogue] -> [rogue_new]: " "rogue = rogue_new" "}"), ] for transform_map in test_maps: try: knl = lp.map_domain(knl, transform_map) raise AssertionError() except LoopyError as err: assert ("was not applicable to any domain. " "Transform map must be applicable to exactly one domain." in str(err)) # }}} # {{{ Make sure there's an error if we try to map inames in priorities knl = ref_knl knl = lp.prioritize_loops(knl, "y, z") knl = lp.prioritize_loops(knl, "x, z") try: transform_map = isl.BasicMap("[nx,nt] -> {[t, y] -> [t_new, y_new]: " "y = y_new and t = t_new }") knl = lp.map_domain(knl, transform_map) raise AssertionError() except ValueError as err: assert ("Loop priority ('y', 'z') contains iname(s) " "transformed by map" in str(err)) # }}} # {{{ Make sure we error when stmt.within_inames contains at least one but # not all mapped inames # {{{ Make potentially problematic kernel knl = lp.make_kernel( [ "[n, m] -> { [i, j]: 0 <= i < n and 0 <= j < m }", "[ell] -> { [k]: 0 <= k < ell }", ], """ for i <>t0 = i {id=stmt0} for j <>t1 = j {id=stmt1, dep=stmt0} end <>t2 = i + 1 {id=stmt2, dep=stmt1} end for k <>t3 = k {id=stmt3, dep=stmt2} end """, lang_version=(2018, 2), ) # }}} # This should fail: try: transform_map = isl.BasicMap("[n, m] -> {[i, j] -> [i_new, j_new]: " "i_new = i + j and j_new = 2 + i }") knl = lp.map_domain(knl, transform_map) raise AssertionError() except LoopyError as err: assert ("Statements must be within all or none of the mapped inames" in str(err)) # This should succeed: transform_map = isl.BasicMap("[n, m] -> {[i] -> [i_new]: i_new = i + 2 }") knl = lp.map_domain(knl, transform_map)