def test_rename_argument_of_domain_params(ctx_factory): knl = lp.make_kernel("{[i, j]: 0<=i<n and 0<=j<m}", """ y[i, j] = 2.0f """) knl = lp.rename_argument(knl, "n", "N") knl = lp.rename_argument(knl, "m", "M") # renamed variables should not appear in the code code_str = lp.generate_code_v2(knl).device_code() assert code_str.find("int const n") == -1 assert code_str.find("int const m") == -1 assert code_str.find("int const N") != -1 assert code_str.find("int const M") != -1 lp.auto_test_vs_ref(knl, ctx_factory(), knl, parameters={"M": 10, "N": 4})
def test_rename_argument(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) kernel = lp.make_kernel('''{ [i]: 0<=i<n }''', '''out[i] = a + 2''') kernel = lp.rename_argument(kernel, "a", "b") evt, (out, ) = kernel(queue, b=np.float32(12), n=20) assert (np.abs(out.get() - 14) < 1e-8).all()
def test_rename_argument(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) kernel = lp.make_kernel( '''{ [i]: 0<=i<n }''', '''out[i] = a + 2''') kernel = lp.rename_argument(kernel, "a", "b") evt, (out,) = kernel(queue, b=np.float32(12), n=20) assert (np.abs(out.get() - 14) < 1e-8).all()
def test_rename_argument_with_assumptions(): import islpy as isl knl = lp.make_kernel("{[i]: 0<=i<n_old}", """ y[i] = 2.0f """) knl = lp.assume(knl, "n_old=10") knl = lp.rename_argument(knl, "n_old", "n_new") assumptions = knl["loopy_kernel"].assumptions assert "n_old" not in assumptions.get_var_dict() assert "n_new" in assumptions.get_var_dict() assert ((assumptions & isl.BasicSet("[n_new]->{: n_new=10}")) == assumptions)
def test_rename_argument_with_auto_stride(ctx_factory): from loopy.kernel.array import FixedStrideArrayDimTag ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "{[i]: 0<=i<10}", """ y[i] = x[i] """, [lp.GlobalArg("x", dtype=float, shape=lp.auto, dim_tags=[FixedStrideArrayDimTag(lp.auto)]), ...]) knl = lp.rename_argument(knl, "x", "x_new") code_str = lp.generate_code_v2(knl).device_code() assert code_str.find("double const *__restrict__ x_new,") != -1 assert code_str.find("double const *__restrict__ x,") == -1 evt, (out, ) = knl(queue, x_new=np.random.rand(10))
def __call__(self, ary): from meshmode.dof_array import DOFArray if not isinstance(ary, DOFArray): raise TypeError("non-array passed to discretization connection") actx = ary.array_context @memoize_in(actx, ( DirectDiscretizationConnection, "resample_by_mat_knl", self.is_surjective, )) def mat_knl(): if self.is_surjective: domains = [ """ {[iel, idof, j]: 0<=iel<nelements and 0<=idof<n_to_nodes and 0<=j<n_from_nodes} """, ] instructions = """ result[to_element_indices[iel], idof] \ = sum(j, resample_mat[idof, j] \ * ary[from_element_indices[iel], j]) """ else: domains = [ """ {[iel_init, idof_init]: 0<=iel_init<nelements_result and 0<=idof_init<n_to_nodes} """, """ {[iel, idof, j]: 0<=iel<nelements and 0<=idof<n_to_nodes and 0<=j<n_from_nodes} """, ] instructions = """ result[iel_init, idof_init] = 0 {id=init} ... gbarrier {id=barrier, dep=init} result[to_element_indices[iel], idof] \ = sum(j, resample_mat[idof, j] \ * ary[from_element_indices[iel], j]) {dep=barrier} """ knl = make_loopy_program( domains, instructions, [ lp.GlobalArg("result", None, shape="nelements_result, n_to_nodes", offset=lp.auto), lp.GlobalArg("ary", None, shape="nelements_vec, n_from_nodes", offset=lp.auto), lp.ValueArg("nelements_result", np.int32), lp.ValueArg("nelements_vec", np.int32), lp.ValueArg("n_from_nodes", np.int32), "...", ], name="resample_by_mat") return knl @memoize_in(actx, (DirectDiscretizationConnection, "resample_by_picking_knl", self.is_surjective)) def pick_knl(): if self.is_surjective: domains = [ """{[iel, idof]: 0<=iel<nelements and 0<=idof<n_to_nodes}""" ] instructions = """ result[to_element_indices[iel], idof] \ = ary[from_element_indices[iel], pick_list[idof]] """ else: domains = [ """ {[iel_init, idof_init]: 0<=iel_init<nelements_result and 0<=idof_init<n_to_nodes} """, """ {[iel, idof]: 0<=iel<nelements and 0<=idof<n_to_nodes} """ ] instructions = """ result[iel_init, idof_init] = 0 {id=init} ... gbarrier {id=barrier, dep=init} result[to_element_indices[iel], idof] \ = ary[from_element_indices[iel], pick_list[idof]] {dep=barrier} """ knl = make_loopy_program( domains, instructions, [ lp.GlobalArg("result", None, shape="nelements_result, n_to_nodes", offset=lp.auto), lp.GlobalArg("ary", None, shape="nelements_vec, n_from_nodes", offset=lp.auto), lp.ValueArg("nelements_result", np.int32), lp.ValueArg("nelements_vec", np.int32), lp.ValueArg("n_from_nodes", np.int32), "...", ], name="resample_by_picking") return knl if ary.shape != (len(self.from_discr.groups), ): raise ValueError("invalid shape of incoming resampling data") group_idx_to_result = [] for i_tgrp, (tgrp, cgrp) in enumerate(zip(self.to_discr.groups, self.groups)): kernels = [] # get kernels for each batch; to be fused eventually kwargs = {} # kwargs to the fused kernel for i_batch, batch in enumerate(cgrp.batches): if batch.from_element_indices.size == 0: continue point_pick_indices = self._resample_point_pick_indices( actx, i_tgrp, i_batch) if point_pick_indices is None: knl = mat_knl() knl = lp.rename_argument(knl, "resample_mat", f"resample_mat_{i_batch}") kwargs[f"resample_mat_{i_batch}"] = (self._resample_matrix( actx, i_tgrp, i_batch)) else: knl = pick_knl() knl = lp.rename_argument(knl, "pick_list", f"pick_list_{i_batch}") kwargs[f"pick_list_{i_batch}"] = point_pick_indices # {{{ enforce different namespaces for the kernels for iname in knl.all_inames(): knl = lp.rename_iname(knl, iname, f"{iname}_{i_batch}") knl = lp.rename_argument(knl, "ary", f"ary_{i_batch}") knl = lp.rename_argument(knl, "from_element_indices", f"from_element_indices_{i_batch}") knl = lp.rename_argument(knl, "to_element_indices", f"to_element_indices_{i_batch}") knl = lp.rename_argument(knl, "nelements", f"nelements_{i_batch}") # }}} kwargs[f"ary_{i_batch}"] = ary[batch.from_group_index] kwargs[f"from_element_indices_{i_batch}"] = ( batch.from_element_indices) kwargs[f"to_element_indices_{i_batch}"] = ( batch.to_element_indices) kernels.append(knl) fused_knl = lp.fuse_kernels(kernels) # order of operations doesn't matter fused_knl = lp.add_nosync(fused_knl, "global", "writes:result", "writes:result", bidirectional=True, force=True) result_dict = actx.call_loopy(fused_knl, nelements_result=tgrp.nelements, n_to_nodes=tgrp.nunit_dofs, **kwargs) group_idx_to_result.append(result_dict["result"]) from meshmode.dof_array import DOFArray return DOFArray.from_list(actx, group_idx_to_result)
def test_write_block_matrix_fusion(ctx_factory): """ A slightly more complicated fusion test, where all sub-kernels write into the same global matrix, but in well-defined separate blocks. This tests makes sure data flow specification is preserved during fusion for matrix-assembly-like programs. """ ctx = ctx_factory() queue = cl.CommandQueue(ctx) def init_global_mat_prg(): return lp.make_kernel( ["{[idof]: 0 <= idof < n}", "{[jdof]: 0 <= jdof < m}"], """ result[idof, jdof] = 0 {id=init} """, [ lp.GlobalArg("result", None, shape="n, m", offset=lp.auto), lp.ValueArg("n, m", np.int32), "...", ], options=lp.Options(return_dict=True), default_offset=lp.auto, name="init_a_global_matrix", ) def write_into_mat_prg(): return lp.make_kernel( ["{[idof]: 0 <= idof < ndofs}", "{[jdof]: 0 <= jdof < mdofs}"], """ result[offset_i + idof, offset_j + jdof] = mat[idof, jdof] """, [ lp.GlobalArg("result", None, shape="n, m", offset=lp.auto), lp.ValueArg("n, m", np.int32), lp.GlobalArg("mat", None, shape="ndofs, mdofs", offset=lp.auto), lp.ValueArg("offset_i", np.int32), lp.ValueArg("offset_j", np.int32), "...", ], options=lp.Options(return_dict=True), default_offset=lp.auto, name="write_into_global_matrix", ) # Construct a 2x2 diagonal matrix with # random 5x5 blocks on the block-diagonal, # and zeros elsewhere n = 10 block_n = 5 mat1 = np.random.randn(block_n, block_n) mat2 = np.random.randn(block_n, block_n) answer = np.block([[mat1, np.zeros((block_n, block_n))], [np.zeros((block_n, block_n)), mat2]]) kwargs = {"n": n, "m": n} # Do some renaming of individual programs before fusion kernels = [init_global_mat_prg()] for idx, (offset, mat) in enumerate([(0, mat1), (block_n, mat2)]): knl = lp.rename_argument(write_into_mat_prg(), "mat", f"mat_{idx}") kwargs[f"mat_{idx}"] = mat for iname in knl.default_entrypoint.all_inames(): knl = lp.rename_iname(knl, iname, f"{iname}_{idx}") knl = lp.rename_argument(knl, "ndofs", f"ndofs_{idx}") knl = lp.rename_argument(knl, "mdofs", f"mdofs_{idx}") kwargs[f"ndofs_{idx}"] = block_n kwargs[f"mdofs_{idx}"] = block_n knl = lp.rename_argument(knl, "offset_i", f"offset_i_{idx}") knl = lp.rename_argument(knl, "offset_j", f"offset_j_{idx}") kwargs[f"offset_i_{idx}"] = offset kwargs[f"offset_j_{idx}"] = offset kernels.append(knl) fused_knl = lp.fuse_kernels( kernels, data_flow=[("result", 0, 1), ("result", 1, 2)], ) fused_knl = lp.add_nosync(fused_knl, "global", "writes:result", "writes:result", bidirectional=True, force=True) evt, result = fused_knl(queue, **kwargs) result = result["result"] np.testing.assert_allclose(result, answer)