def test_variable_size_matrix_mul(ctx_factory): ctx = ctx_factory() n = get_suitable_size(ctx) knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<n}", "c[i, j] = sum(k, a[i, k]*b[k, j])") knl = lp.add_dtypes(knl, { "a": np.float32, "b": np.float32, }) ref_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1", slabs=(0, 1)) knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0", slabs=(0, 1)) knl = lp.split_iname(knl, "k", 8, slabs=(0, 1)) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"]) lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], parameters={"n": n})
def network_time_step( model: model.BaseKernel, coupling: coupling.BaseCoupling, scheme: scheme.TimeStepScheme, target: lp.target.TargetBase=None, ): target = target or utils.default_target() # fuse kernels kernels = [ model.kernel(target), network.Network(model, coupling).kernel(target), lp.fix_parameters(scheme.kernel(target), nsvar=len(model.state_sym)), ] data_flow = [ ('input', 1, 0), ('diffs', 0, 2), ('drift', 0, 2), ('state', 2, 0) ] knl = lp.fuse_kernels(kernels, data_flow=data_flow) # time step knl = lp.to_batched(knl, 'nstep', [], 'i_step', sequential=True) new_i_time = pm.parse('(i_step + i_step_0) % ntime') knl = lp.fix_parameters(knl, i_time=new_i_time) knl.args.append(lp.ValueArg('i_step_0', np.uintc)) knl = lp.add_dtypes(knl, {'i_step_0': np.uintc}) return knl
def test_variable_size_matrix_mul(ctx_factory): ctx = ctx_factory() if (not ctx.devices[0].image_support or ctx.devices[0].platform.name == "Portable Computing Language"): pytest.skip("crashes on pocl") n = get_suitable_size(ctx) knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<n}", "c[i, j] = sum(k, a[i, k]*b[k, j])") knl = lp.add_dtypes(knl, { "a": np.float32, "b": np.float32, }) ref_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1", slabs=(0, 1)) knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0", slabs=(0, 1)) knl = lp.split_iname(knl, "k", 8, slabs=(0, 1)) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"]) lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], parameters={"n": n})
def make_knl(): target = NumbaTarget() # build individual kernels osc = model.Kuramoto() osc.dt = 1.0 osc.const['omega'] = 10.0 * 2.0 * np.pi / 1e3 osc_knl = osc.kernel(target) cfun = coupling.Kuramoto(osc) cfun.param['a'] = pm.parse('a') net = network.Network(osc, cfun) net_knl = net.kernel(target) scm = scheme.EulerStep(osc.dt) scm_knl = scm.kernel(target) scm_knl = lp.fix_parameters(scm_knl, nsvar=len(osc.state_sym)) # fuse kernels knls = osc_knl, net_knl, scm_knl data_flow = [('input', 1, 0), ('diffs', 0, 2), ('drift', 0, 2), ('state', 2, 0)] knl = lp.fuse_kernels(knls, data_flow=data_flow) # and time step knl = lp.to_batched(knl, 'nstep', [], 'i_step', sequential=True) knl = lp.fix_parameters(knl, i_time=pm.parse('(i_step + i_step_0) % ntime')) knl.args.append(lp.ValueArg('i_step_0', np.uintc)) knl = lp.add_dtypes(knl, {'i_step_0': np.uintc}) return knl, osc
def test_variable_size_matrix_mul(ctx_factory): ctx = ctx_factory() if (not ctx.devices[0].image_support or ctx.devices[0].platform.name == "Portable Computing Language"): pytest.skip("crashes on pocl") n = get_suitable_size(ctx) knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<n}", "c[i, j] = sum(k, a[i, k]*b[k, j])") knl = lp.add_dtypes(knl, { "a": np.float32, "b": np.float32, }) ref_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1", slabs=(0, 1)) knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0", slabs=(0, 1)) knl = lp.split_iname(knl, "k", 8, slabs=(0, 1)) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto") knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto") lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], parameters={"n": n})
def test_simple(self): target = NumbaTarget() knl = lp.make_kernel("{ [i]: 0<=i<n }", "out[i] = 2*a[i]", target=target) typed = lp.add_dtypes(knl, {'a': np.float32}) a, out = np.zeros((2, 10), np.float32) a[:] = np.r_[:a.size] typed(a, 10, out) np.testing.assert_allclose(out, a * 2)
def test_simple_kernel(self): knl = lp.make_kernel("{ [i]: 0<=i<n }", "out[i] = 2*a[i]", target=CTarget()) typed = lp.add_dtypes(knl, {'a': np.float32}) code, _ = lp.generate_code(typed) fn = CompiledKernel(typed) # noqa a, out = np.zeros((2, 10), np.float32) a[:] = np.r_[:a.size] fn(a, 10, out) np.testing.assert_allclose(out, a * 2)
def test_force_outer_iname_for_scan(): knl = lp.make_kernel( "[n] -> {[i,j,k]: 0<=k<n and 0<=i<=k and 0<=j<=i}", "out[i] = product(j, a[j]) {inames=i:k}") knl = lp.add_dtypes(knl, dict(a=np.float32)) # TODO: Maybe this deserves to work? with pytest.raises(lp.diagnostic.ReductionIsNotTriangularError): lp.realize_reduction(knl, force_scan=True) knl = lp.realize_reduction(knl, force_scan=True, force_outer_iname_for_scan="i")
def test_rob_stroud_bernstein(ctx_factory): ctx = ctx_factory() # NOTE: tmp would have to be zero-filled beforehand knl = lp.make_kernel( "{[el, i2, alpha1,alpha2]: \ 0 <= el < nels and \ 0 <= i2 < nqp1d and \ 0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 }", """ for el,i2 <> xi = qpts[1, i2] <> s = 1-xi <> r = xi/s <> aind = 0 {id=aind_init} for alpha1 <> w = s**(deg-alpha1) {id=init_w} for alpha2 tmp[el,alpha1,i2] = tmp[el,alpha1,i2] + w * coeffs[aind] \ {id=write_tmp,dep=init_w:aind_init} w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \ {id=update_w,dep=init_w:write_tmp} aind = aind + 1 \ {id=aind_incr,dep=aind_init:write_tmp:update_w} end end end """, [ # Must declare coeffs to have "no" shape, to keep loopy # from trying to figure it out the shape automatically. lp.GlobalArg("coeffs", None, shape=None), "..." ], assumptions="deg>=0 and nels>=1", target=lp.PyOpenCLTarget(ctx.devices[0]) ) knl = lp.fix_parameters(knl, nqp1d=7, deg=4) knl = lp.split_iname(knl, "el", 16, inner_tag="l.0") knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp", slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr")) knl = lp.add_dtypes(knl, dict( qpts=np.float32, coeffs=np.float32, tmp=np.float32, )) print(lp.generate_code_v2(knl))
def get_kernel(self): loopy_insns, result_names = self.get_loopy_insns_and_result_names() kernel_exprs = self.get_kernel_exprs(result_names) arguments = (self.get_default_src_tgt_arguments() + [ lp.GlobalArg("srcindices", None, shape="nresult"), lp.GlobalArg("tgtindices", None, shape="nresult"), lp.ValueArg("nresult", None) ] + [ lp.GlobalArg("result_%d" % i, dtype, shape="nresult") for i, dtype in enumerate(self.value_dtypes) ]) loopy_knl = lp.make_kernel( "{[imat, idim]: 0 <= imat < nresult and 0 <= idim < dim}", self.get_kernel_scaling_assignments() # NOTE: itgt, isrc need to always be defined in case a statement # in loopy_insns or kernel_exprs needs them (e.g. hardcoded in # places like get_kernel_exprs) + [ """ for imat <> itgt = tgtindices[imat] <> isrc = srcindices[imat] <> d[idim] = targets[idim, itgt] - sources[idim, isrc] """ ] + [ """ <> is_self = (isrc == target_to_source[itgt]) """ if self.exclude_self else "" ] + loopy_insns + kernel_exprs + [ """ result_{i}[imat] = \ knl_{i}_scaling * pair_result_{i} \ {{id_prefix=write_p2p}} """.format(i=iknl) for iknl in range(len(self.kernels)) ] + ["end"], arguments, assumptions="nresult>=1", silenced_warnings="write_race(write_p2p*)", name=self.name, fixed_parameters=dict(dim=self.dim), lang_version=MOST_RECENT_LANGUAGE_VERSION) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") loopy_knl = lp.add_dtypes(loopy_knl, dict(nsources=np.int32, ntargets=np.int32)) for knl in self.kernels: loopy_knl = knl.prepare_loopy_kernel(loopy_knl) return loopy_knl
def get_kernel(self): loopy_insns, result_names = self.get_loopy_insns_and_result_names() kernel_exprs = self.get_kernel_exprs(result_names) arguments = ( self.get_default_src_tgt_arguments() + [ lp.GlobalArg("srcindices", None, shape="nresult"), lp.GlobalArg("tgtindices", None, shape="nresult"), lp.ValueArg("nresult", None) ] + [lp.GlobalArg("result_%d" % i, dtype, shape="nresult") for i, dtype in enumerate(self.value_dtypes)]) loopy_knl = lp.make_kernel( "{[imat, idim]: 0 <= imat < nresult and 0 <= idim < dim}", self.get_kernel_scaling_assignments() # NOTE: itgt, isrc need to always be defined in case a statement # in loopy_insns or kernel_exprs needs them (e.g. hardcoded in # places like get_kernel_exprs) + [""" for imat <> itgt = tgtindices[imat] <> isrc = srcindices[imat] <> d[idim] = targets[idim, itgt] - sources[idim, isrc] """] + [""" <> is_self = (isrc == target_to_source[itgt]) """ if self.exclude_self else ""] + loopy_insns + kernel_exprs + [""" result_{i}[imat] = \ knl_{i}_scaling * pair_result_{i} \ {{id_prefix=write_p2p}} """.format(i=iknl) for iknl in range(len(self.kernels))] + ["end"], arguments, assumptions="nresult>=1", silenced_warnings="write_race(write_p2p*)", name=self.name, fixed_parameters=dict(dim=self.dim), lang_version=MOST_RECENT_LANGUAGE_VERSION) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") loopy_knl = lp.add_dtypes(loopy_knl, dict(nsources=np.int32, ntargets=np.int32)) for knl in self.kernels: loopy_knl = knl.prepare_loopy_kernel(loopy_knl) return loopy_knl
def test_scan_data_types(ctx_factory, dtype): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel("{[i,j]: 0<=i<n and 0<=j<=i }", "res[i] = reduce(sum, j, a[j])", assumptions="n>=1") a = np.random.randn(20).astype(dtype) knl = lp.add_dtypes(knl, dict(a=dtype)) knl = lp.realize_reduction(knl, force_scan=True) evt, (res, ) = knl(queue, a=a) assert np.allclose(res, np.cumsum(a))
def test_scan_data_types(ctx_factory, dtype): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "{[i,j]: 0<=i<n and 0<=j<=i }", "res[i] = reduce(sum, j, a[j])", assumptions="n>=1") a = np.random.randn(20).astype(dtype) knl = lp.add_dtypes(knl, dict(a=dtype)) knl = lp.realize_reduction(knl, force_scan=True) evt, (res,) = knl(queue, a=a) assert np.allclose(res, np.cumsum(a))
def test_scan_library(ctx_factory, op_name, np_op): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel("{[i,j]: 0<=i<n and 0<=j<=i }", "res[i] = reduce(%s, j, a[j])" % op_name, assumptions="n>=1") a = np.random.randn(20) knl = lp.add_dtypes(knl, dict(a=np.float)) knl = lp.realize_reduction(knl, force_scan=True) evt, (res, ) = knl(queue, a=a) assert np.allclose(res, np.array([np_op(a[:i + 1]) for i in range(len(a))]))
def test_funny_shape_matrix_mul(ctx_factory): ctx = ctx_factory() n = get_suitable_size(ctx) m = n + 12 ell = m + 12 knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", ["c[i, j] = sum(k, a[i, k]*b[k, j])"], name="matmul", assumptions="n,m,ell >= 1") knl = lp.add_dtypes(knl, { "a": np.float32, "b": np.float32, }) ref_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 8, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 32) #knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto") #knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto") knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", precompute_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", precompute_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[2 * n**3 / 1e9], op_label=["GFlops"], parameters={ "n": n, "m": m, "ell": ell })
def test_scan_library(ctx_factory, op_name, np_op): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "{[i,j]: 0<=i<n and 0<=j<=i }", "res[i] = reduce(%s, j, a[j])" % op_name, assumptions="n>=1") a = np.random.randn(20) knl = lp.add_dtypes(knl, dict(a=np.float)) knl = lp.realize_reduction(knl, force_scan=True) evt, (res,) = knl(queue, a=a) assert np.allclose(res, np.array( [np_op(a[:i+1]) for i in range(len(a))]))
def test_parallel_multi_output_reduction(ctx_factory): knl = lp.make_kernel( "{[i]: 0<=i<128}", """ max_val, max_indices = argmax(i, abs(a[i]), i) """) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.add_dtypes(knl, dict(a=np.float64)) ctx = ctx_factory() with cl.CommandQueue(ctx) as queue: a = np.random.rand(128) out, (max_index, max_val) = knl(queue, a=a) assert max_val == np.max(a) assert max_index == np.argmax(np.abs(a))
def test_local_parallel_scan_with_nonzero_lower_bounds(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel("[n] -> {[i,j]: 1<=i<n+1 and 0<=j<=i-1}", """ out[i-1] = sum(j, a[j]**2) """, "...") knl = lp.fix_parameters(knl, n=16) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.realize_reduction(knl, force_scan=True) knl = lp.realize_reduction(knl) knl = lp.add_dtypes(knl, dict(a=int)) evt, (out, ) = knl(queue, a=np.arange(1, 17)) assert (out == np.cumsum(np.arange(1, 17)**2)).all()
def test_parallel_multi_output_reduction(ctx_factory): knl = lp.make_kernel( "{[i]: 0<=i<128}", """ max_val, max_indices = argmax(i, abs(a[i]), i) """) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.add_dtypes(knl, dict(a=np.float64)) knl = lp.realize_reduction(knl) ctx = ctx_factory() with cl.CommandQueue(ctx) as queue: a = np.random.rand(128) out, (max_index, max_val) = knl(queue, a=a) assert max_val == np.max(a) assert max_index == np.argmax(np.abs(a))
def test_local_parallel_scan(ctx_factory, n): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel("[n] -> {[i,j]: 0<=i<n and 0<=j<=i}", """ out[i] = sum(j, a[j]**2) """, "...") knl = lp.fix_parameters(knl, n=n) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.realize_reduction(knl, force_scan=True) knl = lp.realize_reduction(knl) knl = lp.add_dtypes(knl, dict(a=int)) print(knl) evt, (a, ) = knl(queue, a=np.arange(n)) assert (a == np.cumsum(np.arange(n)**2)).all()
def test_local_parallel_scan_with_nonzero_lower_bounds(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "[n] -> {[i,j]: 1<=i<n+1 and 0<=j<=i-1}", """ out[i-1] = sum(j, a[j]**2) """, "..." ) knl = lp.fix_parameters(knl, n=16) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.realize_reduction(knl, force_scan=True) knl = lp.realize_reduction(knl) knl = lp.add_dtypes(knl, dict(a=int)) evt, (out,) = knl(queue, a=np.arange(1, 17)) assert (out == np.cumsum(np.arange(1, 17)**2)).all()
def test_funny_shape_matrix_mul(ctx_factory): ctx = ctx_factory() n = get_suitable_size(ctx) m = n+12 ell = m+12 knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], name="matmul", assumptions="n,m,ell >= 1") knl = lp.add_dtypes(knl, { "a": np.float32, "b": np.float32, }) ref_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 8, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 32) #knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto") #knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto") knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", default_tag="l.auto") knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", default_tag="l.auto") lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], parameters={"n": n, "m": m, "ell": ell})
def test_global_mc_parallel_reduction(ctx_factory, size): ctx = ctx_factory() import pyopencl.version # noqa if cl.version.VERSION < (2016, 2): pytest.skip("Random123 RNG not supported in PyOpenCL < 2016.2") knl = lp.make_kernel( "{[i]: 0 <= i < n }", """ for i <> key = make_uint2(i, 324830944) {inames=i} <> ctr = make_uint4(0, 1, 2, 3) {inames=i,id=init_ctr} <> vals, ctr = philox4x32_f32(ctr, key) {dep=init_ctr} end z = sum(i, vals.s0 + vals.s1 + vals.s2 + vals.s3) """) ref_knl = knl ref_knl = lp.add_dtypes(ref_knl, {"n": np.int32}) gsize = 128 knl = lp.split_iname(knl, "i", gsize * 20) knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0") knl = lp.split_reduction_inward(knl, "i_inner_inner") knl = lp.split_reduction_inward(knl, "i_inner_outer") from loopy.transform.data import reduction_arg_to_subst_rule knl = reduction_arg_to_subst_rule(knl, "i_outer") knl = lp.precompute(knl, "red_i_outer_arg", "i_outer", temporary_address_space=lp.AddressSpace.GLOBAL, default_tag="l.auto") knl = lp.preprocess_kernel(knl) knl = lp.add_dependency(knl, "writes:acc_i_outer", "id:red_i_outer_arg_barrier") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": size})
def test_local_parallel_scan(ctx_factory, n): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "[n] -> {[i,j]: 0<=i<n and 0<=j<=i}", """ out[i] = sum(j, a[j]**2) """, "..." ) knl = lp.fix_parameters(knl, n=n) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.realize_reduction(knl, force_scan=True) knl = lp.realize_reduction(knl) knl = lp.add_dtypes(knl, dict(a=int)) print(knl) evt, (a,) = knl(queue, a=np.arange(n)) assert (a == np.cumsum(np.arange(n)**2)).all()
def _dtype_and_code(self, knl, **extra_dtypes): dtypes = {'in': np.float32, 'out': np.float32} dtypes.update(extra_dtypes) knl = lp.add_dtypes(knl, dtypes) code, _ = lp.generate_code(knl) return code
def call_loopy(translation_unit: "lp.TranslationUnit", bindings: Dict[str, ArrayOrScalar], entrypoint: Optional[str] = None) -> LoopyCall: """ Invokes an entry point of a :class:`loopy.TranslationUnit` on the array inputs as specified by *bindings*. Restrictions on the structure of ``translation_unit[entrypoint]``: * array arguments of ``translation_unit[entrypoint]`` must either be either input-only or output-only. * all input-only arguments of ``translation_unit[entrypoint]`` must appear in *bindings*. * all output-only arguments of ``translation_unit[entrypoint]`` must appear in *bindings*. * if *translation_unit* has been declared with multiple entrypoints, *entrypoint* can not be *None*. :arg translation_unit: the translation unit to call. :arg bindings: mapping from argument names of ``translation_unit[entrypoint]`` to :class:`pytato.array.Array`. :arg entrypoint: the entrypoint of the ``translation_unit`` parameter. """ if entrypoint is None: if len(translation_unit.entrypoints) != 1: raise ValueError("cannot infer entrypoint") entrypoint, = translation_unit.entrypoints translation_unit = translation_unit.with_entrypoints(entrypoint) # {{{ sanity checks if any([ arg.is_input and arg.is_output for arg in translation_unit[entrypoint].args ]): # Pytato DAG cannot have stateful nodes. raise ValueError("Cannot call a kernel with side-effects.") for name in bindings: if name not in translation_unit[entrypoint].arg_dict: raise ValueError(f"Kernel '{entrypoint}' got an unexpected input: " f"'{name}'.") if translation_unit[entrypoint].arg_dict[name].is_output: raise ValueError( f"Kernel '{entrypoint}' got an output arg '{name}' " f"as input.") # {{{ perform shape inference here bindings = extend_bindings_with_shape_inference( translation_unit[entrypoint], pmap(bindings)) # }}} for arg in translation_unit[entrypoint].args: if arg.is_input: if arg.name not in bindings: raise ValueError(f"Kernel '{entrypoint}' expects an input" f" '{arg.name}'") arg_binding = bindings[arg.name] if isinstance(arg, (lp.ArrayArg, lp.ConstantArg)): if not isinstance(arg_binding, Array): raise ValueError(f"Argument '{arg.name}' expected to be a " f"pytato.Array, got {type(arg_binding)}.") else: assert isinstance(arg, lp.ValueArg) if not (isinstance(arg_binding, Number) or (isinstance(arg_binding, Array) and arg_binding.shape == ())): raise ValueError(f"Argument '{arg.name}' expected to be a " " number or a scalar expression, got " f"{type(arg_binding)}.") # }}} # {{{ infer types of the translation_unit for name, ary in bindings.items(): if translation_unit[entrypoint].arg_dict[name].dtype not in [ lp.auto, None ]: continue if isinstance(ary, Array): translation_unit = lp.add_dtypes(translation_unit, {name: ary.dtype}) else: assert isinstance(ary, Number) translation_unit = lp.add_dtypes(translation_unit, {name: type(ary)}) translation_unit = lp.infer_unknown_types(translation_unit) # }}} # {{{ infer shapes of the translation_unit translation_unit = lp.infer_arg_descr(translation_unit) # }}} translation_unit = translation_unit.with_entrypoints(frozenset()) return LoopyCall(translation_unit, bindings, entrypoint)
def get_kernel(self): loopy_insns, result_names = self.get_loopy_insns_and_result_names() kernel_exprs = self.get_kernel_exprs(result_names) arguments = ( self.get_default_src_tgt_arguments() + [ lp.GlobalArg("box_target_starts", None, shape=None), lp.GlobalArg("box_target_counts_nonchild", None, shape=None), lp.GlobalArg("box_source_starts", None, shape=None), lp.GlobalArg("box_source_counts_nonchild", None, shape=None), lp.GlobalArg("source_box_starts", None, shape=None), lp.GlobalArg("source_box_lists", None, shape=None), lp.GlobalArg("strength", None, shape="nstrengths, nsources", dim_tags="sep,C"), lp.GlobalArg("result", None, shape="nkernels, ntargets", dim_tags="sep,C"), "..." ]) loopy_knl = lp.make_kernel([ "{[itgt_box]: 0 <= itgt_box < ntgt_boxes}", "{[isrc_box]: isrc_box_start <= isrc_box < isrc_box_end}", "{[itgt, isrc, idim]: \ itgt_start <= itgt < itgt_end and \ isrc_start <= isrc < isrc_end and \ 0 <= idim < dim}", ], self.get_kernel_scaling_assignments() + [""" for itgt_box <> tgt_ibox = target_boxes[itgt_box] <> itgt_start = box_target_starts[tgt_ibox] <> itgt_end = itgt_start + box_target_counts_nonchild[tgt_ibox] <> isrc_box_start = source_box_starts[itgt_box] <> isrc_box_end = source_box_starts[itgt_box+1] for isrc_box <> src_ibox = source_box_lists[isrc_box] <> isrc_start = box_source_starts[src_ibox] <> isrc_end = isrc_start + box_source_counts_nonchild[src_ibox] for itgt for isrc <> d[idim] = \ targets[idim, itgt] - sources[idim, isrc] {dup=idim} """] + [""" <> is_self = (isrc == target_to_source[itgt]) """ if self.exclude_self else ""] + loopy_insns + kernel_exprs + [" end"] + [""" result[{i}, itgt] = result[{i}, itgt] + \ knl_{i}_scaling * simul_reduce(sum, isrc, pair_result_{i}) \ {{id_prefix=write_csr}} """.format(i=iknl) for iknl in range(len(self.kernels))] + [""" end end end """], arguments, assumptions="ntgt_boxes>=1", name=self.name, silenced_warnings="write_race(write_csr*)", fixed_parameters=dict( dim=self.dim, nstrengths=self.strength_count, nkernels=len(self.kernels)), lang_version=MOST_RECENT_LANGUAGE_VERSION) loopy_knl = lp.add_dtypes(loopy_knl, dict(nsources=np.int32, ntargets=np.int32)) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") loopy_knl = lp.tag_array_axes(loopy_knl, "targets", "sep,C") loopy_knl = lp.tag_array_axes(loopy_knl, "sources", "sep,C") for knl in self.kernels: loopy_knl = knl.prepare_loopy_kernel(loopy_knl) return loopy_knl
def test_rob_stroud_bernstein_full(ctx_factory): #logging.basicConfig(level=logging.DEBUG) ctx = ctx_factory() # NOTE: result would have to be zero-filled beforehand knl = lp.make_kernel( "{[el, i2, alpha1,alpha2, i1_2, alpha1_2, i2_2]: \ 0 <= el < nels and \ 0 <= i2 < nqp1d and \ 0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 and\ \ 0 <= i1_2 < nqp1d and \ 0 <= alpha1_2 <= deg and \ 0 <= i2_2 < nqp1d \ }", """ for el for i2 <> xi = qpts[1, i2] <> s = 1-xi <> r = xi/s <> aind = 0 {id=aind_init} for alpha1 <> w = s**(deg-alpha1) {id=init_w} <> tmp[alpha1,i2] = tmp[alpha1,i2] + w * coeffs[aind] \ {id=write_tmp,dep=init_w:aind_init} for alpha2 w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \ {id=update_w,dep=init_w:write_tmp} aind = aind + 1 \ {id=aind_incr,dep=aind_init:write_tmp:update_w} end end end for i1_2 <> xi2 = qpts[0, i1_2] {dep=aind_incr} <> s2 = 1-xi2 <> r2 = xi2/s2 <> w2 = s2**deg {id=w2_init} for alpha1_2 for i2_2 result[el, i1_2, i2_2] = result[el, i1_2, i2_2] + \ w2 * tmp[alpha1_2, i2_2] {id=res2,dep=w2_init} end w2 = w2 * r2 * (deg-alpha1_2) / (1+alpha1_2) \ {id=w2_update, dep=res2} end end end """, [ # Must declare coeffs to have "no" shape, to keep loopy # from trying to figure it out the shape automatically. lp.GlobalArg("coeffs", None, shape=None), "..." ], assumptions="deg>=0 and nels>=1", target=lp.PyOpenCLTarget(ctx.devices[0]) ) knl = lp.fix_parameters(knl, nqp1d=7, deg=4) if 0: knl = lp.split_iname(knl, "el", 16, inner_tag="l.0") knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp", slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr")) from pickle import dumps, loads knl = loads(dumps(knl)) knl = lp.add_dtypes(knl, dict( qpts=np.float32, tmp=np.float32, coeffs=np.float32, result=np.float32, )) print(lp.generate_code_v2(knl))
model_raw = """ theta_i = theta_i + dt * (omega + coupling_value * rec_n * sum) theta_i = theta_i + (sig * rand) theta_i = wrap_2_pi(theta_i) tavg[i_node] = tavg[i_node] + sin(theta_i) state[i_node] = theta_i {nosync=*} """ node = 10 #Raw coupling couplingknl = lp.make_kernel("{ [i_node, j_node]: 0<=i_node, j_node<n_node}", coupling_raw, target=target) couplingknl = lp.add_dtypes( couplingknl, { "lengths": np.float32, "state": np.float32, "weights": np.float32, "theta_i": np.float32, "rec_speed_dt": np.float32 }) couplingknl = lp.split_iname(couplingknl, "j_node", 1, outer_tag='l.0') #Raw model modelknl = lp.make_kernel("{ [i_node]: 0<=i_node<n_node}", model_raw, target=target) modelknl = lp.add_dtypes(modelknl, { "state": np.float32, "theta_i": np.float32, "tavg": np.float32 }) # Fuse knls = couplingknl, modelknl
for j_node <float32> wij = weights[j_node] {id = coupling3, dep=coupling1:coupling2} if wij != 0.0 <int> dij = lengths[j_node] * rec_speed_dt {id = coupling4, dep=coupling3} <float32> theta_j = state[j_node] sum = sum + wij * sin(theta_j - theta_i) end end theta_i = theta_i + dt * (omega + coupling_value * rec_n * sum) {id = out1, dep=coupling4} theta_i = theta_i + (sig * rand) {id = out2, dep=out1} theta_i = wrap_2_pi(theta_i) {id = out3, dep=out2} tavg[i_node] = tavg[i_node] + sin(theta_i) {id = out4, dep=out3} state[i_node] = theta_i {dep=*coupling1} end """, assumptions="n_node>=0") kernel = lp.add_dtypes( kernel, dict(tavg=np.float32, state=np.float32, weights=np.float32, lengths=np.float32)) kernel = kernel.copy(target=lp.CudaTarget()) code = lp.generate_code_v2(kernel) print(kernel) print(code.host_code()) print(code.device_code())
def get_kernel(self): loopy_insns, result_names = self.get_loopy_insns_and_result_names() kernel_exprs = self.get_kernel_exprs(result_names) arguments = (self.get_default_src_tgt_arguments() + [ lp.GlobalArg("box_target_starts", None, shape=None), lp.GlobalArg("box_target_counts_nonchild", None, shape=None), lp.GlobalArg("box_source_starts", None, shape=None), lp.GlobalArg("box_source_counts_nonchild", None, shape=None), lp.GlobalArg("source_box_starts", None, shape=None), lp.GlobalArg("source_box_lists", None, shape=None), lp.GlobalArg("strength", None, shape="nstrengths, nsources", dim_tags="sep,C"), lp.GlobalArg( "result", None, shape="nkernels, ntargets", dim_tags="sep,C"), "..." ]) loopy_knl = lp.make_kernel( [ "{[itgt_box]: 0 <= itgt_box < ntgt_boxes}", "{[isrc_box]: isrc_box_start <= isrc_box < isrc_box_end}", "{[itgt, isrc, idim]: \ itgt_start <= itgt < itgt_end and \ isrc_start <= isrc < isrc_end and \ 0 <= idim < dim}", ], self.get_kernel_scaling_assignments() + [ """ for itgt_box <> tgt_ibox = target_boxes[itgt_box] <> itgt_start = box_target_starts[tgt_ibox] <> itgt_end = itgt_start + box_target_counts_nonchild[tgt_ibox] <> isrc_box_start = source_box_starts[itgt_box] <> isrc_box_end = source_box_starts[itgt_box+1] for isrc_box <> src_ibox = source_box_lists[isrc_box] <> isrc_start = box_source_starts[src_ibox] <> isrc_end = isrc_start + box_source_counts_nonchild[src_ibox] for itgt for isrc <> d[idim] = \ targets[idim, itgt] - sources[idim, isrc] {dup=idim} """ ] + [ """ <> is_self = (isrc == target_to_source[itgt]) """ if self.exclude_self else "" ] + loopy_insns + kernel_exprs + [" end"] + [ """ result[{i}, itgt] = result[{i}, itgt] + \ knl_{i}_scaling * simul_reduce(sum, isrc, pair_result_{i}) \ {{id_prefix=write_csr}} """.format(i=iknl) for iknl in range(len(self.kernels)) ] + [ """ end end end """ ], arguments, assumptions="ntgt_boxes>=1", name=self.name, silenced_warnings="write_race(write_csr*)", fixed_parameters=dict(dim=self.dim, nstrengths=self.strength_count, nkernels=len(self.kernels)), lang_version=MOST_RECENT_LANGUAGE_VERSION) loopy_knl = lp.add_dtypes(loopy_knl, dict(nsources=np.int32, ntargets=np.int32)) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") loopy_knl = lp.tag_array_axes(loopy_knl, "targets", "sep,C") loopy_knl = lp.tag_array_axes(loopy_knl, "sources", "sep,C") for knl in self.kernels: loopy_knl = knl.prepare_loopy_kernel(loopy_knl) return loopy_knl