def get_kernel(self, **kwargs): loopy_knl = lp.make_kernel( # NOQA [ "{ [ bid ] : 0 <= bid < n_boxes }", "{ [ nid ] : 0 <= nid < n_box_nodes }" ], [ """ for bid <> box_id = boxes[bid] <> box_node_beg = box_node_starts[box_id] result[bid] = sum(nid, func[box_node_beg + nid] * filter_multiplier[nid]) end """ ], [ lp.ValueArg("n_box_nodes, n_boxes", np.int32), lp.GlobalArg("filter_multiplier", np.float64, "n_box_nodes"), lp.GlobalArg("func", np.float64, "n_box_nodes * n_boxes"), "...", ], name="box_filtered_sum", lang_version=(2018, 2), ) loopy_knl = lp.set_options(loopy_knl, write_cl=False) loopy_knl = lp.set_options(loopy_knl, return_dict=True) return loopy_knl
def get_kernel(self, **kwargs): loopy_knl = lp.make_kernel( # NOQA [ "{ [ bid ] : 0 <= bid < n_boxes }", "{ [ mid ] : 0 <= mid < n_box_nodes }", "{ [ nid ] : 0 <= nid < n_box_nodes }" ], [ """ for bid <> box_id = boxes[bid] <> box_node_beg = box_node_starts[box_id] # Rescale weights based on template interval sizes. # Not needed since the rscl in both the numerator and # the denominator and is canceled. # # <> box_level = box_levels[box_id] # <> box_extent = root_extent * (1.0 / (2**box_level)) # <> weight_rscl = (box_extent / 2.0)**dim for mid <> mode_id = box_node_beg + mid for nid <> user_node_id = user_node_ids[box_node_beg + nid] end result[mode_id] = sum( nid, ( func[user_node_id] * weight[nid] * vandermonde[nid, mid] ) * filter_multiplier[nid] ) / normalizer[mid] end end """ ], [ lp.ValueArg("n_box_nodes, n_boxes", np.int32), # lp.ValueArg("root_extent", np.float64), lp.GlobalArg("weight, normalizer, filter_multiplier", np.float64, "n_box_nodes"), lp.GlobalArg("vandermonde", np.float64, "n_box_nodes, n_box_nodes"), lp.GlobalArg("func", np.float64, "n_box_nodes * n_boxes"), "...", ], name="discrete_legendre_transform", lang_version=(2018, 2), ) loopy_knl = lp.set_options(loopy_knl, write_cl=False) loopy_knl = lp.set_options(loopy_knl, return_dict=True) return loopy_knl
def get_kernel(self, **kwargs): extra_kernel_kwarg_types = () if "extra_kernel_kwarg_types" in kwargs: extra_kernel_kwarg_types = kwargs["extra_kernel_kwarg_types"] eval_inames = frozenset(["itgt"]) scalar_assignment = lp.Assignment( id=None, assignee="expr_val", expression=self.get_normalised_expr(), temp_var_type=None, ) eval_insns = [ insn.copy(within_inames=insn.within_inames | eval_inames) for insn in [scalar_assignment] ] loopy_knl = lp.make_kernel( # NOQA "{ [itgt]: 0<=itgt<n_targets }", [ """ for itgt VAR_ASSIGNMENT end """.replace("VAR_ASSIGNMENT", self.get_variable_assignment_code()) ] + eval_insns + [ """ for itgt result[itgt] = expr_val end """ ], [ lp.ValueArg("dim, n_targets", np.int32), lp.GlobalArg("target_points", np.float64, "dim, n_targets"), lp.TemporaryVariable("expr_val", None, ()), ] + list(extra_kernel_kwarg_types) + [ "...", ], name="eval_expr", lang_version=(2018, 2), ) loopy_knl = lp.fix_parameters(loopy_knl, dim=self.dim) loopy_knl = lp.set_options(loopy_knl, write_cl=False) loopy_knl = lp.set_options(loopy_knl, return_dict=True) if self.function_manglers is not None: loopy_knl = lp.register_function_manglers(loopy_knl, self.function_manglers) if self.preamble_generators is not None: loopy_knl = lp.register_preamble_generators( loopy_knl, self.preamble_generators) return loopy_knl
def test_complex_support(ctx_factory, target): knl = lp.make_kernel("{[i, i1, i2]: 0<=i,i1,i2<10}", """ euler1[i] = exp(3.14159265359j) euler2[i] = (2.7182818284 ** (3.14159265359j)) euler1_real[i] = real(euler1[i]) euler1_imag[i] = imag(euler1[i]) real_times_complex[i] = in1[i]*(in2[i]*1j) real_plus_complex[i] = in1[i] + (in2[i]*1j) abs_complex[i] = abs(real_plus_complex[i]) complex_div_complex[i] = (2jf + 7*in1[i])/(32jf + 37*in1[i]) complex_div_real[i] = (2jf + 7*in1[i])/in1[i] real_div_complex[i] = in1[i]/(2jf + 7*in1[i]) out_sum = sum(i1, 1.0*i1 + i1*1jf)*sum(i2, 1.0*i2 + i2*1jf) conj_out_sum = conj(out_sum) """, [ lp.GlobalArg("out_sum, euler1, real_plus_complex", is_input=False, shape=lp.auto), ... ], target=target(), seq_dependencies=True) knl = lp.set_options(knl, "return_dict") n = 10 in1 = np.random.rand(n) in2 = np.random.rand(n) kwargs = {"in1": in1, "in2": in2} if target == lp.PyOpenCLTarget: knl = lp.set_options(knl, "write_cl") cl_ctx = ctx_factory() evt, out = knl(cl.CommandQueue(cl_ctx), **kwargs) elif target == lp.ExecutableCTarget: evt, out = knl(**kwargs) else: raise NotImplementedError("unsupported target") np.testing.assert_allclose(out["euler1"], -1) np.testing.assert_allclose(out["euler2"], -1) np.testing.assert_allclose(out["euler1_real"], -1) np.testing.assert_allclose(out["euler1_imag"], 0, atol=1e-10) np.testing.assert_allclose(out["real_times_complex"], in1 * (in2 * 1j)) np.testing.assert_allclose(out["real_plus_complex"], in1 + (in2 * 1j)) np.testing.assert_allclose(out["abs_complex"], np.abs(out["real_plus_complex"])) np.testing.assert_allclose(out["complex_div_complex"], (2j + 7 * in1) / (32j + 37 * in1)) np.testing.assert_allclose(out["complex_div_real"], (2j + 7 * in1) / in1) np.testing.assert_allclose(out["real_div_complex"], in1 / (2j + 7 * in1)) np.testing.assert_allclose(out["out_sum"], (0.5 * n * (n - 1) + 0.5 * n * (n - 1) * 1j)**2) np.testing.assert_allclose(out["conj_out_sum"], (0.5 * n * (n - 1) - 0.5 * n * (n - 1) * 1j)**2)
def test_shape_translation_through_sub_array_ref(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) x1 = cl.clrandom.rand(queue, (3, 2), dtype=np.float64) x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64) x3 = cl.clrandom.rand(queue, (6, 6), dtype=np.float64) callee1 = lp.make_function("{[i]: 0<=i<6}", """ b[i] = 2*abs(a[i]) """, name="callee_fn1") callee2 = lp.make_function("{[i, j]: 0<=i<3 and 0 <= j < 2}", """ b[i, j] = 3*a[i, j] """, name="callee_fn2") callee3 = lp.make_function("{[i]: 0<=i<6}", """ b[i] = 5*a[i] """, name="callee_fn3") knl = lp.make_kernel( "{[i, j, k, l]: 0<= i < 6 and 0 <= j < 3 and 0 <= k < 2 and 0<=l<6}", """ [i]: y1[i//2, i%2] = callee_fn1([i]: x1[i//2, i%2]) [j, k]: y2[2*j+k] = callee_fn2([j, k]: x2[2*j+k]) [l]: y3[l, l] = callee_fn3([l]: x3[l, l]) """) knl = lp.merge([knl, callee1]) knl = lp.merge([knl, callee2]) knl = lp.merge([knl, callee3]) if inline: knl = lp.inline_callable_kernel(knl, "callee_fn1") knl = lp.inline_callable_kernel(knl, "callee_fn2") knl = lp.inline_callable_kernel(knl, "callee_fn3") knl = lp.set_options(knl, "write_cl") knl = lp.set_options(knl, "return_dict") evt, out_dict = knl(queue, x1=x1, x2=x2, x3=x3) y1 = out_dict["y1"].get() y2 = out_dict["y2"].get() y3 = out_dict["y3"].get() assert (np.linalg.norm(y1 - 2 * x1.get())) < 1e-15 assert (np.linalg.norm(y2 - 3 * x2.get())) < 1e-15 assert (np.linalg.norm(np.diag(y3 - 5 * x3.get()))) < 1e-15
def test_cuda_short_vector(): knl = lp.make_kernel("{ [i]: 0<=i<n }", "out[i] = 2*a[i]", target=lp.CudaTarget()) knl = lp.set_options(knl, write_code=True) knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="vec") knl = lp.split_array_axis(knl, "a,out", axis_nr=0, count=4) knl = lp.tag_array_axes(knl, "a,out", "C,vec") knl = lp.set_options(knl, write_wrapper=True) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) print(lp.generate_code_v2(knl).device_code())
def test_finite_difference_expr_subst(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) grid = np.linspace(0, 2*np.pi, 2048, endpoint=False) h = grid[1] - grid[0] u = cl.clmath.sin(cl.array.to_device(queue, grid)) fin_diff_knl = lp.make_kernel( "{[i]: 1<=i<=n}", "out[i] = -(f[i+1] - f[i-1])/h", [lp.GlobalArg("out", shape="n+2"), "..."]) flux_knl = lp.make_kernel( "{[j]: 1<=j<=n}", "f[j] = u[j]**2/2", [ lp.GlobalArg("f", shape="n+2"), lp.GlobalArg("u", shape="n+2"), ]) fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl], data_flow=[ ("f", 1, 0) ]) fused_knl = lp.set_options(fused_knl, write_cl=True) evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1)) fused_knl = lp.assignment_to_subst(fused_knl, "f") fused_knl = lp.set_options(fused_knl, write_cl=True) # This is the real test here: The automatically generated # shape expressions are '2+n' and the ones above are 'n+2'. # Is loopy smart enough to understand that these are equal? evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1)) fused0_knl = lp.affine_map_inames(fused_knl, "i", "inew", "inew+1=i") gpu_knl = lp.split_iname( fused0_knl, "inew", 128, outer_tag="g.0", inner_tag="l.0") precomp_knl = lp.precompute( gpu_knl, "f_subst", "inew_inner", fetch_bounding_box=True) precomp_knl = lp.tag_inames(precomp_knl, {"j_0_outer": "unr"}) precomp_knl = lp.set_options(precomp_knl, return_dict=True) evt, _ = precomp_knl(queue, u=u, h=h)
def test_cuda_short_vector(): knl = lp.make_kernel( "{ [i]: 0<=i<n }", "out[i] = 2*a[i]", target=lp.CudaTarget()) knl = lp.set_options(knl, write_code=True) knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="vec") knl = lp.split_array_axis(knl, "a,out", axis_nr=0, count=4) knl = lp.tag_array_axes(knl, "a,out", "C,vec") knl = lp.set_options(knl, write_wrapper=True) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) print(lp.generate_code_v2(knl).device_code())
def test_finite_difference_expr_subst(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) grid = np.linspace(0, 2 * np.pi, 2048, endpoint=False) h = grid[1] - grid[0] u = cl.clmath.sin(cl.array.to_device(queue, grid)) fin_diff_knl = lp.make_kernel("{[i]: 1<=i<=n}", "out[i] = -(f[i+1] - f[i-1])/h", [lp.GlobalArg("out", shape="n+2"), "..."]) flux_knl = lp.make_kernel("{[j]: 1<=j<=n}", "f[j] = u[j]**2/2", [ lp.GlobalArg("f", shape="n+2"), lp.GlobalArg("u", shape="n+2"), ]) fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl], data_flow=[("f", 1, 0)]) fused_knl = lp.set_options(fused_knl, write_cl=True) evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1)) fused_knl = lp.assignment_to_subst(fused_knl, "f") fused_knl = lp.set_options(fused_knl, write_cl=True) # This is the real test here: The automatically generated # shape expressions are '2+n' and the ones above are 'n+2'. # Is loopy smart enough to understand that these are equal? evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1)) fused0_knl = lp.affine_map_inames(fused_knl, "i", "inew", "inew+1=i") gpu_knl = lp.split_iname(fused0_knl, "inew", 128, outer_tag="g.0", inner_tag="l.0") precomp_knl = lp.precompute(gpu_knl, "f_subst", "inew_inner", fetch_bounding_box=True) precomp_knl = lp.tag_inames(precomp_knl, {"j_0_outer": "unr"}) precomp_knl = lp.set_options(precomp_knl, return_dict=True) evt, _ = precomp_knl(queue, u=u, h=h)
def test_fd_demo(): knl = lp.make_kernel( "{[i,j]: 0<=i,j<n}", "result[i+1,j+1] = u[i + 1, j + 1]**2 + -1 + (-4)*u[i + 1, j + 1] \ + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] \ + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]") #assumptions="n mod 16=0") knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.add_prefetch(knl, "u", ["i_inner", "j_inner"], fetch_bounding_box=True, default_tag="l.auto") #n = 1000 #u = cl.clrandom.rand(queue, (n+2, n+2), dtype=np.float32) knl = lp.set_options(knl, write_cl=True) knl = lp.add_and_infer_dtypes(knl, dict(u=np.float32)) code, inf = lp.generate_code(knl) print(code) assert "double" not in code
def time_kernel( self, knl, param_dict, ): if param_dict is None: raise ValueError( "Wall time requires dictionary of kernel parameters.") ctx = self.get_cl_context(knl) queue = cl.CommandQueue(ctx) arg_arrays = create_rand_args(ctx, knl, param_dict) knl = lp.set_options(knl, no_numpy=True) compiled = lp.CompiledKernel(ctx, knl) wtimes = [] import time for t in range(self.n_time_trials + self.n_warmup_time_trials): queue.finish() tstart = time.time() evt, out = compiled(queue, **arg_arrays) queue.finish() tend = time.time() wtimes.append(tend - tstart) import numpy as np return np.average(wtimes[self.n_warmup_time_trials:])
def test_random123(ctx_factory, tp): ctx = ctx_factory() queue = cl.CommandQueue(ctx) import pyopencl.version # noqa if cl.version.VERSION < (2016, 2): pytest.skip("Random123 RNG not supported in PyOpenCL < 2016.2") n = 150000 knl = lp.make_kernel( "{ [i]: 0<=i<n }", """ <> key2 = make_uint2(i, 324830944) {inames=i} <> key4 = make_uint4(i, 324830944, 234181, 2233) {inames=i} <> ctr = make_uint4(0, 1, 2, 3) {inames=i,id=init_ctr} <> real, ctr = philox4x32_TYPE(ctr, key2) {id=realpart,dep=init_ctr} <> imag, ctr = threefry4x32_TYPE(ctr, key4) {dep=init_ctr:realpart} out[i, 0] = real.s0 + 1j * imag.s0 out[i, 1] = real.s1 + 1j * imag.s1 out[i, 2] = real.s2 + 1j * imag.s2 out[i, 3] = real.s3 + 1j * imag.s3 """.replace("TYPE", tp)) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") knl = lp.set_options(knl, write_cl=True) evt, (out,) = knl(queue, n=n) out = out.get() assert (out < 1).all() assert (0 <= out).all()
def get_wkb_knl(self): knl = lp.make_kernel( "[Nx, Ny, Nz] -> { [i,j,k]: 0<=i<Nx and 0<=j<Ny and 0<=k<Nz }", """ <> amp_1 = sqrt(- log(rands[0, i, j, k])) <> amp_2 = sqrt(- log(rands[2, i, j, k])) <> phs_1 = exp(1j * 2. * pi * rands[1, i, j, k]) <> phs_2 = exp(1j * 2. * pi * rands[3, i, j, k]) <> power = f_power[i, j, k] <> Lmode = phs_1 * amp_1 * sqrt(power) <> Rmode = phs_2 * amp_2 * sqrt(power) <> fk_ = (Lmode + Rmode) / sqrt2 fk[i, j, k] = fk_ dfk[i, j, k] = 1j * wk[i, j, k] * (Lmode - Rmode) / sqrt2 - hubble * fk_ """, [ lp.ValueArg("hubble", self.rdtype), lp.GlobalArg("fk, dfk", shape=lp.auto, dtype=self.cdtype), ... ], seq_dependencies=True, silenced_warnings=["inferred_iname"], lang_version=(2018, 2), ) knl = lp.set_options(knl, return_dict=True) return knl
def time_knl(knl, ctx, param_dict): def create_rand_args(ctx, knl, param_dict): queue = cl.CommandQueue(ctx) info = lp.generate_code_v2(knl).implemented_data_info args, arg_data = lp.auto_test.make_ref_args(knl, info, queue, param_dict) args.clear() del args rand_args = lp.auto_test.make_args(knl, info, queue, arg_data, param_dict) del arg_data[:] del arg_data return rand_args queue = cl.CommandQueue(ctx) trial_wtimes = [] arg_arrays = create_rand_args(ctx, knl, param_dict) knl = lp.set_options(knl, no_numpy=True) for t in range(2 + 3): queue.finish() tstart = time.time() evt, out = knl(queue, **arg_arrays) queue.finish() tend = time.time() trial_wtimes.append(tend - tstart) return np.average(trial_wtimes[2:])
def test_slab_decomposition_does_not_double_execute(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "{ [i]: 0<=i<n }", "a[i] = 2*a[i]", assumptions="n>=1") ref_knl = knl for outer_tag in ["for", "g.0"]: knl = ref_knl knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="unr", outer_tag=outer_tag) knl = lp.set_loop_priority(knl, "i_outer") a = cl.array.empty(queue, 20, np.float32) a.fill(17) a_ref = a.copy() a_knl = a.copy() knl = lp.set_options(knl, write_cl=True) print("TEST-----------------------------------------") knl(queue, a=a_knl) print("REF-----------------------------------------") ref_knl(queue, a=a_ref) print("DONE-----------------------------------------") print("REF", a_ref) print("KNL", a_knl) assert (a_ref == a_knl).get().all() print("_________________________________")
def test_inf_support(ctx_factory, target, dtype): from loopy.symbolic import parse import math # See: https://github.com/inducer/loopy/issues/443 for some laughs ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel("{:}", [ lp.Assignment(parse("out_inf"), math.inf), lp.Assignment(parse("out_neginf"), -math.inf) ], [ lp.GlobalArg("out_inf", shape=lp.auto, dtype=dtype), lp.GlobalArg("out_neginf", shape=lp.auto, dtype=dtype) ], target=target()) knl = lp.set_options(knl, "return_dict") if target == lp.PyOpenCLTarget: _, out_dict = knl(queue) out_dict = {k: v.get() for k, v in out_dict.items()} elif target == lp.ExecutableCTarget: _, out_dict = knl() else: raise NotImplementedError("unsupported target") assert np.isinf(out_dict["out_inf"]) assert np.isneginf(out_dict["out_neginf"])
def test_random123(ctx_factory, tp): ctx = ctx_factory() queue = cl.CommandQueue(ctx) import pyopencl.version # noqa if cl.version.VERSION < (2016, 2): pytest.skip("Random123 RNG not supported in PyOpenCL < 2016.2") n = 150000 knl = lp.make_kernel( "{ [i]: 0<=i<n }", """ <> key2 = make_uint2(i, 324830944) {inames=i} <> key4 = make_uint4(i, 324830944, 234181, 2233) {inames=i} <> ctr = make_uint4(0, 1, 2, 3) {inames=i,id=init_ctr} <> real, ctr = philox4x32_TYPE(ctr, key2) {dep=init_ctr} <> imag, ctr = threefry4x32_TYPE(ctr, key4) {dep=init_ctr} out[i, 0] = real.s0 + 1j * imag.s0 out[i, 1] = real.s1 + 1j * imag.s1 out[i, 2] = real.s2 + 1j * imag.s2 out[i, 3] = real.s3 + 1j * imag.s3 """.replace("TYPE", tp)) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") knl = lp.set_options(knl, write_cl=True) evt, (out, ) = knl(queue, n=n) out = out.get() assert (out < 1).all() assert (0 <= out).all()
def test_opencl_emits_ternary_operators_correctly(ctx_factory, target): # See: https://github.com/inducer/loopy/issues/390 ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel("{:}", """ <> tmp1 = 3.1416 <> tmp2 = 0.000 y1 = 1729 if tmp1 else 1.414 y2 = 42 if 2.7183 else 13 y3 = 127 if tmp2 else 128 """, seq_dependencies=True, target=target()) knl = lp.set_options(knl, "return_dict") if target == lp.PyOpenCLTarget: evt, out_dict = knl(queue) elif target == lp.ExecutableCTarget: evt, out_dict = knl() else: raise NotImplementedError("unsupported target") assert out_dict["y1"] == 1729 assert out_dict["y2"] == 42 assert out_dict["y3"] == 128
def test_global_parallel_reduction_simpler(ctx_factory, size): ctx = ctx_factory() pytest.xfail("very sensitive to kernel ordering, fails unused hw-axis check") knl = lp.make_kernel( "{[l,g,j]: 0 <= l < nl and 0 <= g,j < ng}", """ <> key = make_uint2(l+nl*g, 1234) {inames=l:g} <> ctr = make_uint4(0, 1, 2, 3) {inames=l:g,id=init_ctr} <> vals, ctr = philox4x32_f32(ctr, key) {dep=init_ctr} <> tmp[g] = sum(l, vals.s0 + 1j*vals.s1 + vals.s2 + 1j*vals.s3) result = sum(j, tmp[j]) """) ng = 50 knl = lp.fix_parameters(knl, ng=ng) knl = lp.set_options(knl, write_cl=True) ref_knl = knl knl = lp.split_iname(knl, "l", 128, inner_tag="l.0") knl = lp.split_reduction_outward(knl, "l_inner") knl = lp.tag_inames(knl, "g:g.0,j:l.0") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"nl": size})
def __set_editor(knl, script): # set the edit script as the 'editor' os.environ['EDITOR'] = script # turn on code editing edit_knl = lp.set_options(knl, edit_code=True) return edit_knl
def test_packing_unpacking(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) x1 = cl.clrandom.rand(queue, (3, 2), dtype=np.float64) x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64) callee1 = lp.make_function("{[i]: 0<=i<6}", """ b[i] = 2*a[i] """, name="callee_fn1") callee2 = lp.make_function("{[i, j]: 0<=i<2 and 0 <= j < 3}", """ b[i, j] = 3*a[i, j] """, name="callee_fn2") knl = lp.make_kernel( "{[i, j, k]: 0<= i < 3 and 0 <= j < 2 and 0 <= k < 6}", """ [i, j]: y1[i, j] = callee_fn1([i, j]: x1[i, j]) [k]: y2[k] = callee_fn2([k]: x2[k]) """) knl = lp.merge([knl, callee1]) knl = lp.merge([knl, callee2]) knl = lp.pack_and_unpack_args_for_call(knl, "callee_fn1") knl = lp.pack_and_unpack_args_for_call(knl, "callee_fn2") if inline: knl = lp.inline_callable_kernel(knl, "callee_fn1") knl = lp.inline_callable_kernel(knl, "callee_fn2") knl = lp.set_options(knl, "write_cl") knl = lp.set_options(knl, "return_dict") evt, out_dict = knl(queue, x1=x1, x2=x2) y1 = out_dict["y1"].get() y2 = out_dict["y2"].get() assert np.linalg.norm(2 * x1.get() - y1) / np.linalg.norm( 2 * x1.get()) < 1e-15 assert np.linalg.norm(3 * x2.get() - y2) / np.linalg.norm( 3 * x2.get()) < 1e-15
def test_sci_notation_literal(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) set_kernel = lp.make_kernel(""" { [i]: 0<=i<12 } """, """ out[i] = 1e-12""") set_kernel = lp.set_options(set_kernel, write_cl=True) evt, (out, ) = set_kernel(queue) assert (np.abs(out.get() - 1e-12) < 1e-20).all()
def test_diff(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( """{ [i,j]: 0<=i,j<n }""", """ <> a = 1/(1+sinh(x[i] + y[j])**2) z[i] = sum(j, exp(a * x[j])) """, name="diff") knl = lp.fix_parameters(knl, n=50) from loopy.transform.diff import diff_kernel #FIXME Is this the correct interface. Does it make sense to take the entire #translation unit? dknl, diff_map = diff_kernel(knl["diff"], "z", "x") dknl = knl.with_kernel(dknl) dknl = lp.remove_unused_arguments(dknl) dknl = lp.add_inames_to_insn(dknl, "diff_i0", "writes:a_dx or writes:a") print(dknl) n = 50 x = np.random.randn(n) y = np.random.randn(n) dx = np.random.randn(n) fac = 1e-1 h1 = 1e-4 h2 = h1 * fac evt, (z0,) = knl(queue, x=x, y=y) evt, (z1,) = knl(queue, x=(x + h1*dx), y=y) evt, (z2,) = knl(queue, x=(x + h2*dx), y=y) dknl = lp.set_options(dknl, write_cl=True) evt, (df,) = dknl(queue, x=x, y=y) diff1 = (z1-z0) diff2 = (z2-z0) diff1_predicted = df.dot(h1*dx) diff2_predicted = df.dot(h2*dx) err1 = la.norm(diff1 - diff1_predicted) / la.norm(diff1) err2 = la.norm(diff2 - diff2_predicted) / la.norm(diff2) print(err1, err2) assert (err2 < err1 * fac * 1.1).all()
def test_sci_notation_literal(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) set_kernel = lp.make_kernel( ''' { [i]: 0<=i<12 } ''', ''' out[i] = 1e-12''') set_kernel = lp.set_options(set_kernel, write_cl=True) evt, (out,) = set_kernel(queue) assert (np.abs(out.get() - 1e-12) < 1e-20).all()
def test_indexof(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel(""" { [i,j]: 0<=i,j<5 } """, """ out[i,j] = indexof(out[i,j])""") knl = lp.set_options(knl, write_cl=True) (evt, (out, )) = knl(queue) out = out.get() assert np.array_equal(out.ravel(order="C"), np.arange(25))
def test_clamp(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) n = 15 * 10**6 x = cl.clrandom.rand(queue, n, dtype=np.float32) knl = lp.make_kernel("{ [i]: 0<=i<n }", "out[i] = clamp(x[i], a, b)") knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") knl = lp.set_options(knl, write_cl=True) evt, (out, ) = knl(queue, x=x, a=np.float32(12), b=np.float32(15))
def test_indexof(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( ''' { [i,j]: 0<=i,j<5 } ''', ''' out[i,j] = indexof(out[i,j])''') knl = lp.set_options(knl, write_cl=True) (evt, (out,)) = knl(queue) out = out.get() assert np.array_equal(out.ravel(order="C"), np.arange(25))
def test_diff(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( """{ [i,j]: 0<=i,j<n }""", """ <> a = 1/(1+sinh(x[i] + y[j])**2) z[i] = sum(j, exp(a * x[j])) """) knl = lp.fix_parameters(knl, n=50) from loopy.transform.diff import diff_kernel dknl, diff_map = diff_kernel(knl, "z", "x") dknl = lp.remove_unused_arguments(dknl) dknl = lp.add_inames_to_insn(dknl, "diff_i0", "writes:a_dx or writes:a") print(dknl) n = 50 x = np.random.randn(n) y = np.random.randn(n) dx = np.random.randn(n) fac = 1e-1 h1 = 1e-4 h2 = h1 * fac evt, (z0,) = knl(queue, x=x, y=y) evt, (z1,) = knl(queue, x=(x + h1*dx), y=y) evt, (z2,) = knl(queue, x=(x + h2*dx), y=y) dknl = lp.set_options(dknl, write_cl=True) evt, (df,) = dknl(queue, x=x, y=y) diff1 = (z1-z0) diff2 = (z2-z0) diff1_predicted = df.dot(h1*dx) diff2_predicted = df.dot(h2*dx) err1 = la.norm(diff1 - diff1_predicted) / la.norm(diff1) err2 = la.norm(diff2 - diff2_predicted) / la.norm(diff2) print(err1, err2) assert (err2 < err1 * fac * 1.1).all()
def test_clamp(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) n = 15 * 10**6 x = cl.clrandom.rand(queue, n, dtype=np.float32) knl = lp.make_kernel( "{ [i]: 0<=i<n }", "out[i] = clamp(x[i], a, b)") knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") knl = lp.set_options(knl, write_cl=True) evt, (out,) = knl(queue, x=x, a=np.float32(12), b=np.float32(15))
def test_any_all(ctx_factory): ctx = ctx_factory() cq = cl.CommandQueue(ctx) knl = lp.make_kernel( "{[i, j]: 0<=i,j<10}", """ out1 = reduce(any, [i], i == 4) out2 = reduce(all, [j], j == 5) """) knl = lp.set_options(knl, return_dict=True) _, out_dict = knl(cq) assert out_dict["out1"].get() assert not out_dict["out2"].get()
def test_empty_reduction(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( ["{[i]: 0<=i<20}", "[i] -> {[j]: 0<=j<0}"], "a[i] = sum(j, j)", ) knl = lp.preprocess_kernel(knl) print(knl) knl = lp.set_options(knl, write_cl=True) evt, (a, ) = knl(queue) assert (a.get() == 0).all()
def test_indexof_vec(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) if ctx.devices[0].platform.name.startswith("Portable"): # Accurate as of 2015-10-08 pytest.skip("POCL miscompiles vector code") knl = lp.make_kernel(''' { [i,j,k]: 0<=i,j,k<4 } ''', ''' out[i,j,k] = indexof_vec(out[i,j,k])''') knl = lp.tag_inames(knl, {"i": "vec"}) knl = lp.tag_data_axes(knl, "out", "vec,c,c") knl = lp.set_options(knl, write_cl=True) (evt, (out, )) = knl(queue)
def test_nan_support(ctx_factory): from loopy.symbolic import parse ctx = ctx_factory() knl = lp.make_kernel("{:}", [ lp.Assignment(parse("a"), np.nan), lp.Assignment(parse("b"), parse("isnan(a)")), lp.Assignment(parse("c"), parse("isnan(3.14)")) ], seq_dependencies=True) knl = lp.set_options(knl, "return_dict") evt, out_dict = knl(cl.CommandQueue(ctx)) assert np.isnan(out_dict["a"].get()) assert out_dict["b"] == 1 assert out_dict["c"] == 0
def test_diff(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( """{ [i,j]: 0<=i,j<n }""", """ <> a = 1/(1+sinh(x[i] + y[j])**2) z[i] = sum(j, exp(a * x[j])) """) knl = lp.fix_parameters(knl, n=50) from loopy.transform.diff import diff_kernel dknl, diff_map = diff_kernel(knl, "z", "x") dknl = lp.remove_unused_arguments(dknl) print(dknl) n = 50 x = np.random.randn(n) y = np.random.randn(n) dx = np.random.randn(n) fac = 1e-1 h1 = 1e-4 h2 = h1 * fac evt, (z0,) = knl(queue, x=x, y=y) evt, (z1,) = knl(queue, x=(x + h1*dx), y=y) evt, (z2,) = knl(queue, x=(x + h2*dx), y=y) dknl = lp.set_options(dknl, write_cl=True) evt, (df,) = dknl(queue, x=x, y=y) diff1 = (z1-z0) diff2 = (z2-z0) diff1_predicted = df.dot(h1*dx) diff2_predicted = df.dot(h2*dx) err1 = la.norm(diff1 - diff1_predicted) / la.norm(diff1) err2 = la.norm(diff2 - diff2_predicted) / la.norm(diff2) print(err1, err2) assert (err2 < err1 * fac * 1.1).all()
def test_register_knl_with_hw_axes(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) n = 4 x_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) y_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) callee_knl = lp.make_function("{[i, j]:0<=i, j < 4}", """ g[i, j] = 2*e[i, j] + 3*f[i, j] """, name="linear_combo") callee_knl = lp.split_iname(callee_knl, "i", 1, inner_tag="l.0", outer_tag="g.0") caller_knl = lp.make_kernel("{[i, j, k, l, m]: 0<=i, j, k, l, m<4}", """ [j, l]: z[i, j, k, l, m] = linear_combo([j, l]: x[i, j, k, l, m], [j, l]: y[i, j, k, l, m]) """, name="caller") caller_knl = lp.split_iname(caller_knl, "i", 4, inner_tag="l.1", outer_tag="g.1") knl = lp.merge([caller_knl, callee_knl]) knl = lp.set_options(knl, "return_dict") if inline: knl = lp.inline_callable_kernel(knl, "linear_combo") evt, out = knl(queue, x=x_dev, y=y_dev) x_host = x_dev.get() y_host = y_dev.get() assert np.linalg.norm(2 * x_host + 3 * y_host - out["z"].get() ) / np.linalg.norm(2 * x_host + 3 * y_host) < 1e-15
def test_indexof_vec(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) if ctx.devices[0].platform.name.startswith("Portable"): # Accurate as of 2015-10-08 pytest.skip("POCL miscompiles vector code") knl = lp.make_kernel( ''' { [i,j,k]: 0<=i,j,k<4 } ''', ''' out[i,j,k] = indexof_vec(out[i,j,k])''') knl = lp.tag_inames(knl, {"i": "vec"}) knl = lp.tag_data_axes(knl, "out", "vec,c,c") knl = lp.set_options(knl, write_cl=True) (evt, (out,)) = knl(queue)
def test_pyopencl_target_with_global_temps_with_base_storage(ctx_factory): from pyopencl.tools import ImmediateAllocator class RecordingAllocator(ImmediateAllocator): def __init__(self, queue): super().__init__(queue) self.allocated_nbytes = 0 def __call__(self, size): self.allocated_nbytes += size return super().__call__(size) ctx = ctx_factory() cq = cl.CommandQueue(ctx) knl = lp.make_kernel( "{[i, j]: 0<=i, j<10}", """ tmp1[i] = 2*i {id=w_tmp1} y[i] = tmp1[i] {nosync=w_tmp1} ... gbarrier tmp2[j] = 3*j {id=w_tmp2} z[j] = tmp2[j] {nosync=w_tmp2} """, [ lp.TemporaryVariable("tmp1", base_storage="base", address_space=lp.AddressSpace.GLOBAL), lp.TemporaryVariable("tmp2", base_storage="base", address_space=lp.AddressSpace.GLOBAL), ... ], seq_dependencies=True) knl = lp.tag_inames(knl, {"i": "g.0", "j": "g.0"}) knl = lp.set_options(knl, "return_dict") my_allocator = RecordingAllocator(cq) _, out = knl(cq, allocator=my_allocator) np.testing.assert_allclose(out["y"].get(), 2 * np.arange(10)) np.testing.assert_allclose(out["z"].get(), 3 * np.arange(10)) assert my_allocator.allocated_nbytes == ( 40 # base + 40 # y + 40 # z )
def test_sized_integer_c_codegen(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) from pymbolic import var knl = lp.make_kernel( "{[i]: 0<=i<n}", [lp.Assignment("a[i]", lp.TypeCast(np.int64, 1) << var("i"))]) knl = lp.set_options(knl, write_code=True) n = 40 evt, (a, ) = knl(queue, n=n) a_ref = 1 << np.arange(n, dtype=np.int64) assert np.array_equal(a_ref, a.get())
def test_indexof_vec(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) if ( # Accurate as of 2019-11-04 ctx.devices[0].platform.name.startswith("Intel")): pytest.skip("target ICD miscompiles vector code") knl = lp.make_kernel(""" { [i,j,k]: 0<=i,j,k<4 } """, """ out[i,j,k] = indexof_vec(out[i,j,k])""", [lp.GlobalArg("out", shape=lp.auto, is_input=False)]) knl = lp.tag_inames(knl, {"i": "vec"}) knl = lp.tag_data_axes(knl, "out", "vec,c,c") knl = lp.set_options(knl, write_cl=True) (evt, (out, )) = knl(queue)
def test_sized_integer_c_codegen(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) from pymbolic import var knl = lp.make_kernel( "{[i]: 0<=i<n}", [lp.Assignment("a[i]", lp.TypeCast(np.int64, 1) << var("i"))] ) knl = lp.set_options(knl, write_code=True) n = 40 evt, (a,) = knl(queue, n=n) a_ref = 1 << np.arange(n, dtype=np.int64) assert np.array_equal(a_ref, a.get())
def test_empty_reduction(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( [ "{[i]: 0<=i<20}", "[i] -> {[j]: 0<=j<0}" ], "a[i] = sum(j, j)", ) knl = lp.realize_reduction(knl) print(knl) knl = lp.set_options(knl, write_cl=True) evt, (a,) = knl(queue) assert (a.get() == 0).all()
def test_make_copy_kernel(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) intermediate_format = "f,f,sep" a1 = np.random.randn(1024, 4, 3) cknl1 = lp.make_copy_kernel(intermediate_format) cknl1 = lp.fix_parameters(cknl1, n2=3) cknl1 = lp.set_options(cknl1, write_cl=True) evt, a2 = cknl1(queue, input=a1) cknl2 = lp.make_copy_kernel("c,c,c", intermediate_format) cknl2 = lp.fix_parameters(cknl2, n2=3) evt, a3 = cknl2(queue, input=a2) assert (a1 == a3).all()
def test_argmax(ctx_factory): logging.basicConfig(level=logging.INFO) dtype = np.dtype(np.float32) ctx = ctx_factory() queue = cl.CommandQueue(ctx) n = 10000 knl = lp.make_kernel( "{[i]: 0<=i<%d}" % n, """ max_val, max_idx = argmax(i, fabs(a[i])) """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) print(lp.preprocess_kernel(knl)) knl = lp.set_options(knl, write_cl=True, highlight_cl=True) a = np.random.randn(10000).astype(dtype) evt, (max_idx, max_val) = knl(queue, a=a, out_host=True) assert max_val == np.max(np.abs(a)) assert max_idx == np.where(np.abs(a) == max_val)[-1]
def dense_bprop_input(): """ Given: dCost/dOut Compute: dCost/dIn via dCost/dOut * dOut/dIn. dOut/dIn is just the weight matrix. dOut/dIn[i, j] = w[i,j] """ knl = lp.make_kernel( domains=""" { [i, j]: 0<=i<input_len and 0<=j<output_len } """, instructions=[ """ d_cost_d_in[i] = sum(j, d_cost_d_out[j] * weights[i, j]) """, ], assumptions="output_len,input_len>0" ) knl = lp.set_options(knl, 'write_cl') return knl
def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): ctx = ctx_factory() filename = "strongVolumeKernels.f90" with open(filename, "r") as sourcef: source = sourcef.read() source = source.replace("datafloat", "real*4") hsv_r, hsv_s = [ knl for knl in lp.parse_fortran(source, filename, auto_dependencies=False) if "KernelR" in knl.name or "KernelS" in knl.name ] hsv_r = lp.tag_instructions(hsv_r, "rknl") hsv_s = lp.tag_instructions(hsv_s, "sknl") hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"]) #hsv = hsv_s from gnuma_loopy_transforms import ( fix_euler_parameters, set_q_storage_format, set_D_storage_format) hsv = lp.fix_parameters(hsv, Nq=Nq) hsv = lp.set_loop_priority(hsv, "e,k,j,i") hsv = lp.tag_inames(hsv, dict(e="g.0", j="l.1", i="l.0")) hsv = lp.assume(hsv, "elements >= 1") hsv = fix_euler_parameters(hsv, p_p0=1, p_Gamma=1.4, p_R=1) for name in ["Q", "rhsQ"]: hsv = set_q_storage_format(hsv, name) hsv = set_D_storage_format(hsv) #hsv = lp.add_prefetch(hsv, "volumeGeometricFactors") ref_hsv = hsv if opt_level == 0: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "D[:,:]") if opt_level == 1: tap_hsv = hsv # turn the first reads into subst rules local_prep_var_names = set() for insn in lp.find_instructions(hsv, "tag:local_prep"): assignee, = insn.assignee_var_names() local_prep_var_names.add(assignee) hsv = lp.assignment_to_subst(hsv, assignee) # precompute fluxes hsv = lp.assignment_to_subst(hsv, "JinvD_r") hsv = lp.assignment_to_subst(hsv, "JinvD_s") r_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:rknl") s_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:sknl") if ilp_multiple > 1: hsv = lp.split_iname(hsv, "k", 2, inner_tag="ilp") ilp_inames = ("k_inner",) flux_ilp_inames = ("kk",) else: ilp_inames = () flux_ilp_inames = () rtmps = [] stmps = [] flux_store_idx = 0 for rflux_insn, sflux_insn in zip(r_fluxes, s_fluxes): for knl_tag, insn, flux_inames, tmps, flux_precomp_inames in [ ("rknl", rflux_insn, ("j", "n",), rtmps, ("jj", "ii",)), ("sknl", sflux_insn, ("i", "n",), stmps, ("ii", "jj",)), ]: flux_var, = insn.assignee_var_names() print(insn) reader, = lp.find_instructions(hsv, "tag:{knl_tag} and reads:{flux_var}" .format(knl_tag=knl_tag, flux_var=flux_var)) hsv = lp.assignment_to_subst(hsv, flux_var) flux_store_name = "flux_store_%d" % flux_store_idx flux_store_idx += 1 tmps.append(flux_store_name) hsv = lp.precompute(hsv, flux_var+"_subst", flux_inames + ilp_inames, temporary_name=flux_store_name, precompute_inames=flux_precomp_inames + flux_ilp_inames, default_tag=None) if flux_var.endswith("_s"): hsv = lp.tag_data_axes(hsv, flux_store_name, "N0,N1,N2?") else: hsv = lp.tag_data_axes(hsv, flux_store_name, "N1,N0,N2?") n_iname = "n_"+flux_var.replace("_r", "").replace("_s", "") if n_iname.endswith("_0"): n_iname = n_iname[:-2] hsv = lp.rename_iname(hsv, "n", n_iname, within="id:"+reader.id, existing_ok=True) hsv = lp.tag_inames(hsv, dict(ii="l.0", jj="l.1")) for iname in flux_ilp_inames: hsv = lp.tag_inames(hsv, {iname: "ilp"}) hsv = lp.alias_temporaries(hsv, rtmps) hsv = lp.alias_temporaries(hsv, stmps) if opt_level == 2: tap_hsv = hsv for prep_var_name in local_prep_var_names: if prep_var_name.startswith("Jinv") or "_s" in prep_var_name: continue hsv = lp.precompute(hsv, lp.find_one_rule_matching(hsv, prep_var_name+"_*subst*")) if opt_level == 3: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "Q[ii,jj,k,:,:,e]", sweep_inames=ilp_inames) if opt_level == 4: tap_hsv = hsv tap_hsv = lp.tag_inames(tap_hsv, dict( Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.buffer_array(hsv, "rhsQ", ilp_inames, fetch_bounding_box=True, default_tag="for", init_expression="0", store_expression="base + buffer") if opt_level == 5: tap_hsv = hsv tap_hsv = lp.tag_inames(tap_hsv, dict( rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) # buffer axes need to be vectorized in order for this to work hsv = lp.tag_data_axes(hsv, "rhsQ_buf", "c?,vec,c") hsv = lp.tag_data_axes(hsv, "Q_fetch", "c?,vec,c") hsv = lp.tag_data_axes(hsv, "D_fetch", "f,f") hsv = lp.tag_inames(hsv, {"Q_dim_k": "unr", "rhsQ_init_k": "unr", "rhsQ_store_k": "unr"}, ignore_nonexistent=True) if opt_level == 6: tap_hsv = hsv tap_hsv = lp.tag_inames(tap_hsv, dict( rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.tag_inames(hsv, dict( rhsQ_init_field_inner="vec", rhsQ_store_field_inner="vec", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="vec", Q_dim_field_outer="unr")) if opt_level == 7: tap_hsv = hsv hsv = lp.collect_common_factors_on_increment(hsv, "rhsQ_buf", vary_by_axes=(0,) if ilp_multiple > 1 else ()) if opt_level >= 8: tap_hsv = hsv hsv = tap_hsv if 1: print("OPS") op_poly = lp.get_op_poly(hsv) print(lp.stringify_stats_mapping(op_poly)) print("MEM") gmem_poly = lp.sum_mem_access_to_bytes(lp.get_gmem_access_poly(hsv)) print(lp.stringify_stats_mapping(gmem_poly)) hsv = lp.set_options(hsv, cl_build_options=[ "-cl-denorms-are-zero", "-cl-fast-relaxed-math", "-cl-finite-math-only", "-cl-mad-enable", "-cl-no-signed-zeros", ]) hsv = hsv.copy(name="horizontalStrongVolumeKernel") results = lp.auto_test_vs_ref(ref_hsv, ctx, hsv, parameters=dict(elements=300), quiet=True) elapsed = results["elapsed_wall"] print("elapsed", elapsed)
import numpy as np import loopy as lp import pyopencl as cl import pyopencl.array # noqa ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) n = 200 a = cl.array.arange(queue, n, dtype=np.float32) knl = lp.make_kernel( "{ [i,k]: 0<=i,k<n }", "out[i] = a[i] + 1e-12" ) knl = lp.set_options(knl, write_cl=True) evt, (out,) = knl(queue, a=a)
import numpy as np import pyopencl as cl import loopy as lp ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "{[i, j]: 0<=i<n and 0<=j<n}", "c[i, j] = a[i]*b[j]", assumptions="n >= 16") a = np.arange(200, dtype=np.float32) b = np.arange(200, dtype=np.float32) knl = lp.set_options(knl, write_code=True) evt, (c,) = knl(queue, a=a, b=b) # SETUPEND orig_knl = knl # SPLITBEGIN knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") # SPLITEND knl = lp.set_options(knl, write_code=True) evt, (c,) = knl(queue, a=a, b=b)
''' exp[i] = E ** z[i]''', assumptions="n>0") sum_kernel = lp.make_kernel( '{ [j]: 0<=j<n }', 'total = sum(j, exp[j])', assumptions='n>0') softmax_kernel = lp.make_kernel( '{ [k]: 0<=k<n }', 'out3[k] = exp[k] / total', [ lp.GlobalArg("total", None, shape=()), "..." ], assumptions='n>0') big_honkin_knl = lp.fuse_kernels([exp_kernel, sum_kernel, softmax_kernel]) #big_honkin_knl = lp.fuse_kernels([exp_kernel, sum_kernel]) print softmax_kernel.arg_dict["total"].shape #big_honkin_knl = lp.tag_inames(big_honkin_knl, dict(i="l.0")) big_honkin_knl = lp.set_options(big_honkin_knl, write_cl=True) a = np.random.randn(20) big_honkin_knl = big_honkin_knl(queue, z=a, E=np.float64(5))
# Kernel # knl = lp.make_kernel( "{ [i]: 0<=i<nx }", """ double(s) := 2*s b[i] = if(i<nx/2, double(a[i]), a[i]) """ ) # transform # --------- wgs = 512 # Work Group Size knl = lp.fix_parameters(knl, nx=nx) knl = lp.set_options(knl, write_cl=True, write_wrapper=False) print(knl) #typed_knl = lp.add_dtypes(knl, dict(a=np.float32)) #code, _ = lp.generate_code(typed_knl) #print(code) # execute # ------- evt, (out,) = knl(queue, a=a, b=b) #a_assert(2*a.get(), b.get()) #print( np.linalg.norm(2*a.get()-out.get())==0 ) print(a.get()) print(b.get())
import loopy as lp import numpy as np import pyopencl as cl import math knl = lp.make_kernel( """{ [i,j,k]: 0<=i<n and 0<=j<n and 0<=k<n } """, instructions=[ '<float32>exp[i] = E ** in_batch[i]', '<float32> total = sum(j, exp[j])', 'out[k] = exp[k] / total' ], assumptions="n>0") knl = lp.set_options(knl, 'write_cl') if __name__ == "__main__": ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) x = np.array([1, 2, 3, 4, 5]).astype(np.float32) ex = math.e ** x print 'np softmax:', x / ex.sum() print knl evt, out = knl(queue, in_batch=x, E=np.float32(math.e)) print 'lp sofxmax:', out