def test_variable_size_temporary(): knl = lp.make_kernel(''' { [i,j]: 0<=i,j<n } ''', ''' out[i] = sum(j, a[i,j])''') knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) knl = lp.add_prefetch(knl, "a[:,:]", default_tag=None) # Make sure that code generation succeeds even if # there are variable-length arrays. knl = lp.preprocess_kernel(knl) for k in lp.generate_loop_schedules(knl): lp.generate_code(k)
def test_variable_size_temporary(): knl = lp.make_kernel( ''' { [i,j]: 0<=i,j<n } ''', ''' out[i] = sum(j, a[i,j])''') knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) knl = lp.add_prefetch( knl, "a[:,:]", default_tag=None) # Make sure that code generation succeeds even if # there are variable-length arrays. knl = lp.preprocess_kernel(knl) for k in lp.generate_loop_schedules(knl): lp.generate_code(k)
def test_vectorize(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<n}", """ <> temp = 2*b[i] a[i] = temp """) knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32)) knl = lp.set_array_dim_names(knl, "a,b", "i") knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)], 4, split_kwargs=dict(slabs=(0, 1))) knl = lp.tag_data_axes(knl, "a,b", "c,vec") ref_knl = knl ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"}) knl = lp.tag_inames(knl, {"i_inner": "vec"}) knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) code, inf = lp.generate_code(knl) lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters=dict(n=30))
def test_fd_demo(): knl = lp.make_kernel( "{[i,j]: 0<=i,j<n}", "result[i+1,j+1] = u[i + 1, j + 1]**2 + -1 + (-4)*u[i + 1, j + 1] \ + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] \ + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]") #assumptions="n mod 16=0") knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.add_prefetch(knl, "u", ["i_inner", "j_inner"], fetch_bounding_box=True, default_tag="l.auto") #n = 1000 #u = cl.clrandom.rand(queue, (n+2, n+2), dtype=np.float32) knl = lp.set_options(knl, write_cl=True) knl = lp.add_and_infer_dtypes(knl, dict(u=np.float32)) code, inf = lp.generate_code(knl) print(code) assert "double" not in code
def test_vectorize(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<n}", """ <> temp = 2*b[i] a[i] = temp """) knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32)) knl = lp.set_array_dim_names(knl, "a,b", "i") knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)], 4, split_kwargs=dict(slabs=(0, 1))) knl = lp.tag_data_axes(knl, "a,b", "c,vec") ref_knl = knl ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"}) knl = lp.tag_inames(knl, {"i_inner": "vec"}) knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) code, inf = lp.generate_code(knl) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=30))
def __test(loop_size, vec_width): knl = lp.make_kernel( '{{[i]: 0 <= i < {}}}'.format(loop_size), """ <> x = 1.0 a1[0] = a1[0] + x {id=set} ... lbarrier {id=wait, dep=set} for i a1[0] = a1[0] + 1 {id=a1, dep=set:wait, nosync=set} end """, [ lp.GlobalArg( 'a1', shape=(loop_size, ), order='C', dtype=np.float32) ], target=lp.OpenCLTarget(), silenced_warnings=['no_device_in_pre_codegen_checks']) loopy_opts = type('', (object, ), { 'depth': vec_width, 'order': 'C', 'use_atomic_doubles': True }) knl = lp.split_iname(knl, 'i', vec_width, inner_tag='l.0') # feed through deep specializer _, ds = get_deep_specializer(loopy_opts, atomic_ids=['a1'], split_ids=['set'], use_atomics=True, is_write_race=True, split_size=loop_size) knl = ds(knl) val = np.minimum(loop_size, vec_width) assert 'x / {:.1f}f'.format(val) in lp.generate_code(knl)[0]
def test_divisibility_assumption(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "[n] -> {[i]: 0<=i<n}", [ "b[i] = 2*a[i]" ], [ lp.GlobalArg("a", np.float32, shape=("n",)), lp.GlobalArg("b", np.float32, shape=("n",)), lp.ValueArg("n", np.int32), ], assumptions="n>=1 and (exists zz: n = 16*zz)") ref_knl = knl knl = lp.split_iname(knl, "i", 16) knl = lp.preprocess_kernel(knl, ctx.devices[0]) for k in lp.generate_loop_schedules(knl): code = lp.generate_code(k) assert "if" not in code lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": 16**3})
def test_simple_kernel(self): knl = lp.make_kernel("{ [i]: 0<=i<n }", "out[i] = 2*a[i]", target=CTarget()) typed = lp.add_dtypes(knl, {'a': np.float32}) code, _ = lp.generate_code(typed) fn = CompiledKernel(typed) # noqa a, out = np.zeros((2, 10), np.float32) a[:] = np.r_[:a.size] fn(a, 10, out) np.testing.assert_allclose(out, a * 2)
def get_kernel_executor(self, knl, *args, **kwargs): code, _ = lp.generate_code(knl) if self.no_jit: code = '\n'.join([ line for line in code.split('\n') if line != '@_lpy_numba.jit' ]) for i, line in enumerate(code.split('\n')): print(i + 1, line) LOG.debug(code) ns = {} exec(code, ns) return ns[knl.name]
def test_cuda_target(): from loopy.target.cuda import CudaTarget knl = lp.make_kernel( "{ [i]: 0<=i<n }", "out[i] = 2*a[i]", [lp.GlobalArg("out,a", np.float32, shape=lp.auto), "..."], target=CudaTarget()) knl = lp.split_iname(knl, "i", 8, inner_tag="l.0") knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"]) print( lp.generate_code(lp.get_one_scheduled_kernel( lp.preprocess_kernel(knl)))[0])
def test_multiple_writes_to_local_temporary(): # Loopy would previously only handle barrier insertion correctly if exactly # one instruction wrote to each local temporary. This tests that multiple # writes are OK. knl = lp.make_kernel( "{[i,e]: 0<=i<5 and 0<=e<nelements}", """ <> temp[i, 0] = 17 temp[i, 1] = 15 """) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.preprocess_kernel(knl) for k in lp.generate_loop_schedules(knl): code, _ = lp.generate_code(k) print(code)
def get_code(knl, opts=None): """ Returns the device code for a :class:`loopy.LoopKernel` or fixes alreay generated code Parameters ---------- knl : :class:`loopy.LoopKernel` or str The kernel to generate code for. If knl is a string, it is assumed to be pregenerated code, and only the editor script must be called opts: :class:`loopy_options` The options used in created the kernel -- used to detect platform specific fixes. Ignored if not supplied Returns ------- code: str Generated device code Notes ----- The kernel's Target and name should be set for proper functioning """ if isinstance(knl, str): code = knl else: code, _ = lp.generate_code(knl) extra_subs = {} if opts is None: # ignore pass elif opts.lang == 'opencl' and ('intel' in opts.platform.name.lower() and ((opts.order == 'C' and opts.width) or (opts.order == 'F' and opts.depth) or (opts.order == 'F' and opts.width))): # If True, this is a finite-difference Jacobian on an Intel OpenCL platform # Hence we have to tell the codefixer about the intel bug # https://software.intel.com/en-us/forums/opencl/topic/748841 extra_subs[ r'__kernel void __attribute__ \(\(reqd_work_group_size\(\d+, 1, 1' r'\)\)\) species_rates_kernel'] = r'void species_rates_kernel' return codefix('stdin', text_in=code, extra_subs=extra_subs)
def test_eq_constraint(ctx_factory): logging.basicConfig(level=logging.INFO) ctx = ctx_factory() knl = lp.make_kernel("{[i,j]: 0<= i,j < 32}", ["a[i] = b[i]"], [ lp.GlobalArg("a", np.float32, shape=(1000, )), lp.GlobalArg("b", np.float32, shape=(1000, )) ]) knl = lp.split_iname(knl, "i", 16, outer_tag="g.0") knl = lp.split_iname(knl, "i_inner", 16, outer_tag=None, inner_tag="l.0") knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for knl in kernel_gen: print(lp.generate_code(knl))
def test_type_inference_no_artificial_doubles(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[i]: 0<=i<n}", """ <> bb = a[i] - b[i] c[i] = bb """, [ lp.GlobalArg("a", np.float32, shape=("n", )), lp.GlobalArg("b", np.float32, shape=("n", )), lp.GlobalArg("c", np.float32, shape=("n", )), lp.ValueArg("n", np.int32), ], assumptions="n>=1") knl = lp.preprocess_kernel(knl, ctx.devices[0]) for k in lp.generate_loop_schedules(knl): code = lp.generate_code(k) assert "double" not in code
def test_cuda_target(): from loopy.target.cuda import CudaTarget knl = lp.make_kernel( "{ [i]: 0<=i<n }", "out[i] = 2*a[i]", [ lp.GlobalArg("out,a", np.float32, shape=lp.auto), "..." ], target=CudaTarget()) knl = lp.split_iname(knl, "i", 8, inner_tag="l.0") knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"]) print( lp.generate_code( lp.get_one_scheduled_kernel( lp.preprocess_kernel(knl)))[0])
def __init__(self, knl: lp.LoopKernel, comp: Compiler = None): assert isinstance(knl.target, CTarget) self.knl = knl self.code, _ = lp.generate_code(knl) self.comp = comp or Compiler() self.dll = self.comp.build(self.code) self.func_decl, = generate_header(knl) self._arg_info = [] # TODO knl.args[:].dtype is sufficient self._visit_func_decl(self.func_decl) self.name = self.knl.name restype = self.func_decl.subdecl.typename if restype == 'void': self.restype = None else: raise ValueError('Unhandled restype %r' % (restype, )) self._fn = getattr(self.dll, self.name) self._fn.restype = self.restype self._fn.argtypes = [ctype for name, ctype in self._arg_info] self._prepared_call_cache = weakref.WeakKeyDictionary()
def test_type_inference_no_artificial_doubles(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<n}", """ <> bb = a[i] - b[i] c[i] = bb """, [ lp.GlobalArg("a", np.float32, shape=("n",)), lp.GlobalArg("b", np.float32, shape=("n",)), lp.GlobalArg("c", np.float32, shape=("n",)), lp.ValueArg("n", np.int32), ], assumptions="n>=1") knl = lp.preprocess_kernel(knl, ctx.devices[0]) for k in lp.generate_loop_schedules(knl): code = lp.generate_code(k) assert "double" not in code
def test_eq_constraint(ctx_factory): logging.basicConfig(level=logging.INFO) ctx = ctx_factory() knl = lp.make_kernel( "{[i,j]: 0<= i,j < 32}", [ "a[i] = b[i]" ], [ lp.GlobalArg("a", np.float32, shape=(1000,)), lp.GlobalArg("b", np.float32, shape=(1000,)) ]) knl = lp.split_iname(knl, "i", 16, outer_tag="g.0") knl = lp.split_iname(knl, "i_inner", 16, outer_tag=None, inner_tag="l.0") knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for knl in kernel_gen: print(lp.generate_code(knl))
def _dtype_and_code(self, knl, **extra_dtypes): dtypes = {'in': np.float32, 'out': np.float32} dtypes.update(extra_dtypes) knl = lp.add_dtypes(knl, dtypes) code, _ = lp.generate_code(knl) return code
def code(self, *args, **kwargs): knl = self.kernel(*args, **kwargs) code, _ = generate_code(knl) return code
def test_laplacian_lmem_ilp(ctx_factory): # This does not lead to practical/runnable code (out of lmem), but it's an # excellent stress test for the code generator. :) dtype = np.float32 ctx = ctx_factory() order = "C" n = 8 from pymbolic import var K_sym = var("K") field_shape = (K_sym, n, n, n) # K - run-time symbolic knl = lp.make_kernel(ctx.devices[0], "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K }" % n, [ "ur(i,j,k) := sum_float32(@o, D[i,o]*u[e,o,j,k])", "us(i,j,k) := sum_float32(@o, D[j,o]*u[e,i,o,k])", "ut(i,j,k) := sum_float32(@o, D[k,o]*u[e,i,j,o])", "lap[e,i,j,k] = " " sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))" "+ sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))" "+ sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))" ], [ lp.GlobalArg("u", dtype, shape=field_shape, order=order), lp.GlobalArg("lap", dtype, shape=field_shape, order=order), lp.GlobalArg("G", dtype, shape=(6,)+field_shape, order=order), lp.GlobalArg("D", dtype, shape=(n, n), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap", assumptions="K>=1") # Must act on u first, otherwise stencil becomes crooked and # footprint becomes non-convex. knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1)) knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp") knl = lp.add_prefetch(knl, "u", [1, 2, 3, "e_inner_inner"]) knl = lp.precompute(knl, "ur", np.float32, [0, 1, 2, "e_inner_inner"]) knl = lp.precompute(knl, "us", np.float32, [0, 1, 2, "e_inner_inner"]) knl = lp.precompute(knl, "ut", np.float32, [0, 1, 2, "e_inner_inner"]) knl = lp.add_prefetch(knl, "G", ["m", "i", "j", "k", "e_inner_inner"]) knl = lp.add_prefetch(knl, "D", ["m", "j"]) #print seq_knl #1/0 knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) for knl in kernel_gen: print(lp.generate_code(knl))
def test_laplacian_lmem_ilp(ctx_factory): # This does not lead to practical/runnable code (out of lmem), but it's an # excellent stress test for the code generator. :) dtype = np.float32 ctx = ctx_factory() order = "C" n = 8 from pymbolic import var K_sym = var("K") field_shape = (K_sym, n, n, n) # K - run-time symbolic knl = lp.make_kernel( ctx.devices[0], "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K }" % n, [ "ur(i,j,k) := sum_float32(@o, D[i,o]*u[e,o,j,k])", "us(i,j,k) := sum_float32(@o, D[j,o]*u[e,i,o,k])", "ut(i,j,k) := sum_float32(@o, D[k,o]*u[e,i,j,o])", "lap[e,i,j,k] = " " sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))" "+ sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))" "+ sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))" ], [ lp.GlobalArg("u", dtype, shape=field_shape, order=order), lp.GlobalArg("lap", dtype, shape=field_shape, order=order), lp.GlobalArg("G", dtype, shape=(6, ) + field_shape, order=order), lp.GlobalArg("D", dtype, shape=(n, n), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap", assumptions="K>=1") # Must act on u first, otherwise stencil becomes crooked and # footprint becomes non-convex. knl = lp.split_iname(knl, "e", 16, outer_tag="g.0") #, slabs=(0, 1)) knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp") knl = lp.add_prefetch(knl, "u", [1, 2, 3, "e_inner_inner"]) knl = lp.precompute(knl, "ur", np.float32, [0, 1, 2, "e_inner_inner"]) knl = lp.precompute(knl, "us", np.float32, [0, 1, 2, "e_inner_inner"]) knl = lp.precompute(knl, "ut", np.float32, [0, 1, 2, "e_inner_inner"]) knl = lp.add_prefetch(knl, "G", ["m", "i", "j", "k", "e_inner_inner"]) knl = lp.add_prefetch(knl, "D", ["m", "j"]) #print seq_knl #1/0 knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) for knl in kernel_gen: print(lp.generate_code(knl))
import loopy as lp import numpy as np k = lp.make_kernel(["{ [i] : 0 <= i < m }", "{ [j] : 0 <= j < length }"], """ for i <> rowstart = rowstarts[i] <> rowend = rowstarts[i+1] <> length = rowend - rowstart y[i] = sum(j, values[rowstart+j] * x[colindices[rowstart + j]]) end """) k = lp.add_and_infer_dtypes(k, { "values,x": np.float64, "rowstarts,colindices": k.index_dtype }) print(lp.generate_code(k)[0])
def get_kernel_executor(self, knl, *args, **kwargs): code, _ = lp.generate_code(knl) LOG.debug(code) ns = {} exec(code, ns) return ns[knl.name]
import loopy as lp import numpy as np k = lp.make_kernel([ "{ [i] : 0 <= i < m }", "{ [j] : 0 <= j < length }"], """ for i <> rowstart = rowstarts[i] <> rowend = rowstarts[i+1] <> length = rowend - rowstart y[i] = sum(j, values[rowstart+j] * x[colindices[rowstart + j]]) end """) k = lp.add_and_infer_dtypes(k, { "values,x": np.float64, "rowstarts,colindices": k.index_dtype }) print(lp.generate_code(k)[0])