def test_to_batched_temp(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( ''' { [i,j]: 0<=i,j<n } ''', ''' cnst = 2.0 out[i] = sum(j, cnst*a[i,j]*x[j])''', [lp.TemporaryVariable( "cnst", dtype=np.float32, shape=(), scope=lp.temp_var_scope.PRIVATE), '...']) knl = lp.add_and_infer_dtypes(knl, dict(out=np.float32, x=np.float32, a=np.float32)) ref_knl = lp.make_kernel( ''' { [i,j]: 0<=i,j<n } ''', '''out[i] = sum(j, 2.0*a[i,j]*x[j])''') ref_knl = lp.add_and_infer_dtypes(ref_knl, dict(out=np.float32, x=np.float32, a=np.float32)) bknl = lp.to_batched(knl, "nbatches", "out,x") bref_knl = lp.to_batched(ref_knl, "nbatches", "out,x") # checking that cnst is not being bathced assert bknl.temporary_variables['cnst'].shape == () a = np.random.randn(5, 5) x = np.random.randn(7, 5) # Checking that the program compiles and the logic is correct lp.auto_test_vs_ref( bref_knl, ctx, bknl, parameters=dict(a=a, x=x, n=5, nbatches=7))
def test_to_batched_temp(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( """ { [i,j]: 0<=i,j<n } """, """ cnst = 2.0 out[i] = sum(j, cnst*a[i,j]*x[j])""", [ lp.TemporaryVariable("cnst", dtype=np.float32, shape=(), address_space=lp.AddressSpace.PRIVATE), "..." ]) knl = lp.add_and_infer_dtypes( knl, dict(out=np.float32, x=np.float32, a=np.float32)) ref_knl = lp.make_kernel(""" { [i,j]: 0<=i,j<n } """, """out[i] = sum(j, 2.0*a[i,j]*x[j])""") ref_knl = lp.add_and_infer_dtypes( ref_knl, dict(out=np.float32, x=np.float32, a=np.float32)) bknl = lp.to_batched(knl, "nbatches", "out,x") bref_knl = lp.to_batched(ref_knl, "nbatches", "out,x") # checking that cnst is not being bathced assert bknl["loopy_kernel"].temporary_variables["cnst"].shape == () a = np.random.randn(5, 5) x = np.random.randn(7, 5) # Checking that the program compiles and the logic is correct lp.auto_test_vs_ref(bref_knl, ctx, bknl, parameters=dict(a=a, x=x, n=5, nbatches=7))
def test_to_batched(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( ''' { [i,j]: 0<=i,j<n } ''', ''' out[i] = sum(j, a[i,j]*x[j])''') knl = lp.add_and_infer_dtypes(knl, dict(out=np.float32, x=np.float32, a=np.float32)) bknl = lp.to_batched(knl, "nbatches", "out,x") ref_knl = lp.make_kernel( ''' { [i,j,k]: 0<=i,j<n and 0<=k<nbatches} ''', '''out[k, i] = sum(j, a[i,j]*x[k, j])''') ref_knl = lp.add_and_infer_dtypes(ref_knl, dict(out=np.float32, x=np.float32, a=np.float32)) a = np.random.randn(5, 5).astype(np.float32) x = np.random.randn(7, 5).astype(np.float32) # Running both the kernels evt, (out1, ) = bknl(queue, a=a, x=x, n=5, nbatches=7) evt, (out2, ) = ref_knl(queue, a=a, x=x, n=5, nbatches=7) # checking that the outputs are same assert np.linalg.norm(out1-out2) < 1e-15
def test_to_batched_temp(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( ''' { [i,j]: 0<=i,j<n } ''', ''' cnst = 2.0 out[i] = sum(j, cnst*a[i,j]*x[j])''', [ lp.TemporaryVariable("cnst", dtype=np.float32, shape=(), scope=lp.temp_var_scope.PRIVATE), '...' ]) knl = lp.add_and_infer_dtypes( knl, dict(out=np.float32, x=np.float32, a=np.float32)) ref_knl = lp.make_kernel(''' { [i,j]: 0<=i,j<n } ''', '''out[i] = sum(j, 2.0*a[i,j]*x[j])''') ref_knl = lp.add_and_infer_dtypes( ref_knl, dict(out=np.float32, x=np.float32, a=np.float32)) bknl = lp.to_batched(knl, "nbatches", "out,x") bref_knl = lp.to_batched(ref_knl, "nbatches", "out,x") # checking that cnst is not being bathced assert bknl.temporary_variables['cnst'].shape == () a = np.random.randn(5, 5) x = np.random.randn(7, 5) # Checking that the program compiles and the logic is correct lp.auto_test_vs_ref(bref_knl, ctx, bknl, parameters=dict(a=a, x=x, n=5, nbatches=7))
def test_to_batched(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel(''' { [i,j]: 0<=i,j<n } ''', ''' out[i] = sum(j, a[i,j]*x[j])''') knl = lp.add_and_infer_dtypes( knl, dict(out=np.float32, x=np.float32, a=np.float32)) bknl = lp.to_batched(knl, "nbatches", "out,x") ref_knl = lp.make_kernel(''' { [i,j,k]: 0<=i,j<n and 0<=k<nbatches} ''', '''out[k, i] = sum(j, a[i,j]*x[k, j])''') ref_knl = lp.add_and_infer_dtypes( ref_knl, dict(out=np.float32, x=np.float32, a=np.float32)) a = np.random.randn(5, 5).astype(np.float32) x = np.random.randn(7, 5).astype(np.float32) # Running both the kernels evt, (out1, ) = bknl(queue, a=a, x=x, n=5, nbatches=7) evt, (out2, ) = ref_knl(queue, a=a, x=x, n=5, nbatches=7) # checking that the outputs are same assert np.linalg.norm(out1 - out2) < 1e-15
def test_diamond_tiling(ctx_factory, interactive=False): ctx = ctx_factory() queue = cl.CommandQueue(ctx) ref_knl = lp.make_kernel( "[nx,nt] -> {[ix, it]: 1<=ix<nx-1 and 0<=it<nt}", """ u[ix, it+2] = ( 2*u[ix, it+1] + dt**2/dx**2 * (u[ix+1, it+1] - 2*u[ix, it+1] + u[ix-1, it+1]) - u[ix, it]) """) knl_for_transform = ref_knl ref_knl = lp.prioritize_loops(ref_knl, "it, ix") import islpy as isl m = isl.BasicMap( "[nx,nt] -> {[ix, it] -> [tx, tt, tparity, itt, itx]: " "16*(tx - tt) + itx - itt = ix - it and " "16*(tx + tt + tparity) + itt + itx = ix + it and " "0<=tparity<2 and 0 <= itx - itt < 16 and 0 <= itt+itx < 16}") knl = lp.map_domain(knl_for_transform, m) knl = lp.prioritize_loops(knl, "tt,tparity,tx,itt,itx") if interactive: nx = 43 u = np.zeros((nx, 200)) x = np.linspace(-1, 1, nx) dx = x[1] - x[0] u[:, 0] = u[:, 1] = np.exp(-100 * x**2) u_dev = cl.array.to_device(queue, u) knl(queue, u=u_dev, dx=dx, dt=dx) u = u_dev.get() import matplotlib.pyplot as plt plt.imshow(u.T) plt.show() else: types = {"dt,dx,u": np.float64} knl = lp.add_and_infer_dtypes(knl, types) ref_knl = lp.add_and_infer_dtypes(ref_knl, types) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={ "nx": 200, "nt": 300, "dx": 1, "dt": 1 })
def test_precompute_with_preexisting_inames(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[e,i,j,k]: 0<=e<E and 0<=i,j,k<n}", """ result[e,i] = sum(j, D1[i,j]*u[e,j]) result2[e,i] = sum(k, D2[i,k]*u[e,k]) """) knl = lp.add_and_infer_dtypes(knl, { "u": np.float32, "D1": np.float32, "D2": np.float32, }) knl = lp.fix_parameters(knl, n=13) ref_knl = knl knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj") knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj") knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for", precompute_inames="ii,jj") knl = lp.precompute(knl, "D2_subst", "i,k", default_tag="for", precompute_inames="ii,jj") knl = lp.set_loop_priority(knl, "ii,jj,e,j,k") lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters=dict(E=200))
def test_precompute_nested_subst(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i,j]: 0<=i<n and 0<=j<5}", """ E:=a[i] D:=E*E b[i] = D """) knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) ref_knl = knl knl = lp.tag_inames(knl, dict(j="g.1")) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") from loopy.symbolic import get_dependencies assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression) knl = lp.precompute(knl, "D", "i_inner") # There's only one surviving 'E' rule. assert len([ rule_name for rule_name in knl.substitutions if rule_name.startswith("E")]) == 1 # That rule should use the newly created prefetch inames, # not the prior 'i_inner' assert "i_inner" not in get_dependencies(knl.substitutions["E"].expression) lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters=dict(n=12345))
def test_vectorize(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<n}", """ <> temp = 2*b[i] a[i] = temp """) knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32)) knl = lp.set_array_dim_names(knl, "a,b", "i") knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)], 4, split_kwargs=dict(slabs=(0, 1))) knl = lp.tag_data_axes(knl, "a,b", "c,vec") ref_knl = knl ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"}) knl = lp.tag_inames(knl, {"i_inner": "vec"}) knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) code, inf = lp.generate_code(knl) lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters=dict(n=30))
def test_alias_temporaries(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<n}", """ times2(i) := 2*a[i] times3(i) := 3*a[i] times4(i) := 4*a[i] x[i] = times2(i) y[i] = times3(i) z[i] = times4(i) """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) ref_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.precompute(knl, "times2", "i_inner") knl = lp.precompute(knl, "times3", "i_inner") knl = lp.precompute(knl, "times4", "i_inner") knl = lp.alias_temporaries(knl, ["times2_0", "times3_0", "times4_0"]) lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters=dict(n=30))
def test_op_counter_bitwise(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1) e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k)) """ ], name="bitwise", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes( knl, dict( a=np.int32, b=np.int32, g=np.int64, h=np.int64)) poly = get_op_poly(knl) n = 10 m = 10 l = 10 param_values = {'n': n, 'm': m, 'l': l} i32 = poly.dict[np.dtype(np.int32)].eval_with_dict(param_values) i64 = poly.dict[np.dtype(np.int64)].eval_with_dict(param_values) not_there = poly[np.dtype(np.float64)].eval_with_dict(param_values) print(poly.dict) assert i32 == n*m + n*m*l assert i64 == 2*n*m assert not_there == 0
def build_loopy_kernel_A_text(): knl_name = "kernel_tensor_A" knl = lp.make_kernel("{ [i,j,k]: 0<=i,j<n and 0<=k<m }", """ A[i,j] = c*sum(k, B[k,i]*B[k,j]) """, name=knl_name, assumptions="n >= 1 and m >= 1", lang_version=lp.MOST_RECENT_LANGUAGE_VERSION, target=lp.CTarget()) knl = lp.add_and_infer_dtypes( knl, { "A": np.dtype(np.double), "B": np.dtype(np.double), "c": np.dtype(np.double) }) knl = lp.fix_parameters(knl, n=3, m=2) knl = lp.prioritize_loops(knl, "i,j") #print(knl) knl_c, knl_h = lp.generate_code_v2(knl).device_code(), str( lp.generate_header(knl)[0]) replacements = [("__restrict__", "restrict")] knl_c = utils.replace_strings(knl_c, replacements) knl_h = utils.replace_strings(knl_h, replacements) knl_call = "kernel_tensor_A(A, &B[0][0], 1.0/(2.0*Ae));" return knl_name, knl_call, knl_c, knl_h
def test_sparse_matmul(self): "Tests how to do sparse indexing w/ loop." target = NumbaTarget() knl = lp.make_kernel( [ '{[i]: 0 <= i < n}', # note loop bounded by jlo jhi '{[j]: jlo <= j < jhi}' ], # which are set as instructions """ <> jlo = row[i] <> jhi = row[i + 1] out[i] = sum(j, dat[j] * vec[col[j]]) """, 'n nnz row col dat vec out'.split(), target=target) knl = lp.add_and_infer_dtypes(knl, { 'out,dat,vec': np.float32, 'col,row,n,nnz': np.uintc, }) # col and dat have uninferrable shape knl.args[3].shape = pm.var('nnz'), knl.args[4].shape = pm.var('nnz'), from scipy.sparse import csr_matrix n = 64 mat = csr_matrix(np.ones((64, 64)) * (np.random.rand(64, 64) < 0.1)) row = mat.indptr.astype(np.uintc) col = mat.indices.astype(np.uintc) dat = mat.data.astype(np.float32) out, vec = np.random.rand(2, n).astype(np.float32) nnz = mat.nnz knl(n, nnz, row, col, dat, vec, out) np.testing.assert_allclose(out, mat * vec, 1e-5, 1e-6)
def test_barrier_counter_barriers(): knl = lp.make_kernel( "[n,m,l] -> {[i,k,j]: 0<=i<50 and 1<=k<98 and 0<=j<10}", [ """ c[i,j,k] = 2*a[i,j,k] {id=first} e[i,j,k] = c[i,j,k+1]+c[i,j,k-1] {dep=first} """ ], [ lp.TemporaryVariable("c", lp.auto, shape=(50, 10, 99)), "..." ], name="weird2", ) knl = lp.add_and_infer_dtypes(knl, dict(a=np.int32)) knl = lp.split_iname(knl, "k", 128, outer_tag="g.0", inner_tag="l.0") poly = lp.get_synchronization_poly(knl) print(poly) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} barrier_count = poly["barrier_local"].eval_with_dict(params) assert barrier_count == 50*10*2
def test_precompute_confusing_subst_arguments(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i,j]: 0<=i<n and 0<=j<5}", """ D(i):=a[i+1]-a[i] b[i,j] = D(j) """) knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) ref_knl = knl knl = lp.tag_inames(knl, dict(j="g.1")) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") from loopy.symbolic import get_dependencies assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression) knl = lp.precompute(knl, "D", sweep_inames="j", precompute_outer_inames="j, i_inner, i_outer") lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters=dict(n=12345))
def test_precompute_with_preexisting_inames(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[e,i,j,k]: 0<=e<E and 0<=i,j,k<n}", """ result[e,i] = sum(j, D1[i,j]*u[e,j]) result2[e,i] = sum(k, D2[i,k]*u[e,k]) """) knl = lp.add_and_infer_dtypes(knl, { "u": np.float32, "D1": np.float32, "D2": np.float32, }) knl = lp.fix_parameters(knl, n=13) ref_knl = knl knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj") knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj") knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for", precompute_inames="ii,jj") knl = lp.precompute(knl, "D2_subst", "i,k", default_tag="for", precompute_inames="ii,jj") knl = lp.prioritize_loops(knl, "ii,jj,e,j,k") lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters=dict(E=200))
def test_fd_demo(): knl = lp.make_kernel( "{[i,j]: 0<=i,j<n}", "result[i+1,j+1] = u[i + 1, j + 1]**2 + -1 + (-4)*u[i + 1, j + 1] \ + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] \ + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]") #assumptions="n mod 16=0") knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.add_prefetch(knl, "u", ["i_inner", "j_inner"], fetch_bounding_box=True, default_tag="l.auto") #n = 1000 #u = cl.clrandom.rand(queue, (n+2, n+2), dtype=np.float32) knl = lp.set_options(knl, write_cl=True) knl = lp.add_and_infer_dtypes(knl, dict(u=np.float32)) code, inf = lp.generate_code(knl) print(code) assert "double" not in code
def test_op_counter_triangular_domain(): knl = lp.make_kernel("{[i,j]: 0<=i<n and 0<=j<m and i<j}", """ a[i, j] = b[i,j] * 2 """, name="bitwise", assumptions="n,m >= 1") knl = lp.add_and_infer_dtypes(knl, dict(b=np.float64)) expect_fallback = False import islpy as isl try: isl.BasicSet.card except AttributeError: expect_fallback = True else: expect_fallback = False op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)[lp.Op( np.float64, 'mul', CG.SUBGROUP)] value_dict = dict(m=13, n=200) flops = op_map.eval_with_dict(value_dict) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups * subgroups_per_group if expect_fallback: assert flops == 144 * n_subgroups else: assert flops == 78 * n_subgroups
def test_op_counter_basic(): knl = lp.make_kernel("[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k+1] = -g[i,k]*h[i,k+1] """ ], name="basic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes( knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) poly = lp.get_op_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params) f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) f32div = poly[(np.dtype(np.float32), 'div')].eval_with_dict(params) f64mul = poly[(np.dtype(np.float64), 'mul')].eval_with_dict(params) i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params) assert f32add == f32mul == f32div == n * m * l assert f64mul == n * m assert i32add == n * m * 2
def test_gmem_access_counter_consec(): knl = lp.make_kernel("[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k] = g[i,k]*(2+h[i,k]) """ ], name="consec", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes( knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"}) poly = lp.get_gmem_access_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f64consec = poly[(np.dtype(np.float64), 'consecutive', 'load')].eval_with_dict(params) f32consec = poly[(np.dtype(np.float32), 'consecutive', 'load')].eval_with_dict(params) assert f64consec == 2 * n * m assert f32consec == 3 * n * m * l f64consec = poly[(np.dtype(np.float64), 'consecutive', 'store')].eval_with_dict(params) f32consec = poly[(np.dtype(np.float32), 'consecutive', 'store')].eval_with_dict(params) assert f64consec == n * m assert f32consec == n * m * l
def test_gmem_access_counter_logic(): knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ e[i,k] = if(not(k<l-2) and k>6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2) """ ], name="logic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) poly = lp.get_gmem_access_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f32 = poly[(np.dtype(np.float32), 'uniform', 'load')].eval_with_dict(params) f64 = poly[(np.dtype(np.float64), 'uniform', 'load')].eval_with_dict(params) assert f32 == 2 * n * m assert f64 == n * m f64 = poly[(np.dtype(np.float64), 'uniform', 'store')].eval_with_dict(params) assert f64 == n * m
def test_gmem_access_counter_specialops(): knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0) e[i, k] = (1+g[i,k])**(1+h[i,k+1]) """ ], name="specialops", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes( knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) poly = lp.get_gmem_access_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f32 = poly[(np.dtype(np.float32), 'uniform', 'load')].eval_with_dict(params) f64 = poly[(np.dtype(np.float64), 'uniform', 'load')].eval_with_dict(params) assert f32 == 2 * n * m * l assert f64 == 2 * n * m f32 = poly[(np.dtype(np.float32), 'uniform', 'store')].eval_with_dict(params) f64 = poly[(np.dtype(np.float64), 'uniform', 'store')].eval_with_dict(params) assert f32 == n * m * l assert f64 == n * m
def test_gmem_access_counter_basic(): knl = lp.make_kernel("[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k] = g[i,k]*h[i,k+1] """ ], name="basic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes( knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) poly = lp.get_gmem_access_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f32 = poly[(np.dtype(np.float32), 'uniform', 'load')].eval_with_dict(params) f64 = poly[(np.dtype(np.float64), 'uniform', 'load')].eval_with_dict(params) assert f32 == 3 * n * m * l assert f64 == 2 * n * m f32 = poly[(np.dtype(np.float32), 'uniform', 'store')].eval_with_dict(params) f64 = poly[(np.dtype(np.float64), 'uniform', 'store')].eval_with_dict(params) assert f32 == n * m * l assert f64 == n * m
def test_op_counter_bitwise(): knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1) e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k)) """ ], name="bitwise", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes( knl, dict(a=np.int32, b=np.int32, g=np.int64, h=np.int64)) poly = lp.get_op_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params) i32bw = poly[(np.dtype(np.int32), 'bw')].eval_with_dict(params) i64bw = poly[(np.dtype(np.int64), 'bw')].eval_with_dict(params) i64mul = poly[(np.dtype(np.int64), 'mul')].eval_with_dict(params) i64add = poly[(np.dtype(np.int64), 'add')].eval_with_dict(params) i64shift = poly[(np.dtype(np.int64), 'shift')].eval_with_dict(params) assert i32add == n * m + n * m * l assert i32bw == 2 * n * m * l assert i64bw == 2 * n * m assert i64add == i64mul == n * m assert i64shift == 2 * n * m
def test_op_counter_specialops(): knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0) e[i, k] = (1+g[i,k])**(1+h[i,k+1])+rsqrt(g[i,k])*sin(g[i,k]) """ ], name="specialops", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes( knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) poly = lp.get_op_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) f32div = poly[(np.dtype(np.float32), 'div')].eval_with_dict(params) f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params) f64pow = poly[(np.dtype(np.float64), 'pow')].eval_with_dict(params) f64add = poly[(np.dtype(np.float64), 'add')].eval_with_dict(params) i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params) f64rsqrt = poly[(np.dtype(np.float64), 'func:rsqrt')].eval_with_dict(params) f64sin = poly[(np.dtype(np.float64), 'func:sin')].eval_with_dict(params) assert f32div == 2 * n * m * l assert f32mul == f32add == n * m * l assert f64add == 3 * n * m assert f64pow == i32add == f64rsqrt == f64sin == n * m
def test_alias_temporaries(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<n}", """ times2(i) := 2*a[i] times3(i) := 3*a[i] times4(i) := 4*a[i] x[i] = times2(i) y[i] = times3(i) z[i] = times4(i) """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) ref_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.precompute(knl, "times2", "i_inner") knl = lp.precompute(knl, "times3", "i_inner") knl = lp.precompute(knl, "times4", "i_inner") knl = lp.alias_temporaries(knl, ["times2_0", "times3_0", "times4_0"]) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=30))
def test_global_temporary(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{ [i]: 0<=i<n}", """ <> c[i] = a[i + 1] out[i] = c[i] """) knl = lp.add_and_infer_dtypes(knl, { "a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32 }) knl = lp.set_temporary_scope(knl, "c", "global") ref_knl = knl knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") cgr = lp.generate_code_v2(knl) assert len(cgr.device_programs) == 2 #print(cgr.device_code()) #print(cgr.host_code()) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def test_gmem_access_counter_bitwise(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1) e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k)) """ ], name="bitwise", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes( knl, dict( a=np.int32, b=np.int32, g=np.int32, h=np.int32)) poly = get_gmem_access_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} i32 = poly[ (np.dtype(np.int32), 'uniform', 'load') ].eval_with_dict(params) assert i32 == 4*n*m+2*n*m*l i32 = poly[ (np.dtype(np.int32), 'uniform', 'store') ].eval_with_dict(params) assert i32 == n*m+n*m*l
def test_gmem_access_counter_logic(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ e[i,k] = if(not(k<l-2) and k>6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2) """ ], name="logic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) poly = get_gmem_access_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f32 = poly[ (np.dtype(np.float32), 'uniform', 'load') ].eval_with_dict(params) f64 = poly[ (np.dtype(np.float64), 'uniform', 'load') ].eval_with_dict(params) assert f32 == 2*n*m assert f64 == n*m f64 = poly[ (np.dtype(np.float64), 'uniform', 'store') ].eval_with_dict(params) assert f64 == n*m
def test_ispc_streaming_stores(): stream_dtype = np.float32 index_dtype = np.int32 knl = lp.make_kernel( "{[i]: 0<=i<n}", "a[i] = b[i] + scalar * c[i]", target=lp.ISPCTarget(), index_dtype=index_dtype, name="stream_triad") vars = ["a", "b", "c", "scalar"] knl = lp.assume(knl, "n>0") knl = lp.split_iname( knl, "i", 2**18, outer_tag="g.0", slabs=(0, 1)) knl = lp.split_iname(knl, "i_inner", 8, inner_tag="l.0") knl = lp.tag_instructions(knl, "!streaming_store") knl = lp.add_and_infer_dtypes(knl, { var: stream_dtype for var in vars }) knl = lp.set_argument_order(knl, vars + ["n"]) knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) lp.generate_code_v2(knl).all_code()
def test_op_counter_basic(): knl = lp.make_kernel( "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k+1] = -g[i,k]*h[i,k+1] """ ], name="basic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params) f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(params) f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.SUBGROUP) ].eval_with_dict(params) i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32add == f32mul == f32div == n*m*ell*n_subgroups assert f64mul == n*m*n_subgroups assert i32add == n*m*2*n_subgroups
def test_op_counter_triangular_domain(): knl = lp.make_kernel( "{[i,j]: 0<=i<n and 0<=j<m and i<j}", """ a[i, j] = b[i,j] * 2 """, name="bitwise", assumptions="n,m >= 1") knl = lp.add_and_infer_dtypes(knl, dict(b=np.float64)) expect_fallback = False import islpy as isl try: isl.BasicSet.card except AttributeError: expect_fallback = True else: expect_fallback = False poly = get_op_poly(knl)[(np.dtype(np.float64), 'mul')] value_dict = dict(m=13, n=200) flops = poly.eval_with_dict(value_dict) if expect_fallback: assert flops == 144 else: assert flops == 78
def test_op_counter_basic(): knl = lp.make_kernel( "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k+1] = -g[i,k]*h[i,k+1] """ ], name="basic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) op_map = lp.get_op_map(knl, count_redundant_work=True) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} f32add = op_map[lp.Op(np.float32, 'add', CG.WORKITEM)].eval_with_dict(params) f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(params) f32div = op_map[lp.Op(np.float32, 'div', CG.WORKITEM)].eval_with_dict(params) f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.WORKITEM) ].eval_with_dict(params) i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.WORKITEM) ].eval_with_dict(params) assert f32add == f32mul == f32div == n*m*ell assert f64mul == n*m assert i32add == n*m*2
def add_types(knl): return lp.add_and_infer_dtypes(knl, dict( w=np.float32, J=np.float32, DPsi=np.float32, DFinv=np.float32, ))
def test_vectorize(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<n}", """ <> temp = 2*b[i] a[i] = temp """) knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32)) knl = lp.set_array_dim_names(knl, "a,b", "i") knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)], 4, split_kwargs=dict(slabs=(0, 1))) knl = lp.tag_data_axes(knl, "a,b", "c,vec") ref_knl = knl ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"}) knl = lp.tag_inames(knl, {"i_inner": "vec"}) knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) code, inf = lp.generate_code(knl) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=30))
def test_precompute_nested_subst(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i,j]: 0<=i<n and 0<=j<5}", """ E:=a[i] D:=E*E b[i] = D """) knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) ref_knl = knl knl = lp.tag_inames(knl, dict(j="g.1")) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") from loopy.symbolic import get_dependencies assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression) knl = lp.precompute(knl, "D", "i_inner") # There's only one surviving 'E' rule. assert len([ rule_name for rule_name in knl.substitutions if rule_name.startswith("E") ]) == 1 # That rule should use the newly created prefetch inames, # not the prior 'i_inner' assert "i_inner" not in get_dependencies(knl.substitutions["E"].expression) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=12345))
def test_kernel_splitting_with_loop(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{ [i,k]: 0<=i<n and 0<=k<3 }", """ c[k,i] = a[k, i + 1] out[k,i] = c[k,i] """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) ref_knl = knl knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") # schedule from loopy.preprocess import preprocess_kernel knl = preprocess_kernel(knl) from loopy.schedule import get_one_scheduled_kernel knl = get_one_scheduled_kernel(knl) # map schedule onto host or device print(knl) cgr = lp.generate_code_v2(knl) assert len(cgr.device_programs) == 2 print(cgr.device_code()) print(cgr.host_code()) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def test_global_temporary(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{ [i]: 0<=i<n}", """ <> c[i] = a[i + 1] out[i] = c[i] """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) knl = lp.set_temporary_scope(knl, "c", "global") ref_knl = knl knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") cgr = lp.generate_code_v2(knl) assert len(cgr.device_programs) == 2 #print(cgr.device_code()) #print(cgr.host_code()) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def test_precompute_with_preexisting_inames_fail(): knl = lp.make_kernel( "{[e,i,j,k]: 0<=e<E and 0<=i,j<n and 0<=k<2*n}", """ result[e,i] = sum(j, D1[i,j]*u[e,j]) result2[e,i] = sum(k, D2[i,k]*u[e,k]) """) knl = lp.add_and_infer_dtypes(knl, { "u": np.float32, "D1": np.float32, "D2": np.float32, }) knl = lp.fix_parameters(knl, n=13) knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj") knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj") knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for", precompute_inames="ii,jj") with pytest.raises(lp.LoopyError): lp.precompute(knl, "D2_subst", "i,k", default_tag="for", precompute_inames="ii,jj")
def test_barrier_counter_barriers(): knl = lp.make_kernel( "[n,m,ell] -> {[i,k,j]: 0<=i<50 and 1<=k<98 and 0<=j<10}", [ """ c[i,j,k] = 2*a[i,j,k] {id=first} e[i,j,k] = c[i,j,k+1]+c[i,j,k-1] {dep=first} """ ], [ lp.TemporaryVariable("c", lp.auto, shape=(50, 10, 99)), "..." ], name="weird2", ) knl = lp.add_and_infer_dtypes(knl, dict(a=np.int32)) knl = lp.split_iname(knl, "k", 128, inner_tag="l.0") sync_map = lp.get_synchronization_map(knl) print(sync_map) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} barrier_count = sync_map["barrier_local"].eval_with_dict(params) assert barrier_count == 50*10*2
def test_op_counter_logic(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ e[i,k] = if(not(k<l-2) and k>6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2) """ ], name="logic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) poly = get_op_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) f64add = poly[(np.dtype(np.float64), 'add')].eval_with_dict(params) f64div = poly[(np.dtype(np.float64), 'div')].eval_with_dict(params) i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params) assert f32mul == n*m assert f64div == 2*n*m # TODO why? assert f64add == n*m assert i32add == n*m
def test_op_counter_reduction(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], name="matmul_serial", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', CG.SUBGROUP) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32add == f32mul == n*m*ell*n_subgroups op_map_dtype = op_map.group_by('dtype') f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params) assert f32 == f32add + f32mul
def test_op_counter_triangular_domain(): knl = lp.make_kernel( "{[i,j]: 0<=i<n and 0<=j<m and i<j}", """ a[i, j] = b[i,j] * 2 """, name="bitwise", assumptions="n,m >= 1") knl = lp.add_and_infer_dtypes(knl, dict(b=np.float64)) expect_fallback = False import islpy as isl try: isl.BasicSet.card except AttributeError: expect_fallback = True else: expect_fallback = False op_map = lp.get_op_map( knl, count_redundant_work=True )[lp.Op(np.float64, 'mul', CG.WORKITEM)] value_dict = dict(m=13, n=200) flops = op_map.eval_with_dict(value_dict) if expect_fallback: assert flops == 144 else: assert flops == 78
def test_op_counter_specialops(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0) e[i, k] = (1+g[i,k])**(1+h[i,k+1]) """ ], name="specialops", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) poly = get_op_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) f32div = poly[(np.dtype(np.float32), 'div')].eval_with_dict(params) f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params) f64pow = poly[(np.dtype(np.float64), 'pow')].eval_with_dict(params) f64add = poly[(np.dtype(np.float64), 'add')].eval_with_dict(params) i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params) assert f32div == 2*n*m*l assert f32mul == f32add == n*m*l assert f64add == 2*n*m assert f64pow == i32add == n*m
def test_op_counter_logic(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ e[i,k] = if( not(k<ell-2) and k>6 or k/2==ell, g[i,k]*2, g[i,k]+h[i,k]/2) """ ], name="logic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) op_map = lp.get_op_map(knl, count_redundant_work=True) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(params) f64add = op_map[lp.Op(np.float64, 'add', CG.WORKITEM)].eval_with_dict(params) f64div = op_map[lp.Op(np.dtype(np.float64), 'div', CG.WORKITEM) ].eval_with_dict(params) i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.WORKITEM) ].eval_with_dict(params) assert f32mul == n*m assert f64div == 2*n*m # TODO why? assert f64add == n*m assert i32add == n*m
def test_op_counter_bitwise(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1) e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k)) """ ], name="bitwise", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes( knl, dict( a=np.int32, b=np.int32, g=np.int64, h=np.int64)) poly = get_op_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params) i32bw = poly[(np.dtype(np.int32), 'bw')].eval_with_dict(params) i64bw = poly[(np.dtype(np.int64), 'bw')].eval_with_dict(params) i64mul = poly[(np.dtype(np.int64), 'mul')].eval_with_dict(params) i64add = poly[(np.dtype(np.int64), 'add')].eval_with_dict(params) i64shift = poly[(np.dtype(np.int64), 'shift')].eval_with_dict(params) assert i32add == n*m+n*m*l assert i32bw == 2*n*m*l assert i64bw == 2*n*m assert i64add == i64mul == n*m assert i64shift == 2*n*m
def test_op_counter_logic(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ e[i,k] = if( not(k<ell-2) and k>6 or k/2==ell, g[i,k]*2, g[i,k]+h[i,k]/2) """ ], name="logic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params) f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP)].eval_with_dict(params) f64div = op_map[lp.Op(np.dtype(np.float64), 'div', CG.SUBGROUP) ].eval_with_dict(params) i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32mul == n*m*n_subgroups assert f64div == 2*n*m*n_subgroups # TODO why? assert f64add == n*m*n_subgroups assert i32add == n*m*n_subgroups
def test_op_counter_basic(): knl = lp.make_kernel( "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k+1] = -g[i,k]*h[i,k+1] """ ], name="basic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) poly = get_op_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params) f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) f32div = poly[(np.dtype(np.float32), 'div')].eval_with_dict(params) f64mul = poly[(np.dtype(np.float64), 'mul')].eval_with_dict(params) i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params) assert f32add == f32mul == f32div == n*m*l assert f64mul == n*m assert i32add == n*m*2
def test_numba_target(): knl = lp.make_kernel( "{[i,j,k]: 0<=i,j<M and 0<=k<N}", "D[i,j] = sqrt(sum(k, (X[i, k]-X[j, k])**2))", target=lp.NumbaTarget()) knl = lp.add_and_infer_dtypes(knl, {"X": np.float32}) print(lp.generate_code_v2(knl).device_code())
def test_all_counters_parallel_matmul(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], name="matmul", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} sync_poly = lp.get_synchronization_poly(knl) assert len(sync_poly) == 1 assert sync_poly["kernel_launch"].eval_with_dict(params) == 1 op_map = lp.get_op_poly(knl) f32mul = op_map[ (np.dtype(np.float32), 'mul') ].eval_with_dict(params) f32add = op_map[ (np.dtype(np.float32), 'add') ].eval_with_dict(params) i32ops = op_map[ (np.dtype(np.int32), 'add') ].eval_with_dict(params) i32ops += op_map[ (np.dtype(np.int32), 'mul') ].eval_with_dict(params) assert f32mul+f32add == n*m*l*2 assert i32ops == n*m*l*4 + l*n*4 subscript_map = lp.get_gmem_access_poly(knl) f32uncoal = subscript_map[ (np.dtype(np.float32), 'nonconsecutive', 'load') ].eval_with_dict(params) f32coal = subscript_map[ (np.dtype(np.float32), 'consecutive', 'load') ].eval_with_dict(params) assert f32uncoal == n*m*l assert f32coal == n*m*l f32coal = subscript_map[ (np.dtype(np.float32), 'consecutive', 'store') ].eval_with_dict(params) assert f32coal == n*l
def test_reg_counter_reduction(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], name="matmul_serial", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) regs = estimate_regs_per_thread(knl) assert regs == 6
def test_auto_test_can_detect_problems(ctx_factory): ctx = ctx_factory() ref_knl = lp.make_kernel( "{[i,j]: 0<=i,j<n}", """ a[i,j] = 25 """) knl = lp.make_kernel( "{[i]: 0<=i<n}", """ a[i,i] = 25 """) ref_knl = lp.add_and_infer_dtypes(ref_knl, dict(a=np.float32)) knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) from loopy.diagnostic import AutomaticTestFailure with pytest.raises(AutomaticTestFailure): lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters=dict(n=123))
def test_gather_access_footprint(): knl = lp.make_kernel( "{[i,k,j]: 0<=i,j,k<n}", [ "c[i, j] = sum(k, a[i, k]*b[k, j]) + a[i,j]" ], name="matmul", assumptions="n >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) from loopy.statistics import gather_access_footprints, count fp = gather_access_footprints(knl) for key, footprint in six.iteritems(fp): print(key, count(knl, footprint))
def test_gather_access_footprint_2(): knl = lp.make_kernel( "{[i]: 0<=i<n}", "c[2*i] = a[i]", name="matmul", assumptions="n >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) from loopy.statistics import gather_access_footprints, count fp = gather_access_footprints(knl) params = {"n": 200} for key, footprint in six.iteritems(fp): assert count(knl, footprint).eval_with_dict(params) == 200 print(key, count(knl, footprint))
def test_reg_counter_logic(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ e[i,k] = if(not(k<l-2) and k>6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2) """ ], name="logic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) regs = estimate_regs_per_thread(knl) assert regs == 6
def transform(knl, vars, stream_dtype): vars = [v.strip() for v in vars.split(",")] knl = lp.assume(knl, "n>0") knl = lp.split_iname( knl, "i", 2**18, outer_tag="g.0", slabs=(0, 1)) knl = lp.split_iname(knl, "i_inner", 8, inner_tag="l.0") knl = lp.add_and_infer_dtypes(knl, { var: stream_dtype for var in vars }) knl = lp.set_argument_order(knl, vars + ["n"]) return knl
def test_cuda_short_vector(): knl = lp.make_kernel( "{ [i]: 0<=i<n }", "out[i] = 2*a[i]", target=lp.CudaTarget()) knl = lp.set_options(knl, write_code=True) knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="vec") knl = lp.split_array_axis(knl, "a,out", axis_nr=0, count=4) knl = lp.tag_array_axes(knl, "a,out", "C,vec") knl = lp.set_options(knl, write_wrapper=True) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) print(lp.generate_code_v2(knl).device_code())
def test_variable_size_temporary(): knl = lp.make_kernel( ''' { [i,j]: 0<=i,j<n } ''', ''' out[i] = sum(j, a[i,j])''') knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) knl = lp.add_prefetch( knl, "a[:,:]", default_tag=None) # Make sure that code generation succeeds even if # there are variable-length arrays. knl = lp.preprocess_kernel(knl) for k in lp.generate_loop_schedules(knl): lp.generate_code(k)