def test_vectorize(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<n}", """ <> temp = 2*b[i] a[i] = temp """) knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32)) knl = lp.set_array_dim_names(knl, "a,b", "i") knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)], 4, split_kwargs=dict(slabs=(0, 1))) knl = lp.tag_data_axes(knl, "a,b", "c,vec") ref_knl = knl ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"}) knl = lp.tag_inames(knl, {"i_inner": "vec"}) knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) code, inf = lp.generate_code(knl) lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters=dict(n=30))
def test_ispc_streaming_stores(): stream_dtype = np.float32 index_dtype = np.int32 knl = lp.make_kernel( "{[i]: 0<=i<n}", "a[i] = b[i] + scalar * c[i]", target=lp.ISPCTarget(), index_dtype=index_dtype, name="stream_triad") vars = ["a", "b", "c", "scalar"] knl = lp.assume(knl, "n>0") knl = lp.split_iname( knl, "i", 2**18, outer_tag="g.0", slabs=(0, 1)) knl = lp.split_iname(knl, "i_inner", 8, inner_tag="l.0") knl = lp.tag_instructions(knl, "!streaming_store") knl = lp.add_and_infer_dtypes(knl, { var: stream_dtype for var in vars }) knl = lp.set_argument_order(knl, vars + ["n"]) knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) lp.generate_code_v2(knl).all_code()
def test_forced_iname_deps_and_reduction(): # See https://github.com/inducer/loopy/issues/24 # This is (purposefully) somewhat un-idiomatic, to replicate the conditions # under which the above bug was found. If assignees were phi[i], then the # iname propagation heuristic would not assume that dependent instructions # need to run inside of 'i', and hence the forced_iname_* bits below would not # be needed. i1 = lp.CInstruction("i", "doSomethingToGetPhi();", assignees="phi") from pymbolic.primitives import Subscript, Variable i2 = lp.Assignment("a", lp.Reduction("sum", "j", Subscript(Variable("phi"), Variable("j"))), forced_iname_deps=frozenset(), forced_iname_deps_is_final=True) k = lp.make_kernel( "{[i,j] : 0<=i,j<n}", [i1, i2], [ lp.GlobalArg("a", dtype=np.float32, shape=()), lp.ValueArg("n", dtype=np.int32), lp.TemporaryVariable("phi", dtype=np.float32, shape=("n", )), ], target=lp.CTarget(), ) k = lp.preprocess_kernel(k) assert 'i' not in k.insn_inames("insn_0_j_update") print(k.stringify(with_dependencies=True))
def test_unschedulable_kernel_detection(): knl = lp.make_kernel(["{[i,j]:0<=i,j<n}"], """ mat1[i,j] = mat1[i,j] + 1 {inames=i:j, id=i1} mat2[j] = mat2[j] + 1 {inames=j, id=i2} mat3[i] = mat3[i] + 1 {inames=i, id=i3} """) knl = lp.preprocess_kernel(knl) # Check that loopy can detect the unschedulability of the kernel assert lp.needs_iname_duplication(knl) assert len(list(lp.get_iname_duplication_options(knl))) == 4 for inames, insns in lp.get_iname_duplication_options(knl): fixed_knl = lp.duplicate_inames(knl, inames, insns) assert not lp.needs_iname_duplication(fixed_knl) knl = lp.make_kernel(["{[i,j,k,l,m]:0<=i,j,k,l,m<n}"], """ mat1[l,m,i,j,k] = mat1[l,m,i,j,k] + 1 {inames=i:j:k:l:m} mat2[l,m,j,k] = mat2[l,m,j,k] + 1 {inames=j:k:l:m} mat3[l,m,k] = mat3[l,m,k] + 11 {inames=k:l:m} mat4[l,m,i] = mat4[l,m,i] + 1 {inames=i:l:m} """) assert lp.needs_iname_duplication(knl) assert len(list(lp.get_iname_duplication_options(knl))) == 10
def test_forced_iname_deps_and_reduction(): # See https://github.com/inducer/loopy/issues/24 # This is (purposefully) somewhat un-idiomatic, to replicate the conditions # under which the above bug was found. If assignees were phi[i], then the # iname propagation heuristic would not assume that dependent instructions # need to run inside of 'i', and hence the forced_iname_* bits below would not # be needed. i1 = lp.CInstruction("i", "doSomethingToGetPhi();", assignees="phi") from pymbolic.primitives import Subscript, Variable i2 = lp.Assignment("a", lp.Reduction("sum", "j", Subscript(Variable("phi"), Variable("j"))), forced_iname_deps=frozenset(), forced_iname_deps_is_final=True) k = lp.make_kernel("{[i,j] : 0<=i,j<n}", [i1, i2], [ lp.GlobalArg("a", dtype=np.float32, shape=()), lp.ValueArg("n", dtype=np.int32), lp.TemporaryVariable("phi", dtype=np.float32, shape=("n",)), ], target=lp.CTarget(), ) k = lp.preprocess_kernel(k) assert 'i' not in k.insn_inames("insn_0_j_update") print(k.stringify(with_dependencies=True))
def test_divisibility_assumption(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "[n] -> {[i]: 0<=i<n}", [ "b[i] = 2*a[i]" ], [ lp.GlobalArg("a", np.float32, shape=("n",)), lp.GlobalArg("b", np.float32, shape=("n",)), lp.ValueArg("n", np.int32), ], assumptions="n>=1 and (exists zz: n = 16*zz)") ref_knl = knl knl = lp.split_iname(knl, "i", 16) knl = lp.preprocess_kernel(knl, ctx.devices[0]) for k in lp.generate_loop_schedules(knl): code = lp.generate_code(k) assert "if" not in code lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": 16**3})
def test_ilp_race_matmul(ctx_factory): dtype = np.float32 order = "C" n = 9 knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<%d}" % n, [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], [ lp.ImageArg("a", dtype, shape=(n, n)), lp.ImageArg("b", dtype, shape=(n, n)), lp.GlobalArg("c", dtype, shape=(n, n), order=order), ], name="matmul") knl = lp.split_iname(knl, "j", 2, outer_tag="ilp", inner_tag="l.0") knl = lp.split_iname(knl, "k", 2) knl = lp.add_prefetch(knl, 'b', ["k_inner"]) with lp.CacheMode(False): from loopy.diagnostic import WriteRaceConditionWarning from warnings import catch_warnings with catch_warnings(record=True) as warn_list: knl = lp.preprocess_kernel(knl) list(lp.generate_loop_schedules(knl)) assert any(isinstance(w.message, WriteRaceConditionWarning) for w in warn_list)
def test_ilp_write_race_detection_global(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "[n] -> {[i,j]: 0<=i,j<n }", [ "a[i] = 5+i+j", ], [ lp.GlobalArg("a", np.float32), lp.ValueArg("n", np.int32, approximately=1000), ], assumptions="n>=1") knl = lp.tag_inames(knl, dict(j="ilp")) knl = lp.preprocess_kernel(knl, ctx.devices[0]) with lp.CacheMode(False): from loopy.diagnostic import WriteRaceConditionWarning from warnings import catch_warnings with catch_warnings(record=True) as warn_list: list(lp.generate_loop_schedules(knl)) assert any(isinstance(w.message, WriteRaceConditionWarning) for w in warn_list)
def test_vectorize(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<n}", """ <> temp = 2*b[i] a[i] = temp """) knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32)) knl = lp.set_array_dim_names(knl, "a,b", "i") knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)], 4, split_kwargs=dict(slabs=(0, 1))) knl = lp.tag_data_axes(knl, "a,b", "c,vec") ref_knl = knl ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"}) knl = lp.tag_inames(knl, {"i_inner": "vec"}) knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) code, inf = lp.generate_code(knl) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=30))
def test_child_invalid_type_cast(): from pymbolic import var knl = lp.make_kernel("{[i]: 0<=i<n}", [ "<> ctr = make_uint2(0, 0)", lp.Assignment("a[i]", lp.TypeCast(np.int64, var("ctr")) << var("i")) ]) with pytest.raises(lp.LoopyError): knl = lp.preprocess_kernel(knl)
def test_child_invalid_type_cast(): from pymbolic import var knl = lp.make_kernel( "{[i]: 0<=i<n}", ["<> ctr = make_uint2(0, 0)", lp.Assignment("a[i]", lp.TypeCast(np.int64, var("ctr")) << var("i"))] ) with pytest.raises(lp.LoopyError): knl = lp.preprocess_kernel(knl)
def test_nonsense_reduction(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<100}", """ a[i] = sum(i, 2) """, [lp.GlobalArg("a", np.float32, shape=(100, ))]) import pytest with pytest.raises(RuntimeError): knl = lp.preprocess_kernel(knl, ctx.devices[0])
def test_ilp_write_race_avoidance_local(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[i,j]: 0<=i<16 and 0<=j<17 }", [ "<> a[i] = 5+i+j", ], []) knl = lp.tag_inames(knl, dict(i="l.0", j="ilp")) knl = lp.preprocess_kernel(knl, ctx.devices[0]) for k in lp.generate_loop_schedules(knl): assert k.temporary_variables["a"].shape == (16, 17)
def test_ilp_write_race_avoidance_private(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[j]: 0<=j<16 }", [ "<> a = 5+j", ], []) knl = lp.tag_inames(knl, dict(j="ilp")) knl = lp.preprocess_kernel(knl, ctx.devices[0]) for k in lp.generate_loop_schedules(knl): assert k.temporary_variables["a"].shape == (16, )
def test_variable_size_temporary(): knl = lp.make_kernel(''' { [i,j]: 0<=i,j<n } ''', ''' out[i] = sum(j, a[i,j])''') knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) knl = lp.add_prefetch(knl, "a[:,:]", default_tag=None) # Make sure that code generation succeeds even if # there are variable-length arrays. knl = lp.preprocess_kernel(knl) for k in lp.generate_loop_schedules(knl): lp.generate_code(k)
def test_owed_barriers(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[i]: 0<=i<100}", ["<float32> z[i] = a[i]"], [lp.GlobalArg("a", np.float32, shape=(100, ))]) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for gen_knl in kernel_gen: compiled = lp.CompiledKernel(ctx, gen_knl) print(compiled.get_code())
def test_nonsense_reduction(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<100}", """ a[i] = sum(i, 2) """, [lp.GlobalArg("a", np.float32, shape=(100,))] ) import pytest with pytest.raises(RuntimeError): knl = lp.preprocess_kernel(knl, ctx.devices[0])
def test_variable_size_temporary(): knl = lp.make_kernel( ''' { [i,j]: 0<=i,j<n } ''', ''' out[i] = sum(j, a[i,j])''') knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) knl = lp.add_prefetch( knl, "a[:,:]", default_tag=None) # Make sure that code generation succeeds even if # there are variable-length arrays. knl = lp.preprocess_kernel(knl) for k in lp.generate_loop_schedules(knl): lp.generate_code(k)
def cached_data(params): data = {} np.random.seed(17) logging.basicConfig(level=logging.INFO) for param in params: data[param] = {} expn = _sumpy_kernel_init(param) data[param]["setup"] = expn knl = _sumpy_kernel_make(expn, param) knl = lp.preprocess_kernel(knl) data[param]["instantiated"] = knl scheduled = knl.with_kernel(lp.get_one_scheduled_kernel(knl["loopy_kernel"], knl.callables_table)) data[param]["scheduled"] = scheduled return data
def test_simple_side_effect(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i,j]: 0<=i,j<100}", """ a[i] = a[i] + 1 """, [lp.GlobalArg("a", np.float32, shape=(100, ))]) knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for gen_knl in kernel_gen: print(gen_knl) compiled = lp.CompiledKernel(ctx, gen_knl) print(compiled.get_code())
def test_ilp_write_race_avoidance_private(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[j]: 0<=j<16 }", [ "<> a = 5+j", ], []) knl = lp.tag_inames(knl, dict(j="ilp")) knl = lp.preprocess_kernel(knl, ctx.devices[0]) for k in lp.generate_loop_schedules(knl): assert k.temporary_variables["a"].shape == (16,)
def test_ilp_write_race_avoidance_local(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i,j]: 0<=i<16 and 0<=j<17 }", [ "<> a[i] = 5+i+j", ], []) knl = lp.tag_inames(knl, dict(i="l.0", j="ilp")) knl = lp.preprocess_kernel(knl, ctx.devices[0]) for k in lp.generate_loop_schedules(knl): assert k.temporary_variables["a"].shape == (16, 17)
def test_cuda_target(): from loopy.target.cuda import CudaTarget knl = lp.make_kernel( "{ [i]: 0<=i<n }", "out[i] = 2*a[i]", [lp.GlobalArg("out,a", np.float32, shape=lp.auto), "..."], target=CudaTarget()) knl = lp.split_iname(knl, "i", 8, inner_tag="l.0") knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"]) print( lp.generate_code(lp.get_one_scheduled_kernel( lp.preprocess_kernel(knl)))[0])
def test_multiple_writes_to_local_temporary(): # Loopy would previously only handle barrier insertion correctly if exactly # one instruction wrote to each local temporary. This tests that multiple # writes are OK. knl = lp.make_kernel( "{[i,e]: 0<=i<5 and 0<=e<nelements}", """ <> temp[i, 0] = 17 temp[i, 1] = 15 """) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.preprocess_kernel(knl) for k in lp.generate_loop_schedules(knl): code, _ = lp.generate_code(k) print(code)
def test_empty_reduction(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( ["{[i]: 0<=i<20}", "[i] -> {[j]: 0<=j<0}"], "a[i] = sum(j, j)", ) knl = lp.preprocess_kernel(knl) print(knl) knl = lp.set_options(knl, write_cl=True) evt, (a, ) = knl(queue) assert (a.get() == 0).all()
def test_multi_cse(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[i]: 0<=i<100}", ["<float32> z[i] = a[i] + a[i]**2"], [lp.GlobalArg("a", np.float32, shape=(100, ))], local_sizes={0: 16}) knl = lp.split_iname(knl, "i", 16, inner_tag="l.0") knl = lp.add_prefetch(knl, "a", []) knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for gen_knl in kernel_gen: compiled = lp.CompiledKernel(ctx, gen_knl) print(compiled.get_code())
def test_wg_too_small(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[i]: 0<=i<100}", ["<float32> z[i] = a[i] {id=copy}"], [lp.GlobalArg("a", np.float32, shape=(100, ))], local_sizes={0: 16}) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) import pytest for gen_knl in kernel_gen: with pytest.raises(RuntimeError): lp.CompiledKernel(ctx, gen_knl).get_code()
def test_ispc_target(occa_mode=False): from loopy.target.ispc import ISPCTarget knl = lp.make_kernel( "{ [i]: 0<=i<n }", "out[i] = 2*a[i]", [lp.GlobalArg("out,a", np.float32, shape=lp.auto), "..."], target=ISPCTarget(occa_mode=occa_mode)) knl = lp.split_iname(knl, "i", 8, inner_tag="l.0") knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"]) codegen_result = lp.generate_code_v2( lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl))) print(codegen_result.device_code()) print(codegen_result.host_code())
def test_assume(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[i]: 0<=i<n}", "a[i] = a[i] + 1", [lp.GlobalArg("a", np.float32, shape="n"), "..."]) knl = lp.split_iname(knl, "i", 16) knl = lp.set_loop_priority(knl, "i_outer,i_inner") knl = lp.assume(knl, "n mod 16 = 0") knl = lp.assume(knl, "n > 10") knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for gen_knl in kernel_gen: print(gen_knl) compiled = lp.CompiledKernel(ctx, gen_knl) print(compiled.get_code()) assert "if" not in compiled.get_code()
def test_eq_constraint(ctx_factory): logging.basicConfig(level=logging.INFO) ctx = ctx_factory() knl = lp.make_kernel("{[i,j]: 0<= i,j < 32}", ["a[i] = b[i]"], [ lp.GlobalArg("a", np.float32, shape=(1000, )), lp.GlobalArg("b", np.float32, shape=(1000, )) ]) knl = lp.split_iname(knl, "i", 16, outer_tag="g.0") knl = lp.split_iname(knl, "i_inner", 16, outer_tag=None, inner_tag="l.0") knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for knl in kernel_gen: print(lp.generate_code(knl))
def test_simple_side_effect(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i,j]: 0<=i,j<100}", """ a[i] = a[i] + 1 """, [lp.GlobalArg("a", np.float32, shape=(100,))] ) knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for gen_knl in kernel_gen: print(gen_knl) compiled = lp.CompiledKernel(ctx, gen_knl) print(compiled.get_code())
def test_owed_barriers(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<100}", [ "<float32> z[i] = a[i]" ], [lp.GlobalArg("a", np.float32, shape=(100,))] ) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for gen_knl in kernel_gen: compiled = lp.CompiledKernel(ctx, gen_knl) print(compiled.get_code())
def test_type_inference_no_artificial_doubles(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[i]: 0<=i<n}", """ <> bb = a[i] - b[i] c[i] = bb """, [ lp.GlobalArg("a", np.float32, shape=("n", )), lp.GlobalArg("b", np.float32, shape=("n", )), lp.GlobalArg("c", np.float32, shape=("n", )), lp.ValueArg("n", np.int32), ], assumptions="n>=1") knl = lp.preprocess_kernel(knl, ctx.devices[0]) for k in lp.generate_loop_schedules(knl): code = lp.generate_code(k) assert "double" not in code
def test_generate_c_snippet(): from loopy.target.c import CTarget from pymbolic import var I = var("I") # noqa f = var("f") df = var("df") q_v = var("q_v") eN = var("eN") # noqa k = var("k") u = var("u") from functools import partial l_sum = partial(lp.Reduction, "sum", allow_simultaneous=True) Instr = lp.Assignment # noqa knl = lp.make_kernel( "{[I, k]: 0<=I<nSpace and 0<=k<nQuad}", [ Instr(f[I], l_sum(k, q_v[k, I]*u)), Instr(df[I], l_sum(k, q_v[k, I])), ], [ lp.GlobalArg("q_v", np.float64, shape="nQuad, nSpace"), lp.GlobalArg("f,df", np.float64, shape="nSpace"), lp.ValueArg("u", np.float64), "...", ], target=CTarget(), assumptions="nQuad>=1") if 0: # enable to play with prefetching # (prefetch currently requires constant sizes) knl = lp.fix_parameters(knl, nQuad=5, nSpace=3) knl = lp.add_prefetch(knl, "q_v", "k,I", default_tag=None) knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1)) knl = lp.prioritize_loops(knl, "I,k_outer,k_inner") knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) print(lp.generate_body(knl))
def test_generate_c_snippet(): from loopy.target.c import CTarget from pymbolic import var I = var("I") # noqa f = var("f") df = var("df") q_v = var("q_v") eN = var("eN") # noqa k = var("k") u = var("u") from functools import partial l_sum = partial(lp.Reduction, "sum", allow_simultaneous=True) Instr = lp.Assignment # noqa knl = lp.make_kernel( "{[I, k]: 0<=I<nSpace and 0<=k<nQuad}", [ Instr(f[I], l_sum(k, q_v[k, I]*u)), Instr(df[I], l_sum(k, q_v[k, I])), ], [ lp.GlobalArg("q_v", np.float64, shape="nQuad, nSpace"), lp.GlobalArg("f,df", np.float64, shape="nSpace"), lp.ValueArg("u", np.float64), "...", ], target=CTarget(), assumptions="nQuad>=1") if 0: # enable to play with prefetching # (prefetch currently requires constant sizes) knl = lp.fix_parameters(knl, nQuad=5, nSpace=3) knl = lp.add_prefetch(knl, "q_v", "k,I", default_tag=None) knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1)) knl = lp.set_loop_priority(knl, "I,k_outer,k_inner") knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) print(lp.generate_body(knl))
def test_dependent_loop_bounds_3(ctx_factory): # The point of this test is that it shows a dependency between # domains that is exclusively mediated by the row_len temporary. # It also makes sure that row_len gets read before any # conditionals use it. dtype = np.dtype(np.float32) ctx = ctx_factory() knl = lp.make_kernel( [ "{[i]: 0<=i<n}", "{[jj]: 0<=jj<row_len}", ], [ "<> row_len = a_row_lengths[i]", "a[i,jj] = 1", ], [ lp.GlobalArg("a_row_lengths", np.int32, shape=lp.auto), lp.GlobalArg("a", dtype, shape=("n,n"), order="C"), lp.ValueArg("n", np.int32), ]) assert knl.parents_per_domain()[1] == 0 knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") cknl = lp.CompiledKernel(ctx, knl) print("---------------------------------------------------") print(cknl.get_highlighted_code()) print("---------------------------------------------------") knl_bad = lp.split_iname(knl, "jj", 128, outer_tag="g.1", inner_tag="l.1") knl = lp.preprocess_kernel(knl, ctx.devices[0]) import pytest with pytest.raises(RuntimeError): list(lp.generate_loop_schedules(knl_bad))
def test_assume(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<n}", "a[i] = a[i] + 1", [lp.GlobalArg("a", np.float32, shape="n"), "..."]) knl = lp.split_iname(knl, "i", 16) knl = lp.set_loop_priority(knl, "i_outer,i_inner") knl = lp.assume(knl, "n mod 16 = 0") knl = lp.assume(knl, "n > 10") knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for gen_knl in kernel_gen: print(gen_knl) compiled = lp.CompiledKernel(ctx, gen_knl) print(compiled.get_code()) assert "if" not in compiled.get_code()
def test_cuda_target(): from loopy.target.cuda import CudaTarget knl = lp.make_kernel( "{ [i]: 0<=i<n }", "out[i] = 2*a[i]", [ lp.GlobalArg("out,a", np.float32, shape=lp.auto), "..." ], target=CudaTarget()) knl = lp.split_iname(knl, "i", 8, inner_tag="l.0") knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"]) print( lp.generate_code( lp.get_one_scheduled_kernel( lp.preprocess_kernel(knl)))[0])
def test_wg_too_small(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<100}", [ "<float32> z[i] = a[i] {id=copy}" ], [lp.GlobalArg("a", np.float32, shape=(100,))], local_sizes={0: 16}) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) import pytest for gen_knl in kernel_gen: with pytest.raises(RuntimeError): lp.CompiledKernel(ctx, gen_knl).get_code()
def test_multi_cse(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<100}", [ "<float32> z[i] = a[i] + a[i]**2" ], [lp.GlobalArg("a", np.float32, shape=(100,))], local_sizes={0: 16}) knl = lp.split_iname(knl, "i", 16, inner_tag="l.0") knl = lp.add_prefetch(knl, "a", []) knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for gen_knl in kernel_gen: compiled = lp.CompiledKernel(ctx, gen_knl) print(compiled.get_code())
def test_dependent_loop_bounds_3(ctx_factory): # The point of this test is that it shows a dependency between # domains that is exclusively mediated by the row_len temporary. # It also makes sure that row_len gets read before any # conditionals use it. dtype = np.dtype(np.float32) ctx = ctx_factory() knl = lp.make_kernel( [ "{[i]: 0<=i<n}", "{[jj]: 0<=jj<row_len}", ], [ "<> row_len = a_row_lengths[i]", "a[i,jj] = 1", ], [ lp.GlobalArg("a_row_lengths", np.int32, shape=lp.auto), lp.GlobalArg("a", dtype, shape=("n,n"), order="C"), lp.ValueArg("n", np.int32), ]) assert knl.parents_per_domain()[1] == 0 knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") cknl = lp.CompiledKernel(ctx, knl) print("---------------------------------------------------") print(cknl.get_highlighted_code()) print("---------------------------------------------------") knl_bad = lp.split_iname(knl, "jj", 128, outer_tag="g.1", inner_tag="l.1") knl = lp.preprocess_kernel(knl, ctx.devices[0]) with pytest.raises(RuntimeError): list(lp.generate_loop_schedules(knl_bad))
def test_type_inference_no_artificial_doubles(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<n}", """ <> bb = a[i] - b[i] c[i] = bb """, [ lp.GlobalArg("a", np.float32, shape=("n",)), lp.GlobalArg("b", np.float32, shape=("n",)), lp.GlobalArg("c", np.float32, shape=("n",)), lp.ValueArg("n", np.int32), ], assumptions="n>=1") knl = lp.preprocess_kernel(knl, ctx.devices[0]) for k in lp.generate_loop_schedules(knl): code = lp.generate_code(k) assert "double" not in code
def test_ispc_target(occa_mode=False): from loopy.target.ispc import ISPCTarget knl = lp.make_kernel( "{ [i]: 0<=i<n }", "out[i] = 2*a[i]", [ lp.GlobalArg("out,a", np.float32, shape=lp.auto), "..." ], target=ISPCTarget(occa_mode=occa_mode)) knl = lp.split_iname(knl, "i", 8, inner_tag="l.0") knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"]) codegen_result = lp.generate_code_v2( lp.get_one_scheduled_kernel( lp.preprocess_kernel(knl))) print(codegen_result.device_code()) print(codegen_result.host_code())
def test_argmax(ctx_factory): logging.basicConfig(level=logging.INFO) dtype = np.dtype(np.float32) ctx = ctx_factory() queue = cl.CommandQueue(ctx) n = 10000 knl = lp.make_kernel( "{[i]: 0<=i<%d}" % n, """ max_val, max_idx = argmax(i, abs(a[i]), i) """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) print(lp.preprocess_kernel(knl)) knl = lp.set_options(knl, write_cl=True, highlight_cl=True) a = np.random.randn(10000).astype(dtype) evt, (max_idx, max_val) = knl(queue, a=a, out_host=True) assert max_val == np.max(np.abs(a)) assert max_idx == np.where(np.abs(a) == max_val)[-1]
def test_eq_constraint(ctx_factory): logging.basicConfig(level=logging.INFO) ctx = ctx_factory() knl = lp.make_kernel( "{[i,j]: 0<= i,j < 32}", [ "a[i] = b[i]" ], [ lp.GlobalArg("a", np.float32, shape=(1000,)), lp.GlobalArg("b", np.float32, shape=(1000,)) ]) knl = lp.split_iname(knl, "i", 16, outer_tag="g.0") knl = lp.split_iname(knl, "i_inner", 16, outer_tag=None, inner_tag="l.0") knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for knl in kernel_gen: print(lp.generate_code(knl))
def test_argmax(ctx_factory): logging.basicConfig(level=logging.INFO) dtype = np.dtype(np.float32) ctx = ctx_factory() queue = cl.CommandQueue(ctx) n = 10000 knl = lp.make_kernel( "{[i]: 0<=i<%d}" % n, """ max_val, max_idx = argmax(i, fabs(a[i])) """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) print(lp.preprocess_kernel(knl)) knl = lp.set_options(knl, write_cl=True, highlight_cl=True) a = np.random.randn(10000).astype(dtype) evt, (max_idx, max_val) = knl(queue, a=a, out_host=True) assert max_val == np.max(np.abs(a)) assert max_idx == np.where(np.abs(a) == max_val)[-1]
def test_global_mc_parallel_reduction(ctx_factory, size): ctx = ctx_factory() import pyopencl.version # noqa if cl.version.VERSION < (2016, 2): pytest.skip("Random123 RNG not supported in PyOpenCL < 2016.2") knl = lp.make_kernel( "{[i]: 0 <= i < n }", """ for i <> key = make_uint2(i, 324830944) {inames=i} <> ctr = make_uint4(0, 1, 2, 3) {inames=i,id=init_ctr} <> vals, ctr = philox4x32_f32(ctr, key) {dep=init_ctr} end z = sum(i, vals.s0 + vals.s1 + vals.s2 + vals.s3) """) ref_knl = knl ref_knl = lp.add_dtypes(ref_knl, {"n": np.int32}) gsize = 128 knl = lp.split_iname(knl, "i", gsize * 20) knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0") knl = lp.split_reduction_inward(knl, "i_inner_inner") knl = lp.split_reduction_inward(knl, "i_inner_outer") from loopy.transform.data import reduction_arg_to_subst_rule knl = reduction_arg_to_subst_rule(knl, "i_outer") knl = lp.precompute(knl, "red_i_outer_arg", "i_outer", temporary_address_space=lp.AddressSpace.GLOBAL, default_tag="l.auto") knl = lp.preprocess_kernel(knl) knl = lp.add_dependency(knl, "writes:acc_i_outer", "id:red_i_outer_arg_barrier") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": size})
def test_ilp_write_race_detection_global(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("[n] -> {[i,j]: 0<=i,j<n }", [ "a[i] = 5+i+j", ], [ lp.GlobalArg("a", np.float32), lp.ValueArg("n", np.int32, approximately=1000), ], assumptions="n>=1") knl = lp.tag_inames(knl, dict(j="ilp")) knl = lp.preprocess_kernel(knl, ctx.devices[0]) with lp.CacheMode(False): from loopy.diagnostic import WriteRaceConditionWarning from warnings import catch_warnings with catch_warnings(record=True) as warn_list: list(lp.generate_loop_schedules(knl)) assert any( isinstance(w.message, WriteRaceConditionWarning) for w in warn_list)
def auto_test_vs_ref(ref_knl, ctx, test_knl=None, op_count=[], op_label=[], parameters={}, print_ref_code=False, print_code=True, warmup_rounds=2, dump_binary=False, fills_entire_output=None, do_check=True, check_result=None, max_test_kernel_count=1, quiet=False, blacklist_ref_vendors=[]): """Compare results of `ref_knl` to the kernels generated by scheduling *test_knl*. :arg check_result: a callable with :class:`numpy.ndarray` arguments *(result, reference_result)* returning a a tuple (class:`bool`, message) indicating correctness/acceptability of the result :arg max_test_kernel_count: Stop testing after this many *test_knl* """ import pyopencl as cl if test_knl is None: test_knl = ref_knl do_check = False if len(ref_knl.args) != len(test_knl.args): raise LoopyError("ref_knl and test_knl do not have the same number " "of arguments") for i, (ref_arg, test_arg) in enumerate(zip(ref_knl.args, test_knl.args)): if ref_arg.name != test_arg.name: raise LoopyError( "ref_knl and test_knl argument lists disagree at index " "%d (1-based)" % (i + 1)) if ref_arg.dtype != test_arg.dtype: raise LoopyError( "ref_knl and test_knl argument lists disagree at index " "%d (1-based)" % (i + 1)) from loopy.compiled import CompiledKernel from loopy.target.execution import get_highlighted_code if isinstance(op_count, (int, float)): warn("op_count should be a list", stacklevel=2) op_count = [op_count] if isinstance(op_label, str): warn("op_label should be a list", stacklevel=2) op_label = [op_label] from time import time if check_result is None: check_result = _default_check_result if fills_entire_output is not None: warn("fills_entire_output is deprecated", DeprecationWarning, stacklevel=2) # {{{ compile and run reference code from loopy.type_inference import infer_unknown_types ref_knl = infer_unknown_types(ref_knl, expect_completion=True) found_ref_device = False ref_errors = [] from loopy.kernel.data import ImageArg need_ref_image_support = any( isinstance(arg, ImageArg) for arg in ref_knl.args) for dev in _enumerate_cl_devices_for_ref_test(blacklist_ref_vendors, need_ref_image_support): ref_ctx = cl.Context([dev]) ref_queue = cl.CommandQueue( ref_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) pp_ref_knl = lp.preprocess_kernel(ref_knl) for knl in lp.generate_loop_schedules(pp_ref_knl): ref_sched_kernel = knl break logger.info("{} (ref): trying {} for the reference calculation".format( ref_knl.name, dev)) ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel) if not quiet and print_ref_code: print(75 * "-") print("Reference Code:") print(75 * "-") print(get_highlighted_code(ref_compiled.get_code())) print(75 * "-") ref_kernel_info = ref_compiled.kernel_info(frozenset()) try: ref_args, ref_arg_data = \ make_ref_args(ref_sched_kernel, ref_kernel_info.implemented_data_info, ref_queue, parameters) ref_args["out_host"] = False except cl.RuntimeError as e: if e.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED: import traceback ref_errors.append("\n".join([ 75 * "-", "On %s:" % dev, 75 * "-", traceback.format_exc(), 75 * "-" ])) continue else: raise found_ref_device = True if not do_check: break ref_queue.finish() logger.info("{} (ref): using {} for the reference calculation".format( ref_knl.name, dev)) logger.info("%s (ref): run" % ref_knl.name) ref_start = time() if not AUTO_TEST_SKIP_RUN: ref_evt, _ = ref_compiled(ref_queue, **ref_args) else: ref_evt = cl.enqueue_marker(ref_queue) ref_queue.finish() ref_stop = time() ref_elapsed_wall = ref_stop - ref_start logger.info("%s (ref): run done" % ref_knl.name) ref_evt.wait() ref_elapsed_event = 1e-9 * (ref_evt.profile.END - ref_evt.profile.START) break if not found_ref_device: raise LoopyError("could not find a suitable device for the " "reference computation.\n" "These errors were encountered:\n" + "\n".join(ref_errors)) # }}} # {{{ compile and run parallel code need_check = do_check queue = cl.CommandQueue( ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) from loopy.kernel import KernelState from loopy.target.pyopencl import PyOpenCLTarget if test_knl.state not in [ KernelState.PREPROCESSED, KernelState.LINEARIZED ]: if isinstance(test_knl.target, PyOpenCLTarget): test_knl = test_knl.copy(target=PyOpenCLTarget(ctx.devices[0])) test_knl = lp.preprocess_kernel(test_knl) if not test_knl.schedule: test_kernels = lp.generate_loop_schedules(test_knl) else: test_kernels = [test_knl] test_kernel_count = 0 from loopy.type_inference import infer_unknown_types for i, kernel in enumerate(test_kernels): test_kernel_count += 1 if test_kernel_count > max_test_kernel_count: break kernel = infer_unknown_types(kernel, expect_completion=True) compiled = CompiledKernel(ctx, kernel) kernel_info = compiled.kernel_info(frozenset()) args = make_args(kernel, kernel_info.implemented_data_info, queue, ref_arg_data, parameters) args["out_host"] = False if not quiet: print(75 * "-") print("Kernel #%d:" % i) print(75 * "-") if print_code: print(compiled.get_highlighted_code()) print(75 * "-") if dump_binary: # {{{ find cl program for name in dir(kernel_info.cl_kernels): if name.startswith("__"): continue cl_kernel = getattr(kernel_info.cl_kernels, name) cl_program = cl_kernel.get_info(cl.kernel_info.PROGRAM) break else: assert False, "could not find cl_program" # }}} print(type(cl_program)) if hasattr(cl_program, "binaries"): print(cl_program.binaries[0]) print(75 * "-") logger.info("%s: run warmup" % (knl.name)) for i in range(warmup_rounds): if not AUTO_TEST_SKIP_RUN: compiled(queue, **args) if need_check and not AUTO_TEST_SKIP_RUN: for arg_desc in ref_arg_data: if arg_desc is None: continue if not arg_desc.needs_checking: continue from pyopencl.compyte.array import as_strided ref_ary = as_strided( arg_desc.ref_storage_array.get(), shape=arg_desc.ref_shape, strides=arg_desc.ref_numpy_strides).flatten() test_ary = as_strided( arg_desc.test_storage_array.get(), shape=arg_desc.test_shape, strides=arg_desc.test_numpy_strides).flatten() common_len = min(len(ref_ary), len(test_ary)) ref_ary = ref_ary[:common_len] test_ary = test_ary[:common_len] error_is_small, error = check_result(test_ary, ref_ary) if not error_is_small: raise AutomaticTestFailure(error) need_check = False events = [] queue.finish() logger.info("%s: warmup done" % (knl.name)) logger.info("%s: timing run" % (knl.name)) timing_rounds = max(warmup_rounds, 1) while True: from time import time start_time = time() evt_start = cl.enqueue_marker(queue) for i in range(timing_rounds): if not AUTO_TEST_SKIP_RUN: evt, _ = compiled(queue, **args) events.append(evt) else: events.append(cl.enqueue_marker(queue)) evt_end = cl.enqueue_marker(queue) queue.finish() stop_time = time() for evt in events: evt.wait() evt_start.wait() evt_end.wait() elapsed_event = (1e-9*events[-1].profile.END - 1e-9*events[0].profile.START) \ / timing_rounds try: elapsed_event_marker = ((1e-9 * evt_end.profile.START - 1e-9 * evt_start.profile.START) / timing_rounds) except cl.RuntimeError: elapsed_event_marker = None elapsed_wall = (stop_time - start_time) / timing_rounds if elapsed_wall * timing_rounds < 0.3: timing_rounds *= 4 else: break logger.info("%s: timing run done" % (knl.name)) rates = "" for cnt, lbl in zip(op_count, op_label): rates += " {:g} {}/s".format(cnt / elapsed_wall, lbl) if not quiet: def format_float_or_none(v): if v is None: return "<unavailable>" else: return "%g" % v print("elapsed: %s s event, %s s marker-event %s s wall " "(%d rounds)%s" % (format_float_or_none(elapsed_event), format_float_or_none(elapsed_event_marker), format_float_or_none(elapsed_wall), timing_rounds, rates)) if do_check: ref_rates = "" for cnt, lbl in zip(op_count, op_label): ref_rates += " {:g} {}/s".format(cnt / ref_elapsed_event, lbl) if not quiet: print("ref: elapsed: {:g} s event, {:g} s wall{}".format( ref_elapsed_event, ref_elapsed_wall, ref_rates)) # }}} result_dict = {} result_dict["elapsed_event"] = elapsed_event result_dict["elapsed_event_marker"] = elapsed_event_marker result_dict["elapsed_wall"] = elapsed_wall result_dict["timing_rounds"] = timing_rounds if do_check: result_dict["ref_elapsed_event"] = ref_elapsed_event result_dict["ref_elapsed_wall"] = ref_elapsed_wall return result_dict
def gen_code(knl): knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) codegen_result = lp.generate_code_v2(knl) return codegen_result.device_code() + "\n" + codegen_result.host_code()
def auto_test_vs_ref( ref_knl, ctx, test_knl=None, op_count=[], op_label=[], parameters={}, print_ref_code=False, print_code=True, warmup_rounds=2, dump_binary=False, fills_entire_output=None, do_check=True, check_result=None, max_test_kernel_count=1, quiet=False, blacklist_ref_vendors=[]): """Compare results of `ref_knl` to the kernels generated by scheduling *test_knl*. :arg check_result: a callable with :class:`numpy.ndarray` arguments *(result, reference_result)* returning a a tuple (class:`bool`, message) indicating correctness/acceptability of the result :arg max_test_kernel_count: Stop testing after this many *test_knl* """ import pyopencl as cl if test_knl is None: test_knl = ref_knl do_check = False if len(ref_knl.args) != len(test_knl.args): raise LoopyError("ref_knl and test_knl do not have the same number " "of arguments") for i, (ref_arg, test_arg) in enumerate(zip(ref_knl.args, test_knl.args)): if ref_arg.name != test_arg.name: raise LoopyError("ref_knl and test_knl argument lists disagree at index " "%d (1-based)" % (i+1)) if ref_arg.dtype != test_arg.dtype: raise LoopyError("ref_knl and test_knl argument lists disagree at index " "%d (1-based)" % (i+1)) from loopy.compiled import CompiledKernel, get_highlighted_cl_code if isinstance(op_count, (int, float)): warn("op_count should be a list", stacklevel=2) op_count = [op_count] if isinstance(op_label, str): warn("op_label should be a list", stacklevel=2) op_label = [op_label] from time import time if check_result is None: check_result = _default_check_result if fills_entire_output is not None: warn("fills_entire_output is deprecated", DeprecationWarning, stacklevel=2) # {{{ compile and run reference code from loopy.preprocess import infer_unknown_types ref_knl = infer_unknown_types(ref_knl, expect_completion=True) found_ref_device = False ref_errors = [] for dev in _enumerate_cl_devices_for_ref_test(blacklist_ref_vendors): ref_ctx = cl.Context([dev]) ref_queue = cl.CommandQueue(ref_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) pp_ref_knl = lp.preprocess_kernel(ref_knl) for knl in lp.generate_loop_schedules(pp_ref_knl): ref_sched_kernel = knl break logger.info("%s (ref): trying %s for the reference calculation" % ( ref_knl.name, dev)) ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel) if not quiet and print_ref_code: print(75*"-") print("Reference Code:") print(75*"-") print(get_highlighted_cl_code(ref_compiled.code)) print(75*"-") ref_cl_kernel_info = ref_compiled.cl_kernel_info(frozenset()) try: ref_args, ref_arg_data = \ make_ref_args(ref_sched_kernel, ref_cl_kernel_info.implemented_data_info, ref_queue, parameters) ref_args["out_host"] = False except cl.RuntimeError as e: if e.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED: import traceback ref_errors.append("\n".join([ 75*"-", "On %s:" % dev, 75*"-", traceback.format_exc(), 75*"-"])) continue else: raise found_ref_device = True if not do_check: break ref_queue.finish() logger.info("%s (ref): using %s for the reference calculation" % ( ref_knl.name, dev)) logger.info("%s (ref): run" % ref_knl.name) ref_start = time() if not AUTO_TEST_SKIP_RUN: ref_evt, _ = ref_compiled(ref_queue, **ref_args) else: ref_evt = cl.enqueue_marker(ref_queue) ref_queue.finish() ref_stop = time() ref_elapsed_wall = ref_stop-ref_start logger.info("%s (ref): run done" % ref_knl.name) ref_evt.wait() ref_elapsed_event = 1e-9*(ref_evt.profile.END-ref_evt.profile.START) break if not found_ref_device: raise LoopyError("could not find a suitable device for the " "reference computation.\n" "These errors were encountered:\n"+"\n".join(ref_errors)) # }}} # {{{ compile and run parallel code need_check = do_check queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) args = None from loopy.kernel import kernel_state if test_knl.state not in [ kernel_state.PREPROCESSED, kernel_state.SCHEDULED]: test_knl = lp.preprocess_kernel(test_knl) if not test_knl.schedule: test_kernels = lp.generate_loop_schedules(test_knl) else: test_kernels = [test_knl] test_kernel_count = 0 from loopy.preprocess import infer_unknown_types for i, kernel in enumerate(test_kernels): test_kernel_count += 1 if test_kernel_count > max_test_kernel_count: break kernel = infer_unknown_types(kernel, expect_completion=True) compiled = CompiledKernel(ctx, kernel) if args is None: cl_kernel_info = compiled.cl_kernel_info(frozenset()) args = make_args(kernel, cl_kernel_info.implemented_data_info, queue, ref_arg_data, parameters) args["out_host"] = False if not quiet: print(75*"-") print("Kernel #%d:" % i) print(75*"-") if print_code: print(compiled.get_highlighted_code()) print(75*"-") if dump_binary: print(type(compiled.cl_program)) print(compiled.cl_program.binaries[0]) print(75*"-") logger.info("%s: run warmup" % (knl.name)) for i in range(warmup_rounds): if not AUTO_TEST_SKIP_RUN: compiled(queue, **args) if need_check and not AUTO_TEST_SKIP_RUN: for arg_desc in ref_arg_data: if arg_desc is None: continue if not arg_desc.needs_checking: continue from pyopencl.compyte.array import as_strided ref_ary = as_strided( arg_desc.ref_storage_array.get(), shape=arg_desc.ref_shape, strides=arg_desc.ref_numpy_strides).flatten() test_ary = as_strided( arg_desc.test_storage_array.get(), shape=arg_desc.test_shape, strides=arg_desc.test_numpy_strides).flatten() common_len = min(len(ref_ary), len(test_ary)) ref_ary = ref_ary[:common_len] test_ary = test_ary[:common_len] error_is_small, error = check_result(test_ary, ref_ary) if not error_is_small: raise AutomaticTestFailure(error) need_check = False events = [] queue.finish() logger.info("%s: warmup done" % (knl.name)) logger.info("%s: timing run" % (knl.name)) timing_rounds = warmup_rounds while True: from time import time start_time = time() evt_start = cl.enqueue_marker(queue) for i in range(timing_rounds): if not AUTO_TEST_SKIP_RUN: evt, _ = compiled(queue, **args) events.append(evt) else: events.append(cl.enqueue_marker(queue)) evt_end = cl.enqueue_marker(queue) queue.finish() stop_time = time() for evt in events: evt.wait() evt_start.wait() evt_end.wait() elapsed_event = (1e-9*events[-1].profile.END - 1e-9*events[0].profile.START) \ / timing_rounds try: elapsed_event_marker = ((1e-9*evt_end.profile.START - 1e-9*evt_start.profile.START) / timing_rounds) except cl.RuntimeError: elapsed_event_marker = None elapsed_wall = (stop_time-start_time)/timing_rounds if elapsed_wall * timing_rounds < 0.3: timing_rounds *= 4 else: break logger.info("%s: timing run done" % (knl.name)) rates = "" for cnt, lbl in zip(op_count, op_label): rates += " %g %s/s" % (cnt/elapsed_wall, lbl) if not quiet: def format_float_or_none(v): if v is None: return "<unavailable>" else: return "%g" % v print("elapsed: %s s event, %s s marker-event %s s wall " "(%d rounds)%s" % ( format_float_or_none(elapsed_event), format_float_or_none(elapsed_event_marker), format_float_or_none(elapsed_wall), timing_rounds, rates)) if do_check: ref_rates = "" for cnt, lbl in zip(op_count, op_label): ref_rates += " %g %s/s" % (cnt/ref_elapsed_event, lbl) if not quiet: print("ref: elapsed: %g s event, %g s wall%s" % ( ref_elapsed_event, ref_elapsed_wall, ref_rates)) # }}} result_dict = {} result_dict["elapsed_event"] = elapsed_event result_dict["elapsed_event_marker"] = elapsed_event_marker result_dict["elapsed_wall"] = elapsed_wall result_dict["timing_rounds"] = timing_rounds if do_check: result_dict["ref_elapsed_event"] = ref_elapsed_event result_dict["ref_elapsed_wall"] = ref_elapsed_wall return result_dict
def main(): from argparse import ArgumentParser parser = ArgumentParser(description="Stand-alone loopy frontend") parser.add_argument("infile", metavar="INPUT_FILE") parser.add_argument("outfile", default="-", metavar="OUTPUT_FILE", help="Defaults to stdout ('-').", nargs="?") parser.add_argument("--lang", metavar="LANGUAGE", help="loopy|fortran") parser.add_argument("--target", choices=("opencl", "ispc", "ispc-occa", "c", "c-fortran", "cuda"), default="opencl") parser.add_argument("--name") parser.add_argument("--transform") parser.add_argument("--edit-code", action="store_true") parser.add_argument("--occa-defines") parser.add_argument("--occa-add-dummy-arg", action="store_true") parser.add_argument("--print-ir", action="store_true") args = parser.parse_args() if args.target == "opencl": from loopy.target.opencl import OpenCLTarget target = OpenCLTarget() elif args.target == "ispc": from loopy.target.ispc import ISPCTarget target = ISPCTarget() elif args.target == "ispc-occa": from loopy.target.ispc import ISPCTarget target = ISPCTarget(occa_mode=True) elif args.target == "c": from loopy.target.c import CTarget target = CTarget() elif args.target == "c-fortran": from loopy.target.c import CTarget target = CTarget(fortran_abi=True) elif args.target == "cuda": from loopy.target.cuda import CudaTarget target = CudaTarget() else: raise ValueError("unknown target: %s" % target) lp.set_default_target(target) lang = None if args.infile == "-": infile_content = sys.stdin.read() else: from os.path import splitext _, ext = splitext(args.infile) lang = { ".py": "loopy", ".loopy": "loopy", ".floopy": "fortran", ".f90": "fortran", ".fpp": "fortran", ".f": "fortran", ".f77": "fortran", }.get(ext) with open(args.infile, "r") as infile_fd: infile_content = infile_fd.read() if args.lang is not None: lang = args.lang if lang is None: raise RuntimeError("unable to deduce input language " "(wrong input file extension? --lang flag?)") if lang == "loopy": # {{{ path wrangling from os.path import dirname, abspath from os import getcwd infile_dirname = dirname(args.infile) if infile_dirname: infile_dirname = abspath(infile_dirname) else: infile_dirname = getcwd() sys.path.append(infile_dirname) # }}} data_dic = {} data_dic["lp"] = lp data_dic["np"] = np if args.occa_defines: with open(args.occa_defines, "r") as defines_fd: occa_define_code = defines_to_python_code(defines_fd.read()) exec(compile(occa_define_code, args.occa_defines, "exec"), data_dic) with open(args.infile, "r") as infile_fd: exec(compile(infile_content, args.infile, "exec"), data_dic) if args.transform: with open(args.transform, "r") as xform_fd: exec(compile(xform_fd.read(), args.transform, "exec"), data_dic) try: kernel = data_dic["lp_knl"] except KeyError: raise RuntimeError("loopy-lang requires 'lp_knl' " "to be defined on exit") if args.name is not None: kernel = kernel.copy(name=args.name) kernels = [kernel] elif lang in ["fortran", "floopy", "fpp"]: pre_transform_code = None if args.transform: with open(args.transform, "r") as xform_fd: pre_transform_code = xform_fd.read() if args.occa_defines: if pre_transform_code is None: pre_transform_code = "" with open(args.occa_defines, "r") as defines_fd: pre_transform_code = defines_to_python_code(defines_fd.read()) + pre_transform_code kernels = lp.parse_transformed_fortran( infile_content, pre_transform_code=pre_transform_code, filename=args.infile ) if args.name is not None: kernels = [kernel for kernel in kernels if kernel.name == args.name] if not kernels: raise RuntimeError("no kernels found (name specified: %s)" % args.name) else: raise RuntimeError("unknown language: '%s'" % args.lang) if args.print_ir: for kernel in kernels: print(kernel, file=sys.stderr) if args.occa_add_dummy_arg: new_kernels = [] for kernel in kernels: new_args = [lp.GlobalArg("occa_info", np.int32, shape=None)] + kernel.args new_kernels.append(kernel.copy(args=new_args)) kernels = new_kernels del new_kernels codes = [] from loopy.codegen import generate_code for kernel in kernels: kernel = lp.preprocess_kernel(kernel) code, impl_arg_info = generate_code(kernel) codes.append(code) if args.outfile is not None: outfile = args.outfile else: outfile = "-" code = "\n\n".join(codes) # {{{ edit code if requested import os edit_kernel_env = os.environ.get("LOOPY_EDIT_KERNEL") need_edit = args.edit_code if not need_edit and edit_kernel_env is not None: # Do not replace with "any()"--Py2.6/2.7 bug doesn't like # comprehensions in functions with exec(). for k in kernels: if edit_kernel_env.lower() in k.name.lower(): need_edit = True if need_edit: from pytools import invoke_editor code = invoke_editor(code, filename="edit.cl") # }}} if outfile == "-": sys.stdout.write(code) else: with open(outfile, "w") as outfile_fd: outfile_fd.write(code)
def time_instantiate(self, data, param): knl = _sumpy_kernel_make(data[param]["setup"], param) lp.preprocess_kernel(knl)
def main(): from argparse import ArgumentParser parser = ArgumentParser(description="Stand-alone loopy frontend") parser.add_argument("infile", metavar="INPUT_FILE") parser.add_argument("outfile", default="-", metavar="OUTPUT_FILE", help="Defaults to stdout ('-').", nargs="?") parser.add_argument("--lang", metavar="LANGUAGE", help="loopy|fortran") parser.add_argument("--target", choices=("opencl", "ispc", "ispc-occa", "c", "c-fortran", "cuda"), default="opencl") parser.add_argument("--name") parser.add_argument("--transform") parser.add_argument("--edit-code", action="store_true") parser.add_argument("--occa-defines") parser.add_argument("--occa-add-dummy-arg", action="store_true") parser.add_argument("--print-ir", action="store_true") args = parser.parse_args() if args.target == "opencl": from loopy.target.opencl import OpenCLTarget target = OpenCLTarget() elif args.target == "ispc": from loopy.target.ispc import ISPCTarget target = ISPCTarget() elif args.target == "ispc-occa": from loopy.target.ispc import ISPCTarget target = ISPCTarget(occa_mode=True) elif args.target == "c": from loopy.target.c import CTarget target = CTarget() elif args.target == "c-fortran": from loopy.target.c import CTarget target = CTarget(fortran_abi=True) elif args.target == "cuda": from loopy.target.cuda import CudaTarget target = CudaTarget() else: raise ValueError("unknown target: %s" % target) lp.set_default_target(target) lang = None if args.infile == "-": infile_content = sys.stdin.read() else: from os.path import splitext _, ext = splitext(args.infile) lang = { ".py": "loopy", ".loopy": "loopy", ".floopy": "fortran", ".f90": "fortran", ".fpp": "fortran", ".f": "fortran", ".f77": "fortran", }.get(ext) with open(args.infile) as infile_fd: infile_content = infile_fd.read() if args.lang is not None: lang = args.lang if lang is None: raise RuntimeError("unable to deduce input language " "(wrong input file extension? --lang flag?)") if lang == "loopy": # {{{ path wrangling from os.path import dirname, abspath from os import getcwd infile_dirname = dirname(args.infile) if infile_dirname: infile_dirname = abspath(infile_dirname) else: infile_dirname = getcwd() sys.path.append(infile_dirname) # }}} data_dic = {} data_dic["lp"] = lp data_dic["np"] = np if args.occa_defines: with open(args.occa_defines) as defines_fd: occa_define_code = defines_to_python_code(defines_fd.read()) exec(compile(occa_define_code, args.occa_defines, "exec"), data_dic) with open(args.infile) as infile_fd: exec(compile(infile_content, args.infile, "exec"), data_dic) if args.transform: with open(args.transform) as xform_fd: exec(compile(xform_fd.read(), args.transform, "exec"), data_dic) try: kernel = data_dic["lp_knl"] except KeyError: raise RuntimeError("loopy-lang requires 'lp_knl' " "to be defined on exit") if args.name is not None: kernel = kernel.copy(name=args.name) kernels = [kernel] elif lang in ["fortran", "floopy", "fpp"]: pre_transform_code = None if args.transform: with open(args.transform) as xform_fd: pre_transform_code = xform_fd.read() if args.occa_defines: if pre_transform_code is None: pre_transform_code = "" with open(args.occa_defines) as defines_fd: pre_transform_code = ( defines_to_python_code(defines_fd.read()) + pre_transform_code) kernels = lp.parse_transformed_fortran( infile_content, pre_transform_code=pre_transform_code, filename=args.infile) if args.name is not None: kernels = [ kernel for kernel in kernels if kernel.name == args.name ] if not kernels: raise RuntimeError("no kernels found (name specified: %s)" % args.name) else: raise RuntimeError("unknown language: '%s'" % args.lang) if args.print_ir: for kernel in kernels: print(kernel, file=sys.stderr) if args.occa_add_dummy_arg: new_kernels = [] for kernel in kernels: new_args = [lp.ArrayArg("occa_info", np.int32, shape=None) ] + kernel.args new_kernels.append(kernel.copy(args=new_args)) kernels = new_kernels del new_kernels codes = [] from loopy.codegen import generate_code for kernel in kernels: kernel = lp.preprocess_kernel(kernel) code, impl_arg_info = generate_code(kernel) codes.append(code) if args.outfile is not None: outfile = args.outfile else: outfile = "-" code = "\n\n".join(codes) # {{{ edit code if requested import os edit_kernel_env = os.environ.get("LOOPY_EDIT_KERNEL") need_edit = args.edit_code if not need_edit and edit_kernel_env is not None: # Do not replace with "any()"--Py2.6/2.7 bug doesn't like # comprehensions in functions with exec(). for k in kernels: if edit_kernel_env.lower() in k.name.lower(): need_edit = True if need_edit: from pytools import invoke_editor code = invoke_editor(code, filename="edit.cl") # }}} if outfile == "-": sys.stdout.write(code) else: with open(outfile, "w") as outfile_fd: outfile_fd.write(code)