def test_vectorize(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<n}", """ <> temp = 2*b[i] a[i] = temp """) knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32)) knl = lp.set_array_dim_names(knl, "a,b", "i") knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)], 4, split_kwargs=dict(slabs=(0, 1))) knl = lp.tag_data_axes(knl, "a,b", "c,vec") ref_knl = knl ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"}) knl = lp.tag_inames(knl, {"i_inner": "vec"}) knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) code, inf = lp.generate_code(knl) lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters=dict(n=30))
def test_vectorize(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<n}", """ <> temp = 2*b[i] a[i] = temp """) knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32)) knl = lp.set_array_dim_names(knl, "a,b", "i") knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)], 4, split_kwargs=dict(slabs=(0, 1))) knl = lp.tag_data_axes(knl, "a,b", "c,vec") ref_knl = knl ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"}) knl = lp.tag_inames(knl, {"i_inner": "vec"}) knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) code, inf = lp.generate_code(knl) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=30))
def test_ispc_streaming_stores(): stream_dtype = np.float32 index_dtype = np.int32 knl = lp.make_kernel( "{[i]: 0<=i<n}", "a[i] = b[i] + scalar * c[i]", target=lp.ISPCTarget(), index_dtype=index_dtype, name="stream_triad") vars = ["a", "b", "c", "scalar"] knl = lp.assume(knl, "n>0") knl = lp.split_iname( knl, "i", 2**18, outer_tag="g.0", slabs=(0, 1)) knl = lp.split_iname(knl, "i_inner", 8, inner_tag="l.0") knl = lp.tag_instructions(knl, "!streaming_store") knl = lp.add_and_infer_dtypes(knl, { var: stream_dtype for var in vars }) knl = lp.set_argument_order(knl, vars + ["n"]) knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) lp.generate_code_v2(knl).all_code()
def cached_data(params): data = {} np.random.seed(17) logging.basicConfig(level=logging.INFO) for param in params: data[param] = {} expn = _sumpy_kernel_init(param) data[param]["setup"] = expn knl = _sumpy_kernel_make(expn, param) knl = lp.preprocess_kernel(knl) data[param]["instantiated"] = knl scheduled = knl.with_kernel(lp.get_one_scheduled_kernel(knl["loopy_kernel"], knl.callables_table)) data[param]["scheduled"] = scheduled return data
def get_barrier_poly(knl): """Count the number of barriers each thread encounters in a loopy kernel. :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted. :return: An :class:`islpy.PwQPolynomial` holding the number of barrier calls made (in terms of the :class:`loopy.LoopKernel` *inames*). Example usage:: # (first create loopy kernel and specify array data types) barrier_poly = get_barrier_poly(knl) params = {'n': 512, 'm': 256, 'l': 128} barrier_count = barrier_poly.eval_with_dict(params) # (now use this count to predict performance) """ from loopy.preprocess import preprocess_kernel, infer_unknown_types from loopy.schedule import EnterLoop, LeaveLoop, Barrier from operator import mul knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) iname_list = [] barrier_poly = isl.PwQPolynomial('{ 0 }') for sched_item in knl.schedule: if isinstance(sched_item, EnterLoop): if sched_item.iname: # (if not empty) iname_list.append(sched_item.iname) elif isinstance(sched_item, LeaveLoop): if sched_item.iname: # (if not empty) iname_list.pop() elif isinstance(sched_item, Barrier): if iname_list: # (if iname_list is not empty) ct = (count(knl, ( knl.get_inames_domain(iname_list). project_out_except(iname_list, [dim_type.set]) )), ) barrier_poly += reduce(mul, ct) else: barrier_poly += isl.PwQPolynomial('{ 1 }') return barrier_poly
def test_cuda_target(): from loopy.target.cuda import CudaTarget knl = lp.make_kernel( "{ [i]: 0<=i<n }", "out[i] = 2*a[i]", [lp.GlobalArg("out,a", np.float32, shape=lp.auto), "..."], target=CudaTarget()) knl = lp.split_iname(knl, "i", 8, inner_tag="l.0") knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"]) print( lp.generate_code(lp.get_one_scheduled_kernel( lp.preprocess_kernel(knl)))[0])
def estimate_regs_per_thread(knl): """Estimate registers per thread usage by a loopy kernel. :parameter knl: A :class:`loopy.LoopKernel` whose reg usage will be estimated. :return: An :class:`integer` holding an estimate for the number of registers used per thread. This number will most likely be too low, but will hopefully be consistantly too low by the same constant factor. """ from loopy.preprocess import preprocess_kernel, infer_unknown_types from loopy.schedule import EnterLoop, LeaveLoop, Barrier, RunInstruction # noqa knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) max_regs = 0 block_reg_totals = [0] # counters to track nested sets of previously used iname+index combinations reg_counters = [RegisterUsageEstimator(knl)] for sched_item in knl.schedule: if isinstance(sched_item, EnterLoop): block_reg_totals.append(0) # start a new estimator reg_counters.append(RegisterUsageEstimator(knl)) elif isinstance(sched_item, LeaveLoop): if block_reg_totals[-1] > max_regs: max_regs = block_reg_totals[-1] # pop to resume previous total block_reg_totals.pop() reg_counters.pop() elif isinstance(sched_item, RunInstruction): insn = knl.id_to_insn[sched_item.insn_id] block_reg_totals[-1] += reg_counters[-1](insn.assignee) + \ reg_counters[-1](insn.expression) # finished looping, check outer block if block_reg_totals[-1] > max_regs: max_regs = block_reg_totals[-1] return max_regs
def test_ispc_target(occa_mode=False): from loopy.target.ispc import ISPCTarget knl = lp.make_kernel( "{ [i]: 0<=i<n }", "out[i] = 2*a[i]", [lp.GlobalArg("out,a", np.float32, shape=lp.auto), "..."], target=ISPCTarget(occa_mode=occa_mode)) knl = lp.split_iname(knl, "i", 8, inner_tag="l.0") knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"]) codegen_result = lp.generate_code_v2( lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl))) print(codegen_result.device_code()) print(codegen_result.host_code())
def test_generate_c_snippet(): from loopy.target.c import CTarget from pymbolic import var I = var("I") # noqa f = var("f") df = var("df") q_v = var("q_v") eN = var("eN") # noqa k = var("k") u = var("u") from functools import partial l_sum = partial(lp.Reduction, "sum", allow_simultaneous=True) Instr = lp.Assignment # noqa knl = lp.make_kernel( "{[I, k]: 0<=I<nSpace and 0<=k<nQuad}", [ Instr(f[I], l_sum(k, q_v[k, I]*u)), Instr(df[I], l_sum(k, q_v[k, I])), ], [ lp.GlobalArg("q_v", np.float64, shape="nQuad, nSpace"), lp.GlobalArg("f,df", np.float64, shape="nSpace"), lp.ValueArg("u", np.float64), "...", ], target=CTarget(), assumptions="nQuad>=1") if 0: # enable to play with prefetching # (prefetch currently requires constant sizes) knl = lp.fix_parameters(knl, nQuad=5, nSpace=3) knl = lp.add_prefetch(knl, "q_v", "k,I", default_tag=None) knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1)) knl = lp.prioritize_loops(knl, "I,k_outer,k_inner") knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) print(lp.generate_body(knl))
def test_generate_c_snippet(): from loopy.target.c import CTarget from pymbolic import var I = var("I") # noqa f = var("f") df = var("df") q_v = var("q_v") eN = var("eN") # noqa k = var("k") u = var("u") from functools import partial l_sum = partial(lp.Reduction, "sum", allow_simultaneous=True) Instr = lp.Assignment # noqa knl = lp.make_kernel( "{[I, k]: 0<=I<nSpace and 0<=k<nQuad}", [ Instr(f[I], l_sum(k, q_v[k, I]*u)), Instr(df[I], l_sum(k, q_v[k, I])), ], [ lp.GlobalArg("q_v", np.float64, shape="nQuad, nSpace"), lp.GlobalArg("f,df", np.float64, shape="nSpace"), lp.ValueArg("u", np.float64), "...", ], target=CTarget(), assumptions="nQuad>=1") if 0: # enable to play with prefetching # (prefetch currently requires constant sizes) knl = lp.fix_parameters(knl, nQuad=5, nSpace=3) knl = lp.add_prefetch(knl, "q_v", "k,I", default_tag=None) knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1)) knl = lp.set_loop_priority(knl, "I,k_outer,k_inner") knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) print(lp.generate_body(knl))
def test_cuda_target(): from loopy.target.cuda import CudaTarget knl = lp.make_kernel( "{ [i]: 0<=i<n }", "out[i] = 2*a[i]", [ lp.GlobalArg("out,a", np.float32, shape=lp.auto), "..." ], target=CudaTarget()) knl = lp.split_iname(knl, "i", 8, inner_tag="l.0") knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"]) print( lp.generate_code( lp.get_one_scheduled_kernel( lp.preprocess_kernel(knl)))[0])
def test_ispc_target(occa_mode=False): from loopy.target.ispc import ISPCTarget knl = lp.make_kernel( "{ [i]: 0<=i<n }", "out[i] = 2*a[i]", [ lp.GlobalArg("out,a", np.float32, shape=lp.auto), "..." ], target=ISPCTarget(occa_mode=occa_mode)) knl = lp.split_iname(knl, "i", 8, inner_tag="l.0") knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"]) codegen_result = lp.generate_code_v2( lp.get_one_scheduled_kernel( lp.preprocess_kernel(knl))) print(codegen_result.device_code()) print(codegen_result.host_code())
def gen_code(knl): knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) codegen_result = lp.generate_code_v2(knl) return codegen_result.device_code() + "\n" + codegen_result.host_code()
def time_schedule(self, data, param): knl = data[param]["instantiated"] knl.with_kernel(lp.get_one_scheduled_kernel(knl["loopy_kernel"], knl.callables_table))
def get_synchronization_poly(knl): """Count the number of synchronization events each thread encounters in a loopy kernel. :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted. :return: A dictionary mapping each type of synchronization event to a :class:`islpy.PwQPolynomial` holding the number of such events per thread. Possible keys include ``barrier_local``, ``barrier_global`` (if supported by the target) and ``kernel_launch``. Example usage:: # (first create loopy kernel and specify array data types) barrier_poly = get_barrier_poly(knl) params = {'n': 512, 'm': 256, 'l': 128} barrier_count = barrier_poly.eval_with_dict(params) # (now use this count to predict performance) """ from loopy.preprocess import preprocess_kernel, infer_unknown_types from loopy.schedule import (EnterLoop, LeaveLoop, Barrier, CallKernel, ReturnFromKernel, RunInstruction) from operator import mul knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) iname_list = [] result = ToCountMap() one = isl.PwQPolynomial('{ 1 }') def get_count_poly(iname_list): if iname_list: # (if iname_list is not empty) ct = (count(knl, (knl.get_inames_domain(iname_list).project_out_except( iname_list, [dim_type.set]))), ) return reduce(mul, ct) else: return one for sched_item in knl.schedule: if isinstance(sched_item, EnterLoop): if sched_item.iname: # (if not empty) iname_list.append(sched_item.iname) elif isinstance(sched_item, LeaveLoop): if sched_item.iname: # (if not empty) iname_list.pop() elif isinstance(sched_item, Barrier): result = result + ToCountMap( {"barrier_%s" % sched_item.kind: get_count_poly(iname_list)}) elif isinstance(sched_item, CallKernel): result = result + ToCountMap( {"kernel_launch": get_count_poly(iname_list)}) elif isinstance(sched_item, (ReturnFromKernel, RunInstruction)): pass else: raise LoopyError("unexpected schedule item: %s" % type(sched_item).__name__) return result.dict
def get_synchronization_poly(knl): """Count the number of synchronization events each thread encounters in a loopy kernel. :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted. :return: A dictionary mapping each type of synchronization event to a :class:`islpy.PwQPolynomial` holding the number of such events per thread. Possible keys include ``barrier_local``, ``barrier_global`` (if supported by the target) and ``kernel_launch``. Example usage:: # (first create loopy kernel and specify array data types) barrier_poly = get_barrier_poly(knl) params = {'n': 512, 'm': 256, 'l': 128} barrier_count = barrier_poly.eval_with_dict(params) # (now use this count to predict performance) """ from loopy.preprocess import preprocess_kernel, infer_unknown_types from loopy.schedule import (EnterLoop, LeaveLoop, Barrier, CallKernel, ReturnFromKernel, RunInstruction) from operator import mul knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) iname_list = [] result = ToCountMap() one = isl.PwQPolynomial('{ 1 }') def get_count_poly(iname_list): if iname_list: # (if iname_list is not empty) ct = (count(knl, ( knl.get_inames_domain(iname_list). project_out_except(iname_list, [dim_type.set]) )), ) return reduce(mul, ct) else: return one for sched_item in knl.schedule: if isinstance(sched_item, EnterLoop): if sched_item.iname: # (if not empty) iname_list.append(sched_item.iname) elif isinstance(sched_item, LeaveLoop): if sched_item.iname: # (if not empty) iname_list.pop() elif isinstance(sched_item, Barrier): result = result + ToCountMap( {"barrier_%s" % sched_item.kind: get_count_poly(iname_list)}) elif isinstance(sched_item, CallKernel): result = result + ToCountMap( {"kernel_launch": get_count_poly(iname_list)}) elif isinstance(sched_item, (ReturnFromKernel, RunInstruction)): pass else: raise LoopyError("unexpected schedule item: %s" % type(sched_item).__name__) return result.dict