def test_vectorize(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<n}", """ <> temp = 2*b[i] a[i] = temp """) knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32)) knl = lp.set_array_dim_names(knl, "a,b", "i") knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)], 4, split_kwargs=dict(slabs=(0, 1))) knl = lp.tag_data_axes(knl, "a,b", "c,vec") ref_knl = knl ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"}) knl = lp.tag_inames(knl, {"i_inner": "vec"}) knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) code, inf = lp.generate_code(knl) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=30))
def test_small_batched_matvec(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" K = 9997 # noqa Np = 36 # noqa knl = lp.make_kernel( "{[i,j,k]: 0<=k<K and 0<= i,j < %d}" % Np, [ "result[k, i] = sum(j, d[i, j]*f[k, j])" ], [ lp.GlobalArg("d", dtype, shape=(Np, Np), order=order), lp.GlobalArg("f", dtype, shape=("K", Np), order=order), lp.GlobalArg("result", dtype, shape=("K", Np), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="batched_matvec", assumptions="K>=1") seq_knl = knl align_bytes = 64 knl = lp.add_prefetch(knl, 'd[:,:]') pad_mult = lp.find_padding_multiple(knl, "f", 0, align_bytes) knl = lp.split_array_dim(knl, ("f", 0), pad_mult) knl = lp.add_padding(knl, "f", 0, align_bytes) lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[K*2*Np**2/1e9], op_label=["GFlops"], parameters=dict(K=K))
def test_small_batched_matvec(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" K = 9997 # noqa Np = 36 # noqa knl = lp.make_kernel( "{[i,j,k]: 0<=k<K and 0<= i,j < %d}" % Np, ["result[k, i] = sum(j, d[i, j]*f[k, j])"], [ lp.GlobalArg("d", dtype, shape=(Np, Np), order=order), lp.GlobalArg("f", dtype, shape=("K", Np), order=order), lp.GlobalArg("result", dtype, shape=("K", Np), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="batched_matvec", assumptions="K>=1") seq_knl = knl align_bytes = 64 knl = lp.add_prefetch(knl, 'd[:,:]', default_tag="l.auto") pad_mult = lp.find_padding_multiple(knl, "f", 0, align_bytes) knl = lp.split_array_dim(knl, ("f", 0), pad_mult) knl = lp.add_padding(knl, "f", 0, align_bytes) lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[K * 2 * Np**2 / 1e9], op_label=["GFlops"], parameters=dict(K=K))
def test_vectorize(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<n}", """ <> temp = 2*b[i] a[i] = temp """) knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32)) knl = lp.set_array_dim_names(knl, "a,b", "i") knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)], 4, split_kwargs=dict(slabs=(0, 1))) knl = lp.tag_data_axes(knl, "a,b", "c,vec") ref_knl = knl ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"}) knl = lp.tag_inames(knl, {"i_inner": "vec"}) knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) code, inf = lp.generate_code(knl) lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters=dict(n=30))
def set_q_storage_format(kernel, name): kernel = lp.set_array_dim_names(kernel, name, "i,j,k,field,e") kernel = lp.split_array_dim(kernel, (name, 3, "F"), 4, auto_split_inames=False) kernel = lp.tag_data_axes(kernel, name, "N0,N1,N2,vec,N4,N3") return kernel
def set_q_storage_format(kernel, name): kernel = lp.set_array_axis_names(kernel, name, "i,j,k,field,e") kernel = lp.split_array_dim(kernel, (name, 3, "F"), 4, auto_split_inames=False) kernel = lp.tag_array_axes(kernel, name, "N0,N1,N2,vec,N4,N3") return kernel
def variant_fancy_padding(knl): knl = lp.tag_inames(knl, dict(n="l.0")) pad_mult = lp.find_padding_multiple(knl, "u", 1, 32) arg_names = [ prefix+name for name in ["u", "v", "w", "p"] for prefix in ["", "rhs"]] knl = lp.split_array_dim(knl, [(nm, 0) for nm in arg_names], pad_mult) return knl