def test_op_counter_triangular_domain(): knl = lp.make_kernel( "{[i,j]: 0<=i<n and 0<=j<m and i<j}", """ a[i, j] = b[i,j] * 2 """, name="bitwise", assumptions="n,m >= 1") knl = lp.add_and_infer_dtypes(knl, dict(b=np.float64)) expect_fallback = False import islpy as isl try: isl.BasicSet.card except AttributeError: expect_fallback = True else: expect_fallback = False op_map = lp.get_op_map( knl, count_redundant_work=True )[lp.Op(np.float64, 'mul', CG.WORKITEM)] value_dict = dict(m=13, n=200) flops = op_map.eval_with_dict(value_dict) if expect_fallback: assert flops == 144 else: assert flops == 78
def test_op_counter_specialops(): knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0) e[i, k] = (1+g[i,k])**(1+h[i,k+1])+rsqrt(g[i,k])*sin(g[i,k]) """ ], name="specialops", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes( knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) op_map = lp.get_op_map(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params) f32div = op_map[lp.Op(np.float32, 'div')].eval_with_dict(params) f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params) f64pow = op_map[lp.Op(np.float64, 'pow')].eval_with_dict(params) f64add = op_map[lp.Op(np.dtype(np.float64), 'add')].eval_with_dict(params) i32add = op_map[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params) f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt')].eval_with_dict(params) f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin')].eval_with_dict(params) assert f32div == 2 * n * m * l assert f32mul == f32add == n * m * l assert f64add == 3 * n * m assert f64pow == i32add == f64rsq == f64sin == n * m
def test_op_counter_logic(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ e[i,k] = if( not(k<ell-2) and k>6 or k/2==ell, g[i,k]*2, g[i,k]+h[i,k]/2) """ ], name="logic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params) f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP)].eval_with_dict(params) f64div = op_map[lp.Op(np.dtype(np.float64), 'div', CG.SUBGROUP) ].eval_with_dict(params) i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32mul == n*m*n_subgroups assert f64div == 2*n*m*n_subgroups # TODO why? assert f64add == n*m*n_subgroups assert i32add == n*m*n_subgroups
def test_op_counter_logic(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ e[i,k] = if( not(k<ell-2) and k>6 or k/2==ell, g[i,k]*2, g[i,k]+h[i,k]/2) """ ], name="logic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) op_map = lp.get_op_map(knl, count_redundant_work=True) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(params) f64add = op_map[lp.Op(np.float64, 'add', CG.WORKITEM)].eval_with_dict(params) f64div = op_map[lp.Op(np.dtype(np.float64), 'div', CG.WORKITEM) ].eval_with_dict(params) i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.WORKITEM) ].eval_with_dict(params) assert f32mul == n*m assert f64div == 2*n*m # TODO why? assert f64add == n*m assert i32add == n*m
def test_op_counter_triangular_domain(): knl = lp.make_kernel("{[i,j]: 0<=i<n and 0<=j<m and i<j}", """ a[i, j] = b[i,j] * 2 """, name="bitwise", assumptions="n,m >= 1") knl = lp.add_and_infer_dtypes(knl, dict(b=np.float64)) expect_fallback = False import islpy as isl try: isl.BasicSet.card except AttributeError: expect_fallback = True else: expect_fallback = False op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)[lp.Op( np.float64, 'mul', CG.SUBGROUP)] value_dict = dict(m=13, n=200) flops = op_map.eval_with_dict(value_dict) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups * subgroups_per_group if expect_fallback: assert flops == 144 * n_subgroups else: assert flops == 78 * n_subgroups
def test_op_counter_basic(): knl = lp.make_kernel( "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k+1] = -g[i,k]*h[i,k+1] """ ], name="basic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params) f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(params) f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.SUBGROUP) ].eval_with_dict(params) i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32add == f32mul == f32div == n*m*ell*n_subgroups assert f64mul == n*m*n_subgroups assert i32add == n*m*2*n_subgroups
def test_op_counter_reduction(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], name="matmul_serial", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', CG.SUBGROUP) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32add == f32mul == n*m*ell*n_subgroups op_map_dtype = op_map.group_by('dtype') f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params) assert f32 == f32add + f32mul
def test_op_counter_bitwise(): knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1) e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k)) """ ], name="bitwise", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes( knl, dict(a=np.int32, b=np.int32, g=np.int64, h=np.int64)) op_map = lp.get_op_map(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} i32add = op_map[lp.Op(np.int32, 'add')].eval_with_dict(params) i32bw = op_map[lp.Op(np.int32, 'bw')].eval_with_dict(params) i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw')].eval_with_dict(params) i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul')].eval_with_dict(params) i64add = op_map[lp.Op(np.dtype(np.int64), 'add')].eval_with_dict(params) i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift')].eval_with_dict(params) assert i32add == n * m + n * m * l assert i32bw == 2 * n * m * l assert i64bw == 2 * n * m assert i64add == i64mul == n * m assert i64shift == 2 * n * m
def test_op_counter_basic(): knl = lp.make_kernel( "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k+1] = -g[i,k]*h[i,k+1] """ ], name="basic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) op_map = lp.get_op_map(knl, count_redundant_work=True) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} f32add = op_map[lp.Op(np.float32, 'add', CG.WORKITEM)].eval_with_dict(params) f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(params) f32div = op_map[lp.Op(np.float32, 'div', CG.WORKITEM)].eval_with_dict(params) f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.WORKITEM) ].eval_with_dict(params) i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.WORKITEM) ].eval_with_dict(params) assert f32add == f32mul == f32div == n*m*ell assert f64mul == n*m assert i32add == n*m*2
def test_all_counters_parallel_matmul(): knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", ["c[i, j] = sum(k, a[i, k]*b[k, j])"], name="matmul", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 16) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"]) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} sync_map = lp.get_synchronization_map(knl) assert len(sync_map) == 2 assert sync_map["kernel_launch"].eval_with_dict(params) == 1 assert sync_map["barrier_local"].eval_with_dict(params) == 2 * m / 16 op_map = lp.get_op_map(knl) f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params) f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params) i32ops = op_map[lp.Op(np.int32, 'add')].eval_with_dict(params) i32ops += op_map[lp.Op(np.dtype(np.int32), 'mul')].eval_with_dict(params) assert f32mul + f32add == n * m * l * 2 op_map = lp.get_mem_access_map(knl) f32coal = op_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='b')].eval_with_dict(params) f32coal += op_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='a')].eval_with_dict(params) assert f32coal == n * m + m * l f32coal = op_map[lp.MemAccess('global', np.float32, stride=1, direction='store', variable='c')].eval_with_dict(params) assert f32coal == n * l local_mem_map = lp.get_mem_access_map(knl).filter_by(mtype=['local']) local_mem_l = local_mem_map[lp.MemAccess( 'local', np.dtype(np.float32), direction='load')].eval_with_dict(params) assert local_mem_l == n * m * l * 2
def test_op_counter_specialops(): knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0) e[i, k] = (1+g[i,k])**(1+h[i,k+1])+rsqrt(g[i,k])*sin(g[i,k]) """ ], name="specialops", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes( knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, count_within_subscripts=True) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups * subgroups_per_group n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params) f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(params) f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) f64pow = op_map[lp.Op(np.float64, 'pow', CG.SUBGROUP)].eval_with_dict(params) f64add = op_map[lp.Op(np.dtype(np.float64), 'add', CG.SUBGROUP)].eval_with_dict(params) i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP)].eval_with_dict(params) f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', CG.SUBGROUP)].eval_with_dict(params) f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', CG.SUBGROUP)].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32div == 2 * n * m * ell * n_subgroups assert f32mul == f32add == n * m * ell * n_subgroups assert f64add == 3 * n * m * n_subgroups assert f64pow == i32add == f64rsq == f64sin == n * m * n_subgroups
def test_op_counter_bitwise(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1) e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k)) """ ], name="bitwise", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes( knl, dict( a=np.int32, b=np.int32, g=np.int64, h=np.int64)) op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP)].eval_with_dict(params) i32bw = op_map[lp.Op(np.int32, 'bw', CG.SUBGROUP)].eval_with_dict(params) i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw', CG.SUBGROUP) ].eval_with_dict(params) i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul', CG.SUBGROUP) ].eval_with_dict(params) i64add = op_map[lp.Op(np.dtype(np.int64), 'add', CG.SUBGROUP) ].eval_with_dict(params) i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', CG.SUBGROUP) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert i32add == n*m+n*m*ell*n_subgroups assert i32bw == 2*n*m*ell*n_subgroups assert i64bw == 2*n*m*n_subgroups assert i64add == i64mul == n*m*n_subgroups assert i64shift == 2*n*m*n_subgroups
def test_op_counter_reduction(): knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", ["c[i, j] = sum(k, a[i, k]*b[k, j])"], name="matmul_serial", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) op_map = lp.get_op_map(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params) f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul')].eval_with_dict(params) assert f32add == f32mul == n * m * l op_map_dtype = op_map.group_by('dtype') f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params) assert f32 == f32add + f32mul
def find_flops(): ctx = cl.create_some_context() if 0: knl = LaplaceKernel(2) m_expn_cls = LaplaceConformingVolumeTaylorMultipoleExpansion l_expn_cls = LaplaceConformingVolumeTaylorLocalExpansion flop_type = np.float64 else: knl = HelmholtzKernel(2) m_expn_cls = HelmholtzConformingVolumeTaylorMultipoleExpansion l_expn_cls = HelmholtzConformingVolumeTaylorLocalExpansion flop_type = np.complex128 orders = list(range(1, 11, 1)) flop_counts = [] for order in orders: print(order) m_expn = m_expn_cls(knl, order) l_expn = l_expn_cls(knl, order) m2l = E2EFromCSR(ctx, m_expn, l_expn) loopy_knl = m2l.get_kernel() loopy_knl = lp.add_and_infer_dtypes( loopy_knl, { "target_boxes,src_box_lists,src_box_starts": np.int32, "centers,src_expansions": np.float64, }) flops = lp.get_op_map(loopy_knl).filter_by(dtype=[flop_type]).sum() flop_counts.append( flops.eval_with_dict( dict(isrc_start=0, isrc_stop=1, ntgt_boxes=1))) print(orders) print(flop_counts)
def find_flops(): ctx = cl.create_some_context() if 0: knl = LaplaceKernel(2) m_expn_cls = LaplaceConformingVolumeTaylorMultipoleExpansion l_expn_cls = LaplaceConformingVolumeTaylorLocalExpansion flop_type = np.float64 else: knl = HelmholtzKernel(2) m_expn_cls = HelmholtzConformingVolumeTaylorMultipoleExpansion l_expn_cls = HelmholtzConformingVolumeTaylorLocalExpansion flop_type = np.complex128 orders = list(range(1, 11, 1)) flop_counts = [] for order in orders: print(order) m_expn = m_expn_cls(knl, order) l_expn = l_expn_cls(knl, order) m2l = E2EFromCSR(ctx, m_expn, l_expn) loopy_knl = m2l.get_kernel() loopy_knl = lp.add_and_infer_dtypes( loopy_knl, { "target_boxes,src_box_lists,src_box_starts": np.int32, "centers,src_expansions": np.float64, }) flops = lp.get_op_map(loopy_knl).filter_by(dtype=[flop_type]).sum() flop_counts.append( flops.eval_with_dict(dict(isrc_start=0, isrc_stop=1, ntgt_boxes=1))) print(orders) print(flop_counts)
def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa pytest.importorskip("fparser") ctx = ctx_factory() filename = os.path.join(os.path.dirname(__file__), "strongVolumeKernels.f90") with open(filename) as sourcef: source = sourcef.read() source = source.replace("datafloat", "real*4") program = lp.parse_fortran(source, filename, seq_dependencies=False) hsv_r, hsv_s = program["strongVolumeKernelR"], program[ "strongVolumeKernelS"] hsv_r = lp.tag_instructions(hsv_r, "rknl") hsv_s = lp.tag_instructions(hsv_s, "sknl") hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"]) #hsv = hsv_s hsv = lp.add_nosync(hsv, "any", "writes:rhsQ", "writes:rhsQ", force=True) from gnuma_loopy_transforms import (fix_euler_parameters, set_q_storage_format, set_D_storage_format) hsv = lp.fix_parameters(hsv, Nq=Nq) hsv = lp.prioritize_loops(hsv, "e,k,j,i") hsv = lp.tag_inames(hsv, dict(e="g.0", j="l.1", i="l.0")) hsv = lp.assume(hsv, "elements >= 1") hsv = fix_euler_parameters(hsv, p_p0=1, p_Gamma=1.4, p_R=1) from loopy.frontend.fortran.translator import specialize_fortran_division hsv = specialize_fortran_division(hsv) for name in ["Q", "rhsQ"]: hsv = set_q_storage_format(hsv, name) hsv = set_D_storage_format(hsv) #hsv = lp.add_prefetch(hsv, "volumeGeometricFactors") ref_hsv = hsv if opt_level == 0: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "D[:,:]", fetch_outer_inames="e", default_tag="l.auto") if opt_level == 1: tap_hsv = hsv # turn the first reads into subst rules local_prep_var_names = set() for insn in lp.find_instructions(hsv, "tag:local_prep"): assignee, = insn.assignee_var_names() local_prep_var_names.add(assignee) hsv = lp.assignment_to_subst(hsv, assignee) # precompute fluxes hsv = lp.assignment_to_subst(hsv, "JinvD_r") hsv = lp.assignment_to_subst(hsv, "JinvD_s") r_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:rknl") s_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:sknl") if ilp_multiple > 1: hsv = lp.split_iname(hsv, "k", 2, inner_tag="ilp") ilp_inames = ("k_inner", ) flux_ilp_inames = ("kk", ) else: ilp_inames = () flux_ilp_inames = () rtmps = [] stmps = [] flux_store_idx = 0 for rflux_insn, sflux_insn in zip(r_fluxes, s_fluxes): for knl_tag, insn, flux_inames, tmps, flux_precomp_inames in [ ("rknl", rflux_insn, ( "j", "n", ), rtmps, ( "jj", "ii", )), ("sknl", sflux_insn, ( "i", "n", ), stmps, ( "ii", "jj", )), ]: flux_var, = insn.assignee_var_names() print(insn) reader, = lp.find_instructions( hsv, "tag:{knl_tag} and reads:{flux_var}".format(knl_tag=knl_tag, flux_var=flux_var)) hsv = lp.assignment_to_subst(hsv, flux_var) flux_store_name = "flux_store_%d" % flux_store_idx flux_store_idx += 1 tmps.append(flux_store_name) hsv = lp.precompute(hsv, flux_var + "_subst", flux_inames + ilp_inames, temporary_name=flux_store_name, precompute_inames=flux_precomp_inames + flux_ilp_inames, default_tag=None) if flux_var.endswith("_s"): hsv = lp.tag_array_axes(hsv, flux_store_name, "N0,N1,N2?") else: hsv = lp.tag_array_axes(hsv, flux_store_name, "N1,N0,N2?") n_iname = "n_" + flux_var.replace("_r", "").replace("_s", "") if n_iname.endswith("_0"): n_iname = n_iname[:-2] hsv = lp.rename_iname(hsv, "n", n_iname, within="id:" + reader.id, existing_ok=True) hsv = lp.tag_inames(hsv, dict(ii="l.0", jj="l.1")) for iname in flux_ilp_inames: hsv = lp.tag_inames(hsv, {iname: "ilp"}) hsv = lp.alias_temporaries(hsv, rtmps) hsv = lp.alias_temporaries(hsv, stmps) if opt_level == 2: tap_hsv = hsv for prep_var_name in local_prep_var_names: if prep_var_name.startswith("Jinv") or "_s" in prep_var_name: continue hsv = lp.precompute(hsv, lp.find_one_rule_matching( hsv, prep_var_name + "_*subst*"), default_tag="l.auto") if opt_level == 3: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "Q[ii,jj,k,:,:,e]", sweep_inames=ilp_inames, default_tag="l.auto") if opt_level == 4: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.buffer_array(hsv, "rhsQ", ilp_inames, fetch_bounding_box=True, default_tag="for", init_expression="0", store_expression="base + buffer") if opt_level == 5: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) # buffer axes need to be vectorized in order for this to work hsv = lp.tag_array_axes(hsv, "rhsQ_buf", "c?,vec,c") hsv = lp.tag_array_axes(hsv, "Q_fetch", "c?,vec,c") hsv = lp.tag_array_axes(hsv, "D_fetch", "f,f") hsv = lp.tag_inames(hsv, { "Q_dim_k": "unr", "rhsQ_init_k": "unr", "rhsQ_store_k": "unr" }, ignore_nonexistent=True) if opt_level == 6: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.tag_inames( hsv, dict(rhsQ_init_field_inner="vec", rhsQ_store_field_inner="vec", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="vec", Q_dim_field_outer="unr")) if opt_level == 7: tap_hsv = hsv hsv = lp.collect_common_factors_on_increment( hsv, "rhsQ_buf", vary_by_axes=(0, ) if ilp_multiple > 1 else ()) if opt_level >= 8: tap_hsv = hsv hsv = tap_hsv hsv = lp.set_options(hsv, cl_build_options=[ "-cl-denorms-are-zero", "-cl-fast-relaxed-math", "-cl-finite-math-only", "-cl-mad-enable", "-cl-no-signed-zeros" ]) if 1: print("OPS") op_map = lp.get_op_map(hsv, subgroup_size=32) print(lp.stringify_stats_mapping(op_map)) print("MEM") gmem_map = lp.get_mem_access_map(hsv, subgroup_size=32).to_bytes() print(lp.stringify_stats_mapping(gmem_map)) # FIXME: renaming's a bit tricky in this program model. # add a simple transformation for it # hsv = hsv.copy(name="horizontalStrongVolumeKernel") results = lp.auto_test_vs_ref(ref_hsv, ctx, hsv, parameters=dict(elements=300), quiet=True) elapsed = results["elapsed_wall"] print("elapsed", elapsed)
def _cache_kernel_stats(self, t_unit: lp.TranslationUnit, kwargs: dict) \ -> tuple: """Generate the kernel stats for a program with its args.""" args_tuple = tuple( (key, value.shape) if hasattr(value, "shape") else (key, value) for key, value in kwargs.items()) # Are kernel stats already in the cache? try: self.kernel_stats[t_unit][args_tuple] return args_tuple except KeyError: # If not, calculate and cache the stats ep_name = t_unit.default_entrypoint.name executor = t_unit.target.get_kernel_executor(t_unit, self.queue, entrypoint=ep_name) info = executor.translation_unit_info( ep_name, executor.arg_to_dtype_set(kwargs)) typed_t_unit = executor.get_typed_and_scheduled_translation_unit( ep_name, executor.arg_to_dtype_set(kwargs)) kernel = typed_t_unit[ep_name] idi = info.implemented_data_info param_dict = kwargs.copy() param_dict.update({ k: None for k in kernel.arg_dict.keys() if k not in param_dict }) param_dict.update( {d.name: None for d in idi if d.name not in param_dict}) # Generate the wrapper code wrapper = executor.get_wrapper_generator() gen = PythonFunctionGenerator("_mcom_gen_args_profile", list(param_dict)) wrapper.generate_integer_arg_finding_from_shapes(gen, kernel, idi) wrapper.generate_integer_arg_finding_from_offsets(gen, kernel, idi) wrapper.generate_integer_arg_finding_from_strides(gen, kernel, idi) param_names = kernel.all_params() gen("return {%s}" % ", ".join(f"{repr(name)}: {name}" for name in param_names)) # Run the wrapper code, save argument values in domain_params domain_params = gen.get_picklable_function()(**param_dict) # Get flops/memory statistics op_map = lp.get_op_map(typed_t_unit, subgroup_size="guess") bytes_accessed = lp.get_mem_access_map( typed_t_unit, subgroup_size="guess") \ .to_bytes().eval_and_sum(domain_params) flops = op_map.filter_by( dtype=[np.float32, np.float64]).eval_and_sum(domain_params) # Footprint gathering is not yet available in loopy with # kernel callables: # https://github.com/inducer/loopy/issues/399 if 0: try: footprint = lp.gather_access_footprint_bytes(typed_t_unit) footprint_bytes = sum( footprint[k].eval_with_dict(domain_params) for k in footprint) except lp.symbolic.UnableToDetermineAccessRange: footprint_bytes = None else: footprint_bytes = None res = SingleCallKernelProfile(time=0, flops=flops, bytes_accessed=bytes_accessed, footprint_bytes=footprint_bytes) self.kernel_stats.setdefault(t_unit, {})[args_tuple] = res if self.logmgr: if f"{ep_name}_time" not in self.logmgr.quantity_data: self.logmgr.add_quantity(KernelProfile(self, ep_name)) return args_tuple
def test_all_counters_parallel_matmul(): bsize = 16 knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], name="matmul", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) knl = lp.split_iname(knl, "i", bsize, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", bsize, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", bsize) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto") knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto") n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} group_size = bsize*bsize n_workgroups = div_ceil(n, bsize)*div_ceil(ell, bsize) subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group sync_map = lp.get_synchronization_map(knl) assert len(sync_map) == 2 assert sync_map["kernel_launch"].eval_with_dict(params) == 1 assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/bsize op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) f32mul = op_map[ lp.Op(np.float32, 'mul', CG.SUBGROUP) ].eval_with_dict(params) f32add = op_map[ lp.Op(np.float32, 'add', CG.SUBGROUP) ].eval_with_dict(params) i32ops = op_map[ lp.Op(np.int32, 'add', CG.SUBGROUP) ].eval_with_dict(params) i32ops += op_map[ lp.Op(np.dtype(np.int32), 'mul', CG.SUBGROUP) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32mul+f32add == m*2*n_subgroups mem_access_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) f32s1lb = mem_access_map[lp.MemAccess('global', np.float32, lid_strides={0: 1, 1: Variable('ell')}, gid_strides={1: bsize}, direction='load', variable='b', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32s1la = mem_access_map[lp.MemAccess('global', np.float32, lid_strides={0: 1, 1: Variable('m')}, gid_strides={0: Variable('m')*bsize}, direction='load', variable='a', count_granularity=CG.WORKITEM) ].eval_with_dict(params) assert f32s1lb == n*m*ell/bsize assert f32s1la == n*m*ell/bsize f32coal = mem_access_map[lp.MemAccess('global', np.float32, lid_strides={0: 1, 1: Variable('ell')}, gid_strides={0: Variable('ell')*bsize, 1: bsize}, direction='store', variable='c', count_granularity=CG.WORKITEM) ].eval_with_dict(params) assert f32coal == n*ell local_mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS).filter_by(mtype=['local']) local_mem_l = local_mem_map.filter_by(direction=['load'] ).eval_and_sum(params) # (count-per-sub-group)*n_subgroups assert local_mem_l == m*2*n_subgroups local_mem_l_a = local_mem_map[lp.MemAccess('local', np.dtype(np.float32), direction='load', lid_strides={1: 16}, gid_strides={}, variable='a_fetch', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) local_mem_l_b = local_mem_map[lp.MemAccess('local', np.dtype(np.float32), direction='load', lid_strides={0: 1}, gid_strides={}, variable='b_fetch', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert local_mem_l_a == local_mem_l_b == m*n_subgroups local_mem_s = local_mem_map.filter_by(direction=['store'] ).eval_and_sum(params) # (count-per-sub-group)*n_subgroups assert local_mem_s == m*2/bsize*n_subgroups
def test_summations_and_filters(): knl = lp.make_kernel("[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k+1] = -g[i,k]*h[i,k+1] """ ], name="basic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes( knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} mem_map = lp.get_mem_access_map(knl) loads_a = mem_map.filter_by(direction=['load'], variable=['a']).eval_and_sum(params) assert loads_a == 2 * n * m * l global_stores = mem_map.filter_by(mtype=['global'], direction=['store']).eval_and_sum(params) assert global_stores == n * m * l + n * m ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load' ]).to_bytes().eval_and_sum(params) st_bytes = mem_map.filter_by(mtype=['global'], direction=['store' ]).to_bytes().eval_and_sum(params) assert ld_bytes == 4 * n * m * l * 3 + 8 * n * m * 2 assert st_bytes == 4 * n * m * l + 8 * n * m # ignore stride and variable names in this map reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') f32lall = reduced_map[lp.MemAccess( 'global', np.float32, direction='load')].eval_with_dict(params) f64lall = reduced_map[lp.MemAccess( 'global', np.float64, direction='load')].eval_with_dict(params) assert f32lall == 3 * n * m * l assert f64lall == 2 * n * m op_map = lp.get_op_map(knl) #for k, v in op_map.items(): # print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v) op_map_dtype = op_map.group_by('dtype') f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params) f64 = op_map_dtype[lp.Op(dtype=np.float64)].eval_with_dict(params) i32 = op_map_dtype[lp.Op(dtype=np.int32)].eval_with_dict(params) assert f32 == n * m * l * 3 assert f64 == n * m assert i32 == n * m * 2 addsub_all = op_map.filter_by(name=['add', 'sub']).eval_and_sum(params) f32ops_all = op_map.filter_by(dtype=[np.float32]).eval_and_sum(params) assert addsub_all == n * m * l + n * m * 2 assert f32ops_all == n * m * l * 3 non_field = op_map.filter_by(xxx=[np.float32]).eval_and_sum(params) assert non_field == 0 ops_nodtype = op_map.group_by('name') ops_noname = op_map.group_by('dtype') mul_all = ops_nodtype[lp.Op(name='mul')].eval_with_dict(params) f64ops_all = ops_noname[lp.Op(dtype=np.float64)].eval_with_dict(params) assert mul_all == n * m * l + n * m assert f64ops_all == n * m def func_filter(key): return key.stride < 1 and key.dtype == to_loopy_type(np.float64) and \ key.direction == 'load' s1f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params) assert s1f64l == 2 * n * m
if rank == 0: if mesh.layers: cells = cells * (mesh.layers - 1) print("CELLS= {0}".format(cells)) print("DOFS= {0}".format(dofs)) from loopy.program import make_program knl = compile_form(y_form, coffee=False)[0].ast warnings = list(knl.silenced_warnings) warnings.extend(["insn_count_subgroups_upper_bound", "no_lid_found"]) knl = knl.copy(silenced_warnings=warnings) knl.options.ignore_boostable_into = True program = make_program(knl) op_map = lp.get_op_map(program, subgroup_size=1) mem_map = lp.get_mem_access_map(program, subgroup_size=1) for op in ['add', 'sub', 'mul', 'div']: print("{0}S= {1}".format( op.upper(), op_map.filter_by(name=[op], dtype=[np.float64]).eval_and_sum({}))) print("MEMS= {0}".format( mem_map.filter_by(mtype=['global'], dtype=[np.float64]).eval_and_sum({}))) print("INSTRUCTIONS= {0:d}".format(len(knl.instructions))) print("LOOPS= {0:d}".format(len(knl.all_inames()))) for domain in knl.domains: if domain.get_dim_name(3, 0)[0] == "j": print("DOF_LOOP_EXTENT= {0:d}".format( int(domain.dim_max_val(0).to_str()) + 1))
def _cache_kernel_stats(self, program: lp.kernel.LoopKernel, kwargs: dict) \ -> tuple: """Generate the kernel stats for a program with its args.""" args_tuple = tuple( (key, value.shape) if hasattr(value, "shape") else (key, value) for key, value in kwargs.items()) # Are kernel stats already in the cache? try: x = self.kernel_stats[program][args_tuple] # noqa return args_tuple except KeyError: # If not, calculate and cache the stats executor = program.target.get_kernel_executor(program, self.queue) info = executor.kernel_info(executor.arg_to_dtype_set(kwargs)) kernel = executor.get_typed_and_scheduled_kernel( executor.arg_to_dtype_set(kwargs)) idi = info.implemented_data_info types = { k: v for k, v in kwargs.items() if hasattr(v, "dtype") and not v.dtype == object } param_dict = kwargs.copy() param_dict.update({ k: None for k in kernel.arg_dict.keys() if k not in param_dict }) param_dict.update( {d.name: None for d in idi if d.name not in param_dict}) # Generate the wrapper code wrapper = executor.get_wrapper_generator() gen = PythonFunctionGenerator("_mcom_gen_args_profile", list(param_dict)) wrapper.generate_integer_arg_finding_from_shapes(gen, kernel, idi) wrapper.generate_integer_arg_finding_from_offsets(gen, kernel, idi) wrapper.generate_integer_arg_finding_from_strides(gen, kernel, idi) param_names = program.all_params() gen("return {%s}" % ", ".join(f"{repr(name)}: {name}" for name in param_names)) # Run the wrapper code, save argument values in domain_params domain_params = gen.get_picklable_function()(**param_dict) # Get flops/memory statistics kernel = lp.add_and_infer_dtypes(kernel, types) op_map = lp.get_op_map(kernel, subgroup_size="guess") bytes_accessed = lp.get_mem_access_map(kernel, subgroup_size="guess") \ .to_bytes().eval_and_sum(domain_params) flops = op_map.filter_by( dtype=[np.float32, np.float64]).eval_and_sum(domain_params) try: footprint = lp.gather_access_footprint_bytes(kernel) footprint_bytes = sum( footprint[k].eval_with_dict(domain_params) for k in footprint) except lp.symbolic.UnableToDetermineAccessRange: footprint_bytes = None res = ProfileResult(time=0, flops=flops, bytes_accessed=bytes_accessed, footprint_bytes=footprint_bytes) self.kernel_stats.setdefault(program, {})[args_tuple] = res return args_tuple
# peek at generated code evt, (out, ) = knl(queue, a=x_vec_host) knl = lp.make_kernel("{ [i]: 0<=i<n }", "a[i] = 0", assumptions="n>=1") knl = lp.split_iname(knl, "i", 16) # split loop variable knl = lp.prioritize_loops(knl, "i_outer,i_inner") knl = lp.set_options(knl, "write_cl") evt, (out, ) = knl(queue, a=x_vec_dev) knl = lp.make_kernel("{ [i]: 0<=i<n }", "a[i] = a[i] * b[i] + c[i]", assumptions="n>=0 and n mod 4 = 0") orig_knl = knl # copy kernel, test assumptions, and unrolling knl = lp.split_iname(knl, "i", 4) knl = lp.tag_inames(knl, dict(i_inner="unr")) knl = lp.prioritize_loops(knl, "i_outer,i_inner") knl = lp.set_options(knl, "write_cl") evt, (out, ) = knl(queue, a=x_vec_dev, b=y_vec_dev, c=z_vec_dev) from warnings import resetwarnings, filterwarnings resetwarnings() # surpress some warnings during stats filterwarnings('ignore', category=Warning) knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, c=np.float32)) op_map = lp.get_op_map(knl) # get operations counting print(lp.stringify_stats_mapping(op_map)) mem_map = lp.get_mem_access_map(knl) # get memory access(load, store) counting print(lp.stringify_stats_mapping(mem_map))
def test_summations_and_filters(): knl = lp.make_kernel( "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k+1] = -g[i,k]*h[i,k+1] """ ], name="basic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) loads_a = mem_map.filter_by(direction=['load'], variable=['a'], count_granularity=[CG.SUBGROUP] ).eval_and_sum(params) # uniform: (count-per-sub-group)*n_subgroups assert loads_a == (2*n*m*ell)*n_subgroups global_stores = mem_map.filter_by(mtype=['global'], direction=['store'], count_granularity=[CG.SUBGROUP] ).eval_and_sum(params) # uniform: (count-per-sub-group)*n_subgroups assert global_stores == (n*m*ell + n*m)*n_subgroups ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'], count_granularity=[CG.SUBGROUP] ).to_bytes().eval_and_sum(params) st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'], count_granularity=[CG.SUBGROUP] ).to_bytes().eval_and_sum(params) # uniform: (count-per-sub-group)*n_subgroups assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)*n_subgroups assert st_bytes == (4*n*m*ell + 8*n*m)*n_subgroups # ignore stride and variable names in this map reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') f32lall = reduced_map[lp.MemAccess('global', np.float32, direction='load') ].eval_with_dict(params) f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load') ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert f32lall == (3*n*m*ell)*n_subgroups assert f64lall == (2*n*m)*n_subgroups op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) #for k, v in op_map.items(): # print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v) op_map_dtype = op_map.group_by('dtype') f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params) f64 = op_map_dtype[lp.Op(dtype=np.float64)].eval_with_dict(params) i32 = op_map_dtype[lp.Op(dtype=np.int32)].eval_with_dict(params) assert f32 == n*m*ell*3 assert f64 == n*m assert i32 == n*m*2 addsub_all = op_map.filter_by(name=['add', 'sub']).eval_and_sum(params) f32ops_all = op_map.filter_by(dtype=[np.float32]).eval_and_sum(params) assert addsub_all == n*m*ell + n*m*2 assert f32ops_all == n*m*ell*3 non_field = op_map.filter_by(xxx=[np.float32]).eval_and_sum(params) assert non_field == 0 ops_nodtype = op_map.group_by('name') ops_noname = op_map.group_by('dtype') mul_all = ops_nodtype[lp.Op(name='mul')].eval_with_dict(params) f64ops_all = ops_noname[lp.Op(dtype=np.float64)].eval_with_dict(params) assert mul_all == n*m*ell + n*m assert f64ops_all == n*m def func_filter(key): return key.lid_strides == {} and key.dtype == to_loopy_type(np.float64) and \ key.direction == 'load' f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params) # uniform: (count-per-sub-group)*n_subgroups assert f64l == (2*n*m)*n_subgroups
queue.finish() t2 = time.time() evt, vals = knl_g(queue, target_points=pts) queue.finish() t3 = time.time() print("Tests run with %d threads." % ncpus) print("Wall time w/t tag l.0:", t1 - t0) print("Wall time w/t tag g.0:", t3 - t2) # }}} End wall time # {{{ operation counts # count the total work op_map = lp.get_op_map(knl, subgroup_size=ncpus, count_redundant_work=True, count_within_subscripts=True) params = dict(n_targets=pts.shape[1]) print('Operation counts:') total_ops = 0 for op in op_map.keys(): sub_count = op_map[op].eval_with_dict(params) total_ops += sub_count print('\t', op.name, op_map[op], sub_count) print("Total:", total_ops) # TODO: weight each operation by running micro-benchmarks print("OP throughput w/t tag l.0 = %.2f GFLOPS/S" % (total_ops / (t1 - t0) * 1e-9)) print("OP throughput w/t tag g.0 = %.2f GFLOPS/S" % (total_ops / (t3 - t2) * 1e-9))