def test_mem_access_counter_logic(): knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ e[i,k] = if(not(k<l-2) and k>6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2) """ ], name="logic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) mem_map = lp.get_mem_access_map(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') f32_g_l = reduced_map[lp.MemAccess( 'global', to_loopy_type(np.float32), direction='load')].eval_with_dict(params) f64_g_l = reduced_map[lp.MemAccess( 'global', to_loopy_type(np.float64), direction='load')].eval_with_dict(params) f64_g_s = reduced_map[lp.MemAccess( 'global', to_loopy_type(np.float64), direction='store')].eval_with_dict(params) assert f32_g_l == 2 * n * m assert f64_g_l == n * m assert f64_g_s == n * m
def test_all_counters_parallel_matmul(): knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", ["c[i, j] = sum(k, a[i, k]*b[k, j])"], name="matmul", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 16) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"]) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} sync_map = lp.get_synchronization_map(knl) assert len(sync_map) == 2 assert sync_map["kernel_launch"].eval_with_dict(params) == 1 assert sync_map["barrier_local"].eval_with_dict(params) == 2 * m / 16 op_map = lp.get_op_map(knl) f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params) f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params) i32ops = op_map[lp.Op(np.int32, 'add')].eval_with_dict(params) i32ops += op_map[lp.Op(np.dtype(np.int32), 'mul')].eval_with_dict(params) assert f32mul + f32add == n * m * l * 2 op_map = lp.get_mem_access_map(knl) f32coal = op_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='b')].eval_with_dict(params) f32coal += op_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='a')].eval_with_dict(params) assert f32coal == n * m + m * l f32coal = op_map[lp.MemAccess('global', np.float32, stride=1, direction='store', variable='c')].eval_with_dict(params) assert f32coal == n * l local_mem_map = lp.get_mem_access_map(knl).filter_by(mtype=['local']) local_mem_l = local_mem_map[lp.MemAccess( 'local', np.dtype(np.float32), direction='load')].eval_with_dict(params) assert local_mem_l == n * m * l * 2
def test_mem_access_counter_reduction(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], name="matmul", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) subgroup_size = 32 mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=subgroup_size) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, subgroup_size) f32l = mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='a', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32l += mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='b', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group assert f32l == (2*n*m*ell)*n_workgroups*subgroups_per_group f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), lid_strides={}, gid_strides={}, direction='store', variable='c', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group assert f32s == (n*ell)*n_workgroups*subgroups_per_group ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'] ).to_bytes().eval_and_sum(params) st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'] ).to_bytes().eval_and_sum(params) assert ld_bytes == 4*f32l assert st_bytes == 4*f32s
def test_mem_access_counter_logic(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ e[i,k] = if(not(k<ell-2) and k>6 or k/2==ell, g[i,k]*2, g[i,k]+h[i,k]/2) """ ], name="logic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) subgroup_size = 32 mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=subgroup_size) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, subgroup_size) reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') f32_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float32), direction='load') ].eval_with_dict(params) f64_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64), direction='load') ].eval_with_dict(params) f64_g_s = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64), direction='store') ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group assert f32_g_l == (2*n*m)*n_workgroups*subgroups_per_group assert f64_g_l == (n*m)*n_workgroups*subgroups_per_group assert f64_g_s == (n*m)*n_workgroups*subgroups_per_group
def test_mem_access_counter_reduction(): knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", ["c[i, j] = sum(k, a[i, k]*b[k, j])"], name="matmul", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) mem_map = lp.get_mem_access_map(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f32l = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='a')].eval_with_dict(params) f32l += mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='b')].eval_with_dict(params) assert f32l == 2 * n * m * l f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=0, direction='store', variable='c')].eval_with_dict(params) assert f32s == n * l ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load' ]).to_bytes().eval_and_sum(params) st_bytes = mem_map.filter_by(mtype=['global'], direction=['store' ]).to_bytes().eval_and_sum(params) assert ld_bytes == 4 * f32l assert st_bytes == 4 * f32s
def test_mem_access_counter_specialops(): knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0) e[i, k] = (1+g[i,k])**(1+h[i,k+1]) """ ], name="specialops", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes( knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) mem_map = lp.get_mem_access_map(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f32 = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='a')].eval_with_dict(params) f32 += mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='b')].eval_with_dict(params) f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64), stride=0, direction='load', variable='g')].eval_with_dict(params) f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64), stride=0, direction='load', variable='h')].eval_with_dict(params) assert f32 == 2 * n * m * l assert f64 == 2 * n * m f32 = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='store', variable='c')].eval_with_dict(params) f64 = mem_map[lp.MemAccess('global', np.float64, stride=0, direction='store', variable='e')].eval_with_dict(params) assert f32 == n * m * l assert f64 == n * m filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g']) #tot = lp.eval_and_sum_polys(filtered_map, params) tot = filtered_map.eval_and_sum(params) assert tot == n * m * l + n * m
def test_count_granularity_val_checks(): try: lp.MemAccess(count_granularity=CG.WORKITEM) lp.MemAccess(count_granularity=CG.SUBGROUP) lp.MemAccess(count_granularity=CG.WORKGROUP) lp.MemAccess(count_granularity=None) assert True lp.MemAccess(count_granularity='bushel') assert False except ValueError: assert True try: lp.Op(count_granularity=CG.WORKITEM) lp.Op(count_granularity=CG.SUBGROUP) lp.Op(count_granularity=CG.WORKGROUP) lp.Op(count_granularity=None) assert True lp.Op(count_granularity='bushel') assert False except ValueError: assert True
def test_mem_access_counter_nonconsec(): knl = lp.make_kernel( "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k] = g[i,k]*(2+h[i,k]) """ ], name="nonconsec", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes( knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) knl = lp.split_iname(knl, "i", 16) knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"}) mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) # noqa n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} f64nonconsec = mem_map[lp.MemAccess('global', np.float64, stride=Variable('m'), direction='load', variable='g')].eval_with_dict(params) f64nonconsec += mem_map[lp.MemAccess('global', np.float64, stride=Variable('m'), direction='load', variable='h')].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m') * Variable('ell'), direction='load', variable='a')].eval_with_dict(params) f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m') * Variable('ell'), direction='load', variable='b')].eval_with_dict(params) assert f64nonconsec == 2 * n * m assert f32nonconsec == 3 * n * m * ell f64nonconsec = mem_map[lp.MemAccess('global', np.float64, stride=Variable('m'), direction='store', variable='e')].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.float32, stride=Variable('m') * Variable('ell'), direction='store', variable='c')].eval_with_dict(params) assert f64nonconsec == n * m assert f32nonconsec == n * m * ell
def test_mem_access_counter_consec(): knl = lp.make_kernel("[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k] = g[i,k]*(2+h[i,k]) """ ], name="consec", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes( knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"}) mem_map = lp.get_mem_access_map(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f64consec = mem_map[lp.MemAccess('global', np.float64, stride=1, direction='load', variable='g')].eval_with_dict(params) f64consec += mem_map[lp.MemAccess('global', np.float64, stride=1, direction='load', variable='h')].eval_with_dict(params) f32consec = mem_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='a')].eval_with_dict(params) f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=1, direction='load', variable='b')].eval_with_dict(params) assert f64consec == 2 * n * m assert f32consec == 3 * n * m * l f64consec = mem_map[lp.MemAccess('global', np.float64, stride=1, direction='store', variable='e')].eval_with_dict(params) f32consec = mem_map[lp.MemAccess('global', np.float32, stride=1, direction='store', variable='c')].eval_with_dict(params) assert f64consec == n * m assert f32consec == n * m * l
def test_mem_access_counter_basic(): knl = lp.make_kernel("[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k] = g[i,k]*h[i,k+1] """ ], name="basic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes( knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) mem_map = lp.get_mem_access_map(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f32l = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='a')].eval_with_dict(params) f32l += mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='b')].eval_with_dict(params) f64l = mem_map[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='g')].eval_with_dict(params) f64l += mem_map[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='h')].eval_with_dict(params) assert f32l == 3 * n * m * l assert f64l == 2 * n * m f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=0, direction='store', variable='c')].eval_with_dict(params) f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64), stride=0, direction='store', variable='e')].eval_with_dict(params) assert f32s == n * m * l assert f64s == n * m
def test_mem_access_counter_bitwise(): knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1) e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k)) """ ], name="bitwise", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes( knl, dict(a=np.int32, b=np.int32, g=np.int32, h=np.int32)) mem_map = lp.get_mem_access_map(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} i32 = mem_map[lp.MemAccess('global', np.int32, stride=0, direction='load', variable='a')].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, stride=0, direction='load', variable='b')].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, stride=0, direction='load', variable='g')].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32), stride=0, direction='load', variable='h')].eval_with_dict(params) assert i32 == 4 * n * m + 2 * n * m * l i32 = mem_map[lp.MemAccess('global', np.int32, stride=0, direction='store', variable='c')].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, stride=0, direction='store', variable='e')].eval_with_dict(params) assert i32 == n * m + n * m * l
def test_mem_access_counter_bitwise(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1) e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k)) """ ], name="bitwise", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes( knl, dict( a=np.int32, b=np.int32, g=np.int32, h=np.int32)) mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group i32 = mem_map[lp.MemAccess('global', np.int32, lid_strides={}, gid_strides={}, direction='load', variable='a', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, lid_strides={}, gid_strides={}, direction='load', variable='b', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, lid_strides={}, gid_strides={}, direction='load', variable='g', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32), lid_strides={}, gid_strides={}, direction='load', variable='h', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert i32 == (4*n*m+2*n*m*ell)*n_subgroups i32 = mem_map[lp.MemAccess('global', np.int32, lid_strides={}, gid_strides={}, direction='store', variable='c', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, lid_strides={}, gid_strides={}, direction='store', variable='e', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert i32 == (n*m+n*m*ell)*n_subgroups
def test_mem_access_counter_specialops(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0) e[i, k] = (1+g[i,k])**(1+h[i,k+1]) """ ], name="specialops", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group f32 = mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='a', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32 += mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='b', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64), lid_strides={}, gid_strides={}, direction='load', variable='g', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64), lid_strides={}, gid_strides={}, direction='load', variable='h', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert f32 == (2*n*m*ell)*n_subgroups assert f64 == (2*n*m)*n_subgroups f32 = mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='store', variable='c', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64 = mem_map[lp.MemAccess('global', np.float64, lid_strides={}, gid_strides={}, direction='store', variable='e', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert f32 == (n*m*ell)*n_subgroups assert f64 == (n*m)*n_subgroups filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'], count_granularity=CG.SUBGROUP) tot = filtered_map.eval_and_sum(params) # uniform: (count-per-sub-group)*n_subgroups assert tot == (n*m*ell + n*m)*n_subgroups
def test_mem_access_counter_basic(): knl = lp.make_kernel( "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k] = g[i,k]*h[i,k+1] """ ], name="basic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group f32l = mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='a', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32l += mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='b', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64l = mem_map[lp.MemAccess('global', np.float64, lid_strides={}, gid_strides={}, direction='load', variable='g', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64l += mem_map[lp.MemAccess('global', np.float64, lid_strides={}, gid_strides={}, direction='load', variable='h', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert f32l == (3*n*m*ell)*n_subgroups assert f64l == (2*n*m)*n_subgroups f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), lid_strides={}, gid_strides={}, direction='store', variable='c', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64), lid_strides={}, gid_strides={}, direction='store', variable='e', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert f32s == (n*m*ell)*n_subgroups assert f64s == (n*m)*n_subgroups
def test_summations_and_filters(): knl = lp.make_kernel( "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k+1] = -g[i,k]*h[i,k+1] """ ], name="basic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) loads_a = mem_map.filter_by(direction=['load'], variable=['a'], count_granularity=[CG.SUBGROUP] ).eval_and_sum(params) # uniform: (count-per-sub-group)*n_subgroups assert loads_a == (2*n*m*ell)*n_subgroups global_stores = mem_map.filter_by(mtype=['global'], direction=['store'], count_granularity=[CG.SUBGROUP] ).eval_and_sum(params) # uniform: (count-per-sub-group)*n_subgroups assert global_stores == (n*m*ell + n*m)*n_subgroups ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'], count_granularity=[CG.SUBGROUP] ).to_bytes().eval_and_sum(params) st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'], count_granularity=[CG.SUBGROUP] ).to_bytes().eval_and_sum(params) # uniform: (count-per-sub-group)*n_subgroups assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)*n_subgroups assert st_bytes == (4*n*m*ell + 8*n*m)*n_subgroups # ignore stride and variable names in this map reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') f32lall = reduced_map[lp.MemAccess('global', np.float32, direction='load') ].eval_with_dict(params) f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load') ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert f32lall == (3*n*m*ell)*n_subgroups assert f64lall == (2*n*m)*n_subgroups op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) #for k, v in op_map.items(): # print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v) op_map_dtype = op_map.group_by('dtype') f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params) f64 = op_map_dtype[lp.Op(dtype=np.float64)].eval_with_dict(params) i32 = op_map_dtype[lp.Op(dtype=np.int32)].eval_with_dict(params) assert f32 == n*m*ell*3 assert f64 == n*m assert i32 == n*m*2 addsub_all = op_map.filter_by(name=['add', 'sub']).eval_and_sum(params) f32ops_all = op_map.filter_by(dtype=[np.float32]).eval_and_sum(params) assert addsub_all == n*m*ell + n*m*2 assert f32ops_all == n*m*ell*3 non_field = op_map.filter_by(xxx=[np.float32]).eval_and_sum(params) assert non_field == 0 ops_nodtype = op_map.group_by('name') ops_noname = op_map.group_by('dtype') mul_all = ops_nodtype[lp.Op(name='mul')].eval_with_dict(params) f64ops_all = ops_noname[lp.Op(dtype=np.float64)].eval_with_dict(params) assert mul_all == n*m*ell + n*m assert f64ops_all == n*m def func_filter(key): return key.lid_strides == {} and key.dtype == to_loopy_type(np.float64) and \ key.direction == 'load' f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params) # uniform: (count-per-sub-group)*n_subgroups assert f64l == (2*n*m)*n_subgroups
def test_mem_access_counter_mixed(): knl = lp.make_kernel("[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]+x[i,k] e[i, k] = g[i,k]*(2+h[i,k]) """ ], name="mixed", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes( knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64, x=np.float32)) threads = 16 knl = lp.split_iname(knl, "j", threads) knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"}) mem_map = lp.get_mem_access_map(knl) # noqa n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f64uniform = mem_map[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='g')].eval_with_dict(params) f64uniform += mem_map[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='h')].eval_with_dict(params) f32uniform = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='x')].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m'), direction='load', variable='a')].eval_with_dict(params) f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m'), direction='load', variable='b')].eval_with_dict(params) assert f64uniform == 2 * n * m assert f32uniform == n * m * l / threads assert f32nonconsec == 3 * n * m * l f64uniform = mem_map[lp.MemAccess('global', np.float64, stride=0, direction='store', variable='e')].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.float32, stride=Variable('m'), direction='store', variable='c')].eval_with_dict(params) assert f64uniform == n * m assert f32nonconsec == n * m * l
def test_mem_access_counter_mixed(): knl = lp.make_kernel( "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]+x[i,k] e[i, k] = g[i,k]*(2+h[i,k]) """ ], name="mixed", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict( a=np.float32, b=np.float32, g=np.float64, h=np.float64, x=np.float32)) group_size_0 = 65 knl = lp.split_iname(knl, "j", group_size_0) knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"}) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} n_workgroups = div_ceil(ell, group_size_0) group_size = group_size_0 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) f64uniform = mem_map[lp.MemAccess('global', np.float64, lid_strides={}, gid_strides={}, direction='load', variable='g', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64uniform += mem_map[lp.MemAccess('global', np.float64, lid_strides={}, gid_strides={}, direction='load', variable='h', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32uniform = mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='x', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), lid_strides={0: Variable('m')}, gid_strides={0: Variable('m')*group_size_0}, direction='load', variable='a', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), lid_strides={0: Variable('m')}, gid_strides={0: Variable('m')*group_size_0}, direction='load', variable='b', count_granularity=CG.WORKITEM) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert f64uniform == (2*n*m)*n_subgroups assert f32uniform == (m*n)*n_subgroups expect_fallback = False import islpy as isl try: isl.BasicSet.card except AttributeError: expect_fallback = True else: expect_fallback = False if expect_fallback: if ell < group_size_0: assert f32nonconsec == 3*n*m*ell*n_workgroups else: assert f32nonconsec == 3*n*m*n_workgroups*group_size_0 else: assert f32nonconsec == 3*n*m*ell f64uniform = mem_map[lp.MemAccess('global', np.float64, lid_strides={}, gid_strides={}, direction='store', variable='e', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.float32, lid_strides={0: Variable('m')}, gid_strides={0: Variable('m')*group_size_0}, direction='store', variable='c', count_granularity=CG.WORKITEM) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert f64uniform == m*n*n_subgroups if expect_fallback: if ell < group_size_0: assert f32nonconsec == n*m*ell*n_workgroups else: assert f32nonconsec == n*m*n_workgroups*group_size_0 else: assert f32nonconsec == n*m*ell
def test_mem_access_counter_consec(): knl = lp.make_kernel( "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k] = g[i,k]*(2+h[i,k]) """ ], name="consec", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict( a=np.float32, b=np.float32, g=np.float64, h=np.float64)) knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"}) mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size='guess') n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} f64consec = mem_map[lp.MemAccess( 'global', np.float64, lid_strides={0: 1}, gid_strides={0: Variable('m')}, direction='load', variable='g', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f64consec += mem_map[lp.MemAccess( 'global', np.float64, lid_strides={0: 1}, gid_strides={0: Variable('m')}, direction='load', variable='h', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32consec = mem_map[lp.MemAccess( 'global', np.float32, lid_strides={0: 1}, gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')}, direction='load', variable='a', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32consec += mem_map[lp.MemAccess( 'global', np.dtype(np.float32), lid_strides={0: 1}, gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')}, direction='load', variable='b', count_granularity=CG.WORKITEM) ].eval_with_dict(params) assert f64consec == 2*n*m*ell assert f32consec == 3*n*m*ell f64consec = mem_map[lp.MemAccess( 'global', np.float64, lid_strides={0: 1}, gid_strides={0: Variable('m')}, direction='store', variable='e', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32consec = mem_map[lp.MemAccess( 'global', np.float32, lid_strides={0: 1}, gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')}, direction='store', variable='c', count_granularity=CG.WORKITEM) ].eval_with_dict(params) assert f64consec == n*m*ell assert f32consec == n*m*ell
def test_summations_and_filters(): knl = lp.make_kernel("[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k+1] = -g[i,k]*h[i,k+1] """ ], name="basic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes( knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} mem_map = lp.get_mem_access_map(knl) loads_a = mem_map.filter_by(direction=['load'], variable=['a']).eval_and_sum(params) assert loads_a == 2 * n * m * l global_stores = mem_map.filter_by(mtype=['global'], direction=['store']).eval_and_sum(params) assert global_stores == n * m * l + n * m ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load' ]).to_bytes().eval_and_sum(params) st_bytes = mem_map.filter_by(mtype=['global'], direction=['store' ]).to_bytes().eval_and_sum(params) assert ld_bytes == 4 * n * m * l * 3 + 8 * n * m * 2 assert st_bytes == 4 * n * m * l + 8 * n * m # ignore stride and variable names in this map reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') f32lall = reduced_map[lp.MemAccess( 'global', np.float32, direction='load')].eval_with_dict(params) f64lall = reduced_map[lp.MemAccess( 'global', np.float64, direction='load')].eval_with_dict(params) assert f32lall == 3 * n * m * l assert f64lall == 2 * n * m op_map = lp.get_op_map(knl) #for k, v in op_map.items(): # print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v) op_map_dtype = op_map.group_by('dtype') f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params) f64 = op_map_dtype[lp.Op(dtype=np.float64)].eval_with_dict(params) i32 = op_map_dtype[lp.Op(dtype=np.int32)].eval_with_dict(params) assert f32 == n * m * l * 3 assert f64 == n * m assert i32 == n * m * 2 addsub_all = op_map.filter_by(name=['add', 'sub']).eval_and_sum(params) f32ops_all = op_map.filter_by(dtype=[np.float32]).eval_and_sum(params) assert addsub_all == n * m * l + n * m * 2 assert f32ops_all == n * m * l * 3 non_field = op_map.filter_by(xxx=[np.float32]).eval_and_sum(params) assert non_field == 0 ops_nodtype = op_map.group_by('name') ops_noname = op_map.group_by('dtype') mul_all = ops_nodtype[lp.Op(name='mul')].eval_with_dict(params) f64ops_all = ops_noname[lp.Op(dtype=np.float64)].eval_with_dict(params) assert mul_all == n * m * l + n * m assert f64ops_all == n * m def func_filter(key): return key.stride < 1 and key.dtype == to_loopy_type(np.float64) and \ key.direction == 'load' s1f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params) assert s1f64l == 2 * n * m
def test_all_counters_parallel_matmul(): bsize = 16 knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], name="matmul", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) knl = lp.split_iname(knl, "i", bsize, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", bsize, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", bsize) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto") knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto") n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} group_size = bsize*bsize n_workgroups = div_ceil(n, bsize)*div_ceil(ell, bsize) subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group sync_map = lp.get_synchronization_map(knl) assert len(sync_map) == 2 assert sync_map["kernel_launch"].eval_with_dict(params) == 1 assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/bsize op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) f32mul = op_map[ lp.Op(np.float32, 'mul', CG.SUBGROUP) ].eval_with_dict(params) f32add = op_map[ lp.Op(np.float32, 'add', CG.SUBGROUP) ].eval_with_dict(params) i32ops = op_map[ lp.Op(np.int32, 'add', CG.SUBGROUP) ].eval_with_dict(params) i32ops += op_map[ lp.Op(np.dtype(np.int32), 'mul', CG.SUBGROUP) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32mul+f32add == m*2*n_subgroups mem_access_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) f32s1lb = mem_access_map[lp.MemAccess('global', np.float32, lid_strides={0: 1, 1: Variable('ell')}, gid_strides={1: bsize}, direction='load', variable='b', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32s1la = mem_access_map[lp.MemAccess('global', np.float32, lid_strides={0: 1, 1: Variable('m')}, gid_strides={0: Variable('m')*bsize}, direction='load', variable='a', count_granularity=CG.WORKITEM) ].eval_with_dict(params) assert f32s1lb == n*m*ell/bsize assert f32s1la == n*m*ell/bsize f32coal = mem_access_map[lp.MemAccess('global', np.float32, lid_strides={0: 1, 1: Variable('ell')}, gid_strides={0: Variable('ell')*bsize, 1: bsize}, direction='store', variable='c', count_granularity=CG.WORKITEM) ].eval_with_dict(params) assert f32coal == n*ell local_mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS).filter_by(mtype=['local']) local_mem_l = local_mem_map.filter_by(direction=['load'] ).eval_and_sum(params) # (count-per-sub-group)*n_subgroups assert local_mem_l == m*2*n_subgroups local_mem_l_a = local_mem_map[lp.MemAccess('local', np.dtype(np.float32), direction='load', lid_strides={1: 16}, gid_strides={}, variable='a_fetch', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) local_mem_l_b = local_mem_map[lp.MemAccess('local', np.dtype(np.float32), direction='load', lid_strides={0: 1}, gid_strides={}, variable='b_fetch', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert local_mem_l_a == local_mem_l_b == m*n_subgroups local_mem_s = local_mem_map.filter_by(direction=['store'] ).eval_and_sum(params) # (count-per-sub-group)*n_subgroups assert local_mem_s == m*2/bsize*n_subgroups
def test_mem_access_tagged_variables(): bsize = 16 knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", ["c$mmresult[i, j] = sum(k, a$mmaload[i, k]*b$mmbload[k, j])"], name="matmul", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) knl = lp.split_iname(knl, "i", bsize, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", bsize, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", bsize) # knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto") # knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto") n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} group_size = bsize * bsize n_workgroups = div_ceil(n, bsize) * div_ceil(ell, bsize) subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups * subgroups_per_group mem_access_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) f32s1lb = mem_access_map[lp.MemAccess( 'global', np.float32, lid_strides={0: 1}, gid_strides={1: bsize}, direction='load', variable='b', variable_tag='mmbload', count_granularity=CG.WORKITEM)].eval_with_dict(params) f32s1la = mem_access_map[lp.MemAccess( 'global', np.float32, lid_strides={1: Variable('m')}, gid_strides={0: Variable('m') * bsize}, direction='load', variable='a', variable_tag='mmaload', count_granularity=CG.SUBGROUP)].eval_with_dict(params) assert f32s1lb == n * m * ell # uniform: (count-per-sub-group)*n_subgroups assert f32s1la == m * n_subgroups f32coal = mem_access_map[lp.MemAccess( 'global', np.float32, lid_strides={ 0: 1, 1: Variable('ell') }, gid_strides={ 0: Variable('ell') * bsize, 1: bsize }, direction='store', variable='c', variable_tag='mmresult', count_granularity=CG.WORKITEM)].eval_with_dict(params) assert f32coal == n * ell