def test_reg_counter_reduction(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], name="matmul_serial", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) regs = estimate_regs_per_thread(knl) assert regs == 6
def test_reg_counter_logic(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ e[i,k] = if(not(k<l-2) and k>6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2) """ ], name="logic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) regs = estimate_regs_per_thread(knl) assert regs == 6
def test_reg_counter_specialops(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0) e[i, k] = (1+g[i,k])**(1+h[i,k+1]) """ ], name="specialops", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) regs = estimate_regs_per_thread(knl) assert regs == 6
def test_reg_counter_basic(): knl = lp.make_kernel( "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k+1] = g[i,k]*h[i,k+1] """ ], name="basic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) regs = estimate_regs_per_thread(knl) assert regs == 6
def test_reg_counter_bitwise(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1) e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k)) """ ], name="bitwise", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes( knl, dict( a=np.int32, b=np.int32, g=np.int64, h=np.int64)) regs = estimate_regs_per_thread(knl) assert regs == 6
def run_fd_trials(ctx, queue, nvals, configs_t, Atrain_all, Atest_all, ytrain_all, ytest_all, actual_times_all, HK_predict_all, train_test_config): A = [] HK_predict = [] actual = [] dtype = np.float32 for n in nvals: u_mat_dev = cl.clrandom.rand(queue, (n+2, n+2), dtype=dtype) knl = lp.make_kernel( "{[i,j]: 0<=i,j<n}", "result[i,j] = u[i, j]**2 + -1 + (-4)*u[i + 1, j + 1] \ + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] \ + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]", name="finite_diff") knl = lp.add_and_infer_dtypes(knl, {"u": dtype}) ref_knl = knl for BSIZEx, BSIZEy in configs_t: knl = ref_knl knl = lp.split_iname(knl, "i", BSIZEx, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "j", BSIZEy, outer_tag="g.0", inner_tag="l.0") knl = lp.add_prefetch(knl, "u", ["i_inner", "j_inner"], fetch_bounding_box=True) #check = lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=n), # print_code=True) #print "Correctness check: \n", check # use ptx src to determine resource usage #ptx_dump(ctx, knl, n, BSIZEx, BSIZEy) params = {'n': n} barrier_poly = get_barrier_poly(knl) barrier_ct = barrier_poly.eval_with_dict(params) op_map = get_op_poly(knl) op_map2 = get_op_poly2(knl) flops, iops = get_32b_ops(op_map, params) amd_op32 = get_32b_amd_ops(op_map2, params) other_op32 = get_32b_ops_all(op_map2, params) - sum(amd_op32) if flops + iops != sum(amd_op32) + other_op32: #TODO remove after debug print("<debug> PROBLEM!, ops don't add up: ", flops, iops, sum(amd_op32), other_op32) sub_map = get_DRAM_access_poly(knl) # noqa f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s = get_DRAM_f32_accesses( sub_map, params) f32coal = f32coal_l + f32coal_s f32uncoal = f32uncoal_l + f32uncoal_s # execute #print "="*40+"TIMING RESULTS" print("running kernel...") #knl = lp.set_options(knl, write_cl=True, highlight_cl=True) trial_times = [] for i in range(averaging_trials+warmup_trials): evt, (out,) = knl(queue, u=u_mat_dev) evt.wait() trial_times.append((evt.profile.END - evt.profile.START)*1e-9) avg_time = np.average(trial_times[warmup_trials:]) gstats = GPUStats('TeslaC2070') #''' if n % BSIZEx == 0 and n % BSIZEy == 0: reg32_per_thread = 14 else: reg32_per_thread = 15 # 16 for k20 #''' #reg32_per_thread = 1 #estimate_regs_per_thread(knl) #print(reg32_per_thread, estimate_regs_per_thread(knl)) reg32_per_thread = estimate_regs_per_thread(knl) shared_mem_per_block = 4*(BSIZEx+2)*(BSIZEy+2) total_blocks = math.ceil(n/BSIZEx)*math.ceil(n/BSIZEy) total_threads = total_blocks*BSIZEx*BSIZEy # TODO unused kstats = KernelStats(flops/(n*n), f32uncoal/(n*n), f32coal/(n*n), barrier_ct, reg32_per_thread, shared_mem_per_block) tconfig = ThreadConfig(BSIZEx*BSIZEy, total_blocks) model = PerfModel(gstats, kstats, tconfig, np.dtype(dtype)) cycles = model.compute_total_cycles() actual.append(avg_time) #for time in trial_times: #!!!!! # actual.append(time) HK_predict.append(cycles/(gstats.sm_clock_freq*10**9)) #for time in trial_times: ''' update_LS_matrix(A, flops, iops, f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s, barrier_ct, total_blocks, n*n, np.dtype(dtype).itemsize, model) ''' ops = copy.deepcopy(amd_op32) ops.append(other_op32) update_LS_matrix2(A, ops, f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s, barrier_ct, total_blocks, n*n, np.dtype(dtype).itemsize, model) update_lstsq_mats(Atrain_all, Atest_all, ytrain_all, ytest_all, actual_times_all, HK_predict_all, A, actual, HK_predict, train_test_config)
def run_empt_trials(ctx, queue, nvals, configs_t, Atrain_all, Atest_all, ytrain_all, ytest_all, actual_times_all, HK_predict_all, train_test_config): A = [] HK_predict = [] actual = [] dtype = np.float32 for n in nvals: knl = lp.make_kernel( "{[i,j]: 0<=i,j<%d}" % n, [ "" ], name="empty") for BSIZEx, BSIZEy in configs_t: #check = lp.auto_test_vs_ref(ref_knl, ctx, knl, print_code=True) #print "Correctness check: \n", check # use ptx src to determine resource usage #ptx_dump(ctx, knl, n, BSIZEx, BSIZEy) params = {'n': n} barrier_poly = get_barrier_poly(knl) barrier_ct = barrier_poly.eval_with_dict(params) #op_map = get_op_poly(knl) #TODO figure out error op_map = {} op_map2 = {} flops, iops = get_32b_ops(op_map, params) amd_op32 = get_32b_amd_ops(op_map2, params) other_op32 = get_32b_ops_all(op_map2, params) - sum(amd_op32) if flops + iops != sum(amd_op32) + other_op32: #TODO remove after debug print("<debug> PROBLEM!, ops don't add up: ", flops, iops, sum(amd_op32), other_op32) #sub_map = get_DRAM_access_poly(knl) #TODO figure out error sub_map = {} f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s = get_DRAM_f32_accesses( sub_map, params) # execute #print "="*40+"TIMING RESULTS" print("running kernel...") #knl = lp.set_options(knl, write_cl=True, highlight_cl=True) trial_times = [] for i in range(averaging_trials+warmup_trials): evt, out = knl(queue) evt.wait() trial_times.append((evt.profile.END - evt.profile.START)*1e-9) avg_time = np.average(trial_times[warmup_trials:]) gstats = GPUStats('TeslaC2070') reg32_per_thread = 2 #print(reg32_per_thread, estimate_regs_per_thread(knl)) reg32_per_thread = estimate_regs_per_thread(knl) #reg32_per_thread = 1 #estimate_regs_per_thread(knl) shared_mem_per_block = 0 total_blocks = math.ceil(n/BSIZEx)*math.ceil(n/BSIZEy) total_threads = total_blocks*BSIZEx*BSIZEy # TODO unused # TODO actually increase threads/blocks but expect 0 result kstats = KernelStats(0, 0, 0, barrier_ct, reg32_per_thread, shared_mem_per_block) tconfig = ThreadConfig(BSIZEx*BSIZEy, total_blocks) model = PerfModel(gstats, kstats, tconfig, np.dtype(dtype)) cycles = model.compute_total_cycles() actual.append(avg_time) #for time in trial_times: #!!!!! # actual.append(time) HK_predict.append(cycles/(gstats.sm_clock_freq*10**9)) #for time in trial_times: ''' update_LS_matrix(A, flops, iops, f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s, barrier_ct, total_blocks, n*n, np.dtype(dtype).itemsize, model) ''' ops = copy.deepcopy(amd_op32) ops.append(other_op32) update_LS_matrix2(A, ops, f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s, barrier_ct, total_blocks, n*n, np.dtype(dtype).itemsize, model) update_lstsq_mats(Atrain_all, Atest_all, ytrain_all, ytest_all, actual_times_all, HK_predict_all, A, actual, HK_predict, train_test_config)
def run_conv_trials(ctx, queue, nvals, configs_t, Atrain_all, Atest_all, ytrain_all, ytest_all, actual_times_all, HK_predict_all, train_test_config): A = [] HK_predict = [] actual = [] dtype = np.float32 ncolors = 3 for n in nvals: knl = lp.make_kernel( "{ [iimg, ifeat, icolor, im_x, im_y, f_x, f_y]: \ -f_w <= f_x,f_y <= f_w \ and 0 <= im_x < im_w and 0 <= im_y < im_h \ and 0<=iimg<=nimgs and 0<=ifeat<nfeats and 0<=icolor<ncolors \ }", """ out[iimg, ifeat, im_x, im_y] = sum((f_x, f_y, icolor), \ img[iimg, f_w+im_x-f_x, f_w+im_y-f_y, icolor] \ * f[ifeat, f_w+f_x, f_w+f_y, icolor]) """, [ lp.GlobalArg("f", dtype, shape=lp.auto), lp.GlobalArg("img", dtype, shape=lp.auto), lp.GlobalArg("out", dtype, shape=lp.auto), "..." ], assumptions="f_w>=1 and im_w, im_h >= 2*f_w+1 and nfeats>=1 and nimgs>=0", flags="annotate_inames", defines=dict(ncolors=ncolors), name="conv") f_w = 3 knl = lp.fix_parameters(knl, f_w=f_w) ref_knl = knl for BSIZEx, BSIZEy in configs_t: knl = ref_knl im_w = n im_h = n nfeats = 3 nimgs = 3 f_dev = cl.clrandom.rand(queue, (nfeats, 2*f_w+1, 2*f_w+1, ncolors), dtype=dtype) img_dev = cl.clrandom.rand(queue, (nimgs+1, n+2*f_w, n+2*f_w, ncolors), dtype=dtype) knl = lp.split_iname(knl, "im_x", BSIZEx, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "im_y", BSIZEy, outer_tag="g.1", inner_tag="l.1") knl = lp.tag_inames(knl, dict(ifeat="g.2")) knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]") knl = lp.add_prefetch(knl, "img", "im_x_inner, im_y_inner, f_x, f_y") params = dict(im_w=im_w, im_h=im_h, f_w=f_w, nfeats=nfeats, nimgs=nimgs) #check = lp.auto_test_vs_ref(ref_knl, ctx, knl, print_code=True, # parameters=params) #print "Correctness check: \n", check # use ptx src to determine resource usage #ptx_dump(ctx, knl, n, BSIZEx, BSIZEy) barrier_poly = get_barrier_poly(knl) barrier_ct = barrier_poly.eval_with_dict(params) op_map = get_op_poly(knl) op_map2 = get_op_poly2(knl) flops, iops = get_32b_ops(op_map, params) amd_op32 = get_32b_amd_ops(op_map2, params) other_op32 = get_32b_ops_all(op_map2, params) - sum(amd_op32) if flops + iops != sum(amd_op32) + other_op32: #TODO remove after debug print("<debug> PROBLEM!, ops don't add up: ", flops, iops, sum(amd_op32), other_op32) #TODO why do blk sizes that don't fit perfecty increase total flops/iops sub_map = get_DRAM_access_poly(knl) # noqa f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s = get_DRAM_f32_accesses( sub_map, params) f32coal = f32coal_l + f32coal_s f32uncoal = f32uncoal_l + f32uncoal_s # execute print("running kernel...") #knl = lp.set_options(knl, write_cl=True, highlight_cl=True) trial_times = [] for i in range(averaging_trials+warmup_trials): evt, (out,) = knl(queue, f=f_dev, img=img_dev, im_w=im_w, im_h=im_h, nfeats=nfeats, nimgs=nimgs) evt.wait() trial_times.append((evt.profile.END - evt.profile.START)*1e-9) avg_time = np.average(trial_times[warmup_trials:]) gstats = GPUStats('TeslaC2070') reg32_per_thread = 20 #20 for c2070, 33 for k20 #print(reg32_per_thread, estimate_regs_per_thread(knl)) reg32_per_thread = estimate_regs_per_thread(knl) #reg32_per_thread = 1 #estimate_regs_per_thread(knl) shared_mem_per_block = (ncolors * (f_w*2+1) * (f_w*2+1) + (BSIZEx+f_w*2) * (BSIZEy+f_w*2) ) * np.dtype(dtype).itemsize total_blocks = math.ceil(n/BSIZEx)*math.ceil(n/BSIZEy) total_threads = total_blocks*BSIZEx*BSIZEy # TODO unused kstats = KernelStats(flops/(n*n), f32uncoal/(n*n), f32coal/(n*n), barrier_ct, reg32_per_thread, shared_mem_per_block) tconfig = ThreadConfig(BSIZEx*BSIZEy, total_blocks) model = PerfModel(gstats, kstats, tconfig, np.dtype(dtype)) cycles = model.compute_total_cycles() actual.append(avg_time) #for time in trial_times: #!!!!! # actual.append(time) HK_predict.append(cycles/(gstats.sm_clock_freq*10**9)) #for time in trial_times: ''' update_LS_matrix(A, flops, iops, f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s, barrier_ct, total_blocks, n*n, np.dtype(dtype).itemsize, model) ''' ops = copy.deepcopy(amd_op32) ops.append(other_op32) update_LS_matrix2(A, ops, f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s, barrier_ct, total_blocks, n*n, np.dtype(dtype).itemsize, model) #TODO try total_threads for n*n update_lstsq_mats(Atrain_all, Atest_all, ytrain_all, ytest_all, actual_times_all, HK_predict_all, A, actual, HK_predict, train_test_config)
def run_tp_trials(ctx, queue, nvals, configs_t, Atrain_all, Atest_all, ytrain_all, ytest_all, actual_times_all, HK_predict_all, train_test_config, prefetch=True): A = [] HK_predict = [] actual = [] dtype = np.float32 for n in nvals: a_mat_dev = cl.clrandom.rand(queue, (n, n), dtype=dtype) b_mat_dev = cl.clrandom.rand(queue, (n, n), dtype=dtype) order = "C" knl = lp.make_kernel( "{[i,j]: 0<=i,j<%d}" % n, [ "b[i, j] = a[j, i]" ], [ lp.GlobalArg("a", dtype, shape=(n, n), order=order), lp.GlobalArg("b", dtype, shape=(n, n), order=order), ], name="transpose") ref_knl = knl for BSIZEx, BSIZEy in configs_t: knl = ref_knl knl = lp.split_iname(knl, "i", BSIZEy, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", BSIZEx, outer_tag="g.1", inner_tag="l.0") if prefetch: knl = lp.add_prefetch(knl, 'a', ["i_inner", "j_inner"]) #check = lp.auto_test_vs_ref(ref_knl, ctx, knl, print_code=True) #print "Correctness check: \n", check # use ptx src to determine resource usage #ptx_dump(ctx, knl, n, BSIZEx, BSIZEy) params = {'n': n} barrier_poly = get_barrier_poly(knl) barrier_ct = barrier_poly.eval_with_dict(params) op_map = get_op_poly(knl) op_map2 = get_op_poly2(knl) flops, iops = get_32b_ops(op_map, params) amd_op32 = get_32b_amd_ops(op_map2, params) other_op32 = get_32b_ops_all(op_map2, params) - sum(amd_op32) if flops + iops != sum(amd_op32) + other_op32: #TODO remove after debug print("<debug> PROBLEM!, ops don't add up: ", flops, iops, sum(amd_op32), other_op32) sub_map = get_DRAM_access_poly(knl) # noqa f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s = get_DRAM_f32_accesses( sub_map, params) f32coal = f32coal_l + f32coal_s f32uncoal = f32uncoal_l + f32uncoal_s # execute #print "="*40+"TIMING RESULTS" print("running kernel...") #knl = lp.set_options(knl, write_cl=True, highlight_cl=True) #if not prefetch: # knl = lp.set_options(knl, write_cl=True, highlight_cl=True) trial_times = [] for i in range(averaging_trials+warmup_trials): evt, (out,) = knl(queue, a=a_mat_dev, b=b_mat_dev) evt.wait() trial_times.append((evt.profile.END - evt.profile.START)*1e-9) avg_time = np.average(trial_times[warmup_trials:]) #if not prefetch: # 1/0 gstats = GPUStats('TeslaC2070') ''' for k20 if n % BSIZEx == 0 and n % BSIZEy == 0: if prefetch: reg32_per_thread = 10 else: reg32_per_thread = 8 else: if prefetch: reg32_per_thread = 8 else: reg32_per_thread = 9 ''' reg32_per_thread = 8 # for c2070 #print(reg32_per_thread, estimate_regs_per_thread(knl)) reg32_per_thread = estimate_regs_per_thread(knl) #reg32_per_thread = 1 #estimate_regs_per_thread(knl) if prefetch: shared_mem_per_block = 4*BSIZEx*BSIZEy else: shared_mem_per_block = 0 # TODO why is HK way off on the non-prefetch version? total_blocks = math.ceil(n/BSIZEx)*math.ceil(n/BSIZEy) total_threads = total_blocks*BSIZEx*BSIZEy # TODO unused kstats = KernelStats(flops/(n*n), f32uncoal/(n*n), f32coal/(n*n), barrier_ct, reg32_per_thread, shared_mem_per_block) tconfig = ThreadConfig(BSIZEx*BSIZEy, total_blocks) model = PerfModel(gstats, kstats, tconfig, np.dtype(dtype)) cycles = model.compute_total_cycles() actual.append(avg_time) #for time in trial_times: #!!!!! # actual.append(time) HK_predict.append(cycles/(gstats.sm_clock_freq*10**9)) #update_LS_matrix(A, flops, f32coal_l, f32coal_s, f32uncoal_l, #for time in trial_times: ''' update_LS_matrix(A, flops, iops, f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s, barrier_ct, total_blocks, n*n, np.dtype(dtype).itemsize, model) ''' ops = copy.deepcopy(amd_op32) ops.append(other_op32) update_LS_matrix2(A, ops, f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s, barrier_ct, total_blocks, n*n, np.dtype(dtype).itemsize, model) update_lstsq_mats(Atrain_all, Atest_all, ytrain_all, ytest_all, actual_times_all, HK_predict_all, A, actual, HK_predict, train_test_config)
def run_axpy_trials(ctx, queue, nvals, configs_t, Atrain_all, Atest_all, ytrain_all, ytest_all, actual_times_all, HK_predict_all, train_test_config): A = [] HK_predict = [] actual = [] dtype = np.float32 #TODO figure out smem usage issue for n in nvals: x_vec_dev = cl.clrandom.rand(queue, n, dtype=dtype) y_vec_dev = cl.clrandom.rand(queue, n, dtype=dtype) z_vec_dev = cl.clrandom.rand(queue, n, dtype=dtype) knl = lp.make_kernel( "[n] -> {[i]: 0<=i<%d}" % n, [ "z[i] = 5.0*x[i]+7.0*y[i]" ], [ lp.GlobalArg("x", dtype, shape=n), lp.GlobalArg("y", dtype, shape=n), lp.GlobalArg("z", dtype, shape=n), ], name="axpy") ref_knl = knl for BSIZEx, BSIZEy in configs_t: knl = ref_knl unroll = 4 knl = lp.split_iname(knl, "i", unroll*BSIZEx, outer_tag="g.0", slabs=(0, 1)) knl = lp.split_iname(knl, "i_inner", BSIZEx, outer_tag="unr", inner_tag="l.0") #check = lp.auto_test_vs_ref(ref_knl, ctx, knl, print_code=False) #print "Correctness check: \n", check # use ptx src to determine resource usage #ptx_dump(ctx, knl, n, BSIZEx, BSIZEy) params = {'n': n} barrier_poly = get_barrier_poly(knl) barrier_ct = barrier_poly.eval_with_dict(params) op_map = get_op_poly(knl) op_map2 = get_op_poly2(knl) flops, iops = get_32b_ops(op_map, params) amd_op32 = get_32b_amd_ops(op_map2, params) other_op32 = get_32b_ops_all(op_map2, params) - sum(amd_op32) if flops + iops != sum(amd_op32) + other_op32: #TODO remove after debug print("<debug> PROBLEM!, ops don't add up: ", flops, iops, sum(amd_op32), other_op32) sub_map = get_DRAM_access_poly(knl) # noqa f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s = get_DRAM_f32_accesses( sub_map, params) f32coal = f32coal_l + f32coal_s f32uncoal = f32uncoal_l + f32uncoal_s ''' print_ptx_src_msg(knl.name) print "="*40+"KERNEL STATS" print "barrier count: ", barrier_ct print "flops: ", flops print(sub_map) print "="*40 ''' # execute print("running kernel...") #knl = lp.set_options(knl, write_cl=True, highlight_cl=True) trial_times = [] for i in range(averaging_trials+warmup_trials): evt, (out,) = knl(queue, x=x_vec_dev, y=y_vec_dev, z=z_vec_dev) evt.wait() trial_times.append((evt.profile.END - evt.profile.START)*1e-9) avg_time = np.average(trial_times[warmup_trials:]) gstats = GPUStats('TeslaC2070') reg32_per_thread = 18 #18 for c2070, 20 for k20 #reg32_per_thread = 1 #estimate_regs_per_thread(knl) #print(reg32_per_thread, estimate_regs_per_thread(knl)) reg32_per_thread = estimate_regs_per_thread(knl) shared_mem_per_block = 0 total_blocks = math.ceil(n/(BSIZEx*unroll)) kstats = KernelStats(flops*unroll/n, f32uncoal*unroll/n, f32coal*unroll/n, barrier_ct, reg32_per_thread, shared_mem_per_block) tconfig = ThreadConfig(BSIZEx*BSIZEy, total_blocks) model = PerfModel(gstats, kstats, tconfig, np.dtype(dtype)) cycles = model.compute_total_cycles() actual.append(avg_time) #for time in trial_times: #!!!!! # actual.append(time) HK_predict.append(cycles/(gstats.sm_clock_freq*10**9)) #for time in trial_times: ''' update_LS_matrix(A, flops, iops, f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s, barrier_ct, total_blocks, n/unroll, np.dtype(dtype).itemsize, model) ''' ops = copy.deepcopy(amd_op32) ops.append(other_op32) update_LS_matrix2(A, ops, f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s, barrier_ct, total_blocks, n/unroll, np.dtype(dtype).itemsize, model) update_lstsq_mats(Atrain_all, Atest_all, ytrain_all, ytest_all, actual_times_all, HK_predict_all, A, actual, HK_predict, train_test_config)
def run_mm_trials(ctx, queue, nvals, configs_t, Atrain_all, Atest_all, ytrain_all, ytest_all, actual_times_all, HK_predict_all, train_test_config, version): A = [] HK_predict = [] actual = [] dtype = np.float32 #TODO figure out smem usage issue for n in nvals: a_mat_dev = cl.clrandom.rand(queue, (n, n), dtype=dtype) b_mat_dev = cl.clrandom.rand(queue, (n, n), dtype=dtype) c_mat_dev = cl.clrandom.rand(queue, (n, n), dtype=dtype) order = "C" knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<%d}" % n, [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], [ lp.GlobalArg("a", dtype, shape=(n, n), order=order), lp.GlobalArg("b", dtype, shape=(n, n), order=order), lp.GlobalArg("c", dtype, shape=(n, n), order=order), ], name="matmul") ref_knl = knl for BSIZEx, BSIZEy in configs_t: knl = ref_knl if version == "allcoal": knl = lp.split_iname(knl, "i", BSIZEy, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", BSIZEx, outer_tag="g.1", inner_tag="l.0") elif version == "partcoal": knl = lp.split_iname(knl, "i", BSIZEy, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", BSIZEx, outer_tag="g.1", inner_tag="l.1") else: 1/0 # TODO error ksplit = BSIZEy knl = lp.split_iname(knl, "k", ksplit) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner", ]) #check = lp.auto_test_vs_ref(ref_knl, ctx, knl, print_code=True) #print "Correctness check: \n", check # use ptx src to determine resource usage #ptx_dump(ctx, knl, n, BSIZEx, BSIZEy) params = {'n': n} barrier_poly = get_barrier_poly(knl) barrier_ct = barrier_poly.eval_with_dict(params) op_map = get_op_poly(knl) op_map2 = get_op_poly2(knl) flops, iops = get_32b_ops(op_map, params) amd_op32 = get_32b_amd_ops(op_map2, params) other_op32 = get_32b_ops_all(op_map2, params) - sum(amd_op32) if flops + iops != sum(amd_op32) + other_op32: #TODO remove after debug print("<debug> PROBLEM!, ops don't add up: ", flops, iops, sum(amd_op32), other_op32) sub_map = get_DRAM_access_poly(knl) # noqa f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s = get_DRAM_f32_accesses( sub_map, params) f32coal = f32coal_l + f32coal_s f32uncoal = f32uncoal_l + f32uncoal_s ''' print_ptx_src_msg(knl.name) print "="*40+"KERNEL STATS" print "barrier count: ", barrier_ct print "flops: ", flops print(sub_map) print "="*40 ''' # execute #print "="*40+"TIMING RESULTS" print("running kernel...") #knl = lp.set_options(knl, write_cl=True, highlight_cl=True) trial_times = [] for i in range(averaging_trials+warmup_trials): evt, (out,) = knl(queue, a=a_mat_dev, b=b_mat_dev, c=c_mat_dev) evt.wait() trial_times.append((evt.profile.END - evt.profile.START)*1e-9) avg_time = np.average(trial_times[warmup_trials:]) gstats = GPUStats('TeslaC2070') ''' for k20: if BSIZEx == 8 or BSIZEx == 32: # TODO fix hack reg32_per_thread = 25 elif BSIZEx == 24: reg32_per_thread = 18 elif BSIZEx == 16: reg32_per_thread = 22 ''' # for C2070 if BSIZEx == 8 or BSIZEx == 16: # TODO fix hack reg32_per_thread = 20 elif BSIZEx == 32: reg32_per_thread = 19 elif BSIZEx == 24: reg32_per_thread = 12 #reg32_per_thread = 1 #estimate_regs_per_thread(knl) #print(reg32_per_thread, estimate_regs_per_thread(knl)) reg32_per_thread = estimate_regs_per_thread(knl) shared_mem_per_block = 4*ksplit*(BSIZEx+BSIZEy) total_blocks = math.ceil(n/BSIZEx)*math.ceil(n/BSIZEy) total_threads = total_blocks*BSIZEx*BSIZEy # TODO never used kstats = KernelStats(flops/(n*n), f32uncoal/(n*n), f32coal/(n*n), barrier_ct, reg32_per_thread, shared_mem_per_block) tconfig = ThreadConfig(BSIZEx*BSIZEy, total_blocks) model = PerfModel(gstats, kstats, tconfig, np.dtype(dtype)) cycles = model.compute_total_cycles() actual.append(avg_time) #for time in trial_times: #!!!!! # actual.append(time) HK_predict.append(cycles/(gstats.sm_clock_freq*10**9)) ''' print "actual runtime: ", actual[-1] print "total predicted time: ", predicted[-1] print "total predicted execution cycles: ", cycles print "="*40 ''' #''' #!!!!! ''' update_LS_matrix(A, flops, iops, f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s, barrier_ct, total_blocks, n*n, np.dtype(dtype).itemsize, model) ''' ops = copy.deepcopy(amd_op32) ops.append(other_op32) update_LS_matrix2(A, ops, f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s, barrier_ct, total_blocks, n*n, np.dtype(dtype).itemsize, model) ''' for time in trial_times: update_LS_matrix(A, flops, iops, f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s, barrier_ct, total_blocks, n*n, np.dtype(dtype).itemsize, model) ''' update_lstsq_mats(Atrain_all, Atest_all, ytrain_all, ytest_all, actual_times_all, HK_predict_all, A, actual, HK_predict, train_test_config)