コード例 #1
0
def test_reg_counter_reduction():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
            [
                "c[i, j] = sum(k, a[i, k]*b[k, j])"
            ],
            name="matmul_serial", assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
    regs = estimate_regs_per_thread(knl)
    assert regs == 6
コード例 #2
0
def test_reg_counter_logic():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
            [
                """
                e[i,k] = if(not(k<l-2) and k>6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2)
                """
            ],
            name="logic", assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
    regs = estimate_regs_per_thread(knl)
    assert regs == 6
コード例 #3
0
def test_reg_counter_specialops():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
            [
                """
                c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0)
                e[i, k] = (1+g[i,k])**(1+h[i,k+1])
                """
            ],
            name="specialops", assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(knl,
                        dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
    regs = estimate_regs_per_thread(knl)
    assert regs == 6
コード例 #4
0
def test_reg_counter_basic():

    knl = lp.make_kernel(
            "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
            [
                """
                c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
                e[i, k+1] = g[i,k]*h[i,k+1]
                """
            ],
            name="basic", assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(knl,
                        dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
    regs = estimate_regs_per_thread(knl)
    assert regs == 6
コード例 #5
0
def test_reg_counter_bitwise():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
            [
                """
                c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1)
                e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k))
                """
            ],
            name="bitwise", assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(
            knl, dict(
                a=np.int32, b=np.int32,
                g=np.int64, h=np.int64))
    regs = estimate_regs_per_thread(knl)
    assert regs == 6
コード例 #6
0
def run_fd_trials(ctx, queue, nvals, configs_t,
                  Atrain_all, Atest_all, ytrain_all, ytest_all,
                  actual_times_all, HK_predict_all, train_test_config):
    A = []
    HK_predict = []
    actual = []
    dtype = np.float32

    for n in nvals:
        u_mat_dev = cl.clrandom.rand(queue, (n+2, n+2), dtype=dtype)
        knl = lp.make_kernel(
              "{[i,j]: 0<=i,j<n}",
              "result[i,j] = u[i, j]**2 + -1 + (-4)*u[i + 1, j + 1] \
                    + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] \
                    + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]",
              name="finite_diff")
        knl = lp.add_and_infer_dtypes(knl, {"u": dtype})
        ref_knl = knl

        for BSIZEx, BSIZEy in configs_t:
            knl = ref_knl
            knl = lp.split_iname(knl,
                    "i", BSIZEx, outer_tag="g.1", inner_tag="l.1")
            knl = lp.split_iname(knl,
                    "j", BSIZEy, outer_tag="g.0", inner_tag="l.0")
            knl = lp.add_prefetch(knl, "u",
                    ["i_inner", "j_inner"],
                    fetch_bounding_box=True)

            #check = lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=n),
            #                            print_code=True)
            #print "Correctness check: \n", check

            # use ptx src to determine resource usage
            #ptx_dump(ctx, knl, n, BSIZEx, BSIZEy)

            params = {'n': n}
            barrier_poly = get_barrier_poly(knl)
            barrier_ct = barrier_poly.eval_with_dict(params)
            op_map = get_op_poly(knl)
            op_map2 = get_op_poly2(knl)
            flops, iops = get_32b_ops(op_map, params)
            amd_op32 = get_32b_amd_ops(op_map2, params)
            other_op32 = get_32b_ops_all(op_map2, params) - sum(amd_op32)
            if flops + iops != sum(amd_op32) + other_op32: #TODO remove after debug
                print("<debug> PROBLEM!, ops don't add up: ",
                        flops, iops, sum(amd_op32), other_op32)
            sub_map = get_DRAM_access_poly(knl)  # noqa
            f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s = get_DRAM_f32_accesses(
                                                                    sub_map, params)
            f32coal = f32coal_l + f32coal_s
            f32uncoal = f32uncoal_l + f32uncoal_s

            # execute
            #print "="*40+"TIMING RESULTS"
            print("running kernel...")
            #knl = lp.set_options(knl, write_cl=True, highlight_cl=True)

            trial_times = []
            for i in range(averaging_trials+warmup_trials):
                evt, (out,) = knl(queue, u=u_mat_dev)
                evt.wait()
                trial_times.append((evt.profile.END - evt.profile.START)*1e-9)
            avg_time = np.average(trial_times[warmup_trials:])

            gstats = GPUStats('TeslaC2070')
            #'''
            if n % BSIZEx == 0 and n % BSIZEy == 0:
                reg32_per_thread = 14
            else:
                reg32_per_thread = 15 # 16 for k20
            #'''
            #reg32_per_thread = 1 #estimate_regs_per_thread(knl)
            #print(reg32_per_thread, estimate_regs_per_thread(knl))
            reg32_per_thread = estimate_regs_per_thread(knl)

            shared_mem_per_block = 4*(BSIZEx+2)*(BSIZEy+2)
            total_blocks = math.ceil(n/BSIZEx)*math.ceil(n/BSIZEy)
            total_threads = total_blocks*BSIZEx*BSIZEy  # TODO unused
            kstats = KernelStats(flops/(n*n), f32uncoal/(n*n), f32coal/(n*n),
                                 barrier_ct, reg32_per_thread, shared_mem_per_block)
            tconfig = ThreadConfig(BSIZEx*BSIZEy, total_blocks)
            model = PerfModel(gstats, kstats, tconfig,
                            np.dtype(dtype))
            cycles = model.compute_total_cycles()

            actual.append(avg_time)
            #for time in trial_times: #!!!!!
            #    actual.append(time)
            HK_predict.append(cycles/(gstats.sm_clock_freq*10**9))

            #for time in trial_times:
            '''
            update_LS_matrix(A, flops, iops, f32coal_l, f32coal_s, f32uncoal_l,
                             f32uncoal_s, barrier_ct, total_blocks, n*n,
                             np.dtype(dtype).itemsize, model)
            '''
            ops = copy.deepcopy(amd_op32)
            ops.append(other_op32)
            update_LS_matrix2(A, ops, f32coal_l, f32coal_s, f32uncoal_l,
                             f32uncoal_s, barrier_ct, total_blocks, n*n,
                             np.dtype(dtype).itemsize, model)

    update_lstsq_mats(Atrain_all, Atest_all, ytrain_all, ytest_all,
                      actual_times_all, HK_predict_all,
                      A, actual, HK_predict, train_test_config)
コード例 #7
0
def run_empt_trials(ctx, queue, nvals, configs_t,
                    Atrain_all, Atest_all, ytrain_all, ytest_all,
                    actual_times_all, HK_predict_all, train_test_config):
    A = []
    HK_predict = []
    actual = []
    dtype = np.float32
    for n in nvals:
        knl = lp.make_kernel(
                "{[i,j]: 0<=i,j<%d}" % n,
                [
                    ""
                ],
                name="empty")

        for BSIZEx, BSIZEy in configs_t:

            #check = lp.auto_test_vs_ref(ref_knl, ctx, knl, print_code=True)
            #print "Correctness check: \n", check

            # use ptx src to determine resource usage
            #ptx_dump(ctx, knl, n, BSIZEx, BSIZEy)

            params = {'n': n}
            barrier_poly = get_barrier_poly(knl)
            barrier_ct = barrier_poly.eval_with_dict(params)
            #op_map = get_op_poly(knl) #TODO figure out error
            op_map = {}
            op_map2 = {}
            flops, iops = get_32b_ops(op_map, params)
            amd_op32 = get_32b_amd_ops(op_map2, params)
            other_op32 = get_32b_ops_all(op_map2, params) - sum(amd_op32)
            if flops + iops != sum(amd_op32) + other_op32: #TODO remove after debug
                print("<debug> PROBLEM!, ops don't add up: ",
                        flops, iops, sum(amd_op32), other_op32)
            #sub_map = get_DRAM_access_poly(knl)  #TODO figure out error
            sub_map = {}
            f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s = get_DRAM_f32_accesses(
                                                                    sub_map, params)
            # execute
            #print "="*40+"TIMING RESULTS"
            print("running kernel...")
            #knl = lp.set_options(knl, write_cl=True, highlight_cl=True)

            trial_times = []
            for i in range(averaging_trials+warmup_trials):
                evt, out = knl(queue)
                evt.wait()
                trial_times.append((evt.profile.END - evt.profile.START)*1e-9)
            avg_time = np.average(trial_times[warmup_trials:])

            gstats = GPUStats('TeslaC2070')
            reg32_per_thread = 2
            #print(reg32_per_thread, estimate_regs_per_thread(knl))
            reg32_per_thread = estimate_regs_per_thread(knl)
            #reg32_per_thread = 1 #estimate_regs_per_thread(knl)
            shared_mem_per_block = 0
            total_blocks = math.ceil(n/BSIZEx)*math.ceil(n/BSIZEy)
            total_threads = total_blocks*BSIZEx*BSIZEy  # TODO unused
            # TODO actually increase threads/blocks but expect 0 result
            kstats = KernelStats(0, 0, 0, barrier_ct, reg32_per_thread,
                                 shared_mem_per_block)
            tconfig = ThreadConfig(BSIZEx*BSIZEy, total_blocks)
            model = PerfModel(gstats, kstats, tconfig,
                            np.dtype(dtype))
            cycles = model.compute_total_cycles()

            actual.append(avg_time)
            #for time in trial_times: #!!!!!
            #    actual.append(time)
            HK_predict.append(cycles/(gstats.sm_clock_freq*10**9))

            #for time in trial_times:
            '''
            update_LS_matrix(A, flops, iops, f32coal_l, f32coal_s, f32uncoal_l,
                             f32uncoal_s, barrier_ct, total_blocks, n*n,
                             np.dtype(dtype).itemsize, model)
            '''
            ops = copy.deepcopy(amd_op32)
            ops.append(other_op32)
            update_LS_matrix2(A, ops, f32coal_l, f32coal_s, f32uncoal_l,
                             f32uncoal_s, barrier_ct, total_blocks, n*n,
                             np.dtype(dtype).itemsize, model)

    update_lstsq_mats(Atrain_all, Atest_all, ytrain_all, ytest_all,
                      actual_times_all, HK_predict_all,
                      A, actual, HK_predict, train_test_config)
コード例 #8
0
def run_conv_trials(ctx, queue, nvals, configs_t,
                    Atrain_all, Atest_all, ytrain_all, ytest_all,
                    actual_times_all, HK_predict_all, train_test_config):
    A = []
    HK_predict = []
    actual = []
    dtype = np.float32
    ncolors = 3
    for n in nvals:
        knl = lp.make_kernel(
            "{ [iimg, ifeat, icolor, im_x, im_y, f_x, f_y]: \
                -f_w <= f_x,f_y <= f_w \
                and 0 <= im_x < im_w and 0 <= im_y < im_h \
                and 0<=iimg<=nimgs and 0<=ifeat<nfeats and 0<=icolor<ncolors \
             }",
            """
            out[iimg, ifeat, im_x, im_y] = sum((f_x, f_y, icolor), \
                img[iimg, f_w+im_x-f_x, f_w+im_y-f_y, icolor] \
                * f[ifeat, f_w+f_x, f_w+f_y, icolor])
            """,
            [
                lp.GlobalArg("f", dtype, shape=lp.auto),
                lp.GlobalArg("img", dtype, shape=lp.auto),
                lp.GlobalArg("out", dtype, shape=lp.auto),
                "..."
            ],
            assumptions="f_w>=1 and im_w, im_h >= 2*f_w+1 and nfeats>=1 and nimgs>=0",
            flags="annotate_inames",
            defines=dict(ncolors=ncolors),
            name="conv")

        f_w = 3
        knl = lp.fix_parameters(knl, f_w=f_w)
        ref_knl = knl

        for BSIZEx, BSIZEy in configs_t:
            knl = ref_knl
            im_w = n
            im_h = n
            nfeats = 3
            nimgs = 3
            f_dev = cl.clrandom.rand(queue, (nfeats, 2*f_w+1, 2*f_w+1, ncolors),
                                     dtype=dtype)
            img_dev = cl.clrandom.rand(queue, (nimgs+1, n+2*f_w, n+2*f_w, ncolors),
                                       dtype=dtype)

            knl = lp.split_iname(knl, "im_x", BSIZEx,
                                 outer_tag="g.0", inner_tag="l.0")
            knl = lp.split_iname(knl, "im_y", BSIZEy,
                                 outer_tag="g.1", inner_tag="l.1")
            knl = lp.tag_inames(knl, dict(ifeat="g.2"))
            knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]")
            knl = lp.add_prefetch(knl, "img", "im_x_inner, im_y_inner, f_x, f_y")

            params = dict(im_w=im_w, im_h=im_h, f_w=f_w, nfeats=nfeats, nimgs=nimgs)

            #check = lp.auto_test_vs_ref(ref_knl, ctx, knl, print_code=True,
            #                            parameters=params)
            #print "Correctness check: \n", check
            # use ptx src to determine resource usage
            #ptx_dump(ctx, knl, n, BSIZEx, BSIZEy)

            barrier_poly = get_barrier_poly(knl)
            barrier_ct = barrier_poly.eval_with_dict(params)
            op_map = get_op_poly(knl)
            op_map2 = get_op_poly2(knl)
            flops, iops = get_32b_ops(op_map, params)
            amd_op32 = get_32b_amd_ops(op_map2, params)
            other_op32 = get_32b_ops_all(op_map2, params) - sum(amd_op32)
            if flops + iops != sum(amd_op32) + other_op32: #TODO remove after debug
                print("<debug> PROBLEM!, ops don't add up: ",
                        flops, iops, sum(amd_op32), other_op32)
            #TODO why do blk sizes that don't fit perfecty increase total flops/iops
            sub_map = get_DRAM_access_poly(knl)  # noqa
            f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s = get_DRAM_f32_accesses(
                                                                    sub_map, params)
            f32coal = f32coal_l + f32coal_s
            f32uncoal = f32uncoal_l + f32uncoal_s

            # execute
            print("running kernel...")
            #knl = lp.set_options(knl, write_cl=True, highlight_cl=True)

            trial_times = []
            for i in range(averaging_trials+warmup_trials):
                evt, (out,) = knl(queue, f=f_dev, img=img_dev, im_w=im_w, im_h=im_h,
                                  nfeats=nfeats, nimgs=nimgs)
                evt.wait()
                trial_times.append((evt.profile.END - evt.profile.START)*1e-9)
            avg_time = np.average(trial_times[warmup_trials:])

            gstats = GPUStats('TeslaC2070')
            reg32_per_thread = 20 #20 for c2070, 33 for k20
            #print(reg32_per_thread, estimate_regs_per_thread(knl))
            reg32_per_thread = estimate_regs_per_thread(knl)
            #reg32_per_thread = 1 #estimate_regs_per_thread(knl)
            shared_mem_per_block = (ncolors * (f_w*2+1) * (f_w*2+1) +
                                    (BSIZEx+f_w*2) * (BSIZEy+f_w*2)
                                    ) * np.dtype(dtype).itemsize
            total_blocks = math.ceil(n/BSIZEx)*math.ceil(n/BSIZEy)
            total_threads = total_blocks*BSIZEx*BSIZEy  # TODO unused
            kstats = KernelStats(flops/(n*n), f32uncoal/(n*n), f32coal/(n*n),
                                 barrier_ct, reg32_per_thread, shared_mem_per_block)
            tconfig = ThreadConfig(BSIZEx*BSIZEy, total_blocks)
            model = PerfModel(gstats, kstats, tconfig, np.dtype(dtype))
            cycles = model.compute_total_cycles()

            actual.append(avg_time)
            #for time in trial_times: #!!!!!
            #    actual.append(time)
            HK_predict.append(cycles/(gstats.sm_clock_freq*10**9))
            #for time in trial_times:
            '''
            update_LS_matrix(A, flops, iops, f32coal_l, f32coal_s, f32uncoal_l,
                             f32uncoal_s, barrier_ct, total_blocks, n*n,
                             np.dtype(dtype).itemsize, model)
            '''
            ops = copy.deepcopy(amd_op32)
            ops.append(other_op32)
            update_LS_matrix2(A, ops, f32coal_l, f32coal_s, f32uncoal_l,
                             f32uncoal_s, barrier_ct, total_blocks, n*n,
                             np.dtype(dtype).itemsize, model)
            #TODO try total_threads for n*n

    update_lstsq_mats(Atrain_all, Atest_all, ytrain_all, ytest_all,
                      actual_times_all, HK_predict_all,
                      A, actual, HK_predict, train_test_config)
コード例 #9
0
def run_tp_trials(ctx, queue, nvals, configs_t,
                  Atrain_all, Atest_all, ytrain_all, ytest_all, actual_times_all,
                  HK_predict_all, train_test_config, prefetch=True):
    A = []
    HK_predict = []
    actual = []
    dtype = np.float32
    for n in nvals:
        a_mat_dev = cl.clrandom.rand(queue, (n, n), dtype=dtype)
        b_mat_dev = cl.clrandom.rand(queue, (n, n), dtype=dtype)
        order = "C"
        knl = lp.make_kernel(
                "{[i,j]: 0<=i,j<%d}" % n,
                [
                    "b[i, j] = a[j, i]"
                ], [
                    lp.GlobalArg("a", dtype, shape=(n, n), order=order),
                    lp.GlobalArg("b", dtype, shape=(n, n), order=order),
                ],
                name="transpose")
        ref_knl = knl

        for BSIZEx, BSIZEy in configs_t:
            knl = ref_knl
            knl = lp.split_iname(knl, "i", BSIZEy, outer_tag="g.0", inner_tag="l.1")
            knl = lp.split_iname(knl, "j", BSIZEx, outer_tag="g.1", inner_tag="l.0")
            if prefetch:
                knl = lp.add_prefetch(knl, 'a', ["i_inner", "j_inner"])

            #check = lp.auto_test_vs_ref(ref_knl, ctx, knl, print_code=True)
            #print "Correctness check: \n", check

            # use ptx src to determine resource usage
            #ptx_dump(ctx, knl, n, BSIZEx, BSIZEy)

            params = {'n': n}
            barrier_poly = get_barrier_poly(knl)
            barrier_ct = barrier_poly.eval_with_dict(params)
            op_map = get_op_poly(knl)
            op_map2 = get_op_poly2(knl)
            flops, iops = get_32b_ops(op_map, params)
            amd_op32 = get_32b_amd_ops(op_map2, params)
            other_op32 = get_32b_ops_all(op_map2, params) - sum(amd_op32)
            if flops + iops != sum(amd_op32) + other_op32: #TODO remove after debug
                print("<debug> PROBLEM!, ops don't add up: ",
                        flops, iops, sum(amd_op32), other_op32)
            sub_map = get_DRAM_access_poly(knl)  # noqa
            f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s = get_DRAM_f32_accesses(
                                                                  sub_map, params)
            f32coal = f32coal_l + f32coal_s
            f32uncoal = f32uncoal_l + f32uncoal_s
            # execute
            #print "="*40+"TIMING RESULTS"
            print("running kernel...")
            #knl = lp.set_options(knl, write_cl=True, highlight_cl=True)
            #if not prefetch:
            #    knl = lp.set_options(knl, write_cl=True, highlight_cl=True)

            trial_times = []
            for i in range(averaging_trials+warmup_trials):
                evt, (out,) = knl(queue, a=a_mat_dev, b=b_mat_dev)
                evt.wait()
                trial_times.append((evt.profile.END - evt.profile.START)*1e-9)
            avg_time = np.average(trial_times[warmup_trials:])
            #if not prefetch:
            #    1/0
            gstats = GPUStats('TeslaC2070')
            ''' for k20
            if n % BSIZEx == 0 and n % BSIZEy == 0:
                if prefetch:
                    reg32_per_thread = 10
                else:
                    reg32_per_thread = 8
            else:
                if prefetch:
                    reg32_per_thread = 8
                else:
                    reg32_per_thread = 9
            '''
            reg32_per_thread = 8 # for c2070
            #print(reg32_per_thread, estimate_regs_per_thread(knl))
            reg32_per_thread = estimate_regs_per_thread(knl)
            #reg32_per_thread = 1 #estimate_regs_per_thread(knl)
            if prefetch:
                shared_mem_per_block = 4*BSIZEx*BSIZEy
            else:
                shared_mem_per_block = 0
            # TODO why is HK way  off on the non-prefetch version?
            total_blocks = math.ceil(n/BSIZEx)*math.ceil(n/BSIZEy)
            total_threads = total_blocks*BSIZEx*BSIZEy  # TODO unused
            kstats = KernelStats(flops/(n*n), f32uncoal/(n*n), f32coal/(n*n),
                                 barrier_ct, reg32_per_thread, shared_mem_per_block)
            tconfig = ThreadConfig(BSIZEx*BSIZEy, total_blocks)
            model = PerfModel(gstats, kstats, tconfig,
                            np.dtype(dtype))
            cycles = model.compute_total_cycles()

            actual.append(avg_time)
            #for time in trial_times: #!!!!!
            #    actual.append(time)
            HK_predict.append(cycles/(gstats.sm_clock_freq*10**9))

            #update_LS_matrix(A, flops, f32coal_l, f32coal_s, f32uncoal_l,
            #for time in trial_times:
            '''
            update_LS_matrix(A, flops, iops, f32coal_l, f32coal_s, f32uncoal_l,
                             f32uncoal_s, barrier_ct, total_blocks, n*n,
                             np.dtype(dtype).itemsize, model)
            '''
            ops = copy.deepcopy(amd_op32)
            ops.append(other_op32)
            update_LS_matrix2(A, ops, f32coal_l, f32coal_s, f32uncoal_l,
                             f32uncoal_s, barrier_ct, total_blocks, n*n,
                             np.dtype(dtype).itemsize, model)

    update_lstsq_mats(Atrain_all, Atest_all, ytrain_all, ytest_all,
                      actual_times_all, HK_predict_all,
                      A, actual, HK_predict, train_test_config)
コード例 #10
0
def run_axpy_trials(ctx, queue, nvals, configs_t,
                    Atrain_all, Atest_all, ytrain_all, ytest_all,
                    actual_times_all, HK_predict_all, train_test_config):
    A = []
    HK_predict = []
    actual = []
    dtype = np.float32

    #TODO figure out smem usage issue
    for n in nvals:
        x_vec_dev = cl.clrandom.rand(queue, n, dtype=dtype)
        y_vec_dev = cl.clrandom.rand(queue, n, dtype=dtype)
        z_vec_dev = cl.clrandom.rand(queue, n, dtype=dtype)
        knl = lp.make_kernel(
            "[n] -> {[i]: 0<=i<%d}" % n,
            [
                "z[i] = 5.0*x[i]+7.0*y[i]"
            ], [
                lp.GlobalArg("x", dtype, shape=n),
                lp.GlobalArg("y", dtype, shape=n),
                lp.GlobalArg("z", dtype, shape=n),
            ], name="axpy")
        ref_knl = knl

        for BSIZEx, BSIZEy in configs_t:
            knl = ref_knl
            unroll = 4
            knl = lp.split_iname(knl, "i", unroll*BSIZEx,
                 outer_tag="g.0", slabs=(0, 1))
            knl = lp.split_iname(knl, "i_inner", BSIZEx,
                 outer_tag="unr", inner_tag="l.0")

            #check = lp.auto_test_vs_ref(ref_knl, ctx, knl, print_code=False)
            #print "Correctness check: \n", check

            # use ptx src to determine resource usage
            #ptx_dump(ctx, knl, n, BSIZEx, BSIZEy)

            params = {'n': n}
            barrier_poly = get_barrier_poly(knl)
            barrier_ct = barrier_poly.eval_with_dict(params)
            op_map = get_op_poly(knl)
            op_map2 = get_op_poly2(knl)
            flops, iops = get_32b_ops(op_map, params)
            amd_op32 = get_32b_amd_ops(op_map2, params)
            other_op32 = get_32b_ops_all(op_map2, params) - sum(amd_op32)
            if flops + iops != sum(amd_op32) + other_op32: #TODO remove after debug
                print("<debug> PROBLEM!, ops don't add up: ",
                        flops, iops, sum(amd_op32), other_op32)
            sub_map = get_DRAM_access_poly(knl)  # noqa
            f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s = get_DRAM_f32_accesses(
                                                                  sub_map, params)
            f32coal = f32coal_l + f32coal_s
            f32uncoal = f32uncoal_l + f32uncoal_s

            '''
            print_ptx_src_msg(knl.name)

            print "="*40+"KERNEL STATS"
            print "barrier count: ", barrier_ct
            print "flops: ", flops
            print(sub_map)
            print "="*40
            '''

            # execute
            print("running kernel...")
            #knl = lp.set_options(knl, write_cl=True, highlight_cl=True)

            trial_times = []
            for i in range(averaging_trials+warmup_trials):
                evt, (out,) = knl(queue, x=x_vec_dev, y=y_vec_dev, z=z_vec_dev)
                evt.wait()
                trial_times.append((evt.profile.END - evt.profile.START)*1e-9)
            avg_time = np.average(trial_times[warmup_trials:])

            gstats = GPUStats('TeslaC2070')
            reg32_per_thread = 18 #18 for c2070, 20 for k20
            #reg32_per_thread = 1 #estimate_regs_per_thread(knl)
            #print(reg32_per_thread, estimate_regs_per_thread(knl))
            reg32_per_thread = estimate_regs_per_thread(knl)
            shared_mem_per_block = 0
            total_blocks = math.ceil(n/(BSIZEx*unroll))
            kstats = KernelStats(flops*unroll/n, f32uncoal*unroll/n,
                                 f32coal*unroll/n, barrier_ct, reg32_per_thread,
                                 shared_mem_per_block)
            tconfig = ThreadConfig(BSIZEx*BSIZEy, total_blocks)
            model = PerfModel(gstats, kstats, tconfig, np.dtype(dtype))
            cycles = model.compute_total_cycles()

            actual.append(avg_time)
            #for time in trial_times: #!!!!!
            #    actual.append(time)
            HK_predict.append(cycles/(gstats.sm_clock_freq*10**9))

            #for time in trial_times:
            '''
            update_LS_matrix(A, flops, iops, f32coal_l, f32coal_s, f32uncoal_l,
                             f32uncoal_s, barrier_ct, total_blocks, n/unroll,
                             np.dtype(dtype).itemsize, model)
            '''
            ops = copy.deepcopy(amd_op32)
            ops.append(other_op32)
            update_LS_matrix2(A, ops, f32coal_l, f32coal_s, f32uncoal_l,
                             f32uncoal_s, barrier_ct, total_blocks, n/unroll,
                             np.dtype(dtype).itemsize, model)

    update_lstsq_mats(Atrain_all, Atest_all, ytrain_all, ytest_all,
                      actual_times_all, HK_predict_all,
                      A, actual, HK_predict, train_test_config)
コード例 #11
0
def run_mm_trials(ctx, queue, nvals, configs_t,
                  Atrain_all, Atest_all, ytrain_all, ytest_all,
                  actual_times_all, HK_predict_all, train_test_config, version):
    A = []
    HK_predict = []
    actual = []
    dtype = np.float32

    #TODO figure out smem usage issue
    for n in nvals:
        a_mat_dev = cl.clrandom.rand(queue, (n, n), dtype=dtype)
        b_mat_dev = cl.clrandom.rand(queue, (n, n), dtype=dtype)
        c_mat_dev = cl.clrandom.rand(queue, (n, n), dtype=dtype)
        order = "C"
        knl = lp.make_kernel(
            "{[i,j,k]: 0<=i,j,k<%d}" % n,
            [
                "c[i, j] = sum(k, a[i, k]*b[k, j])"
            ], [
                lp.GlobalArg("a", dtype, shape=(n, n), order=order),
                lp.GlobalArg("b", dtype, shape=(n, n), order=order),
                lp.GlobalArg("c", dtype, shape=(n, n), order=order),
            ], name="matmul")
        ref_knl = knl

        for BSIZEx, BSIZEy in configs_t:
            knl = ref_knl
            if version == "allcoal":
                knl = lp.split_iname(knl, "i", BSIZEy,
                                     outer_tag="g.0", inner_tag="l.1")
                knl = lp.split_iname(knl, "j", BSIZEx,
                                     outer_tag="g.1", inner_tag="l.0")
            elif version == "partcoal":
                knl = lp.split_iname(knl, "i", BSIZEy,
                                     outer_tag="g.0", inner_tag="l.0")
                knl = lp.split_iname(knl, "j", BSIZEx,
                                     outer_tag="g.1", inner_tag="l.1")
            else:
                1/0
                # TODO error
            ksplit = BSIZEy
            knl = lp.split_iname(knl, "k", ksplit)
            knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"])
            knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner", ])

            #check = lp.auto_test_vs_ref(ref_knl, ctx, knl, print_code=True)
            #print "Correctness check: \n", check
            # use ptx src to determine resource usage

            #ptx_dump(ctx, knl, n, BSIZEx, BSIZEy)

            params = {'n': n}
            barrier_poly = get_barrier_poly(knl)
            barrier_ct = barrier_poly.eval_with_dict(params)
            op_map = get_op_poly(knl)
            op_map2 = get_op_poly2(knl)
            flops, iops = get_32b_ops(op_map, params)
            amd_op32 = get_32b_amd_ops(op_map2, params)
            other_op32 = get_32b_ops_all(op_map2, params) - sum(amd_op32)
            if flops + iops != sum(amd_op32) + other_op32: #TODO remove after debug
                print("<debug> PROBLEM!, ops don't add up: ",
                        flops, iops, sum(amd_op32), other_op32)
            sub_map = get_DRAM_access_poly(knl)  # noqa
            f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s = get_DRAM_f32_accesses(
                                                                  sub_map, params)
            f32coal = f32coal_l + f32coal_s
            f32uncoal = f32uncoal_l + f32uncoal_s

            '''
            print_ptx_src_msg(knl.name)
            print "="*40+"KERNEL STATS"
            print "barrier count: ", barrier_ct
            print "flops: ", flops
            print(sub_map)
            print "="*40
            '''

            # execute
            #print "="*40+"TIMING RESULTS"
            print("running kernel...")
            #knl = lp.set_options(knl, write_cl=True, highlight_cl=True)

            trial_times = []
            for i in range(averaging_trials+warmup_trials):
                evt, (out,) = knl(queue, a=a_mat_dev, b=b_mat_dev, c=c_mat_dev)
                evt.wait()
                trial_times.append((evt.profile.END - evt.profile.START)*1e-9)
            avg_time = np.average(trial_times[warmup_trials:])

            gstats = GPUStats('TeslaC2070')
            ''' for k20:
            if BSIZEx == 8 or BSIZEx == 32:  # TODO fix hack
                reg32_per_thread = 25
            elif BSIZEx == 24:
                reg32_per_thread = 18
            elif BSIZEx == 16:
                reg32_per_thread = 22
            '''
            # for C2070
            if BSIZEx == 8 or BSIZEx == 16:  # TODO fix hack
                reg32_per_thread = 20
            elif BSIZEx == 32:
                reg32_per_thread = 19
            elif BSIZEx == 24:
                reg32_per_thread = 12
            #reg32_per_thread = 1 #estimate_regs_per_thread(knl)
            #print(reg32_per_thread, estimate_regs_per_thread(knl))
            reg32_per_thread = estimate_regs_per_thread(knl)

            shared_mem_per_block = 4*ksplit*(BSIZEx+BSIZEy)
            total_blocks = math.ceil(n/BSIZEx)*math.ceil(n/BSIZEy)
            total_threads = total_blocks*BSIZEx*BSIZEy  # TODO never used
            kstats = KernelStats(flops/(n*n), f32uncoal/(n*n), f32coal/(n*n),
                                 barrier_ct, reg32_per_thread, shared_mem_per_block)
            tconfig = ThreadConfig(BSIZEx*BSIZEy, total_blocks)
            model = PerfModel(gstats, kstats, tconfig,
                            np.dtype(dtype))
            cycles = model.compute_total_cycles()
            actual.append(avg_time)
            #for time in trial_times: #!!!!!
            #    actual.append(time)
            HK_predict.append(cycles/(gstats.sm_clock_freq*10**9))

            '''
            print "actual runtime: ", actual[-1]
            print "total predicted time: ", predicted[-1]
            print "total predicted execution cycles: ", cycles
            print "="*40
            '''
            #''' #!!!!!
            '''
            update_LS_matrix(A, flops, iops, f32coal_l, f32coal_s, f32uncoal_l,
                             f32uncoal_s, barrier_ct, total_blocks, n*n,
                             np.dtype(dtype).itemsize, model)
            '''
            ops = copy.deepcopy(amd_op32)
            ops.append(other_op32)
            update_LS_matrix2(A, ops, f32coal_l, f32coal_s, f32uncoal_l,
                             f32uncoal_s, barrier_ct, total_blocks, n*n,
                             np.dtype(dtype).itemsize, model)

            '''
            for time in trial_times:
                update_LS_matrix(A, flops, iops, f32coal_l, f32coal_s, f32uncoal_l,
                                 f32uncoal_s, barrier_ct, total_blocks, n*n,
                                 np.dtype(dtype).itemsize, model)
            '''

    update_lstsq_mats(Atrain_all, Atest_all, ytrain_all, ytest_all,
                      actual_times_all, HK_predict_all,
                      A, actual, HK_predict, train_test_config)