Ejemplo n.º 1
0
def test_all_counters_parallel_matmul():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
            [
                "c[i, j] = sum(k, a[i, k]*b[k, j])"
            ],
            name="matmul", assumptions="n,m,l >= 1")
    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0")

    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}

    barrier_count = get_barrier_poly(knl).eval_with_dict(params)
    assert barrier_count == 0

    op_map = get_op_poly(knl)
    f32mul = op_map[
                        (np.dtype(np.float32), 'mul')
                        ].eval_with_dict(params)
    f32add = op_map[
                        (np.dtype(np.float32), 'add')
                        ].eval_with_dict(params)
    i32ops = op_map[
                        (np.dtype(np.int32), 'add')
                        ].eval_with_dict(params)
    i32ops += op_map[
                        (np.dtype(np.int32), 'mul')
                        ].eval_with_dict(params)

    assert f32mul+f32add == n*m*l*2
    assert i32ops == n*m*l*4 + l*n*4

    subscript_map = get_gmem_access_poly(knl)
    f32uncoal = subscript_map[
                        (np.dtype(np.float32), 'nonconsecutive', 'load')
                        ].eval_with_dict(params)
    f32coal = subscript_map[
                        (np.dtype(np.float32), 'consecutive', 'load')
                        ].eval_with_dict(params)

    assert f32uncoal == n*m*l
    assert f32coal == n*m*l

    f32coal = subscript_map[
                        (np.dtype(np.float32), 'consecutive', 'store')
                        ].eval_with_dict(params)

    assert f32coal == n*l
Ejemplo n.º 2
0
def test_barrier_counter_nobarriers():

    knl = lp.make_kernel(
            "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
            [
                """
                c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
                e[i, k] = g[i,k]*h[i,k+1]
                """
            ],
            name="basic", assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(knl,
                        dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
    poly = get_barrier_poly(knl)
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}
    barrier_count = poly.eval_with_dict(params)
    assert barrier_count == 0
Ejemplo n.º 3
0
def test_barrier_counter_barriers():

    knl = lp.make_kernel(
            "[n,m,l] -> {[i,k,j]: 0<=i<50 and 1<=k<98 and 0<=j<10}",
            [
                """
            c[i,j,k] = 2*a[i,j,k] {id=first}
            e[i,j,k] = c[i,j,k+1]+c[i,j,k-1] {dep=first}
            """
            ], [
                lp.TemporaryVariable("c", lp.auto, shape=(50, 10, 99)),
                "..."
            ],
            name="weird2",
            )
    knl = lp.add_and_infer_dtypes(knl, dict(a=np.int32))
    knl = lp.split_iname(knl, "k", 128, outer_tag="g.0", inner_tag="l.0")
    poly = get_barrier_poly(knl)
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}
    barrier_count = poly.eval_with_dict(params)
    assert barrier_count == 50*10*2
        knl = ref_knl
        knl = lp.split_iname(knl, "i", BSIZEy, outer_tag="g.0", inner_tag="l.1")
        knl = lp.split_iname(knl, "j", BSIZEx, outer_tag="g.1", inner_tag="l.0")
        knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"])

        # check = lp.auto_test_vs_ref(ref_knl, ctx, knl, print_code=True)
        # print "Correctness check: \n", check

        # use ptx src to determine resource usage
        cknl = lp.compiled.CompiledKernel(ctx, knl)
        ptx_src = cknl.cl_kernel_info().cl_kernel.program.binaries[0]
        ptx_src_file = open(knl.name + ".ptx", "w")
        ptx_src_file.write(ptx_src)

        barrier_poly = get_barrier_poly(knl)
        barrier_count = barrier_poly.eval_with_dict({"n": n})
        op_map = get_op_poly(knl)
        flops = op_map.get(np.dtype(np.float32), isl.PwQPolynomial("{ 0 }")).eval_with_dict({"n": n})
        iops = op_map.get(np.dtype(np.int32), isl.PwQPolynomial("{ 0 }")).eval_with_dict({"n": n})
        sub_map = get_DRAM_access_poly(knl)  # noqa

        f32coal_l = sub_map.get(
            (np.dtype(np.float32), "consecutive", "load"), isl.PwQPolynomial("{ 0 }")
        ).eval_with_dict({"n": n})
        f32coal_s = sub_map.get(
            (np.dtype(np.float32), "consecutive", "store"), isl.PwQPolynomial("{ 0 }")
        ).eval_with_dict({"n": n})
        f32coal = f32coal_l + f32coal_s
        # print "coalesced: %i, (stores: %i, loads: %i)" % (f32coal, f32coal_s, f32coal_l)
        f32uncoal_l = sub_map.get(
def run_empt_trials(ctx, queue, nvals, configs_t,
                    Atrain_all, Atest_all, ytrain_all, ytest_all,
                    actual_times_all, HK_predict_all, train_test_config):
    A = []
    HK_predict = []
    actual = []
    dtype = np.float32
    for n in nvals:
        knl = lp.make_kernel(
                "{[i,j]: 0<=i,j<%d}" % n,
                [
                    ""
                ],
                name="empty")

        for BSIZEx, BSIZEy in configs_t:

            #check = lp.auto_test_vs_ref(ref_knl, ctx, knl, print_code=True)
            #print "Correctness check: \n", check

            # use ptx src to determine resource usage
            #ptx_dump(ctx, knl, n, BSIZEx, BSIZEy)

            params = {'n': n}
            barrier_poly = get_barrier_poly(knl)
            barrier_ct = barrier_poly.eval_with_dict(params)
            op_map = get_op_poly(knl)
            flops, iops = get_32b_ops(op_map, params)
            sub_map = get_DRAM_access_poly(knl)  # noqa
            f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s = get_DRAM_f32_accesses(
                                                                    sub_map, params)

            # execute
            #print "="*40+"TIMING RESULTS"
            print("running kernel...")
            #knl = lp.set_options(knl, write_cl=True, highlight_cl=True)

            trial_times = []
            for i in range(averaging_trials+warmup_trials):
                evt, out = knl(queue)
                evt.wait()
                trial_times.append((evt.profile.END - evt.profile.START)*1e-9)
            avg_time = np.average(trial_times[warmup_trials:])

            gstats = GPUStats('TeslaK20')
            reg32_per_thread = 2
            shared_mem_per_block = 0
            total_blocks = math.ceil(n/BSIZEx)*math.ceil(n/BSIZEy)
            total_threads = total_blocks*BSIZEx*BSIZEy  # TODO unused
            # TODO actually increase threads/blocks but expect 0 result
            kstats = KernelStats(0, 0, 0, barrier_ct, reg32_per_thread,
                                 shared_mem_per_block)
            tconfig = ThreadConfig(BSIZEx*BSIZEy, total_blocks)
            model = PerfModel(gstats, kstats, tconfig,
                            np.dtype(dtype))
            cycles = model.compute_total_cycles()

            actual.append(avg_time)
            HK_predict.append(cycles/(gstats.sm_clock_freq*10**9))

            update_LS_matrix(A, flops, iops, f32coal_l, f32coal_s, f32uncoal_l,
                             f32uncoal_s, barrier_ct, total_blocks, n*n,
                             np.dtype(dtype).itemsize, model)

    update_lstsq_mats(Atrain_all, Atest_all, ytrain_all, ytest_all,
                      actual_times_all, HK_predict_all,
                      A, actual, HK_predict, train_test_config)
def run_fd_trials(ctx, queue, nvals, configs_t,
                  Atrain_all, Atest_all, ytrain_all, ytest_all,
                  actual_times_all, HK_predict_all, train_test_config):
    A = []
    HK_predict = []
    actual = []
    dtype = np.float32

    for n in nvals:
        u_mat_dev = cl.clrandom.rand(queue, (n+2, n+2), dtype=dtype)
        knl = lp.make_kernel(
              "{[i,j]: 0<=i,j<n}",
              "result[i,j] = u[i, j]**2 + -1 + (-4)*u[i + 1, j + 1] \
                    + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] \
                    + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]",
              name="finite_diff")
        knl = lp.add_and_infer_dtypes(knl, {"u": dtype})
        ref_knl = knl

        for BSIZEx, BSIZEy in configs_t:
            knl = ref_knl
            knl = lp.split_iname(knl,
                    "i", BSIZEx, outer_tag="g.1", inner_tag="l.1")
            knl = lp.split_iname(knl,
                    "j", BSIZEy, outer_tag="g.0", inner_tag="l.0")
            knl = lp.add_prefetch(knl, "u",
                    ["i_inner", "j_inner"],
                    fetch_bounding_box=True)

            #check = lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=n),
            #                            print_code=True)
            #print "Correctness check: \n", check

            # use ptx src to determine resource usage
            #ptx_dump(ctx, knl, n, BSIZEx, BSIZEy)

            params = {'n': n}
            barrier_poly = get_barrier_poly(knl)
            barrier_ct = barrier_poly.eval_with_dict(params)
            op_map = get_op_poly(knl)
            flops, iops = get_32b_ops(op_map, params)
            sub_map = get_DRAM_access_poly(knl)  # noqa
            f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s = get_DRAM_f32_accesses(
                                                                    sub_map, params)
            f32coal = f32coal_l + f32coal_s
            f32uncoal = f32uncoal_l + f32uncoal_s

            # execute
            #print "="*40+"TIMING RESULTS"
            print("running kernel...")
            #knl = lp.set_options(knl, write_cl=True, highlight_cl=True)

            trial_times = []
            for i in range(averaging_trials+warmup_trials):
                evt, (out,) = knl(queue, u=u_mat_dev)
                evt.wait()
                trial_times.append((evt.profile.END - evt.profile.START)*1e-9)
            avg_time = np.average(trial_times[warmup_trials:])

            gstats = GPUStats('TeslaK20')
            if n % BSIZEx == 0 and n % BSIZEy == 0:
                reg32_per_thread = 14
            else:
                reg32_per_thread = 16

            shared_mem_per_block = 4*(BSIZEx+2)*(BSIZEy+2)
            total_blocks = math.ceil(n/BSIZEx)*math.ceil(n/BSIZEy)
            total_threads = total_blocks*BSIZEx*BSIZEy  # TODO unused
            kstats = KernelStats(flops/(n*n), f32uncoal/(n*n), f32coal/(n*n),
                                 barrier_ct, reg32_per_thread, shared_mem_per_block)
            tconfig = ThreadConfig(BSIZEx*BSIZEy, total_blocks)
            model = PerfModel(gstats, kstats, tconfig,
                            np.dtype(dtype))
            cycles = model.compute_total_cycles()

            actual.append(avg_time)
            HK_predict.append(cycles/(gstats.sm_clock_freq*10**9))

            update_LS_matrix(A, flops, iops, f32coal_l, f32coal_s, f32uncoal_l,
                             f32uncoal_s, barrier_ct, total_blocks, n*n,
                             np.dtype(dtype).itemsize, model)

    update_lstsq_mats(Atrain_all, Atest_all, ytrain_all, ytest_all,
                      actual_times_all, HK_predict_all,
                      A, actual, HK_predict, train_test_config)
def run_conv_trials(ctx, queue, nvals, configs_t,
                    Atrain_all, Atest_all, ytrain_all, ytest_all,
                    actual_times_all, HK_predict_all, train_test_config):
    A = []
    HK_predict = []
    actual = []
    dtype = np.float32
    ncolors = 3
    for n in nvals:
        knl = lp.make_kernel(
            "{ [iimg, ifeat, icolor, im_x, im_y, f_x, f_y]: \
                -f_w <= f_x,f_y <= f_w \
                and 0 <= im_x < im_w and 0 <= im_y < im_h \
                and 0<=iimg<=nimgs and 0<=ifeat<nfeats and 0<=icolor<ncolors \
             }",
            """
            out[iimg, ifeat, im_x, im_y] = sum((f_x, f_y, icolor), \
                img[iimg, f_w+im_x-f_x, f_w+im_y-f_y, icolor] \
                * f[ifeat, f_w+f_x, f_w+f_y, icolor])
            """,
            [
                lp.GlobalArg("f", dtype, shape=lp.auto),
                lp.GlobalArg("img", dtype, shape=lp.auto),
                lp.GlobalArg("out", dtype, shape=lp.auto),
                "..."
            ],
            assumptions="f_w>=1 and im_w, im_h >= 2*f_w+1 and nfeats>=1 and nimgs>=0",
            flags="annotate_inames",
            defines=dict(ncolors=ncolors),
            name="conv")

        f_w = 3
        knl = lp.fix_parameters(knl, f_w=f_w)
        ref_knl = knl

        for BSIZEx, BSIZEy in configs_t:
            knl = ref_knl
            im_w = n
            im_h = n
            nfeats = 3
            nimgs = 3
            f_dev = cl.clrandom.rand(queue, (nfeats, 2*f_w+1, 2*f_w+1, ncolors),
                                     dtype=dtype)
            img_dev = cl.clrandom.rand(queue, (nimgs+1, n+2*f_w, n+2*f_w, ncolors),
                                       dtype=dtype)

            knl = lp.split_iname(knl, "im_x", BSIZEx,
                                 outer_tag="g.0", inner_tag="l.0")
            knl = lp.split_iname(knl, "im_y", BSIZEy,
                                 outer_tag="g.1", inner_tag="l.1")
            knl = lp.tag_inames(knl, dict(ifeat="g.2"))
            knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]")
            knl = lp.add_prefetch(knl, "img", "im_x_inner, im_y_inner, f_x, f_y")

            params = dict(im_w=im_w, im_h=im_h, f_w=f_w, nfeats=nfeats, nimgs=nimgs)

            #check = lp.auto_test_vs_ref(ref_knl, ctx, knl, print_code=True,
            #                            parameters=params)
            #print "Correctness check: \n", check
            # use ptx src to determine resource usage
            #ptx_dump(ctx, knl, n, BSIZEx, BSIZEy)

            barrier_poly = get_barrier_poly(knl)
            barrier_ct = barrier_poly.eval_with_dict(params)
            op_map = get_op_poly(knl)
            flops, iops = get_32b_ops(op_map, params)
            #TODO why do blk sizes that don't fit perfecty increase total flops/iops
            sub_map = get_DRAM_access_poly(knl)  # noqa
            f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s = get_DRAM_f32_accesses(
                                                                    sub_map, params)
            f32coal = f32coal_l + f32coal_s
            f32uncoal = f32uncoal_l + f32uncoal_s

            # execute
            print("running kernel...")
            #knl = lp.set_options(knl, write_cl=True, highlight_cl=True)

            trial_times = []
            for i in range(averaging_trials+warmup_trials):
                evt, (out,) = knl(queue, f=f_dev, img=img_dev, im_w=im_w, im_h=im_h,
                                  nfeats=nfeats, nimgs=nimgs)
                evt.wait()
                trial_times.append((evt.profile.END - evt.profile.START)*1e-9)
            avg_time = np.average(trial_times[warmup_trials:])

            gstats = GPUStats('TeslaK20')
            reg32_per_thread = 33
            shared_mem_per_block = (ncolors * (f_w*2+1) * (f_w*2+1) +
                                    (BSIZEx+f_w*2) * (BSIZEy+f_w*2)
                                    ) * np.dtype(dtype).itemsize
            total_blocks = math.ceil(n/BSIZEx)*math.ceil(n/BSIZEy)
            total_threads = total_blocks*BSIZEx*BSIZEy  # TODO unused
            kstats = KernelStats(flops/(n*n), f32uncoal/(n*n), f32coal/(n*n),
                                 barrier_ct, reg32_per_thread, shared_mem_per_block)
            tconfig = ThreadConfig(BSIZEx*BSIZEy, total_blocks)
            model = PerfModel(gstats, kstats, tconfig, np.dtype(dtype))
            cycles = model.compute_total_cycles()

            actual.append(avg_time)
            HK_predict.append(cycles/(gstats.sm_clock_freq*10**9))
            update_LS_matrix(A, flops, iops, f32coal_l, f32coal_s, f32uncoal_l,
                             f32uncoal_s, barrier_ct, total_blocks, n*n,
                             np.dtype(dtype).itemsize, model)
            #TODO try total_threads for n*n

    update_lstsq_mats(Atrain_all, Atest_all, ytrain_all, ytest_all,
                      actual_times_all, HK_predict_all,
                      A, actual, HK_predict, train_test_config)
def run_tp_trials(ctx, queue, nvals, configs_t,
                  Atrain_all, Atest_all, ytrain_all, ytest_all, actual_times_all,
                  HK_predict_all, train_test_config, prefetch=True):
    A = []
    HK_predict = []
    actual = []
    dtype = np.float32
    for n in nvals:
        a_mat_dev = cl.clrandom.rand(queue, (n, n), dtype=dtype)
        b_mat_dev = cl.clrandom.rand(queue, (n, n), dtype=dtype)
        order = "C"
        knl = lp.make_kernel(
                "{[i,j]: 0<=i,j<%d}" % n,
                [
                    "b[i, j] = a[j, i]"
                ], [
                    lp.GlobalArg("a", dtype, shape=(n, n), order=order),
                    lp.GlobalArg("b", dtype, shape=(n, n), order=order),
                ],
                name="transpose")
        ref_knl = knl

        for BSIZEx, BSIZEy in configs_t:
            knl = ref_knl
            knl = lp.split_iname(knl, "i", BSIZEy, outer_tag="g.0", inner_tag="l.1")
            knl = lp.split_iname(knl, "j", BSIZEx, outer_tag="g.1", inner_tag="l.0")
            if prefetch:
                knl = lp.add_prefetch(knl, 'a', ["i_inner", "j_inner"])

            #check = lp.auto_test_vs_ref(ref_knl, ctx, knl, print_code=True)
            #print "Correctness check: \n", check

            # use ptx src to determine resource usage
            #ptx_dump(ctx, knl, n, BSIZEx, BSIZEy)

            barrier_poly = get_barrier_poly(knl)
            barrier_ct = barrier_poly.eval_with_dict({'n': n})
            op_map = get_op_poly(knl)
            flops, iops = get_32b_ops(op_map, {'n': n})
            sub_map = get_DRAM_access_poly(knl)  # noqa
            f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s = get_DRAM_f32_accesses(
                                                                  sub_map, {'n': n})
            f32coal = f32coal_l + f32coal_s
            f32uncoal = f32uncoal_l + f32uncoal_s
            # execute
            #print "="*40+"TIMING RESULTS"
            print("running kernel...")
            #knl = lp.set_options(knl, write_cl=True, highlight_cl=True)
            #if not prefetch:
            #    knl = lp.set_options(knl, write_cl=True, highlight_cl=True)

            trial_times = []
            for i in range(averaging_trials+warmup_trials):
                evt, (out,) = knl(queue, a=a_mat_dev, b=b_mat_dev)
                evt.wait()
                trial_times.append((evt.profile.END - evt.profile.START)*1e-9)
            avg_time = np.average(trial_times[warmup_trials:])
            #if not prefetch:
            #    1/0
            gstats = GPUStats('TeslaK20')
            if n % BSIZEx == 0 and n % BSIZEy == 0:
                if prefetch:
                    reg32_per_thread = 10
                else:
                    reg32_per_thread = 8
            else:
                if prefetch:
                    reg32_per_thread = 8
                else:
                    reg32_per_thread = 9

            if prefetch:
                shared_mem_per_block = 4*BSIZEx*BSIZEy
            else:
                shared_mem_per_block = 0
            # TODO why is HK way  off on the non-prefetch version?
            total_blocks = math.ceil(n/BSIZEx)*math.ceil(n/BSIZEy)
            total_threads = total_blocks*BSIZEx*BSIZEy  # TODO unused
            kstats = KernelStats(flops/(n*n), f32uncoal/(n*n), f32coal/(n*n),
                                 barrier_ct, reg32_per_thread, shared_mem_per_block)
            tconfig = ThreadConfig(BSIZEx*BSIZEy, total_blocks)
            model = PerfModel(gstats, kstats, tconfig,
                            np.dtype(dtype))
            cycles = model.compute_total_cycles()

            actual.append(avg_time)
            HK_predict.append(cycles/(gstats.sm_clock_freq*10**9))

            #update_LS_matrix(A, flops, f32coal_l, f32coal_s, f32uncoal_l,
            update_LS_matrix(A, flops, iops, f32coal_l, f32coal_s, f32uncoal_l,
                             f32uncoal_s, barrier_ct, total_blocks, n*n,
                             np.dtype(dtype).itemsize, model)

    update_lstsq_mats(Atrain_all, Atest_all, ytrain_all, ytest_all,
                      actual_times_all, HK_predict_all,
                      A, actual, HK_predict, train_test_config)
def run_axpy_trials(ctx, queue, nvals, configs_t,
                    Atrain_all, Atest_all, ytrain_all, ytest_all,
                    actual_times_all, HK_predict_all, train_test_config):
    A = []
    HK_predict = []
    actual = []
    dtype = np.float32

    #TODO figure out smem usage issue
    for n in nvals:
        x_vec_dev = cl.clrandom.rand(queue, n, dtype=dtype)
        y_vec_dev = cl.clrandom.rand(queue, n, dtype=dtype)
        z_vec_dev = cl.clrandom.rand(queue, n, dtype=dtype)
        knl = lp.make_kernel(
            "[n] -> {[i]: 0<=i<%d}" % n,
            [
                "z[i] = 5.0*x[i]+7.0*y[i]"
            ], [
                lp.GlobalArg("x", dtype, shape=n),
                lp.GlobalArg("y", dtype, shape=n),
                lp.GlobalArg("z", dtype, shape=n),
            ], name="axpy")
        ref_knl = knl

        for BSIZEx, BSIZEy in configs_t:
            knl = ref_knl
            unroll = 4
            knl = lp.split_iname(knl, "i", unroll*BSIZEx,
                 outer_tag="g.0", slabs=(0, 1))
            knl = lp.split_iname(knl, "i_inner", BSIZEx,
                 outer_tag="unr", inner_tag="l.0")

            #check = lp.auto_test_vs_ref(ref_knl, ctx, knl, print_code=False)
            #print "Correctness check: \n", check

            # use ptx src to determine resource usage
            #ptx_dump(ctx, knl, n, BSIZEx, BSIZEy)

            barrier_poly = get_barrier_poly(knl)
            barrier_ct = barrier_poly.eval_with_dict({'n': n})
            op_map = get_op_poly(knl)
            flops, iops = get_32b_ops(op_map, {'n': n})
            sub_map = get_DRAM_access_poly(knl)  # noqa
            f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s = get_DRAM_f32_accesses(
                                                                  sub_map, {'n': n})
            f32coal = f32coal_l + f32coal_s
            f32uncoal = f32uncoal_l + f32uncoal_s

            '''
            print_ptx_src_msg(knl.name)

            print "="*40+"KERNEL STATS"
            print "barrier count: ", barrier_ct
            print "flops: ", flops
            print(sub_map)
            print "="*40
            '''

            # execute
            print("running kernel...")
            #knl = lp.set_options(knl, write_cl=True, highlight_cl=True)

            trial_times = []
            for i in range(averaging_trials+warmup_trials):
                evt, (out,) = knl(queue, x=x_vec_dev, y=y_vec_dev, z=z_vec_dev)
                evt.wait()
                trial_times.append((evt.profile.END - evt.profile.START)*1e-9)
            avg_time = np.average(trial_times[warmup_trials:])

            gstats = GPUStats('TeslaK20')
            reg32_per_thread = 20
            shared_mem_per_block = 0
            total_blocks = math.ceil(n/(BSIZEx*unroll))
            kstats = KernelStats(flops*unroll/n, f32uncoal*unroll/n,
                                 f32coal*unroll/n, barrier_ct, reg32_per_thread,
                                 shared_mem_per_block)
            tconfig = ThreadConfig(BSIZEx*BSIZEy, total_blocks)
            model = PerfModel(gstats, kstats, tconfig, np.dtype(dtype))
            cycles = model.compute_total_cycles()

            actual.append(avg_time)
            HK_predict.append(cycles/(gstats.sm_clock_freq*10**9))

            update_LS_matrix(A, flops, iops, f32coal_l, f32coal_s, f32uncoal_l,
                             f32uncoal_s, barrier_ct, total_blocks, n/unroll,
                             np.dtype(dtype).itemsize, model)

    update_lstsq_mats(Atrain_all, Atest_all, ytrain_all, ytest_all,
                      actual_times_all, HK_predict_all,
                      A, actual, HK_predict, train_test_config)
def run_varyflops_trials(ctx, queue, nvals, configs_t,
                  Atrain_all, Atest_all, ytrain_all, ytest_all,
                  actual_times_all, HK_predict_all, train_test_config):
    A = []
    HK_predict = []
    actual = []
    dtype = np.float32

    #TODO figure out smem usage issue
    for n in nvals:
        a_mat_dev = cl.clrandom.rand(queue, (n, n, n), dtype=dtype)
        b_mat_dev = cl.clrandom.rand(queue, (n, n, n), dtype=dtype)
        g_mat_dev = cl.clrandom.rand(queue, (n, n, n), dtype=dtype)
        h_mat_dev = cl.clrandom.rand(queue, (n, n, n+1), dtype=dtype)


        knl = lp.make_kernel(
                "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
                [
                    """
                    c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
                    """
                ],
                name="basic", assumptions="n,m,l >= 1")
        '''
        knl = lp.make_kernel(
                "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
                [
                    """
                    c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
                    e[i, j, k+1] = g[i,j,k]*h[i,j,k+1]
                    """
                ],
                name="basic", assumptions="n,m,l >= 1")
        knl = lp.make_kernel(
                "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
                [
                    """
                    c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0
                    e[i, j, k+1] = g[i,j,k]*h[i,j,k+1]
                    """
                ],
                name="basic", assumptions="n,m,l >= 1")
        '''
        #knl = lp.add_and_infer_dtypes(knl,
        #                    dict(a=dtype, b=dtype, g=dtype, h=dtype))
        knl = lp.add_and_infer_dtypes(knl,
                            dict(a=dtype, b=dtype))
        ref_knl = knl

        for BSIZEx, BSIZEy in configs_t:
            knl = ref_knl
            knl = lp.split_iname(knl, "i", BSIZEy,
                                 outer_tag="g.0", inner_tag="l.1")
            knl = lp.split_iname(knl, "j", BSIZEx,
                                 outer_tag="g.1", inner_tag="l.0")

            params = dict(n=n, m=n, l=n)
            check = lp.auto_test_vs_ref(ref_knl, ctx, knl, print_code=True,
                                        parameters=params)
            #print "Correctness check: \n", check
            # use ptx src to determine resource usage
            #ptx_dump(ctx, knl, n, BSIZEx, BSIZEy)
            barrier_poly = get_barrier_poly(knl)
            barrier_ct = barrier_poly.eval_with_dict(params)
            op_map = get_op_poly(knl)
            flops, iops = get_32b_ops(op_map, params)
            sub_map = get_DRAM_access_poly(knl)  # noqa
            f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s = get_DRAM_f32_accesses(
                                                                  sub_map, params)
            f32coal = f32coal_l + f32coal_s
            f32uncoal = f32uncoal_l + f32uncoal_s
            #print(sub_map)
            #print(f32coal/(n*n), f32uncoal/(n*n))
            print(knl)
            print(f32coal/(n*n), f32uncoal/(n*n))
            1/0
            '''
            print_ptx_src_msg(knl.name)
            print "="*40+"KERNEL STATS"
            print "barrier count: ", barrier_ct
            print "flops: ", flops
            print(sub_map)
            print "="*40
            '''

            # execute
            #print "="*40+"TIMING RESULTS"
            print("running kernel...")
            #knl = lp.set_options(knl, write_cl=True, highlight_cl=True)

            trial_times = []
            for i in range(averaging_trials+warmup_trials):
                #evt, out = knl(queue, a=a_mat_dev, b=b_mat_dev,
                #                  g=g_mat_dev, h=h_mat_dev)
                evt, out = knl(queue, a=a_mat_dev, b=b_mat_dev)
                evt.wait()
                trial_times.append((evt.profile.END - evt.profile.START)*1e-9)
            avg_time = np.average(trial_times[warmup_trials:])

            gstats = GPUStats('TeslaK20')
            '''
            if BSIZEx == 8 or BSIZEx == 32:  # TODO fix hack
                reg32_per_thread = 25
            elif BSIZEx == 24:
                reg32_per_thread = 18
            elif BSIZEx == 16:
                reg32_per_thread = 22
            '''
            reg32_per_thread = 18 

            shared_mem_per_block = 0
            total_blocks = math.ceil(n/BSIZEx)*math.ceil(n/BSIZEy)
            total_threads = total_blocks*BSIZEx*BSIZEy  # TODO never used
            kstats = KernelStats(flops/(n*n), f32uncoal/(n*n), f32coal/(n*n),
                                 barrier_ct, reg32_per_thread, shared_mem_per_block)
            tconfig = ThreadConfig(BSIZEx*BSIZEy, total_blocks)
            model = PerfModel(gstats, kstats, tconfig,
                            np.dtype(dtype))
            cycles = model.compute_total_cycles()
            actual.append(avg_time)
            HK_predict.append(cycles/(gstats.sm_clock_freq*10**9))

            '''
            print "actual runtime: ", actual[-1]
            print "total predicted time: ", predicted[-1]
            print "total predicted execution cycles: ", cycles
            print "="*40
            '''
            update_LS_matrix(A, flops, iops, f32coal_l, f32coal_s, f32uncoal_l,
                             f32uncoal_s, barrier_ct, total_blocks, n*n,
                             np.dtype(dtype).itemsize, model)

    update_lstsq_mats(Atrain_all, Atest_all, ytrain_all, ytest_all,
                      actual_times_all, HK_predict_all,
                      A, actual, HK_predict, train_test_config)
def run_mm_trials(ctx, queue, nvals, configs_t,
                  Atrain_all, Atest_all, ytrain_all, ytest_all,
                  actual_times_all, HK_predict_all, train_test_config, version):
    A = []
    HK_predict = []
    actual = []
    dtype = np.float32

    #TODO figure out smem usage issue
    for n in nvals:
        a_mat_dev = cl.clrandom.rand(queue, (n, n), dtype=dtype)
        b_mat_dev = cl.clrandom.rand(queue, (n, n), dtype=dtype)
        c_mat_dev = cl.clrandom.rand(queue, (n, n), dtype=dtype)
        order = "C"
        knl = lp.make_kernel(
            "{[i,j,k]: 0<=i,j,k<%d}" % n,
            [
                "c[i, j] = sum(k, a[i, k]*b[k, j])"
            ], [
                lp.GlobalArg("a", dtype, shape=(n, n), order=order),
                lp.GlobalArg("b", dtype, shape=(n, n), order=order),
                lp.GlobalArg("c", dtype, shape=(n, n), order=order),
            ], name="matmul")
        ref_knl = knl

        for BSIZEx, BSIZEy in configs_t:
            knl = ref_knl
            if version == "allcoal":
                knl = lp.split_iname(knl, "i", BSIZEy,
                                     outer_tag="g.0", inner_tag="l.1")
                knl = lp.split_iname(knl, "j", BSIZEx,
                                     outer_tag="g.1", inner_tag="l.0")
            elif version == "partcoal":
                knl = lp.split_iname(knl, "i", BSIZEy,
                                     outer_tag="g.0", inner_tag="l.0")
                knl = lp.split_iname(knl, "j", BSIZEx,
                                     outer_tag="g.1", inner_tag="l.1")
            else:
                1/0
                # TODO error
            ksplit = BSIZEy
            knl = lp.split_iname(knl, "k", ksplit)
            knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"])
            knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner", ])

            #check = lp.auto_test_vs_ref(ref_knl, ctx, knl, print_code=True)
            #print "Correctness check: \n", check
            # use ptx src to determine resource usage

            #ptx_dump(ctx, knl, n, BSIZEx, BSIZEy)

            params = {'n': n}
            barrier_poly = get_barrier_poly(knl)
            barrier_ct = barrier_poly.eval_with_dict(params)
            op_map = get_op_poly(knl)
            op_map2 = get_op_poly2(knl)
            flops, iops = get_32b_ops(op_map, params)
            amd_op32 = get_32b_amd_ops(op_map2, params)
            other_op32 = get_32b_ops_all(op_map2, params) - sum(amd_op32)
            if flops + iops != sum(amd_op32) + other_op32: #TODO remove after debug
                print("<debug> PROBLEM!, ops don't add up: ",
                        flops, iops, sum(amd_op32), other_op32)
            sub_map = get_DRAM_access_poly(knl)  # noqa
            f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s = get_DRAM_f32_accesses(
                                                                  sub_map, params)
            f32coal = f32coal_l + f32coal_s
            f32uncoal = f32uncoal_l + f32uncoal_s

            '''
            print_ptx_src_msg(knl.name)
            print "="*40+"KERNEL STATS"
            print "barrier count: ", barrier_ct
            print "flops: ", flops
            print(sub_map)
            print "="*40
            '''

            # execute
            #print "="*40+"TIMING RESULTS"
            print("running kernel...")
            #knl = lp.set_options(knl, write_cl=True, highlight_cl=True)

            trial_times = []
            for i in range(averaging_trials+warmup_trials):
                evt, (out,) = knl(queue, a=a_mat_dev, b=b_mat_dev, c=c_mat_dev)
                evt.wait()
                trial_times.append((evt.profile.END - evt.profile.START)*1e-9)
            avg_time = np.average(trial_times[warmup_trials:])

            gstats = GPUStats('TeslaC2070')
            ''' for k20:
            if BSIZEx == 8 or BSIZEx == 32:  # TODO fix hack
                reg32_per_thread = 25
            elif BSIZEx == 24:
                reg32_per_thread = 18
            elif BSIZEx == 16:
                reg32_per_thread = 22
            '''
            # for C2070
            if BSIZEx == 8 or BSIZEx == 16:  # TODO fix hack
                reg32_per_thread = 20
            elif BSIZEx == 32:
                reg32_per_thread = 19
            elif BSIZEx == 24:
                reg32_per_thread = 12
            #reg32_per_thread = 1 #estimate_regs_per_thread(knl)
            #print(reg32_per_thread, estimate_regs_per_thread(knl))
            reg32_per_thread = estimate_regs_per_thread(knl)

            shared_mem_per_block = 4*ksplit*(BSIZEx+BSIZEy)
            total_blocks = math.ceil(n/BSIZEx)*math.ceil(n/BSIZEy)
            total_threads = total_blocks*BSIZEx*BSIZEy  # TODO never used
            kstats = KernelStats(flops/(n*n), f32uncoal/(n*n), f32coal/(n*n),
                                 barrier_ct, reg32_per_thread, shared_mem_per_block)
            tconfig = ThreadConfig(BSIZEx*BSIZEy, total_blocks)
            model = PerfModel(gstats, kstats, tconfig,
                            np.dtype(dtype))
            cycles = model.compute_total_cycles()
            actual.append(avg_time)
            #for time in trial_times: #!!!!!
            #    actual.append(time)
            HK_predict.append(cycles/(gstats.sm_clock_freq*10**9))

            '''
            print "actual runtime: ", actual[-1]
            print "total predicted time: ", predicted[-1]
            print "total predicted execution cycles: ", cycles
            print "="*40
            '''
            #''' #!!!!!
            '''
            update_LS_matrix(A, flops, iops, f32coal_l, f32coal_s, f32uncoal_l,
                             f32uncoal_s, barrier_ct, total_blocks, n*n,
                             np.dtype(dtype).itemsize, model)
            '''
            ops = copy.deepcopy(amd_op32)
            ops.append(other_op32)
            update_LS_matrix2(A, ops, f32coal_l, f32coal_s, f32uncoal_l,
                             f32uncoal_s, barrier_ct, total_blocks, n*n,
                             np.dtype(dtype).itemsize, model)

            '''
            for time in trial_times:
                update_LS_matrix(A, flops, iops, f32coal_l, f32coal_s, f32uncoal_l,
                                 f32uncoal_s, barrier_ct, total_blocks, n*n,
                                 np.dtype(dtype).itemsize, model)
            '''

    update_lstsq_mats(Atrain_all, Atest_all, ytrain_all, ytest_all,
                      actual_times_all, HK_predict_all,
                      A, actual, HK_predict, train_test_config)