Ejemplo n.º 1
0
def test_op_counter_logic():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                """
                e[i,k] = if(
                        not(k<ell-2) and k>6 or k/2==ell,
                        g[i,k]*2,
                        g[i,k]+h[i,k]/2)
                """
            ],
            name="logic", assumptions="n,m,ell >= 1")

    knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
    n_workgroups = 1
    group_size = 1
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups*subgroups_per_group
    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}
    f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params)
    f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP)].eval_with_dict(params)
    f64div = op_map[lp.Op(np.dtype(np.float64), 'div', CG.SUBGROUP)
                    ].eval_with_dict(params)
    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP)
                    ].eval_with_dict(params)
    # (count-per-sub-group)*n_subgroups
    assert f32mul == n*m*n_subgroups
    assert f64div == 2*n*m*n_subgroups  # TODO why?
    assert f64add == n*m*n_subgroups
    assert i32add == n*m*n_subgroups
Ejemplo n.º 2
0
def test_op_counter_basic():

    knl = lp.make_kernel(
            "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                """
                c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
                e[i, k+1] = -g[i,k]*h[i,k+1]
                """
            ],
            name="basic", assumptions="n,m,ell >= 1")

    knl = lp.add_and_infer_dtypes(knl,
                                  dict(a=np.float32, b=np.float32,
                                       g=np.float64, h=np.float64))
    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
    n_workgroups = 1
    group_size = 1
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups*subgroups_per_group
    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}
    f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params)
    f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params)
    f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(params)
    f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.SUBGROUP)
                    ].eval_with_dict(params)
    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP)
                    ].eval_with_dict(params)
    # (count-per-sub-group)*n_subgroups
    assert f32add == f32mul == f32div == n*m*ell*n_subgroups
    assert f64mul == n*m*n_subgroups
    assert i32add == n*m*2*n_subgroups
Ejemplo n.º 3
0
def test_op_counter_reduction():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                "c[i, j] = sum(k, a[i, k]*b[k, j])"
            ],
            name="matmul_serial", assumptions="n,m,ell >= 1")

    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
    n_workgroups = 1
    group_size = 1
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups*subgroups_per_group
    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}
    f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params)
    f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', CG.SUBGROUP)
                    ].eval_with_dict(params)
    # (count-per-sub-group)*n_subgroups
    assert f32add == f32mul == n*m*ell*n_subgroups

    op_map_dtype = op_map.group_by('dtype')
    f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params)
    assert f32 == f32add + f32mul
Ejemplo n.º 4
0
def test_op_counter_logic():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                """
                e[i,k] = if(
                        not(k<ell-2) and k>6 or k/2==ell,
                        g[i,k]*2,
                        g[i,k]+h[i,k]/2)
                """
            ],
            name="logic", assumptions="n,m,ell >= 1")

    knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
    op_map = lp.get_op_map(knl, count_redundant_work=True)
    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}
    f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(params)
    f64add = op_map[lp.Op(np.float64, 'add', CG.WORKITEM)].eval_with_dict(params)
    f64div = op_map[lp.Op(np.dtype(np.float64), 'div', CG.WORKITEM)
                    ].eval_with_dict(params)
    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.WORKITEM)
                    ].eval_with_dict(params)
    assert f32mul == n*m
    assert f64div == 2*n*m  # TODO why?
    assert f64add == n*m
    assert i32add == n*m
Ejemplo n.º 5
0
def test_op_counter_basic():

    knl = lp.make_kernel(
            "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                """
                c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
                e[i, k+1] = -g[i,k]*h[i,k+1]
                """
            ],
            name="basic", assumptions="n,m,ell >= 1")

    knl = lp.add_and_infer_dtypes(knl,
                                  dict(a=np.float32, b=np.float32,
                                       g=np.float64, h=np.float64))
    op_map = lp.get_op_map(knl, count_redundant_work=True)
    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}
    f32add = op_map[lp.Op(np.float32, 'add', CG.WORKITEM)].eval_with_dict(params)
    f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(params)
    f32div = op_map[lp.Op(np.float32, 'div', CG.WORKITEM)].eval_with_dict(params)
    f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.WORKITEM)
                    ].eval_with_dict(params)
    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.WORKITEM)
                    ].eval_with_dict(params)
    assert f32add == f32mul == f32div == n*m*ell
    assert f64mul == n*m
    assert i32add == n*m*2
Ejemplo n.º 6
0
def test_all_counters_parallel_matmul():

    knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
                         ["c[i, j] = sum(k, a[i, k]*b[k, j])"],
                         name="matmul",
                         assumptions="n,m,l >= 1")
    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0")
    knl = lp.split_iname(knl, "k", 16)
    knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"])
    knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"])

    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}

    sync_map = lp.get_synchronization_map(knl)
    assert len(sync_map) == 2
    assert sync_map["kernel_launch"].eval_with_dict(params) == 1
    assert sync_map["barrier_local"].eval_with_dict(params) == 2 * m / 16

    op_map = lp.get_op_map(knl)
    f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params)
    f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params)
    i32ops = op_map[lp.Op(np.int32, 'add')].eval_with_dict(params)
    i32ops += op_map[lp.Op(np.dtype(np.int32), 'mul')].eval_with_dict(params)

    assert f32mul + f32add == n * m * l * 2

    op_map = lp.get_mem_access_map(knl)

    f32coal = op_map[lp.MemAccess('global',
                                  np.float32,
                                  stride=1,
                                  direction='load',
                                  variable='b')].eval_with_dict(params)
    f32coal += op_map[lp.MemAccess('global',
                                   np.float32,
                                   stride=1,
                                   direction='load',
                                   variable='a')].eval_with_dict(params)

    assert f32coal == n * m + m * l

    f32coal = op_map[lp.MemAccess('global',
                                  np.float32,
                                  stride=1,
                                  direction='store',
                                  variable='c')].eval_with_dict(params)

    assert f32coal == n * l

    local_mem_map = lp.get_mem_access_map(knl).filter_by(mtype=['local'])
    local_mem_l = local_mem_map[lp.MemAccess(
        'local', np.dtype(np.float32),
        direction='load')].eval_with_dict(params)
    assert local_mem_l == n * m * l * 2
Ejemplo n.º 7
0
def test_op_counter_specialops():

    knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [
        """
                c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0)
                e[i, k] = (1+g[i,k])**(1+h[i,k+1])+rsqrt(g[i,k])*sin(g[i,k])
                """
    ],
                         name="specialops",
                         assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(
        knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
    op_map = lp.get_op_map(knl)
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}
    f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params)
    f32div = op_map[lp.Op(np.float32, 'div')].eval_with_dict(params)
    f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params)
    f64pow = op_map[lp.Op(np.float64, 'pow')].eval_with_dict(params)
    f64add = op_map[lp.Op(np.dtype(np.float64), 'add')].eval_with_dict(params)
    i32add = op_map[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
    f64rsq = op_map[lp.Op(np.dtype(np.float64),
                          'func:rsqrt')].eval_with_dict(params)
    f64sin = op_map[lp.Op(np.dtype(np.float64),
                          'func:sin')].eval_with_dict(params)
    assert f32div == 2 * n * m * l
    assert f32mul == f32add == n * m * l
    assert f64add == 3 * n * m
    assert f64pow == i32add == f64rsq == f64sin == n * m
Ejemplo n.º 8
0
def test_op_counter_triangular_domain():

    knl = lp.make_kernel("{[i,j]: 0<=i<n and 0<=j<m and i<j}",
                         """
            a[i, j] = b[i,j] * 2
            """,
                         name="bitwise",
                         assumptions="n,m >= 1")

    knl = lp.add_and_infer_dtypes(knl, dict(b=np.float64))

    expect_fallback = False
    import islpy as isl
    try:
        isl.BasicSet.card
    except AttributeError:
        expect_fallback = True
    else:
        expect_fallback = False

    op_map = lp.get_op_map(knl, subgroup_size=SGS,
                           count_redundant_work=True)[lp.Op(
                               np.float64, 'mul', CG.SUBGROUP)]
    value_dict = dict(m=13, n=200)
    flops = op_map.eval_with_dict(value_dict)

    n_workgroups = 1
    group_size = 1
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups * subgroups_per_group

    if expect_fallback:
        assert flops == 144 * n_subgroups
    else:
        assert flops == 78 * n_subgroups
Ejemplo n.º 9
0
def test_op_counter_triangular_domain():

    knl = lp.make_kernel(
            "{[i,j]: 0<=i<n and 0<=j<m and i<j}",
            """
            a[i, j] = b[i,j] * 2
            """,
            name="bitwise", assumptions="n,m >= 1")

    knl = lp.add_and_infer_dtypes(knl,
            dict(b=np.float64))

    expect_fallback = False
    import islpy as isl
    try:
        isl.BasicSet.card
    except AttributeError:
        expect_fallback = True
    else:
        expect_fallback = False

    op_map = lp.get_op_map(
                    knl,
                    count_redundant_work=True
                    )[lp.Op(np.float64, 'mul', CG.WORKITEM)]
    value_dict = dict(m=13, n=200)
    flops = op_map.eval_with_dict(value_dict)

    if expect_fallback:
        assert flops == 144
    else:
        assert flops == 78
Ejemplo n.º 10
0
def test_op_counter_reduction():

    knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
                         ["c[i, j] = sum(k, a[i, k]*b[k, j])"],
                         name="matmul_serial",
                         assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
    op_map = lp.get_op_map(knl)
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}
    f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params)
    f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul')].eval_with_dict(params)
    assert f32add == f32mul == n * m * l

    op_map_dtype = op_map.group_by('dtype')
    f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params)
    assert f32 == f32add + f32mul
Ejemplo n.º 11
0
def test_op_counter_bitwise():

    knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [
        """
                c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1)
                e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k))
                """
    ],
                         name="bitwise",
                         assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(
        knl, dict(a=np.int32, b=np.int32, g=np.int64, h=np.int64))

    op_map = lp.get_op_map(knl)
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}
    i32add = op_map[lp.Op(np.int32, 'add')].eval_with_dict(params)
    i32bw = op_map[lp.Op(np.int32, 'bw')].eval_with_dict(params)
    i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw')].eval_with_dict(params)
    i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul')].eval_with_dict(params)
    i64add = op_map[lp.Op(np.dtype(np.int64), 'add')].eval_with_dict(params)
    i64shift = op_map[lp.Op(np.dtype(np.int64),
                            'shift')].eval_with_dict(params)
    assert i32add == n * m + n * m * l
    assert i32bw == 2 * n * m * l
    assert i64bw == 2 * n * m
    assert i64add == i64mul == n * m
    assert i64shift == 2 * n * m
Ejemplo n.º 12
0
def test_count_granularity_val_checks():

    try:
        lp.MemAccess(count_granularity=CG.WORKITEM)
        lp.MemAccess(count_granularity=CG.SUBGROUP)
        lp.MemAccess(count_granularity=CG.WORKGROUP)
        lp.MemAccess(count_granularity=None)
        assert True
        lp.MemAccess(count_granularity='bushel')
        assert False
    except ValueError:
        assert True

    try:
        lp.Op(count_granularity=CG.WORKITEM)
        lp.Op(count_granularity=CG.SUBGROUP)
        lp.Op(count_granularity=CG.WORKGROUP)
        lp.Op(count_granularity=None)
        assert True
        lp.Op(count_granularity='bushel')
        assert False
    except ValueError:
        assert True
Ejemplo n.º 13
0
def test_op_counter_specialops():

    knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [
        """
                c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0)
                e[i, k] = (1+g[i,k])**(1+h[i,k+1])+rsqrt(g[i,k])*sin(g[i,k])
                """
    ],
                         name="specialops",
                         assumptions="n,m,ell >= 1")

    knl = lp.add_and_infer_dtypes(
        knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
    op_map = lp.get_op_map(knl,
                           subgroup_size=SGS,
                           count_redundant_work=True,
                           count_within_subscripts=True)
    n_workgroups = 1
    group_size = 1
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups * subgroups_per_group
    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}
    f32mul = op_map[lp.Op(np.float32, 'mul',
                          CG.SUBGROUP)].eval_with_dict(params)
    f32div = op_map[lp.Op(np.float32, 'div',
                          CG.SUBGROUP)].eval_with_dict(params)
    f32add = op_map[lp.Op(np.float32, 'add',
                          CG.SUBGROUP)].eval_with_dict(params)
    f64pow = op_map[lp.Op(np.float64, 'pow',
                          CG.SUBGROUP)].eval_with_dict(params)
    f64add = op_map[lp.Op(np.dtype(np.float64), 'add',
                          CG.SUBGROUP)].eval_with_dict(params)
    i32add = op_map[lp.Op(np.dtype(np.int32), 'add',
                          CG.SUBGROUP)].eval_with_dict(params)
    f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt',
                          CG.SUBGROUP)].eval_with_dict(params)
    f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin',
                          CG.SUBGROUP)].eval_with_dict(params)
    # (count-per-sub-group)*n_subgroups
    assert f32div == 2 * n * m * ell * n_subgroups
    assert f32mul == f32add == n * m * ell * n_subgroups
    assert f64add == 3 * n * m * n_subgroups
    assert f64pow == i32add == f64rsq == f64sin == n * m * n_subgroups
Ejemplo n.º 14
0
def test_op_counter_bitwise():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                """
                c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1)
                e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k))
                """
            ],
            name="bitwise", assumptions="n,m,ell >= 1")

    knl = lp.add_and_infer_dtypes(
            knl, dict(
                a=np.int32, b=np.int32,
                g=np.int64, h=np.int64))

    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
    n_workgroups = 1
    group_size = 1
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups*subgroups_per_group
    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}
    i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP)].eval_with_dict(params)
    i32bw = op_map[lp.Op(np.int32, 'bw', CG.SUBGROUP)].eval_with_dict(params)
    i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw', CG.SUBGROUP)
                   ].eval_with_dict(params)
    i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul', CG.SUBGROUP)
                    ].eval_with_dict(params)
    i64add = op_map[lp.Op(np.dtype(np.int64), 'add', CG.SUBGROUP)
                    ].eval_with_dict(params)
    i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', CG.SUBGROUP)
                      ].eval_with_dict(params)
    # (count-per-sub-group)*n_subgroups
    assert i32add == n*m+n*m*ell*n_subgroups
    assert i32bw == 2*n*m*ell*n_subgroups
    assert i64bw == 2*n*m*n_subgroups
    assert i64add == i64mul == n*m*n_subgroups
    assert i64shift == 2*n*m*n_subgroups
Ejemplo n.º 15
0
def test_all_counters_parallel_matmul():
    bsize = 16
    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                "c[i, j] = sum(k, a[i, k]*b[k, j])"
            ],
            name="matmul", assumptions="n,m,ell >= 1")
    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
    knl = lp.split_iname(knl, "i", bsize, outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", bsize, outer_tag="g.1", inner_tag="l.0")
    knl = lp.split_iname(knl, "k", bsize)
    knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto")
    knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto")

    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}
    group_size = bsize*bsize
    n_workgroups = div_ceil(n, bsize)*div_ceil(ell, bsize)
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups*subgroups_per_group

    sync_map = lp.get_synchronization_map(knl)
    assert len(sync_map) == 2
    assert sync_map["kernel_launch"].eval_with_dict(params) == 1
    assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/bsize

    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
    f32mul = op_map[
                        lp.Op(np.float32, 'mul', CG.SUBGROUP)
                        ].eval_with_dict(params)
    f32add = op_map[
                        lp.Op(np.float32, 'add', CG.SUBGROUP)
                        ].eval_with_dict(params)
    i32ops = op_map[
                        lp.Op(np.int32, 'add', CG.SUBGROUP)
                        ].eval_with_dict(params)
    i32ops += op_map[
                        lp.Op(np.dtype(np.int32), 'mul', CG.SUBGROUP)
                        ].eval_with_dict(params)

    # (count-per-sub-group)*n_subgroups
    assert f32mul+f32add == m*2*n_subgroups

    mem_access_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                           subgroup_size=SGS)

    f32s1lb = mem_access_map[lp.MemAccess('global', np.float32,
                             lid_strides={0: 1, 1: Variable('ell')},
                             gid_strides={1: bsize},
                             direction='load', variable='b',
                             count_granularity=CG.WORKITEM)
                             ].eval_with_dict(params)
    f32s1la = mem_access_map[lp.MemAccess('global', np.float32,
                             lid_strides={0: 1, 1: Variable('m')},
                             gid_strides={0: Variable('m')*bsize},
                             direction='load',
                             variable='a', count_granularity=CG.WORKITEM)
                             ].eval_with_dict(params)

    assert f32s1lb == n*m*ell/bsize
    assert f32s1la == n*m*ell/bsize

    f32coal = mem_access_map[lp.MemAccess('global', np.float32,
                             lid_strides={0: 1, 1: Variable('ell')},
                             gid_strides={0: Variable('ell')*bsize, 1: bsize},
                             direction='store', variable='c',
                             count_granularity=CG.WORKITEM)
                             ].eval_with_dict(params)

    assert f32coal == n*ell

    local_mem_map = lp.get_mem_access_map(knl,
                        count_redundant_work=True,
                        subgroup_size=SGS).filter_by(mtype=['local'])

    local_mem_l = local_mem_map.filter_by(direction=['load']
                                          ).eval_and_sum(params)
    # (count-per-sub-group)*n_subgroups
    assert local_mem_l == m*2*n_subgroups

    local_mem_l_a = local_mem_map[lp.MemAccess('local', np.dtype(np.float32),
                                               direction='load',
                                               lid_strides={1: 16},
                                               gid_strides={},
                                               variable='a_fetch',
                                               count_granularity=CG.SUBGROUP)
                                  ].eval_with_dict(params)
    local_mem_l_b = local_mem_map[lp.MemAccess('local', np.dtype(np.float32),
                                               direction='load',
                                               lid_strides={0: 1},
                                               gid_strides={},
                                               variable='b_fetch',
                                               count_granularity=CG.SUBGROUP)
                                  ].eval_with_dict(params)

    # (count-per-sub-group)*n_subgroups
    assert local_mem_l_a == local_mem_l_b == m*n_subgroups

    local_mem_s = local_mem_map.filter_by(direction=['store']
                                          ).eval_and_sum(params)

    # (count-per-sub-group)*n_subgroups
    assert local_mem_s == m*2/bsize*n_subgroups
Ejemplo n.º 16
0
def test_summations_and_filters():

    knl = lp.make_kernel("[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
                         [
                             """
                c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
                e[i, k+1] = -g[i,k]*h[i,k+1]
                """
                         ],
                         name="basic",
                         assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(
        knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}

    mem_map = lp.get_mem_access_map(knl)

    loads_a = mem_map.filter_by(direction=['load'],
                                variable=['a']).eval_and_sum(params)
    assert loads_a == 2 * n * m * l

    global_stores = mem_map.filter_by(mtype=['global'],
                                      direction=['store']).eval_and_sum(params)
    assert global_stores == n * m * l + n * m

    ld_bytes = mem_map.filter_by(mtype=['global'],
                                 direction=['load'
                                            ]).to_bytes().eval_and_sum(params)
    st_bytes = mem_map.filter_by(mtype=['global'],
                                 direction=['store'
                                            ]).to_bytes().eval_and_sum(params)
    assert ld_bytes == 4 * n * m * l * 3 + 8 * n * m * 2
    assert st_bytes == 4 * n * m * l + 8 * n * m

    # ignore stride and variable names in this map
    reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
    f32lall = reduced_map[lp.MemAccess(
        'global', np.float32, direction='load')].eval_with_dict(params)
    f64lall = reduced_map[lp.MemAccess(
        'global', np.float64, direction='load')].eval_with_dict(params)
    assert f32lall == 3 * n * m * l
    assert f64lall == 2 * n * m

    op_map = lp.get_op_map(knl)
    #for k, v in op_map.items():
    #    print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v)

    op_map_dtype = op_map.group_by('dtype')
    f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params)
    f64 = op_map_dtype[lp.Op(dtype=np.float64)].eval_with_dict(params)
    i32 = op_map_dtype[lp.Op(dtype=np.int32)].eval_with_dict(params)
    assert f32 == n * m * l * 3
    assert f64 == n * m
    assert i32 == n * m * 2

    addsub_all = op_map.filter_by(name=['add', 'sub']).eval_and_sum(params)
    f32ops_all = op_map.filter_by(dtype=[np.float32]).eval_and_sum(params)
    assert addsub_all == n * m * l + n * m * 2
    assert f32ops_all == n * m * l * 3

    non_field = op_map.filter_by(xxx=[np.float32]).eval_and_sum(params)
    assert non_field == 0

    ops_nodtype = op_map.group_by('name')
    ops_noname = op_map.group_by('dtype')
    mul_all = ops_nodtype[lp.Op(name='mul')].eval_with_dict(params)
    f64ops_all = ops_noname[lp.Op(dtype=np.float64)].eval_with_dict(params)
    assert mul_all == n * m * l + n * m
    assert f64ops_all == n * m

    def func_filter(key):
        return key.stride < 1 and key.dtype == to_loopy_type(np.float64) and \
               key.direction == 'load'

    s1f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params)
    assert s1f64l == 2 * n * m
Ejemplo n.º 17
0
def test_summations_and_filters():

    knl = lp.make_kernel(
            "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                """
                c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
                e[i, k+1] = -g[i,k]*h[i,k+1]
                """
            ],
            name="basic", assumptions="n,m,ell >= 1")

    knl = lp.add_and_infer_dtypes(knl,
                    dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))

    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}

    n_workgroups = 1
    group_size = 1
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups*subgroups_per_group

    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                    subgroup_size=SGS)

    loads_a = mem_map.filter_by(direction=['load'], variable=['a'],
                                count_granularity=[CG.SUBGROUP]
                                ).eval_and_sum(params)

    # uniform: (count-per-sub-group)*n_subgroups
    assert loads_a == (2*n*m*ell)*n_subgroups

    global_stores = mem_map.filter_by(mtype=['global'], direction=['store'],
                                      count_granularity=[CG.SUBGROUP]
                                      ).eval_and_sum(params)

    # uniform: (count-per-sub-group)*n_subgroups
    assert global_stores == (n*m*ell + n*m)*n_subgroups

    ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'],
                                 count_granularity=[CG.SUBGROUP]
                                 ).to_bytes().eval_and_sum(params)
    st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'],
                                 count_granularity=[CG.SUBGROUP]
                                 ).to_bytes().eval_and_sum(params)

    # uniform: (count-per-sub-group)*n_subgroups
    assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)*n_subgroups
    assert st_bytes == (4*n*m*ell + 8*n*m)*n_subgroups

    # ignore stride and variable names in this map
    reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
    f32lall = reduced_map[lp.MemAccess('global', np.float32, direction='load')
                          ].eval_with_dict(params)
    f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load')
                          ].eval_with_dict(params)

    # uniform: (count-per-sub-group)*n_subgroups
    assert f32lall == (3*n*m*ell)*n_subgroups
    assert f64lall == (2*n*m)*n_subgroups

    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
    #for k, v in op_map.items():
    #    print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v)

    op_map_dtype = op_map.group_by('dtype')
    f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params)
    f64 = op_map_dtype[lp.Op(dtype=np.float64)].eval_with_dict(params)
    i32 = op_map_dtype[lp.Op(dtype=np.int32)].eval_with_dict(params)
    assert f32 == n*m*ell*3
    assert f64 == n*m
    assert i32 == n*m*2

    addsub_all = op_map.filter_by(name=['add', 'sub']).eval_and_sum(params)
    f32ops_all = op_map.filter_by(dtype=[np.float32]).eval_and_sum(params)
    assert addsub_all == n*m*ell + n*m*2
    assert f32ops_all == n*m*ell*3

    non_field = op_map.filter_by(xxx=[np.float32]).eval_and_sum(params)
    assert non_field == 0

    ops_nodtype = op_map.group_by('name')
    ops_noname = op_map.group_by('dtype')
    mul_all = ops_nodtype[lp.Op(name='mul')].eval_with_dict(params)
    f64ops_all = ops_noname[lp.Op(dtype=np.float64)].eval_with_dict(params)
    assert mul_all == n*m*ell + n*m
    assert f64ops_all == n*m

    def func_filter(key):
        return key.lid_strides == {} and key.dtype == to_loopy_type(np.float64) and \
               key.direction == 'load'
    f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params)

    # uniform: (count-per-sub-group)*n_subgroups
    assert f64l == (2*n*m)*n_subgroups