Exemple #1
0
def check_performance(ctx_and_double, shape1, shape2, bwo):

    ctx, double = ctx_and_double
    dtype = numpy.float64 if double else numpy.float32

    a = get_test_array(shape1, dtype)
    b = get_test_array(shape2, dtype)

    a_dev = ctx.to_device(a)
    b_dev = ctx.to_device(b)
    res_ref = ref_dot(a, b)
    res_dev = ctx.allocate(res_ref.shape, dtype=dtype)

    try:
        dot = MatrixMul(ctx).prepare_for(res_dev, a_dev, b_dev, block_width_override=bwo)
    except ValueError:
        pytest.skip()

    attempts = 10
    t1 = time.time()
    for i in range(attempts):
        dot(res_dev, a_dev, b_dev)
    ctx.synchronize()
    t2 = time.time()

    assert diff_is_negligible(ctx.from_device(res_dev), res_ref)

    return (t2 - t1) / attempts, product(res_ref.shape) * shape1[-1] * 2
Exemple #2
0
def check_performance(ctx_and_double, shape1, shape2, bwo):

    ctx, double = ctx_and_double
    dtype = numpy.float64 if double else numpy.float32

    a = get_test_array(shape1, dtype)
    b = get_test_array(shape2, dtype)

    a_dev = ctx.to_device(a)
    b_dev = ctx.to_device(b)
    res_ref = ref_dot(a, b)
    res_dev = ctx.allocate(res_ref.shape, dtype=dtype)

    try:
        dot = MatrixMul(ctx).prepare_for(res_dev,
                                         a_dev,
                                         b_dev,
                                         block_width_override=bwo)
    except ValueError:
        pytest.skip()

    attempts = 10
    t1 = time.time()
    for i in range(attempts):
        dot(res_dev, a_dev, b_dev)
    ctx.synchronize()
    t2 = time.time()

    assert diff_is_negligible(ctx.from_device(res_dev), res_ref)

    return (t2 - t1) / attempts, product(res_ref.shape) * shape1[-1] * 2
Exemple #3
0
def test_sizes(ctx_with_gs_limits, gl_size, gs_is_multiple):
    """
    Test that virtual sizes are correct.
    """

    ctx = ctx_with_gs_limits
    grid_size, local_size = gl_size

    ref = ReferenceIds(grid_size, local_size, gs_is_multiple)

    get_sizes = ctx.compile_static("""
    KERNEL void get_sizes(GLOBAL_MEM int *sizes)
    {
        if (virtual_global_flat_id() > 0) return;

        for (int i = 0; i < 3; i++)
        {
            sizes[i] = virtual_local_size(i);
            sizes[i + 3] = virtual_num_groups(i);
            sizes[i + 6] = virtual_global_size(i);
        }
        sizes[9] = virtual_global_flat_size();
    }
    """, 'get_sizes', ref.global_size, local_size=ref.local_size)

    sizes = ctx.allocate(10, numpy.int32)
    get_sizes(sizes)

    gls = list(ref.global_size) + [1] * (3 - len(ref.global_size))
    ls = list(ref.local_size) + [1] * (3 - len(ref.local_size))
    gs = [min_blocks(g, l) for g, l in zip(gls, ls)]

    ref_sizes = numpy.array(ls + gs + gls + [product(gls)]).astype(numpy.int32)
    assert diff_is_negligible(sizes.get(), ref_sizes)
Exemple #4
0
def check_performance(ctx_and_double, shape_and_axes):
    ctx, double = ctx_and_double

    shape, axes = shape_and_axes
    dtype = numpy.complex128 if double else numpy.complex64

    data = get_test_array(shape, dtype)
    data_dev = ctx.to_device(data)
    res_dev = ctx.empty_like(data_dev)

    fft = FFT(ctx).prepare_for(res_dev, data_dev, None, axes=axes)

    attempts = 10
    t1 = time.time()
    for i in range(attempts):
        fft(res_dev, data_dev, -1)
    ctx.synchronize()
    t2 = time.time()
    dev_time = (t2 - t1) / attempts

    fwd_ref = numpy.fft.fftn(data, axes=axes).astype(dtype)
    assert diff_is_negligible(res_dev.get(), fwd_ref)

    return dev_time, product(shape) * sum([numpy.log2(shape[a])
                                           for a in axes]) * 5
Exemple #5
0
def test_ids(ctx_with_gs_limits, gl_size, gs_is_multiple):
    """
    Test that virtual IDs are correct for each thread.
    """

    ctx = ctx_with_gs_limits
    grid_size, local_size = gl_size

    ref = ReferenceIds(grid_size, local_size, gs_is_multiple)

    get_ids = ctx.compile_static("""
    KERNEL void get_ids(GLOBAL_MEM int *fid,
        GLOBAL_MEM int *lx, GLOBAL_MEM int *ly, GLOBAL_MEM int *lz,
        GLOBAL_MEM int *gx, GLOBAL_MEM int *gy, GLOBAL_MEM int *gz,
        GLOBAL_MEM int *glx, GLOBAL_MEM int *gly, GLOBAL_MEM int *glz)
    {
        VIRTUAL_SKIP_THREADS;
        const int i = virtual_global_flat_id();
        fid[i] = i;
        lx[i] = virtual_local_id(0);
        ly[i] = virtual_local_id(1);
        lz[i] = virtual_local_id(2);
        gx[i] = virtual_group_id(0);
        gy[i] = virtual_group_id(1);
        gz[i] = virtual_group_id(2);
        glx[i] = virtual_global_id(0);
        gly[i] = virtual_global_id(1);
        glz[i] = virtual_global_id(2);
    }
    """, 'get_ids', ref.global_size, local_size=ref.local_size)

    fid = ctx.allocate(product(ref.np_global_size), numpy.int32)
    lx = ctx.allocate(ref.np_global_size, numpy.int32)
    ly = ctx.allocate(ref.np_global_size, numpy.int32)
    lz = ctx.allocate(ref.np_global_size, numpy.int32)
    gx = ctx.allocate(ref.np_global_size, numpy.int32)
    gy = ctx.allocate(ref.np_global_size, numpy.int32)
    gz = ctx.allocate(ref.np_global_size, numpy.int32)
    glx = ctx.allocate(ref.np_global_size, numpy.int32)
    gly = ctx.allocate(ref.np_global_size, numpy.int32)
    glz = ctx.allocate(ref.np_global_size, numpy.int32)

    get_ids(fid, lx, ly, lz, gx, gy, gz, glx, gly, glz)

    assert diff_is_negligible(fid.get(), ref.predict_global_flat_ids())
    assert diff_is_negligible(lx.get(), ref.predict_local_ids(0))
    assert diff_is_negligible(ly.get(), ref.predict_local_ids(1))
    assert diff_is_negligible(lz.get(), ref.predict_local_ids(2))
    assert diff_is_negligible(gx.get(), ref.predict_group_ids(0))
    assert diff_is_negligible(gy.get(), ref.predict_group_ids(1))
    assert diff_is_negligible(gz.get(), ref.predict_group_ids(2))
    assert diff_is_negligible(glx.get(), ref.predict_global_ids(0))
    assert diff_is_negligible(gly.get(), ref.predict_global_ids(1))
    assert diff_is_negligible(glz.get(), ref.predict_global_ids(2))
Exemple #6
0
def ref_dot(a, b):
    a_batch = product(a.shape[:-2])
    b_batch = product(b.shape[:-2])

    assert a_batch == b_batch or a_batch == 1 or b_batch == 1

    a = a.reshape(a_batch, a.shape[-2], a.shape[-1])
    b = b.reshape(b_batch, b.shape[-2], b.shape[-1])

    out_batch = max(a_batch, b_batch)
    out_shape = (out_batch, a.shape[-2], b.shape[-1])
    out_dtype = numpy.result_type(a.dtype, b.dtype)
    out = numpy.empty(out_shape, out_dtype)

    for i in range(out_batch):
        ai = 0 if a_batch == 1 else i
        bi = 0 if b_batch == 1 else i
        out[i] = numpy.dot(a[ai], b[bi])

    if a_batch == b_batch == 1:
        out = out.reshape(out.shape[-2], out_shape[-1])

    return out
Exemple #7
0
def ref_dot(a, b):
    a_batch = product(a.shape[:-2])
    b_batch = product(b.shape[:-2])

    assert a_batch == b_batch or a_batch == 1 or b_batch == 1

    a = a.reshape(a_batch, a.shape[-2], a.shape[-1])
    b = b.reshape(b_batch, b.shape[-2], b.shape[-1])

    out_batch = max(a_batch, b_batch)
    out_shape = (out_batch, a.shape[-2], b.shape[-1])
    out_dtype = numpy.result_type(a.dtype, b.dtype)
    out = numpy.empty(out_shape, out_dtype)

    for i in range(out_batch):
        ai = 0 if a_batch == 1 else i
        bi = 0 if b_batch == 1 else i
        out[i] = numpy.dot(a[ai], b[bi])

    if a_batch == b_batch == 1:
        out = out.reshape(out.shape[-2], out_shape[-1])

    return out
def test_find_local_size(ctx_and_global_size):
    ctx, global_size = ctx_and_global_size
    """
    Check that if None is passed as local_size, kernel can find some local_size to run with
    (not necessarily optimal).
    """

    module = ctx.compile("""
    KERNEL void test(GLOBAL_MEM int *dest)
    {
      const int i = get_global_id(0) +
        get_global_id(1) * get_global_size(0) +
        get_global_id(2) * get_global_size(1) * get_global_size(0);
      dest[i] = i;
    }
    """)
    test = module.test
    dest_dev = ctx.allocate(global_size, numpy.int32)
    test(dest_dev, global_size=global_size)

    assert diff_is_negligible(
        dest_dev.get().ravel(),
        numpy.arange(product(global_size)).astype(numpy.int32))
def test_find_local_size(ctx_and_global_size):
    ctx, global_size = ctx_and_global_size

    """
    Check that if None is passed as local_size, kernel can find some local_size to run with
    (not necessarily optimal).
    """

    module = ctx.compile(
    """
    KERNEL void test(GLOBAL_MEM int *dest)
    {
      const int i = get_global_id(0) +
        get_global_id(1) * get_global_size(0) +
        get_global_id(2) * get_global_size(1) * get_global_size(0);
      dest[i] = i;
    }
    """)
    test = module.test
    dest_dev = ctx.allocate(global_size, numpy.int32)
    test(dest_dev, global_size=global_size)

    assert diff_is_negligible(dest_dev.get().ravel(),
        numpy.arange(product(global_size)).astype(numpy.int32))
Exemple #10
0
def check_performance(ctx_and_double, shape_and_axes):
    ctx, double = ctx_and_double

    shape, axes = shape_and_axes
    dtype = numpy.complex128 if double else numpy.complex64

    data = get_test_array(shape, dtype)
    data_dev = ctx.to_device(data)
    res_dev = ctx.empty_like(data_dev)

    fft = FFT(ctx).prepare_for(res_dev, data_dev, None, axes=axes)

    attempts = 10
    t1 = time.time()
    for i in range(attempts):
        fft(res_dev, data_dev, -1)
    ctx.synchronize()
    t2 = time.time()
    dev_time = (t2 - t1) / attempts

    fwd_ref = numpy.fft.fftn(data, axes=axes).astype(dtype)
    assert diff_is_negligible(res_dev.get(), fwd_ref)

    return dev_time, product(shape) * sum([numpy.log2(shape[a]) for a in axes]) * 5
Exemple #11
0
 def predict_global_flat_ids(self):
     return numpy.arange(product(self.np_global_size)).astype(numpy.int32)
Exemple #12
0
def pytest_generate_tests(metafunc):

    perf_log_shapes = [
        (4, ),
        (10, ),
        (13, ),  # 1D
        (4, 4),
        (7, 7),
        (10, 10),  # 2D
        (4, 4, 4),
        (5, 5, 7),
        (7, 7, 7)
    ]  # 3D
    perf_mem_limit = 4 * 2**20

    if 'shape_and_axes' in metafunc.funcargnames:
        shapes = []

        for x in [3, 8, 9, 10, 11, 12, 13, 20]:
            shapes.append((2**x, ))

        for x, y in itertools.product([4, 7, 8, 10], [4, 7, 8, 10]):
            shapes.append((2**x, 2**y))

        for x, y, z in itertools.product([4, 7, 10], [4, 7, 10], [4, 7, 10]):
            shapes.append((2**x, 2**y, 2**z))

        batch_sizes = [1, 16, 128, 1024, 4096]

        mem_limit = 2**20

        vals = []
        ids = []
        for shape, batch in itertools.product(shapes, batch_sizes):
            if product(shape) * batch <= mem_limit:
                if batch == 1:
                    vals.append((shape, None))
                else:
                    vals.append(
                        ((batch, ) + shape, tuple(range(1,
                                                        len(shape) + 1))))
                ids.append(str(batch) + "x" + str(shape))

        metafunc.parametrize('shape_and_axes', vals, ids=ids)

    elif 'non2batch_shape_and_axes' in metafunc.funcargnames:

        def idgen(shape_and_axes):
            shape, axes = shape_and_axes
            assert len(axes) == 1
            outer_batch = shape[:axes[0]]
            inner_batch = shape[axes[0] + 1:]
            return ((str(outer_batch) + "x") if len(outer_batch) > 0 else "") + \
                str(shape[axes[0]]) + "x" + str(inner_batch)

        vals = [((17, 16), (1, )), ((177, 256), (1, )), ((39, 16, 7), (1, )),
                ((17, 16, 131), (1, )), ((7, 1024, 11), (1, )),
                ((5, 1024, 57), (1, ))]

        metafunc.parametrize('non2batch_shape_and_axes',
                             vals,
                             ids=list(map(idgen, vals)))

    elif 'non2problem_shape_and_axes' in metafunc.funcargnames:

        def idgen(non2problem_shape_and_axes):
            shape, axes = non2problem_shape_and_axes
            return str(shape) + 'over' + str(axes)

        vals = [((17, 15), (1, )), ((17, 17), (1, )), ((19, 4095), (1, )),
                ((19, 4097), (1, )), ((39, 31, 7), (1, )),
                ((39, 33, 7), (1, )), ((3, 255, 7), (1, )),
                ((3, 257, 7), (1, )), ((17, 200, 131), (0, 1)),
                ((7, 1000, 11), (1, 2)), ((15, 900, 57), (0, 1, 2))]

        metafunc.parametrize('non2problem_shape_and_axes',
                             vals,
                             ids=list(map(idgen, vals)))

    elif 'perf_shape_and_axes' in metafunc.funcargnames:

        vals = []
        ids = []
        for log_shape in perf_log_shapes:
            shape = tuple(2**x for x in log_shape)
            batch = perf_mem_limit // (2**sum(log_shape))
            vals.append(((batch, ) + shape, tuple(range(1, len(shape) + 1))))
            ids.append(str(batch) + "x" + str(shape))

        metafunc.parametrize('perf_shape_and_axes', vals, ids=ids)

    elif 'non2problem_perf_shape_and_axes' in metafunc.funcargnames:

        vals = []
        ids = []
        for log_shape in perf_log_shapes:
            for modifier in (1, -1):
                shape = tuple(2**(x - 1) + modifier for x in log_shape)
                batch = perf_mem_limit // (2**sum(log_shape))
                vals.append(((batch, ) + shape, tuple(range(1,
                                                            len(shape) + 1))))
                ids.append(str(batch) + "x" + str(shape))

        metafunc.parametrize('non2problem_perf_shape_and_axes', vals, ids=ids)
Exemple #13
0
def pytest_generate_tests(metafunc):

    perf_log_shapes = [
        (4,), (10,), (13,), # 1D
        (4, 4), (7, 7), (10, 10), # 2D
        (4, 4, 4), (5, 5, 7), (7, 7, 7)] # 3D
    perf_mem_limit = 4 * 2**20

    if 'shape_and_axes' in metafunc.funcargnames:
        shapes = []

        for x in [3, 8, 9, 10, 11, 12, 13, 20]:
            shapes.append((2 ** x,))

        for x, y in itertools.product([4, 7, 8, 10], [4, 7, 8, 10]):
            shapes.append((2 ** x, 2 ** y))

        for x, y, z in itertools.product([4, 7, 10], [4, 7, 10], [4, 7, 10]):
            shapes.append((2 ** x, 2 ** y, 2 ** z))

        batch_sizes = [1, 16, 128, 1024, 4096]

        mem_limit = 2 ** 20

        vals = []
        ids = []
        for shape, batch in itertools.product(shapes, batch_sizes):
            if product(shape) * batch <= mem_limit:
                if batch == 1:
                    vals.append((shape, None))
                else:
                    vals.append(((batch,) + shape, tuple(range(1, len(shape) + 1))))
                ids.append(str(batch) + "x" + str(shape))

        metafunc.parametrize('shape_and_axes', vals, ids=ids)

    elif 'non2batch_shape_and_axes' in metafunc.funcargnames:
        def idgen(shape_and_axes):
            shape, axes = shape_and_axes
            assert len(axes) == 1
            outer_batch = shape[:axes[0]]
            inner_batch = shape[axes[0]+1:]
            return ((str(outer_batch) + "x") if len(outer_batch) > 0 else "") + \
                str(shape[axes[0]]) + "x" + str(inner_batch)

        vals = [
            ((17, 16), (1,)),
            ((177, 256), (1,)),
            ((39, 16, 7), (1,)),
            ((17, 16, 131), (1,)),
            ((7, 1024, 11), (1,)),
            ((5, 1024, 57), (1,))]

        metafunc.parametrize('non2batch_shape_and_axes', vals, ids=list(map(idgen, vals)))

    elif 'non2problem_shape_and_axes' in metafunc.funcargnames:

        def idgen(non2problem_shape_and_axes):
            shape, axes = non2problem_shape_and_axes
            return str(shape) + 'over' + str(axes)

        vals = [
            ((17, 15), (1,)),
            ((17, 17), (1,)),
            ((19, 4095), (1,)),
            ((19, 4097), (1,)),
            ((39, 31, 7), (1,)),
            ((39, 33, 7), (1,)),
            ((3, 255, 7), (1,)),
            ((3, 257, 7), (1,)),
            ((17, 200, 131), (0, 1)),
            ((7, 1000, 11), (1, 2)),
            ((15, 900, 57), (0, 1, 2))]

        metafunc.parametrize('non2problem_shape_and_axes', vals, ids=list(map(idgen, vals)))

    elif 'perf_shape_and_axes' in metafunc.funcargnames:

        vals = []
        ids = []
        for log_shape in perf_log_shapes:
            shape = tuple(2 ** x for x in log_shape)
            batch = perf_mem_limit // (2 ** sum(log_shape))
            vals.append(((batch,) + shape, tuple(range(1, len(shape) + 1))))
            ids.append(str(batch) + "x" + str(shape))

        metafunc.parametrize('perf_shape_and_axes', vals, ids=ids)

    elif 'non2problem_perf_shape_and_axes' in metafunc.funcargnames:

        vals = []
        ids = []
        for log_shape in perf_log_shapes:
            for modifier in (1, -1):
                shape = tuple(2 ** (x - 1) + modifier for x in log_shape)
                batch = perf_mem_limit // (2 ** sum(log_shape))
                vals.append(((batch,) + shape, tuple(range(1, len(shape) + 1))))
                ids.append(str(batch) + "x" + str(shape))

        metafunc.parametrize('non2problem_perf_shape_and_axes', vals, ids=ids)
Exemple #14
0
 def set_shape(self, shape):
     self._shape = shape
     if shape is None:
         self._size = None
     else:
         self._size = product(shape)