Example #1
0
def test_memoize_kernel():
    # thread = Thread(profile=True)
    ary_a = np.ones(int(1e3))
    ary_b = np.zeros(ary_a.shape)
    ary_a_buffer = to_device(ary_a)
    ary_b_buffer = to_device(ary_b)
    n_recompilations = 100
    for i in range(n_recompilations + 1):
        kernels = []
        for j in range(10):
            some_knl = Kernel(
                f'some_knl_{j}', {
                    'ary_a': Global(ary_a_buffer),
                    'ary_b': Global(ary_b_buffer)
                }, """
                     ary_b[get_global_id(0)] = ary_a[get_global_id(0)];
                     """)
            kernels.append(some_knl)
        Program(kernels=kernels).compile()
        some_knl(global_size=ary_a.shape)
        if i == 1:
            t = time.time()
    time_per_recompile = (time.time() - t) / n_recompilations
    # thread.queue.get_profiler().show_histogram_cumulative_kernel_times()
    print(time_per_recompile)
    assert time_per_recompile < 0.001  # less than 1 ms overhead per recompilation achieved through caching
def test_copy_array_region_on_device_between_buffers():
    ary_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.complex64)
    in_cl = to_device(ary_np)
    out_np = np.zeros(shape=(4, 4), dtype=in_cl)
    out_cl = to_device(out_np)
    copy_region = CopyArrayRegion(in_buffer=in_cl,
                                  region_in=Slice[:1, 1:3],
                                  out_buffer=out_cl,
                                  region_out=Slice[1:2, 2:4])
    out = copy_region()
    out_np[1:2, 2:4] = in_cl.get()[0:1, 1:3]
    assert np.all(out.get() == out_np)
def test_two_input_integer_functions(name, dtype):
    a_cl = to_device(np.ones((10, ), dtype))
    a_emulation = to_device(np.ones((10, ), dtype))
    knl = Kernel(f'knl_{name}', {
        'a': Global(a_cl),
        'num': Scalar(dtype(0))
    },
                 f'a[get_global_id(0)]={name}(a[get_global_id(0)], num);',
                 global_size=a_cl.shape)
    knl.compile()(a=a_cl)
    knl.compile(emulate=True)(a=a_emulation)
    assert np.all(a_cl.get() == a_emulation.get())
Example #4
0
def test_get_refreshed_argument_of_memoized_kernel():
    for i in range(10):
        ary_a = np.ones(100) + i
        ary_b = np.zeros(100)
        some_knl = Kernel(
            'some_knl', {
                'ary_a': Global(to_device(ary_a)),
                'ary_b': Global(to_device(ary_b))
            }, """
                 ary_b[get_global_id(0)] = ary_a[get_global_id(0)];
                 """).compile()
        some_knl(global_size=ary_a.shape)
    assert np.all(some_knl.ary_b.get() == ary_a)
def test_debug_kernel_with_barriers():
    buff = np.zeros(shape=(2, 4)).astype(Types.int)
    mem_buffer = to_device(buff)
    knl = Kernel('knl', {'mem_glob': Global(mem_buffer)},
                 """
        __local int mem[2];
        mem[0]=0;
        mem[1]=0;
        mem[get_local_id(1)] = get_local_id(1);
        barrier(CLK_LOCAL_MEM_FENCE);
        mem[get_local_id(1)] = mem[1];
        //barrier(CLK_GLOBAL_MEM_FENCE);
        mem_glob[get_global_id(0)*get_global_size(1)+get_global_id(1)] = mem[get_local_id(1)];
    """,
                 global_size=(2, 4),
                 local_size=(1, 2))
    compiled_cl = knl.compile(
        emulate=False,
        file=Path(__file__).parent.joinpath('py_cl_kernels/knl'))
    compiled_cl()
    mem_buffer_py = zeros_like(mem_buffer)
    compiled_py = knl.compile(
        emulate=True, file=Path(__file__).parent.joinpath('py_cl_kernels/knl'))
    # out[0] = complex64(inp[0].real+out[0].imag*1j) instead of out[0].real=inp[0].real
    compiled_py(mem_glob=mem_buffer_py)
    assert np.all(mem_buffer.get() == mem_buffer_py.get())
def test_transpose_complex():
    ary_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=Types.cfloat)
    ary_np = np.hstack([ary_np, ary_np])  # array([[1, 2, 3], [4, 5, 6]])
    in_cl = to_device(ary_np)
    transpose_in_buffer = Transpose(in_cl, (1, 0))
    out_cl = transpose_in_buffer()
    assert np.all(out_cl.get() == ary_np.T)
def test_copy_array_region_on_device_given_axis_index():
    ary_np = np.ones(shape=(5, 10), dtype=np.int32)
    in_cl = to_device(ary_np)
    copy_region = CopyArrayRegion(in_buffer=in_cl, region_in=Slice[2:-1, :])
    out = copy_region()
    out_np = in_cl.get()[2:-1, :]
    assert np.all(out.get() == out_np)
def test_copy_array_region_on_device_negative_indexing():
    ary_np = np.ones(shape=(5, 10), dtype=np.int32)
    in_cl = to_device(ary_np)
    copy_region = CopyArrayRegion(in_buffer=in_cl, region_in=Slice[:, 2:-1])
    out = copy_region()
    out_np = in_cl.get()[:, 2:-1]
    assert np.all(out.get() == out_np)
Example #9
0
def test_multiple_command_queues():
    queue1 = create_queue(device_id=0)
    queue2 = create_queue(context=queue1.context)
    ary_a = to_device(np.ones(100000) + 1, queue1)
    ary_b = to_device(np.zeros(100000), queue1)
    some_knl = Kernel('some_knl', {
        'ary_a': Global(ary_a),
        'ary_b': Global(ary_b)
    },
                      """
             ary_b[get_global_id(0)] += ary_a[get_global_id(0)];
             """,
                      global_size=ary_a.shape).compile(queue2.context)
    some_knl(queue=queue2)
    # thread2.queue.finish()
    some_knl(queue=queue1)
    test = 0
Example #10
0
def test_sum_along_axis_1d():
    ary = np.array([1, 2, 3])
    ary_buffer = to_device(ary)

    sum_along_axis = SumAlongAxis(ary_buffer, axis=0)
    # res_py = sum_along_axis(emulate=True).get()
    res_cl = sum_along_axis().get()
    ref = ary.sum(axis=0)
    assert np.all(res_cl == ref)
def test_copy_array_region_on_device():
    ary_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.complex64)
    in_cl = to_device(ary_np)
    copy_region = CopyArrayRegion(in_cl, region_in=Slice[0:1, 1:3])

    out = copy_region()

    out_np = in_cl.get()[0:1, 1:3]

    assert np.all(out.get() == out_np)
Example #12
0
def test_sum_along_axis():
    ary = np.array([[1, 2, 3], [1, 2, 3]])
    ary_buffer = to_device(ary)
    """"
    sum_along_axis = SumAlongAxis(ary_buffer, axis=0)
    res = sum_along_axis().get()
    ref = ary.sum(axis=0)
    assert np.all(res == ref)
    """

    sum_along_axis = SumAlongAxis(ary_buffer, axis=1)
    res = sum_along_axis().get()
    ref = ary.sum(axis=1)
    assert np.all(res == ref)
Example #13
0
def test_ifft(in_data_np):
    in_ = to_device(in_data_np)
    fft_cl = Fft(in_buffer=in_)
    fft_in_ = fft_cl()

    ref_ifft_fft_in_ = np.fft.ifft(fft_in_.get(), axis=-1)
    ifft_cl = IFft(fft_in_)
    ifft_fft_in_ = ifft_cl().get()
    if in_data_np.size < 1024:
        ifft_cl_py = IFft(fft_in_, emulate=True)
        py_np_ifft_fft_in_ = ifft_cl_py().get()
        a = py_np_ifft_fft_in_
        b = ifft_fft_in_
        assert np.allclose(a, b)
    assert np.allclose(ifft_fft_in_, ref_ifft_fft_in_)
 def eval_code(emulate=False):
     data = to_device(np.array([0]).astype(Types.char))
     knl = Kernel('knl_test_for_loop', {
         'data': Global(data)
     },
                  """
         ${header}{
             data[0]+=i;
         }
     """,
                  replacements={
                      'header': header
                  },
                  global_size=data.shape).compile(emulate=emulate)
     knl()
     get_current_queue().finish()
     res = knl.data.get()
     return res
def test_access_complex_variable():
    buff = np.array([0.5]).astype(Types.cfloat)
    buff_in = to_device(buff)
    buff_out = zeros_like(buff_in)
    knl = Kernel('knl', {
        'inp': Global(buff_in),
        'out': Global(buff_out)
    },
                 """
        out[get_global_id(0)].real = inp[get_global_id(0)].real; 
    """,
                 global_size=(1, ))
    compiled_cl = knl.compile(
        emulate=False,
        file=Path(__file__).parent.joinpath('py_cl_kernels/knl'))
    compiled_cl()
    buff_out_py = zeros_like(buff_in)
    compiled_py = knl.compile(
        emulate=True, file=Path(__file__).parent.joinpath('py_cl_kernels/knl'))
    # out[0] = complex64(inp[0].real+out[0].imag*1j) instead of out[0].real=inp[0].real
    compiled_py(out=buff_out_py)
    assert np.all(buff_out.get() == buff_out_py.get())
Example #16
0
def test_add_functions_inside_function_or_kernel_definition():
    ary_a = to_device(np.ones(100))
    fnc_add3 = Function('add_three', {'a': Scalar(Types.int)},
                        'return a + 3;',
                        returns=Types.int)
    fnc_add5 = Function('add_five', {'a': Scalar(Types.int)},
                        """
             return add_three(a)+2;
             """,
                        functions=[fnc_add3],
                        returns=Types.int)
    some_knl = Kernel('some_knl', {'ary_a': Global(ary_a)},
                      """
             ary_a[get_global_id(0)] = add_five(ary_a[get_global_id(0)]);
             """,
                      global_size=ary_a.shape,
                      functions=[fnc_add5])
    functions = [
    ]  # funcitons defined here have higher proiority in case of name conflicts
    Program(functions=functions, kernels=[some_knl]).compile()
    some_knl()
    assert ary_a.get()[0] == 6
 def run(emulate=False):
     ary = to_device(np.ones(10).astype(data_t))
     local_mem = LocalArray(
         dtype=data_t,
         shape=5)  # 5 is to to test that local array argument is changed
     knl = Kernel('knl_local_arg', {
         'ary': Global(ary),
         'local_mem': local_mem
     },
                  """
            int offset = get_group_id(0)*get_local_size(0);
            for(int i=0; i<5; i++) local_mem[i] = ary[offset + i];
            barrier(CLK_LOCAL_MEM_FENCE);
            data_t sum = (data_t)(0);
            for(int i=0; i<5; i++) sum+=local_mem[i];
            ary[get_global_id(0)] = sum;
                  """,
                  type_defs={'data_t': data_t},
                  global_size=ary.shape,
                  local_size=(5, ))
     local_mem = LocalArray(dtype=data_t, shape=5)
     knl.compile(emulate=emulate)(local_mem=local_mem)
     return ary.get()
Example #18
0
def test_fft(in_data_np):
    atol = 1e-4 if in_data_np.dtype == Types.cfloat else 1e-8
    import numpy as np

    in_data_cl = to_device(in_data_np)

    fft_cl = Fft(in_data_cl, emulate=False)

    # zero padding data for numpy
    axis = 1
    N = in_data_np.shape[axis]
    if not np.log2(N).is_integer():  # if not power of 2, pad accordingly
        N = 2**int(np.log2(N) + 1)
    in_data_np_power_of_two = np.zeros((in_data_np.shape[0], N),
                                       in_data_np.dtype)
    in_data_np_power_of_two[:, :in_data_np.shape[axis]] = in_data_np

    def measure(call):
        attempts = 3
        ts = []
        for i in range(attempts):
            t1 = time.time()
            call()
            t2 = time.time()
            ts.append(t2 - t1)
        return min(ts)

    # import pyfftw
    # t_fftw = measure(lambda: pyfftw.interfaces.numpy_fft.fft(in_data_np_power_of_two, axis=-1))
    t_np = measure(lambda: np.fft.fft(in_data_np_power_of_two, axis=-1))
    fft_in_data_np = np.fft.fft(in_data_np_power_of_two, axis=-1)

    def fft_call():
        fft_in_data_cl = fft_cl()
        fft_in_data_cl.queue.finish()

    t_cl = measure(fft_call)
    fft_in_data_cl = fft_cl()

    if in_data_np.size < 1024:
        # Test against emulation (commented since it is slower)
        use_existing_file_for_emulation(False)
        fft_cl_py = Fft(in_data_cl, emulate=True)

        fft_in_data_cl_py = fft_cl_py()
        a = fft_in_data_cl_py.get().view(Types.cdouble)
        b = fft_in_data_cl.get().view(Types.cdouble)
        c = fft_in_data_np.view(Types.cdouble)
        assert np.allclose(a, b)
        assert np.allclose(c, b)
        assert np.allclose(c, a)

    # import matplotlib.pyplot as plt
    # plt.plot(fft_in_data_np.flatten())
    # plt.plot(fft_in_data_cl_emulation.get().flatten())
    # plt.show()
    assert np.allclose(fft_in_data_np, fft_in_data_cl.get(), atol=atol)
    # benchmark using reikna
    if False:  # change to true to run against reikna's fft. Note: Reikna takes quite some optimization time before run
        from reikna.cluda import any_api
        from reikna.fft import FFT
        import numpy
        api = any_api()
        thr = api.Thread.create()
        data = in_data_np
        dtype = data.dtype
        axes = (1, )
        fft = FFT(data, axes=axes)
        fftc = fft.compile(thr)
        data_dev = thr.to_device(data)
        res_dev = thr.empty_like(data_dev)
        ts = []
        for i in range(attempts):
            t1 = time.time()
            fftc(res_dev, data_dev)
            thr.synchronize()
            t2 = time.time()
            ts.append(t2 - t1)
        fwd_ref = numpy.fft.fftn(data, axes=axes).astype(dtype)
        tnp = time.time()
        fwd_ref = numpy.fft.fftn(data, axes=axes).astype(dtype)
        tnp = time.time() - tnp
        # numpy.fft.fftn(data[:, :, 0], axes=(1,))
        treikna_min = min(ts)
        assert np.allclose(fft_in_data_np, res_dev.get())
def test_transpose_single_dimension():
    ary_np = np.array([[1, 2, 3]], dtype=Types.int).T
    in_cl = to_device(ary_np)
    transpose_on_input_buffer = Transpose(in_cl, (1, 0))
    out_cl = transpose_on_input_buffer()
    assert np.all(out_cl.get() == ary_np.T)
def cl_set(array: Array, region: TypeSliceFormatCopyArrayRegion, value):
    """
    example usage:
    set slice of array with scalar value
    val = 1
    cl_set(ary, Slice[:,2:3], val)

    set slice of array with equally shaped numpy array like the slice
    some_np_array = np.array([[3,4])
    cl_set(ary, Slice[1:2,2:3], some_np_array)


    :param array:
    :param region:
    :param value:
    :return:
    """
    # todo test if array c contiguous
    region_arg = region
    # if slice is contiguous block of memory set it as
    # _buffer_np = np.zeros_like(add_symbols_memory_initialization.out_buffer)
    # _buffer_np[:, memory: -memory] = mapper.alphabet[0]
    # add_symbols_memory_initialization.out_buffer.set(_buffer_np)
    region = CopyArrayRegion._deal_with_incomplete_regions(region_arg, array)
    region = CopyArrayRegion._deal_with_none_in_stop(region, array)
    region = CopyArrayRegion._deal_with_negative_region(region, array)

    # test if requested region is
    for axis, _slice in enumerate(region):
        step_width = _slice[2]
        if abs(_slice[0] * step_width) > array.shape[axis] or abs(_slice[1] * step_width) > array.shape[axis]:
            raise ValueError('Slicing out of array bounds')
    if any([(part[0] - part[1]) == 0 for part in region]):  # check that there is no empty slice
        return
    global_size = np.product([part[1] - part[0] for part in region])
    target_shape = to_device(np.array(array.shape).astype(Types.int))
    offset_target = to_device(np.array([part[0] for part in region]).astype(Types.int))
    source_shape = to_device(np.array([part[1] - part[0] for part in region]).astype(Types.int))
    source_n_dims = len(source_shape)

    if isinstance(value, np.ndarray):
        source = to_device(value.astype(array.dtype))
        arg_source = Global(source)
        code_source = 'source[get_global_id(0)]'
    else:
        arg_source = Scalar(array.dtype)
        source = value
        code_source = 'source'

    knl = Kernel('set_cl_array',
                 {'target': Global(array),
                  'target_shape': Global(target_shape),
                  'offset_target': Global(offset_target),
                  'source': arg_source,
                  'source_shape': Global(source_shape),
                  'source_n_dims': Scalar(Types.int)},
                 """
   // id_source = get_global_id(0)
   // id_source points to element of array source which replaces element with id_target in array target.
   // we need to compute id_target from id_source:
   // we assume c-contiguous addressing like:
   // id_source = id0*s1*s2*s3+id1*s2*s3+id2*s3+id3 (here s refers shape of source array)
   // At first we need to compute individual ids of source array from id_source:
   // id3 = int(gid % s3), temp = (gid-id3)/s3
   // id2 = int(temp % s2), temp = (temp-id2)/s2
   // id1 = int(temp % s1), temp = (temp-id1)/s1
   // id0 = int(temp % s0), temp = (temp-id0)/s1
   // Finally, we can determine the id of the target array and copy element to corresponding position:
   // id_target = (id0*offset0t)*s1t*s2t ... (sxt: shape of target array along dim x)
   int id_target = 0; // to be iteratively computed from global id, slice dimensions and ary dimensions

   int temp = get_global_id(0);
   int prod_source_id_multiplier = 1;
   int prod_target_id_multiplier = 1;

   for(int i=source_n_dims-1; i>=0; i--){ // i=i_axis_source
    int id_source = temp % source_shape[i];
    temp = (int)((temp-id_source)/source_shape[i]);
    prod_source_id_multiplier *= source_shape[i];
    id_target += (offset_target[i]+id_source)*prod_target_id_multiplier;
    prod_target_id_multiplier *= target_shape[i];
   }
   target[id_target] = ${source};
                   """,
                 replacements={'addr': Helpers.command_compute_address(array.ndim),
                               'source': code_source},
                 global_size=(global_size,)
                 ).compile(array.context, emulate=False)
    knl(source=source, source_n_dims=source_n_dims)
Example #21
0
 def deal_with_np_arrays(v):
     if isinstance(v, Global) and isinstance(v.default, np.ndarray):
         v.default = to_device(ary=v.default, queue=queue)
         return v
     else:
         return v