def main(dtype): from pycuda.elementwise import get_linear_combination_kernel lc_kernel, lc_texrefs = get_linear_combination_kernel( ((True, dtype, dtype), (True, dtype, dtype)), dtype) for size_exp in range(10, 26): size = 1 << size_exp from pycuda.curandom import rand a = gpuarray.to_gpu(numpy.array(5, dtype=dtype)) x = rand(size, dtype=dtype) b = gpuarray.to_gpu(numpy.array(7, dtype=dtype)) y = rand(size, dtype=dtype) z = gpuarray.empty_like(x) start = drv.Event() stop = drv.Event() start.record() for i in range(20): a.bind_to_texref_ext(lc_texrefs[0], allow_double_hack=True) b.bind_to_texref_ext(lc_texrefs[1], allow_double_hack=True) lc_kernel.prepared_call(x._grid, x._block, x.gpudata, y.gpudata, z.gpudata, x.mem_size) stop.record() stop.synchronize() print(size, size_exp, stop.time_since(start))
def main(dtype): from pycuda.elementwise import get_linear_combination_kernel lc_kernel, lc_texrefs = get_linear_combination_kernel(( (True, dtype, dtype), (True, dtype, dtype) ), dtype) for size_exp in range(10, 26): size = 1 << size_exp from pycuda.curandom import rand a = gpuarray.to_gpu(numpy.array(5, dtype=dtype)) x = rand(size, dtype=dtype) b = gpuarray.to_gpu(numpy.array(7, dtype=dtype)) y = rand(size, dtype=dtype) z = gpuarray.empty_like(x) start = drv.Event() stop = drv.Event() start.record() for i in range(20): a.bind_to_texref_ext(lc_texrefs[0], allow_double_hack=True) b.bind_to_texref_ext(lc_texrefs[1], allow_double_hack=True) lc_kernel.prepared_call(x._grid, x._block, x.gpudata, y.gpudata, z.gpudata, x.mem_size) stop.record() stop.synchronize() print size, size_exp, stop.time_since(start)
def __init__(self, result_dtype, scalar_dtype, sample_vec, arg_count, pool=None): from pycuda.elementwise import get_linear_combination_kernel self.vector_dtype = sample_vec.dtype self.result_dtype = result_dtype self.shape = sample_vec.shape self.block = sample_vec._block self.grid = sample_vec._grid self.mem_size = sample_vec.mem_size self.kernel, _ = get_linear_combination_kernel( arg_count*((False, scalar_dtype, self.vector_dtype),), result_dtype) if pool: self.allocator = pool.allocate else: self.allocator = None
def make_lc2_kernel(self, dtype, a_is_gpu, b_is_gpu): from pycuda.elementwise import get_linear_combination_kernel return get_linear_combination_kernel( ((a_is_gpu, dtype, dtype), (b_is_gpu, dtype, dtype)), dtype)
def make_lc2_kernel(self, dtype, a_is_gpu, b_is_gpu): from pycuda.elementwise import get_linear_combination_kernel return get_linear_combination_kernel(( (a_is_gpu, dtype, dtype), (b_is_gpu, dtype, dtype) ), dtype)