def __init__(self, shape, dtype, prealloc): self.device = cuda.get_current_device() self.freelist = deque() self.events = {} for i in range(prealloc): gpumem = cuda.device_array(shape=shape, dtype=dtype) self.freelist.append(gpumem) self.events[gpumem] = cuda.event(timing=False)
generator._compute_cuda_dims(data) gridDim = generator._gridDim blockDim = generator._blockDim print "grid: ", gridDim print "block: ", blockDim dData = cuda.to_device(data) dCentroids = cuda.to_device(generator.centroids) dLabels = cuda.device_array_like(generator.labels) dDists = cuda.device_array_like(generator._dists) startE = cuda.event() endE = cuda.event() startE.record() _cu_label_kernel_dists[gridDim,blockDim](dData,dCentroids,dLabels,dDists) endE.record() endE.synchronize() print cuda.event_elapsed_time(startE,endE) startE.record() dDists.copy_to_host(ary = generator._dists) labels = dLabels.copy_to_host(ary = generator.labels) endE.record() endE.synchronize() print cuda.event_elapsed_time(startE,endE)
@cuda.jit('void(float32[:], float32[:])') def cu_copy_array(dst, src): i = cuda.grid(1) dst[i] = src[i] BLOCKCOUNT = 25000 BLOCKSIZE = 256 aryA = np.arange(BLOCKSIZE * BLOCKCOUNT, dtype=np.float32) print 'data size: %.1fMB' % (aryA.size * aryA.dtype.itemsize / (2**20)) evt_total_begin = cuda.event() evt_total_end = cuda.event() evt_kernel_begin = cuda.event() evt_kernel_end = cuda.event() t_total_begin = timer() evt_total_begin.record() # explicity tranfer memory d_aryA = cuda.to_device(aryA) d_aryB = cuda.device_array_like(aryA) evt_kernel_begin.record() t_kernel_begin = timer()
import numpy as np @cuda.jit('void(float32[:], float32[:])') def cu_copy_array(dst, src): i = cuda.grid(1) dst[i] = src[i] BLOCKCOUNT = 25000 BLOCKSIZE = 256 aryA = np.arange(BLOCKSIZE * BLOCKCOUNT, dtype=np.float32) print 'data size: %.1fMB' % (aryA.size * aryA.dtype.itemsize / (2**20)) evt_total_begin = cuda.event() evt_total_end = cuda.event() evt_kernel_begin = cuda.event() evt_kernel_end = cuda.event() t_total_begin = timer() evt_total_begin.record() # explicity tranfer memory d_aryA = cuda.to_device(aryA) d_aryB = cuda.device_array_like(aryA) evt_kernel_begin.record() t_kernel_begin = timer()