コード例 #1
0
 def __init__(self, shape, dtype, prealloc):
     self.device = cuda.get_current_device()
     self.freelist = deque()
     self.events = {}
     for i in range(prealloc):
         gpumem = cuda.device_array(shape=shape, dtype=dtype)
         self.freelist.append(gpumem)
         self.events[gpumem] = cuda.event(timing=False)
コード例 #2
0
 def __init__(self, shape, dtype, prealloc):
     self.device = cuda.get_current_device()
     self.freelist = deque()
     self.events = {}
     for i in range(prealloc):
         gpumem = cuda.device_array(shape=shape, dtype=dtype)
         self.freelist.append(gpumem)
         self.events[gpumem] = cuda.event(timing=False)
コード例 #3
0
generator._compute_cuda_dims(data)
gridDim = generator._gridDim
blockDim = generator._blockDim

print "grid: ", gridDim
print "block: ", blockDim

dData = cuda.to_device(data)

dCentroids = cuda.to_device(generator.centroids)

dLabels = cuda.device_array_like(generator.labels)
dDists = cuda.device_array_like(generator._dists)

startE = cuda.event()
endE = cuda.event()

startE.record()
_cu_label_kernel_dists[gridDim,blockDim](dData,dCentroids,dLabels,dDists)
endE.record()
endE.synchronize()
print cuda.event_elapsed_time(startE,endE)

startE.record()
dDists.copy_to_host(ary = generator._dists)
labels = dLabels.copy_to_host(ary = generator.labels)
endE.record()
endE.synchronize()
print cuda.event_elapsed_time(startE,endE)
コード例 #4
0

@cuda.jit('void(float32[:], float32[:])')
def cu_copy_array(dst, src):
    i = cuda.grid(1)
    dst[i] = src[i]


BLOCKCOUNT = 25000
BLOCKSIZE = 256

aryA = np.arange(BLOCKSIZE * BLOCKCOUNT, dtype=np.float32)

print 'data size: %.1fMB' % (aryA.size * aryA.dtype.itemsize / (2**20))

evt_total_begin = cuda.event()
evt_total_end = cuda.event()

evt_kernel_begin = cuda.event()
evt_kernel_end = cuda.event()

t_total_begin = timer()
evt_total_begin.record()

# explicity tranfer memory
d_aryA = cuda.to_device(aryA)
d_aryB = cuda.device_array_like(aryA)

evt_kernel_begin.record()

t_kernel_begin = timer()
コード例 #5
0
import numpy as np

@cuda.jit('void(float32[:], float32[:])')
def cu_copy_array(dst, src):
    i = cuda.grid(1)
    dst[i] = src[i]


BLOCKCOUNT = 25000
BLOCKSIZE = 256

aryA = np.arange(BLOCKSIZE * BLOCKCOUNT, dtype=np.float32)

print 'data size: %.1fMB' % (aryA.size * aryA.dtype.itemsize / (2**20))

evt_total_begin = cuda.event()
evt_total_end = cuda.event()

evt_kernel_begin = cuda.event()
evt_kernel_end = cuda.event()

t_total_begin = timer()
evt_total_begin.record()

# explicity tranfer memory
d_aryA = cuda.to_device(aryA)
d_aryB = cuda.device_array_like(aryA)

evt_kernel_begin.record()

t_kernel_begin = timer()