Example #1
0
def bp(grad,W,gW,gb,h,gradInput=None):
	z = cuda.device_array_like(gW)
	print('grad',grad.shape,'h',h.shape)
	mmprod(h,grad,z,transa='T')
	mmadd(gW,z,gW)
	mmadd(gb,grad,gb)
	if gradInput is not None:
		mmprod(grad,W,gradInput,transb='T')
Example #2
0
def mcopy(a):
    blockDim = (min(32,a.shape[0]),min(32,a.shape[1]))
    gridDim = ((((a.shape[0] + blockDim[0]) - 1) / blockDim[0]), (((a.shape[1] + blockDim[1]) - 1) / blockDim[1]))

    b = cuda.device_array_like(a)

    d_mcopy[gridDim,blockDim](a,b)

    return b
Example #3
0
def msum(a):
	blockDim = (min(32,a.shape[0]),min(32,a.shape[1]))
	gridDim = ((((a.shape[0] + blockDim[0]) - 1) / blockDim[0]), (((a.shape[1] + blockDim[1]) - 1) / blockDim[1]))
	db = cuda.device_array_like(a)
	print(blockDim,gridDim)
	d_msum[gridDim,blockDim](a,db)
	while gridDim[1] > 1:
		blockDim = (min(32,a.shape[0]),min(32,gridDim[1]))
		gridDim = ((((a.shape[0] + blockDim[0]) - 1) / blockDim[0]), (((gridDim[1] + blockDim[1]) - 1) / blockDim[1]))
		print(blockDim,gridDim)
		d_msum[gridDim,blockDim](db,db)
	return db
def block_increment(start, n):

    cuda.select_device(0)
    stream = cuda.stream()
    blockdim = 256
    griddim = n // 256 + 1
    c_host = np.zeros((n, n), dtype=np.float32)
    m_dev = curand.normal(0, 1, n, dtype=np.float32, device=True)
    n_dev = curand.normal(0, 1, n, dtype=np.float32, device=True)
    a_host = np.zeros(n, dtype=np.float32)
    a_dev = cuda.device_array_like(a_host)
    cuda_div[griddim, blockdim, stream](m_dev, n_dev, a_dev, n)
    #keeps a_dev on the device for the kernel ==> no access at this point to the device memory
    # so i cant know what appends to m_dev and n_dev best guess is python GC is
    # translated into desallocation on the device
    b_dev = curand.uniform((n * n), dtype=np.float32, device=True)
    c_dev = cuda.device_array_like(c_host, stream)
    block_kernel[griddim, blockdim, stream](start, n, a_dev, b_dev, c_dev)
    c_dev.copy_to_host(c_host, stream)
    stream.synchronize()

    return c_host
Example #5
0
    def generate_omegas(self):

        # Generating device array
        device_output = cuda.device_array_like(self.host_omegas)

        # Calling kernel to calculate omegas
        self.omega[self.grid_dim,
                   self.threads_per_block](self.device_values, device_output,
                                           self.size)

        # Copying back omegas to host
        device_output.copy_to_host(self.host_omegas)

        return device_output
Example #6
0
    def compute_block(self):

        device_uniforms = curand.uniform(size=N * N, device=True)
        host_results = zeros((self.size, self.size))

        stream = cuda.stream()
        device_proposals = cuda.to_device(self.host_proposals, stream=stream)
        device_omegas = cuda.to_device(self.host_omegas, stream=stream)
        device_results = cuda.device_array_like(host_results, stream=stream)
        cu_one_block[self.grid_dim, self.threads_per_block,
                     stream](self.start, device_proposals, device_omegas,
                             device_uniforms, device_results, self.size,
                             self.size)
        device_results.copy_to_host(host_results, stream=stream)

        stream.synchronize()

        return host_results
Example #7
0
    def generate_proposals(self):

        # Generating two device arrays of standard normals variables
        device_normals_1 = curand.normal(0, 1, self.size, device=True)
        device_normals_2 = curand.normal(0, 1, self.size, device=True)

        # Generating device array
        device_output = cuda.device_array_like(self.host_values)

        # Calling external kernel to calculate ratios of standard normals (in device)
        self.division[self.grid_dim,
                      self.threads_per_block](device_normals_1,
                                              device_normals_2, device_output,
                                              self.size)

        # Copying back values to host
        device_output.copy_to_host(self.host_values)

        return device_output
def mc_cuda(paths, dt, interest, volatility):
    n = paths.shape[0]

    blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
    gridsz = int(math.ceil(float(n) / blksz))

    # instantiate a CUDA stream for queueing async CUDA cmds
    stream = cuda.stream()
    # instantiate a cuRAND PRNG
    prng = curand.PRNG(curand.PRNG.MRG32K3A)

    # Allocate device side array
    d_normdist = cuda.device_array(n, dtype=np.double, stream=stream)
    
    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    # configure the kernel
    # similar to CUDA-C: step_cuda<<<gridsz, blksz, 0, stream>>>
    step_cfg = step_cuda[gridsz, blksz, stream]
    
    # transfer the initial prices
    d_last = cuda.to_device(paths[:, 0], stream=stream)
    for j in range(1, paths.shape[1]):
        # call cuRAND to populate d_normdist with gaussian noises
        prng.normal(d_normdist, mean=0, sigma=1)
        # setup memory for new prices
        # device_array_like is like empty_like for GPU
        d_paths = cuda.device_array_like(paths[:, j], stream=stream)
        # invoke step kernel asynchronously
        step_cfg(d_last, d_paths, dt, c0, c1, d_normdist)
        # transfer memory back to the host
        d_paths.copy_to_host(paths[:, j], stream=stream)
        d_last = d_paths
    # wait for all GPU work to complete
    stream.synchronize()
Example #9
0
def task2():
    a = numpy.float32(2.)  # Force value to be float32
    x = numpy.arange(NELEM, dtype='float32')
    y = numpy.arange(NELEM, dtype='float32')

    ### Task2 ###
    # a) Complete the memory transfer for x -> dx, y -> dy
    # b) Allocate device memory for dout
    # c) Transfer for out <- dout
    dx = cuda.to_device(x)
    dy = cuda.to_device(y)
    dout = cuda.device_array_like(x)

    griddim = NUM_BLOCKS
    blockdim = NUM_THREADS
    saxpy[griddim, blockdim](a, dx, dy, dout)

    out = dout.copy_to_host()
    print "out =", out

    if numpy.allclose(a * x + y, out):
        print "Correct result"
    else:
        print "Incorrect result"
Example #10
0
def task2():
	a = numpy.float32(2.)				# Force value to be float32
	x = numpy.arange(NELEM, dtype='float32')
	y = numpy.arange(NELEM, dtype='float32')

	### Task2 ###
	# a) Complete the memory transfer for x -> dx, y -> dy
	# b) Allocate device memory for dout
	# c) Transfer for out <- dout
	dx = cuda.to_device(x)
	dy = cuda.to_device(y)
	dout = cuda.device_array_like(x)

	griddim = NUM_BLOCKS
	blockdim = NUM_THREADS
	saxpy[griddim, blockdim](a, dx, dy, dout)

	out = dout.copy_to_host()
	print "out =", out

	if numpy.allclose(a * x + y, out):
		print "Correct result"
	else:
		print "Incorrect result"
Example #11
0
from numbapro import cuda, float32, void
import numpy
import time

@cuda.jit(void(float32[:], float32[:], float32[:]))
def sumarrays(a, b, c):
	
	i = cuda.grid(1) # Short for cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
	
	if i < c.size:
		c[i] = a[i] + b[i]

n = 16*1024*1024
a = numpy.arange(n, dtype='float32')
b = a*2

start_time = time.time()

da = cuda.to_device(a)
db = cuda.to_device(b)
dc = cuda.device_array_like(a)

size_block = 1024
size_grid = int((n-1)/size_block + 1)
sumarrays[size_grid, size_block](da, db, dc)

c = dc.copy_to_host()

print "Time elapsed: ", time.time() - start_time, "s"
Example #12
0
generator.centroids = generator._init_centroids(data)
generator.labels = cuda.pinned_array(shape=generator.N, dtype = np.int32)
generator._dists = cuda.pinned_array(shape=generator.N, dtype = np.float32)

generator._compute_cuda_dims(data)
gridDim = generator._gridDim
blockDim = generator._blockDim

print "grid: ", gridDim
print "block: ", blockDim

dData = cuda.to_device(data)

dCentroids = cuda.to_device(generator.centroids)

dLabels = cuda.device_array_like(generator.labels)
dDists = cuda.device_array_like(generator._dists)

startE = cuda.event()
endE = cuda.event()

startE.record()
_cu_label_kernel_dists[gridDim,blockDim](dData,dCentroids,dLabels,dDists)
endE.record()
endE.synchronize()
print cuda.event_elapsed_time(startE,endE)

startE.record()
dDists.copy_to_host(ary = generator._dists)
labels = dLabels.copy_to_host(ary = generator.labels)
endE.record()
Example #13
0
from timeit import default_timer as timer
import numpy as np
from numbapro import vectorize, float32, cuda

src = np.arange(10 ** 7, dtype=np.float32)
dst = np.empty_like(src)

@vectorize([float32(float32)], target='gpu')
def copy_kernel(src):
    return src

# Regular memory transfer

ts = timer()
d_src = cuda.to_device(src)
d_dst = cuda.device_array_like(dst)

copy_kernel(d_src, out=d_dst)

d_dst.copy_to_host(dst)
te = timer()

print 'regular', te - ts

del d_src, d_dst

assert np.allclose(dst, src)

# Pinned (pagelocked) memory transfer

with cuda.pinned(src, dst):
Example #14
0
nThreads = (16,16)
nBlocks = (ceil(n_theta/nThreads[0]), ceil(n_phi/nThreads[1]))
print(nBlocks)

date = "2015_6_22_15_33_43"
xml_path = "D:\\image_software\\results\\GMEMtracking3D_"+date+"\\XML_finalResult_lht_bckgRm\\GMEMfinalResult_frame????.xml"

# CUDA call of the image
s = timer()
d_theta = cuda.to_device(all_theta)
d_phi = cuda.to_device(all_phi)
d_image = cuda.to_device(image)
d_r = cuda.to_device(r)
d_center = cuda.to_device(center)
d_max_val = cuda.device_array_like(all_max_val)
d_r_max = cuda.device_array_like(r_of_maxval)
calcRay_CUDA[nBlocks, nThreads](d_theta, d_phi, d_image, d_r, d_center, d_max_val, d_r_max)
d_max_val.copy_to_host(all_max_val)
d_r_max.copy_to_host(r_of_maxval)
e = timer()
print(e-s)

n_time = 10
pos = readXML(xml_path, n_time)

# Calculate the points coordinates
pos_arr = np.asarray(pos[0][0:3])
n_cells = pos_arr.shape[1]
angles = np.zeros((3,n_cells))
for cell in range(n_cells):
Example #15
0
def main():
    device = cuda.get_current_device()
    maxtpb = device.MAX_THREADS_PER_BLOCK
    warpsize = device.WARP_SIZE

    # benchmark loop
    vary_warpsize = []

    baseline = []
    ilpx2 = []
    ilpx4 = []
    ilpx8 = []

    # For OSX 10.8 where the GPU is used for graphic as well,
    # increasing the following to 10 * 2 ** 20 seems to be necessary to
    # produce consistent result.
    approx_data_size = 1.5 * 2**20

    for multiplier in range(1, maxtpb // warpsize + 1):
        blksz = warpsize * multiplier
        gridsz = ceil_to_nearest(float(approx_data_size) / blksz, 8)
        print 'kernel config [%d, %d]' % (gridsz, blksz)

        N = blksz * gridsz
        A = np.arange(N, dtype=np.float32)
        B = np.arange(N, dtype=np.float32)

        print 'data size %dMB' % (N / 2.**20 * A.dtype.itemsize)

        dA = cuda.to_device(A)
        dB = cuda.to_device(B)

        assert float(N) / blksz == gridsz, (float(N) / blksz, gridsz)
        vary_warpsize.append(blksz)

        dC = cuda.device_array_like(A)
        basetime = time_this(vec_add, gridsz, blksz, (dA, dB, dC))
        expected_result = dC.copy_to_host()
        if basetime > 0:
            baseline.append(N / basetime)

        dC = cuda.device_array_like(A)
        x2time = time_this(vec_add_ilp_x2, gridsz // 2, blksz, (dA, dB, dC))
        assert np.allclose(expected_result, dC.copy_to_host())
        if x2time > 0:
            ilpx2.append(N / x2time)

        dC = cuda.device_array_like(A)
        x4time = time_this(vec_add_ilp_x4, gridsz // 4, blksz, (dA, dB, dC))
        assert np.allclose(expected_result, dC.copy_to_host())
        if x4time > 0:
            ilpx4.append(N / x4time)

        dC = cuda.device_array_like(A)
        x8time = time_this(vec_add_ilp_x8, gridsz // 8, blksz, (dA, dB, dC))
        assert np.allclose(expected_result, dC.copy_to_host())
        if x8time > 0:
            ilpx8.append(N / x8time)

    pylab.plot(vary_warpsize[:len(baseline)], baseline, label='baseline')
    pylab.plot(vary_warpsize[:len(ilpx2)], ilpx2, label='ILP2')
    pylab.plot(vary_warpsize[:len(ilpx4)], ilpx4, label='ILP4')
    pylab.plot(vary_warpsize[:len(ilpx8)], ilpx8, label='ILP8')
    pylab.legend(loc=4)
    pylab.title(cuda.get_current_device().name)
    pylab.xlabel('block size')
    pylab.ylabel('float per second')
    pylab.show()
Example #16
0
aryA = np.arange(BLOCKSIZE * BLOCKCOUNT, dtype=np.float32)

print 'data size: %.1fMB' % (aryA.size * aryA.dtype.itemsize / (2**20))

evt_total_begin = cuda.event()
evt_total_end = cuda.event()

evt_kernel_begin = cuda.event()
evt_kernel_end = cuda.event()

t_total_begin = timer()
evt_total_begin.record()

# explicity tranfer memory
d_aryA = cuda.to_device(aryA)
d_aryB = cuda.device_array_like(aryA)

evt_kernel_begin.record()

t_kernel_begin = timer()
cu_copy_array[BLOCKCOUNT, BLOCKSIZE](d_aryB, d_aryA)
t_kernel_end = timer()

evt_kernel_end.record()

aryB = d_aryB.copy_to_host()

evt_total_end.record()

evt_total_end.synchronize()
t_total_end = timer()
Example #17
0
def accum_bp(grad,gW,gb,h):
	z = cuda.device_array_like(gW)
	mmprod(h,grad,z,transa='T')
	mmadd(gW,z,gW)
	mmadd(gb,grad,gb)
Example #18
0
aryA = np.arange(BLOCKSIZE * BLOCKCOUNT, dtype=np.float32)

print 'data size: %.1fMB' % (aryA.size * aryA.dtype.itemsize / (2**20))

evt_total_begin = cuda.event()
evt_total_end = cuda.event()

evt_kernel_begin = cuda.event()
evt_kernel_end = cuda.event()

t_total_begin = timer()
evt_total_begin.record()

# explicity tranfer memory
d_aryA = cuda.to_device(aryA)
d_aryB = cuda.device_array_like(aryA)

evt_kernel_begin.record()

t_kernel_begin = timer()
cu_copy_array[BLOCKCOUNT, BLOCKSIZE](d_aryB, d_aryA)
t_kernel_end = timer()

evt_kernel_end.record()

aryB = d_aryB.copy_to_host()

evt_total_end.record()

evt_total_end.synchronize()
t_total_end = timer()
Example #19
0
def main():
    device = cuda.get_current_device()
    maxtpb = device.MAX_THREADS_PER_BLOCK
    warpsize = device.WARP_SIZE

    # benchmark loop
    vary_warpsize = []

    baseline = []
    ilpx2 = []
    ilpx4 = []
    ilpx8 = []

    # For OSX 10.8 where the GPU is used for graphic as well,
    # increasing the following to 10 * 2 ** 20 seems to be necessary to
    # produce consistent result.
    approx_data_size = 1.5 * 2**20

    for multiplier in range(1, maxtpb // warpsize + 1):
        blksz = warpsize * multiplier
        gridsz = ceil_to_nearest(float(approx_data_size) / blksz, 8)
        print 'kernel config [%d, %d]' % (gridsz, blksz)

        N = blksz * gridsz
        A = np.arange(N, dtype=np.float32)
        B = np.arange(N, dtype=np.float32)

        print 'data size %dMB' % (N / 2.**20 * A.dtype.itemsize)

        dA = cuda.to_device(A)
        dB = cuda.to_device(B)

        assert float(N) / blksz == gridsz, (float(N) / blksz, gridsz)
        vary_warpsize.append(blksz)

        dC = cuda.device_array_like(A)
        basetime = time_this(vec_add, gridsz, blksz, (dA, dB, dC))
        expected_result = dC.copy_to_host()
        if basetime > 0:
            baseline.append(N / basetime)
        

        dC = cuda.device_array_like(A)
        x2time = time_this(vec_add_ilp_x2, gridsz//2, blksz, (dA, dB, dC))
        assert np.allclose(expected_result, dC.copy_to_host())
        if x2time > 0:
            ilpx2.append(N / x2time)

        dC = cuda.device_array_like(A)
        x4time = time_this(vec_add_ilp_x4, gridsz//4, blksz, (dA, dB, dC))
        assert np.allclose(expected_result, dC.copy_to_host())
        if x4time > 0:
            ilpx4.append(N / x4time)

        dC = cuda.device_array_like(A)
        x8time = time_this(vec_add_ilp_x8, gridsz//8, blksz, (dA, dB, dC))
        assert np.allclose(expected_result, dC.copy_to_host())
        if x8time > 0:
            ilpx8.append(N / x8time)

    pylab.plot(vary_warpsize[:len(baseline)], baseline, label='baseline')
    pylab.plot(vary_warpsize[:len(ilpx2)], ilpx2, label='ILP2')
    pylab.plot(vary_warpsize[:len(ilpx4)], ilpx4, label='ILP4')
    pylab.plot(vary_warpsize[:len(ilpx8)], ilpx8, label='ILP8')
    pylab.legend(loc=4)
    pylab.title(cuda.get_current_device().name)
    pylab.xlabel('block size')
    pylab.ylabel('float per second')
    pylab.show()
Example #20
0
from numbapro import vectorize, float32, cuda

src = np.arange(10**7, dtype=np.float32)
dst = np.empty_like(src)


@vectorize([float32(float32)], target='gpu')
def copy_kernel(src):
    return src


# Regular memory transfer

ts = timer()
d_src = cuda.to_device(src)
d_dst = cuda.device_array_like(dst)

copy_kernel(d_src, out=d_dst)

d_dst.copy_to_host(dst)
te = timer()

print 'regular', te - ts

del d_src, d_dst

assert np.allclose(dst, src)

# Pinned (pagelocked) memory transfer

with cuda.pinned(src, dst):