Example #1
0
 def test_cublasSaxpy(self):
     alpha = np.float32(np.random.rand())
     x = np.random.rand(5).astype(np.float32)
     x_gpu = gpuarray.to_gpu(x)
     y = np.random.rand(5).astype(np.float32)
     y_gpu = gpuarray.to_gpu(y)
     cublas.cublasSaxpy(self.cublas_handle, x_gpu.size, alpha, x_gpu.gpudata, 1,
                        y_gpu.gpudata, 1)
     assert np.allclose(y_gpu.get(), alpha*x+y)
Example #2
0
# create a cuBLAS context. This is similar in nature to CUDA contexts
cublas_context_h = cublas.cublasCreate()
"""Level-1 AXPY (vector-vector)"""

# this is a direct wrapper to a low-level C function, so the input may seem more like a C function than a true Python function.
# In short, this performed an "AXPY" operation, ultimately putting the output data into the y_gpu array
# first input is always the CUDA context handle. We then have to specify the size of the vectors, since this function will be ultimately
# operating on C pointers; we can do this by using the size parameter of a gpuarray. Having typecasted our scalar already to a NumPy float32 variable,
# we can pass the a variable right over as the scalar parameter. We then hand the underlying C pointer of the x_gpu array to this function using the gpudata
# parameter. Then we specify the stride of the first array as 1: the stride specifies how many steps we should take between each input value.
# (In contrast, if you were using a vector from a column in a row-wise matrix, you would set the stride to the width of the matrix.)
# We then put in the pointer to the y_gpu array, and set its stride to 1 as well

#We can now use the cublasSaxpy function. The S stands for single precision, which is what we will need since we are working with 32-bit floating point arrays:
cublas.cublasSaxpy(cublas_context_h, x_gpu.size, a, x_gpu.gpudata, 1,
                   y_gpu.gpudata, 1)

print(y_gpu.get())
print 'This is close to the NumPy approximation: %s' % np.allclose(
    a * x + y, y_gpu.get())

w_gpu = gpuarray.to_gpu(x)
v_gpu = gpuarray.to_gpu(y)

#perform a dot product
dot_output = cublas.cublasSdot(cublas_context_h, v_gpu.size, v_gpu.gpudata, 1,
                               w_gpu.gpudata, 1)

print(dot_output)

l2_output = cublas.cublasSnrm2(cublas_context_h, v_gpu.size, v_gpu.gpudata, 1)
Example #3
0
y_pin = [
    cuda.register_host_memory(y[i * N / nStreams:(i + 1) * N / nStreams])
    for i in range(nStreams)
]

h = cublas.cublasCreate()

x_gpu = np.empty(nStreams, dtype=object)
y_gpu = np.empty(nStreams, dtype=object)
ans = np.empty(nStreams, dtype=object)

for i in range(nStreams):
    cublas.cublasSetStream(h, streams[i].handle)

    x_gpu[i] = gpuarray.to_gpu_async(x_pin[i], stream=streams[i])
    y_gpu[i] = gpuarray.to_gpu_async(y_pin[i], stream=streams[i])

    cublas.cublasSaxpy(h, x_gpu[i].size, a, x_gpu[i].gpudata, 1,
                       y_gpu[i].gpudata, 1)
    ans[i] = y_gpu[i].get_async(stream=streams[i])

cublas.cublasDestroy(h)

# Uncomment to check for errors in the calculation
#y_gpu = np.array([yg.get() for yg in y_gpu])
#y_gpu = np.array(y_gpu).reshape(y.shape)
#print np.allclose(y_gpu, a*x + y)

e.record()
e.synchronize()
print s.time_till(e), " ms"