def test_cublasSaxpy(self): alpha = np.float32(np.random.rand()) x = np.random.rand(5).astype(np.float32) x_gpu = gpuarray.to_gpu(x) y = np.random.rand(5).astype(np.float32) y_gpu = gpuarray.to_gpu(y) cublas.cublasSaxpy(self.cublas_handle, x_gpu.size, alpha, x_gpu.gpudata, 1, y_gpu.gpudata, 1) assert np.allclose(y_gpu.get(), alpha*x+y)
# create a cuBLAS context. This is similar in nature to CUDA contexts cublas_context_h = cublas.cublasCreate() """Level-1 AXPY (vector-vector)""" # this is a direct wrapper to a low-level C function, so the input may seem more like a C function than a true Python function. # In short, this performed an "AXPY" operation, ultimately putting the output data into the y_gpu array # first input is always the CUDA context handle. We then have to specify the size of the vectors, since this function will be ultimately # operating on C pointers; we can do this by using the size parameter of a gpuarray. Having typecasted our scalar already to a NumPy float32 variable, # we can pass the a variable right over as the scalar parameter. We then hand the underlying C pointer of the x_gpu array to this function using the gpudata # parameter. Then we specify the stride of the first array as 1: the stride specifies how many steps we should take between each input value. # (In contrast, if you were using a vector from a column in a row-wise matrix, you would set the stride to the width of the matrix.) # We then put in the pointer to the y_gpu array, and set its stride to 1 as well #We can now use the cublasSaxpy function. The S stands for single precision, which is what we will need since we are working with 32-bit floating point arrays: cublas.cublasSaxpy(cublas_context_h, x_gpu.size, a, x_gpu.gpudata, 1, y_gpu.gpudata, 1) print(y_gpu.get()) print 'This is close to the NumPy approximation: %s' % np.allclose( a * x + y, y_gpu.get()) w_gpu = gpuarray.to_gpu(x) v_gpu = gpuarray.to_gpu(y) #perform a dot product dot_output = cublas.cublasSdot(cublas_context_h, v_gpu.size, v_gpu.gpudata, 1, w_gpu.gpudata, 1) print(dot_output) l2_output = cublas.cublasSnrm2(cublas_context_h, v_gpu.size, v_gpu.gpudata, 1)
y_pin = [ cuda.register_host_memory(y[i * N / nStreams:(i + 1) * N / nStreams]) for i in range(nStreams) ] h = cublas.cublasCreate() x_gpu = np.empty(nStreams, dtype=object) y_gpu = np.empty(nStreams, dtype=object) ans = np.empty(nStreams, dtype=object) for i in range(nStreams): cublas.cublasSetStream(h, streams[i].handle) x_gpu[i] = gpuarray.to_gpu_async(x_pin[i], stream=streams[i]) y_gpu[i] = gpuarray.to_gpu_async(y_pin[i], stream=streams[i]) cublas.cublasSaxpy(h, x_gpu[i].size, a, x_gpu[i].gpudata, 1, y_gpu[i].gpudata, 1) ans[i] = y_gpu[i].get_async(stream=streams[i]) cublas.cublasDestroy(h) # Uncomment to check for errors in the calculation #y_gpu = np.array([yg.get() for yg in y_gpu]) #y_gpu = np.array(y_gpu).reshape(y.shape) #print np.allclose(y_gpu, a*x + y) e.record() e.synchronize() print s.time_till(e), " ms"