def test_cublasSdot(self): x = np.random.rand(5).astype(np.float32) x_gpu = gpuarray.to_gpu(x) y = np.random.rand(5).astype(np.float32) y_gpu = gpuarray.to_gpu(y) result = cublas.cublasSdot(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1, y_gpu.gpudata, 1) assert np.allclose(result, np.dot(x, y))
def forward(self, vector1, vector2): with torch.cuda.device_of(vector1): output = vector1.new(1) handle = torch.cuda.current_blas_handle() stream = torch.cuda.current_stream() cublas.cublasSetStream(handle, stream) if isinstance(vector1, torch.cuda.FloatTensor): result = cublas.cublasSdot(handle, vector1.numel(), vector1.data_ptr(), 1, vector2.data_ptr(), 1) elif isinstance(vector1, torch.cuda.DoubleTensor): result = cublas.cublasDdot(handle, vector1.numel(), vector1.data_ptr(), 1, vector2.data_ptr(), 1) output = output.fill_(float(result)) self.save_for_backward(vector1, vector2) return output
# (In contrast, if you were using a vector from a column in a row-wise matrix, you would set the stride to the width of the matrix.) # We then put in the pointer to the y_gpu array, and set its stride to 1 as well #We can now use the cublasSaxpy function. The S stands for single precision, which is what we will need since we are working with 32-bit floating point arrays: cublas.cublasSaxpy(cublas_context_h, x_gpu.size, a, x_gpu.gpudata, 1, y_gpu.gpudata, 1) print(y_gpu.get()) print 'This is close to the NumPy approximation: %s' % np.allclose( a * x + y, y_gpu.get()) w_gpu = gpuarray.to_gpu(x) v_gpu = gpuarray.to_gpu(y) #perform a dot product dot_output = cublas.cublasSdot(cublas_context_h, v_gpu.size, v_gpu.gpudata, 1, w_gpu.gpudata, 1) print(dot_output) l2_output = cublas.cublasSnrm2(cublas_context_h, v_gpu.size, v_gpu.gpudata, 1) print(l2_output) cublas.cublasDestroy(cublas_context_h) #(f we want to operate on arrays of 64-bit real floating point values, (float64 in NumPy and PyCUDA), then we would use the cublasDaxpy) """Level-2 GEMV (general matrix-vector)""" # m and n are the number of rows and columns m = 10 n = 100