def impl_test_binaryop_matvec(self, dtype): x = np.random.normal(scale=5.0, size=(3, 5)).astype(dtype) a = np.random.normal(scale=5.0, size=(1, 5)).astype(dtype) b = np.random.normal(scale=5.0, size=(3, 1)).astype(dtype) # the following two test correct broadcasting on 0D vectors c = np.random.normal(scale=5.0, size=(5, )).astype(dtype) d = np.random.normal(scale=5.0, size=(3, )).astype(dtype) x_gpu = gpuarray.to_gpu(x) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = gpuarray.to_gpu(c) d_gpu = gpuarray.to_gpu(d) out = gpuarray.empty(x.shape, dtype=dtype) # addition res = misc.add_matvec(x_gpu, a_gpu, out=out).get() assert np.allclose(res, x+a) assert np.allclose(misc.add_matvec(x_gpu, b_gpu).get(), x+b) assert np.allclose(misc.add_matvec(x_gpu, c_gpu).get(), x+c) assert_raises(ValueError, misc.add_matvec, x_gpu, d_gpu) # multiplication res = misc.mult_matvec(x_gpu, a_gpu, out=out).get() assert np.allclose(res, x*a) assert np.allclose(misc.mult_matvec(x_gpu, b_gpu).get(), x*b) assert np.allclose(misc.mult_matvec(x_gpu, c_gpu).get(), x*c) assert_raises(ValueError, misc.mult_matvec, x_gpu, d_gpu) # division res = misc.div_matvec(x_gpu, a_gpu, out=out).get() assert np.allclose(res, x/a) assert np.allclose(misc.div_matvec(x_gpu, b_gpu).get(), x/b) assert np.allclose(misc.div_matvec(x_gpu, c_gpu).get(), x/c) assert_raises(ValueError, misc.div_matvec, x_gpu, d_gpu)
def impl_test_binaryop_matvec(self, dtype): x = np.random.normal(scale=5.0, size=(3, 5)).astype(dtype) a = np.random.normal(scale=5.0, size=(1, 5)).astype(dtype) b = np.random.normal(scale=5.0, size=(3, 1)).astype(dtype) # the following two test correct broadcasting on 0D vectors c = np.random.normal(scale=5.0, size=(5, )).astype(dtype) d = np.random.normal(scale=5.0, size=(3, )).astype(dtype) x_gpu = gpuarray.to_gpu(x) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = gpuarray.to_gpu(c) d_gpu = gpuarray.to_gpu(d) out = gpuarray.empty(x.shape, dtype=dtype) # addition res = misc.add_matvec(x_gpu, a_gpu, out=out).get() assert np.allclose(res, x + a) assert np.allclose(misc.add_matvec(x_gpu, b_gpu).get(), x + b) assert np.allclose(misc.add_matvec(x_gpu, c_gpu).get(), x + c) assert_raises(ValueError, misc.add_matvec, x_gpu, d_gpu) # multiplication res = misc.mult_matvec(x_gpu, a_gpu, out=out).get() assert np.allclose(res, x * a) assert np.allclose(misc.mult_matvec(x_gpu, b_gpu).get(), x * b) assert np.allclose(misc.mult_matvec(x_gpu, c_gpu).get(), x * c) assert_raises(ValueError, misc.mult_matvec, x_gpu, d_gpu) # division res = misc.div_matvec(x_gpu, a_gpu, out=out).get() assert np.allclose(res, x / a) assert np.allclose(misc.div_matvec(x_gpu, b_gpu).get(), x / b) assert np.allclose(misc.div_matvec(x_gpu, c_gpu).get(), x / c) assert_raises(ValueError, misc.div_matvec, x_gpu, d_gpu)
def _rbf_kernel_vectorized_cublas(data1, data2, sigma=10): # pragma: no cover """kernel for edge similarity computed with the vectorized method Args: data1 (TYPE): pssm data 1 data2 (TYPE): pssm dta 2 sigma (int, optional): exponent of the exponetial Returns: np.array: value of the rbk kernel for all the pairs """ beta = 2 * sigma**2 d1_ = gpuarray.to_gpu(data1.astype(np.float32)) d2_ = gpuarray.to_gpu(data2.astype(np.float32)) mgpu = -2 * culinalg.dot(d1_, d2_, transa='N', transb='T') vgpu = cumisc.sum(d1_**2, axis=1)[:, None] cumisc.add_matvec(mgpu, vgpu, out=mgpu) vgpu = cumisc.sum(d2_**2, axis=1) cumisc.add_matvec(mgpu, vgpu, out=mgpu) mcpu = mgpu.get() return np.exp(-mcpu / beta).reshape(-1)
def _forward_pass(self, activations): """Perform a forward pass on the network by computing the values of the neurons in the hidden layers and the output layer. Parameters ---------- activations : list, length = n_layers - 1 The ith element of the list holds the values of the ith layer. with_output_activation : bool, default True If True, the output passes through the output activation function, which is either the softmax function or the logistic function """ hidden_activation = ACTIVATIONS[self.activation] # Iterate over the hidden layers for i in range(self.n_layers_ - 1): activations[i + 1] = safe_sparse_dot(activations[i], self.coefs_[i]) activations[i + 1] = cumisc.add_matvec(activations[i + 1], self.intercepts_[i], axis=1) # For the hidden layers if (i + 1) != (self.n_layers_ - 1): activations[i + 1] = hidden_activation(activations[i + 1]) # For the last layer output_activation = ACTIVATIONS[self.out_activation_] activations[i + 1] = output_activation(activations[i + 1]) return activations
def _dev_lin(self, devX, devW, devB): """Linear function on GPU. Returns: devH (gpuarray): GPU matrix with the result. """ devH = misc.add_matvec(linalg.dot(devX, devW), devB, axis=1) return devH
def _dev_tanh(self, devX, devW, devB): """Hyperbolic tangent function on GPU. Returns: devH (gpuarray): GPU matrix with the result. """ devH = misc.add_matvec(linalg.dot(devX, devW), devB, axis=1) cumath.tanh(devH, out=devH) return devH
def get_distances_to_centers(self, data): # make sure the array is c order data = np.asarray(data, dtype=np.float32, order='C') # ship to gpu data_gpu = gpuarray.to_gpu(data) # alloc space on gpu for distances dists_shape = (data.shape[0], self.centers.shape[0]) dists_gpu = gpuarray.zeros(dists_shape, np.float32) # calc data norms on gpu data_norms = cumisc.sum(data_gpu**2, axis=1) # calc distance on gpu cumisc.add_matvec(dists_gpu, self.center_norms, 1, dists_gpu) cumisc.add_matvec(dists_gpu, data_norms, 0, dists_gpu) culinalg.add_dot(data_gpu, self.centers_gpu, dists_gpu, transb='T', alpha=-2.0) return dists_gpu
def _dev_sigm(self, devX, devW, devB): """Compute Sigmoid on GPU for a given array and return array.""" # def sigm(a): # block = a._block # grid = (int(np.ceil(1.0 * np.prod(a.shape) / block[0])), 1) # dev_sigm.prepared_call(grid, block, a.gpudata) # return a devH = misc.add_matvec(linalg.dot(devX, devW), devB, axis=1) block = devH._block grid = (int(np.ceil(1.0 * np.prod(devH.shape) / block[0])), 1) self.dev_sigm.prepared_call(grid, block, devH.gpudata) return devH
def demosaick_gpu(img): img = gp.to_gpu(img) p2x = im2col(img, _i2c2) cm.log(img + _eps, out=img) p1x = im2col(img, _i2c1) wA = p1x.shape[0] wB = p2x.shape[0] hA = p1x.shape[1] hB = p2x.shape[1] # Path 1 p1x = p1x.reshape([wA * hA, 576]) p1y = lg.dot(p1x, _wts.int1) cm.exp(p1y, out=p1y) p1y = p1y.reshape([wA * hA * 64, 3 * _ofac]) p1x = lg.dot(p1y, _wts.int2) msc.add_matvec(p1x, _wts.int2b, out=p1x) p1x = p1x.reshape([wA * hA * 64 * 3, _ofac]) # Path 2 # conv1 p2x = p2x.reshape([wB * hB, 64]) p2y = lg.dot(p2x, _wts.c1) msc.add_matvec(p2y, _wts.c1b, out=p2y) gp.maximum(p2y, 0., p2y) p2y = p2y.reshape([wB, hB, _numsel]) # conv2 shI = [wB - 1, hB - 1, _numsel] shM = [(wB - 1) * (hB - 1), _numsel] p2x = gp.empty(shM, dtype=np.float32) pTT = gp.empty(shI, dtype=np.float32) pTT = pTT.reshape(shI) pTT[...] = p2y[0:-1, 0:-1, :] pTT = pTT.reshape(shM) p2x = lg.dot(pTT, _wts.c200) pTT = pTT.reshape(shI) pTT[...] = p2y[0:-1, 1:, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c201, p2x) pTT = pTT.reshape(shI) pTT[...] = p2y[1:, 0:-1, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c210, p2x) pTT = pTT.reshape(shI) pTT[...] = p2y[1:, 1:, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c211, p2x) msc.add_matvec(p2x, _wts.c2b, out=p2x) gp.maximum(p2x, 0., p2x) p2x = p2x.reshape(shI) # conv 3 shI = [wB - 2, hB - 2, _numsel] shM = [(wB - 2) * (hB - 2), _numsel] p2y = gp.empty(shM, dtype=np.float32) pTT = gp.empty(shI, dtype=np.float32) pTT = pTT.reshape(shI) pTT[...] = p2x[0:-1, 0:-1, :] pTT = pTT.reshape(shM) p2y = lg.dot(pTT, _wts.c300) pTT = pTT.reshape(shI) pTT[...] = p2x[0:-1, 1:, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c301, p2y) pTT = pTT.reshape(shI) pTT[...] = p2x[1:, 0:-1, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c310, p2y) pTT = pTT.reshape(shI) pTT[...] = p2x[1:, 1:, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c311, p2y) msc.add_matvec(p2y, _wts.c3b, out=p2y) gp.maximum(p2y, 0., p2y) p2x = lg.dot(p2y, _wts.sout) msc.add_matvec(p2x, _wts.soutb, out=p2x) gp.maximum(p2x, 0., p2x) p2x = p2x.reshape(p1x.shape) # Combine p1x *= p2x p1 = msc.sum(p1x, axis=1) gp.maximum(p1, 0., p1) gp.minimum(p1, 1., p1) p1 = p1.reshape([wA, hA, 64 * 3]) im = p2im(p1.get()) return im
def _impl_test_binaryop_matvec(self, dtype): if issubclass(dtype, numbers.Integral): x = np.random.randint(1, 10, 15).reshape((3, 5)).astype(dtype) a = np.random.randint(1, 10, 5).reshape((1, 5)).astype(dtype) b = np.random.randint(1, 10, 3).reshape((3, 1)).astype(dtype) # the following two test correct broadcasting on 0D vectors c = np.random.randint(1, 10, 5).reshape((5, )).astype(dtype) d = np.random.randint(1, 10, 3).reshape((3, )).astype(dtype) else: x = np.random.normal(scale=5.0, size=(3, 5)).astype(dtype) a = np.random.normal(scale=5.0, size=(1, 5)).astype(dtype) b = np.random.normal(scale=5.0, size=(3, 1)).astype(dtype) # the following two test correct broadcasting on 0D vectors c = np.random.normal(scale=5.0, size=(5, )).astype(dtype) d = np.random.normal(scale=5.0, size=(3, )).astype(dtype) x_gpu = gpuarray.to_gpu(x) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = gpuarray.to_gpu(c) d_gpu = gpuarray.to_gpu(d) out = gpuarray.empty(x.shape, dtype=dtype) # addition res = misc.add_matvec(x_gpu, a_gpu, out=out).get() assert_allclose(res, x + a, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.add_matvec(x_gpu, b_gpu).get(), x + b, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.add_matvec(x_gpu, c_gpu).get(), x + c, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_raises(ValueError, misc.add_matvec, x_gpu, d_gpu) # multiplication res = misc.mult_matvec(x_gpu, a_gpu, out=out).get() assert_allclose(res, x * a, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.mult_matvec(x_gpu, b_gpu).get(), x * b, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.mult_matvec(x_gpu, c_gpu).get(), x * c, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_raises(ValueError, misc.mult_matvec, x_gpu, d_gpu) # division res = misc.div_matvec(x_gpu, a_gpu, out=out).get() if issubclass(dtype, numbers.Integral): assert_allclose(res, x // a, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.div_matvec(x_gpu, b_gpu).get(), x // b, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.div_matvec(x_gpu, c_gpu).get(), x // c, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) else: assert_allclose(res, x / a, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.div_matvec(x_gpu, b_gpu).get(), x / b, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.div_matvec(x_gpu, c_gpu).get(), x / c, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_raises(ValueError, misc.div_matvec, x_gpu, d_gpu)
def add_mv(self, m, v, out): cumisc.add_matvec(m, v, out=out)
def squared_sum(a, b, method): """ Compute squared summations of rows and then their pairwise summations. Parameters ---------- A : ndarray B : ndarray method : str This chooses the method for the computations. It can be 'add_togpu' or 'togpu_misc_add' or 'togpu_cuda_add'. Returns ------- out : GPUArray Compute squared summations of each row for each of the ndarrays giving us two 1D arrays. Then, compute their pairwise summations to result in a 2D array. There are three workflows, thus three possible values for the corresponding argument that chooses one of those values for : 'method'. They are listed below: 'add_togpu' : Compute squared sum of rows of the inputs and then perform broadcasted element-wise summations, all on CPU. Then, transfer this array to GPU as the output. 'togpu_misc_add' : Compute squared sum of rows of the inputs, giving us two `1D` arrays. Transfer these as two arrays onto GPU. Create a `zeros` array directly on GPU and in two steps add in the two summed arrays in a broadcasted manner, using 'skcuda.misc.add.add_matvec' along the rows and columns, giving us the pairwise summations. 'togpu_cuda_add' : Same as previous one, but instead of using 'skcuda.misc.add.add_matvec', we would roll out our own CUDA kernel, with the idea of having more control, specifically making use of threads and blocks and in the process attaining best possible performance. """ c_gpu = None # Initialize output if method == "add_togpu": c = np.einsum('ij,ij->i', a, a)[:, None] + np.einsum('ij,ij->i', b, b) c_gpu = gpuarray.to_gpu(c) elif method == "togpu_misc_add": a1_gpu = gpuarray.to_gpu(np.einsum('ij,ij->i', a, a)[:, None]) b1_gpu = gpuarray.to_gpu(np.einsum('ij,ij->i', b, b)) M, N = a.shape[0], b.shape[0] c_gpu = gpuarray.zeros((M, N), dtype=np.float32) misc.add_matvec(c_gpu, a1_gpu, out=c_gpu) misc.add_matvec(c_gpu, b1_gpu, out=c_gpu) elif method == "togpu_cuda_add": c_gpu = addvecs(np.einsum('ij,ij->i', a, a), np.einsum('ij,ij->i', b, b)) else: raise Exception("Invalid method.") return c_gpu
def _impl_test_binaryop_matvec(self, dtype): if issubclass(dtype, numbers.Integral): x = np.random.randint(1, 10, 15).reshape((3, 5)).astype(dtype) a = np.random.randint(1, 10, 5).reshape((1, 5)).astype(dtype) b = np.random.randint(1, 10, 3).reshape((3, 1)).astype(dtype) # the following two test correct broadcasting on 0D vectors c = np.random.randint(1, 10, 5).reshape((5, )).astype(dtype) d = np.random.randint(1, 10, 3).reshape((3, )).astype(dtype) else: x = np.random.normal(scale=5.0, size=(3, 5)).astype(dtype) a = np.random.normal(scale=5.0, size=(1, 5)).astype(dtype) b = np.random.normal(scale=5.0, size=(3, 1)).astype(dtype) # the following two test correct broadcasting on 0D vectors c = np.random.normal(scale=5.0, size=(5, )).astype(dtype) d = np.random.normal(scale=5.0, size=(3, )).astype(dtype) x_gpu = gpuarray.to_gpu(x) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = gpuarray.to_gpu(c) d_gpu = gpuarray.to_gpu(d) out = gpuarray.empty(x.shape, dtype=dtype) # addition res = misc.add_matvec(x_gpu, a_gpu, out=out).get() assert_allclose(res, x+a, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.add_matvec(x_gpu, b_gpu).get(), x+b, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.add_matvec(x_gpu, c_gpu).get(), x+c, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_raises(ValueError, misc.add_matvec, x_gpu, d_gpu) # multiplication res = misc.mult_matvec(x_gpu, a_gpu, out=out).get() assert_allclose(res, x*a, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.mult_matvec(x_gpu, b_gpu).get(), x*b, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.mult_matvec(x_gpu, c_gpu).get(), x*c, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_raises(ValueError, misc.mult_matvec, x_gpu, d_gpu) # division res = misc.div_matvec(x_gpu, a_gpu, out=out).get() if issubclass(dtype, numbers.Integral): assert_allclose(res, x//a, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.div_matvec(x_gpu, b_gpu).get(), x//b, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.div_matvec(x_gpu, c_gpu).get(), x//c, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) else: assert_allclose(res, x/a, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.div_matvec(x_gpu, b_gpu).get(), x/b, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.div_matvec(x_gpu, c_gpu).get(), x/c, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_raises(ValueError, misc.div_matvec, x_gpu, d_gpu)