def predict(self, X): if not self.is_fit: print("GPR Model not fit yet.") return st = time.time() X = np.asarray(X) Kff = self.kernel2(self.train_X, self.train_X) # (N, N) Kyy = self.kernel2(X, X) # (k, k) Kfy = self.kernel2(self.train_X, X) # (N, k) et = time.time() print('kernel time cost', et - st, 's') st = time.time() Kff_gpu = gpuarray.to_gpu( (Kff + 1e-7 * np.eye(len(self.train_X))).astype(np.float32)) Kff_inv_gpu = sklin.inv(Kff_gpu) Kff_inv = Kff_inv_gpu.get() et = time.time() print('2inv time cost', et - st, 's') st = time.time() aT = np.random.randn(Kfy.shape[1], Kfy.shape[0]) aT[:, :] = Kfy.T[:, :] KfyT_gpu = gpuarray.to_gpu((aT).astype(np.float32)) Kfy_gpu = gpuarray.to_gpu(Kfy.astype(np.float32)) trainy_gpu = gpuarray.to_gpu(self.train_y.astype(np.float32)) Kff_inv_gpu = gpuarray.to_gpu(Kff_inv.astype(np.float32)) gpu_t = sklin.dot(KfyT_gpu, Kff_inv_gpu) mu = sklin.dot(gpu_t, trainy_gpu).get() cov = Kyy - sklin.dot(gpu_t, Kfy_gpu).get() et = time.time() print('dot time cost', et - st, 's') return mu, cov
def fast_matmul(x, y, x_type, y_type): ''' use pycuda to compute c = a * b ''' linalg.init() a_gpu = gpuarray.to_gpu(x.astype(x_type)) a_t_gpu = gpuarray.to_gpu(x.T.copy().astype(x_type)) b_gpu = gpuarray.to_gpu(y.astype(y_type)) # row_sum = gpuarray.zeros(shape = x[0].shape, dtype = x_type) row_sum = 0 # a = np.asarray(x, x_type) # b = np.asarray(y, y_type) # a_gpu = gpuarray.to_gpu(a) # b_gpu = gpuarray.to_gpu(b) t1_inside = time.time() c_gpu = linalg.dot(a_gpu, b_gpu) for a_i in a_gpu: # row_sum = misc.add(row_sum, a_i) row_sum += a_i gg = linalg.dot(a_gpu, b_gpu) gg = linalg.dot(a_i, a_i) gg = reduce(linalg.dot, (a_gpu, b_gpu, b_gpu, b_gpu)) # tmp1, tmp2 = linalg.dot(a_gpu, b_gpu), linalg.dot(b_gpu, b_gpu) z_gpu = a_gpu.copy() tmp = a_t_gpu # print('x.T\n', x.T) # print('tmp\n', tmp) # print('x = a_gpu: ', np.allclose(x, a_gpu.get())) # print('x.T = tmp: ', np.allclose(x.T, tmp.get())) a_prod = linalg.dot(a_gpu, tmp) t2_inside = time.time() print('inside cost {:.4f}s'.format(t2_inside - t1_inside)) a = np.random.randint(-5, 5, (3, 4)).astype(np.float32) a_gpu = gpuarray.to_gpu(a) norm_gpu = linalg.norm(a_gpu) print('is norm right?', np.linalg.norm(a) == norm_gpu) a_gpu = abs(a_gpu) column_sum = misc.sum(a_gpu, axis=0) column_sum = column_sum.reshape((1, -1)) all_one_gpu = gpuarray.to_gpu(np.ones((3, 1), np.float32)) div_mat_gpu = linalg.dot(all_one_gpu, column_sum) norm_1 = a_gpu / (div_mat_gpu + 1e-3) print(a_gpu) print(column_sum) print(column_sum.shape) print(norm_1) # abs_a = a_gpu.__abs__() # print(a) # print(abs_a) # c = abs_a + a_gpu # print(repr(c)) # print(type(c)) # c = 1/2 * c # print(a_gpu, c) return c_gpu.get(), a_prod.get(), row_sum.get()
def dot(self, X, Y, out=None, transa='N', transb='N'): if out: Z = linalg.dot(X, Y, out=out, transa=transa, transb=transb) else: Z = linalg.dot(X, Y, transa=transa, transb=transb) sync_only() return Z
def conv2d_forward_batch(self, inputs, params, bias, outputs, padding, stride): num_filters = params.shape[0] num_images, input_rows, input_cols, num_input_maps = inputs.shape kernel_shape = params.shape[1:] num_output_pixels = outputs.shape[1] * outputs.shape[2] num_kernel_params = np.prod(kernel_shape) out_shape = (num_output_pixels, num_filters) num_cuda_kernels = num_output_pixels * num_input_maps for i in range(num_images): col = self.zeros((num_output_pixels, num_kernel_params)) _im2col_fp32_impl(np.int32(num_cuda_kernels), inputs[i], np.int32(input_rows), np.int32(input_cols), np.int32(kernel_shape[0]), np.int32(kernel_shape[1]), np.int32(padding), np.int32(padding), np.int32(stride[0]), np.int32(stride[1]), np.int32(outputs.shape[2]), np.int32(num_input_maps), col.gpudata, block=(NUM_CUDA_THREADS, 1, 1), grid=(get_blocks(num_cuda_kernels), 1)) reshaped_params = params.reshape(num_filters, num_kernel_params) culinalg.dot(col, reshaped_params, transb='T', out=outputs[i].reshape(out_shape)) flat_outputs = flatten_all_but_last(outputs) self.add_mv(flat_outputs, bias, flat_outputs)
def cuda_dot3(A, b): print("cuda_dot3", A.shape, b.shape) # send b to GPU b_gpu = gpuarray.to_gpu(b) # transpose b on GPU bt_gpu = linalg.transpose(b_gpu) #remove b for now b_gpu.gpudata.free() del(b_gpu) # send A to GPU A_gpu = gpuarray.to_gpu(A) temp_gpu = linalg.dot(bt_gpu, A_gpu) bt_gpu.gpudata.free() del(bt_gpu) A_gpu.gpudata.free() del(A_gpu) # send b to GPU b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(temp_gpu, b_gpu) temp_gpu.gpudata.free() del(temp_gpu) b_gpu.gpudata.free() del(b_gpu) #theoretically possible to move into RAM, force cleanup on GPU and then return from RAM #but most likely not necessary return c_gpu.get()
def computeEnergy(D_v, S, T, _Lambda, _gamma_c, Alpha, Beta): l, m, n = S.shape sum_alpha_beta = gpuarray.zeros_like(D_v) sk_linalg.dot(Beta, Alpha, out=sum_alpha_beta) GR = grad(T) square_matrix(GR, GR) G_norm = gpuarray.zeros_like(T) sum_three_matrix(GR[0, :, :, :], GR[1, :, :, :], GR[2, :, :, :], G_norm, 1.0, 1.0, 1.0) sqrt_matrix(G_norm, G_norm) # multiply_matrix(G_norm, _Gamma, G_norm) ET = _gamma_c * gpuarray.sum(G_norm) SP = gpuarray.zeros_like(S) absolute_matrix(S, SP) multiply_matrix(SP, _Lambda, SP) ES = gpuarray.sum(SP) sparse = D_v - S.reshape(l * m * n, 1) - T.reshape(l * m * n, 1) - sum_alpha_beta square_matrix(sparse, sparse) EL = gpuarray.sum(sparse) E = 1 / 2 * EL.get() + ES.get() + ET.get() return EL.get(), ES.get(), ET.get(), E
def conv2d_forward_batch(self, inputs, params, bias, outputs, padding, stride): num_filters = params.shape[0] num_images, input_rows, input_cols, num_input_maps = inputs.shape kernel_shape = params.shape[1:] num_output_pixels = outputs.shape[1] * outputs.shape[2] num_kernel_params = np.prod(kernel_shape) out_shape = (num_output_pixels, num_filters) num_cuda_kernels = num_output_pixels * num_input_maps for i in range(num_images): col = self.zeros((num_output_pixels, num_kernel_params)) _im2col_fp32_impl(np.int32(num_cuda_kernels), inputs[i], np.int32(input_rows), np.int32(input_cols), np.int32(kernel_shape[0]), np.int32(kernel_shape[1]), np.int32(padding), np.int32(padding), np.int32(stride[0]), np.int32(stride[1]), np.int32(outputs.shape[2]), np.int32(num_input_maps), col.gpudata, block=(get_blocks(num_cuda_kernels), 1, 1), grid=(NUM_CUDA_THREADS, 1, 1)) reshaped_params = params.reshape(num_filters, num_kernel_params) culinalg.dot(col, reshaped_params, transb='T', out=outputs[i].reshape(out_shape)) flat_outputs = flatten_all_but_last(outputs) self.add_mv(flat_outputs, bias, flat_outputs)
def dot3(A, b): ''' Calculates matrix multiplication "b.T*A*b" on GPU. A has to be nxn. ''' #print("dot3 "+str(A.shape)+" "+str(b.shape)) # Make sure we dont run out of memory on the GPU if ((A.size + 2*b.size) <= 629088256): # send A to GPU A_gpu = gpuarray.to_gpu(A) # send b to GPU b_gpu = gpuarray.to_gpu(b) temp_gpu = linalg.dot(A_gpu, b_gpu) A_gpu.gpudata.free() del(A_gpu) # transpose b on GPU bt_gpu = linalg.transpose(b_gpu) #remove b b_gpu.gpudata.free() del(b_gpu) out_gpu = linalg.dot(bt_gpu, temp_gpu) return out_gpu.get() else: print("Too big for GPU, using CPU.") return np.dot(np.dot(b.T, A), b)
def dot3(A, b): ''' Calculates matrix multiplication "b.T*A*b" on GPU. ''' #print("dot3 "+str(A.shape)+" "+str(b.shape)) # send A to GPU A_gpu = gpuarray.to_gpu(A) # send b to GPU b_gpu = gpuarray.to_gpu(b) temp_gpu = linalg.dot(A_gpu, b_gpu) A_gpu.gpudata.free() del(A_gpu) # transpose b on GPU bt_gpu = linalg.transpose(b_gpu) #remove b b_gpu.gpudata.free() del(b_gpu) out_gpu = linalg.dot(bt_gpu, temp_gpu) return out_gpu.get()
def matrix_inverse(A, cuda = True): m, n = A.shape transpose = False pinv = None # A is m-by-n and the rank of A is equal to m (m ≤ n) then A has right inverse A_P = At * inv(A * At) # A is m-by-n and the rank of A is equal to n (n ≤ m) then A has left inverse A_P = A * inv(At * A) if m > n: A = np.ascontiguousarray(A.T) transpose = True # By default this calculates the right inverse At = np.ascontiguousarray(A.T) if cuda: A_gpu = gpua.to_gpu(A) At_gpu = gpua.to_gpu(At) inv = linalg.inv(linalg.dot(A_gpu, At_gpu)) pinv = linalg.dot(At_gpu, inv).get() else: inv = np.linalg.inv(A.dot(At)) pinv = At.dot(inv) if transpose: pinv = pinv.T return pinv
def CalcNoise(self, cut=False, mode=0): step = 100 blocks = np.ceil(1.0 * self.FSamps / step).astype(np.int) noisevec = np.zeros(self.FSamps) if (mode == 0): for i in range(blocks): start = i * step stop = (i + 1) * step if (stop > self.FSamps): print "stop!", i, stop stop = self.FSamps r2 = cula.dot(self.Real[start:stop], self.Real[start:stop:]) i2 = cula.dot(self.Imag[start:stop], self.Imag[start:stop]) noisesamps = len(self.Imag[start:stop]) * 2 noise = np.sqrt((r2 + i2) / noisesamps) noisevec[start:stop] = 1.0 / noise self.Real[start:stop] /= noise self.Imag[start:stop] /= noise self.gpu_fft_data[start:stop] /= noise #print "noise", i, noise if (cut == True): fftdataR = (self.gpu_fft_data.get()).real fftdataI = (self.gpu_fft_data.get()).imag Rbad = np.where(np.abs(fftdataR) > 6.0)[0] Ibad = np.where(np.abs(fftdataI) > 6.0)[0] print fftdataR[Rbad], fftdataI[Ibad] NRbad = len(Rbad) NIbad = len(Ibad) fftdata = self.gpu_fft_data.get() fftdata.real[Rbad] = np.random.normal(0, 1, NRbad) fftdata.imag[Ibad] = np.random.normal(0, 1, NIbad) print "bad", NRbad, NIbad self.gpu_fft_data = gpuarray.to_gpu(np.complex128(fftdata)) if (mode == 1): #r2 = cula.dot(self.Real[self.FSamps/2:], self.Real[self.FSamps/2:]) #i2 = cula.dot(self.Imag[self.FSamps/2:], self.Imag[self.FSamps/2:]) fftD = self.gpu_fft_data.get() r2 = np.dot(fftD.real[self.FSamps / 2:], fftD.real[self.FSamps / 2:]) i2 = np.dot(fftD.imag[self.FSamps / 2:], fftD.imag[self.FSamps / 2:]) noisesamps = len(fftD.imag[self.FSamps / 2:]) * 2 del fftD noise = np.sqrt((r2 + i2) / noisesamps) print "Noise mode 1: ", noise #self.Real = self.Real / noise #self.Imag = self.Imag / noise self.gpu_fft_data = self.gpu_fft_data / noise noisevec[:] = 1.0 / noise #self.Noise = gpuarray.empty(self.FSamps, np.float64) self.Noise = gpuarray.to_gpu(np.float64(noisevec))
def getTranformada_Inversa(test_image, diagonal): test_image = test_image.astype(np.float32) diagonal = diagonal.astype(np.float32) test_image = gpuarray.to_gpu(test_image) diagonal = gpuarray.to_gpu(diagonal) test_image_gpuT = linalg.transpose(test_image) testimage_gpu = linalg.dot(test_image_gpuT, diagonal) test_image_gpuT = linalg.transpose(testimage_gpu) testimage_gpu = linalg.dot(test_image_gpuT, diagonal) return testimage_gpu.get()
def getTranformada(test_image, diagonal): #multiplico cada fila por la diagonal diagonal = diagonal.astype(np.float32) test_image = gpuarray.to_gpu(test_image) diagonal = gpuarray.to_gpu(diagonal) testimage_gpu = linalg.dot(test_image, diagonal) testimageT_gpu = linalg.transpose(testimage_gpu) testimage_gpu = linalg.dot(testimageT_gpu, diagonal) testimageT_gpu = linalg.transpose(testimage_gpu) return testimageT_gpu.get()
def computeLambda(self): print('\t\tComputing lambda...') T = np.zeros(self.num_columns) if (self.GPU == True): if not self.affine: gpu_data = gpuarray.to_gpu(self.data) C_gpu = linalg.dot(gpu_data, gpu_data, transa='T') for i in xrange(self.num_columns): T[i] = linalg.norm(C_gpu[i, :]) else: gpu_data = gpuarray.to_gpu(self.data) # affine transformation y_mean_gpu = misc.mean(gpu_data, axis=1) # creating affine matrix to subtract to the data (may encounter problem with strides) aff_mat = np.zeros([self.num_rows, self.num_columns]).astype('f') for i in xrange(0, self.num_columns): aff_mat[:, i] = y_mean_gpu.get() aff_mat_gpu = gpuarray.to_gpu(aff_mat) gpu_data_aff = misc.subtract(aff_mat_gpu, gpu_data) C_gpu = linalg.dot(gpu_data, gpu_data_aff, transa='T') #computing euclidean norm (rows) for i in xrange(self.num_columns): T[i] = linalg.norm(C_gpu[i, :]) else: if not self.affine: T = np.linalg.norm(np.dot(self.data.T, self.data), axis=1) else: #affine transformation y_mean = np.mean(self.data, axis=1) tmp_mat = np.outer(y_mean, np.ones( self.num_columns)) - self.data T = np.linalg.norm(np.dot(self.data.T, tmp_mat), axis=1) _lambda = np.amax(T) return _lambda
def test_dot_matrix_h_complex128(self): a = np.asarray(np.random.rand(2, 4)+1j*np.random.rand(2, 4), np.complex128) b = np.asarray(np.random.rand(2, 2)+1j*np.random.rand(2, 2), np.complex128) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(a_gpu, b_gpu, 'c') assert np.allclose(np.dot(a.conj().T, b), c_gpu.get()) a = a.astype(np.complex128, order="F", copy=True) b = b.astype(np.complex128, order="F", copy=True) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(a_gpu, b_gpu, 'c') assert np.allclose(np.dot(a.conj().T, b), c_gpu.get())
def test_dot_vector_complex128(self): a = np.asarray(np.random.rand(5), np.complex128) b = np.asarray(np.random.rand(5), np.complex128) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c = linalg.dot(a_gpu, b_gpu) assert np.allclose(np.dot(a, b), c) a = a.astype(np.complex128, order="F", copy=True) b = b.astype(np.complex128, order="F", copy=True) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c = linalg.dot(a_gpu, b_gpu) assert np.allclose(np.dot(a, b), c)
def thunk(): alpha = gpuarray.to_gpu(np.squeeze(np.asarray(inputs[0]))[:, None]) x_t = gpuarray.to_gpu(np.asarray(inputs[1])[0, :, :]) x_f = gpuarray.to_gpu(np.asarray(inputs[2])[0, :, :]) Xt = cumath.exp(misc.add(linalg.dot(x_t, A), b)) Xf = cumath.exp(misc.add(linalg.dot(x_f, A), b)) Xtn = misc.sum(Xt, axis=1, keepdims=True) Xfn = misc.sum(Xf, axis=1, keepdims=True) Xt = misc.divide(Xt, Xtn) Xf = misc.divide(Xf, Xfn) w = misc.multiply(Xt, alpha) + misc.multiply(Xf, 1 - alpha) dq = Xt - Xf qdw = dq / w t1 = misc.sum(x * qdw, axis=1) f = 2 * depth + self.base.n t2 = f * misc.sum(dq, axis=1) / misc.sum(w, axis=1) t3 = misc.sum(x, axis=1) * misc.sum(qdw, axis=1) dalpha = t1 - t2 + t3 del dq, t1, f, t2, t3 iw = 1 / w S1 = misc.multiply( depth[:, None] * (self.base.n - 1) / self.base.n, iw) S2 = (self.base.n + depth[:, None]) / cumath.log( misc.sum(w, axis=1, keepdims=True)) F = misc.multiply(misc.subtract((x * iw) - S1, S2), alpha) del w, iw, S1, S2 cast = gpuarray.zeros((x_t.shape[1], Xt.shape[1]), dtype=theano.config.floatX) dLq_t = gpuarray.zeros(x_t.shape, dtype=theano.config.floatX) dLq_f = gpuarray.zeros(x_f.shape, dtype=theano.config.floatX) for i in range(Xt.shape[0]): S1 = misc.multiply(Xt[None, i, :], A) S2 = misc.sum(S1, axis=1, keepdims=True) S2 = misc.multiply(S2, misc.add(Xt[None, i, :], cast)) dLq_t[i, :] = misc.sum(misc.multiply(F[None, i, :], S1 - S2), axis=1) S1 = misc.multiply(Xf[None, i, :], A) S2 = misc.sum(S1, axis=1, keepdims=True) S2 = misc.multiply(S2, misc.add(Xf[None, i, :], cast)) dLq_f[i, :] = misc.sum(misc.multiply(F[None, i, :], S1 - S2), axis=1) outputs[0][0] = dalpha.get() outputs[1][0] = dLq_t.get() outputs[2][0] = dLq_f.get() for v in node.outputs: compute_map[v][0] = True
def _get_XTX_cuda(self, X, x_1, x_2, y_1, y_2): """Caluclate dot product between two array on gpu. Parameters ---------- X: array Array to normalize x_1: int Lower bound on slice on x-axis x_2: int Upper bound on slice x-axis y_1: int Lower bound on slice y-axis y_2: int Upper bound on slice y-axis Returns ------- XX.T: array X X.T array """ X_f, X_b = gpuarray.to_gpu(X[x_1:x_2, :]), gpuarray.to_gpu( X[y_1:y_2, :]) X_f_norm, X_b_norm = self._cuda_norm(X_f), self._cuda_norm(X_b) return linalg.dot(X_f_norm, X_b_norm, transb="T").get()
def __calc_A_shift_gpu(self, shift_x, shift_y): psis_gpu = self.converter.get_prolates_as_images() # TODO: need to assert that returns indeed a gpuarray n_psis = len(psis_gpu) if shift_x == 0 and shift_y == 0: return np.eye(n_psis) A_shift = gpuarray.zeros((n_psis, n_psis),'complex64') non_neg_freqs = self.converter.get_non_neg_freq_inds() psis_gpu_non_neg_freqs = psis_gpu[non_neg_freqs] psis_non_neg_shifted = circ_shift_kernel.circ_shift(psis_gpu_non_neg_freqs, shift_x, shift_y) psis_non_neg_shifted = self.converter.mask_points_inside_the_circle(psis_non_neg_shifted) psis_non_neg_shifted = psis_non_neg_shifted.reshape(len(psis_non_neg_shifted), -1) psis_gpu = psis_gpu.reshape(n_psis, -1) A_shift[non_neg_freqs] = linalg.dot(psis_non_neg_shifted, psis_gpu, transb='C') zero_freq_inds = self.converter.get_zero_freq_inds() pos_freq_inds = self.converter.get_pos_freq_inds() neg_freq_inds = self.converter.get_neg_freq_inds() A_shift[neg_freq_inds, zero_freq_inds] = A_shift[pos_freq_inds, zero_freq_inds] A_shift[neg_freq_inds, pos_freq_inds] = A_shift[pos_freq_inds, neg_freq_inds] A_shift[neg_freq_inds, neg_freq_inds] = A_shift[pos_freq_inds, pos_freq_inds] A_shift[neg_freq_inds] = linalg.conj(A_shift[neg_freq_inds]) # TODO: get rid of the transpose # return np.transpose(A_shift).copy() return np.transpose(A_shift).get().copy()
def forward(self, bottom, top): # print 'hanli crf forward -- ' # print 'self.diff.shape: ' + str(self.diff.shape); # self.diff.shape: (batchsize, 65536) # print 'crf bottom[0].data.shape: ' + str(bottom[0].data.shape); #crf bottom[0].data.shape: (batchsize, 11) # print 'raw degree bottom[1].data.shape: ' + str(bottom[1].data.shape); #(batchsize, 65536, 11) # print 'png bottom[2].data.shape: ' + str(bottom[2].data.shape); # (batchsize, 65536) # print 'np.dot(bottom[1].data[i,:,:], bottom[0].data[i,:]).shape: ' + str(np.dot(bottom[1].data[0,:,:], bottom[0].data[0,:]).shape); #(65536,) # print 'bottom[2].data[i,:].shape: ' + str(bottom[2].data[0,:].shape); # (65536,) with pu.caffe_cuda_context(): linalg.init() for i in range(self.diff.shape[0]): #a = bottom[1].data_as_pycuda_gpuarray() #b = bottom[0].data_as_pycuda_gpuarray() a = bottom[1].data[i, :, :].astype(np.float32) b = bottom[0].data[i, :].astype(np.float32) ##a = np.asarray(np.random.rand(4, 4), dtype=np.float32) ##b = np.asarray(np.random.rand(4), dtype=np.float32) #a_gpu = gpuarray.GPUArray(a, dtype=np.float32) #b_gpu = gpuarray.GPUArray(b, dtype=np.float32) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(a_gpu, b_gpu) #self.diff[i,:] = c_gpu + bottom[2].data[i,:] - bottom[3].data[i,:]; self.diff[i, :] = np.dot( bottom[1].data[i, :, :], bottom[0].data[ i, :]) + bottom[2].data[i, :] - bottom[3].data[i, :] top[0].data[...] = np.sum(self.diff**2) / bottom[3].num / 2. #self.transDiff = np.transpose(self.diff / bottom[3].num); # (65536, 50) a_gpu = gpuarray.to_gpu(self.diff / bottom[3].num) at_gpu = linalg.transpose(a_gpu) self.transDiff = at_gpu
def initParallelAlgorithms(): global bitonicSort_ fin = open("ParallelAlgorithms/bitonicSort.cu") mod = SourceModule(fin.read()) fin.close() bitonicSort_ = mod.get_function("bitonicSort") global finishCSM_ global getSumSquares_ fin = open("ParallelAlgorithms/CSMHelper.cu") mod = SourceModule(fin.read()) fin.close() finishCSM_ = mod.get_function("finishCSM") getSumSquares_ = mod.get_function("getSumSquares") #Run each of the algorithms on dummy data so that they're pre-compiled #1) Bitonic Sort X = np.random.randn(16, 16) N = np.int32(16) NPow2 = N NThreads = N/2 XG = gpuarray.to_gpu(X) bitonicSort_(XG, N, NPow2, block=(NThreads, 1, 1), grid=(X.shape[0], 1), shared=4*NPow2) linalg.init() #2) Other primitive operations NegXDotX = linalg.dot(XG, XG) XPlusX = skcuda.misc.add(XG, XG) XSqr = skcuda.misc.multiply(XG, XG) XSqr = skcuda.misc.sum(XSqr, 1) XPlusCol = skcuda.misc.add_matvec(XG, XSqr, 0)
def _correlate_matmul_cublas(self, frames_flat, mask): arr = np.ascontiguousarray(frames_flat[:, mask], dtype=np.float32) npix = arr.shape[1] # Pre-allocating memory for all bins might save a bit of time, # but would take more memory d_arr = garray.to_gpu(arr) d_outer = cublas.dot(d_arr, d_arr, transb="T", handle=self.cublas_handle) d_means = skmisc.mean(d_arr, axis=1, keepdims=True) d_denom_mat = cublas.dot(d_means, d_means, transb="T", handle=self.cublas_handle) self.sum_diagonals(d_outer, self.d_sumdiags1) self.sum_diagonals(d_denom_mat, self.d_sumdiags2) self.d_sumdiags1 /= self.d_sumdiags2 self.d_sumdiags1 /= npix return self.d_sumdiags1.get()
def dot_accum(a, b): """ Compute squared euclidean distance between two 2D arrays representing n-dimensional points using GPU. This computes the matrix-multiplication of the GPU versions of the inputs, gets it back to host CPU and then accumulates the squared sum of rows into it along the rows and columns respectively. Parameters ---------- A : ndarray 2D NumPy array of float dtype representing n-dimensional points, with each row being one point. B : ndarray 2D NumPy array of float dtype representing n-dimensional points, with each row being one point. Returns ------- out : ndarray This holds the euclidean distances. """ a_gpu = gpuarray.to_gpu(-2 * a) b_gpu = gpuarray.to_gpu(b) out = culinalg.dot(a_gpu, b_gpu, transb='T').get() out += np.einsum('ij,ij->i', a, a)[:, None] out += np.einsum('ij,ij->i', b, b) return out
def _rbf_kernel_vectorized_cublas(data1, data2, sigma=10): # pragma: no cover """kernel for edge similarity computed with the vectorized method Args: data1 (TYPE): pssm data 1 data2 (TYPE): pssm dta 2 sigma (int, optional): exponent of the exponetial Returns: np.array: value of the rbk kernel for all the pairs """ beta = 2 * sigma**2 d1_ = gpuarray.to_gpu(data1.astype(np.float32)) d2_ = gpuarray.to_gpu(data2.astype(np.float32)) mgpu = -2 * culinalg.dot(d1_, d2_, transa='N', transb='T') vgpu = cumisc.sum(d1_**2, axis=1)[:, None] cumisc.add_matvec(mgpu, vgpu, out=mgpu) vgpu = cumisc.sum(d2_**2, axis=1) cumisc.add_matvec(mgpu, vgpu, out=mgpu) mcpu = mgpu.get() return np.exp(-mcpu / beta).reshape(-1)
def _dev_lin(self, devX, devW, devB): """Linear function on GPU. Returns: devH (gpuarray): GPU matrix with the result. """ devH = misc.add_matvec(linalg.dot(devX, devW), devB, axis=1) return devH
def compute_analysis_cuda2(self, xb, y, R, P, H, HT=None, hph=None, calcP=True): if HT is None: HT = culinalg.transpose(H) HP = culinalg.dot(H, P) if hph is None: hph = culinalg.dot(HP, HT) Rhph = misc.add(R, hph) inv = culinalg.inv(Rhph) W = culinalg.dot(HP, inv, transa='T') Hxb = culinalg.dot(H, xb) yHxb = misc.subtract(y, Hxb) WyHxb = culinalg.dot(W, yHxb) xhat = misc.add(xb, WyHxb) #xhat = xb + culinalg.dot(W, (y - culinalg.dot(H, xb))) if calcP: I = culinalg.eye(P.shape[0]) WH = culinalg.dot(W, H) IWH = I - WH Phat = culinalg.dot(IWH, P) else: Phat = misc.zeros((1, ), dtype=P.dtype) return xhat, Phat
def NNMF_gpu(X,r,tol,V=v0,W=w0,verbose=1): Vr = V[:,0:r].copy() Wr = W[0:r,:].copy() X_gpu = gpuarray.to_gpu(X) V_gpu = gpuarray.to_gpu(Vr) W_gpu = gpuarray.to_gpu(Wr) #Frobinius norm at previous step B_gpu = linalg.dot(V_gpu, W_gpu) L = linalg.norm(X_gpu-B_gpu)**2 iteration = 0 while 1: #update V V_gpu *= linalg.dot(X_gpu,linalg.transpose(W_gpu)) V_gpu /= linalg.dot(B_gpu,linalg.transpose(W_gpu)) B_gpu = linalg.dot(V_gpu, W_gpu) #update W W_gpu *= linalg.dot(linalg.transpose(V_gpu),X_gpu) W_gpu /= linalg.dot(linalg.transpose(V_gpu),B_gpu) B_gpu = linalg.dot(V_gpu, W_gpu) Lnew = linalg.norm(X_gpu-B_gpu)**2 if abs(Lnew-L) <= tol*(L+1): break else: L = Lnew iteration += 1 if(verbose and iteration%50==0): print "At iteration %i, the loss is %.2f" %(iteration, L) return V_gpu,W_gpu,iteration
def _dot_matrix_tests(self, dtype, transa, transb): a = np.asarray(np.random.rand(4, 2), dtype) if transa == 'n': b = np.asarray(np.random.rand(2, 2), dtype) else: b = np.asarray(np.random.rand(4, 4), dtype) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(a_gpu, b_gpu, transa, transb) aa = a if transa == 'n' else a.T bb = b if transb == 'n' else b.T assert np.allclose(np.dot(aa, bb), c_gpu.get()) a = a.astype(dtype, order="F", copy=True) b = b.astype(dtype, order="F", copy=True) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(a_gpu, b_gpu, transa, transb) assert np.allclose(np.dot(aa, bb), c_gpu.get())
def _dev_tanh(self, devX, devW, devB): """Hyperbolic tangent function on GPU. Returns: devH (gpuarray): GPU matrix with the result. """ devH = misc.add_matvec(linalg.dot(devX, devW), devB, axis=1) cumath.tanh(devH, out=devH) return devH
def dot(a, b): ''' Calculates matrix multiplication "a*b" on GPU. ''' #print("dot "+str(a.shape)+" "+str(b.shape)) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) d_gpu = linalg.dot(a_gpu, b_gpu) return d_gpu.get()
def bigdot(self, X, Y, out=None, mask=None): if out: raise Exception("Not implemented in-place sparse multiplication") else: if type(X) == cusparse.CSR: Z = X.mm(Y) else: Z = linalg.dot(X, Y) sync_only() return Z
def thunk(): alpha = gpuarray.to_gpu(np.squeeze(np.asarray(inputs[0]))[:, None]) x_t = gpuarray.to_gpu(np.asarray(inputs[1])[0, :, :]) x_f = gpuarray.to_gpu(np.asarray(inputs[2])[0, :, :]) Xt = cumath.exp(misc.add(linalg.dot(x_t, A), b)) Xf = cumath.exp(misc.add(linalg.dot(x_f, A), b)) Xtn = misc.sum(Xt, axis=1, keepdims=True) Xfn = misc.sum(Xf, axis=1, keepdims=True) Xt = misc.divide(Xt, Xtn) Xf = misc.divide(Xf, Xfn) w = misc.multiply(Xt, alpha) + misc.multiply(Xf, 1 - alpha) wp = cumath.log(w) wpn = misc.sum(wp, axis=1, keepdims=True) / self.n wp = misc.subtract(wp, wpn) t1 = misc.sum(x * wp, axis=1) t2 = (self.n + depth) * cumath.log(misc.sum(w, axis=1)) t3 = depth * wpn outputs[0][0] = misc.sum(t1 - t2 + t3).get() for v in node.outputs: compute_map[v][0] = True
def mldivide(A, B): ''' CULA would be necessary for this function to work. :-/ ''' A_gpu = gpuarray.to_gpu(A) A_inv_gpu = linalg.inv(A) A_gpu.gpudata.free() del(A_gpu) B_gpu = gpuarray.to_gpu(B) out_gpu = linalg.dot(A_inv_gpu, B_gpu) return out_gpu.get()
def _dot_matrix_vector_tests(self, dtype): a = np.asarray(np.random.rand(4, 4), dtype) b = np.asarray(np.random.rand(4), dtype) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(a_gpu, b_gpu) assert np.allclose(np.dot(a, b), c_gpu.get()) a = np.asarray(np.random.rand(4), dtype) b = np.asarray(np.random.rand(4, 4), dtype) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(a_gpu, b_gpu) assert np.allclose(np.dot(a, b), c_gpu.get()) a = np.asarray(np.random.rand(4, 4), dtype) b = np.asarray(np.random.rand(4, 1), dtype) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(a_gpu, b_gpu) assert np.allclose(np.dot(a, b), c_gpu.get())
def skcuda_linalg(a, b): linalg.init() a = np.asarray(a, np.float32) b = np.asarray(b, np.float32) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(a_gpu, b_gpu, 'T') a_nrm = linalg.norm(a_gpu) b_nrm = linalg.norm(b_gpu) type(a_nrm) ans = misc.divide(c_gpu, a_nrm * b_nrm) print ans
def logis(y,x): end = 0 start = 0 x = x.astype(np.float32) y = y.astype(np.float32) start=time.time() # Translado de variable a GPU x_gpu = gpuarray.to_gpu(x) y_gpu = gpuarray.to_gpu(y) linalg.init() # Transpuesta de X x_gpu_T = linalg.transpose(x_gpu) beta_gpu = linalg.dot(linalg.dot(linalg.inv(linalg.dot(x_gpu_T,x_gpu)),x_gpu_T),y_gpu) j = 1 while(True): mu = sapply(x,beta_gpu.get()) mu = mu.astype(np.float32) mu_gpu = gpuarray.to_gpu(mu) V_gpu= linalg.diag(mu_gpu) f2_gpu = linalg.multiply(mu_gpu,1-mu_gpu) f3_gpu = linalg.diag(1/f2_gpu) f4_gpu = (y_gpu-mu_gpu) f5_gpu = linalg.dot(f3_gpu,f4_gpu) if(np.isnan(f5_gpu.get()).any()): f5_cpu = f5_gpu.get() f5_cpu = nanValue(f5_cpu) f5_gpu = gpuarray.to_gpu(f5_cpu.astype(np.float32)) y_1_gpu = linalg.dot(x_gpu,beta_gpu) + f5_gpu beta_1_gpu = linalg.dot(linalg.dot(linalg.dot(linalg.inv(linalg.dot(linalg.dot(x_gpu_T,V_gpu),x_gpu)),x_gpu_T),V_gpu),y_1_gpu) check_value = np.absolute(linalg.norm(beta_1_gpu-beta_gpu)) #if(check_value<0.00001): #break if(j == 10 or check_value<0.00001): break beta_gpu = beta_1_gpu j = j + 1 end = time.time() tiempo = (end-start) return {"iteraciones":j,"Betas":beta_gpu.get(),"time":tiempo}
def getCSMGPU(XG, YG): tbegin = time.time() GPUNeg2 = gpuarray.to_gpu(np.array([-2.0], dtype=np.float32)) YGT = linalg.transpose(YG) XSqr = skcuda.misc.multiply(XG, XG) XSqr = skcuda.misc.sum(XSqr, 1) YSqr = skcuda.misc.multiply(YG, YG) YSqr = skcuda.misc.sum(YSqr, 1) C = linalg.dot(XG, YGT) C = skcuda.misc.multiply(GPUNeg2, C) skcuda.misc.add_matvec(C, XSqr, 0, C) skcuda.misc.add_matvec(C, YSqr, 1, C) return C
def safe_sparse_dot(a, b, transa='N', transb='N', dense_output=True): ''' Parameters ---------- a : array b : array Returns ------- dot_product : array ''' return culinalg.dot(a, b, transa, transb) """Dot product that handle the sparse matrix case correctly
def _dev_sigm(self, devX, devW, devB): """Compute Sigmoid on GPU for a given array and return array.""" # def sigm(a): # block = a._block # grid = (int(np.ceil(1.0 * np.prod(a.shape) / block[0])), 1) # dev_sigm.prepared_call(grid, block, a.gpudata) # return a devH = misc.add_matvec(linalg.dot(devX, devW), devB, axis=1) block = devH._block grid = (int(np.ceil(1.0 * np.prod(devH.shape) / block[0])), 1) self.dev_sigm.prepared_call(grid, block, devH.gpudata) return devH
def dot(a, b): ''' Calculates matrix multiplication "a*b" on GPU. ''' #print("dot "+str(a.shape)+" "+str(b.shape)) # Make sure we dont run out of memory on the GPU if ((a.size + b.size + a.shape[0]*b.shape[1]) <= 629088256): a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) d_gpu = linalg.dot(a_gpu, b_gpu) return d_gpu.get() else: return np.dot(a, b)
def _predict(self, X, dev=False): """Predict a batch of data. Auxiliary function that implements a particular prediction. For prediction, use `ELM.predict()` instead. Args: X (matrix): input data size (N * `inputs`) dev (bool, optional): whether leave result in the GPU memory Returns: Y (matrix): predicted outputs size (N * `outputs`), always in float/double format. """ assert self.B is not None, "Solve the task before predicting" devH = self._project(X, dev=True) devY = linalg.dot(devH, self.B) Y = devY if dev else devY.get() return Y
def inner(a, b): ''' Calculates inner product of "a" and "b". ''' #print("inner "+str(a.shape)+" "+str(b.shape)) # send b to GPU a_gpu = gpuarray.to_gpu(np.matrix(a)) b_gpu = gpuarray.to_gpu(np.matrix(b)) out_gpu = linalg.dot(a_gpu, b_gpu, transb='T') #clear a_gpu.gpudata.free() del(a_gpu) b_gpu.gpudata.free() del(b_gpu) return out_gpu.get()
def cuda_dot2(b, A): print("cuda_dot2", b.shape, A.shape) # send b to GPU b_gpu = gpuarray.to_gpu(b) # transpose b on GPU bt_gpu = linalg.transpose(b_gpu) # send A to GPU A_gpu = gpuarray.to_gpu(A) out_gpu = linalg.dot(bt_gpu, A_gpu) b_gpu.gpudata.free() del(b_gpu) bt_gpu.gpudata.free() del(bt_gpu) A_gpu.gpudata.free() del(A_gpu) return out_gpu.get()
def getCSMGPU2(XG, YG): #Step 1: Sum of squares across rows dim = np.int32(XG.shape[1]) dimpow2 = roundUpPow2(dim) NThreads = np.int32(min(dimpow2, 512)) XSqr = gpuarray.empty(XG.shape[0], np.float32) YSqr = gpuarray.empty(YG.shape[0], np.float32) getSumSquares_(XG, XSqr, dim, dimpow2, block=(NThreads, 1, 1), grid=(XG.shape[0], 1), shared=4*dimpow2) getSumSquares_(YG, YSqr, dim, dimpow2, block=(NThreads, 1, 1), grid=(YG.shape[0], 1), shared=4*dimpow2) #Step 2: Do multiplication part YGT = linalg.transpose(YG) CSM = linalg.dot(XG, YGT) #Step 3: Add everything together Mp = np.array(XG.shape[0], dtype=np.int32) Np = np.array(YG.shape[0], dtype=np.int32) MPow2 = roundUpPow2(XG.shape[0]) NThreads = min(MPow2, 512) #CSM is N x M finishCSM_(CSM, XSqr, YSqr, Np, Mp, MPow2, block=(NThreads, 1, 1), grid=(YG.shape[0], 1)) return (CSM, XSqr, YSqr)
def dot2(b, A): ''' Calculates matrix multiplication "b.T*A" on GPU. ''' #print("dot2 "+str(b.shape)+" "+str(A.shape)) # send b to GPU b_gpu = gpuarray.to_gpu(b) # transpose b on GPU bt_gpu = linalg.transpose(b_gpu) # clear b b_gpu.gpudata.free() del(b_gpu) # send A to GPU A_gpu = gpuarray.to_gpu(A) out_gpu = linalg.dot(bt_gpu, A_gpu) #clear #bt_gpu.gpudata.free() #del(bt_gpu) #A_gpu.gpudata.free() #del(A_gpu) return out_gpu.get()
def dot2(b, A): ''' Calculates matrix multiplication "b.T*A" on GPU. ''' #print("dot2 "+str(b.shape)+" "+str(A.shape)) # Make sure we dont run out of memory on the GPU if ((A.size + b.size + A.shape[0]*b.shape[1]) <= 629088256): try: # send b to GPU b_gpu = gpuarray.to_gpu(b) # transpose b on GPU bt_gpu = linalg.transpose(b_gpu) # clear b b_gpu.gpudata.free() del(b_gpu) # send A to GPU A_gpu = gpuarray.to_gpu(A) out_gpu = linalg.dot(bt_gpu, A_gpu) except: # clear b b_gpu.gpudata.free() del(b_gpu) print("Too big for GPU, using CPU.") return np.dot(b.T, A) else: print("Too big for GPU, using CPU.") return np.dot(b.T, A) #clear #bt_gpu.gpudata.free() #del(bt_gpu) #A_gpu.gpudata.free() #del(A_gpu) return out_gpu.get()
def elmvis(Xraw, A, slowdown=10, report=5, maxtime=24*60*60, tol=0, batch=None, maxiter=None, maxupdate=None, maxstall=None, cossim=None, silent=False): """ELMVIS+ function running in GPU memory. """ X = Xraw / np.linalg.norm(Xraw, axis=1)[:, None] # unit-length version of X Xh = np.dot(A, X) # X_hat, predicted value of X N, d = X.shape I = np.arange(N) # index of samples # set default values if cossim is None: cossim = np.trace(X.T.dot(A).dot(X)) / N if maxiter is None: maxiter = N*N*N if maxupdate is None: maxupdate = N*N if maxstall is None: maxstall = N*N if not silent: print "original similarity: ", cossim # init GPU dt = X.dtype.type try: linalg.init() except ImportError as e: print e devA = gpuarray.to_gpu(A.astype(dt)) devX = gpuarray.to_gpu(X.astype(dt)) devXi1 = gpuarray.empty((d,), dtype=dt) devXh = linalg.dot(devA, devX) devAi = gpuarray.empty((N, 2), dtype=dt) devDelta = gpuarray.empty((2, d), dtype=dt) result = gpuarray.empty((d,), dtype=dt) # swap kernel kernel = """ __global__ void diff(%s *A, %s *Y, %s *AY, %s *result, long d, long N, long i1, long i2) { long j = blockDim.x * blockIdx.x + threadIdx.x; %s yi1 = Y[i1*d + j]; %s yi2 = Y[i2*d + j]; result[j] = (A[i1*N + i1] * (yi2 - yi1) + 2*AY[i1*d + j]) * (yi2 - yi1) + (A[i2*N + i2] * (yi1 - yi2) + 2*(AY[i2*d + j] + A[i2*N + i1]*(yi2 - yi1))) * (yi1 - yi2); } """ if dt is np.float64: kernel = kernel % ("double", "double", "double", "double", "double", "double") else: kernel = kernel % ("float", "float", "float", "float", "float", "float") mod_diff = SourceModule(kernel) dev_diff = mod_diff.get_function("diff") dev_diff.prepare("PPPPllll") block = result._block grid = (int(np.ceil(1.0 * result.shape[0] / block[0])), 1) t0 = tlast = time() stall = 0 iters = 0 updates = 0 updates_last = 0 iters_last = 0 ups_max = 0 while (iters < maxiter) and (stall < maxstall): iters += 1 stall += 1 # get two different random numbers i1, i2 = np.random.randint(0, N, size=2) while i1 == i2: i1, i2 = np.random.randint(0, N, size=2) dev_diff.prepared_call(grid, block, devA.gpudata, devX.gpudata, devXh.gpudata, result.gpudata, d, N, i1, i2) diff = np.sum(result.get()) if diff > tol: stall = 0 devAi[:, 0] = devA[:, i1] devAi[:, 1] = devA[:, i2] devDelta[0, :] = devX[i1, :] - devX[i2, :] devDelta[1, :] = devX[i2, :] - devX[i1, :] linalg.add_dot(devAi, devDelta, devXh, alpha=-1) tI = I[i1] I[i1] = I[i2] I[i2] = tI devXi1[:] = devX[i1, :] devX[i1] = devX[i2] devX[i2] = devXi1 cossim += diff / N updates += 1 if updates > maxupdate: break t = time() if t - tlast > report: ups = (updates-updates_last)*1.0/(t-tlast) ips = (iters-iters_last)*1.0/(t-tlast) if not silent: print "%d iters | %d updates | %.0f iters/s | %.0f updates/s | cos similarity = %.4f" % (iters, updates, ips, ups, cossim) updates_last = updates iters_last = iters tlast = t ups_max = max(ups, ups_max) if ups < ups_max/slowdown: break if t - t0 > maxtime: break ips = iters*1.0/(time()-t0) ups = updates*1.0/(time()-t0) Xraw[:] = Xraw[I] cossim = np.trace(X.T.dot(A).dot(X)) / N if not silent: print "final similarity: ", cossim info = {'cossim': cossim, 'iters': iters, 'updates': updates, 'ips': ips, 'ups': ups} return I, info
def getnextz(\ z_last,za, c_obs_long,\ Nx,Ny,nx,ny,nXobs,\ Se_inv,Sa_inv,\ Arule, Brule, Crule, Drule,\ KAxrule, KAyrule, \ KBxrule, KByrule, \ KCxrule, KCyrule, \ KDxrule, KDyrule): nx_last_long = cds.dot(Nx,z_last) ny_last_long = cds.dot(Ny,z_last) nx_last = matrix(reshape(nx_last_long,(ny-1,nx-1))); ny_last = matrix(reshape(ny_last_long,(ny-1,nx-1))); vbigK, bigc_last = getbigK(\ nx,ny,nXobs, nx_last, ny_last, Arule, Brule, Crule, Drule, KAxrule, KAyrule, KBxrule, KByrule, KCxrule, KCyrule, KDxrule, KDyrule) NxNy = vstack((Nx,Ny)) KN = cds.dot(vbigK, NxNy) nobs = nXobs*4 # int*int bigc_last_long = matrix(reshape(cds.T(bigc_last),(nobs,1))) delta_c = cds.substract(c_obs_long, bigc_last_long) # # This is the Gauss method # dz = linalg.inv(KN.T*KN)*KN.T*delta_c # z_next = z_last + dz # di2 = squeeze(dot(dz.T,dz)) # This is the optimal estimation method # nasledujici dva radky by se daly mergnout #term3 zz_gpu = gpuarray.to_gpu(z_last-za) delta_c_gpu = gpuarray.to_gpu(delta_c) KN_gpu = gpuarray.to_gpu(KN) # (important for term2) temp0_gpu = linalg.add_dot(KN_gpu, zz_gpu, delta_c_gpu) zz_gpu.gpudata.free() del(zz_gpu) Se_inv_gpu = gpuarray.to_gpu(Se_inv) # (important for term2) temp1_gpu = linalg.dot(Se_inv_gpu, temp0_gpu) #temp0_gpu.gpudata.free() #del(temp0_gpu) term3_gpu = linalg.dot(KN_gpu, temp1_gpu, transa="T") term3 = term3_gpu.get() temp1_gpu.gpudata.free() del(temp1_gpu) term3_gpu.gpudata.free() del(term3_gpu) # term2 temp2_gpu = linalg.dot(Se_inv_gpu, KN_gpu) Se_inv_gpu.gpudata.free() del(Se_inv_gpu) Sa_inv_gpu = gpuarray.to_gpu(Sa_inv) term2_gpu = linalg.add_dot(KN_gpu, temp2_gpu, Sa_inv_gpu, transa="T") temp2_gpu.gpudata.free() del(temp2_gpu) KN_gpu.gpudata.free() del(KN_gpu) term2 = term2_gpu.get() #term0 = cds.dot3(Se_inv, KN) #term2 = Sa_inv+term0 #term3 = cds.dot2(KN, cds.dot(Se_inv, (delta_c+cds.dot(KN,(z_last-za))))) z_next = za + np.linalg.solve(term2,term3) # same as term2\term3 dz_gpu = gpuarray.to_gpu(z_next-z_last) temp3_gpu = linalg.dot(term2_gpu, dz_gpu) term2_gpu.gpudata.free() del(term2_gpu) di2_gpu = linalg.dot(dz_gpu, temp3_gpu, transa='T') #dz = z_next - z_last #di2 = cds.dot3(term2, dz) # Get out return z_next, di2_gpu.get()
def dot_mm(self, a, b, out, transa=False, transb=False): transa = 'T' if transa else 'N' transb = 'T' if transb else 'N' culinalg.dot(a, b, transa=transa, transb=transb, out=out)
demo_types = [np.float32, np.float64] # we can use single or double precision precisions = ['single', 'double'] print("Principal Component Analysis Demo!") print("Compute all 100 principal components of a 1000x100 data matrix") print("Lets test if the first two resulting eigenvectors (principal components) are orthogonal, by dotting them and seeing if it is about zero, then we can see the amount of the origial variance explained by just two of the original 100 dimensions.\n\n\n") for i in range(len(demo_types)): demo_type = demo_types[i] X = np.random.rand(1000,100).astype(demo_type) # 1000 samples of 100-dimensional data vectors X_gpu = gpuarray.GPUArray((1000,100), demo_type, order="F") # note that order="F" or a transpose is necessary. fit_transform requires row-major matrices, and column-major is the default X_gpu.set(X) # copy data to gpu T_gpu = pca.fit_transform(X_gpu) # calculate the principal components dot_product = linalg.dot(T_gpu[:,0], T_gpu[:,1]) # show that the resulting eigenvectors are orthogonal print("The dot product of the two " + str(precisions[i]) + " precision eigenvectors is: " + str(dot_product)) # now get the variance of each eigenvector so we can see the percent explained by the first two std_vec = np.std(T_gpu.get(), axis=0) print("We explained " + str(100 * np.sum(std_vec[:2]) / np.sum(std_vec)) + "% of the variance with 2 principal components in " + str(precisions[i]) + " precision\n\n")
for t in demo_types: print "Testing matrix multiplication for type " + str(np.dtype(t)) if np.iscomplexobj(t()): a = np.asarray(np.random.rand(10, 5) + 1j * np.random.rand(10, 5), t) b = np.asarray(np.random.rand(5, 5) + 1j * np.random.rand(5, 5), t) c = np.asarray(np.random.rand(5, 5) + 1j * np.random.rand(5, 5), t) else: a = np.asarray(np.random.rand(10, 5), t) b = np.asarray(np.random.rand(5, 5), t) c = np.asarray(np.random.rand(5, 5), t) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = gpuarray.to_gpu(c) temp_gpu = culinalg.dot(a_gpu, b_gpu) d_gpu = culinalg.dot(temp_gpu, c_gpu) temp_gpu.gpudata.free() del (temp_gpu) print "Success status: ", np.allclose(np.dot(np.dot(a, b), c), d_gpu.get()) print "Testing vector multiplication for type " + str(np.dtype(t)) if np.iscomplexobj(t()): d = np.asarray(np.random.rand(5) + 1j * np.random.rand(5), t) e = np.asarray(np.random.rand(5) + 1j * np.random.rand(5), t) else: d = np.asarray(np.random.rand(5), t) e = np.asarray(np.random.rand(5), t) d_gpu = gpuarray.to_gpu(d) e_gpu = gpuarray.to_gpu(e)