def correlations(X, Y, useGPU): if useGPU: import pycuda.autoinit import pycuda.gpuarray as gpuarray import skcuda.linalg as linalg linalg.init() X_gpu = gpuarray.to_gpu(X) XT_gpu = linalg.transpose(X_gpu) cxx = linalg.mdot(XT_gpu, X_gpu).get() XT_gpu = linalg.transpose(X_gpu) X_gpu.gpudata.free() del X_gpu Y_gpu = gpuarray.to_gpu(Y) cxy = linalg.mdot(XT_gpu, Y_gpu).get() cyx = cxy.T YT_gpu = linalg.transpose(Y_gpu) cyy = linalg.mdot(YT_gpu, Y_gpu).get() else: cxx = np.dot(X.T, X) cxy = np.dot(X.T, Y) cyx = cxy.T cyy = np.dot(Y.T, Y) return cxx, cxy, cyx, cyy
def NNMF_gpu(X,r,tol,V=v0,W=w0,verbose=1): Vr = V[:,0:r].copy() Wr = W[0:r,:].copy() X_gpu = gpuarray.to_gpu(X) V_gpu = gpuarray.to_gpu(Vr) W_gpu = gpuarray.to_gpu(Wr) #Frobinius norm at previous step B_gpu = linalg.dot(V_gpu, W_gpu) L = linalg.norm(X_gpu-B_gpu)**2 iteration = 0 while 1: #update V V_gpu *= linalg.dot(X_gpu,linalg.transpose(W_gpu)) V_gpu /= linalg.dot(B_gpu,linalg.transpose(W_gpu)) B_gpu = linalg.dot(V_gpu, W_gpu) #update W W_gpu *= linalg.dot(linalg.transpose(V_gpu),X_gpu) W_gpu /= linalg.dot(linalg.transpose(V_gpu),B_gpu) B_gpu = linalg.dot(V_gpu, W_gpu) Lnew = linalg.norm(X_gpu-B_gpu)**2 if abs(Lnew-L) <= tol*(L+1): break else: L = Lnew iteration += 1 if(verbose and iteration%50==0): print "At iteration %i, the loss is %.2f" %(iteration, L) return V_gpu,W_gpu,iteration
def getTranformada(test_image, diagonal): #multiplico cada fila por la diagonal diagonal = diagonal.astype(np.float32) test_image = gpuarray.to_gpu(test_image) diagonal = gpuarray.to_gpu(diagonal) testimage_gpu = linalg.dot(test_image, diagonal) testimageT_gpu = linalg.transpose(testimage_gpu) testimage_gpu = linalg.dot(testimageT_gpu, diagonal) testimageT_gpu = linalg.transpose(testimage_gpu) return testimageT_gpu.get()
def getTranformada_Inversa(test_image, diagonal): test_image = test_image.astype(np.float32) diagonal = diagonal.astype(np.float32) test_image = gpuarray.to_gpu(test_image) diagonal = gpuarray.to_gpu(diagonal) test_image_gpuT = linalg.transpose(test_image) testimage_gpu = linalg.dot(test_image_gpuT, diagonal) test_image_gpuT = linalg.transpose(testimage_gpu) testimage_gpu = linalg.dot(test_image_gpuT, diagonal) return testimage_gpu.get()
def test_transpose_float64(self): # M < N a = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], np.float64) a_gpu = gpuarray.to_gpu(a) at_gpu = linalg.transpose(a_gpu) assert np.all(a.T == at_gpu.get()) # M > N b = a.T.copy() b_gpu = gpuarray.to_gpu(b) bt_gpu = linalg.transpose(b_gpu) assert np.all(b.T == bt_gpu.get())
def test_transpose_complex128(self): # M < N a = np.array([[1j, 2j, 3j, 4j, 5j, 6j], [7j, 8j, 9j, 10j, 11j, 12j]], np.complex128) a_gpu = gpuarray.to_gpu(a) at_gpu = linalg.transpose(a_gpu) assert np.all(a.T == at_gpu.get()) # M > N b = a.T.copy() b_gpu = gpuarray.to_gpu(b) bt_gpu = linalg.transpose(b_gpu) assert np.all(b.T == bt_gpu.get())
def compute_analysis_cuda2(self, xb, y, R, P, H, HT=None, hph=None, calcP=True): if HT is None: HT = culinalg.transpose(H) HP = culinalg.dot(H, P) if hph is None: hph = culinalg.dot(HP, HT) Rhph = misc.add(R, hph) inv = culinalg.inv(Rhph) W = culinalg.dot(HP, inv, transa='T') Hxb = culinalg.dot(H, xb) yHxb = misc.subtract(y, Hxb) WyHxb = culinalg.dot(W, yHxb) xhat = misc.add(xb, WyHxb) #xhat = xb + culinalg.dot(W, (y - culinalg.dot(H, xb))) if calcP: I = culinalg.eye(P.shape[0]) WH = culinalg.dot(W, H) IWH = I - WH Phat = culinalg.dot(IWH, P) else: Phat = misc.zeros((1, ), dtype=P.dtype) return xhat, Phat
def dot3(A, b): ''' Calculates matrix multiplication "b.T*A*b" on GPU. ''' #print("dot3 "+str(A.shape)+" "+str(b.shape)) # send A to GPU A_gpu = gpuarray.to_gpu(A) # send b to GPU b_gpu = gpuarray.to_gpu(b) temp_gpu = linalg.dot(A_gpu, b_gpu) A_gpu.gpudata.free() del(A_gpu) # transpose b on GPU bt_gpu = linalg.transpose(b_gpu) #remove b b_gpu.gpudata.free() del(b_gpu) out_gpu = linalg.dot(bt_gpu, temp_gpu) return out_gpu.get()
def get_probabilities(self, batch): lookup_table_gpu = gpuarray.to_gpu(self.lookup_table) probs = [] for i in range(batch.shape[0]): batch_gpu = gpuarray.to_gpu(batch[i]) batch_T_gpu = linalg.transpose(batch_gpu) res_gpu = linalg.transpose( linalg.dot(lookup_table_gpu, batch_T_gpu)) res = np.argmax(res_gpu.get(), axis=-1) probs.append(res) probs = np.expand_dims(np.asarray(probs), axis=-1) return probs
def forward(self, bottom, top): # print 'hanli crf forward -- ' # print 'self.diff.shape: ' + str(self.diff.shape); # self.diff.shape: (batchsize, 65536) # print 'crf bottom[0].data.shape: ' + str(bottom[0].data.shape); #crf bottom[0].data.shape: (batchsize, 11) # print 'raw degree bottom[1].data.shape: ' + str(bottom[1].data.shape); #(batchsize, 65536, 11) # print 'png bottom[2].data.shape: ' + str(bottom[2].data.shape); # (batchsize, 65536) # print 'np.dot(bottom[1].data[i,:,:], bottom[0].data[i,:]).shape: ' + str(np.dot(bottom[1].data[0,:,:], bottom[0].data[0,:]).shape); #(65536,) # print 'bottom[2].data[i,:].shape: ' + str(bottom[2].data[0,:].shape); # (65536,) with pu.caffe_cuda_context(): linalg.init() for i in range(self.diff.shape[0]): #a = bottom[1].data_as_pycuda_gpuarray() #b = bottom[0].data_as_pycuda_gpuarray() a = bottom[1].data[i, :, :].astype(np.float32) b = bottom[0].data[i, :].astype(np.float32) ##a = np.asarray(np.random.rand(4, 4), dtype=np.float32) ##b = np.asarray(np.random.rand(4), dtype=np.float32) #a_gpu = gpuarray.GPUArray(a, dtype=np.float32) #b_gpu = gpuarray.GPUArray(b, dtype=np.float32) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(a_gpu, b_gpu) #self.diff[i,:] = c_gpu + bottom[2].data[i,:] - bottom[3].data[i,:]; self.diff[i, :] = np.dot( bottom[1].data[i, :, :], bottom[0].data[ i, :]) + bottom[2].data[i, :] - bottom[3].data[i, :] top[0].data[...] = np.sum(self.diff**2) / bottom[3].num / 2. #self.transDiff = np.transpose(self.diff / bottom[3].num); # (65536, 50) a_gpu = gpuarray.to_gpu(self.diff / bottom[3].num) at_gpu = linalg.transpose(a_gpu) self.transDiff = at_gpu
class GPUArrayBox(Box): __slots__ = [] __array_priority__ = 100.0 @primitive def __getitem__(A, idx): return A[idx] shape = property(lambda self: self._value.shape) ndim = property(lambda self: self._value.ndim) size = property(lambda self: self._value.size) dtype = property(lambda self: self._value.dtype) T = property(lambda self: culinalg.transpose(self)) flags = property(lambda self: self._value.flags) get = property(lambda self: self._value.get) def __len__(self): return len(self._value) def astype(self, *args, **kwargs): return self._value.astype(*args, **kwargs) def __neg__(self): return anp.negative(self) def __add__(self, other): return cumisc.add(self, other) def __sub__(self, other): return cumisc.subtract(self, other) def __mul__(self, other): return cumisc.multiply(self, other) def __div__(self, other): return cumisc.divide( self, other) def __matmul__(self, other): return culinalg.dot(self, other) def __radd__(self, other): return cumisc.add(other, self) def __rsub__(self, other): return cumisc.subtract(other, self) def __rmul__(self, other): return cumisc.multiply(other, self) def __rdiv__(self, other): return cumisc.divide(other, self) def __rmatmul__(self, other): return culinalg.dot(other, self) def __hash__(self): return id(self)
def dot3(A, b): ''' Calculates matrix multiplication "b.T*A*b" on GPU. A has to be nxn. ''' #print("dot3 "+str(A.shape)+" "+str(b.shape)) # Make sure we dont run out of memory on the GPU if ((A.size + 2*b.size) <= 629088256): # send A to GPU A_gpu = gpuarray.to_gpu(A) # send b to GPU b_gpu = gpuarray.to_gpu(b) temp_gpu = linalg.dot(A_gpu, b_gpu) A_gpu.gpudata.free() del(A_gpu) # transpose b on GPU bt_gpu = linalg.transpose(b_gpu) #remove b b_gpu.gpudata.free() del(b_gpu) out_gpu = linalg.dot(bt_gpu, temp_gpu) return out_gpu.get() else: print("Too big for GPU, using CPU.") return np.dot(np.dot(b.T, A), b)
def cuda_dot3(A, b): print("cuda_dot3", A.shape, b.shape) # send b to GPU b_gpu = gpuarray.to_gpu(b) # transpose b on GPU bt_gpu = linalg.transpose(b_gpu) #remove b for now b_gpu.gpudata.free() del(b_gpu) # send A to GPU A_gpu = gpuarray.to_gpu(A) temp_gpu = linalg.dot(bt_gpu, A_gpu) bt_gpu.gpudata.free() del(bt_gpu) A_gpu.gpudata.free() del(A_gpu) # send b to GPU b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(temp_gpu, b_gpu) temp_gpu.gpudata.free() del(temp_gpu) b_gpu.gpudata.free() del(b_gpu) #theoretically possible to move into RAM, force cleanup on GPU and then return from RAM #but most likely not necessary return c_gpu.get()
def kernel_lin(A, B, C, transa='N'): func = kern_lin if A.dtype == np.float64: func = Dkern_lin if transa == 'T': func(linalg.transpose(A), B, C) else: func(A, B, C)
def cuda_T(a): a_gpu = gpuarray.to_gpu(a) at_gpu = linalg.transpose(a_gpu) a_gpu.gpudata.free() del(a_gpu) return at_gpu.get()
def T(a): ''' Transposes matrix "y" on the GPU. ''' try: a_gpu = gpuarray.to_gpu(a) at_gpu = linalg.transpose(a_gpu) return at_gpu.get() except: print("Using CPU for Transpose.") return np.matrix(a.T, copy=False)
def T(a): ''' Transposes matrix "y" on the GPU. ''' a_gpu = gpuarray.to_gpu(a) at_gpu = linalg.transpose(a_gpu) #a_gpu.gpudata.free() #del(a_gpu) return at_gpu.get()
def getCSMGPU(XG, YG): tbegin = time.time() GPUNeg2 = gpuarray.to_gpu(np.array([-2.0], dtype=np.float32)) YGT = linalg.transpose(YG) XSqr = skcuda.misc.multiply(XG, XG) XSqr = skcuda.misc.sum(XSqr, 1) YSqr = skcuda.misc.multiply(YG, YG) YSqr = skcuda.misc.sum(YSqr, 1) C = linalg.dot(XG, YGT) C = skcuda.misc.multiply(GPUNeg2, C) skcuda.misc.add_matvec(C, XSqr, 0, C) skcuda.misc.add_matvec(C, YSqr, 1, C) return C
def transpose(self): if self.device == 'cuda': data = linalg.transpose(self.data) else: data = self.data.transpose() if self.autograd: return Tensor( data=data, autograd=True, creators=[self], creation_op="transpose", device=self.device, ) return Tensor( data=data, device=self.device, )
def cuda_dot2(b, A): print("cuda_dot2", b.shape, A.shape) # send b to GPU b_gpu = gpuarray.to_gpu(b) # transpose b on GPU bt_gpu = linalg.transpose(b_gpu) # send A to GPU A_gpu = gpuarray.to_gpu(A) out_gpu = linalg.dot(bt_gpu, A_gpu) b_gpu.gpudata.free() del(b_gpu) bt_gpu.gpudata.free() del(bt_gpu) A_gpu.gpudata.free() del(A_gpu) return out_gpu.get()
def getCSMGPU2(XG, YG): #Step 1: Sum of squares across rows dim = np.int32(XG.shape[1]) dimpow2 = roundUpPow2(dim) NThreads = np.int32(min(dimpow2, 512)) XSqr = gpuarray.empty(XG.shape[0], np.float32) YSqr = gpuarray.empty(YG.shape[0], np.float32) getSumSquares_(XG, XSqr, dim, dimpow2, block=(NThreads, 1, 1), grid=(XG.shape[0], 1), shared=4 * dimpow2) getSumSquares_(YG, YSqr, dim, dimpow2, block=(NThreads, 1, 1), grid=(YG.shape[0], 1), shared=4 * dimpow2) #Step 2: Do multiplication part YGT = linalg.transpose(YG) CSM = linalg.dot(XG, YGT) #Step 3: Add everything together Mp = np.array(XG.shape[0], dtype=np.int32) Np = np.array(YG.shape[0], dtype=np.int32) MPow2 = roundUpPow2(XG.shape[0]) NThreads = min(MPow2, 512) #CSM is N x M finishCSM_(CSM, XSqr, YSqr, Np, Mp, MPow2, block=(NThreads, 1, 1), grid=(YG.shape[0], 1)) return (CSM, XSqr, YSqr)
def sorted_eig(X, ascending=True, mode='cpu'): if mode == 'cpu': e_vals, e_vecs = np.linalg.eig(X) idx = np.argsort(e_vals) if not ascending: idx = idx[::-1] e_vecs = e_vecs[:, idx] e_vals = e_vals[idx] return e_vals, e_vecs elif mode == 'gpu': import skcuda.linalg as LA import pycuda.gpuarray as gpuarray e_vecs_gpu, e_vals_gpu = LA.eig(X, 'N', 'V', lib='cusolver') e_vals = e_vals_gpu.get() idx = np.argsort(e_vals) V_gpu = gpuarray.empty((X.shape[0], X.shape[1]), np.float32) d = X.shape[0] for i in range(d): V_gpu[i] = e_vecs_gpu[idx[i]] V_gpu = LA.transpose(V_gpu) return e_vals, V_gpu
def logis(y,x): end = 0 start = 0 x = x.astype(np.float32) y = y.astype(np.float32) start=time.time() # Translado de variable a GPU x_gpu = gpuarray.to_gpu(x) y_gpu = gpuarray.to_gpu(y) linalg.init() # Transpuesta de X x_gpu_T = linalg.transpose(x_gpu) beta_gpu = linalg.dot(linalg.dot(linalg.inv(linalg.dot(x_gpu_T,x_gpu)),x_gpu_T),y_gpu) j = 1 while(True): mu = sapply(x,beta_gpu.get()) mu = mu.astype(np.float32) mu_gpu = gpuarray.to_gpu(mu) V_gpu= linalg.diag(mu_gpu) f2_gpu = linalg.multiply(mu_gpu,1-mu_gpu) f3_gpu = linalg.diag(1/f2_gpu) f4_gpu = (y_gpu-mu_gpu) f5_gpu = linalg.dot(f3_gpu,f4_gpu) if(np.isnan(f5_gpu.get()).any()): f5_cpu = f5_gpu.get() f5_cpu = nanValue(f5_cpu) f5_gpu = gpuarray.to_gpu(f5_cpu.astype(np.float32)) y_1_gpu = linalg.dot(x_gpu,beta_gpu) + f5_gpu beta_1_gpu = linalg.dot(linalg.dot(linalg.dot(linalg.inv(linalg.dot(linalg.dot(x_gpu_T,V_gpu),x_gpu)),x_gpu_T),V_gpu),y_1_gpu) check_value = np.absolute(linalg.norm(beta_1_gpu-beta_gpu)) #if(check_value<0.00001): #break if(j == 10 or check_value<0.00001): break beta_gpu = beta_1_gpu j = j + 1 end = time.time() tiempo = (end-start) return {"iteraciones":j,"Betas":beta_gpu.get(),"time":tiempo}
def getCSMGPU2(XG, YG): #Step 1: Sum of squares across rows dim = np.int32(XG.shape[1]) dimpow2 = roundUpPow2(dim) NThreads = np.int32(min(dimpow2, 512)) XSqr = gpuarray.empty(XG.shape[0], np.float32) YSqr = gpuarray.empty(YG.shape[0], np.float32) getSumSquares_(XG, XSqr, dim, dimpow2, block=(NThreads, 1, 1), grid=(XG.shape[0], 1), shared=4*dimpow2) getSumSquares_(YG, YSqr, dim, dimpow2, block=(NThreads, 1, 1), grid=(YG.shape[0], 1), shared=4*dimpow2) #Step 2: Do multiplication part YGT = linalg.transpose(YG) CSM = linalg.dot(XG, YGT) #Step 3: Add everything together Mp = np.array(XG.shape[0], dtype=np.int32) Np = np.array(YG.shape[0], dtype=np.int32) MPow2 = roundUpPow2(XG.shape[0]) NThreads = min(MPow2, 512) #CSM is N x M finishCSM_(CSM, XSqr, YSqr, Np, Mp, MPow2, block=(NThreads, 1, 1), grid=(YG.shape[0], 1)) return (CSM, XSqr, YSqr)
def dot2(b, A): ''' Calculates matrix multiplication "b.T*A" on GPU. ''' #print("dot2 "+str(b.shape)+" "+str(A.shape)) # send b to GPU b_gpu = gpuarray.to_gpu(b) # transpose b on GPU bt_gpu = linalg.transpose(b_gpu) # clear b b_gpu.gpudata.free() del(b_gpu) # send A to GPU A_gpu = gpuarray.to_gpu(A) out_gpu = linalg.dot(bt_gpu, A_gpu) #clear #bt_gpu.gpudata.free() #del(bt_gpu) #A_gpu.gpudata.free() #del(A_gpu) return out_gpu.get()
def dot2(b, A): ''' Calculates matrix multiplication "b.T*A" on GPU. ''' #print("dot2 "+str(b.shape)+" "+str(A.shape)) # Make sure we dont run out of memory on the GPU if ((A.size + b.size + A.shape[0]*b.shape[1]) <= 629088256): try: # send b to GPU b_gpu = gpuarray.to_gpu(b) # transpose b on GPU bt_gpu = linalg.transpose(b_gpu) # clear b b_gpu.gpudata.free() del(b_gpu) # send A to GPU A_gpu = gpuarray.to_gpu(A) out_gpu = linalg.dot(bt_gpu, A_gpu) except: # clear b b_gpu.gpudata.free() del(b_gpu) print("Too big for GPU, using CPU.") return np.dot(b.T, A) else: print("Too big for GPU, using CPU.") return np.dot(b.T, A) #clear #bt_gpu.gpudata.free() #del(bt_gpu) #A_gpu.gpudata.free() #del(A_gpu) return out_gpu.get()
def transpose(A): return linalg.transpose(A)
import pycuda.autoinit import pycuda.driver as drv import pycuda.gpuarray as gpuarray import numpy as np import skcuda.linalg as culinalg import skcuda.misc as cumisc culinalg.init() # Double precision is only supported by devices with compute # capability >= 1.3: import string demo_types = [np.float32, np.complex64] if cumisc.get_compute_capability(pycuda.autoinit.device) >= 1.3: demo_types.extend([np.float64, np.complex128]) for t in demo_types: print('Testing transpose for type ' + str(np.dtype(t))) if np.iscomplexobj(t()): b = np.array([[1j, 2j, 3j, 4j, 5j, 6j], [7j, 8j, 9j, 10j, 11j, 12j]], t) else: a = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], t) a_gpu = gpuarray.to_gpu(a) at_gpu = culinalg.transpose(a_gpu) if np.iscomplexobj(t()): print('Success status: ', np.all(np.conj(a.T) == at_gpu.get())) else: print('Success status: ', np.all(a.T == at_gpu.get()))
def update_W_hat_skcuda(W_hat, X_hat, A_t, B_t, x_sum, alpha_sum, eps, t): n_hat, k_cluster = W_hat.shape # m_dim, _ = X_hat.shape W_hat_new = W_hat.copy() linalg.init() if not isinstance(W_hat_new, gpuarray.GPUArray): W_hat_new_gpu = gpuarray.to_gpu(W_hat_new.astype(np.float64)) else: W_hat_new_gpu = W_hat_new if not isinstance(X_hat, gpuarray.GPUArray): tmp_x = np.ascontiguousarray(X_hat) X_hat_gpu = gpuarray.to_gpu(tmp_x.astype(np.float64)) else: X_hat_gpu = X_hat # X_hat_T_gpu = gpuarray.to_gpu(X_hat.T.copy().astype(np.float64)) X_hat_T_gpu = linalg.transpose(X_hat_gpu) if not isinstance(A_t, gpuarray.GPUArray): A_t_gpu = gpuarray.to_gpu(A_t.astype(np.float64)) else: A_t_gpu = A_t A_t_gpu_trans = linalg.transpose(A_t_gpu) if not isinstance(B_t, gpuarray.GPUArray): B_t_gpu = gpuarray.to_gpu(B_t.astype(np.float64)) else: B_t_gpu = B_t B_t_gpu_trans = linalg.transpose(B_t_gpu) all_ones_gpu = gpuarray.to_gpu(np.ones((n_hat, 1), dtype=np.float64)) k = 0 while True: k += 1 # ipdb.set_trace() W_hat_old_gpu = W_hat_new_gpu.copy() for j in range(k_cluster): T1 = linalg.dot(X_hat_T_gpu, B_t_gpu_trans[j, :].reshape((-1, 1))) X_product_gpu = linalg.dot(X_hat_T_gpu, X_hat_gpu) T2 = reduce(linalg.dot, (X_product_gpu, W_hat_new_gpu, A_t_gpu_trans[j, :].reshape(-1, 1))) grad_gpu = -T1 + T2 step_size = 1 / (linalg.norm(X_product_gpu) * linalg.norm(A_t_gpu_trans[j, :]) + 1e-8) tmp = -step_size * grad_gpu.reshape( (-1)) + W_hat_new_gpu[:, j].copy() # u_j_gpu = 1/2 * (tmp + abs(tmp)) # normalized_u_j_gpu = 1/max(linalg.norm(u_j_gpu), 1) * u_j_gpu # u_j_gpu = 1/max(linalg.norm(tmp), 1) * tmp # normalized_u_j_gpu = 1/2 * (u_j_gpu + abs(u_j_gpu)) u_j = geo_projection_to_cvx_cmb(tmp.get()) normalized_u_j_gpu = gpuarray.to_gpu(u_j.astype(np.float64)) W_hat_new_gpu[:, j] = normalized_u_j_gpu # T1 = linalg.dot(X_hat_T_gpu, B_t_gpu) # X_product_gpu = linalg.dot(X_hat_T_gpu, X_hat_gpu) # T2 = reduce(linalg.dot, (X_product_gpu, W_hat_new_gpu, A_t_gpu)) # grad_gpu = T2 - T1 # step_size = 1/(linalg.norm(X_product_gpu) * linalg.norm(A_t_gpu) + 1e-8) # tmp = W_hat_new_gpu - step_size * grad_gpu # u_gpu = 1/2 * (tmp + abs(tmp)) # column_sum_gpu = misc.sum(u_gpu, axis = 0).astype(np.float64) # # ipdb.set_trace() # div_mat_gpu = linalg.dot(all_ones_gpu, column_sum_gpu.reshape((1, -1))) + 1e-8 # W_hat_new_gpu = u_gpu / div_mat_gpu.astype(np.float64) # if k % 50 == 0: # g_val = get_g_hat_value(t, W_hat_new_gpu.get(), X_hat, # A_t, B_t, x_sum, alpha_sum) # print('iteration {}, function value: {:.4f}'.format(k, g_val)) if (linalg.norm(W_hat_new_gpu - W_hat_old_gpu) < eps) or k >= 10000: break return W_hat_new_gpu
def _sub_kmeans_gpu_custom(X, k): import skcuda import skcuda.linalg as LA import pycuda.driver as cuda import pycuda.autoinit import pycuda.gpuarray as gpuarray import custom_kernels as CC LA.init() CC.init() n, d = X.shape X = X.astype(np.float32) V_gpu = random_V(d, mode='gpu') m = d / 2 X_gpu = gpuarray.to_gpu(X) mu_D_gpu = CC.column_mean(X_gpu) sub_gpu = skcuda.misc.subtract(X_gpu, mu_D_gpu) sub_gpu_T = LA.transpose(sub_gpu) S_D_gpu = CC.matmul(sub_gpu_T, sub_gpu) mu_is_gpu = gpuarray.to_gpu(X[np.random.choice(n, k)]) itr = 1 assignment_unchanged = 0 C_gpu = None MAX_ITER = 100 while itr < MAX_ITER: Pc_gpu = projection_matrix(d, m, mode='gpu') PcV_gpu = LA.dot(Pc_gpu, V_gpu, transa='T', transb='T') PcVmu_is_gpu = gpuarray.empty((k, m), dtype=np.float32) for i in range(k): PcVmu_is_gpu[i] = LA.dot(PcV_gpu, mu_is_gpu[i][:, None]).ravel() global_temp = LA.dot(X_gpu, PcV_gpu, transb='T') if itr % 2 == 0: C_old = C_gpu.get() C_gpu = CC.argmin_mu_diff(global_temp, PcVmu_is_gpu) if itr % 2 == 0: Cnew = C_gpu.get() points_changed = np.sum(1 - np.equal(C_old, Cnew).astype(np.uint8)) if points_changed == 0: assignment_unchanged += 1 if assignment_unchanged >= 2: break print('[i] Itr %d: %d points changed' % (itr, points_changed)) C = C_gpu.get() counts = {i: 0 for i in range(k)} for i in xrange(n): C_id = np.int(C[i]) counts[C_id] += 1 maxv = np.max(counts.values()) storage = np.zeros((k, np.int(maxv), d)).astype(np.float32) counter = np.zeros(k, dtype=np.uint32) # k for i in range(n): C_id = np.int(C[i]) storage[C_id, np.int(counter[C_id]), :] = X[i].ravel() counter[C_id] += 1 storage_gpu = gpuarray.to_gpu(storage) mu_is_gpu = CC.sum_axis2(storage_gpu) counter_gpu = gpuarray.to_gpu(counter)[:, None] mu_is_gpu = skcuda.misc.divide( mu_is_gpu, counter_gpu.astype(np.float32)) S_is_gpu = gpuarray.zeros((k, d, d), dtype=np.float32) # k,d,d for i in range(k): storage_gpu[i] = skcuda.misc.subtract(storage_gpu[i], mu_is_gpu[i]) curr_cluster_points = storage_gpu[i, :np.int(counter[i]), :] # |k|,d S_is_gpu[i] = LA.dot(curr_cluster_points, curr_cluster_points, transa='T') S_is_sum_gpu = S_is_gpu.reshape((k, d * d)) S_is_sum_gpu = skcuda.misc.sum(S_is_sum_gpu, axis=0, keepdims=True) S_is_sum_gpu = S_is_sum_gpu.reshape((d, d)) S_is_diff_gpu = skcuda.misc.subtract(S_is_sum_gpu, S_D_gpu) w, V_gpu = sorted_eig(S_is_diff_gpu, mode='gpu') maxVal = min(w) m = np.sum([1 for i in w if i / maxVal > 1e-3]) m = max(1, m) itr += 1 return C_gpu.get(), V_gpu.get(), m
def gpu_transpose(a): a_gpu = gpuarray.to_gpu(a) at_gpu = linalg.transpose(a_gpu) return at_gpu.get()
import pycuda.autoinit import pycuda.driver as cuda import pycuda.gpuarray as gpuarray from pycuda.compiler import SourceModule import numpy as np import skcuda.linalg as linalg s = cuda.Event() e = cuda.Event() s.record() N = 32 * 1024 linalg.init() a = np.tril(np.ones(N, dtype=np.float32)) a_gpu = gpuarray.to_gpu(a) at_gpu = linalg.transpose(a_gpu) print "done" e.record() e.synchronize() print s.time_till(e)
ff = transf(ff) ff[ff<0] = 0 ff[ff>2**15] = 0 # sometimes there is a problem with saving signed/unsigned ff values while ff.max() > 7: # rescale ff ff /= 10 # print(ff.max()) else: ff = np.zeros(outShape) if useGPU: signorms = linalg.norm(signals, axis=1, keepdims=True) signormsRep = np.repeat(signorms, signals.shape[1], axis=1) signormsGPU = pycuda.gpuarray.to_gpu(signormsRep.astype(np.float32)) signalsGPU = pycuda.gpuarray.to_gpu(signals.astype(np.float32)) signalsGPU = sklinalg.transpose(skmisc.divide(signalsGPU, signormsGPU)) del signormsGPU ROWSTEP = 14 if fitType == 0: signorms = linalg.norm(signals, axis=1, keepdims=True) signormsRep = np.repeat(signorms, signals.shape[1], axis=1) signalsCPU = np.transpose( signals / signormsRep) ROWSTEP = 14 for slc in range(*sliceRange): print(slc) if fatT2 <= 0: print("Searching fat...") fatT2 = fitSlc(int((sliceRange[1]-sliceRange[0])/2+sliceRange[0]), True, t2, b1, ff) ffl = FatFractionLookup(t2Lim, b1Lim, fatT2, etl, echoSpacing, refocusingFactor)
def process(self, **kwargs): """Calculate the likelihood, returning ln(likelihood).""" ret = {'value': LIKELIHOOD_FLOOR} self._fractions = kwargs.get('fractions', []) if not len(self._fractions): return ret self._model_observations = kwargs['model_observations'] self._score_modifier = kwargs.get(self.key('score_modifier'), 0.0) self._upper_limits = np.array(kwargs.get('upperlimits', []), dtype=bool) value = ret['value'] if min(self._fractions) < 0.0 or max(self._fractions) > 1.0: return ret for oi, obs in enumerate(self._model_observations): if not self._upper_limits[oi] and (isnan(obs) or not np.isfinite(obs)): return ret diag = kwargs.get('kdiagonal', None) residuals = kwargs.get('kresiduals', None) if diag is None or residuals is None: return ret if kwargs.get('kmat', None) is not None: kmat = kwargs['kmat'] # Add observed errors to diagonal kmat[np.diag_indices_from(kmat)] += diag # full_size = np.count_nonzero(kmat) # Remove small covariance terms # min_cov = self.MIN_COV_TERM * np.max(kmat) # kmat[kmat <= min_cov] = 0.0 # print("Sparse frac: {:.2%}".format( # float(full_size - np.count_nonzero(kmat)) / full_size)) condn = np.linalg.cond(kmat) if condn > 1.0e10: return ret if self._use_cpu is not True and self._model._fitter._cuda: try: import pycuda.gpuarray as gpuarray import skcuda.linalg as skla except ImportError: self._use_cpu = True if not self._cuda_reported: self._printer.message('cuda_not_enabled', master_only=True, warning=True) else: self._use_cpu = False if not self._cuda_reported: self._printer.message('cuda_enabled', master_only=True) self._cuda_reported = True kmat_gpu = gpuarray.to_gpu(kmat) # kmat will now contain the cholesky decomp. skla.cholesky(kmat_gpu, lib='cusolver') value = -np.log(skla.det(kmat_gpu, lib='cusolver')) res_gpu = gpuarray.to_gpu( residuals.reshape(len(residuals), 1)) cho_mat_gpu = res_gpu.copy() skla.cho_solve(kmat_gpu, cho_mat_gpu, lib='cusolver') value -= (0.5 * (skla.mdot(skla.transpose(res_gpu), cho_mat_gpu)).get())[0][0] if self._use_cpu: try: chol_kmat = scipy.linalg.cholesky(kmat, check_finite=False) value = -np.linalg.slogdet(chol_kmat)[-1] value -= 0.5 * (np.matmul( residuals.T, scipy.linalg.cho_solve( (chol_kmat, False), residuals, check_finite=False))) except Exception: try: value = -0.5 * (np.matmul( np.matmul(residuals.T, scipy.linalg.inv(kmat)), residuals) + np.log(scipy.linalg.det(kmat))) except scipy.linalg.LinAlgError: return ret ret['kdiagonal'] = diag ret['kresiduals'] = residuals elif 'kfmat' in kwargs: raise RuntimeError('Should not have kfmat in likelihood!') else: # Shortcut when matrix is diagonal. self._o_band_vs = kwargs['obandvs'] # print('likelihood') # print(np.sqrt(diag)) # print(self._o_band_vs) # print(residuals) value = -0.5 * np.sum(residuals**2 / (self._o_band_vs**2 + diag) + np.log(self._o_band_vs**2 + diag)) score = self._score_modifier + value if isnan(score) or not np.isfinite(score): return ret ret['value'] = max(LIKELIHOOD_FLOOR, score) return ret
time_cula = [] for i in N: t = np.float32 n = i * 32 a = np.asarray(np.random.rand(n,n), t) start = time.time() c = np.transpose(a) time_cpu.append(time.time() - start) a_gpu = gpuarray.to_gpu(a) start = time.time() c_gpu = culinalg.transpose(a_gpu) time_linalg.append(time.time() - start) a_gpu2 = gpuarray.to_gpu(a) cula_result = gpuarray.empty((n, n), np.float32) #culaGetVersion ''' culaInitialize start = time.time() culaDeviceSgeTranspose(n, n, a_gpu2.gpudata, n, cula_result.gpudata, n) time_cula.append(time.time() - start)
x0[:,slice(0,laz),0] = read_rec_as_arr(fp_img,nc,laz,0) for j in range(laz): x0[:,j,0]=np.roll(x0[:,j,0],int(r_shift[j]),axis=0) for i in range(nproc): x0[:,slice(laz,nl),0] = read_rec_as_arr(fp_img,nc,nread,laz+i*nread) for k in range(laz,nl): x0[:,k,0]=np.roll(x0[:,k,0],int(r_shift[k+i*nread]),axis=0) x_gpu = gpuarray.to_gpu(x0) xt_gpu = gpuarray.to_gpu(np.empty((nl, nc, 1), np.complex64)) cu_fft.fft(x_gpu, x_gpu, az_plan) x_gpu[:,:,0] = linalg.misc.multiply(x_gpu[:,:,0],pf1_gpu) xt_gpu[:,:,0] = linalg.transpose(x_gpu[:,:,0]) cu_fft.fft(xt_gpu, xt_gpu, rg_plan) x_gpu[:,:,0] = linalg.transpose(xt_gpu[:,:,0]) x_gpu[:,:,0] = linalg.misc.multiply(x_gpu[:,:,0],pf2_gpu) xt_gpu[:,:,0] = linalg.transpose(x_gpu[:,:,0]) cu_fft.ifft(xt_gpu, xt_gpu, rg_plan, True) x_gpu[:,:,0] = linalg.transpose(xt_gpu[:,:,0]) x_gpu[:,:,0] = linalg.misc.multiply(x_gpu[:,:,0],pf3_gpu) cu_fft.ifft(x_gpu, x_gpu, az_plan, True) <<<<<<< HEAD slc_gpu[:,slice(i*nread,(i+1)*nread)] = x_gpu[:,slice(0,nl-laz),0].get() x0 = np.roll(x0,-nread,axis=1) print(i) elapsed_time = time.time() - start print("elapsed_time:{0}".format(elapsed_time) + "[sec]")
def kernel(A, B, C, transa='N'): if transa == 'T': kern(linalg.transpose(A), B, C) else: kern(A, B, C)