def logis(y,x): end = 0 start = 0 x = x.astype(np.float32) y = y.astype(np.float32) start=time.time() # Translado de variable a GPU x_gpu = gpuarray.to_gpu(x) y_gpu = gpuarray.to_gpu(y) linalg.init() # Transpuesta de X x_gpu_T = linalg.transpose(x_gpu) beta_gpu = linalg.dot(linalg.dot(linalg.inv(linalg.dot(x_gpu_T,x_gpu)),x_gpu_T),y_gpu) j = 1 while(True): mu = sapply(x,beta_gpu.get()) mu = mu.astype(np.float32) mu_gpu = gpuarray.to_gpu(mu) V_gpu= linalg.diag(mu_gpu) f2_gpu = linalg.multiply(mu_gpu,1-mu_gpu) f3_gpu = linalg.diag(1/f2_gpu) f4_gpu = (y_gpu-mu_gpu) f5_gpu = linalg.dot(f3_gpu,f4_gpu) if(np.isnan(f5_gpu.get()).any()): f5_cpu = f5_gpu.get() f5_cpu = nanValue(f5_cpu) f5_gpu = gpuarray.to_gpu(f5_cpu.astype(np.float32)) y_1_gpu = linalg.dot(x_gpu,beta_gpu) + f5_gpu beta_1_gpu = linalg.dot(linalg.dot(linalg.dot(linalg.inv(linalg.dot(linalg.dot(x_gpu_T,V_gpu),x_gpu)),x_gpu_T),V_gpu),y_1_gpu) check_value = np.absolute(linalg.norm(beta_1_gpu-beta_gpu)) #if(check_value<0.00001): #break if(j == 10 or check_value<0.00001): break beta_gpu = beta_1_gpu j = j + 1 end = time.time() tiempo = (end-start) return {"iteraciones":j,"Betas":beta_gpu.get(),"time":tiempo}
def test_diag_1d_complex128(self): v = np.array([1j, 2j, 3j, 4j, 5j, 6j], np.complex128) v_gpu = gpuarray.to_gpu(v) d_gpu = linalg.diag(v_gpu) assert np.all(np.diag(v) == d_gpu.get())
def test_diag_2d_wide_complex64(self): v = np.array(np.random.rand(32, 64)*1j, np.complex64) v_gpu = gpuarray.to_gpu(v) d_gpu = linalg.diag(v_gpu) assert np.all(np.diag(v) == d_gpu.get())
def test_diag_2d_tall_float64(self): v = np.array(np.random.rand(64, 32), np.float64) v_gpu = gpuarray.to_gpu(v) d_gpu = linalg.diag(v_gpu) assert np.all(np.diag(v) == d_gpu.get())
def test_diag_1d_float64(self): v = np.array([1, 2, 3, 4, 5, 6], np.float64) v_gpu = gpuarray.to_gpu(v) d_gpu = linalg.diag(v_gpu) assert np.all(np.diag(v) == d_gpu.get())
def test_diag_2d_wide_float32(self): v = np.array(np.random.rand(32, 64), np.float32) v_gpu = gpuarray.to_gpu(v) d_gpu = linalg.diag(v_gpu) assert np.all(np.diag(v) == d_gpu.get())
#!/usr/bin/env python """ Demonstrate diagonal matrix creation on the GPU. """ from __future__ import print_function import pycuda.autoinit import pycuda.gpuarray as gpuarray import pycuda.driver as drv import numpy as np import skcuda.linalg as culinalg import skcuda.misc as cumisc culinalg.init() # Double precision is only supported by devices with compute # capability >= 1.3: import string demo_types = [np.float32, np.complex64] if cumisc.get_compute_capability(pycuda.autoinit.device) >= 1.3: demo_types.extend([np.float64, np.complex128]) for t in demo_types: print('Testing real diagonal matrix creation for type ' + str(np.dtype(t))) v = np.array([1, 2, 3, 4, 5, 6], t) v_gpu = gpuarray.to_gpu(v) d_gpu = culinalg.diag(v_gpu) print('Success status: ', np.all(d_gpu.get() == np.diag(v)))
def FastICASymmApro(X, whitening, dewhitening, maxIterations, threshold): Threads = 32 ThreadBlock = (Threads, Threads, 1) Dim, NumOfSampl = X.shape Dim = np.int32(Dim) B = linalg.orth(np.random.random( (Dim, Dim))).astype(np.float32) #linalg.orth makes the array non contiguous #B.flags['C_CONTIGUOUS'] print(B) B_gpu = gpuarray.to_gpu(np.ascontiguousarray(B, np.float32)) #Bold Bold_gpu = gpuarray.zeros((Dim, Dim), np.float32) Bold = np.zeros((Dim, Dim)) #W A = np.zeros((Dim, Dim)) #maybe dtype #CTC CTC_gpu = gpuarray.zeros((Dim, Dim), np.float32) #hypTan hypTan_gpu = gpuarray.zeros((NumOfSampl, Dim), np.float32) #rowSum row = int(np.ceil(NumOfSampl / Threads)) Sum_gpu = gpuarray.zeros((row, Dim), np.float32) rowSum_gpu = gpuarray.zeros(Dim, np.float32) #minAbsCos minAbsCos_gpu = gpuarray.zeros((Dim, Dim), np.float32) #left, right left_gpu = gpuarray.zeros((Dim, Dim), np.float32) right_gpu = gpuarray.zeros((Dim, Dim), np.float32) #Identity I_gpu = gpuarray.to_gpu(np.eye(Dim).astype(np.float32)) Check_gpu = gpuarray.zeros((Dim, Dim), np.float32) #diag diag_gpu = gpuarray.zeros(Dim, np.float32) # start = cuda.Event() # end = cuda.Event() # start.record() #X X_gpu = gpuarray.to_gpu(X.astype(np.float32)) for i in range(0, maxIterations + 1): # print(i, maxIterations) if i == maxIterations: print('Component {} did not converge after {} iterations'.format( i, maxIterations)) B = B_gpu.get() if B.size != 0: #not empty B = B @ np.real(inv(sqrt(B.T @ B))) W = B.T @ whitening A = dewhitening @ B print('A:\n', A) print('W:\n', W) return A, W return None, None #TODO f = True j = 0 gpuMatMul(B_gpu, B_gpu, CTC_gpu, transb='T') gpuSumCol(CTC_gpu, Sum_gpu, ThreadBlock, rowSum_gpu) # print(np.allclose(rowSum_gpu.get(), gpuSum(CTC_gpu, axis=0).get())) norm = findMax(rowSum_gpu, (31, 1, 1)) # norm = gpuMax(rowSum_gpu) Div(B_gpu, norm, Dim, block=ThreadBlock, grid=(1, 1, 1)) #Division by scalar #maybe check every 5 iterations while f: Mul(left_gpu, B_gpu, np.float32(3 / 2), Dim, block=ThreadBlock, grid=(1, 1, 1)) #Division by scalar gpuMatMul(B_gpu, B_gpu, right_gpu, transb='T') gpuMatMul(right_gpu, B_gpu, right_gpu) Mul(right_gpu, right_gpu, np.float32(1 / 2), Dim, block=ThreadBlock, grid=(1, 1, 1)) #Division by scalar Sub(B_gpu, left_gpu, right_gpu, Dim, block=ThreadBlock, grid=(1, 1, 1)) #C = left - right gpuMatMul(B_gpu, B_gpu, Check_gpu, transb='T') if j >= 20: f = compareGpuC(Check_gpu, I_gpu, ThreadBlock).get() f = not f <= threshold j = 0 j += 1 # j+=1 gpuMatMul(B_gpu, Bold_gpu, minAbsCos_gpu, transa='T') # minAbsCos2 = findMin(abs(findDiag(minAbsCos_gpu, diag_gpu)), (128, 1, 1)).get() minAbsCos2 = gpuMin(abs(diag(minAbsCos_gpu))).get() # print( abs( diag(minAbsCos_gpu) ) ) minAbsCos = minAbsCos2[0] if 1 - minAbsCos < threshold: print('Converged!') #TODO # end.record() # end.synchronize() # secs = start.time_till(end)*1e-3 # return secs # print('Seconds: ', secs) # C = B_gpu.get() # A = dewhitening @ C # W = C.T @ whitening # print('A:\n', A) # print('W:\n', W) # return A, W Copy(Bold_gpu, B_gpu, Dim, block=ThreadBlock, grid=(1, 1, 1)) #Bold = B gpuMatMul(X_gpu, B_gpu, hypTan_gpu, transa='T') n = int(np.ceil(hypTan_gpu.shape[0] / Threads)) if n > 65536: n = 65535 # n=1 gpuTanh(hypTan_gpu, np.int32(hypTan_gpu.shape[1]), np.int32(hypTan_gpu.shape[0]), block=ThreadBlock, grid=(1, n, 1)) gpuMatMul(X_gpu, hypTan_gpu, CTC_gpu) n = Dim * NumOfSampl m = int(np.ceil(hypTan_gpu.shape[0] / (Threads * Threads))) if m > 65536: m = 65535 # m = 1 elementWise(hypTan_gpu, np.int32(n), block=(Threads * Threads, 1, 1), grid=(m, 1, 1)) #1 - hypTan*hypTan gpuSumCol(hypTan_gpu, Sum_gpu, ThreadBlock, rowSum_gpu) MatVecMul(B_gpu, rowSum_gpu, Dim, block=ThreadBlock, grid=(1, 1, 1)) Sub(B_gpu, CTC_gpu, B_gpu, Dim, block=ThreadBlock, grid=(1, 1, 1)) #C = left - right Div(B_gpu, np.int32(NumOfSampl), Dim, block=ThreadBlock, grid=(1, 1, 1)) #Division by scalar
def diag(A): A_gpu = gpuarray.to_gpu(A) out_gpu = linalg.diag(A_gpu) return out_gpu.get()
def compute_P_cuda(self, C, D): dD = culinalg.diag(D) CD = culinalg.dot_diag(dD, C, 'T') P = culinalg.dot_diag(dD, CD) return P.copy()
def test_diag_2d_tall_complex128(self): v = np.array(np.random.rand(64, 32)*1j, np.complex128) v_gpu = gpuarray.to_gpu(v) d_gpu = linalg.diag(v_gpu) assert np.all(np.diag(v) == d_gpu.get())
#!/usr/bin/env python """ Demonstrate diagonal matrix creation on the GPU. """ from __future__ import print_function import pycuda.autoinit import pycuda.gpuarray as gpuarray import pycuda.driver as drv import numpy as np import skcuda.linalg as culinalg import skcuda.misc as cumisc culinalg.init() # Double precision is only supported by devices with compute # capability >= 1.3: import string demo_types = [np.float32, np.complex64] if cumisc.get_compute_capability(pycuda.autoinit.device) >= 1.3: demo_types.extend([np.float64, np.complex128]) for t in demo_types: print('Testing real diagonal matrix creation for type ' + str(np.dtype(t))) v = np.array([1, 2, 3, 4, 5, 6], t) v_gpu = gpuarray.to_gpu(v) d_gpu = culinalg.diag(v_gpu) print('Success status: %r' % np.all(d_gpu.get() == np.diag(v)))