def gemm_v2(): """ Let GEMM transpose the input matrices so that they can be in C order, originally. Note that the output matrix is still in Fortran array. The string arguments in gemm tells it to apply transformation on the input matrices. See argument description in: http://docs.continuum.io/numbapro/cudalib.html#blas-level-2 """ print("Version 2".center(80, '=')) # Prepare arrays for input A = np.array(np.arange(N**2, dtype=np.float32).reshape(N, N)) B = np.array(np.arange(N) + 10, dtype=A.dtype) D = np.zeros_like(A, order='F') # NumPy start = timer() E = np.dot(A, np.diag(B)) numpy_time = timer() - start print("Numpy took %f seconds" % numpy_time) # cuBLAS blas = cublas.Blas() start = timer() blas.gemm('T', 'T', N, N, N, 1.0, A, np.diag(B), 1.0, D) cuda_time = timer() - start print("CUBLAS took %f seconds" % cuda_time) diff = np.abs(D - E) print("Maximum error %f" % np.max(diff))
def gemm(): print("Version 2".center(80, '=')) A = np.random.rand(dim, dim) B = np.random.rand(dim, dim) D = np.zeros_like(A, order='F') print("MATRIX A :") print A print("VECTOR B :") print B # NumPy start = timer() E = np.dot(A, B) numpy_time = timer() - start print("Numpy took %f seconds" % numpy_time) # cuBLAS blas = cublas.Blas() start = timer() blas.gemm('T', 'T', dim, dim, dim, 1.0, A, B, 1.0, D) cuda_time = timer() - start print("RESULT MATRIX EVALUATED WITH CUBLAS") print D print("CUBLAS took %f seconds" % cuda_time) diff = np.abs(D - E) print("Maximum error %f" % np.max(diff))
def gemm_v1(): ''' Note that all arrays are in Fortran order. ''' print("Version 1".center(80, '=')) # Prepare arrays for input A = np.array(np.arange(N**2, dtype=np.float32).reshape(N, N), order='F') B = np.array(np.arange(N) + 10, dtype=A.dtype, order='F') D = np.zeros_like(A, order='F') # NumPy start = timer() E = np.dot(A, np.diag(B)) numpy_time = timer() - start print("Numpy took %f seconds" % numpy_time) # cuBLAS blas = cublas.Blas() start = timer() blas.gemm('N', 'N', N, N, N, 1.0, A, np.diag(B), 1.0, D) cuda_time = timer() - start print("CUBLAS took %f seconds" % cuda_time) diff = np.abs(D - E) print("Maximum error %f" % np.max(diff))
def infer(learner, stimuli, coeffs=None): #Get Blas routines blas = cublas.Blas() #Initialize arrays numDict = learner.Q.shape[0] numStim = stimuli.shape[0] dataLength = stimuli.shape[1] u = np.zeros((numStim, numDict), dtype=np.float32, order='F') if coeffs is not None: u[:] = np.atleast_2d(coeffs) d_u = cuda.to_device(u) d_s = cuda.to_device( np.zeros((numStim, numDict), dtype=np.float32, order='F')) d_b = cuda.to_device( np.zeros((numStim, numDict), dtype=np.float32, order='F')) d_ci = cuda.to_device( np.zeros((numStim, numDict), dtype=np.float32, order='F')) d_c = cuda.to_device( np.zeros((numDict, numDict), dtype=np.float32, order='F')) #Move inputs to GPU d_dictionary = cuda.to_device( np.array(learner.Q, dtype=np.float32, order='F')) d_stimuli = cuda.to_device(np.array(stimuli, dtype=np.float32, order='F')) blockdim2 = (32, 32) # TODO: experiment, was all 32s blockdim1 = 32 griddimcsub = int(ceil(numDict / blockdim1)) griddimi = (int(ceil(numStim / blockdim2[0])), int(ceil(numDict / blockdim2[1]))) #Calculate c: overlap of basis functions with each other minus identity blas.gemm('N', 'T', numDict, numDict, dataLength, 1., d_dictionary, d_dictionary, 0., d_c) LCALearner.csub[griddimcsub, blockdim1](d_c) blas.gemm('N', 'T', numStim, numDict, dataLength, 1., d_stimuli, d_dictionary, 0., d_b) thresh = np.mean(np.absolute(d_b.copy_to_host()), axis=1) d_thresh = cuda.to_device(thresh) #Update u[i] and s[i] for niter time steps for kk in range(learner.niter): #Calculate ci: amount other neurons are stimulated times overlap with rest of basis blas.gemm('N', 'N', numStim, numDict, numDict, 1., d_s, d_c, 0., d_ci) LCALearner.iterate[griddimi, blockdim2](d_c, d_b, d_ci, d_u, d_s, learner.infrate, d_thresh, learner.min_thresh, learner.adapt, learner.softthresh) u = d_u.copy_to_host() s = d_s.copy_to_host() return s.T, u.T, thresh
def infer(dictionary, coeffs, stimuli, eta, lamb, nIter, softThresh, adapt): #Get Blas routines bs = cublas.Blas() #Initialize arrays numDict = dictionary.shape[0] numStim = stimuli.shape[0] dataLength = stimuli.shape[1] d_u = cuda.to_device( np.zeros((numStim, numDict), dtype=np.float32, order='F')) d_s = cuda.to_device( np.zeros((numStim, numDict), dtype=np.float32, order='F')) d_b = cuda.to_device( np.zeros((numStim, numDict), dtype=np.float32, order='F')) d_ci = cuda.to_device( np.zeros((numStim, numDict), dtype=np.float32, order='F')) d_c = cuda.to_device( np.zeros((numDict, numDict), dtype=np.float32, order='F')) #Move inputs to GPU d_dictionary = cuda.to_device( np.array(dictionary, dtype=np.float32, order='F')) d_coeffs = cuda.to_device(np.array(coeffs, dtype=np.float32, order='F')) d_stimuli = cuda.to_device(np.array(stimuli, dtype=np.float32, order='F')) blockdim2 = (32, 32) blockdim1 = 32 #griddimc = (int(numDict/blockdim[0]),int(numDict/blockdim[1])) griddimcsub = int(numDict / blockdim1) griddimb = (int(numStim / blockdim2[0]), int(numDict / blockdim2[1])) griddimi = (int(numStim / blockdim2[0]), int(numDict / blockdim2[1])) #Calculate c: overlap of basis functions with each other minus identity #cinit[griddimc,blockdim](d_dictionary,d_c) bs.gemm('N', 'T', numDict, numDict, dataLength, 1., d_dictionary, d_dictionary, 0., d_c) csub[griddimcsub, blockdim1](d_c) #binit[griddimb,blockdim2](d_dictionary,d_stimuli,d_b) bs.gemm('N', 'T', numStim, numDict, dataLength, 1., d_stimuli, d_dictionary, 0., d_b) thresh = np.mean(np.absolute(d_b.copy_to_host()), axis=1) d_thresh = cuda.to_device(thresh) #Update u[i] and s[i] for nIter time steps for kk in xrange(nIter): #Calculate ci: amount other neurons are stimulated times overlap with rest of basis bs.gemm('N', 'N', numStim, numDict, numDict, 1., d_s, d_c, 0., d_ci) iter[griddimi, blockdim2](d_c, d_b, d_ci, d_u, d_s, eta, d_thresh, lamb, adapt, softThresh) u = d_u.copy_to_host() s = d_s.copy_to_host() return (s, u, thresh)
def mp(dictionary, stimuli, k=None, minabs=None): """ Does matching pursuit on a batch of stimuli. Args: dictionary: Dictionary for matching pursuit. First axis should be dictionary element number. stimuli: Stimulus batch for matching pursuit. First axis should be stimulus number. k: Sparseness constraint. k dictionary elements will be used to represent stimuli. minabs: Minimum absolute value of the remaining signal to continue projection. If nothing is given, minabs is set to zero and k basis elements will be used. Returns: coeffs: List of dictionary element coefficients to be used for each stimulus. """ if k is None: k = dictionary.shape[0] if minabs is None: minabs = 0. bs = cublas.Blas() numDict = dictionary.shape[0] numStim = stimuli.shape[0] dataLength = stimuli.shape[1] assert k <= numDict #Setup variables on GPU d_coefs = cuda.to_device( np.zeros(shape=(numStim, numDict), dtype=np.float32, order='F')) d_curCoef = cuda.to_device( np.zeros(shape=(numStim, numDict), dtype=np.float32, order='F')) d_coefsd = cuda.to_device( np.zeros(shape=(numStim, numDict), dtype=np.float32, order='F')) d_winners = cuda.to_device( np.zeros(shape=(k, numStim), dtype=np.int64, order='F')) d_delta = cuda.to_device( np.zeros_like(stimuli, dtype=np.float32, order='F')) d_coefsd = cuda.to_device( np.zeros(shape=(numStim, numDict), dtype=np.float32, order='F')) #Move args to GPU d_stim = cuda.to_device(np.array(stimuli, dtype=np.float32, order='F')) d_stimt = cuda.to_device( np.zeros_like(stimuli, dtype=np.float32, order='F')) d_dict = cuda.to_device(np.array(dictionary, dtype=np.float32, order='F')) griddim1 = 32 griddim2 = (32, 32) assert numStim % 32 == 0 and dataLength % 32 == 0 and numDict % 32 == 0 blockdimstim = int(numStim / griddim1) blockdim2 = (int(numStim / griddim2[0]), int(dataLength / griddim2[1])) blockdimcoef = (int(numStim / griddim2[0]), int(numDict / griddim2[1])) for ii in xrange(k): if minabs >= np.mean(np.absolute(d_stim.copy_to_host())): break bs.gemm('N', 'T', numStim, numDict, dataLength, 1., d_stim, d_dict, 0., d_curCoef) if ii > 0: removeWinners[griddim1, blockdimstim](d_curCoef, d_winners, ii) maxCoefsABS[griddim1, blockdimstim](d_curCoef, d_coefs, d_coefsd, d_winners, ii, 0) #print d_winners.copy_to_host() bs.gemm('N', 'N', numStim, dataLength, numDict, 1., d_coefsd, d_dict, 0., d_delta) #print 'delta' #print d_delta.copy_to_host() #d_coefsd = cuda.to_device(np.zeros(shape=(numStim,numDict),dtype=np.float32,order='F')) bs.geam('N', 'N', numStim, numDict, 0., d_coefsd, 0., d_coefsd, d_coefsd) bs.geam('N', 'N', numStim, dataLength, 1., d_stim, -1., d_delta, d_stim) #bs.geam('N','N',numStim,dataLength,1.,d_stimt,0.,d_delta,d_stim) #print 'stim' #print d_stim.copy_to_host() return d_coefs.copy_to_host()
def fista(I, Phi, lambdav, L=None, tol=10e-6, max_iterations=200, display=True, verbose=False): b = cublas.Blas() c = cusparse.Sparse() descr = c.matdescr() (m, n) = Phi.shape (m, batch) = I.shape if L == None: L = scipy.sparse.linalg.svds(Phi, 1, which='LM', return_singular_vectors=False) print "Max eigenvalue: ." + str(L) L = (L**2)*4 # L = svd(Phi) -> eig(2*(Phi.T*Phi)) invL = 1/L t = 1. #if sps.issparse(Phi): # Phi = np.array(Phi.todense()) d_I = cuda.to_device(np.array(I, dtype=np.float32, order='F')) # d_Phi = cuda.to_device(np.array(Phi, dtype=np.float32, order='F')) d_Phi = cusparse.csr_matrix(Phi, dtype=np.float32) d_PhiT = cusparse.csr_matrix(Phi.T, dtype=np.float32) # hack because csrgemm issues with 'T' # d_Q = cuda.device_array((n, n), dtype=np.float32, order='F') d_c = cuda.device_array((n, batch), dtype=np.float32, order='F') d_x = cuda.to_device(np.array(np.zeros((n, batch), dtype=np.float32), order='F')) d_y = cuda.to_device(np.array(np.zeros((n, batch), dtype=np.float32), order='F')) d_x2 = cuda.to_device(np.array(np.zeros((n, batch), dtype=np.float32), order='F')) # Temporary array variables d_t = cuda.device_array((m, batch), dtype=np.float32, order='F') d_t2 = cuda.device_array(n*batch, dtype=np.float32, order='F') #b.gemm('T', 'N', n, n, m, 1, d_Phi, d_Phi, 0, d_Q) # Q = Phi^T * Phi #b.gemm('T', 'N', n, batch, m, -2, d_Phi, d_I, 0, d_c) # c = -2*Phi^T * y # c.csrgemm('T', 'N', n, n, m, descr, d_Phi.nnz, d_Phi.data, d_Phi.indptr, d_Phi.indices, # descr, d_Phi.nnz, d_Phi.data, d_Phi.indptr, d_Phi.indices, descr, d_Q.data, d_Q.indptr, d_Q.indices) d_Q = c.csrgemm_ez(d_PhiT, d_Phi, transA='N', transB='N') c.csrmm('T', m, batch, n, d_Phi.nnz, -2, descr, d_Phi.data, d_Phi.indptr, d_Phi.indices, d_I, m, 0, d_c, n) blockdim = 32, 32 griddim = int(math.ceil(n/blockdim[0])), int(math.ceil(batch/blockdim[1])) blockdim_1d = 256 griddim_1d = int(math.ceil(n*batch/blockdim_1d)) start = l2l1obj(b, c, descr, d_I, d_Phi, d_x, d_t, d_t2, lambdav, blockdim_1d, griddim_1d) obj2 = start for i in xrange(max_iterations): # x2 = 2*Q*y + c # b.symm('L', 'U', n, batch, 2, d_Q, d_y, 0, d_x2) c.csrmm('N', n, batch, n, d_Q.nnz, 2, descr, d_Q.data, d_Q.indptr, d_Q.indices, d_y, n, 0, d_x2, n) b.geam('N', 'N', n, batch, 1, d_c, 1, d_x2, d_x2) # x2 = y - invL * x2 b.geam('N', 'N', n, batch, 1, d_y, -invL, d_x2, d_x2) # proxOp() l1prox[griddim, blockdim](d_x2, invL*lambdav, d_x2) t2 = (1+math.sqrt(1+4*(t**2)))/2.0 # y = x2 + ((t-1)/t2)*(x2-x) b.geam('N', 'N', n, batch, 1+(t-1)/t2, d_x2, (1-t)/t2, d_x, d_y) # x = x2 b.geam('N', 'N', n, batch, 1, d_x2, 0, d_x, d_x) t = t2 # update objective obj = obj2 obj2 = l2l1obj(b, c, descr, d_I, d_Phi, d_x2, d_t, d_t2, lambdav, blockdim_1d, griddim_1d) if verbose: x2 = d_x2.copy_to_host() print "L1 Objective: " + str(obj2) # print "L1 Objective: " + str(lambdav*np.sum(np.abs(x2)) + np.sum((I-Phi.dot(x2))**2)) if np.abs(obj-obj2)/float(obj) < tol: break x2 = d_x2.copy_to_host() if display: print "FISTA Iterations: " + str(i) # print "L1 Objective: " + str(obj2) print "L1 Objective: " + str(lambdav*np.sum(np.abs(x2)) + np.sum((I-Phi.dot(x2))**2)) print "Objective delta: " + str(obj2-start) return x2