def add_batch(self, X, T, wc=None): """Add a batch of training data to an iterative solution, weighted if neeed. The batch is processed as a whole, the training data is splitted in `ELM.add_data()` method. With parameters HH_out, HT_out, the output will be put into these matrices instead of model. Args: X (matrix): input data matrix size (N * `inputs`) T (matrix): output data matrix size (N * `outputs`) wc (vector): vector of weights for data samples, one weight per sample, size (N * 1) HH_out, HT_out (matrix, optional): output matrices to add batch result into, always given together """ devH = self._project(X, dev=True) T = np.array(T, order="C", dtype=self.precision) devT = gpuarray.to_gpu(T) if wc is not None: # apply weights if given w = np.array(wc**0.5, dtype=self.precision)[:, None] # re-shape to column matrix devWC = gpuarray.to_gpu(w) misc.mult_matvec(devH, devWC, axis=0, out=devH) misc.mult_matvec(devT, devWC, axis=0, out=devT) if self.HH is None: # initialize space for self.HH, self.HT self.HT = misc.zeros((self.L, self.outputs), dtype=self.precision) self.HH = linalg.eye(self.L, dtype=self.precision) self.HH *= self.norm linalg.add_dot(devH, devT, self.HT, transa='T') if self.precision is np.float64: linalg.add_dot(devH, devH, self.HH, transa='T') else: cublas.cublasSsyrk(self.handle, 'L', 'N', self.L, X.shape[0], 1, devH.ptr, self.L, 1, self.HH.ptr, self.L)
def add_batch(self, X, T, wc=None): """Add a batch of training data to an iterative solution, weighted if neeed. The batch is processed as a whole, the training data is splitted in `ELM.add_data()` method. With parameters HH_out, HT_out, the output will be put into these matrices instead of model. Args: X (matrix): input data matrix size (N * `inputs`) T (matrix): output data matrix size (N * `outputs`) wc (vector): vector of weights for data samples, one weight per sample, size (N * 1) HH_out, HT_out (matrix, optional): output matrices to add batch result into, always given together """ devH = self._project(X, dev=True) T = np.array(T, order="C", dtype=self.precision) devT = gpuarray.to_gpu(T) if wc is not None: # apply weights if given w = np.array( wc**0.5, dtype=self.precision)[:, None] # re-shape to column matrix devWC = gpuarray.to_gpu(w) misc.mult_matvec(devH, devWC, axis=0, out=devH) misc.mult_matvec(devT, devWC, axis=0, out=devT) if self.HH is None: # initialize space for self.HH, self.HT self.HT = misc.zeros((self.L, self.outputs), dtype=self.precision) self.HH = linalg.eye(self.L, dtype=self.precision) self.HH *= self.norm linalg.add_dot(devH, devT, self.HT, transa='T') if self.precision is np.float64: linalg.add_dot(devH, devH, self.HH, transa='T') else: cublas.cublasSsyrk(self.handle, 'L', 'N', self.L, X.shape[0], 1, devH.ptr, self.L, 1, self.HH.ptr, self.L)
def sqsum_adddot2(a, b): """ Compute squared euclidean distance between two 2D arrays representing n-dimensional points using GPU. This uses the GPUArray versions of the input arrays to compute element-wise summations of squared sum of rows and accumulates into the matrix-multiplication result residing on GPU. The final result resides on GPU. Parameters ---------- A : ndarray 2D NumPy array of float dtype representing n-dimensional points, with each row being one point. B : ndarray 2D NumPy array of float dtype representing n-dimensional points, with each row being one point. Returns ------- out : GPUArray This holds the euclidean distances. """ a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = sq_sums(a_gpu, b_gpu) return culinalg.add_dot(a_gpu, b_gpu, c_gpu, transb='T', alpha=-2.0)
def sqsum_adddot(a, b, method): """ Compute squared euclidean distance between two 2D arrays representing n-dimensional points using GPU. This uses the input arrays themselves to compute element-wise summations of squared sum of rows and accumulates into the matrix-multiplication result residing on GPU. The final result resides on GPU. Parameters ---------- A : ndarray 2D NumPy array of float dtype representing n-dimensional points, with each row being one point. B : ndarray 2D NumPy array of float dtype representing n-dimensional points, with each row being one point. method : str It can be 'add_togpu' or 'togpu_misc_add' or 'togpu_cuda_add'. Refer to function "squared_sum" for more information. Returns ------- out : GPUArray This holds the euclidean distances residing on GPU. """ a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = squared_sum(a, b, method=method) return culinalg.add_dot(a_gpu, b_gpu, c_gpu, transb='T', alpha=-2.0)
def get_distances_to_centers(self, data): # make sure the array is c order data = np.asarray(data, dtype=np.float32, order='C') # ship to gpu data_gpu = gpuarray.to_gpu(data) # alloc space on gpu for distances dists_shape = (data.shape[0], self.centers.shape[0]) dists_gpu = gpuarray.zeros(dists_shape, np.float32) # calc data norms on gpu data_norms = cumisc.sum(data_gpu**2, axis=1) # calc distance on gpu cumisc.add_matvec(dists_gpu, self.center_norms, 1, dists_gpu) cumisc.add_matvec(dists_gpu, data_norms, 0, dists_gpu) culinalg.add_dot(data_gpu, self.centers_gpu, dists_gpu, transb='T', alpha=-2.0) return dists_gpu
def multiplyConv2DHGradKLGPU(W, H, V, VLam, doDivision=True): """ Compute the 2D convolutional multiplicative update for H under the Kullback-Liebler divergence, using skcuda :param W: A TxNxK matrix of K sources over spatiotemporal spans NxT\ :param H: A FxKxM matrix of source activations for each submatrix of W\ over F transpositions over M time :param VLam: Convolutional WH multiplication :param doDivision: If true, return the factor Numerator/Denomenator\ otherwise, return (Numerator, Denomenator) :returns Ratio: A FxKxM matrix of multiplicative updates for H\ or (RatioNum, RatioDenom) if doDivision = False """ HNums = gpuarray.zeros(H.shape, np.float32) HDenoms = gpuarray.zeros((H.shape[0], H.shape[1]), np.float32) thisVLam = VLam.copy() ZerosToOnes(thisVLam) VLamQuot = skcuda.misc.divide(V, thisVLam) thisVLamQuot = VLamQuot.copy() thisW = W.copy() for t in range(W.shape[0]): if t > 0: z = gpuarray.zeros((V.shape[0], t), np.float32) thisVLamQuot[:, 0:-t] = VLamQuot[:, t::] thisVLamQuot[:, -t::] = z for f in range(H.shape[0]): if f > 0: thisW[t, f::, :] = W[t, 0:-f, :] thisW[t, 0:f, :] = gpuarray.zeros((f, W.shape[2]), np.float32) linalg.add_dot(thisW[t, :, :], thisVLamQuot, HNums[f, :, :], transa='T') HDenoms[f, :] = skcuda.misc.add(HDenoms[f, :], skcuda.misc.sum(thisW[t, :, :], 0)) HDenoms = TileHDenom(HDenoms[:, :, None], H.shape[2]) if doDivision: return skcuda.misc.divide(HNums, HDenoms) else: return (HNums, HDenoms)
def _impl_add_dot_matrix_tests(self, dtype, transa, transb): a = np.asarray(np.random.rand(4, 2), dtype) if transa == 'n': b = np.asarray(np.random.rand(2, 2), dtype) else: b = np.asarray(np.random.rand(4, 4), dtype) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) aa = a if transa == 'n' else a.T bb = b if transb == 'n' else b.T c = np.asarray(np.random.rand(aa.shape[0], bb.shape[1]), dtype) c_gpu = gpuarray.to_gpu(c) c_gpu = linalg.add_dot(a_gpu, b_gpu, c_gpu, transa, transb) assert np.allclose(c + np.dot(aa, bb), c_gpu.get()) a = a.astype(dtype, order="F", copy=True) b = b.astype(dtype, order="F", copy=True) c = c.astype(dtype, order="F", copy=True) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = gpuarray.to_gpu(c) c_gpu = linalg.add_dot(a_gpu, b_gpu, c_gpu, transa, transb) assert np.allclose(c+np.dot(aa, bb), c_gpu.get())
def multiplyConv2DHGradGPU(W, H, V, VLam, doDivision=True): """ Compute the 2D convolutional multiplicative update for H using skcuda :param W: A TxNxK GPU array of K sources over spatiotemporal spans NxT\ :param H: A FxKxM GPU array of source activations for each submatrix of W\ over F transpositions over M time :param V: An MxN GPU target array :param VLam: An MxN GPU estimate array :param doDivision: If true, return the factor Numerator/Denomenator\ otherwise, return (Numerator, Denomenator) """ thisV = V.copy() thisVLam = VLam.copy() thisW = W.copy() HNums = gpuarray.zeros(H.shape, np.float32) HDenoms = gpuarray.zeros(H.shape, np.float32) for t in range(W.shape[0]): if t > 0: #thisV = shiftMatLRUD(V, dj=-t) z = gpuarray.zeros((V.shape[0], t), np.float32) thisV[:, 0:-t] = V[:, t::] thisV[:, -t::] = z thisVLam[:, 0:-t] = VLam[:, t::] thisVLam[:, -t::] = z for f in range(H.shape[0]): if f > 0: #thisW = shiftMatLRUD(W[t, :, :], di=f) thisW[t, f::, :] = W[t, 0:-f, :] thisW[t, 0:f, :] = gpuarray.zeros((f, W.shape[2]), np.float32) linalg.add_dot(thisW[t, :, :], thisV, HNums[f, :, :], transa='T') linalg.add_dot(thisW[t, :, :], thisVLam, HDenoms[f, :, :], transa='T') if doDivision: return skcuda.misc.divide(HNums, HDenoms) else: return (HNums, HDenoms)
def multiplyConv2DWGradGPU(W, H, V, VLam, doDivision=True): """ Compute the 2D convolutional multiplicative update for W using skcuda :param W: A TxNxK GPU array of K sources over spatiotemporal spans NxT\ :param H: A FxKxM GPU array of source activations for each submatrix of W\ over F transpositions over M time :param V: An MxN GPU target array :param VLam: An MxN GPU estimate array :param doDivision: If true, return the factor Numerator/Denomenator\ otherwise, return (Numerator, Denomenator) """ thisV = V.copy() thisVLam = VLam.copy() thisH = H.copy() WNums = gpuarray.zeros(W.shape, np.float32) WDenoms = gpuarray.zeros(W.shape, np.float32) for f in range(H.shape[0]): if f > 0: z = gpuarray.zeros((f, V.shape[1]), np.float32) thisV[0:-f, :] = V[f::, :] thisV[-f::, :] = z thisVLam[0:-f, :] = VLam[f::, :] thisVLam[-f::, :] = z for t in range(W.shape[0]): if t > 0: thisH[f, :, t::] = H[f, :, 0:-t] thisH[f, :, 0:t] = gpuarray.zeros((H.shape[1], t), np.float32) linalg.add_dot(thisV, thisH[f, :, :], WNums[t, :, :], transb='T') linalg.add_dot(thisVLam, thisH[f, :, :], WDenoms[t, :, :], transb='T') if doDivision: return skcuda.misc.divide(WNums, WDenoms) else: return (WNums, WDenoms)
def _impl_test_dot_strided(self, dtype): # n/n a = np.asarray(np.random.rand(4, 10), dtype) b = np.asarray(np.random.rand(2, 20), dtype) c = np.zeros((4, 30), dtype) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = gpuarray.to_gpu(c) linalg.add_dot(a_gpu[:, 4:6], b_gpu[:, 2:8], c_gpu[:, 1:7], 'n', 'n') res = c_gpu.get() assert np.allclose(np.dot(a[:, 4:6], b[:, 2:8]), res[:, 1:7]) # t/n a = np.asarray(np.random.rand(4, 10), dtype) b = np.asarray(np.random.rand(4, 20), dtype) c = np.zeros((2, 30), dtype) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = gpuarray.to_gpu(c) linalg.add_dot(a_gpu[:, 4:6], b_gpu[:, 2:8], c_gpu[:, 1:7], 't', 'n') res = c_gpu.get() assert np.allclose(np.dot(a[:, 4:6].T, b[:, 2:8]), res[:, 1:7]) # n/t a = np.asarray(np.random.rand(4, 10), dtype) b = np.asarray(np.random.rand(6, 20), dtype) c = np.zeros((4, 30), dtype) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = gpuarray.to_gpu(c) linalg.add_dot(a_gpu[:, 4:10], b_gpu[:, 2:8], c_gpu[:, 1:7], 'n', 't') res = c_gpu.get() assert np.allclose(np.dot(a[:, 4:10], b[:, 2:8].T), res[:, 1:7]) # t/t a = np.asarray(np.random.rand(6, 10), dtype) b = np.asarray(np.random.rand(8, 20), dtype) c = np.zeros((2, 30), dtype) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = gpuarray.to_gpu(c) linalg.add_dot(a_gpu[:, 4:6], b_gpu[:, 2:8], c_gpu[:, 1:9], 't', 't') res = c_gpu.get() assert np.allclose(np.dot(a[:, 4:6].T, b[:, 2:8].T), res[:, 1:9])
def dot_add_mm(self, a, b, out, transa=False, transb=False): transa = 'T' if transa else 'N' transb = 'T' if transb else 'N' culinalg.add_dot(a, b, out, transa, transb)
def run(self, iterations): for i in range(0,iterations): # F = XG(G.T G)^-1 linalg.add_dot(self.G_gpu, self.G_gpu, self.GTG_gpu, transa="T", beta=0.) try: self.GTGinv_gpu.set(np.linalg.inv(self.GTG_gpu.get())) # linalg.pinv only worked with CULA except LinAlgError: self.GTGinv_gpu.set(np.linalg.iinv(self.GTG_gpu.get())) linalg.add_dot(self.X_gpu, self.G_gpu, self.XG_gpu, beta=0.) linalg.add_dot(self.XG_gpu, self.GTGinv_gpu, self.F_gpu, beta=0.) # preparation and calculation of the matrix separations linalg.add_dot(self.X_gpu, self.F_gpu, self.XTF_gpu, transa="T", beta=0.) linalg.add_dot(self.F_gpu, self.F_gpu, self.FTF_gpu, transa="T", beta=0.) self.matrix_separationXTF(self.XTF_gpu, self.XTFpos_gpu, self.XTFneg_gpu, block=(self.block_G, 1, 1), grid=(self.grid_G, 1)) self.matrix_separationFTF(self.FTF_gpu, self.FTFpos_gpu, self.FTFneg_gpu, block=(self.block_FTF, 1, 1), grid=(self.grid_FTF, 1)) # compute the G update linalg.add_dot(self.G_gpu, self.FTFpos_gpu, self.GFTFpos_gpu, beta=0.) linalg.add_dot(self.G_gpu, self.FTFneg_gpu, self.GFTFneg_gpu, beta=0.) self.G_ew_update(self.G_gpu, self.XTFpos_gpu, self.GFTFneg_gpu, self.XTFneg_gpu, self.GFTFpos_gpu, block=(self.block_G, 1, 1), grid=(self.grid_G, 1)) # test for convergence if (i % self.niter_test_conv == 0) and self.checkConvergence(): print "NMF converged after %i iterations" % i break
def run(self, iterations): for i in range(0,iterations): # update H linalg.add_dot(self.W_gpu, self.X_gpu, self.WTX_gpu, transa="T", beta=0.) # add_dot is faster than dot, dot calls ad linalg.add_dot(self.W_gpu, self.W_gpu, self.WTW_gpu, transa="T", beta=0.) linalg.add_dot(self.WTW_gpu, self.H_gpu, self.WTWH_gpu, beta=0.) self.update_H(self.H_gpu, self.WTX_gpu, self.WTWH_gpu, block=(self.block_H, 1, 1), grid=(self.grid_H, 1)) # update W linalg.add_dot(self.X_gpu, self.H_gpu, self.XHT_gpu, transb="T", beta=0.) linalg.add_dot(self.W_gpu, self.H_gpu, self.WH_gpu, beta=0.) linalg.add_dot(self.WH_gpu, self.H_gpu, self.WHHT_gpu, transb="T", beta=0.) self.update_W(self.W_gpu, self.XHT_gpu, self.WHHT_gpu, block=(self.block_W, 1, 1), grid=(self.grid_W, 1)) # test for convergence if (i % self.niter_test_conv == 0) and self.checkConvergence(): print "NMF converged after %i iterations" % i break
def demosaick_gpu(img): img = gp.to_gpu(img) p2x = im2col(img, _i2c2) cm.log(img + _eps, out=img) p1x = im2col(img, _i2c1) wA = p1x.shape[0] wB = p2x.shape[0] hA = p1x.shape[1] hB = p2x.shape[1] # Path 1 p1x = p1x.reshape([wA * hA, 576]) p1y = lg.dot(p1x, _wts.int1) cm.exp(p1y, out=p1y) p1y = p1y.reshape([wA * hA * 64, 3 * _ofac]) p1x = lg.dot(p1y, _wts.int2) msc.add_matvec(p1x, _wts.int2b, out=p1x) p1x = p1x.reshape([wA * hA * 64 * 3, _ofac]) # Path 2 # conv1 p2x = p2x.reshape([wB * hB, 64]) p2y = lg.dot(p2x, _wts.c1) msc.add_matvec(p2y, _wts.c1b, out=p2y) gp.maximum(p2y, 0., p2y) p2y = p2y.reshape([wB, hB, _numsel]) # conv2 shI = [wB - 1, hB - 1, _numsel] shM = [(wB - 1) * (hB - 1), _numsel] p2x = gp.empty(shM, dtype=np.float32) pTT = gp.empty(shI, dtype=np.float32) pTT = pTT.reshape(shI) pTT[...] = p2y[0:-1, 0:-1, :] pTT = pTT.reshape(shM) p2x = lg.dot(pTT, _wts.c200) pTT = pTT.reshape(shI) pTT[...] = p2y[0:-1, 1:, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c201, p2x) pTT = pTT.reshape(shI) pTT[...] = p2y[1:, 0:-1, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c210, p2x) pTT = pTT.reshape(shI) pTT[...] = p2y[1:, 1:, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c211, p2x) msc.add_matvec(p2x, _wts.c2b, out=p2x) gp.maximum(p2x, 0., p2x) p2x = p2x.reshape(shI) # conv 3 shI = [wB - 2, hB - 2, _numsel] shM = [(wB - 2) * (hB - 2), _numsel] p2y = gp.empty(shM, dtype=np.float32) pTT = gp.empty(shI, dtype=np.float32) pTT = pTT.reshape(shI) pTT[...] = p2x[0:-1, 0:-1, :] pTT = pTT.reshape(shM) p2y = lg.dot(pTT, _wts.c300) pTT = pTT.reshape(shI) pTT[...] = p2x[0:-1, 1:, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c301, p2y) pTT = pTT.reshape(shI) pTT[...] = p2x[1:, 0:-1, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c310, p2y) pTT = pTT.reshape(shI) pTT[...] = p2x[1:, 1:, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c311, p2y) msc.add_matvec(p2y, _wts.c3b, out=p2y) gp.maximum(p2y, 0., p2y) p2x = lg.dot(p2y, _wts.sout) msc.add_matvec(p2x, _wts.soutb, out=p2x) gp.maximum(p2x, 0., p2x) p2x = p2x.reshape(p1x.shape) # Combine p1x *= p2x p1 = msc.sum(p1x, axis=1) gp.maximum(p1, 0., p1) gp.minimum(p1, 1., p1) p1 = p1.reshape([wA, hA, 64 * 3]) im = p2im(p1.get()) return im
def getnextz(\ z_last,za, c_obs_long,\ Nx,Ny,nx,ny,nXobs,\ Se_inv,Sa_inv,\ Arule, Brule, Crule, Drule,\ KAxrule, KAyrule, \ KBxrule, KByrule, \ KCxrule, KCyrule, \ KDxrule, KDyrule): nx_last_long = cds.dot(Nx,z_last) ny_last_long = cds.dot(Ny,z_last) nx_last = matrix(reshape(nx_last_long,(ny-1,nx-1))); ny_last = matrix(reshape(ny_last_long,(ny-1,nx-1))); vbigK, bigc_last = getbigK(\ nx,ny,nXobs, nx_last, ny_last, Arule, Brule, Crule, Drule, KAxrule, KAyrule, KBxrule, KByrule, KCxrule, KCyrule, KDxrule, KDyrule) NxNy = vstack((Nx,Ny)) KN = cds.dot(vbigK, NxNy) nobs = nXobs*4 # int*int bigc_last_long = matrix(reshape(cds.T(bigc_last),(nobs,1))) delta_c = cds.substract(c_obs_long, bigc_last_long) # # This is the Gauss method # dz = linalg.inv(KN.T*KN)*KN.T*delta_c # z_next = z_last + dz # di2 = squeeze(dot(dz.T,dz)) # This is the optimal estimation method # nasledujici dva radky by se daly mergnout #term3 zz_gpu = gpuarray.to_gpu(z_last-za) delta_c_gpu = gpuarray.to_gpu(delta_c) KN_gpu = gpuarray.to_gpu(KN) # (important for term2) temp0_gpu = linalg.add_dot(KN_gpu, zz_gpu, delta_c_gpu) zz_gpu.gpudata.free() del(zz_gpu) Se_inv_gpu = gpuarray.to_gpu(Se_inv) # (important for term2) temp1_gpu = linalg.dot(Se_inv_gpu, temp0_gpu) #temp0_gpu.gpudata.free() #del(temp0_gpu) term3_gpu = linalg.dot(KN_gpu, temp1_gpu, transa="T") term3 = term3_gpu.get() temp1_gpu.gpudata.free() del(temp1_gpu) term3_gpu.gpudata.free() del(term3_gpu) # term2 temp2_gpu = linalg.dot(Se_inv_gpu, KN_gpu) Se_inv_gpu.gpudata.free() del(Se_inv_gpu) Sa_inv_gpu = gpuarray.to_gpu(Sa_inv) term2_gpu = linalg.add_dot(KN_gpu, temp2_gpu, Sa_inv_gpu, transa="T") temp2_gpu.gpudata.free() del(temp2_gpu) KN_gpu.gpudata.free() del(KN_gpu) term2 = term2_gpu.get() #term0 = cds.dot3(Se_inv, KN) #term2 = Sa_inv+term0 #term3 = cds.dot2(KN, cds.dot(Se_inv, (delta_c+cds.dot(KN,(z_last-za))))) z_next = za + np.linalg.solve(term2,term3) # same as term2\term3 dz_gpu = gpuarray.to_gpu(z_next-z_last) temp3_gpu = linalg.dot(term2_gpu, dz_gpu) term2_gpu.gpudata.free() del(term2_gpu) di2_gpu = linalg.dot(dz_gpu, temp3_gpu, transa='T') #dz = z_next - z_last #di2 = cds.dot3(term2, dz) # Get out return z_next, di2_gpu.get()
def elmvis(Xraw, A, slowdown=10, report=5, maxtime=24*60*60, tol=0, batch=None, maxiter=None, maxupdate=None, maxstall=None, cossim=None, silent=False): """ELMVIS+ function running in GPU memory. """ X = Xraw / np.linalg.norm(Xraw, axis=1)[:, None] # unit-length version of X Xh = np.dot(A, X) # X_hat, predicted value of X N, d = X.shape I = np.arange(N) # index of samples # set default values if cossim is None: cossim = np.trace(X.T.dot(A).dot(X)) / N if maxiter is None: maxiter = N*N*N if maxupdate is None: maxupdate = N*N if maxstall is None: maxstall = N*N if not silent: print "original similarity: ", cossim # init GPU dt = X.dtype.type try: linalg.init() except ImportError as e: print e devA = gpuarray.to_gpu(A.astype(dt)) devX = gpuarray.to_gpu(X.astype(dt)) devXi1 = gpuarray.empty((d,), dtype=dt) devXh = linalg.dot(devA, devX) devAi = gpuarray.empty((N, 2), dtype=dt) devDelta = gpuarray.empty((2, d), dtype=dt) result = gpuarray.empty((d,), dtype=dt) # swap kernel kernel = """ __global__ void diff(%s *A, %s *Y, %s *AY, %s *result, long d, long N, long i1, long i2) { long j = blockDim.x * blockIdx.x + threadIdx.x; %s yi1 = Y[i1*d + j]; %s yi2 = Y[i2*d + j]; result[j] = (A[i1*N + i1] * (yi2 - yi1) + 2*AY[i1*d + j]) * (yi2 - yi1) + (A[i2*N + i2] * (yi1 - yi2) + 2*(AY[i2*d + j] + A[i2*N + i1]*(yi2 - yi1))) * (yi1 - yi2); } """ if dt is np.float64: kernel = kernel % ("double", "double", "double", "double", "double", "double") else: kernel = kernel % ("float", "float", "float", "float", "float", "float") mod_diff = SourceModule(kernel) dev_diff = mod_diff.get_function("diff") dev_diff.prepare("PPPPllll") block = result._block grid = (int(np.ceil(1.0 * result.shape[0] / block[0])), 1) t0 = tlast = time() stall = 0 iters = 0 updates = 0 updates_last = 0 iters_last = 0 ups_max = 0 while (iters < maxiter) and (stall < maxstall): iters += 1 stall += 1 # get two different random numbers i1, i2 = np.random.randint(0, N, size=2) while i1 == i2: i1, i2 = np.random.randint(0, N, size=2) dev_diff.prepared_call(grid, block, devA.gpudata, devX.gpudata, devXh.gpudata, result.gpudata, d, N, i1, i2) diff = np.sum(result.get()) if diff > tol: stall = 0 devAi[:, 0] = devA[:, i1] devAi[:, 1] = devA[:, i2] devDelta[0, :] = devX[i1, :] - devX[i2, :] devDelta[1, :] = devX[i2, :] - devX[i1, :] linalg.add_dot(devAi, devDelta, devXh, alpha=-1) tI = I[i1] I[i1] = I[i2] I[i2] = tI devXi1[:] = devX[i1, :] devX[i1] = devX[i2] devX[i2] = devXi1 cossim += diff / N updates += 1 if updates > maxupdate: break t = time() if t - tlast > report: ups = (updates-updates_last)*1.0/(t-tlast) ips = (iters-iters_last)*1.0/(t-tlast) if not silent: print "%d iters | %d updates | %.0f iters/s | %.0f updates/s | cos similarity = %.4f" % (iters, updates, ips, ups, cossim) updates_last = updates iters_last = iters tlast = t ups_max = max(ups, ups_max) if ups < ups_max/slowdown: break if t - t0 > maxtime: break ips = iters*1.0/(time()-t0) ups = updates*1.0/(time()-t0) Xraw[:] = Xraw[I] cossim = np.trace(X.T.dot(A).dot(X)) / N if not silent: print "final similarity: ", cossim info = {'cossim': cossim, 'iters': iters, 'updates': updates, 'ips': ips, 'ups': ups} return I, info
def elmvis(Xraw, A, slowdown=10, report=5, maxtime=24 * 60 * 60, tol=0, batch=None, maxiter=None, maxupdate=None, maxstall=None, cossim=None, silent=False): """ELMVIS+ function running in GPU memory. """ X = Xraw / np.linalg.norm(Xraw, axis=1)[:, None] # unit-length version of X Xh = np.dot(A, X) # X_hat, predicted value of X N, d = X.shape I = np.arange(N) # index of samples # set default values if cossim is None: cossim = np.trace(X.T.dot(A).dot(X)) / N if maxiter is None: maxiter = N * N * N if maxupdate is None: maxupdate = N * N if maxstall is None: maxstall = N * N if not silent: print "original similarity: ", cossim # init GPU dt = X.dtype.type try: linalg.init() except ImportError as e: print e devA = gpuarray.to_gpu(A.astype(dt)) devX = gpuarray.to_gpu(X.astype(dt)) devXi1 = gpuarray.empty((d, ), dtype=dt) devXh = linalg.dot(devA, devX) devAi = gpuarray.empty((N, 2), dtype=dt) devDelta = gpuarray.empty((2, d), dtype=dt) result = gpuarray.empty((d, ), dtype=dt) # swap kernel kernel = """ __global__ void diff(%s *A, %s *Y, %s *AY, %s *result, long d, long N, long i1, long i2) { long j = blockDim.x * blockIdx.x + threadIdx.x; %s yi1 = Y[i1*d + j]; %s yi2 = Y[i2*d + j]; result[j] = (A[i1*N + i1] * (yi2 - yi1) + 2*AY[i1*d + j]) * (yi2 - yi1) + (A[i2*N + i2] * (yi1 - yi2) + 2*(AY[i2*d + j] + A[i2*N + i1]*(yi2 - yi1))) * (yi1 - yi2); } """ if dt is np.float64: kernel = kernel % ("double", "double", "double", "double", "double", "double") else: kernel = kernel % ("float", "float", "float", "float", "float", "float") mod_diff = SourceModule(kernel) dev_diff = mod_diff.get_function("diff") dev_diff.prepare("PPPPllll") block = result._block grid = (int(np.ceil(1.0 * result.shape[0] / block[0])), 1) t0 = tlast = time() stall = 0 iters = 0 updates = 0 updates_last = 0 iters_last = 0 ups_max = 0 while (iters < maxiter) and (stall < maxstall): iters += 1 stall += 1 # get two different random numbers i1, i2 = np.random.randint(0, N, size=2) while i1 == i2: i1, i2 = np.random.randint(0, N, size=2) dev_diff.prepared_call(grid, block, devA.gpudata, devX.gpudata, devXh.gpudata, result.gpudata, d, N, i1, i2) diff = np.sum(result.get()) if diff > tol: stall = 0 devAi[:, 0] = devA[:, i1] devAi[:, 1] = devA[:, i2] devDelta[0, :] = devX[i1, :] - devX[i2, :] devDelta[1, :] = devX[i2, :] - devX[i1, :] linalg.add_dot(devAi, devDelta, devXh, alpha=-1) tI = I[i1] I[i1] = I[i2] I[i2] = tI devXi1[:] = devX[i1, :] devX[i1] = devX[i2] devX[i2] = devXi1 cossim += diff / N updates += 1 if updates > maxupdate: break t = time() if t - tlast > report: ups = (updates - updates_last) * 1.0 / (t - tlast) ips = (iters - iters_last) * 1.0 / (t - tlast) if not silent: print "%d iters | %d updates | %.0f iters/s | %.0f updates/s | cos similarity = %.4f" % ( iters, updates, ips, ups, cossim) updates_last = updates iters_last = iters tlast = t ups_max = max(ups, ups_max) if ups < ups_max / slowdown: break if t - t0 > maxtime: break ips = iters * 1.0 / (time() - t0) ups = updates * 1.0 / (time() - t0) Xraw[:] = Xraw[I] cossim = np.trace(X.T.dot(A).dot(X)) / N if not silent: print "final similarity: ", cossim info = { 'cossim': cossim, 'iters': iters, 'updates': updates, 'ips': ips, 'ups': ups } return I, info