def sum_t(self, a, axis, out): if len(a.shape) < 3 and (axis == 0 or axis == 1): cumisc.sum(a, axis=axis, out=out) elif axis is None: cumisc.sum(a.reshape((a.size, 1)), axis=0, out=out) else: raise NotImplementedError
def _correlate_fft(self, frames_flat, cufft_plan): npix = frames_flat.shape[1] d_in = cufft_plan.data_in d_in.fill(0) f_out1 = cufft_plan.data_out f_out2 = garray.zeros_like(cufft_plan.data_out) # fft(pad(frames_flat), axis=1) d_in[:, :self.nframes] = frames_flat.T.astype("f") f_out1 = cufft_plan.fft(d_in, output=f_out1) # frames_flat.sum(axis=1) # skmisc.sum() only works on base data, not gpuarray views, # so we sum on the whole array and then extract the right subset. skmisc.sum(d_in, axis=0, out=self.d_sums_denom_tmp) # fft(pad(frames_flat[::-1]), axis=1) d_in.fill(0) d_in[:, :self.nframes] = frames_flat.T[:, ::-1].astype("f") f_out2 = cufft_plan.fft(d_in, output=f_out2) # product, ifft f_out1 *= f_out2 num = cufft_plan.ifft(f_out1, output=d_in) # numerator of g_2 skmisc.sum(num, axis=0, out=self.d_sums) # denominator of g_2: correlate(d_sums_denom) self._correlate_denom(npix) self.d_numerator /= self.d_denom res = self.d_numerator.get() return res
def _rbf_kernel_vectorized_cublas(data1, data2, sigma=10): # pragma: no cover """kernel for edge similarity computed with the vectorized method Args: data1 (TYPE): pssm data 1 data2 (TYPE): pssm dta 2 sigma (int, optional): exponent of the exponetial Returns: np.array: value of the rbk kernel for all the pairs """ beta = 2 * sigma**2 d1_ = gpuarray.to_gpu(data1.astype(np.float32)) d2_ = gpuarray.to_gpu(data2.astype(np.float32)) mgpu = -2 * culinalg.dot(d1_, d2_, transa='N', transb='T') vgpu = cumisc.sum(d1_**2, axis=1)[:, None] cumisc.add_matvec(mgpu, vgpu, out=mgpu) vgpu = cumisc.sum(d2_**2, axis=1) cumisc.add_matvec(mgpu, vgpu, out=mgpu) mcpu = mgpu.get() return np.exp(-mcpu / beta).reshape(-1)
def sum_t(self, a, axis, out): if len(a.shape) < 3 and (axis == 0 or axis == 1): cumisc.sum(a, axis, out) elif axis is None: self.copy_to(cumisc.sum(a), out) else: raise NotImplementedError
def meanUnderMask(volume, mask=None, p=1, gpu=False): """ meanValueUnderMask: Determines the mean value under a mask @param volume: The volume @type volume: L{pytom_volume.vol} @param mask: The mask @type mask: L{pytom_volume.vol} @param p: precomputed number of voxels in mask @type p: float @return: A value (scalar) @rtype: single @change: support None as mask, FF 08.07.2014 """ return sum((volume * mask)) / sum(mask)
def __init__(self, volume, template, mask, wedge, stdV, gpu=True): self.volume = gu.to_gpu(volume) self.template = Volume(template) self.templatePadded = gu.zeros_like(self.volume, dtype=np.float32) self.mask = Volume(mask) self.maskPadded = gu.zeros_like(self.volume, dtype=np.float32) self.sOrg = mask.shape self.sPad = volume.shape print(self.sPad, self.sOrg) rotate(self.mask, [0, 0, 0], self.maskPadded, self.sPad, self.sOrg) #paste_in_center_gpu(self.template.d_data, self.templatePadded, np.int32(self.sPad), np.int32(self.maskSize), block=(10, 10, 10), grid=(8,1,1)) #rotate(self.template, [0, 0, 0], self.templatePadded, self.sPad, self.maskSize) print(volume.shape, stdV.shape, wedge.shape) self.wedge = gu.to_gpu(wedge) self.stdV = gu.to_gpu(stdV) self.fwd_plan = Plan(volume.shape, volume.dtype, np.complex64) self.inv_plan = Plan(volume.shape, np.complex64, volume.dtype) self.volume_fft = gu.zeros_like(self.volume, dtype=np.complex64) self.template_fft = gu.zeros_like(self.volume, dtype=np.complex64) self.ccc_map = gu.zeros_like(self.volume, dtype=np.float32) self.norm_volume = np.prod(volume.shape) self.scores = gu.ones_like(self.volume, dtype=np.float32) * -1000 self.angles = gu.ones_like(self.volume, dtype=np.float32) * -1000 self.p = sum(self.mask.d_data)
def fast_matmul(x, y, x_type, y_type): ''' use pycuda to compute c = a * b ''' linalg.init() a_gpu = gpuarray.to_gpu(x.astype(x_type)) a_t_gpu = gpuarray.to_gpu(x.T.copy().astype(x_type)) b_gpu = gpuarray.to_gpu(y.astype(y_type)) # row_sum = gpuarray.zeros(shape = x[0].shape, dtype = x_type) row_sum = 0 # a = np.asarray(x, x_type) # b = np.asarray(y, y_type) # a_gpu = gpuarray.to_gpu(a) # b_gpu = gpuarray.to_gpu(b) t1_inside = time.time() c_gpu = linalg.dot(a_gpu, b_gpu) for a_i in a_gpu: # row_sum = misc.add(row_sum, a_i) row_sum += a_i gg = linalg.dot(a_gpu, b_gpu) gg = linalg.dot(a_i, a_i) gg = reduce(linalg.dot, (a_gpu, b_gpu, b_gpu, b_gpu)) # tmp1, tmp2 = linalg.dot(a_gpu, b_gpu), linalg.dot(b_gpu, b_gpu) z_gpu = a_gpu.copy() tmp = a_t_gpu # print('x.T\n', x.T) # print('tmp\n', tmp) # print('x = a_gpu: ', np.allclose(x, a_gpu.get())) # print('x.T = tmp: ', np.allclose(x.T, tmp.get())) a_prod = linalg.dot(a_gpu, tmp) t2_inside = time.time() print('inside cost {:.4f}s'.format(t2_inside - t1_inside)) a = np.random.randint(-5, 5, (3, 4)).astype(np.float32) a_gpu = gpuarray.to_gpu(a) norm_gpu = linalg.norm(a_gpu) print('is norm right?', np.linalg.norm(a) == norm_gpu) a_gpu = abs(a_gpu) column_sum = misc.sum(a_gpu, axis=0) column_sum = column_sum.reshape((1, -1)) all_one_gpu = gpuarray.to_gpu(np.ones((3, 1), np.float32)) div_mat_gpu = linalg.dot(all_one_gpu, column_sum) norm_1 = a_gpu / (div_mat_gpu + 1e-3) print(a_gpu) print(column_sum) print(column_sum.shape) print(norm_1) # abs_a = a_gpu.__abs__() # print(a) # print(abs_a) # c = abs_a + a_gpu # print(repr(c)) # print(type(c)) # c = 1/2 * c # print(a_gpu, c) return c_gpu.get(), a_prod.get(), row_sum.get()
def log_loss(y_true, y_prob): """Compute Logistic loss for classification. Parameters ---------- y_true : array-like or label indicator matrix Ground truth (correct) labels. y_prob : array-like of float, shape = (n_samples, n_classes) Predicted probabilities, as returned by a classifier's predict_proba method. Returns ------- loss : float The degree to which the samples are correctly predicted. """ if y_prob.dtype == np.float64: cuClip(y_prob.gpudata, np.float64(1e-10), np.float64(1 - 1e-10), np.int32(y_prob.size), block=(blockSize, 1, 1), grid=(int((y_prob.size - 1) / blockSize + 1), 1, 1)) else: cuClipf(y_prob.gpudata, np.float32(1e-10), np.float32(1 - 1e-10), np.int32(y_prob.size), block=(blockSize, 1, 1), grid=(int((y_prob.size - 1) / blockSize + 1), 1, 1)) if y_prob.shape[1] == 1: y_prob = gpuarray.to_gpu( np.append(1 - y_prob.get(), y_prob.get(), axis=1)) if y_true.shape[1] == 1: y_true = gpuarray.to_gpu( np.append(1 - y_true.get(), y_true.get(), axis=1)) tmp_gpu = gpuarray.GPUArray(y_prob.shape, y_prob.dtype) if y_prob.dtype == np.float64: cuLogLoss(y_true.gpudata, y_prob.gpudata, tmp_gpu.gpudata, np.int32(y_prob.size), block=(blockSize, 1, 1), grid=(int((y_prob.size - 1) / blockSize + 1), 1, 1)) else: cuLogLossf(y_true.gpudata, y_prob.gpudata, tmp_gpu.gpudata, np.int32(y_prob.size), block=(blockSize, 1, 1), grid=(int((y_prob.size - 1) / blockSize + 1), 1, 1)) #total = float(misc.sum(y_true * tmp_gpu).get()) total = float(cumisc.sum(tmp_gpu).get()) return (-total) / y_prob.shape[0]
def softmax_gpu2d(x: gpuarray.GPUArray, dim): assert len(x.shape) == 2, 'expected 2-dimension array' assert 0 <= dim <= 1, "expected 0 <= dim <=1" exp_ker = exp_float_ker if x.dtype == np.float32 else exp_double_ker x_exp = gpuarray.empty_like(x) exp_ker(x, x_exp) x_exp_sum = misc.sum(x_gpu=x_exp, axis=dim) x_exp = misc.div_matvec(x_gpu=x_exp, a_gpu=x_exp_sum, axis=1 - dim) return x_exp
def binary_log_loss(y_true, y_prob): """Compute binary logistic loss for classification. This is identical to log_loss in binary classification case, but is kept for its use in multilabel case. Parameters ---------- y_true : array-like or label indicator matrix Ground truth (correct) labels. y_prob : array-like of float, shape = (n_samples, n_classes) Predicted probabilities, as returned by a classifier's predict_proba method. Returns ------- loss : float The degree to which the samples are correctly predicted. """ if y_prob.dtype == np.float64: cuClip(y_prob.gpudata, np.float64(1e-10), np.float64(1 - 1e-10), np.int32(y_prob.size), block=(blockSize, 1, 1), grid=(int((y_prob.size - 1) / blockSize + 1), 1, 1)) else: cuClipf(y_prob.gpudata, np.float32(1e-10), np.float32(1 - 1e-10), np.int32(y_prob.size), block=(blockSize, 1, 1), grid=(int((y_prob.size - 1) / blockSize + 1), 1, 1)) tmp_gpu = gpuarray.GPUArray(y_prob.shape, y_prob.dtype) if y_prob.dtype == np.float64: cuBinaryLogLoss(y_true.gpudata, y_prob.gpudata, tmp_gpu.gpudata, np.int32(y_prob.size), block=(blockSize, 1, 1), grid=(int((y_prob.size - 1) / blockSize + 1), 1, 1)) else: cuBinaryLogLossf(y_true.gpudata, y_prob.gpudata, tmp_gpu.gpudata, np.int32(y_prob.size), block=(blockSize, 1, 1), grid=(int((y_prob.size - 1) / blockSize + 1), 1, 1)) total = float(cumisc.sum(tmp_gpu).get()) return (-total) / y_prob.shape[0]
def _impl_test_sum(self, dtype): x = np.random.normal(scale=5.0, size=(3, 5)) x = x.astype(dtype=dtype, order='C') x_gpu = gpuarray.to_gpu(x) assert_allclose(misc.sum(x_gpu).get(), x.sum(), rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.sum(x_gpu, axis=0).get(), x.sum(axis=0), rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.sum(x_gpu, axis=1).get(), x.sum(axis=1), rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) x = x.astype(dtype=dtype, order='F') x_gpu = gpuarray.to_gpu(x) assert_allclose(misc.sum(x_gpu).get(), x.sum(), rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.sum(x_gpu, axis=0).get(), x.sum(axis=0), rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.sum(x_gpu, axis=1).get(), x.sum(axis=1), rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype])
def marginilize_rots_scales(self, posteriors, phases, shift_x, shift_y): shift_ind = self.ravel_shift_index(shift_x, shift_y) W = np.zeros((self.n_images, self.converter.get_num_prolates()), np.complex64) if config.is_use_gpu: W_gpu = gpuarray.zeros(W.shape, dtype='complex64') for i in np.arange(self.n_images): Wi = misc.sum(linalg.dot(posteriors[i, shift_ind], phases), axis=0).reshape((1,-1)) slice_assign_kernel.slice_assign_1d(W_gpu, Wi, i) W = W_gpu.get() else: for i in np.arange(self.n_images): W[i] = np.sum(np.dot(posteriors[i, shift_ind], phases), axis=0) return W
def _cuda_norm(self, X): """Caluclate L2-norm on gpu. Parameters ---------- X: array Array to normalize Returns ------- normX: array Normalized array """ return misc.divide(X, misc.sum(X**2, axis=1, keepdims=True)**0.5)
def impl_test_sum(self, dtype): x = np.random.normal(scale=5.0, size=(3, 5)) x = x.astype(dtype=dtype, order='C') x_gpu = gpuarray.to_gpu(x) assert np.allclose(misc.sum(x_gpu).get(), x.sum()) assert np.allclose(misc.sum(x_gpu, axis=0).get(), x.sum(axis=0)) assert np.allclose(misc.sum(x_gpu, axis=1).get(), x.sum(axis=1)) x = x.astype(dtype=dtype, order='F') x_gpu = gpuarray.to_gpu(x) assert np.allclose(misc.sum(x_gpu).get(), x.sum()) assert np.allclose(misc.sum(x_gpu, axis=0).get(), x.sum(axis=0)) assert np.allclose(misc.sum(x_gpu, axis=1).get(), x.sum(axis=1))
def get_distances_to_centers(self, data): # make sure the array is c order data = np.asarray(data, dtype=np.float32, order='C') # ship to gpu data_gpu = gpuarray.to_gpu(data) # alloc space on gpu for distances dists_shape = (data.shape[0], self.centers.shape[0]) dists_gpu = gpuarray.zeros(dists_shape, np.float32) # calc data norms on gpu data_norms = cumisc.sum(data_gpu**2, axis=1) # calc distance on gpu cumisc.add_matvec(dists_gpu, self.center_norms, 1, dists_gpu) cumisc.add_matvec(dists_gpu, data_norms, 0, dists_gpu) culinalg.add_dot(data_gpu, self.centers_gpu, dists_gpu, transb='T', alpha=-2.0) return dists_gpu
def run_gpu(self): """ Solves the MFTIE on GPU. The result is stored on an attribute "self.phase" containing a GPU array. """ # Extract pre-allocated GPU arrays # extract inputs iIo = self.iIo nm2 = self.inverse_laplacian dzI = self.dzI ky, kx = self.k Nz, Ny, Nx = self.shape # create outputs ft_dzI = gpuarray.empty((Nz, Ny, Nx), np.complex64) gradx = gpuarray.empty((Nz, Ny, Nx), np.complex64) grady = gpuarray.empty((Nz, Ny, Nx), np.complex64) # extract plans ft3dcc = self.pft3dcc ft2dcc = self.pft2dcc # Do the math! # FT(dzI) cu_fft.fft(dzI, ft_dzI, ft3dcc) # IFT(k*nm2*...) cu_fft.ifft((ft_dzI * nm2) * kx, gradx, ft3dcc, True) cu_fft.ifft((ft_dzI * nm2) * ky, grady, ft3dcc, True) # FT(... / Io) cu_fft.fft(gradx * iIo, gradx, ft3dcc) cu_fft.fft(grady * iIo, grady, ft3dcc) # Sum_z(nm2*(k*...)) Slapl = misc.sum( (nm2 * (kx * gradx + ky * grady)).reshape(Nz, Ny * Nx), 0).reshape(Ny, Nx) # IFT(...) cu_fft.ifft(Slapl, self.phdata, ft2dcc, True)
def thunk(): alpha = gpuarray.to_gpu(np.squeeze(np.asarray(inputs[0]))[:, None]) x_t = gpuarray.to_gpu(np.asarray(inputs[1])[0, :, :]) x_f = gpuarray.to_gpu(np.asarray(inputs[2])[0, :, :]) Xt = cumath.exp(misc.add(linalg.dot(x_t, A), b)) Xf = cumath.exp(misc.add(linalg.dot(x_f, A), b)) Xtn = misc.sum(Xt, axis=1, keepdims=True) Xfn = misc.sum(Xf, axis=1, keepdims=True) Xt = misc.divide(Xt, Xtn) Xf = misc.divide(Xf, Xfn) w = misc.multiply(Xt, alpha) + misc.multiply(Xf, 1 - alpha) wp = cumath.log(w) wpn = misc.sum(wp, axis=1, keepdims=True) / self.n wp = misc.subtract(wp, wpn) t1 = misc.sum(x * wp, axis=1) t2 = (self.n + depth) * cumath.log(misc.sum(w, axis=1)) t3 = depth * wpn outputs[0][0] = misc.sum(t1 - t2 + t3).get() for v in node.outputs: compute_map[v][0] = True
def thunk(): alpha = gpuarray.to_gpu(np.squeeze(np.asarray(inputs[0]))[:, None]) x_t = gpuarray.to_gpu(np.asarray(inputs[1])[0, :, :]) x_f = gpuarray.to_gpu(np.asarray(inputs[2])[0, :, :]) Xt = cumath.exp(misc.add(linalg.dot(x_t, A), b)) Xf = cumath.exp(misc.add(linalg.dot(x_f, A), b)) Xtn = misc.sum(Xt, axis=1, keepdims=True) Xfn = misc.sum(Xf, axis=1, keepdims=True) Xt = misc.divide(Xt, Xtn) Xf = misc.divide(Xf, Xfn) w = misc.multiply(Xt, alpha) + misc.multiply(Xf, 1 - alpha) dq = Xt - Xf qdw = dq / w t1 = misc.sum(x * qdw, axis=1) f = 2 * depth + self.base.n t2 = f * misc.sum(dq, axis=1) / misc.sum(w, axis=1) t3 = misc.sum(x, axis=1) * misc.sum(qdw, axis=1) dalpha = t1 - t2 + t3 del dq, t1, f, t2, t3 iw = 1 / w S1 = misc.multiply( depth[:, None] * (self.base.n - 1) / self.base.n, iw) S2 = (self.base.n + depth[:, None]) / cumath.log( misc.sum(w, axis=1, keepdims=True)) F = misc.multiply(misc.subtract((x * iw) - S1, S2), alpha) del w, iw, S1, S2 cast = gpuarray.zeros((x_t.shape[1], Xt.shape[1]), dtype=theano.config.floatX) dLq_t = gpuarray.zeros(x_t.shape, dtype=theano.config.floatX) dLq_f = gpuarray.zeros(x_f.shape, dtype=theano.config.floatX) for i in range(Xt.shape[0]): S1 = misc.multiply(Xt[None, i, :], A) S2 = misc.sum(S1, axis=1, keepdims=True) S2 = misc.multiply(S2, misc.add(Xt[None, i, :], cast)) dLq_t[i, :] = misc.sum(misc.multiply(F[None, i, :], S1 - S2), axis=1) S1 = misc.multiply(Xf[None, i, :], A) S2 = misc.sum(S1, axis=1, keepdims=True) S2 = misc.multiply(S2, misc.add(Xf[None, i, :], cast)) dLq_f[i, :] = misc.sum(misc.multiply(F[None, i, :], S1 - S2), axis=1) outputs[0][0] = dalpha.get() outputs[1][0] = dLq_t.get() outputs[2][0] = dLq_f.get() for v in node.outputs: compute_map[v][0] = True
def almLasso_mat_fun(self): ''' This function represents the Augumented Lagrangian Multipliers method for Lasso problem. The lagrangian form of the Lasso can be expressed as following: MIN{ 1/2||Y-XBHETA||_2^2 + lambda||THETA||_1} s.t B-T=0 When applied to this problem, the ADMM updates take the form BHETA^t+1 = (XtX + rhoI)^-1(Xty + rho^t - mu^t) THETA^t+1 = Shrinkage_lambda/rho(BHETA(t+1) + mu(t)/rho) mu(t+1) = mu(t) + rho(BHETA(t+1) - BHETA(t+1)) The algorithm involves a 'ridge regression' update for BHETA, a soft-thresholding (shrinkage) step for THETA and then a simple linear update for mu NB: Actually, this ADMM version contains several variations such as the using of two penalty parameters instead of just one of them (mu1, mu2) ''' print('\tADMM processing...') alpha1 = alpha2 = 0 if (len(self.reg_params) == 1): alpha1 = self.reg_params[0] alpha2 = self.reg_params[0] elif (len(self.reg_params) == 2): alpha1 = self.reg_params[0] alpha2 = self.reg_params[1] #thresholds parameters for stopping criteria if (len(self.thr) == 1): thr1 = self.thr[0] thr2 = self.thr[0] elif (len(self.thr) == 2): thr1 = self.thr[0] thr2 = self.thr[1] # entry condition err1 = 10 * thr1 err2 = 10 * thr2 start_time = time.time() # setting penalty parameters for the ALM mu1p = alpha1 * 1 / self.computeLambda() print("\t\t-Compute Lambda- Time = %s seconds" % (time.time() - start_time)) mu2p = alpha2 * 1 mu1 = mu1p mu2 = mu2p i = 1 start_time = time.time() if self.GPU == True: # defining penalty parameters e constraint to minimize, lambda and C matrix respectively THETA = misc.zeros((self.num_columns, self.num_columns), dtype='float64') lambda2 = misc.zeros((self.num_columns, self.num_columns), dtype='float64') gpu_data = gpuarray.to_gpu(self.data) P_GPU = linalg.dot(gpu_data, gpu_data, transa='T') OP1 = P_GPU linalg.scale(np.float32(mu1), OP1) OP2 = linalg.eye(self.num_columns) linalg.scale(mu2, OP2) if self.affine == True: print('\t\tGPU affine...') OP3 = misc.ones((self.num_columns, self.num_columns), dtype='float64') linalg.scale(mu2, OP3) lambda3 = misc.zeros((1, self.num_columns), dtype='float64') # TODO: Because of some problem with linalg.inv version of scikit-cuda we fix it using np.linalg.inv of numpy A = np.linalg.inv( misc.add(misc.add(OP1.get(), OP2.get()), OP3.get())) A_GPU = gpuarray.to_gpu(A) while ((err1 > thr1 or err2 > thr1) and i < self.max_iter): _lambda2 = gpuarray.to_gpu(lambda2) _lambda3 = gpuarray.to_gpu(lambda3) linalg.scale(1 / mu2, _lambda2) term_OP2 = gpuarray.to_gpu(_lambda2.get()) OP2 = gpuarray.to_gpu(misc.subtract(THETA, term_OP2)) linalg.scale(mu2, OP2) OP4 = gpuarray.to_gpu( np.matlib.repmat(_lambda3.get(), self.num_columns, 1)) # updating Z BHETA = linalg.dot( A_GPU, misc.add(misc.add(misc.add(OP1, OP2), OP3), OP4)) # deallocating unnecessary GPU variables OP2.gpudata.free() OP4.gpudata.free() _lambda2.gpudata.free() _lambda3.gpudata.free() # updating C THETA = misc.add(BHETA, term_OP2) THETA = self.shrinkL1Lq(THETA.get(), 1 / mu2) THETA = THETA.astype('float64') # updating Lagrange multipliers term_lambda2 = misc.subtract(BHETA, gpuarray.to_gpu(THETA)) linalg.scale(mu2, term_lambda2) term_lambda2 = gpuarray.to_gpu(term_lambda2.get()) lambda2 = misc.add(lambda2, term_lambda2) # on GPU term_lambda3 = misc.subtract( misc.ones((1, self.num_columns), dtype='float64'), misc.sum(BHETA, axis=0)) linalg.scale(mu2, term_lambda3) term_lambda3 = gpuarray.to_gpu(term_lambda3.get()) lambda3 = misc.add(lambda3, term_lambda3) # on GPU # deallocating unnecessary GPU variables term_OP2.gpudata.free() term_lambda2.gpudata.free() term_lambda3.gpudata.free() err1 = self.errorCoef(BHETA.get(), THETA) err2 = self.errorCoef(np.sum(BHETA.get(), axis=0), np.ones([1, self.num_columns])) # deallocating unnecessary GPU variables BHETA.gpudata.free() THETA = gpuarray.to_gpu((THETA)) # reporting errors if (self.verbose and (i % self.step == 0)): print( '\t\tIteration = %d, ||Z - C|| = %2.5e, ||1 - C^T 1|| = %2.5e' % (i, err1, err2)) i += 1 THETA = THETA.get() Err = [err1, err2] if (self.verbose): print( '\t\tTerminating ADMM at iteration %5.0f, \n ||Z - C|| = %2.5e, ||1 - C^T 1|| = %2.5e. \n' % (i, err1, err2)) else: print '\t\tGPU not affine' # TODO: Because of some problem with linalg.inv version of scikit-cuda we fix it using np.linalg.inv of numpy A = np.linalg.inv(misc.add(OP1.get(), OP2.get())) A_GPU = gpuarray.to_gpu(A) while (err1 > thr1 and i < self.max_iter): _lambda2 = gpuarray.to_gpu(lambda2) term_OP2 = THETA linalg.scale(mu2, term_OP2) term_OP2 = misc.subtract(term_OP2, _lambda2) OP2 = gpuarray.to_gpu(term_OP2.get()) BHETA = linalg.dot(A_GPU, misc.add(OP1, OP2)) linalg.scale(1 / mu2, _lambda2) term_THETA = gpuarray.to_gpu(_lambda2.get()) THETA = misc.add(BHETA, term_THETA) THETA = self.shrinkL1Lq(THETA.get(), 1 / mu2) THETA = THETA.astype('float32') # updating Lagrange multipliers term_lambda2 = misc.subtract(BHETA, gpuarray.to_gpu(THETA)) linalg.scale(mu2, term_lambda2) term_lambda2 = gpuarray.to_gpu(term_lambda2.get()) lambda2 = misc.add(lambda2, term_lambda2) # on GPU err1 = self.errorCoef(BHETA.get(), THETA) THETA = gpuarray.to_gpu((THETA)) # reporting errors if (self.verbose and (i % self.step == 0)): print('\t\tIteration %5.0f, ||Z - C|| = %2.5e' % (i, err1)) i += 1 THETA = THETA.get() Err = [err1, err2] if (self.verbose): print( '\t\tTerminating ADMM at iteration %5.0f, \n ||Z - C|| = %2.5e' % (i, err1)) else: #CPU version # defining penalty parameters e constraint to minimize, lambda and C matrix respectively THETA = np.zeros([self.num_columns, self.num_columns]) lambda2 = np.zeros([self.num_columns, self.num_columns]) P = self.data.T.dot(self.data) OP1 = np.multiply(P, mu1) if self.affine == True: # INITIALIZATION lambda3 = np.zeros(self.num_columns).T A = np.linalg.inv( np.multiply(mu1, P) + np.multiply(mu2, np.eye(self.num_columns, dtype=int)) + np.multiply(mu2, np.ones([self.num_columns, self.num_columns]))) OP3 = np.multiply( mu2, np.ones([self.num_columns, self.num_columns])) while ((err1 > thr1 or err2 > thr1) and i < self.max_iter): # updating Bheta OP2 = np.multiply(THETA - np.divide(lambda2, mu2), mu2) OP4 = np.matlib.repmat(lambda3, self.num_columns, 1) BHETA = A.dot(OP1 + OP2 + OP3 + OP4) # updating C THETA = BHETA + np.divide(lambda2, mu2) THETA = self.shrinkL1Lq(THETA, 1 / mu2) # updating Lagrange multipliers lambda2 = lambda2 + np.multiply(mu2, BHETA - THETA) lambda3 = lambda3 + np.multiply( mu2, np.ones([1, self.num_columns]) - np.sum(BHETA, axis=0)) err1 = self.errorCoef(BHETA, THETA) err2 = self.errorCoef(np.sum(BHETA, axis=0), np.ones([1, self.num_columns])) # mu1 = min(mu1 * (1 + 10 ^ -5), 10 ^ 2 * mu1p); # mu2 = min(mu2 * (1 + 10 ^ -5), 10 ^ 2 * mu2p); # reporting errors if (self.verbose and (i % self.step == 0)): print( '\t\tIteration = %d, ||Z - C|| = %2.5e, ||1 - C^T 1|| = %2.5e' % (i, err1, err2)) i += 1 Err = [err1, err2] if (self.verbose): print( '\t\tTerminating ADMM at iteration %5.0f, \n ||Z - C|| = %2.5e, ||1 - C^T 1|| = %2.5e. \n' % (i, err1, err2)) else: print '\t\tCPU not affine' A = np.linalg.inv( OP1 + np.multiply(mu2, np.eye(self.num_columns, dtype=int))) while (err1 > thr1 and i < self.max_iter): # updating Z OP2 = np.multiply(mu2, THETA) - lambda2 BHETA = A.dot(OP1 + OP2) # updating C THETA = BHETA + np.divide(lambda2, mu2) THETA = self.shrinkL1Lq(THETA, 1 / mu2) # updating Lagrange multipliers lambda2 = lambda2 + np.multiply(mu2, BHETA - THETA) # computing errors err1 = self.errorCoef(BHETA, THETA) # reporting errors if (self.verbose and (i % self.step == 0)): print('\t\tIteration %5.0f, ||Z - C|| = %2.5e' % (i, err1)) i += 1 Err = [err1, err2] if (self.verbose): print( '\t\tTerminating ADMM at iteration %5.0f, \n ||Z - C|| = %2.5e' % (i, err1)) print("\t\t-ADMM- Time = %s seconds" % (time.time() - start_time)) return THETA, Err
import pycuda.gpuarray as gpuarray import pycuda.autoinit import numpy as np from skcuda import misc def rolling_window(a, window): shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) strides = a.strides + (a.strides[-1],) return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) a = np.arange(100000,dtype=np.float32) b = np.array(rolling_window(a,5)) misc.init() dest_gpu = gpuarray.to_gpu(b) c = misc.sum(dest_gpu,axis=1)
def sum_diagonals(self, d_arr, d_out): self.d_diags.fill(0) self._kern_args[0] = d_arr.gpudata self.extract_diags_kernel(*self._kern_args, grid=self._grid, block=self._blocks) skmisc.sum(self.d_diags, axis=1, out=d_out)
def __init__(self, centers): culinalg.init() self.centers = centers.astype(np.float32) self.centers_gpu = gpuarray.to_gpu(self.centers) self.center_norms = cumisc.sum(self.centers_gpu**2, axis=1)
def demosaick_gpu(img): img = gp.to_gpu(img) p2x = im2col(img, _i2c2) cm.log(img + _eps, out=img) p1x = im2col(img, _i2c1) wA = p1x.shape[0] wB = p2x.shape[0] hA = p1x.shape[1] hB = p2x.shape[1] # Path 1 p1x = p1x.reshape([wA * hA, 576]) p1y = lg.dot(p1x, _wts.int1) cm.exp(p1y, out=p1y) p1y = p1y.reshape([wA * hA * 64, 3 * _ofac]) p1x = lg.dot(p1y, _wts.int2) msc.add_matvec(p1x, _wts.int2b, out=p1x) p1x = p1x.reshape([wA * hA * 64 * 3, _ofac]) # Path 2 # conv1 p2x = p2x.reshape([wB * hB, 64]) p2y = lg.dot(p2x, _wts.c1) msc.add_matvec(p2y, _wts.c1b, out=p2y) gp.maximum(p2y, 0., p2y) p2y = p2y.reshape([wB, hB, _numsel]) # conv2 shI = [wB - 1, hB - 1, _numsel] shM = [(wB - 1) * (hB - 1), _numsel] p2x = gp.empty(shM, dtype=np.float32) pTT = gp.empty(shI, dtype=np.float32) pTT = pTT.reshape(shI) pTT[...] = p2y[0:-1, 0:-1, :] pTT = pTT.reshape(shM) p2x = lg.dot(pTT, _wts.c200) pTT = pTT.reshape(shI) pTT[...] = p2y[0:-1, 1:, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c201, p2x) pTT = pTT.reshape(shI) pTT[...] = p2y[1:, 0:-1, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c210, p2x) pTT = pTT.reshape(shI) pTT[...] = p2y[1:, 1:, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c211, p2x) msc.add_matvec(p2x, _wts.c2b, out=p2x) gp.maximum(p2x, 0., p2x) p2x = p2x.reshape(shI) # conv 3 shI = [wB - 2, hB - 2, _numsel] shM = [(wB - 2) * (hB - 2), _numsel] p2y = gp.empty(shM, dtype=np.float32) pTT = gp.empty(shI, dtype=np.float32) pTT = pTT.reshape(shI) pTT[...] = p2x[0:-1, 0:-1, :] pTT = pTT.reshape(shM) p2y = lg.dot(pTT, _wts.c300) pTT = pTT.reshape(shI) pTT[...] = p2x[0:-1, 1:, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c301, p2y) pTT = pTT.reshape(shI) pTT[...] = p2x[1:, 0:-1, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c310, p2y) pTT = pTT.reshape(shI) pTT[...] = p2x[1:, 1:, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c311, p2y) msc.add_matvec(p2y, _wts.c3b, out=p2y) gp.maximum(p2y, 0., p2y) p2x = lg.dot(p2y, _wts.sout) msc.add_matvec(p2x, _wts.soutb, out=p2x) gp.maximum(p2x, 0., p2x) p2x = p2x.reshape(p1x.shape) # Combine p1x *= p2x p1 = msc.sum(p1x, axis=1) gp.maximum(p1, 0., p1) gp.minimum(p1, 1., p1) p1 = p1.reshape([wA, hA, 64 * 3]) im = p2im(p1.get()) return im