def calculate_H_gpu(X, W, P): WPW = la.add_diag(P, la.dot(W, W, "t", "n")) tmp = la.dot(W, la.inv(WPW, overwrite=True)) H = la.dot(X, tmp, "n", "t") H = gpu.maximum(H, 0) H = to_unit_variance(H) return H, tmp
def maximum_cuda(a, b=None): """Maximum values of two GPUArrays. Parameters ---------- a : gpuarray First GPUArray. b : gpuarray Second GPUArray. Returns ------- gpuarray Maximum values from both GPArrays, or single value if one GPUarray. Examples -------- >>> a = maximum_cuda(give_cuda([1, 2, 3]), give_cuda([3, 2, 1])) [3, 2, 3] >>> type(a) <class 'pycuda.gpuarray.GPUArray'> """ if b is not None: return cuda_array.maximum(a, b) return cuda_array.max(a)
def test_minimum_maximum_scalar(self): from pycuda.curandom import rand as curand sz = 20 a_gpu = curand((sz, )) a = a_gpu.get() import pycuda.gpuarray as gpuarray max_a0_gpu = gpuarray.maximum(a_gpu, 0) min_a0_gpu = gpuarray.minimum(0, a_gpu) assert la.norm(max_a0_gpu.get() - np.maximum(a, 0)) == 0 assert la.norm(min_a0_gpu.get() - np.minimum(0, a)) == 0
def test_minimum_maximum_scalar(self): from pycuda.curandom import rand as curand l = 20 a_gpu = curand((l,)) a = a_gpu.get() import pycuda.gpuarray as gpuarray max_a0_gpu = gpuarray.maximum(a_gpu, 0) min_a0_gpu = gpuarray.minimum(0, a_gpu) assert la.norm(max_a0_gpu.get() - np.maximum(a, 0)) == 0 assert la.norm(min_a0_gpu.get() - np.minimum(0, a)) == 0
def train_rfn_gpu(X, n_hidden, n_iter, learnrateW, learnratePsi, dropout_rate, input_droput_rate, minPsi=0.1, seed=32): k = n_hidden n, m = X.shape W = np.random.normal(scale=0.01, size=(k, m)).astype(np.float32) P = np.array([0.1] * m, dtype=np.float32) XXdiag = np.diag(np.dot(X.T, X) / n).copy() # explicit copy to avoid numpy 1.8 warning W = gpu.to_gpu(W, allocator=_mempool.allocate) P = gpu.to_gpu(P, allocator=_mempool.allocate) X = gpu.to_gpu(X, allocator=_mempool.allocate) XXdiag = gpu.to_gpu(XXdiag, allocator=_mempool.allocate) I = la.eye(k, dtype=np.float32) init_rng(seed) t0 = time.time() for cur_iter in range(n_iter): H, tmp = calculate_H_gpu(X, W, P) if dropout_rate > 0: dropout(H, dropout_rate) Xtmp = X if input_dropout_rate > 0: Xtmp = X.copy() saltpepper_noise(Xtmp, input_dropout_rate) U = la.dot(Xtmp, H, "t", "n") / n S = la.dot(H, H, "t", "n") / n S += I S -= la.dot(tmp, W, "n", "t") Cii = la.dot(la.dot(W, S, "t") - 2 * U, W) Sinv = la.inv(S, overwrite=True) dW = la.dot(Sinv, U, "n", "t") - W dP = XXdiag + la.diag(Cii) - P W += learnrateW * dW P += learnratePsi * dP P = gpu.maximum(P, minPsi) if cur_iter % 25 == 0: print "iter %3d (elapsed time: %5.2fs)" % (cur_iter, time.time() - t0) return W.get(), P.get()
def ProxFSs(s, t, _Lambda, _gamma_c): l, m, n = s.shape t2 = gpuarray.empty_like(t) square_matrix(t, t2) t_norm = gpuarray.empty((l, m, n), dtype=np.float32) sum_three_matrix(t2[0, :, :, :], t2[1, :, :, :], t2[2, :, :, ], t_norm, 1.0, 1.0, 1.0) sqrt_matrix(t_norm, t_norm) # divide_matrix(t_norm, _Gamma, t_norm) max_abs_t = gpuarray.maximum(1, t_norm / _gamma_c) pt = gpuarray.zeros((3, l, m, n), t.dtype) divide_matrix(t[0, :, :, :], max_abs_t, pt[0, :, :, :]) divide_matrix(t[1, :, :, :], max_abs_t, pt[1, :, :, :]) divide_matrix(t[2, :, :, :], max_abs_t, pt[2, :, :, :]) s_abs = gpuarray.empty((l, m, n), dtype=np.float32) # \|s\| absolute_matrix(s, s_abs) divide_matrix(s_abs, _Lambda, s_abs) max_abs_s = gpuarray.maximum(1, s_abs) ps = gpuarray.zeros((l, m, n), s.dtype) divide_matrix(s, max_abs_s, ps) return ps, pt
def ProxFSs(t, _gamma): _, l, m, n = t.shape t2 = gpuarray.empty_like(t) square_matrix(t, t2) t_norm = gpuarray.empty((l, m, n), dtype=np.float32) sum_three_matrix(t2[0, :, :, :], t2[1, :, :, :], t2[2, :, :, :], t_norm, 1.0, 1.0, 1.0) sqrt_matrix(t_norm, t_norm) max_abs_t = gpuarray.maximum(1, t_norm / _gamma) pt = gpuarray.zeros((3, l, m, n), t.dtype) divide_matrix(t[0, :, :, :], max_abs_t, pt[0, :, :, :]) divide_matrix(t[1, :, :, :], max_abs_t, pt[1, :, :, :]) divide_matrix(t[2, :, :, :], max_abs_t, pt[2, :, :, :]) return pt
def test_if_positive(self): from pycuda.curandom import rand as curand sz = 20 a_gpu = curand((sz, )) b_gpu = curand((sz, )) a = a_gpu.get() b = b_gpu.get() import pycuda.gpuarray as gpuarray max_a_b_gpu = gpuarray.maximum(a_gpu, b_gpu) min_a_b_gpu = gpuarray.minimum(a_gpu, b_gpu) print(max_a_b_gpu) print(np.maximum(a, b)) assert la.norm(max_a_b_gpu.get() - np.maximum(a, b)) == 0 assert la.norm(min_a_b_gpu.get() - np.minimum(a, b)) == 0
def test_if_positive(self): from pycuda.curandom import rand as curand l = 20 a_gpu = curand((l,)) b_gpu = curand((l,)) a = a_gpu.get() b = b_gpu.get() import pycuda.gpuarray as gpuarray max_a_b_gpu = gpuarray.maximum(a_gpu, b_gpu) min_a_b_gpu = gpuarray.minimum(a_gpu, b_gpu) print (max_a_b_gpu) print((np.maximum(a, b))) assert la.norm(max_a_b_gpu.get() - np.maximum(a, b)) == 0 assert la.norm(min_a_b_gpu.get() - np.minimum(a, b)) == 0
def train_rfn_gpu(X, n_hidden, n_iter, learnrateW, learnratePsi, dropout_rate, input_droput_rate, minPsi=0.1, seed=32): k = n_hidden n, m = X.shape W = np.random.normal(scale=0.01, size=(k, m)).astype(np.float32) P = np.array([0.1] * m, dtype=np.float32) XXdiag = np.diag(np.dot(X.T, X) / n).copy() # explicit copy to avoid numpy 1.8 warning W = gpu.to_gpu(W, allocator=_mempool.allocate) P = gpu.to_gpu(P, allocator=_mempool.allocate) X = gpu.to_gpu(X, allocator=_mempool.allocate) XXdiag = gpu.to_gpu(XXdiag, allocator=_mempool.allocate) I = la.eye(k, dtype=np.float32) init_rng(seed) t0 = time.time() for cur_iter in range(n_iter): H, tmp = calculate_H_gpu(X, W, P) if dropout_rate > 0: dropout(H, dropout_rate) Xtmp = X if input_dropout_rate > 0: Xtmp = X.copy() saltpepper_noise(Xtmp, input_dropout_rate) U = la.dot(Xtmp, H, "t", "n") / n S = la.dot(H, H, "t", "n") / n S += I S -= la.dot(tmp, W, "n", "t") Cii = la.dot(la.dot(W, S, "t") - 2*U, W) Sinv = la.inv(S, overwrite=True) dW = la.dot(Sinv, U, "n", "t") - W dP = XXdiag + la.diag(Cii) - P W += learnrateW * dW P += learnratePsi * dP P = gpu.maximum(P, minPsi) if cur_iter % 25 == 0: print "iter %3d (elapsed time: %5.2fs)" % (cur_iter, time.time() - t0) return W.get(), P.get()
def demosaick_gpu(img): img = gp.to_gpu(img) p2x = im2col(img, _i2c2) cm.log(img + _eps, out=img) p1x = im2col(img, _i2c1) wA = p1x.shape[0] wB = p2x.shape[0] hA = p1x.shape[1] hB = p2x.shape[1] # Path 1 p1x = p1x.reshape([wA * hA, 576]) p1y = lg.dot(p1x, _wts.int1) cm.exp(p1y, out=p1y) p1y = p1y.reshape([wA * hA * 64, 3 * _ofac]) p1x = lg.dot(p1y, _wts.int2) msc.add_matvec(p1x, _wts.int2b, out=p1x) p1x = p1x.reshape([wA * hA * 64 * 3, _ofac]) # Path 2 # conv1 p2x = p2x.reshape([wB * hB, 64]) p2y = lg.dot(p2x, _wts.c1) msc.add_matvec(p2y, _wts.c1b, out=p2y) gp.maximum(p2y, 0., p2y) p2y = p2y.reshape([wB, hB, _numsel]) # conv2 shI = [wB - 1, hB - 1, _numsel] shM = [(wB - 1) * (hB - 1), _numsel] p2x = gp.empty(shM, dtype=np.float32) pTT = gp.empty(shI, dtype=np.float32) pTT = pTT.reshape(shI) pTT[...] = p2y[0:-1, 0:-1, :] pTT = pTT.reshape(shM) p2x = lg.dot(pTT, _wts.c200) pTT = pTT.reshape(shI) pTT[...] = p2y[0:-1, 1:, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c201, p2x) pTT = pTT.reshape(shI) pTT[...] = p2y[1:, 0:-1, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c210, p2x) pTT = pTT.reshape(shI) pTT[...] = p2y[1:, 1:, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c211, p2x) msc.add_matvec(p2x, _wts.c2b, out=p2x) gp.maximum(p2x, 0., p2x) p2x = p2x.reshape(shI) # conv 3 shI = [wB - 2, hB - 2, _numsel] shM = [(wB - 2) * (hB - 2), _numsel] p2y = gp.empty(shM, dtype=np.float32) pTT = gp.empty(shI, dtype=np.float32) pTT = pTT.reshape(shI) pTT[...] = p2x[0:-1, 0:-1, :] pTT = pTT.reshape(shM) p2y = lg.dot(pTT, _wts.c300) pTT = pTT.reshape(shI) pTT[...] = p2x[0:-1, 1:, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c301, p2y) pTT = pTT.reshape(shI) pTT[...] = p2x[1:, 0:-1, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c310, p2y) pTT = pTT.reshape(shI) pTT[...] = p2x[1:, 1:, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c311, p2y) msc.add_matvec(p2y, _wts.c3b, out=p2y) gp.maximum(p2y, 0., p2y) p2x = lg.dot(p2y, _wts.sout) msc.add_matvec(p2x, _wts.soutb, out=p2x) gp.maximum(p2x, 0., p2x) p2x = p2x.reshape(p1x.shape) # Combine p1x *= p2x p1 = msc.sum(p1x, axis=1) gp.maximum(p1, 0., p1) gp.minimum(p1, 1., p1) p1 = p1.reshape([wA, hA, 64 * 3]) im = p2im(p1.get()) return im
def relu(x, deriv=False): if deriv: return 1.0 - cm.exp(-x) else: return gpu.maximum(x, 0)
def roi_pool(feature_maps, input_rois): feature_maps_gpu = gpuarray.to_gpu(feature_maps) input_rois_gpu = gpuarray.to_gpu(input_rois) roi_pool_conv5 = gpuarray.to_gpu(np.zeros([256, 36], dtype=np.float32)) pooled_height = 6 pooled_width = 6 spatial_scale = 0.0625 batch_size, height, width, channels = feature_maps_gpu.shape num_rois = input_rois_gpu.shape[0] roi_pool_conv5s = gpuarray.to_gpu( np.zeros([num_rois, 9216], dtype=np.float32)) for i in range(num_rois): # roi_batch_ind = input_rois_gpu[i, 0] roi_start_w = cumath.floor(input_rois_gpu[i, 1] * spatial_scale) # should be round() roi_start_h = cumath.floor(input_rois_gpu[i, 2] * spatial_scale) # should be round() roi_end_w = cumath.floor(input_rois_gpu[i, 3] * spatial_scale) # should be round() roi_end_h = cumath.floor(input_rois_gpu[i, 4] * spatial_scale) # should be round() roi_height = gpuarray.maximum(roi_end_h - roi_start_h + 1, 1) roi_width = gpuarray.maximum(roi_end_w - roi_start_w + 1, 1) bin_size_h = roi_height / float(pooled_height) bin_size_w = roi_width / float(pooled_width) for c in range(channels): for ph in range(pooled_height): for pw in range(pooled_width): hstart = cumath.floor(ph * bin_size_h) wstart = cumath.floor(pw * bin_size_w) hend = cumath.ceil((ph + 1) * bin_size_h) wend = cumath.ceil((pw + 1) * bin_size_w) hstart = gpuarray.minimum( gpuarray.maximum(hstart + roi_start_h, 0), height) hend = gpuarray.minimum( gpuarray.maximum(hend + roi_start_h, 0), height) wstart = gpuarray.minimum( gpuarray.maximum(wstart + roi_start_w, 0), width) wend = gpuarray.minimum( gpuarray.maximum(wend + roi_start_w, 0), width) is_empty = (hend <= hstart) + (wend <= wstart) pool_index = ph * pooled_width + pw if (is_empty.get()): roi_pool_conv5[c, pool_index] = 0 for h in range(int(hstart.get()), int(hend.get())): for w in range(int(wstart.get()), int(wend.get())): # index = h * width + w if ((feature_maps_gpu[0, h, w, c] > roi_pool_conv5[c, pool_index]).get()): roi_pool_conv5[ c, pool_index] = feature_maps_gpu[0, h, w, c] roi_pool_conv5s[i] = roi_pool_conv5.reshape([9216]) return roi_pool_conv5s
c = gpuarray.empty((100, 100), dtype=dtype) print('c:\n{0}\nshape={1}\n'.format(c, c.shape)) d = gpuarray.zeros((100, 100), dtype=dtype) print('d:\n{0}\nshape={1}\n'.format(d, d.shape)) e = gpuarray.arange(0.0, 100.0, 1.0, dtype=dtype) print('e:\n{0}\nshape={1}\n'.format(e, e.shape)) f = gpuarray.if_positive(e < 50, e - 100, e + 100) print('f:\n{0}\nshape={1}\n'.format(f, f.shape)) g = gpuarray.if_positive(e < 50, gpuarray.ones_like(e), gpuarray.zeros_like(e)) print('g:\n{0}\nshape={1}\n'.format(g, g.shape)) h = gpuarray.maximum(e, f) print('h:\n{0}\nshape={1}\n'.format(h, h.shape)) i = gpuarray.minimum(e, f) print('i:\n{0}\nshape={1}\n'.format(i, i.shape)) g = gpuarray.sum(a) print(g, type(g)) k = gpuarray.max(a) print(k, type(k)) l = gpuarray.min(a) print(l, type(l))
def tgv(op, out_dir, alpha=4e-5, tau_p=0.625, tau_d=0.125, reduction=2**-8, fac=2, iters=3000, relative_tolerance=1e-20, absolute_tolerance=1e-19, cg=False, inner_iters=20, norm_est=None, norm_est_iters=10, time_iters=False, no_progress=False, save_images=False, save_mat=False, image_format='png', verbose=False): """TGV regularized reconstruction using the Encoding Matrix E. Args: op (:class:`artbox.operators.Operator`): :class:`artbox.operators.Operator` object. out_dir (str): Output directory. alpha (float): Regularization parameter. tau_p (float): tau_p. tau_d (float): tau_d. reduction (float): Regularization parameter reduction per iteration. fac (float): fac. iters (int): Number of iterations. relative_tolerance (float): Relative tolerance for early stopping rule. absolute_tolerance (float): Absolute tolerance for early stopping rule. cg (bool): Indicate whether inner CG method should be used (TGV-CG). inner_iters (int): Number of iterations for inner CG. norm_est (float): Estimated norm of operator. If `None`, it will be calculated. norm_est_iters (int): Number of iterations for norm estimation. time_iters (bool): If `True`, all iterations are timed. no_progress (bool): If `True`, no progress bar is shown. save_images (bool): If `True`, all intermediate images are saved to disk. save_mat (bool): If `True`, all intermediate images are saved as MATLAB data files. image_format (str): Format images are saved in. verbose (int): Verbosity level. double (bool): Indicate whether computations should be performed with double precision. TODO!! """ data = op.data alpha = alpha/reduction alpha00 = alpha*fac alpha10 = alpha alpha01 = alpha00*reduction alpha11 = alpha10*reduction maxiter = iters # set up primal variables ut = gpuarray.zeros((data.nX1, data.nX2, 1), np.complex64, order='F') # 'reference zero' tt = gpuarray.zeros((data.nX1, data.nX2, 1), np.float32, order='F') op.adjoint(op.dgpu['recondata'], ut) # norm estimation if norm_est is None: # perform norm estimation norm_est = op.norm_est(ut, norm_est_iters) if verbose: print("Norm estimation: " + str(norm_est)) else: # use user-provided norm if verbose: print("Norm estimation (provided by user): " + str(norm_est)) ut /= norm_est u = gpuarray.maximum(tt, ut.real).astype(np.complex64) u_ = gpuarray_copy(u) w = gpuarray.zeros((u.shape[0], u.shape[1], 2*u.shape[2]), np.complex64, order='F') w_ = gpuarray.zeros((u.shape[0], u.shape[1], 2*u.shape[2]), np.complex64, order='F') # set up dual variables p = gpuarray.zeros((u.shape[0], u.shape[1], 2*u.shape[2]), np.complex64, order='F') q = gpuarray.zeros((u.shape[0], u.shape[1], 3*u.shape[2]), np.complex64, order='F') v = gpuarray.zeros_like(op.dgpu['recondata']) # set up variables associated with linear transform Ku = gpuarray.zeros(op.dgpu['recondata'].shape, np.complex64, order='F') Kadjv = gpuarray.zeros((data.nX1, data.nX2, 1), np.complex64, order='F') # if args.L2 is None: # # L2 is *not* provided by the user # M = 1 # L2 = 0.5*(M*M + 17 + np.sqrt(pow(M, 4.0) - 2*M*M + 33)) # else: # # L2 is provided by the user # L2 = args.L2 # this one works for TGV # tau_p = 1.0/np.sqrt(L2) # tau_d = 1.0/tau_p/L2 uold = gpuarray.empty_like(u) wold = gpuarray.empty_like(w) if cg: from artbox.cg import InnerCG tmp_EHs = None tau_p = 1.0/norm_est tau_d = 1.0/tau_p/(0.5*(17+np.sqrt(33))) if time_iters: from time import time if not no_progress and verbose: # set up progress bar from progressbar import ProgressBar progress = ProgressBar() iter_range = progress(range(maxiter)) else: iter_range = range(maxiter) total_iterations = 0 try: for k in iter_range: if verbose and no_progress: print("Iteration " + repr(k)) if no_progress and time_iters: start = time() total_iterations += 1 alpha0 = np.exp(float(k)/maxiter*np.log(alpha01) + float(maxiter-k)/maxiter*np.log(alpha00)) alpha1 = np.exp(float(k)/maxiter*np.log(alpha11) + float(maxiter-k)/maxiter*np.log(alpha10)) # primal update cuda.memcpy_dtod(uold.gpudata, u.gpudata, u.nbytes) cuda.memcpy_dtod(wold.gpudata, w.gpudata, w.nbytes) op.apply(u_, Ku) Ku /= norm_est tgvk.tgv_update_v(v, Ku, op.dgpu['recondata'], tau_d, lin_constr=(alpha1 < 0)) # dual update tgvk.tgv_update_p(u_, w_, p, tau_d, abs(alpha1)) tgvk.tgv_update_q(w_, q, tau_d, abs(alpha0)) op.adjoint(v, Kadjv) Kadjv /= norm_est # Inner conjugate gradient method if cg: try: icg = InnerCG(op, data, u, p, tau_p, inner_iters, relative_tolerance, absolute_tolerance, verbose, EHs=tmp_EHs) icg.run() except: raise finally: total_iterations += icg.iteration tmp_EHs = icg.EHs else: tgvk.tgv_update_u(u, p, Kadjv, tau_p) tgvk.tgv_update_w(w, p, q, tau_p) # extragradient update tgvk.tgv_update_u_2(u_, u, uold) tgvk.tgv_update_w_2(w_, w, wold) # Print time per iteration if no_progress and time_iters: print("Elapsed time for iteration " + str(k) + ": " + str(time() - start) + " seconds") # Save images if save_images: save_image(np.abs(u.get().reshape(data.nX1, data.nX2)), out_dir, k, image_format) # Save matlab files if save_mat: save_matlab(u.get().reshape(data.nX1, data.nX2), out_dir, k) except KeyboardInterrupt: print("Reconstruction aborted (CTRL-C) at iteration " + str(total_iterations)) finally: # always save final image and Matlab data save_image(np.abs(u.get().reshape(data.nX1, data.nX2)), out_dir, "result", image_format) save_matlab(u.get().reshape(data.nX1, data.nX2), out_dir, "result") return total_iterations
def MAX(inp, inout): gpu.maximum(inp, inout, out=inout)