def test_log(self): """tests if the log function works""" a = simplearray.array(test_sample).fill_arange()+1 b = cumath.log(a) for i in range(test_sample): self.assert_(abs(math.log(a[i]) - b[i]) < 1e-3)
def multinomial_log_likelihood(softmax_vals,Y,one_n_trans,one_c): # add small amount to protect against log(0) small_val = 1e-9 prod = Y*cumath.log(softmax_vals+small_val) prod = linalg.dot(one_n_trans,prod) prod = linalg.dot(prod,one_c) return(prod.get())
def kl_error(self, input_data, targets, average=True, cache=None, prediction=True): """ The KL divergence error """ if cache is not None: activations = cache else: activations = \ self.feed_forward(input_data, prediction=prediction) targets_non_nan = gpuarray.empty_like(targets) nan_to_zeros(targets, targets_non_nan) kl_error = gpuarray.sum(targets_non_nan * (cumath.log(targets_non_nan + eps) - cumath.log(activations + eps))) if average: kl_error /= targets.shape[0] return float(kl_error.get())
def logsumexp(mat): max_dim = max_by_axis(mat, 1) tmp = add_vec_to_mat(mat, max_dim, 0, substract=True) tmp = cumath.exp(tmp) tmp = matrix_sum_out_axis(tmp, 1) tmp = cumath.log(tmp) max_dim += tmp return max_dim
def thunk(): alpha = gpuarray.to_gpu(np.squeeze(np.asarray(inputs[0]))[:, None]) x_t = gpuarray.to_gpu(np.asarray(inputs[1])[0, :, :]) x_f = gpuarray.to_gpu(np.asarray(inputs[2])[0, :, :]) Xt = cumath.exp(misc.add(linalg.dot(x_t, A), b)) Xf = cumath.exp(misc.add(linalg.dot(x_f, A), b)) Xtn = misc.sum(Xt, axis=1, keepdims=True) Xfn = misc.sum(Xf, axis=1, keepdims=True) Xt = misc.divide(Xt, Xtn) Xf = misc.divide(Xf, Xfn) w = misc.multiply(Xt, alpha) + misc.multiply(Xf, 1 - alpha) wp = cumath.log(w) wpn = misc.sum(wp, axis=1, keepdims=True) / self.n wp = misc.subtract(wp, wpn) t1 = misc.sum(x * wp, axis=1) t2 = (self.n + depth) * cumath.log(misc.sum(w, axis=1)) t3 = depth * wpn outputs[0][0] = misc.sum(t1 - t2 + t3).get() for v in node.outputs: compute_map[v][0] = True
def fprop(self, Y, Y_true, meta): # really bad things happen if Y_true is a bool array if not Y.shape == Y_true.shape: raise ValueError("Shape of predictions and labels do not match. (Y={}, Y_true={})".format(Y.shape, Y_true.shape)) out = - pycuda.gpuarray.sum(Y_true * cumath.log(Y)) / meta['space_below'].get_extent('b') fprop_state = {} fprop_state['input_space'] = meta['space_below'] return out, meta, fprop_state
def logsumexp(mat, tmp=None): max_dim = max_by_axis(mat, 1) if tmp is None: tmp = gpuarray.empty_like(mat) add_vec_to_mat(mat, max_dim, 0, target=tmp, substract=True) exp_func.prepared_async_call(tmp._grid, tmp._block, None, tmp.gpudata, tmp.gpudata, tmp.mem_size) # tmp = cumath.exp(tmp) tmp = matrix_sum_out_axis(tmp, 1) tmp = cumath.log(tmp) max_dim += tmp return max_dim
def fprop(self, Y, Y_true, meta): # really bad things happen if Y_true is a bool array if not Y.shape == Y_true.shape: raise ValueError( "Shape of predictions and labels do not match. (Y={}, Y_true={})" .format(Y.shape, Y_true.shape)) out = -pycuda.gpuarray.sum( Y_true * cumath.log(Y)) / meta['space_below'].get_extent('b') fprop_state = {} fprop_state['input_space'] = meta['space_below'] return out, meta, fprop_state
def thunk(): alpha = gpuarray.to_gpu(np.squeeze(np.asarray(inputs[0]))[:, None]) x_t = gpuarray.to_gpu(np.asarray(inputs[1])[0, :, :]) x_f = gpuarray.to_gpu(np.asarray(inputs[2])[0, :, :]) Xt = cumath.exp(misc.add(linalg.dot(x_t, A), b)) Xf = cumath.exp(misc.add(linalg.dot(x_f, A), b)) Xtn = misc.sum(Xt, axis=1, keepdims=True) Xfn = misc.sum(Xf, axis=1, keepdims=True) Xt = misc.divide(Xt, Xtn) Xf = misc.divide(Xf, Xfn) w = misc.multiply(Xt, alpha) + misc.multiply(Xf, 1 - alpha) dq = Xt - Xf qdw = dq / w t1 = misc.sum(x * qdw, axis=1) f = 2 * depth + self.base.n t2 = f * misc.sum(dq, axis=1) / misc.sum(w, axis=1) t3 = misc.sum(x, axis=1) * misc.sum(qdw, axis=1) dalpha = t1 - t2 + t3 del dq, t1, f, t2, t3 iw = 1 / w S1 = misc.multiply( depth[:, None] * (self.base.n - 1) / self.base.n, iw) S2 = (self.base.n + depth[:, None]) / cumath.log( misc.sum(w, axis=1, keepdims=True)) F = misc.multiply(misc.subtract((x * iw) - S1, S2), alpha) del w, iw, S1, S2 cast = gpuarray.zeros((x_t.shape[1], Xt.shape[1]), dtype=theano.config.floatX) dLq_t = gpuarray.zeros(x_t.shape, dtype=theano.config.floatX) dLq_f = gpuarray.zeros(x_f.shape, dtype=theano.config.floatX) for i in range(Xt.shape[0]): S1 = misc.multiply(Xt[None, i, :], A) S2 = misc.sum(S1, axis=1, keepdims=True) S2 = misc.multiply(S2, misc.add(Xt[None, i, :], cast)) dLq_t[i, :] = misc.sum(misc.multiply(F[None, i, :], S1 - S2), axis=1) S1 = misc.multiply(Xf[None, i, :], A) S2 = misc.sum(S1, axis=1, keepdims=True) S2 = misc.multiply(S2, misc.add(Xf[None, i, :], cast)) dLq_f[i, :] = misc.sum(misc.multiply(F[None, i, :], S1 - S2), axis=1) outputs[0][0] = dalpha.get() outputs[1][0] = dLq_t.get() outputs[2][0] = dLq_f.get() for v in node.outputs: compute_map[v][0] = True
def log_t(self, a, out): cumath.log(a, out=out)
def cross_entropy(x, y): loss = y * cumath.log(x + eps) nan_to_zeros_kernel(loss, loss) loss = -gpuarray.sum(loss) return float(loss.get())
def logsumexp(mat): max_dim = max_by_axis(mat, 1) tmp = add_vec_to_mat(mat, -max_dim, 0) L = max_dim + cumath.log(matrix_sum_out_axis(cumath.exp(tmp), 1)) return L
GPU_find_max_in_shrmem(all_l_rhots_gpu, grid=grd, block=blk, shared=int(max_tpb*8)) # Indexes are not contiguous griddimx = int(nclmns / max_tpb) griddimy = int(nsamps) # One thread per sample-time grd = (griddimx, griddimy, 1) blk = (max_tpb, 1, 1) maxes = np.array(all_l_rhots_gpu[:,0][1::nmodes].get()).astype(np.float64) maxes_gpu = gpuarray.to_gpu(maxes) GPU_bcast_vec_to_matrix(all_l_rhots_gpu, -maxes_gpu, grid=grd, block=blk, shared=8) # ***** THIS IS CORRECT AND WORKING UP THROUGH HERE AS OF AUGUST 10TH 2016 ***** ''' Marginalize over Time ''' all_l_rhots_gpu = cumath.exp(all_l_rhots_gpu) # exponentiate GPU_nv_reduc(all_l_rhots_gpu) # sum over time lnL_gpu = maxes_gpu + cumath.log(all_l_rhots_gpu) # TIMES DELTA T FIXME
# time operation s = time() hC = numpy.log(hA) e = time() print 'serial elapsed time: %f \n' % (e-s) ## device execution # allocate device arrays dA = gpuarray.to_gpu(hA) dB = gpuarray.to_gpu(hB) dC = gpuarray.to_gpu(hC) # time operation s = time() dC = cumath.log(dA) e = time() print 'gpu elapsed time: %f \n' % (e-s) ################### # 3) elementwise kernel # performs array operations much faster than gpu_array print '\n elementwise kernel\n' print '---------------------\n' from pycuda.curandom import rand as curand a_gpu = curand((1000,)) b_gpu = curand((1000,))
def cross_entropy(x, y): loss = y * cumath.log(x + eps) nan_to_zeros(loss, loss) loss = -gpuarray.sum(loss) return loss
def get_log_like_val(self,Y): return np.min( ((gpuarray.sum( (cumath.log(self.outputs+self.eps_tol)*Y) )).get(), 10**20 ) )
def rdmd(a_gpu, k=None, p=5, q=1, modes='exact', method_rsvd='standard', return_amplitudes=False, return_vandermonde=False, handle=None): """ Randomized Dynamic Mode Decomposition. Dynamic Mode Decomposition (DMD) is a data processing algorithm which allows to decompose a matrix `a` in space and time. The matrix `a` is decomposed as `a = FBV`, where the columns of `F` contain the dynamic modes. The modes are ordered corresponding to the amplitudes stored in the diagonal matrix `B`. `V` is a Vandermonde matrix describing the temporal evolution. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Real/complex input matrix `a` with dimensions `(m, n)`. k : int, optional If `k < (n-1)` low-rank Dynamic Mode Decomposition is computed. p : int `p` sets the oversampling parameter for rSVD (default k=5). q : int `q` sets the number of power iterations for rSVD (default=1). modes : `{'standard', 'exact'}` 'standard' : uses the standard definition to compute the dynamic modes, `F = U * W`. 'exact' : computes the exact dynamic modes, `F = Y * V * (S**-1) * W`. method_rsvd : `{'standard', 'fast'}` 'standard' : (default) Standard algorithm as described in [1, 2] 'fast' : Version II algorithm as described in [2] return_amplitudes : bool `{True, False}` True: return amplitudes in addition to dynamic modes. return_vandermonde : bool `{True, False}` True: return Vandermonde matrix in addition to dynamic modes and amplitudes. handle : int CUBLAS context. If no context is specified, the default handle from `skcuda.misc._global_cublas_handle` is used. Returns ------- f_gpu : pycuda.gpuarray.GPUArray Matrix containing the dynamic modes of shape `(m, n-1)` or `(m, k)`. b_gpu : pycuda.gpuarray.GPUArray 1-D array containing the amplitudes of length `min(n-1, k)`. v_gpu : pycuda.gpuarray.GPUArray Vandermonde matrix of shape `(n-1, n-1)` or `(k, n-1)`. Notes ----- Double precision is only supported if the standard version of the CULA Dense toolkit is installed. This function destroys the contents of the input matrix. Arrays are assumed to be stored in column-major order, i.e., order='F'. References ---------- N. B. Erichson and C. Donovan. "Randomized Low-Rank Dynamic Mode Decomposition for Motion Detection" Under Review. N. Halko, P. Martinsson, and J. Tropp. "Finding structure with randomness: probabilistic algorithms for constructing approximate matrix decompositions" (2009). (available at `arXiv <http://arxiv.org/abs/0909.4061>`_). J. H. Tu, et al. "On dynamic mode decomposition: theory and applications." arXiv preprint arXiv:1312.0041 (2013). Examples -------- >>> #Numpy >>> import numpy as np >>> #Plot libs >>> import matplotlib.pyplot as plt >>> from mpl_toolkits.mplot3d import Axes3D >>> from matplotlib import cm >>> #GPU DMD libs >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> from skcuda import linalg, rlinalg >>> linalg.init() >>> rlinalg.init() >>> # Define time and space discretizations >>> x=np.linspace( -15, 15, 200) >>> t=np.linspace(0, 8*np.pi , 80) >>> dt=t[2]-t[1] >>> X, T = np.meshgrid(x,t) >>> # Create two patio-temporal patterns >>> F1 = 0.5* np.cos(X)*(1.+0.* T) >>> F2 = ( (1./np.cosh(X)) * np.tanh(X)) *(2.*np.exp(1j*2.8*T)) >>> # Add both signals >>> F = (F1+F2) >>> #Plot dataset >>> fig = plt.figure() >>> ax = fig.add_subplot(231, projection='3d') >>> ax = fig.gca(projection='3d') >>> surf = ax.plot_surface(X, T, F, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=True) >>> ax.set_zlim(-1, 1) >>> plt.title('F') >>> ax = fig.add_subplot(232, projection='3d') >>> ax = fig.gca(projection='3d') >>> surf = ax.plot_surface(X, T, F1, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=False) >>> ax.set_zlim(-1, 1) >>> plt.title('F1') >>> ax = fig.add_subplot(233, projection='3d') >>> ax = fig.gca(projection='3d') >>> surf = ax.plot_surface(X, T, F2, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=False) >>> ax.set_zlim(-1, 1) >>> plt.title('F2') >>> #Dynamic Mode Decomposition >>> F_gpu = np.array(F.T, np.complex64, order='F') >>> F_gpu = gpuarray.to_gpu(F_gpu) >>> Fmodes_gpu, b_gpu, V_gpu, omega_gpu = rlinalg.rdmd(F_gpu, k=2, p=0, q=1, modes='exact', return_amplitudes=True, return_vandermonde=True) >>> omega = omega_gpu.get() >>> plt.scatter(omega.real, omega.imag, marker='o', c='r') >>> #Recover original signal >>> F1tilde = np.dot(Fmodes_gpu[:,0:1].get() , np.dot(b_gpu[0].get(), V_gpu[0:1,:].get() ) ) >>> F2tilde = np.dot(Fmodes_gpu[:,1:2].get() , np.dot(b_gpu[1].get(), V_gpu[1:2,:].get() ) ) >>> #Plot DMD modes >>> #Mode 0 >>> ax = fig.add_subplot(235, projection='3d') >>> ax = fig.gca(projection='3d') >>> surf = ax.plot_surface(X[0:F1tilde.shape[1],:], T[0:F1tilde.shape[1],:], F1tilde.T, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=False) >>> ax.set_zlim(-1, 1) >>> plt.title('F1_tilde') >>> #Mode 1 >>> ax = fig.add_subplot(236, projection='3d') >>> ax = fig.gca(projection='3d') >>> surf = ax.plot_surface(X[0:F2tilde.shape[1],:], T[0:F2tilde.shape[1],:], F2tilde.T, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=False) >>> ax.set_zlim(-1, 1) >>> plt.title('F2_tilde') >>> plt.show() """ #************************************************************************* #*** Author: N. Benjamin Erichson <*****@*****.**> *** #*** <2015> *** #*** License: BSD 3 clause *** #************************************************************************* if not _has_cula: raise NotImplementedError('CULA not installed') if handle is None: handle = misc._global_cublas_handle alloc = misc._global_cublas_allocator # The free version of CULA only supports single precision floating data_type = a_gpu.dtype.type real_type = np.float32 if data_type == np.complex64: cula_func_gesvd = cula.culaDeviceCgesvd cublas_func_gemm = cublas.cublasCgemm cublas_func_dgmm = cublas.cublasCdgmm cula_func_gels = cula.culaDeviceCgels copy_func = cublas.cublasCcopy transpose_func = cublas.cublasCgeam alpha = np.complex64(1.0) beta = np.complex64(0.0) TRANS_type = 'C' isreal = False elif data_type == np.float32: cula_func_gesvd = cula.culaDeviceSgesvd cublas_func_gemm = cublas.cublasSgemm cublas_func_dgmm = cublas.cublasSdgmm cula_func_gels = cula.culaDeviceSgels copy_func = cublas.cublasScopy transpose_func = cublas.cublasSgeam alpha = np.float32(1.0) beta = np.float32(0.0) TRANS_type = 'T' isreal = True else: if cula._libcula_toolkit == 'standard': if data_type == np.complex128: cula_func_gesvd = cula.culaDeviceZgesvd cublas_func_gemm = cublas.cublasZgemm cublas_func_dgmm = cublas.cublasZdgmm cula_func_gels = cula.culaDeviceZgels copy_func = cublas.cublasZcopy transpose_func = cublas.cublasZgeam alpha = np.complex128(1.0) beta = np.complex128(0.0) TRANS_type = 'C' isreal = False elif data_type == np.float64: cula_func_gesvd = cula.culaDeviceDgesvd cublas_func_gemm = cublas.cublasDgemm cublas_func_dgmm = cublas.cublasDdgmm cula_func_gels = cula.culaDeviceDgels copy_func = cublas.cublasDcopy transpose_func = cublas.cublasDgeam alpha = np.float64(1.0) beta = np.float64(0.0) TRANS_type = 'T' isreal = True else: raise ValueError('unsupported type') real_type = np.float64 else: raise ValueError('double precision not supported') #CUDA assumes that arrays are stored in column-major order m, n = np.array(a_gpu.shape, int) nx = n-1 #Set k if k == None : k = nx if k > nx or k < 1: raise ValueError('k is not valid') #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Split data into lef and right snapshot sequence #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Note: we need a copy of X_gpu, because SVD destroys X_gpu #While Y_gpu is just a pointer X_gpu = gpuarray.empty((m, n), data_type, order="F", allocator=alloc) copy_func(handle, X_gpu.size, int(a_gpu.gpudata), 1, int(X_gpu.gpudata), 1) X_gpu = X_gpu[:, :nx] Y_gpu = a_gpu[:, 1:] #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Randomized Singular Value Decomposition #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ U_gpu, s_gpu, Vh_gpu = rsvd(X_gpu, k=k, p=p, q=q, method=method_rsvd, handle=handle) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Solve the LS problem to find estimate for M using the pseudo-inverse #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #real: M = U.T * Y * Vt.T * S**-1 #complex: M = U.H * Y * Vt.H * S**-1 #Let G = Y * Vt.H * S**-1, hence M = M * G #Allocate G and M G_gpu = gpuarray.empty((m,k), data_type, order="F", allocator=alloc) M_gpu = gpuarray.empty((k,k), data_type, order="F", allocator=alloc) #i) s = s **-1 (inverse) if data_type == np.complex64 or data_type == np.complex128: s_gpu = 1/s_gpu s_gpu = s_gpu + 1j * gpuarray.zeros_like(s_gpu) else: s_gpu = 1.0/s_gpu #ii) real/complex: scale Vs = Vt* x diag(s**-1) Vs_gpu = gpuarray.empty((nx,k), data_type, order="F", allocator=alloc) lda = max(1, Vh_gpu.strides[1] // Vh_gpu.dtype.itemsize) ldb = max(1, Vs_gpu.strides[1] // Vs_gpu.dtype.itemsize) transpose_func(handle, TRANS_type, TRANS_type, nx, k, alpha, int(Vh_gpu.gpudata), lda, beta, int(Vh_gpu.gpudata), lda, int(Vs_gpu.gpudata), ldb) cublas_func_dgmm(handle, 'r', nx, k, int(Vs_gpu.gpudata), nx, int(s_gpu.gpudata), 1 , int(Vs_gpu.gpudata), nx) #iii) real: G = Y * Vs , complex: G = Y x Vs cublas_func_gemm(handle, 'n', 'n', m, k, nx, alpha, int(Y_gpu.gpudata), m, int(Vs_gpu.gpudata), nx, beta, int(G_gpu.gpudata), m ) #iv) real/complex: M = U* x G cublas_func_gemm(handle, TRANS_type, 'n', k, k, m, alpha, int(U_gpu.gpudata), m, int(G_gpu.gpudata), m, beta, int(M_gpu.gpudata), k ) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Eigen Decomposition #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Note: If a_gpu is real the imag part is omitted Vr_gpu, w_gpu = linalg.eig(M_gpu, 'N', 'V', 'F') omega = cumath.log(w_gpu) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Compute DMD Modes #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ F_gpu = gpuarray.empty((m,k), data_type, order="F", allocator=alloc) modes = modes.lower() if modes == 'exact': #Compute (exact) DMD modes: F = Y * V * S**-1 * W = G * W cublas_func_gemm(handle, 'n', 'n', m, k, k, alpha, G_gpu.gpudata, m, Vr_gpu.gpudata, k, beta, G_gpu.gpudata, m ) F_gpu_temp = G_gpu elif modes == 'standard': #Compute (standard) DMD modes: F = U * W cublas_func_gemm(handle, 'n', 'n', m, k, k, alpha, U_gpu.gpudata, m, Vr_gpu.gpudata, k, beta, U_gpu.gpudata, m ) F_gpu_temp = U_gpu else: raise ValueError('Type of modes is not supported, choose "exact" or "standard".') #Copy is required, because gels destroys input copy_func(handle, F_gpu_temp.size, int(F_gpu_temp.gpudata), 1, int(F_gpu.gpudata), 1) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Compute amplitueds b using least-squares: Fb=x1 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if return_amplitudes==True: #x1_gpu = a_gpu[:,0].copy() x1_gpu = gpuarray.empty(m, data_type, order="F", allocator=alloc) copy_func(handle, x1_gpu.size, int(a_gpu[:,0].gpudata), 1, int(x1_gpu.gpudata), 1) cula_func_gels( 'N', m, k, int(1) , F_gpu_temp.gpudata, m, x1_gpu.gpudata, m) b_gpu = x1_gpu #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Compute Vandermonde matrix (CPU) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if return_vandermonde==True: V_gpu = linalg.vander(w_gpu, n=nx) # Free internal CULA memory: cula.culaFreeBuffers() #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Return #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if return_amplitudes==True and return_vandermonde==True: return F_gpu, b_gpu[:k], V_gpu, omega elif return_amplitudes==True and return_vandermonde==False: return F_gpu, b_gpu[:k], omega elif return_amplitudes==False and return_vandermonde==True: return F_gpu, V_gpu, omega else: return F_gpu, omega
def marginalize_all_lnL(mod, all_l_rhots_gpu, nmodes, nsamps, ntimes, nclmns, delta_t): # Recopy constants into device constant memory # **-- constants --** max_tpb = 1024 nmodes_gpu = mod.get_global("nmodes")[0] nsamps_gpu = mod.get_global("nsamps")[0] ntimes_gpu = mod.get_global("ntimes")[0] nclmns_gpu = mod.get_global("nclmns")[0] cuda.memcpy_htod(nmodes_gpu, np.array(nmodes, ndmin=1).astype(np.int32)) cuda.memcpy_htod(nsamps_gpu, np.array(nsamps, ndmin=1).astype(np.int32)) cuda.memcpy_htod(ntimes_gpu, np.array(ntimes, ndmin=1).astype(np.int32)) cuda.memcpy_htod(nclmns_gpu, np.array(nclmns, ndmin=1).astype(np.int32)) # Get GPU functions GPU_find_max_in_shrmem = mod.get_function("find_max_in_shrmem") GPU_nv_reduc = mod.get_function("nv_reduc") GPU_bcast_vec_to_matrix = mod.get_function("bcast_vec_to_matrix") def next_greater_power_of_2(x): return 2**(x - 1).bit_length() griddimx = int(nclmns / max_tpb) griddimy = int(nsamps) # One thread per sample-time grd = (griddimx, griddimy, 1) blk = (max_tpb, 1, 1) print("Finding Maximum...\n") # Get the maxes GPU_find_max_in_shrmem(all_l_rhots_gpu, grid=grd, block=blk, shared=int(max_tpb * 8)) griddimy = int(nsamps) blokdimx = next_greater_power_of_2( griddimx) # Only need as many threads as we had blocks in x dimension grd = (1, griddimy, 1) blk = (blokdimx, 1, 1) # Second reduction - this works as long as we don't have rhoTS longer then 1024^2 GPU_find_max_in_shrmem(all_l_rhots_gpu, grid=grd, block=blk, shared=int(blokdimx * 8)) # Collect the maxes through the host maxes = np.array(all_l_rhots_gpu[:, 0][nmodes - 2::nmodes].get()).astype( np.float64) maxes_gpu = gpuarray.to_gpu(maxes) griddimx = int(nclmns / max_tpb) griddimy = int(nsamps) # One thread per sample-time grd = (griddimx, griddimy, 1) blk = (max_tpb, 1, 1) GPU_bcast_vec_to_matrix(all_l_rhots_gpu, -maxes_gpu, grid=grd, block=blk, shared=8) # Exponentiating a bunch of zeros creates a bunch of extra ones that we don't want in our # sum, so this is the number we need to subtract out to offset it padwidth = nclmns - ntimes all_l_rhots_gpu = cumath.exp(all_l_rhots_gpu) # exponentiate print("Reducing final answer...\n") GPU_nv_reduc(all_l_rhots_gpu, grid=grd, block=blk, shared=max_tpb * 8) # sum over time griddimy = int(nsamps) blokdimx = next_greater_power_of_2( griddimx) # Only need as many threads as we had blocks in x dimension grd = (1, griddimy, 1) blk = (blokdimx, 1, 1) GPU_nv_reduc(all_l_rhots_gpu, grid=grd, block=blk, shared=blokdimx * 8) # sum over time lnL = (all_l_rhots_gpu[:, 0][nmodes - 1::nmodes].get() - padwidth).astype( np.float64) lnL_gpu = gpuarray.to_gpu(lnL) lnL_gpu = maxes_gpu + cumath.log(lnL_gpu * delta_t) return lnL_gpu.get()
grid=grd, block=blk, shared=int(max_tpb * 8)) # Indexes are not contiguous griddimx = int(nclmns / max_tpb) griddimy = int(nsamps) # One thread per sample-time grd = (griddimx, griddimy, 1) blk = (max_tpb, 1, 1) maxes = np.array(all_l_rhots_gpu[:, 0][1::nmodes].get()).astype(np.float64) maxes_gpu = gpuarray.to_gpu(maxes) GPU_bcast_vec_to_matrix(all_l_rhots_gpu, -maxes_gpu, grid=grd, block=blk, shared=8) # ***** THIS IS CORRECT AND WORKING UP THROUGH HERE AS OF AUGUST 10TH 2016 ***** ''' Marginalize over Time ''' all_l_rhots_gpu = cumath.exp(all_l_rhots_gpu) # exponentiate GPU_nv_reduc(all_l_rhots_gpu) # sum over time lnL_gpu = maxes_gpu + cumath.log(all_l_rhots_gpu) # TIMES DELTA T FIXME
def cross_entropy_logistic(x, y): loss = y * cumath.log(x + eps) + (1. - y) * cumath.log(1. - x + eps) loss = -gpuarray.sum(loss) return loss
def marginalize_all_lnL(mod, all_l_rhots_gpu, nmodes, nsamps, ntimes, nclmns, delta_t): # Recopy constants into device constant memory # **-- constants --** max_tpb = 1024 nmodes_gpu = mod.get_global("nmodes")[0] nsamps_gpu = mod.get_global("nsamps")[0] ntimes_gpu = mod.get_global("ntimes")[0] nclmns_gpu = mod.get_global("nclmns")[0] cuda.memcpy_htod(nmodes_gpu, np.array(nmodes, ndmin=1).astype(np.int32)) cuda.memcpy_htod(nsamps_gpu, np.array(nsamps, ndmin=1).astype(np.int32)) cuda.memcpy_htod(ntimes_gpu, np.array(ntimes, ndmin=1).astype(np.int32)) cuda.memcpy_htod(nclmns_gpu, np.array(nclmns, ndmin=1).astype(np.int32)) # Get GPU functions GPU_find_max_in_shrmem = mod.get_function("find_max_in_shrmem") GPU_nv_reduc = mod.get_function("nv_reduc") GPU_bcast_vec_to_matrix = mod.get_function("bcast_vec_to_matrix") def next_greater_power_of_2(x): return 2**(x-1).bit_length() griddimx = int(nclmns / max_tpb) griddimy = int(nsamps) # One thread per sample-time grd = (griddimx, griddimy, 1) blk = (max_tpb, 1, 1) print("Finding Maximum...\n") # Get the maxes GPU_find_max_in_shrmem(all_l_rhots_gpu, grid=grd, block=blk, shared=int(max_tpb*8)) griddimy = int(nsamps) blokdimx = next_greater_power_of_2(griddimx) # Only need as many threads as we had blocks in x dimension grd = (1, griddimy, 1) blk = (blokdimx, 1, 1) # Second reduction - this works as long as we don't have rhoTS longer then 1024^2 GPU_find_max_in_shrmem(all_l_rhots_gpu, grid=grd, block=blk, shared=int(blokdimx*8)) # Collect the maxes through the host maxes = np.array(all_l_rhots_gpu[:,0][nmodes-2::nmodes].get()).astype(np.float64) maxes_gpu = gpuarray.to_gpu(maxes) griddimx = int(nclmns / max_tpb) griddimy = int(nsamps) # One thread per sample-time grd = (griddimx, griddimy, 1) blk = (max_tpb, 1, 1) GPU_bcast_vec_to_matrix(all_l_rhots_gpu, -maxes_gpu, grid=grd, block=blk, shared=8) # Exponentiating a bunch of zeros creates a bunch of extra ones that we don't want in our # sum, so this is the number we need to subtract out to offset it padwidth = nclmns - ntimes all_l_rhots_gpu = cumath.exp(all_l_rhots_gpu) # exponentiate print("Reducing final answer...\n") GPU_nv_reduc(all_l_rhots_gpu, grid=grd, block=blk, shared=max_tpb*8) # sum over time griddimy = int(nsamps) blokdimx = next_greater_power_of_2(griddimx) # Only need as many threads as we had blocks in x dimension grd = (1, griddimy, 1) blk = (blokdimx, 1, 1) GPU_nv_reduc(all_l_rhots_gpu, grid=grd, block=blk, shared=blokdimx*8) # sum over time lnL = (all_l_rhots_gpu[:,0][nmodes-1::nmodes].get() - padwidth).astype(np.float64) lnL_gpu = gpuarray.to_gpu(lnL) lnL_gpu = maxes_gpu + cumath.log(lnL_gpu*delta_t) return lnL_gpu.get()
def cdmd(a_gpu, k=None, c=None, modes="exact", return_amplitudes=False, return_vandermonde=False, handle=None): """ Compressed Dynamic Mode Decomposition. Dynamic Mode Decomposition (DMD) is a data processing algorithm which allows to decompose a matrix `a` in space and time. The matrix `a` is decomposed as `a = FBV`, where the columns of `F` contain the dynamic modes. The modes are ordered corresponding to the amplitudes stored in the diagonal matrix `B`. `V` is a Vandermonde matrix describing the temporal evolution. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Real/complex input matrix `a` with dimensions `(m, n)`. k : int, optional If `k < (n-1)` low-rank Dynamic Mode Decomposition is computed. c : int `p` sets the number of measurements sensors. modes : `{'exact'}` 'exact' : computes the exact dynamic modes, `F = Y * V * (S**-1) * W`. return_amplitudes : bool `{True, False}` True: return amplitudes in addition to dynamic modes. return_vandermonde : bool `{True, False}` True: return Vandermonde matrix in addition to dynamic modes and amplitudes. handle : int CUBLAS context. If no context is specified, the default handle from `skcuda.misc._global_cublas_handle` is used. Returns ------- f_gpu : pycuda.gpuarray.GPUArray Matrix containing the dynamic modes of shape `(m, n-1)` or `(m, k)`. b_gpu : pycuda.gpuarray.GPUArray 1-D array containing the amplitudes of length `min(n-1, k)`. v_gpu : pycuda.gpuarray.GPUArray Vandermonde matrix of shape `(n-1, n-1)` or `(k, n-1)`. Notes ----- Double precision is only supported if the standard version of the CULA Dense toolkit is installed. This function destroys the contents of the input matrix. Arrays are assumed to be stored in column-major order, i.e., order='F'. References ---------- S. L. Brunton, et al. "Compressed sampling and dynamic mode decomposition." arXiv preprint arXiv:1312.5186 (2013). J. H. Tu, et al. "On dynamic mode decomposition: theory and applications." arXiv preprint arXiv:1312.0041 (2013). Examples -------- >>> #Numpy >>> import numpy as np >>> #Plot libs >>> import matplotlib.pyplot as plt >>> from mpl_toolkits.mplot3d import Axes3D >>> from matplotlib import cm >>> #GPU DMD libs >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> from skcuda import linalg, rlinalg >>> linalg.init() >>> rlinalg.init() >>> # Define time and space discretizations >>> x=np.linspace( -15, 15, 200) >>> t=np.linspace(0, 8*np.pi , 80) >>> dt=t[2]-t[1] >>> X, T = np.meshgrid(x,t) >>> # Create two patio-temporal patterns >>> F1 = 0.5* np.cos(X)*(1.+0.* T) >>> F2 = ( (1./np.cosh(X)) * np.tanh(X)) *(2.*np.exp(1j*2.8*T)) >>> # Add both signals >>> F = (F1+F2) >>> #Plot dataset >>> fig = plt.figure() >>> ax = fig.add_subplot(231, projection='3d') >>> ax = fig.gca(projection='3d') >>> surf = ax.plot_surface(X, T, F, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=True) >>> ax.set_zlim(-1, 1) >>> plt.title('F') >>> ax = fig.add_subplot(232, projection='3d') >>> ax = fig.gca(projection='3d') >>> surf = ax.plot_surface(X, T, F1, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=False) >>> ax.set_zlim(-1, 1) >>> plt.title('F1') >>> ax = fig.add_subplot(233, projection='3d') >>> ax = fig.gca(projection='3d') >>> surf = ax.plot_surface(X, T, F2, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=False) >>> ax.set_zlim(-1, 1) >>> plt.title('F2') >>> #Dynamic Mode Decomposition >>> F_gpu = np.array(F.T, np.complex64, order='F') >>> F_gpu = gpuarray.to_gpu(F_gpu) >>> Fmodes_gpu, b_gpu, V_gpu, omega_gpu = rlinalg.cdmd(F_gpu, k=2, c=20, modes='exact', return_amplitudes=True, return_vandermonde=True) >>> omega = omega_gpu.get() >>> plt.scatter(omega.real, omega.imag, marker='o', c='r') >>> #Recover original signal >>> F1tilde = np.dot(Fmodes_gpu[:,0:1].get() , np.dot(b_gpu[0].get(), V_gpu[0:1,:].get() ) ) >>> F2tilde = np.dot(Fmodes_gpu[:,1:2].get() , np.dot(b_gpu[1].get(), V_gpu[1:2,:].get() ) ) >>> # Plot DMD modes >>> #Mode 0 >>> ax = fig.add_subplot(235, projection='3d') >>> ax = fig.gca(projection='3d') >>> surf = ax.plot_surface(X[0:F1tilde.shape[1],:], T[0:F1tilde.shape[1],:], F1tilde.T, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=False) >>> ax.set_zlim(-1, 1) >>> plt.title('F1_tilde') >>> #Mode 1 >>> ax = fig.add_subplot(236, projection='3d') >>> ax = fig.gca(projection='3d') >>> surf = ax.plot_surface(X[0:F2tilde.shape[1],:], T[0:F2tilde.shape[1],:], F2tilde.T, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=False) >>> ax.set_zlim(-1, 1) >>> plt.title('F2_tilde') >>> plt.show() """ # ************************************************************************* # *** Author: N. Benjamin Erichson <*****@*****.**> *** # *** <2015> *** # *** License: BSD 3 clause *** # ************************************************************************* if not _has_cula: raise NotImplementedError("CULA not installed") if handle is None: handle = misc._global_cublas_handle alloc = misc._global_cublas_allocator # The free version of CULA only supports single precision floating data_type = a_gpu.dtype.type real_type = np.float32 if data_type == np.complex64: cula_func_gesvd = cula.culaDeviceCgesvd cublas_func_gemm = cublas.cublasCgemm cublas_func_dgmm = cublas.cublasCdgmm cula_func_gels = cula.culaDeviceCgels copy_func = cublas.cublasCcopy transpose_func = cublas.cublasCgeam alpha = np.complex64(1.0) beta = np.complex64(0.0) TRANS_type = "C" isreal = False elif data_type == np.float32: cula_func_gesvd = cula.culaDeviceSgesvd cublas_func_gemm = cublas.cublasSgemm cublas_func_dgmm = cublas.cublasSdgmm cula_func_gels = cula.culaDeviceSgels copy_func = cublas.cublasScopy transpose_func = cublas.cublasSgeam alpha = np.float32(1.0) beta = np.float32(0.0) TRANS_type = "T" isreal = True else: if cula._libcula_toolkit == "standard": if data_type == np.complex128: cula_func_gesvd = cula.culaDeviceZgesvd cublas_func_gemm = cublas.cublasZgemm cublas_func_dgmm = cublas.cublasZdgmm cula_func_gels = cula.culaDeviceZgels copy_func = cublas.cublasZcopy transpose_func = cublas.cublasZgeam alpha = np.complex128(1.0) beta = np.complex128(0.0) TRANS_type = "C" isreal = False elif data_type == np.float64: cula_func_gesvd = cula.culaDeviceDgesvd cublas_func_gemm = cublas.cublasDgemm cublas_func_dgmm = cublas.cublasDdgmm cula_func_gels = cula.culaDeviceDgels copy_func = cublas.cublasDcopy transpose_func = cublas.cublasDgeam alpha = np.float64(1.0) beta = np.float64(0.0) TRANS_type = "T" isreal = True else: raise ValueError("unsupported type") real_type = np.float64 else: raise ValueError("double precision not supported") # CUDA assumes that arrays are stored in column-major order m, n = np.array(a_gpu.shape, int) nx = n - 1 # Set k if k == None: k = nx if k > nx or k < 1: raise ValueError("k is not valid") # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Compress # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if c == None: Ac_gpu = A c = m else: # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Generate a random sensing matrix S # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if isreal == False: Simag_gpu = gpuarray.empty((m, c), real_type, order="F", allocator=alloc) Sreal_gpu = gpuarray.empty((m, c), real_type, order="F", allocator=alloc) S_gpu = gpuarray.empty((c, m), data_type, order="F", allocator=alloc) rand.fill_uniform(Simag_gpu) rand.fill_uniform(Sreal_gpu) S_gpu = Sreal_gpu + 1j * Simag_gpu S_gpu = S_gpu.T * 2 - 1 # Scale to [-1,1] else: S_gpu = gpuarray.empty((c, m), real_type, order="F", allocator=alloc) rand.fill_uniform(S_gpu) # Draw random samples from a ~ Uniform(-1,1) distribution S_gpu = S_gpu * 2 - 1 # Scale to [-1,1] # Allocate Ac Ac_gpu = gpuarray.empty((c, n), data_type, order="F", allocator=alloc) # Compress input matrix cublas_func_gemm( handle, "n", "n", c, n, m, alpha, int(S_gpu.gpudata), c, int(a_gpu.gpudata), m, beta, int(Ac_gpu.gpudata), c ) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Split data into lef and right snapshot sequence # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Note: we need a copy of X_gpu, because SVD destroys X_gpu # While Y_gpu is just a pointer X_gpu = gpuarray.empty((c, n), data_type, order="F", allocator=alloc) copy_func(handle, X_gpu.size, int(Ac_gpu.gpudata), 1, int(X_gpu.gpudata), 1) X_gpu = X_gpu[:, :nx] Y_gpu = Ac_gpu[:, 1:] Yorig_gpu = a_gpu[:, 1:] # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Singular Value Decomposition # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Allocate s, U, Vt for economic SVD # Note: singular values are always real min_s = min(nx, c) s_gpu = gpuarray.zeros(min_s, real_type, order="F", allocator=alloc) U_gpu = gpuarray.zeros((c, min_s), data_type, order="F", allocator=alloc) Vh_gpu = gpuarray.zeros((min_s, nx), data_type, order="F", allocator=alloc) # Economic SVD cula_func_gesvd( "S", "S", c, nx, int(X_gpu.gpudata), c, int(s_gpu.gpudata), int(U_gpu.gpudata), c, int(Vh_gpu.gpudata), min_s ) # Low-rank DMD: trancate SVD if k < nx if k != nx: s_gpu = s_gpu[:k] U_gpu = U_gpu[:, :k] Vh_gpu = Vh_gpu[:k, :] # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Solve the LS problem to find estimate for M using the pseudo-inverse # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # real: M = U.T * Y * Vt.T * S**-1 # complex: M = U.H * Y * Vt.H * S**-1 # Let G = Y * Vt.H * S**-1, hence M = M * G # Allocate G and M G_gpu = gpuarray.zeros((c, k), data_type, order="F", allocator=alloc) M_gpu = gpuarray.zeros((k, k), data_type, order="F", allocator=alloc) # i) s = s **-1 (inverse) if data_type == np.complex64 or data_type == np.complex128: s_gpu = 1 / s_gpu s_gpu = s_gpu + 1j * gpuarray.zeros_like(s_gpu) else: s_gpu = 1 / s_gpu # ii) real/complex: scale Vs = Vt* x diag(s**-1) Vs_gpu = gpuarray.zeros((nx, k), data_type, order="F", allocator=alloc) lda = max(1, Vh_gpu.strides[1] // Vh_gpu.dtype.itemsize) ldb = max(1, Vs_gpu.strides[1] // Vs_gpu.dtype.itemsize) transpose_func( handle, TRANS_type, TRANS_type, nx, k, 1.0, int(Vh_gpu.gpudata), lda, 0.0, int(Vh_gpu.gpudata), lda, int(Vs_gpu.gpudata), ldb, ) # End Transpose cublas_func_dgmm(handle, "r", nx, k, int(Vs_gpu.gpudata), nx, int(s_gpu.gpudata), 1, int(Vs_gpu.gpudata), nx) # iii) real: G = Y * Vs , complex: G = Y x Vs cublas_func_gemm( handle, "n", "n", c, k, nx, alpha, int(Y_gpu.gpudata), c, int(Vs_gpu.gpudata), nx, beta, int(G_gpu.gpudata), c ) # iv) real/complex: M = U* x G cublas_func_gemm( handle, TRANS_type, "n", k, k, c, alpha, int(U_gpu.gpudata), c, int(G_gpu.gpudata), c, beta, int(M_gpu.gpudata), k, ) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Eigen Decomposition # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Note: If a_gpu is real the imag part is omitted Vr_gpu, w_gpu = linalg.eig(M_gpu, "N", "V", "F") omega = cumath.log(w_gpu) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Compute DMD Modes # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ F_gpu = gpuarray.empty((m, k), data_type, order="F", allocator=alloc) modes = modes.lower() if modes == "exact": # Compute (exact) DMD modes: F = Y * V * S**-1 * W = G * W cublas_func_gemm( handle, "n", "n", nx, k, k, alpha, int(Vs_gpu.gpudata), nx, int(Vr_gpu.gpudata), k, beta, int(Vs_gpu.gpudata), nx, ) cublas_func_gemm( handle, "n", "n", m, k, nx, alpha, Yorig_gpu.gpudata, m, Vs_gpu.gpudata, nx, beta, F_gpu.gpudata, m ) else: raise ValueError('Type of modes is not supported, choose "exact" or "standard".') # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Compute amplitueds b using least-squares: Fb=x1 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if return_amplitudes == True: F_gpu_temp = gpuarray.empty((m, k), data_type, order="F", allocator=alloc) # Copy is required, because gels destroys input copy_func(handle, F_gpu.size, int(F_gpu.gpudata), 1, int(F_gpu_temp.gpudata), 1) # x1_gpu = a_gpu[:,0].copy() x1_gpu = gpuarray.empty(m, data_type, order="F", allocator=alloc) copy_func(handle, x1_gpu.size, int(a_gpu[:, 0].gpudata), 1, int(x1_gpu.gpudata), 1) cula_func_gels("N", m, k, int(1), F_gpu_temp.gpudata, m, x1_gpu.gpudata, m) b_gpu = x1_gpu # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Compute Vandermonde matrix (CPU) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if return_vandermonde == True: V_gpu = linalg.vander(w_gpu, n=nx) # Free internal CULA memory: cula.culaFreeBuffers() # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Return # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if return_amplitudes == True and return_vandermonde == True: return F_gpu, b_gpu[:k], V_gpu, omega elif return_amplitudes == True and return_vandermonde == False: return F_gpu, b_gpu[:k], omega elif return_amplitudes == False and return_vandermonde == True: return F_gpu, V_gpu, omega else: return F_gpu, omega
def demosaick_gpu(img): img = gp.to_gpu(img) p2x = im2col(img, _i2c2) cm.log(img + _eps, out=img) p1x = im2col(img, _i2c1) wA = p1x.shape[0] wB = p2x.shape[0] hA = p1x.shape[1] hB = p2x.shape[1] # Path 1 p1x = p1x.reshape([wA * hA, 576]) p1y = lg.dot(p1x, _wts.int1) cm.exp(p1y, out=p1y) p1y = p1y.reshape([wA * hA * 64, 3 * _ofac]) p1x = lg.dot(p1y, _wts.int2) msc.add_matvec(p1x, _wts.int2b, out=p1x) p1x = p1x.reshape([wA * hA * 64 * 3, _ofac]) # Path 2 # conv1 p2x = p2x.reshape([wB * hB, 64]) p2y = lg.dot(p2x, _wts.c1) msc.add_matvec(p2y, _wts.c1b, out=p2y) gp.maximum(p2y, 0., p2y) p2y = p2y.reshape([wB, hB, _numsel]) # conv2 shI = [wB - 1, hB - 1, _numsel] shM = [(wB - 1) * (hB - 1), _numsel] p2x = gp.empty(shM, dtype=np.float32) pTT = gp.empty(shI, dtype=np.float32) pTT = pTT.reshape(shI) pTT[...] = p2y[0:-1, 0:-1, :] pTT = pTT.reshape(shM) p2x = lg.dot(pTT, _wts.c200) pTT = pTT.reshape(shI) pTT[...] = p2y[0:-1, 1:, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c201, p2x) pTT = pTT.reshape(shI) pTT[...] = p2y[1:, 0:-1, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c210, p2x) pTT = pTT.reshape(shI) pTT[...] = p2y[1:, 1:, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c211, p2x) msc.add_matvec(p2x, _wts.c2b, out=p2x) gp.maximum(p2x, 0., p2x) p2x = p2x.reshape(shI) # conv 3 shI = [wB - 2, hB - 2, _numsel] shM = [(wB - 2) * (hB - 2), _numsel] p2y = gp.empty(shM, dtype=np.float32) pTT = gp.empty(shI, dtype=np.float32) pTT = pTT.reshape(shI) pTT[...] = p2x[0:-1, 0:-1, :] pTT = pTT.reshape(shM) p2y = lg.dot(pTT, _wts.c300) pTT = pTT.reshape(shI) pTT[...] = p2x[0:-1, 1:, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c301, p2y) pTT = pTT.reshape(shI) pTT[...] = p2x[1:, 0:-1, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c310, p2y) pTT = pTT.reshape(shI) pTT[...] = p2x[1:, 1:, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c311, p2y) msc.add_matvec(p2y, _wts.c3b, out=p2y) gp.maximum(p2y, 0., p2y) p2x = lg.dot(p2y, _wts.sout) msc.add_matvec(p2x, _wts.soutb, out=p2x) gp.maximum(p2x, 0., p2x) p2x = p2x.reshape(p1x.shape) # Combine p1x *= p2x p1 = msc.sum(p1x, axis=1) gp.maximum(p1, 0., p1) gp.minimum(p1, 1., p1) p1 = p1.reshape([wA, hA, 64 * 3]) im = p2im(p1.get()) return im
def cdmd(a_gpu, k=None, c=None, modes='exact', return_amplitudes=False, return_vandermonde=False, handle=None): """ Compressed Dynamic Mode Decomposition. Dynamic Mode Decomposition (DMD) is a data processing algorithm which allows to decompose a matrix `a` in space and time. The matrix `a` is decomposed as `a = FBV`, where the columns of `F` contain the dynamic modes. The modes are ordered corresponding to the amplitudes stored in the diagonal matrix `B`. `V` is a Vandermonde matrix describing the temporal evolution. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Real/complex input matrix `a` with dimensions `(m, n)`. k : int, optional If `k < (n-1)` low-rank Dynamic Mode Decomposition is computed. c : int `p` sets the number of measurements sensors. modes : `{'exact'}` 'exact' : computes the exact dynamic modes, `F = Y * V * (S**-1) * W`. return_amplitudes : bool `{True, False}` True: return amplitudes in addition to dynamic modes. return_vandermonde : bool `{True, False}` True: return Vandermonde matrix in addition to dynamic modes and amplitudes. handle : int CUBLAS context. If no context is specified, the default handle from `skcuda.misc._global_cublas_handle` is used. Returns ------- f_gpu : pycuda.gpuarray.GPUArray Matrix containing the dynamic modes of shape `(m, n-1)` or `(m, k)`. b_gpu : pycuda.gpuarray.GPUArray 1-D array containing the amplitudes of length `min(n-1, k)`. v_gpu : pycuda.gpuarray.GPUArray Vandermonde matrix of shape `(n-1, n-1)` or `(k, n-1)`. Notes ----- Double precision is only supported if the standard version of the CULA Dense toolkit is installed. This function destroys the contents of the input matrix. Arrays are assumed to be stored in column-major order, i.e., order='F'. References ---------- S. L. Brunton, et al. "Compressed sampling and dynamic mode decomposition." arXiv preprint arXiv:1312.5186 (2013). J. H. Tu, et al. "On dynamic mode decomposition: theory and applications." arXiv preprint arXiv:1312.0041 (2013). Examples -------- >>> #Numpy >>> import numpy as np >>> #Plot libs >>> import matplotlib.pyplot as plt >>> from mpl_toolkits.mplot3d import Axes3D >>> from matplotlib import cm >>> #GPU DMD libs >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> from skcuda import linalg, rlinalg >>> linalg.init() >>> rlinalg.init() >>> # Define time and space discretizations >>> x=np.linspace( -15, 15, 200) >>> t=np.linspace(0, 8*np.pi , 80) >>> dt=t[2]-t[1] >>> X, T = np.meshgrid(x,t) >>> # Create two patio-temporal patterns >>> F1 = 0.5* np.cos(X)*(1.+0.* T) >>> F2 = ( (1./np.cosh(X)) * np.tanh(X)) *(2.*np.exp(1j*2.8*T)) >>> # Add both signals >>> F = (F1+F2) >>> #Plot dataset >>> fig = plt.figure() >>> ax = fig.add_subplot(231, projection='3d') >>> ax = fig.gca(projection='3d') >>> surf = ax.plot_surface(X, T, F, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=True) >>> ax.set_zlim(-1, 1) >>> plt.title('F') >>> ax = fig.add_subplot(232, projection='3d') >>> ax = fig.gca(projection='3d') >>> surf = ax.plot_surface(X, T, F1, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=False) >>> ax.set_zlim(-1, 1) >>> plt.title('F1') >>> ax = fig.add_subplot(233, projection='3d') >>> ax = fig.gca(projection='3d') >>> surf = ax.plot_surface(X, T, F2, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=False) >>> ax.set_zlim(-1, 1) >>> plt.title('F2') >>> #Dynamic Mode Decomposition >>> F_gpu = np.array(F.T, np.complex64, order='F') >>> F_gpu = gpuarray.to_gpu(F_gpu) >>> Fmodes_gpu, b_gpu, V_gpu, omega_gpu = rlinalg.cdmd(F_gpu, k=2, c=20, modes='exact', return_amplitudes=True, return_vandermonde=True) >>> omega = omega_gpu.get() >>> plt.scatter(omega.real, omega.imag, marker='o', c='r') >>> #Recover original signal >>> F1tilde = np.dot(Fmodes_gpu[:,0:1].get() , np.dot(b_gpu[0].get(), V_gpu[0:1,:].get() ) ) >>> F2tilde = np.dot(Fmodes_gpu[:,1:2].get() , np.dot(b_gpu[1].get(), V_gpu[1:2,:].get() ) ) >>> # Plot DMD modes >>> #Mode 0 >>> ax = fig.add_subplot(235, projection='3d') >>> ax = fig.gca(projection='3d') >>> surf = ax.plot_surface(X[0:F1tilde.shape[1],:], T[0:F1tilde.shape[1],:], F1tilde.T, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=False) >>> ax.set_zlim(-1, 1) >>> plt.title('F1_tilde') >>> #Mode 1 >>> ax = fig.add_subplot(236, projection='3d') >>> ax = fig.gca(projection='3d') >>> surf = ax.plot_surface(X[0:F2tilde.shape[1],:], T[0:F2tilde.shape[1],:], F2tilde.T, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=False) >>> ax.set_zlim(-1, 1) >>> plt.title('F2_tilde') >>> plt.show() """ #************************************************************************* #*** Author: N. Benjamin Erichson <*****@*****.**> *** #*** <2015> *** #*** License: BSD 3 clause *** #************************************************************************* if not _has_cula: raise NotImplementedError('CULA not installed') if handle is None: handle = misc._global_cublas_handle alloc = misc._global_cublas_allocator # The free version of CULA only supports single precision floating data_type = a_gpu.dtype.type real_type = np.float32 if data_type == np.complex64: cula_func_gesvd = cula.culaDeviceCgesvd cublas_func_gemm = cublas.cublasCgemm cublas_func_dgmm = cublas.cublasCdgmm cula_func_gels = cula.culaDeviceCgels copy_func = cublas.cublasCcopy transpose_func = cublas.cublasCgeam alpha = np.complex64(1.0) beta = np.complex64(0.0) TRANS_type = 'C' isreal = False elif data_type == np.float32: cula_func_gesvd = cula.culaDeviceSgesvd cublas_func_gemm = cublas.cublasSgemm cublas_func_dgmm = cublas.cublasSdgmm cula_func_gels = cula.culaDeviceSgels copy_func = cublas.cublasScopy transpose_func = cublas.cublasSgeam alpha = np.float32(1.0) beta = np.float32(0.0) TRANS_type = 'T' isreal = True else: if cula._libcula_toolkit == 'standard': if data_type == np.complex128: cula_func_gesvd = cula.culaDeviceZgesvd cublas_func_gemm = cublas.cublasZgemm cublas_func_dgmm = cublas.cublasZdgmm cula_func_gels = cula.culaDeviceZgels copy_func = cublas.cublasZcopy transpose_func = cublas.cublasZgeam alpha = np.complex128(1.0) beta = np.complex128(0.0) TRANS_type = 'C' isreal = False elif data_type == np.float64: cula_func_gesvd = cula.culaDeviceDgesvd cublas_func_gemm = cublas.cublasDgemm cublas_func_dgmm = cublas.cublasDdgmm cula_func_gels = cula.culaDeviceDgels copy_func = cublas.cublasDcopy transpose_func = cublas.cublasDgeam alpha = np.float64(1.0) beta = np.float64(0.0) TRANS_type = 'T' isreal = True else: raise ValueError('unsupported type') real_type = np.float64 else: raise ValueError('double precision not supported') #CUDA assumes that arrays are stored in column-major order m, n = np.array(a_gpu.shape, int) nx = n - 1 #Set k if k == None: k = nx if k > nx or k < 1: raise ValueError('k is not valid') #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Compress #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if c == None: Ac_gpu = A c = m else: #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Generate a random sensing matrix S #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if isreal == False: Simag_gpu = gpuarray.empty((m, c), real_type, order="F", allocator=alloc) Sreal_gpu = gpuarray.empty((m, c), real_type, order="F", allocator=alloc) S_gpu = gpuarray.empty((c, m), data_type, order="F", allocator=alloc) rand.fill_uniform(Simag_gpu) rand.fill_uniform(Sreal_gpu) S_gpu = Sreal_gpu + 1j * Simag_gpu S_gpu = S_gpu.T * 2 - 1 #Scale to [-1,1] else: S_gpu = gpuarray.empty((c, m), real_type, order="F", allocator=alloc) rand.fill_uniform( S_gpu ) #Draw random samples from a ~ Uniform(-1,1) distribution S_gpu = S_gpu * 2 - 1 #Scale to [-1,1] #Allocate Ac Ac_gpu = gpuarray.empty((c, n), data_type, order="F", allocator=alloc) #Compress input matrix cublas_func_gemm(handle, 'n', 'n', c, n, m, alpha, int(S_gpu.gpudata), c, int(a_gpu.gpudata), m, beta, int(Ac_gpu.gpudata), c) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Split data into lef and right snapshot sequence #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Note: we need a copy of X_gpu, because SVD destroys X_gpu #While Y_gpu is just a pointer X_gpu = gpuarray.empty((c, n), data_type, order="F", allocator=alloc) copy_func(handle, X_gpu.size, int(Ac_gpu.gpudata), 1, int(X_gpu.gpudata), 1) X_gpu = X_gpu[:, :nx] Y_gpu = Ac_gpu[:, 1:] Yorig_gpu = a_gpu[:, 1:] #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Singular Value Decomposition #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Allocate s, U, Vt for economic SVD #Note: singular values are always real min_s = min(nx, c) s_gpu = gpuarray.zeros(min_s, real_type, order="F", allocator=alloc) U_gpu = gpuarray.zeros((c, min_s), data_type, order="F", allocator=alloc) Vh_gpu = gpuarray.zeros((min_s, nx), data_type, order="F", allocator=alloc) #Economic SVD cula_func_gesvd('S', 'S', c, nx, int(X_gpu.gpudata), c, int(s_gpu.gpudata), int(U_gpu.gpudata), c, int(Vh_gpu.gpudata), min_s) #Low-rank DMD: trancate SVD if k < nx if k != nx: s_gpu = s_gpu[:k] U_gpu = U_gpu[:, :k] Vh_gpu = Vh_gpu[:k, :] #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Solve the LS problem to find estimate for M using the pseudo-inverse #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #real: M = U.T * Y * Vt.T * S**-1 #complex: M = U.H * Y * Vt.H * S**-1 #Let G = Y * Vt.H * S**-1, hence M = M * G #Allocate G and M G_gpu = gpuarray.zeros((c, k), data_type, order="F", allocator=alloc) M_gpu = gpuarray.zeros((k, k), data_type, order="F", allocator=alloc) #i) s = s **-1 (inverse) if data_type == np.complex64 or data_type == np.complex128: s_gpu = 1 / s_gpu s_gpu = s_gpu + 1j * gpuarray.zeros_like(s_gpu) else: s_gpu = 1 / s_gpu #ii) real/complex: scale Vs = Vt* x diag(s**-1) Vs_gpu = gpuarray.zeros((nx, k), data_type, order="F", allocator=alloc) lda = max(1, Vh_gpu.strides[1] // Vh_gpu.dtype.itemsize) ldb = max(1, Vs_gpu.strides[1] // Vs_gpu.dtype.itemsize) transpose_func(handle, TRANS_type, TRANS_type, nx, k, 1.0, int(Vh_gpu.gpudata), lda, 0.0, int(Vh_gpu.gpudata), lda, int(Vs_gpu.gpudata), ldb) #End Transpose cublas_func_dgmm(handle, 'r', nx, k, int(Vs_gpu.gpudata), nx, int(s_gpu.gpudata), 1, int(Vs_gpu.gpudata), nx) #iii) real: G = Y * Vs , complex: G = Y x Vs cublas_func_gemm(handle, 'n', 'n', c, k, nx, alpha, int(Y_gpu.gpudata), c, int(Vs_gpu.gpudata), nx, beta, int(G_gpu.gpudata), c) #iv) real/complex: M = U* x G cublas_func_gemm(handle, TRANS_type, 'n', k, k, c, alpha, int(U_gpu.gpudata), c, int(G_gpu.gpudata), c, beta, int(M_gpu.gpudata), k) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Eigen Decomposition #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Note: If a_gpu is real the imag part is omitted Vr_gpu, w_gpu = linalg.eig(M_gpu, 'N', 'V', 'F', lib='cula') omega = cumath.log(w_gpu) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Compute DMD Modes #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ F_gpu = gpuarray.empty((m, k), data_type, order="F", allocator=alloc) modes = modes.lower() if modes == 'exact': #Compute (exact) DMD modes: F = Y * V * S**-1 * W = G * W cublas_func_gemm(handle, 'n', 'n', nx, k, k, alpha, int(Vs_gpu.gpudata), nx, int(Vr_gpu.gpudata), k, beta, int(Vs_gpu.gpudata), nx) cublas_func_gemm(handle, 'n', 'n', m, k, nx, alpha, Yorig_gpu.gpudata, m, Vs_gpu.gpudata, nx, beta, F_gpu.gpudata, m) else: raise ValueError( 'Type of modes is not supported, choose "exact" or "standard".') #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Compute amplitueds b using least-squares: Fb=x1 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if return_amplitudes == True: F_gpu_temp = gpuarray.empty((m, k), data_type, order="F", allocator=alloc) #Copy is required, because gels destroys input copy_func(handle, F_gpu.size, int(F_gpu.gpudata), 1, int(F_gpu_temp.gpudata), 1) #x1_gpu = a_gpu[:,0].copy() x1_gpu = gpuarray.empty(m, data_type, order="F", allocator=alloc) copy_func(handle, x1_gpu.size, int(a_gpu[:, 0].gpudata), 1, int(x1_gpu.gpudata), 1) cula_func_gels('N', m, k, int(1), F_gpu_temp.gpudata, m, x1_gpu.gpudata, m) b_gpu = x1_gpu #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Compute Vandermonde matrix (CPU) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if return_vandermonde == True: V_gpu = linalg.vander(w_gpu, n=nx) # Free internal CULA memory: cula.culaFreeBuffers() #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Return #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if return_amplitudes == True and return_vandermonde == True: return F_gpu, b_gpu[:k], V_gpu, omega elif return_amplitudes == True and return_vandermonde == False: return F_gpu, b_gpu[:k], omega elif return_amplitudes == False and return_vandermonde == True: return F_gpu, V_gpu, omega else: return F_gpu, omega
def random_normal(loc=0.0, scale=1.0, size=None): u1 = curandom.rand(size, dtype=numpy.float64) u2 = curandom.rand(size, dtype=numpy.float64) z1 = cumath.sqrt(-2.*cumath.log(u1))*cumath.cos(2.*numpy.pi*u2) return CUDAArray(scale*z1+loc)
def log(self): return CUDAArray(cumath.log(self.arr))
def cauchy_prior_log_den(beta): log_beta_den_vals = -cumath.log(1 + beta*beta) return(gpuarray.sum(log_beta_den_vals).get())