Ejemplo n.º 1
0
 def test_log(self):
     """tests if the log function works"""
     a = simplearray.array(test_sample).fill_arange()+1       
     b = cumath.log(a)
     
     for i in range(test_sample):
         self.assert_(abs(math.log(a[i]) - b[i]) < 1e-3)
Ejemplo n.º 2
0
def multinomial_log_likelihood(softmax_vals,Y,one_n_trans,one_c):
    # add small amount to protect against log(0)
    small_val = 1e-9
    prod = Y*cumath.log(softmax_vals+small_val)
    prod = linalg.dot(one_n_trans,prod)
    prod = linalg.dot(prod,one_c)
    return(prod.get())
Ejemplo n.º 3
0
    def kl_error(self, input_data, targets, average=True,
                 cache=None, prediction=True):
        """ The KL divergence error
        """

        if cache is not None:
            activations = cache
        else:
            activations = \
              self.feed_forward(input_data, prediction=prediction)

        targets_non_nan = gpuarray.empty_like(targets)
        nan_to_zeros(targets, targets_non_nan)
        kl_error = gpuarray.sum(targets_non_nan *
                                (cumath.log(targets_non_nan + eps) -
                                 cumath.log(activations + eps)))
        if average:
            kl_error /= targets.shape[0]
        return float(kl_error.get())
Ejemplo n.º 4
0
def logsumexp(mat):
    max_dim = max_by_axis(mat, 1)
    tmp = add_vec_to_mat(mat, max_dim, 0, substract=True)

    tmp = cumath.exp(tmp)
    
    tmp = matrix_sum_out_axis(tmp, 1)
    tmp = cumath.log(tmp)
    max_dim += tmp
    return max_dim
Ejemplo n.º 5
0
def logsumexp(mat):
    max_dim = max_by_axis(mat, 1)
    tmp = add_vec_to_mat(mat, max_dim, 0, substract=True)

    tmp = cumath.exp(tmp)

    tmp = matrix_sum_out_axis(tmp, 1)
    tmp = cumath.log(tmp)
    max_dim += tmp
    return max_dim
Ejemplo n.º 6
0
 def thunk():
     alpha = gpuarray.to_gpu(np.squeeze(np.asarray(inputs[0]))[:, None])
     x_t = gpuarray.to_gpu(np.asarray(inputs[1])[0, :, :])
     x_f = gpuarray.to_gpu(np.asarray(inputs[2])[0, :, :])
     Xt = cumath.exp(misc.add(linalg.dot(x_t, A), b))
     Xf = cumath.exp(misc.add(linalg.dot(x_f, A), b))
     Xtn = misc.sum(Xt, axis=1, keepdims=True)
     Xfn = misc.sum(Xf, axis=1, keepdims=True)
     Xt = misc.divide(Xt, Xtn)
     Xf = misc.divide(Xf, Xfn)
     w = misc.multiply(Xt, alpha) + misc.multiply(Xf, 1 - alpha)
     wp = cumath.log(w)
     wpn = misc.sum(wp, axis=1, keepdims=True) / self.n
     wp = misc.subtract(wp, wpn)
     t1 = misc.sum(x * wp, axis=1)
     t2 = (self.n + depth) * cumath.log(misc.sum(w, axis=1))
     t3 = depth * wpn
     outputs[0][0] = misc.sum(t1 - t2 + t3).get()
     for v in node.outputs:
         compute_map[v][0] = True
Ejemplo n.º 7
0
    def fprop(self, Y, Y_true, meta):
        # really bad things happen if Y_true is a bool array

        if not Y.shape == Y_true.shape:
            raise ValueError("Shape of predictions and labels do not match. (Y={}, Y_true={})".format(Y.shape, Y_true.shape))

        out = - pycuda.gpuarray.sum(Y_true * cumath.log(Y)) / meta['space_below'].get_extent('b')

        fprop_state = {}
        fprop_state['input_space'] = meta['space_below']

        return out, meta, fprop_state
Ejemplo n.º 8
0
def logsumexp(mat, tmp=None):
    max_dim = max_by_axis(mat, 1)
    if tmp is None:
        tmp = gpuarray.empty_like(mat)
    add_vec_to_mat(mat, max_dim, 0, target=tmp, substract=True)

    exp_func.prepared_async_call(tmp._grid, tmp._block, None,
                                 tmp.gpudata, tmp.gpudata, tmp.mem_size)
    
    # tmp = cumath.exp(tmp)
    tmp = matrix_sum_out_axis(tmp, 1)
    tmp = cumath.log(tmp)
    max_dim += tmp
    return max_dim
Ejemplo n.º 9
0
    def fprop(self, Y, Y_true, meta):
        # really bad things happen if Y_true is a bool array

        if not Y.shape == Y_true.shape:
            raise ValueError(
                "Shape of predictions and labels do not match. (Y={}, Y_true={})"
                .format(Y.shape, Y_true.shape))

        out = -pycuda.gpuarray.sum(
            Y_true * cumath.log(Y)) / meta['space_below'].get_extent('b')

        fprop_state = {}
        fprop_state['input_space'] = meta['space_below']

        return out, meta, fprop_state
Ejemplo n.º 10
0
        def thunk():
            alpha = gpuarray.to_gpu(np.squeeze(np.asarray(inputs[0]))[:, None])
            x_t = gpuarray.to_gpu(np.asarray(inputs[1])[0, :, :])
            x_f = gpuarray.to_gpu(np.asarray(inputs[2])[0, :, :])
            Xt = cumath.exp(misc.add(linalg.dot(x_t, A), b))
            Xf = cumath.exp(misc.add(linalg.dot(x_f, A), b))
            Xtn = misc.sum(Xt, axis=1, keepdims=True)
            Xfn = misc.sum(Xf, axis=1, keepdims=True)
            Xt = misc.divide(Xt, Xtn)
            Xf = misc.divide(Xf, Xfn)
            w = misc.multiply(Xt, alpha) + misc.multiply(Xf, 1 - alpha)
            dq = Xt - Xf
            qdw = dq / w
            t1 = misc.sum(x * qdw, axis=1)
            f = 2 * depth + self.base.n
            t2 = f * misc.sum(dq, axis=1) / misc.sum(w, axis=1)
            t3 = misc.sum(x, axis=1) * misc.sum(qdw, axis=1)
            dalpha = t1 - t2 + t3
            del dq, t1, f, t2, t3

            iw = 1 / w
            S1 = misc.multiply(
                depth[:, None] * (self.base.n - 1) / self.base.n, iw)
            S2 = (self.base.n + depth[:, None]) / cumath.log(
                misc.sum(w, axis=1, keepdims=True))
            F = misc.multiply(misc.subtract((x * iw) - S1, S2), alpha)
            del w, iw, S1, S2

            cast = gpuarray.zeros((x_t.shape[1], Xt.shape[1]),
                                  dtype=theano.config.floatX)
            dLq_t = gpuarray.zeros(x_t.shape, dtype=theano.config.floatX)
            dLq_f = gpuarray.zeros(x_f.shape, dtype=theano.config.floatX)
            for i in range(Xt.shape[0]):
                S1 = misc.multiply(Xt[None, i, :], A)
                S2 = misc.sum(S1, axis=1, keepdims=True)
                S2 = misc.multiply(S2, misc.add(Xt[None, i, :], cast))
                dLq_t[i, :] = misc.sum(misc.multiply(F[None, i, :], S1 - S2),
                                       axis=1)
                S1 = misc.multiply(Xf[None, i, :], A)
                S2 = misc.sum(S1, axis=1, keepdims=True)
                S2 = misc.multiply(S2, misc.add(Xf[None, i, :], cast))
                dLq_f[i, :] = misc.sum(misc.multiply(F[None, i, :], S1 - S2),
                                       axis=1)
            outputs[0][0] = dalpha.get()
            outputs[1][0] = dLq_t.get()
            outputs[2][0] = dLq_f.get()
            for v in node.outputs:
                compute_map[v][0] = True
Ejemplo n.º 11
0
 def log_t(self, a, out):
     cumath.log(a, out=out)
Ejemplo n.º 12
0
def cross_entropy(x, y):
    loss = y * cumath.log(x + eps)
    nan_to_zeros_kernel(loss, loss)
    loss = -gpuarray.sum(loss)
    return float(loss.get())
Ejemplo n.º 13
0
def logsumexp(mat):
    max_dim = max_by_axis(mat, 1)
    tmp = add_vec_to_mat(mat, -max_dim, 0)
    L = max_dim + cumath.log(matrix_sum_out_axis(cumath.exp(tmp), 1))
    return L
Ejemplo n.º 14
0
GPU_find_max_in_shrmem(all_l_rhots_gpu, grid=grd, block=blk, shared=int(max_tpb*8))

# Indexes are not contiguous

griddimx = int(nclmns / max_tpb) 
griddimy = int(nsamps) 
# One thread per sample-time
grd = (griddimx, griddimy, 1)
blk = (max_tpb,  1,        1)

maxes = np.array(all_l_rhots_gpu[:,0][1::nmodes].get()).astype(np.float64)
maxes_gpu = gpuarray.to_gpu(maxes)

GPU_bcast_vec_to_matrix(all_l_rhots_gpu, -maxes_gpu, grid=grd, block=blk, shared=8)
# ***** THIS IS CORRECT AND WORKING UP THROUGH HERE AS OF AUGUST 10TH 2016 ***** 

''' 
 Marginalize over Time
'''

all_l_rhots_gpu = cumath.exp(all_l_rhots_gpu) # exponentiate 

GPU_nv_reduc(all_l_rhots_gpu) # sum over time 

lnL_gpu = maxes_gpu + cumath.log(all_l_rhots_gpu) # TIMES DELTA T FIXME




Ejemplo n.º 15
0
# time operation
s = time()
hC = numpy.log(hA)
e = time()

print 'serial elapsed time: %f \n' % (e-s)

## device execution
# allocate device arrays
dA = gpuarray.to_gpu(hA)
dB = gpuarray.to_gpu(hB)
dC = gpuarray.to_gpu(hC)

# time operation
s = time()
dC = cumath.log(dA)
e = time()

print 'gpu elapsed time: %f \n' % (e-s)

###################
# 3) elementwise kernel
# performs array operations much faster than gpu_array

print '\n elementwise kernel\n'
print '---------------------\n'

from pycuda.curandom import rand as curand

a_gpu = curand((1000,))
b_gpu = curand((1000,))
Ejemplo n.º 16
0
def cross_entropy(x, y):
    loss = y * cumath.log(x + eps)
    nan_to_zeros(loss, loss)
    loss = -gpuarray.sum(loss)
    return loss
Ejemplo n.º 17
0
 def get_log_like_val(self,Y):
     return np.min( ((gpuarray.sum( (cumath.log(self.outputs+self.eps_tol)*Y) )).get(), 10**20 ) )    
Ejemplo n.º 18
0
def rdmd(a_gpu, k=None, p=5, q=1, modes='exact', method_rsvd='standard', return_amplitudes=False, return_vandermonde=False, handle=None):
    """
    Randomized Dynamic Mode Decomposition.

    Dynamic Mode Decomposition (DMD) is a data processing algorithm which
    allows to decompose a matrix `a` in space and time.
    The matrix `a` is decomposed as `a = FBV`, where the columns of `F`
    contain the dynamic modes. The modes are ordered corresponding
    to the amplitudes stored in the diagonal matrix `B`. `V` is a Vandermonde
    matrix describing the temporal evolution.

    Parameters
    ----------
    a_gpu : pycuda.gpuarray.GPUArray
        Real/complex input matrix  `a` with dimensions `(m, n)`.
    k : int, optional
        If `k < (n-1)` low-rank Dynamic Mode Decomposition is computed.
    p : int
        `p` sets the oversampling parameter for rSVD (default k=5).
    q : int
        `q` sets the number of power iterations for rSVD (default=1).
    modes : `{'standard', 'exact'}`
        'standard' : uses the standard definition to compute the dynamic modes,
                    `F = U * W`.
        'exact' : computes the exact dynamic modes, `F = Y * V * (S**-1) * W`.    
    method_rsvd : `{'standard', 'fast'}`
        'standard' : (default) Standard algorithm as described in [1, 2] 
        'fast' : Version II algorithm as described in [2]
    return_amplitudes : bool `{True, False}` 
        True: return amplitudes in addition to dynamic modes. 
    return_vandermonde : bool `{True, False}`
        True: return Vandermonde matrix in addition to dynamic modes and amplitudes.    
    handle : int
        CUBLAS context. If no context is specified, the default handle from
        `skcuda.misc._global_cublas_handle` is used.

    Returns
    -------
    f_gpu : pycuda.gpuarray.GPUArray
        Matrix containing the dynamic modes of shape `(m, n-1)`  or `(m, k)`.
    b_gpu : pycuda.gpuarray.GPUArray
        1-D array containing the amplitudes of length `min(n-1, k)`.
    v_gpu : pycuda.gpuarray.GPUArray
        Vandermonde matrix of shape `(n-1, n-1)`  or `(k, n-1)`.

    Notes
    -----
    Double precision is only supported if the standard version of the
    CULA Dense toolkit is installed.

    This function destroys the contents of the input matrix.

    Arrays are assumed to be stored in column-major order, i.e., order='F'.

    References
    ----------
    N. B. Erichson and C. Donovan.
    "Randomized Low-Rank Dynamic Mode Decomposition for Motion Detection"
    Under Review.    
    
    N. Halko, P. Martinsson, and J. Tropp.
    "Finding structure with randomness: probabilistic
    algorithms for constructing approximate matrix
    decompositions" (2009).
    (available at `arXiv <http://arxiv.org/abs/0909.4061>`_).

    J. H. Tu, et al.
    "On dynamic mode decomposition: theory and applications."
    arXiv preprint arXiv:1312.0041 (2013).
    
    Examples
    --------
    >>> #Numpy
    >>> import numpy as np
    >>> #Plot libs
    >>> import matplotlib.pyplot as plt
    >>> from mpl_toolkits.mplot3d import Axes3D
    >>> from matplotlib import cm
    >>> #GPU DMD libs
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> from skcuda import linalg, rlinalg
    >>> linalg.init()
    >>> rlinalg.init()
    
    >>> # Define time and space discretizations
    >>> x=np.linspace( -15, 15, 200)
    >>> t=np.linspace(0, 8*np.pi , 80) 
    >>> dt=t[2]-t[1]
    >>> X, T = np.meshgrid(x,t)
    >>> # Create two patio-temporal patterns
    >>> F1 = 0.5* np.cos(X)*(1.+0.* T)
    >>> F2 = ( (1./np.cosh(X)) * np.tanh(X)) *(2.*np.exp(1j*2.8*T))
    >>> # Add both signals
    >>> F = (F1+F2)
    
    >>> #Plot dataset
    >>> fig = plt.figure()
    >>> ax = fig.add_subplot(231, projection='3d')
    >>> ax = fig.gca(projection='3d')
    >>> surf = ax.plot_surface(X, T, F, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=True)
    >>> ax.set_zlim(-1, 1)
    >>> plt.title('F')
    >>> ax = fig.add_subplot(232, projection='3d')
    >>> ax = fig.gca(projection='3d')
    >>> surf = ax.plot_surface(X, T, F1, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=False)
    >>> ax.set_zlim(-1, 1)
    >>> plt.title('F1')
    >>> ax = fig.add_subplot(233, projection='3d')
    >>> ax = fig.gca(projection='3d')
    >>> surf = ax.plot_surface(X, T, F2, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=False)
    >>> ax.set_zlim(-1, 1)
    >>> plt.title('F2')
    
    >>> #Dynamic Mode Decomposition
    >>> F_gpu = np.array(F.T, np.complex64, order='F')
    >>> F_gpu = gpuarray.to_gpu(F_gpu) 
    >>> Fmodes_gpu, b_gpu, V_gpu, omega_gpu = rlinalg.rdmd(F_gpu, k=2, p=0, q=1, modes='exact', return_amplitudes=True, return_vandermonde=True)
    >>> omega = omega_gpu.get()
    >>> plt.scatter(omega.real, omega.imag, marker='o', c='r')
   
    >>> #Recover original signal
    >>> F1tilde = np.dot(Fmodes_gpu[:,0:1].get() , np.dot(b_gpu[0].get(), V_gpu[0:1,:].get() ) )
    >>> F2tilde = np.dot(Fmodes_gpu[:,1:2].get() , np.dot(b_gpu[1].get(), V_gpu[1:2,:].get() ) )
    
    >>> #Plot DMD modes
    >>> #Mode 0
    >>> ax = fig.add_subplot(235, projection='3d')
    >>> ax = fig.gca(projection='3d')
    >>> surf = ax.plot_surface(X[0:F1tilde.shape[1],:], T[0:F1tilde.shape[1],:], F1tilde.T, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=False)
    >>> ax.set_zlim(-1, 1)
    >>> plt.title('F1_tilde')
    >>> #Mode 1
    >>> ax = fig.add_subplot(236, projection='3d')
    >>> ax = fig.gca(projection='3d')
    >>> surf = ax.plot_surface(X[0:F2tilde.shape[1],:], T[0:F2tilde.shape[1],:], F2tilde.T, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=False)
    >>> ax.set_zlim(-1, 1)
    >>> plt.title('F2_tilde')
    >>> plt.show()     
    """

    #*************************************************************************
    #***        Author: N. Benjamin Erichson <*****@*****.**>        ***
    #***                              <2015>                               ***
    #***                       License: BSD 3 clause                       ***
    #*************************************************************************

    if not _has_cula:
        raise NotImplementedError('CULA not installed')

    if handle is None:
        handle = misc._global_cublas_handle

    alloc = misc._global_cublas_allocator

    # The free version of CULA only supports single precision floating
    data_type = a_gpu.dtype.type
    real_type = np.float32

    if data_type == np.complex64:
        cula_func_gesvd = cula.culaDeviceCgesvd
        cublas_func_gemm = cublas.cublasCgemm
        cublas_func_dgmm = cublas.cublasCdgmm
        cula_func_gels = cula.culaDeviceCgels
        copy_func = cublas.cublasCcopy
        transpose_func = cublas.cublasCgeam
        alpha = np.complex64(1.0)
        beta = np.complex64(0.0)
        TRANS_type = 'C'
        isreal = False
    elif data_type == np.float32:
        cula_func_gesvd = cula.culaDeviceSgesvd
        cublas_func_gemm = cublas.cublasSgemm
        cublas_func_dgmm = cublas.cublasSdgmm
        cula_func_gels = cula.culaDeviceSgels
        copy_func = cublas.cublasScopy
        transpose_func = cublas.cublasSgeam
        alpha = np.float32(1.0)
        beta = np.float32(0.0)
        TRANS_type = 'T'
        isreal = True
    else:
        if cula._libcula_toolkit == 'standard':
            if data_type == np.complex128:
                cula_func_gesvd = cula.culaDeviceZgesvd
                cublas_func_gemm = cublas.cublasZgemm
                cublas_func_dgmm = cublas.cublasZdgmm
                cula_func_gels = cula.culaDeviceZgels
                copy_func = cublas.cublasZcopy
                transpose_func = cublas.cublasZgeam
                alpha = np.complex128(1.0)
                beta = np.complex128(0.0)
                TRANS_type = 'C'
                isreal = False
            elif data_type == np.float64:
                cula_func_gesvd = cula.culaDeviceDgesvd
                cublas_func_gemm = cublas.cublasDgemm
                cublas_func_dgmm = cublas.cublasDdgmm
                cula_func_gels = cula.culaDeviceDgels
                copy_func = cublas.cublasDcopy
                transpose_func = cublas.cublasDgeam
                alpha = np.float64(1.0)
                beta = np.float64(0.0)
                TRANS_type = 'T'
                isreal = True
            else:
                raise ValueError('unsupported type')
            real_type = np.float64
        else:
            raise ValueError('double precision not supported')

    #CUDA assumes that arrays are stored in column-major order
    m, n = np.array(a_gpu.shape, int)
    nx = n-1
    #Set k     
    if k == None : k = nx
    if k > nx or k < 1: raise ValueError('k is not valid')

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #Split data into lef and right snapshot sequence
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #Note: we need a copy of X_gpu, because SVD destroys X_gpu
    #While Y_gpu is just a pointer
    X_gpu = gpuarray.empty((m, n), data_type, order="F", allocator=alloc) 
    copy_func(handle, X_gpu.size, int(a_gpu.gpudata), 1, int(X_gpu.gpudata), 1)
    X_gpu = X_gpu[:, :nx]    
    Y_gpu = a_gpu[:, 1:] 

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #Randomized Singular Value Decomposition
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~    
    U_gpu, s_gpu, Vh_gpu = rsvd(X_gpu, k=k, p=p, q=q, 
                                method=method_rsvd, handle=handle)
    
    
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #Solve the LS problem to find estimate for M using the pseudo-inverse    
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~    
    #real: M = U.T * Y * Vt.T * S**-1
    #complex: M = U.H * Y * Vt.H * S**-1
    #Let G = Y * Vt.H * S**-1, hence M = M * G
    
    #Allocate G and M  
    G_gpu = gpuarray.empty((m,k), data_type, order="F", allocator=alloc)    
    M_gpu = gpuarray.empty((k,k), data_type, order="F", allocator=alloc)     
    
    #i) s = s **-1 (inverse)
    if data_type == np.complex64 or data_type == np.complex128:
        s_gpu = 1/s_gpu
        s_gpu = s_gpu + 1j * gpuarray.zeros_like(s_gpu)
    else:
        s_gpu = 1.0/s_gpu

    
    #ii) real/complex: scale Vs =  Vt* x diag(s**-1) 
    Vs_gpu = gpuarray.empty((nx,k), data_type, order="F", allocator=alloc) 
    lda = max(1, Vh_gpu.strides[1] // Vh_gpu.dtype.itemsize)
    ldb = max(1, Vs_gpu.strides[1] // Vs_gpu.dtype.itemsize)     
    transpose_func(handle, TRANS_type, TRANS_type, nx, k,
                   alpha, int(Vh_gpu.gpudata), lda, beta, int(Vh_gpu.gpudata), lda,
                   int(Vs_gpu.gpudata), ldb)    
    
    
    
    cublas_func_dgmm(handle, 'r', nx, k, int(Vs_gpu.gpudata), nx, 
                     int(s_gpu.gpudata), 1 , int(Vs_gpu.gpudata), nx)
   

    #iii) real: G = Y * Vs , complex: G = Y x Vs
    cublas_func_gemm(handle, 'n', 'n', m, k, nx, alpha, 
                     int(Y_gpu.gpudata), m, int(Vs_gpu.gpudata), nx, 
                        beta, int(G_gpu.gpudata), m )      
   
    
    #iv) real/complex: M = U* x G 
    cublas_func_gemm(handle, TRANS_type, 'n', k, k, m, alpha, 
                     int(U_gpu.gpudata), m, int(G_gpu.gpudata), m, 
                    beta, int(M_gpu.gpudata), k )     

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #Eigen Decomposition
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #Note: If a_gpu is real the imag part is omitted
    Vr_gpu, w_gpu = linalg.eig(M_gpu, 'N', 'V', 'F')
    omega = cumath.log(w_gpu)
    
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
    #Compute DMD Modes 
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~    
    F_gpu = gpuarray.empty((m,k), data_type, order="F", allocator=alloc)    
    modes = modes.lower()
    if modes == 'exact': #Compute (exact) DMD modes: F = Y * V * S**-1 * W = G * W
        cublas_func_gemm(handle, 'n', 'n', m, k, k, alpha, 
                         G_gpu.gpudata, m, Vr_gpu.gpudata, k, 
                         beta, G_gpu.gpudata, m  )     
        F_gpu_temp = G_gpu
    
    elif modes == 'standard': #Compute (standard) DMD modes: F = U * W
        cublas_func_gemm(handle, 'n', 'n', m, k, k, 
                         alpha, U_gpu.gpudata, m, Vr_gpu.gpudata, k, 
                         beta, U_gpu.gpudata, m  )     
        F_gpu_temp = U_gpu
    else: 
        raise ValueError('Type of modes is not supported, choose "exact" or "standard".')
    
    #Copy is required, because gels destroys input    
    copy_func(handle, F_gpu_temp.size, int(F_gpu_temp.gpudata), 
              1, int(F_gpu.gpudata), 1)

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #Compute amplitueds b using least-squares: Fb=x1
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    if return_amplitudes==True:    
        #x1_gpu = a_gpu[:,0].copy() 
        x1_gpu = gpuarray.empty(m, data_type, order="F", allocator=alloc) 
        copy_func(handle, x1_gpu.size, int(a_gpu[:,0].gpudata), 1, int(x1_gpu.gpudata), 1)
        cula_func_gels( 'N', m, k, int(1) , F_gpu_temp.gpudata, m, x1_gpu.gpudata, m)
        b_gpu = x1_gpu
    
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #Compute Vandermonde matrix (CPU)
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    if return_vandermonde==True:      
        V_gpu = linalg.vander(w_gpu, n=nx)
    
    # Free internal CULA memory:
    cula.culaFreeBuffers()

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #Return 
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  
    if return_amplitudes==True and return_vandermonde==True:
        return F_gpu, b_gpu[:k], V_gpu, omega
    elif return_amplitudes==True and return_vandermonde==False:
        return F_gpu, b_gpu[:k], omega
    elif return_amplitudes==False and return_vandermonde==True:
        return F_gpu, V_gpu, omega
    else:
        return F_gpu, omega   
Ejemplo n.º 19
0
def cross_entropy(x, y):
    loss = y * cumath.log(x + eps)
    nan_to_zeros(loss, loss)
    loss = -gpuarray.sum(loss)
    return loss
Ejemplo n.º 20
0
def marginalize_all_lnL(mod, all_l_rhots_gpu, nmodes, nsamps, ntimes, nclmns,
                        delta_t):
    # Recopy constants into device constant memory

    # **-- constants --**
    max_tpb = 1024

    nmodes_gpu = mod.get_global("nmodes")[0]
    nsamps_gpu = mod.get_global("nsamps")[0]
    ntimes_gpu = mod.get_global("ntimes")[0]
    nclmns_gpu = mod.get_global("nclmns")[0]

    cuda.memcpy_htod(nmodes_gpu, np.array(nmodes, ndmin=1).astype(np.int32))
    cuda.memcpy_htod(nsamps_gpu, np.array(nsamps, ndmin=1).astype(np.int32))
    cuda.memcpy_htod(ntimes_gpu, np.array(ntimes, ndmin=1).astype(np.int32))
    cuda.memcpy_htod(nclmns_gpu, np.array(nclmns, ndmin=1).astype(np.int32))

    # Get GPU functions

    GPU_find_max_in_shrmem = mod.get_function("find_max_in_shrmem")
    GPU_nv_reduc = mod.get_function("nv_reduc")
    GPU_bcast_vec_to_matrix = mod.get_function("bcast_vec_to_matrix")

    def next_greater_power_of_2(x):
        return 2**(x - 1).bit_length()

    griddimx = int(nclmns / max_tpb)
    griddimy = int(nsamps)
    # One thread per sample-time
    grd = (griddimx, griddimy, 1)
    blk = (max_tpb, 1, 1)

    print("Finding Maximum...\n")
    # Get the maxes
    GPU_find_max_in_shrmem(all_l_rhots_gpu,
                           grid=grd,
                           block=blk,
                           shared=int(max_tpb * 8))

    griddimy = int(nsamps)
    blokdimx = next_greater_power_of_2(
        griddimx)  # Only need as many threads as we had blocks in x dimension
    grd = (1, griddimy, 1)
    blk = (blokdimx, 1, 1)
    # Second reduction - this works as long as we don't have rhoTS longer then 1024^2
    GPU_find_max_in_shrmem(all_l_rhots_gpu,
                           grid=grd,
                           block=blk,
                           shared=int(blokdimx * 8))

    # Collect the maxes through the host
    maxes = np.array(all_l_rhots_gpu[:, 0][nmodes - 2::nmodes].get()).astype(
        np.float64)
    maxes_gpu = gpuarray.to_gpu(maxes)

    griddimx = int(nclmns / max_tpb)
    griddimy = int(nsamps)
    # One thread per sample-time
    grd = (griddimx, griddimy, 1)
    blk = (max_tpb, 1, 1)

    GPU_bcast_vec_to_matrix(all_l_rhots_gpu,
                            -maxes_gpu,
                            grid=grd,
                            block=blk,
                            shared=8)

    # Exponentiating a bunch of zeros creates a bunch of extra ones that we don't want in our
    # sum, so this is the number we need to subtract out to offset it
    padwidth = nclmns - ntimes

    all_l_rhots_gpu = cumath.exp(all_l_rhots_gpu)  # exponentiate

    print("Reducing final answer...\n")
    GPU_nv_reduc(all_l_rhots_gpu, grid=grd, block=blk,
                 shared=max_tpb * 8)  # sum over time

    griddimy = int(nsamps)
    blokdimx = next_greater_power_of_2(
        griddimx)  # Only need as many threads as we had blocks in x dimension
    grd = (1, griddimy, 1)
    blk = (blokdimx, 1, 1)

    GPU_nv_reduc(all_l_rhots_gpu, grid=grd, block=blk,
                 shared=blokdimx * 8)  # sum over time

    lnL = (all_l_rhots_gpu[:, 0][nmodes - 1::nmodes].get() - padwidth).astype(
        np.float64)
    lnL_gpu = gpuarray.to_gpu(lnL)
    lnL_gpu = maxes_gpu + cumath.log(lnL_gpu * delta_t)

    return lnL_gpu.get()
Ejemplo n.º 21
0
def logsumexp(mat):
    max_dim = max_by_axis(mat, 1)
    tmp = add_vec_to_mat(mat, -max_dim, 0)
    L = max_dim + cumath.log(matrix_sum_out_axis(cumath.exp(tmp), 1))
    return L
Ejemplo n.º 22
0
                       grid=grd,
                       block=blk,
                       shared=int(max_tpb * 8))

# Indexes are not contiguous

griddimx = int(nclmns / max_tpb)
griddimy = int(nsamps)
# One thread per sample-time
grd = (griddimx, griddimy, 1)
blk = (max_tpb, 1, 1)

maxes = np.array(all_l_rhots_gpu[:, 0][1::nmodes].get()).astype(np.float64)
maxes_gpu = gpuarray.to_gpu(maxes)

GPU_bcast_vec_to_matrix(all_l_rhots_gpu,
                        -maxes_gpu,
                        grid=grd,
                        block=blk,
                        shared=8)
# ***** THIS IS CORRECT AND WORKING UP THROUGH HERE AS OF AUGUST 10TH 2016 *****
''' 
 Marginalize over Time
'''

all_l_rhots_gpu = cumath.exp(all_l_rhots_gpu)  # exponentiate

GPU_nv_reduc(all_l_rhots_gpu)  # sum over time

lnL_gpu = maxes_gpu + cumath.log(all_l_rhots_gpu)  # TIMES DELTA T FIXME
Ejemplo n.º 23
0
def cross_entropy_logistic(x, y):
    loss = y * cumath.log(x + eps) + (1. - y) * cumath.log(1. - x + eps)
    loss = -gpuarray.sum(loss)
    return loss
Ejemplo n.º 24
0
def marginalize_all_lnL(mod, all_l_rhots_gpu, nmodes, nsamps, ntimes, nclmns, delta_t):
	# Recopy constants into device constant memory

	# **-- constants --**
	max_tpb = 1024 
	
	nmodes_gpu = mod.get_global("nmodes")[0]
	nsamps_gpu = mod.get_global("nsamps")[0]
	ntimes_gpu = mod.get_global("ntimes")[0]
	nclmns_gpu = mod.get_global("nclmns")[0]

	cuda.memcpy_htod(nmodes_gpu, np.array(nmodes, ndmin=1).astype(np.int32))
	cuda.memcpy_htod(nsamps_gpu, np.array(nsamps, ndmin=1).astype(np.int32))
	cuda.memcpy_htod(ntimes_gpu, np.array(ntimes, ndmin=1).astype(np.int32))
	cuda.memcpy_htod(nclmns_gpu, np.array(nclmns, ndmin=1).astype(np.int32))

	# Get GPU functions

	GPU_find_max_in_shrmem = mod.get_function("find_max_in_shrmem")
	GPU_nv_reduc = mod.get_function("nv_reduc")
	GPU_bcast_vec_to_matrix = mod.get_function("bcast_vec_to_matrix")

	def next_greater_power_of_2(x):  
    		return 2**(x-1).bit_length()


        griddimx = int(nclmns / max_tpb)
        griddimy = int(nsamps)
        # One thread per sample-time
        grd = (griddimx, griddimy, 1)
        blk = (max_tpb,  1,        1)

	print("Finding Maximum...\n")
	# Get the maxes
	GPU_find_max_in_shrmem(all_l_rhots_gpu, grid=grd, block=blk, shared=int(max_tpb*8))

	griddimy = int(nsamps)
	blokdimx = next_greater_power_of_2(griddimx) # Only need as many threads as we had blocks in x dimension
	grd = (1, griddimy, 1)
	blk = (blokdimx, 1, 1)
	# Second reduction - this works as long as we don't have rhoTS longer then 1024^2
	GPU_find_max_in_shrmem(all_l_rhots_gpu, grid=grd, block=blk, shared=int(blokdimx*8))
	
	# Collect the maxes through the host	
	maxes = np.array(all_l_rhots_gpu[:,0][nmodes-2::nmodes].get()).astype(np.float64)
	maxes_gpu = gpuarray.to_gpu(maxes)
	
	griddimx = int(nclmns / max_tpb)
	griddimy = int(nsamps)
	# One thread per sample-time
	grd = (griddimx, griddimy, 1)
	blk = (max_tpb,  1,        1)


	GPU_bcast_vec_to_matrix(all_l_rhots_gpu, -maxes_gpu, grid=grd, block=blk, shared=8)

	# Exponentiating a bunch of zeros creates a bunch of extra ones that we don't want in our
	# sum, so this is the number we need to subtract out to offset it
	padwidth = nclmns - ntimes


	all_l_rhots_gpu = cumath.exp(all_l_rhots_gpu) # exponentiate 

	print("Reducing final answer...\n")
	GPU_nv_reduc(all_l_rhots_gpu, grid=grd, block=blk, shared=max_tpb*8) # sum over time 

	griddimy = int(nsamps)
	blokdimx = next_greater_power_of_2(griddimx) # Only need as many threads as we had blocks in x dimension
	grd = (1, griddimy, 1)
	blk = (blokdimx, 1, 1)

	GPU_nv_reduc(all_l_rhots_gpu, grid=grd, block=blk, shared=blokdimx*8) # sum over time 

	lnL = (all_l_rhots_gpu[:,0][nmodes-1::nmodes].get() - padwidth).astype(np.float64)
	lnL_gpu = gpuarray.to_gpu(lnL)
	lnL_gpu = maxes_gpu + cumath.log(lnL_gpu*delta_t)

	return lnL_gpu.get()
Ejemplo n.º 25
0
def cdmd(a_gpu, k=None, c=None, modes="exact", return_amplitudes=False, return_vandermonde=False, handle=None):
    """
    Compressed Dynamic Mode Decomposition.

    Dynamic Mode Decomposition (DMD) is a data processing algorithm which
    allows to decompose a matrix `a` in space and time.
    The matrix `a` is decomposed as `a = FBV`, where the columns of `F`
    contain the dynamic modes. The modes are ordered corresponding
    to the amplitudes stored in the diagonal matrix `B`. `V` is a Vandermonde
    matrix describing the temporal evolution.

    Parameters
    ----------
    a_gpu : pycuda.gpuarray.GPUArray
        Real/complex input matrix  `a` with dimensions `(m, n)`.
    k : int, optional
        If `k < (n-1)` low-rank Dynamic Mode Decomposition is computed.
    c : int
        `p` sets the number of measurements sensors.
    modes : `{'exact'}`
        'exact' : computes the exact dynamic modes, `F = Y * V * (S**-1) * W`.
    return_amplitudes : bool `{True, False}` 
        True: return amplitudes in addition to dynamic modes. 
    return_vandermonde : bool `{True, False}`
        True: return Vandermonde matrix in addition to dynamic modes and amplitudes.          
    handle : int
        CUBLAS context. If no context is specified, the default handle from
        `skcuda.misc._global_cublas_handle` is used.

    Returns
    -------
    f_gpu : pycuda.gpuarray.GPUArray
        Matrix containing the dynamic modes of shape `(m, n-1)`  or `(m, k)`.
    b_gpu : pycuda.gpuarray.GPUArray
        1-D array containing the amplitudes of length `min(n-1, k)`.
    v_gpu : pycuda.gpuarray.GPUArray
        Vandermonde matrix of shape `(n-1, n-1)`  or `(k, n-1)`.

    Notes
    -----
    Double precision is only supported if the standard version of the
    CULA Dense toolkit is installed.

    This function destroys the contents of the input matrix.

    Arrays are assumed to be stored in column-major order, i.e., order='F'.

    References
    ----------
    S. L. Brunton, et al.
    "Compressed sampling and dynamic mode decomposition."  
    arXiv preprint arXiv:1312.5186 (2013).
    
    J. H. Tu, et al.
    "On dynamic mode decomposition: theory and applications."
    arXiv preprint arXiv:1312.0041 (2013).
    
    Examples
    --------
    >>> #Numpy
    >>> import numpy as np
    >>> #Plot libs
    >>> import matplotlib.pyplot as plt
    >>> from mpl_toolkits.mplot3d import Axes3D
    >>> from matplotlib import cm
    >>> #GPU DMD libs
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> from skcuda import linalg, rlinalg
    >>> linalg.init()
    >>> rlinalg.init()
    
    >>> # Define time and space discretizations
    >>> x=np.linspace( -15, 15, 200)
    >>> t=np.linspace(0, 8*np.pi , 80) 
    >>> dt=t[2]-t[1]
    >>> X, T = np.meshgrid(x,t)
    >>> # Create two patio-temporal patterns
    >>> F1 = 0.5* np.cos(X)*(1.+0.* T)
    >>> F2 = ( (1./np.cosh(X)) * np.tanh(X)) *(2.*np.exp(1j*2.8*T))
    >>> # Add both signals
    >>> F = (F1+F2)
    
    >>> #Plot dataset
    >>> fig = plt.figure()
    >>> ax = fig.add_subplot(231, projection='3d')
    >>> ax = fig.gca(projection='3d')
    >>> surf = ax.plot_surface(X, T, F, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=True)
    >>> ax.set_zlim(-1, 1)
    >>> plt.title('F')
    >>> ax = fig.add_subplot(232, projection='3d')
    >>> ax = fig.gca(projection='3d')
    >>> surf = ax.plot_surface(X, T, F1, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=False)
    >>> ax.set_zlim(-1, 1)
    >>> plt.title('F1')
    >>> ax = fig.add_subplot(233, projection='3d')
    >>> ax = fig.gca(projection='3d')
    >>> surf = ax.plot_surface(X, T, F2, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=False)
    >>> ax.set_zlim(-1, 1)
    >>> plt.title('F2')
    
    
    >>> #Dynamic Mode Decomposition
    >>> F_gpu = np.array(F.T, np.complex64, order='F')
    >>> F_gpu = gpuarray.to_gpu(F_gpu) 
    >>> Fmodes_gpu, b_gpu, V_gpu, omega_gpu = rlinalg.cdmd(F_gpu, k=2, c=20, modes='exact', return_amplitudes=True, return_vandermonde=True)
    >>> omega = omega_gpu.get()
    >>> plt.scatter(omega.real, omega.imag, marker='o', c='r')
    
    >>> #Recover original signal
    >>> F1tilde = np.dot(Fmodes_gpu[:,0:1].get() , np.dot(b_gpu[0].get(), V_gpu[0:1,:].get() ) )
    >>> F2tilde = np.dot(Fmodes_gpu[:,1:2].get() , np.dot(b_gpu[1].get(), V_gpu[1:2,:].get() ) )
    
    >>> # Plot DMD modes
    >>> #Mode 0
    >>> ax = fig.add_subplot(235, projection='3d')
    >>> ax = fig.gca(projection='3d')
    >>> surf = ax.plot_surface(X[0:F1tilde.shape[1],:], T[0:F1tilde.shape[1],:], F1tilde.T, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=False)
    >>> ax.set_zlim(-1, 1)
    >>> plt.title('F1_tilde')
    >>> #Mode 1
    >>> ax = fig.add_subplot(236, projection='3d')
    >>> ax = fig.gca(projection='3d')
    >>> surf = ax.plot_surface(X[0:F2tilde.shape[1],:], T[0:F2tilde.shape[1],:], F2tilde.T, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=False)
    >>> ax.set_zlim(-1, 1)
    >>> plt.title('F2_tilde')
    >>> plt.show()     
    """

    # *************************************************************************
    # ***        Author: N. Benjamin Erichson <*****@*****.**>        ***
    # ***                              <2015>                               ***
    # ***                       License: BSD 3 clause                       ***
    # *************************************************************************

    if not _has_cula:
        raise NotImplementedError("CULA not installed")

    if handle is None:
        handle = misc._global_cublas_handle

    alloc = misc._global_cublas_allocator

    # The free version of CULA only supports single precision floating
    data_type = a_gpu.dtype.type
    real_type = np.float32

    if data_type == np.complex64:
        cula_func_gesvd = cula.culaDeviceCgesvd
        cublas_func_gemm = cublas.cublasCgemm
        cublas_func_dgmm = cublas.cublasCdgmm
        cula_func_gels = cula.culaDeviceCgels
        copy_func = cublas.cublasCcopy
        transpose_func = cublas.cublasCgeam
        alpha = np.complex64(1.0)
        beta = np.complex64(0.0)
        TRANS_type = "C"
        isreal = False
    elif data_type == np.float32:
        cula_func_gesvd = cula.culaDeviceSgesvd
        cublas_func_gemm = cublas.cublasSgemm
        cublas_func_dgmm = cublas.cublasSdgmm
        cula_func_gels = cula.culaDeviceSgels
        copy_func = cublas.cublasScopy
        transpose_func = cublas.cublasSgeam
        alpha = np.float32(1.0)
        beta = np.float32(0.0)
        TRANS_type = "T"
        isreal = True
    else:
        if cula._libcula_toolkit == "standard":
            if data_type == np.complex128:
                cula_func_gesvd = cula.culaDeviceZgesvd
                cublas_func_gemm = cublas.cublasZgemm
                cublas_func_dgmm = cublas.cublasZdgmm
                cula_func_gels = cula.culaDeviceZgels
                copy_func = cublas.cublasZcopy
                transpose_func = cublas.cublasZgeam
                alpha = np.complex128(1.0)
                beta = np.complex128(0.0)
                TRANS_type = "C"
                isreal = False
            elif data_type == np.float64:
                cula_func_gesvd = cula.culaDeviceDgesvd
                cublas_func_gemm = cublas.cublasDgemm
                cublas_func_dgmm = cublas.cublasDdgmm
                cula_func_gels = cula.culaDeviceDgels
                copy_func = cublas.cublasDcopy
                transpose_func = cublas.cublasDgeam
                alpha = np.float64(1.0)
                beta = np.float64(0.0)
                TRANS_type = "T"
                isreal = True
            else:
                raise ValueError("unsupported type")
            real_type = np.float64
        else:
            raise ValueError("double precision not supported")

    # CUDA assumes that arrays are stored in column-major order
    m, n = np.array(a_gpu.shape, int)
    nx = n - 1
    # Set k
    if k == None:
        k = nx
    if k > nx or k < 1:
        raise ValueError("k is not valid")

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Compress
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    if c == None:
        Ac_gpu = A
        c = m
    else:
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Generate a random sensing matrix S
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        if isreal == False:
            Simag_gpu = gpuarray.empty((m, c), real_type, order="F", allocator=alloc)
            Sreal_gpu = gpuarray.empty((m, c), real_type, order="F", allocator=alloc)
            S_gpu = gpuarray.empty((c, m), data_type, order="F", allocator=alloc)
            rand.fill_uniform(Simag_gpu)
            rand.fill_uniform(Sreal_gpu)
            S_gpu = Sreal_gpu + 1j * Simag_gpu
            S_gpu = S_gpu.T * 2 - 1  # Scale to [-1,1]
        else:
            S_gpu = gpuarray.empty((c, m), real_type, order="F", allocator=alloc)
            rand.fill_uniform(S_gpu)  # Draw random samples from a ~ Uniform(-1,1) distribution
            S_gpu = S_gpu * 2 - 1  # Scale to [-1,1]

        # Allocate Ac
        Ac_gpu = gpuarray.empty((c, n), data_type, order="F", allocator=alloc)

        # Compress input matrix
        cublas_func_gemm(
            handle, "n", "n", c, n, m, alpha, int(S_gpu.gpudata), c, int(a_gpu.gpudata), m, beta, int(Ac_gpu.gpudata), c
        )

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Split data into lef and right snapshot sequence
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Note: we need a copy of X_gpu, because SVD destroys X_gpu
    # While Y_gpu is just a pointer
    X_gpu = gpuarray.empty((c, n), data_type, order="F", allocator=alloc)
    copy_func(handle, X_gpu.size, int(Ac_gpu.gpudata), 1, int(X_gpu.gpudata), 1)
    X_gpu = X_gpu[:, :nx]
    Y_gpu = Ac_gpu[:, 1:]
    Yorig_gpu = a_gpu[:, 1:]

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Singular Value Decomposition
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Allocate s, U, Vt for economic SVD
    # Note: singular values are always real
    min_s = min(nx, c)
    s_gpu = gpuarray.zeros(min_s, real_type, order="F", allocator=alloc)
    U_gpu = gpuarray.zeros((c, min_s), data_type, order="F", allocator=alloc)
    Vh_gpu = gpuarray.zeros((min_s, nx), data_type, order="F", allocator=alloc)

    # Economic SVD
    cula_func_gesvd(
        "S", "S", c, nx, int(X_gpu.gpudata), c, int(s_gpu.gpudata), int(U_gpu.gpudata), c, int(Vh_gpu.gpudata), min_s
    )

    # Low-rank DMD: trancate SVD if k < nx
    if k != nx:
        s_gpu = s_gpu[:k]
        U_gpu = U_gpu[:, :k]
        Vh_gpu = Vh_gpu[:k, :]

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Solve the LS problem to find estimate for M using the pseudo-inverse
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # real: M = U.T * Y * Vt.T * S**-1
    # complex: M = U.H * Y * Vt.H * S**-1
    # Let G = Y * Vt.H * S**-1, hence M = M * G

    # Allocate G and M
    G_gpu = gpuarray.zeros((c, k), data_type, order="F", allocator=alloc)
    M_gpu = gpuarray.zeros((k, k), data_type, order="F", allocator=alloc)

    # i) s = s **-1 (inverse)
    if data_type == np.complex64 or data_type == np.complex128:
        s_gpu = 1 / s_gpu
        s_gpu = s_gpu + 1j * gpuarray.zeros_like(s_gpu)
    else:
        s_gpu = 1 / s_gpu

    # ii) real/complex: scale Vs =  Vt* x diag(s**-1)
    Vs_gpu = gpuarray.zeros((nx, k), data_type, order="F", allocator=alloc)
    lda = max(1, Vh_gpu.strides[1] // Vh_gpu.dtype.itemsize)
    ldb = max(1, Vs_gpu.strides[1] // Vs_gpu.dtype.itemsize)
    transpose_func(
        handle,
        TRANS_type,
        TRANS_type,
        nx,
        k,
        1.0,
        int(Vh_gpu.gpudata),
        lda,
        0.0,
        int(Vh_gpu.gpudata),
        lda,
        int(Vs_gpu.gpudata),
        ldb,
    )
    # End Transpose

    cublas_func_dgmm(handle, "r", nx, k, int(Vs_gpu.gpudata), nx, int(s_gpu.gpudata), 1, int(Vs_gpu.gpudata), nx)

    # iii) real: G = Y * Vs , complex: G = Y x Vs
    cublas_func_gemm(
        handle, "n", "n", c, k, nx, alpha, int(Y_gpu.gpudata), c, int(Vs_gpu.gpudata), nx, beta, int(G_gpu.gpudata), c
    )

    # iv) real/complex: M = U* x G
    cublas_func_gemm(
        handle,
        TRANS_type,
        "n",
        k,
        k,
        c,
        alpha,
        int(U_gpu.gpudata),
        c,
        int(G_gpu.gpudata),
        c,
        beta,
        int(M_gpu.gpudata),
        k,
    )

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Eigen Decomposition
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Note: If a_gpu is real the imag part is omitted
    Vr_gpu, w_gpu = linalg.eig(M_gpu, "N", "V", "F")

    omega = cumath.log(w_gpu)

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Compute DMD Modes
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    F_gpu = gpuarray.empty((m, k), data_type, order="F", allocator=alloc)
    modes = modes.lower()
    if modes == "exact":  # Compute (exact) DMD modes: F = Y * V * S**-1 * W = G * W

        cublas_func_gemm(
            handle,
            "n",
            "n",
            nx,
            k,
            k,
            alpha,
            int(Vs_gpu.gpudata),
            nx,
            int(Vr_gpu.gpudata),
            k,
            beta,
            int(Vs_gpu.gpudata),
            nx,
        )

        cublas_func_gemm(
            handle, "n", "n", m, k, nx, alpha, Yorig_gpu.gpudata, m, Vs_gpu.gpudata, nx, beta, F_gpu.gpudata, m
        )

    else:
        raise ValueError('Type of modes is not supported, choose "exact" or "standard".')

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Compute amplitueds b using least-squares: Fb=x1
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    if return_amplitudes == True:
        F_gpu_temp = gpuarray.empty((m, k), data_type, order="F", allocator=alloc)

        # Copy is required, because gels destroys input
        copy_func(handle, F_gpu.size, int(F_gpu.gpudata), 1, int(F_gpu_temp.gpudata), 1)

        # x1_gpu = a_gpu[:,0].copy()
        x1_gpu = gpuarray.empty(m, data_type, order="F", allocator=alloc)
        copy_func(handle, x1_gpu.size, int(a_gpu[:, 0].gpudata), 1, int(x1_gpu.gpudata), 1)
        cula_func_gels("N", m, k, int(1), F_gpu_temp.gpudata, m, x1_gpu.gpudata, m)
        b_gpu = x1_gpu

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Compute Vandermonde matrix (CPU)
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    if return_vandermonde == True:
        V_gpu = linalg.vander(w_gpu, n=nx)

    # Free internal CULA memory:
    cula.culaFreeBuffers()

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Return
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    if return_amplitudes == True and return_vandermonde == True:
        return F_gpu, b_gpu[:k], V_gpu, omega
    elif return_amplitudes == True and return_vandermonde == False:
        return F_gpu, b_gpu[:k], omega
    elif return_amplitudes == False and return_vandermonde == True:
        return F_gpu, V_gpu, omega
    else:
        return F_gpu, omega
Ejemplo n.º 26
0
def demosaick_gpu(img):
    img = gp.to_gpu(img)
    p2x = im2col(img, _i2c2)
    cm.log(img + _eps, out=img)
    p1x = im2col(img, _i2c1)

    wA = p1x.shape[0]
    wB = p2x.shape[0]
    hA = p1x.shape[1]
    hB = p2x.shape[1]

    # Path 1
    p1x = p1x.reshape([wA * hA, 576])
    p1y = lg.dot(p1x, _wts.int1)
    cm.exp(p1y, out=p1y)

    p1y = p1y.reshape([wA * hA * 64, 3 * _ofac])
    p1x = lg.dot(p1y, _wts.int2)
    msc.add_matvec(p1x, _wts.int2b, out=p1x)
    p1x = p1x.reshape([wA * hA * 64 * 3, _ofac])

    # Path 2
    # conv1
    p2x = p2x.reshape([wB * hB, 64])
    p2y = lg.dot(p2x, _wts.c1)
    msc.add_matvec(p2y, _wts.c1b, out=p2y)
    gp.maximum(p2y, 0., p2y)
    p2y = p2y.reshape([wB, hB, _numsel])

    # conv2
    shI = [wB - 1, hB - 1, _numsel]
    shM = [(wB - 1) * (hB - 1), _numsel]
    p2x = gp.empty(shM, dtype=np.float32)
    pTT = gp.empty(shI, dtype=np.float32)

    pTT = pTT.reshape(shI)
    pTT[...] = p2y[0:-1, 0:-1, :]
    pTT = pTT.reshape(shM)
    p2x = lg.dot(pTT, _wts.c200)
    pTT = pTT.reshape(shI)
    pTT[...] = p2y[0:-1, 1:, :]
    pTT = pTT.reshape(shM)
    lg.add_dot(pTT, _wts.c201, p2x)
    pTT = pTT.reshape(shI)
    pTT[...] = p2y[1:, 0:-1, :]
    pTT = pTT.reshape(shM)
    lg.add_dot(pTT, _wts.c210, p2x)
    pTT = pTT.reshape(shI)
    pTT[...] = p2y[1:, 1:, :]
    pTT = pTT.reshape(shM)
    lg.add_dot(pTT, _wts.c211, p2x)
    msc.add_matvec(p2x, _wts.c2b, out=p2x)
    gp.maximum(p2x, 0., p2x)
    p2x = p2x.reshape(shI)

    # conv 3
    shI = [wB - 2, hB - 2, _numsel]
    shM = [(wB - 2) * (hB - 2), _numsel]
    p2y = gp.empty(shM, dtype=np.float32)
    pTT = gp.empty(shI, dtype=np.float32)

    pTT = pTT.reshape(shI)
    pTT[...] = p2x[0:-1, 0:-1, :]
    pTT = pTT.reshape(shM)
    p2y = lg.dot(pTT, _wts.c300)
    pTT = pTT.reshape(shI)
    pTT[...] = p2x[0:-1, 1:, :]
    pTT = pTT.reshape(shM)
    lg.add_dot(pTT, _wts.c301, p2y)
    pTT = pTT.reshape(shI)
    pTT[...] = p2x[1:, 0:-1, :]
    pTT = pTT.reshape(shM)
    lg.add_dot(pTT, _wts.c310, p2y)
    pTT = pTT.reshape(shI)
    pTT[...] = p2x[1:, 1:, :]
    pTT = pTT.reshape(shM)
    lg.add_dot(pTT, _wts.c311, p2y)
    msc.add_matvec(p2y, _wts.c3b, out=p2y)
    gp.maximum(p2y, 0., p2y)

    p2x = lg.dot(p2y, _wts.sout)

    msc.add_matvec(p2x, _wts.soutb, out=p2x)
    gp.maximum(p2x, 0., p2x)
    p2x = p2x.reshape(p1x.shape)

    # Combine
    p1x *= p2x
    p1 = msc.sum(p1x, axis=1)
    gp.maximum(p1, 0., p1)
    gp.minimum(p1, 1., p1)
    p1 = p1.reshape([wA, hA, 64 * 3])

    im = p2im(p1.get())

    return im
Ejemplo n.º 27
0
 def log_t(self, a, out):
     cumath.log(a, out=out)
Ejemplo n.º 28
0
def cdmd(a_gpu,
         k=None,
         c=None,
         modes='exact',
         return_amplitudes=False,
         return_vandermonde=False,
         handle=None):
    """
    Compressed Dynamic Mode Decomposition.

    Dynamic Mode Decomposition (DMD) is a data processing algorithm which
    allows to decompose a matrix `a` in space and time.
    The matrix `a` is decomposed as `a = FBV`, where the columns of `F`
    contain the dynamic modes. The modes are ordered corresponding
    to the amplitudes stored in the diagonal matrix `B`. `V` is a Vandermonde
    matrix describing the temporal evolution.

    Parameters
    ----------
    a_gpu : pycuda.gpuarray.GPUArray
        Real/complex input matrix  `a` with dimensions `(m, n)`.
    k : int, optional
        If `k < (n-1)` low-rank Dynamic Mode Decomposition is computed.
    c : int
        `p` sets the number of measurements sensors.
    modes : `{'exact'}`
        'exact' : computes the exact dynamic modes, `F = Y * V * (S**-1) * W`.
    return_amplitudes : bool `{True, False}` 
        True: return amplitudes in addition to dynamic modes. 
    return_vandermonde : bool `{True, False}`
        True: return Vandermonde matrix in addition to dynamic modes and amplitudes.          
    handle : int
        CUBLAS context. If no context is specified, the default handle from
        `skcuda.misc._global_cublas_handle` is used.

    Returns
    -------
    f_gpu : pycuda.gpuarray.GPUArray
        Matrix containing the dynamic modes of shape `(m, n-1)`  or `(m, k)`.
    b_gpu : pycuda.gpuarray.GPUArray
        1-D array containing the amplitudes of length `min(n-1, k)`.
    v_gpu : pycuda.gpuarray.GPUArray
        Vandermonde matrix of shape `(n-1, n-1)`  or `(k, n-1)`.

    Notes
    -----
    Double precision is only supported if the standard version of the
    CULA Dense toolkit is installed.

    This function destroys the contents of the input matrix.

    Arrays are assumed to be stored in column-major order, i.e., order='F'.

    References
    ----------
    S. L. Brunton, et al.
    "Compressed sampling and dynamic mode decomposition."  
    arXiv preprint arXiv:1312.5186 (2013).
    
    J. H. Tu, et al.
    "On dynamic mode decomposition: theory and applications."
    arXiv preprint arXiv:1312.0041 (2013).
    
    Examples
    --------
    >>> #Numpy
    >>> import numpy as np
    >>> #Plot libs
    >>> import matplotlib.pyplot as plt
    >>> from mpl_toolkits.mplot3d import Axes3D
    >>> from matplotlib import cm
    >>> #GPU DMD libs
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> from skcuda import linalg, rlinalg
    >>> linalg.init()
    >>> rlinalg.init()
    
    >>> # Define time and space discretizations
    >>> x=np.linspace( -15, 15, 200)
    >>> t=np.linspace(0, 8*np.pi , 80) 
    >>> dt=t[2]-t[1]
    >>> X, T = np.meshgrid(x,t)
    >>> # Create two patio-temporal patterns
    >>> F1 = 0.5* np.cos(X)*(1.+0.* T)
    >>> F2 = ( (1./np.cosh(X)) * np.tanh(X)) *(2.*np.exp(1j*2.8*T))
    >>> # Add both signals
    >>> F = (F1+F2)
    
    >>> #Plot dataset
    >>> fig = plt.figure()
    >>> ax = fig.add_subplot(231, projection='3d')
    >>> ax = fig.gca(projection='3d')
    >>> surf = ax.plot_surface(X, T, F, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=True)
    >>> ax.set_zlim(-1, 1)
    >>> plt.title('F')
    >>> ax = fig.add_subplot(232, projection='3d')
    >>> ax = fig.gca(projection='3d')
    >>> surf = ax.plot_surface(X, T, F1, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=False)
    >>> ax.set_zlim(-1, 1)
    >>> plt.title('F1')
    >>> ax = fig.add_subplot(233, projection='3d')
    >>> ax = fig.gca(projection='3d')
    >>> surf = ax.plot_surface(X, T, F2, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=False)
    >>> ax.set_zlim(-1, 1)
    >>> plt.title('F2')
    
    
    >>> #Dynamic Mode Decomposition
    >>> F_gpu = np.array(F.T, np.complex64, order='F')
    >>> F_gpu = gpuarray.to_gpu(F_gpu) 
    >>> Fmodes_gpu, b_gpu, V_gpu, omega_gpu = rlinalg.cdmd(F_gpu, k=2, c=20, modes='exact', return_amplitudes=True, return_vandermonde=True)
    >>> omega = omega_gpu.get()
    >>> plt.scatter(omega.real, omega.imag, marker='o', c='r')
    
    >>> #Recover original signal
    >>> F1tilde = np.dot(Fmodes_gpu[:,0:1].get() , np.dot(b_gpu[0].get(), V_gpu[0:1,:].get() ) )
    >>> F2tilde = np.dot(Fmodes_gpu[:,1:2].get() , np.dot(b_gpu[1].get(), V_gpu[1:2,:].get() ) )
    
    >>> # Plot DMD modes
    >>> #Mode 0
    >>> ax = fig.add_subplot(235, projection='3d')
    >>> ax = fig.gca(projection='3d')
    >>> surf = ax.plot_surface(X[0:F1tilde.shape[1],:], T[0:F1tilde.shape[1],:], F1tilde.T, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=False)
    >>> ax.set_zlim(-1, 1)
    >>> plt.title('F1_tilde')
    >>> #Mode 1
    >>> ax = fig.add_subplot(236, projection='3d')
    >>> ax = fig.gca(projection='3d')
    >>> surf = ax.plot_surface(X[0:F2tilde.shape[1],:], T[0:F2tilde.shape[1],:], F2tilde.T, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=False)
    >>> ax.set_zlim(-1, 1)
    >>> plt.title('F2_tilde')
    >>> plt.show()     
    """

    #*************************************************************************
    #***        Author: N. Benjamin Erichson <*****@*****.**>        ***
    #***                              <2015>                               ***
    #***                       License: BSD 3 clause                       ***
    #*************************************************************************

    if not _has_cula:
        raise NotImplementedError('CULA not installed')

    if handle is None:
        handle = misc._global_cublas_handle

    alloc = misc._global_cublas_allocator

    # The free version of CULA only supports single precision floating
    data_type = a_gpu.dtype.type
    real_type = np.float32

    if data_type == np.complex64:
        cula_func_gesvd = cula.culaDeviceCgesvd
        cublas_func_gemm = cublas.cublasCgemm
        cublas_func_dgmm = cublas.cublasCdgmm
        cula_func_gels = cula.culaDeviceCgels
        copy_func = cublas.cublasCcopy
        transpose_func = cublas.cublasCgeam
        alpha = np.complex64(1.0)
        beta = np.complex64(0.0)
        TRANS_type = 'C'
        isreal = False
    elif data_type == np.float32:
        cula_func_gesvd = cula.culaDeviceSgesvd
        cublas_func_gemm = cublas.cublasSgemm
        cublas_func_dgmm = cublas.cublasSdgmm
        cula_func_gels = cula.culaDeviceSgels
        copy_func = cublas.cublasScopy
        transpose_func = cublas.cublasSgeam
        alpha = np.float32(1.0)
        beta = np.float32(0.0)
        TRANS_type = 'T'
        isreal = True
    else:
        if cula._libcula_toolkit == 'standard':
            if data_type == np.complex128:
                cula_func_gesvd = cula.culaDeviceZgesvd
                cublas_func_gemm = cublas.cublasZgemm
                cublas_func_dgmm = cublas.cublasZdgmm
                cula_func_gels = cula.culaDeviceZgels
                copy_func = cublas.cublasZcopy
                transpose_func = cublas.cublasZgeam
                alpha = np.complex128(1.0)
                beta = np.complex128(0.0)
                TRANS_type = 'C'
                isreal = False
            elif data_type == np.float64:
                cula_func_gesvd = cula.culaDeviceDgesvd
                cublas_func_gemm = cublas.cublasDgemm
                cublas_func_dgmm = cublas.cublasDdgmm
                cula_func_gels = cula.culaDeviceDgels
                copy_func = cublas.cublasDcopy
                transpose_func = cublas.cublasDgeam
                alpha = np.float64(1.0)
                beta = np.float64(0.0)
                TRANS_type = 'T'
                isreal = True
            else:
                raise ValueError('unsupported type')
            real_type = np.float64
        else:
            raise ValueError('double precision not supported')

    #CUDA assumes that arrays are stored in column-major order
    m, n = np.array(a_gpu.shape, int)
    nx = n - 1
    #Set k
    if k == None: k = nx
    if k > nx or k < 1: raise ValueError('k is not valid')

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #Compress
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    if c == None:
        Ac_gpu = A
        c = m
    else:
        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        #Generate a random sensing matrix S
        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        if isreal == False:
            Simag_gpu = gpuarray.empty((m, c),
                                       real_type,
                                       order="F",
                                       allocator=alloc)
            Sreal_gpu = gpuarray.empty((m, c),
                                       real_type,
                                       order="F",
                                       allocator=alloc)
            S_gpu = gpuarray.empty((c, m),
                                   data_type,
                                   order="F",
                                   allocator=alloc)
            rand.fill_uniform(Simag_gpu)
            rand.fill_uniform(Sreal_gpu)
            S_gpu = Sreal_gpu + 1j * Simag_gpu
            S_gpu = S_gpu.T * 2 - 1  #Scale to [-1,1]
        else:
            S_gpu = gpuarray.empty((c, m),
                                   real_type,
                                   order="F",
                                   allocator=alloc)
            rand.fill_uniform(
                S_gpu
            )  #Draw random samples from a ~ Uniform(-1,1) distribution
            S_gpu = S_gpu * 2 - 1  #Scale to [-1,1]

        #Allocate Ac
        Ac_gpu = gpuarray.empty((c, n), data_type, order="F", allocator=alloc)

        #Compress input matrix
        cublas_func_gemm(handle, 'n', 'n', c, n, m, alpha,
                         int(S_gpu.gpudata), c, int(a_gpu.gpudata), m, beta,
                         int(Ac_gpu.gpudata), c)

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #Split data into lef and right snapshot sequence
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #Note: we need a copy of X_gpu, because SVD destroys X_gpu
    #While Y_gpu is just a pointer
    X_gpu = gpuarray.empty((c, n), data_type, order="F", allocator=alloc)
    copy_func(handle, X_gpu.size, int(Ac_gpu.gpudata), 1, int(X_gpu.gpudata),
              1)
    X_gpu = X_gpu[:, :nx]
    Y_gpu = Ac_gpu[:, 1:]
    Yorig_gpu = a_gpu[:, 1:]

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #Singular Value Decomposition
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #Allocate s, U, Vt for economic SVD
    #Note: singular values are always real
    min_s = min(nx, c)
    s_gpu = gpuarray.zeros(min_s, real_type, order="F", allocator=alloc)
    U_gpu = gpuarray.zeros((c, min_s), data_type, order="F", allocator=alloc)
    Vh_gpu = gpuarray.zeros((min_s, nx), data_type, order="F", allocator=alloc)

    #Economic SVD
    cula_func_gesvd('S', 'S', c, nx, int(X_gpu.gpudata), c, int(s_gpu.gpudata),
                    int(U_gpu.gpudata), c, int(Vh_gpu.gpudata), min_s)

    #Low-rank DMD: trancate SVD if k < nx
    if k != nx:
        s_gpu = s_gpu[:k]
        U_gpu = U_gpu[:, :k]
        Vh_gpu = Vh_gpu[:k, :]

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #Solve the LS problem to find estimate for M using the pseudo-inverse
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #real: M = U.T * Y * Vt.T * S**-1
    #complex: M = U.H * Y * Vt.H * S**-1
    #Let G = Y * Vt.H * S**-1, hence M = M * G

    #Allocate G and M
    G_gpu = gpuarray.zeros((c, k), data_type, order="F", allocator=alloc)
    M_gpu = gpuarray.zeros((k, k), data_type, order="F", allocator=alloc)

    #i) s = s **-1 (inverse)
    if data_type == np.complex64 or data_type == np.complex128:
        s_gpu = 1 / s_gpu
        s_gpu = s_gpu + 1j * gpuarray.zeros_like(s_gpu)
    else:
        s_gpu = 1 / s_gpu

    #ii) real/complex: scale Vs =  Vt* x diag(s**-1)
    Vs_gpu = gpuarray.zeros((nx, k), data_type, order="F", allocator=alloc)
    lda = max(1, Vh_gpu.strides[1] // Vh_gpu.dtype.itemsize)
    ldb = max(1, Vs_gpu.strides[1] // Vs_gpu.dtype.itemsize)
    transpose_func(handle, TRANS_type, TRANS_type, nx, k, 1.0,
                   int(Vh_gpu.gpudata), lda, 0.0, int(Vh_gpu.gpudata), lda,
                   int(Vs_gpu.gpudata), ldb)
    #End Transpose

    cublas_func_dgmm(handle, 'r', nx, k, int(Vs_gpu.gpudata), nx,
                     int(s_gpu.gpudata), 1, int(Vs_gpu.gpudata), nx)

    #iii) real: G = Y * Vs , complex: G = Y x Vs
    cublas_func_gemm(handle, 'n', 'n', c, k, nx, alpha, int(Y_gpu.gpudata), c,
                     int(Vs_gpu.gpudata), nx, beta, int(G_gpu.gpudata), c)

    #iv) real/complex: M = U* x G
    cublas_func_gemm(handle, TRANS_type, 'n', k, k, c, alpha,
                     int(U_gpu.gpudata), c, int(G_gpu.gpudata), c, beta,
                     int(M_gpu.gpudata), k)

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #Eigen Decomposition
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #Note: If a_gpu is real the imag part is omitted
    Vr_gpu, w_gpu = linalg.eig(M_gpu, 'N', 'V', 'F', lib='cula')

    omega = cumath.log(w_gpu)

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #Compute DMD Modes
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    F_gpu = gpuarray.empty((m, k), data_type, order="F", allocator=alloc)
    modes = modes.lower()
    if modes == 'exact':  #Compute (exact) DMD modes: F = Y * V * S**-1 * W = G * W

        cublas_func_gemm(handle, 'n', 'n', nx, k, k, alpha,
                         int(Vs_gpu.gpudata), nx, int(Vr_gpu.gpudata), k, beta,
                         int(Vs_gpu.gpudata), nx)

        cublas_func_gemm(handle, 'n', 'n', m, k, nx, alpha, Yorig_gpu.gpudata,
                         m, Vs_gpu.gpudata, nx, beta, F_gpu.gpudata, m)

    else:
        raise ValueError(
            'Type of modes is not supported, choose "exact" or "standard".')

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #Compute amplitueds b using least-squares: Fb=x1
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    if return_amplitudes == True:
        F_gpu_temp = gpuarray.empty((m, k),
                                    data_type,
                                    order="F",
                                    allocator=alloc)

        #Copy is required, because gels destroys input
        copy_func(handle, F_gpu.size, int(F_gpu.gpudata), 1,
                  int(F_gpu_temp.gpudata), 1)

        #x1_gpu = a_gpu[:,0].copy()
        x1_gpu = gpuarray.empty(m, data_type, order="F", allocator=alloc)
        copy_func(handle, x1_gpu.size, int(a_gpu[:, 0].gpudata), 1,
                  int(x1_gpu.gpudata), 1)
        cula_func_gels('N', m, k, int(1), F_gpu_temp.gpudata, m,
                       x1_gpu.gpudata, m)
        b_gpu = x1_gpu

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #Compute Vandermonde matrix (CPU)
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    if return_vandermonde == True:
        V_gpu = linalg.vander(w_gpu, n=nx)

    # Free internal CULA memory:
    cula.culaFreeBuffers()

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #Return
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    if return_amplitudes == True and return_vandermonde == True:
        return F_gpu, b_gpu[:k], V_gpu, omega
    elif return_amplitudes == True and return_vandermonde == False:
        return F_gpu, b_gpu[:k], omega
    elif return_amplitudes == False and return_vandermonde == True:
        return F_gpu, V_gpu, omega
    else:
        return F_gpu, omega
Ejemplo n.º 29
0
def cross_entropy_logistic(x, y):
    loss = y * cumath.log(x + eps) + (1. - y) * cumath.log(1. - x + eps)
    loss = -gpuarray.sum(loss)
    return loss
Ejemplo n.º 30
0
def random_normal(loc=0.0, scale=1.0, size=None):
    u1 = curandom.rand(size, dtype=numpy.float64)
    u2 = curandom.rand(size, dtype=numpy.float64)
    z1 = cumath.sqrt(-2.*cumath.log(u1))*cumath.cos(2.*numpy.pi*u2)
    return CUDAArray(scale*z1+loc)
Ejemplo n.º 31
0
 def log(self):
     return CUDAArray(cumath.log(self.arr))
Ejemplo n.º 32
0
def cauchy_prior_log_den(beta):
    log_beta_den_vals = -cumath.log(1 + beta*beta)    
    return(gpuarray.sum(log_beta_den_vals).get())