Example #1
0
def correlations(X, Y, useGPU):
    if useGPU:
        import pycuda.autoinit
        import pycuda.gpuarray as gpuarray
        import skcuda.linalg as linalg
        linalg.init()

        X_gpu = gpuarray.to_gpu(X)
        XT_gpu = linalg.transpose(X_gpu)
        cxx = linalg.mdot(XT_gpu, X_gpu).get()

        XT_gpu = linalg.transpose(X_gpu)
        X_gpu.gpudata.free()
        del X_gpu
        Y_gpu = gpuarray.to_gpu(Y)
        cxy = linalg.mdot(XT_gpu, Y_gpu).get()

        cyx = cxy.T

        YT_gpu = linalg.transpose(Y_gpu)
        cyy = linalg.mdot(YT_gpu, Y_gpu).get()
    else:
        cxx = np.dot(X.T, X)
        cxy = np.dot(X.T, Y)
        cyx = cxy.T
        cyy = np.dot(Y.T, Y)

    return cxx, cxy, cyx, cyy
Example #2
0
def NNMF_gpu(X,r,tol,V=v0,W=w0,verbose=1):
    Vr = V[:,0:r].copy()
    Wr = W[0:r,:].copy()
    X_gpu = gpuarray.to_gpu(X)
    V_gpu = gpuarray.to_gpu(Vr)
    W_gpu = gpuarray.to_gpu(Wr)
    #Frobinius norm at previous step
    B_gpu = linalg.dot(V_gpu, W_gpu)
    L = linalg.norm(X_gpu-B_gpu)**2
    iteration = 0
    while 1: #update V
        V_gpu *= linalg.dot(X_gpu,linalg.transpose(W_gpu))
        V_gpu /= linalg.dot(B_gpu,linalg.transpose(W_gpu))
        B_gpu = linalg.dot(V_gpu, W_gpu)
        #update W
        W_gpu *= linalg.dot(linalg.transpose(V_gpu),X_gpu)
        W_gpu /= linalg.dot(linalg.transpose(V_gpu),B_gpu)
        B_gpu = linalg.dot(V_gpu, W_gpu)
        Lnew = linalg.norm(X_gpu-B_gpu)**2
        if abs(Lnew-L) <= tol*(L+1):
            break
        else:
            L = Lnew
            iteration += 1
            if(verbose and iteration%50==0):
                print "At iteration %i, the loss is %.2f" %(iteration, L)
    return V_gpu,W_gpu,iteration
Example #3
0
def getTranformada(test_image, diagonal):
    #multiplico cada fila por la diagonal
    diagonal = diagonal.astype(np.float32)
    test_image = gpuarray.to_gpu(test_image)
    diagonal = gpuarray.to_gpu(diagonal)
    testimage_gpu = linalg.dot(test_image, diagonal)
    testimageT_gpu = linalg.transpose(testimage_gpu)
    testimage_gpu = linalg.dot(testimageT_gpu, diagonal)
    testimageT_gpu = linalg.transpose(testimage_gpu)
    return testimageT_gpu.get()
Example #4
0
def getTranformada_Inversa(test_image, diagonal):
    test_image = test_image.astype(np.float32)
    diagonal = diagonal.astype(np.float32)
    test_image = gpuarray.to_gpu(test_image)
    diagonal = gpuarray.to_gpu(diagonal)
    test_image_gpuT = linalg.transpose(test_image)
    testimage_gpu = linalg.dot(test_image_gpuT, diagonal)
    test_image_gpuT = linalg.transpose(testimage_gpu)
    testimage_gpu = linalg.dot(test_image_gpuT, diagonal)
    return testimage_gpu.get()
 def test_transpose_float64(self):
     # M < N
     a = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], np.float64)
     a_gpu = gpuarray.to_gpu(a)
     at_gpu = linalg.transpose(a_gpu)
     assert np.all(a.T == at_gpu.get())
     # M > N
     b = a.T.copy()
     b_gpu = gpuarray.to_gpu(b)
     bt_gpu = linalg.transpose(b_gpu)
     assert np.all(b.T == bt_gpu.get())
 def test_transpose_complex128(self):
     # M < N
     a = np.array([[1j, 2j, 3j, 4j, 5j, 6j], [7j, 8j, 9j, 10j, 11j, 12j]],
                  np.complex128)
     a_gpu = gpuarray.to_gpu(a)
     at_gpu = linalg.transpose(a_gpu)
     assert np.all(a.T == at_gpu.get())
     # M > N
     b = a.T.copy()
     b_gpu = gpuarray.to_gpu(b)
     bt_gpu = linalg.transpose(b_gpu)
     assert np.all(b.T == bt_gpu.get())
Example #7
0
 def test_transpose_float64(self):
     # M < N
     a = np.array([[1, 2, 3, 4, 5, 6],
                   [7, 8, 9, 10, 11, 12]],
                  np.float64)
     a_gpu = gpuarray.to_gpu(a)
     at_gpu = linalg.transpose(a_gpu)
     assert np.all(a.T == at_gpu.get())
     # M > N
     b = a.T.copy()
     b_gpu = gpuarray.to_gpu(b)
     bt_gpu = linalg.transpose(b_gpu)
     assert np.all(b.T == bt_gpu.get())
Example #8
0
 def test_transpose_complex128(self):
     # M < N
     a = np.array([[1j, 2j, 3j, 4j, 5j, 6j],
                   [7j, 8j, 9j, 10j, 11j, 12j]],
                  np.complex128)
     a_gpu = gpuarray.to_gpu(a)
     at_gpu = linalg.transpose(a_gpu)
     assert np.all(a.T == at_gpu.get())
     # M > N
     b = a.T.copy()
     b_gpu = gpuarray.to_gpu(b)
     bt_gpu = linalg.transpose(b_gpu)
     assert np.all(b.T == bt_gpu.get())
Example #9
0
 def compute_analysis_cuda2(self,
                            xb,
                            y,
                            R,
                            P,
                            H,
                            HT=None,
                            hph=None,
                            calcP=True):
     if HT is None:
         HT = culinalg.transpose(H)
     HP = culinalg.dot(H, P)
     if hph is None:
         hph = culinalg.dot(HP, HT)
     Rhph = misc.add(R, hph)
     inv = culinalg.inv(Rhph)
     W = culinalg.dot(HP, inv, transa='T')
     Hxb = culinalg.dot(H, xb)
     yHxb = misc.subtract(y, Hxb)
     WyHxb = culinalg.dot(W, yHxb)
     xhat = misc.add(xb, WyHxb)
     #xhat = xb + culinalg.dot(W, (y - culinalg.dot(H, xb)))
     if calcP:
         I = culinalg.eye(P.shape[0])
         WH = culinalg.dot(W, H)
         IWH = I - WH
         Phat = culinalg.dot(IWH, P)
     else:
         Phat = misc.zeros((1, ), dtype=P.dtype)
     return xhat, Phat
Example #10
0
def dot3(A, b):
    ''' Calculates matrix multiplication "b.T*A*b" on GPU. '''
    #print("dot3 "+str(A.shape)+" "+str(b.shape))
    
    # send A to GPU    
    A_gpu = gpuarray.to_gpu(A)
    
    # send b to GPU
    b_gpu = gpuarray.to_gpu(b)
    
    temp_gpu = linalg.dot(A_gpu, b_gpu)
    
    A_gpu.gpudata.free()
    del(A_gpu)
    
    # transpose b on GPU
    bt_gpu = linalg.transpose(b_gpu)
        
    #remove b
    b_gpu.gpudata.free()
    del(b_gpu)
    
    out_gpu = linalg.dot(bt_gpu, temp_gpu)
    
    return out_gpu.get()
Example #11
0
    def get_probabilities(self, batch):

        lookup_table_gpu = gpuarray.to_gpu(self.lookup_table)
        probs = []

        for i in range(batch.shape[0]):
            batch_gpu = gpuarray.to_gpu(batch[i])
            batch_T_gpu = linalg.transpose(batch_gpu)
            res_gpu = linalg.transpose(
                linalg.dot(lookup_table_gpu, batch_T_gpu))
            res = np.argmax(res_gpu.get(), axis=-1)
            probs.append(res)

        probs = np.expand_dims(np.asarray(probs), axis=-1)

        return probs
Example #12
0
    def forward(self, bottom, top):
        #        print 'hanli crf forward -- '
        #        print 'self.diff.shape: ' + str(self.diff.shape);  # self.diff.shape: (batchsize, 65536)
        #        print 'crf bottom[0].data.shape: ' + str(bottom[0].data.shape); #crf bottom[0].data.shape: (batchsize, 11)
        #        print 'raw degree bottom[1].data.shape: ' + str(bottom[1].data.shape);  #(batchsize, 65536, 11)
        #        print 'png bottom[2].data.shape: ' + str(bottom[2].data.shape);  # (batchsize, 65536)
        #        print 'np.dot(bottom[1].data[i,:,:], bottom[0].data[i,:]).shape: ' + str(np.dot(bottom[1].data[0,:,:], bottom[0].data[0,:]).shape); #(65536,)
        #        print 'bottom[2].data[i,:].shape: ' + str(bottom[2].data[0,:].shape);  # (65536,)
        with pu.caffe_cuda_context():
            linalg.init()
            for i in range(self.diff.shape[0]):
                #a =  bottom[1].data_as_pycuda_gpuarray()
                #b =  bottom[0].data_as_pycuda_gpuarray()
                a = bottom[1].data[i, :, :].astype(np.float32)
                b = bottom[0].data[i, :].astype(np.float32)
                ##a = np.asarray(np.random.rand(4, 4), dtype=np.float32)
                ##b = np.asarray(np.random.rand(4), dtype=np.float32)

                #a_gpu = gpuarray.GPUArray(a, dtype=np.float32)
                #b_gpu = gpuarray.GPUArray(b, dtype=np.float32)
                a_gpu = gpuarray.to_gpu(a)
                b_gpu = gpuarray.to_gpu(b)
                c_gpu = linalg.dot(a_gpu, b_gpu)
                #self.diff[i,:] = c_gpu + bottom[2].data[i,:] - bottom[3].data[i,:];
                self.diff[i, :] = np.dot(
                    bottom[1].data[i, :, :], bottom[0].data[
                        i, :]) + bottom[2].data[i, :] - bottom[3].data[i, :]
            top[0].data[...] = np.sum(self.diff**2) / bottom[3].num / 2.
            #self.transDiff = np.transpose(self.diff / bottom[3].num); # (65536, 50)
            a_gpu = gpuarray.to_gpu(self.diff / bottom[3].num)
            at_gpu = linalg.transpose(a_gpu)
            self.transDiff = at_gpu
Example #13
0
class GPUArrayBox(Box):
    __slots__ = []
    __array_priority__ = 100.0

    @primitive
    def __getitem__(A, idx): return A[idx]

    shape = property(lambda self: self._value.shape)
    ndim  = property(lambda self: self._value.ndim)
    size  = property(lambda self: self._value.size)
    dtype = property(lambda self: self._value.dtype)
    T = property(lambda self: culinalg.transpose(self))
    flags = property(lambda self: self._value.flags)
    get = property(lambda self: self._value.get)
    def __len__(self): return len(self._value)
    def astype(self, *args, **kwargs): return self._value.astype(*args, **kwargs)

    def __neg__(self): return anp.negative(self)
    def __add__(self, other): return cumisc.add(self, other)
    def __sub__(self, other): return cumisc.subtract(self, other)
    def __mul__(self, other): return cumisc.multiply(self, other)
    def __div__(self, other): return cumisc.divide(  self, other)
    def __matmul__(self, other): return culinalg.dot(self, other)
    def __radd__(self, other): return cumisc.add(other, self)
    def __rsub__(self, other): return cumisc.subtract(other, self)
    def __rmul__(self, other): return cumisc.multiply(other, self)
    def __rdiv__(self, other): return cumisc.divide(other, self)
    def __rmatmul__(self, other): return culinalg.dot(other, self)
    def __hash__(self): return id(self)
Example #14
0
def dot3(A, b):
    ''' Calculates matrix multiplication "b.T*A*b" on GPU. 
        A has to be nxn. '''
    #print("dot3 "+str(A.shape)+" "+str(b.shape))
    
    # Make sure we dont run out of memory on the GPU                         
    if ((A.size + 2*b.size) <= 629088256):
        
        # send A to GPU    
        A_gpu = gpuarray.to_gpu(A)
        
        # send b to GPU
        b_gpu = gpuarray.to_gpu(b)
        
        temp_gpu = linalg.dot(A_gpu, b_gpu)
        
        A_gpu.gpudata.free()
        del(A_gpu)
        
        # transpose b on GPU
        bt_gpu = linalg.transpose(b_gpu)
            
        #remove b
        b_gpu.gpudata.free()
        del(b_gpu)
        
        out_gpu = linalg.dot(bt_gpu, temp_gpu)
        
        return out_gpu.get()
    
    else:
        print("Too big for GPU, using CPU.")
        return np.dot(np.dot(b.T, A), b)
Example #15
0
def cuda_dot3(A, b):
    print("cuda_dot3", A.shape, b.shape)
    # send b to GPU
    b_gpu = gpuarray.to_gpu(b)
    # transpose b on GPU
    bt_gpu = linalg.transpose(b_gpu)
    #remove b for now
    b_gpu.gpudata.free()
    del(b_gpu)
    # send A to GPU    
    A_gpu = gpuarray.to_gpu(A)
    
    temp_gpu = linalg.dot(bt_gpu, A_gpu)
    
    bt_gpu.gpudata.free()
    del(bt_gpu)
    A_gpu.gpudata.free()
    del(A_gpu)
    
    # send b to GPU
    b_gpu = gpuarray.to_gpu(b)
    
    c_gpu = linalg.dot(temp_gpu, b_gpu)
    
    temp_gpu.gpudata.free()
    del(temp_gpu)
    b_gpu.gpudata.free()
    del(b_gpu)
        
    #theoretically possible to move into RAM, force cleanup on GPU and then return from RAM
    #but most likely not necessary
    return c_gpu.get()
Example #16
0
def kernel_lin(A, B, C, transa='N'):
    func = kern_lin
    if A.dtype == np.float64:
        func = Dkern_lin
    if transa == 'T':
        func(linalg.transpose(A), B, C)
    else:
        func(A, B, C)
Example #17
0
def cuda_T(a):
    a_gpu = gpuarray.to_gpu(a)
    at_gpu = linalg.transpose(a_gpu)
    
    a_gpu.gpudata.free()
    del(a_gpu)
    
    return at_gpu.get()
Example #18
0
def T(a):
    ''' Transposes matrix "y" on the GPU. '''
    try:
        a_gpu = gpuarray.to_gpu(a)
        at_gpu = linalg.transpose(a_gpu)
        return at_gpu.get()
    except:
        print("Using CPU for Transpose.")
        return np.matrix(a.T, copy=False)
Example #19
0
def T(a):
    ''' Transposes matrix "y" on the GPU. '''
    a_gpu = gpuarray.to_gpu(a)
    at_gpu = linalg.transpose(a_gpu)
    
    #a_gpu.gpudata.free()
    #del(a_gpu)
    
    return at_gpu.get()
def getCSMGPU(XG, YG):
    tbegin = time.time()
    GPUNeg2 = gpuarray.to_gpu(np.array([-2.0], dtype=np.float32))
    YGT = linalg.transpose(YG)
    XSqr = skcuda.misc.multiply(XG, XG)
    XSqr = skcuda.misc.sum(XSqr, 1)
    YSqr = skcuda.misc.multiply(YG, YG)
    YSqr = skcuda.misc.sum(YSqr, 1)
    C = linalg.dot(XG, YGT)
    C = skcuda.misc.multiply(GPUNeg2, C)
    skcuda.misc.add_matvec(C, XSqr, 0, C)
    skcuda.misc.add_matvec(C, YSqr, 1, C)
    return C
def getCSMGPU(XG, YG):
    tbegin = time.time()
    GPUNeg2 = gpuarray.to_gpu(np.array([-2.0], dtype=np.float32))
    YGT = linalg.transpose(YG)
    XSqr = skcuda.misc.multiply(XG, XG)
    XSqr = skcuda.misc.sum(XSqr, 1)
    YSqr = skcuda.misc.multiply(YG, YG)
    YSqr = skcuda.misc.sum(YSqr, 1)
    C = linalg.dot(XG, YGT)
    C = skcuda.misc.multiply(GPUNeg2, C)
    skcuda.misc.add_matvec(C, XSqr, 0, C)
    skcuda.misc.add_matvec(C, YSqr, 1, C)
    return C
Example #22
0
 def transpose(self):
     if self.device == 'cuda':
         data = linalg.transpose(self.data)
     else:
         data = self.data.transpose()
     if self.autograd:
         return Tensor(
             data=data,
             autograd=True,
             creators=[self],
             creation_op="transpose",
             device=self.device,
         )
     return Tensor(
         data=data,
         device=self.device,
     )
Example #23
0
def cuda_dot2(b, A):
    print("cuda_dot2", b.shape, A.shape)
    # send b to GPU
    b_gpu = gpuarray.to_gpu(b)
    # transpose b on GPU
    bt_gpu = linalg.transpose(b_gpu)
    # send A to GPU    
    A_gpu = gpuarray.to_gpu(A)
    
    out_gpu = linalg.dot(bt_gpu, A_gpu)
    
    b_gpu.gpudata.free()
    del(b_gpu)
    bt_gpu.gpudata.free()
    del(bt_gpu)
    A_gpu.gpudata.free()
    del(A_gpu)
    
    return out_gpu.get()
def getCSMGPU2(XG, YG):
    #Step 1: Sum of squares across rows
    dim = np.int32(XG.shape[1])
    dimpow2 = roundUpPow2(dim)
    NThreads = np.int32(min(dimpow2, 512))
    XSqr = gpuarray.empty(XG.shape[0], np.float32)
    YSqr = gpuarray.empty(YG.shape[0], np.float32)
    getSumSquares_(XG,
                   XSqr,
                   dim,
                   dimpow2,
                   block=(NThreads, 1, 1),
                   grid=(XG.shape[0], 1),
                   shared=4 * dimpow2)
    getSumSquares_(YG,
                   YSqr,
                   dim,
                   dimpow2,
                   block=(NThreads, 1, 1),
                   grid=(YG.shape[0], 1),
                   shared=4 * dimpow2)

    #Step 2: Do multiplication part
    YGT = linalg.transpose(YG)
    CSM = linalg.dot(XG, YGT)

    #Step 3: Add everything together
    Mp = np.array(XG.shape[0], dtype=np.int32)
    Np = np.array(YG.shape[0], dtype=np.int32)
    MPow2 = roundUpPow2(XG.shape[0])
    NThreads = min(MPow2, 512)
    #CSM is N x M
    finishCSM_(CSM,
               XSqr,
               YSqr,
               Np,
               Mp,
               MPow2,
               block=(NThreads, 1, 1),
               grid=(YG.shape[0], 1))
    return (CSM, XSqr, YSqr)
Example #25
0
def sorted_eig(X, ascending=True, mode='cpu'):
    if mode == 'cpu':
        e_vals, e_vecs = np.linalg.eig(X)
        idx = np.argsort(e_vals)
        if not ascending:
            idx = idx[::-1]
        e_vecs = e_vecs[:, idx]
        e_vals = e_vals[idx]
        return e_vals, e_vecs
    elif mode == 'gpu':
        import skcuda.linalg as LA
        import pycuda.gpuarray as gpuarray
        e_vecs_gpu, e_vals_gpu = LA.eig(X, 'N', 'V', lib='cusolver')
        e_vals = e_vals_gpu.get()
        idx = np.argsort(e_vals)
        V_gpu = gpuarray.empty((X.shape[0], X.shape[1]), np.float32)
        d = X.shape[0]
        for i in range(d):
            V_gpu[i] = e_vecs_gpu[idx[i]]
        V_gpu = LA.transpose(V_gpu)
        return e_vals, V_gpu
def logis(y,x):
    end = 0
    start = 0
    x = x.astype(np.float32)
    y = y.astype(np.float32)
    start=time.time()
    # Translado de variable a GPU
    x_gpu = gpuarray.to_gpu(x)
    y_gpu = gpuarray.to_gpu(y)

    linalg.init()
    # Transpuesta de X
    x_gpu_T = linalg.transpose(x_gpu)
    beta_gpu = linalg.dot(linalg.dot(linalg.inv(linalg.dot(x_gpu_T,x_gpu)),x_gpu_T),y_gpu)
    j = 1
    while(True):
        mu = sapply(x,beta_gpu.get())
        mu = mu.astype(np.float32)
        mu_gpu = gpuarray.to_gpu(mu)
        V_gpu= linalg.diag(mu_gpu)
        f2_gpu = linalg.multiply(mu_gpu,1-mu_gpu)
        f3_gpu = linalg.diag(1/f2_gpu)
        f4_gpu = (y_gpu-mu_gpu)
        f5_gpu = linalg.dot(f3_gpu,f4_gpu)
        if(np.isnan(f5_gpu.get()).any()):
            f5_cpu = f5_gpu.get()
            f5_cpu = nanValue(f5_cpu)
            f5_gpu = gpuarray.to_gpu(f5_cpu.astype(np.float32))
        y_1_gpu = linalg.dot(x_gpu,beta_gpu) + f5_gpu
        beta_1_gpu = linalg.dot(linalg.dot(linalg.dot(linalg.inv(linalg.dot(linalg.dot(x_gpu_T,V_gpu),x_gpu)),x_gpu_T),V_gpu),y_1_gpu)
        check_value = np.absolute(linalg.norm(beta_1_gpu-beta_gpu))
        #if(check_value<0.00001):
            #break
        if(j == 10 or check_value<0.00001):
            break
        beta_gpu = beta_1_gpu
        j = j + 1
    end = time.time()
    tiempo = (end-start)
    return {"iteraciones":j,"Betas":beta_gpu.get(),"time":tiempo}
def getCSMGPU2(XG, YG):
    #Step 1: Sum of squares across rows
    dim = np.int32(XG.shape[1])
    dimpow2 = roundUpPow2(dim)
    NThreads = np.int32(min(dimpow2, 512))
    XSqr = gpuarray.empty(XG.shape[0], np.float32)
    YSqr = gpuarray.empty(YG.shape[0], np.float32)
    getSumSquares_(XG, XSqr, dim, dimpow2, block=(NThreads, 1, 1), grid=(XG.shape[0], 1), shared=4*dimpow2)
    getSumSquares_(YG, YSqr, dim, dimpow2, block=(NThreads, 1, 1), grid=(YG.shape[0], 1), shared=4*dimpow2)

    #Step 2: Do multiplication part
    YGT = linalg.transpose(YG)
    CSM = linalg.dot(XG, YGT)

    #Step 3: Add everything together
    Mp = np.array(XG.shape[0], dtype=np.int32)
    Np = np.array(YG.shape[0], dtype=np.int32)
    MPow2 = roundUpPow2(XG.shape[0])
    NThreads = min(MPow2, 512)
    #CSM is N x M
    finishCSM_(CSM, XSqr, YSqr, Np, Mp, MPow2, block=(NThreads, 1, 1), grid=(YG.shape[0], 1))
    return (CSM, XSqr, YSqr)
Example #28
0
def dot2(b, A):
    ''' Calculates matrix multiplication "b.T*A" on GPU. '''
    #print("dot2 "+str(b.shape)+" "+str(A.shape))
    
    # send b to GPU
    b_gpu = gpuarray.to_gpu(b)
    # transpose b on GPU
    bt_gpu = linalg.transpose(b_gpu)
    # clear b
    b_gpu.gpudata.free()
    del(b_gpu)
    # send A to GPU    
    A_gpu = gpuarray.to_gpu(A)
    
    out_gpu = linalg.dot(bt_gpu, A_gpu)
    
    #clear
    #bt_gpu.gpudata.free()
    #del(bt_gpu)
    #A_gpu.gpudata.free()
    #del(A_gpu)
    
    return out_gpu.get()
Example #29
0
def dot2(b, A):
    ''' Calculates matrix multiplication "b.T*A" on GPU. '''
    #print("dot2 "+str(b.shape)+" "+str(A.shape))
    
    
    # Make sure we dont run out of memory on the GPU
    if ((A.size + b.size + A.shape[0]*b.shape[1]) <= 629088256):
        try:
            # send b to GPU
            b_gpu = gpuarray.to_gpu(b)
            # transpose b on GPU
            bt_gpu = linalg.transpose(b_gpu)
            # clear b
            b_gpu.gpudata.free()
            del(b_gpu)
            
            # send A to GPU    
            A_gpu = gpuarray.to_gpu(A)
            
            out_gpu = linalg.dot(bt_gpu, A_gpu)
        except:
            # clear b
            b_gpu.gpudata.free()
            del(b_gpu)
            
            print("Too big for GPU, using CPU.")
            return np.dot(b.T, A)
    else:
        print("Too big for GPU, using CPU.")
        return np.dot(b.T, A)
    #clear
    #bt_gpu.gpudata.free()
    #del(bt_gpu)
    #A_gpu.gpudata.free()
    #del(A_gpu)
    
    return out_gpu.get()
Example #30
0
def transpose(A):
    return linalg.transpose(A)
Example #31
0
import pycuda.autoinit
import pycuda.driver as drv
import pycuda.gpuarray as gpuarray
import numpy as np

import skcuda.linalg as culinalg
import skcuda.misc as cumisc
culinalg.init()

# Double precision is only supported by devices with compute
# capability >= 1.3:
import string
demo_types = [np.float32, np.complex64]
if cumisc.get_compute_capability(pycuda.autoinit.device) >= 1.3:
    demo_types.extend([np.float64, np.complex128])

for t in demo_types:
    print('Testing transpose for type ' + str(np.dtype(t)))
    if np.iscomplexobj(t()):
        b = np.array([[1j, 2j, 3j, 4j, 5j, 6j],
                      [7j, 8j, 9j, 10j, 11j, 12j]], t)
    else:
        a = np.array([[1, 2, 3, 4, 5, 6],
                      [7, 8, 9, 10, 11, 12]], t)
    a_gpu = gpuarray.to_gpu(a)
    at_gpu = culinalg.transpose(a_gpu)
    if np.iscomplexobj(t()):
        print('Success status: ', np.all(np.conj(a.T) == at_gpu.get()))
    else:
        print('Success status: ', np.all(a.T == at_gpu.get()))
Example #32
0
def update_W_hat_skcuda(W_hat, X_hat, A_t, B_t, x_sum, alpha_sum, eps, t):
    n_hat, k_cluster = W_hat.shape
    # m_dim, _ = X_hat.shape
    W_hat_new = W_hat.copy()
    linalg.init()

    if not isinstance(W_hat_new, gpuarray.GPUArray):
        W_hat_new_gpu = gpuarray.to_gpu(W_hat_new.astype(np.float64))
    else:
        W_hat_new_gpu = W_hat_new

    if not isinstance(X_hat, gpuarray.GPUArray):
        tmp_x = np.ascontiguousarray(X_hat)
        X_hat_gpu = gpuarray.to_gpu(tmp_x.astype(np.float64))
    else:
        X_hat_gpu = X_hat
    # X_hat_T_gpu = gpuarray.to_gpu(X_hat.T.copy().astype(np.float64))
    X_hat_T_gpu = linalg.transpose(X_hat_gpu)

    if not isinstance(A_t, gpuarray.GPUArray):
        A_t_gpu = gpuarray.to_gpu(A_t.astype(np.float64))
    else:
        A_t_gpu = A_t
    A_t_gpu_trans = linalg.transpose(A_t_gpu)

    if not isinstance(B_t, gpuarray.GPUArray):
        B_t_gpu = gpuarray.to_gpu(B_t.astype(np.float64))
    else:
        B_t_gpu = B_t
    B_t_gpu_trans = linalg.transpose(B_t_gpu)

    all_ones_gpu = gpuarray.to_gpu(np.ones((n_hat, 1), dtype=np.float64))

    k = 0
    while True:
        k += 1
        # ipdb.set_trace()
        W_hat_old_gpu = W_hat_new_gpu.copy()
        for j in range(k_cluster):
            T1 = linalg.dot(X_hat_T_gpu, B_t_gpu_trans[j, :].reshape((-1, 1)))
            X_product_gpu = linalg.dot(X_hat_T_gpu, X_hat_gpu)
            T2 = reduce(linalg.dot, (X_product_gpu, W_hat_new_gpu,
                                     A_t_gpu_trans[j, :].reshape(-1, 1)))
            grad_gpu = -T1 + T2
            step_size = 1 / (linalg.norm(X_product_gpu) *
                             linalg.norm(A_t_gpu_trans[j, :]) + 1e-8)
            tmp = -step_size * grad_gpu.reshape(
                (-1)) + W_hat_new_gpu[:, j].copy()

            # u_j_gpu = 1/2 * (tmp + abs(tmp))
            # normalized_u_j_gpu = 1/max(linalg.norm(u_j_gpu), 1) * u_j_gpu

            # u_j_gpu = 1/max(linalg.norm(tmp), 1) * tmp
            # normalized_u_j_gpu = 1/2 * (u_j_gpu + abs(u_j_gpu))
            u_j = geo_projection_to_cvx_cmb(tmp.get())
            normalized_u_j_gpu = gpuarray.to_gpu(u_j.astype(np.float64))

            W_hat_new_gpu[:, j] = normalized_u_j_gpu

        # T1 = linalg.dot(X_hat_T_gpu, B_t_gpu)
        # X_product_gpu = linalg.dot(X_hat_T_gpu, X_hat_gpu)
        # T2 = reduce(linalg.dot, (X_product_gpu, W_hat_new_gpu, A_t_gpu))
        # grad_gpu =  T2 - T1
        # step_size = 1/(linalg.norm(X_product_gpu) * linalg.norm(A_t_gpu) + 1e-8)
        # tmp =  W_hat_new_gpu - step_size * grad_gpu
        # u_gpu = 1/2 * (tmp + abs(tmp))

        # column_sum_gpu = misc.sum(u_gpu, axis = 0).astype(np.float64)
        # # ipdb.set_trace()
        # div_mat_gpu = linalg.dot(all_ones_gpu, column_sum_gpu.reshape((1, -1))) + 1e-8
        # W_hat_new_gpu = u_gpu / div_mat_gpu.astype(np.float64)

        # if k % 50 == 0:
        #     g_val = get_g_hat_value(t, W_hat_new_gpu.get(), X_hat,
        #             A_t, B_t, x_sum, alpha_sum)
        #     print('iteration {}, function value: {:.4f}'.format(k, g_val))

        if (linalg.norm(W_hat_new_gpu - W_hat_old_gpu) < eps) or k >= 10000:
            break

    return W_hat_new_gpu
Example #33
0
def _sub_kmeans_gpu_custom(X, k):
    import skcuda
    import skcuda.linalg as LA
    import pycuda.driver as cuda
    import pycuda.autoinit
    import pycuda.gpuarray as gpuarray
    import custom_kernels as CC
    LA.init()
    CC.init()

    n, d = X.shape
    X = X.astype(np.float32)
    V_gpu = random_V(d, mode='gpu')

    m = d / 2

    X_gpu = gpuarray.to_gpu(X)
    mu_D_gpu = CC.column_mean(X_gpu)
    sub_gpu = skcuda.misc.subtract(X_gpu, mu_D_gpu)
    sub_gpu_T = LA.transpose(sub_gpu)
    S_D_gpu = CC.matmul(sub_gpu_T, sub_gpu)
    mu_is_gpu = gpuarray.to_gpu(X[np.random.choice(n, k)])
    itr = 1
    assignment_unchanged = 0
    C_gpu = None
    MAX_ITER = 100

    while itr < MAX_ITER:
        Pc_gpu = projection_matrix(d, m, mode='gpu')
        PcV_gpu = LA.dot(Pc_gpu, V_gpu, transa='T', transb='T')
        PcVmu_is_gpu = gpuarray.empty((k, m), dtype=np.float32)

        for i in range(k):
            PcVmu_is_gpu[i] = LA.dot(PcV_gpu, mu_is_gpu[i][:, None]).ravel()

        global_temp = LA.dot(X_gpu, PcV_gpu, transb='T')
        if itr % 2 == 0:
            C_old = C_gpu.get()
        C_gpu = CC.argmin_mu_diff(global_temp, PcVmu_is_gpu)
        if itr % 2 == 0:
            Cnew = C_gpu.get()
            points_changed = np.sum(1 - np.equal(C_old, Cnew).astype(np.uint8))
            if points_changed == 0:
                assignment_unchanged += 1
            if assignment_unchanged >= 2:
                break
            print('[i] Itr %d: %d points changed' % (itr, points_changed))

        C = C_gpu.get()
        counts = {i: 0 for i in range(k)}

        for i in xrange(n):
            C_id = np.int(C[i])
            counts[C_id] += 1
        maxv = np.max(counts.values())
        storage = np.zeros((k, np.int(maxv), d)).astype(np.float32)

        counter = np.zeros(k, dtype=np.uint32)  # k
        for i in range(n):
            C_id = np.int(C[i])
            storage[C_id, np.int(counter[C_id]), :] = X[i].ravel()
            counter[C_id] += 1

        storage_gpu = gpuarray.to_gpu(storage)

        mu_is_gpu = CC.sum_axis2(storage_gpu)
        counter_gpu = gpuarray.to_gpu(counter)[:, None]

        mu_is_gpu = skcuda.misc.divide(
            mu_is_gpu, counter_gpu.astype(np.float32))
        S_is_gpu = gpuarray.zeros((k, d, d), dtype=np.float32)  # k,d,d

        for i in range(k):
            storage_gpu[i] = skcuda.misc.subtract(storage_gpu[i], mu_is_gpu[i])
            curr_cluster_points = storage_gpu[i,
                                              :np.int(counter[i]), :]  # |k|,d
            S_is_gpu[i] = LA.dot(curr_cluster_points,
                                 curr_cluster_points, transa='T')

        S_is_sum_gpu = S_is_gpu.reshape((k, d * d))
        S_is_sum_gpu = skcuda.misc.sum(S_is_sum_gpu, axis=0, keepdims=True)
        S_is_sum_gpu = S_is_sum_gpu.reshape((d, d))

        S_is_diff_gpu = skcuda.misc.subtract(S_is_sum_gpu, S_D_gpu)

        w, V_gpu = sorted_eig(S_is_diff_gpu, mode='gpu')

        maxVal = min(w)
        m = np.sum([1 for i in w if i / maxVal > 1e-3])
        m = max(1, m)

        itr += 1
    return C_gpu.get(), V_gpu.get(), m
Example #34
0
 def gpu_transpose(a):
     a_gpu = gpuarray.to_gpu(a)
     at_gpu = linalg.transpose(a_gpu)
     return at_gpu.get()
Example #35
0
import pycuda.autoinit
import pycuda.driver as cuda
import pycuda.gpuarray as gpuarray
from pycuda.compiler import SourceModule
import numpy as np 
import skcuda.linalg as linalg

s = cuda.Event()
e = cuda.Event()

s.record()

N = 32 * 1024

linalg.init()
a = np.tril(np.ones(N, dtype=np.float32))
a_gpu = gpuarray.to_gpu(a)
at_gpu = linalg.transpose(a_gpu)
print "done"
e.record()
e.synchronize()
print s.time_till(e)
Example #36
0
            ff = transf(ff)

        ff[ff<0] = 0
        ff[ff>2**15] = 0 # sometimes there is a problem with saving signed/unsigned ff values
        while ff.max() > 7: # rescale ff
            ff /= 10
        # print(ff.max())
    else:
        ff = np.zeros(outShape)
    
    if useGPU:
        signorms = linalg.norm(signals, axis=1, keepdims=True)
        signormsRep = np.repeat(signorms, signals.shape[1], axis=1)
        signormsGPU = pycuda.gpuarray.to_gpu(signormsRep.astype(np.float32))
        signalsGPU = pycuda.gpuarray.to_gpu(signals.astype(np.float32))
        signalsGPU = sklinalg.transpose(skmisc.divide(signalsGPU, signormsGPU))
        del signormsGPU
        ROWSTEP = 14
                
    if fitType == 0:
        signorms = linalg.norm(signals, axis=1, keepdims=True)
        signormsRep = np.repeat(signorms, signals.shape[1], axis=1)
        signalsCPU = np.transpose( signals / signormsRep)
        ROWSTEP = 14
        
    for slc in range(*sliceRange):
        print(slc)
        if fatT2 <= 0:            
            print("Searching fat...")
            fatT2 = fitSlc(int((sliceRange[1]-sliceRange[0])/2+sliceRange[0]), True, t2, b1, ff)
            ffl = FatFractionLookup(t2Lim, b1Lim, fatT2, etl, echoSpacing, refocusingFactor)
Example #37
0
    def process(self, **kwargs):
        """Calculate the likelihood, returning ln(likelihood)."""
        ret = {'value': LIKELIHOOD_FLOOR}

        self._fractions = kwargs.get('fractions', [])
        if not len(self._fractions):
            return ret

        self._model_observations = kwargs['model_observations']
        self._score_modifier = kwargs.get(self.key('score_modifier'), 0.0)
        self._upper_limits = np.array(kwargs.get('upperlimits', []),
                                      dtype=bool)

        value = ret['value']

        if min(self._fractions) < 0.0 or max(self._fractions) > 1.0:
            return ret
        for oi, obs in enumerate(self._model_observations):
            if not self._upper_limits[oi] and (isnan(obs)
                                               or not np.isfinite(obs)):
                return ret

        diag = kwargs.get('kdiagonal', None)
        residuals = kwargs.get('kresiduals', None)

        if diag is None or residuals is None:
            return ret

        if kwargs.get('kmat', None) is not None:
            kmat = kwargs['kmat']

            # Add observed errors to diagonal
            kmat[np.diag_indices_from(kmat)] += diag

            # full_size = np.count_nonzero(kmat)

            # Remove small covariance terms
            # min_cov = self.MIN_COV_TERM * np.max(kmat)
            # kmat[kmat <= min_cov] = 0.0

            # print("Sparse frac: {:.2%}".format(
            #     float(full_size - np.count_nonzero(kmat)) / full_size))

            condn = np.linalg.cond(kmat)
            if condn > 1.0e10:
                return ret

            if self._use_cpu is not True and self._model._fitter._cuda:
                try:
                    import pycuda.gpuarray as gpuarray
                    import skcuda.linalg as skla
                except ImportError:
                    self._use_cpu = True
                    if not self._cuda_reported:
                        self._printer.message('cuda_not_enabled',
                                              master_only=True,
                                              warning=True)
                else:
                    self._use_cpu = False
                    if not self._cuda_reported:
                        self._printer.message('cuda_enabled', master_only=True)
                        self._cuda_reported = True

                    kmat_gpu = gpuarray.to_gpu(kmat)
                    # kmat will now contain the cholesky decomp.
                    skla.cholesky(kmat_gpu, lib='cusolver')
                    value = -np.log(skla.det(kmat_gpu, lib='cusolver'))
                    res_gpu = gpuarray.to_gpu(
                        residuals.reshape(len(residuals), 1))
                    cho_mat_gpu = res_gpu.copy()
                    skla.cho_solve(kmat_gpu, cho_mat_gpu, lib='cusolver')
                    value -= (0.5 * (skla.mdot(skla.transpose(res_gpu),
                                               cho_mat_gpu)).get())[0][0]

            if self._use_cpu:
                try:
                    chol_kmat = scipy.linalg.cholesky(kmat, check_finite=False)

                    value = -np.linalg.slogdet(chol_kmat)[-1]
                    value -= 0.5 * (np.matmul(
                        residuals.T,
                        scipy.linalg.cho_solve(
                            (chol_kmat, False), residuals,
                            check_finite=False)))
                except Exception:
                    try:
                        value = -0.5 * (np.matmul(
                            np.matmul(residuals.T, scipy.linalg.inv(kmat)),
                            residuals) + np.log(scipy.linalg.det(kmat)))
                    except scipy.linalg.LinAlgError:
                        return ret

            ret['kdiagonal'] = diag
            ret['kresiduals'] = residuals
        elif 'kfmat' in kwargs:
            raise RuntimeError('Should not have kfmat in likelihood!')
        else:
            # Shortcut when matrix is diagonal.
            self._o_band_vs = kwargs['obandvs']
            # print('likelihood')
            # print(np.sqrt(diag))
            # print(self._o_band_vs)
            # print(residuals)
            value = -0.5 * np.sum(residuals**2 / (self._o_band_vs**2 + diag) +
                                  np.log(self._o_band_vs**2 + diag))

        score = self._score_modifier + value
        if isnan(score) or not np.isfinite(score):
            return ret
        ret['value'] = max(LIKELIHOOD_FLOOR, score)
        return ret
Example #38
0
time_cula = []
for i in N:
	t = np.float32
	n = i * 32
    
	a = np.asarray(np.random.rand(n,n), t)


	start = time.time()
	c = np.transpose(a)
	time_cpu.append(time.time() - start)

	a_gpu = gpuarray.to_gpu(a)

	start = time.time()
	c_gpu = culinalg.transpose(a_gpu)
	time_linalg.append(time.time() - start)

	
	
	a_gpu2 = gpuarray.to_gpu(a)
	cula_result = gpuarray.empty((n, n), np.float32)

	#culaGetVersion
	'''
	culaInitialize
	
	start = time.time()
	culaDeviceSgeTranspose(n, n, a_gpu2.gpudata, n, cula_result.gpudata, n)
	time_cula.append(time.time() - start)
	
Example #39
0
        x0[:,slice(0,laz),0] = read_rec_as_arr(fp_img,nc,laz,0)

        for j in range(laz):
            x0[:,j,0]=np.roll(x0[:,j,0],int(r_shift[j]),axis=0)

        for i in range(nproc):
            x0[:,slice(laz,nl),0] = read_rec_as_arr(fp_img,nc,nread,laz+i*nread)

            for k in range(laz,nl):
                x0[:,k,0]=np.roll(x0[:,k,0],int(r_shift[k+i*nread]),axis=0)

            x_gpu = gpuarray.to_gpu(x0)
            xt_gpu = gpuarray.to_gpu(np.empty((nl, nc, 1), np.complex64))
            cu_fft.fft(x_gpu, x_gpu, az_plan)
            x_gpu[:,:,0] = linalg.misc.multiply(x_gpu[:,:,0],pf1_gpu)
            xt_gpu[:,:,0] = linalg.transpose(x_gpu[:,:,0])
            cu_fft.fft(xt_gpu, xt_gpu, rg_plan)
            x_gpu[:,:,0] = linalg.transpose(xt_gpu[:,:,0])
            x_gpu[:,:,0] = linalg.misc.multiply(x_gpu[:,:,0],pf2_gpu)
            xt_gpu[:,:,0] = linalg.transpose(x_gpu[:,:,0])
            cu_fft.ifft(xt_gpu, xt_gpu, rg_plan, True)
            x_gpu[:,:,0] = linalg.transpose(xt_gpu[:,:,0])
            x_gpu[:,:,0] = linalg.misc.multiply(x_gpu[:,:,0],pf3_gpu)
            cu_fft.ifft(x_gpu, x_gpu, az_plan, True)
<<<<<<< HEAD
            slc_gpu[:,slice(i*nread,(i+1)*nread)] = x_gpu[:,slice(0,nl-laz),0].get()
            x0 = np.roll(x0,-nread,axis=1)
            print(i)

        elapsed_time = time.time() - start
        print("elapsed_time:{0}".format(elapsed_time) + "[sec]")
Example #40
0
def kernel(A, B, C, transa='N'):
    if transa == 'T':
        kern(linalg.transpose(A), B, C)
    else:
        kern(A, B, C)