Ejemplo n.º 1
0
    def predict(self, X):
        if not self.is_fit:
            print("GPR Model not fit yet.")
            return

        st = time.time()
        X = np.asarray(X)
        Kff = self.kernel2(self.train_X, self.train_X)  # (N, N)
        Kyy = self.kernel2(X, X)  # (k, k)
        Kfy = self.kernel2(self.train_X, X)  # (N, k)
        et = time.time()
        print('kernel time cost', et - st, 's')

        st = time.time()
        Kff_gpu = gpuarray.to_gpu(
            (Kff + 1e-7 * np.eye(len(self.train_X))).astype(np.float32))
        Kff_inv_gpu = sklin.inv(Kff_gpu)
        Kff_inv = Kff_inv_gpu.get()
        et = time.time()
        print('2inv time cost', et - st, 's')

        st = time.time()
        aT = np.random.randn(Kfy.shape[1], Kfy.shape[0])
        aT[:, :] = Kfy.T[:, :]
        KfyT_gpu = gpuarray.to_gpu((aT).astype(np.float32))
        Kfy_gpu = gpuarray.to_gpu(Kfy.astype(np.float32))
        trainy_gpu = gpuarray.to_gpu(self.train_y.astype(np.float32))
        Kff_inv_gpu = gpuarray.to_gpu(Kff_inv.astype(np.float32))
        gpu_t = sklin.dot(KfyT_gpu, Kff_inv_gpu)
        mu = sklin.dot(gpu_t, trainy_gpu).get()
        cov = Kyy - sklin.dot(gpu_t, Kfy_gpu).get()
        et = time.time()
        print('dot time cost', et - st, 's')
        return mu, cov
Ejemplo n.º 2
0
def fast_matmul(x, y, x_type, y_type):
    '''
    use pycuda to compute c = a * b
    '''
    linalg.init()
    a_gpu = gpuarray.to_gpu(x.astype(x_type))
    a_t_gpu = gpuarray.to_gpu(x.T.copy().astype(x_type))
    b_gpu = gpuarray.to_gpu(y.astype(y_type))
    # row_sum = gpuarray.zeros(shape = x[0].shape, dtype = x_type)
    row_sum = 0
    # a = np.asarray(x, x_type)
    # b = np.asarray(y, y_type)
    # a_gpu = gpuarray.to_gpu(a)
    # b_gpu = gpuarray.to_gpu(b)

    t1_inside = time.time()
    c_gpu = linalg.dot(a_gpu, b_gpu)
    for a_i in a_gpu:
        # row_sum = misc.add(row_sum, a_i)
        row_sum += a_i
        gg = linalg.dot(a_gpu, b_gpu)
        gg = linalg.dot(a_i, a_i)
        gg = reduce(linalg.dot, (a_gpu, b_gpu, b_gpu, b_gpu))
        # tmp1, tmp2 = linalg.dot(a_gpu, b_gpu), linalg.dot(b_gpu, b_gpu)
        z_gpu = a_gpu.copy()
    tmp = a_t_gpu
    # print('x.T\n', x.T)
    # print('tmp\n', tmp)
    # print('x = a_gpu: ', np.allclose(x, a_gpu.get()))
    # print('x.T = tmp: ', np.allclose(x.T, tmp.get()))

    a_prod = linalg.dot(a_gpu, tmp)
    t2_inside = time.time()
    print('inside cost {:.4f}s'.format(t2_inside - t1_inside))

    a = np.random.randint(-5, 5, (3, 4)).astype(np.float32)
    a_gpu = gpuarray.to_gpu(a)
    norm_gpu = linalg.norm(a_gpu)
    print('is norm right?', np.linalg.norm(a) == norm_gpu)
    a_gpu = abs(a_gpu)
    column_sum = misc.sum(a_gpu, axis=0)
    column_sum = column_sum.reshape((1, -1))
    all_one_gpu = gpuarray.to_gpu(np.ones((3, 1), np.float32))
    div_mat_gpu = linalg.dot(all_one_gpu, column_sum)

    norm_1 = a_gpu / (div_mat_gpu + 1e-3)

    print(a_gpu)
    print(column_sum)
    print(column_sum.shape)
    print(norm_1)
    # abs_a = a_gpu.__abs__()
    # print(a)
    # print(abs_a)
    # c = abs_a + a_gpu
    # print(repr(c))
    # print(type(c))
    # c = 1/2 * c
    # print(a_gpu, c)
    return c_gpu.get(), a_prod.get(), row_sum.get()
Ejemplo n.º 3
0
 def dot(self, X, Y, out=None, transa='N', transb='N'):
     if out:
         Z = linalg.dot(X, Y, out=out, transa=transa, transb=transb)
     else:
         Z = linalg.dot(X, Y, transa=transa, transb=transb)
     sync_only()
     return Z
Ejemplo n.º 4
0
    def conv2d_forward_batch(self, inputs, params, bias, outputs,
                             padding, stride):
        num_filters = params.shape[0]
        num_images, input_rows, input_cols, num_input_maps = inputs.shape
        kernel_shape = params.shape[1:]
        num_output_pixels = outputs.shape[1] * outputs.shape[2]
        num_kernel_params = np.prod(kernel_shape)
        out_shape = (num_output_pixels, num_filters)
        num_cuda_kernels = num_output_pixels * num_input_maps

        for i in range(num_images):
            col = self.zeros((num_output_pixels, num_kernel_params))
            _im2col_fp32_impl(np.int32(num_cuda_kernels), inputs[i],
                              np.int32(input_rows), np.int32(input_cols),
                              np.int32(kernel_shape[0]),
                              np.int32(kernel_shape[1]),
                              np.int32(padding), np.int32(padding),
                              np.int32(stride[0]), np.int32(stride[1]),
                              np.int32(outputs.shape[2]),
                              np.int32(num_input_maps),
                              col.gpudata,
                              block=(NUM_CUDA_THREADS, 1, 1),
                              grid=(get_blocks(num_cuda_kernels), 1))

            reshaped_params = params.reshape(num_filters, num_kernel_params)
            culinalg.dot(col, reshaped_params, transb='T',
                         out=outputs[i].reshape(out_shape))

        flat_outputs = flatten_all_but_last(outputs)
        self.add_mv(flat_outputs, bias, flat_outputs)
Ejemplo n.º 5
0
def cuda_dot3(A, b):
    print("cuda_dot3", A.shape, b.shape)
    # send b to GPU
    b_gpu = gpuarray.to_gpu(b)
    # transpose b on GPU
    bt_gpu = linalg.transpose(b_gpu)
    #remove b for now
    b_gpu.gpudata.free()
    del(b_gpu)
    # send A to GPU    
    A_gpu = gpuarray.to_gpu(A)
    
    temp_gpu = linalg.dot(bt_gpu, A_gpu)
    
    bt_gpu.gpudata.free()
    del(bt_gpu)
    A_gpu.gpudata.free()
    del(A_gpu)
    
    # send b to GPU
    b_gpu = gpuarray.to_gpu(b)
    
    c_gpu = linalg.dot(temp_gpu, b_gpu)
    
    temp_gpu.gpudata.free()
    del(temp_gpu)
    b_gpu.gpudata.free()
    del(b_gpu)
        
    #theoretically possible to move into RAM, force cleanup on GPU and then return from RAM
    #but most likely not necessary
    return c_gpu.get()
Ejemplo n.º 6
0
def computeEnergy(D_v, S, T, _Lambda, _gamma_c, Alpha, Beta):
    l, m, n = S.shape

    sum_alpha_beta = gpuarray.zeros_like(D_v)
    sk_linalg.dot(Beta, Alpha, out=sum_alpha_beta)

    GR = grad(T)
    square_matrix(GR, GR)
    G_norm = gpuarray.zeros_like(T)
    sum_three_matrix(GR[0, :, :, :], GR[1, :, :, :], GR[2, :, :, :], G_norm,
                     1.0, 1.0, 1.0)
    sqrt_matrix(G_norm, G_norm)
    #    multiply_matrix(G_norm, _Gamma, G_norm)
    ET = _gamma_c * gpuarray.sum(G_norm)

    SP = gpuarray.zeros_like(S)
    absolute_matrix(S, SP)
    multiply_matrix(SP, _Lambda, SP)
    ES = gpuarray.sum(SP)

    sparse = D_v - S.reshape(l * m * n, 1) - T.reshape(l * m * n,
                                                       1) - sum_alpha_beta
    square_matrix(sparse, sparse)
    EL = gpuarray.sum(sparse)

    E = 1 / 2 * EL.get() + ES.get() + ET.get()

    return EL.get(), ES.get(), ET.get(), E
Ejemplo n.º 7
0
    def conv2d_forward_batch(self, inputs, params, bias, outputs,
                             padding, stride):
        num_filters = params.shape[0]
        num_images, input_rows, input_cols, num_input_maps = inputs.shape
        kernel_shape = params.shape[1:]
        num_output_pixels = outputs.shape[1] * outputs.shape[2]
        num_kernel_params = np.prod(kernel_shape)
        out_shape = (num_output_pixels, num_filters)
        num_cuda_kernels = num_output_pixels * num_input_maps

        for i in range(num_images):
            col = self.zeros((num_output_pixels, num_kernel_params))
            _im2col_fp32_impl(np.int32(num_cuda_kernels), inputs[i],
                              np.int32(input_rows), np.int32(input_cols),
                              np.int32(kernel_shape[0]),
                              np.int32(kernel_shape[1]),
                              np.int32(padding), np.int32(padding),
                              np.int32(stride[0]), np.int32(stride[1]),
                              np.int32(outputs.shape[2]),
                              np.int32(num_input_maps),
                              col.gpudata,
                              block=(get_blocks(num_cuda_kernels), 1, 1),
                              grid=(NUM_CUDA_THREADS, 1, 1))

            reshaped_params = params.reshape(num_filters, num_kernel_params)
            culinalg.dot(col, reshaped_params, transb='T',
                         out=outputs[i].reshape(out_shape))

        flat_outputs = flatten_all_but_last(outputs)
        self.add_mv(flat_outputs, bias, flat_outputs)
Ejemplo n.º 8
0
def dot3(A, b):
    ''' Calculates matrix multiplication "b.T*A*b" on GPU. 
        A has to be nxn. '''
    #print("dot3 "+str(A.shape)+" "+str(b.shape))
    
    # Make sure we dont run out of memory on the GPU                         
    if ((A.size + 2*b.size) <= 629088256):
        
        # send A to GPU    
        A_gpu = gpuarray.to_gpu(A)
        
        # send b to GPU
        b_gpu = gpuarray.to_gpu(b)
        
        temp_gpu = linalg.dot(A_gpu, b_gpu)
        
        A_gpu.gpudata.free()
        del(A_gpu)
        
        # transpose b on GPU
        bt_gpu = linalg.transpose(b_gpu)
            
        #remove b
        b_gpu.gpudata.free()
        del(b_gpu)
        
        out_gpu = linalg.dot(bt_gpu, temp_gpu)
        
        return out_gpu.get()
    
    else:
        print("Too big for GPU, using CPU.")
        return np.dot(np.dot(b.T, A), b)
Ejemplo n.º 9
0
def dot3(A, b):
    ''' Calculates matrix multiplication "b.T*A*b" on GPU. '''
    #print("dot3 "+str(A.shape)+" "+str(b.shape))
    
    # send A to GPU    
    A_gpu = gpuarray.to_gpu(A)
    
    # send b to GPU
    b_gpu = gpuarray.to_gpu(b)
    
    temp_gpu = linalg.dot(A_gpu, b_gpu)
    
    A_gpu.gpudata.free()
    del(A_gpu)
    
    # transpose b on GPU
    bt_gpu = linalg.transpose(b_gpu)
        
    #remove b
    b_gpu.gpudata.free()
    del(b_gpu)
    
    out_gpu = linalg.dot(bt_gpu, temp_gpu)
    
    return out_gpu.get()
Ejemplo n.º 10
0
def matrix_inverse(A, cuda = True):
    m, n = A.shape
    transpose = False
    pinv = None
    
#     A is m-by-n and the rank of A is equal to m (m ≤ n) then A has right inverse A_P = At * inv(A * At)
#     A is m-by-n and the rank of A is equal to n (n ≤ m) then A has left inverse A_P = A * inv(At * A)

    if m > n:
        A = np.ascontiguousarray(A.T)
        transpose = True
    # By default this calculates the right inverse
    At = np.ascontiguousarray(A.T)
    if cuda:
        A_gpu = gpua.to_gpu(A)
        At_gpu = gpua.to_gpu(At)

        inv = linalg.inv(linalg.dot(A_gpu, At_gpu))
        pinv = linalg.dot(At_gpu, inv).get()
    else:
        inv = np.linalg.inv(A.dot(At))
        pinv = At.dot(inv)
        
    if transpose:
        pinv = pinv.T
        
    return pinv
Ejemplo n.º 11
0
    def CalcNoise(self, cut=False, mode=0):

        step = 100
        blocks = np.ceil(1.0 * self.FSamps / step).astype(np.int)
        noisevec = np.zeros(self.FSamps)
        if (mode == 0):
            for i in range(blocks):
                start = i * step
                stop = (i + 1) * step
                if (stop > self.FSamps):
                    print "stop!", i, stop
                    stop = self.FSamps

                r2 = cula.dot(self.Real[start:stop], self.Real[start:stop:])
                i2 = cula.dot(self.Imag[start:stop], self.Imag[start:stop])
                noisesamps = len(self.Imag[start:stop]) * 2
                noise = np.sqrt((r2 + i2) / noisesamps)
                noisevec[start:stop] = 1.0 / noise
                self.Real[start:stop] /= noise
                self.Imag[start:stop] /= noise
                self.gpu_fft_data[start:stop] /= noise
                #print "noise", i, noise

            if (cut == True):
                fftdataR = (self.gpu_fft_data.get()).real
                fftdataI = (self.gpu_fft_data.get()).imag

                Rbad = np.where(np.abs(fftdataR) > 6.0)[0]
                Ibad = np.where(np.abs(fftdataI) > 6.0)[0]
                print fftdataR[Rbad], fftdataI[Ibad]
                NRbad = len(Rbad)
                NIbad = len(Ibad)
                fftdata = self.gpu_fft_data.get()
                fftdata.real[Rbad] = np.random.normal(0, 1, NRbad)
                fftdata.imag[Ibad] = np.random.normal(0, 1, NIbad)
                print "bad", NRbad, NIbad
                self.gpu_fft_data = gpuarray.to_gpu(np.complex128(fftdata))
        if (mode == 1):
            #r2 = cula.dot(self.Real[self.FSamps/2:], self.Real[self.FSamps/2:])
            #i2   = cula.dot(self.Imag[self.FSamps/2:], self.Imag[self.FSamps/2:])
            fftD = self.gpu_fft_data.get()
            r2 = np.dot(fftD.real[self.FSamps / 2:],
                        fftD.real[self.FSamps / 2:])
            i2 = np.dot(fftD.imag[self.FSamps / 2:],
                        fftD.imag[self.FSamps / 2:])
            noisesamps = len(fftD.imag[self.FSamps / 2:]) * 2

            del fftD

            noise = np.sqrt((r2 + i2) / noisesamps)
            print "Noise mode 1: ", noise
            #self.Real = self.Real / noise
            #self.Imag = self.Imag / noise
            self.gpu_fft_data = self.gpu_fft_data / noise
            noisevec[:] = 1.0 / noise

        #self.Noise = gpuarray.empty(self.FSamps, np.float64)
        self.Noise = gpuarray.to_gpu(np.float64(noisevec))
Ejemplo n.º 12
0
def getTranformada_Inversa(test_image, diagonal):
    test_image = test_image.astype(np.float32)
    diagonal = diagonal.astype(np.float32)
    test_image = gpuarray.to_gpu(test_image)
    diagonal = gpuarray.to_gpu(diagonal)
    test_image_gpuT = linalg.transpose(test_image)
    testimage_gpu = linalg.dot(test_image_gpuT, diagonal)
    test_image_gpuT = linalg.transpose(testimage_gpu)
    testimage_gpu = linalg.dot(test_image_gpuT, diagonal)
    return testimage_gpu.get()
Ejemplo n.º 13
0
def getTranformada(test_image, diagonal):
    #multiplico cada fila por la diagonal
    diagonal = diagonal.astype(np.float32)
    test_image = gpuarray.to_gpu(test_image)
    diagonal = gpuarray.to_gpu(diagonal)
    testimage_gpu = linalg.dot(test_image, diagonal)
    testimageT_gpu = linalg.transpose(testimage_gpu)
    testimage_gpu = linalg.dot(testimageT_gpu, diagonal)
    testimageT_gpu = linalg.transpose(testimage_gpu)
    return testimageT_gpu.get()
Ejemplo n.º 14
0
    def computeLambda(self):
        print('\t\tComputing lambda...')

        T = np.zeros(self.num_columns)

        if (self.GPU == True):

            if not self.affine:

                gpu_data = gpuarray.to_gpu(self.data)
                C_gpu = linalg.dot(gpu_data, gpu_data, transa='T')

                for i in xrange(self.num_columns):
                    T[i] = linalg.norm(C_gpu[i, :])

            else:

                gpu_data = gpuarray.to_gpu(self.data)

                # affine transformation
                y_mean_gpu = misc.mean(gpu_data, axis=1)

                # creating affine matrix to subtract to the data (may encounter problem with strides)
                aff_mat = np.zeros([self.num_rows,
                                    self.num_columns]).astype('f')
                for i in xrange(0, self.num_columns):
                    aff_mat[:, i] = y_mean_gpu.get()

                aff_mat_gpu = gpuarray.to_gpu(aff_mat)
                gpu_data_aff = misc.subtract(aff_mat_gpu, gpu_data)

                C_gpu = linalg.dot(gpu_data, gpu_data_aff, transa='T')

                #computing euclidean norm (rows)
                for i in xrange(self.num_columns):
                    T[i] = linalg.norm(C_gpu[i, :])
        else:

            if not self.affine:

                T = np.linalg.norm(np.dot(self.data.T, self.data), axis=1)

            else:
                #affine transformation
                y_mean = np.mean(self.data, axis=1)

                tmp_mat = np.outer(y_mean, np.ones(
                    self.num_columns)) - self.data

                T = np.linalg.norm(np.dot(self.data.T, tmp_mat), axis=1)

        _lambda = np.amax(T)

        return _lambda
Ejemplo n.º 15
0
 def test_dot_matrix_h_complex128(self):
     a = np.asarray(np.random.rand(2, 4)+1j*np.random.rand(2, 4), np.complex128)
     b = np.asarray(np.random.rand(2, 2)+1j*np.random.rand(2, 2), np.complex128)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c_gpu = linalg.dot(a_gpu, b_gpu, 'c')
     assert np.allclose(np.dot(a.conj().T, b), c_gpu.get())
     a = a.astype(np.complex128, order="F", copy=True)
     b = b.astype(np.complex128, order="F", copy=True)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c_gpu = linalg.dot(a_gpu, b_gpu, 'c')
     assert np.allclose(np.dot(a.conj().T, b), c_gpu.get())
Ejemplo n.º 16
0
 def test_dot_vector_complex128(self):
     a = np.asarray(np.random.rand(5), np.complex128)
     b = np.asarray(np.random.rand(5), np.complex128)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c = linalg.dot(a_gpu, b_gpu)
     assert np.allclose(np.dot(a, b), c)
     a = a.astype(np.complex128, order="F", copy=True)
     b = b.astype(np.complex128, order="F", copy=True)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c = linalg.dot(a_gpu, b_gpu)
     assert np.allclose(np.dot(a, b), c)
Ejemplo n.º 17
0
 def test_dot_matrix_h_complex128(self):
     a = np.asarray(np.random.rand(2, 4)+1j*np.random.rand(2, 4), np.complex128)
     b = np.asarray(np.random.rand(2, 2)+1j*np.random.rand(2, 2), np.complex128)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c_gpu = linalg.dot(a_gpu, b_gpu, 'c')
     assert np.allclose(np.dot(a.conj().T, b), c_gpu.get())
     a = a.astype(np.complex128, order="F", copy=True)
     b = b.astype(np.complex128, order="F", copy=True)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c_gpu = linalg.dot(a_gpu, b_gpu, 'c')
     assert np.allclose(np.dot(a.conj().T, b), c_gpu.get())
Ejemplo n.º 18
0
 def test_dot_vector_complex128(self):
     a = np.asarray(np.random.rand(5), np.complex128)
     b = np.asarray(np.random.rand(5), np.complex128)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c = linalg.dot(a_gpu, b_gpu)
     assert np.allclose(np.dot(a, b), c)
     a = a.astype(np.complex128, order="F", copy=True)
     b = b.astype(np.complex128, order="F", copy=True)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c = linalg.dot(a_gpu, b_gpu)
     assert np.allclose(np.dot(a, b), c)
Ejemplo n.º 19
0
        def thunk():
            alpha = gpuarray.to_gpu(np.squeeze(np.asarray(inputs[0]))[:, None])
            x_t = gpuarray.to_gpu(np.asarray(inputs[1])[0, :, :])
            x_f = gpuarray.to_gpu(np.asarray(inputs[2])[0, :, :])
            Xt = cumath.exp(misc.add(linalg.dot(x_t, A), b))
            Xf = cumath.exp(misc.add(linalg.dot(x_f, A), b))
            Xtn = misc.sum(Xt, axis=1, keepdims=True)
            Xfn = misc.sum(Xf, axis=1, keepdims=True)
            Xt = misc.divide(Xt, Xtn)
            Xf = misc.divide(Xf, Xfn)
            w = misc.multiply(Xt, alpha) + misc.multiply(Xf, 1 - alpha)
            dq = Xt - Xf
            qdw = dq / w
            t1 = misc.sum(x * qdw, axis=1)
            f = 2 * depth + self.base.n
            t2 = f * misc.sum(dq, axis=1) / misc.sum(w, axis=1)
            t3 = misc.sum(x, axis=1) * misc.sum(qdw, axis=1)
            dalpha = t1 - t2 + t3
            del dq, t1, f, t2, t3

            iw = 1 / w
            S1 = misc.multiply(
                depth[:, None] * (self.base.n - 1) / self.base.n, iw)
            S2 = (self.base.n + depth[:, None]) / cumath.log(
                misc.sum(w, axis=1, keepdims=True))
            F = misc.multiply(misc.subtract((x * iw) - S1, S2), alpha)
            del w, iw, S1, S2

            cast = gpuarray.zeros((x_t.shape[1], Xt.shape[1]),
                                  dtype=theano.config.floatX)
            dLq_t = gpuarray.zeros(x_t.shape, dtype=theano.config.floatX)
            dLq_f = gpuarray.zeros(x_f.shape, dtype=theano.config.floatX)
            for i in range(Xt.shape[0]):
                S1 = misc.multiply(Xt[None, i, :], A)
                S2 = misc.sum(S1, axis=1, keepdims=True)
                S2 = misc.multiply(S2, misc.add(Xt[None, i, :], cast))
                dLq_t[i, :] = misc.sum(misc.multiply(F[None, i, :], S1 - S2),
                                       axis=1)
                S1 = misc.multiply(Xf[None, i, :], A)
                S2 = misc.sum(S1, axis=1, keepdims=True)
                S2 = misc.multiply(S2, misc.add(Xf[None, i, :], cast))
                dLq_f[i, :] = misc.sum(misc.multiply(F[None, i, :], S1 - S2),
                                       axis=1)
            outputs[0][0] = dalpha.get()
            outputs[1][0] = dLq_t.get()
            outputs[2][0] = dLq_f.get()
            for v in node.outputs:
                compute_map[v][0] = True
    def _get_XTX_cuda(self, X, x_1, x_2, y_1, y_2):
        """Caluclate dot product between two array on gpu.

        Parameters
        ----------
        X: array
            Array to normalize
        x_1: int
            Lower bound on slice on x-axis
        x_2: int
            Upper bound on slice x-axis
        y_1: int
            Lower bound on slice y-axis
        y_2: int
            Upper bound on slice y-axis
        Returns
        -------
        XX.T: array
            X X.T array
            
        """
        X_f, X_b = gpuarray.to_gpu(X[x_1:x_2, :]), gpuarray.to_gpu(
            X[y_1:y_2, :])
        X_f_norm, X_b_norm = self._cuda_norm(X_f), self._cuda_norm(X_b)
        return linalg.dot(X_f_norm, X_b_norm, transb="T").get()
Ejemplo n.º 21
0
    def __calc_A_shift_gpu(self, shift_x, shift_y):

        psis_gpu = self.converter.get_prolates_as_images()  # TODO: need to assert that returns indeed a gpuarray
        n_psis = len(psis_gpu)

        if shift_x == 0 and shift_y == 0:
            return np.eye(n_psis)

        A_shift = gpuarray.zeros((n_psis, n_psis),'complex64')
        non_neg_freqs = self.converter.get_non_neg_freq_inds()

        psis_gpu_non_neg_freqs = psis_gpu[non_neg_freqs]
        psis_non_neg_shifted = circ_shift_kernel.circ_shift(psis_gpu_non_neg_freqs, shift_x, shift_y)

        psis_non_neg_shifted = self.converter.mask_points_inside_the_circle(psis_non_neg_shifted)

        psis_non_neg_shifted = psis_non_neg_shifted.reshape(len(psis_non_neg_shifted), -1)
        psis_gpu = psis_gpu.reshape(n_psis, -1)
        A_shift[non_neg_freqs] = linalg.dot(psis_non_neg_shifted, psis_gpu, transb='C')

        zero_freq_inds = self.converter.get_zero_freq_inds()
        pos_freq_inds  = self.converter.get_pos_freq_inds()
        neg_freq_inds  = self.converter.get_neg_freq_inds()

        A_shift[neg_freq_inds, zero_freq_inds] = A_shift[pos_freq_inds, zero_freq_inds]
        A_shift[neg_freq_inds, pos_freq_inds] = A_shift[pos_freq_inds, neg_freq_inds]
        A_shift[neg_freq_inds, neg_freq_inds] = A_shift[pos_freq_inds, pos_freq_inds]

        A_shift[neg_freq_inds] = linalg.conj(A_shift[neg_freq_inds])
        # TODO: get rid of the transpose
        # return np.transpose(A_shift).copy()
        return np.transpose(A_shift).get().copy()
Ejemplo n.º 22
0
    def forward(self, bottom, top):
        #        print 'hanli crf forward -- '
        #        print 'self.diff.shape: ' + str(self.diff.shape);  # self.diff.shape: (batchsize, 65536)
        #        print 'crf bottom[0].data.shape: ' + str(bottom[0].data.shape); #crf bottom[0].data.shape: (batchsize, 11)
        #        print 'raw degree bottom[1].data.shape: ' + str(bottom[1].data.shape);  #(batchsize, 65536, 11)
        #        print 'png bottom[2].data.shape: ' + str(bottom[2].data.shape);  # (batchsize, 65536)
        #        print 'np.dot(bottom[1].data[i,:,:], bottom[0].data[i,:]).shape: ' + str(np.dot(bottom[1].data[0,:,:], bottom[0].data[0,:]).shape); #(65536,)
        #        print 'bottom[2].data[i,:].shape: ' + str(bottom[2].data[0,:].shape);  # (65536,)
        with pu.caffe_cuda_context():
            linalg.init()
            for i in range(self.diff.shape[0]):
                #a =  bottom[1].data_as_pycuda_gpuarray()
                #b =  bottom[0].data_as_pycuda_gpuarray()
                a = bottom[1].data[i, :, :].astype(np.float32)
                b = bottom[0].data[i, :].astype(np.float32)
                ##a = np.asarray(np.random.rand(4, 4), dtype=np.float32)
                ##b = np.asarray(np.random.rand(4), dtype=np.float32)

                #a_gpu = gpuarray.GPUArray(a, dtype=np.float32)
                #b_gpu = gpuarray.GPUArray(b, dtype=np.float32)
                a_gpu = gpuarray.to_gpu(a)
                b_gpu = gpuarray.to_gpu(b)
                c_gpu = linalg.dot(a_gpu, b_gpu)
                #self.diff[i,:] = c_gpu + bottom[2].data[i,:] - bottom[3].data[i,:];
                self.diff[i, :] = np.dot(
                    bottom[1].data[i, :, :], bottom[0].data[
                        i, :]) + bottom[2].data[i, :] - bottom[3].data[i, :]
            top[0].data[...] = np.sum(self.diff**2) / bottom[3].num / 2.
            #self.transDiff = np.transpose(self.diff / bottom[3].num); # (65536, 50)
            a_gpu = gpuarray.to_gpu(self.diff / bottom[3].num)
            at_gpu = linalg.transpose(a_gpu)
            self.transDiff = at_gpu
def initParallelAlgorithms():
    global bitonicSort_
    fin = open("ParallelAlgorithms/bitonicSort.cu")
    mod = SourceModule(fin.read())
    fin.close()
    bitonicSort_ = mod.get_function("bitonicSort")

    global finishCSM_
    global getSumSquares_
    fin = open("ParallelAlgorithms/CSMHelper.cu")
    mod = SourceModule(fin.read())
    fin.close()
    finishCSM_ = mod.get_function("finishCSM")
    getSumSquares_ = mod.get_function("getSumSquares")

    #Run each of the algorithms on dummy data so that they're pre-compiled

    #1) Bitonic Sort
    X = np.random.randn(16, 16)
    N = np.int32(16)
    NPow2 = N
    NThreads = N/2
    XG = gpuarray.to_gpu(X)
    bitonicSort_(XG, N, NPow2, block=(NThreads, 1, 1), grid=(X.shape[0], 1), shared=4*NPow2)

    linalg.init()
    #2) Other primitive operations
    NegXDotX = linalg.dot(XG, XG)
    XPlusX = skcuda.misc.add(XG, XG)
    XSqr = skcuda.misc.multiply(XG, XG)
    XSqr = skcuda.misc.sum(XSqr, 1)
    XPlusCol = skcuda.misc.add_matvec(XG, XSqr, 0)
Ejemplo n.º 24
0
    def _correlate_matmul_cublas(self, frames_flat, mask):
        arr = np.ascontiguousarray(frames_flat[:, mask], dtype=np.float32)
        npix = arr.shape[1]
        # Pre-allocating memory for all bins might save a bit of time,
        # but would take more memory
        d_arr = garray.to_gpu(arr)
        d_outer = cublas.dot(d_arr, d_arr, transb="T", handle=self.cublas_handle)
        d_means = skmisc.mean(d_arr, axis=1, keepdims=True)
        d_denom_mat = cublas.dot(d_means, d_means, transb="T", handle=self.cublas_handle)

        self.sum_diagonals(d_outer, self.d_sumdiags1)
        self.sum_diagonals(d_denom_mat, self.d_sumdiags2)
        self.d_sumdiags1 /= self.d_sumdiags2
        self.d_sumdiags1 /= npix

        return self.d_sumdiags1.get()
Ejemplo n.º 25
0
def dot_accum(a, b):
    """
    Compute squared euclidean distance between two 2D arrays representing
    n-dimensional points using GPU. This computes the matrix-multiplication of
    the GPU versions of the inputs, gets it back to host CPU and then
    accumulates the squared sum of rows into it along the rows and columns
    respectively.

    Parameters
    ----------
    A : ndarray
        2D NumPy array of float dtype representing n-dimensional points, with
        each row being one point.
    B : ndarray
        2D NumPy array of float dtype representing n-dimensional points, with
        each row being one point.

    Returns
    -------
    out : ndarray
        This holds the euclidean distances.

    """

    a_gpu = gpuarray.to_gpu(-2 * a)
    b_gpu = gpuarray.to_gpu(b)
    out = culinalg.dot(a_gpu, b_gpu, transb='T').get()
    out += np.einsum('ij,ij->i', a, a)[:, None]
    out += np.einsum('ij,ij->i', b, b)
    return out
Ejemplo n.º 26
0
    def _rbf_kernel_vectorized_cublas(data1,
                                      data2,
                                      sigma=10):  # pragma: no cover
        """kernel for edge similarity computed with the vectorized method

        Args:
            data1 (TYPE): pssm data 1
            data2 (TYPE): pssm dta 2
            sigma (int, optional): exponent of the exponetial

        Returns:
            np.array: value of the rbk kernel for all the pairs
        """
        beta = 2 * sigma**2
        d1_ = gpuarray.to_gpu(data1.astype(np.float32))
        d2_ = gpuarray.to_gpu(data2.astype(np.float32))
        mgpu = -2 * culinalg.dot(d1_, d2_, transa='N', transb='T')
        vgpu = cumisc.sum(d1_**2, axis=1)[:, None]
        cumisc.add_matvec(mgpu, vgpu, out=mgpu)

        vgpu = cumisc.sum(d2_**2, axis=1)
        cumisc.add_matvec(mgpu, vgpu, out=mgpu)

        mcpu = mgpu.get()
        return np.exp(-mcpu / beta).reshape(-1)
Ejemplo n.º 27
0
    def _dev_lin(self, devX, devW, devB):
        """Linear function on GPU.

        Returns:
            devH (gpuarray): GPU matrix with the result.
        """
        devH = misc.add_matvec(linalg.dot(devX, devW), devB, axis=1)
        return devH
Ejemplo n.º 28
0
    def _dev_lin(self, devX, devW, devB):
        """Linear function on GPU.

        Returns:
            devH (gpuarray): GPU matrix with the result.
        """
        devH = misc.add_matvec(linalg.dot(devX, devW), devB, axis=1)
        return devH
Ejemplo n.º 29
0
 def compute_analysis_cuda2(self,
                            xb,
                            y,
                            R,
                            P,
                            H,
                            HT=None,
                            hph=None,
                            calcP=True):
     if HT is None:
         HT = culinalg.transpose(H)
     HP = culinalg.dot(H, P)
     if hph is None:
         hph = culinalg.dot(HP, HT)
     Rhph = misc.add(R, hph)
     inv = culinalg.inv(Rhph)
     W = culinalg.dot(HP, inv, transa='T')
     Hxb = culinalg.dot(H, xb)
     yHxb = misc.subtract(y, Hxb)
     WyHxb = culinalg.dot(W, yHxb)
     xhat = misc.add(xb, WyHxb)
     #xhat = xb + culinalg.dot(W, (y - culinalg.dot(H, xb)))
     if calcP:
         I = culinalg.eye(P.shape[0])
         WH = culinalg.dot(W, H)
         IWH = I - WH
         Phat = culinalg.dot(IWH, P)
     else:
         Phat = misc.zeros((1, ), dtype=P.dtype)
     return xhat, Phat
Ejemplo n.º 30
0
def NNMF_gpu(X,r,tol,V=v0,W=w0,verbose=1):
    Vr = V[:,0:r].copy()
    Wr = W[0:r,:].copy()
    X_gpu = gpuarray.to_gpu(X)
    V_gpu = gpuarray.to_gpu(Vr)
    W_gpu = gpuarray.to_gpu(Wr)
    #Frobinius norm at previous step
    B_gpu = linalg.dot(V_gpu, W_gpu)
    L = linalg.norm(X_gpu-B_gpu)**2
    iteration = 0
    while 1: #update V
        V_gpu *= linalg.dot(X_gpu,linalg.transpose(W_gpu))
        V_gpu /= linalg.dot(B_gpu,linalg.transpose(W_gpu))
        B_gpu = linalg.dot(V_gpu, W_gpu)
        #update W
        W_gpu *= linalg.dot(linalg.transpose(V_gpu),X_gpu)
        W_gpu /= linalg.dot(linalg.transpose(V_gpu),B_gpu)
        B_gpu = linalg.dot(V_gpu, W_gpu)
        Lnew = linalg.norm(X_gpu-B_gpu)**2
        if abs(Lnew-L) <= tol*(L+1):
            break
        else:
            L = Lnew
            iteration += 1
            if(verbose and iteration%50==0):
                print "At iteration %i, the loss is %.2f" %(iteration, L)
    return V_gpu,W_gpu,iteration
Ejemplo n.º 31
0
 def _dot_matrix_tests(self, dtype, transa, transb):
     a = np.asarray(np.random.rand(4, 2), dtype)
     if transa == 'n':
         b = np.asarray(np.random.rand(2, 2), dtype)
     else:
         b = np.asarray(np.random.rand(4, 4), dtype)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c_gpu = linalg.dot(a_gpu, b_gpu, transa, transb)
     aa = a if transa == 'n' else a.T
     bb = b if transb == 'n' else b.T
     assert np.allclose(np.dot(aa, bb), c_gpu.get())
     a = a.astype(dtype, order="F", copy=True)
     b = b.astype(dtype, order="F", copy=True)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c_gpu = linalg.dot(a_gpu, b_gpu, transa, transb)
     assert np.allclose(np.dot(aa, bb), c_gpu.get())
Ejemplo n.º 32
0
 def _dot_matrix_tests(self, dtype, transa, transb):
     a = np.asarray(np.random.rand(4, 2), dtype)
     if transa == 'n':
         b = np.asarray(np.random.rand(2, 2), dtype)
     else:
         b = np.asarray(np.random.rand(4, 4), dtype)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c_gpu = linalg.dot(a_gpu, b_gpu, transa, transb)
     aa = a if transa == 'n' else a.T
     bb = b if transb == 'n' else b.T
     assert np.allclose(np.dot(aa, bb), c_gpu.get())
     a = a.astype(dtype, order="F", copy=True)
     b = b.astype(dtype, order="F", copy=True)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c_gpu = linalg.dot(a_gpu, b_gpu, transa, transb)
     assert np.allclose(np.dot(aa, bb), c_gpu.get())
Ejemplo n.º 33
0
    def _dev_tanh(self, devX, devW, devB):
        """Hyperbolic tangent function on GPU.

        Returns:
            devH (gpuarray): GPU matrix with the result.
        """
        devH = misc.add_matvec(linalg.dot(devX, devW), devB, axis=1)
        cumath.tanh(devH, out=devH)
        return devH
Ejemplo n.º 34
0
    def _dev_tanh(self, devX, devW, devB):
        """Hyperbolic tangent function on GPU.

        Returns:
            devH (gpuarray): GPU matrix with the result.
        """
        devH = misc.add_matvec(linalg.dot(devX, devW), devB, axis=1)
        cumath.tanh(devH, out=devH)
        return devH
Ejemplo n.º 35
0
def dot(a, b):
    ''' Calculates matrix multiplication "a*b" on GPU. '''
    #print("dot "+str(a.shape)+" "+str(b.shape))
    
    a_gpu = gpuarray.to_gpu(a)
    b_gpu = gpuarray.to_gpu(b)
    
    d_gpu = linalg.dot(a_gpu, b_gpu)
    
    return d_gpu.get()
Ejemplo n.º 36
0
 def bigdot(self, X, Y, out=None, mask=None):
     if out:
         raise Exception("Not implemented in-place sparse multiplication")
     else:
         if type(X) == cusparse.CSR:
             Z = X.mm(Y)
         else:
             Z = linalg.dot(X, Y)
         sync_only()
         return Z
Ejemplo n.º 37
0
 def thunk():
     alpha = gpuarray.to_gpu(np.squeeze(np.asarray(inputs[0]))[:, None])
     x_t = gpuarray.to_gpu(np.asarray(inputs[1])[0, :, :])
     x_f = gpuarray.to_gpu(np.asarray(inputs[2])[0, :, :])
     Xt = cumath.exp(misc.add(linalg.dot(x_t, A), b))
     Xf = cumath.exp(misc.add(linalg.dot(x_f, A), b))
     Xtn = misc.sum(Xt, axis=1, keepdims=True)
     Xfn = misc.sum(Xf, axis=1, keepdims=True)
     Xt = misc.divide(Xt, Xtn)
     Xf = misc.divide(Xf, Xfn)
     w = misc.multiply(Xt, alpha) + misc.multiply(Xf, 1 - alpha)
     wp = cumath.log(w)
     wpn = misc.sum(wp, axis=1, keepdims=True) / self.n
     wp = misc.subtract(wp, wpn)
     t1 = misc.sum(x * wp, axis=1)
     t2 = (self.n + depth) * cumath.log(misc.sum(w, axis=1))
     t3 = depth * wpn
     outputs[0][0] = misc.sum(t1 - t2 + t3).get()
     for v in node.outputs:
         compute_map[v][0] = True
Ejemplo n.º 38
0
def mldivide(A, B):
    ''' CULA would be necessary for this function to work. :-/ '''
    A_gpu = gpuarray.to_gpu(A)
    A_inv_gpu = linalg.inv(A)
        
    A_gpu.gpudata.free()
    del(A_gpu)
    
    B_gpu = gpuarray.to_gpu(B)
    
    out_gpu = linalg.dot(A_inv_gpu, B_gpu)
    return out_gpu.get()
Ejemplo n.º 39
0
    def _dot_matrix_vector_tests(self, dtype):
        a = np.asarray(np.random.rand(4, 4), dtype)
        b = np.asarray(np.random.rand(4), dtype)
        a_gpu = gpuarray.to_gpu(a)
        b_gpu = gpuarray.to_gpu(b)
        c_gpu = linalg.dot(a_gpu, b_gpu)
        assert np.allclose(np.dot(a, b), c_gpu.get())

        a = np.asarray(np.random.rand(4), dtype)
        b = np.asarray(np.random.rand(4, 4), dtype)
        a_gpu = gpuarray.to_gpu(a)
        b_gpu = gpuarray.to_gpu(b)
        c_gpu = linalg.dot(a_gpu, b_gpu)
        assert np.allclose(np.dot(a, b), c_gpu.get())

        a = np.asarray(np.random.rand(4, 4), dtype)
        b = np.asarray(np.random.rand(4, 1), dtype)
        a_gpu = gpuarray.to_gpu(a)
        b_gpu = gpuarray.to_gpu(b)
        c_gpu = linalg.dot(a_gpu, b_gpu)
        assert np.allclose(np.dot(a, b), c_gpu.get())
def skcuda_linalg(a, b):
    linalg.init()
    a = np.asarray(a, np.float32)
    b = np.asarray(b, np.float32)
    a_gpu = gpuarray.to_gpu(a)
    b_gpu = gpuarray.to_gpu(b)
    c_gpu = linalg.dot(a_gpu, b_gpu, 'T')
    a_nrm = linalg.norm(a_gpu)
    b_nrm = linalg.norm(b_gpu)
    type(a_nrm)
    ans = misc.divide(c_gpu, a_nrm * b_nrm)
    print ans
def logis(y,x):
    end = 0
    start = 0
    x = x.astype(np.float32)
    y = y.astype(np.float32)
    start=time.time()
    # Translado de variable a GPU
    x_gpu = gpuarray.to_gpu(x)
    y_gpu = gpuarray.to_gpu(y)

    linalg.init()
    # Transpuesta de X
    x_gpu_T = linalg.transpose(x_gpu)
    beta_gpu = linalg.dot(linalg.dot(linalg.inv(linalg.dot(x_gpu_T,x_gpu)),x_gpu_T),y_gpu)
    j = 1
    while(True):
        mu = sapply(x,beta_gpu.get())
        mu = mu.astype(np.float32)
        mu_gpu = gpuarray.to_gpu(mu)
        V_gpu= linalg.diag(mu_gpu)
        f2_gpu = linalg.multiply(mu_gpu,1-mu_gpu)
        f3_gpu = linalg.diag(1/f2_gpu)
        f4_gpu = (y_gpu-mu_gpu)
        f5_gpu = linalg.dot(f3_gpu,f4_gpu)
        if(np.isnan(f5_gpu.get()).any()):
            f5_cpu = f5_gpu.get()
            f5_cpu = nanValue(f5_cpu)
            f5_gpu = gpuarray.to_gpu(f5_cpu.astype(np.float32))
        y_1_gpu = linalg.dot(x_gpu,beta_gpu) + f5_gpu
        beta_1_gpu = linalg.dot(linalg.dot(linalg.dot(linalg.inv(linalg.dot(linalg.dot(x_gpu_T,V_gpu),x_gpu)),x_gpu_T),V_gpu),y_1_gpu)
        check_value = np.absolute(linalg.norm(beta_1_gpu-beta_gpu))
        #if(check_value<0.00001):
            #break
        if(j == 10 or check_value<0.00001):
            break
        beta_gpu = beta_1_gpu
        j = j + 1
    end = time.time()
    tiempo = (end-start)
    return {"iteraciones":j,"Betas":beta_gpu.get(),"time":tiempo}
def getCSMGPU(XG, YG):
    tbegin = time.time()
    GPUNeg2 = gpuarray.to_gpu(np.array([-2.0], dtype=np.float32))
    YGT = linalg.transpose(YG)
    XSqr = skcuda.misc.multiply(XG, XG)
    XSqr = skcuda.misc.sum(XSqr, 1)
    YSqr = skcuda.misc.multiply(YG, YG)
    YSqr = skcuda.misc.sum(YSqr, 1)
    C = linalg.dot(XG, YGT)
    C = skcuda.misc.multiply(GPUNeg2, C)
    skcuda.misc.add_matvec(C, XSqr, 0, C)
    skcuda.misc.add_matvec(C, YSqr, 1, C)
    return C
Ejemplo n.º 43
0
def safe_sparse_dot(a, b, transa='N', transb='N', dense_output=True):
    '''
    Parameters
    ----------
    a : array
    b : array

    Returns
    -------
    dot_product : array
    '''
    return culinalg.dot(a, b, transa, transb)
    """Dot product that handle the sparse matrix case correctly
def getCSMGPU(XG, YG):
    tbegin = time.time()
    GPUNeg2 = gpuarray.to_gpu(np.array([-2.0], dtype=np.float32))
    YGT = linalg.transpose(YG)
    XSqr = skcuda.misc.multiply(XG, XG)
    XSqr = skcuda.misc.sum(XSqr, 1)
    YSqr = skcuda.misc.multiply(YG, YG)
    YSqr = skcuda.misc.sum(YSqr, 1)
    C = linalg.dot(XG, YGT)
    C = skcuda.misc.multiply(GPUNeg2, C)
    skcuda.misc.add_matvec(C, XSqr, 0, C)
    skcuda.misc.add_matvec(C, YSqr, 1, C)
    return C
Ejemplo n.º 45
0
    def _dev_sigm(self, devX, devW, devB):
        """Compute Sigmoid on GPU for a given array and return array."""

#        def sigm(a):
#            block = a._block
#            grid = (int(np.ceil(1.0 * np.prod(a.shape) / block[0])), 1)
#            dev_sigm.prepared_call(grid, block, a.gpudata)
#            return a

        devH = misc.add_matvec(linalg.dot(devX, devW), devB, axis=1)
        block = devH._block
        grid = (int(np.ceil(1.0 * np.prod(devH.shape) / block[0])), 1)
        self.dev_sigm.prepared_call(grid, block, devH.gpudata)
        return devH
Ejemplo n.º 46
0
def dot(a, b):
    ''' Calculates matrix multiplication "a*b" on GPU. '''
    #print("dot "+str(a.shape)+" "+str(b.shape))
    
    # Make sure we dont run out of memory on the GPU
    if ((a.size + b.size + a.shape[0]*b.shape[1]) <= 629088256):
        a_gpu = gpuarray.to_gpu(a)
        b_gpu = gpuarray.to_gpu(b)
        
        d_gpu = linalg.dot(a_gpu, b_gpu)
        
        return d_gpu.get()
    
    else:
        return np.dot(a, b)
Ejemplo n.º 47
0
    def _predict(self, X, dev=False):
        """Predict a batch of data. Auxiliary function that implements a particular prediction.

        For prediction, use `ELM.predict()` instead.

        Args:
            X (matrix): input data size (N * `inputs`)
            dev (bool, optional): whether leave result in the GPU memory

        Returns:
            Y (matrix): predicted outputs size (N * `outputs`), always in float/double format.
        """
        assert self.B is not None, "Solve the task before predicting"
        devH = self._project(X, dev=True)
        devY = linalg.dot(devH, self.B)
        Y = devY if dev else devY.get()
        return Y
Ejemplo n.º 48
0
def inner(a, b):
    ''' Calculates inner product of "a" and "b". '''
    #print("inner "+str(a.shape)+" "+str(b.shape))
    
    # send b to GPU
    a_gpu = gpuarray.to_gpu(np.matrix(a))
    b_gpu = gpuarray.to_gpu(np.matrix(b))
    
    
    out_gpu = linalg.dot(a_gpu, b_gpu, transb='T')
    
    #clear
    a_gpu.gpudata.free()
    del(a_gpu)
    b_gpu.gpudata.free()
    del(b_gpu)
        
    return out_gpu.get()
Ejemplo n.º 49
0
def cuda_dot2(b, A):
    print("cuda_dot2", b.shape, A.shape)
    # send b to GPU
    b_gpu = gpuarray.to_gpu(b)
    # transpose b on GPU
    bt_gpu = linalg.transpose(b_gpu)
    # send A to GPU    
    A_gpu = gpuarray.to_gpu(A)
    
    out_gpu = linalg.dot(bt_gpu, A_gpu)
    
    b_gpu.gpudata.free()
    del(b_gpu)
    bt_gpu.gpudata.free()
    del(bt_gpu)
    A_gpu.gpudata.free()
    del(A_gpu)
    
    return out_gpu.get()
def getCSMGPU2(XG, YG):
    #Step 1: Sum of squares across rows
    dim = np.int32(XG.shape[1])
    dimpow2 = roundUpPow2(dim)
    NThreads = np.int32(min(dimpow2, 512))
    XSqr = gpuarray.empty(XG.shape[0], np.float32)
    YSqr = gpuarray.empty(YG.shape[0], np.float32)
    getSumSquares_(XG, XSqr, dim, dimpow2, block=(NThreads, 1, 1), grid=(XG.shape[0], 1), shared=4*dimpow2)
    getSumSquares_(YG, YSqr, dim, dimpow2, block=(NThreads, 1, 1), grid=(YG.shape[0], 1), shared=4*dimpow2)

    #Step 2: Do multiplication part
    YGT = linalg.transpose(YG)
    CSM = linalg.dot(XG, YGT)

    #Step 3: Add everything together
    Mp = np.array(XG.shape[0], dtype=np.int32)
    Np = np.array(YG.shape[0], dtype=np.int32)
    MPow2 = roundUpPow2(XG.shape[0])
    NThreads = min(MPow2, 512)
    #CSM is N x M
    finishCSM_(CSM, XSqr, YSqr, Np, Mp, MPow2, block=(NThreads, 1, 1), grid=(YG.shape[0], 1))
    return (CSM, XSqr, YSqr)
Ejemplo n.º 51
0
def dot2(b, A):
    ''' Calculates matrix multiplication "b.T*A" on GPU. '''
    #print("dot2 "+str(b.shape)+" "+str(A.shape))
    
    # send b to GPU
    b_gpu = gpuarray.to_gpu(b)
    # transpose b on GPU
    bt_gpu = linalg.transpose(b_gpu)
    # clear b
    b_gpu.gpudata.free()
    del(b_gpu)
    # send A to GPU    
    A_gpu = gpuarray.to_gpu(A)
    
    out_gpu = linalg.dot(bt_gpu, A_gpu)
    
    #clear
    #bt_gpu.gpudata.free()
    #del(bt_gpu)
    #A_gpu.gpudata.free()
    #del(A_gpu)
    
    return out_gpu.get()
Ejemplo n.º 52
0
def dot2(b, A):
    ''' Calculates matrix multiplication "b.T*A" on GPU. '''
    #print("dot2 "+str(b.shape)+" "+str(A.shape))
    
    
    # Make sure we dont run out of memory on the GPU
    if ((A.size + b.size + A.shape[0]*b.shape[1]) <= 629088256):
        try:
            # send b to GPU
            b_gpu = gpuarray.to_gpu(b)
            # transpose b on GPU
            bt_gpu = linalg.transpose(b_gpu)
            # clear b
            b_gpu.gpudata.free()
            del(b_gpu)
            
            # send A to GPU    
            A_gpu = gpuarray.to_gpu(A)
            
            out_gpu = linalg.dot(bt_gpu, A_gpu)
        except:
            # clear b
            b_gpu.gpudata.free()
            del(b_gpu)
            
            print("Too big for GPU, using CPU.")
            return np.dot(b.T, A)
    else:
        print("Too big for GPU, using CPU.")
        return np.dot(b.T, A)
    #clear
    #bt_gpu.gpudata.free()
    #del(bt_gpu)
    #A_gpu.gpudata.free()
    #del(A_gpu)
    
    return out_gpu.get()
Ejemplo n.º 53
0
def elmvis(Xraw,
           A,
           slowdown=10,
           report=5,
           maxtime=24*60*60,
           tol=0,
           batch=None,
           maxiter=None,
           maxupdate=None,
           maxstall=None,
           cossim=None,
           silent=False):
    """ELMVIS+ function running in GPU memory.
    """
    X = Xraw / np.linalg.norm(Xraw, axis=1)[:, None]  # unit-length version of X
    Xh = np.dot(A, X)  # X_hat, predicted value of X
    N, d = X.shape
    I = np.arange(N)  # index of samples

    # set default values
    if cossim is None: cossim = np.trace(X.T.dot(A).dot(X)) / N
    if maxiter is None: maxiter = N*N*N
    if maxupdate is None: maxupdate = N*N
    if maxstall is None: maxstall = N*N

    if not silent:
        print "original similarity: ", cossim

    # init GPU
    dt = X.dtype.type
    try:
        linalg.init()
    except ImportError as e:
        print e
    devA = gpuarray.to_gpu(A.astype(dt))
    devX = gpuarray.to_gpu(X.astype(dt))
    devXi1 = gpuarray.empty((d,), dtype=dt)
    devXh = linalg.dot(devA, devX)
    devAi = gpuarray.empty((N, 2), dtype=dt)
    devDelta = gpuarray.empty((2, d), dtype=dt)
    result = gpuarray.empty((d,), dtype=dt)

    # swap kernel
    kernel = """
        __global__ void diff(%s *A, %s *Y, %s *AY, %s *result, long d, long N, long i1, long i2) {
            long j = blockDim.x * blockIdx.x + threadIdx.x;
            %s yi1 = Y[i1*d + j];
            %s yi2 = Y[i2*d + j];
            result[j] = (A[i1*N + i1] * (yi2 - yi1) + 2*AY[i1*d + j]) * (yi2 - yi1) +
                        (A[i2*N + i2] * (yi1 - yi2) + 2*(AY[i2*d + j] + A[i2*N + i1]*(yi2 - yi1))) * (yi1 - yi2);
        }
        """
    if dt is np.float64:
        kernel = kernel % ("double", "double", "double", "double", "double", "double")
    else:
        kernel = kernel % ("float", "float", "float", "float", "float", "float")
    mod_diff = SourceModule(kernel)
    dev_diff = mod_diff.get_function("diff")
    dev_diff.prepare("PPPPllll")
    block = result._block
    grid = (int(np.ceil(1.0 * result.shape[0] / block[0])), 1)

    t0 = tlast = time()
    stall = 0
    iters = 0
    updates = 0
    updates_last = 0
    iters_last = 0
    ups_max = 0

    while (iters < maxiter) and (stall < maxstall):
        iters += 1
        stall += 1

        # get two different random numbers
        i1, i2 = np.random.randint(0, N, size=2)
        while i1 == i2:
            i1, i2 = np.random.randint(0, N, size=2)

        dev_diff.prepared_call(grid, block, devA.gpudata, devX.gpudata, devXh.gpudata, result.gpudata, d, N, i1, i2)
        diff = np.sum(result.get())

        if diff > tol:
            stall = 0
            devAi[:, 0] = devA[:, i1]
            devAi[:, 1] = devA[:, i2]
            devDelta[0, :] = devX[i1, :] - devX[i2, :]
            devDelta[1, :] = devX[i2, :] - devX[i1, :]
            linalg.add_dot(devAi, devDelta, devXh, alpha=-1)

            tI = I[i1]
            I[i1] = I[i2]
            I[i2] = tI

            devXi1[:] = devX[i1, :]
            devX[i1] = devX[i2]
            devX[i2] = devXi1

            cossim += diff / N
            updates += 1
            if updates > maxupdate:
                break

        t = time()
        if t - tlast > report:
            ups = (updates-updates_last)*1.0/(t-tlast)
            ips = (iters-iters_last)*1.0/(t-tlast)
            if not silent:
                print "%d iters | %d updates | %.0f iters/s | %.0f updates/s | cos similarity = %.4f" % (iters, updates, ips, ups, cossim)

            updates_last = updates
            iters_last = iters
            tlast = t
            ups_max = max(ups, ups_max)
            if ups < ups_max/slowdown:
                break

        if t - t0 > maxtime:
            break

    ips = iters*1.0/(time()-t0)
    ups = updates*1.0/(time()-t0)
    Xraw[:] = Xraw[I]

    cossim = np.trace(X.T.dot(A).dot(X)) / N
    if not silent:
        print "final similarity: ", cossim

    info = {'cossim': cossim, 'iters': iters, 'updates': updates, 'ips': ips, 'ups': ups}
    return I, info
Ejemplo n.º 54
0
def getnextz(\
            z_last,za, c_obs_long,\
            Nx,Ny,nx,ny,nXobs,\
            Se_inv,Sa_inv,\
            Arule, Brule, Crule, Drule,\
            KAxrule, KAyrule, \
            KBxrule, KByrule, \
            KCxrule, KCyrule, \
            KDxrule, KDyrule):          
    
    
    nx_last_long = cds.dot(Nx,z_last)
    ny_last_long = cds.dot(Ny,z_last)
    nx_last = matrix(reshape(nx_last_long,(ny-1,nx-1))); 
    ny_last = matrix(reshape(ny_last_long,(ny-1,nx-1))); 
           
    vbigK, bigc_last = getbigK(\
            nx,ny,nXobs, 
            nx_last, ny_last,
            Arule, Brule, Crule, Drule,
            KAxrule, KAyrule, 
            KBxrule, KByrule, 
            KCxrule, KCyrule, 
            KDxrule, KDyrule)    
            
    NxNy = vstack((Nx,Ny))
    
    KN = cds.dot(vbigK, NxNy)
    
    
    nobs = nXobs*4   # int*int
    bigc_last_long = matrix(reshape(cds.T(bigc_last),(nobs,1)))
    delta_c = cds.substract(c_obs_long, bigc_last_long)

#     # This is the Gauss method
#     dz = linalg.inv(KN.T*KN)*KN.T*delta_c
#     z_next = z_last + dz
#     di2 = squeeze(dot(dz.T,dz))
    
    # This is the optimal estimation method
    
    # nasledujici dva radky by se daly mergnout
    
    
    #term3
    zz_gpu = gpuarray.to_gpu(z_last-za)
    delta_c_gpu = gpuarray.to_gpu(delta_c)
    KN_gpu = gpuarray.to_gpu(KN)  # (important for term2)
    
    temp0_gpu = linalg.add_dot(KN_gpu, zz_gpu, delta_c_gpu)
    
    zz_gpu.gpudata.free()
    del(zz_gpu)
    
    
    Se_inv_gpu = gpuarray.to_gpu(Se_inv)  # (important for term2)
    
    temp1_gpu = linalg.dot(Se_inv_gpu, temp0_gpu)
    
    #temp0_gpu.gpudata.free()
    #del(temp0_gpu)
    
    term3_gpu = linalg.dot(KN_gpu, temp1_gpu, transa="T")
    term3 = term3_gpu.get()
    
    temp1_gpu.gpudata.free()
    del(temp1_gpu)
    term3_gpu.gpudata.free()
    del(term3_gpu)
        
    # term2
    temp2_gpu = linalg.dot(Se_inv_gpu, KN_gpu)
    
    Se_inv_gpu.gpudata.free()
    del(Se_inv_gpu)
    
    Sa_inv_gpu = gpuarray.to_gpu(Sa_inv)
    
    term2_gpu = linalg.add_dot(KN_gpu, temp2_gpu, Sa_inv_gpu, transa="T")
    
    temp2_gpu.gpudata.free()
    del(temp2_gpu)
    
    KN_gpu.gpudata.free()
    del(KN_gpu)
    
    term2 = term2_gpu.get()
    #term0 = cds.dot3(Se_inv, KN)
    #term2 = Sa_inv+term0
    #term3 = cds.dot2(KN, cds.dot(Se_inv, (delta_c+cds.dot(KN,(z_last-za)))))
    
    z_next = za + np.linalg.solve(term2,term3) # same as term2\term3
        
    dz_gpu = gpuarray.to_gpu(z_next-z_last)
    
    temp3_gpu = linalg.dot(term2_gpu, dz_gpu)
    
    term2_gpu.gpudata.free()
    del(term2_gpu)
    
    di2_gpu = linalg.dot(dz_gpu, temp3_gpu, transa='T')
    
    #dz = z_next - z_last
    #di2 = cds.dot3(term2, dz)

    # Get out
    return z_next, di2_gpu.get()
Ejemplo n.º 55
0
 def dot_mm(self, a, b, out, transa=False, transb=False):
     transa = 'T' if transa else 'N'
     transb = 'T' if transb else 'N'
     culinalg.dot(a, b, transa=transa, transb=transb, out=out)
Ejemplo n.º 56
0
demo_types = [np.float32, np.float64] # we can use single or double precision
precisions = ['single', 'double']

print("Principal Component Analysis Demo!")
print("Compute all 100 principal components of a 1000x100 data matrix")
print("Lets test if the first two resulting eigenvectors (principal components) are orthogonal, by dotting them and seeing if it is about zero, then we can see the amount of the origial variance explained by just two of the original 100 dimensions.\n\n\n")

for i in range(len(demo_types)):

	demo_type = demo_types[i]
	X = np.random.rand(1000,100).astype(demo_type) # 1000 samples of 100-dimensional data vectors
	X_gpu = gpuarray.GPUArray((1000,100), demo_type, order="F") # note that order="F" or a transpose is necessary. fit_transform requires row-major matrices, and column-major is the default
	X_gpu.set(X) # copy data to gpu
	T_gpu = pca.fit_transform(X_gpu) # calculate the principal components
	dot_product = linalg.dot(T_gpu[:,0], T_gpu[:,1]) # show that the resulting eigenvectors are orthogonal 
	print("The dot product of the two " + str(precisions[i]) + " precision eigenvectors is: " + str(dot_product))
	# now get the variance of each eigenvector so we can see the percent explained by the first two
	std_vec = np.std(T_gpu.get(), axis=0)
	print("We explained " + str(100 * np.sum(std_vec[:2]) / np.sum(std_vec)) + "% of the variance with 2 principal components in " +  str(precisions[i]) +  " precision\n\n")










Ejemplo n.º 57
0
for t in demo_types:
    print "Testing matrix multiplication for type " + str(np.dtype(t))
    if np.iscomplexobj(t()):
        a = np.asarray(np.random.rand(10, 5) + 1j * np.random.rand(10, 5), t)
        b = np.asarray(np.random.rand(5, 5) + 1j * np.random.rand(5, 5), t)
        c = np.asarray(np.random.rand(5, 5) + 1j * np.random.rand(5, 5), t)
    else:
        a = np.asarray(np.random.rand(10, 5), t)
        b = np.asarray(np.random.rand(5, 5), t)
        c = np.asarray(np.random.rand(5, 5), t)

    a_gpu = gpuarray.to_gpu(a)
    b_gpu = gpuarray.to_gpu(b)
    c_gpu = gpuarray.to_gpu(c)

    temp_gpu = culinalg.dot(a_gpu, b_gpu)
    d_gpu = culinalg.dot(temp_gpu, c_gpu)
    temp_gpu.gpudata.free()
    del (temp_gpu)
    print "Success status: ", np.allclose(np.dot(np.dot(a, b), c), d_gpu.get())

    print "Testing vector multiplication for type " + str(np.dtype(t))
    if np.iscomplexobj(t()):
        d = np.asarray(np.random.rand(5) + 1j * np.random.rand(5), t)
        e = np.asarray(np.random.rand(5) + 1j * np.random.rand(5), t)
    else:
        d = np.asarray(np.random.rand(5), t)
        e = np.asarray(np.random.rand(5), t)

    d_gpu = gpuarray.to_gpu(d)
    e_gpu = gpuarray.to_gpu(e)