Esempio n. 1
0
    def add_batch(self, X, T, wc=None):
        """Add a batch of training data to an iterative solution, weighted if neeed.

        The batch is processed as a whole, the training data is splitted in `ELM.add_data()` method.
        With parameters HH_out, HT_out, the output will be put into these matrices instead of model.

        Args:
            X (matrix): input data matrix size (N * `inputs`)
            T (matrix): output data matrix size (N * `outputs`)
            wc (vector): vector of weights for data samples, one weight per sample, size (N * 1)
            HH_out, HT_out (matrix, optional): output matrices to add batch result into, always given together
        """
        devH = self._project(X, dev=True)
        T = np.array(T, order="C", dtype=self.precision)
        devT = gpuarray.to_gpu(T)
        if wc is not None:  # apply weights if given
            w = np.array(wc**0.5, dtype=self.precision)[:, None]  # re-shape to column matrix
            devWC = gpuarray.to_gpu(w)
            misc.mult_matvec(devH, devWC, axis=0, out=devH)
            misc.mult_matvec(devT, devWC, axis=0, out=devT)

        if self.HH is None:  # initialize space for self.HH, self.HT
            self.HT = misc.zeros((self.L, self.outputs), dtype=self.precision)
            self.HH = linalg.eye(self.L, dtype=self.precision)
            self.HH *= self.norm

        linalg.add_dot(devH, devT, self.HT, transa='T')
        if self.precision is np.float64:
            linalg.add_dot(devH, devH, self.HH, transa='T')
        else:
            cublas.cublasSsyrk(self.handle, 'L', 'N', self.L, X.shape[0], 1, devH.ptr, self.L, 1, self.HH.ptr, self.L)
Esempio n. 2
0
    def add_batch(self, X, T, wc=None):
        """Add a batch of training data to an iterative solution, weighted if neeed.

        The batch is processed as a whole, the training data is splitted in `ELM.add_data()` method.
        With parameters HH_out, HT_out, the output will be put into these matrices instead of model.

        Args:
            X (matrix): input data matrix size (N * `inputs`)
            T (matrix): output data matrix size (N * `outputs`)
            wc (vector): vector of weights for data samples, one weight per sample, size (N * 1)
            HH_out, HT_out (matrix, optional): output matrices to add batch result into, always given together
        """
        devH = self._project(X, dev=True)
        T = np.array(T, order="C", dtype=self.precision)
        devT = gpuarray.to_gpu(T)
        if wc is not None:  # apply weights if given
            w = np.array(
                wc**0.5,
                dtype=self.precision)[:, None]  # re-shape to column matrix
            devWC = gpuarray.to_gpu(w)
            misc.mult_matvec(devH, devWC, axis=0, out=devH)
            misc.mult_matvec(devT, devWC, axis=0, out=devT)

        if self.HH is None:  # initialize space for self.HH, self.HT
            self.HT = misc.zeros((self.L, self.outputs), dtype=self.precision)
            self.HH = linalg.eye(self.L, dtype=self.precision)
            self.HH *= self.norm

        linalg.add_dot(devH, devT, self.HT, transa='T')
        if self.precision is np.float64:
            linalg.add_dot(devH, devH, self.HH, transa='T')
        else:
            cublas.cublasSsyrk(self.handle, 'L', 'N', self.L, X.shape[0], 1,
                               devH.ptr, self.L, 1, self.HH.ptr, self.L)
Esempio n. 3
0
def sqsum_adddot2(a, b):
    """
    Compute squared euclidean distance between two 2D arrays representing
    n-dimensional points using GPU. This uses the GPUArray versions of the
    input arrays to compute element-wise summations of squared sum of rows and
    accumulates into the matrix-multiplication result residing on GPU.
    The final result resides on GPU.

    Parameters
    ----------
    A : ndarray
        2D NumPy array of float dtype representing n-dimensional points, with
        each row being one point.
    B : ndarray
        2D NumPy array of float dtype representing n-dimensional points, with
        each row being one point.

    Returns
    -------
    out : GPUArray
        This holds the euclidean distances.

    """

    a_gpu = gpuarray.to_gpu(a)
    b_gpu = gpuarray.to_gpu(b)
    c_gpu = sq_sums(a_gpu, b_gpu)
    return culinalg.add_dot(a_gpu, b_gpu, c_gpu, transb='T', alpha=-2.0)
Esempio n. 4
0
def sqsum_adddot(a, b, method):
    """
    Compute squared euclidean distance between two 2D arrays representing
    n-dimensional points using GPU. This uses the input arrays themselves to
    compute element-wise summations of squared sum of rows and accumulates into
    the matrix-multiplication result residing on GPU.
    The final result resides on GPU.

    Parameters
    ----------
    A : ndarray
        2D NumPy array of float dtype representing n-dimensional points, with
        each row being one point.
    B : ndarray
        2D NumPy array of float dtype representing n-dimensional points, with
        each row being one point.
    method : str
        It can be 'add_togpu' or 'togpu_misc_add' or 'togpu_cuda_add'.
        Refer to function "squared_sum" for more information.

    Returns
    -------
    out : GPUArray
        This holds the euclidean distances residing on GPU.
    """

    a_gpu = gpuarray.to_gpu(a)
    b_gpu = gpuarray.to_gpu(b)
    c_gpu = squared_sum(a, b, method=method)
    return culinalg.add_dot(a_gpu, b_gpu, c_gpu, transb='T', alpha=-2.0)
Esempio n. 5
0
    def get_distances_to_centers(self, data):

        # make sure the array is c order
        data = np.asarray(data, dtype=np.float32, order='C')

        # ship to gpu
        data_gpu = gpuarray.to_gpu(data)

        # alloc space on gpu for distances
        dists_shape = (data.shape[0], self.centers.shape[0])
        dists_gpu = gpuarray.zeros(dists_shape, np.float32)

        # calc data norms on gpu
        data_norms = cumisc.sum(data_gpu**2, axis=1)

        # calc distance on gpu
        cumisc.add_matvec(dists_gpu, self.center_norms, 1, dists_gpu)
        cumisc.add_matvec(dists_gpu, data_norms, 0, dists_gpu)
        culinalg.add_dot(data_gpu, self.centers_gpu,
            dists_gpu, transb='T', alpha=-2.0)
        return dists_gpu
Esempio n. 6
0
def multiplyConv2DHGradKLGPU(W, H, V, VLam, doDivision=True):
    """
    Compute the 2D convolutional multiplicative update for H
    under the Kullback-Liebler divergence, using skcuda
    :param W: A TxNxK matrix of K sources over spatiotemporal spans NxT\
    :param H: A FxKxM matrix of source activations for each submatrix of W\
            over F transpositions over M time
    :param VLam: Convolutional WH multiplication
    :param doDivision: If true, return the factor Numerator/Denomenator\
        otherwise, return (Numerator, Denomenator)
    :returns Ratio: A FxKxM matrix of multiplicative updates for H\
        or (RatioNum, RatioDenom) if doDivision = False
    """
    HNums = gpuarray.zeros(H.shape, np.float32)
    HDenoms = gpuarray.zeros((H.shape[0], H.shape[1]), np.float32)
    thisVLam = VLam.copy()
    ZerosToOnes(thisVLam)
    VLamQuot = skcuda.misc.divide(V, thisVLam)
    thisVLamQuot = VLamQuot.copy()
    thisW = W.copy()
    for t in range(W.shape[0]):
        if t > 0:
            z = gpuarray.zeros((V.shape[0], t), np.float32)
            thisVLamQuot[:, 0:-t] = VLamQuot[:, t::]
            thisVLamQuot[:, -t::] = z
        for f in range(H.shape[0]):
            if f > 0:
                thisW[t, f::, :] = W[t, 0:-f, :]
                thisW[t, 0:f, :] = gpuarray.zeros((f, W.shape[2]), np.float32)
            linalg.add_dot(thisW[t, :, :],
                           thisVLamQuot,
                           HNums[f, :, :],
                           transa='T')
            HDenoms[f, :] = skcuda.misc.add(HDenoms[f, :],
                                            skcuda.misc.sum(thisW[t, :, :], 0))
    HDenoms = TileHDenom(HDenoms[:, :, None], H.shape[2])
    if doDivision:
        return skcuda.misc.divide(HNums, HDenoms)
    else:
        return (HNums, HDenoms)
Esempio n. 7
0
 def _impl_add_dot_matrix_tests(self, dtype, transa, transb):
     a = np.asarray(np.random.rand(4, 2), dtype)
     if transa == 'n':
         b = np.asarray(np.random.rand(2, 2), dtype)
     else:
         b = np.asarray(np.random.rand(4, 4), dtype)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     aa = a if transa == 'n' else a.T
     bb = b if transb == 'n' else b.T
     c = np.asarray(np.random.rand(aa.shape[0], bb.shape[1]), dtype)
     c_gpu = gpuarray.to_gpu(c)
     c_gpu = linalg.add_dot(a_gpu, b_gpu, c_gpu, transa, transb)
     assert np.allclose(c + np.dot(aa, bb), c_gpu.get())
     a = a.astype(dtype, order="F", copy=True)
     b = b.astype(dtype, order="F", copy=True)
     c = c.astype(dtype, order="F", copy=True)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c_gpu = gpuarray.to_gpu(c)
     c_gpu = linalg.add_dot(a_gpu, b_gpu, c_gpu, transa, transb)
     assert np.allclose(c+np.dot(aa, bb), c_gpu.get())
Esempio n. 8
0
 def _impl_add_dot_matrix_tests(self, dtype, transa, transb):
     a = np.asarray(np.random.rand(4, 2), dtype)
     if transa == 'n':
         b = np.asarray(np.random.rand(2, 2), dtype)
     else:
         b = np.asarray(np.random.rand(4, 4), dtype)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     aa = a if transa == 'n' else a.T
     bb = b if transb == 'n' else b.T
     c = np.asarray(np.random.rand(aa.shape[0], bb.shape[1]), dtype)
     c_gpu = gpuarray.to_gpu(c)
     c_gpu = linalg.add_dot(a_gpu, b_gpu, c_gpu, transa, transb)
     assert np.allclose(c + np.dot(aa, bb), c_gpu.get())
     a = a.astype(dtype, order="F", copy=True)
     b = b.astype(dtype, order="F", copy=True)
     c = c.astype(dtype, order="F", copy=True)
     a_gpu = gpuarray.to_gpu(a)
     b_gpu = gpuarray.to_gpu(b)
     c_gpu = gpuarray.to_gpu(c)
     c_gpu = linalg.add_dot(a_gpu, b_gpu, c_gpu, transa, transb)
     assert np.allclose(c+np.dot(aa, bb), c_gpu.get())
Esempio n. 9
0
def multiplyConv2DHGradGPU(W, H, V, VLam, doDivision=True):
    """
    Compute the 2D convolutional multiplicative update for H using skcuda
    :param W: A TxNxK GPU array of K sources over spatiotemporal spans NxT\
    :param H: A FxKxM GPU array of source activations for each submatrix of W\
            over F transpositions over M time
    :param V: An MxN GPU target array
    :param VLam: An MxN GPU estimate array
    :param doDivision: If true, return the factor Numerator/Denomenator\
        otherwise, return (Numerator, Denomenator)
    """
    thisV = V.copy()
    thisVLam = VLam.copy()
    thisW = W.copy()
    HNums = gpuarray.zeros(H.shape, np.float32)
    HDenoms = gpuarray.zeros(H.shape, np.float32)
    for t in range(W.shape[0]):
        if t > 0:
            #thisV = shiftMatLRUD(V, dj=-t)
            z = gpuarray.zeros((V.shape[0], t), np.float32)
            thisV[:, 0:-t] = V[:, t::]
            thisV[:, -t::] = z
            thisVLam[:, 0:-t] = VLam[:, t::]
            thisVLam[:, -t::] = z
        for f in range(H.shape[0]):
            if f > 0:
                #thisW = shiftMatLRUD(W[t, :, :], di=f)
                thisW[t, f::, :] = W[t, 0:-f, :]
                thisW[t, 0:f, :] = gpuarray.zeros((f, W.shape[2]), np.float32)
            linalg.add_dot(thisW[t, :, :], thisV, HNums[f, :, :], transa='T')
            linalg.add_dot(thisW[t, :, :],
                           thisVLam,
                           HDenoms[f, :, :],
                           transa='T')
    if doDivision:
        return skcuda.misc.divide(HNums, HDenoms)
    else:
        return (HNums, HDenoms)
Esempio n. 10
0
def multiplyConv2DWGradGPU(W, H, V, VLam, doDivision=True):
    """
    Compute the 2D convolutional multiplicative update for W using skcuda
    :param W: A TxNxK GPU array of K sources over spatiotemporal spans NxT\
    :param H: A FxKxM GPU array of source activations for each submatrix of W\
            over F transpositions over M time
    :param V: An MxN GPU target array
    :param VLam: An MxN GPU estimate array
    :param doDivision: If true, return the factor Numerator/Denomenator\
        otherwise, return (Numerator, Denomenator)
    """
    thisV = V.copy()
    thisVLam = VLam.copy()
    thisH = H.copy()
    WNums = gpuarray.zeros(W.shape, np.float32)
    WDenoms = gpuarray.zeros(W.shape, np.float32)
    for f in range(H.shape[0]):
        if f > 0:
            z = gpuarray.zeros((f, V.shape[1]), np.float32)
            thisV[0:-f, :] = V[f::, :]
            thisV[-f::, :] = z
            thisVLam[0:-f, :] = VLam[f::, :]
            thisVLam[-f::, :] = z
        for t in range(W.shape[0]):
            if t > 0:
                thisH[f, :, t::] = H[f, :, 0:-t]
                thisH[f, :, 0:t] = gpuarray.zeros((H.shape[1], t), np.float32)
            linalg.add_dot(thisV, thisH[f, :, :], WNums[t, :, :], transb='T')
            linalg.add_dot(thisVLam,
                           thisH[f, :, :],
                           WDenoms[t, :, :],
                           transb='T')
    if doDivision:
        return skcuda.misc.divide(WNums, WDenoms)
    else:
        return (WNums, WDenoms)
Esempio n. 11
0
    def _impl_test_dot_strided(self, dtype):
        # n/n
        a = np.asarray(np.random.rand(4, 10), dtype)
        b = np.asarray(np.random.rand(2, 20), dtype)
        c = np.zeros((4, 30), dtype)
        a_gpu = gpuarray.to_gpu(a)
        b_gpu = gpuarray.to_gpu(b)
        c_gpu = gpuarray.to_gpu(c)
        linalg.add_dot(a_gpu[:, 4:6], b_gpu[:, 2:8], c_gpu[:, 1:7], 'n', 'n')
        res = c_gpu.get()
        assert np.allclose(np.dot(a[:, 4:6], b[:, 2:8]), res[:, 1:7])

        # t/n
        a = np.asarray(np.random.rand(4, 10), dtype)
        b = np.asarray(np.random.rand(4, 20), dtype)
        c = np.zeros((2, 30), dtype)
        a_gpu = gpuarray.to_gpu(a)
        b_gpu = gpuarray.to_gpu(b)
        c_gpu = gpuarray.to_gpu(c)
        linalg.add_dot(a_gpu[:, 4:6], b_gpu[:, 2:8], c_gpu[:, 1:7], 't', 'n')
        res = c_gpu.get()
        assert np.allclose(np.dot(a[:, 4:6].T, b[:, 2:8]), res[:, 1:7])

        # n/t
        a = np.asarray(np.random.rand(4, 10), dtype)
        b = np.asarray(np.random.rand(6, 20), dtype)
        c = np.zeros((4, 30), dtype)
        a_gpu = gpuarray.to_gpu(a)
        b_gpu = gpuarray.to_gpu(b)
        c_gpu = gpuarray.to_gpu(c)
        linalg.add_dot(a_gpu[:, 4:10], b_gpu[:, 2:8], c_gpu[:, 1:7], 'n', 't')
        res = c_gpu.get()
        assert np.allclose(np.dot(a[:, 4:10], b[:, 2:8].T), res[:, 1:7])

        # t/t
        a = np.asarray(np.random.rand(6, 10), dtype)
        b = np.asarray(np.random.rand(8, 20), dtype)
        c = np.zeros((2, 30), dtype)
        a_gpu = gpuarray.to_gpu(a)
        b_gpu = gpuarray.to_gpu(b)
        c_gpu = gpuarray.to_gpu(c)
        linalg.add_dot(a_gpu[:, 4:6], b_gpu[:, 2:8], c_gpu[:, 1:9], 't', 't')
        res = c_gpu.get()
        assert np.allclose(np.dot(a[:, 4:6].T, b[:, 2:8].T), res[:, 1:9])
Esempio n. 12
0
    def _impl_test_dot_strided(self, dtype):
        # n/n
        a = np.asarray(np.random.rand(4, 10), dtype)
        b = np.asarray(np.random.rand(2, 20), dtype)
        c = np.zeros((4, 30), dtype)
        a_gpu = gpuarray.to_gpu(a)
        b_gpu = gpuarray.to_gpu(b)
        c_gpu = gpuarray.to_gpu(c)
        linalg.add_dot(a_gpu[:, 4:6], b_gpu[:, 2:8], c_gpu[:, 1:7], 'n', 'n')
        res = c_gpu.get()
        assert np.allclose(np.dot(a[:, 4:6], b[:, 2:8]), res[:, 1:7])

        # t/n
        a = np.asarray(np.random.rand(4, 10), dtype)
        b = np.asarray(np.random.rand(4, 20), dtype)
        c = np.zeros((2, 30), dtype)
        a_gpu = gpuarray.to_gpu(a)
        b_gpu = gpuarray.to_gpu(b)
        c_gpu = gpuarray.to_gpu(c)
        linalg.add_dot(a_gpu[:, 4:6], b_gpu[:, 2:8], c_gpu[:, 1:7], 't', 'n')
        res = c_gpu.get()
        assert np.allclose(np.dot(a[:, 4:6].T, b[:, 2:8]), res[:, 1:7])

        # n/t
        a = np.asarray(np.random.rand(4, 10), dtype)
        b = np.asarray(np.random.rand(6, 20), dtype)
        c = np.zeros((4, 30), dtype)
        a_gpu = gpuarray.to_gpu(a)
        b_gpu = gpuarray.to_gpu(b)
        c_gpu = gpuarray.to_gpu(c)
        linalg.add_dot(a_gpu[:, 4:10], b_gpu[:, 2:8], c_gpu[:, 1:7], 'n', 't')
        res = c_gpu.get()
        assert np.allclose(np.dot(a[:, 4:10], b[:, 2:8].T), res[:, 1:7])

        # t/t
        a = np.asarray(np.random.rand(6, 10), dtype)
        b = np.asarray(np.random.rand(8, 20), dtype)
        c = np.zeros((2, 30), dtype)
        a_gpu = gpuarray.to_gpu(a)
        b_gpu = gpuarray.to_gpu(b)
        c_gpu = gpuarray.to_gpu(c)
        linalg.add_dot(a_gpu[:, 4:6], b_gpu[:, 2:8], c_gpu[:, 1:9], 't', 't')
        res = c_gpu.get()
        assert np.allclose(np.dot(a[:, 4:6].T, b[:, 2:8].T), res[:, 1:9])
Esempio n. 13
0
 def dot_add_mm(self, a, b, out, transa=False, transb=False):
     transa = 'T' if transa else 'N'
     transb = 'T' if transb else 'N'
     culinalg.add_dot(a, b, out, transa, transb)
Esempio n. 14
0
    def run(self, iterations):

        for i in range(0,iterations):
            # F = XG(G.T G)^-1
            linalg.add_dot(self.G_gpu, self.G_gpu, self.GTG_gpu, transa="T",
                           beta=0.)
            try:
                self.GTGinv_gpu.set(np.linalg.inv(self.GTG_gpu.get()))
                # linalg.pinv only worked with CULA
            except LinAlgError:
                self.GTGinv_gpu.set(np.linalg.iinv(self.GTG_gpu.get()))
            linalg.add_dot(self.X_gpu, self.G_gpu, self.XG_gpu, beta=0.)
            linalg.add_dot(self.XG_gpu, self.GTGinv_gpu, self.F_gpu, beta=0.)

            # preparation and calculation of the matrix separations
            linalg.add_dot(self.X_gpu, self.F_gpu, self.XTF_gpu, transa="T",
                           beta=0.)
            linalg.add_dot(self.F_gpu, self.F_gpu, self.FTF_gpu, transa="T",
                           beta=0.)
            self.matrix_separationXTF(self.XTF_gpu, self.XTFpos_gpu,
                                      self.XTFneg_gpu,
                                      block=(self.block_G, 1, 1),
                                      grid=(self.grid_G, 1))
            self.matrix_separationFTF(self.FTF_gpu, self.FTFpos_gpu,
                                      self.FTFneg_gpu,
                                      block=(self.block_FTF, 1, 1),
                                      grid=(self.grid_FTF, 1))

            # compute the G update
            linalg.add_dot(self.G_gpu, self.FTFpos_gpu, self.GFTFpos_gpu,
                           beta=0.)
            linalg.add_dot(self.G_gpu, self.FTFneg_gpu, self.GFTFneg_gpu,
                           beta=0.)
            self.G_ew_update(self.G_gpu, self.XTFpos_gpu, self.GFTFneg_gpu,
                             self.XTFneg_gpu, self.GFTFpos_gpu,
                             block=(self.block_G, 1, 1),
                             grid=(self.grid_G, 1))

            # test for convergence
            if (i % self.niter_test_conv == 0) and self.checkConvergence():
                print "NMF converged after %i iterations" % i
                break
Esempio n. 15
0
    def run(self, iterations):

        for i in range(0,iterations):
            # update H
            linalg.add_dot(self.W_gpu, self.X_gpu, self.WTX_gpu, transa="T",
                           beta=0.) # add_dot is faster than dot, dot calls ad
            linalg.add_dot(self.W_gpu, self.W_gpu, self.WTW_gpu, transa="T",
                           beta=0.)
            linalg.add_dot(self.WTW_gpu, self.H_gpu, self.WTWH_gpu, beta=0.)
            self.update_H(self.H_gpu, self.WTX_gpu, self.WTWH_gpu,
                          block=(self.block_H, 1, 1), grid=(self.grid_H, 1))

            # update W
            linalg.add_dot(self.X_gpu, self.H_gpu, self.XHT_gpu, transb="T",
                           beta=0.)
            linalg.add_dot(self.W_gpu, self.H_gpu, self.WH_gpu, beta=0.)
            linalg.add_dot(self.WH_gpu, self.H_gpu, self.WHHT_gpu, transb="T", beta=0.)
            self.update_W(self.W_gpu, self.XHT_gpu, self.WHHT_gpu,
                          block=(self.block_W, 1, 1), grid=(self.grid_W, 1))

            # test for convergence
            if (i % self.niter_test_conv == 0) and self.checkConvergence():
                print "NMF converged after %i iterations" % i
                break
Esempio n. 16
0
def demosaick_gpu(img):
    img = gp.to_gpu(img)
    p2x = im2col(img, _i2c2)
    cm.log(img + _eps, out=img)
    p1x = im2col(img, _i2c1)

    wA = p1x.shape[0]
    wB = p2x.shape[0]
    hA = p1x.shape[1]
    hB = p2x.shape[1]

    # Path 1
    p1x = p1x.reshape([wA * hA, 576])
    p1y = lg.dot(p1x, _wts.int1)
    cm.exp(p1y, out=p1y)

    p1y = p1y.reshape([wA * hA * 64, 3 * _ofac])
    p1x = lg.dot(p1y, _wts.int2)
    msc.add_matvec(p1x, _wts.int2b, out=p1x)
    p1x = p1x.reshape([wA * hA * 64 * 3, _ofac])

    # Path 2
    # conv1
    p2x = p2x.reshape([wB * hB, 64])
    p2y = lg.dot(p2x, _wts.c1)
    msc.add_matvec(p2y, _wts.c1b, out=p2y)
    gp.maximum(p2y, 0., p2y)
    p2y = p2y.reshape([wB, hB, _numsel])

    # conv2
    shI = [wB - 1, hB - 1, _numsel]
    shM = [(wB - 1) * (hB - 1), _numsel]
    p2x = gp.empty(shM, dtype=np.float32)
    pTT = gp.empty(shI, dtype=np.float32)

    pTT = pTT.reshape(shI)
    pTT[...] = p2y[0:-1, 0:-1, :]
    pTT = pTT.reshape(shM)
    p2x = lg.dot(pTT, _wts.c200)
    pTT = pTT.reshape(shI)
    pTT[...] = p2y[0:-1, 1:, :]
    pTT = pTT.reshape(shM)
    lg.add_dot(pTT, _wts.c201, p2x)
    pTT = pTT.reshape(shI)
    pTT[...] = p2y[1:, 0:-1, :]
    pTT = pTT.reshape(shM)
    lg.add_dot(pTT, _wts.c210, p2x)
    pTT = pTT.reshape(shI)
    pTT[...] = p2y[1:, 1:, :]
    pTT = pTT.reshape(shM)
    lg.add_dot(pTT, _wts.c211, p2x)
    msc.add_matvec(p2x, _wts.c2b, out=p2x)
    gp.maximum(p2x, 0., p2x)
    p2x = p2x.reshape(shI)

    # conv 3
    shI = [wB - 2, hB - 2, _numsel]
    shM = [(wB - 2) * (hB - 2), _numsel]
    p2y = gp.empty(shM, dtype=np.float32)
    pTT = gp.empty(shI, dtype=np.float32)

    pTT = pTT.reshape(shI)
    pTT[...] = p2x[0:-1, 0:-1, :]
    pTT = pTT.reshape(shM)
    p2y = lg.dot(pTT, _wts.c300)
    pTT = pTT.reshape(shI)
    pTT[...] = p2x[0:-1, 1:, :]
    pTT = pTT.reshape(shM)
    lg.add_dot(pTT, _wts.c301, p2y)
    pTT = pTT.reshape(shI)
    pTT[...] = p2x[1:, 0:-1, :]
    pTT = pTT.reshape(shM)
    lg.add_dot(pTT, _wts.c310, p2y)
    pTT = pTT.reshape(shI)
    pTT[...] = p2x[1:, 1:, :]
    pTT = pTT.reshape(shM)
    lg.add_dot(pTT, _wts.c311, p2y)
    msc.add_matvec(p2y, _wts.c3b, out=p2y)
    gp.maximum(p2y, 0., p2y)

    p2x = lg.dot(p2y, _wts.sout)

    msc.add_matvec(p2x, _wts.soutb, out=p2x)
    gp.maximum(p2x, 0., p2x)
    p2x = p2x.reshape(p1x.shape)

    # Combine
    p1x *= p2x
    p1 = msc.sum(p1x, axis=1)
    gp.maximum(p1, 0., p1)
    gp.minimum(p1, 1., p1)
    p1 = p1.reshape([wA, hA, 64 * 3])

    im = p2im(p1.get())

    return im
Esempio n. 17
0
def getnextz(\
            z_last,za, c_obs_long,\
            Nx,Ny,nx,ny,nXobs,\
            Se_inv,Sa_inv,\
            Arule, Brule, Crule, Drule,\
            KAxrule, KAyrule, \
            KBxrule, KByrule, \
            KCxrule, KCyrule, \
            KDxrule, KDyrule):          
    
    
    nx_last_long = cds.dot(Nx,z_last)
    ny_last_long = cds.dot(Ny,z_last)
    nx_last = matrix(reshape(nx_last_long,(ny-1,nx-1))); 
    ny_last = matrix(reshape(ny_last_long,(ny-1,nx-1))); 
           
    vbigK, bigc_last = getbigK(\
            nx,ny,nXobs, 
            nx_last, ny_last,
            Arule, Brule, Crule, Drule,
            KAxrule, KAyrule, 
            KBxrule, KByrule, 
            KCxrule, KCyrule, 
            KDxrule, KDyrule)    
            
    NxNy = vstack((Nx,Ny))
    
    KN = cds.dot(vbigK, NxNy)
    
    
    nobs = nXobs*4   # int*int
    bigc_last_long = matrix(reshape(cds.T(bigc_last),(nobs,1)))
    delta_c = cds.substract(c_obs_long, bigc_last_long)

#     # This is the Gauss method
#     dz = linalg.inv(KN.T*KN)*KN.T*delta_c
#     z_next = z_last + dz
#     di2 = squeeze(dot(dz.T,dz))
    
    # This is the optimal estimation method
    
    # nasledujici dva radky by se daly mergnout
    
    
    #term3
    zz_gpu = gpuarray.to_gpu(z_last-za)
    delta_c_gpu = gpuarray.to_gpu(delta_c)
    KN_gpu = gpuarray.to_gpu(KN)  # (important for term2)
    
    temp0_gpu = linalg.add_dot(KN_gpu, zz_gpu, delta_c_gpu)
    
    zz_gpu.gpudata.free()
    del(zz_gpu)
    
    
    Se_inv_gpu = gpuarray.to_gpu(Se_inv)  # (important for term2)
    
    temp1_gpu = linalg.dot(Se_inv_gpu, temp0_gpu)
    
    #temp0_gpu.gpudata.free()
    #del(temp0_gpu)
    
    term3_gpu = linalg.dot(KN_gpu, temp1_gpu, transa="T")
    term3 = term3_gpu.get()
    
    temp1_gpu.gpudata.free()
    del(temp1_gpu)
    term3_gpu.gpudata.free()
    del(term3_gpu)
        
    # term2
    temp2_gpu = linalg.dot(Se_inv_gpu, KN_gpu)
    
    Se_inv_gpu.gpudata.free()
    del(Se_inv_gpu)
    
    Sa_inv_gpu = gpuarray.to_gpu(Sa_inv)
    
    term2_gpu = linalg.add_dot(KN_gpu, temp2_gpu, Sa_inv_gpu, transa="T")
    
    temp2_gpu.gpudata.free()
    del(temp2_gpu)
    
    KN_gpu.gpudata.free()
    del(KN_gpu)
    
    term2 = term2_gpu.get()
    #term0 = cds.dot3(Se_inv, KN)
    #term2 = Sa_inv+term0
    #term3 = cds.dot2(KN, cds.dot(Se_inv, (delta_c+cds.dot(KN,(z_last-za)))))
    
    z_next = za + np.linalg.solve(term2,term3) # same as term2\term3
        
    dz_gpu = gpuarray.to_gpu(z_next-z_last)
    
    temp3_gpu = linalg.dot(term2_gpu, dz_gpu)
    
    term2_gpu.gpudata.free()
    del(term2_gpu)
    
    di2_gpu = linalg.dot(dz_gpu, temp3_gpu, transa='T')
    
    #dz = z_next - z_last
    #di2 = cds.dot3(term2, dz)

    # Get out
    return z_next, di2_gpu.get()
Esempio n. 18
0
 def dot_add_mm(self, a, b, out, transa=False, transb=False):
     transa = 'T' if transa else 'N'
     transb = 'T' if transb else 'N'
     culinalg.add_dot(a, b, out, transa, transb)
Esempio n. 19
0
def elmvis(Xraw,
           A,
           slowdown=10,
           report=5,
           maxtime=24*60*60,
           tol=0,
           batch=None,
           maxiter=None,
           maxupdate=None,
           maxstall=None,
           cossim=None,
           silent=False):
    """ELMVIS+ function running in GPU memory.
    """
    X = Xraw / np.linalg.norm(Xraw, axis=1)[:, None]  # unit-length version of X
    Xh = np.dot(A, X)  # X_hat, predicted value of X
    N, d = X.shape
    I = np.arange(N)  # index of samples

    # set default values
    if cossim is None: cossim = np.trace(X.T.dot(A).dot(X)) / N
    if maxiter is None: maxiter = N*N*N
    if maxupdate is None: maxupdate = N*N
    if maxstall is None: maxstall = N*N

    if not silent:
        print "original similarity: ", cossim

    # init GPU
    dt = X.dtype.type
    try:
        linalg.init()
    except ImportError as e:
        print e
    devA = gpuarray.to_gpu(A.astype(dt))
    devX = gpuarray.to_gpu(X.astype(dt))
    devXi1 = gpuarray.empty((d,), dtype=dt)
    devXh = linalg.dot(devA, devX)
    devAi = gpuarray.empty((N, 2), dtype=dt)
    devDelta = gpuarray.empty((2, d), dtype=dt)
    result = gpuarray.empty((d,), dtype=dt)

    # swap kernel
    kernel = """
        __global__ void diff(%s *A, %s *Y, %s *AY, %s *result, long d, long N, long i1, long i2) {
            long j = blockDim.x * blockIdx.x + threadIdx.x;
            %s yi1 = Y[i1*d + j];
            %s yi2 = Y[i2*d + j];
            result[j] = (A[i1*N + i1] * (yi2 - yi1) + 2*AY[i1*d + j]) * (yi2 - yi1) +
                        (A[i2*N + i2] * (yi1 - yi2) + 2*(AY[i2*d + j] + A[i2*N + i1]*(yi2 - yi1))) * (yi1 - yi2);
        }
        """
    if dt is np.float64:
        kernel = kernel % ("double", "double", "double", "double", "double", "double")
    else:
        kernel = kernel % ("float", "float", "float", "float", "float", "float")
    mod_diff = SourceModule(kernel)
    dev_diff = mod_diff.get_function("diff")
    dev_diff.prepare("PPPPllll")
    block = result._block
    grid = (int(np.ceil(1.0 * result.shape[0] / block[0])), 1)

    t0 = tlast = time()
    stall = 0
    iters = 0
    updates = 0
    updates_last = 0
    iters_last = 0
    ups_max = 0

    while (iters < maxiter) and (stall < maxstall):
        iters += 1
        stall += 1

        # get two different random numbers
        i1, i2 = np.random.randint(0, N, size=2)
        while i1 == i2:
            i1, i2 = np.random.randint(0, N, size=2)

        dev_diff.prepared_call(grid, block, devA.gpudata, devX.gpudata, devXh.gpudata, result.gpudata, d, N, i1, i2)
        diff = np.sum(result.get())

        if diff > tol:
            stall = 0
            devAi[:, 0] = devA[:, i1]
            devAi[:, 1] = devA[:, i2]
            devDelta[0, :] = devX[i1, :] - devX[i2, :]
            devDelta[1, :] = devX[i2, :] - devX[i1, :]
            linalg.add_dot(devAi, devDelta, devXh, alpha=-1)

            tI = I[i1]
            I[i1] = I[i2]
            I[i2] = tI

            devXi1[:] = devX[i1, :]
            devX[i1] = devX[i2]
            devX[i2] = devXi1

            cossim += diff / N
            updates += 1
            if updates > maxupdate:
                break

        t = time()
        if t - tlast > report:
            ups = (updates-updates_last)*1.0/(t-tlast)
            ips = (iters-iters_last)*1.0/(t-tlast)
            if not silent:
                print "%d iters | %d updates | %.0f iters/s | %.0f updates/s | cos similarity = %.4f" % (iters, updates, ips, ups, cossim)

            updates_last = updates
            iters_last = iters
            tlast = t
            ups_max = max(ups, ups_max)
            if ups < ups_max/slowdown:
                break

        if t - t0 > maxtime:
            break

    ips = iters*1.0/(time()-t0)
    ups = updates*1.0/(time()-t0)
    Xraw[:] = Xraw[I]

    cossim = np.trace(X.T.dot(A).dot(X)) / N
    if not silent:
        print "final similarity: ", cossim

    info = {'cossim': cossim, 'iters': iters, 'updates': updates, 'ips': ips, 'ups': ups}
    return I, info
Esempio n. 20
0
def elmvis(Xraw,
           A,
           slowdown=10,
           report=5,
           maxtime=24 * 60 * 60,
           tol=0,
           batch=None,
           maxiter=None,
           maxupdate=None,
           maxstall=None,
           cossim=None,
           silent=False):
    """ELMVIS+ function running in GPU memory.
    """
    X = Xraw / np.linalg.norm(Xraw, axis=1)[:,
                                            None]  # unit-length version of X
    Xh = np.dot(A, X)  # X_hat, predicted value of X
    N, d = X.shape
    I = np.arange(N)  # index of samples

    # set default values
    if cossim is None: cossim = np.trace(X.T.dot(A).dot(X)) / N
    if maxiter is None: maxiter = N * N * N
    if maxupdate is None: maxupdate = N * N
    if maxstall is None: maxstall = N * N

    if not silent:
        print "original similarity: ", cossim

    # init GPU
    dt = X.dtype.type
    try:
        linalg.init()
    except ImportError as e:
        print e
    devA = gpuarray.to_gpu(A.astype(dt))
    devX = gpuarray.to_gpu(X.astype(dt))
    devXi1 = gpuarray.empty((d, ), dtype=dt)
    devXh = linalg.dot(devA, devX)
    devAi = gpuarray.empty((N, 2), dtype=dt)
    devDelta = gpuarray.empty((2, d), dtype=dt)
    result = gpuarray.empty((d, ), dtype=dt)

    # swap kernel
    kernel = """
        __global__ void diff(%s *A, %s *Y, %s *AY, %s *result, long d, long N, long i1, long i2) {
            long j = blockDim.x * blockIdx.x + threadIdx.x;
            %s yi1 = Y[i1*d + j];
            %s yi2 = Y[i2*d + j];
            result[j] = (A[i1*N + i1] * (yi2 - yi1) + 2*AY[i1*d + j]) * (yi2 - yi1) +
                        (A[i2*N + i2] * (yi1 - yi2) + 2*(AY[i2*d + j] + A[i2*N + i1]*(yi2 - yi1))) * (yi1 - yi2);
        }
        """
    if dt is np.float64:
        kernel = kernel % ("double", "double", "double", "double", "double",
                           "double")
    else:
        kernel = kernel % ("float", "float", "float", "float", "float",
                           "float")
    mod_diff = SourceModule(kernel)
    dev_diff = mod_diff.get_function("diff")
    dev_diff.prepare("PPPPllll")
    block = result._block
    grid = (int(np.ceil(1.0 * result.shape[0] / block[0])), 1)

    t0 = tlast = time()
    stall = 0
    iters = 0
    updates = 0
    updates_last = 0
    iters_last = 0
    ups_max = 0

    while (iters < maxiter) and (stall < maxstall):
        iters += 1
        stall += 1

        # get two different random numbers
        i1, i2 = np.random.randint(0, N, size=2)
        while i1 == i2:
            i1, i2 = np.random.randint(0, N, size=2)

        dev_diff.prepared_call(grid, block, devA.gpudata, devX.gpudata,
                               devXh.gpudata, result.gpudata, d, N, i1, i2)
        diff = np.sum(result.get())

        if diff > tol:
            stall = 0
            devAi[:, 0] = devA[:, i1]
            devAi[:, 1] = devA[:, i2]
            devDelta[0, :] = devX[i1, :] - devX[i2, :]
            devDelta[1, :] = devX[i2, :] - devX[i1, :]
            linalg.add_dot(devAi, devDelta, devXh, alpha=-1)

            tI = I[i1]
            I[i1] = I[i2]
            I[i2] = tI

            devXi1[:] = devX[i1, :]
            devX[i1] = devX[i2]
            devX[i2] = devXi1

            cossim += diff / N
            updates += 1
            if updates > maxupdate:
                break

        t = time()
        if t - tlast > report:
            ups = (updates - updates_last) * 1.0 / (t - tlast)
            ips = (iters - iters_last) * 1.0 / (t - tlast)
            if not silent:
                print "%d iters | %d updates | %.0f iters/s | %.0f updates/s | cos similarity = %.4f" % (
                    iters, updates, ips, ups, cossim)

            updates_last = updates
            iters_last = iters
            tlast = t
            ups_max = max(ups, ups_max)
            if ups < ups_max / slowdown:
                break

        if t - t0 > maxtime:
            break

    ips = iters * 1.0 / (time() - t0)
    ups = updates * 1.0 / (time() - t0)
    Xraw[:] = Xraw[I]

    cossim = np.trace(X.T.dot(A).dot(X)) / N
    if not silent:
        print "final similarity: ", cossim

    info = {
        'cossim': cossim,
        'iters': iters,
        'updates': updates,
        'ips': ips,
        'ups': ups
    }
    return I, info