Ejemplo n.º 1
0
 def sum_t(self, a, axis, out):
     if len(a.shape) < 3 and (axis == 0 or axis == 1):
         cumisc.sum(a, axis=axis, out=out)
     elif axis is None:
         cumisc.sum(a.reshape((a.size, 1)), axis=0, out=out)
     else:
         raise NotImplementedError
Ejemplo n.º 2
0
    def _correlate_fft(self, frames_flat, cufft_plan):
        npix = frames_flat.shape[1]

        d_in = cufft_plan.data_in
        d_in.fill(0)
        f_out1 = cufft_plan.data_out
        f_out2 = garray.zeros_like(cufft_plan.data_out)

        # fft(pad(frames_flat), axis=1)
        d_in[:, :self.nframes] = frames_flat.T.astype("f")
        f_out1 = cufft_plan.fft(d_in, output=f_out1)

        # frames_flat.sum(axis=1)
        # skmisc.sum() only works on base data, not gpuarray views,
        # so we sum on the whole array and then extract the right subset.
        skmisc.sum(d_in, axis=0, out=self.d_sums_denom_tmp)

        # fft(pad(frames_flat[::-1]), axis=1)
        d_in.fill(0)
        d_in[:, :self.nframes] = frames_flat.T[:, ::-1].astype("f")
        f_out2 = cufft_plan.fft(d_in, output=f_out2)

        # product, ifft
        f_out1 *= f_out2
        num = cufft_plan.ifft(f_out1, output=d_in)

        # numerator of g_2
        skmisc.sum(num, axis=0, out=self.d_sums)

        # denominator of g_2: correlate(d_sums_denom)
        self._correlate_denom(npix)

        self.d_numerator /= self.d_denom
        res = self.d_numerator.get()
        return res
Ejemplo n.º 3
0
    def _rbf_kernel_vectorized_cublas(data1,
                                      data2,
                                      sigma=10):  # pragma: no cover
        """kernel for edge similarity computed with the vectorized method

        Args:
            data1 (TYPE): pssm data 1
            data2 (TYPE): pssm dta 2
            sigma (int, optional): exponent of the exponetial

        Returns:
            np.array: value of the rbk kernel for all the pairs
        """
        beta = 2 * sigma**2
        d1_ = gpuarray.to_gpu(data1.astype(np.float32))
        d2_ = gpuarray.to_gpu(data2.astype(np.float32))
        mgpu = -2 * culinalg.dot(d1_, d2_, transa='N', transb='T')
        vgpu = cumisc.sum(d1_**2, axis=1)[:, None]
        cumisc.add_matvec(mgpu, vgpu, out=mgpu)

        vgpu = cumisc.sum(d2_**2, axis=1)
        cumisc.add_matvec(mgpu, vgpu, out=mgpu)

        mcpu = mgpu.get()
        return np.exp(-mcpu / beta).reshape(-1)
Ejemplo n.º 4
0
 def sum_t(self, a, axis, out):
     if len(a.shape) < 3 and (axis == 0 or axis == 1):
         cumisc.sum(a, axis=axis, out=out)
     elif axis is None:
         cumisc.sum(a.reshape((a.size, 1)), axis=0, out=out)
     else:
         raise NotImplementedError
Ejemplo n.º 5
0
 def sum_t(self, a, axis, out):
     if len(a.shape) < 3 and (axis == 0 or axis == 1):
         cumisc.sum(a, axis, out)
     elif axis is None:
         self.copy_to(cumisc.sum(a), out)
     else:
         raise NotImplementedError
Ejemplo n.º 6
0
def meanUnderMask(volume, mask=None, p=1, gpu=False):
    """
    meanValueUnderMask: Determines the mean value under a mask
    @param volume: The volume
    @type volume:  L{pytom_volume.vol}
    @param mask:  The mask
    @type mask:  L{pytom_volume.vol}
    @param p: precomputed number of voxels in mask
    @type p: float
    @return: A value (scalar)
    @rtype: single
    @change: support None as mask, FF 08.07.2014
    """

    return sum((volume * mask)) / sum(mask)
Ejemplo n.º 7
0
    def __init__(self, volume, template, mask, wedge, stdV, gpu=True):
        self.volume = gu.to_gpu(volume)

        self.template = Volume(template)
        self.templatePadded = gu.zeros_like(self.volume, dtype=np.float32)

        self.mask = Volume(mask)
        self.maskPadded = gu.zeros_like(self.volume, dtype=np.float32)
        self.sOrg = mask.shape
        self.sPad = volume.shape
        print(self.sPad, self.sOrg)
        rotate(self.mask, [0, 0, 0], self.maskPadded, self.sPad, self.sOrg)
        #paste_in_center_gpu(self.template.d_data, self.templatePadded, np.int32(self.sPad), np.int32(self.maskSize), block=(10, 10, 10), grid=(8,1,1))
        #rotate(self.template, [0, 0, 0], self.templatePadded, self.sPad, self.maskSize)
        print(volume.shape, stdV.shape, wedge.shape)
        self.wedge = gu.to_gpu(wedge)
        self.stdV = gu.to_gpu(stdV)

        self.fwd_plan = Plan(volume.shape, volume.dtype, np.complex64)
        self.inv_plan = Plan(volume.shape, np.complex64, volume.dtype)

        self.volume_fft = gu.zeros_like(self.volume, dtype=np.complex64)
        self.template_fft = gu.zeros_like(self.volume, dtype=np.complex64)

        self.ccc_map = gu.zeros_like(self.volume, dtype=np.float32)
        self.norm_volume = np.prod(volume.shape)

        self.scores = gu.ones_like(self.volume, dtype=np.float32) * -1000
        self.angles = gu.ones_like(self.volume, dtype=np.float32) * -1000
        self.p = sum(self.mask.d_data)
Ejemplo n.º 8
0
def fast_matmul(x, y, x_type, y_type):
    '''
    use pycuda to compute c = a * b
    '''
    linalg.init()
    a_gpu = gpuarray.to_gpu(x.astype(x_type))
    a_t_gpu = gpuarray.to_gpu(x.T.copy().astype(x_type))
    b_gpu = gpuarray.to_gpu(y.astype(y_type))
    # row_sum = gpuarray.zeros(shape = x[0].shape, dtype = x_type)
    row_sum = 0
    # a = np.asarray(x, x_type)
    # b = np.asarray(y, y_type)
    # a_gpu = gpuarray.to_gpu(a)
    # b_gpu = gpuarray.to_gpu(b)

    t1_inside = time.time()
    c_gpu = linalg.dot(a_gpu, b_gpu)
    for a_i in a_gpu:
        # row_sum = misc.add(row_sum, a_i)
        row_sum += a_i
        gg = linalg.dot(a_gpu, b_gpu)
        gg = linalg.dot(a_i, a_i)
        gg = reduce(linalg.dot, (a_gpu, b_gpu, b_gpu, b_gpu))
        # tmp1, tmp2 = linalg.dot(a_gpu, b_gpu), linalg.dot(b_gpu, b_gpu)
        z_gpu = a_gpu.copy()
    tmp = a_t_gpu
    # print('x.T\n', x.T)
    # print('tmp\n', tmp)
    # print('x = a_gpu: ', np.allclose(x, a_gpu.get()))
    # print('x.T = tmp: ', np.allclose(x.T, tmp.get()))

    a_prod = linalg.dot(a_gpu, tmp)
    t2_inside = time.time()
    print('inside cost {:.4f}s'.format(t2_inside - t1_inside))

    a = np.random.randint(-5, 5, (3, 4)).astype(np.float32)
    a_gpu = gpuarray.to_gpu(a)
    norm_gpu = linalg.norm(a_gpu)
    print('is norm right?', np.linalg.norm(a) == norm_gpu)
    a_gpu = abs(a_gpu)
    column_sum = misc.sum(a_gpu, axis=0)
    column_sum = column_sum.reshape((1, -1))
    all_one_gpu = gpuarray.to_gpu(np.ones((3, 1), np.float32))
    div_mat_gpu = linalg.dot(all_one_gpu, column_sum)

    norm_1 = a_gpu / (div_mat_gpu + 1e-3)

    print(a_gpu)
    print(column_sum)
    print(column_sum.shape)
    print(norm_1)
    # abs_a = a_gpu.__abs__()
    # print(a)
    # print(abs_a)
    # c = abs_a + a_gpu
    # print(repr(c))
    # print(type(c))
    # c = 1/2 * c
    # print(a_gpu, c)
    return c_gpu.get(), a_prod.get(), row_sum.get()
Ejemplo n.º 9
0
def log_loss(y_true, y_prob):
    """Compute Logistic loss for classification.

    Parameters
    ----------
    y_true : array-like or label indicator matrix
        Ground truth (correct) labels.

    y_prob : array-like of float, shape = (n_samples, n_classes)
        Predicted probabilities, as returned by a classifier's
        predict_proba method.

    Returns
    -------
    loss : float
        The degree to which the samples are correctly predicted.
    """
    if y_prob.dtype == np.float64:
        cuClip(y_prob.gpudata,
               np.float64(1e-10),
               np.float64(1 - 1e-10),
               np.int32(y_prob.size),
               block=(blockSize, 1, 1),
               grid=(int((y_prob.size - 1) / blockSize + 1), 1, 1))
    else:
        cuClipf(y_prob.gpudata,
                np.float32(1e-10),
                np.float32(1 - 1e-10),
                np.int32(y_prob.size),
                block=(blockSize, 1, 1),
                grid=(int((y_prob.size - 1) / blockSize + 1), 1, 1))

    if y_prob.shape[1] == 1:
        y_prob = gpuarray.to_gpu(
            np.append(1 - y_prob.get(), y_prob.get(), axis=1))

    if y_true.shape[1] == 1:
        y_true = gpuarray.to_gpu(
            np.append(1 - y_true.get(), y_true.get(), axis=1))

    tmp_gpu = gpuarray.GPUArray(y_prob.shape, y_prob.dtype)
    if y_prob.dtype == np.float64:
        cuLogLoss(y_true.gpudata,
                  y_prob.gpudata,
                  tmp_gpu.gpudata,
                  np.int32(y_prob.size),
                  block=(blockSize, 1, 1),
                  grid=(int((y_prob.size - 1) / blockSize + 1), 1, 1))
    else:
        cuLogLossf(y_true.gpudata,
                   y_prob.gpudata,
                   tmp_gpu.gpudata,
                   np.int32(y_prob.size),
                   block=(blockSize, 1, 1),
                   grid=(int((y_prob.size - 1) / blockSize + 1), 1, 1))
    #total = float(misc.sum(y_true * tmp_gpu).get())
    total = float(cumisc.sum(tmp_gpu).get())
    return (-total) / y_prob.shape[0]
Ejemplo n.º 10
0
def softmax_gpu2d(x: gpuarray.GPUArray, dim):
    assert len(x.shape) == 2, 'expected 2-dimension array'
    assert 0 <= dim <= 1, "expected 0 <= dim <=1"
    exp_ker = exp_float_ker if x.dtype == np.float32 else exp_double_ker
    x_exp = gpuarray.empty_like(x)
    exp_ker(x, x_exp)
    x_exp_sum = misc.sum(x_gpu=x_exp, axis=dim)
    x_exp = misc.div_matvec(x_gpu=x_exp, a_gpu=x_exp_sum, axis=1 - dim)
    return x_exp
Ejemplo n.º 11
0
def binary_log_loss(y_true, y_prob):
    """Compute binary logistic loss for classification.

    This is identical to log_loss in binary classification case,
    but is kept for its use in multilabel case.

    Parameters
    ----------
    y_true : array-like or label indicator matrix
        Ground truth (correct) labels.

    y_prob : array-like of float, shape = (n_samples, n_classes)
        Predicted probabilities, as returned by a classifier's
        predict_proba method.

    Returns
    -------
    loss : float
        The degree to which the samples are correctly predicted.
    """
    if y_prob.dtype == np.float64:
        cuClip(y_prob.gpudata,
               np.float64(1e-10),
               np.float64(1 - 1e-10),
               np.int32(y_prob.size),
               block=(blockSize, 1, 1),
               grid=(int((y_prob.size - 1) / blockSize + 1), 1, 1))
    else:
        cuClipf(y_prob.gpudata,
                np.float32(1e-10),
                np.float32(1 - 1e-10),
                np.int32(y_prob.size),
                block=(blockSize, 1, 1),
                grid=(int((y_prob.size - 1) / blockSize + 1), 1, 1))

    tmp_gpu = gpuarray.GPUArray(y_prob.shape, y_prob.dtype)
    if y_prob.dtype == np.float64:
        cuBinaryLogLoss(y_true.gpudata,
                        y_prob.gpudata,
                        tmp_gpu.gpudata,
                        np.int32(y_prob.size),
                        block=(blockSize, 1, 1),
                        grid=(int((y_prob.size - 1) / blockSize + 1), 1, 1))
    else:
        cuBinaryLogLossf(y_true.gpudata,
                         y_prob.gpudata,
                         tmp_gpu.gpudata,
                         np.int32(y_prob.size),
                         block=(blockSize, 1, 1),
                         grid=(int((y_prob.size - 1) / blockSize + 1), 1, 1))

    total = float(cumisc.sum(tmp_gpu).get())
    return (-total) / y_prob.shape[0]
Ejemplo n.º 12
0
 def _impl_test_sum(self, dtype):
     x = np.random.normal(scale=5.0, size=(3, 5))
     x = x.astype(dtype=dtype, order='C')
     x_gpu = gpuarray.to_gpu(x)
     assert_allclose(misc.sum(x_gpu).get(),
                     x.sum(),
                     rtol=dtype_to_rtol[dtype],
                     atol=dtype_to_atol[dtype])
     assert_allclose(misc.sum(x_gpu, axis=0).get(),
                     x.sum(axis=0),
                     rtol=dtype_to_rtol[dtype],
                     atol=dtype_to_atol[dtype])
     assert_allclose(misc.sum(x_gpu, axis=1).get(),
                     x.sum(axis=1),
                     rtol=dtype_to_rtol[dtype],
                     atol=dtype_to_atol[dtype])
     x = x.astype(dtype=dtype, order='F')
     x_gpu = gpuarray.to_gpu(x)
     assert_allclose(misc.sum(x_gpu).get(),
                     x.sum(),
                     rtol=dtype_to_rtol[dtype],
                     atol=dtype_to_atol[dtype])
     assert_allclose(misc.sum(x_gpu, axis=0).get(),
                     x.sum(axis=0),
                     rtol=dtype_to_rtol[dtype],
                     atol=dtype_to_atol[dtype])
     assert_allclose(misc.sum(x_gpu, axis=1).get(),
                     x.sum(axis=1),
                     rtol=dtype_to_rtol[dtype],
                     atol=dtype_to_atol[dtype])
Ejemplo n.º 13
0
 def marginilize_rots_scales(self, posteriors, phases, shift_x, shift_y):
     shift_ind = self.ravel_shift_index(shift_x, shift_y)
     W = np.zeros((self.n_images, self.converter.get_num_prolates()), np.complex64)
     if config.is_use_gpu:
         W_gpu = gpuarray.zeros(W.shape, dtype='complex64')
         for i in np.arange(self.n_images):
             Wi = misc.sum(linalg.dot(posteriors[i, shift_ind], phases), axis=0).reshape((1,-1))
             slice_assign_kernel.slice_assign_1d(W_gpu, Wi, i)
         W = W_gpu.get()
     else:
         for i in np.arange(self.n_images):
             W[i] = np.sum(np.dot(posteriors[i, shift_ind], phases), axis=0)
     return W
    def _cuda_norm(self, X):
        """Caluclate L2-norm on gpu.

        Parameters
        ----------
        X: array
            Array to normalize
        Returns
        -------
        normX: array
            Normalized array

        """
        return misc.divide(X, misc.sum(X**2, axis=1, keepdims=True)**0.5)
Ejemplo n.º 15
0
 def impl_test_sum(self, dtype):
     x = np.random.normal(scale=5.0, size=(3, 5))
     x = x.astype(dtype=dtype, order='C')
     x_gpu = gpuarray.to_gpu(x)
     assert np.allclose(misc.sum(x_gpu).get(), x.sum())
     assert np.allclose(misc.sum(x_gpu, axis=0).get(), x.sum(axis=0))
     assert np.allclose(misc.sum(x_gpu, axis=1).get(), x.sum(axis=1))
     x = x.astype(dtype=dtype, order='F')
     x_gpu = gpuarray.to_gpu(x)
     assert np.allclose(misc.sum(x_gpu).get(), x.sum())
     assert np.allclose(misc.sum(x_gpu, axis=0).get(), x.sum(axis=0))
     assert np.allclose(misc.sum(x_gpu, axis=1).get(), x.sum(axis=1))
Ejemplo n.º 16
0
 def impl_test_sum(self, dtype):
     x = np.random.normal(scale=5.0, size=(3, 5))
     x = x.astype(dtype=dtype, order='C')
     x_gpu = gpuarray.to_gpu(x)
     assert np.allclose(misc.sum(x_gpu).get(), x.sum())
     assert np.allclose(misc.sum(x_gpu, axis=0).get(), x.sum(axis=0))
     assert np.allclose(misc.sum(x_gpu, axis=1).get(), x.sum(axis=1))
     x = x.astype(dtype=dtype, order='F')
     x_gpu = gpuarray.to_gpu(x)
     assert np.allclose(misc.sum(x_gpu).get(), x.sum())
     assert np.allclose(misc.sum(x_gpu, axis=0).get(), x.sum(axis=0))
     assert np.allclose(misc.sum(x_gpu, axis=1).get(), x.sum(axis=1))
Ejemplo n.º 17
0
    def get_distances_to_centers(self, data):

        # make sure the array is c order
        data = np.asarray(data, dtype=np.float32, order='C')

        # ship to gpu
        data_gpu = gpuarray.to_gpu(data)

        # alloc space on gpu for distances
        dists_shape = (data.shape[0], self.centers.shape[0])
        dists_gpu = gpuarray.zeros(dists_shape, np.float32)

        # calc data norms on gpu
        data_norms = cumisc.sum(data_gpu**2, axis=1)

        # calc distance on gpu
        cumisc.add_matvec(dists_gpu, self.center_norms, 1, dists_gpu)
        cumisc.add_matvec(dists_gpu, data_norms, 0, dists_gpu)
        culinalg.add_dot(data_gpu, self.centers_gpu,
            dists_gpu, transb='T', alpha=-2.0)
        return dists_gpu
Ejemplo n.º 18
0
    def run_gpu(self):
        """
        Solves the MFTIE on GPU. The result is stored on an attribute
        "self.phase" containing a GPU array.
        """
        # Extract pre-allocated GPU arrays
        # extract inputs
        iIo = self.iIo
        nm2 = self.inverse_laplacian
        dzI = self.dzI
        ky, kx = self.k
        Nz, Ny, Nx = self.shape

        # create outputs
        ft_dzI = gpuarray.empty((Nz, Ny, Nx), np.complex64)
        gradx = gpuarray.empty((Nz, Ny, Nx), np.complex64)
        grady = gpuarray.empty((Nz, Ny, Nx), np.complex64)

        # extract plans
        ft3dcc = self.pft3dcc
        ft2dcc = self.pft2dcc

        # Do the math!
        # FT(dzI)
        cu_fft.fft(dzI, ft_dzI, ft3dcc)
        # IFT(k*nm2*...)
        cu_fft.ifft((ft_dzI * nm2) * kx, gradx, ft3dcc, True)
        cu_fft.ifft((ft_dzI * nm2) * ky, grady, ft3dcc, True)
        # FT(... / Io)
        cu_fft.fft(gradx * iIo, gradx, ft3dcc)
        cu_fft.fft(grady * iIo, grady, ft3dcc)
        # Sum_z(nm2*(k*...))
        Slapl = misc.sum(
            (nm2 * (kx * gradx + ky * grady)).reshape(Nz, Ny * Nx),
            0).reshape(Ny, Nx)
        # IFT(...)
        cu_fft.ifft(Slapl, self.phdata, ft2dcc, True)
Ejemplo n.º 19
0
 def thunk():
     alpha = gpuarray.to_gpu(np.squeeze(np.asarray(inputs[0]))[:, None])
     x_t = gpuarray.to_gpu(np.asarray(inputs[1])[0, :, :])
     x_f = gpuarray.to_gpu(np.asarray(inputs[2])[0, :, :])
     Xt = cumath.exp(misc.add(linalg.dot(x_t, A), b))
     Xf = cumath.exp(misc.add(linalg.dot(x_f, A), b))
     Xtn = misc.sum(Xt, axis=1, keepdims=True)
     Xfn = misc.sum(Xf, axis=1, keepdims=True)
     Xt = misc.divide(Xt, Xtn)
     Xf = misc.divide(Xf, Xfn)
     w = misc.multiply(Xt, alpha) + misc.multiply(Xf, 1 - alpha)
     wp = cumath.log(w)
     wpn = misc.sum(wp, axis=1, keepdims=True) / self.n
     wp = misc.subtract(wp, wpn)
     t1 = misc.sum(x * wp, axis=1)
     t2 = (self.n + depth) * cumath.log(misc.sum(w, axis=1))
     t3 = depth * wpn
     outputs[0][0] = misc.sum(t1 - t2 + t3).get()
     for v in node.outputs:
         compute_map[v][0] = True
Ejemplo n.º 20
0
 def _impl_test_sum(self, dtype):
     x = np.random.normal(scale=5.0, size=(3, 5))
     x = x.astype(dtype=dtype, order='C')
     x_gpu = gpuarray.to_gpu(x)
     assert_allclose(misc.sum(x_gpu).get(), x.sum(),
                     rtol=dtype_to_rtol[dtype],
                     atol=dtype_to_atol[dtype])
     assert_allclose(misc.sum(x_gpu, axis=0).get(), x.sum(axis=0),
                     rtol=dtype_to_rtol[dtype],
                     atol=dtype_to_atol[dtype])
     assert_allclose(misc.sum(x_gpu, axis=1).get(), x.sum(axis=1),
                     rtol=dtype_to_rtol[dtype],
                     atol=dtype_to_atol[dtype])
     x = x.astype(dtype=dtype, order='F')
     x_gpu = gpuarray.to_gpu(x)
     assert_allclose(misc.sum(x_gpu).get(), x.sum(),
                     rtol=dtype_to_rtol[dtype],
                     atol=dtype_to_atol[dtype])
     assert_allclose(misc.sum(x_gpu, axis=0).get(), x.sum(axis=0),
                     rtol=dtype_to_rtol[dtype],
                     atol=dtype_to_atol[dtype])
     assert_allclose(misc.sum(x_gpu, axis=1).get(), x.sum(axis=1),
                     rtol=dtype_to_rtol[dtype],
                     atol=dtype_to_atol[dtype])
Ejemplo n.º 21
0
        def thunk():
            alpha = gpuarray.to_gpu(np.squeeze(np.asarray(inputs[0]))[:, None])
            x_t = gpuarray.to_gpu(np.asarray(inputs[1])[0, :, :])
            x_f = gpuarray.to_gpu(np.asarray(inputs[2])[0, :, :])
            Xt = cumath.exp(misc.add(linalg.dot(x_t, A), b))
            Xf = cumath.exp(misc.add(linalg.dot(x_f, A), b))
            Xtn = misc.sum(Xt, axis=1, keepdims=True)
            Xfn = misc.sum(Xf, axis=1, keepdims=True)
            Xt = misc.divide(Xt, Xtn)
            Xf = misc.divide(Xf, Xfn)
            w = misc.multiply(Xt, alpha) + misc.multiply(Xf, 1 - alpha)
            dq = Xt - Xf
            qdw = dq / w
            t1 = misc.sum(x * qdw, axis=1)
            f = 2 * depth + self.base.n
            t2 = f * misc.sum(dq, axis=1) / misc.sum(w, axis=1)
            t3 = misc.sum(x, axis=1) * misc.sum(qdw, axis=1)
            dalpha = t1 - t2 + t3
            del dq, t1, f, t2, t3

            iw = 1 / w
            S1 = misc.multiply(
                depth[:, None] * (self.base.n - 1) / self.base.n, iw)
            S2 = (self.base.n + depth[:, None]) / cumath.log(
                misc.sum(w, axis=1, keepdims=True))
            F = misc.multiply(misc.subtract((x * iw) - S1, S2), alpha)
            del w, iw, S1, S2

            cast = gpuarray.zeros((x_t.shape[1], Xt.shape[1]),
                                  dtype=theano.config.floatX)
            dLq_t = gpuarray.zeros(x_t.shape, dtype=theano.config.floatX)
            dLq_f = gpuarray.zeros(x_f.shape, dtype=theano.config.floatX)
            for i in range(Xt.shape[0]):
                S1 = misc.multiply(Xt[None, i, :], A)
                S2 = misc.sum(S1, axis=1, keepdims=True)
                S2 = misc.multiply(S2, misc.add(Xt[None, i, :], cast))
                dLq_t[i, :] = misc.sum(misc.multiply(F[None, i, :], S1 - S2),
                                       axis=1)
                S1 = misc.multiply(Xf[None, i, :], A)
                S2 = misc.sum(S1, axis=1, keepdims=True)
                S2 = misc.multiply(S2, misc.add(Xf[None, i, :], cast))
                dLq_f[i, :] = misc.sum(misc.multiply(F[None, i, :], S1 - S2),
                                       axis=1)
            outputs[0][0] = dalpha.get()
            outputs[1][0] = dLq_t.get()
            outputs[2][0] = dLq_f.get()
            for v in node.outputs:
                compute_map[v][0] = True
Ejemplo n.º 22
0
    def almLasso_mat_fun(self):
        '''
        This function represents the Augumented Lagrangian Multipliers method for Lasso problem.
        The lagrangian form of the Lasso can be expressed as following:

        MIN{ 1/2||Y-XBHETA||_2^2 + lambda||THETA||_1} s.t B-T=0

        When applied to this problem, the ADMM updates take the form

        BHETA^t+1 = (XtX + rhoI)^-1(Xty + rho^t - mu^t)
        THETA^t+1 = Shrinkage_lambda/rho(BHETA(t+1) + mu(t)/rho)
        mu(t+1) = mu(t) + rho(BHETA(t+1) - BHETA(t+1))

        The algorithm involves a 'ridge regression' update for BHETA, a soft-thresholding (shrinkage) step for THETA and
        then a simple linear update for mu

        NB: Actually, this ADMM version contains several variations such as the using of two penalty parameters instead
        of just one of them (mu1, mu2)
        '''

        print('\tADMM processing...')

        alpha1 = alpha2 = 0
        if (len(self.reg_params) == 1):
            alpha1 = self.reg_params[0]
            alpha2 = self.reg_params[0]
        elif (len(self.reg_params) == 2):
            alpha1 = self.reg_params[0]
            alpha2 = self.reg_params[1]

        #thresholds parameters for stopping criteria
        if (len(self.thr) == 1):
            thr1 = self.thr[0]
            thr2 = self.thr[0]
        elif (len(self.thr) == 2):
            thr1 = self.thr[0]
            thr2 = self.thr[1]

        # entry condition
        err1 = 10 * thr1
        err2 = 10 * thr2

        start_time = time.time()

        # setting penalty parameters for the ALM
        mu1p = alpha1 * 1 / self.computeLambda()
        print("\t\t-Compute Lambda- Time = %s seconds" %
              (time.time() - start_time))
        mu2p = alpha2 * 1

        mu1 = mu1p
        mu2 = mu2p

        i = 1
        start_time = time.time()
        if self.GPU == True:

            # defining penalty parameters e constraint to minimize, lambda and C matrix respectively
            THETA = misc.zeros((self.num_columns, self.num_columns),
                               dtype='float64')
            lambda2 = misc.zeros((self.num_columns, self.num_columns),
                                 dtype='float64')

            gpu_data = gpuarray.to_gpu(self.data)
            P_GPU = linalg.dot(gpu_data, gpu_data, transa='T')

            OP1 = P_GPU
            linalg.scale(np.float32(mu1), OP1)

            OP2 = linalg.eye(self.num_columns)
            linalg.scale(mu2, OP2)

            if self.affine == True:

                print('\t\tGPU affine...')

                OP3 = misc.ones((self.num_columns, self.num_columns),
                                dtype='float64')
                linalg.scale(mu2, OP3)
                lambda3 = misc.zeros((1, self.num_columns), dtype='float64')

                # TODO: Because of some problem with linalg.inv version of scikit-cuda we fix it using np.linalg.inv of numpy
                A = np.linalg.inv(
                    misc.add(misc.add(OP1.get(), OP2.get()), OP3.get()))

                A_GPU = gpuarray.to_gpu(A)

                while ((err1 > thr1 or err2 > thr1) and i < self.max_iter):

                    _lambda2 = gpuarray.to_gpu(lambda2)
                    _lambda3 = gpuarray.to_gpu(lambda3)

                    linalg.scale(1 / mu2, _lambda2)
                    term_OP2 = gpuarray.to_gpu(_lambda2.get())

                    OP2 = gpuarray.to_gpu(misc.subtract(THETA, term_OP2))
                    linalg.scale(mu2, OP2)

                    OP4 = gpuarray.to_gpu(
                        np.matlib.repmat(_lambda3.get(), self.num_columns, 1))

                    # updating Z
                    BHETA = linalg.dot(
                        A_GPU, misc.add(misc.add(misc.add(OP1, OP2), OP3),
                                        OP4))

                    # deallocating unnecessary GPU variables
                    OP2.gpudata.free()
                    OP4.gpudata.free()
                    _lambda2.gpudata.free()
                    _lambda3.gpudata.free()

                    # updating C
                    THETA = misc.add(BHETA, term_OP2)
                    THETA = self.shrinkL1Lq(THETA.get(), 1 / mu2)
                    THETA = THETA.astype('float64')

                    # updating Lagrange multipliers
                    term_lambda2 = misc.subtract(BHETA, gpuarray.to_gpu(THETA))

                    linalg.scale(mu2, term_lambda2)
                    term_lambda2 = gpuarray.to_gpu(term_lambda2.get())
                    lambda2 = misc.add(lambda2, term_lambda2)  # on GPU

                    term_lambda3 = misc.subtract(
                        misc.ones((1, self.num_columns), dtype='float64'),
                        misc.sum(BHETA, axis=0))
                    linalg.scale(mu2, term_lambda3)
                    term_lambda3 = gpuarray.to_gpu(term_lambda3.get())
                    lambda3 = misc.add(lambda3, term_lambda3)  # on GPU

                    # deallocating unnecessary GPU variables
                    term_OP2.gpudata.free()
                    term_lambda2.gpudata.free()
                    term_lambda3.gpudata.free()

                    err1 = self.errorCoef(BHETA.get(), THETA)
                    err2 = self.errorCoef(np.sum(BHETA.get(), axis=0),
                                          np.ones([1, self.num_columns]))

                    # deallocating unnecessary GPU variables
                    BHETA.gpudata.free()

                    THETA = gpuarray.to_gpu((THETA))

                    # reporting errors
                    if (self.verbose and (i % self.step == 0)):
                        print(
                            '\t\tIteration = %d, ||Z - C|| = %2.5e, ||1 - C^T 1|| = %2.5e'
                            % (i, err1, err2))
                    i += 1

                THETA = THETA.get()

                Err = [err1, err2]
                if (self.verbose):
                    print(
                        '\t\tTerminating ADMM at iteration %5.0f, \n ||Z - C|| = %2.5e, ||1 - C^T 1|| = %2.5e. \n'
                        % (i, err1, err2))

            else:
                print '\t\tGPU not affine'

                # TODO: Because of some problem with linalg.inv version of scikit-cuda we fix it using np.linalg.inv of numpy
                A = np.linalg.inv(misc.add(OP1.get(), OP2.get()))
                A_GPU = gpuarray.to_gpu(A)

                while (err1 > thr1 and i < self.max_iter):

                    _lambda2 = gpuarray.to_gpu(lambda2)

                    term_OP2 = THETA
                    linalg.scale(mu2, term_OP2)

                    term_OP2 = misc.subtract(term_OP2, _lambda2)

                    OP2 = gpuarray.to_gpu(term_OP2.get())

                    BHETA = linalg.dot(A_GPU, misc.add(OP1, OP2))

                    linalg.scale(1 / mu2, _lambda2)
                    term_THETA = gpuarray.to_gpu(_lambda2.get())

                    THETA = misc.add(BHETA, term_THETA)
                    THETA = self.shrinkL1Lq(THETA.get(), 1 / mu2)

                    THETA = THETA.astype('float32')

                    # updating Lagrange multipliers
                    term_lambda2 = misc.subtract(BHETA, gpuarray.to_gpu(THETA))
                    linalg.scale(mu2, term_lambda2)
                    term_lambda2 = gpuarray.to_gpu(term_lambda2.get())
                    lambda2 = misc.add(lambda2, term_lambda2)  # on GPU

                    err1 = self.errorCoef(BHETA.get(), THETA)

                    THETA = gpuarray.to_gpu((THETA))

                    # reporting errors
                    if (self.verbose and (i % self.step == 0)):
                        print('\t\tIteration %5.0f, ||Z - C|| = %2.5e' %
                              (i, err1))
                    i += 1

                THETA = THETA.get()
                Err = [err1, err2]
                if (self.verbose):
                    print(
                        '\t\tTerminating ADMM at iteration %5.0f, \n ||Z - C|| = %2.5e'
                        % (i, err1))

        else:  #CPU version

            # defining penalty parameters e constraint to minimize, lambda and C matrix respectively
            THETA = np.zeros([self.num_columns, self.num_columns])
            lambda2 = np.zeros([self.num_columns, self.num_columns])

            P = self.data.T.dot(self.data)
            OP1 = np.multiply(P, mu1)

            if self.affine == True:

                # INITIALIZATION
                lambda3 = np.zeros(self.num_columns).T

                A = np.linalg.inv(
                    np.multiply(mu1, P) +
                    np.multiply(mu2, np.eye(self.num_columns, dtype=int)) +
                    np.multiply(mu2,
                                np.ones([self.num_columns, self.num_columns])))

                OP3 = np.multiply(
                    mu2, np.ones([self.num_columns, self.num_columns]))

                while ((err1 > thr1 or err2 > thr1) and i < self.max_iter):

                    # updating Bheta
                    OP2 = np.multiply(THETA - np.divide(lambda2, mu2), mu2)
                    OP4 = np.matlib.repmat(lambda3, self.num_columns, 1)
                    BHETA = A.dot(OP1 + OP2 + OP3 + OP4)

                    # updating C
                    THETA = BHETA + np.divide(lambda2, mu2)
                    THETA = self.shrinkL1Lq(THETA, 1 / mu2)

                    # updating Lagrange multipliers
                    lambda2 = lambda2 + np.multiply(mu2, BHETA - THETA)
                    lambda3 = lambda3 + np.multiply(
                        mu2,
                        np.ones([1, self.num_columns]) - np.sum(BHETA, axis=0))

                    err1 = self.errorCoef(BHETA, THETA)
                    err2 = self.errorCoef(np.sum(BHETA, axis=0),
                                          np.ones([1, self.num_columns]))

                    # mu1 = min(mu1 * (1 + 10 ^ -5), 10 ^ 2 * mu1p);
                    # mu2 = min(mu2 * (1 + 10 ^ -5), 10 ^ 2 * mu2p);

                    # reporting errors
                    if (self.verbose and (i % self.step == 0)):
                        print(
                            '\t\tIteration = %d, ||Z - C|| = %2.5e, ||1 - C^T 1|| = %2.5e'
                            % (i, err1, err2))
                    i += 1

                Err = [err1, err2]

                if (self.verbose):
                    print(
                        '\t\tTerminating ADMM at iteration %5.0f, \n ||Z - C|| = %2.5e, ||1 - C^T 1|| = %2.5e. \n'
                        % (i, err1, err2))
            else:
                print '\t\tCPU not affine'

                A = np.linalg.inv(
                    OP1 +
                    np.multiply(mu2, np.eye(self.num_columns, dtype=int)))

                while (err1 > thr1 and i < self.max_iter):

                    # updating Z
                    OP2 = np.multiply(mu2, THETA) - lambda2
                    BHETA = A.dot(OP1 + OP2)

                    # updating C
                    THETA = BHETA + np.divide(lambda2, mu2)
                    THETA = self.shrinkL1Lq(THETA, 1 / mu2)

                    # updating Lagrange multipliers
                    lambda2 = lambda2 + np.multiply(mu2, BHETA - THETA)

                    # computing errors
                    err1 = self.errorCoef(BHETA, THETA)

                    # reporting errors
                    if (self.verbose and (i % self.step == 0)):
                        print('\t\tIteration %5.0f, ||Z - C|| = %2.5e' %
                              (i, err1))
                    i += 1

                Err = [err1, err2]
                if (self.verbose):
                    print(
                        '\t\tTerminating ADMM at iteration %5.0f, \n ||Z - C|| = %2.5e'
                        % (i, err1))

        print("\t\t-ADMM- Time = %s seconds" % (time.time() - start_time))

        return THETA, Err
Ejemplo n.º 23
0
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
import numpy as np
from skcuda import misc

def rolling_window(a, window):
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
    strides = a.strides + (a.strides[-1],)
    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)

a = np.arange(100000,dtype=np.float32)
b = np.array(rolling_window(a,5))
misc.init()	
dest_gpu = gpuarray.to_gpu(b)
c = misc.sum(dest_gpu,axis=1)


Ejemplo n.º 24
0
 def sum_diagonals(self, d_arr, d_out):
     self.d_diags.fill(0)
     self._kern_args[0] = d_arr.gpudata
     self.extract_diags_kernel(*self._kern_args, grid=self._grid, block=self._blocks)
     skmisc.sum(self.d_diags, axis=1, out=d_out)
    def __init__(self, centers):
        culinalg.init()

        self.centers = centers.astype(np.float32)
        self.centers_gpu = gpuarray.to_gpu(self.centers)
        self.center_norms = cumisc.sum(self.centers_gpu**2, axis=1)
Ejemplo n.º 26
0
def demosaick_gpu(img):
    img = gp.to_gpu(img)
    p2x = im2col(img, _i2c2)
    cm.log(img + _eps, out=img)
    p1x = im2col(img, _i2c1)

    wA = p1x.shape[0]
    wB = p2x.shape[0]
    hA = p1x.shape[1]
    hB = p2x.shape[1]

    # Path 1
    p1x = p1x.reshape([wA * hA, 576])
    p1y = lg.dot(p1x, _wts.int1)
    cm.exp(p1y, out=p1y)

    p1y = p1y.reshape([wA * hA * 64, 3 * _ofac])
    p1x = lg.dot(p1y, _wts.int2)
    msc.add_matvec(p1x, _wts.int2b, out=p1x)
    p1x = p1x.reshape([wA * hA * 64 * 3, _ofac])

    # Path 2
    # conv1
    p2x = p2x.reshape([wB * hB, 64])
    p2y = lg.dot(p2x, _wts.c1)
    msc.add_matvec(p2y, _wts.c1b, out=p2y)
    gp.maximum(p2y, 0., p2y)
    p2y = p2y.reshape([wB, hB, _numsel])

    # conv2
    shI = [wB - 1, hB - 1, _numsel]
    shM = [(wB - 1) * (hB - 1), _numsel]
    p2x = gp.empty(shM, dtype=np.float32)
    pTT = gp.empty(shI, dtype=np.float32)

    pTT = pTT.reshape(shI)
    pTT[...] = p2y[0:-1, 0:-1, :]
    pTT = pTT.reshape(shM)
    p2x = lg.dot(pTT, _wts.c200)
    pTT = pTT.reshape(shI)
    pTT[...] = p2y[0:-1, 1:, :]
    pTT = pTT.reshape(shM)
    lg.add_dot(pTT, _wts.c201, p2x)
    pTT = pTT.reshape(shI)
    pTT[...] = p2y[1:, 0:-1, :]
    pTT = pTT.reshape(shM)
    lg.add_dot(pTT, _wts.c210, p2x)
    pTT = pTT.reshape(shI)
    pTT[...] = p2y[1:, 1:, :]
    pTT = pTT.reshape(shM)
    lg.add_dot(pTT, _wts.c211, p2x)
    msc.add_matvec(p2x, _wts.c2b, out=p2x)
    gp.maximum(p2x, 0., p2x)
    p2x = p2x.reshape(shI)

    # conv 3
    shI = [wB - 2, hB - 2, _numsel]
    shM = [(wB - 2) * (hB - 2), _numsel]
    p2y = gp.empty(shM, dtype=np.float32)
    pTT = gp.empty(shI, dtype=np.float32)

    pTT = pTT.reshape(shI)
    pTT[...] = p2x[0:-1, 0:-1, :]
    pTT = pTT.reshape(shM)
    p2y = lg.dot(pTT, _wts.c300)
    pTT = pTT.reshape(shI)
    pTT[...] = p2x[0:-1, 1:, :]
    pTT = pTT.reshape(shM)
    lg.add_dot(pTT, _wts.c301, p2y)
    pTT = pTT.reshape(shI)
    pTT[...] = p2x[1:, 0:-1, :]
    pTT = pTT.reshape(shM)
    lg.add_dot(pTT, _wts.c310, p2y)
    pTT = pTT.reshape(shI)
    pTT[...] = p2x[1:, 1:, :]
    pTT = pTT.reshape(shM)
    lg.add_dot(pTT, _wts.c311, p2y)
    msc.add_matvec(p2y, _wts.c3b, out=p2y)
    gp.maximum(p2y, 0., p2y)

    p2x = lg.dot(p2y, _wts.sout)

    msc.add_matvec(p2x, _wts.soutb, out=p2x)
    gp.maximum(p2x, 0., p2x)
    p2x = p2x.reshape(p1x.shape)

    # Combine
    p1x *= p2x
    p1 = msc.sum(p1x, axis=1)
    gp.maximum(p1, 0., p1)
    gp.minimum(p1, 1., p1)
    p1 = p1.reshape([wA, hA, 64 * 3])

    im = p2im(p1.get())

    return im