Ejemplo n.º 1
0
    def fit(self, data):
        """Estimate principal components

        Parameters
        ----------
        data : Series or a subclass (e.g. RowMatrix)
            Data to estimate independent components from, must be a collection of
            key-value pairs where the keys are identifiers and the values are
            one-dimensional arrays
        """

        if not (isinstance(data, Series)):
            raise Exception('Input must be Series or a subclass (e.g. RowMatrix)')

        if type(data) is not RowMatrix:
            data = data.toRowMatrix()

        mat = data.center(0)

        svd = SVD(k=self.k, method=self.svdmethod)
        svd.calc(mat)

        self.scores = svd.u
        self.latent = svd.s
        self.comps = svd.v

        return self
Ejemplo n.º 2
0
    def fit(self, data):
        """Estimate principal components

        Parameters
        ----------
        data : Series or a subclass (e.g. RowMatrix)
            Data to estimate independent components from, must be a collection of
            key-value pairs where the keys are identifiers and the values are
            one-dimensional arrays
        """

        if not (isinstance(data, Series)):
            raise Exception('Input must be Series or a subclass (e.g. RowMatrix)')

        if type(data) is not RowMatrix:
            data = data.toRowMatrix()

        mat = data.center(0)

        svd = SVD(k=self.k, method=self.svdMethod)
        svd.calc(mat)

        self.scores = svd.u
        self.latent = svd.s
        self.comps = svd.v

        return self
Ejemplo n.º 3
0
    def test_SvdDirect(self):
        dataLocal = [
            array([1.0, 2.0, 6.0]),
            array([1.0, 3.0, 0.0]),
            array([1.0, 4.0, 6.0]),
            array([5.0, 1.0, 4.0])
        ]
        data = self.sc.parallelize(zip(range(1, 5), dataLocal))
        mat = RowMatrix(data)

        svd = SVD(k=1, method="direct")
        svd.calc(mat)
        uTrue, sTrue, vTrue = LinAlg.svd(array(dataLocal))
        uTest = transpose(array(svd.u.rows().collect()))[0]
        vTest = svd.v[0]
        assert (allclose(svd.s[0], sTrue[0]))
        assert (allclose(vTest, vTrue[0, :]) | allclose(-vTest, vTrue[0, :]))
        assert (allclose(uTest, uTrue[:, 0]) | allclose(-uTest, uTrue[:, 0]))
Ejemplo n.º 4
0
    def test_SvdDirect(self):
        dataLocal = [
            array([1.0, 2.0, 6.0]),
            array([1.0, 3.0, 0.0]),
            array([1.0, 4.0, 6.0]),
            array([5.0, 1.0, 4.0])
        ]
        data = self.sc.parallelize(zip(range(1, 5), dataLocal))
        mat = RowMatrix(data)

        svd = SVD(k=1, method="direct")
        svd.calc(mat)
        uTrue, sTrue, vTrue = LinAlg.svd(array(dataLocal))
        uTest = transpose(array(svd.u.rows().collect()))[0]
        vTest = svd.v[0]
        assert(allclose(svd.s[0], sTrue[0]))
        assert(allclose(vTest, vTrue[0, :]) | allclose(-vTest, vTrue[0, :]))
        assert(allclose(uTest, uTrue[:, 0]) | allclose(-uTest, uTrue[:, 0]))
Ejemplo n.º 5
0
 def test_conversion(self):
     from thunder.rdds.series import Series
     data_local = [
         array([1.0, 2.0, 6.0]),
         array([1.0, 3.0, 0.0]),
         array([1.0, 4.0, 6.0]),
         array([5.0, 1.0, 4.0])
     ]
     data = Series(self.sc.parallelize(zip(range(1, 5), data_local)))
     SVD(k=1, method='direct').calc(data)
Ejemplo n.º 6
0
    def test_SvdEM(self):
        dataLocal = [
            array([1.0, 2.0, 6.0]),
            array([1.0, 3.0, 0.0]),
            array([1.0, 4.0, 6.0]),
            array([5.0, 1.0, 4.0])
        ]
        data = self.sc.parallelize(zip(range(1, 5), dataLocal))
        mat = RowMatrix(data)

        svd = SVD(k=1, method="em")
        svd.calc(mat)
        uTrue, sTrue, vTrue = LinAlg.svd(array(dataLocal))
        uTest = transpose(array(svd.u.rows().collect()))[0]
        vTest = svd.v[0]
        tol = 10e-04  # allow small error for iterative method
        assert(allclose(svd.s[0], sTrue[0], atol=tol))
        assert(allclose(vTest, vTrue[0, :], atol=tol) | allclose(-vTest, vTrue[0, :], atol=tol))
        assert(allclose(uTest, uTrue[:, 0], atol=tol) | allclose(-uTest, uTrue[:, 0], atol=tol))
    def test_SvdEM(self):
        dataLocal = [
            array([1.0, 2.0, 6.0]),
            array([1.0, 3.0, 0.0]),
            array([1.0, 4.0, 6.0]),
            array([5.0, 1.0, 4.0])
        ]
        data = self.sc.parallelize(zip(range(1, 5), dataLocal))
        mat = RowMatrix(data)

        svd = SVD(k=1, method="em")
        svd.calc(mat)
        uTrue, sTrue, vTrue = LinAlg.svd(array(dataLocal))
        uTest = transpose(array(svd.u.rows().collect()))[0]
        vTest = svd.v[0]
        tol = 10e-04  # allow small error for iterative method
        assert(allclose(svd.s[0], sTrue[0], atol=tol))
        assert(allclose(vTest, vTrue[0, :], atol=tol) | allclose(-vTest, vTrue[0, :], atol=tol))
        assert(allclose(uTest, uTrue[:, 0], atol=tol) | allclose(-uTest, uTrue[:, 0], atol=tol))
Ejemplo n.º 8
0
    def fit(self, data):
        """
        Fit independent components using an iterative fixed-point algorithm

        Parameters
        ----------
        data : Series or a subclass (e.g. RowMatrix)
            Data to estimate independent components from, must be a collection of
            key-value pairs where the keys are identifiers and the values are
            one-dimensional arrays

        Returns
        ----------
        self : returns an instance of self.
        """

        from numpy import random, sqrt, zeros, real, dot, outer, diag, transpose
        from scipy.linalg import sqrtm, inv, orth

        if not (isinstance(data, Series)):
            raise Exception(
                'Input must be Series or a subclass (e.g. RowMatrix)')

        if not isinstance(data, RowMatrix):
            data = data.toRowMatrix()

        d = data.ncols

        if self.k is None:
            self.k = d

        if self.c > self.k:
            raise Exception(
                "number of independent comps " + str(self.c) +
                " must be less than the number of principal comps " +
                str(self.k))

        if self.k > d:
            raise Exception("number of principal comps " + str(self.k) +
                            " must be less than the data dimensionality " +
                            str(d))

        # reduce dimensionality
        svd = SVD(k=self.k, method=self.svdMethod).calc(data)

        # whiten data
        whtMat = real(dot(inv(diag(svd.s / sqrt(data.nrows))), svd.v))
        unWhtMat = real(dot(transpose(svd.v), diag(svd.s / sqrt(data.nrows))))
        wht = data.times(whtMat.T)

        # do multiple independent component extraction
        if self.seed != 0:
            random.seed(self.seed)
        b = orth(random.randn(self.k, self.c))
        bOld = zeros((self.k, self.c))
        niter = 0
        minAbsCos = 0
        errVec = zeros(self.maxIter)

        while (niter < self.maxIter) & ((1 - minAbsCos) > self.tol):
            niter += 1
            # update rule for pow3 non-linearity (TODO: add others)
            b = wht.rows().map(
                lambda x: outer(x,
                                dot(x, b)**3)).sum() / wht.nrows - 3 * b
            # make orthogonal
            b = dot(b, real(sqrtm(inv(dot(transpose(b), b)))))
            # evaluate error
            minAbsCos = min(abs(diag(dot(transpose(b), bOld))))
            # store results
            bOld = b
            errVec[niter - 1] = (1 - minAbsCos)

        # get un-mixing matrix
        w = dot(b.T, whtMat)

        # get mixing matrix
        a = dot(unWhtMat, b)

        # get components
        sigs = data.times(w.T)

        self.w = w
        self.a = a
        self.sigs = sigs

        return self