def test_times_rdd(self):
     mat1 = RowMatrix(
         self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5, 6]))],
                             2))
     mat2 = RowMatrix(
         self.sc.parallelize([(1, array([7, 8, 9])),
                              (2, array([10, 11, 12]))], 2))
     truth = array([[47, 52, 57], [64, 71, 78], [81, 90, 99]])
     resultA = mat1.times(mat2)
     assert array_equal(resultA, truth)
 def test_elementwise_rdd(self):
     mat1 = RowMatrix(
         self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5, 6]))],
                             2))
     mat2 = RowMatrix(
         self.sc.parallelize([(1, array([7, 8, 9])),
                              (2, array([10, 11, 12]))], 2))
     result = mat1.elementwise(mat2, add).rows().collect()
     truth = array([[8, 10, 12], [14, 16, 18]])
     assert array_equal(result, truth)
Example #3
0
    def test_als(self):
        """ Test accuracy of alternating least-squares NMF algorithm
        against the MATLAB-computed version
        """
        #  set data and initializing constants
        keys = [array([i + 1]) for i in range(4)]
        data_local = array([[1.0, 2.0, 6.0], [1.0, 3.0, 0.0], [1.0, 4.0, 6.0],
                            [5.0, 1.0, 4.0]])
        data = self.sc.parallelize(zip(keys, data_local))
        mat = RowMatrix(data)
        h0 = array([[0.09082617, 0.85490047, 0.57234593],
                    [0.82766740, 0.21301186, 0.90913979]])

        # if the rows of h are not normalized on each iteration:
        h_true = array([[0., 0.6010, 0.9163], [0.8970, 0.1556, 0.7423]])
        w_true = array([[4.5885, 1.5348], [1.3651, 0.2184], [5.9349, 1.0030],
                        [0., 5.5147]])

        # if the columns of h are normalized (as in the current implementation):
        scale_mat = diag(norm(h_true, axis=1))
        h_true = dot(LinAlg.inv(scale_mat), h_true)
        w_true = dot(w_true, scale_mat)

        # calculate NMF using the Thunder implementation
        # (maxiter=9 corresponds with Matlab algorithm)
        nmf_thunder = NMF(k=2, method="als", h0=h0, maxiter=9)
        nmf_thunder.fit(mat)
        h_thunder = nmf_thunder.h
        w_thunder = array(nmf_thunder.w.values().collect())

        tol = 1e-03  # allow small error
        assert (allclose(w_thunder, w_true, atol=tol))
        assert (allclose(h_thunder, h_true, atol=tol))
Example #4
0
    def transform(self, data):
        """Project data into principal component space

        Parameters
        ----------
        data : Series or a subclass (e.g. RowMatrix)
            Data to estimate independent components from, must be a collection of
            key-value pairs where the keys are identifiers and the values are
            one-dimensional arrays

        Returns
        -------
        scores : RowMatrix, nrows, each of shape (k,)
            The scores (i.e. the representation of the data in PC space)
        """

        if not (isinstance(data, Series)):
            raise Exception('Input must be Series or a subclass (e.g. RowMatrix)')

        if type(data) is not RowMatrix:
            data = RowMatrix(data)

        mat = data.center(0)
        scores = mat.times(self.comps.T / self.latent)
        return scores
 def test_times_array(self):
     mat1 = RowMatrix(
         self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5,
                                                                6]))]))
     mat2 = array([[7, 8], [9, 10], [11, 12]])
     truth = [array([58, 64]), array([139, 154])]
     rdd = mat1.times(mat2)
     result = rdd.rows().collect()
     assert array_equal(result, truth)
     assert array_equal(rdd.index, range(0, 2))
Example #6
0
 def generate(self, k=3, npartitions=10, nrows=100, ncols=10, seed=None):
     random.seed(seed)
     u = random.randn(nrows, k)
     v = random.randn(k, ncols)
     a = dot(u, v)
     a += random.randn(shape(a)[0], shape(a)[1])
     data = RowMatrix(self.sc.parallelize(appendKeys(a), npartitions))
     if self.returnParams is True:
         return data, u, v
     else:
         return data
 def test_outer(self):
     mat1 = RowMatrix(
         self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5,
                                                                6]))]))
     resultA = mat1.gramian()
     resultB1 = mat1.gramian("accum")
     resultB2 = mat1.gramian("aggregate")
     truth = array([[17, 22, 27], [22, 29, 36], [27, 36, 45]])
     assert array_equal(resultA, truth)
     assert array_equal(resultB1, truth)
     assert array_equal(resultB2, truth)
Example #8
0
 def generate(self, npartitions=10, nrows=100):
     random.seed(42)
     time = linspace(0, 10, nrows)
     s1 = sin(2 * time)
     s2 = sign(sin(3 * time))
     s = c_[s1, s2]
     s += 0.2 * random.randn(s.shape[0], s.shape[1])  # Add noise
     s /= s.std(axis=0)
     a = array([[1, 1], [0.5, 2]])
     x = dot(s, a.T)
     data = RowMatrix(self.sc.parallelize(appendKeys(x), npartitions))
     if self.returnParams is True:
         return data, s, a
     else:
         return data
Example #9
0
    def test_init(self):
        """
        test performance of whole function, including random initialization
        """
        data_local = array([[1.0, 2.0, 6.0], [1.0, 3.0, 0.0], [1.0, 4.0, 6.0],
                            [5.0, 1.0, 4.0]])
        data = self.sc.parallelize(
            zip([array([i]) for i in range(data_local.shape[0])], data_local))
        mat = RowMatrix(data)

        nmf_thunder = NMF(k=2, recon_hist='final')
        nmf_thunder.fit(mat)

        # check to see if Thunder's solution achieves close-to-optimal reconstruction error
        # scikit-learn's solution achieves 2.993952
        # matlab's non-deterministic implementation usually achieves < 2.9950 (when it converges)
        assert (nmf_thunder.recon_err < 2.9950)
Example #10
0
    def generate(self,
                 q=1,
                 p=3,
                 nrows=50,
                 npartitions=10,
                 sigmas=None,
                 seed=None):
        """
        Generate data from a factor analysis model

        Parameters
        ----------
        q : int, optional, default = 1
          The number of factors generating this data

        p : int, optios, default = 3
          The number of observed factors (p >= q)

        nrows : int, optional, default = 50
          Number of observations we have

        sigmas = 1 x p ndarray, optional, default = None
          Scale of the noise to add, randomly generated
          from standard normal distribution if not given
        """
        random.seed(seed)
        # Generate factor loadings (n x q)
        F = matrix(random.randn(nrows, q))
        # Generate factor scores (q x p)
        w = matrix(random.randn(q, p))
        # Generate non-zero the error covariances (1 x p)
        if sigmas is None:
            sigmas = random.randn(1, p)
        # Generate the error terms (n x p)
        # (each row gets scaled by our sigmas)
        epsilon = random.randn(nrows, p) * sigmas
        # Combine this to get our actual data (n x p)
        x = (F * w) + epsilon
        # Put the data in an RDD
        data = RowMatrix(self.sc.parallelize(self.appendKeys(x), npartitions))

        if self.returnParams is True:
            return data, F, w, epsilon
        else:
            return data
Example #11
0
    def test_SvdDirect(self):
        dataLocal = [
            array([1.0, 2.0, 6.0]),
            array([1.0, 3.0, 0.0]),
            array([1.0, 4.0, 6.0]),
            array([5.0, 1.0, 4.0])
        ]
        data = self.sc.parallelize(zip(range(1, 5), dataLocal))
        mat = RowMatrix(data)

        svd = SVD(k=1, method="direct")
        svd.calc(mat)
        uTrue, sTrue, vTrue = LinAlg.svd(array(dataLocal))
        uTest = transpose(array(svd.u.rows().collect()))[0]
        vTest = svd.v[0]
        assert (allclose(svd.s[0], sTrue[0]))
        assert (allclose(vTest, vTrue[0, :]) | allclose(-vTest, vTrue[0, :]))
        assert (allclose(uTest, uTrue[:, 0]) | allclose(-uTest, uTrue[:, 0]))
    def test_SvdEM(self):
        dataLocal = [
            array([1.0, 2.0, 6.0]),
            array([1.0, 3.0, 0.0]),
            array([1.0, 4.0, 6.0]),
            array([5.0, 1.0, 4.0])
        ]
        data = self.sc.parallelize(zip(range(1, 5), dataLocal))
        mat = RowMatrix(data)

        svd = SVD(k=1, method="em")
        svd.calc(mat)
        uTrue, sTrue, vTrue = LinAlg.svd(array(dataLocal))
        uTest = transpose(array(svd.u.rows().collect()))[0]
        vTest = svd.v[0]
        tol = 10e-04  # allow small error for iterative method
        assert(allclose(svd.s[0], sTrue[0], atol=tol))
        assert(allclose(vTest, vTrue[0, :], atol=tol) | allclose(-vTest, vTrue[0, :], atol=tol))
        assert(allclose(uTest, uTrue[:, 0], atol=tol) | allclose(-uTest, uTrue[:, 0], atol=tol))
Example #13
0
    def generate(self, nrows=50, ncols=50, npartitions=10, seed=None):
        """
        Generate a matrix where every element is i.i.d. and drawn from a
        standard normal distribution

        Parameters
        ----------
        nrows : int, optional, default = 50
          Number of columns in the generated matrix

        nrows : int, optional, default = 50
          Number of rows in the generated matrix
        """
        random.seed(seed)
        # Generate the data
        x = matrix(random.randn(nrows, ncols))
        # Put the data into an RDD
        data = RowMatrix(self.sc.parallelize(self.appendKeys(x), npartitions))
        return data
    def test_pca(self):
        dataLocal = [
            array([1.0, 1.0, 1.0, 5.0]),
            array([2.0, 3.0, 4.0, 1.0]),
            array([6.0, 0.0, 6.0, 6.0])
        ]
        data = self.sc.parallelize(zip(range(1, 4), dataLocal))
        mat = RowMatrix(data)

        pca1 = PCA(k=1, svdMethod='direct')
        pca1.fit(mat)
        out1_comps = pca1.comps
        out1_scores = pca1.scores.collectValuesAsArray() * pca1.latent
        out1_transform_scores = pca1.transform(mat).collectValuesAsArray() * pca1.latent

        from sklearn.decomposition import PCA as skPCA
        pca2 = skPCA(n_components=1)
        pca2.fit(array(dataLocal))
        out2_comps = pca2.components_
        out2_scores = pca2.transform(array(dataLocal))

        assert(allclose(out1_comps, out2_comps) | allclose(out1_comps, -out2_comps))
        assert(allclose(out1_scores, out2_scores) | allclose(out1_scores, -out2_scores))
        assert(allclose(out1_scores, out1_transform_scores))
Example #15
0
 def toRowMatrix(self):
     """
     Convert Series to RowMatrix
     """
     from thunder.rdds.matrices import RowMatrix
     return RowMatrix(self.rdd).__finalize__(self)
 def test_elementwise_array(self):
     mat = RowMatrix(self.sc.parallelize([(1, array([1, 2, 3]))]))
     assert array_equal(
         mat.elementwise(2, add).rows().collect()[0], array([3, 4, 5]))