def test_times_rdd(self): mat1 = RowMatrix( self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5, 6]))], 2)) mat2 = RowMatrix( self.sc.parallelize([(1, array([7, 8, 9])), (2, array([10, 11, 12]))], 2)) truth = array([[47, 52, 57], [64, 71, 78], [81, 90, 99]]) resultA = mat1.times(mat2) assert array_equal(resultA, truth)
def test_elementwise_rdd(self): mat1 = RowMatrix( self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5, 6]))], 2)) mat2 = RowMatrix( self.sc.parallelize([(1, array([7, 8, 9])), (2, array([10, 11, 12]))], 2)) result = mat1.elementwise(mat2, add).rows().collect() truth = array([[8, 10, 12], [14, 16, 18]]) assert array_equal(result, truth)
def test_als(self): """ Test accuracy of alternating least-squares NMF algorithm against the MATLAB-computed version """ # set data and initializing constants keys = [array([i + 1]) for i in range(4)] data_local = array([[1.0, 2.0, 6.0], [1.0, 3.0, 0.0], [1.0, 4.0, 6.0], [5.0, 1.0, 4.0]]) data = self.sc.parallelize(zip(keys, data_local)) mat = RowMatrix(data) h0 = array([[0.09082617, 0.85490047, 0.57234593], [0.82766740, 0.21301186, 0.90913979]]) # if the rows of h are not normalized on each iteration: h_true = array([[0., 0.6010, 0.9163], [0.8970, 0.1556, 0.7423]]) w_true = array([[4.5885, 1.5348], [1.3651, 0.2184], [5.9349, 1.0030], [0., 5.5147]]) # if the columns of h are normalized (as in the current implementation): scale_mat = diag(norm(h_true, axis=1)) h_true = dot(LinAlg.inv(scale_mat), h_true) w_true = dot(w_true, scale_mat) # calculate NMF using the Thunder implementation # (maxiter=9 corresponds with Matlab algorithm) nmf_thunder = NMF(k=2, method="als", h0=h0, maxiter=9) nmf_thunder.fit(mat) h_thunder = nmf_thunder.h w_thunder = array(nmf_thunder.w.values().collect()) tol = 1e-03 # allow small error assert (allclose(w_thunder, w_true, atol=tol)) assert (allclose(h_thunder, h_true, atol=tol))
def transform(self, data): """Project data into principal component space Parameters ---------- data : Series or a subclass (e.g. RowMatrix) Data to estimate independent components from, must be a collection of key-value pairs where the keys are identifiers and the values are one-dimensional arrays Returns ------- scores : RowMatrix, nrows, each of shape (k,) The scores (i.e. the representation of the data in PC space) """ if not (isinstance(data, Series)): raise Exception('Input must be Series or a subclass (e.g. RowMatrix)') if type(data) is not RowMatrix: data = RowMatrix(data) mat = data.center(0) scores = mat.times(self.comps.T / self.latent) return scores
def test_times_array(self): mat1 = RowMatrix( self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5, 6]))])) mat2 = array([[7, 8], [9, 10], [11, 12]]) truth = [array([58, 64]), array([139, 154])] rdd = mat1.times(mat2) result = rdd.rows().collect() assert array_equal(result, truth) assert array_equal(rdd.index, range(0, 2))
def generate(self, k=3, npartitions=10, nrows=100, ncols=10, seed=None): random.seed(seed) u = random.randn(nrows, k) v = random.randn(k, ncols) a = dot(u, v) a += random.randn(shape(a)[0], shape(a)[1]) data = RowMatrix(self.sc.parallelize(appendKeys(a), npartitions)) if self.returnParams is True: return data, u, v else: return data
def test_outer(self): mat1 = RowMatrix( self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5, 6]))])) resultA = mat1.gramian() resultB1 = mat1.gramian("accum") resultB2 = mat1.gramian("aggregate") truth = array([[17, 22, 27], [22, 29, 36], [27, 36, 45]]) assert array_equal(resultA, truth) assert array_equal(resultB1, truth) assert array_equal(resultB2, truth)
def generate(self, npartitions=10, nrows=100): random.seed(42) time = linspace(0, 10, nrows) s1 = sin(2 * time) s2 = sign(sin(3 * time)) s = c_[s1, s2] s += 0.2 * random.randn(s.shape[0], s.shape[1]) # Add noise s /= s.std(axis=0) a = array([[1, 1], [0.5, 2]]) x = dot(s, a.T) data = RowMatrix(self.sc.parallelize(appendKeys(x), npartitions)) if self.returnParams is True: return data, s, a else: return data
def test_init(self): """ test performance of whole function, including random initialization """ data_local = array([[1.0, 2.0, 6.0], [1.0, 3.0, 0.0], [1.0, 4.0, 6.0], [5.0, 1.0, 4.0]]) data = self.sc.parallelize( zip([array([i]) for i in range(data_local.shape[0])], data_local)) mat = RowMatrix(data) nmf_thunder = NMF(k=2, recon_hist='final') nmf_thunder.fit(mat) # check to see if Thunder's solution achieves close-to-optimal reconstruction error # scikit-learn's solution achieves 2.993952 # matlab's non-deterministic implementation usually achieves < 2.9950 (when it converges) assert (nmf_thunder.recon_err < 2.9950)
def generate(self, q=1, p=3, nrows=50, npartitions=10, sigmas=None, seed=None): """ Generate data from a factor analysis model Parameters ---------- q : int, optional, default = 1 The number of factors generating this data p : int, optios, default = 3 The number of observed factors (p >= q) nrows : int, optional, default = 50 Number of observations we have sigmas = 1 x p ndarray, optional, default = None Scale of the noise to add, randomly generated from standard normal distribution if not given """ random.seed(seed) # Generate factor loadings (n x q) F = matrix(random.randn(nrows, q)) # Generate factor scores (q x p) w = matrix(random.randn(q, p)) # Generate non-zero the error covariances (1 x p) if sigmas is None: sigmas = random.randn(1, p) # Generate the error terms (n x p) # (each row gets scaled by our sigmas) epsilon = random.randn(nrows, p) * sigmas # Combine this to get our actual data (n x p) x = (F * w) + epsilon # Put the data in an RDD data = RowMatrix(self.sc.parallelize(self.appendKeys(x), npartitions)) if self.returnParams is True: return data, F, w, epsilon else: return data
def test_SvdDirect(self): dataLocal = [ array([1.0, 2.0, 6.0]), array([1.0, 3.0, 0.0]), array([1.0, 4.0, 6.0]), array([5.0, 1.0, 4.0]) ] data = self.sc.parallelize(zip(range(1, 5), dataLocal)) mat = RowMatrix(data) svd = SVD(k=1, method="direct") svd.calc(mat) uTrue, sTrue, vTrue = LinAlg.svd(array(dataLocal)) uTest = transpose(array(svd.u.rows().collect()))[0] vTest = svd.v[0] assert (allclose(svd.s[0], sTrue[0])) assert (allclose(vTest, vTrue[0, :]) | allclose(-vTest, vTrue[0, :])) assert (allclose(uTest, uTrue[:, 0]) | allclose(-uTest, uTrue[:, 0]))
def test_SvdEM(self): dataLocal = [ array([1.0, 2.0, 6.0]), array([1.0, 3.0, 0.0]), array([1.0, 4.0, 6.0]), array([5.0, 1.0, 4.0]) ] data = self.sc.parallelize(zip(range(1, 5), dataLocal)) mat = RowMatrix(data) svd = SVD(k=1, method="em") svd.calc(mat) uTrue, sTrue, vTrue = LinAlg.svd(array(dataLocal)) uTest = transpose(array(svd.u.rows().collect()))[0] vTest = svd.v[0] tol = 10e-04 # allow small error for iterative method assert(allclose(svd.s[0], sTrue[0], atol=tol)) assert(allclose(vTest, vTrue[0, :], atol=tol) | allclose(-vTest, vTrue[0, :], atol=tol)) assert(allclose(uTest, uTrue[:, 0], atol=tol) | allclose(-uTest, uTrue[:, 0], atol=tol))
def generate(self, nrows=50, ncols=50, npartitions=10, seed=None): """ Generate a matrix where every element is i.i.d. and drawn from a standard normal distribution Parameters ---------- nrows : int, optional, default = 50 Number of columns in the generated matrix nrows : int, optional, default = 50 Number of rows in the generated matrix """ random.seed(seed) # Generate the data x = matrix(random.randn(nrows, ncols)) # Put the data into an RDD data = RowMatrix(self.sc.parallelize(self.appendKeys(x), npartitions)) return data
def test_pca(self): dataLocal = [ array([1.0, 1.0, 1.0, 5.0]), array([2.0, 3.0, 4.0, 1.0]), array([6.0, 0.0, 6.0, 6.0]) ] data = self.sc.parallelize(zip(range(1, 4), dataLocal)) mat = RowMatrix(data) pca1 = PCA(k=1, svdMethod='direct') pca1.fit(mat) out1_comps = pca1.comps out1_scores = pca1.scores.collectValuesAsArray() * pca1.latent out1_transform_scores = pca1.transform(mat).collectValuesAsArray() * pca1.latent from sklearn.decomposition import PCA as skPCA pca2 = skPCA(n_components=1) pca2.fit(array(dataLocal)) out2_comps = pca2.components_ out2_scores = pca2.transform(array(dataLocal)) assert(allclose(out1_comps, out2_comps) | allclose(out1_comps, -out2_comps)) assert(allclose(out1_scores, out2_scores) | allclose(out1_scores, -out2_scores)) assert(allclose(out1_scores, out1_transform_scores))
def toRowMatrix(self): """ Convert Series to RowMatrix """ from thunder.rdds.matrices import RowMatrix return RowMatrix(self.rdd).__finalize__(self)
def test_elementwise_array(self): mat = RowMatrix(self.sc.parallelize([(1, array([1, 2, 3]))])) assert array_equal( mat.elementwise(2, add).rows().collect()[0], array([3, 4, 5]))