def fit(self, data): """Estimate principal components Parameters ---------- data : Series or a subclass (e.g. RowMatrix) Data to estimate independent components from, must be a collection of key-value pairs where the keys are identifiers and the values are one-dimensional arrays """ if not (isinstance(data, Series)): raise Exception('Input must be Series or a subclass (e.g. RowMatrix)') if type(data) is not RowMatrix: data = data.toRowMatrix() mat = data.center(0) svd = SVD(k=self.k, method=self.svdmethod) svd.calc(mat) self.scores = svd.u self.latent = svd.s self.comps = svd.v return self
def fit(self, data): """Estimate principal components Parameters ---------- data : Series or a subclass (e.g. RowMatrix) Data to estimate independent components from, must be a collection of key-value pairs where the keys are identifiers and the values are one-dimensional arrays """ if not (isinstance(data, Series)): raise Exception('Input must be Series or a subclass (e.g. RowMatrix)') if type(data) is not RowMatrix: data = data.toRowMatrix() mat = data.center(0) svd = SVD(k=self.k, method=self.svdMethod) svd.calc(mat) self.scores = svd.u self.latent = svd.s self.comps = svd.v return self
def test_SvdDirect(self): dataLocal = [ array([1.0, 2.0, 6.0]), array([1.0, 3.0, 0.0]), array([1.0, 4.0, 6.0]), array([5.0, 1.0, 4.0]) ] data = self.sc.parallelize(zip(range(1, 5), dataLocal)) mat = RowMatrix(data) svd = SVD(k=1, method="direct") svd.calc(mat) uTrue, sTrue, vTrue = LinAlg.svd(array(dataLocal)) uTest = transpose(array(svd.u.rows().collect()))[0] vTest = svd.v[0] assert (allclose(svd.s[0], sTrue[0])) assert (allclose(vTest, vTrue[0, :]) | allclose(-vTest, vTrue[0, :])) assert (allclose(uTest, uTrue[:, 0]) | allclose(-uTest, uTrue[:, 0]))
def test_SvdDirect(self): dataLocal = [ array([1.0, 2.0, 6.0]), array([1.0, 3.0, 0.0]), array([1.0, 4.0, 6.0]), array([5.0, 1.0, 4.0]) ] data = self.sc.parallelize(zip(range(1, 5), dataLocal)) mat = RowMatrix(data) svd = SVD(k=1, method="direct") svd.calc(mat) uTrue, sTrue, vTrue = LinAlg.svd(array(dataLocal)) uTest = transpose(array(svd.u.rows().collect()))[0] vTest = svd.v[0] assert(allclose(svd.s[0], sTrue[0])) assert(allclose(vTest, vTrue[0, :]) | allclose(-vTest, vTrue[0, :])) assert(allclose(uTest, uTrue[:, 0]) | allclose(-uTest, uTrue[:, 0]))
def test_conversion(self): from thunder.rdds.series import Series data_local = [ array([1.0, 2.0, 6.0]), array([1.0, 3.0, 0.0]), array([1.0, 4.0, 6.0]), array([5.0, 1.0, 4.0]) ] data = Series(self.sc.parallelize(zip(range(1, 5), data_local))) SVD(k=1, method='direct').calc(data)
def test_SvdEM(self): dataLocal = [ array([1.0, 2.0, 6.0]), array([1.0, 3.0, 0.0]), array([1.0, 4.0, 6.0]), array([5.0, 1.0, 4.0]) ] data = self.sc.parallelize(zip(range(1, 5), dataLocal)) mat = RowMatrix(data) svd = SVD(k=1, method="em") svd.calc(mat) uTrue, sTrue, vTrue = LinAlg.svd(array(dataLocal)) uTest = transpose(array(svd.u.rows().collect()))[0] vTest = svd.v[0] tol = 10e-04 # allow small error for iterative method assert(allclose(svd.s[0], sTrue[0], atol=tol)) assert(allclose(vTest, vTrue[0, :], atol=tol) | allclose(-vTest, vTrue[0, :], atol=tol)) assert(allclose(uTest, uTrue[:, 0], atol=tol) | allclose(-uTest, uTrue[:, 0], atol=tol))
def fit(self, data): """ Fit independent components using an iterative fixed-point algorithm Parameters ---------- data : Series or a subclass (e.g. RowMatrix) Data to estimate independent components from, must be a collection of key-value pairs where the keys are identifiers and the values are one-dimensional arrays Returns ---------- self : returns an instance of self. """ from numpy import random, sqrt, zeros, real, dot, outer, diag, transpose from scipy.linalg import sqrtm, inv, orth if not (isinstance(data, Series)): raise Exception( 'Input must be Series or a subclass (e.g. RowMatrix)') if not isinstance(data, RowMatrix): data = data.toRowMatrix() d = data.ncols if self.k is None: self.k = d if self.c > self.k: raise Exception( "number of independent comps " + str(self.c) + " must be less than the number of principal comps " + str(self.k)) if self.k > d: raise Exception("number of principal comps " + str(self.k) + " must be less than the data dimensionality " + str(d)) # reduce dimensionality svd = SVD(k=self.k, method=self.svdMethod).calc(data) # whiten data whtMat = real(dot(inv(diag(svd.s / sqrt(data.nrows))), svd.v)) unWhtMat = real(dot(transpose(svd.v), diag(svd.s / sqrt(data.nrows)))) wht = data.times(whtMat.T) # do multiple independent component extraction if self.seed != 0: random.seed(self.seed) b = orth(random.randn(self.k, self.c)) bOld = zeros((self.k, self.c)) niter = 0 minAbsCos = 0 errVec = zeros(self.maxIter) while (niter < self.maxIter) & ((1 - minAbsCos) > self.tol): niter += 1 # update rule for pow3 non-linearity (TODO: add others) b = wht.rows().map( lambda x: outer(x, dot(x, b)**3)).sum() / wht.nrows - 3 * b # make orthogonal b = dot(b, real(sqrtm(inv(dot(transpose(b), b))))) # evaluate error minAbsCos = min(abs(diag(dot(transpose(b), bOld)))) # store results bOld = b errVec[niter - 1] = (1 - minAbsCos) # get un-mixing matrix w = dot(b.T, whtMat) # get mixing matrix a = dot(unWhtMat, b) # get components sigs = data.times(w.T) self.w = w self.a = a self.sigs = sigs return self