def test_times_rdd(self): mat1 = RowMatrix(self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5, 6]))], 2)) mat2 = RowMatrix(self.sc.parallelize([(1, array([7, 8, 9])), (2, array([10, 11, 12]))], 2)) truth = array([[47, 52, 57], [64, 71, 78], [81, 90, 99]]) resultA = mat1.times(mat2) resultB = mat1.times(mat2, "accum") assert array_equal(resultA, truth) assert array_equal(resultB, truth)
def test_times_rdd(self): mat1 = RowMatrix( self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5, 6]))], 2)) mat2 = RowMatrix( self.sc.parallelize([(1, array([7, 8, 9])), (2, array([10, 11, 12]))], 2)) truth = array([[47, 52, 57], [64, 71, 78], [81, 90, 99]]) resultA = mat1.times(mat2) resultB = mat1.times(mat2, "accum") assert array_equal(resultA, truth) assert array_equal(resultB, truth)
def test_times_array(self): mat1 = RowMatrix( self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5, 6]))])) mat2 = array([[7, 8], [9, 10], [11, 12]]) truth = [array([58, 64]), array([139, 154])] result = mat1.times(mat2).collect() assert array_equal(result, truth)
def test_times_array(self): mat1 = RowMatrix(self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5, 6]))])) mat2 = array([[7, 8], [9, 10], [11, 12]]) truth = [array([58, 64]), array([139, 154])] result = mat1.times(mat2).collect() assert array_equal(result, truth)
def calc(self, mat): """ Calcuate singular vectors Parameters ---------- mat : RDD of (tuple, array) pairs, or RowMatrix Matrix to compute singular vectors from Returns ---------- self : returns an instance of self. """ if type(mat) is not RowMatrix: mat = RowMatrix(mat) if self.method == "direct": # get the normalized gramian matrix cov = mat.gramian() / mat.nrows # do a local eigendecomposition eigw, eigv = eigh(cov) inds = argsort(eigw)[::-1] s = sqrt(eigw[inds[0:self.k]]) * sqrt(mat.nrows) v = eigv[:, inds[0:self.k]].T # project back into data, normalize by singular values u = mat.times(v.T / s) self.u = u.rdd self.s = s self.v = v if self.method == "em": # initialize random matrix c = random.rand(self.k, mat.ncols) iter = 0 error = 100 # iterative update subspace using expectation maximization # e-step: x = (c'c)^-1 c' y # m-step: c = y x' (xx')^-1 while (iter < self.maxiter) & (error > self.tol): c_old = c # pre compute (c'c)^-1 c' c_inv = dot(c.T, inv(dot(c, c.T))) # compute (xx')^-1 through a map reduce xx = mat.times(c_inv).gramian() xx_inv = inv(xx) # pre compute (c'c)^-1 c' (xx')^-1 premult2 = mat.rdd.context.broadcast(dot(c_inv, xx_inv)) # compute the new c through a map reduce c = mat.rows().map( lambda x: outer(x, dot(x, premult2.value))).sum() c = c.T error = sum(sum((c - c_old)**2)) iter += 1 # project data into subspace spanned by columns of c # use standard eigendecomposition to recover an orthonormal basis c = orth(c.T) cov = mat.times(c).gramian() / mat.nrows eigw, eigv = eigh(cov) inds = argsort(eigw)[::-1] s = sqrt(eigw[inds[0:self.k]]) * sqrt(mat.nrows) v = dot(eigv[:, inds[0:self.k]].T, c.T) u = mat.times(v.T / s) self.u = u.rdd self.s = s self.v = v return self
def fit(self, data): """ Fit independent components using an iterative fixed-point algorithm Parameters ---------- data: RDD of (tuple, array) pairs, or RowMatrix Data to estimate independent components from Returns ---------- self : returns an instance of self. """ d = len(data.first()[1]) if self.k is None: self.k = d if self.c > self.k: raise Exception("number of independent comps " + str(self.c) + " must be less than the number of principal comps " + str(self.k)) if self.k > d: raise Exception("number of principal comps " + str(self.k) + " must be less than the data dimensionality " + str(d)) if type(data) is not RowMatrix: data = RowMatrix(data) # reduce dimensionality svd = SVD(k=self.k, method=self.svdmethod).calc(data) # whiten data whtmat = real(dot(inv(diag(svd.s/sqrt(data.nrows))), svd.v)) unwhtmat = real(dot(transpose(svd.v), diag(svd.s/sqrt(data.nrows)))) wht = data.times(whtmat.T) # do multiple independent component extraction if self.seed != 0: random.seed(self.seed) b = orth(random.randn(self.k, self.c)) b_old = zeros((self.k, self.c)) iter = 0 minabscos = 0 errvec = zeros(self.maxiter) while (iter < self.maxiter) & ((1 - minabscos) > self.tol): iter += 1 # update rule for pow3 non-linearity (TODO: add others) b = wht.rows().map(lambda x: outer(x, dot(x, b) ** 3)).sum() / wht.nrows - 3 * b # make orthogonal b = dot(b, real(sqrtm(inv(dot(transpose(b), b))))) # evaluate error minabscos = min(abs(diag(dot(transpose(b), b_old)))) # store results b_old = b errvec[iter-1] = (1 - minabscos) # get un-mixing matrix w = dot(b.T, whtmat) # get mixing matrix a = dot(unwhtmat, b) # get components sigs = data.times(w.T).rdd self.w = w self.a = a self.sigs = sigs return self
def calc(self, mat): """ Calcuate singular vectors Parameters ---------- mat : RDD of (tuple, array) pairs, or RowMatrix Matrix to compute singular vectors from Returns ---------- self : returns an instance of self. """ if type(mat) is not RowMatrix: mat = RowMatrix(mat) if self.method == "direct": # get the normalized gramian matrix cov = mat.gramian() / mat.nrows # do a local eigendecomposition eigw, eigv = eigh(cov) inds = argsort(eigw)[::-1] s = sqrt(eigw[inds[0:self.k]]) * sqrt(mat.nrows) v = eigv[:, inds[0:self.k]].T # project back into data, normalize by singular values u = mat.times(v.T / s) self.u = u.rdd self.s = s self.v = v if self.method == "em": # initialize random matrix c = random.rand(self.k, mat.ncols) iter = 0 error = 100 # iterative update subspace using expectation maximization # e-step: x = (c'c)^-1 c' y # m-step: c = y x' (xx')^-1 while (iter < self.maxiter) & (error > self.tol): c_old = c # pre compute (c'c)^-1 c' c_inv = dot(c.T, inv(dot(c, c.T))) # compute (xx')^-1 through a map reduce xx = mat.times(c_inv).gramian() xx_inv = inv(xx) # pre compute (c'c)^-1 c' (xx')^-1 premult2 = mat.rdd.context.broadcast(dot(c_inv, xx_inv)) # compute the new c through a map reduce c = mat.rows().map(lambda x: outer(x, dot(x, premult2.value))).sum() c = c.T error = sum(sum((c - c_old) ** 2)) iter += 1 # project data into subspace spanned by columns of c # use standard eigendecomposition to recover an orthonormal basis c = orth(c.T) cov = mat.times(c).gramian() / mat.nrows eigw, eigv = eigh(cov) inds = argsort(eigw)[::-1] s = sqrt(eigw[inds[0:self.k]]) * sqrt(mat.nrows) v = dot(eigv[:, inds[0:self.k]].T, c.T) u = mat.times(v.T / s) self.u = u.rdd self.s = s self.v = v return self