def regresswithpca(data, modelfile, regressmode): """Perform univariate regression, followed by principal components analysis to reduce dimensionality :param data: RDD of data points as key value pairs :param modelfile: model parameters (string with file location, array, or tuple) :param regressmode: form of regression ("linear" or "bilinear") :return stats: statistics of the fit :return comps: compoents from PCA :return scores: scores from PCA :return latent: latent variances from PCA """ # create model model = RegressionModel.load(modelfile, regressmode) # do regression betas, stats, resid = model.fit(data) # do principal components analysis scores, latent, comps = svd(betas, 2) # compute trajectories from raw data traj = model.fit(data, comps) return stats, comps, latent, scores, traj
def crosscorr(data, sigfile, lag): """cross-correlate data points (typically time series data) against a signal over the specified lags arguments: data - RDD of data points sigfile - signal to correlate with (string with file location or array) lag - maximum lag (result will be 2*lag + 1) returns: betas - cross-correlations at different time lags scores, latent, comps - result of applying pca if lag > 0 """ # compute cross correlations method = SigProcessingMethod.load("crosscorr", sigfile=sigfile, lag=lag) betas = method.calc(data) if lag is not 0: # do PCA scores, latent, comps = svd(betas, 2) return betas, scores, latent, comps else: return betas
def regresswithpca(data, modelfile, regressmode, k=2): """Perform univariate regression, followed by principal components analysis to reduce dimensionality :param data: RDD of data points as key value pairs :param modelfile: model parameters (string with file location, array, or tuple) :param regressmode: form of regression ("linear" or "bilinear") :param k: number of principal components to compute :return stats: statistics of the fit :return comps: compoents from PCA :return scores: scores from PCA :return latent: latent variances from PCA """ # create model model = RegressionModel.load(modelfile, regressmode) # do regression betas, stats, resid = model.fit(data) # do principal components analysis scores, latent, comps = svd(betas, k) # compute trajectories from raw data traj = model.fit(data, comps) return stats, comps, latent, scores, traj
def regress(data, modelfile, regressmode): """perform mass univariate regression, followed by principal components analysis to reduce dimensionality arguments: data - RDD of data points modelfile - model parameters (string with file location, array, or tuple) regressmode - form of regression ("linear" or "bilinear") returns: stats - statistics of the fit comps, latent, scores, traj - results of principal components analysis """ # create model model = RegressionModel.load(modelfile, regressmode) # do regression betas, stats, resid = model.fit(data) # do principal components analysis scores, latent, comps = svd(betas, 2) # compute trajectories from raw data traj = model.fit(data, comps) return stats, comps, latent, scores, traj
def crosscorr(data, sigfile, lag): """Cross-correlate data points (typically time series data) against a signal over the specified lags :param data: RDD of data points as key value pairs :param sigfile: signal to correlate with (string with file location or array) :param lag: maximum lag (result will be length 2*lag + 1) :return betas: cross-correlations at different time lags :return scores: scores from PCA (if lag > 0) :return latent: scores from PCA (if lag > 0) :return comps: components from PCA (if lag > 0) """ # compute cross correlations method = SigProcessingMethod.load("crosscorr", sigfile=sigfile, lag=lag) betas = method.calc(data) if lag is not 0: # do PCA scores, latent, comps = svd(betas, 2) return betas, scores, latent, comps else: return betas
def ica(data, k, c, svdmethod="direct", maxiter=100, tol=0.000001, seed=0): """Perform independent components analysis :param: data: RDD of data points :param k: number of principal components to use :param c: number of independent components to find :param maxiter: maximum number of iterations (default = 100) :param: tol: tolerance for change in estimate (default = 0.000001) :return w: the mixing matrix :return: sigs: the independent components TODO: also return unmixing matrix """ # get count n = data.count() # reduce dimensionality scores, latent, comps = svd(data, k, meansubtract=0, method=svdmethod) # whiten data whtmat = real(dot(inv(diag(latent / sqrt(n))), comps)) unwhtmat = real(dot(transpose(comps), diag(latent / sqrt(n)))) wht = data.mapValues(lambda x: dot(whtmat, x)) # do multiple independent component extraction if seed != 0: random.seed(seed) b = orth(random.randn(k, c)) b_old = zeros((k, c)) iter = 0 minabscos = 0 errvec = zeros(maxiter) while (iter < maxiter) & ((1 - minabscos) > tol): iter += 1 # update rule for pow3 non-linearity (TODO: add others) b = wht.map(lambda (_, v): v).map( lambda x: outer(x, dot(x, b)**3)).sum() / n - 3 * b # make orthogonal b = dot(b, real(sqrtm(inv(dot(transpose(b), b))))) # evaluate error minabscos = min(abs(diag(dot(transpose(b), b_old)))) # store results b_old = b errvec[iter - 1] = (1 - minabscos) # get un-mixing matrix w = dot(transpose(b), whtmat) # get components sigs = data.mapValues(lambda x: dot(w, x)) return w, sigs
def test_svd_direct(self): data_local = [array([1.0, 2.0, 6.0]), array([1.0, 3.0, 0.0]), array([1.0, 4.0, 6.0]), array([5.0, 1.0, 4.0])] data = self.sc.parallelize(zip(range(1, 5), data_local)) u, s, v = svd(data, 1, meansubtract=0, method="direct") u_true, s_true, v_true = LinAlg.svd(array(data_local)) u_test = transpose(array(u.map(lambda (_, v): v).collect()))[0] v_test = v[0] assert allclose(s[0], s_true[0]) assert allclose(v_test, v_true[0, :]) | allclose(-v_test, v_true[0, :]) assert allclose(u_test, u_true[:, 0]) | allclose(-u_test, u_true[:, 0])
def ica(data, k, c, svdmethod="direct", maxiter=100, tol=0.000001, seed=0): """perform independent components analysis arguments: data - RDD of data points k - number of principal components to use c - number of independent components to find maxiter - maximum number of iterations (default = 100) tol - tolerance for change in estimate (default = 0.000001) returns: w - the mixing matrix sigs - the independent components """ # get count n = data.count() # reduce dimensionality scores, latent, comps = svd(data, k, meansubtract=0, method=svdmethod) # whiten data whtmat = real(dot(inv(diag(latent/sqrt(n))), comps)) unwhtmat = real(dot(transpose(comps), diag(latent/sqrt(n)))) wht = data.map(lambda x: dot(whtmat, x)) # do multiple independent component extraction if seed != 0: random.seed(seed) b = orth(random.randn(k, c)) b_old = zeros((k, c)) iter = 0 minabscos = 0 errvec = zeros(maxiter) while (iter < maxiter) & ((1 - minabscos) > tol): iter += 1 # update rule for pow3 non-linearity (TODO: add others) b = wht.map(lambda x: outer(x, dot(x, b) ** 3)).sum() / n - 3 * b # make orthogonal b = dot(b, real(sqrtm(inv(dot(transpose(b), b))))) # evaluate error minabscos = min(abs(diag(dot(transpose(b), b_old)))) # store results b_old = b errvec[iter-1] = (1 - minabscos) # get un-mixing matrix w = dot(transpose(b), whtmat) # get components sigs = data.map(lambda x: dot(w, x)) return w, sigs
def test_svd_em(self): data_local = [array([1.0, 2.0, 6.0]), array([1.0, 3.0, 0.0]), array([1.0, 4.0, 6.0]), array([5.0, 1.0, 4.0])] data = self.sc.parallelize(zip(range(1, 5), data_local)) u, s, v = svd(data, 1, meansubtract=0, method="em") u_true, s_true, v_true = LinAlg.svd(array(data_local)) u_test = transpose(array(u.map(lambda (_, v): v).collect()))[0] v_test = v[0] tol = 10e-04 # allow small error for iterative method assert allclose(s[0], s_true[0], atol=tol) assert allclose(v_test, v_true[0, :], atol=tol) | allclose(-v_test, v_true[0, :], atol=tol) assert allclose(u_test, u_true[:, 0], atol=tol) | allclose(-u_test, u_true[:, 0], atol=tol)
def pca(data, k, svdmethod="direct"): """Perform principal components analysis using the singular value decomposition :param data: RDD of data points as key value pairs :param k: number of principal components to recover :param svdmethod: which svd algorithm to use (default = "direct") :return comps: the k principal components (as array) :return latent: the latent values :return scores: the k scores (as RDD) """ scores, latent, comps = svd(data, k, meansubtract=0, method=svdmethod) return scores, latent, comps
def test_svd_direct(self): data_local = array([ array([1.0, 2.0, 6.0]), array([1.0, 3.0, 0.0]), array([1.0, 4.0, 6.0]), array([5.0, 1.0, 4.0]) ]) data = self.sc.parallelize(data_local) u, s, v = svd(data, 1, meansubtract=0, method="direct") u_true, s_true, v_true = LinAlg.svd(data_local) u_test = transpose(array(u.collect()))[0] v_test = v[0] assert(allclose(s[0], s_true[0])) assert(allclose(v_test, v_true[0, :]) | allclose(-v_test, v_true[0, :])) assert(allclose(u_test, u_true[:, 0]) | allclose(-u_test, u_true[:, 0]))
def test_svd_direct(self): data_local = [ array([1.0, 2.0, 6.0]), array([1.0, 3.0, 0.0]), array([1.0, 4.0, 6.0]), array([5.0, 1.0, 4.0]) ] data = self.sc.parallelize(zip(range(1, 5), data_local)) u, s, v = svd(data, 1, meansubtract=0, method="direct") u_true, s_true, v_true = LinAlg.svd(array(data_local)) u_test = transpose(array(u.map(lambda (_, v): v).collect()))[0] v_test = v[0] assert (allclose(s[0], s_true[0])) assert (allclose(v_test, v_true[0, :]) | allclose(-v_test, v_true[0, :])) assert (allclose(u_test, u_true[:, 0]) | allclose(-u_test, u_true[:, 0]))
def test_svd_em(self): data_local = [ array([1.0, 2.0, 6.0]), array([1.0, 3.0, 0.0]), array([1.0, 4.0, 6.0]), array([5.0, 1.0, 4.0]) ] data = self.sc.parallelize(zip(range(1, 5), data_local)) u, s, v = svd(data, 1, meansubtract=0, method="em") u_true, s_true, v_true = LinAlg.svd(array(data_local)) u_test = transpose(array(u.map(lambda (_, v): v).collect()))[0] v_test = v[0] tol = 10e-04 # allow small error for iterative method assert (allclose(s[0], s_true[0], atol=tol)) assert (allclose(v_test, v_true[0, :], atol=tol) | allclose(-v_test, v_true[0, :], atol=tol)) assert (allclose(u_test, u_true[:, 0], atol=tol) | allclose(-u_test, u_true[:, 0], atol=tol))