Example #1
0
def regresswithpca(data, modelfile, regressmode):
    """Perform univariate regression,
    followed by principal components analysis
    to reduce dimensionality

    :param data: RDD of data points as key value pairs
    :param modelfile: model parameters (string with file location, array, or tuple)
    :param regressmode: form of regression ("linear" or "bilinear")

    :return stats: statistics of the fit
    :return comps: compoents from PCA
    :return scores: scores from PCA
    :return latent: latent variances from PCA
    """
    # create model
    model = RegressionModel.load(modelfile, regressmode)

    # do regression
    betas, stats, resid = model.fit(data)

    # do principal components analysis
    scores, latent, comps = svd(betas, 2)

    # compute trajectories from raw data
    traj = model.fit(data, comps)

    return stats, comps, latent, scores, traj
Example #2
0
def crosscorr(data, sigfile, lag):
    """cross-correlate data points
    (typically time series data)
    against a signal over the specified lags

    arguments:
    data - RDD of data points
    sigfile - signal to correlate with (string with file location or array)
    lag - maximum lag (result will be 2*lag + 1)

    returns:
    betas - cross-correlations at different time lags
    scores, latent, comps - result of applying pca if lag > 0
    """

    # compute cross correlations
    method = SigProcessingMethod.load("crosscorr", sigfile=sigfile, lag=lag)
    betas = method.calc(data)

    if lag is not 0:
        # do PCA
        scores, latent, comps = svd(betas, 2)
        return betas, scores, latent, comps
    else:
        return betas
Example #3
0
def regresswithpca(data, modelfile, regressmode, k=2):
    """Perform univariate regression,
    followed by principal components analysis
    to reduce dimensionality

    :param data: RDD of data points as key value pairs
    :param modelfile: model parameters (string with file location, array, or tuple)
    :param regressmode: form of regression ("linear" or "bilinear")
    :param k: number of principal components to compute

    :return stats: statistics of the fit
    :return comps: compoents from PCA
    :return scores: scores from PCA
    :return latent: latent variances from PCA
    """
    # create model
    model = RegressionModel.load(modelfile, regressmode)

    # do regression
    betas, stats, resid = model.fit(data)

    # do principal components analysis
    scores, latent, comps = svd(betas, k)

    # compute trajectories from raw data
    traj = model.fit(data, comps)

    return stats, comps, latent, scores, traj
Example #4
0
def regress(data, modelfile, regressmode):
    """perform mass univariate regression,
    followed by principal components analysis
    to reduce dimensionality

    arguments:
    data - RDD of data points
    modelfile - model parameters (string with file location, array, or tuple)
    regressmode - form of regression ("linear" or "bilinear")

    returns:
    stats - statistics of the fit
    comps, latent, scores, traj - results of principal components analysis
    """
    # create model
    model = RegressionModel.load(modelfile, regressmode)

    # do regression
    betas, stats, resid = model.fit(data)

    # do principal components analysis
    scores, latent, comps = svd(betas, 2)

    # compute trajectories from raw data
    traj = model.fit(data, comps)

    return stats, comps, latent, scores, traj
Example #5
0
def crosscorr(data, sigfile, lag):
    """Cross-correlate data points
    (typically time series data)
    against a signal over the specified lags

    :param data: RDD of data points as key value pairs
    :param sigfile: signal to correlate with (string with file location or array)
    :param lag: maximum lag (result will be length 2*lag + 1)

    :return betas: cross-correlations at different time lags
    :return scores: scores from PCA (if lag > 0)
    :return latent: scores from PCA (if lag > 0)
    :return comps: components from PCA (if lag > 0)
    """

    # compute cross correlations
    method = SigProcessingMethod.load("crosscorr", sigfile=sigfile, lag=lag)
    betas = method.calc(data)

    if lag is not 0:
        # do PCA
        scores, latent, comps = svd(betas, 2)
        return betas, scores, latent, comps
    else:
        return betas
Example #6
0
def crosscorr(data, sigfile, lag):
    """Cross-correlate data points
    (typically time series data)
    against a signal over the specified lags

    :param data: RDD of data points as key value pairs
    :param sigfile: signal to correlate with (string with file location or array)
    :param lag: maximum lag (result will be length 2*lag + 1)

    :return betas: cross-correlations at different time lags
    :return scores: scores from PCA (if lag > 0)
    :return latent: scores from PCA (if lag > 0)
    :return comps: components from PCA (if lag > 0)
    """

    # compute cross correlations
    method = SigProcessingMethod.load("crosscorr", sigfile=sigfile, lag=lag)
    betas = method.calc(data)

    if lag is not 0:
        # do PCA
        scores, latent, comps = svd(betas, 2)
        return betas, scores, latent, comps
    else:
        return betas
Example #7
0
def ica(data, k, c, svdmethod="direct", maxiter=100, tol=0.000001, seed=0):
    """Perform independent components analysis

    :param: data: RDD of data points
    :param k: number of principal components to use
    :param c: number of independent components to find
    :param maxiter: maximum number of iterations (default = 100)
    :param: tol: tolerance for change in estimate (default = 0.000001)

    :return w: the mixing matrix
    :return: sigs: the independent components

    TODO: also return unmixing matrix
    """
    # get count
    n = data.count()

    # reduce dimensionality
    scores, latent, comps = svd(data, k, meansubtract=0, method=svdmethod)

    # whiten data
    whtmat = real(dot(inv(diag(latent / sqrt(n))), comps))
    unwhtmat = real(dot(transpose(comps), diag(latent / sqrt(n))))
    wht = data.mapValues(lambda x: dot(whtmat, x))

    # do multiple independent component extraction
    if seed != 0:
        random.seed(seed)
    b = orth(random.randn(k, c))
    b_old = zeros((k, c))
    iter = 0
    minabscos = 0
    errvec = zeros(maxiter)

    while (iter < maxiter) & ((1 - minabscos) > tol):
        iter += 1
        # update rule for pow3 non-linearity (TODO: add others)
        b = wht.map(lambda (_, v): v).map(
            lambda x: outer(x,
                            dot(x, b)**3)).sum() / n - 3 * b
        # make orthogonal
        b = dot(b, real(sqrtm(inv(dot(transpose(b), b)))))
        # evaluate error
        minabscos = min(abs(diag(dot(transpose(b), b_old))))
        # store results
        b_old = b
        errvec[iter - 1] = (1 - minabscos)

    # get un-mixing matrix
    w = dot(transpose(b), whtmat)

    # get components
    sigs = data.mapValues(lambda x: dot(w, x))

    return w, sigs
    def test_svd_direct(self):
        data_local = [array([1.0, 2.0, 6.0]), array([1.0, 3.0, 0.0]), array([1.0, 4.0, 6.0]), array([5.0, 1.0, 4.0])]
        data = self.sc.parallelize(zip(range(1, 5), data_local))

        u, s, v = svd(data, 1, meansubtract=0, method="direct")
        u_true, s_true, v_true = LinAlg.svd(array(data_local))
        u_test = transpose(array(u.map(lambda (_, v): v).collect()))[0]
        v_test = v[0]
        assert allclose(s[0], s_true[0])
        assert allclose(v_test, v_true[0, :]) | allclose(-v_test, v_true[0, :])
        assert allclose(u_test, u_true[:, 0]) | allclose(-u_test, u_true[:, 0])
Example #9
0
def ica(data, k, c, svdmethod="direct", maxiter=100, tol=0.000001, seed=0):
    """perform independent components analysis

    arguments:
    data - RDD of data points
    k - number of principal components to use
    c - number of independent components to find
    maxiter - maximum number of iterations (default = 100)
    tol - tolerance for change in estimate (default = 0.000001)

    returns:
    w - the mixing matrix
    sigs - the independent components
    """
    # get count
    n = data.count()

    # reduce dimensionality
    scores, latent, comps = svd(data, k, meansubtract=0, method=svdmethod)

    # whiten data
    whtmat = real(dot(inv(diag(latent/sqrt(n))), comps))
    unwhtmat = real(dot(transpose(comps), diag(latent/sqrt(n))))
    wht = data.map(lambda x: dot(whtmat, x))

    # do multiple independent component extraction
    if seed != 0:
        random.seed(seed)
    b = orth(random.randn(k, c))
    b_old = zeros((k, c))
    iter = 0
    minabscos = 0
    errvec = zeros(maxiter)

    while (iter < maxiter) & ((1 - minabscos) > tol):
        iter += 1
        # update rule for pow3 non-linearity (TODO: add others)
        b = wht.map(lambda x: outer(x, dot(x, b) ** 3)).sum() / n - 3 * b
        # make orthogonal
        b = dot(b, real(sqrtm(inv(dot(transpose(b), b)))))
        # evaluate error
        minabscos = min(abs(diag(dot(transpose(b), b_old))))
        # store results
        b_old = b
        errvec[iter-1] = (1 - minabscos)

    # get un-mixing matrix
    w = dot(transpose(b), whtmat)

    # get components
    sigs = data.map(lambda x: dot(w, x))

    return w, sigs
    def test_svd_em(self):
        data_local = [array([1.0, 2.0, 6.0]), array([1.0, 3.0, 0.0]), array([1.0, 4.0, 6.0]), array([5.0, 1.0, 4.0])]
        data = self.sc.parallelize(zip(range(1, 5), data_local))

        u, s, v = svd(data, 1, meansubtract=0, method="em")
        u_true, s_true, v_true = LinAlg.svd(array(data_local))
        u_test = transpose(array(u.map(lambda (_, v): v).collect()))[0]
        v_test = v[0]
        tol = 10e-04  # allow small error for iterative method
        assert allclose(s[0], s_true[0], atol=tol)
        assert allclose(v_test, v_true[0, :], atol=tol) | allclose(-v_test, v_true[0, :], atol=tol)
        assert allclose(u_test, u_true[:, 0], atol=tol) | allclose(-u_test, u_true[:, 0], atol=tol)
Example #11
0
def pca(data, k, svdmethod="direct"):
    """Perform principal components analysis
    using the singular value decomposition

    :param data: RDD of data points as key value pairs
    :param k: number of principal components to recover
    :param svdmethod: which svd algorithm to use (default = "direct")

    :return comps: the k principal components (as array)
    :return latent: the latent values
    :return scores: the k scores (as RDD)
    """
    scores, latent, comps = svd(data, k, meansubtract=0, method=svdmethod)
    return scores, latent, comps
Example #12
0
def pca(data, k, svdmethod="direct"):
    """Perform principal components analysis
    using the singular value decomposition

    :param data: RDD of data points as key value pairs
    :param k: number of principal components to recover
    :param svdmethod: which svd algorithm to use (default = "direct")

    :return comps: the k principal components (as array)
    :return latent: the latent values
    :return scores: the k scores (as RDD)
    """
    scores, latent, comps = svd(data, k, meansubtract=0, method=svdmethod)

    return scores, latent, comps
Example #13
0
    def test_svd_direct(self):
        data_local = array([
            array([1.0, 2.0, 6.0]),
            array([1.0, 3.0, 0.0]),
            array([1.0, 4.0, 6.0]),
            array([5.0, 1.0, 4.0])
        ])
        data = self.sc.parallelize(data_local)

        u, s, v = svd(data, 1, meansubtract=0, method="direct")
        u_true, s_true, v_true = LinAlg.svd(data_local)
        u_test = transpose(array(u.collect()))[0]
        v_test = v[0]
        assert(allclose(s[0], s_true[0]))
        assert(allclose(v_test, v_true[0, :]) | allclose(-v_test, v_true[0, :]))
        assert(allclose(u_test, u_true[:, 0]) | allclose(-u_test, u_true[:, 0]))
Example #14
0
    def test_svd_direct(self):
        data_local = [
            array([1.0, 2.0, 6.0]),
            array([1.0, 3.0, 0.0]),
            array([1.0, 4.0, 6.0]),
            array([5.0, 1.0, 4.0])
        ]
        data = self.sc.parallelize(zip(range(1, 5), data_local))

        u, s, v = svd(data, 1, meansubtract=0, method="direct")
        u_true, s_true, v_true = LinAlg.svd(array(data_local))
        u_test = transpose(array(u.map(lambda (_, v): v).collect()))[0]
        v_test = v[0]
        assert (allclose(s[0], s_true[0]))
        assert (allclose(v_test, v_true[0, :])
                | allclose(-v_test, v_true[0, :]))
        assert (allclose(u_test, u_true[:, 0])
                | allclose(-u_test, u_true[:, 0]))
Example #15
0
    def test_svd_em(self):
        data_local = [
            array([1.0, 2.0, 6.0]),
            array([1.0, 3.0, 0.0]),
            array([1.0, 4.0, 6.0]),
            array([5.0, 1.0, 4.0])
        ]
        data = self.sc.parallelize(zip(range(1, 5), data_local))

        u, s, v = svd(data, 1, meansubtract=0, method="em")
        u_true, s_true, v_true = LinAlg.svd(array(data_local))
        u_test = transpose(array(u.map(lambda (_, v): v).collect()))[0]
        v_test = v[0]
        tol = 10e-04  # allow small error for iterative method
        assert (allclose(s[0], s_true[0], atol=tol))
        assert (allclose(v_test, v_true[0, :], atol=tol)
                | allclose(-v_test, v_true[0, :], atol=tol))
        assert (allclose(u_test, u_true[:, 0], atol=tol)
                | allclose(-u_test, u_true[:, 0], atol=tol))