Esempio n. 1
1
 def test_pandas(self, close_figures):
     pc = PCA(pd.DataFrame(self.x))
     pc1 = PCA(self.x)
     assert_allclose(pc.factors.values, pc1.factors)
     fig = pc.plot_scree()
     fig = pc.plot_scree(ncomp=10)
     fig = pc.plot_scree(log_scale=False)
     fig = pc.plot_rsquare()
     fig = pc.plot_rsquare(ncomp=5)
     proj = pc.project(2)
     PCA(pd.DataFrame(self.x), ncomp=4, gls=True)
     PCA(pd.DataFrame(self.x), ncomp=4, standardize=False)
Esempio n. 2
0
    def test_gls_and_weights(self):
        assert_raises(ValueError, PCA, self.x, gls=True)
        assert_raises(ValueError, PCA, self.x, weights=np.array([1.0, 1.0]))

        # Pre-standardize to make comparison simple
        x = (self.x - self.x.mean(0))
        x = x / (x**2.0).mean(0)
        pc_gls = PCA(x, ncomp=1, standardize=False, demean=False, gls=True)
        pc = PCA(x, ncomp=1, standardize=False, demean=False)
        errors = x - pc.projection
        var = (errors**2.0).mean(0)
        weights = 1.0 / var
        weights = weights / np.sqrt((weights**2.0).mean())

        assert_allclose(weights, pc_gls.weights)
        assert_equal(x, pc_gls.data)
        assert_equal(x, pc.data)

        pc_weights = PCA(x,
                         ncomp=1,
                         standardize=False,
                         demean=False,
                         weights=weights)

        assert_allclose(weights, pc_weights.weights)
        assert_allclose(np.abs(pc_weights.factors), np.abs(pc_gls.factors))
Esempio n. 3
0
    def test_eig_svd_equiv(self):
        """
        Test leading components since the tail end can differ
        """
        pc_eig = PCA(self.x)
        pc_svd = PCA(self.x, method='svd')

        assert_allclose(pc_eig.projection, pc_svd.projection)
        assert_allclose(np.abs(pc_eig.factors[:, :2]),
                        np.abs(pc_svd.factors[:, :2]))
        assert_allclose(np.abs(pc_eig.coeff[:2, :]),
                        np.abs(pc_svd.coeff[:2, :]))
        assert_allclose(pc_eig.eigenvals, pc_svd.eigenvals)
        assert_allclose(np.abs(pc_eig.eigenvecs[:, :2]),
                        np.abs(pc_svd.eigenvecs[:, :2]))

        pc_svd = PCA(self.x, method='svd', ncomp=2)
        pc_nipals = PCA(self.x, method='nipals', ncomp=2)
        assert_allclose(np.abs(pc_nipals.factors),
                        np.abs(pc_svd.factors),
                        atol=DECIMAL_5)
        assert_allclose(np.abs(pc_nipals.coeff),
                        np.abs(pc_svd.coeff),
                        atol=DECIMAL_5)
        assert_allclose(pc_nipals.eigenvals, pc_svd.eigenvals, atol=DECIMAL_5)
        assert_allclose(np.abs(pc_nipals.eigenvecs),
                        np.abs(pc_svd.eigenvecs),
                        atol=DECIMAL_5)
        # Check data for no changes
        assert_equal(self.x, pc_svd.data)
        # Check data for no changes
        assert_equal(self.x, pc_eig.data)
        # Check data for no changes
        assert_equal(self.x, pc_nipals.data)
Esempio n. 4
0
    def test_wide(self):
        pc = PCA(self.x_wide)
        assert_equal(pc.factors.shape[1], self.x_wide.shape[0])
        assert_equal(pc.eigenvecs.shape[1], min(np.array(self.x_wide.shape)))

        pc = PCA(pd.DataFrame(self.x_wide))
        assert_equal(pc.factors.shape[1], self.x_wide.shape[0])
        assert_equal(pc.eigenvecs.shape[1], min(np.array(self.x_wide.shape)))
Esempio n. 5
0
    def test_rsquare(self):
        x = self.x + 0.0
        mu = x.mean(0)
        x_demean = x - mu
        std = np.std(x, 0)
        x_std = x_demean / std

        pc = PCA(self.x)
        nvar = x.shape[1]
        rsquare = np.zeros(nvar + 1)
        tss = np.sum(x_std**2)
        for i in range(nvar + 1):
            errors = x_std - pc.project(i, transform=False, unweight=False)
            rsquare[i] = 1.0 - np.sum(errors**2) / tss
        assert_allclose(rsquare, pc.rsquare)

        pc = PCA(self.x, standardize=False)
        tss = np.sum(x_demean**2)
        for i in range(nvar + 1):
            errors = x_demean - pc.project(i, transform=False, unweight=False)
            rsquare[i] = 1.0 - np.sum(errors**2) / tss
        assert_allclose(rsquare, pc.rsquare)

        pc = PCA(self.x, standardize=False, demean=False)
        tss = np.sum(x**2)
        for i in range(nvar + 1):
            errors = x - pc.project(i, transform=False, unweight=False)
            rsquare[i] = 1.0 - np.sum(errors**2) / tss
        assert_allclose(rsquare, pc.rsquare)
Esempio n. 6
0
    def test_missing_dataframe(self):
        x = self.x.copy()
        x[::5, ::7] = np.nan
        pc = PCA(x, ncomp=3, missing='fill-em')

        x = pd.DataFrame(x)
        pc_df = PCA(x, ncomp=3, missing='fill-em')
        assert_allclose(pc.coeff, pc_df.coeff)
        assert_allclose(pc.factors, pc_df.factors)

        pc_df_nomissing = PCA(pd.DataFrame(self.x.copy()), ncomp=3)
        assert_true(isinstance(pc_df.coeff, type(pc_df_nomissing.coeff)))
        assert_true(isinstance(pc_df.data, type(pc_df_nomissing.data)))
        assert_true(
            isinstance(pc_df.eigenvals, type(pc_df_nomissing.eigenvals)))
        assert_true(
            isinstance(pc_df.eigenvecs, type(pc_df_nomissing.eigenvecs)))

        x = self.x.copy()
        x[::5, ::7] = np.nan
        x_df = pd.DataFrame(x)
        pc = PCA(x, missing='drop-row')
        pc_df = PCA(x_df, missing='drop-row')
        assert_allclose(pc.coeff, pc_df.coeff)
        assert_allclose(pc.factors, pc_df.factors)

        pc = PCA(x, missing='drop-col')
        pc_df = PCA(x_df, missing='drop-col')
        assert_allclose(pc.coeff, pc_df.coeff)
        assert_allclose(pc.factors, pc_df.factors)

        pc = PCA(x, missing='drop-min')
        pc_df = PCA(x_df, missing='drop-min')
        assert_allclose(pc.coeff, pc_df.coeff)
        assert_allclose(pc.factors, pc_df.factors)
Esempio n. 7
0
    def test_warnings_and_errors(self):
        with warnings.catch_warnings(record=True) as w:
            pc = PCA(self.x, ncomp=300)
            assert_equal(len(w), 1)

        with warnings.catch_warnings(record=True) as w:
            rs = self.rs
            x = rs.standard_normal((200, 1)) * np.ones(200)
            pc = PCA(x, method='eig')
            assert_equal(len(w), 1)

        assert_raises(ValueError, PCA, self.x, method='unknown')
        assert_raises(ValueError, PCA, self.x, missing='unknown')
        assert_raises(ValueError, PCA, self.x, tol=2.0)
        assert_raises(ValueError, PCA, np.nan * np.ones((200, 100)), tol=2.0)
Esempio n. 8
0
def test_gls_warning(reset_randomstate):
    data = np.random.standard_normal((400, 200))
    data[:, 1:] = data[:, :1] + .01 * data[:, 1:]
    with pytest.warns(EstimationWarning,
                      match="Many series are being down weighted"):
        factors = PCA(data, ncomp=2, gls=True).factors
    assert factors.shape == (data.shape[0], 2)
Esempio n. 9
0
def impute_accuracy(missingCube, missingGlyCube, comps, PCAcompare=True):
    """ Calculate the imputation R2X """
    cube, glyCube, _ = form_tensor()
    CMTFR2X = np.zeros(comps.shape)
    PCAR2X = np.zeros(comps.shape)

    # compare artificially introduced missingness only
    imputeCube = np.copy(cube)
    imputeCube[np.isfinite(missingCube)] = np.nan
    imputeGlyCube = np.copy(glyCube)
    imputeGlyCube[np.isfinite(missingGlyCube)] = np.nan

    if PCAcompare:
        missingMat = flatten_to_mat(missingCube, missingGlyCube)
        imputeMat = np.copy(flatten_to_mat(cube, glyCube))
        imputeMat[np.isfinite(missingMat)] = np.nan

    for ii, nComp in enumerate(comps):
        # reconstruct with some values missing
        recon_cmtf = perform_CMTF(missingCube, missingGlyCube, nComp)
        CMTFR2X[ii] = calcR2X(recon_cmtf, tIn=imputeCube, mIn=imputeGlyCube)

        if PCAcompare:
            outt = PCA(missingMat, ncomp=nComp, missing="fill-em", standardize=False, demean=False, normalize=False)
            recon_pca = outt.scores @ outt.loadings.T
            PCAR2X[ii] = calcR2X(recon_pca, mIn=imputeMat)

    return CMTFR2X, PCAR2X
Esempio n. 10
0
def main():
    beg_date = '2004-01-01'
    funds = ['002001_Nav']
    period = 25
    df_filtered = fund_Analysis(beg_date, funds)
    train_sets, cv_sets, test_sets = fund_data_proprocessing(
        beg_date,
        funds,
        df_filtered,
        degroup='Roll',
        split_portion=0.15,
        period=period)
    test_features_data, features_name, test_labels = getTFDataSets(
        test_sets, period)
    train_features_data, _, train_labels = getTFDataSets(train_sets, period)
    cv_features_data, _, cv_labels = getTFDataSets(cv_sets, period)

    X = np.append(np.append(train_features_data, cv_features_data, axis=0),
                  test_features_data,
                  axis=0)
    X_2 = np.append(train_features_data, cv_features_data, axis=0)
    y = np.append(np.append(train_labels, cv_labels, axis=0),
                  test_labels,
                  axis=0)
    y_2 = np.append(train_labels, cv_labels, axis=0)

    print "Sample Size: {}".format(X_2.shape)
    print "Labels size: {}".format(y_2.shape)

    pca = PCA(X, ncomp=200)
    print pca.factors.shape
    print pca.ic
    print pca.eigenvals
Esempio n. 11
0
    def test_against_reference(self):
        # Test against MATLAB, which by default demeans but does not standardize
        x = data.xo / 1000.0
        pc = PCA(x, normalize=False, standardize=False)

        ref = princomp1
        assert_allclose(np.abs(pc.factors), np.abs(ref.factors))
        assert_allclose(pc.factors.dot(pc.coeff) + x.mean(0), x)
        assert_allclose(np.abs(pc.coeff), np.abs(ref.coef.T))
        assert_allclose(pc.factors.dot(pc.coeff), ref.factors.dot(ref.coef.T))

        pc = PCA(x[:20], normalize=False, standardize=False)
        mu = x[:20].mean(0)
        ref = princomp2
        assert_allclose(np.abs(pc.factors), np.abs(ref.factors))
        assert_allclose(pc.factors.dot(pc.coeff) + mu, x[:20])
        assert_allclose(np.abs(pc.coeff), np.abs(ref.coef.T))
        assert_allclose(pc.factors.dot(pc.coeff), ref.factors.dot(ref.coef.T))
Esempio n. 12
0
    def test_options(self):
        pc = PCA(self.x)
        pc_no_norm = PCA(self.x, normalize=False)
        assert_allclose(pc.factors.dot(pc.coeff),
                        pc_no_norm.factors.dot(pc_no_norm.coeff))
        princomp = pc.factors
        assert_allclose(princomp.T.dot(princomp), np.eye(100), atol=1e-5)
        weights = pc_no_norm.coeff
        assert_allclose(weights.T.dot(weights), np.eye(100), atol=1e-5)

        pc_10 = PCA(self.x, ncomp=10)
        assert_allclose(pc.factors[:, :10], pc_10.factors)
        assert_allclose(pc.coeff[:10, :], pc_10.coeff)
        assert_allclose(pc.rsquare[:(10 + 1)], pc_10.rsquare)
        assert_allclose(pc.eigenvals[:10], pc_10.eigenvals)
        assert_allclose(pc.eigenvecs[:, :10], pc_10.eigenvecs)

        pc = PCA(self.x, standardize=False, normalize=False)
        mu = self.x.mean(0)
        xdm = self.x - mu
        xpx = xdm.T.dot(xdm)
        val, vec = np.linalg.eigh(xpx)
        ind = np.argsort(val)
        ind = ind[::-1]
        val = val[ind]
        vec = vec[:, ind]
        assert_allclose(xdm, pc.transformed_data)
        assert_allclose(val, pc.eigenvals)
        assert_allclose(np.abs(vec), np.abs(pc.eigenvecs))
        assert_allclose(np.abs(pc.factors), np.abs(xdm.dot(vec)))
        assert_allclose(pc.projection, xdm + mu)

        pc = PCA(self.x, standardize=False, demean=False, normalize=False)
        x = self.x
        xpx = x.T.dot(x)
        val, vec = np.linalg.eigh(xpx)
        ind = np.argsort(val)
        ind = ind[::-1]
        val = val[ind]
        vec = vec[:, ind]
        assert_allclose(x, pc.transformed_data)
        assert_allclose(val, pc.eigenvals)
        assert_allclose(np.abs(vec), np.abs(pc.eigenvecs))
        assert_allclose(np.abs(pc.factors), np.abs(x.dot(vec)))
    def principle_component_analysis(self, v, clean_data="greedy"):

        s = self.map_column_to_sheet(v[0])

        # prepare data
        dfClean = s.cleanData(v, clean_data)
        data = dfClean[v]

        pca = PCA(data)

        return pca
Esempio n. 14
0
    def test_rsquare(self):
        x = self.x + 0.0
        mu = x.mean(0)
        x_demean = x - mu
        std = np.std(x, 0)
        x_std = x_demean / std

        pc = PCA(self.x)
        nvar = x.shape[1]
        rsquare = np.zeros(nvar + 1)
        tss = np.sum(x_std ** 2)
        for i in range(nvar + 1):
            errors = x_std - pc.project(i, transform=False, unweight=False)
            rsquare[i] = 1.0 - np.sum(errors ** 2) / tss
        assert_allclose(rsquare, pc.rsquare)

        pc = PCA(self.x, standardize=False)
        tss = np.sum(x_demean ** 2)
        for i in range(nvar + 1):
            errors = x_demean - pc.project(i, transform=False, unweight=False)
            rsquare[i] = 1.0 - np.sum(errors ** 2) / tss
        assert_allclose(rsquare, pc.rsquare)

        pc = PCA(self.x, standardize=False, demean=False)
        tss = np.sum(x ** 2)
        for i in range(nvar + 1):
            errors = x - pc.project(i, transform=False, unweight=False)
            rsquare[i] = 1.0 - np.sum(errors ** 2) / tss
        assert_allclose(rsquare, pc.rsquare)
Esempio n. 15
0
def makeFigure():
    """Get a list of the axis objects and create a figure"""
    # Get list of axis objects
    ax, f = getSetup((6, 3), (1, 2))

    comps = np.arange(1, 13)
    TMTFR2X = np.zeros(comps.shape)
    PCAR2X = np.zeros(comps.shape)

    tOrig, mOrig = createCube()

    tMat = np.reshape(tOrig, (181, -1))
    tMat = tMat[:, ~np.all(np.isnan(tMat), axis=0)]
    tMat = np.hstack((tMat, mOrig))

    sizePCA = comps * np.sum(tMat.shape)
    sizeTfac = comps * (np.sum(tOrig.shape) + mOrig.shape[1])

    for i, cc in enumerate(comps):
        outt = PCA(tMat,
                   ncomp=cc,
                   missing="fill-em",
                   standardize=False,
                   demean=False,
                   normalize=False)
        recon = outt.scores @ outt.loadings.T
        PCAR2X[i] = np.nanvar(tMat - recon) / np.nanvar(tMat)

        _, _, TMTFR2X[i] = perform_CMTF(tOrig, mOrig, r=cc)

    ax[0].scatter(comps, TMTFR2X, color="k", s=10)
    ax[0].set_ylabel("TMTF R2X")
    ax[0].set_xlabel("Number of Components")
    ax[0].set_xticks([x for x in comps])
    ax[0].set_xticklabels([x for x in comps])
    ax[0].set_ylim(0, 1)
    ax[0].set_xlim(0.0, np.amax(comps) + 0.5)

    ax[1].set_xscale("log", base=2)
    ax[1].plot(sizePCA, PCAR2X, "r.", label="PCA")
    ax[1].plot(sizeTfac, 1.0 - TMTFR2X, "k.", label="TMTF")
    ax[1].set_ylabel("Normalized Unexplained Variance")
    ax[1].set_xlabel("Size of Factorization")
    ax[1].set_ylim(bottom=0.0)
    ax[1].set_xlim(2**8, 2**12)
    ax[1].legend()

    # Add subplot labels
    subplotLabel(ax)

    return f
    def test_pca(self):
        p = 20
        x = np.random.randn(100)[:, None]
        x = x + np.random.randn(100, p)
        pc = PCA(x, ncomp=p, missing=None)

        t = np.arange(100)

        mslist = []
        for i in range(p):
            mslist.append(pyleo.Series(time=t, value=x[:, i]))
        ms = pyleo.MultipleSeries(mslist)

        #res = ms.pca(nMC=20, missing='fill-em', standardize=False)
        res = ms.pca(nMC=20)

        # assert what?
        assert_array_equal(pc.eigenvals, res['eigvals'])
Esempio n. 17
0
def initialize_cp(tensor: np.ndarray, matrix: np.ndarray, rank: int):
    r"""Initialize factors used in `parafac`.
    Parameters
    ----------
    tensor : ndarray
    rank : int
    Returns
    -------
    factors : CPTensor
        An initial cp tensor.
    """
    factors = []
    for mode in range(tl.ndim(tensor)):
        unfold = tl.unfold(tensor, mode)

        if mode == 0 and (matrix is not None):
            unfold = np.hstack((unfold, matrix))

        # Remove completely missing columns
        unfold = unfold[:, np.sum(np.isfinite(unfold), axis=0) > 2]

        # Impute by PCA
        outt = PCA(unfold,
                   ncomp=1,
                   method="nipals",
                   missing="fill-em",
                   standardize=False,
                   demean=False,
                   normalize=False,
                   max_em_iter=1000)
        recon_pca = outt.scores @ outt.loadings.T
        unfold[np.isnan(unfold)] = recon_pca[np.isnan(unfold)]

        U = np.linalg.svd(unfold)[0]

        if U.shape[1] < rank:
            # This is a hack but it seems to do the job for now
            pad_part = np.random.rand(U.shape[0], rank - U.shape[1])
            U = tl.concatenate([U, pad_part], axis=1)

        factors.append(U[:, :rank])

    return tl.cp_tensor.CPTensor((None, factors))
Esempio n. 18
0
def sm_pca(u, v):
    """
    Compute principal directions of variation
    """
    # Form input into dataframe
    data = pd.DataFrame({'u': u, 'v': v})

    # Clean data
    data = data.query('~u.isnull() & ~v.isnull()')

    # Perform PCA
    pca_model = PCA(data, demean=True, standardize=False)

    # Component vectors
    u_1, v_1 = pca_model.eigenvecs.iloc[:, 0]
    u_2, v_2 = pca_model.eigenvecs.iloc[:, 1]
    l_1, l_2 = pca_model.eigenvals

    # Compute angle of eigenvector 1
    theta = 180 * np.arctan2(v_1, u_1) / np.pi

    return u_1, v_1, u_2, v_2, l_1, l_2, theta
Esempio n. 19
0
    def test_projection(self):
        pc = PCA(self.x, ncomp=5)
        mu = self.x.mean(0)
        demean_x = self.x - mu
        coef = np.linalg.pinv(pc.factors).dot(demean_x)
        direct = pc.factors.dot(coef)
        assert_allclose(pc.projection, direct + mu)

        pc = PCA(self.x, standardize=False, ncomp=5)
        coef = np.linalg.pinv(pc.factors).dot(demean_x)
        direct = pc.factors.dot(coef)
        assert_allclose(pc.projection, direct + mu)

        pc = PCA(self.x, standardize=False, demean=False, ncomp=5)
        coef = np.linalg.pinv(pc.factors).dot(self.x)
        direct = pc.factors.dot(coef)
        assert_allclose(pc.projection, direct)

        pc = PCA(self.x, ncomp=5, gls=True)
        mu = self.x.mean(0)
        demean_x = self.x - mu
        coef = np.linalg.pinv(pc.factors).dot(demean_x)
        direct = pc.factors.dot(coef)
        assert_allclose(pc.projection, direct + mu)

        pc = PCA(self.x, standardize=False, ncomp=5)
        coef = np.linalg.pinv(pc.factors).dot(demean_x)
        direct = pc.factors.dot(coef)
        assert_allclose(pc.projection, direct + mu)

        pc = PCA(self.x, standardize=False, demean=False, ncomp=5, gls=True)
        coef = np.linalg.pinv(pc.factors).dot(self.x)
        direct = pc.factors.dot(coef)
        assert_allclose(pc.projection, direct)

        # Test error for too many factors
        project = pc.project
        assert_raises(ValueError, project, 6)
Esempio n. 20
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 16 13:32:05 2020

@author: kellenbullock
"""
import pandas as pd
from statsmodels.multivariate.pca import PCA

df = pd.read_excel('Final.xlsx')
df = df.drop(columns=['Unnamed: 0'])

c = PCA(df, standardize=False)
Esempio n. 21
0
    def test_replace_missing(self):
        x = self.x.copy()
        x[::5, ::7] = np.nan

        pc = PCA(x, missing='drop-row')
        x_dropped_row = x[np.logical_not(np.any(np.isnan(x), 1))]
        pc_dropped = PCA(x_dropped_row)
        assert_equal(pc.projection, pc_dropped.projection)
        assert_equal(x, pc.data)

        pc = PCA(x, missing='drop-col')
        x_dropped_col = x[:, np.logical_not(np.any(np.isnan(x), 0))]
        pc_dropped = PCA(x_dropped_col)
        assert_equal(pc.projection, pc_dropped.projection)
        assert_equal(x, pc.data)

        pc = PCA(x, missing='drop-min')
        if x_dropped_row.size > x_dropped_col.size:
            x_dropped_min = x_dropped_row
        else:
            x_dropped_min = x_dropped_col
        pc_dropped = PCA(x_dropped_min)
        assert_equal(pc.projection, pc_dropped.projection)
        assert_equal(x, pc.data)

        pc = PCA(x, ncomp=3, missing='fill-em')
        missing = np.isnan(x)
        mu = np.nanmean(x, axis=0)
        errors = x - mu
        sigma = np.sqrt(np.nanmean(errors**2, axis=0))
        x_std = errors / sigma
        x_std[missing] = 0.0
        last = x_std[missing]
        delta = 1.0
        count = 0
        while delta > 5e-8:
            pc_temp = PCA(x_std, ncomp=3, standardize=False, demean=False)
            x_std[missing] = pc_temp.projection[missing]
            current = x_std[missing]
            diff = current - last
            delta = np.sqrt(np.sum(diff**2)) / np.sqrt(np.sum(current**2))
            last = current
            count += 1
        x = self.x + 0.0
        projection = pc_temp.projection * sigma + mu
        x[missing] = projection[missing]
        assert_allclose(pc._adjusted_data, x)
        # Check data for no changes
        assert_equal(self.x, self.x_copy)

        x = self.x
        pc = PCA(x)
        pc_dropped = PCA(x, missing='drop-row')
        assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5)

        pc_dropped = PCA(x, missing='drop-col')
        assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5)

        pc_dropped = PCA(x, missing='drop-min')
        assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5)

        pc = PCA(x, ncomp=3)
        pc_dropped = PCA(x, ncomp=3, missing='fill-em')
        assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5)

        # Test too many missing for missing='fill-em'
        x = self.x.copy()
        x[:, :] = np.nan
        assert_raises(ValueError, PCA, x, missing='drop-row')
        assert_raises(ValueError, PCA, x, missing='drop-col')
        assert_raises(ValueError, PCA, x, missing='drop-min')
        assert_raises(ValueError, PCA, x, missing='fill-em')
Esempio n. 22
0
from statsmodels.multivariate.pca import PCA

model = PCA(X)

transformed = model.transformed_data

components = model.eigenvecs

components
Esempio n. 23
0
 def test_pandas(self):
     pc = PCA(pd.DataFrame(self.x))
     pc1 = PCA(self.x)
     assert_equal(pc.factors.values, pc1.factors)
     fig = pc.plot_scree()
     fig = pc.plot_scree(ncomp=10)
     fig = pc.plot_scree(log_scale=False)
     fig = pc.plot_rsquare()
     fig = pc.plot_rsquare(ncomp=5)
     proj = pc.project(2)
     PCA(pd.DataFrame(self.x), ncomp=4, gls=True)
     PCA(pd.DataFrame(self.x), ncomp=4, standardize=False)
Esempio n. 24
0
# 有两种方法使用 PCA 分析矩形矩阵:我们可以将行视为 "objects",将列视为 "variables",反之亦然。
# 在这里,我们将把生育率措施当作 "variables",将国家作为 "objects"。 因此目标是将每年的生育率值
# 降低到较小的生育率 "profiles" 或 "basis functions",以反映不同国家随时间变化的大部分变化。

# 值得一看,PCA中消除了平均趋势。 它表明这个数据集涵盖的时间段内,生育率稳步下降。请注意,均值是
# 使用国家/地区作为分析单位来计算的,而忽略了人口规模。 对于以下进行的 PCA 分析也是如此。 更复杂
# 的分析可能会对这些国家加权,比如说 1980 年的人口。

ax = dta.mean().plot(grid=False)
ax.set_xlabel("Year", size=17)
ax.set_ylabel("Fertility rate", size=17)
ax.set_xlim(0, 51)

# 接下来,运行 PCA:

pca_model = PCA(dta.T, standardize=False, demean=True)

# 基于特征值,我们看到第一个主成分(PC)占主导,第二和第三个主成分(PC)可能捕获了少量有意义的变化。

fig = pca_model.plot_scree(log_scale=False)

# 接下来,我们将绘制主成分(PC)因子。 主导因子是单调递增的。与上面显示的平均值相比,第一个因子得分为正的国家的生育率增长更快(或下降更快)。
# 在第一个因子上得分为负的国家/地区的生育率下降得比平均值快。第二个因子呈U形,并在 1985 年左右出现一个正峰值。第二个因子的正向评分较高的国家
# 将在数据范围的开始和结束时低于平均受精率,但高于数据中心的平均受精率的范围。

fig, ax = plt.subplots(figsize=(8, 4))
lines = ax.plot(pca_model.factors.iloc[:, :3], lw=4, alpha=.6)
ax.set_xticklabels(dta.columns.values[::10])
ax.set_xlim(0, 51)
ax.set_xlabel("Year", size=17)
fig.subplots_adjust(.1, .1, .85, .9)
Esempio n. 25
0
 def test_smoke_plot_and_repr(self, close_figures):
     pc = PCA(self.x)
     fig = pc.plot_scree()
     fig = pc.plot_scree(ncomp=10)
     fig = pc.plot_scree(log_scale=False)
     fig = pc.plot_scree(cumulative=True)
     fig = pc.plot_rsquare()
     fig = pc.plot_rsquare(ncomp=5)
     # Additional smoke test
     pc.__repr__()
     pc = PCA(self.x, standardize=False)
     pc.__repr__()
     pc = PCA(self.x, standardize=False, demean=False)
     pc.__repr__()
     # Check data for no changes
     assert_equal(self.x, pc.data)
Esempio n. 26
0
 def test_equivalence(self):
     x = self.x.copy()
     assert_allclose(PCA(x).factors, pca(x)[0])
Esempio n. 27
0
def hdrboxplot(data,
               ncomp=2,
               alpha=None,
               threshold=0.95,
               bw=None,
               xdata=None,
               labels=None,
               ax=None,
               use_brute=False,
               seed=None):
    """
    High Density Region boxplot

    Parameters
    ----------
    data : sequence of ndarrays or 2-D ndarray
        The vectors of functions to create a functional boxplot from.  If a
        sequence of 1-D arrays, these should all be the same size.
        The first axis is the function index, the second axis the one along
        which the function is defined.  So ``data[0, :]`` is the first
        functional curve.
    ncomp : int, optional
        Number of components to use.  If None, returns the as many as the
        smaller of the number of rows or columns in data.
    alpha : list of floats between 0 and 1, optional
        Extra quantile values to compute. Default is None
    threshold : float between 0 and 1, optional
        Percentile threshold value for outliers detection. High value means
        a lower sensitivity to outliers. Default is `0.95`.
    bw: array_like or str, optional
        If an array, it is a fixed user-specified bandwidth. If `None`, set to
        `normal_reference`. If a string, should be one of:

            - normal_reference: normal reference rule of thumb (default)
            - cv_ml: cross validation maximum likelihood
            - cv_ls: cross validation least squares

    xdata : ndarray, optional
        The independent variable for the data. If not given, it is assumed to
        be an array of integers 0..N-1 with N the length of the vectors in
        `data`.
    labels : sequence of scalar or str, optional
        The labels or identifiers of the curves in `data`. If not given,
        outliers are labeled in the plot with array indices.
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.
    use_brute : bool
        Use the brute force optimizer instead of the default differential
        evolution to find the curves. Default is False.
    seed : {None, int, np.random.RandomState}
        Seed value to pass to scipy.optimize.differential_evolution. Can be an
        integer or RandomState instance. If None, then the default RandomState
        provided by np.random is used.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.
    hdr_res : HdrResults instance
        An `HdrResults` instance with the following attributes:

         - 'median', array. Median curve.
         - 'hdr_50', array. 50% quantile band. [sup, inf] curves
         - 'hdr_90', list of array. 90% quantile band. [sup, inf]
            curves.
         - 'extra_quantiles', list of array. Extra quantile band.
            [sup, inf] curves.
         - 'outliers', ndarray. Outlier curves.

    Notes
    -----
    The median curve is the curve with the highest probability on the reduced
    space of a Principal Component Analysis (PCA).

    Outliers are defined as curves that fall outside the band corresponding
    to the quantile given by `threshold`.

    The non-outlying region is defined as the band made up of all the
    non-outlying curves.

    Behind the scene, the dataset is represented as a matrix. Each line
    corresponding to a 1D curve. This matrix is then decomposed using Principal
    Components Analysis (PCA). This allows to represent the data using a finite
    number of modes, or components. This compression process allows to turn the
    functional representation into a scalar representation of the matrix. In
    other words, you can visualize each curve from its components. Each curve
    is thus a point in this reduced space. With 2 components, this is called a
    bivariate plot (2D plot).

    In this plot, if some points are adjacent (similar components), it means
    that back in the original space, the curves are similar. Then, finding the
    median curve means finding the higher density region (HDR) in the reduced
    space. Moreover, the more you get away from this HDR, the more the curve is
    unlikely to be similar to the other curves.

    Using a kernel smoothing technique, the probability density function (PDF)
    of the multivariate space can be recovered. From this PDF, it is possible
    to compute the density probability linked to the cluster of points and plot
    its contours.

    Finally, using these contours, the different quantiles can be extracted
    along with the median curve and the outliers.

    Steps to produce the HDR boxplot include:

    1. Compute a multivariate kernel density estimation
    2. Compute contour lines for quantiles 90%, 50% and `alpha` %
    3. Plot the bivariate plot
    4. Compute median curve along with quantiles and outliers curves.

    References
    ----------
    [1] R.J. Hyndman and H.L. Shang, "Rainbow Plots, Bagplots, and Boxplots for
        Functional Data", vol. 19, pp. 29-45, 2010.

    Examples
    --------
    Load the El Nino dataset.  Consists of 60 years worth of Pacific Ocean sea
    surface temperature data.

    >>> import matplotlib.pyplot as plt
    >>> import statsmodels.api as sm
    >>> data = sm.datasets.elnino.load(as_pandas=False)

    Create a functional boxplot.  We see that the years 1982-83 and 1997-98 are
    outliers; these are the years where El Nino (a climate pattern
    characterized by warming up of the sea surface and higher air pressures)
    occurred with unusual intensity.

    >>> fig = plt.figure()
    >>> ax = fig.add_subplot(111)
    >>> res = sm.graphics.hdrboxplot(data.raw_data[:, 1:],
    ...                              labels=data.raw_data[:, 0].astype(int),
    ...                              ax=ax)

    >>> ax.set_xlabel("Month of the year")
    >>> ax.set_ylabel("Sea surface temperature (C)")
    >>> ax.set_xticks(np.arange(13, step=3) - 1)
    >>> ax.set_xticklabels(["", "Mar", "Jun", "Sep", "Dec"])
    >>> ax.set_xlim([-0.2, 11.2])

    >>> plt.show()

    .. plot:: plots/graphics_functional_hdrboxplot.py

    See Also
    --------
    banddepth, rainbowplot, fboxplot
    """
    fig, ax = utils.create_mpl_ax(ax)

    if labels is None:
        # For use with pandas, get the labels
        if hasattr(data, 'index'):
            labels = data.index
        else:
            labels = np.arange(len(data))

    data = np.asarray(data)
    if xdata is None:
        xdata = np.arange(data.shape[1])

    n_samples, dim = data.shape
    # PCA and bivariate plot
    pca = PCA(data, ncomp=ncomp)
    data_r = pca.factors

    # Create gaussian kernel
    ks_gaussian = KDEMultivariate(data_r,
                                  bw=bw,
                                  var_type='c' * data_r.shape[1])

    # Boundaries of the n-variate space
    bounds = np.array([data_r.min(axis=0), data_r.max(axis=0)]).T

    # Compute contour line of pvalue linked to a given probability level
    if alpha is None:
        alpha = [threshold, 0.9, 0.5]
    else:
        alpha.extend([threshold, 0.9, 0.5])
        alpha = list(set(alpha))
    alpha.sort(reverse=True)

    n_quantiles = len(alpha)
    pdf_r = ks_gaussian.pdf(data_r).flatten()
    pvalues = [
        np.percentile(pdf_r, (1 - alpha[i]) * 100, interpolation='linear')
        for i in range(n_quantiles)
    ]

    # Find mean, outliers curves
    if have_de_optim and not use_brute:
        median = differential_evolution(lambda x: -ks_gaussian.pdf(x),
                                        bounds=bounds,
                                        maxiter=5,
                                        seed=seed).x
    else:
        median = brute(lambda x: -ks_gaussian.pdf(x),
                       ranges=bounds,
                       finish=fmin)

    outliers_idx = np.where(pdf_r < pvalues[alpha.index(threshold)])[0]
    labels_outlier = [labels[i] for i in outliers_idx]
    outliers = data[outliers_idx]

    # Find HDR given some quantiles

    def _band_quantiles(band, use_brute=use_brute, seed=seed):
        """
        Find extreme curves for a quantile band.

        From the `band` of quantiles, the associated PDF extrema values
        are computed. If `min_alpha` is not provided (single quantile value),
        `max_pdf` is set to `1E6` in order not to constrain the problem on high
        values.

        An optimization is performed per component in order to find the min and
        max curves. This is done by comparing the PDF value of a given curve
        with the band PDF.

        Parameters
        ----------
        band : array_like
            alpha values ``(max_alpha, min_alpha)`` ex: ``[0.9, 0.5]``
        use_brute : bool
            Use the brute force optimizer instead of the default differential
            evolution to find the curves. Default is False.
        seed : {None, int, np.random.RandomState}
            Seed value to pass to scipy.optimize.differential_evolution. Can
            be an integer or RandomState instance. If None, then the default
            RandomState provided by np.random is used.


        Returns
        -------
        band_quantiles : list of 1-D array
            ``(max_quantile, min_quantile)`` (2, n_features)

        """
        min_pdf = pvalues[alpha.index(band[0])]
        try:
            max_pdf = pvalues[alpha.index(band[1])]
        except IndexError:
            max_pdf = 1E6
        band = [min_pdf, max_pdf]

        pool = Pool()
        data = zip(
            range(dim),
            itertools.repeat(
                (band, pca, bounds, ks_gaussian, seed, use_brute)))
        band_quantiles = pool.map(_min_max_band, data)
        pool.terminate()
        pool.close()

        band_quantiles = list(zip(*band_quantiles))

        return band_quantiles

    extra_alpha = [
        i for i in alpha if 0.5 != i and 0.9 != i and threshold != i
    ]
    if len(extra_alpha) > 0:
        extra_quantiles = []
        for x in extra_alpha:
            for y in _band_quantiles([x], use_brute=use_brute, seed=seed):
                extra_quantiles.append(y)
    else:
        extra_quantiles = []

    # Inverse transform from n-variate plot to dataset dataset's shape
    median = _inverse_transform(pca, median)[0]
    hdr_90 = _band_quantiles([0.9, 0.5], use_brute=use_brute, seed=seed)
    hdr_50 = _band_quantiles([0.5], use_brute=use_brute, seed=seed)

    hdr_res = HdrResults({
        "median": median,
        "hdr_50": hdr_50,
        "hdr_90": hdr_90,
        "extra_quantiles": extra_quantiles,
        "outliers": outliers,
        "outliers_idx": outliers_idx
    })

    # Plots
    ax.plot(np.array([xdata] * n_samples).T,
            data.T,
            c='c',
            alpha=.1,
            label=None)
    ax.plot(xdata, median, c='k', label='Median')
    fill_betweens = []
    fill_betweens.append(
        ax.fill_between(xdata,
                        *hdr_50,
                        color='gray',
                        alpha=.4,
                        label='50% HDR'))
    fill_betweens.append(
        ax.fill_between(xdata,
                        *hdr_90,
                        color='gray',
                        alpha=.3,
                        label='90% HDR'))

    if len(extra_quantiles) != 0:
        ax.plot(np.array([xdata] * len(extra_quantiles)).T,
                np.array(extra_quantiles).T,
                c='y',
                ls='-.',
                alpha=.4,
                label='Extra quantiles')

    if len(outliers) != 0:
        for ii, outlier in enumerate(outliers):
            if labels_outlier is None:
                label = 'Outliers'
            else:
                label = str(labels_outlier[ii])
            ax.plot(xdata, outlier, ls='--', alpha=0.7, label=label)

    handles, labels = ax.get_legend_handles_labels()

    # Proxy artist for fill_between legend entry
    # See https://matplotlib.org/1.3.1/users/legend_guide.html
    plt = _import_mpl()
    for label, fill_between in zip(['50% HDR', '90% HDR'], fill_betweens):
        p = plt.Rectangle((0, 0), 1, 1, fc=fill_between.get_facecolor()[0])
        handles.append(p)
        labels.append(label)

    by_label = OrderedDict(zip(labels, handles))
    if len(outliers) != 0:
        by_label.pop('Median')
        by_label.pop('50% HDR')
        by_label.pop('90% HDR')

    ax.legend(by_label.values(), by_label.keys(), loc='best')

    return fig, hdr_res
Esempio n. 28
0
 def test_equivalence_full_matrices(self):
     x = self.x.copy()
     svd_full_matrices_true = PCA(x, svd_full_matrices=True).factors
     svd_full_matrices_false = PCA(x).factors
     assert_allclose(svd_full_matrices_true, svd_full_matrices_false)
Esempio n. 29
0
def test_missing():
    data = np.empty((200, 50))
    data[0, 0] = np.nan
    with pytest.raises(ValueError, match="data contains non-finite values"):
        PCA(data)
Esempio n. 30
0
# pca in statsmodels
import numpy as np
from statsmodels.multivariate.pca import PCA
X = np.random.randn(100)[:, None]
X = X + np.random.randn(100, 100)
pc = PCA(X)

print(pc.factors.shape) 
pc.plot_scree(ncomp = 5).show()
Esempio n. 31
0
 def test_smoke_plot_and_repr(self):
     pc = PCA(self.x)
     fig = pc.plot_scree()
     fig = pc.plot_scree(ncomp=10)
     fig = pc.plot_scree(log_scale=False)
     fig = pc.plot_scree(cumulative=True)
     fig = pc.plot_rsquare()
     fig = pc.plot_rsquare(ncomp=5)
     # Additional smoke test
     pc.__repr__()
     pc = PCA(self.x, standardize=False)
     pc.__repr__()
     pc = PCA(self.x, standardize=False, demean=False)
     pc.__repr__()
     # Check data for no changes
     assert_equal(self.x, pc.data)
Esempio n. 32
0
def makeFigure():
    ax, f = getSetup((11, 14), (4, 3))
    comps = np.arange(1, 7)

    tensor, _ = Tensor3D()

    CMTFfacs = [
        parafac(tensor,
                cc,
                tol=1e-12,
                n_iter_max=4000,
                linesearch=True,
                orthogonalise=2) for cc in comps
    ]

    # Normalize factors
    CMTFfacs = [cp_normalize(f) for f in CMTFfacs]
    CMTFfacs = [reorient_factors(f) for f in CMTFfacs]
    CMTFfacs = [
        sort_factors(f) if i > 0 else f for i, f in enumerate(CMTFfacs)
    ]

    # Calculate R2X
    CMTFR2X = np.array([calcR2X(f, tensor) for f in CMTFfacs])
    print(CMTFR2X)

    ax[0].axis("off")
    ax[1].scatter(comps, CMTFR2X, color="b")
    ax[1].set_ylabel("R2X")
    ax[1].set_xlabel("Number of Components")
    ax[1].set_xticks([x for x in comps])
    ax[1].set_xticklabels([x for x in comps])
    ax[1].set_ylim(0, 1)
    ax[1].set_xlim(0.0, np.amax(comps) + 0.5)

    PCAR2X = np.zeros(comps.shape)
    sizeTfac = np.zeros(comps.shape)

    tMat = flatten_to_mat(tensor)
    sizePCA = comps * np.sum(tMat.shape)

    for i, cc in enumerate(comps):
        outt = PCA(tMat,
                   ncomp=cc,
                   missing="fill-em",
                   standardize=False,
                   demean=False,
                   normalize=False)
        recon = outt.scores @ outt.loadings.T
        PCAR2X[i] = calcR2X(recon, mIn=tMat)
        sizeTfac[i] = tensor_degFreedom(CMTFfacs[i])

    ax[2].set_xscale("log", base=2)
    ax[2].plot(sizeTfac, 1.0 - CMTFR2X, ".", label="CMTF")
    ax[2].plot(sizePCA, 1.0 - PCAR2X, ".", label="PCA")
    ax[2].set_ylabel("Normalized Unexplained Variance")
    ax[2].set_xlabel("Size of Reduced Data")
    ax[2].set_ylim(bottom=0.0)
    ax[2].set_xlim(2**8, 2**12)
    ax[2].xaxis.set_major_formatter(ScalarFormatter())
    ax[2].legend()

    # Colormap

    Rlabels, agLabels = dimensionLabel3D()
    tfac = CMTFfacs[2]

    # Flip comp. 2
    tfac.factors[0][:, 1] *= -1
    tfac.factors[2][:, 1] *= -1

    components = [str(ii + 1) for ii in range(tfac.rank)]
    comp_plot(tfac.factors[0], components, False, "Samples", ax[3])
    comp_plot(tfac.factors[1], components, agLabels, "Antigens", ax[4])
    comp_plot(tfac.factors[2], components, Rlabels, "Receptors", ax[5])

    time_plot(tfac, ax[6])
    time_plot(tfac, ax[7], condition="Negative")
    time_plot(tfac, ax[8], condition="Moderate")
    time_plot(tfac, ax[9], condition="Severe")
    time_plot(tfac, ax[10], condition="Deceased")

    df = time_components_df(tfac)
    sns.boxplot(data=df.loc[df["week"] == 1, :],
                x="Factors",
                y="value",
                hue="group",
                ax=ax[11])

    #sns.boxplot(data=df.loc[df["week"] == 3, :], x="variable", y="value", hue="group")

    subplotLabel(ax)
    return f
Esempio n. 33
0
# Base de dados trabalhada e unificada num DF
df = pca_dataf(data_base, list_plan_ref)  # Essa parte demora bem
df = df.loc['1996-01-01':, :]
#adf_res = eu.adf_test(df)  # resultados do teste adf para cada série
df_t = df.copy()  # copia da df para os dados transformados

for i, series in enumerate(df):
    if any(df[series] <= 0):
        df_t[series] = df[series].diff()
    else:
        df_t[series] = df[series].pct_change(fill_method=None)

df_t = df_t.dropna(axis=0, how='all')

pc = PCA(df, ncomp=1, standardize=True, missing='fill-em')
pc_t = PCA(df_t, ncomp=1, standardize=True, missing='fill-em')

wb = xw.Book(r'F:\DADOS\ASSET\MACROECONOMIA\DADOS\Atividade\PCA\PCA_ativ.xlsm')
sht = wb.sheets['pca']
sht_d = wb.sheets['pca_d']
sht.range('A1').value = pc.scores
sht_d.range('A1').value = pc_t.scores

# ANÁLISE DE DADOS QUE JÁ SAÍRAM
last = pd.DataFrame(df.iloc[-1, :]).T
last = last.dropna(axis=1)

df_last = df.filter(items=last.columns)
df_tlast = df_last.copy()
for i, series in enumerate(df_last):
# The mean trend is removed in PCA, but its worthwhile taking a look at
# it.  It shows that fertility has dropped steadily over the time period
# covered in this dataset.  Note that the mean is calculated using a country
# as the unit of analysis, ignoring population size.  This is also true for
# the PC analysis conducted below.  A more sophisticated analysis might
# weight the countries, say by population in 1980.

ax = dta.mean().plot(grid=False)
ax.set_xlabel("Year", size=17)
ax.set_ylabel(
    "Fertility rate", size=17)
ax.set_xlim(0, 51)

# Next we perform the PCA:

pca_model = PCA(dta.T, standardize=False, demean=True)

# Based on the eigenvalues, we see that the first PC dominates, with
# perhaps a small amount of meaningful variation captured in the second and
# third PC's.

fig = pca_model.plot_scree(log_scale=False)

# Next we will plot the PC factors.  The dominant factor is monotonically
# increasing.  Countries with a positive score on the first factor will
# increase faster (or decrease slower) compared to the mean shown above.
# Countries with a negative score on the first factor will decrease faster
# than the mean.  The second factor is U-shaped with a positive peak at
# around 1985.  Countries with a large positive score on the second factor
# will have lower than average fertilities at the beginning and end of the
# data range, but higher than average fertility in the middle of the range.