Example #1
0
def SPCA(X, reg, reg2):
    X = StandardScaler().fit_transform(X)
    transformer = SparsePCA(n_components=9, alpha=reg, ridge_alpha=reg2)
    transformer.fit(X)
    norm_comps = np.array(
        [i / np.linalg.norm(i) for i in transformer.components_])
    return norm_comps
def test_fit_transform():
    alpha = 1
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)  # wide array
    spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha,
                          random_state=0)
    spca_lars.fit(Y)
    U1 = spca_lars.transform(Y)
    # Test multiple CPUs
    if sys.platform == 'win32':  # fake parallelism for win32
        import sklearn.externals.joblib.parallel as joblib_par
        _mp = joblib_par.multiprocessing
        joblib_par.multiprocessing = None
        try:
            spca = SparsePCA(n_components=3, n_jobs=2, random_state=0,
                             alpha=alpha).fit(Y)
            U2 = spca.transform(Y)
        finally:
            joblib_par.multiprocessing = _mp
    else:  # we can efficiently use parallelism
        spca = SparsePCA(n_components=3, n_jobs=2, method='lars', alpha=alpha,
                         random_state=0).fit(Y)
        U2 = spca.transform(Y)
    assert_true(not np.all(spca_lars.components_ == 0))
    assert_array_almost_equal(U1, U2)
    # Test that CD gives similar results
    spca_lasso = SparsePCA(n_components=3, method='cd', random_state=0,
                           alpha=alpha)
    spca_lasso.fit(Y)
    assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
def test_initialization():
    rng = np.random.RandomState(0)
    U_init = rng.randn(5, 3)
    V_init = rng.randn(3, 4)
    model = SparsePCA(n_components=3, U_init=U_init, V_init=V_init, max_iter=0, random_state=rng)
    model.fit(rng.randn(5, 4))
    assert_array_equal(model.components_, V_init)
Example #4
0
def test_fit_transform():
    alpha = 1
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)  # wide array
    spca_lars = SparsePCA(n_components=3,
                          method='lars',
                          alpha=alpha,
                          random_state=0)
    spca_lars.fit(Y)

    # Test that CD gives similar results
    spca_lasso = SparsePCA(n_components=3,
                           method='cd',
                           random_state=0,
                           alpha=alpha)
    spca_lasso.fit(Y)
    assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)

    # Test that deprecated ridge_alpha parameter throws warning
    warning_msg = "The ridge_alpha parameter on transform()"
    assert_warns_message(DeprecationWarning,
                         warning_msg,
                         spca_lars.transform,
                         Y,
                         ridge_alpha=0.01)
    assert_warns_message(DeprecationWarning,
                         warning_msg,
                         spca_lars.transform,
                         Y,
                         ridge_alpha=None)
Example #5
0
def test_fit_transform():
    alpha = 1
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)  # wide array
    spca_lars = SparsePCA(n_components=3,
                          method='lars',
                          alpha=alpha,
                          random_state=0)
    spca_lars.fit(Y)
    U1 = spca_lars.transform(Y)
    # Test multiple CPUs
    spca = SparsePCA(n_components=3,
                     n_jobs=2,
                     method='lars',
                     alpha=alpha,
                     random_state=0).fit(Y)
    U2 = spca.transform(Y)
    assert_true(not np.all(spca_lars.components_ == 0))
    assert_array_almost_equal(U1, U2)
    # Test that CD gives similar results
    spca_lasso = SparsePCA(n_components=3,
                           method='cd',
                           random_state=0,
                           alpha=alpha)
    spca_lasso.fit(Y)
    assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
Example #6
0
def test_fit_transform():
    alpha = 1
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)  # wide array
    spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha,
                          random_state=0)
    spca_lars.fit(Y)
    U1 = spca_lars.transform(Y)
    # Test multiple CPUs
    if sys.platform == 'win32':  # fake parallelism for win32
        import sklearn.externals.joblib.parallel as joblib_par
        _mp = joblib_par.multiprocessing
        joblib_par.multiprocessing = None
        try:
            spca = SparsePCA(n_components=3, n_jobs=2, random_state=0,
                             alpha=alpha).fit(Y)
            U2 = spca.transform(Y)
        finally:
            joblib_par.multiprocessing = _mp
    else:  # we can efficiently use parallelism
        spca = SparsePCA(n_components=3, n_jobs=2, method='lars', alpha=alpha,
                         random_state=0).fit(Y)
        U2 = spca.transform(Y)
    assert_true(not np.all(spca_lars.components_ == 0))
    assert_array_almost_equal(U1, U2)
    # Test that CD gives similar results
    spca_lasso = SparsePCA(n_components=3, method='cd', random_state=0,
                           alpha=alpha)
    spca_lasso.fit(Y)
    assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
Example #7
0
class SparsePCA():
    def __init__(self, cols, n_components):
        self.n_components = n_components
        self.model = SparsePCA(n_components=n_components)
        self.columns = cols

    def fit(self, data):
        self.model.fit(data[self.columns])

    def fit_transform(self, data):
        transformed = self.model.fit_transform(data[self.columns])
        transformed = pd.DataFrame(
            transformed,
            columns=["spca_" + str(i + 1) for i in range(self.n_components)])
        data = pd.concat([data, transformed], axis=1)
        data = data.drop(self.columns, axis=1)
        return data

    def transform(self, data):
        transformed = self.model.transform(data[self.columns])
        transformed = pd.DataFrame(
            transformed,
            columns=["spca_" + str(i + 1) for i in range(self.n_components)])
        data = pd.concat([data, transformed], axis=1)
        data = data.drop(self.columns, axis=1)
        return data
Example #8
0
def test_initialization():
    rng = np.random.RandomState(0)
    U_init = rng.randn(5, 3)
    V_init = rng.randn(3, 4)
    model = SparsePCA(n_components=3, U_init=U_init, V_init=V_init, max_iter=0,
                      random_state=rng)
    model.fit(rng.randn(5, 4))
    assert_array_equal(model.components_, V_init)
Example #9
0
def sccodedirect():
    "得到不带眼镜的RPCA结果"
    nglassmodel = np.load('nglassline.npy').astype('f')
    from sklearn.decomposition import SparsePCA
    learning = SparsePCA(500,verbose=True)
    learning.fit(nglassmodel)
    import cPickle
    cPickle.dump(learning,file('sparsepcadirect','wb'),-1)
 def sparse_pca(self):
     """
     Runs PCA on view and returns projected view, the principle components,
     and explained variance.
     """
     model = SparsePCA(n_components=param['components'], alpha=param['sparse_pca_alpha'])
     model.fit(self.view)
     return model.transform(self.view), model.components_
Example #11
0
def sparse_pca(K, alpha, ridge_alpha):
    transformer = SparsePCA(n_components=1, alpha=alpha, ridge_alpha=ridge_alpha, normalize_components=False, random_state=0)
    transformer.fit(K)
    val = transformer.components_[0]
    print('#nnz: ', np.sum(np.abs(val) > 1.0e-10))
    #print(np.sum(val * val))
    #val = np.random.randn(K.shape[1])
    return val / np.linalg.norm(val)
Example #12
0
def test_initialization():
    rng = np.random.RandomState(0)
    U_init = rng.randn(5, 3)
    V_init = rng.randn(3, 4)
    model = SparsePCA(
        n_components=3, U_init=U_init, V_init=V_init, max_iter=0, random_state=rng
    )
    model.fit(rng.randn(5, 4))
    assert_allclose(model.components_, V_init / np.linalg.norm(V_init, axis=1)[:, None])
Example #13
0
def test_fit_transform():
    alpha = 1
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)  # wide array
    spca_lars = SparsePCA(n_components=3, method="lars", alpha=alpha, random_state=0)
    spca_lars.fit(Y)

    # Test that CD gives similar results
    spca_lasso = SparsePCA(n_components=3, method="cd", random_state=0, alpha=alpha)
    spca_lasso.fit(Y)
    assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
Example #14
0
def do_sparse_pca(sparse_matrix):
    # from skikit learn http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.SparsePCA.html#sklearn.decomposition.SparsePCA

    dense_matrix = sparse_matrix.tobsr().toarray()
    # instantiate the spca with some parameters
    spca = SparsePCA(n_components=6, alpha=0.01, ridge_alpha=0.01, max_iter=1000, tol=1e-08, method='lars', n_jobs=1, U_init=None, V_init=None, verbose=False, random_state=None)

    # train the spca with our matrix
    spca.fit(dense_matrix)

    # return the components
    return spca.components_
def test_initialization(norm_comp):
    rng = np.random.RandomState(0)
    U_init = rng.randn(5, 3)
    V_init = rng.randn(3, 4)
    model = SparsePCA(n_components=3, U_init=U_init, V_init=V_init, max_iter=0,
                      random_state=rng, normalize_components=norm_comp)
    model.fit(rng.randn(5, 4))
    if norm_comp:
        assert_allclose(model.components_,
                        V_init / np.linalg.norm(V_init, axis=1)[:, None])
    else:
        assert_allclose(model.components_, V_init)
Example #16
0
def spca(data, num_components=None, alpha=1):
    # creates a matrix with sparse principal component analysis
    # build matrix with all data
    data = [d.flatten() for d in data if not any(isnan(d))]
    datamatrix = row_stack(data)

    # center data
    cdata = datamatrix - mean(datamatrix, axis=0)

    if num_components is None:
        num_components = cdata.shape[0]

    # do spca on matrix
    spca = SparsePCA(n_components=num_components, alpha=alpha)
    spca.fit(cdata)

    # normalize components
    components = spca.components_.T
    for r in range(0, components.shape[1]):
        compnorm = numpy.apply_along_axis(numpy.linalg.norm, 0, components[:,
                                                                           r])
        if not compnorm == 0:
            components[:, r] /= compnorm
    components = components.T

    # calc adjusted explained variance from "Sparse Principal Component Analysis" by Zou, Hastie, Tibshirani
    spca.components_ = components
    #nuz = spca.transform(cdata).T
    nuz = ridge_regression(spca.components_.T,
                           cdata.T,
                           0.01,
                           solver='dense_cholesky').T

    #nuz = dot(components, cdata.T)
    q, r = qr(nuz.T)
    cumulative_var = []
    for i in range(1, num_components + 1):
        cumulative_var.append(trace(r[0:i, ] * r[0:i, ]))
    explained_var = [math.sqrt(cumulative_var[0])]
    for i in range(1, num_components):
        explained_var.append(
            math.sqrt(cumulative_var[i]) - math.sqrt(cumulative_var[i - 1]))

    order = numpy.argsort(explained_var)[::-1]
    components = numpy.take(components, order, axis=0)
    evars = numpy.take(explained_var, order).tolist()
    #evars = numpy.take(explained_var,order)
    #order2 = [0,1,2,4,5,7,12,19]
    #components = numpy.take(components,order2,axis=0)
    #evars = numpy.take(evars,order2).tolist()

    return components, evars
Example #17
0
def test_fit_transform():
    alpha = 1
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)  # wide array
    spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha,
                          random_state=0)
    spca_lars.fit(Y)

    # Test that CD gives similar results
    spca_lasso = SparsePCA(n_components=3, method='cd', random_state=0,
                           alpha=alpha)
    spca_lasso.fit(Y)
    assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
Example #18
0
class SPCAEstimator():
    def __init__(self, n_components, alpha=10.0):
        self.n_components = n_components
        self.whiten = False
        self.alpha = alpha  # higher alpha => sparser components
        #self.transformer = MiniBatchSparsePCA(n_components, alpha=alpha, n_iter=100,
        #    batch_size=max(20, n_components//5), random_state=0, normalize_components=True)
        self.transformer = SparsePCA(
            n_components,
            alpha=alpha,
            ridge_alpha=0.01,
            max_iter=100,
            random_state=0,
            n_jobs=-1,
            normalize_components=True)  # TODO: warm start using PCA result?
        self.batch_support = False  # maybe through memmap and HDD-stored tensor
        self.stdev = np.zeros((n_components, ))
        self.total_var = 0.0

    def get_param_str(self):
        return "spca_c{}_a{}{}".format(self.n_components, self.alpha,
                                       '_w' if self.whiten else '')

    def fit(self, X):
        self.transformer.fit(X)

        # Save variance for later
        self.total_var = X.var(axis=0).sum()

        # Compute projected standard deviations
        # NB: cannot simply project with dot product!
        self.stdev = self.transformer.transform(X).std(
            axis=0)  # X = (n_samples, n_features)

        # Sort components based on explained variance
        idx = np.argsort(self.stdev)[::-1]
        self.stdev = self.stdev[idx]
        self.transformer.components_[:] = self.transformer.components_[idx]

        # Check orthogonality
        dotps = [
            np.dot(*self.transformer.components_[[i, j]])
            for (i, j) in itertools.combinations(range(self.n_components), 2)
        ]
        if not np.allclose(dotps, 0, atol=1e-4):
            print('SPCA components not orghogonal, max dot',
                  np.abs(dotps).max())

    def get_components(self):
        var_ratio = self.stdev**2 / self.total_var
        return self.transformer.components_, self.stdev, var_ratio  # SPCA outputs are normalized
Example #19
0
class DimensionalityReducer(object):
    
    def __init__(self):
        self.sc = None
        self.pca = None
    
    def fitPCA(self, X, nfeats=3):
        self.sc = StandardScaler()
        self.pca = SparsePCA(n_components=nfeats)
        self.pca.fit(self.sc.fit_transform(X))
        
    def transformPCA(self, X):
        components = self.pca.transform(self.sc.transform(X))
        return components
Example #20
0
def test_fit_transform_parallel():
    alpha = 1
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)  # wide array
    spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha,
                          random_state=0)
    spca_lars.fit(Y)
    U1 = spca_lars.transform(Y)
    # Test multiple CPUs
    spca = SparsePCA(n_components=3, n_jobs=2, method='lars', alpha=alpha,
                     random_state=0).fit(Y)
    U2 = spca.transform(Y)
    assert_true(not np.all(spca_lars.components_ == 0))
    assert_array_almost_equal(U1, U2)
Example #21
0
class SparsePCAImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
Example #22
0
def test_fit_transform_parallel():
    alpha = 1
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)  # wide array
    spca_lars = SparsePCA(n_components=3, method="lars", alpha=alpha, random_state=0)
    spca_lars.fit(Y)
    U1 = spca_lars.transform(Y)
    # Test multiple CPUs
    spca = SparsePCA(
        n_components=3, n_jobs=2, method="lars", alpha=alpha, random_state=0
    ).fit(Y)
    U2 = spca.transform(Y)
    assert not np.all(spca_lars.components_ == 0)
    assert_array_almost_equal(U1, U2)
Example #23
0
def test_pca_vs_spca():
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 1000, (8, 8), random_state=rng)
    Z, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)
    spca = SparsePCA(alpha=0, ridge_alpha=0, n_components=2)
    pca = PCA(n_components=2)
    pca.fit(Y)
    spca.fit(Y)
    results_test_pca = pca.transform(Z)
    results_test_spca = spca.transform(Z)
    assert_allclose(np.abs(spca.components_.dot(pca.components_.T)),
                    np.eye(2), atol=1e-5)
    results_test_pca *= np.sign(results_test_pca[0, :])
    results_test_spca *= np.sign(results_test_spca[0, :])
    assert_allclose(results_test_pca, results_test_spca)
def spca(components, train_matrix, test_matrix):
    """Sparse principal component analysis routine.

    Parameters
    ----------
    components : int
        The number of components to be returned.
    train_matrix : array
        The training features.
    test_matrix : array
        The test features.

    Returns
    -------
    new_train : array
        Extracted training features.
    new_test : array
        Extracted test features.
    """
    msg = 'The number of components must be a positive int greater than 0.'
    assert components > 0, msg

    pca = SparsePCA(n_components=components)
    model = pca.fit(X=train_matrix)
    new_train = model.transform(train_matrix)
    new_test = model.transform(test_matrix)

    return new_train, new_test
def test_pca_vs_spca():
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 1000, (8, 8), random_state=rng)
    Z, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)
    spca = SparsePCA(alpha=0, ridge_alpha=0, n_components=2,
                     normalize_components=True)
    pca = PCA(n_components=2)
    pca.fit(Y)
    spca.fit(Y)
    results_test_pca = pca.transform(Z)
    results_test_spca = spca.transform(Z)
    assert_allclose(np.abs(spca.components_.dot(pca.components_.T)),
                    np.eye(2), atol=1e-5)
    results_test_pca *= np.sign(results_test_pca[0, :])
    results_test_spca *= np.sign(results_test_spca[0, :])
    assert_allclose(results_test_pca, results_test_spca)
Example #26
0
File: amm.py Project: ljeagle/bolt
def _fitted_sparse_pca(X, d, unscaled_alpha, **kwargs):
    # this seems to work better than initializing with MiniBatchSparsePCA,
    # svd of cov mat, or basically anything else I tried
    U, _, Vt = randomized_svd(X, n_components=d, random_state=123)
    U = U[:, :d]
    V = Vt.T[:d]

    # SparsePCA (and all the sklearn dictionary learning stuff)
    # internally uses sum of squared errs for each sample, and L1 norm
    # of parameter matrix; to make alpha meaningful across datasets,
    # want to scale by number of examples (so it's effectively using MSE)
    # and divide by L1 norm (which grows linearly with size of parameter
    # matrix / vector); also scale by variance of data for similar reasons
    N, D = X.shape
    alpha = unscaled_alpha * np.var(X - X.mean(axis=0)) * N / D
    verbose = 1
    pca = SparsePCA(
        n_components=d,
        alpha=alpha,
        normalize_components=True,
        method='lars',
        U_init=U,
        V_init=V,
        max_iter=10,
        ridge_alpha=max(1,
                        len(X) * X.std() * 10),
        # ridge_alpha=1e8,
        verbose=verbose,
        random_state=123)
    if verbose > 0:
        print("fitting sparse pca...")
    return pca.fit(X)
Example #27
0
def spca(data, num_components=None, alpha=1):
		# creates a matrix with sparse principal component analysis
		# build matrix with all data
		data = [d.flatten() for d in data if not any(isnan(d))]
		datamatrix = row_stack(data)
		
		# center data
		cdata = datamatrix - mean(datamatrix, axis=0)
		
		if num_components is None:
			num_components = cdata.shape[0]
		
		# do spca on matrix
		spca = SparsePCA(n_components=num_components, alpha=alpha)
		spca.fit(cdata)
		
		# normalize components
		components = spca.components_.T
		for r in xrange(0,components.shape[1]):
			compnorm = numpy.apply_along_axis(numpy.linalg.norm, 0, components[:,r])
			if not compnorm == 0:
				components[:,r] /= compnorm
		components = components.T
		
		# calc adjusted explained variance from "Sparse Principal Component Analysis" by Zou, Hastie, Tibshirani
		spca.components_ = components
		#nuz = spca.transform(cdata).T
		nuz = ridge_regression(spca.components_.T, cdata.T, 0.01, solver='dense_cholesky').T
		
		#nuz = dot(components, cdata.T)
		q,r = qr(nuz.T)
		cumulative_var = []
		for i in range(1,num_components+1):
			cumulative_var.append(trace(r[0:i,]*r[0:i,]))
		explained_var = [math.sqrt(cumulative_var[0])]
		for i in range(1,num_components):
			explained_var.append(math.sqrt(cumulative_var[i])-math.sqrt(cumulative_var[i-1]))
		
		order = numpy.argsort(explained_var)[::-1]
		components = numpy.take(components,order,axis=0)
		evars = numpy.take(explained_var,order).tolist()
		#evars = numpy.take(explained_var,order)
		#order2 = [0,1,2,4,5,7,12,19]
		#components = numpy.take(components,order2,axis=0)
		#evars = numpy.take(evars,order2).tolist()
		
		return components, evars
Example #28
0
def testSparse(n_components, alpha):
    spca = SparsePCA(n_components=n_components, alpha=alpha)
    spca_data = spca.fit(data).transform(data)
    plt.scatter(spca_data[:, 0],
                spca_data[:, 1],
                c=labels,
                cmap='nipy_spectral')
    plt.show()
def test_fit_transform_tall():
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 65, (8, 8), random_state=rng)  # tall array
    spca_lars = SparsePCA(n_components=3, method="lars", random_state=rng)
    U1 = spca_lars.fit_transform(Y)
    spca_lasso = SparsePCA(n_components=3, method="cd", random_state=rng)
    U2 = spca_lasso.fit(Y).transform(Y)
    assert_array_almost_equal(U1, U2)
Example #30
0
def test_fit_transform_tall():
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 65, (8, 8), random_state=rng)  # tall array
    spca_lars = SparsePCA(n_components=3, method='lars', random_state=rng)
    U1 = spca_lars.fit_transform(Y)
    spca_lasso = SparsePCA(n_components=3, method='cd', random_state=rng)
    U2 = spca_lasso.fit(Y).transform(Y)
    assert_array_almost_equal(U1, U2)
Example #31
0
    def _OnClick2(self, event):
        if self.var2.get() == "Off":
            self.var2.set("On")
        elif self.var2.get() == "On":
            self.var2.set("Off")
            print("Sparse PCA is running...")
            label = pd.read_csv(self.labelVar, header=None)[0].tolist()
            df = pd.read_csv(self.dfLabel, header=None)
            data, label = df, label
            #Standdardize the data
            data = StandardScaler().fit_transform(data)

            # apply PCA
            sparsepca = SparsePCA(n_components=2)

            # get 1st and 2nd components
            sparsepca.fit(data)
            SparseprincipalComponents = sparsepca.fit_transform(data)
            SparseprincipalDf = pd.DataFrame(
                data=SparseprincipalComponents,
                columns=['Component 1', 'Component 2'])
            print("Our principal components are: ")
            print(SparseprincipalComponents)
            X_r1 = SparseprincipalComponents[:, 0]
            X_r2 = SparseprincipalComponents[:, 1]
            unique = np.unique(label)
            print(len(np.unique(label)) + "*************************")
            try:
                plt.scatter(X_r1, X_r2, c=label)
            except:
                print(
                    "Data matrix does not match label matrix (Select input file and label, remove headers)"
                )

            name = 'Sparse_PCA'  #CHANGE FILENAME HERE *************************************************************************
            #plt.legend(unique, loc=8, ncol=5,fontsize='x-small')
            plt.title(name + " Clusters: " + str(len(unique)))
            plt.show()
            plt.savefig(name + ".png")
            plt.clf()

            # save  1st and 2nd components to csv
            SparseprincipalDf.to_excel(
                "Sparse_PCA_Components.xlsx"
            )  #Names of 1st and 2nd components to EXCEL here *************************************************************************
Example #32
0
def test_fit_transform_variance():
    alpha = 1
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)  # wide array
    spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha,
                          random_state=0, variance=True)
    pca = PCA(n_components=3, random_state=0)

    pca.fit(Y)
    # no need to fit spca for this
    spca_lars.fit(Y)

    components = pca.components_
    explained_variance = pca.explained_variance_
    spca_lars.components_ = components
    explained_variance_sparse = spca_lars.explained_variance_

    assert_array_almost_equal(explained_variance, explained_variance_sparse)
Example #33
0
class SPCA:
    def __init__(self, rfe_cv, *args, **kwargs):
        self.rfe = None
        self.rfe_cv = rfe_cv
        self.model = SparsePCA(*args, **kwargs)

    def fit(self, X, y):
        Z = numpy.concatenate([X, y.reshape(-1, 1)], axis=1)
        Z = numpy.array(Z, dtype=numpy.float32)
        Z[Z == numpy.inf] = numpy.nan
        Z[Z == -numpy.inf] = numpy.nan
        X_, y_ = X[~pandas.isna(Z).any(axis=1), :], y[~pandas.isna(Z).any(
            axis=1)]
        if Z.shape[0] != X.shape[0]:
            print(
                'FIT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}'
                .format(X.shape[0] - X_.shape[0]))
        if self.rfe_cv:
            raise Exception("PCA could not be processed with RFE_CV")
        else:
            self.model.fit(X_)

    def predict(self, X):
        Z = numpy.concatenate([X], axis=1)
        Z = numpy.array(Z, dtype=numpy.float32)
        Z[Z == numpy.inf] = numpy.nan
        Z[Z == -numpy.inf] = numpy.nan
        nan_mask = ~pandas.isna(Z).any(axis=1)
        X_ = X[nan_mask, :]
        if Z.shape[0] != X.shape[0]:
            print(
                'PREDICT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}'
                .format(X.shape[0] - X_.shape[0]))
        if self.rfe_cv:
            raise Exception("PCA could not be processed with RFE_CV")
        else:
            predicted = self.model.transform(X_)
            Z = numpy.full(shape=(X.shape[0], predicted.shape[1]),
                           fill_value=numpy.nan,
                           dtype=numpy.float64)
            Z[nan_mask, :] = predicted
        return Z
Example #34
0
def test_fit_transform():
    alpha = 1
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)  # wide array
    spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha,
                          random_state=0)
    spca_lars.fit(Y)

    # Test that CD gives similar results
    spca_lasso = SparsePCA(n_components=3, method='cd', random_state=0,
                           alpha=alpha)
    spca_lasso.fit(Y)
    assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)

    # Test that deprecated ridge_alpha parameter throws warning
    warning_msg = "The ridge_alpha parameter on transform()"
    assert_warns_message(DeprecationWarning, warning_msg, spca_lars.transform,
                         Y, ridge_alpha=0.01)
    assert_warns_message(DeprecationWarning, warning_msg, spca_lars.transform,
                         Y, ridge_alpha=None)
Example #35
0
def compute_robust_low_rank(data, total_k, range_kprime, d):
    # PCA + sparse PCA
    Count = 0
    Count += 1
    projection_error = []
    sdp_val = []
    minPi = []
    min_sdp_val = 1000000

    for k in range_kprime:
        print("processing k=", k)
        eigs, eigvecs = lasp.eigsh(data, k=k, which='LA', tol=0.00001)
        Pi = np.matmul(eigvecs, eigvecs.T)
        projected_data = np.matmul(Pi, np.matmul(data, Pi))

        if k < total_k:
            spca = SparsePCA(n_components=total_k - k,
                             random_state=0,
                             alpha=1e-5,
                             normalize_components=True)

            spca.fit(100 * (data - projected_data))

            u = spca.components_
            A = np.matmul(np.eye(d) - Pi, np.matmul(u.T, u))
            B = np.matmul(A, np.eye(d) - Pi)
            eigval, U = lasp.eigsh(B, k=total_k - k, which='LA', tol=0.00001)

            D = 1.0 * np.diag(eigval > 0.00001)
            U = np.matmul(U, D)

            sPi = Pi + np.matmul(U, U.T)
        else:
            sPi = Pi

        projected_data = np.matmul(sPi, np.matmul(data, sPi))
        projection_error.append(np.trace(data) - np.trace(projected_data))
        [curr_y, min_val, curr_alpha, avg_y_val] = solveGroth(sPi, d)
        sdp_val.append(min_val)
        minPi.append(sPi)
    return [projection_error, sdp_val, minPi, min_sdp_val]
Example #36
0
def spca_run(alpha=1):
    pca = SparsePCA(n_components=2, alpha=alpha)

    pca_data = pca.fit(data).transform(data)

    fig, axs = plt.subplots(1, 1)

    axs.scatter(pca_data[:, 0], pca_data[:, 1], c=labels, cmap='rainbow')
    axs.set_xlabel('PC1')
    axs.set_ylabel('PC2')

    plt.show()
Example #37
0
    def tu_spca(self, dataname="kong", components_n=1, data=None):

        #测试数据
        X, y = make_blobs(n_samples=10000,
                          n_features=3,
                          centers=[[3, 3, 3], [0, 0, 0], [1, 1, 1], [2, 2, 2]],
                          cluster_std=[0.2, 0.1, 0.2, 0.2],
                          random_state=9)
        if data == None:
            data = X

        message = []
        #训练数据
        spca = SparsePCA(n_components=components_n,
                         normalize_components=True,
                         random_state=0)
        spca.fit(X)
        #保存数据
        value = spca.transform(X)
        save_helper.save_txt_helper(value, dataname)

        components = spca.components_
        error = spca.error_
        page2 = Page()
        #绘图
        for j in range(0, components.shape[0]):
            bar1 = Bar("稀疏组建" + str(j))
            bar1.add("", [
                "components_" + str(i) for i in range(0, components.shape[1])
            ], components[j])
            page2.add(bar1)
        message.append("我们仅提供稀疏组建和数据误差供给分析")

        print(error)
        bar2 = Bar("数据误差分析")
        bar2.add("", ["error" + str(i) for i in range(0, len(error))], error)
        page2.add(bar2)
        save_helper.save_tu_helper(page2, dataname)

        return message
Example #38
0
def spca_fn(X):
    import matplotlib.pyplot as plt
    from sklearn.decomposition import PCA
    from sklearn.decomposition import SparsePCA
    if X.shape != (7501, 6):
        X = np.transpose(X)

    pca = PCA(n_components=1)
    X_r = pca.fit(X).transform(X)
    spca = SparsePCA(n_components=1)
    X_r2 = spca.fit(X).transform(X)

    return X_r, X_r2
def spca_alpha(Y_matrix, max_features, eps = 1e-4):
    """
    Y_matrix = input matrix
    max_features = maximum number of non-zero elements in the first sparse principal component
    eps = convergence parameter
    """
    range_ = [0,1]

    while True:
        range_mid = 0.5 * (range_[1] + range_[0])

        sp_pca_mid = SparsePCA(n_components=1, alpha=range_mid)
        sp_pca_mid.fit(Y_matrix)
        n_features_mid = sum([x != 0 for x in sp_pca_mid.components_[0]])

        if n_features_mid == max_features:
            return range_mid
        elif range_[1] - range_[0] < eps:
            return None
        elif n_features_mid < max_features:
            range_[1] = range_mid
        else:
            range_[0] = range_mid
Example #40
0
def WeightsEstimatedFromSparsePCAWithWeightedCovariance(ret_p, n_com=30):
    ret_port = ret_p.dropna(how='all', axis=1)
    tf = SparsePCA(n_components=n_com)  # , random_state=0)
    cov_matrix = WeightedCovariance(ret_port)
    tf.fit(cov_matrix)  # 注意量级
    tf.transform(
        ret_port.fillna(0.0)
    )  # .apply(lambda x:x.where(~x.isnull(),x.mean()),axis=0))#,index=date_investing[date_investing<'2019-12'])
    # 根据组合的组合的平均收益,调整组合的符号
    weights = pd.DataFrame(tf.components_, columns=cov_matrix.columns).T
    ret_transformed_port = (ret_port.fillna(0.0) @ weights).replace(
        0.0, np.nan)
    for c in ret_transformed_port.columns:
        weights[c] = weights[c] * np.sign(
            ret_transformed_port[c].mean()) / np.abs(weights[c]).sum()
    ret_transformed_port = (ret_port.fillna(0.0) @ weights).replace(
        0.0, np.nan)
    # 按t值选,还是按SR选择
    select_port = np.abs(
        PortfolioAnalysis(ret_transformed_port)).T.sort_values(
            by='SR', ascending=False).index
    for p in select_port:
        weights[p] *= np.sign(ret_transformed_port[p].mean())
    return weights[select_port]
Example #41
0
def WeightsEstimatedFromSparsePCA(ret_port, n_com=25):
    tf = SparsePCA(n_components=n_com)  # , random_state=0)
    tf.fit(ret_port.agg(lambda x: x - x.mean()).fillna(0.0))  # 注意量级
    tf.transform(
        ret_port.fillna(0.0)
    )  # .apply(lambda x:x.where(~x.isnull(),x.mean()),axis=0))#,index=date_investing[date_investing<'2019-12'])
    # 根据组合的组合的平均收益,调整组合的符号
    weights = pd.DataFrame(tf.components_, columns=signal_names.split(',')).T
    ret_transformed_port = (cov_chara_ret.fillna(0.0) @ weights).replace(
        0.0, np.nan)
    for c in weights.columns:
        weights[c] = weights[c] * np.sign(
            ret_transformed_port[c].mean()) / np.abs(weights[c]).sum()
    ret_transformed_port = (cov_chara_ret.fillna(0.0) @ weights).replace(
        0.0, np.nan)
    # 按t值选,还是按SR选择
    select_port = np.abs(
        PortfolioAnalysis(ret_transformed_port.dropna(
            how='all',
            axis=1))).T.sort_values(by='SR',
                                    ascending=False).index[:int(n_com * 0.67)]
    for p in select_port:
        weights[p] *= np.sign(ret_transformed_port[p].mean())
    return weights[select_port]
def main():
    accounts = csv_to_dict('accounts.csv', 0, cast_evals=[str, read_time, readOutcome], type="account")
    account_nodes = csv_to_dict('nodevisits.csv', 1, cast_evals=[str, str, read_time, str], type="node")
    account_submissions = csv_to_dict('submissions.csv', 1, cast_evals=[str, str, read_time, str, str], type="submission")

    account_visits = account_nodes
    for acc in account_visits:
        account_visits[acc].extend(account_submissions[acc])
        account_visits[acc] = sorted(account_visits[acc], key=lambda k: k['time'])
    session_length(account_visits)

    #Build sessions based on time scale determined from previous code as 15 minutes
    sessions = []
    for acc in account_visits:
        actions = []
        for idx, visit in enumerate(account_visits[acc]):
            if idx == 0:
                actions = {"node": [], "submission": [], "learning_outcome": accounts[acc][0]["learning_outcome"]}
                actions[account_visits[acc][idx]["type"]].append(visit)
            else:
                #Time between visits in minutes
                delta_time = delta_minutes(visit["time"], account_visits[acc][idx-1]["time"])
                #New session, defined as 15 minutes from above
                if delta_time > 15:
                    sessions.append(actions)
                    actions = {"node": [], "submission": [], "learning_outcome": accounts[acc][0]["learning_outcome"]}
                    actions[account_visits[acc][idx]["type"]].append(visit)

                else:
                    actions[account_visits[acc][idx]["type"]].append(visit)
        sessions.append(actions)

    for session in sessions:
        if len(session["node"]) > 0 and len(session["submission"]) > 0:
            session["start_time"] = min(session["node"][0]["time"], session["submission"][0]["time"])
            session["end_time"] = max(session["node"] [len(session["node"]) -1]["time"], session["submission"] [len(session["submission"]) -1]["time"])
        elif len(session["node"]) > 0:
            session["start_time"] = session["node"][0]["time"]
            session["end_time"] = session["node"] [len(session["submission"]) -1]["time"]
        else:
            session["start_time"] = session["submission"][0]["time"]
            session["end_time"] = session["submission"] [len(session["submission"]) -1]["time"]

    #Remove sessions without any time difference or no nodes visited
    sessions = [session for session in sessions if delta_minutes(session["end_time"], session["start_time"]) != 0]

    X = session_properties(sessions)
    X = standardize(X)
    pca = SparsePCA(n_components = 2)
    #Negative one just makes plot easier to look at, PCA is sign insensitive so no real effect
    X_r = -1 * pca.fit(X).transform(X)

    kmeans = cluster.KMeans(n_clusters=4)
    group = kmeans.fit_predict(X_r)

    fig = plt.figure(figsize=(6,6))
    ax = fig.add_subplot(111)
    plt.rc('font', family='serif', size=20)
    ax.set_xticklabels([])
    ax.set_yticklabels([])
    ax.scatter(X_r[:,0], X_r[:,1],s=20,marker = 'o', c=group)
    plt.show()

    outcomes = np.asarray([session["learning_outcome"] for session in sessions])
    session_by_outcome = []
    tags = []
    labels = get_labels(X_r, group, 4)
    for result in range(0, 4):
        session_by_outcome.append(group[outcomes == result])
        if result == 0:
            tags.append("No certificate achieved")
        else:
            tags.append("Mastery Level = " + str(result))

    plot_hist(session_by_outcome, x_min = 0, x_max = 4, y_min = 0, y_max = 1, bins = 4, tags = tags, y_label = "Fraction of sessions", labels=labels)
Example #43
0
File: spca.py Project: mikss/sdp-ex
N = 500
P = 10
MU = [0] * P
T = 1  # spike level
K = 2  # sparsity level
V = list(range(1,K+1)) + [0]*(P-K)
V = V / np.linalg.norm(V)
SIG = np.identity(P) + T * np.matrix(V).transpose() * np.matrix(V)
X = np.matrix(np.random.multivariate_normal(MU,SIG,N))

#####

# using scikit-learn method for Sparse PCA (like an l1-regularized dictionary learning problem)
from sklearn.decomposition import SparsePCA
spca = SparsePCA(n_components=1, alpha=5)
spca.fit(X)

from sklearn.decomposition import PCA
pca = PCA(n_components=1)
pca.fit(X)

print('Classical 1st principal component:', pca.components_)
print('Sparse 1st principal component:', spca.components_)

#####

# TODO: SDP implementation a la El Ghaoui, Bach, D'Aspremont
import cvxopt
# TWO CONSTRAINTS
# trace = 1 (multiply with identity)
# l1 norm <= k (multiply with all 1s matrix)
Example #44
0
class SPCA(object):
    """
    Wrapper for sklearn package.  Performs sparse PCA

    SPCA has 5 methods:
       - fit(waveforms)
       update class instance with ICA fit

       - fit_transform()
       do what fit() does, but additionally return the projection onto ICA space

       - inverse_transform(A)
       inverses the decomposition, returns waveforms for an input A, using Z

       - get_basis()
       returns the basis vectors Z^\dagger

       - get_params()
       returns metadata used for fits.
    """
    def __init__(self, num_components=10,
                 catalog_name='unknown',
                 alpha = 0.1,
                 ridge_alpha = 0.01,
                 max_iter = 2000,
                 tol = 1e-9,
                 n_jobs = 1,
                 random_state = None):

        self._decomposition  = 'Sparse PCA'
        self._num_components = num_components
        self._catalog_name   = catalog_name
        self._alpha          = alpha
        self._ridge_alpha    = ridge_alpha
        self._n_jobs         = n_jobs
        self._max_iter       = max_iter
        self._tol            = tol
        self._random_state   = random_state

        self._SPCA = SparsePCA(n_components=self._num_components,
                              alpha        = self._alpha,
                              ridge_alpha  = self._ridge_alpha,
                              n_jobs       = self._n_jobs,
                              max_iter     = self._max_iter,
                              tol          = self._tol,
                              random_state = self._random_state)

    def fit(self,waveforms):
        # TODO make sure there are more columns than rows (transpose if not)
        # normalize waveforms
        self._waveforms = waveforms
        self._SPCA.fit(self._waveforms)

    def fit_transform(self,waveforms):
        # TODO make sure there are more columns than rows (transpose if not)
        # normalize waveforms
        self._waveforms = waveforms
        self._A = self._SPCA.fit_transform(self._waveforms)
        return self._A

    def inverse_transform(self,A):
        # convert basis back to waveforms using fit
        new_waveforms = self._SPCA.inverse_transform(A)
        return new_waveforms

    def get_params(self):
        # TODO know what catalog was used! (include waveform metadata)
        params = self._SPCA.get_params()
        params['num_components'] = params.pop('n_components')
        params['Decompositon'] = self._decomposition
        return params

    def get_basis(self):
        """ Return the SPCA basis vectors (Z^\dagger)"""
        Zt = self._SPCA.components_
        return Zt
Example #45
0
 def fit(self, dif_df):
     factorization = SparsePCA(n_components=self.n_components, alpha=0.03)
     X = dif_df.values[1:]
     self.ticker_symbols_used = dif_df.columns.values
     factorization.fit(X)
     self.factorization = factorization