Ejemplo n.º 1
0
  def export(self, query, n_topics, n_words, title="PCA Export", fname="PCAExport"):
    vec = DictVectorizer()
    
    rows = topics_to_vectorspace(self.model, n_topics, n_words)
    X = vec.fit_transform(rows)
    pca = skPCA(n_components=2)
    X_pca = pca.fit(X.toarray()).transform(X.toarray())
    
    match = []
    for i in range(n_topics):
      topic = [t[1] for t in self.model.show_topic(i, len(self.dictionary.keys()))]
      m = None
      for word in topic:
        if word in query:
          match.append(word)
          break

    pyplot.figure()
    for i in range(X_pca.shape[0]):
      pyplot.scatter(X_pca[i, 0], X_pca[i, 1], alpha=.5)
      pyplot.text(X_pca[i, 0], X_pca[i, 1], s=' '.join([str(i), match[i]]))  
     
    pyplot.title(title)
    pyplot.savefig(fname)
     
    pyplot.close()
Ejemplo n.º 2
0
def test_pca_fit(datatype, input_type, name, use_handle):

    if name == 'blobs':
        pytest.skip('fails when using blobs dataset')
        X, y = make_blobs(n_samples=500000,
                          n_features=1000, random_state=0)

    elif name == 'digits':
        X, _ = datasets.load_digits(return_X_y=True)

    else:
        X, Y = make_multilabel_classification(n_samples=500,
                                              n_classes=2,
                                              n_labels=1,
                                              allow_unlabeled=False,
                                              random_state=1)

    skpca = skPCA(n_components=2)
    skpca.fit(X)

    handle, stream = get_handle(use_handle)
    cupca = cuPCA(n_components=2, handle=handle)
    cupca.fit(X)
    cupca.handle.sync()

    for attr in ['singular_values_', 'components_', 'explained_variance_',
                 'explained_variance_ratio_']:
        with_sign = False if attr in ['components_'] else True
        print(attr)
        print(getattr(cupca, attr))
        print(getattr(skpca, attr))
        cuml_res = (getattr(cupca, attr))

        skl_res = getattr(skpca, attr)
        assert array_equal(cuml_res, skl_res, 1e-3, with_sign=with_sign)
Ejemplo n.º 3
0
def pca_comparison_kropt():
    X, y = datasets.load_kropt()

    # Perform custom PCA, sklearn PCA and IPCA transformations

    pca = PCA(n_components=2, verbose=True)
    X_trans1 = pca.fit_transform(X)

    skpca = skPCA(n_components=2)
    X_trans2 = skpca.fit_transform(X)

    ipca = IncrementalPCA(n_components=2, batch_size=5000)
    X_trans3 = ipca.fit_transform(X)

    # Plot transformed spaces
    fig, ax = plt.subplots(1, 3, figsize=(15, 5))
    ax[0].scatter(X_trans1[:, 0], X_trans1[:, 1])
    ax[0].title.set_text('Custom PCA Kropt')
    ax[0].set_xlabel('PC1')
    ax[0].set_ylabel('PC2')

    ax[1].scatter(X_trans2[:, 0], X_trans2[:, 1])
    ax[1].title.set_text('Sklearn PCA Kropt')
    ax[1].set_xlabel('PC1')
    ax[1].set_ylabel('PC2')

    ax[2].scatter(X_trans3[:, 0], X_trans3[:, 1])
    ax[2].title.set_text('Sklearn IPCA Kropt')
    ax[2].set_xlabel('PC1')
    ax[2].set_ylabel('PC2')

    plt.show()
Ejemplo n.º 4
0
def test_pca_defaults(n_samples, n_features, sparse):
    if sparse:
        X = cupyx.scipy.sparse.random(n_samples,
                                      n_features,
                                      density=0.03,
                                      dtype=cp.float32,
                                      random_state=10)
    else:
        X, Y = make_multilabel_classification(n_samples=n_samples,
                                              n_features=n_features,
                                              n_classes=2,
                                              n_labels=1,
                                              random_state=1)
    cupca = cuPCA()
    cupca.fit(X)
    curesult = cupca.transform(X)
    cupca.handle.sync()

    if sparse:
        X = X.toarray().get()
    skpca = skPCA()
    skpca.fit(X)
    skresult = skpca.transform(X)

    assert skpca.svd_solver == cupca.svd_solver
    assert cupca.components_.shape[0] == skpca.components_.shape[0]
    assert curesult.shape == skresult.shape
    assert array_equal(curesult, skresult, 1e-3, with_sign=False)
Ejemplo n.º 5
0
def pca_comparison_satimage():
    X, y = datasets.load_satimage()

    pca = PCA(2, verbose=True)
    X_trans1 = pca.fit_transform(X)

    skpca = skPCA(2)
    X_trans2 = skpca.fit_transform(X)

    # transform dataset with sklearn's IncrementalPCA
    ipca = IncrementalPCA(2)
    X_trans3 = ipca.fit_transform(X)

    fig = plt.figure(figsize=(15, 5))
    ax = fig.add_subplot(1, 3, 1)
    ax.set_title('SatImage PCA')
    ax.plot(X_trans1[:, 0], X_trans1[:, 1], 'o')  # , dataPCA[:,2],'o')
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax = fig.add_subplot(1, 3, 2)
    ax.set_title('SatImage sklearn PCA')
    ax.plot(X_trans2[:, 0], X_trans2[:, 1], 'o')  # , dataPCA[:,2],'o')
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax = fig.add_subplot(1, 3, 3)
    ax.set_title('SatImage sklearn IncrementalPCA')
    ax.plot(X_trans3[:, 0], X_trans3[:, 1], 'o')  # , dataPCA[:,2],'o')
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    plt.show()
Ejemplo n.º 6
0
def pca_comparison_credita():
    X, y = datasets.load_credita()

    fig, ax = plt.subplots(1, 3, figsize=(15, 5))
    plt.subplots_adjust(bottom=.10, left=.05, top=.90, right=.95)

    # transform dataset with our PCA
    pca = PCA(2, verbose=True)
    X_trans1 = pca.fit_transform(X)

    ax[0].scatter(X_trans1[:, 0], X_trans1[:, 1])
    ax[0].title.set_text('2-component PCA on Credit-A')

    # transform dataset with sklearn's PCA
    skpca = skPCA(2)
    X_trans2 = skpca.fit_transform(X)

    ax[1].scatter(X_trans2[:, 0], X_trans2[:, 1])
    ax[1].title.set_text('2-component PCA (sklearn) on Credit-A')

    # transform dataset with sklearn's IncrementalPCA
    ipca = IncrementalPCA(2)
    X_trans3 = ipca.fit_transform(X)

    ax[2].scatter(X_trans3[:, 0], X_trans3[:, 1])
    ax[2].title.set_text('2-component IncrementalPCA (sklearn) on Credit-A')

    plt.show()
Ejemplo n.º 7
0
def test_pca_fit(datatype, input_type):

    X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]],
                 dtype=datatype)
    skpca = skPCA(n_components=2)
    skpca.fit(X)

    cupca = cuPCA(n_components=2)

    if input_type == 'dataframe':
        gdf = cudf.DataFrame()
        gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype)
        gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype)
        cupca.fit(gdf)

    else:
        cupca.fit(X)

    for attr in [
            'singular_values_', 'components_', 'explained_variance_',
            'explained_variance_ratio_', 'noise_variance_'
    ]:
        with_sign = False if attr in ['components_'] else True
        print(attr)
        print(getattr(cupca, attr))
        print(getattr(skpca, attr))
        cuml_res = (getattr(cupca, attr))
        if isinstance(cuml_res, cudf.Series):
            cuml_res = cuml_res.to_array()
        else:
            cuml_res = cuml_res.as_matrix()
        skl_res = getattr(skpca, attr)
        assert array_equal(cuml_res, skl_res, 1e-3, with_sign=with_sign)
Ejemplo n.º 8
0
def test_pca_fit_transform(datatype, input_type,
                           name, use_handle):

    if name == 'blobs':
        X, y = make_blobs(n_samples=500000,
                          n_features=1000, random_state=0)

    elif name == 'iris':
        iris = datasets.load_iris()
        X = iris.data

    else:
        X, Y = make_multilabel_classification(n_samples=500,
                                              n_classes=2,
                                              n_labels=1,
                                              allow_unlabeled=False,
                                              random_state=1)

    if name != 'blobs':
        skpca = skPCA(n_components=2)
        Xskpca = skpca.fit_transform(X)

    handle, stream = get_handle(use_handle)
    cupca = cuPCA(n_components=2, handle=handle)

    X_cupca = cupca.fit_transform(X)
    cupca.handle.sync()

    if name != 'blobs':
        assert array_equal(X_cupca, Xskpca, 1e-3, with_sign=True)
        assert Xskpca.shape[0] == X_cupca.shape[0]
        assert Xskpca.shape[1] == X_cupca.shape[1]
Ejemplo n.º 9
0
def run_pca(X, n_components, svd_solver, whiten, random_state, model):
    if model == 'sklearn':
        pca = skPCA(n_components=n_components,
                    svd_solver=svd_solver,
                    whiten=whiten,
                    random_state=random_state)
    elif model == 'h2o4gpu':
        from h2o4gpu.solvers.pca import PCAH2O as h2oPCA
        pca = h2oPCA(n_components=n_components,
                     whiten=whiten)  #, random_state=random_state)
    elif model == 'cuml':
        from cuSKL import PCA as cumlPCA
        pca = cumlPCA(n_components=n_components,
                      svd_solver=svd_solver,
                      whiten=whiten,
                      random_state=random_state)
    else:
        raise NotImplementedError

    @timer
    def fit_(pca, X, model):
        pca.fit(X)
        return pca

    @timer
    def transform_(pca, X, model):
        return pca.transform(X)

    pca = fit_(pca, X, model=model)
    Xpca = transform_(pca, X, model=model)
    pca.transformed_result = lambda: None
    setattr(pca, 'transformed_result', Xpca)
    return pca
Ejemplo n.º 10
0
def test_pca_fit_transform(datatype, input_type, name, use_handle):
    if name == 'blobs':
        X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0)

    elif name == 'iris':
        iris = datasets.load_iris()
        X = iris.data

    else:
        X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]],
                     dtype=datatype)

    if name != 'blobs':
        skpca = skPCA(n_components=2)
        Xskpca = skpca.fit_transform(X)

    handle, stream = get_handle(use_handle)
    cupca = cuPCA(n_components=2, handle=handle)

    if input_type == 'dataframe':
        X = pd.DataFrame({'fea%d' % i: X[0:, i] for i in range(X.shape[1])})
        X_cudf = cudf.DataFrame.from_pandas(X)
        X_cupca = cupca.fit_transform(X_cudf)

    else:
        X_cupca = cupca.fit_transform(X)
    cupca.handle.sync()

    if name != 'blobs':
        assert array_equal(X_cupca, Xskpca, 1e-3, with_sign=True)
Ejemplo n.º 11
0
def test_pca_defaults(n_samples, n_features, sparse):
    # FIXME: Disable the case True-300-200 due to flaky test
    if sparse and n_features == 300 and n_samples == 200:
        pytest.xfail('Skipping the case True-300-200 due to flaky test')

    if sparse:
        X = cupyx.scipy.sparse.random(n_samples,
                                      n_features,
                                      density=0.03,
                                      dtype=cp.float32,
                                      random_state=10)
    else:
        X, Y = make_multilabel_classification(n_samples=n_samples,
                                              n_features=n_features,
                                              n_classes=2,
                                              n_labels=1,
                                              random_state=1)
    cupca = cuPCA()
    cupca.fit(X)
    curesult = cupca.transform(X)
    cupca.handle.sync()

    if sparse:
        X = X.toarray().get()
    skpca = skPCA()
    skpca.fit(X)
    skresult = skpca.transform(X)

    assert skpca.svd_solver == cupca.svd_solver
    assert cupca.components_.shape[0] == skpca.components_.shape[0]
    assert curesult.shape == skresult.shape
    assert array_equal(curesult, skresult, 1e-3, with_sign=False)
Ejemplo n.º 12
0
def runPCA(log_name):
    # preprocess
    log_name = log_name + '_PCA.txt'
    f = open(log_name, 'w')
    X = df.iloc[:, 0:-1]
    y = df.iloc[:, -1]
    X_scaled = skPreprocessing.scale(X)
    # params
    numClass = len(np.unique(y))
    numDataPnt = X_scaled.shape[0]
    numDimen = X_scaled.shape[1]
    # run PCA
    start_time = time.time()
    pca = skPCA(n_components=numClass)
    X_pca = pca.fit(X_scaled).transform(X_scaled)
    costs = np.zeros(NUM_RUN)
    labels_pred = np.zeros((NUM_RUN, numDataPnt))
    labels_half = np.zeros((int(NUM_RUN / 2), numDataPnt))
    for i in range(NUM_RUN):
        kmeans_model = skCluster.KMeans(n_clusters=numClass,
                                        init='random').fit(X_pca)
        costs[i] = kmeans_model.inertia_
        labels_pred[i] = kmeans_model.labels_
    end_time = time.time()
    labels_half = removeHalfUpperCosts(costs, labels_pred, numDataPnt)
    # run Evaluations
    runEvalMetrics(X_pca, labels_true=y, labels_pred=labels_half, f=f)
    f.write("\n")
    f.write("# of Class : %d, # of Data Points : %d, # of Dimensions : %d \n" %
            (numClass, numDataPnt, numDimen))
    f.write("Shape of X [%d %d]" % (X_pca.shape[0], X_pca.shape[1]))
    f.write('\n')
    f.write("Clustering took %.2f s\n" % (end_time - start_time))
    f.close()
Ejemplo n.º 13
0
def WJldaTest(images, labels, j, R, P, k, I, pcaAcc, name=''):
    """
    Calculates a transformation matrix based on PCA such that  pcaAcc of the signal
    are retained and after that applies LDA to transform the data.
    Using PCA and LDA/SVM from Scikit-learn.
    """

    fj = FJ(images, j, R, P, k, I, name)

    ipca = skPCA(n_components=pcaAcc)
    ipca.fit(fj)
    fj2 = ipca.transform(fj)

    print fj2.shape
    if fj2.shape[1] == 1:
        return (None, None)

    ilda = skLDA()
    #ilda = skLDA(solver='eigen')
    try:
        ilda.fit(fj2, labels)
    except Exception as e:
        print j
        return (None, None)
    fj3 = ilda.transform(fj2)

    print fj3.shape


    return (ipca, ilda)
Ejemplo n.º 14
0
def WJldaTest(images, labels, j, R, P, k, I, pcaAcc, name=''):
    """
    Calculates a transformation matrix based on PCA such that  pcaAcc of the signal
    are retained and after that applies LDA to transform the data.
    Using PCA and LDA/SVM from Scikit-learn.
    """

    fj = FJ(images, j, R, P, k, I, name)

    ipca = skPCA(n_components=pcaAcc)
    ipca.fit(fj)
    fj2 = ipca.transform(fj)

    print fj2.shape
    if fj2.shape[1] == 1:
        return (None, None)

    ilda = skLDA()
    #ilda = skLDA(solver='eigen')
    try:
        ilda.fit(fj2, labels)
    except Exception as e:
        print j
        return (None, None)
    fj3 = ilda.transform(fj2)

    print fj3.shape

    return (ipca, ilda)
Ejemplo n.º 15
0
 def test(self):
     " validate result with sklearn.decomposition.PCA "
     from sklearn.decomposition import PCA as skPCA
     X = np.random.normal(3.2, 5.1, size=(20, 8))
     pca = PCA(3).fit(X)
     skpca = skPCA(3).fit(X)
     output = skpca.transform(X)
     self.assertTrue(np.allclose(np.abs(pca.transform(X)), np.abs(output)),
                     "Should be equal")
Ejemplo n.º 16
0
def main():
    set_printoptions(precision=3, suppress=True)
    X = randn(5, 3)
    pca = PCA(n_components=2)
    print pca.fit_transform(X)
    print pca.fit(X)
    skpca = skPCA(n_components=2)
    print skpca.fit_transform(X)
    print skpca.components_
Ejemplo n.º 17
0
    def phase3(self, data, trueLabels):
        step3ResultsFolder = Path(self.config["resultsDir"]) / "phase3"
        step3ResultsFolder.mkdir(exist_ok=True, parents=True)

        pca = skPCA(N_COMPONENTS)
        ipca = IncrementalPCA(N_COMPONENTS)
        reducedData = pca.fit_transform(data)
        iReducedData = ipca.fit_transform(data)

        Visualizer.labeledScatter3D(reducedData, trueLabels, path=step3ResultsFolder / f"{N_COMPONENTS}_dims_pcaScatter.png")
        Visualizer.labeledScatter3D(iReducedData, trueLabels, path=step3ResultsFolder / f"{N_COMPONENTS}_dims_ipcaScatter.png")
        return reducedData, iReducedData
Ejemplo n.º 18
0
def PCA(feature_array,
        n_components=None,
        whiten=True,
        svd_solver='auto',
        tol=0.0,
        random_state=None):
    """ Performs Principal Component Analysis on a feature set to deduce axis that have the highest variance.
        Each component is guarenteed orthogonal in the original feature space (boon and bane).

    Args:
        feature_array: 2D array (#Sample x #Feature), should be float64
        n_components : How many components are desired, None = all (# input features)
        whiten       : Only set to False if you have already whitened the data
        tol          : Tolerance on update at each iteration
        svd_solver   : Which solver to use, 'auto' intelligently selects
        random_state : Random seed to fix output
    Output:
        components   : PCA components (#features, #components)
        weights      : Sample weights (#samples , #components)

    >>> import numpy as N
    >>> N.random.seed(0)
    >>> feature_array = N.random.random((1000,10))

    >>> components, weights = PCA(feature_array, n_components = 3)

    >>> components.shape
    (10, 3)

    >>> weights.shape
    (1000, 3)


    """

    from sklearn.decomposition import PCA as skPCA

    _PCA = skPCA(n_components=n_components,
                 whiten=whiten,
                 svd_solver=svd_solver,
                 tol=tol,
                 copy=True,
                 random_state=random_state)

    weights = _PCA.fit(feature_array).transform(feature_array)

    components = _PCA.components_.T

    return components, weights
Ejemplo n.º 19
0
def select_components_above_background(expression_values: np.ndarray,
                                       n_permutations: int,
                                       path_name: str = 'pathway'):

    pca = skPCA().fit(expression_values.T)
    expr_flat = expression_values.flatten()
    explained_var_df = pd.DataFrame(index=list(range(n_permutations)),
                                    columns=list(
                                        range(expression_values.shape[1])))
    for i in range(n_permutations):
        np.random.shuffle(expr_flat)
        expr_permuted = expr_flat.reshape(expression_values.shape[0],
                                          expression_values.shape[1])
        pca_permuted = skPCA().fit(expr_permuted.T)
        explained_var_df.loc[i] = pca_permuted.explained_variance_ratio_

    pval = list()

    for j in range(expression_values.shape[1]):
        pval.append(
            np.sum(
                explained_var_df.iloc[:,
                                      j] >= pca.explaiend_variance_ratio_[j]) /
            n_permutations)

    n_significant_components = np.where(np.array(pval) >= 0.05)[0][0]
    explained_var_sign_comp = pca.explained_variance_ratio_[
        0:n_significant_components] * 100
    var_df = pd.DataFrame.from_dict(
        {
            'PCs': int(n_significant_components),
            'explained_var': explained_var_sign_comp
        },
        orient='index',
        columns=[path_name])
    return n_significant_components, var_df
Ejemplo n.º 20
0
    def _pca(mat, dim):
        """ Wrapper to PCA method, use sklearn
        See: 
        http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html


        >>> mat = [[1., 1., 0.], [0., 1., 0.], [1., 0., 0.]]
        >>> ReducePCA._pca(mat, 2)
        array([[ 0.        ,  0.19526215],
        [-0.70710678, -0.09763107],
        [ 0.70710678, -0.09763107]])
        """
        from sklearn.decomposition import KernelPCA as skPCA
        mypca = skPCA(n_components=dim, kernel="cosine")
        return mypca.fit_transform(mat)  #[:,:self.out_dim]
Ejemplo n.º 21
0
def test_pca_defaults(n_samples, n_features):
    X, Y = make_multilabel_classification(n_samples=n_samples,
                                          n_features=n_features,
                                          n_classes=2,
                                          n_labels=1,
                                          random_state=1)
    skpca = skPCA()
    skpca.fit(X)

    cupca = cuPCA()
    cupca.fit(X)
    cupca.handle.sync()

    assert skpca.svd_solver == cupca.svd_solver
    assert cupca.components_.shape[0] == skpca.components_.shape[0]
Ejemplo n.º 22
0
def test_pca_fit_transform(datatype):
    gdf = pygdf.DataFrame()
    gdf['0']=np.asarray([-1,-2,-3,1,2,3],dtype=datatype)
    gdf['1']=np.asarray([-1,-1,-2,1,1,2],dtype=datatype)

    X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype = datatype)

    print("Calling fit_transform")
    cupca = cuPCA(n_components = 2)
    Xcupca = cupca.fit_transform(gdf)
    skpca = skPCA(n_components = 2)
    Xskpca = skpca.fit_transform(X)

    assert array_equal(Xcupca, Xskpca,
            1e-3,with_sign=False)
Ejemplo n.º 23
0
def test_pca(eng):
    x = make_low_rank_matrix(n_samples=10, n_features=5, random_state=0)
    x = fromarray(x, engine=eng)

    from sklearn.decomposition import PCA as skPCA
    pca = skPCA(n_components=2)
    t1 = pca.fit_transform(x.toarray())
    w1_T = pca.components_

    t2, w2_T = PCA(k=2, svd_method='direct').fit(x)
    assert allclose_sign(w1_T.T, w2_T.T)
    assert allclose_sign(t1, t2)

    t2, w2_T = PCA(k=2, svd_method='em', max_iter=100, seed=0).fit(x)
    tol = 1e-1
    assert allclose_sign(w1_T.T, w2_T.T, atol=tol)
    assert allclose_sign(t1, t2, atol=tol)
Ejemplo n.º 24
0
def test_pca_fit(datatype):
    gdf = pygdf.DataFrame()
    gdf['0']=np.asarray([-1,-2,-3,1,2,3],dtype=datatype)
    gdf['1']=np.asarray([-1,-1,-2,1,1,2],dtype=datatype)

    X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype = datatype)

    print("Calling fit")
    cupca = cuPCA(n_components = 2)
    cupca.fit(gdf)
    skpca = skPCA(n_components = 2)
    skpca.fit(X)

    for attr in ['singular_values_','components_','explained_variance_','explained_variance_ratio_','noise_variance_']:
        with_sign = False if attr in ['components_'] else True
        assert array_equal(getattr(cupca,attr),getattr(skpca,attr),
            1e-3,with_sign=with_sign)
def test_pca(eng):
    x = make_low_rank_matrix(n_samples=10, n_features=5, random_state=0)
    x = fromarray(x, engine=eng)

    from sklearn.decomposition import PCA as skPCA
    pca = skPCA(n_components=2)
    t1 = pca.fit_transform(x.toarray())
    w1_T = pca.components_

    t2, w2_T = PCA(k=2, svd_method='direct').fit(x)
    assert allclose_sign(w1_T.T, w2_T.T)
    assert allclose_sign(t1, t2)

    t2, w2_T = PCA(k=2, svd_method='em', max_iter=100, seed=0).fit(x)
    tol = 1e-1
    assert allclose_sign(w1_T.T, w2_T.T, atol=tol)
    assert allclose_sign(t1, t2, atol=tol)
Ejemplo n.º 26
0
def test_pca_fit(datatype, input_type, name, use_handle):

    if name == 'blobs':
        pytest.skip('fails when using blobs dataset')
        X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0)

    elif name == 'iris':
        iris = datasets.load_iris()
        X = iris.data

    else:
        X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]],
                     dtype=datatype)

    skpca = skPCA(n_components=2)
    skpca.fit(X)

    handle, stream = get_handle(use_handle)
    cupca = cuPCA(n_components=2, handle=handle)

    if input_type == 'dataframe':
        X = pd.DataFrame({'fea%d' % i: X[0:, i] for i in range(X.shape[1])})
        X_cudf = cudf.DataFrame.from_pandas(X)
        cupca.fit(X_cudf)

    else:
        cupca.fit(X)

    cupca.handle.sync()

    for attr in [
            'singular_values_', 'components_', 'explained_variance_',
            'explained_variance_ratio_', 'noise_variance_'
    ]:
        with_sign = False if attr in ['components_'] else True
        print(attr)
        print(getattr(cupca, attr))
        print(getattr(skpca, attr))
        cuml_res = (getattr(cupca, attr))
        if isinstance(cuml_res, cudf.Series):
            cuml_res = cuml_res.to_array()
        else:
            cuml_res = cuml_res.as_matrix()
        skl_res = getattr(skpca, attr)
        assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign)
Ejemplo n.º 27
0
def test_pca_fit_transform(datatype, input_type):
    X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]],
                 dtype=datatype)
    skpca = skPCA(n_components=2)
    Xskpca = skpca.fit_transform(X)

    cupca = cuPCA(n_components=2)

    if input_type == 'dataframe':
        gdf = cudf.DataFrame()
        gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype)
        gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype)
        Xcupca = cupca.fit_transform(gdf)

    else:
        Xcupca = cupca.fit_transform(X)

    assert array_equal(Xcupca, Xskpca, 1e-3, with_sign=True)
Ejemplo n.º 28
0
def test_pca_fit_then_transform(datatype, input_type, name, use_handle):
    blobs_n_samples = 500000
    if name == 'blobs' and pytest.max_gpu_memory < 32:
        if pytest.adapt_stress_test:
            blobs_n_samples = int(blobs_n_samples * pytest.max_gpu_memory / 32)
        else:
            pytest.skip("Insufficient GPU memory for this test."
                        "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")

    if name == 'blobs':
        X, y = make_blobs(n_samples=blobs_n_samples,
                          n_features=1000,
                          random_state=0)

    elif name == 'iris':
        iris = datasets.load_iris()
        X = iris.data

    else:
        X, Y = make_multilabel_classification(n_samples=500,
                                              n_classes=2,
                                              n_labels=1,
                                              allow_unlabeled=False,
                                              random_state=1)

    if name != 'blobs':
        skpca = skPCA(n_components=2)
        skpca.fit(X)
        Xskpca = skpca.transform(X)

    handle, stream = get_handle(use_handle)
    cupca = cuPCA(n_components=2, handle=handle)

    cupca.fit(X)
    X_cupca = cupca.transform(X)
    cupca.handle.sync()

    if name != 'blobs':
        assert array_equal(X_cupca, Xskpca, 1e-3, with_sign=True)
        assert Xskpca.shape[0] == X_cupca.shape[0]
        assert Xskpca.shape[1] == X_cupca.shape[1]
Ejemplo n.º 29
0
def return_top_pca_gene(by_cell_matrix, range_genes=None):
    gene_number = 100
    gene_pca = skPCA(n_components=3)
    np_by_gene = np.asarray(by_cell_matrix.transpose())
    gene_index = by_cell_matrix.index.tolist()

    if range_genes is not None:
        start_num = range_genes[0]
        end_num_genes = range_genes[1]
    else:
        start_num = 0
        end_num_genes = min(gene_number, len(gene_index))
    by_gene_trans = gene_pca.fit_transform(np_by_gene)
    Pc_df = pd.DataFrame(gene_pca.components_.T,
                         columns=['PC-1', 'PC-2', 'PC-3'],
                         index=gene_index)
    pca_rank_df = Pc_df.abs().sum(axis=1)
    Pc_sort_df = pca_rank_df.nlargest(len(gene_index))
    top_pca_list = Pc_sort_df.index.tolist()
    new_cell_matrix = by_cell_matrix.ix[
        top_pca_list[start_num:end_num_genes], :]
    return new_cell_matrix.transpose(), top_pca_list[start_num:end_num_genes]
Ejemplo n.º 30
0
    def test_pca(self):
        dataLocal = [
            array([1.0, 1.0, 1.0, 5.0]),
            array([2.0, 3.0, 4.0, 1.0]),
            array([6.0, 0.0, 6.0, 6.0])
        ]
        data = self.sc.parallelize(zip(range(1, 4), dataLocal))
        mat = RowMatrix(data)

        pca1 = PCA(k=1, svdMethod='direct')
        pca1.fit(mat)
        out1_comps = pca1.comps
        out1_scores = pca1.scores.collectValuesAsArray() * pca1.latent
        out1_transform_scores = pca1.transform(mat).collectValuesAsArray() * pca1.latent

        from sklearn.decomposition import PCA as skPCA
        pca2 = skPCA(n_components=1)
        pca2.fit(array(dataLocal))
        out2_comps = pca2.components_
        out2_scores = pca2.transform(array(dataLocal))

        assert(allclose(out1_comps, out2_comps) | allclose(out1_comps, -out2_comps))
        assert(allclose(out1_scores, out2_scores) | allclose(out1_scores, -out2_scores))
        assert(allclose(out1_scores, out1_transform_scores))
Ejemplo n.º 31
0
 def _fit_local(self, X):
     from sklearn.decomposition import PCA as skPCA
     pca = skPCA(n_components=self.k)
     t = pca.fit_transform(X)
     w_T = pca.components_
     return t, w_T
Ejemplo n.º 32
0
def plot_PCA(df_by_gene, num_genes=100, gene_list_filter=False, title='', plot=False, label_map=False, gene_map = False, annotate=False):
    gene_list = df_by_gene.columns.tolist()
    sns.set_palette("RdBu_r", 10, 1)
    if gene_list_filter:
        sig_by_gene = df_by_gene[gene_list_filter]
        sig_by_cell = sig_by_gene.transpose()
    else:
        sig_by_gene = df_by_gene
        sig_by_cell = sig_by_gene.transpose()
    gene_pca = skPCA(n_components=3)
    np_by_gene = np.asarray(sig_by_gene)

    by_gene_trans = gene_pca.fit_transform(np_by_gene)
    Pc_df = pd.DataFrame(gene_pca.components_.T, columns=['PC-1', 'PC-2', 'PC-3'], index=sig_by_gene.columns.tolist())
    pca_rank_df = Pc_df.abs().sum(axis=1)
    Pc_sort_df = pca_rank_df.nlargest(len(sig_by_gene.columns.tolist()))
    top_pca_list = Pc_sort_df.index.tolist()
    print(top_pca_list[0:num_genes], 'top_pca_list')
    top_by_gene = df_by_gene[top_pca_list[0:num_genes]]
    gene_top = skPCA(n_components=2)
    cell_pca = skPCA(n_components=2)
    top_by_cell = top_by_gene.transpose()
    np_top_gene = np.asarray(top_by_cell)
    np_top_cell = np.asarray(top_by_gene)
    top_cell_trans = cell_pca.fit_transform(np_top_cell)
    top_gene_trans = gene_top.fit_transform(np_top_gene)
    if not np.isnan(top_cell_trans).any():
        fig, (ax_cell, ax_gene) = plt.subplots(2, 1, figsize=(15, 30), sharex=False)
        rect_cell = ax_cell.patch
        rect_gene = ax_gene.patch
        rect_cell.set_facecolor('white')
        rect_gene.set_facecolor('white')
        ax_cell.grid(b=True, which='major', color='grey', linestyle='--', linewidth=0.3)
        ax_gene.grid(b=True, which='major', color='grey', linestyle='--', linewidth=0.3)
        if label_map:
            X = [x for x in top_cell_trans[:, 0]]
            Y = [y for y in top_cell_trans[:, 1]]
            labels = [label_map[cell][2] for cell in top_by_cell.columns.tolist()]
            markers = [label_map[cell][1] for cell in top_by_cell.columns.tolist()]
            colors = [label_map[cell][0] for cell in top_by_cell.columns.tolist()]
            label_done = []
            for X_pos, Y_pos, m, color, l in zip(X, Y, markers, colors, labels):
                if l in label_done:
                    lab = ''
                else:
                    lab= l
                    label_done.append(l)
                ax_cell.scatter(X_pos, Y_pos, marker=m, c=color, label=lab, s=30)

        else:
            ax_cell.scatter(top_cell_trans[:, 0], top_cell_trans[:, 1], alpha=0.75)
        ax_cell.set_xlim([min(top_cell_trans[:, 0])-1, max(top_cell_trans[:, 0]+1)])
        ax_cell.set_ylim([min(top_cell_trans[:, 1])-1, max(top_cell_trans[:, 1]+2)])
        ax_cell.set_title(title+'_cell')
        ax_cell.legend(loc='best', ncol=1, prop={'size':12}, markerscale=1.5, frameon=True)
        ax_cell.set_xlabel('PC1')
        ax_cell.set_ylabel('PC2')
        if annotate:
            for label, x, y in zip(top_by_cell.columns, top_cell_trans[:, 0], top_cell_trans[:, 1]):
                ax_cell.annotate(label, (x+0.1, y+0.1))

        if gene_map:
            X = [x for x in top_gene_trans[:, 0]]
            Y = [y for y in top_gene_trans[:, 1]]
            labels = top_by_gene.columns.tolist()
            colors = [gene_map[gene] for gene in top_by_gene.columns.tolist()]
            for X_pos, Y_pos, color, l in zip(X, Y, colors, labels):
                ax_gene.scatter(X_pos, Y_pos, marker='o', c=color, label = l, s=30)
        else:
            ax_gene.scatter(top_gene_trans[:, 0], top_gene_trans[:, 1], alpha=0.75)
        ax_gene.set_xlim([min(top_gene_trans[:, 0])-1, max(top_gene_trans[:, 0])+1])
        ax_gene.set_ylim([min(top_gene_trans[:, 1])-1, max(top_gene_trans[:, 1])+2])
        ax_gene.set_title(title+'_gene')
        ax_gene.set_xlabel('PC1')
        ax_gene.set_ylabel('PC2')
        for label, x, y in zip(top_by_gene.columns, top_gene_trans[:, 0], top_gene_trans[:, 1]):
            ax_gene.annotate(label, (x+.5, y+.5))
        if plot:
            plt.show()
        if title != '':
            save_name = '_'.join(title.split(' ')[0:2])
            plt.savefig(os.path.join(filename,save_name+'_skpca.pdf'), bbox_inches='tight')
        else:
            plt.savefig(os.path.join(filename,'non_group_skpca.pdf'), bbox_inches='tight')
        plt.close()
        return top_pca_list
    else:
        return []
Ejemplo n.º 33
0
    # 4.3 Restoring original dataset and computing mean relative error
    X_tilde = pca.inv_transform(W)

    MRE = lambda Xreal, Xpred: np.sum([
        la.norm(Xreal[:, j] - Xpred[:, j], 2) / la.norm(Xreal[:, j], 2)
        for j in range(Xreal.shape[1])
    ]) / Xreal.shape[0]
    mean_relative_error = MRE(X, X_tilde)
    print(
        f"\n   Dataset approximated with the first {pca.n_components} principal components. Mean Relative Error = {mean_relative_error}"
    )

    # 4.3.1 Comparison with sklearn PCA implementation
    from sklearn.decomposition import PCA as skPCA
    skpca = skPCA(n_components=0.9)
    skW = skpca.fit_transform(X)
    skX_tilde = skpca.inverse_transform(skW)

    # error = np.sum([la.norm(X[i] - skX_tilde[i])/la.norm(X[i]) for i in range(X.shape[0])])/X.shape[0]
    error = MRE(X, skX_tilde)
    print(
        f"   Dataset approximated with sklearn implementation ({skpca.n_components} var explained/{len(skpca.singular_values_)} components). Mean Relative Error = {error}"
    )

    pca_std = PCA(n_components=0.9, use_std=True)
    W_std = pca_std.fit_transform(X)
    X_tilde_std = pca_std.inv_transform(W_std)
    error = MRE(X, X_tilde_std)
    print(
        f"   Dataset approximated with use of std ({pca_std.var_explained} var explained/{pca_std.n_components} components). Mean Relative Error = {error}"
Ejemplo n.º 34
0
def PCA(mixed, state: dict, options: dict):
    mixed = mixed.T
    unmix = skPCA(n_components=mixed.shape[1]).fit_transform(mixed)
    return unmix.T, state
Ejemplo n.º 35
0
 def __init__(self, config):
     """PCA constructor"""
     Transform.__init__(self, config)
     self.transformer = skPCA(self.dimension)
     self.process_func_train = self.transformer.fit_transform
     self.process_func_test = self.transformer.transform