def test_correct_shapes():
    rng = np.random.RandomState(0)
    X = rng.randn(12, 10)
    spca = SparsePCA(n_components=8, random_state=rng)
    U = spca.fit_transform(X)
    assert_equal(spca.components_.shape, (8, 10))
    assert_equal(U.shape, (12, 8))
    # test overcomplete decomposition
    spca = SparsePCA(n_components=13, random_state=rng)
    U = spca.fit_transform(X)
    assert_equal(spca.components_.shape, (13, 10))
    assert_equal(U.shape, (12, 13))
Example #2
0
 def directional_context_embedding(self, context: List[str]) -> ndarray:
     transformer = SparsePCA(n_components=1, max_iter=10000, n_jobs=4)
     if len(context) > 2:
         w1 = self.word_vector(context[0])
         n = w1.size
         return transformer.fit_transform(
             numpy.kron(
                 w1,
                 self.directional_context_embedding(
                     context[1:]).transpose()).reshape(n, n))
     elif len(context) == 2:
         w1 = self.word_vector(context[0])
         w2 = self.word_vector(context[1])
         n = w1.size
         return transformer.fit_transform(
             numpy.kron(w1, w2.transpose()).reshape(n, n))
    def calc_principal_components(self, df, n_comp=20, method='PCA'):
        '''
        Run PCA and Sparse PCA on feature table
        :param df: 
        :return: 
        '''
        print(">> Running " + method + "...")
        if df.shape[1] <= n_comp:
            n_comp = df.shape[1] - 1

        tmp_drop_cols = ['Gene_Name', self.cfg.Y]
        X = df.drop(tmp_drop_cols, axis=1)
        pca_data = X.copy()

        pca = None
        if method == 'SparsePCA':
            pca = SparsePCA(n_components=n_comp)
        else:
            pca = PCA(n_components=n_comp)
        principal_components = pca.fit_transform(pca_data)

        columns = []
        for i in range(1, n_comp + 1):
            columns.append('PC' + str(i))

        pca_df = pd.DataFrame(data=principal_components, columns=columns)
        pca_df = pd.concat([pca_df, df[tmp_drop_cols]], axis=1)

        filepath = str(self.cfg.unsuperv_out / (method + ".table.tsv"))
        pca_df.to_csv(filepath, sep='\t', index=None)

        return pca, pca_df
def test_correct_shapes(norm_comp):
    rng = np.random.RandomState(0)
    X = rng.randn(12, 10)
    spca = SparsePCA(n_components=8,
                     random_state=rng,
                     normalize_components=norm_comp)
    U = spca.fit_transform(X)
    assert_equal(spca.components_.shape, (8, 10))
    assert_equal(U.shape, (12, 8))
    # test overcomplete decomposition
    spca = SparsePCA(n_components=13,
                     random_state=rng,
                     normalize_components=norm_comp)
    U = spca.fit_transform(X)
    assert_equal(spca.components_.shape, (13, 10))
    assert_equal(U.shape, (12, 13))
Example #5
0
class SparsePCA():
    def __init__(self, cols, n_components):
        self.n_components = n_components
        self.model = SparsePCA(n_components=n_components)
        self.columns = cols

    def fit(self, data):
        self.model.fit(data[self.columns])

    def fit_transform(self, data):
        transformed = self.model.fit_transform(data[self.columns])
        transformed = pd.DataFrame(
            transformed,
            columns=["spca_" + str(i + 1) for i in range(self.n_components)])
        data = pd.concat([data, transformed], axis=1)
        data = data.drop(self.columns, axis=1)
        return data

    def transform(self, data):
        transformed = self.model.transform(data[self.columns])
        transformed = pd.DataFrame(
            transformed,
            columns=["spca_" + str(i + 1) for i in range(self.n_components)])
        data = pd.concat([data, transformed], axis=1)
        data = data.drop(self.columns, axis=1)
        return data
Example #6
0
def test_scaling_fit_transform():
    alpha = 1
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 1000, (8, 8), random_state=rng)
    spca_lars = SparsePCA(n_components=3, method="lars", alpha=alpha, random_state=rng)
    results_train = spca_lars.fit_transform(Y)
    results_test = spca_lars.transform(Y[:10])
    assert_allclose(results_train[0], results_test[0])
Example #7
0
def test_transform_nan():
    # Test that SparsePCA won't return NaN when there is 0 feature in all
    # samples.
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)  # wide array
    Y[:, 0] = 0
    estimator = SparsePCA(n_components=8)
    assert_false(np.any(np.isnan(estimator.fit_transform(Y))))
 def transform(self, X):
     n_components = 15
     tsne = SparsePCA(n_components=n_components)
     X_pca = tsne.fit_transform(X.toarray())                
     if self.drop_original_feature:
         return X_pca
     else:
         return X    
Example #9
0
def apply_SparsePCA(X, num_components, alpha=1, ridge_alpha=0.1):

    sparse_pca = SparsePCA(n_components=num_components,
                           alpha=alpha,
                           ridge_alpha=ridge_alpha)
    X_transform = sparse_pca.fit_transform(X)

    return X_transform
Example #10
0
def test_transform_nan():
    # Test that SparsePCA won't return NaN when there is 0 feature in all
    # samples.
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)  # wide array
    Y[:, 0] = 0
    estimator = SparsePCA(n_components=8)
    assert not np.any(np.isnan(estimator.fit_transform(Y)))
Example #11
0
def test_fit_transform_tall():
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 65, (8, 8), random_state=rng)  # tall array
    spca_lars = SparsePCA(n_components=3, method='lars', random_state=rng)
    U1 = spca_lars.fit_transform(Y)
    spca_lasso = SparsePCA(n_components=3, method='cd', random_state=rng)
    U2 = spca_lasso.fit(Y).transform(Y)
    assert_array_almost_equal(U1, U2)
def test_fit_transform_tall():
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 65, (8, 8), random_state=rng)  # tall array
    spca_lars = SparsePCA(n_components=3, method="lars", random_state=rng)
    U1 = spca_lars.fit_transform(Y)
    spca_lasso = SparsePCA(n_components=3, method="cd", random_state=rng)
    U2 = spca_lasso.fit(Y).transform(Y)
    assert_array_almost_equal(U1, U2)
Example #13
0
 def _estimate_linear_combination(self, imgs_vec, params):
     estimator = SparsePCA(n_components=params.get('nb_labels'),
                           max_iter=params.get('max_iter'),
                           tol=params.get('tol'),
                           n_jobs=1)
     fit_result = estimator.fit_transform(imgs_vec)
     components = estimator.components_
     return estimator, components, fit_result
def test_scaling_fit_transform():
    alpha = 1
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 1000, (8, 8), random_state=rng)
    spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha,
                          random_state=rng, normalize_components=True)
    results_train = spca_lars.fit_transform(Y)
    results_test = spca_lars.transform(Y[:10])
    assert_allclose(results_train[0], results_test[0])
def sparsepca_mat(data_mat_filename, pca_data_mat_filename, pca_dims=3):

    data_mat = pickle.load(open(data_mat_filename, "rb"))

    pca = SparsePCA(n_components=pca_dims, normalize_components=True)

    pca_res = pca.fit_transform(data_mat)

    # Save the data
    pickle.dump(pca_res, open(pca_data_mat_filename, "wb"))
Example #16
0
def sparse_pca_on_pictures(whole, dimensions=None):
    whole_as_vector = transform_all_to_vectors(whole)
    pca = SparsePCA(dimensions,
                    alpha=100,
                    n_jobs=-1,
                    normalize_components=True,
                    max_iter=10)
    transformed = pca.fit_transform(whole_as_vector)
    plt.clf()
    plt.imshow(transformed, cmap='hot', interpolation='nearest')
    plt.colorbar()
    plt.savefig("faces/sparsePCA/D_" + str(dimensions) + ".png")
def DimensionalityReduction(Data):

    cData = Data
    pca = PCA(n_components=2)
    kpca = KernelPCA(kernel='rbf', n_components=2)
    spca = SparsePCA(n_components=2)

    pData = pca.fit_transform(cData)
    sData = kpca.fit_transform(cData)
    kData = spca.fit_transform(cData)

    return pData, sData, kData
Example #18
0
class Spca(Preprocess):
    """
	主成分分析クラスです

	"""
    def __init__(self):
        super().__init__()

    def make_parser(self):
        parser = super().make_parser()
        parser.add_argument("--n_components",
                            dest="n_components",
                            default=2,
                            type=int)
        #parser.add_argument("-t","--target_colname",dest="target_colname",default=None,type=str)
        return parser

    def set_parsed_args_unique(self, parsed):
        self.n_components = parsed.n_components

    def parse_args(self, args):
        parser = self.make_parser()
        return parser.parse_args(args)

    def spca(self, data):
        self.model = SparsePCA(cols=self.columns,
                               n_components=self.n_components)
        transformed = self.model.fit_transform(data)
        return transformed

    def main(self, args):
        parsed = self.parse_args(args)
        self.set_parsed_args_common(parsed)
        self.set_parsed_args_unique(parsed)

        data = self.read_data()
        self.columns = self.get_col_list()
        #主成分分析
        data.data = self.spca(data.data)
        #前処理をフローに追加
        data.add_preprocess(self.model)
        """
		#変換規則のファイル出力
		with open(self.temp_files_path+"pca.pickle","wb") as f:
			pickle.dump(self.model,f)

		#前処理の順番を保存
		self.write_order()
		"""

        #主成分データセット出力
        self.write_data(data)
Example #19
0
def vectorize_data(texts, condition):
    """This function vectorizes text to matrices."""
    # vectorizer = TfidfVectorizer()
    #vectorizer = TfidfVectorizer(max_features=111)
    #vectorizer = TfidfVectorizer(max_features=111)
    vectorizer = TfidfVectorizer()
    transformation = vectorizer.fit_transform(texts)
    if condition == 'pca':
        pca = SparsePCA(random_state=0)
        transformation = pca.fit_transform(transformation.toarray())
        # r = robjects.r
        # pca = r.princomp(transformation)
        # transformation =  pca.rotation
    dimensionality_notion = 1 # len(transformation.toarray()[0])
    return transformation, dimensionality_notion
def build_corrcoef(vec_mat):
    corr_mat = np.ones([len(vec_mat), len(vec_mat)])
    for i in range(len(vec_mat)):
        for j in range(i + 1, len(vec_mat)):
            linear_model = LinearRegression()
            linear_model.fit(vec_mat[i], vec_mat[j])
            pred = linear_model.predict(vec_mat[i])
            tmp = np.concatenate([pred, vec_mat[j]], 0)
            model = SparsePCA(n_components=1)
            tmp = model.fit_transform(tmp).reshape([2, -1])
            src, tar = tmp[0], tmp[1]
            corr_val = abs(
                np.corrcoef(src.reshape([-1]), tar.reshape([-1]))[0][1])
            corr_mat[i, j] = corr_val
            corr_mat[j, i] = corr_val
    return np.around(corr_mat, decimals=2, out=None)
Example #21
0
def kmeans_pca(x):
    vect = CountVectorizer()
    for i in range(len(x)):
        x[i] = str(x[i])

    x_vect = vect.fit_transform(x)

    kmeans = KMeans(
        n_clusters=3, init="random",
        random_state=0)  # init의 기본값인 k-means++는 문서 클러스터링에서 좋지 않다고 함
    kmeans.fit(x_vect)
    idx = list(kmeans.fit_predict(x))
    names = w2v.wv.index2word
    print(len(kmeans.labels_))

    DF = pd.DataFrame(data=x_vect)
    DF["target"] = y
    DF["cluster"] = kmeans.labels_
    print(kmeans.labels_)
    print(DF.groupby(['target',
                      'cluster'])[0].count())  # 그루핑 결과 특정 타겟으로 쏠림 현상이 나옴

    pca = SparsePCA(n_components=2, random_state=0, n_jobs=-1,
                    verbose=1)  # 텍스트가 희소 행렬로 나타나기 때문에 sparse 메소드 사용
    #pca = PCA(n_components=2, random_state=0)
    pca_transformed = pca.fit_transform(x_vect.toarray())
    DF["pca_x"] = pca_transformed[:, 0]
    DF["pca_y"] = pca_transformed[:, 1]

    marker0_ind = DF[DF['cluster'] == 0].index
    marker1_ind = DF[DF['cluster'] == 1].index
    marker2_ind = DF[DF['cluster'] == 2].index

    plt.scatter(x=DF.loc[marker0_ind, 'pca_x'],
                y=DF.loc[marker0_ind, 'pca_y'],
                marker='o')
    plt.scatter(x=DF.loc[marker1_ind, 'pca_x'],
                y=DF.loc[marker1_ind, 'pca_y'],
                marker='v')
    plt.scatter(x=DF.loc[marker2_ind, 'pca_x'],
                y=DF.loc[marker2_ind, 'pca_y'],
                marker='^')

    plt.xlabel("PCA1")
    plt.xlabel("PCA2")
    plt.show()
Example #22
0
def cluster_sk_pca_sparse(content):
    """ x """
    _config = SparsePCA(n_components=content['n_components'],
                        alpha=content['alpha'],
                        ridge_alpha=content['ridge_alpha'],
                        max_iter=content['max_iter'],
                        tol=content['tol'],
                        method=content['sk_method'],
                        n_jobs=-1)
    _result = _config.fit_transform(content['data'])
    return httpWrapper(
        json.dumps({
            'result': _result.tolist(),
            'components': _config.components_.tolist(),
            'error': _config.error_,
            'iter': _config.n_iter_
        }))
Example #23
0
    def _OnClick2(self, event):
        if self.var2.get() == "Off":
            self.var2.set("On")
        elif self.var2.get() == "On":
            self.var2.set("Off")
            print("Sparse PCA is running...")
            label = pd.read_csv(self.labelVar, header=None)[0].tolist()
            df = pd.read_csv(self.dfLabel, header=None)
            data, label = df, label
            #Standdardize the data
            data = StandardScaler().fit_transform(data)

            # apply PCA
            sparsepca = SparsePCA(n_components=2)

            # get 1st and 2nd components
            sparsepca.fit(data)
            SparseprincipalComponents = sparsepca.fit_transform(data)
            SparseprincipalDf = pd.DataFrame(
                data=SparseprincipalComponents,
                columns=['Component 1', 'Component 2'])
            print("Our principal components are: ")
            print(SparseprincipalComponents)
            X_r1 = SparseprincipalComponents[:, 0]
            X_r2 = SparseprincipalComponents[:, 1]
            unique = np.unique(label)
            print(len(np.unique(label)) + "*************************")
            try:
                plt.scatter(X_r1, X_r2, c=label)
            except:
                print(
                    "Data matrix does not match label matrix (Select input file and label, remove headers)"
                )

            name = 'Sparse_PCA'  #CHANGE FILENAME HERE *************************************************************************
            #plt.legend(unique, loc=8, ncol=5,fontsize='x-small')
            plt.title(name + " Clusters: " + str(len(unique)))
            plt.show()
            plt.savefig(name + ".png")
            plt.clf()

            # save  1st and 2nd components to csv
            SparseprincipalDf.to_excel(
                "Sparse_PCA_Components.xlsx"
            )  #Names of 1st and 2nd components to EXCEL here *************************************************************************
Example #24
0
    def run_and_fit(self,model_string,nr_components,nr_timepoints,nr_neurons,lambd=0):
        np.random.seed(7)
        #X=self.simulate_data(nr_components,nr_timepoints,nr_neurons)
        X=self.simulate_data_w_noise(nr_components,nr_timepoints,nr_neurons,noise_ampl_mult=5)
        if model_string=='EnsemblePursuit':
            options_dict={'seed_neuron_av_nr':10,'min_assembly_size':1}


            ep_pt=EnsemblePursuitPyTorch(n_ensembles=nr_components,lambd=lambd,options_dict=options_dict)
            U,V=ep_pt.fit_transform(X)
            self.U=U.numpy()
            self.V=V.numpy().T
        if model_string=='EnsemblePursuitNumpy':
            options_dict={'seed_neuron_av_nr':10,'min_assembly_size':1}
            ep_np=EnsemblePursuitNumpy(n_ensembles=nr_components,lambd=lambd,options_dict=options_dict)
            U,V,self.corrs=ep_np.fit_transform(X)
            self.U=U
            self.V=V.T
        if model_string=='ICA':
           ica=FastICA(n_components=nr_components,random_state=7)
           self.V=ica.fit_transform(X.T).T
           self.U=ica.mixing_
        if model_string=='PCA':
           pca=PCA(n_components=nr_components,random_state=7)
           self.V=pca.fit_transform(X.T).T
           self.U=pca.components_.T
        if model_string=='sparsePCA':
           spca=SparsePCA(n_components=nr_components,random_state=7)
           self.V=spca.fit_transform(X.T).T
           self.U=spca.components_.T
        if model_string=='NMF':
           X-=X.min(axis=0)
           nmf=NMF(n_components=nr_components, init='nndsvd', random_state=7,alpha=lambd,l1_ratio=0.5)
           self.V=nmf.fit_transform(X.T).T
           self.U=nmf.components_.T
        if model_string=='LDA':
           X-=X.min(axis=0)
           nmf=LatentDirichletAllocation(n_components=nr_components,random_state=7)
           self.V=nmf.fit_transform(X.T).T
           self.U=nmf.components_.T
        print('SHPS', self.U.shape, self.V.shape)
        self.orig=X
        [email protected]
        print('orig',self.orig.shape)
        print('approx',self.approx.shape)
def prep(q_features, comp):
    out = []
    pca = SparsePCA(n_components=comp)
    x = pca.fit_transform(q_features)
    z = hamming_z(x)
    t = pd.read_csv('thresholdvalues.csv', delimiter=None)
    t = t.drop(t.columns[0], axis=1)
    t = t.T
    t = np.array(t)
    y = np.zeros((len(q_features), comp))

    for i in range(len(q_features)):
        for v in range(comp):
            if z[i][v] <= t[0][v]:
                y[i][v] = 1
            else:
                y[i][v] = -1

    return y
Example #26
0
def get_latent_value(values,
                     method='kernelpca',
                     normalization=True,
                     widget_key=None):
    st.write("Dimensionality reduction for dim: <{}*{}>".format(
        len(values), len(values[0])))

    if normalization:
        values = normalize(values, norm='l2')

    if method == 'pca':
        pca = PCA(n_components=2, whiten=True)
        pca.fit(np.transpose(values))
        return pca.components_

    elif method == 'sparsepca':
        sparse_pca = SparsePCA(n_components=2)
        return np.transpose(sparse_pca.fit_transform(values))

    elif method == 'kernelpca':
        kernel_pca = KernelPCA(n_components=2, kernel='rbf')
        return np.transpose(kernel_pca.fit_transform(values))

    elif method == 'tsne':
        n_it = st.slider("Max iteration",
                         min_value=5000,
                         max_value=50000,
                         key='tsne_it_{}'.format(widget_key))
        perp = st.slider("Perplexity",
                         min_value=30,
                         max_value=300,
                         key='tsne_prep{}'.format(widget_key))
        lr = st.slider("Learning rate",
                       min_value=10,
                       max_value=1000,
                       key='tsne_lr{}'.format(widget_key))
        tsne = TSNE(n_components=2,
                    n_iter=n_it,
                    perplexity=perp,
                    learning_rate=lr,
                    n_jobs=4)
        return np.transpose(tsne.fit_transform(values))
Example #27
0
def init_atlas_sparse_pca(imgs, nb_patterns, nb_iter=5, bg_threshold=0.1):
    """ estimating initial atlas using SoA method based on linear combinations

    :param list(ndarray) imgs: list of input images
    :param int nb_patterns: number of pattern in the atlas to be set
    :param int nb_iter: max number of iterations
    :param float bg_threshold:
    :return ndarray: estimated atlas

    >>> np.random.seed(0)
    >>> atlas = np.zeros((8, 12), dtype=int)
    >>> atlas[:3, 1:5] = 1
    >>> atlas[3:7, 6:12] = 2
    >>> luts = np.array([[0, 1, 0]] * 99 + [[0, 0, 1]] * 99 + [[0, 1, 1]] * 99)
    >>> imgs = [lut[atlas] for lut in luts]
    >>> init_atlas_sparse_pca(imgs, 2, bg_threshold=0.05)
    array([[0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0],
           [0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0],
           [0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
           [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
           [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
           [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
    """
    imgs_vec = np.array([np.ravel(im) for im in imgs])

    try:
        estimator = SparsePCA(n_components=nb_patterns + 1, max_iter=nb_iter)
        fit_result = estimator.fit_transform(imgs_vec)
        components = estimator.components_

        ptn_used = np.sum(np.abs(fit_result), axis=0) > 0
        atlas_ptns = components.reshape((-1, ) + imgs[0].shape)

        atlas = convert_lin_comb_patterns_2_atlas(atlas_ptns, ptn_used,
                                                  bg_threshold)
    except Exception:
        logging.exception('CRASH: %s' % init_atlas_sparse_pca.__name__)
        atlas = np.zeros(imgs[0].shape, dtype=int)
    return atlas
n = 1
for FrameRange_ind in range(len(offset_list)):
    for sparsePCA_alpha_ind in sparsePCA_alpha:
        # for sparsePCA_ridge_alpha_ind in sparsePCA_ridge_alpha:
        # compute PCA
        ncomp = 5
        offset = offset_list[FrameRange_ind]
        upto = upto_list[FrameRange_ind]
        # if ~upto:
        #     upto = O.Shapes().shape[0]
        PCA_start = time.time()
        p = SparsePCA(n_components=ncomp, alpha=sparsePCA_alpha_ind, ridge_alpha=0.01)
        PCA_end = time.time()
        print("The " + str(n) + " PCA time: " + str(PCA_end-PCA_start))
        Projection_start = time.time()
        scorePCA = p.fit_transform(O.Shapes()[offset:upto, :].T).T
        Projection_end = time.time()
        print("The " + str(n) + " Projection time: " + str(Projection_end-Projection_start))
        # explained_variance_ratio = p.explained_variance_ratio_
        plt.figure(1)
        plt.plot(p.components_.T)
        plt.legend(range(5))
        plt.savefig("princomp/" + str(offset) + "to" + str(upto) + "_alpha" + str(sparsePCA_alpha_ind) + ".png", bbox_inches='tight')
        plt.clf()

        plt.figure(2)
        plt.scatter(scorePCA[0, :10000], scorePCA[1, :10000], s=4)
        plt.savefig("scatter/" + str(offset) + "to" + str(upto) + "_alpha" + str(sparsePCA_alpha_ind) + ".png", bbox_inches='tight')
        plt.clf()

        m = 1
Example #29
0
cnt=0
feature=[[0 for i in range(0,n_feat)] for j in range(0,120542)] #80362
for line in fin:
    a=line.split(" ")
    for i in range(2,n_feat):
        feature[cnt][i-2]=float(a[i].split(":")[1])
    cnt+=1
print cnt
#print feature[cnt-1]

X=np.array(feature)
'''
pca=PCA(n_components=n_feat)
pca_result=pca.fit_transform(X)
'''
pca=SparsePCA(n_components=n_feat,alpha=0.6,n_jobs=2,max_iter=15)
pca_result=pca.fit_transform(X)

#print pca_result[0]
cnt=0
fin = open("data/feature/train_gh_97a",'r')

for line in fin:
    a=line.split(" ")
    PCA_d=50
    for i in range(0,PCA_d):
        a[i+2]=str(i)+":"+str(feature[cnt][i])
    ll=" ".join(a[0:PCA_d+2])
    fo.write(ll+"\n")
    cnt+=1
fo.close()
def _cluster_analysis(feats_data, save_name, is_color_time=False):
    #%% ##### Clustering analysis
    df_n = feats_data.dropna()
    
    df = df_n[set_feats].copy()
    index_data = df_n[index_cols].reset_index()
    
    X = df.values.copy()
    #[x_min==x_max]
    x_min, x_max = df.min(), df.max()
    df = (df - x_min)/(x_max - x_min)
    
    X = df.values
    #%% #### labels and indexes vectors
    nz = int(np.ceil(np.log10(index_data['time_group']+ 0.001).max()))
    time_g_str = [('%1.1f' % x).zfill(nz+2) for x in index_data['time_group'].values]
    cohort_str = [str(int(x)) for x in index_data['cohort_n']]
    
    labels = ['C{}_T{}'.format(*x) for x in zip(cohort_str, time_g_str)]
    label_order = sorted(list(set(labels)))
    
    uC = sorted(list(set(cohort_str)))
    uT = sorted(list(set(time_g_str)))
    filled_markers = ('o', 's', 'v', '^', '<', '>', '8', 'p', '*', 'h', 'H', 'D', 'd', 'P', 'X')
    
    if is_color_time:
        cols = sns.color_palette("RdYlGn", len(uT))
        
        col_dict_u = {k : v for k,v in zip(time_g_str, cols)}
        col_dict = {ll : col_dict_u[tt] for ll, tt in zip(labels, time_g_str)}
        
        mks_dict = {x : filled_markers[ii] for ii, x in enumerate(uC)}
        mks = [mks_dict[x[1]] for x in label_order]
    else:
        
        cols = sns.color_palette("colorblind", len(uC))
        col_dict_u = {k : v for k,v in zip(uC, cols)}
        col_dict = {ll : col_dict_u[tt] for ll, tt in zip(labels, cohort_str)}
        
        mks_dict = {x : filled_markers[ii] for ii, x in enumerate(uT)}
        mks = [mks_dict[x.partition('_T')[-1]] for x in label_order]
    
    
    #%%
    tsne = TSNE(n_components=2, 
                    #perplexity = 21,
                    init='pca',
                    verbose=1, 
                    n_iter=10000
                    )
    X_tsne = tsne.fit_transform(X)
    
    #%%
    pca_s = SparsePCA()
    X_pca_s = pca_s.fit_transform(X)
    
    pca = PCA()
    X_pca = pca.fit_transform(X)
    #%%
    
            
    dat = {'t-SNE':X_tsne, 'PCA':X_pca, 'PCA_Sparse':X_pca_s}
    
    with PdfPages(save_name) as pdf_pages:
        
        for k, Xp in dat.items():
            _plot_clusters(Xp, labels, label_order, col_dict, mks)
            plt.title(k)
            pdf_pages.savefig()
            plt.close()
            #%%
    return dat
Example #31
0
	def sparse_pca(self, n_components, alpha):
		pca = SparsePCA(n_components = 3, alpha = alpha)
		self.X = pca.fit_transform(self.X)
		self.df_c = pd.DataFrame(pca.components_.T, index = self.crimes, columns = [1,2,3])
		return self.df_c
Example #32
0
def Var_Select(orgdata, k, alphaMax=10, alphastep=0.2):
    """
    orgdata-需要信息压缩的数据框
    k-预期最大需要保留的最大变量个数,实际保留数量不能多于这个数值
    alphaMax-SparsePCA算法惩罚项的最大值,一般要到5才会取得比较理想的结果
    alphastep-SparsePCA算法惩罚项递增的步长
    """
    #step1:当数据量过大时,为了减少不必要的耗时
    if orgdata.iloc[:, 1].count() > 5000:
        data = orgdata.sample(5000)
    else:
        data = orgdata
#step2:引入所需要的包,并且对数据进行标准化
    from sklearn import preprocessing
    import pandas as pd
    import numpy as np
    from sklearn.decomposition import SparsePCA
    #from functools import reduce
    data = preprocessing.scale(data)
    n_components = k
    #pca_n = list()
    #step3:进行SparsePCA计算,选择合适的惩罚项alpha,当恰巧每个原始变量只在一个主成分上有权重时,停止循环
    for i in np.arange(0.1, alphaMax, alphastep):
        pca_model = SparsePCA(n_components=n_components, alpha=i)
        pca_model.fit(data)
        pca = pd.DataFrame(pca_model.components_).T
        n = data.shape[1] - sum(sum(np.array(pca != 0)))  ####计算系数不为0的数量
        if n == 0:
            global best_alpha
            best_alpha = i
            break
    #step4:根据上一步得到的惩罚项的取值,估计SparsePCA,并得到稀疏主成分得分
    pca_model = SparsePCA(n_components=n_components, alpha=best_alpha)
    pca_model.fit(data)
    pca = pd.DataFrame(pca_model.components_).T
    data = pd.DataFrame(data)
    score = pd.DataFrame(pca_model.fit_transform(data))
    #step6:计算原始变量与主成分之间的1-R方值
    r = []
    R_square = []
    for xk in range(data.shape[1]):  # xk输入变量个数
        for paj in range(n_components):  # paj主成分个数
            r.append(
                abs(np.corrcoef(data.iloc[:, xk], score.iloc[:, paj])[0, 1]))
            r_max1 = max(r)
            r.remove(r_max1)
            r.append(-2)
            r_max2 = max(r)
            R_square.append((1 - r_max1**2) / (1 - r_max2**2))

    R_square = abs(
        pd.DataFrame(
            np.array(R_square).reshape((data.shape[1], n_components))))
    var_list = []
    #print(R_square)
    #step7:每个主成分中,选出原始变量的1-R方值最小的。
    for i in range(n_components):
        vmin = R_square[i].min()
        #print(R_square[i])
        #print(vmin)
        #print(R_square[R_square[i] == min][i])
        var_list.append(R_square[R_square[i] == vmin][i].index)

    news_ids = []
    for id in var_list:
        if id not in news_ids:
            news_ids.append(id)
    print(news_ids)
    data_vc = orgdata.iloc[:, np.array(news_ids).reshape(len(news_ids))]
    return data_vc
Example #33
0
n_comp = 12

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results_train = tsvd.fit_transform(train.drop(["y"], axis=1))
tsvd_results_test = tsvd.transform(test)

# PCA
# pca = PCA(n_components=n_comp, random_state=420)
# pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
# pca2_results_test = pca.transform(test)

#sparse PCA
spca = SparsePCA(n_components=n_comp, random_state=420)
spca2_results_train = spca.fit_transform(train.drop(["y"], axis=1))
spca2_results_test = spca.transform(test)

#Kernel PCA
kpca = KernelPCA(n_components=n_comp, random_state=420)
kpca2_results_train = kpca.fit_transform(train.drop(["y"], axis=1))
kpca2_results_test = kpca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train.drop(["y"], axis=1))
Example #34
0
			count += 1
			if count > n:
				break
			try:
				cat = io.imread("sparse-cats/"+f,as_grey=True).flatten()
				cat.shape = (40000,1)
				images = np.append(images, cat, axis=1)
			except:
				count -= 1
				continue
		print("loaded cats...")

		tic = time.clock()
		print("starting learning...")
		pca = SparsePCA(n_components=n,max_iter=1000)
		x = pca.fit_transform(images,subject)
		print("learning done...")
		toc = time.clock()
		print(x)

		out = np.zeros(40000)
		print("starting transform...")
		for i in range(40000):
			for j in range(n):
				#out[i] += (x[i,j])
				out[i] += (images[i,j] * x[i,j])

		out.shape = (200,200)
		print(out)
		name = re.match("people/([a-z]*)_small.jpg",filename).group(1)
		io.imsave("pca/pca_cat_{0}_{1}.jpg".format(n,name),out)
def transform(xTrain,yTrain,xTest):
    pca = SparsePCA(n_components=2);
    newXTrain =  pca.fit_transform(xTrain,yTrain)
    newXTest = pca.transform(xTest)
    return newXTrain,newXTest   
Example #36
0
#convert RData to pd DataFrame
readin = pyreadr.read_r('C:/Users/TW/Downloads/west.RData')
westdf = readin["west"]
chapter = westdf[['chapter']]

#propossesing the data
westdf = westdf.drop(['chapter'], axis=1)  #delete 'chapter' column   408*302
x = westdf.loc[:, :].values
x = StandardScaler(with_std=False).fit_transform(x)  #centerlize the data

#SparsePCA transform
transformer = SparsePCA(n_components=3,\
                        alpha=0.1,\
                        normalize_components=True,\
                        random_state=0)
x_transformed = transformer.fit_transform(x)

# for data analysis
x_transformed.shape
transformer.alpha
egienvetors = transformer.components_
transformer.error_
transformer.get_params(deep=True)
np.mean(transformer.components_ == 0)
westspca = pd.DataFrame(data=egienvetors, columns=westdf.columns)
Spca1 = westspca.sort_values(by=[0], axis=1)
Spca2 = westspca.sort_values(by=[1], axis=1)
Spca3 = westspca.sort_values(by=[2], axis=1)
Spca4 = westspca.sort_values(by=[3], axis=1)
Spca5 = westspca.sort_values(by=[4], axis=1)
class SPCA(object):
    def __init__(self,
                 n_components=None,
                 alpha=1,
                 ridge_alpha=0.01,
                 max_iter=1000,
                 tol=1e-8,
                 method='lars',
                 n_jobs=None,
                 U_init=None,
                 V_init=None,
                 verbose=False,
                 random_state=None,
                 normalize_components='deprecated'):
        """
        :param n_components:
        :param alpha:
        :param ridge_alpha:
        :param max_iter:
        :param tol:
        :param method:
        :param n_jobs:
        :param U_init:
        :param V_init:
        :param verbose:
        :param random_state:
        :param normalize_components:
        """
        self.model = SparsePCA(n_components=n_components,
                               alpha=alpha,
                               ridge_alpha=ridge_alpha,
                               max_iter=max_iter,
                               tol=tol,
                               method=method,
                               n_jobs=n_jobs,
                               U_init=U_init,
                               V_init=V_init,
                               verbose=verbose,
                               random_state=random_state,
                               normalize_components=normalize_components)

    def fit(self, x, y):
        self.model.fit(X=x, y=y)

    def transform(self, x):
        self.model.transform(X=x)

    def fit_transform(self, x, y=None):
        return self.model.fit_transform(X=x, y=y)

    def get_params(self):
        return self.model.get_params(deep=True)

    def set_params(self, **params):
        return self.model.set_params(**params)

    def get_attributes(self):
        components = self.model.components_
        error = self.model.error_
        n_iter = self.model.n_iter_
        mean = self.model.mean_
        return components, error, n_iter, mean
Example #38
0
#Sparse Principal Components Analysis (SparsePCA)
#SparsePCA
"""
Finds the set of sparse components that can optimally reconstruct the data. 
The amount of sparseness is controllable by the coefficient of the L1 penalty, 
given by the parameter alpha.
"""
from sklearn.decomposition import SparsePCA
#SparsePCA(n_components=None, alpha=1, ridge_alpha=0.01, max_iter=1000, tol=1e-08,
#method='lars', n_jobs=1, U_init=None, V_init=None,
#verbose=False, random_state=None)
#method : {‘lars’, ‘cd’}
#alpha: higher value--sparser components
spca = SparsePCA(method='lars')
SPCA_OUTPUT = spca.fit(X_all_his_center)
X_spca = spca.fit_transform(X_all_his_center)
np.savetxt("D:/lly/2017MM/PHASE2/final_totoal/SPCA_MM_PCs.csv",
           SPCA_OUTPUT.components_,
           delimiter=",")

#2d-visualization-SPCA-data projection in higher dimensional space
fig = plt.figure()
plt.plot(X_spca[reds, 0], X_spca[reds, 1], "ro", markersize=10)
plt.plot(X_spca[blues, 0], X_spca[blues, 1], "b^", alpha=0.5)
plt.plot(X_spca[greens, 0], X_spca[greens, 1], "g+")
plt.legend('LWN')
plt.title("newData under two PCs--SPCA")
plt.xlabel("$1^{st}$ PC")
plt.ylabel("$2^{nd}$ PC")

plt.grid('on')
Example #39
0
class SPCA(object):
    """
    Wrapper for sklearn package.  Performs sparse PCA

    SPCA has 5 methods:
       - fit(waveforms)
       update class instance with ICA fit

       - fit_transform()
       do what fit() does, but additionally return the projection onto ICA space

       - inverse_transform(A)
       inverses the decomposition, returns waveforms for an input A, using Z

       - get_basis()
       returns the basis vectors Z^\dagger

       - get_params()
       returns metadata used for fits.
    """
    def __init__(self, num_components=10,
                 catalog_name='unknown',
                 alpha = 0.1,
                 ridge_alpha = 0.01,
                 max_iter = 2000,
                 tol = 1e-9,
                 n_jobs = 1,
                 random_state = None):

        self._decomposition  = 'Sparse PCA'
        self._num_components = num_components
        self._catalog_name   = catalog_name
        self._alpha          = alpha
        self._ridge_alpha    = ridge_alpha
        self._n_jobs         = n_jobs
        self._max_iter       = max_iter
        self._tol            = tol
        self._random_state   = random_state

        self._SPCA = SparsePCA(n_components=self._num_components,
                              alpha        = self._alpha,
                              ridge_alpha  = self._ridge_alpha,
                              n_jobs       = self._n_jobs,
                              max_iter     = self._max_iter,
                              tol          = self._tol,
                              random_state = self._random_state)

    def fit(self,waveforms):
        # TODO make sure there are more columns than rows (transpose if not)
        # normalize waveforms
        self._waveforms = waveforms
        self._SPCA.fit(self._waveforms)

    def fit_transform(self,waveforms):
        # TODO make sure there are more columns than rows (transpose if not)
        # normalize waveforms
        self._waveforms = waveforms
        self._A = self._SPCA.fit_transform(self._waveforms)
        return self._A

    def inverse_transform(self,A):
        # convert basis back to waveforms using fit
        new_waveforms = self._SPCA.inverse_transform(A)
        return new_waveforms

    def get_params(self):
        # TODO know what catalog was used! (include waveform metadata)
        params = self._SPCA.get_params()
        params['num_components'] = params.pop('n_components')
        params['Decompositon'] = self._decomposition
        return params

    def get_basis(self):
        """ Return the SPCA basis vectors (Z^\dagger)"""
        Zt = self._SPCA.components_
        return Zt
Example #40
0
    #csv = "c:/iris44.csv"  # wikipedia Iris_flower_data_set
        # 5.1,3.5,1.4,0.2  # ,Iris-setosa ...
    N = 40
    K = 450000
    
    seed = 1
    exec "\n".join( sys.argv[1:] )  # N= ...
    np.random.seed(seed)
    np.set_printoptions( 1, threshold=100, suppress=True )  # .1f
    try:
        A = np.genfromtxt( csv, delimiter="," )
        N, K = A.shape
    except IOError:
        print('error')
        A = np.random.normal( size=(N, K) )  # gen correlated ?

    print(len(A[1]), N, K)
    
    print "A:", A
    #pca = PCA(n_components=4)
    pca = SparsePCA(n_components=None, alpha=1, ridge_alpha=0.01, max_iter=1000, tol=1e-08, method='lars', n_jobs=1, U_init=None, V_init=None, verbose=False, random_state=None)
    scores=pca.fit_transform(A)
    pca_variance = pca.explained_variance_ratio_
    coeff = pca.components_
    #A1=pca.inverse_transform(coeff)
    print(pca_variance)
    print("coeff",coeff)
    #score = pca.transform(A)
    print("score",scores)
    #print A1
    
Example #41
0
res_poly = compare_KernelPCA(kernel='poly')
res_rbf = compare_KernelPCA(kernel='rbf')
res_sigmoid = compare_KernelPCA(kernel='sigmoid')
res_cosine = compare_KernelPCA(kernel='cosine')


kernel_pca_precomputed = KernelPCA(n_components=kernel_pca_n_comp, kernel='precomputed')
kernel_pca_precomputed_data = kernel_pca_precomputed.fit_transform(data.dot(data.T))
kernel_pca_precomputed.lambdas_.round(3)

# ---
# ## Модификации метода главных компонент
# ### SparcePCA

sparse_pca_lars = SparsePCA(2, method='lars')
sparse_pca_lars_data = sparse_pca_lars.fit_transform(data)

print("Sparse PCA with lars method components")
print(sparse_pca_lars.components_)


sparse_pca_cd = SparsePCA(2, method='cd')
sparse_pca_cd_data = sparse_pca_cd.fit_transform(data)

print("Sparse PCA with cd method components")
print(sparse_pca_cd.components_)


fig, axs = plt.subplots(1,2)
fig.set_figwidth(11)
fig.set_figheight(5)
def textSimilarity():
    NeighborDirectory = GEOTEXT_HOME
    # matplotlib.use('Agg')
    DATA_FOLDER = userTextDirectory
    # DATA_FOLDER = "/GEOTEXT_HOME/af/Downloads/review_polarity/txt_sentoken"
    K_FOLD = 10
    data_target = load_files(DATA_FOLDER, encoding=encoding)
    filenames = data_target.filenames
    DO_PCA = True
    DO_SPARSEPCA = False
    Reduction_D = 100
    DO_SVD = False
    categories = data_target.target_names
    DO_NMF = False
    
    def size_mb(docs):
        return sum(len(s.encode(encoding)) for s in docs) / 1e6
    
    data_size_mb = size_mb(data_target.data)
    
    
    print("%d documents - %0.3fMB (all data set)" % (
        len(data_target.data), data_size_mb))
    
    print("%d categories" % len(categories))
    print()
    
    # split a training set and a test set
    target = data_target.target
    
    
    
    print("Extracting features from all the dataset using a sparse vectorizer")
    t0 = 0
    vectorizer = TfidfVectorizer(use_idf=True, norm='l2', binary=False, sublinear_tf=True, min_df=2, max_df=0.2, ngram_range=(1, 1), stop_words='english')
    
    # vectorizer = CountVectorizer(min_df=2, max_df=1.0, ngram_range=(1, 4))
    # the output of the fit_transform (x_train) is a sparse csc matrix.
    data = vectorizer.fit_transform(data_target.data)
    print data.dtype
    data = csr_matrix(data, dtype=float32)
    print data.dtype
    duration = 1
    print("done in %fs at %0.3fMB/s" % (duration, data_size_mb / duration))
    print("n_samples: %d, n_features: %d" % data.shape)
    print()
    
    
    
    if DO_PCA:
        print("dimension reduction pca with d=%d" % Reduction_D)
        pca = PCA(n_components=Reduction_D, copy=True, whiten=False)
        print type(data)
        data = pca.fit_transform(data.todense())
    if DO_SPARSEPCA:
        print("dimension reduction sparsepca with d=%d" % Reduction_D)
        spca = SparsePCA(Reduction_D)
        data = spca.fit_transform(data.toarray())
    if DO_SVD:
        print("dimension reduction svd with d=%d" % Reduction_D)
        svd = TruncatedSVD(n_components=Reduction_D, algorithm="randomized", n_iterations=5, random_state=None, tol=0)
        data = svd.fit_transform(data)
    if DO_NMF:
        print("dimension reduction nmf with d=%d" % Reduction_D)
        nmf = NMF(n_components=Reduction_D)
        data = nmf.fit_transform(data)
    
    DO_CHI = False
    if DO_CHI:
        print("Extracting best features by a chi-squared test")
        ch2NumFeatures = 1000 
        ch2 = SelectKBest(chi2, k=ch2NumFeatures)
        # print vectorizer.get_stop_words()
        data = ch2.fit_transform(data, target)
        # print data
    
    
    KNN = 10
    nn = NearestNeighbors(n_neighbors=KNN + 1, algorithm='ball_tree').fit(data)
    # query and data are the same so every node is counted as its most similar here
    distances, indices = nn.kneighbors(data)
    with codecs.open(path.join(NeighborDirectory, 'neighbors.txt'), 'w', encoding) as outf:
        nodeIndex = -1
        nodeNeighbors = []
        for neighbors in indices:
            nodeIndex += 1
            outf.write(path.basename(filenames[nodeIndex]) + ' ')
            for neighbor in neighbors:
                if neighbor == nodeIndex:
                    continue
                else:
                    outf.write(path.basename(filenames[neighbor]) + ' ')
            outf.write('\n')