def test_mini_batch_fit_transform(): raise SkipTest alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = MiniBatchSparsePCA(n_components=3, random_state=0, alpha=alpha).fit(Y) U1 = spca_lars.transform(Y) # Test multiple CPUs if sys.platform == 'win32': # fake parallelism for win32 import sklearn.externals.joblib.parallel as joblib_par _mp = joblib_par.multiprocessing joblib_par.multiprocessing = None try: U2 = MiniBatchSparsePCA(n_components=3, n_jobs=2, alpha=alpha, random_state=0).fit(Y).transform(Y) finally: joblib_par.multiprocessing = _mp else: # we can efficiently use parallelism U2 = MiniBatchSparsePCA(n_components=3, n_jobs=2, alpha=alpha, random_state=0).fit(Y).transform(Y) assert_true(not np.all(spca_lars.components_ == 0)) assert_array_almost_equal(U1, U2) # Test that CD gives similar results spca_lasso = MiniBatchSparsePCA(n_components=3, method='cd', alpha=alpha, random_state=0).fit(Y) assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
def test_mini_batch_fit_transform(): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = MiniBatchSparsePCA(n_components=3, random_state=0, alpha=alpha).fit(Y) U1 = spca_lars.transform(Y) # Test multiple CPUs if sys.platform == 'win32': # fake parallelism for win32 import joblib _mp = joblib.parallel.multiprocessing joblib.parallel.multiprocessing = None try: spca = MiniBatchSparsePCA(n_components=3, n_jobs=2, alpha=alpha, random_state=0) U2 = spca.fit(Y).transform(Y) finally: joblib.parallel.multiprocessing = _mp else: # we can efficiently use parallelism spca = MiniBatchSparsePCA(n_components=3, n_jobs=2, alpha=alpha, random_state=0) U2 = spca.fit(Y).transform(Y) assert not np.all(spca_lars.components_ == 0) assert_array_almost_equal(U1, U2) # Test that CD gives similar results spca_lasso = MiniBatchSparsePCA(n_components=3, method='cd', alpha=alpha, random_state=0).fit(Y) assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
def test_mini_batch_correct_shapes(): rng = np.random.RandomState(0) X = rng.randn(12, 10) pca = MiniBatchSparsePCA(n_components=8, random_state=rng) U = pca.fit_transform(X) assert_equal(pca.components_.shape, (8, 10)) assert_equal(U.shape, (12, 8)) # test overcomplete decomposition pca = MiniBatchSparsePCA(n_components=13, random_state=rng) U = pca.fit_transform(X) assert_equal(pca.components_.shape, (13, 10)) assert_equal(U.shape, (12, 13))
class MiniBatchSparsePCAImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
def decompose_minibatch_sparse_pca(X, n_components, alpha=0.8, n_iter=100, batch_size=3, random_state=np.random.RandomState(42)): minibatch_sparse_pca = MiniBatchSparsePCA( n_components=n_components, alpha=alpha, n_iter=n_iter, batch_size=batch_size, random_state=random_state, ) X_minibatch_sparse_pca = minibatch_sparse_pca.fit_transform(X) return X_minibatch_sparse_pca
def createMiniBatchSparsePCADecomposition(params): # params['method'] = {‘lars’, ‘cd’} # params['alpha'] = {1} # params['ridge_alpha'] = {1} cls = MiniBatchSparsePCA() return cls
def pcaImgWrdMat(highDim, lowDim): (options, args) = parser.parse_args(sys.argv[1:]) #@UnusedVariable dataset = options.dataset # method = options.method #acquire the category list catmap = getCatMap(dataset) catList = catmap.keys() # the number of categories in category list # nCategory = len(catList) for catName in catList: print '%s : %d : %d\n' % (catName, highDim, lowDim) catPosFileName = rootDir + dataset + iwmDir + catName + str( highDim) + iwmext catPosData = np.loadtxt(catPosFileName, dtype=np.int, delimiter=' ') nPosImages = catPosData.shape[0] catNegFileName = rootDir + dataset + iwmDir + 'NEG' + catName + str( highDim) + iwmext catNegData = np.loadtxt(catNegFileName, dtype=np.int, delimiter=' ') nNegImages = catNegData.shape[0] catData = np.vstack((catPosData, catNegData)) labels = np.vstack((np.ones( (nPosImages, 1), np.int), np.zeros((nNegImages, 1), np.int))) print 'pca...' pcaData = PCA(n_components=lowDim).fit(catData).transform(catData) pcaData = np.hstack((pcaData, labels)) pcaDataFileName = rootDir + dataset + outputDir + catName + str( highDim) + str(lowDim) + '.pca' np.savetxt(pcaDataFileName, pcaData, fmt='%f', delimiter=' ') print 'ppca...' ppcaData = ProbabilisticPCA( n_components=lowDim).fit(catData).transform(catData) ppcaData = np.hstack((ppcaData, labels)) ppcaDataFileName = rootDir + dataset + outputDir + catName + str( highDim) + str(lowDim) + '.ppca' np.savetxt(ppcaDataFileName, ppcaData, fmt='%f', delimiter=' ') print 'rpca...' rpcaData = RandomizedPCA( n_components=lowDim).fit(catData).transform(catData) rpcaData = np.hstack((rpcaData, labels)) rpcaDataFileName = rootDir + dataset + outputDir + catName + str( highDim) + str(lowDim) + '.rpca' np.savetxt(rpcaDataFileName, rpcaData, fmt='%f', delimiter=' ') print 'kpca...' kpcaData = KernelPCA( n_components=lowDim).fit(catData).transform(catData) kpcaData = np.hstack((kpcaData, labels)) kpcaDataFileName = rootDir + dataset + outputDir + catName + str( highDim) + str(lowDim) + '.kpca' np.savetxt(kpcaDataFileName, kpcaData, fmt='%f', delimiter=' ') print 'spca...' spcaData = MiniBatchSparsePCA( n_components=lowDim, n_iter=100).fit(catData).transform(catData) spcaData = np.hstack((spcaData, labels)) spcaDataFileName = rootDir + dataset + outputDir + catName + str( highDim) + str(lowDim) + '.spca' np.savetxt(spcaDataFileName, spcaData, fmt='%f', delimiter=' ') pass
def test_spca_n_iter_deprecation(): """Check that we raise a warning for the deprecation of `n_iter` and it is ignored when `max_iter` is specified. """ rng = np.random.RandomState(0) n_samples, n_features = 12, 10 X = rng.randn(n_samples, n_features) warn_msg = "'n_iter' is deprecated in version 1.1 and will be removed" with pytest.warns(FutureWarning, match=warn_msg): MiniBatchSparsePCA(n_iter=2).fit(X) n_iter, max_iter = 1, 100 with pytest.warns(FutureWarning, match=warn_msg): model = MiniBatchSparsePCA( n_iter=n_iter, max_iter=max_iter, random_state=0 ).fit(X) assert model.n_iter_ > 1 assert model.n_iter_ <= max_iter
class MBSPCA: def __init__(self, rfe_cv, *args, **kwargs): self.rfe = None self.rfe_cv = rfe_cv self.model = MiniBatchSparsePCA(*args, **kwargs) def fit(self, X, y): Z = numpy.concatenate([X, y.reshape(-1, 1)], axis=1) Z = numpy.array(Z, dtype=numpy.float32) Z[Z == numpy.inf] = numpy.nan Z[Z == -numpy.inf] = numpy.nan X_, y_ = X[~pandas.isna(Z).any(axis=1), :], y[~pandas.isna(Z).any( axis=1)] if Z.shape[0] != X.shape[0]: print( 'FIT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}' .format(X.shape[0] - X_.shape[0])) if self.rfe_cv: raise Exception("PCA could not be processed with RFE_CV") else: self.model.fit(X_) def predict(self, X): Z = numpy.concatenate([X], axis=1) Z = numpy.array(Z, dtype=numpy.float32) Z[Z == numpy.inf] = numpy.nan Z[Z == -numpy.inf] = numpy.nan nan_mask = ~pandas.isna(Z).any(axis=1) X_ = X[nan_mask, :] if Z.shape[0] != X.shape[0]: print( 'PREDICT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}' .format(X.shape[0] - X_.shape[0])) if self.rfe_cv: raise Exception("PCA could not be processed with RFE_CV") else: predicted = self.model.transform(X_) Z = numpy.full(shape=(X.shape[0], predicted.shape[1]), fill_value=numpy.nan, dtype=numpy.float64) Z[nan_mask, :] = predicted return Z
def cluster_sk_mini_batch_sparse_pca(content): """ x """ _config = MiniBatchSparsePCA(n_components=content['n_components'], alpha=content['alpha'], ridge_alpha=content['ridge_alpha'], n_iter=content['n_iter'], callback=None, batch_size=content['batch_size'], verbose=0, shuffle=content['shuffle'], n_jobs=-1, method=content['sk_method'], random_state=None) _result = _config.fit_transform(content['data']) return httpWrapper( json.dumps({ 'result': _result.tolist(), 'components': _config.components_.tolist(), 'iter': _config.n_iter_ }))
def test_spca_early_stopping(global_random_seed): """Check that `tol` and `max_no_improvement` act as early stopping.""" rng = np.random.RandomState(global_random_seed) n_samples, n_features = 50, 10 X = rng.randn(n_samples, n_features) # vary the tolerance to force the early stopping of one of the model model_early_stopped = MiniBatchSparsePCA( max_iter=100, tol=0.5, random_state=global_random_seed ).fit(X) model_not_early_stopped = MiniBatchSparsePCA( max_iter=100, tol=1e-3, random_state=global_random_seed ).fit(X) assert model_early_stopped.n_iter_ < model_not_early_stopped.n_iter_ # force the max number of no improvement to a large value to check that # it does help to early stop model_early_stopped = MiniBatchSparsePCA( max_iter=100, tol=1e-6, max_no_improvement=2, random_state=global_random_seed ).fit(X) model_not_early_stopped = MiniBatchSparsePCA( max_iter=100, tol=1e-6, max_no_improvement=100, random_state=global_random_seed ).fit(X) assert model_early_stopped.n_iter_ < model_not_early_stopped.n_iter_
def MiniBatchSparsePCA(self, alpha: float = 1, batch_size: int = None, **kwargs): """ MiniBatchSparsePCA降维 [注]: 主要是使用了L1的正则化,这样可以将很多非主要成分的影响度降为0,MiniBatchSparsePCA通过使用一部分样本特征和给定的迭代次数来进行PCA降维, 以解决在大样本时特征分解过慢的问题,代价就是PCA降维的精确度可能会降低。 :param alpha: :param batch_size: :param kwargs: 根据个人需求选择参数 :return: """ assert self.n_components is None or isinstance( self.n_components, int), '参数n_components只能为None或int类型' kwargs['n_components'] = self.n_components kwargs['batch_size'] = batch_size kwargs['alpha'] = alpha self.compressor = MiniBatchSparsePCA(kwargs)
def pcaImgWrdMat(highDim, lowDim): (options, args) = parser.parse_args(sys.argv[1:]) #@UnusedVariable dataset = options.dataset # method = options.method #acquire the category list catmap = getCatMap(dataset) catList = catmap.keys() # the number of categories in category list # nCategory = len(catList) for catName in catList: print '%s : %d : %d\n' % (catName, highDim, lowDim) catPosFileName = rootDir + dataset + iwmDir + catName + str( highDim) + iwmext catPosData = np.loadtxt(catPosFileName, dtype=np.int, delimiter=' ') nPosImages = catPosData.shape[0] catNegFileName = rootDir + dataset + iwmDir + 'NEG' + catName + str( highDim) + iwmext catNegData = np.loadtxt(catNegFileName, dtype=np.int, delimiter=' ') nNegImages = catNegData.shape[0] catData = np.vstack((catPosData, catNegData)) labels = np.vstack((np.ones( (nPosImages, 1), np.int), np.zeros((nNegImages, 1), np.int))) print 'spca...' try: spcaDataFileName = rootDir + dataset + outputDir + catName + str( highDim) + str(lowDim) + '.spca' if os.path.exists(spcaDataFileName): continue spcaData = MiniBatchSparsePCA( n_components=lowDim, n_iter=100).fit(catData).transform(catData) spcaData = np.hstack((spcaData, labels)) np.savetxt(spcaDataFileName, spcaData, fmt='%f', delimiter=' ') except: print 'error: SPCA : %s : %d : %d' % (catName, highDim, lowDim) pass
def batch_minibatch_sparse_pca(scaled_split_dfs, n_components, batch=50): ''' Performs minibatch sparse pca for each subset in dictionary of x and y train, and x and y test. Number of resulting components is set by n_components. For best results, n_compnents should be smaller than the number of samples. Batch determines how many features are analyzed at a time. Returns two dictionaries, one with the sparse pca features an done with information about the sparse pca done. ''' sparse_pca_dfs = copy.deepcopy(scaled_split_dfs) sparse_mb_pca = MiniBatchSparsePCA(n_components=n_components, batch_size=batch, random_state=0) sparse_pca_ncomponents = {} sparse_pca_stats = {} for key in sparse_pca_dfs: sparse_mb_pca.fit(sparse_pca_dfs[key]['x_train']) sparse_pca_x_train = sparse_mb_pca.transform( sparse_pca_dfs[key]['x_train']) sparse_pca_dfs[key]['x_train'] = sparse_pca_x_train sparse_pca_x_test = sparse_mb_pca.transform( scaled_split_dfs[key]['x_test']) sparse_pca_dfs[key]['x_test'] = sparse_pca_x_test sparse_pca_ncomponents[key] = sparse_pca_x_train.shape[1] sparse_pca_stats['ncomponents'] = sparse_pca_ncomponents return sparse_pca_dfs, sparse_pca_stats
alpha = [1e-5,1e-4,1e-3,1e-2,1e-1] #sparsity parameter. Higher=more sparse wavelims = (4000,5700) # do PCA X, V, Z, Xs_hat, X_hat, wavelengths, ev, n = do_PCA(dir, wavelims, n_pcs) # get residuals X_residual = X - X_hat means, stds = np.mean(X_residual, axis=0), np.std(X_residual, axis=0) #Xs_residual = (X_residual - means)/stds Xs_residual = X_residual - means # sparse PCA print "starting sparse PCA" for a in alpha: spca = MiniBatchSparsePCA(n_components=n_spcs, alpha=a) spca.fit(Xs_residual) f, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, sharex=True) for i in range(len(V)): ax1.plot(wavelengths, V[i,:], ',', alpha=1-0.1*i, label='pc %d'%i) for i in range(5): ax2.plot(wavelengths,spca.components_[i,:], ',', alpha=1-0.1*i, label='sparse pc %d'%i) for i in range(5): ax3.plot(wavelengths,spca.components_[i+5,:], ',', alpha=1-0.1*i, label='sparse pc %d'%i) for i in range(5): ax4.plot(wavelengths,spca.components_[i+10,:], ',', alpha=1-0.1*i, label='sparse pc %d'%i) ax1.legend(fontsize=10) ax2.legend(fontsize=10) ax4.set_xlabel('wavelength') ax1.set_ylabel('eigenvector values')
for idx in range(len(fea)): if idx in select_word_idx_list: # print idx,len(fea) li.append(fea[idx]) ALL_FEA[i] = li NeedPCA = False if NeedPCA: print 'Len of ALL_FEA: ',len(ALL_FEA) print 'Start PCA ... ' # pca = KernelPCA(n_components = num_af_pca,) pca = MiniBatchSparsePCA(n_components = num_af_pca,n_jobs = 4,verbose = 1,batch_size = len(ALL_FEA)/10) new_all_fea = pca.fit_transform(np.array(ALL_FEA)) # from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA # pca = LDA(n_components = num_af_pca) # new_all_fea = pca.fit_transform(np.array(ALL_FEA)) ALL_FEA = new_all_fea print '\nFinish PCA ... ' allSongs = [] head = 0 tail = 0
glob.glob(f"{TOP_DIR}/var/user_vectors/*")[:SAMPLE_SIZE]), desc="load example users..."): args.append((idx, filename)) with ProcessPoolExecutor(max_workers=psutil.cpu_count()) as exe: for ret in tqdm(exe.map(load, args), total=len(args), desc="load example users..."): if ret is None: continue idx, vec = ret for term_idx, weight in vec.items(): mtx[idx, term_idx] = weight print(f"[{FILE}] start to train TruncatedSVD...") transformer = MiniBatchSparsePCA(n_components=500, batch_size=100, random_state=0) transformer.fit(mtx.todense()) elapsed_time = time.time() - start_time print(f"[{FILE}] elapsed_time = {elapsed_time}") print(f"[{FILE}] start to transform matrix...") X_transformed = transformer.transform(mtx[:5000]) print(X_transformed) print(X_transformed.shape) print(type(X_transformed)) joblib.dump(transformer, f"{TOP_DIR}/var/transformer.joblib") if "--transform" in sys.argv: transformer = joblib.load(f"{TOP_DIR}/var/transformer.joblib") """ 1000個づつ分割 """ filenames = glob.glob(f"{HOME}/var/user_vectors/*")
plt.title(title) plt.legend(loc='best') # 转换前的可视化, 只显示前两维度的数据 plt.figure(1) plot_func('origin data') # KernelPCA 是非线性降维, LDA 只能用于分类降维 # ICA 通常不用于降低维度,而是用于分离叠加信号 models_list = [('LDA', LinearDiscriminantAnalysis(n_components=2)), ('PCA', PCA(n_components=2, random_state=0)), ('PCARand', PCA(n_components=2, random_state=0, svd_solver='randomized')), ('IncrementalPCA', IncrementalPCA(n_components=2, batch_size=10, whiten=True)), ('FactorAnalysis', FactorAnalysis(n_components=2, max_iter=500)), ('FastICA', FastICA(n_components=2, random_state=0)), ('KernelPCA', KernelPCA(n_components=2, random_state=0, kernel='rbf')), ('SparsePCA', SparsePCA(n_components=2, random_state=0, verbose=True)), ('MiniBatchSparsePCA', MiniBatchSparsePCA(n_components=2, verbose=True, batch_size=10, random_state=0)), ('DictionaryLearning', DictionaryLearning(n_components=2, verbose=True, random_state=0)), ('MiniBatchDictionaryLearning', MiniBatchDictionaryLearning(n_components=2, batch_size=5, random_state=0, alpha=0.1))] model = namedtuple('models', ['mod_name', 'mod_ins']) for i in range(len(models_list)): mod = model(*models_list[i]) if mod.mod_name == 'LDA': mod.mod_ins.fit(X, y) X_new = mod.mod_ins.transform(X) else: X_new = mod.mod_ins.fit_transform(X) plt.figure(i + 2) plot_func(mod.mod_name + ' transformed data') print(mod.mod_name + 'finished!')
from sklearn.model_selection import GridSearchCV from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix #%% from sklearn.pipeline import Pipeline from sklearn.decomposition import MiniBatchSparsePCA from sklearn.decomposition import MiniBatchDictionaryLearning from sklearn.decomposition import FastICA #%% mbsp = MiniBatchSparsePCA() mbdl = MiniBatchDictionaryLearning() fica = FastICA() logreg = LogisticRegression(class_weight='balanced', solver='sag', max_iter=5000) rf = BalancedRandomForestClassifier(class_weight='balanced') p_mbsp = { 'spca__n_components' : [5 , 15 , 25] } p_mbdl = { 'dl__n_components' : [5 , 15 , 25] } p_fica = { 'ica__n_components' : [5 ,15 , 25] } p_mbsp = { 'spca__n_components' : [ 25] } p_mbdl = { 'dl__n_components' :25] } p_fica = { 'ica__n_components' : [ 25] }
def __init__(self, rfe_cv, *args, **kwargs): self.rfe = None self.rfe_cv = rfe_cv self.model = MiniBatchSparsePCA(*args, **kwargs)
from sklearn.ensemble import IsolationForest clf = IsolationForest(random_state=0, n_jobs=-1, contamination=0.25).fit(X) A = clf.predict(X) print((A == -1).mean(), (labels != 0).mean(), ((A == -1) == (labels != 0)).mean()) #%% from sklearn.decomposition import MiniBatchSparsePCA X = data_pts_1 mbsp = MiniBatchSparsePCA(n_components=20, alpha=1, ridge_alpha=0.01, batch_size=4, n_jobs=-1) mbsp.fit(X) #X_transformed = transformer.transform(X) # X_transformed.shape # plt.plot(mbsp.components_[0,:]); plt.show() #%% X = data_pts_1 from sklearn import decomposition mbdl = decomposition.MiniBatchDictionaryLearning(n_jobs=-1, n_components=20,
'pmf': lambda args: NimfaWrapper(nimfa.Pmf, args.dims), 'psmf': lambda args: NimfaWrapper(nimfa.Psmf, args.dims), 'saucie': lambda args: SaucieWrapper(args.dims) if SAUCIE_AVAILABLE else _embedding_error(), 'scscope': lambda args: ScScope(args.dims) if SCSCOPE_AVAILABLE else _embedding_error, 'sepnmf': lambda args: NimfaWrapper(nimfa.SepNMF, args.dims), 'spca': lambda args: SparsePCA( n_components=args.dims, n_jobs=args.njobs, normalize_components=True), 'spca-batch': lambda args: MiniBatchSparsePCA( n_components=args.dims, n_jobs=args.njobs, normalize_components=True), 'spectral': lambda args: SpectralEmbedding(n_components=args.dims, n_jobs=args.njobs), 'snmf': lambda args: NimfaWrapper(nimfa.Snmf, args.dims), 'srp': lambda args: SparseRandomProjection(n_components=args.dims), 'tga': lambda args: TGA(n_components=args.dims) if TGA_AVAILABLE else _embedding_error(), 'tsvd': lambda args: TruncatedSVD(n_components=args.dims), 'tsne': lambda args: TSNE(n_components=args.dims), 'umap': lambda args: umap.UMAP(n_components=args.dims)
def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams)