def dim_reduction(X, n_components=2, mode="MDS"): """Reduces the number of dimensions in which a dataset is defined. Arguments X - NumPy array with shape (N,M), where N is the number of observations, and M the number of features. Keyword Arguments n_components - Intended number of features after dimensionality reduction. Default = 2 mode - String that defines the type of dim reduction: - None - "PCA" principal component analysis - "ICA" independent component analysis - "FA" factor analysis - "TSNE" t-stochastic neighbour embedding - "UMAP" uniform manifold approximation and embedding - "RANDOMPROJECTION" - "FEATUREAGGLOMERATION" - "ISOMAP" - "LLE" local linear embedding - "HESSIAN" Hessian eigenmaps - "MLLE" modified local linear embedding - "LTSA" local tangent space alignment - "MDS" multi-dimensional scaling - "DICTIONARY" dictionary learning - "TSVD" truncated SVD (also known as "LSE") Default = "MDS" Returns X - NumPy array with shape (N-n,M), where N is the number of observations and n is the number of observations with a NaN. M is the number of features. Now with scaled values. """ # Make sure the mode is in all caps. if type(mode) == str: mode = mode.upper() # Copy X into a new matrix. X_ = numpy.copy(X) # None if mode is None or mode == "NONE": # Literally nothing happens here for now. print("Fart noise!") # Principal component analysis. elif mode == 'PCA': # Initialise a new PCA. pca = decomposition.PCA(n_components=n_components) # Fit the PCA with the data. pca.fit(X_) # Transform the data. X_ = pca.transform(X_) # Independent component analysis. elif mode == 'ICA': # Initialise a new ICA. ica = decomposition.FastICA(n_components=n_components) # Fit the ICA with the data. ica.fit(X_) # Transform the data. X_ = ica.transform(X_) # Factor analysis. elif mode == 'FA': # Initialise a new factor analysis. fa = decomposition.FactorAnalysis(n_components=n_components) # Perform the factor analysis on the data. fa.fit(X_) # Transform the data. X_ = fa.transform(X_) # T-Distributed stochastic neighbour embedding. elif mode == 'TSNE': # Run several t-SNEs to find a good one. n_runs = 10 Xs_ = [] dkl = numpy.ones(n_runs, dtype=float) * numpy.inf print("Running %d t-SNEs to find lowest Kullback-Leibler divergence." \ % (n_runs)) for i in range(n_runs): # Initialise a new t-distributed stochastic neighbouring embedding # (t-SNE) analysis. tsne = TSNE(n_components=n_components) # Copy the data into a new variable. Xs_.append(numpy.copy(X_)) # Fit to and transform the data. Xs_[i] = tsne.fit_transform(Xs_[i]) # Get the KL-divergence. dkl[i] = tsne.kl_divergence_ print("\tCurrent KL-divergence = %.5f" % (dkl[i])) # Choose the solution with the lowest KL-divergence. X_ = numpy.copy(Xs_[numpy.argmin(dkl)]) # Get rid of all the excess X copies. del Xs_ # Uniform manifold approximation and projection. elif mode == 'UMAP': # Create a new UMAP instance. um = umap.UMAP(n_components=n_components, min_dist=0.01) # Fit and transform X. X_ = um.fit_transform(X_) # Gaussian Random Projection. elif mode == 'RANDOMPROJECTION': # Create a new GaussianRandomProjection instance. rp = GaussianRandomProjection(n_components=n_components) # Fit and transform X. X_ = rp.fit_transform(X_) # Feature Agglomeration. elif mode == 'FEATUREAGGLOMERATION': # Create a new FeatureAgglomeration instance. fa = cluster.FeatureAgglomeration(n_clusters=n_components) # Fit and transform X. X_ = fa.fit_transform(X_) # Isomap. elif mode == 'ISOMAP': # Create a new Isomap instance. im = Isomap(n_components=n_components) # Fit and transform X. X_ = im.fit_transform(X_) # Locally Linear Embedding. elif mode == 'LLE': # Create a new LocallyLinearEmbedding instance. lle = LocallyLinearEmbedding(n_neighbors=10, n_components=n_components, \ method='standard', eigen_solver='dense') # Fit and transform X. X_ = lle.fit_transform(X_) # Hessian eigenmaps. elif mode == 'HESSIAN': # Create a new LocallyLinearEmbedding instance. hlle = LocallyLinearEmbedding(n_neighbors=10, n_components=n_components, \ method='hessian', eigen_solver='dense') # Fit and transform X. X_ = hlle.fit_transform(X_) # MLLE. elif mode == 'MLLE': # Create a new LocallyLinearEmbedding instance. mlle = LocallyLinearEmbedding(n_neighbors=10, n_components=n_components, \ method='modified', eigen_solver='dense') # Fit and transform X. X_ = mlle.fit_transform(X_) # LTSA. elif mode == 'LTSA': # Create a new LocallyLinearEmbedding instance. ltsa = LocallyLinearEmbedding(n_neighbors=10, n_components=n_components, \ method='ltsa', eigen_solver='dense') # Fit and transform X. X_ = ltsa.fit_transform(X_) # Multi-dimensional scaling. elif mode == 'MDS': # Create a new MDS instance. mds = MDS(n_components=n_components) # Fit and transform X. X_ = mds.fit_transform(X_) # Dictionary Learning elif mode == "DICTIONARY": # Create a DictionaryLearning instance. dictlearn = decomposition.DictionaryLearning( \ n_components=n_components, \ fit_algorithm='cd', \ # The 'omp' algorithm orthogonalises the whole thing, whereas # a lasso solution with a low alpha leaves a slightly more # scattered solution. transform_algorithm='lasso_cd', \ transform_alpha=0.1, \ ) # Fit and transform X. X_ = dictlearn.fit_transform(X) # Truncated SVD (also known as 'Latent Semantic analysis' (LSE) elif mode in ['TSVD', 'LSE']: tsvd = decomposition.TruncatedSVD(n_components=n_components) # Fit and transform X. X_ = tsvd.fit_transform(X) else: raise Exception("Unrecognised dimensionality reduction mode '%s'" % (mode)) return X_
cov_mat = np.cov(X_std.T) eig_vals, eig_vecs = np.linalg.eig(cov_mat) # feed the feature matrix to PCA from sklearn.preprocessing import StandardScaler feature_matrix = StandardScaler().fit_transform(feature_matrix) df_feature_matrix = pd.DataFrame(feature_matrix) df_feature_matrix.fillna(0, inplace=True) from sklearn import preprocessing data_scaled = pd.DataFrame(preprocessing.scale(df_feature_matrix), columns=df_feature_matrix.columns) pca = decomposition.PCA(n_components=5) pca1 = decomposition.PCA(n_components=30) X_std_pca = pca.fit_transform(data_scaled) X_std_pca1 = pca1.fit_transform(data_scaled) # X_std_pca.shape # calculate the explained variance of pca pcaExpVariance = pca.explained_variance_ # print("PCA variance= ", pcaExpVariance) pcaTransformed = pca.transform(feature_matrix) # pcaTransformed.shape # calculate explained variance ratio for analysis of no. of features # using 5 components variance = pca.explained_variance_ratio_ var = np.cumsum(np.round(pca.explained_variance_ratio_, decimals=3) * 100)
def estim_class_model(features, nb_classes, estim_model='GMM', pca_coef=None, use_scaler=True, max_iter=99): """ create pipeline (scaler, PCA, model) over several options how to cluster samples and fit it on data :param ndarray features: :param int nb_classes: number of expected classes :param float pca_coef: range (0, 1) or None :param bool use_scaler: whether use a scaler :param str estim_model: used model :param int max_iter: :return: >>> np.random.seed(0) >>> fts = np.row_stack([np.random.random((50, 3)) - 1, ... np.random.random((50, 3)) + 1]) >>> mm = estim_class_model(fts, 2) >>> mm.predict_proba(fts).shape (100, 2) >>> mm = estim_class_model(fts, 2, estim_model='GMM_kmeans', ... pca_coef=0.95, max_iter=3) >>> mm.predict_proba(fts).shape (100, 2) >>> mm = estim_class_model(fts, 2, estim_model='GMM_Otsu', max_iter=3) >>> mm.predict_proba(fts).shape (100, 2) >>> mm = estim_class_model(fts, 2, estim_model='kmeans_quantiles', ... use_scaler=False, max_iter=3) >>> mm.predict_proba(fts).shape (100, 2) >>> mm = estim_class_model(fts, 2, estim_model='BGM', max_iter=3) >>> mm.predict_proba(fts).shape (100, 2) >>> mm = estim_class_model(fts, 2, estim_model='Otsu', max_iter=3) >>> mm.predict_proba(fts).shape (100, 2) """ components = [] if use_scaler: components += [('std_scaler', preprocessing.StandardScaler())] if pca_coef is not None: components += [('reduce_dim', decomposition.PCA(pca_coef))] nb_inits = max(1, int(np.sqrt(max_iter))) # http://scikit-learn.org/stable/modules/generated/sklearn.mixture.GMM.html mm = mixture.GaussianMixture(n_components=nb_classes, covariance_type='full', n_init=nb_inits, max_iter=max_iter) # split the model and used initilaisation if '_' in estim_model: init_type = estim_model.split('_')[-1] estim_model = estim_model.split('_')[0] else: init_type = '' y = None if estim_model == 'GMM': # model = estim_class_model_gmm(features, nb_classes) if init_type == 'kmeans': mm.set_params(n_init=1) # http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html kmeans = cluster.KMeans(n_clusters=nb_classes, init='k-means++', n_jobs=-1) y = kmeans.fit_predict(features) elif init_type == 'Otsu': mm.set_params(n_init=1) y = compute_multivarian_otsu(features) elif estim_model == 'kmeans': # http://scikit-learn.org/stable/modules/generated/sklearn.mixture.GMM.html mm.set_params(max_iter=1) init_type = 'quantiles' if init_type == 'quantiles' else 'k-means++' _, y = estim_class_model_kmeans(features, nb_classes, init_type=init_type, max_iter=max_iter) logging.info('compute probability of each feature to all component') elif estim_model == 'BGM': mm = mixture.BayesianGaussianMixture(n_components=nb_classes, covariance_type='full', n_init=nb_inits, max_iter=max_iter) elif estim_model == 'Otsu' and nb_classes == 2: mm.set_params(max_iter=1, n_init=1) y = compute_multivarian_otsu(features) components += [('model', mm)] # compose the pipeline model = pipeline.Pipeline(components) if y is not None: # fit with examples model.fit(features, y) else: # fit from scrach model.fit(features) return model
parser = argparse.ArgumentParser() parser.add_argument('-model', type=str, default='linear_model') parser.add_argument('-featuredim', type=int, default=20) parser.add_argument('-inputfeatures', type=str, default='../data/features_ALL.txt') parser.add_argument('-labels', type=str, default='../data/ratings.txt') args = parser.parse_args() features = np.loadtxt(args.inputfeatures, delimiter=',') #features = preprocessing.scale(features) features_train = features[0:-50] features_test = features[-50:] pca = decomposition.PCA(n_components=args.featuredim) pca.fit(features_train) features_train = pca.transform(features_train) features_test = pca.transform(features_test) ratings = np.loadtxt(args.labels, delimiter=',') #ratings = preprocessing.scale(ratings) ratings_train = ratings[0:-50] ratings_test = ratings[-50:] if args.model == 'linear_model': regr = linear_model.LinearRegression() elif args.model == 'svm': regr = svm.SVR() elif args.model == 'rf': regr = RandomForestRegressor(n_estimators=50,
import numpy as np import matplotlib.pyplot as plt from sklearn import linear_model, decomposition, datasets from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV logistic = linear_model.LogisticRegression() pca = decomposition.PCA() pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)]) #设置流水线过程 digits = datasets.load_digits() X_digits = digits.data y_digits = digits.target pca.fit(X_digits) plt.figure(1, figsize=(4, 3)) plt.clf() plt.axes([.2, .2, .7, .7]) plt.plot(pca.explained_variance_, linewidth=2) #画出每个维度的解释方差,方差越大,其对应维度越需要被留下来 plt.axis('tight') plt.xlabel('n_components') plt.ylabel('explained_variance_') n_components = [20, 40, 64] Cs = np.logspace(-4, 4, 3) #Parameters of pipelines can be set using ‘__’ separated parameter names: estimator = GridSearchCV(pipe, dict(pca__n_components=n_components, logistic__C=Cs)) #自动调优
all_projections['LAMP'] = (vp.LAMP(), {'verbose': [False], 'fraction_delta': [2.0, 8.0, 12.0], 'n_iterations': [100, 200], 'sample_type': ['random', 'clustering_centroid']}) all_projections['LE'] = (manifold.SpectralEmbedding(), {'n_components': [2], 'affinity': ['nearest_neighbors'], 'random_state': [42]}) all_projections['LISO'] = (vp.LandmarkIsomap(), {'verbose': [False], 'n_neighbors': [4, 8, 16], 'dissimilarity_type': ['euclidean']}) all_projections['LLC'] = (drtoolbox.LLC(), {'k': [8, 12], 'n_analyzers': [10, 20], 'max_iter': [200, 400], 'verbose': [False]}) all_projections['LLE'] = (manifold.LocallyLinearEmbedding(), {'n_components': [2], 'n_neighbors': [5, 7, 11], 'max_iter': [100, 200], 'reg': [0.001, 0.01, 0.1], 'method': ['standard'], 'eigen_solver': ['dense'], 'random_state': [42]}) all_projections['LLTSA'] = (tapkee.LinearLocalTangentSpaceAlignment(), {'n_neighbors': [4, 7, 11], 'verbose': [False]}) # subject to "eigendecomposition failed" errors (Eigen's NoConvergence) all_projections['LMDS'] = (tapkee.LandmarkMDS(), {'n_neighbors': [4, 7, 11], 'verbose': [False]}) all_projections['LMNN'] = (drtoolbox.LMNN(), {'k': [3, 5, 7], 'verbose': [False]}) all_projections['LMVU'] = (drtoolbox.LandmarkMVU(), {'k1': [3, 5, 7], 'k2': [8, 12, 15], 'verbose': [False]}) all_projections['LPP'] = (tapkee.LocalityPreservingProjections(), {'n_neighbors': [4, 7, 11], 'verbose': [False]}) # subject to "eigendecomposition failed" errors (Eigen's NoConvergence) all_projections['LSP'] = (vp.LSP(), {'verbose': [False], 'fraction_delta': [2.0, 8.0, 12.0], 'n_iterations': [100, 200], 'n_neighbors': [4, 8, 16], 'control_point_type': ['random', 'kmeans'], 'dissimilarity_type': ['euclidean']}) all_projections['LTSA'] = (manifold.LocallyLinearEmbedding(), {'n_components': [2], 'n_neighbors': [5, 7, 11], 'max_iter': [100, 200], 'reg': [0.001, 0.01, 0.1], 'method': ['ltsa'], 'eigen_solver': ['dense'], 'random_state': [42]}) all_projections['MC'] = (drtoolbox.ManifoldChart(), {'n_analyzers': [10, 20], 'max_iter': [200, 400], 'verbose': [False]}) all_projections['MCML'] = (drtoolbox.MCML(), {'verbose': [False]}) all_projections['MDS'] = (manifold.MDS(), {'n_components': [2], 'n_init': [2, 4], 'metric': [True], 'max_iter': [300, 500], 'random_state': [42]}) all_projections['MLLE'] = (manifold.LocallyLinearEmbedding(), {'n_components': [2], 'n_neighbors': [5, 7, 11], 'max_iter': [100, 200], 'reg': [0.001, 0.01, 0.1], 'method': ['modified'], 'eigen_solver': ['dense'], 'random_state': [42]}) all_projections['MVU'] = (drtoolbox.MVU(), {'k': [8, 12, 15], 'verbose': [False]}) all_projections['NMDS'] = (manifold.MDS(), {'n_components': [2], 'n_init': [2, 4], 'metric': [False], 'max_iter': [300, 500], 'random_state': [42]}) all_projections['NMF'] = (decomposition.NMF(), {'n_components': [2], 'init': ['random', 'nndsvdar'], 'beta_loss': ['frobenius'], 'max_iter': [200, 400], 'alpha': [0, 0.5], 'l1_ratio': [0.0, 0.5], 'random_state': [42]}) all_projections['PBC'] = (vp.ProjectionByClustering(), {'verbose': [False], 'fraction_delta': [2.0, 8.0, 12.0], 'n_iterations': [100, 200], 'init_type': ['fastmap', 'random'], 'dissimilarity_type': ['euclidean'], 'cluster_factor': [1.5, 4.5, 9.0]}) all_projections['PCA'] = (decomposition.PCA(), {'n_components': [2], 'random_state': [42]}) all_projections['PLSP'] = (vp.PLSP(), {'dissimilarity_type': ['euclidean'], 'verbose': [False], 'sample_type': ['clustering']}) all_projections['PPCA'] = (drtoolbox.ProbPCA(), {'max_iter': [200, 400], 'verbose': [False]}) all_projections['RSAM'] = (vp.RapidSammon(), {'verbose': [False], 'dissimilarity_type': ['euclidean']}) all_projections['SPCA'] = (decomposition.SparsePCA(), {'n_components': [2], 'alpha': [0.01, 0.1, 0.5], 'ridge_alpha': [0.05, 0.05, 0.5], 'max_iter': [1000, 2000], 'tol': [1e-08], 'method': ['lars'], 'random_state': [42], 'normalize_components': [True]}) all_projections['SPE'] = (tapkee.StochasticProximityEmbedding(), {'n_neighbors': [6, 12, 18], 'n_updates': [20, 70], 'max_iter': [0], 'verbose': [False]}) all_projections['SRP'] = (random_projection.SparseRandomProjection(), {'n_components': [2], 'density': ['auto'], 'random_state': [42]}) all_projections['TSNE'] = (mtsne.MTSNE(), {'n_components': [2], 'perplexity': [5.0, 15.0, 30.0, 50.0], 'early_exaggeration': [6.0, 12.0, 18.0], 'learning_rate': [200.0], 'n_iter': [1000, 3000], 'n_iter_without_progress': [300], 'min_grad_norm': [1e-07], 'metric': ['euclidean'], 'init': ['random'], 'random_state': [42], 'method': ['barnes_hut'], 'angle': [0.5], 'n_jobs': [4]}) all_projections['TSVD'] = (decomposition.TruncatedSVD(), {'n_components': [2], 'algorithm': ['randomized'], 'n_iter': [5, 10], 'random_state': [42]}) all_projections['UMAP'] = (umap.UMAP(), {'n_components': [2], 'random_state': [42], 'n_neighbors': [5, 10, 15], 'metric': ['euclidean'], 'init': ['spectral', 'random'], 'min_dist': [0.001, 0.01, 0.1, 0.5], 'spread': [1.0], 'angular_rp_forest': [False]})
cmap=cmap, interpolation='nearest', vmin=-vmax, vmax=vmax) plt.xticks(()) plt.yticks(()) plt.subplots_adjust(0.01, 0.05, 0.99, 0.93, 0.04, 0.) # ############################################################################# # List of the different estimators, whether to center and transpose the # problem, and whether the transformer uses the clustering API. estimators = [ ('Eigenfaces - PCA using randomized SVD', decomposition.PCA(n_components=n_components, svd_solver='randomized', whiten=True), True), ('Non-negative components - NMF', decomposition.NMF(n_components=n_components, init='nndsvda', tol=5e-3), False), ('Independent components - FastICA', decomposition.FastICA(n_components=n_components, whiten=True), True), ('Sparse comp. - MiniBatchSparsePCA', decomposition.MiniBatchSparsePCA(n_components=n_components, alpha=0.8, n_iter=100, batch_size=3, random_state=rng), True), ('MiniBatchDictionaryLearning', decomposition.MiniBatchDictionaryLearning(n_components=15, alpha=0.1,
from sklearn import datasets,decomposition,svm,metrics import numpy as np from sklearn.pipeline import Pipeline from matplotlib import pyplot as plt from sklearn.model_selection import train_test_split faces = datasets.fetch_olivetti_faces() print(faces.data.shape) fig = plt.figure(figsize=(8, 6)) for i in range(15): ax = fig.add_subplot(3, 5, i + 1, xticks=[], yticks=[]) ax.imshow(faces.images[i], cmap=plt.cm.bone) X_train, X_test, y_train, y_test = train_test_split(faces.data,faces.target, random_state=0) print(X_train.shape, X_test.shape) pca = decomposition.PCA(n_components=150, whiten=True) pca.fit(X_train) plt.imshow(pca.mean_.reshape(faces.images[0].shape),cmap=plt.cm.bone) plt.show() print(pca.components_.shape) fig = plt.figure(figsize=(16, 6)) for i in range(30): ax = fig.add_subplot(3, 10, i + 1, xticks=[], yticks=[]) ax.imshow(pca.components_[i].reshape(faces.images[0].shape),cmap=plt.cm.bone) X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print(X_train_pca.shape) print(X_test_pca.shape) clf = svm.SVC(C=5., gamma=0.001) clf.fit(X_train_pca, y_train) fig = plt.figure(figsize=(8, 6)) for i in range(15): ax = fig.add_subplot(3, 5, i + 1, xticks=[], yticks=[])
def plot_iris_mds(): iris = datasets.load_iris() X = iris.data y = iris.target # MDS fig = pylab.figure(figsize=(10, 4)) ax = fig.add_subplot(121, projection='3d') ax.set_axis_bgcolor('white') mds = manifold.MDS(n_components=3) Xtrans = mds.fit_transform(X) for cl, color, marker in zip(np.unique(y), colors, markers): ax.scatter(Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], Xtrans[y == cl][:, 2], c=color, marker=marker, edgecolor='black') pylab.title("MDS on Iris data set in 3 dimensions") ax.view_init(10, -15) mds = manifold.MDS(n_components=2) Xtrans = mds.fit_transform(X) ax = fig.add_subplot(122) for cl, color, marker in zip(np.unique(y), colors, markers): ax.scatter(Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], c=color, marker=marker, edgecolor='black') pylab.title("MDS on Iris data set in 2 dimensions") filename = "mds_demo_iris.png" pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight") # PCA fig = pylab.figure(figsize=(10, 4)) ax = fig.add_subplot(121, projection='3d') ax.set_axis_bgcolor('white') pca = decomposition.PCA(n_components=3) Xtrans = pca.fit(X).transform(X) for cl, color, marker in zip(np.unique(y), colors, markers): ax.scatter(Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], Xtrans[y == cl][:, 2], c=color, marker=marker, edgecolor='black') pylab.title("PCA on Iris data set in 3 dimensions") ax.view_init(50, -35) pca = decomposition.PCA(n_components=2) Xtrans = pca.fit_transform(X) ax = fig.add_subplot(122) for cl, color, marker in zip(np.unique(y), colors, markers): ax.scatter(Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], c=color, marker=marker, edgecolor='black') pylab.title("PCA on Iris data set in 2 dimensions") filename = "pca_demo_iris.png" pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
y = iris_df[column_names[-1]] standard_scaler = sk_preprocessing.StandardScaler() X = standard_scaler.fit_transform(X) label_encoder = sk_preprocessing.LabelEncoder() y = label_encoder.fit_transform(y) cv_iris_log_model = sk_linear.LogisticRegression() cv_iris_model_quality = sk_model_selection.cross_val_score(cv_iris_log_model, X, y, cv=4, scoring="accuracy") print("Original model quality:") # print(cv_iris_model_quality) print(np.mean(cv_iris_model_quality)) pca_2_components = sk_decomposition.PCA(n_components=2) principal_components = pca_2_components.fit_transform(X) plt.scatter(principal_components[:, 0], principal_components[:, 1], c=y, cmap="prism") pca_all_components = sk_decomposition.PCA() pca_all_components.fit(X) print("Explained variance ratio:") print(pca_all_components.explained_variance_ratio_) cv_iris_2_log_model = sk_linear.LogisticRegression() cv_iris_2_model_quality = sk_model_selection.cross_val_score(cv_iris_2_log_model, principal_components, y, cv=4, scoring="accuracy") print("2 pc model quality:")
import multiprocessing from collections import Counter from sklearn import preprocessing import scipy.special as special from pandas import DataFrame, Series from collections import Counter np.random.seed(2019) random.seed(2019) ## SVD to for file,dim in [('data/video_w2v.pkl',32),('data/audio_w2v.pkl',64)]: if file[-3:]=="pkl": df=pd.read_pickle(file) else: df=pd.read_csv(file) pca = sk_decomposition.PCA(n_components=dim,whiten=False,svd_solver='auto') pca.fit(df[df.columns[1:]]) df1=pd.DataFrame(pca.transform(df[df.columns[1:]])) df1.columns=df.columns[1:dim+1] df1[df.columns[0]]=df[df.columns[0]].values df1.to_pickle(file[:-4]+'_svd_'+str(dim)+'.pkl') def norm(train_df,test_df,features): df=pd.concat([train_df,test_df])[features] scaler = preprocessing.QuantileTransformer(random_state=0) scaler.fit(df[features])
def _eval_search_params(params_builder): search_params = {} for p in params_builder['param_set']: search_list = p['sp_list'].strip() if search_list == '': continue param_name = p['sp_name'] if param_name.lower().endswith(NON_SEARCHABLE): print("Warning: `%s` is not eligible for search and was " "omitted!" % param_name) continue if not search_list.startswith(':'): safe_eval = SafeEval(load_scipy=True, load_numpy=True) ev = safe_eval(search_list) search_params[param_name] = ev else: # Have `:` before search list, asks for estimator evaluatio safe_eval_es = SafeEval(load_estimators=True) search_list = search_list[1:].strip() # TODO maybe add regular express check ev = safe_eval_es(search_list) preprocessors = ( preprocessing.StandardScaler(), preprocessing.Binarizer(), preprocessing.Imputer(), preprocessing.MaxAbsScaler(), preprocessing.Normalizer(), preprocessing.MinMaxScaler(), preprocessing.PolynomialFeatures(), preprocessing.RobustScaler(), feature_selection.SelectKBest(), feature_selection.GenericUnivariateSelect(), feature_selection.SelectPercentile(), feature_selection.SelectFpr(), feature_selection.SelectFdr(), feature_selection.SelectFwe(), feature_selection.VarianceThreshold(), decomposition.FactorAnalysis(random_state=0), decomposition.FastICA(random_state=0), decomposition.IncrementalPCA(), decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS), decomposition.LatentDirichletAllocation(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchDictionaryLearning(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchSparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.NMF(random_state=0), decomposition.PCA(random_state=0), decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.TruncatedSVD(random_state=0), kernel_approximation.Nystroem(random_state=0), kernel_approximation.RBFSampler(random_state=0), kernel_approximation.AdditiveChi2Sampler(), kernel_approximation.SkewedChi2Sampler(random_state=0), cluster.FeatureAgglomeration(), skrebate.ReliefF(n_jobs=N_JOBS), skrebate.SURF(n_jobs=N_JOBS), skrebate.SURFstar(n_jobs=N_JOBS), skrebate.MultiSURF(n_jobs=N_JOBS), skrebate.MultiSURFstar(n_jobs=N_JOBS), imblearn.under_sampling.ClusterCentroids(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.CondensedNearestNeighbour( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.EditedNearestNeighbours(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RepeatedEditedNearestNeighbours( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.InstanceHardnessThreshold( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NearMiss(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NeighbourhoodCleaningRule( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.OneSidedSelection(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RandomUnderSampler(random_state=0), imblearn.under_sampling.TomekLinks(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.RandomOverSampler(random_state=0), imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.BorderlineSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SMOTENC(categorical_features=[], random_state=0, n_jobs=N_JOBS), imblearn.combine.SMOTEENN(random_state=0), imblearn.combine.SMOTETomek(random_state=0)) newlist = [] for obj in ev: if obj is None: newlist.append(None) elif obj == 'all_0': newlist.extend(preprocessors[0:36]) elif obj == 'sk_prep_all': # no KernalCenter() newlist.extend(preprocessors[0:8]) elif obj == 'fs_all': newlist.extend(preprocessors[8:15]) elif obj == 'decomp_all': newlist.extend(preprocessors[15:26]) elif obj == 'k_appr_all': newlist.extend(preprocessors[26:30]) elif obj == 'reb_all': newlist.extend(preprocessors[31:36]) elif obj == 'imb_all': newlist.extend(preprocessors[36:55]) elif type(obj) is int and -1 < obj < len(preprocessors): newlist.append(preprocessors[obj]) elif hasattr(obj, 'get_params'): # user uploaded object if 'n_jobs' in obj.get_params(): newlist.append(obj.set_params(n_jobs=N_JOBS)) else: newlist.append(obj) else: sys.exit("Unsupported estimator type: %r" % (obj)) search_params[param_name] = newlist return search_params
y = df.index Xs.append(df.values) # for a in y: # print(a) # sys.stdout.flush() try: columns = ['id'] df_out = pd.DataFrame(index=y) for t, X in enumerate(Xs): print(t, end=' ') sys.stdout.flush() pca = decomposition.PCA(n_components=2, svd_solver='full', random_state=0) pca.fit(X) Y = pca.transform(X) df_out['t{}d0'.format(t)] = Y[:, 0] df_out['t{}d1'.format(t)] = Y[:, 1] if (len(df_out) - df_out.count()).sum() == 0: df_out.to_csv('./Output/{}-pca_s1.csv'.format( dataset_dir.split('/')[-1]), index_label='id') print(' '.join(sys.argv), 'OK') else: print(' '.join(sys.argv), 'crashed') except Exception as e: print(e)
from sklearn.metrics import mean_squared_log_error from sklearn.ensemble import RandomForestRegressor from lightgbm import LGBMRegressor from sklearn import decomposition features = [ 'Market', 'Day', 'Stock', 'x0', 'x1', 'x2', 'x3A', 'x3B', 'x3C', 'x3D', 'x3E', 'x4', 'x5', 'x6' ] df = pd.read_csv('train.csv', index_col=0) df = df.fillna(0) # replace NaN entries df_test = pd.read_csv('test.csv', index_col=0) df_test = df_test.fillna(0) # replace NaN entries X = df[features] Y = df['y'] model = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0) #model = LGBMRegressor(n_estimators=1000, learning_rate=0.01) pca = decomposition.PCA(n_components=len(features)) pca.fit(X) model.fit(pca.transform(X), Y) yp = pd.Series(model.predict(pca.transform(df_test[features]))).rename('y') yp.index.name = 'Index' print(yp.head()) yp.to_csv('RandomForestPCA.csv', header=True)
knn = {'color': '#33a02c', 'model': neighbors.KNeighborsClassifier()} c_svm = {'color': '#fb9a99', 'model': svm.SVC()} perceptron = {'color': '#e31a1c', 'model': linear_model.Perceptron()} n_bayes = {'color': '#fdbf6f', 'model': naive_bayes.GaussianNB()} pca_logit = { 'color': '#ff7f00', 'model': Pipeline(steps=[( 'pca', decomposition.PCA()), ('logit', linear_model.LogisticRegression())]) } neural = { 'color': '#cab2d6', 'model': Pipeline( steps=[('rbm', BernoulliRBM()), ('logit', linear_model.LogisticRegression())]) } passive_aggressive = { 'color': '#ffff99', 'model': linear_model.PassiveAggressiveClassifier() }
#%% ############################### # PCA for dimension reduction # (Possibility to do it with R for visualization) ############################### from sklearn import decomposition from sklearn.preprocessing import StandardScaler import matplotlib.pyplot as plt import seaborn as sns sns.set(style='white') import numpy as np X_scaled = StandardScaler().fit_transform(X_train) pca = decomposition.PCA().fit(X_scaled) # Graph to choose the number of PC to keep 95% of the explained variability plt.figure(figsize=(10, 7)) plt.plot(np.cumsum(pca.explained_variance_ratio_), color='k', lw=2) plt.xlabel('Number of components') plt.ylabel('Total explained variance') plt.xlim(0, 12) plt.yticks(np.arange(0, 1.1, 0.1)) plt.axvline(10, c='b') plt.axhline(0.95, c='r') plt.show() # Fit your dataset to the optimal pca pca = decomposition.PCA(n_components=9) X_pca = pca.fit_transform(X_scaled)
import numpy as np from sklearn import decomposition import pandas as pd df1 = pd.DataFrame({ 'F1': [10, 2, 8, 9, 12], 'F2': [20, 5, 17, 20, 22], 'F3': [10, 2, 7, 10, 11] }) print(df1.corr()) pca1 = decomposition.PCA() #learn the new principal axis pca1.fit(df1) #transform original data to new axis df2 = pca1.transform(df1) #variance of data along original dimensions = variance of data along new axis tot_var_original = np.trunc(np.var(df1.F1) + np.var(df1.F2) + np.var(df1.F3)) tot_var_transformed = np.trunc( np.var(df2[:, 0]) + np.var(df2[:, 1]) + np.var(df2[:, 2])) #principal components captures variance in decreasing order print(pca1.explained_variance_ratio_) pca2 = decomposition.PCA(n_components=1) pca2.fit(df1) df3 = pca2.transform(df1) print(np.var(df3[:, 0]))
plt.imshow(comp.reshape(image_shape), cmap=plt.cm.bwr, interpolation='nearest', vmin=-vmax, vmax=vmax) plt.xticks(()) plt.yticks(()) plt.subplots_adjust(0.01, 0.05, 0.99, 0.93, 0.04, 0.) ############################################################################### # List of the different estimators, whether to center and transpose the # problem, and whether the transformer uses the clustering API. estimators = [ ('Linear Sieve', linearsieve.Sieve(n_hidden=n_components), False), ('Eigenfaces - PCA', decomposition.PCA(n_components=n_components), True), ('Non-negative components - NMF', decomposition.NMF(n_components=n_components, init='nndsvda', beta=5.0, tol=5e-3, sparseness='components'), False), ('Independent components - FastICA', decomposition.FastICA(n_components=n_components, whiten=True), True), ('Sparse comp. - MiniBatchSparsePCA', decomposition.MiniBatchSparsePCA(n_components=n_components, alpha=0.8, n_iter=100, batch_size=3, random_state=rng), True), ('MiniBatchDictionaryLearning', decomposition.MiniBatchDictionaryLearning(n_components=15, alpha=0.1, n_iter=50, batch_size=3, random_state=rng), True), ('Cluster centers - MiniBatchKMeans', MiniBatchKMeans(n_clusters=n_components, tol=1e-3, batch_size=20, max_iter=50, random_state=rng), True), ('Factor Analysis components - FA', decomposition.FactorAnalysis(n_components=n_components, max_iter=2), True),
import numpy as np from sklearn import decomposition import pandas as pd df1= pd.DataFrame({ 'F1':[10,2,8,9,12], 'F2':[20,5,17,20,22], 'F3':[10,2,7,10,11]}) from sklearn.preprocessing import StandardScaler sc = StandardScaler() df2 = sc.fit_transform(df1) df2[:,[2]] pca2 = decomposition.PCA() #build pca model for given data #principal components are just original dimensions rotated by some angle #relation between given features to principal components #PC1 = w11*F1 + w12*F2 + w13*F3 #PC2 = w21*F1 + w22*F2 + w23*F3 #PC3 = w31*F1 + w32*F2 + w33*F3 pca2.fit(df2) print(pca2.components_) print(pca2.explained_variance_.cumsum()) pca2.explained_variance_ratio_ #variance of data along original dimensions tot_var_original = np.trunc(np.var(df2[:,[0]]) + np.var(df2[:,[1]]) + np.var(df2[:,[2]])) #variance of data along principal component axes tot_var_pca = np.trunc(np.sum(pca2.explained_variance_))
def plot_pca_constraints(dataset, encoded_data, i_cl_row, i_cl_col, idx_tsne, epoch): """ plots pca of data points with connections between cannot-link constraints :param dataset: :param encoded_data: :param i_cl_row: :param i_cl_col: :param idx_tsne: :param epoch: :return: """ if isinstance(dataset.train_labels, np.ndarray): y = dataset.train_labels else: y = dataset.train_labels.numpy() num_labels = int(max(y) + 1) cmap = plt.get_cmap('jet') mymap = colors.LinearSegmentedColormap.from_list( 'trunc({n},{a:.2f},{b:.2f})'.format(n=cmap.name, a=0, b=1), cmap(np.linspace(0, 1, num_labels))) Z = [[0, 0], [0, 0]] levels = range(0, num_labels + 1, 1) CB = plt.contourf(Z, levels, cmap=mymap) idx_tsne = np.concatenate((idx_tsne, i_cl_row, i_cl_col)) idx_tsne = np.unique(idx_tsne) idx_constraints_row_tsne = idx_tsne.searchsorted(i_cl_row) idx_constraints_col_tsne = idx_tsne.searchsorted(i_cl_col) encoded_data = [encoded_data[i] for i in idx_tsne] pca = decomposition.PCA(n_components=2) pca.fit(encoded_data) u_pca = pca.transform(encoded_data) y_pca = [y[i] for i in idx_tsne] # save output: with open('./pca_epoch_{}.png'.format(epoch), 'w') as f: # Python 3: open(..., 'wb') pickle.dump([ u_pca, idx_constraints_row_tsne, idx_constraints_col_tsne, y, idx_tsne ], f) # Plot figure with subplots of different sizes - large 10*10 for tsne + two columns of 10 images for the pairs plt.figure(1, clear=True) # set up subplot grid gridspec.GridSpec(10, 12) # large subplot - tsne: plt.subplot2grid((10, 12), (0, 0), colspan=10, rowspan=10) plt.scatter(u_pca[:, 0], u_pca[:, 1], marker='o', c=y_pca, cmap=mymap, s=1) plt.colorbar(CB, ticks=range(num_labels)) plt.clim(-0.5, float(num_labels) - 0.5) norm = colors.Normalize(0, num_labels) for i in range(len(idx_constraints_col_tsne)): color = cmap(norm(y[idx_constraints_row_tsne[i]])) plt.plot([ u_pca[idx_constraints_row_tsne[i], 0], u_pca[idx_constraints_col_tsne[i], 0] ], [ u_pca[idx_constraints_row_tsne[i], 1], u_pca[idx_constraints_col_tsne[i], 1] ], color='k', marker='o', markersize=0.3, markerfacecolor=color, markeredgewidth=0.1) plt.savefig('./pca_epoch_{}.png'.format(epoch))
sns.set(style='white') digits = datasets.load_digits() X = digits.data y = digits.target # Картинки - матрицы 8х8 интенсивности каждого пикселя. Матрица разворачивается в вектор 64 для признакового описания. # f, axes = plt.subplot(5, 2, sharey=True, figsize=(16, 6)) plt.figure(figsize=(16, 6)) for i in range(10): plt.subplot(2, 5, i + 1) plt.imshow(X[i, :].reshape([8, 8])) plt.show() print('Projecting %d-dimensional data to 2D' % X.shape[1]) pca = decomposition.PCA(n_components=2) X_reduced = pca.fit_transform(X) plt.figure(figsize=(12, 10)) plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y, edgecolors='none', alpha=0.7, s=40, cmap=plt.cm.get_cmap('nipy_spectral', 10)) plt.colorbar() plt.title('MNIST. PCA projection') plt.show() # Покажем, что имеет смысл сужать количество компонент так, чтобы оставалось не менее 90% исходной дисперсиии.
random_state=0) ###only inclulde one dummy category to avoid multicollinearity, either one could be chosen #datacorr = data2.corr() #correlation matrix, showing correlation between each variable and all the others #data.corr().head() #sb.heatmap(datacorr, cmap = 'bwr') #heatmap of correlation matrix ###darker colors represent higher correlation, several pairs of variables are highly correlated. #Two highly correlated variables should not be both used in model. PCA will later be performed to explain the same variance while avoiding multicollinearity ##drop response variable and standardize predictor variables #X = data_stnd #store predictor variables #y = data['diagnosis_dummies'] #store response variable pca = skdc.PCA() #empty model space pcafit = pca.fit_transform(x, y) ##apply dimensionality reduction to X var_explained = pca.explained_variance_ratio_ #ratio of variance each PC explains print(pd.Series(var_explained)) ###Since 29 components aren't necessary, the last 20 PCs will be disregarded ###since they explain less than.01 of the variance ##indeed,the first 10 PCs explain 95% of the variance pca = skdc.PCA(n_components=10) #only include first 10 components logreg = sklm.LogisticRegression() #empty model space pipeline = skpl.Pipeline([('pca', pca), ('logistic', logreg) ]) #create pipeline from pca to logregression space predRight = 0 #create count variables predWrong = 0 fit = pipeline.fit(x_train, y_train) #fit model
def get_channels(self, avi_rgb_file, avi_grey_file): self.__avi_rgb = avi_rgb_file self.__avi_grey = avi_grey_file # Read first frames self.__cap_rgb = cv2.VideoCapture(avi_rgb_file) self.__cap_grey = cv2.VideoCapture(avi_grey_file) ret_rgb, frame_rgb1 = self.__cap_rgb.read() ret_grey, frame_grey1 = self.__cap_grey.read() # Mirror grey frame mirror_grey1 = cv2.flip(frame_grey1, 1) # TODO Non-contant arguments self.calculatePerspectiveTransform("out1.png", "out2.png") frame_rgb1 = self.transform(frame_rgb1, mirror_grey1) # Here we select ROI self.select_roi(frame_rgb1) cropped_grey1 = self.crop(frame_rgb1) i = 0 while(1): ret_rgb, frame_rgb2 = self.__cap_rgb.read() ret_grey, frame_grey2 = self.__cap_grey.read() if(ret_rgb & ret_grey == True): mirror_grey2 = cv2.flip(frame_grey2, 1) frame_rgb2 = self.transform(frame_rgb2, mirror_grey2) # Crop current ROI cropped_rgb2 = self.crop(frame_rgb2) cropped_grey2 = self.crop(mirror_grey2) self.__C1.append(np.mean(cropped_grey2[: ,:, 1])) self.__R.append(np.mean(cropped_rgb2[:, :, 0])) self.__G.append(np.mean(cropped_rgb2[:, :, 1])) self.__B.append(np.mean(cropped_rgb2[:, :, 2])) i += 1 else: break # Smoothing r = 3 self.__C1 = smooth(self.__C1, r) self.__R = smooth(self.__R, r) self.__G = smooth(self.__G, r) self.__B = smooth(self.__B, r) plt.figure(1) plt.plot(self.__C1, label="pasmo szarości", color="black") plt.plot(self.__R, label="pasmo czerwieni", color="red") plt.plot(self.__G, label="pasmo zielone", color="green") plt.plot(self.__B, label="pasmo niebieskie", color="blue") plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=2, mode="expand", borderaxespad=0.) plt.title("Srednie wartosci z kanalow barwnych") plt.show(block=True) # dataset matrix = np.zeros([len(self.__C1), 4]) for i in range(len(matrix)): matrix[i] = [self.__C1[i], self.__R[i], self.__G[i], self.__B[i]] data = np.mat(matrix) # PCA pca_components = 4 pca = decomposition.PCA(n_components=pca_components).fit(data) X_out_pca = pca.transform(data) plt.figure(2) for i in range(4): plt.subplot(2,2,i+1) plt.plot(X_out_pca[:,i]) plt.title("PCA kanał " + str(i+1)) plt.show(block=True) # ICA ica = decomposition.FastICA(n_components=4) ICA_out = ica.fit(X_out_pca).transform(X_out_pca) # Estimate the sources plt.figure(3) for i in range(4): plt.subplot(2,2,i+1) plt.plot(ICA_out[:,i]) plt.title("ICA kanał " + str(i+1)) plt.show(block=True) #FFT of PCA for i in range(4): xf, yf = self.count_fft(X_out_pca[:, i]) N = len(X_out_pca[:,i]) for index, freq in enumerate(xf): if (freq < 0.04 or freq > 4): yf[index] = 0 plt.figure(4) plt.subplot(2,2,i+1) plt.plot(xf, np.abs(yf[0:N // 2])) plt.title("FFT pf PCA") #IFFT new_yf = ifft(yf) plt.figure(5) plt.subplot(2,2,i+1) plt.plot(new_yf) plt.title("IFFT pf PCA") plt.show(block=True) # FFT of ICA for i in range(4): xf, yf = self.count_fft(ICA_out[:, i]) N = len(ICA_out[:, i]) for index, freq in enumerate(xf): if (freq < 0.04 or freq > 4): yf[index] = 0 plt.figure(6) plt.subplot(2, 2, i + 1) plt.plot(xf, np.abs(yf[0:N // 2])) plt.title("FFT pf ICA") # IFFT new_yf = ifft(yf) plt.figure(7) plt.subplot(2, 2, i + 1) plt.plot(new_yf) plt.title("IFFT pf ICA") plt.show(block=True)
def perform_PCA(X,pca_components): pca = decomposition.PCA(n_components=pca_components) pca.fit(X) X_PCA = pca.transform(X) return X_PCA
def train(self, features, new_dim): self.model = decomposition.PCA(n_components=new_dim) self.model.fit(features)
def pca(self): pca = decomposition.PCA(n_components=2) pos = pca.fit(self.input_matrix).transform(self.input_matrix) return pos
""" import numpy as np import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D from sklearn import decomposition from sklearn import datasets np.random.seed(5) iris = datasets.load_iris() X = iris.data y = iris.target centers = [[1, 1], [-1, -1], [1, -1]] pca = decomposition.PCA(n_components=3) pca.fit(X) X = pca.transform(X) fig = plt.figure(1, figsize=(4, 3)) plt.clf() ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134) #3D圖的長寬高 plt.cla() for name, label in [('Setosa', 0), ('Versicolour', 1), ('Virginica', 2)]: ax.text3D(X[y == label, 0].mean(), X[y == label, 1].mean() + 1.5, X[y == label, 2].mean(), name, horizontalalignment='center', bbox=dict(alpha=.5, edgecolor='w', facecolor='w')) #字典資料組
def execute(self, callback=None): """This does the actual class-averaging, and returns the result""" options = self.options if options["verbose"] > 0: print("Start averaging class {} with {} particles ".format( options["classnum"], len(options["particles"]))) files = self.data["particles1"][1], self.data["particles2"][1] ptcls = options["particles"] # just a shortcut #if len(options["particles"])<5 : #z=EMData(str(files[ptcls[0][0]]),ptcls[0][1]).to_zero() #return {"avg":z,"basis":[z,z,z,z,z]} # print files,ptcls[0] # read in all particles and append each to element to ptcls avgr = Averagers.get("mean") for p in ptcls: p.append( EMData(str(files[p[0]]), p[1]).process("xform", {"transform": p[2]})) p.append(p[-1].process("filter.highpass.gauss", { "cutoff_freq": 0.01 }).process("filter.lowpass.gauss", { "cutoff_freq": 0.05 }).process("normalize.circlemean", { "radius": -6 }).process("mask.soft", { "outer_radius": -8, "width": 4 })) avgr.add_image(p[4]) # Copy each particle minus it's mean value avg = avgr.finish() # PCA on the mean-subtracted particles # At this point p[3] will be the particle in the correct orientation # p[4] will be the filtered/masked particle # p[5] will be the filtered/masked/bg subtr particle for p in ptcls: p.append(p[4].copy()) p[5].sub(avg) if options["mask"] == None: mask = ptcls[0][-1].copy() mask.to_one() # mask.process("mask.soft",{"outer_radius":-8,"width":4}) mask.process("mask.sharp", {"outer_radius": max(-10, -mask["ny"] // 25)}) else: mask = options["mask"] nmask = int( mask["square_sum"] ) # ok, square part is silly, but gives a count of "1" pixels # print "basis start" #if options["verbose"]>1: print("PCA {}, {}".format(options["classnum"],len(ptcls))) #pca=Analyzers.get("pca_large",{"nvec":options["nbasis"],"mask":mask,"tmpfile":"tmp{}".format(options["classnum"])}) #for p in ptcls: #pca.insert_image(p[5]) # filter to focus on lower resolution differences #basis=pca.analyze() # use numpy/sklearn instead of the old EMAN2 PCA code (more flexible) datamx = EMData(nmask, len(ptcls)) for i, p in enumerate(ptcls): p5 = p[5].process("misc.mask.pack", {"mask": mask}) datamx.insert_clip(p5, (0, i, 0)) npdata = to_numpy(datamx) # need to be careful about deleting datamx # actual PCA calculation msa = skdc.PCA(n_components=options["nbasis"]) msa.fit(npdata) # we just need one basis vector basis = msa.components_[self.options["usebasis"]] basis = from_numpy(basis).process("misc.mask.pack", { "mask": mask, "unpack": 1 }) ## Varimax rotation... good idea? #if not options["novarimax"]: #if options["verbose"]>1: print("Varimax {}".format(options["classnum"])) #pca2=Analyzers.get("varimax",{"mask":mask}) #for im in basis: #pca2.insert_image(im) #basis=pca2.analyze() # if you turn this on multithreaded it will crash sometimes #avg.mult(0.05) #arbitrary for debugging #avg.write_image("pca.hdf",-1) #basis[0].write_image("pca.hdf",-1) #basis[1].write_image("pca.hdf",-1) #basis[2].write_image("pca.hdf",-1) #basis[3].write_image("pca.hdf",-1) # print "basis" # at the moment we are just splitting into 2 classes, so we'll use the first eigenvector. A bit worried about defocus coming through, but hopefully ok... dots = [ p[5].cmp("ccc", basis) for p in ptcls ] # NOTE: basis number is passed in as an option, may not be #1 or #3 (default) if len(dots) == 0: return {"failed": True} dota = old_div(sum(dots), len(dots)) # print "average" if options["verbose"] > 1: print("Split by dot {}".format(options["classnum"])) # we will just use the sign of the dot product to split avgr = [Averagers.get("mean"), Averagers.get("mean")] incl = [[], []] for i, d in enumerate(dots): if d < dota: avgr[0].add_image(ptcls[i][3]) incl[0].append(ptcls[i][0]) incl[0].append(ptcls[i][1]) else: avgr[1].add_image(ptcls[i][3]) incl[1].append(ptcls[i][0]) incl[1].append(ptcls[i][1]) #for p in ptcls: #avgr.add_image(p[3].process("xform",{"transform":p[2]})) if len(incl[0]) == 0 or len(incl[1]) == 0: if options["verbose"] > 0: print("No separation on class {}".format(options["classnum"])) return {"failed": True} if options["verbose"] > 0: print("Finish averaging class {}".format(options["classnum"])) # if callback!=None : callback(100) # return {"avg":avg,"basis":basis} try: avg1 = avgr[0].finish() avg1["xform.projection"] = options["euler"] avg1["class_eoidxs"] = incl[ 0] # contains alternating file # and particle # # print avg1 avg2 = avgr[1].finish() avg2["xform.projection"] = options["euler"] avg2["class_eoidxs"] = incl[1] # print avg2 except: return {"failed": True} # print basis return { "avg1": avg1, "avg2": avg2, "basis": basis } # basis was originally the first 3 vectors, now we are only returning the selected one. Will see if it's a problem
def x_pca(x, n_components): pca_m = decomposition.PCA(n_components) pc = pca_m.fit_transform(x) return pc, pca_m
return args.input_dir, args.output_dir, args.n input_dir, output_dir, n_comp = ParseArguments() n_comp = int(n_comp) # wczytujemy dane x_train, y_train, x_test, y_test, classes_names = read_data(input_dir) ### PCA print("PCA reduction ", x_train.shape[1], " -> ", n_comp, " ...", end=" ") pca = decomposition.PCA(n_components=n_comp, svd_solver='randomized') ## wyuczenie macierzy tranformacji i zaaplikowanie jej na x_train start_time = time.time() x_train_reduced = pca.fit_transform(x_train) print(" took %s seconds " % round((time.time() - start_time), 5)) # zastosowanie tej samej macierzy na x_test x_test_reduced = pca.transform(x_test) # zapisujemy dane save_data(x_train_reduced, y_train, x_test_reduced, y_test, classes_names, output_dir)