Example #1
0
def dim_reduction(X, n_components=2, mode="MDS"):
    
    """Reduces the number of dimensions in which a dataset is defined.
    
    Arguments

    X       -   NumPy array with shape (N,M), where N is the number of
                observations, and M the number of features.
    
    Keyword Arguments
    
    n_components    -   Intended number of features after dimensionality
                        reduction. Default = 2
    
    mode            -   String that defines the type of dim reduction:
                        - None
                        - "PCA" principal component analysis
                        - "ICA" independent component analysis
                        - "FA" factor analysis
                        - "TSNE" t-stochastic neighbour embedding
                        - "UMAP" uniform manifold approximation and embedding
                        - "RANDOMPROJECTION"
                        - "FEATUREAGGLOMERATION"
                        - "ISOMAP"
                        - "LLE" local linear embedding
                        - "HESSIAN" Hessian eigenmaps
                        - "MLLE" modified local linear embedding
                        - "LTSA" local tangent space alignment
                        - "MDS" multi-dimensional scaling
                        - "DICTIONARY" dictionary learning
                        - "TSVD" truncated SVD (also known as "LSE")
                        Default = "MDS"
    
    Returns
    
    X       -   NumPy array with shape (N-n,M), where N is the number of
                observations and n is the number of observations with a NaN.
                M is the number of features. Now with scaled values.
    """
    
    # Make sure the mode is in all caps.
    if type(mode) == str:
        mode = mode.upper()
    
    # Copy X into a new matrix.
    X_ = numpy.copy(X)

    # None
    if mode is None or mode == "NONE":
        # Literally nothing happens here for now.
        print("Fart noise!")
        
    # Principal component analysis.
    elif mode == 'PCA':
        # Initialise a new PCA.
        pca = decomposition.PCA(n_components=n_components)
        # Fit the PCA with the data.
        pca.fit(X_)
        # Transform the data.
        X_ = pca.transform(X_)
    
    # Independent component analysis.
    elif mode == 'ICA':
        # Initialise a new ICA.
        ica = decomposition.FastICA(n_components=n_components)
        # Fit the ICA with the data.
        ica.fit(X_)
        # Transform the data.
        X_ = ica.transform(X_)
    
    # Factor analysis.
    elif mode == 'FA':
        # Initialise a new factor analysis.
        fa = decomposition.FactorAnalysis(n_components=n_components)
        # Perform the factor analysis on the data.
        fa.fit(X_)
        # Transform the data.
        X_ = fa.transform(X_)
    
    # T-Distributed stochastic neighbour embedding.
    elif mode == 'TSNE':
        # Run several t-SNEs to find a good one.
        n_runs = 10
        Xs_ = []
        dkl = numpy.ones(n_runs, dtype=float) * numpy.inf
        print("Running %d t-SNEs to find lowest Kullback-Leibler divergence." \
            % (n_runs))
        for i in range(n_runs):
            # Initialise a new t-distributed stochastic neighbouring embedding
            #  (t-SNE) analysis.
            tsne = TSNE(n_components=n_components)
            # Copy the data into a new variable.
            Xs_.append(numpy.copy(X_))
            # Fit to and transform the data.
            Xs_[i] = tsne.fit_transform(Xs_[i])
            # Get the KL-divergence.
            dkl[i] = tsne.kl_divergence_
            print("\tCurrent KL-divergence = %.5f" % (dkl[i]))
        # Choose the solution with the lowest KL-divergence.
        X_ = numpy.copy(Xs_[numpy.argmin(dkl)])
        # Get rid of all the excess X copies.
        del Xs_
    
    # Uniform manifold approximation and projection.
    elif mode == 'UMAP':
        # Create a new UMAP instance.
        um = umap.UMAP(n_components=n_components, min_dist=0.01)
        # Fit and transform X.
        X_ = um.fit_transform(X_)
    
    # Gaussian Random Projection.
    elif mode == 'RANDOMPROJECTION':
        # Create a new GaussianRandomProjection instance.
        rp = GaussianRandomProjection(n_components=n_components)
        # Fit and transform X.
        X_ = rp.fit_transform(X_)
    
    # Feature Agglomeration.
    elif mode == 'FEATUREAGGLOMERATION':
        # Create a new FeatureAgglomeration instance.
        fa = cluster.FeatureAgglomeration(n_clusters=n_components)
        # Fit and transform X.
        X_ = fa.fit_transform(X_)
    
    # Isomap.
    elif mode == 'ISOMAP':
        # Create a new Isomap instance.
        im = Isomap(n_components=n_components)
        # Fit and transform X.
        X_ = im.fit_transform(X_)
    
    # Locally Linear Embedding.
    elif mode == 'LLE':
        # Create a new LocallyLinearEmbedding instance.
        lle = LocallyLinearEmbedding(n_neighbors=10, n_components=n_components, \
            method='standard', eigen_solver='dense')
        # Fit and transform X.
        X_ = lle.fit_transform(X_)
    
    # Hessian eigenmaps.
    elif mode == 'HESSIAN':
        # Create a new LocallyLinearEmbedding instance.
        hlle = LocallyLinearEmbedding(n_neighbors=10, n_components=n_components, \
            method='hessian', eigen_solver='dense')
        # Fit and transform X.
        X_ = hlle.fit_transform(X_)
    
    # MLLE.
    elif mode == 'MLLE':
        # Create a new LocallyLinearEmbedding instance.
        mlle = LocallyLinearEmbedding(n_neighbors=10, n_components=n_components, \
            method='modified', eigen_solver='dense')
        # Fit and transform X.
        X_ = mlle.fit_transform(X_)
    
    # LTSA.
    elif mode == 'LTSA':
        # Create a new LocallyLinearEmbedding instance.
        ltsa = LocallyLinearEmbedding(n_neighbors=10, n_components=n_components, \
            method='ltsa', eigen_solver='dense')
        # Fit and transform X.
        X_ = ltsa.fit_transform(X_)
    
    # Multi-dimensional scaling.
    elif mode == 'MDS':
        # Create a new MDS instance.
        mds = MDS(n_components=n_components)
        # Fit and transform X.
        X_ = mds.fit_transform(X_)
    
    # Dictionary Learning
    elif mode == "DICTIONARY":
        # Create a DictionaryLearning instance.
        dictlearn = decomposition.DictionaryLearning( \
            n_components=n_components, \
            fit_algorithm='cd', \
            # The 'omp' algorithm orthogonalises the whole thing, whereas
            # a lasso solution with a low alpha leaves a slightly more
            # scattered solution.
            transform_algorithm='lasso_cd', \
            transform_alpha=0.1, \
            )
        # Fit and transform X.
        X_ = dictlearn.fit_transform(X)
    
    # Truncated SVD (also known as 'Latent Semantic analysis' (LSE)
    elif mode in ['TSVD', 'LSE']:
        tsvd = decomposition.TruncatedSVD(n_components=n_components)
        # Fit and transform X.
        X_ = tsvd.fit_transform(X)
    
    else:
        raise Exception("Unrecognised dimensionality reduction mode '%s'" % (mode))
    
    return X_
Example #2
0
cov_mat = np.cov(X_std.T)
eig_vals, eig_vecs = np.linalg.eig(cov_mat)

# feed the feature matrix to PCA

from sklearn.preprocessing import StandardScaler

feature_matrix = StandardScaler().fit_transform(feature_matrix)
df_feature_matrix = pd.DataFrame(feature_matrix)
df_feature_matrix.fillna(0, inplace=True)

from sklearn import preprocessing

data_scaled = pd.DataFrame(preprocessing.scale(df_feature_matrix),
                           columns=df_feature_matrix.columns)
pca = decomposition.PCA(n_components=5)
pca1 = decomposition.PCA(n_components=30)
X_std_pca = pca.fit_transform(data_scaled)
X_std_pca1 = pca1.fit_transform(data_scaled)
# X_std_pca.shape

# calculate the explained variance of pca
pcaExpVariance = pca.explained_variance_
# print("PCA variance= ", pcaExpVariance)
pcaTransformed = pca.transform(feature_matrix)
# pcaTransformed.shape

# calculate explained variance ratio for analysis of no. of features
# using 5 components
variance = pca.explained_variance_ratio_
var = np.cumsum(np.round(pca.explained_variance_ratio_, decimals=3) * 100)
Example #3
0
def estim_class_model(features,
                      nb_classes,
                      estim_model='GMM',
                      pca_coef=None,
                      use_scaler=True,
                      max_iter=99):
    """ create pipeline (scaler, PCA, model) over several options how
    to cluster samples and fit it on data

    :param ndarray features:
    :param int nb_classes: number of expected classes
    :param float pca_coef: range (0, 1) or None
    :param bool use_scaler: whether use a scaler
    :param str estim_model: used model
    :param int max_iter:
    :return:

    >>> np.random.seed(0)
    >>> fts = np.row_stack([np.random.random((50, 3)) - 1,
    ...                     np.random.random((50, 3)) + 1])
    >>> mm = estim_class_model(fts, 2)
    >>> mm.predict_proba(fts).shape
    (100, 2)
    >>> mm = estim_class_model(fts, 2, estim_model='GMM_kmeans',
    ...                         pca_coef=0.95, max_iter=3)
    >>> mm.predict_proba(fts).shape
    (100, 2)
    >>> mm = estim_class_model(fts, 2, estim_model='GMM_Otsu', max_iter=3)
    >>> mm.predict_proba(fts).shape
    (100, 2)
    >>> mm = estim_class_model(fts, 2, estim_model='kmeans_quantiles',
    ...                         use_scaler=False, max_iter=3)
    >>> mm.predict_proba(fts).shape
    (100, 2)
    >>> mm = estim_class_model(fts, 2, estim_model='BGM', max_iter=3)
    >>> mm.predict_proba(fts).shape
    (100, 2)
    >>> mm = estim_class_model(fts, 2, estim_model='Otsu', max_iter=3)
    >>> mm.predict_proba(fts).shape
    (100, 2)
    """
    components = []
    if use_scaler:
        components += [('std_scaler', preprocessing.StandardScaler())]
    if pca_coef is not None:
        components += [('reduce_dim', decomposition.PCA(pca_coef))]

    nb_inits = max(1, int(np.sqrt(max_iter)))
    # http://scikit-learn.org/stable/modules/generated/sklearn.mixture.GMM.html
    mm = mixture.GaussianMixture(n_components=nb_classes,
                                 covariance_type='full',
                                 n_init=nb_inits,
                                 max_iter=max_iter)

    # split the model and used initilaisation
    if '_' in estim_model:
        init_type = estim_model.split('_')[-1]
        estim_model = estim_model.split('_')[0]
    else:
        init_type = ''

    y = None
    if estim_model == 'GMM':
        # model = estim_class_model_gmm(features, nb_classes)
        if init_type == 'kmeans':
            mm.set_params(n_init=1)
            # http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
            kmeans = cluster.KMeans(n_clusters=nb_classes,
                                    init='k-means++',
                                    n_jobs=-1)
            y = kmeans.fit_predict(features)
        elif init_type == 'Otsu':
            mm.set_params(n_init=1)
            y = compute_multivarian_otsu(features)

    elif estim_model == 'kmeans':
        # http://scikit-learn.org/stable/modules/generated/sklearn.mixture.GMM.html
        mm.set_params(max_iter=1)
        init_type = 'quantiles' if init_type == 'quantiles' else 'k-means++'
        _, y = estim_class_model_kmeans(features,
                                        nb_classes,
                                        init_type=init_type,
                                        max_iter=max_iter)

        logging.info('compute probability of each feature to all component')

    elif estim_model == 'BGM':
        mm = mixture.BayesianGaussianMixture(n_components=nb_classes,
                                             covariance_type='full',
                                             n_init=nb_inits,
                                             max_iter=max_iter)

    elif estim_model == 'Otsu' and nb_classes == 2:
        mm.set_params(max_iter=1, n_init=1)
        y = compute_multivarian_otsu(features)

    components += [('model', mm)]
    # compose the pipeline
    model = pipeline.Pipeline(components)

    if y is not None:
        # fit with examples
        model.fit(features, y)
    else:
        # fit from scrach
        model.fit(features)
    return model
Example #4
0
parser = argparse.ArgumentParser()
parser.add_argument('-model', type=str, default='linear_model')
parser.add_argument('-featuredim', type=int, default=20)
parser.add_argument('-inputfeatures',
                    type=str,
                    default='../data/features_ALL.txt')
parser.add_argument('-labels', type=str, default='../data/ratings.txt')
args = parser.parse_args()

features = np.loadtxt(args.inputfeatures, delimiter=',')
#features = preprocessing.scale(features)
features_train = features[0:-50]
features_test = features[-50:]

pca = decomposition.PCA(n_components=args.featuredim)
pca.fit(features_train)
features_train = pca.transform(features_train)
features_test = pca.transform(features_test)

ratings = np.loadtxt(args.labels, delimiter=',')
#ratings = preprocessing.scale(ratings)
ratings_train = ratings[0:-50]
ratings_test = ratings[-50:]

if args.model == 'linear_model':
    regr = linear_model.LinearRegression()
elif args.model == 'svm':
    regr = svm.SVR()
elif args.model == 'rf':
    regr = RandomForestRegressor(n_estimators=50,
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model, decomposition, datasets
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

logistic = linear_model.LogisticRegression()
pca = decomposition.PCA()
pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])  #设置流水线过程

digits = datasets.load_digits()
X_digits = digits.data
y_digits = digits.target
pca.fit(X_digits)

plt.figure(1, figsize=(4, 3))
plt.clf()
plt.axes([.2, .2, .7, .7])
plt.plot(pca.explained_variance_, linewidth=2)  #画出每个维度的解释方差,方差越大,其对应维度越需要被留下来
plt.axis('tight')
plt.xlabel('n_components')
plt.ylabel('explained_variance_')

n_components = [20, 40, 64]
Cs = np.logspace(-4, 4, 3)

#Parameters of pipelines can be set using ‘__’ separated parameter names:

estimator = GridSearchCV(pipe,
                         dict(pca__n_components=n_components,
                              logistic__C=Cs))  #自动调优
Example #6
0
all_projections['LAMP']    = (vp.LAMP(), {'verbose': [False], 'fraction_delta': [2.0, 8.0, 12.0], 'n_iterations': [100, 200], 'sample_type': ['random', 'clustering_centroid']})
all_projections['LE']      = (manifold.SpectralEmbedding(), {'n_components': [2], 'affinity': ['nearest_neighbors'], 'random_state': [42]})
all_projections['LISO']    = (vp.LandmarkIsomap(), {'verbose': [False], 'n_neighbors': [4, 8, 16], 'dissimilarity_type': ['euclidean']})
all_projections['LLC']     = (drtoolbox.LLC(), {'k': [8, 12], 'n_analyzers': [10, 20], 'max_iter': [200, 400], 'verbose': [False]})
all_projections['LLE']     = (manifold.LocallyLinearEmbedding(), {'n_components': [2], 'n_neighbors': [5, 7, 11], 'max_iter': [100, 200], 'reg': [0.001, 0.01, 0.1], 'method': ['standard'], 'eigen_solver': ['dense'], 'random_state': [42]})
all_projections['LLTSA']   = (tapkee.LinearLocalTangentSpaceAlignment(), {'n_neighbors': [4, 7, 11], 'verbose': [False]}) # subject to "eigendecomposition failed" errors (Eigen's NoConvergence)
all_projections['LMDS']    = (tapkee.LandmarkMDS(), {'n_neighbors': [4, 7, 11], 'verbose': [False]})
all_projections['LMNN']    = (drtoolbox.LMNN(), {'k': [3, 5, 7], 'verbose': [False]})
all_projections['LMVU']    = (drtoolbox.LandmarkMVU(), {'k1': [3, 5, 7], 'k2': [8, 12, 15], 'verbose': [False]})
all_projections['LPP']     = (tapkee.LocalityPreservingProjections(), {'n_neighbors': [4, 7, 11], 'verbose': [False]}) # subject to "eigendecomposition failed" errors (Eigen's NoConvergence)
all_projections['LSP']     = (vp.LSP(), {'verbose': [False], 'fraction_delta': [2.0, 8.0, 12.0], 'n_iterations': [100, 200], 'n_neighbors': [4, 8, 16], 'control_point_type': ['random', 'kmeans'], 'dissimilarity_type': ['euclidean']})
all_projections['LTSA']    = (manifold.LocallyLinearEmbedding(), {'n_components': [2], 'n_neighbors': [5, 7, 11], 'max_iter': [100, 200], 'reg': [0.001, 0.01, 0.1], 'method': ['ltsa'], 'eigen_solver': ['dense'], 'random_state': [42]})
all_projections['MC']      = (drtoolbox.ManifoldChart(), {'n_analyzers': [10, 20], 'max_iter': [200, 400], 'verbose': [False]})
all_projections['MCML']    = (drtoolbox.MCML(), {'verbose': [False]})
all_projections['MDS']     = (manifold.MDS(), {'n_components': [2], 'n_init': [2, 4], 'metric': [True], 'max_iter': [300, 500], 'random_state': [42]})
all_projections['MLLE']    = (manifold.LocallyLinearEmbedding(), {'n_components': [2], 'n_neighbors': [5, 7, 11], 'max_iter': [100, 200], 'reg': [0.001, 0.01, 0.1], 'method': ['modified'], 'eigen_solver': ['dense'], 'random_state': [42]})
all_projections['MVU']     = (drtoolbox.MVU(), {'k': [8, 12, 15], 'verbose': [False]})
all_projections['NMDS']    = (manifold.MDS(), {'n_components': [2], 'n_init': [2, 4], 'metric': [False], 'max_iter': [300, 500], 'random_state': [42]})
all_projections['NMF']     = (decomposition.NMF(), {'n_components': [2], 'init': ['random', 'nndsvdar'], 'beta_loss': ['frobenius'], 'max_iter': [200, 400], 'alpha': [0, 0.5], 'l1_ratio': [0.0, 0.5], 'random_state': [42]})
all_projections['PBC']     = (vp.ProjectionByClustering(), {'verbose': [False], 'fraction_delta': [2.0, 8.0, 12.0], 'n_iterations': [100, 200], 'init_type': ['fastmap', 'random'], 'dissimilarity_type': ['euclidean'], 'cluster_factor': [1.5, 4.5, 9.0]})
all_projections['PCA']     = (decomposition.PCA(), {'n_components': [2], 'random_state': [42]})
all_projections['PLSP']    = (vp.PLSP(), {'dissimilarity_type': ['euclidean'], 'verbose': [False], 'sample_type': ['clustering']})
all_projections['PPCA']    = (drtoolbox.ProbPCA(), {'max_iter': [200, 400], 'verbose': [False]})
all_projections['RSAM']    = (vp.RapidSammon(), {'verbose': [False], 'dissimilarity_type': ['euclidean']})
all_projections['SPCA']    = (decomposition.SparsePCA(), {'n_components': [2], 'alpha': [0.01, 0.1, 0.5], 'ridge_alpha': [0.05, 0.05, 0.5], 'max_iter': [1000, 2000], 'tol': [1e-08], 'method': ['lars'], 'random_state': [42], 'normalize_components': [True]})
all_projections['SPE']     = (tapkee.StochasticProximityEmbedding(), {'n_neighbors': [6, 12, 18], 'n_updates': [20, 70], 'max_iter': [0], 'verbose': [False]})
all_projections['SRP']     = (random_projection.SparseRandomProjection(), {'n_components': [2], 'density': ['auto'], 'random_state': [42]})
all_projections['TSNE']    = (mtsne.MTSNE(), {'n_components': [2], 'perplexity': [5.0, 15.0, 30.0, 50.0], 'early_exaggeration': [6.0, 12.0, 18.0], 'learning_rate': [200.0], 'n_iter': [1000, 3000], 'n_iter_without_progress': [300], 'min_grad_norm': [1e-07], 'metric': ['euclidean'], 'init': ['random'], 'random_state': [42], 'method': ['barnes_hut'], 'angle': [0.5], 'n_jobs': [4]})
all_projections['TSVD']    = (decomposition.TruncatedSVD(), {'n_components': [2], 'algorithm': ['randomized'], 'n_iter': [5, 10], 'random_state': [42]})
all_projections['UMAP']    = (umap.UMAP(), {'n_components': [2], 'random_state': [42], 'n_neighbors': [5, 10, 15], 'metric': ['euclidean'], 'init': ['spectral', 'random'], 'min_dist': [0.001, 0.01, 0.1, 0.5], 'spread': [1.0], 'angular_rp_forest': [False]})
                   cmap=cmap,
                   interpolation='nearest',
                   vmin=-vmax,
                   vmax=vmax)
        plt.xticks(())
        plt.yticks(())
    plt.subplots_adjust(0.01, 0.05, 0.99, 0.93, 0.04, 0.)


# #############################################################################
# List of the different estimators, whether to center and transpose the
# problem, and whether the transformer uses the clustering API.
estimators = [
    ('Eigenfaces - PCA using randomized SVD',
     decomposition.PCA(n_components=n_components,
                       svd_solver='randomized',
                       whiten=True), True),
    ('Non-negative components - NMF',
     decomposition.NMF(n_components=n_components, init='nndsvda',
                       tol=5e-3), False),
    ('Independent components - FastICA',
     decomposition.FastICA(n_components=n_components, whiten=True), True),
    ('Sparse comp. - MiniBatchSparsePCA',
     decomposition.MiniBatchSparsePCA(n_components=n_components,
                                      alpha=0.8,
                                      n_iter=100,
                                      batch_size=3,
                                      random_state=rng), True),
    ('MiniBatchDictionaryLearning',
     decomposition.MiniBatchDictionaryLearning(n_components=15,
                                               alpha=0.1,
Example #8
0
from sklearn import datasets,decomposition,svm,metrics
import numpy as np
from sklearn.pipeline import Pipeline
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
faces = datasets.fetch_olivetti_faces()
print(faces.data.shape)
fig = plt.figure(figsize=(8, 6))
for i in range(15):
    ax = fig.add_subplot(3, 5, i + 1, xticks=[], yticks=[])
    ax.imshow(faces.images[i], cmap=plt.cm.bone)
X_train, X_test, y_train, y_test = train_test_split(faces.data,faces.target, random_state=0)
print(X_train.shape, X_test.shape)
pca = decomposition.PCA(n_components=150, whiten=True)
pca.fit(X_train)
plt.imshow(pca.mean_.reshape(faces.images[0].shape),cmap=plt.cm.bone)
plt.show()
print(pca.components_.shape)
fig = plt.figure(figsize=(16, 6))
for i in range(30):
    ax = fig.add_subplot(3, 10, i + 1, xticks=[], yticks=[])
    ax.imshow(pca.components_[i].reshape(faces.images[0].shape),cmap=plt.cm.bone)
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print(X_train_pca.shape)
print(X_test_pca.shape)
clf = svm.SVC(C=5., gamma=0.001)
clf.fit(X_train_pca, y_train)
fig = plt.figure(figsize=(8, 6))
for i in range(15):
    ax = fig.add_subplot(3, 5, i + 1, xticks=[], yticks=[])
Example #9
0
def plot_iris_mds():

    iris = datasets.load_iris()
    X = iris.data
    y = iris.target

    # MDS

    fig = pylab.figure(figsize=(10, 4))

    ax = fig.add_subplot(121, projection='3d')
    ax.set_axis_bgcolor('white')

    mds = manifold.MDS(n_components=3)
    Xtrans = mds.fit_transform(X)

    for cl, color, marker in zip(np.unique(y), colors, markers):
        ax.scatter(Xtrans[y == cl][:, 0],
                   Xtrans[y == cl][:, 1],
                   Xtrans[y == cl][:, 2],
                   c=color,
                   marker=marker,
                   edgecolor='black')
    pylab.title("MDS on Iris data set in 3 dimensions")
    ax.view_init(10, -15)

    mds = manifold.MDS(n_components=2)
    Xtrans = mds.fit_transform(X)

    ax = fig.add_subplot(122)
    for cl, color, marker in zip(np.unique(y), colors, markers):
        ax.scatter(Xtrans[y == cl][:, 0],
                   Xtrans[y == cl][:, 1],
                   c=color,
                   marker=marker,
                   edgecolor='black')
    pylab.title("MDS on Iris data set in 2 dimensions")

    filename = "mds_demo_iris.png"
    pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")

    # PCA

    fig = pylab.figure(figsize=(10, 4))

    ax = fig.add_subplot(121, projection='3d')
    ax.set_axis_bgcolor('white')

    pca = decomposition.PCA(n_components=3)
    Xtrans = pca.fit(X).transform(X)

    for cl, color, marker in zip(np.unique(y), colors, markers):
        ax.scatter(Xtrans[y == cl][:, 0],
                   Xtrans[y == cl][:, 1],
                   Xtrans[y == cl][:, 2],
                   c=color,
                   marker=marker,
                   edgecolor='black')
    pylab.title("PCA on Iris data set in 3 dimensions")
    ax.view_init(50, -35)

    pca = decomposition.PCA(n_components=2)
    Xtrans = pca.fit_transform(X)

    ax = fig.add_subplot(122)
    for cl, color, marker in zip(np.unique(y), colors, markers):
        ax.scatter(Xtrans[y == cl][:, 0],
                   Xtrans[y == cl][:, 1],
                   c=color,
                   marker=marker,
                   edgecolor='black')
    pylab.title("PCA on Iris data set in 2 dimensions")

    filename = "pca_demo_iris.png"
    pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
Example #10
0
y = iris_df[column_names[-1]]

standard_scaler = sk_preprocessing.StandardScaler()
X = standard_scaler.fit_transform(X)

label_encoder = sk_preprocessing.LabelEncoder()
y = label_encoder.fit_transform(y)

cv_iris_log_model = sk_linear.LogisticRegression()
cv_iris_model_quality = sk_model_selection.cross_val_score(cv_iris_log_model, X, y, cv=4, scoring="accuracy")

print("Original model quality:")
# print(cv_iris_model_quality)
print(np.mean(cv_iris_model_quality))

pca_2_components = sk_decomposition.PCA(n_components=2)
principal_components = pca_2_components.fit_transform(X)

plt.scatter(principal_components[:, 0], principal_components[:, 1], c=y, cmap="prism")

pca_all_components = sk_decomposition.PCA()
pca_all_components.fit(X)

print("Explained variance ratio:")
print(pca_all_components.explained_variance_ratio_)

cv_iris_2_log_model = sk_linear.LogisticRegression()
cv_iris_2_model_quality = sk_model_selection.cross_val_score(cv_iris_2_log_model, principal_components, y, cv=4,
                                                             scoring="accuracy")

print("2 pc model quality:")
Example #11
0
import multiprocessing
from collections import Counter
from sklearn import preprocessing
import scipy.special as special
from pandas import DataFrame, Series
from collections import Counter
np.random.seed(2019)
random.seed(2019)

## SVD to 
for file,dim in [('data/video_w2v.pkl',32),('data/audio_w2v.pkl',64)]:
    if file[-3:]=="pkl":
        df=pd.read_pickle(file)
    else:
        df=pd.read_csv(file)
    pca = sk_decomposition.PCA(n_components=dim,whiten=False,svd_solver='auto')
    pca.fit(df[df.columns[1:]])
    df1=pd.DataFrame(pca.transform(df[df.columns[1:]]))
    df1.columns=df.columns[1:dim+1]
    df1[df.columns[0]]=df[df.columns[0]].values
    df1.to_pickle(file[:-4]+'_svd_'+str(dim)+'.pkl')



    


def norm(train_df,test_df,features):   
    df=pd.concat([train_df,test_df])[features]
    scaler = preprocessing.QuantileTransformer(random_state=0)
    scaler.fit(df[features]) 
def _eval_search_params(params_builder):
    search_params = {}

    for p in params_builder['param_set']:
        search_list = p['sp_list'].strip()
        if search_list == '':
            continue

        param_name = p['sp_name']
        if param_name.lower().endswith(NON_SEARCHABLE):
            print("Warning: `%s` is not eligible for search and was "
                  "omitted!" % param_name)
            continue

        if not search_list.startswith(':'):
            safe_eval = SafeEval(load_scipy=True, load_numpy=True)
            ev = safe_eval(search_list)
            search_params[param_name] = ev
        else:
            # Have `:` before search list, asks for estimator evaluatio
            safe_eval_es = SafeEval(load_estimators=True)
            search_list = search_list[1:].strip()
            # TODO maybe add regular express check
            ev = safe_eval_es(search_list)
            preprocessors = (
                preprocessing.StandardScaler(), preprocessing.Binarizer(),
                preprocessing.Imputer(), preprocessing.MaxAbsScaler(),
                preprocessing.Normalizer(), preprocessing.MinMaxScaler(),
                preprocessing.PolynomialFeatures(),
                preprocessing.RobustScaler(), feature_selection.SelectKBest(),
                feature_selection.GenericUnivariateSelect(),
                feature_selection.SelectPercentile(),
                feature_selection.SelectFpr(), feature_selection.SelectFdr(),
                feature_selection.SelectFwe(),
                feature_selection.VarianceThreshold(),
                decomposition.FactorAnalysis(random_state=0),
                decomposition.FastICA(random_state=0),
                decomposition.IncrementalPCA(),
                decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS),
                decomposition.LatentDirichletAllocation(random_state=0,
                                                        n_jobs=N_JOBS),
                decomposition.MiniBatchDictionaryLearning(random_state=0,
                                                          n_jobs=N_JOBS),
                decomposition.MiniBatchSparsePCA(random_state=0,
                                                 n_jobs=N_JOBS),
                decomposition.NMF(random_state=0),
                decomposition.PCA(random_state=0),
                decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS),
                decomposition.TruncatedSVD(random_state=0),
                kernel_approximation.Nystroem(random_state=0),
                kernel_approximation.RBFSampler(random_state=0),
                kernel_approximation.AdditiveChi2Sampler(),
                kernel_approximation.SkewedChi2Sampler(random_state=0),
                cluster.FeatureAgglomeration(),
                skrebate.ReliefF(n_jobs=N_JOBS), skrebate.SURF(n_jobs=N_JOBS),
                skrebate.SURFstar(n_jobs=N_JOBS),
                skrebate.MultiSURF(n_jobs=N_JOBS),
                skrebate.MultiSURFstar(n_jobs=N_JOBS),
                imblearn.under_sampling.ClusterCentroids(random_state=0,
                                                         n_jobs=N_JOBS),
                imblearn.under_sampling.CondensedNearestNeighbour(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.EditedNearestNeighbours(random_state=0,
                                                                n_jobs=N_JOBS),
                imblearn.under_sampling.RepeatedEditedNearestNeighbours(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.InstanceHardnessThreshold(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.NearMiss(random_state=0,
                                                 n_jobs=N_JOBS),
                imblearn.under_sampling.NeighbourhoodCleaningRule(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.OneSidedSelection(random_state=0,
                                                          n_jobs=N_JOBS),
                imblearn.under_sampling.RandomUnderSampler(random_state=0),
                imblearn.under_sampling.TomekLinks(random_state=0,
                                                   n_jobs=N_JOBS),
                imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.RandomOverSampler(random_state=0),
                imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.BorderlineSMOTE(random_state=0,
                                                       n_jobs=N_JOBS),
                imblearn.over_sampling.SMOTENC(categorical_features=[],
                                               random_state=0,
                                               n_jobs=N_JOBS),
                imblearn.combine.SMOTEENN(random_state=0),
                imblearn.combine.SMOTETomek(random_state=0))
            newlist = []
            for obj in ev:
                if obj is None:
                    newlist.append(None)
                elif obj == 'all_0':
                    newlist.extend(preprocessors[0:36])
                elif obj == 'sk_prep_all':  # no KernalCenter()
                    newlist.extend(preprocessors[0:8])
                elif obj == 'fs_all':
                    newlist.extend(preprocessors[8:15])
                elif obj == 'decomp_all':
                    newlist.extend(preprocessors[15:26])
                elif obj == 'k_appr_all':
                    newlist.extend(preprocessors[26:30])
                elif obj == 'reb_all':
                    newlist.extend(preprocessors[31:36])
                elif obj == 'imb_all':
                    newlist.extend(preprocessors[36:55])
                elif type(obj) is int and -1 < obj < len(preprocessors):
                    newlist.append(preprocessors[obj])
                elif hasattr(obj, 'get_params'):  # user uploaded object
                    if 'n_jobs' in obj.get_params():
                        newlist.append(obj.set_params(n_jobs=N_JOBS))
                    else:
                        newlist.append(obj)
                else:
                    sys.exit("Unsupported estimator type: %r" % (obj))

            search_params[param_name] = newlist

    return search_params
Example #13
0
                y = df.index
            Xs.append(df.values)

    # for a in y:
    # 	print(a)
    # 	sys.stdout.flush()

    try:
        columns = ['id']
        df_out = pd.DataFrame(index=y)
        for t, X in enumerate(Xs):
            print(t, end=' ')
            sys.stdout.flush()

            pca = decomposition.PCA(n_components=2,
                                    svd_solver='full',
                                    random_state=0)
            pca.fit(X)
            Y = pca.transform(X)

            df_out['t{}d0'.format(t)] = Y[:, 0]
            df_out['t{}d1'.format(t)] = Y[:, 1]
        if (len(df_out) - df_out.count()).sum() == 0:
            df_out.to_csv('./Output/{}-pca_s1.csv'.format(
                dataset_dir.split('/')[-1]),
                          index_label='id')
            print(' '.join(sys.argv), 'OK')
        else:
            print(' '.join(sys.argv), 'crashed')
    except Exception as e:
        print(e)
from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn import decomposition

features = [
    'Market', 'Day', 'Stock', 'x0', 'x1', 'x2', 'x3A', 'x3B', 'x3C', 'x3D',
    'x3E', 'x4', 'x5', 'x6'
]

df = pd.read_csv('train.csv', index_col=0)
df = df.fillna(0)  # replace NaN entries
df_test = pd.read_csv('test.csv', index_col=0)
df_test = df_test.fillna(0)  # replace NaN entries

X = df[features]
Y = df['y']

model = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
#model = LGBMRegressor(n_estimators=1000, learning_rate=0.01)

pca = decomposition.PCA(n_components=len(features))
pca.fit(X)

model.fit(pca.transform(X), Y)
yp = pd.Series(model.predict(pca.transform(df_test[features]))).rename('y')
yp.index.name = 'Index'
print(yp.head())

yp.to_csv('RandomForestPCA.csv', header=True)
Example #15
0
knn = {'color': '#33a02c', 'model': neighbors.KNeighborsClassifier()}

c_svm = {'color': '#fb9a99', 'model': svm.SVC()}

perceptron = {'color': '#e31a1c', 'model': linear_model.Perceptron()}

n_bayes = {'color': '#fdbf6f', 'model': naive_bayes.GaussianNB()}

pca_logit = {
    'color':
    '#ff7f00',
    'model':
    Pipeline(steps=[(
        'pca',
        decomposition.PCA()), ('logit', linear_model.LogisticRegression())])
}

neural = {
    'color':
    '#cab2d6',
    'model':
    Pipeline(
        steps=[('rbm',
                BernoulliRBM()), ('logit', linear_model.LogisticRegression())])
}

passive_aggressive = {
    'color': '#ffff99',
    'model': linear_model.PassiveAggressiveClassifier()
}
#%%

###############################
# PCA for dimension reduction # (Possibility to do it with R for visualization)
###############################

from sklearn import decomposition
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='white')
import numpy as np

X_scaled = StandardScaler().fit_transform(X_train)
pca = decomposition.PCA().fit(X_scaled)

# Graph to choose the number of PC to keep 95% of the explained variability
plt.figure(figsize=(10, 7))
plt.plot(np.cumsum(pca.explained_variance_ratio_), color='k', lw=2)
plt.xlabel('Number of components')
plt.ylabel('Total explained variance')
plt.xlim(0, 12)
plt.yticks(np.arange(0, 1.1, 0.1))
plt.axvline(10, c='b')
plt.axhline(0.95, c='r')
plt.show()

# Fit your dataset to the optimal pca
pca = decomposition.PCA(n_components=9)
X_pca = pca.fit_transform(X_scaled)
import numpy as np
from sklearn import decomposition
import pandas as pd

df1 = pd.DataFrame({
    'F1': [10, 2, 8, 9, 12],
    'F2': [20, 5, 17, 20, 22],
    'F3': [10, 2, 7, 10, 11]
})
print(df1.corr())

pca1 = decomposition.PCA()

#learn the new principal axis
pca1.fit(df1)
#transform original data to new axis
df2 = pca1.transform(df1)

#variance of data along original dimensions = variance of data along new axis
tot_var_original = np.trunc(np.var(df1.F1) + np.var(df1.F2) + np.var(df1.F3))
tot_var_transformed = np.trunc(
    np.var(df2[:, 0]) + np.var(df2[:, 1]) + np.var(df2[:, 2]))

#principal components captures variance in decreasing order
print(pca1.explained_variance_ratio_)

pca2 = decomposition.PCA(n_components=1)
pca2.fit(df1)
df3 = pca2.transform(df1)
print(np.var(df3[:, 0]))
Example #18
0
        plt.imshow(comp.reshape(image_shape), cmap=plt.cm.bwr,
                   interpolation='nearest',
                   vmin=-vmax, vmax=vmax)
        plt.xticks(())
        plt.yticks(())
    plt.subplots_adjust(0.01, 0.05, 0.99, 0.93, 0.04, 0.)


###############################################################################
# List of the different estimators, whether to center and transpose the
# problem, and whether the transformer uses the clustering API.
estimators = [
    ('Linear Sieve',
     linearsieve.Sieve(n_hidden=n_components), False),
    ('Eigenfaces - PCA',
        decomposition.PCA(n_components=n_components), True),
    ('Non-negative components - NMF',
        decomposition.NMF(n_components=n_components, init='nndsvda', beta=5.0,
                       tol=5e-3, sparseness='components'), False),
    ('Independent components - FastICA',
        decomposition.FastICA(n_components=n_components, whiten=True), True),
    ('Sparse comp. - MiniBatchSparsePCA',
        decomposition.MiniBatchSparsePCA(n_components=n_components, alpha=0.8,
                                      n_iter=100, batch_size=3, random_state=rng), True),
    ('MiniBatchDictionaryLearning',
        decomposition.MiniBatchDictionaryLearning(n_components=15, alpha=0.1,
                                                  n_iter=50, batch_size=3, random_state=rng), True),
    ('Cluster centers - MiniBatchKMeans',
        MiniBatchKMeans(n_clusters=n_components, tol=1e-3, batch_size=20, max_iter=50, random_state=rng), True),
    ('Factor Analysis components - FA',
       decomposition.FactorAnalysis(n_components=n_components, max_iter=2), True),
Example #19
0
import numpy as np
from sklearn import decomposition
import pandas as pd

df1= pd.DataFrame({
        'F1':[10,2,8,9,12],
        'F2':[20,5,17,20,22],
        'F3':[10,2,7,10,11]})

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
df2 = sc.fit_transform(df1)
df2[:,[2]]

pca2 = decomposition.PCA()
#build pca model for given data
#principal components are just original dimensions rotated by some angle
#relation between given features to principal components
#PC1 = w11*F1 + w12*F2 + w13*F3
#PC2 = w21*F1 + w22*F2 + w23*F3
#PC3 = w31*F1 + w32*F2 + w33*F3
pca2.fit(df2)
print(pca2.components_)
print(pca2.explained_variance_.cumsum())
pca2.explained_variance_ratio_


#variance of data along original dimensions
tot_var_original = np.trunc(np.var(df2[:,[0]]) + np.var(df2[:,[1]]) + np.var(df2[:,[2]]))
#variance of data along principal component axes
tot_var_pca = np.trunc(np.sum(pca2.explained_variance_))
Example #20
0
def plot_pca_constraints(dataset, encoded_data, i_cl_row, i_cl_col, idx_tsne,
                         epoch):
    """
    plots pca of data points with connections between cannot-link constraints
    :param dataset:
    :param encoded_data:
    :param i_cl_row:
    :param i_cl_col:
    :param idx_tsne:
    :param epoch:
    :return:
    """
    if isinstance(dataset.train_labels, np.ndarray):
        y = dataset.train_labels
    else:
        y = dataset.train_labels.numpy()
    num_labels = int(max(y) + 1)
    cmap = plt.get_cmap('jet')
    mymap = colors.LinearSegmentedColormap.from_list(
        'trunc({n},{a:.2f},{b:.2f})'.format(n=cmap.name, a=0, b=1),
        cmap(np.linspace(0, 1, num_labels)))
    Z = [[0, 0], [0, 0]]
    levels = range(0, num_labels + 1, 1)
    CB = plt.contourf(Z, levels, cmap=mymap)
    idx_tsne = np.concatenate((idx_tsne, i_cl_row, i_cl_col))
    idx_tsne = np.unique(idx_tsne)
    idx_constraints_row_tsne = idx_tsne.searchsorted(i_cl_row)
    idx_constraints_col_tsne = idx_tsne.searchsorted(i_cl_col)
    encoded_data = [encoded_data[i] for i in idx_tsne]
    pca = decomposition.PCA(n_components=2)
    pca.fit(encoded_data)
    u_pca = pca.transform(encoded_data)
    y_pca = [y[i] for i in idx_tsne]
    # save output:
    with open('./pca_epoch_{}.png'.format(epoch),
              'w') as f:  # Python 3: open(..., 'wb')
        pickle.dump([
            u_pca, idx_constraints_row_tsne, idx_constraints_col_tsne, y,
            idx_tsne
        ], f)

    # Plot figure with subplots of different sizes - large 10*10 for tsne + two columns of 10 images for the pairs
    plt.figure(1, clear=True)
    # set up subplot grid
    gridspec.GridSpec(10, 12)

    # large subplot - tsne:
    plt.subplot2grid((10, 12), (0, 0), colspan=10, rowspan=10)
    plt.scatter(u_pca[:, 0], u_pca[:, 1], marker='o', c=y_pca, cmap=mymap, s=1)
    plt.colorbar(CB, ticks=range(num_labels))
    plt.clim(-0.5, float(num_labels) - 0.5)

    norm = colors.Normalize(0, num_labels)
    for i in range(len(idx_constraints_col_tsne)):
        color = cmap(norm(y[idx_constraints_row_tsne[i]]))
        plt.plot([
            u_pca[idx_constraints_row_tsne[i], 0],
            u_pca[idx_constraints_col_tsne[i], 0]
        ], [
            u_pca[idx_constraints_row_tsne[i], 1],
            u_pca[idx_constraints_col_tsne[i], 1]
        ],
                 color='k',
                 marker='o',
                 markersize=0.3,
                 markerfacecolor=color,
                 markeredgewidth=0.1)
    plt.savefig('./pca_epoch_{}.png'.format(epoch))
Example #21
0
sns.set(style='white')

digits = datasets.load_digits()
X = digits.data
y = digits.target

# Картинки - матрицы 8х8 интенсивности каждого пикселя. Матрица разворачивается в вектор 64 для признакового описания.
# f, axes = plt.subplot(5, 2, sharey=True, figsize=(16, 6))
plt.figure(figsize=(16, 6))
for i in range(10):
    plt.subplot(2, 5, i + 1)
    plt.imshow(X[i, :].reshape([8, 8]))
plt.show()

print('Projecting %d-dimensional data to 2D' % X.shape[1])
pca = decomposition.PCA(n_components=2)
X_reduced = pca.fit_transform(X)

plt.figure(figsize=(12, 10))
plt.scatter(X_reduced[:, 0],
            X_reduced[:, 1],
            c=y,
            edgecolors='none',
            alpha=0.7,
            s=40,
            cmap=plt.cm.get_cmap('nipy_spectral', 10))
plt.colorbar()
plt.title('MNIST. PCA projection')
plt.show()

# Покажем, что имеет смысл сужать количество компонент так, чтобы оставалось не менее 90% исходной дисперсиии.
                                                    random_state=0)

###only inclulde one dummy category to avoid multicollinearity, either one could be chosen
#datacorr = data2.corr() #correlation matrix, showing correlation between each variable and all the others
#data.corr().head()

#sb.heatmap(datacorr, cmap = 'bwr') #heatmap of correlation matrix
###darker colors represent higher correlation, several pairs of variables are highly correlated.

#Two highly correlated variables should not be both used in model. PCA will later be performed to explain the same variance while avoiding multicollinearity

##drop response variable and standardize predictor variables

#X = data_stnd #store predictor variables
#y = data['diagnosis_dummies'] #store response variable
pca = skdc.PCA()  #empty model space
pcafit = pca.fit_transform(x, y)  ##apply dimensionality reduction to X
var_explained = pca.explained_variance_ratio_  #ratio of variance each PC explains
print(pd.Series(var_explained))
###Since 29 components aren't necessary, the last 20 PCs will be disregarded
###since they explain less than.01 of the variance
##indeed,the first 10 PCs explain 95% of the variance

pca = skdc.PCA(n_components=10)  #only include first 10 components
logreg = sklm.LogisticRegression()  #empty model space
pipeline = skpl.Pipeline([('pca', pca), ('logistic', logreg)
                          ])  #create pipeline from pca to logregression space
predRight = 0  #create count variables
predWrong = 0

fit = pipeline.fit(x_train, y_train)  #fit model
    def get_channels(self, avi_rgb_file, avi_grey_file):

        self.__avi_rgb = avi_rgb_file
        self.__avi_grey = avi_grey_file

        # Read first frames
        self.__cap_rgb = cv2.VideoCapture(avi_rgb_file)
        self.__cap_grey = cv2.VideoCapture(avi_grey_file)
        ret_rgb, frame_rgb1 = self.__cap_rgb.read()
        ret_grey, frame_grey1 = self.__cap_grey.read()

        # Mirror grey frame
        mirror_grey1 = cv2.flip(frame_grey1, 1)

        # TODO Non-contant arguments
        self.calculatePerspectiveTransform("out1.png", "out2.png")
        frame_rgb1 = self.transform(frame_rgb1, mirror_grey1)

        # Here we select ROI
        self.select_roi(frame_rgb1)
        cropped_grey1 = self.crop(frame_rgb1)
        i = 0

        while(1):
            ret_rgb, frame_rgb2 = self.__cap_rgb.read()
            ret_grey, frame_grey2 = self.__cap_grey.read()

            if(ret_rgb & ret_grey == True):

                mirror_grey2 = cv2.flip(frame_grey2, 1)
                frame_rgb2 = self.transform(frame_rgb2, mirror_grey2)

                # Crop current ROI
                cropped_rgb2 = self.crop(frame_rgb2)
                cropped_grey2 = self.crop(mirror_grey2)

                self.__C1.append(np.mean(cropped_grey2[: ,:, 1]))
                self.__R.append(np.mean(cropped_rgb2[:, :, 0]))
                self.__G.append(np.mean(cropped_rgb2[:, :, 1]))
                self.__B.append(np.mean(cropped_rgb2[:, :, 2]))

                i += 1
            else:
                break

        # Smoothing
        r = 3
        self.__C1 = smooth(self.__C1, r)
        self.__R = smooth(self.__R, r)
        self.__G = smooth(self.__G, r)
        self.__B = smooth(self.__B, r)

        plt.figure(1)
        plt.plot(self.__C1, label="pasmo szarości", color="black")
        plt.plot(self.__R, label="pasmo czerwieni", color="red")
        plt.plot(self.__G, label="pasmo zielone", color="green")
        plt.plot(self.__B, label="pasmo niebieskie", color="blue")
        plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
                   ncol=2, mode="expand", borderaxespad=0.)
        plt.title("Srednie wartosci z kanalow barwnych")
        plt.show(block=True)

        # dataset
        matrix = np.zeros([len(self.__C1), 4])
        for i in range(len(matrix)):
            matrix[i] = [self.__C1[i], self.__R[i], self.__G[i], self.__B[i]]
        data = np.mat(matrix)

        # PCA
        pca_components = 4
        pca = decomposition.PCA(n_components=pca_components).fit(data)
        X_out_pca = pca.transform(data)

        plt.figure(2)
        for i in range(4):
            plt.subplot(2,2,i+1)
            plt.plot(X_out_pca[:,i])
            plt.title("PCA kanał " + str(i+1))
        plt.show(block=True)


        # ICA
        ica = decomposition.FastICA(n_components=4)
        ICA_out = ica.fit(X_out_pca).transform(X_out_pca)  # Estimate the sources

        plt.figure(3)
        for i in range(4):
            plt.subplot(2,2,i+1)
            plt.plot(ICA_out[:,i])
            plt.title("ICA kanał " + str(i+1))
        plt.show(block=True)

        #FFT of PCA
        for i in range(4):
            xf, yf = self.count_fft(X_out_pca[:, i])
            N = len(X_out_pca[:,i])

            for index, freq in enumerate(xf):
                if (freq < 0.04 or freq > 4):
                    yf[index] = 0
            plt.figure(4)
            plt.subplot(2,2,i+1)
            plt.plot(xf, np.abs(yf[0:N // 2]))
            plt.title("FFT pf PCA")

            #IFFT
            new_yf = ifft(yf)

            plt.figure(5)
            plt.subplot(2,2,i+1)
            plt.plot(new_yf)
            plt.title("IFFT pf PCA")

        plt.show(block=True)

        # FFT of ICA
        for i in range(4):
            xf, yf = self.count_fft(ICA_out[:, i])
            N = len(ICA_out[:, i])

            for index, freq in enumerate(xf):
                if (freq < 0.04 or freq > 4):
                    yf[index] = 0
            plt.figure(6)
            plt.subplot(2, 2, i + 1)
            plt.plot(xf, np.abs(yf[0:N // 2]))
            plt.title("FFT pf ICA")

            # IFFT
            new_yf = ifft(yf)

            plt.figure(7)
            plt.subplot(2, 2, i + 1)
            plt.plot(new_yf)
            plt.title("IFFT pf ICA")

        plt.show(block=True)
Example #24
0
def perform_PCA(X,pca_components):
    pca = decomposition.PCA(n_components=pca_components)
    pca.fit(X)
    X_PCA = pca.transform(X)
    return X_PCA
Example #25
0
 def train(self, features, new_dim):
     self.model = decomposition.PCA(n_components=new_dim)
     self.model.fit(features)
 def pca(self):
     pca = decomposition.PCA(n_components=2)
     pos = pca.fit(self.input_matrix).transform(self.input_matrix)
     return pos
Example #27
0
"""

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from sklearn import decomposition
from sklearn import datasets

np.random.seed(5)

iris = datasets.load_iris()
X = iris.data
y = iris.target
centers = [[1, 1], [-1, -1], [1, -1]]
pca = decomposition.PCA(n_components=3)
pca.fit(X)
X = pca.transform(X)

fig = plt.figure(1, figsize=(4, 3))
plt.clf()
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)  #3D圖的長寬高
plt.cla()

for name, label in [('Setosa', 0), ('Versicolour', 1), ('Virginica', 2)]:
    ax.text3D(X[y == label, 0].mean(),
              X[y == label, 1].mean() + 1.5,
              X[y == label, 2].mean(),
              name,
              horizontalalignment='center',
              bbox=dict(alpha=.5, edgecolor='w', facecolor='w'))  #字典資料組
Example #28
0
    def execute(self, callback=None):
        """This does the actual class-averaging, and returns the result"""
        options = self.options

        if options["verbose"] > 0:
            print("Start averaging class {} with {} particles ".format(
                options["classnum"], len(options["particles"])))

        files = self.data["particles1"][1], self.data["particles2"][1]

        ptcls = options["particles"]  # just a shortcut

        #if len(options["particles"])<5 :
        #z=EMData(str(files[ptcls[0][0]]),ptcls[0][1]).to_zero()
        #return {"avg":z,"basis":[z,z,z,z,z]}

        #		print files,ptcls[0]
        # read in all particles and append each to element to ptcls
        avgr = Averagers.get("mean")
        for p in ptcls:
            p.append(
                EMData(str(files[p[0]]), p[1]).process("xform",
                                                       {"transform": p[2]}))
            p.append(p[-1].process("filter.highpass.gauss", {
                "cutoff_freq": 0.01
            }).process("filter.lowpass.gauss", {
                "cutoff_freq": 0.05
            }).process("normalize.circlemean", {
                "radius": -6
            }).process("mask.soft", {
                "outer_radius": -8,
                "width": 4
            }))
            avgr.add_image(p[4])

        # Copy each particle minus it's mean value
        avg = avgr.finish()

        # PCA on the mean-subtracted particles
        # At this point p[3] will be the particle in the correct orientation
        # p[4] will be the filtered/masked particle
        # p[5] will be the filtered/masked/bg subtr particle
        for p in ptcls:
            p.append(p[4].copy())
            p[5].sub(avg)

        if options["mask"] == None:
            mask = ptcls[0][-1].copy()
            mask.to_one()
            #			mask.process("mask.soft",{"outer_radius":-8,"width":4})
            mask.process("mask.sharp",
                         {"outer_radius": max(-10, -mask["ny"] // 25)})
        else:
            mask = options["mask"]
        nmask = int(
            mask["square_sum"]
        )  # ok, square part is silly, but gives a count of "1" pixels

        #		print "basis start"
        #if options["verbose"]>1: print("PCA {}, {}".format(options["classnum"],len(ptcls)))
        #pca=Analyzers.get("pca_large",{"nvec":options["nbasis"],"mask":mask,"tmpfile":"tmp{}".format(options["classnum"])})
        #for p in ptcls:
        #pca.insert_image(p[5])		# filter to focus on lower resolution differences
        #basis=pca.analyze()

        # use numpy/sklearn instead of the old EMAN2 PCA code (more flexible)
        datamx = EMData(nmask, len(ptcls))
        for i, p in enumerate(ptcls):
            p5 = p[5].process("misc.mask.pack", {"mask": mask})
            datamx.insert_clip(p5, (0, i, 0))
        npdata = to_numpy(datamx)  # need to be careful about deleting datamx

        # actual PCA calculation
        msa = skdc.PCA(n_components=options["nbasis"])
        msa.fit(npdata)

        # we just need one basis vector
        basis = msa.components_[self.options["usebasis"]]
        basis = from_numpy(basis).process("misc.mask.pack", {
            "mask": mask,
            "unpack": 1
        })

        ## Varimax rotation... good idea?
        #if not options["novarimax"]:
        #if options["verbose"]>1: print("Varimax {}".format(options["classnum"]))
        #pca2=Analyzers.get("varimax",{"mask":mask})
        #for im in basis:
        #pca2.insert_image(im)

        #basis=pca2.analyze()

        # if you turn this on multithreaded it will crash sometimes
        #avg.mult(0.05)	#arbitrary for debugging
        #avg.write_image("pca.hdf",-1)
        #basis[0].write_image("pca.hdf",-1)
        #basis[1].write_image("pca.hdf",-1)
        #basis[2].write_image("pca.hdf",-1)
        #basis[3].write_image("pca.hdf",-1)
        #		print "basis"

        # at the moment we are just splitting into 2 classes, so we'll use the first eigenvector. A bit worried about defocus coming through, but hopefully ok...
        dots = [
            p[5].cmp("ccc", basis) for p in ptcls
        ]  # NOTE: basis number is passed in as an option, may not be #1 or #3 (default)
        if len(dots) == 0:
            return {"failed": True}
        dota = old_div(sum(dots), len(dots))

        #		print "average"
        if options["verbose"] > 1:
            print("Split by dot {}".format(options["classnum"]))

        # we will just use the sign of the dot product to split
        avgr = [Averagers.get("mean"), Averagers.get("mean")]
        incl = [[], []]
        for i, d in enumerate(dots):
            if d < dota:
                avgr[0].add_image(ptcls[i][3])
                incl[0].append(ptcls[i][0])
                incl[0].append(ptcls[i][1])
            else:
                avgr[1].add_image(ptcls[i][3])
                incl[1].append(ptcls[i][0])
                incl[1].append(ptcls[i][1])

        #for p in ptcls:
        #avgr.add_image(p[3].process("xform",{"transform":p[2]}))

        if len(incl[0]) == 0 or len(incl[1]) == 0:
            if options["verbose"] > 0:
                print("No separation on class {}".format(options["classnum"]))
            return {"failed": True}

        if options["verbose"] > 0:
            print("Finish averaging class {}".format(options["classnum"]))
        #		if callback!=None : callback(100)
        #		return {"avg":avg,"basis":basis}
        try:
            avg1 = avgr[0].finish()
            avg1["xform.projection"] = options["euler"]
            avg1["class_eoidxs"] = incl[
                0]  # contains alternating file # and particle #
            #			print avg1
            avg2 = avgr[1].finish()
            avg2["xform.projection"] = options["euler"]
            avg2["class_eoidxs"] = incl[1]
#			print avg2
        except:
            return {"failed": True}

#		print basis

        return {
            "avg1": avg1,
            "avg2": avg2,
            "basis": basis
        }  # basis was originally the first 3 vectors, now we are only returning the selected one. Will see if it's a problem
Example #29
0
 def x_pca(x, n_components):
     pca_m = decomposition.PCA(n_components)
     pc = pca_m.fit_transform(x)
     return pc, pca_m
Example #30
0
    return args.input_dir, args.output_dir, args.n


input_dir, output_dir, n_comp = ParseArguments()

n_comp = int(n_comp)

# wczytujemy dane
x_train, y_train, x_test, y_test, classes_names = read_data(input_dir)

### PCA

print("PCA reduction ", x_train.shape[1], " -> ", n_comp, " ...", end=" ")

pca = decomposition.PCA(n_components=n_comp, svd_solver='randomized')

## wyuczenie macierzy tranformacji i zaaplikowanie jej na x_train
start_time = time.time()
x_train_reduced = pca.fit_transform(x_train)
print("  took %s seconds " % round((time.time() - start_time), 5))

# zastosowanie tej samej macierzy na x_test

x_test_reduced = pca.transform(x_test)

# zapisujemy dane

save_data(x_train_reduced, y_train, x_test_reduced, y_test, classes_names,
          output_dir)