Beispiel #1
0
def preprocess(Xtr, Xvl, use_pca, max_pca_components=None):
    """
    The data preprocessing
    Xtr - the training data features
    Xvl - the test data features
    use_pca - whether to use PCA for feature space reduction
    max_pca_components - the maximal number of PCA components to extract
    return preprocessed features
    """
    if use_pca:
        if max_pca_components == None:
            raise "Please specify maximal number of PCA components to extract"
        #scaler = decomposition.RandomizedPCA(n_components=max_features)
        scaler = decomposition.SparsePCA(n_components=max_pca_components)
        print 'PCA max features to keep: %d' % (max_pca_components)
        Xtr = scaler.fit_transform(
            Xtr
        )  # fit only for train data (http://cs231n.github.io/neural-networks-2/#datapre)
        Xvl = scaler.transform(Xvl)
    else:
        scaler = StandardScaler(copy=False)
        # scale only first column 'SUBJID'
        xtr_subj = Xtr[:, :1]
        xvl_subj = Xvl[:, :1]
        xtr_subj = scaler.fit_transform(
            xtr_subj
        )  # fit only for train data (http://cs231n.github.io/neural-networks-2/#datapre)
        xvl_subj = scaler.transform(xvl_subj)

    print 'Train data mean: %f, variance: %f' % (Xtr.mean(), Xtr.std())
    print 'Test data mean: %f, variance: %f' % (Xvl.mean(), Xvl.std())

    return Xtr, Xvl
Beispiel #2
0
    def PCA(self, X, Y=None, ncomp=2, method='PCA'):
        """ decompose a multivariate dataset in an orthogonal
            set that explain a maximum amount of the variance

        @param X: Input dataset

        Keyword Arguments:
        ncomp  -- number or components to be kept (Default: 2)
        method -- method to be used
                  PCA(default)/Randomized/Sparse

        """
        from sklearn import decomposition
        from sklearn import cross_decomposition
        if method == 'Randomized':
            pca = decomposition.RandomizedPCA(n_components=ncomp)
        elif method == 'Sparse':
            pca = decomposition.SparsePCA(n_components=ncomp)
        elif method == 'rbf':
            pca = decomposition.KernelPCA(n_components=ncomp,
                                          fit_inverse_transform=True,
                                          gamma=10,
                                          kernel="rbf")
        elif method == 'linear':
            pca = decomposition.KernelPCA(n_components=ncomp, kernel="linear")
        elif method == 'sigmoid':
            pca = decomposition.KernelPCA(n_components=ncomp, kernel="sigmoid")
        elif method == 'SVD':
            pca = decomposition.TruncatedSVD(n_components=ncomp)
        else:
            pca = decomposition.PCA(n_components=ncomp)
            method = 'PCA'
        print('[ML] Using %s method' % method)
        pca.fit(X)
        return pca.transform(X)
Beispiel #3
0
    def _train(self, train_data, params, verbose):
        import sklearn.decomposition as sk_dec
        if verbose:
            print("Training {} ...".format(self.name))
        start_time = time.time()
        try:
            covs = []
            for x in train_data:
                est = sk_dec.SparsePCA(n_components=params['n_components'],
                                       alpha=params['alpha'],
                                       ridge_alpha=params['ridge_alpha'],
                                       max_iter=params['max_iter'],
                                       tol=params['tol'])
                est.fit(x)

                # get covariance: \Psi + \Lambda.T * \Sigma_{zz} * \Lambda
                z = est.transform(x)
                cov_z = np.cov(z.T)
                var_x = np.var(x, axis=0)
                cov = np.dot(est.components_.T, np.dot(cov_z, est.components_))
                np.fill_diagonal(cov, var_x)

                covs.append(cov)
        except Exception as e:
            covs = None
            if verbose:
                print(f"\t{self.name} failed with message: {e}")
        finish_time = time.time()
        if verbose:
            print("\tElapsed time {:.1f}s".format(finish_time - start_time))
        return covs, None
Beispiel #4
0
    def PreprocessingSparsePCA(self, PCA_coefficients, MNE_coefficients, N_neighbors):
        """
        :type MNE_coefficients: int
        :type PCA_coefficients: int
        :param MNE_coefficients: number of coefficnents for mns projection
        :param PCA_coefficients: number of n_coefficients for PCA transform
        :param N_neighbors: number of neighbors for embedding
        """
        self.MNE_coefficients = MNE_coefficients
        self.PCA_coefficients = PCA_coefficients
        self.N_neighbors = N_neighbors

        self.pca = decomposition.SparsePCA(n_components=self.PCA_coefficients,
                                           alpha=0.5, ridge_alpha=0.01, max_iter=1000,
                                           tol=1e-06, method='lars',
                                           n_jobs=-1, U_init=None,
                                           V_init=None, verbose=False,
                                           random_state=0)

        self.Embedding = manifold.SpectralEmbedding(n_components=self.MNE_coefficients,
                                                    affinity='nearest_neighbors',
                                                    gamma=None, random_state=0,
                                                    n_neighbors=self.N_neighbors)
        self.X_pca = self.pca.fit_transform(self.Waves_Coefficients)
        self.X_red = self.Embedding.fit_transform(self.X_pca)
        return self.X_red
 def create_components_feat(self, components=[1, 2, 3]):
     spca = decomposition.SparsePCA(alpha=0.02)
     length = self.shape[1]
     print(self)
     data = spca.fit_transform(self)
     for comp in components:
         self['PCA_comp_' + str(comp)] = data[:, comp]
 def SparsePCA(self, source):
     min_max_scaler = preprocessing.MinMaxScaler()
     data_source = min_max_scaler.fit_transform(source)
     pca = decomposition.SparsePCA(n_components=2)
     print(pca.error_)  #不一定对的得得得得得
     result = {}
     result['data'] = pca.fit_transform(data_source)
     result['params'] = 0
     return result
def SparsePCA(array, percent_samples):
    print "Sparse PCA", percent_samples * 100, "% of training data."
    print "Features\tTime"

    array = array[:int(percent_samples * len(array))]
    for pct in pct_features_list:
        num_features = int(pct * len(array[0]))
        start = time()
        Y = decomposition.SparsePCA(
            n_components=num_features).fit_transform(array)
        end = time()
        print num_features, "\t", (end - start)
def DimensionReduction(filename, vector_list, reducedDim):
    print "PCA starts"
    print "reduced dimension :", reducedDim
    start = time.time()

    pca = decomposition.SparsePCA(reducedDim)
    result = pca.fit_transform(vector_list)

    np.savetxt("data/reducedVec_" + filename + "_dim" + str(reducedDim) +
               ".csv",
               result,
               delimiter=",")
    elapsed_time = time.time() - start
    print "PCA finish : %s [min]" % (elapsed_time / 60)
    return result
Beispiel #9
0
def reduce_dimensionality(df, distances, dim_type):
    print 'reducing dimensionality'
    if dim_type == 'mds':
        # convert two components as we're plotting points in a two-dimensional plane
        # "precomputed" because we provide a distance matrix
        # we will also specify 'random_state' so the plot is reproducible.
        mds = manifold.MDS(n_components=2,
                           dissimilarity="precomputed",
                           random_state=1)
        pos = mds.fit_transform(distances)  # shape (n_components, n_samples)
    if dim_type == 'isomap':
        pos = manifold.Isomap(n_neighbors=10,
                              n_components=2).fit_transform(distances)
    if dim_type == 'pca':
        pca = decomposition.PCA(n_components=2, random_state=1)
        pos = pca.fit_transform(distances)
    if dim_type == 'sparsepca':
        pca = decomposition.SparsePCA(n_components=2, random_state=1)
        pos = pca.fit_transform(distances)
    if dim_type == 'tsne':
        tsne = manifold.TSNE(n_components=2, init='pca', random_state=1)
        pos = tsne.fit_transform(distances)
    if dim_type == 'spectral':
        se = manifold.SpectralEmbedding(n_components=2,
                                        n_neighbors=10,
                                        random_state=1)
        pos = se.fit_transform(distances)
    if dim_type == 'lle':
        methods = ['standard', 'ltsa', 'hessian', 'modified']
        lle = manifold.LocallyLinearEmbedding(n_neighbors=10,
                                              n_components=2,
                                              method=methods[1],
                                              random_state=1)
        pos = lle.fit_transform(distances)
    if dim_type == 'trunc':
        tsvd = decomposition.TruncatedSVD(n_components=2, random_state=1)
        pos = tsvd.fit_transform(distances)

    xs, ys = pos[:, 0], pos[:, 1]
    names = df.index.values

    for x, y, name in zip(xs, ys, names):
        plt.scatter(x, y, s=5)


#        plt.text(x, y, name)  # can be overwhelming with lots of data points
    plt.show()
    return xs, ys, names
Beispiel #10
0
def choose_decomposition_method(method, n_components):
    """Return the decomposition corresponding to `method`."""
    if method == 'PCA':
        return decomposition.PCA(n_components)
    elif method == 'Randomized PCA':
        return decomposition.RandomizedPCA(n_components)
    elif method == 'Kernel PCA':
        return decomposition.KernelPCA(n_components, kernel='rbf')
    elif method == 'Sparse PCA':
        return decomposition.SparsePCA(n_components, n_jobs=1)
    elif method == 'SVD':
        return decomposition.TruncatedSVD(n_components)
    elif method == 'Factor Analysis':
        return decomposition.FactorAnalysis(n_components)
    elif method == 'ICA':
        return decomposition.FastICA(n_components)
    raise ValueError('{} is not a known method'.format(method))
 def SparsePCA(self, n_comps=[]):
     if n_comps == []: n_comps = self.datatrain.shape[1]
     self.sparsecomp = decomp.SparsePCA()
     self.sparsecomp.fit(self.datatrain)
     #        if n_comps==self.numdims:
     #            scree = np.vstack((np.array(range(len(self.princomp.explained_variance_))),self.princomp.explained_variance_))
     #            X2=scree[:,0]
     #            X1=scree[:,-1]
     #            distance = np.dot((X2*np.ones([len(self.princomp.explained_variance_),2]))-scree.transpose(),(X1*np.ones([len(self.princomp.explained_variance_),2])-scree.transpose()).transpose())
     #            distance=distance/np.dot(X2-X1,(X2-X1).transpose())
     #            distance=np.diag(distance)
     #            self.princomp.n_components=min(enumerate(distance),key=itemgetter(1))[0]+1
     self.sparsecompscores = [
         self.sparsecomp.transform(self.datatrain),
         self.sparsecomp.transform(self.dataval),
         self.sparsecomp.transform(self.datatest)
     ]
Beispiel #12
0
# We drop the columns that are not interesting for us, and the row with no label
data.select_good_columns(columns_to_delete + sub_diagnosis_id)
data.preprocessing(label_id)
if NORMALIZE:
    data.normalize_age(label_age, label_gender)

if MISSING_VALUE_STRATEGY == 'Binary':
    # We create the binary columns
    data.create_missing_data_col()

if MISSING_VALUE_STRATEGY in ['Replacement', 'Binary']:
    # We replace missing values in the ADOS answers (8) by 0
    data.replace(8, 3, inplace=True)

pca = decomposition.PCA()
spca = decomposition.SparsePCA(alpha=0.02)
kpca = decomposition.KernelPCA(kernel='cosine')
kpca2 = decomposition.KernelPCA(kernel='sigmoid')

pca_plot(pca, 1)

plt.figure(4)
plt.plot(pca.explained_variance_ratio_)

pca_plot(spca, 5)

pca_plot(kpca, 8, show_coef=False)

pca_plot(kpca2, 10, show_coef=False)

plt.show()
Beispiel #13
0
import heapq
import sys, string
from numpy import loadtxt
import os
import os.path
global test_list
import matplotlib.pyplot as plt
from sklearn import decomposition
from sklearn.decomposition import SparsePCA
lis = []


def strip_first_col(fname, delimiter=None):
    with open(fname, 'r') as fin:
        for line in fin:
            try:
                yield line.split(delimiter, 1)[1]
            except IndexError:
                continue


X = np.loadtxt(strip_first_col(
    "/Volumes/MyPassport/Leucegene/Leucegene_Results/AlternativeOutput/ExpressionInput/exp.LeucegeneSplicing.txt"
),
               skiprows=1)
X = zip(*X)
X = np.array(X)
sparsepc = decomposition.SparsePCA()
sparsepc.fit(X)
print("Optimal number of features : %d" % sparsepc.components_)
# load the nifti data
masker = NiftiMasker(mask_img='../data/playground/overlap_mask_3mm.nii.gz')
subs_2d = masker.fit_transform('../data/playground/subs_3mm.nii.gz')

for csv in glob('../data/raw/subs-405*.csv'):
    behav_df = pd.read_csv(csv, converters={'id': lambda x: str(x).zfill(4)})
    behav_df.set_index('id', inplace=True)
    behav_df

    X_train, X_test, y_train, y_test = train_test_split(
        subs_2d, behav_df['score'].values, test_size=0.10)
    # preprocessing strategy
    # PCA
    pca = dcm.PCA(svd_solver='full')
    spca = dcm.SparsePCA()
    sparse_alpha_opts = [0.1, 0.5, 1, 2, 5, 10]
    kpca = dcm.KernelPCA()
    kernel_opts = ["linear", "rbf", "sigmoid"]
    n_component_opts = [0.7, 0.8, 0.9, 0.95, 0.99]
    lasso = linear_model.LassoCV(max_iter=100000,
                                 n_jobs=28,
                                 alphas=np.arange(0.1, 50, 0.1))
    lasso_lars = linear_model.LassoLarsCV(n_jobs=28,
                                          max_iter=100000,
                                          max_n_alphas=10000)
    eps_opts = [10.0, 5.0, 2.0, 1.5, 0.9, 0.1, 0.01, 0.001, 0.0001]
    elastic = linear_model.ElasticNetCV(alphas=np.arange(0.1, 50, 0.1),
                                        max_iter=100000)
    l1_ratio_opts = [0.1, 0.5, 0.9, 0.95, 0.99]
    lasso_lars_bay = linear_model.LassoLarsIC(max_iter=100000)
Beispiel #15
0
])

#normalize them
patch_scaler = preprocessing.StandardScaler()
stimuli_patches = patch_scaler.fit_transform(stimuli_patches)

#%%
nfeat = 15
rpca = decomposition.RandomizedPCA(n_components=nfeat, whiten=True)
rpca.fit(unlagged_stimuli)

unlagged_stimuli = rpca.transform(unlagged_stimuli)

#%%
#sparse pca
spca = decomposition.SparsePCA(n_jobs=-1)
spca.fit(unlagged_stimuli)

unlagged_stimuli = spca.transform(unlagged_stimuli)

#%%
#dictionary minibatch
mbdic = decomposition.MiniBatchDictionaryLearning(n_components=50,
                                                  verbose=True)
mbdic.fit(stimuli_patches)

#%%
#visualize

V = mbdic.components_
plt.figure()
def generate_transformers(x,
                          dataset,
                          global_dir,
                          min_variance=10,
                          additional_scale_tsvd=1):
    """
  This function returns a dictionary with callables for a given dataset.
  """

    transform_functions = {
        'vae': (lambda x: transform_vae(x, VAE_net)),
        'pca': (lambda x: transform_pca(x, pca, var_pca)),
        'tsvd': (lambda x: transform_tsvd(x, tsvd)),
        'kpca': (lambda x: transform_kpca(x, kpca)),
        'spca': (lambda x: transform_spca(x, spca)),
        'iso': (lambda x: transform_iso(x, iso)),
        'lle': (lambda x: transform_lle(x, lle)),
    }
    """
  Note that below, we could have dynamically generated most transformer
  functions. However, doing so would potentially lose overview, and we
  do not have to optimize for efficiency here, while we actually have
  to preserve readability.
  """

    ################ Regular PCA ################

    pca = decomposition.PCA(n_components=2)
    var_pca = np.var(pca.fit_transform(
        x))  # We do this in one call, since we don't need latent_X for now

    # print(np.sum(pca.explained_variance_ratio_)) # Could be interesting to explain results with

    def transform_pca(x, pca, var_pca):
        return np.matmul(x, np.transpose(
            pca.components_)) / math.sqrt(var_pca) * math.sqrt(min_variance)

    ################ Truncated SVD ################
    tsvd = decomposition.TruncatedSVD(n_components=2,
                                      n_iter=7,
                                      random_state=42)
    var_tsvd = np.var(tsvd.fit_transform(x))

    def transform_tsvd(x, tsvd):
        return np.matmul(x, np.transpose(tsvd.components_)) / math.sqrt(
            var_tsvd) * math.sqrt(min_variance) * additional_scale_tsvd

    ################ Kernel PCA ################
    kpca = decomposition.KernelPCA(n_components=2,
                                   kernel="sigmoid",
                                   fit_inverse_transform=True,
                                   gamma=None,
                                   random_state=42)
    var_kpca = np.var(kpca.fit_transform(x))

    if 0. in kpca.lambdas_:  # KPCA with Sigmoid kernel does not work for this set
        del transform_functions['kpca']

    def transform_kpca(x, kpca):
        x = np.array(x)
        if len(x.shape) == 1:
            x = x.reshape(1, -1)
        return kpca.transform(x) / math.sqrt(var_kpca) * math.sqrt(
            min_variance)

    ################ Sparse PCA ################
    spca = decomposition.SparsePCA(n_components=2,
                                   alpha=0.0001,
                                   random_state=42,
                                   n_jobs=-1)
    var_spca = np.var(spca.fit_transform(x))

    def transform_spca(x, spca):
        return np.matmul(x, np.transpose(
            spca.components_)) / math.sqrt(var_spca) * math.sqrt(min_variance)

    ################ ISO ################
    iso = manifold.Isomap(n_neighbors=8, n_components=2, eigen_solver='dense')
    var_iso = np.var(iso.fit_transform(x))

    def transform_iso(x, iso):
        x = np.array(x)
        if len(x.shape) == 1:
            x = x.reshape(1, -1)
        return iso.transform(x) / math.sqrt(var_iso) * math.sqrt(min_variance)

    ################ LLE ################
    lle = manifold.LocallyLinearEmbedding(n_neighbors=8,
                                          n_components=2,
                                          eigen_solver='dense')
    var_lle = np.var(lle.fit_transform(x))

    def transform_lle(x, lle):
        x = np.array(x)
        if len(x.shape) == 1:
            x = x.reshape(1, -1)
        return lle.transform(x) / math.sqrt(var_lle) * math.sqrt(min_variance)

    ################ SCVIS VAE ################
    VAE_save_file = global_dir + "/results/vae_models/" + dataset + ".pt"

    if not os.path.isfile(VAE_save_file):
        # Auto-encoder needs to be trained on the model first
        print('Training new VAE model on %s dataset' % dataset)
        trainVAE(
            x, global_dir, dataset
        )  # normalizing using np.max(np.abs(x)) not necessary as it equals 1

    # Once trained, it loads existing model, also for reproducability
    VAE_model = torch.load(VAE_save_file)['model_state_dict']

    print('Loaded VAE model for %s dataset' % dataset)

    VAE_net = VAE(input_dim=x.shape[1], latent_dim=2)
    VAE_net.load_state_dict(VAE_model)
    VAE_net.eval()

    def transform_vae(x, VAE_net):
        x = np.array(x)
        if len(x.shape) == 1:
            x = x.reshape(1, -1)

        with torch.no_grad():
            x_batch = torch.from_numpy(x).float()
            encoder_mu, encoder_log_var = VAE_net.encoder(x_batch, p=1.0)
            batch_z = VAE_net.sampling(encoder_mu,
                                       encoder_log_var,
                                       batch_size=len(x),
                                       eval=True).numpy()

        return np.array(batch_z, dtype=float)

    return transform_functions
Beispiel #17
0
    train_df = pd.read_csv(os.path.join(train_dir, 'train.csv'),
                           index_col='ID')

    # Transforming target variable to normal distribution
    train_df['target'] = stats.boxcox(train_df['target'])[0]

    # Scaling train data
    print('Scaling Train Data')
    std_scale = preprocessing.StandardScaler().fit(train_df.iloc[:, 1:])
    train_df_scaled = std_scale.transform(train_df.iloc[:, 1:])

    # Fitting and transforming train data with sparse PCA
    print('Fitting and transforming Train Data with Sparse PCA algorithm')
    n_components = args.n_components

    sparse_sm = decomposition.SparsePCA(n_components=n_components)
    sparse_train = sparse_sm.fit_transform(train_df_scaled)

    train_PCA_output_path = os.path.join('/opt/ml/processing/train',
                                         'train_sparse_pca.csv')
    test_PCA_output_path = os.path.join('/opt/ml/processing/test',
                                        'test_sparse_pca.csv')

    # Saving transformed train data
    pd.concat([
        train_df['target'],
        pd.DataFrame(
            sparse_train,
            columns=['c{}'.format(num + 1) for num in range(n_components)],
            index=train_df.index)
    ],
Beispiel #18
0
# Projection on to the first 2 principal components

print("Computing Kernel PCA projection")
t0 = time()
kpca = decomposition.KernelPCA(n_components=n_com)
X_kpca = kpca.fit_transform(X)
plot_embedding(
    X_kpca,
    "Kernel Principal Components projection of the digits (time %.2fs)" %
    (time() - t0))
# ----------------------------------------------------------------------
# Projection on to the first 2 principal components

print("Computing Sparce PCA projection")
t0 = time()
spca = decomposition.SparsePCA(n_components=n_com)
X_spca = spca.fit_transform(X)
plot_embedding(
    X_pca,
    "Sparce Principal Components projection of the digits (time %.2fs)" %
    (time() - t0))

# ----------------------------------------------------------------------
# Isomap projection of the digits dataset
print("Computing Isomap embedding")
t0 = time()
iso = manifold.Isomap(n_neighbors, n_components=n_com)
X_iso = iso.fit_transform(X)
print("Done.")
plot_embedding(X_iso,
               "Isomap projection of the digits (time %.2fs)" % (time() - t0))
Beispiel #19
0
 def __init__(self, source):
     min_max_scaler = preprocessing.MinMaxScaler()
     data_source = min_max_scaler.fit_transform(source)
     pca = decomposition.SparsePCA(n_components=2)
     self.return_data = pca.fit_transform(data_source)
Beispiel #20
0
Y_TRAIN = traindata[:len(traindata),   -1]

X_TEST = testdata[:, 2 : -1]
Y_TEST = testdata[:, -1]

max_score = 0
num_components = 0
'''
Initialize sequence of estimators:
    Data pre-processing
    Feature selection
    Classifier
'''
estimators = []
estimators.append(('scaler', preprocessing.StandardScaler()))
estimators.append(('FeatureSelection', decomposition.SparsePCA()))
estimators.append(('clf', SGD = SGDClassifier(loss='log', penalty='elasticnet', n_iter=100)))
'''
Form a pipeline from the estimators
'''
model = Pipeline(estimators)
'''
Set hyper parameters for grid search
'''
parameters = {
    'FeatureSelection__n_components': (4, 8, 12, 16),
    'FeatureSelection__alpha': (0.1, 1, 10, 100),
    'clf__alpha': (0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001),
    'clef__l1_ratio': (0, 0.15, 0.5, 0.85, 1),
    'clf__n_iter': (10, 50, 80, 250)
}
Beispiel #21
0
all_projections['LAMP']    = (vp.LAMP(), {'verbose': [False], 'fraction_delta': [2.0, 8.0, 12.0], 'n_iterations': [100, 200], 'sample_type': ['random', 'clustering_centroid']})
all_projections['LE']      = (manifold.SpectralEmbedding(), {'n_components': [2], 'affinity': ['nearest_neighbors'], 'random_state': [42]})
all_projections['LISO']    = (vp.LandmarkIsomap(), {'verbose': [False], 'n_neighbors': [4, 8, 16], 'dissimilarity_type': ['euclidean']})
all_projections['LLC']     = (drtoolbox.LLC(), {'k': [8, 12], 'n_analyzers': [10, 20], 'max_iter': [200, 400], 'verbose': [False]})
all_projections['LLE']     = (manifold.LocallyLinearEmbedding(), {'n_components': [2], 'n_neighbors': [5, 7, 11], 'max_iter': [100, 200], 'reg': [0.001, 0.01, 0.1], 'method': ['standard'], 'eigen_solver': ['dense'], 'random_state': [42]})
all_projections['LLTSA']   = (tapkee.LinearLocalTangentSpaceAlignment(), {'n_neighbors': [4, 7, 11], 'verbose': [False]}) # subject to "eigendecomposition failed" errors (Eigen's NoConvergence)
all_projections['LMDS']    = (tapkee.LandmarkMDS(), {'n_neighbors': [4, 7, 11], 'verbose': [False]})
all_projections['LMNN']    = (drtoolbox.LMNN(), {'k': [3, 5, 7], 'verbose': [False]})
all_projections['LMVU']    = (drtoolbox.LandmarkMVU(), {'k1': [3, 5, 7], 'k2': [8, 12, 15], 'verbose': [False]})
all_projections['LPP']     = (tapkee.LocalityPreservingProjections(), {'n_neighbors': [4, 7, 11], 'verbose': [False]}) # subject to "eigendecomposition failed" errors (Eigen's NoConvergence)
all_projections['LSP']     = (vp.LSP(), {'verbose': [False], 'fraction_delta': [2.0, 8.0, 12.0], 'n_iterations': [100, 200], 'n_neighbors': [4, 8, 16], 'control_point_type': ['random', 'kmeans'], 'dissimilarity_type': ['euclidean']})
all_projections['LTSA']    = (manifold.LocallyLinearEmbedding(), {'n_components': [2], 'n_neighbors': [5, 7, 11], 'max_iter': [100, 200], 'reg': [0.001, 0.01, 0.1], 'method': ['ltsa'], 'eigen_solver': ['dense'], 'random_state': [42]})
all_projections['MC']      = (drtoolbox.ManifoldChart(), {'n_analyzers': [10, 20], 'max_iter': [200, 400], 'verbose': [False]})
all_projections['MCML']    = (drtoolbox.MCML(), {'verbose': [False]})
all_projections['MDS']     = (manifold.MDS(), {'n_components': [2], 'n_init': [2, 4], 'metric': [True], 'max_iter': [300, 500], 'random_state': [42]})
all_projections['MLLE']    = (manifold.LocallyLinearEmbedding(), {'n_components': [2], 'n_neighbors': [5, 7, 11], 'max_iter': [100, 200], 'reg': [0.001, 0.01, 0.1], 'method': ['modified'], 'eigen_solver': ['dense'], 'random_state': [42]})
all_projections['MVU']     = (drtoolbox.MVU(), {'k': [8, 12, 15], 'verbose': [False]})
all_projections['NMDS']    = (manifold.MDS(), {'n_components': [2], 'n_init': [2, 4], 'metric': [False], 'max_iter': [300, 500], 'random_state': [42]})
all_projections['NMF']     = (decomposition.NMF(), {'n_components': [2], 'init': ['random', 'nndsvdar'], 'beta_loss': ['frobenius'], 'max_iter': [200, 400], 'alpha': [0, 0.5], 'l1_ratio': [0.0, 0.5], 'random_state': [42]})
all_projections['PBC']     = (vp.ProjectionByClustering(), {'verbose': [False], 'fraction_delta': [2.0, 8.0, 12.0], 'n_iterations': [100, 200], 'init_type': ['fastmap', 'random'], 'dissimilarity_type': ['euclidean'], 'cluster_factor': [1.5, 4.5, 9.0]})
all_projections['PCA']     = (decomposition.PCA(), {'n_components': [2], 'random_state': [42]})
all_projections['PLSP']    = (vp.PLSP(), {'dissimilarity_type': ['euclidean'], 'verbose': [False], 'sample_type': ['clustering']})
all_projections['PPCA']    = (drtoolbox.ProbPCA(), {'max_iter': [200, 400], 'verbose': [False]})
all_projections['RSAM']    = (vp.RapidSammon(), {'verbose': [False], 'dissimilarity_type': ['euclidean']})
all_projections['SPCA']    = (decomposition.SparsePCA(), {'n_components': [2], 'alpha': [0.01, 0.1, 0.5], 'ridge_alpha': [0.05, 0.05, 0.5], 'max_iter': [1000, 2000], 'tol': [1e-08], 'method': ['lars'], 'random_state': [42], 'normalize_components': [True]})
all_projections['SPE']     = (tapkee.StochasticProximityEmbedding(), {'n_neighbors': [6, 12, 18], 'n_updates': [20, 70], 'max_iter': [0], 'verbose': [False]})
all_projections['SRP']     = (random_projection.SparseRandomProjection(), {'n_components': [2], 'density': ['auto'], 'random_state': [42]})
all_projections['TSNE']    = (mtsne.MTSNE(), {'n_components': [2], 'perplexity': [5.0, 15.0, 30.0, 50.0], 'early_exaggeration': [6.0, 12.0, 18.0], 'learning_rate': [200.0], 'n_iter': [1000, 3000], 'n_iter_without_progress': [300], 'min_grad_norm': [1e-07], 'metric': ['euclidean'], 'init': ['random'], 'random_state': [42], 'method': ['barnes_hut'], 'angle': [0.5], 'n_jobs': [4]})
all_projections['TSVD']    = (decomposition.TruncatedSVD(), {'n_components': [2], 'algorithm': ['randomized'], 'n_iter': [5, 10], 'random_state': [42]})
all_projections['UMAP']    = (umap.UMAP(), {'n_components': [2], 'random_state': [42], 'n_neighbors': [5, 10, 15], 'metric': ['euclidean'], 'init': ['spectral', 'random'], 'min_dist': [0.001, 0.01, 0.1, 0.5], 'spread': [1.0], 'angular_rp_forest': [False]})
Beispiel #22
0
    return tmp


from sklearn import decomposition

l = AllStateDataLoader()
print("Extraction data_2...")
data_2 = l.get_data_2_train()
print("Extraction data_3...")
data_3 = l.get_data_3_train()
print("Extraction data_all...")
data_all = l.get_data_all_train()

data_all_reindexed = duplicate_data(data_all)

pca = decomposition.SparsePCA(n_components=3, verbose=True)

X = get_X_without_scaler(data_all_reindexed)
pca.fit(X)
X_pca = pca.transform(X)

x1 = -600
x2 = 400
y1 = -80
y2 = 80

plt.figure()
plt.subplot(311)
plt.plot(X_pca[data_all["real_A"] == 1,2], X_pca[data_all["real_A"] == 1,1], "b+")
#plt.axis((x1,x2,y1,y2))
plt.subplot(312)
Beispiel #23
0
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, decomposition, manifold
from itertools import cycle


def load_data():
    iris = datasets.load_iris()
    return iris.data, iris.target


PCA_Set = [
    decomposition.PCA(n_components=None),
    decomposition.PCA(svd_solver='randomized'),
    decomposition.SparsePCA(n_components=None),
    decomposition.IncrementalPCA(n_components=None),
    decomposition.KernelPCA(n_components=None, kernel='linear'),
    decomposition.KernelPCA(n_components=None, kernel='rbf'),
    decomposition.KernelPCA(n_components=None, kernel='poly'),
    decomposition.KernelPCA(n_components=None, kernel='sigmoid'),
    decomposition.FastICA(n_components=None)
]
PCA_Set_Name = [
    'Default', 'Randomized', 'Sparse', 'Incremental', 'Kernel(linear)',
    'Kernel(rbf)', 'Kernel(poly)', 'Kernel(sigmoid)', 'ICA'
]


def plot_PCA(*data):
    X, Y = data
    fig = plt.figure("PCA", figsize=(20, 8))
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import decomposition

df = pd.DataFrame.from_csv('data_1k_num.csv', index_col=None)

jb = df.JobRole.unique()
#print jb
for i in range(len(jb)):
    df.JobRole.replace(jb[i], i + 1, inplace=True)

dept = df.Department.unique()
for i in range(len(dept)):
    df.Department.replace(dept[i], i + 1, inplace=True)

est = decomposition.SparsePCA(20, alpha=0.5, max_iter=100)

c = est.fit(df)
df1 = pd.DataFrame(est.components_)
df1 = (df1)
df1.to_csv("Reduced_Components.csv")
print est.components_
Beispiel #25
0
def main():
    progname = os.path.basename(sys.argv[0])
    usage = """prog [options] <input stack> <output basis> [reprojections]
This too provides a variety of dimensionality reduction methods. This new version
uses scikit.learn, which provides a greater variety of algorithms, but must load 
all data into memory. If working with a large file, you may want to consider using
--step to operate on a limited subset of the data.

If specified, [reprojections] will contain projections of the full input stack
(ignoring --step) into the basis subspace represented as a single image. This 
obviates the need for e2basis.py, and permits use of nonlinear decompositions.

---
Performs multivariate statistical analysis on a stack of images. Writes
a set of Eigenimages which can be uses as a basis set for reducing
the dimensionality of a data set (noise reduction). Typically this
basis set is then used to reproject the data (e2basis.py) and
classify the data based on the projected vectors. If the
output file supports arbitrary metadata (like HDF), Eigenvalues
are stored in the 'eigval' parameter in each image.

Note: The mean value is subtracted from each image prior to MSA
calculation. The mean image is stored as the first image in the output
file, though it is not part of the orthonormal basis when
handled this way."""

    parser = EMArgumentParser(usage=usage, version=EMANVERSION)

    parser.add_argument(
        "--mode",
        type=str,
        help="Mode should be one of: pca, sparsepca, fastica, factan, lda, nmf",
        default="pca")
    parser.add_argument(
        "--nomean",
        action="store_true",
        help="Suppress writing the average image as the first output image",
        default=False)
    parser.add_argument(
        "--nomeansub",
        action="store_true",
        help=
        "Suppress subtracting the mean from each input image, also implies --nomean",
        default=False)
    parser.add_argument("--nbasis",
                        "-n",
                        type=int,
                        help="Number of basis images to generate.",
                        default=20)
    parser.add_argument(
        "--maskfile",
        "-M",
        type=str,
        help=
        "File containing a mask defining the pixels to include in the Eigenimages"
    )
    parser.add_argument(
        "--projin",
        type=str,
        default=None,
        help=
        "When generating subspace projections, use this file instead of the input used for the MSA"
    )
    parser.add_argument(
        "--normproj",
        action="store_true",
        help=
        "When generating subspace projections, normalize each projection vector to unit length",
        default=False)
    parser.add_argument(
        "--mask",
        type=int,
        help=
        "Mask radius, negative values imply ny/2+1+mask, --mask=0 disables, --maskfile overrides",
        default=0)
    parser.add_argument(
        "--simmx",
        type=str,
        help=
        "Will use transformations from simmx on each particle prior to analysis"
    )
    parser.add_argument(
        "--normalize",
        action="store_true",
        help=
        "Perform a careful normalization of input images before MSA. Otherwise normalization is not modified until after mean subtraction.",
        default=False)
    parser.add_argument(
        "--step",
        type=str,
        default="0,1",
        help=
        "Specify <init>,<step>[,last]. Processes only a subset of the input data. For example, 0,2 would process only the even numbered particles"
    )
    parser.add_argument(
        "--ppid",
        type=int,
        help="Set the PID of the parent process, used for cross platform PPID",
        default=-1)
    parser.add_argument(
        "--verbose",
        "-v",
        dest="verbose",
        action="store",
        metavar="n",
        type=int,
        default=0,
        help=
        "verbose level [0-9], higher number means higher level of verboseness")

    #parser.add_argument("--gui",action="store_true",help="Start the GUI for interactive boxing",default=False)
    #parser.add_argument("--boxsize","-B",type=int,help="Box size in pixels",default=-1)
    #parser.add_argument("--dbin","-D",type=str,help="Filename to read an existing box database from",default=None)

    (options, args) = parser.parse_args()
    if len(args) < 2: parser.error("Input and output filenames required")

    logid = E2init(sys.argv, options.ppid)

    if options.verbose > 0: print("Beginning MSA")

    # Number of image s in the input file
    nfile = EMUtil.get_image_count(args[0])

    try:
        step = [int(i) for i in options.step.split(",")]
        if len(step) == 1: step = (0, step[0], nfile)
        elif len(step) == 2: step.append(nfile)
        elif len(step) == 3:
            if step[2] <= 0:
                step[2] += nfile  # undocumented negative final value permitted
        else:
            raise Exception
    except:
        print("Invalid --step specification")
        sys.exit(1)

    # setup mask image
    if options.maskfile:
        mask = EMData(options.maskfile, 0)
        if mask["mean_nonzero"] != 1.0:
            print("ERROR: maskfile must be a binary mask (1/0 only)")
            sys.exit(1)
    else:
        # default is no masking
        mask = EMData(args[0], 0)
        mask.to_one()
        # negative values handled by mask.sharp
        if options.mask != 0:
            mask.process_inplace("mask.sharp", {"outer_radius": options.mask})

    # Memory usage warning >2G raw data
    n = (step[2] - step[0]) // step[1]
    nval = int(mask["square_sum"])
    #	print(args[0],n,nval)
    if options.verbose or n * nval > 500000000:
        print("Estimated memory usage (mb): ", n * nval * 4 / 2**20)

    # Read all image data into numpy array
    if options.simmx: data = simmx_get(args[0], options.simmx, mask, step)
    else: data = normal_get(args[0], mask, step)

    if options.normalize:
        for i in range(len(data)):
            data[i] /= np.linalg.norm(data[i])

    # first output image is the mean of the input vectors, which has been subtracted from each vector
    try:
        os.unlink(args[1])
    except:
        pass
    mean = np.mean(data, 0)
    if not options.nomeansub:
        for i in range(len(data)):
            data[i] -= mean
    #from_numpy(mean).process("misc.mask.pack",{"mask":mask,"unpack":1}).write_image(args[1],0)

    shift = 0
    # This is where the actual action takes place!
    if options.mode == "pca":
        msa = skdc.PCA(n_components=options.nbasis)
        #		print(data.shape)
        msa.fit(data)
    elif options.mode == "factan":
        msa = skdc.FactorAnalysis(n_components=options.nbasis)
        msa.fit(data)
    elif options.mode == "sparsepca":
        msa = skdc.SparsePCA(n_components=options.nbasis)
        #		print(data.shape)
        msa.fit(data)
    elif options.mode == "fastica":
        msa = skdc.FastICA(n_components=options.nbasis,
                           algorithm="parallel",
                           max_iter=500,
                           tol=0.001)
        msa.fit(data)
    elif options.mode == "lda":
        shift = max(-data.min() + data.std() * 0.5,
                    data.std() * 4.0 - data.mean())  # we need positivity
        # if we are processing projections later, we need to try to insure that they will be positive as well
        if options.projin:
            nfile2 = EMUtil.get_image_count(options.projin)
            pmin = 0
            pstd = 0
            pmean = 0
            pn = 0
            for i in range(0, nfile2,
                           nfile2 // 256):  # read a scattering of images
                tmp = EMData(options.projin)
                pmin = min(pmin, tmp["minimum"])
                pstd = max(pstd, tmp["sigma_nonzero"])
                pmean += tmp["mean"]
                pn += 1
            pmean /= pn
            shiftp = max(pmin + pstd * 0.5, pstd * 4.0 - pmean)
            shift = max(shift, shiftp)

        data += shift
        msa = skdc.LatentDirichletAllocation(n_components=options.nbasis,
                                             learning_method="online",
                                             verbose=1)
        msa.fit(data)
    elif options.mode == "nmf":
        shift = max(-data.min() + data.std() * 1.5,
                    data.std() * 4.0 - data.mean())  # we need positivity
        # if we are processing projections later, we need to try to insure that they will be positive as well
        if options.projin:
            nfile2 = EMUtil.get_image_count(options.projin)
            pmin = 0
            pstd = 0
            pmean = 0
            pn = 0
            for i in range(0, nfile2,
                           nfile2 // 256):  # read a scattering of images
                tmp = EMData(options.projin)
                pmin = min(pmin, tmp["minimum"])
                pstd = max(pstd, tmp["sigma_nonzero"])
                pmean += tmp["mean"]
                pn += 1
            pmean /= pn
            shiftp = max(pmin + pstd * 0.5, pstd * 4.0 - pmean)
            shift = max(shift, shiftp)

        data += shift
        msa = skdc.NMF(n_components=options.nbasis, init="nndsvd")
        msa.fit(data)

    # write mean
    if not options.nomean and not options.nomeansub:
        mn = from_numpy(mean).process("misc.mask.pack", {
            "mask": mask,
            "unpack": 1
        })
        mn["eigval"] = 0  # we add this artifically to the mean image, both to mark it, and to make some other code requiring it work. It isn't meaningful as a value, obviously
        mn.write_image(args[1], 0)


#	print(msa.components_.shape)
#	c=from_numpy(msa.components_.copy()).write_image("z.hdf",0)

    if options.verbose > 0: print("MSA complete")

    # write other basis vectors
    if options.nomean or options.nomeansub: offset = 0
    else: offset = 1
    for i, v in enumerate(msa.components_):
        im = from_numpy(v.copy()).process("misc.mask.pack", {
            "mask": mask,
            "unpack": 1
        })
        if options.mode == "pca":
            im["eigval"] = float(msa.singular_values_[i])
            im["explvarfrac"] = float(msa.explained_variance_ratio_[i])
            if options.verbose:
                print("Explained variance: ", im["explvarfrac"],
                      "\tSingular Value: ", im["eigval"])
        elif options.mode == "fastica":
            if im["sigma"] > 0:
                im.mult(1.0 / im["sigma"]
                        )  # fastica seems to produce very small vector lengths
        im.write_image(args[1], i + offset)

    # if requested we use the model to generate reprojections of the full set of input images
    # into the new subspace. This permits use of nonlinear algorithms (the components_ output
    # is not directly usable)
    if len(args) > 2:
        try:
            os.unlink(args[2])
        except:
            pass

        if options.projin != None:
            images = options.projin
            nfile2 = EMUtil.get_image_count(images)
            step2 = [0, 1, nfile2]
        else:
            nfile2 = nfile
            step2 = step
            images = args[0]

        if options.verbose: print("Reprojecting input data into subspace")
        chunksize = min(max(2, 250000000 // nval),
                        step2[2])  # limiting memory usage for this step to ~2G
        out = EMData(
            options.nbasis, step2[2]
        )  # we hold the full set of reprojections in memory, though
        start = 0
        while (start < step2[2]):
            stept = [start, 1, min(step2[2], start + chunksize)]
            if options.verbose: print(stept)

            # read a chunk of data
            if options.simmx:
                chunk = simmx_get(images, options.simmx, mask, stept)
            else:
                chunk = normal_get(images, mask, stept)
            if shift != 0:
                chunk += shift  # for methods requiring positivity
                if chunk.min() <= 0:
                    print(
                        "ERROR: Results invalid, negative values. Shifting to prevent crash. Chunk ",
                        stept, " has mean=", chunk.mean(), "std=", chunk.std(),
                        "min=", chunk.min())
                    chunk += -chunk.min()

            proj = msa.transform(chunk)  # into subspace
            if options.normproj:
                for i in range(len(proj)):
                    proj[i] /= np.linalg.norm(proj[i])
            im = from_numpy(proj.copy())
            out.insert_clip(im, (0, start, 0))
            start += chunksize

        # write results
        out.write_image(args[2], 0)

    E2end(logid)
    if options.mode not in ("pca", "sparsepca", "fastica"):
        print(
            "WARNING: While projection vectors are reliable, use of modes other than PCA or ICA may involve nonlinarities, meaning the 'Eigenimages' may not be interpretable in the usual way."
        )
def dim_reduction(x, alg='pca', n_comp=2048):
    if alg == 'pca':
        return decomposition.PCA(n_components=n_comp).fit_transform(x)
    else:
        return decomposition.SparsePCA(n_components=n_comp).fit_transform(x)
Beispiel #27
0
def _eval_search_params(params_builder):
    search_params = {}

    for p in params_builder['param_set']:
        search_list = p['sp_list'].strip()
        if search_list == '':
            continue

        param_name = p['sp_name']
        if param_name.lower().endswith(NON_SEARCHABLE):
            print("Warning: `%s` is not eligible for search and was "
                  "omitted!" % param_name)
            continue

        if not search_list.startswith(':'):
            safe_eval = SafeEval(load_scipy=True, load_numpy=True)
            ev = safe_eval(search_list)
            search_params[param_name] = ev
        else:
            # Have `:` before search list, asks for estimator evaluatio
            safe_eval_es = SafeEval(load_estimators=True)
            search_list = search_list[1:].strip()
            # TODO maybe add regular express check
            ev = safe_eval_es(search_list)
            preprocessings = (
                preprocessing.StandardScaler(), preprocessing.Binarizer(),
                preprocessing.MaxAbsScaler(), preprocessing.Normalizer(),
                preprocessing.MinMaxScaler(),
                preprocessing.PolynomialFeatures(),
                preprocessing.RobustScaler(), feature_selection.SelectKBest(),
                feature_selection.GenericUnivariateSelect(),
                feature_selection.SelectPercentile(),
                feature_selection.SelectFpr(), feature_selection.SelectFdr(),
                feature_selection.SelectFwe(),
                feature_selection.VarianceThreshold(),
                decomposition.FactorAnalysis(random_state=0),
                decomposition.FastICA(random_state=0),
                decomposition.IncrementalPCA(),
                decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS),
                decomposition.LatentDirichletAllocation(random_state=0,
                                                        n_jobs=N_JOBS),
                decomposition.MiniBatchDictionaryLearning(random_state=0,
                                                          n_jobs=N_JOBS),
                decomposition.MiniBatchSparsePCA(random_state=0,
                                                 n_jobs=N_JOBS),
                decomposition.NMF(random_state=0),
                decomposition.PCA(random_state=0),
                decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS),
                decomposition.TruncatedSVD(random_state=0),
                kernel_approximation.Nystroem(random_state=0),
                kernel_approximation.RBFSampler(random_state=0),
                kernel_approximation.AdditiveChi2Sampler(),
                kernel_approximation.SkewedChi2Sampler(random_state=0),
                cluster.FeatureAgglomeration(),
                skrebate.ReliefF(n_jobs=N_JOBS), skrebate.SURF(n_jobs=N_JOBS),
                skrebate.SURFstar(n_jobs=N_JOBS),
                skrebate.MultiSURF(n_jobs=N_JOBS),
                skrebate.MultiSURFstar(n_jobs=N_JOBS),
                imblearn.under_sampling.ClusterCentroids(random_state=0,
                                                         n_jobs=N_JOBS),
                imblearn.under_sampling.CondensedNearestNeighbour(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.EditedNearestNeighbours(random_state=0,
                                                                n_jobs=N_JOBS),
                imblearn.under_sampling.RepeatedEditedNearestNeighbours(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.InstanceHardnessThreshold(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.NearMiss(random_state=0,
                                                 n_jobs=N_JOBS),
                imblearn.under_sampling.NeighbourhoodCleaningRule(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.OneSidedSelection(random_state=0,
                                                          n_jobs=N_JOBS),
                imblearn.under_sampling.RandomUnderSampler(random_state=0),
                imblearn.under_sampling.TomekLinks(random_state=0,
                                                   n_jobs=N_JOBS),
                imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.RandomOverSampler(random_state=0),
                imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.BorderlineSMOTE(random_state=0,
                                                       n_jobs=N_JOBS),
                imblearn.over_sampling.SMOTENC(categorical_features=[],
                                               random_state=0,
                                               n_jobs=N_JOBS),
                imblearn.combine.SMOTEENN(random_state=0),
                imblearn.combine.SMOTETomek(random_state=0))
            newlist = []
            for obj in ev:
                if obj is None:
                    newlist.append(None)
                elif obj == 'all_0':
                    newlist.extend(preprocessings[0:35])
                elif obj == 'sk_prep_all':  # no KernalCenter()
                    newlist.extend(preprocessings[0:7])
                elif obj == 'fs_all':
                    newlist.extend(preprocessings[7:14])
                elif obj == 'decomp_all':
                    newlist.extend(preprocessings[14:25])
                elif obj == 'k_appr_all':
                    newlist.extend(preprocessings[25:29])
                elif obj == 'reb_all':
                    newlist.extend(preprocessings[30:35])
                elif obj == 'imb_all':
                    newlist.extend(preprocessings[35:54])
                elif type(obj) is int and -1 < obj < len(preprocessings):
                    newlist.append(preprocessings[obj])
                elif hasattr(obj, 'get_params'):  # user uploaded object
                    if 'n_jobs' in obj.get_params():
                        newlist.append(obj.set_params(n_jobs=N_JOBS))
                    else:
                        newlist.append(obj)
                else:
                    sys.exit("Unsupported estimator type: %r" % (obj))

            search_params[param_name] = newlist

    return search_params
def get_search_params(params_builder):
    search_params = {}
    safe_eval = SafeEval(load_scipy=True, load_numpy=True)
    safe_eval_es = SafeEval(load_estimators=True)

    for p in params_builder['param_set']:
        search_p = p['search_param_selector']['search_p']
        if search_p.strip() == '':
            continue
        param_type = p['search_param_selector']['selected_param_type']

        lst = search_p.split(':')
        assert (
            len(lst) == 2
        ), "Error, make sure there is one and only one colon in search parameter input."
        literal = lst[1].strip()
        param_name = lst[0].strip()
        if param_name:
            if param_name.lower() == 'n_jobs':
                sys.exit("Parameter `%s` is invalid for search." % param_name)
            elif not param_name.endswith('-'):
                ev = safe_eval(literal)
                if param_type == 'final_estimator_p':
                    search_params['estimator__' + param_name] = ev
                else:
                    search_params['preprocessing_' + param_type[5:6] + '__' +
                                  param_name] = ev
            else:
                # only for estimator eval, add `-` to the end of param
                #TODO maybe add regular express check
                ev = safe_eval_es(literal)
                for obj in ev:
                    if 'n_jobs' in obj.get_params():
                        obj.set_params(n_jobs=N_JOBS)
                if param_type == 'final_estimator_p':
                    search_params['estimator__' + param_name[:-1]] = ev
                else:
                    search_params['preprocessing_' + param_type[5:6] + '__' +
                                  param_name[:-1]] = ev
        elif param_type != 'final_estimator_p':
            #TODO regular express check ?
            ev = safe_eval_es(literal)
            preprocessors = [
                preprocessing.StandardScaler(),
                preprocessing.Binarizer(),
                preprocessing.Imputer(),
                preprocessing.MaxAbsScaler(),
                preprocessing.Normalizer(),
                preprocessing.MinMaxScaler(),
                preprocessing.PolynomialFeatures(),
                preprocessing.RobustScaler(),
                feature_selection.SelectKBest(),
                feature_selection.GenericUnivariateSelect(),
                feature_selection.SelectPercentile(),
                feature_selection.SelectFpr(),
                feature_selection.SelectFdr(),
                feature_selection.SelectFwe(),
                feature_selection.VarianceThreshold(),
                decomposition.FactorAnalysis(random_state=0),
                decomposition.FastICA(random_state=0),
                decomposition.IncrementalPCA(),
                decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS),
                decomposition.LatentDirichletAllocation(random_state=0,
                                                        n_jobs=N_JOBS),
                decomposition.MiniBatchDictionaryLearning(random_state=0,
                                                          n_jobs=N_JOBS),
                decomposition.MiniBatchSparsePCA(random_state=0,
                                                 n_jobs=N_JOBS),
                decomposition.NMF(random_state=0),
                decomposition.PCA(random_state=0),
                decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS),
                decomposition.TruncatedSVD(random_state=0),
                kernel_approximation.Nystroem(random_state=0),
                kernel_approximation.RBFSampler(random_state=0),
                kernel_approximation.AdditiveChi2Sampler(),
                kernel_approximation.SkewedChi2Sampler(random_state=0),
                cluster.FeatureAgglomeration(),
                skrebate.ReliefF(n_jobs=N_JOBS),
                skrebate.SURF(n_jobs=N_JOBS),
                skrebate.SURFstar(n_jobs=N_JOBS),
                skrebate.MultiSURF(n_jobs=N_JOBS),
                skrebate.MultiSURFstar(n_jobs=N_JOBS),
                imblearn.under_sampling.ClusterCentroids(random_state=0,
                                                         n_jobs=N_JOBS),
                imblearn.under_sampling.CondensedNearestNeighbour(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.EditedNearestNeighbours(random_state=0,
                                                                n_jobs=N_JOBS),
                imblearn.under_sampling.RepeatedEditedNearestNeighbours(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.InstanceHardnessThreshold(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.NearMiss(random_state=0,
                                                 n_jobs=N_JOBS),
                imblearn.under_sampling.NeighbourhoodCleaningRule(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.OneSidedSelection(random_state=0,
                                                          n_jobs=N_JOBS),
                imblearn.under_sampling.RandomUnderSampler(random_state=0),
                imblearn.under_sampling.TomekLinks(random_state=0,
                                                   n_jobs=N_JOBS),
                imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.RandomOverSampler(random_state=0),
                imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.BorderlineSMOTE(random_state=0,
                                                       n_jobs=N_JOBS),
                imblearn.over_sampling.SMOTENC(categorical_features=[],
                                               random_state=0,
                                               n_jobs=N_JOBS),
                imblearn.combine.SMOTEENN(random_state=0),
                imblearn.combine.SMOTETomek(random_state=0)
            ]
            newlist = []
            for obj in ev:
                if obj is None:
                    newlist.append(None)
                elif obj == 'all_0':
                    newlist.extend(preprocessors[0:36])
                elif obj == 'sk_prep_all':  # no KernalCenter()
                    newlist.extend(preprocessors[0:8])
                elif obj == 'fs_all':
                    newlist.extend(preprocessors[8:15])
                elif obj == 'decomp_all':
                    newlist.extend(preprocessors[15:26])
                elif obj == 'k_appr_all':
                    newlist.extend(preprocessors[26:30])
                elif obj == 'reb_all':
                    newlist.extend(preprocessors[31:36])
                elif obj == 'imb_all':
                    newlist.extend(preprocessors[36:55])
                elif type(obj) is int and -1 < obj < len(preprocessors):
                    newlist.append(preprocessors[obj])
                elif hasattr(obj, 'get_params'):  # user object
                    if 'n_jobs' in obj.get_params():
                        newlist.append(obj.set_params(n_jobs=N_JOBS))
                    else:
                        newlist.append(obj)
                else:
                    sys.exit("Unsupported preprocessor type: %r" % (obj))
            search_params['preprocessing_' + param_type[5:6]] = newlist
        else:
            sys.exit("Parameter name of the final estimator can't be skipped!")

    return search_params
def main():
    results = []
    args = parse_args()

    # there's a bug in the _sparseness method in sklearn's nmf module that is
    # hit in some edge cases.  The value it computes isn't actually needed in
    # this case, so we can just ignore this divide by 0 error
    np.seterr(invalid="ignore")
    print("Processing %s" % args.data_file)
    full_mtx = np.loadtxt(args.data_file, delimiter=',')
    # normalize / clean-up our matrix: remove genes that are always 0, and
    # convert all non-zero values to 1
    min_col_mutation_pct = 1.5  # columns with < this percentage of mutations will be deleted
    full_mtx = Matrix.trim_cols(full_mtx, 100 - min_col_mutation_pct)
    full_mtx = Matrix.to_binary(full_mtx)

    partitions = ({
        'start_row': 0,
        'end_row': 269,
        'name': "COAD"
    }, {
        'start_row': 270,
        'end_row': 689,
        'name': "KIRC"
    }, {
        'start_row': 690,
        'end_row': 941,
        'name': "PRAD"
    }, {
        'start_row': 942,
        'end_row': 1284,
        'name': "SKCM"
    }, {
        'start_row': 1285,
        'end_row': 1341,
        'name': "UCS"
    })

    # create an array with the classes of each sample
    classes = list()
    for partition_idx, partition in enumerate(partitions):
        for row_idx in range(partition['start_row'], partition['end_row'] + 1):
            classes.append(partition_idx)

    l_reg = LogisticRegression()
    lreg_mtx = l_reg.fit_transform(full_mtx, classes)
    accuracy = l_reg.score(full_mtx, classes)
    print("accuracy = %f" % accuracy)
    print("regression dimensions : %d, %d" % (len(lreg_mtx), len(lreg_mtx[0])))

    print("Computing Sparse PCA projection")
    pca_mtx = decomposition.SparsePCA(n_components=100).fit_transform(lreg_mtx)
    pca_mtx = Matrix.to_binary(pca_mtx)
    #rank_range = range(2,3)
    rank_range = range(2, 11)

    for partition in partitions:
        print("Processing %s cancer" % partition['name'])
        mtx = pca_mtx[partition['start_row']:partition['end_row'] + 1]
        mtx = Matrix.trim_rows(mtx)  # remove empty rows
        print("Matrix is %d by %d and %f sparse" %
              (len(mtx), len(mtx[0]), Matrix.get_sparsity(mtx)))
        mtx = np.matrix.transpose(
            mtx)  # transpose to put samples into columns, genes into rows

        print(
            "=====> Finding the optimum # clusters (range = %d to %d) <=====" %
            (rank_range[0], rank_range[-1]))

        results = list()

        for num_clusters in rank_range:
            print("Trying cluster size %d " % num_clusters)
            nmf = Nmf(mtx, clusters=num_clusters)
            c, w, h = nmf.get_consensus_matrix()
            coph = nmf.coph_cor(c)
            results.append({
                'rank': num_clusters,
                'consensus': Matrix.reorder(c),
                'w': w,
                'h': h,
                'coph': coph
            })

        best = max(results, key=lambda x: x['coph'])
        worst = min(results, key=lambda x: x['coph'])

        print("worst: rank %d with %f" % (worst['rank'], worst['coph']))
        print("best: rank %d with %f" % (best['rank'], best['coph']))
        print("all:", [x['coph'] for x in results])

        base_name = join(args.out_dir, partition['name'])

        Matrix.to_cophenetic_plot(rank_range, [x['coph'] for x in results],
                                  base_name + "_cophenetic.png")

        Matrix.to_heat_map(mtx, base_name + "_a.png")
        Matrix.to_heat_map(best['w'], base_name + "_w.png")
        Matrix.to_heat_map(best['h'], base_name + "_h.png")
        #sort h matrix
        ordered_h = np.sort(best['h'], axis=0)
        Matrix.to_line_plot(ordered_h, base_name + "_h_line.png")
        Matrix.to_consensus_plot(best['consensus'], best['rank'], base_name)
        Matrix.to_consensus_plot(results[0]['consensus'], results[0]['rank'],
                                 base_name)
        Matrix.to_consensus_plot(results[1]['consensus'], results[1]['rank'],
                                 base_name)
        Matrix.to_consensus_plot(results[2]['consensus'], results[2]['rank'],
                                 base_name)
        Matrix.to_consensus_plot(results[3]['consensus'], results[3]['rank'],
                                 base_name)