import pandas as pd
import skrebate
import numpy as np

############## Her 0.1 ###################
input_file = '/home/ansohn/data/CGEMS-data/CGEMS-prostate-cancer-data-only-genes-predict-aggressive.csv'

#rel_out = '/home/ansohn/cgems/cgems_relieff.txt'
#surf_out = '/home/ansohn/cgems/cgems_surf.txt'
#surfstar_out = '/home/ansohn/cgems/cgems_surfstar.txt'
msurf_out = '/home/ansohn/cgems/cgems_msurf.txt'

data = pd.read_csv(input_file)
labels = data['class'].values
features = data.drop('class', axis=1)

#rel = skrebate.ReliefF(n_features_to_select=2, n_jobs=-1)
#surf = skrebate.SURF(n_features_to_select=2, n_jobs=-1)
#surfstar = skrebate.SURFstar(n_features_to_select=2, n_jobs=-1)
msurf = skrebate.MultiSURF(n_features_to_select=2, n_jobs=-1)

#rel1 = np.savetxt(rel_out, rel.fit(features.values, labels).top_features_.astype('int32'))
#surf1 = np.savetxt(surf_out, surf.fit(features.values, labels).top_features_.astype('int32'))
#surfstar1 = np.savetxt(surfstar_out, surfstar.fit(features.values, labels).top_features_.astype('int32'))
msurf1 = np.savetxt(
    msurf_out,
    msurf.fit(features.values, labels).top_features_.astype('int32'))
Exemple #2
0
def _eval_search_params(params_builder):
    search_params = {}

    for p in params_builder['param_set']:
        search_list = p['sp_list'].strip()
        if search_list == '':
            continue

        param_name = p['sp_name']
        if param_name.lower().endswith(NON_SEARCHABLE):
            print("Warning: `%s` is not eligible for search and was "
                  "omitted!" % param_name)
            continue

        if not search_list.startswith(':'):
            safe_eval = SafeEval(load_scipy=True, load_numpy=True)
            ev = safe_eval(search_list)
            search_params[param_name] = ev
        else:
            # Have `:` before search list, asks for estimator evaluatio
            safe_eval_es = SafeEval(load_estimators=True)
            search_list = search_list[1:].strip()
            # TODO maybe add regular express check
            ev = safe_eval_es(search_list)
            preprocessings = (
                preprocessing.StandardScaler(), preprocessing.Binarizer(),
                preprocessing.MaxAbsScaler(), preprocessing.Normalizer(),
                preprocessing.MinMaxScaler(),
                preprocessing.PolynomialFeatures(),
                preprocessing.RobustScaler(), feature_selection.SelectKBest(),
                feature_selection.GenericUnivariateSelect(),
                feature_selection.SelectPercentile(),
                feature_selection.SelectFpr(), feature_selection.SelectFdr(),
                feature_selection.SelectFwe(),
                feature_selection.VarianceThreshold(),
                decomposition.FactorAnalysis(random_state=0),
                decomposition.FastICA(random_state=0),
                decomposition.IncrementalPCA(),
                decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS),
                decomposition.LatentDirichletAllocation(random_state=0,
                                                        n_jobs=N_JOBS),
                decomposition.MiniBatchDictionaryLearning(random_state=0,
                                                          n_jobs=N_JOBS),
                decomposition.MiniBatchSparsePCA(random_state=0,
                                                 n_jobs=N_JOBS),
                decomposition.NMF(random_state=0),
                decomposition.PCA(random_state=0),
                decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS),
                decomposition.TruncatedSVD(random_state=0),
                kernel_approximation.Nystroem(random_state=0),
                kernel_approximation.RBFSampler(random_state=0),
                kernel_approximation.AdditiveChi2Sampler(),
                kernel_approximation.SkewedChi2Sampler(random_state=0),
                cluster.FeatureAgglomeration(),
                skrebate.ReliefF(n_jobs=N_JOBS), skrebate.SURF(n_jobs=N_JOBS),
                skrebate.SURFstar(n_jobs=N_JOBS),
                skrebate.MultiSURF(n_jobs=N_JOBS),
                skrebate.MultiSURFstar(n_jobs=N_JOBS),
                imblearn.under_sampling.ClusterCentroids(random_state=0,
                                                         n_jobs=N_JOBS),
                imblearn.under_sampling.CondensedNearestNeighbour(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.EditedNearestNeighbours(random_state=0,
                                                                n_jobs=N_JOBS),
                imblearn.under_sampling.RepeatedEditedNearestNeighbours(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.InstanceHardnessThreshold(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.NearMiss(random_state=0,
                                                 n_jobs=N_JOBS),
                imblearn.under_sampling.NeighbourhoodCleaningRule(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.OneSidedSelection(random_state=0,
                                                          n_jobs=N_JOBS),
                imblearn.under_sampling.RandomUnderSampler(random_state=0),
                imblearn.under_sampling.TomekLinks(random_state=0,
                                                   n_jobs=N_JOBS),
                imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.RandomOverSampler(random_state=0),
                imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.BorderlineSMOTE(random_state=0,
                                                       n_jobs=N_JOBS),
                imblearn.over_sampling.SMOTENC(categorical_features=[],
                                               random_state=0,
                                               n_jobs=N_JOBS),
                imblearn.combine.SMOTEENN(random_state=0),
                imblearn.combine.SMOTETomek(random_state=0))
            newlist = []
            for obj in ev:
                if obj is None:
                    newlist.append(None)
                elif obj == 'all_0':
                    newlist.extend(preprocessings[0:35])
                elif obj == 'sk_prep_all':  # no KernalCenter()
                    newlist.extend(preprocessings[0:7])
                elif obj == 'fs_all':
                    newlist.extend(preprocessings[7:14])
                elif obj == 'decomp_all':
                    newlist.extend(preprocessings[14:25])
                elif obj == 'k_appr_all':
                    newlist.extend(preprocessings[25:29])
                elif obj == 'reb_all':
                    newlist.extend(preprocessings[30:35])
                elif obj == 'imb_all':
                    newlist.extend(preprocessings[35:54])
                elif type(obj) is int and -1 < obj < len(preprocessings):
                    newlist.append(preprocessings[obj])
                elif hasattr(obj, 'get_params'):  # user uploaded object
                    if 'n_jobs' in obj.get_params():
                        newlist.append(obj.set_params(n_jobs=N_JOBS))
                    else:
                        newlist.append(obj)
                else:
                    sys.exit("Unsupported estimator type: %r" % (obj))

            search_params[param_name] = newlist

    return search_params
############## Her 0.1 ###################
input_file = '/home/ansohn/Python/data/gametes-data/loc2_filtered/a5000/Her01/a_5000s_2000her_0.1__maf_0.2_EDM-1_01.txt'
rel_out = '/home/ansohn/Python/data/gametes-data/a5000_h01_relieff.txt'
surf_out = '/home/ansohn/Python/data/gametes-data/a5000_h01_surf.txt'
surfstar_out = '/home/ansohn/Python/data/gametes-data/a5000_h01_surfstar.txt'
msurf_out = '/home/ansohn/Python/data/gametes-data/a5000_h01_msurf.txt'

data = pd.read_csv(input_file, sep='\t')
labels = data['Class'].values
features = data.drop('Class', axis=1)

rel = skrebate.ReliefF(n_jobs=18)
surf = skrebate.SURF(n_jobs=18)
surfstar = skrebate.SURFstar(n_jobs=18)
msurf = skrebate.MultiSURF(n_jobs=18)

rel1 = np.savetxt(rel_out, rel.fit(features.values, labels).top_features_.astype('int32'))
surf1 = np.savetxt(surf_out, surf.fit(features.values, labels).top_features_.astype('int32'))
surfstar1 = np.savetxt(surfstar_out, surfstar.fit(features.values, labels).top_features_.astype('int32'))
msurf1 = np.savetxt(msurf_out, msurf.fit(features.values, labels).top_features_.astype('int32'))



############## Her 0.2 ###################
input_file = '/home/ansohn/Python/data/gametes-data/loc2_filtered/a5000/Her02/a_5000s_2000her_0.2__maf_0.2_EDM-1_01.txt'
rel_out = '/home/ansohn/Python/data/gametes-data/a5000_h02_relieff.txt'
surf_out = '/home/ansohn/Python/data/gametes-data/a5000_h02_surf.txt'
surfstar_out = '/home/ansohn/Python/data/gametes-data/a5000_h02_surfstar.txt'
msurf_out = '/home/ansohn/Python/data/gametes-data/a5000_h02_msurf.txt'
def get_search_params(params_builder):
    search_params = {}
    safe_eval = SafeEval(load_scipy=True, load_numpy=True)
    safe_eval_es = SafeEval(load_estimators=True)

    for p in params_builder['param_set']:
        search_p = p['search_param_selector']['search_p']
        if search_p.strip() == '':
            continue
        param_type = p['search_param_selector']['selected_param_type']

        lst = search_p.split(':')
        assert (
            len(lst) == 2
        ), "Error, make sure there is one and only one colon in search parameter input."
        literal = lst[1].strip()
        param_name = lst[0].strip()
        if param_name:
            if param_name.lower() == 'n_jobs':
                sys.exit("Parameter `%s` is invalid for search." % param_name)
            elif not param_name.endswith('-'):
                ev = safe_eval(literal)
                if param_type == 'final_estimator_p':
                    search_params['estimator__' + param_name] = ev
                else:
                    search_params['preprocessing_' + param_type[5:6] + '__' +
                                  param_name] = ev
            else:
                # only for estimator eval, add `-` to the end of param
                #TODO maybe add regular express check
                ev = safe_eval_es(literal)
                for obj in ev:
                    if 'n_jobs' in obj.get_params():
                        obj.set_params(n_jobs=N_JOBS)
                if param_type == 'final_estimator_p':
                    search_params['estimator__' + param_name[:-1]] = ev
                else:
                    search_params['preprocessing_' + param_type[5:6] + '__' +
                                  param_name[:-1]] = ev
        elif param_type != 'final_estimator_p':
            #TODO regular express check ?
            ev = safe_eval_es(literal)
            preprocessors = [
                preprocessing.StandardScaler(),
                preprocessing.Binarizer(),
                preprocessing.Imputer(),
                preprocessing.MaxAbsScaler(),
                preprocessing.Normalizer(),
                preprocessing.MinMaxScaler(),
                preprocessing.PolynomialFeatures(),
                preprocessing.RobustScaler(),
                feature_selection.SelectKBest(),
                feature_selection.GenericUnivariateSelect(),
                feature_selection.SelectPercentile(),
                feature_selection.SelectFpr(),
                feature_selection.SelectFdr(),
                feature_selection.SelectFwe(),
                feature_selection.VarianceThreshold(),
                decomposition.FactorAnalysis(random_state=0),
                decomposition.FastICA(random_state=0),
                decomposition.IncrementalPCA(),
                decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS),
                decomposition.LatentDirichletAllocation(random_state=0,
                                                        n_jobs=N_JOBS),
                decomposition.MiniBatchDictionaryLearning(random_state=0,
                                                          n_jobs=N_JOBS),
                decomposition.MiniBatchSparsePCA(random_state=0,
                                                 n_jobs=N_JOBS),
                decomposition.NMF(random_state=0),
                decomposition.PCA(random_state=0),
                decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS),
                decomposition.TruncatedSVD(random_state=0),
                kernel_approximation.Nystroem(random_state=0),
                kernel_approximation.RBFSampler(random_state=0),
                kernel_approximation.AdditiveChi2Sampler(),
                kernel_approximation.SkewedChi2Sampler(random_state=0),
                cluster.FeatureAgglomeration(),
                skrebate.ReliefF(n_jobs=N_JOBS),
                skrebate.SURF(n_jobs=N_JOBS),
                skrebate.SURFstar(n_jobs=N_JOBS),
                skrebate.MultiSURF(n_jobs=N_JOBS),
                skrebate.MultiSURFstar(n_jobs=N_JOBS),
                imblearn.under_sampling.ClusterCentroids(random_state=0,
                                                         n_jobs=N_JOBS),
                imblearn.under_sampling.CondensedNearestNeighbour(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.EditedNearestNeighbours(random_state=0,
                                                                n_jobs=N_JOBS),
                imblearn.under_sampling.RepeatedEditedNearestNeighbours(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.InstanceHardnessThreshold(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.NearMiss(random_state=0,
                                                 n_jobs=N_JOBS),
                imblearn.under_sampling.NeighbourhoodCleaningRule(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.OneSidedSelection(random_state=0,
                                                          n_jobs=N_JOBS),
                imblearn.under_sampling.RandomUnderSampler(random_state=0),
                imblearn.under_sampling.TomekLinks(random_state=0,
                                                   n_jobs=N_JOBS),
                imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.RandomOverSampler(random_state=0),
                imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.BorderlineSMOTE(random_state=0,
                                                       n_jobs=N_JOBS),
                imblearn.over_sampling.SMOTENC(categorical_features=[],
                                               random_state=0,
                                               n_jobs=N_JOBS),
                imblearn.combine.SMOTEENN(random_state=0),
                imblearn.combine.SMOTETomek(random_state=0)
            ]
            newlist = []
            for obj in ev:
                if obj is None:
                    newlist.append(None)
                elif obj == 'all_0':
                    newlist.extend(preprocessors[0:36])
                elif obj == 'sk_prep_all':  # no KernalCenter()
                    newlist.extend(preprocessors[0:8])
                elif obj == 'fs_all':
                    newlist.extend(preprocessors[8:15])
                elif obj == 'decomp_all':
                    newlist.extend(preprocessors[15:26])
                elif obj == 'k_appr_all':
                    newlist.extend(preprocessors[26:30])
                elif obj == 'reb_all':
                    newlist.extend(preprocessors[31:36])
                elif obj == 'imb_all':
                    newlist.extend(preprocessors[36:55])
                elif type(obj) is int and -1 < obj < len(preprocessors):
                    newlist.append(preprocessors[obj])
                elif hasattr(obj, 'get_params'):  # user object
                    if 'n_jobs' in obj.get_params():
                        newlist.append(obj.set_params(n_jobs=N_JOBS))
                    else:
                        newlist.append(obj)
                else:
                    sys.exit("Unsupported preprocessor type: %r" % (obj))
            search_params['preprocessing_' + param_type[5:6]] = newlist
        else:
            sys.exit("Parameter name of the final estimator can't be skipped!")

    return search_params
############## Her 0.1 ###################
input_file = '/home/ansohn/Python/data/gametes-data/loc2_filtered/a10/Her01/a_10s_2000her_0.1__maf_0.2_EDM-1_01.txt'
rel_out = '/home/ansohn/Python/data/gametes-data/a10_h01_relieff.txt'
surf_out = '/home/ansohn/Python/data/gametes-data/a10_h01_surf.txt'
surfstar_out = '/home/ansohn/Python/data/gametes-data/a10_h01_surfstar.txt'
msurf_out = '/home/ansohn/Python/data/gametes-data/a10_h01_msurf.txt'

data = pd.read_csv(input_file, sep='\t')
labels = data['Class'].values
features = data.drop('Class', axis=1)

rel = skrebate.ReliefF(n_jobs=5)
surf = skrebate.SURF(n_jobs=5)
surfstar = skrebate.SURFstar(n_jobs=5)
msurf = skrebate.MultiSURF(n_jobs=5)

rel1 = np.savetxt(
    rel_out,
    rel.fit(features.values, labels).top_features_.astype('int32'))
surf1 = np.savetxt(
    surf_out,
    surf.fit(features.values, labels).top_features_.astype('int32'))
surfstar1 = np.savetxt(
    surfstar_out,
    surfstar.fit(features.values, labels).top_features_.astype('int32'))
msurf1 = np.savetxt(
    msurf_out,
    msurf.fit(features.values, labels).top_features_.astype('int32'))

############## Her 0.2 ###################
Exemple #6
0
############## Her 0.1 ###################
input_file = '/home/ansohn/Python/data/gametes-data/loc2_filtered/a100/Her01/a_100s_2000her_0.1__maf_0.2_EDM-1_01.txt'
rel_out = '/home/ansohn/Python/data/gametes-data/a100_h01_relieff.txt'
surf_out = '/home/ansohn/Python/data/gametes-data/a100_h01_surf.txt'
surfstar_out = '/home/ansohn/Python/data/gametes-data/a100_h01_surfstar.txt'
msurf_out = '/home/ansohn/Python/data/gametes-data/a100_h01_msurf.txt'

data = pd.read_csv(input_file, sep='\t')
labels = data['Class'].values
features = data.drop('Class', axis=1)

rel = skrebate.ReliefF()
surf = skrebate.SURF()
surfstar = skrebate.SURFstar()
msurf = skrebate.MultiSURF()

#rel1 = np.savetxt(rel_out, rel.fit(features.values, labels).top_features_.astype('int32'))
#surf1 = np.savetxt(surf_out, surf.fit(features.values, labels).top_features_.astype('int32'))
#surfstar1 = np.savetxt(surfstar_out, surfstar.fit(features.values, labels).top_features_.astype('int32'))
#msurf1 = np.savetxt(msurf_out, msurf.fit(features.values, labels).top_features_.astype('int32'))

############## Her 0.02 ###################
#input_file = '/home/ansohn/Python/data/gametes-data/loc2_filtered/a100/Her02/a_100s_2000her_0.2__maf_0.2_EDM-1_01.txt'
#rel_out = '/home/ansohn/Python/data/gametes-data/a100_h02_relieff.txt'
#surf_out = '/home/ansohn/Python/data/gametes-data/a100_h02_surf.txt'
#surfstar_out = '/home/ansohn/Python/data/gametes-data/a100_h02_surfstar.txt'
#msurf_out = '/home/ansohn/Python/data/gametes-data/a100_h02_msurf.txt'
#
#data = pd.read_csv(input_file, sep='\t')
#labels = data['Class'].values