def initialize(self, jobs_server='', method='random', features=None, labels=None): ''' Inicializa el hormiguero ''' #----------------------------------- # SINGLE PROCESSING #----------------------------------- if jobs_server == '': if method == 'random': self = [individual.initialize_random() for individual in self] if method == 'relief': RF = skrebate.ReliefF(n_features_to_select=2, n_neighbors=100).fit( features[0], labels[0]) y = np.cumsum(RF.feature_importances_ - min(RF.feature_importances_)) pF = y / max(y) self = [ individual.initialize_relief(pF=pF / sum(pF)) for individual in self ] else: pass #----------------------------------- # MULTIPROCESSING #----------------------------------- else: self = jobs_server.map(initialize, [(individual, method) for individual in self])
def _eval_search_params(params_builder): search_params = {} for p in params_builder['param_set']: search_list = p['sp_list'].strip() if search_list == '': continue param_name = p['sp_name'] if param_name.lower().endswith(NON_SEARCHABLE): print("Warning: `%s` is not eligible for search and was " "omitted!" % param_name) continue if not search_list.startswith(':'): safe_eval = SafeEval(load_scipy=True, load_numpy=True) ev = safe_eval(search_list) search_params[param_name] = ev else: # Have `:` before search list, asks for estimator evaluatio safe_eval_es = SafeEval(load_estimators=True) search_list = search_list[1:].strip() # TODO maybe add regular express check ev = safe_eval_es(search_list) preprocessings = ( preprocessing.StandardScaler(), preprocessing.Binarizer(), preprocessing.MaxAbsScaler(), preprocessing.Normalizer(), preprocessing.MinMaxScaler(), preprocessing.PolynomialFeatures(), preprocessing.RobustScaler(), feature_selection.SelectKBest(), feature_selection.GenericUnivariateSelect(), feature_selection.SelectPercentile(), feature_selection.SelectFpr(), feature_selection.SelectFdr(), feature_selection.SelectFwe(), feature_selection.VarianceThreshold(), decomposition.FactorAnalysis(random_state=0), decomposition.FastICA(random_state=0), decomposition.IncrementalPCA(), decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS), decomposition.LatentDirichletAllocation(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchDictionaryLearning(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchSparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.NMF(random_state=0), decomposition.PCA(random_state=0), decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.TruncatedSVD(random_state=0), kernel_approximation.Nystroem(random_state=0), kernel_approximation.RBFSampler(random_state=0), kernel_approximation.AdditiveChi2Sampler(), kernel_approximation.SkewedChi2Sampler(random_state=0), cluster.FeatureAgglomeration(), skrebate.ReliefF(n_jobs=N_JOBS), skrebate.SURF(n_jobs=N_JOBS), skrebate.SURFstar(n_jobs=N_JOBS), skrebate.MultiSURF(n_jobs=N_JOBS), skrebate.MultiSURFstar(n_jobs=N_JOBS), imblearn.under_sampling.ClusterCentroids(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.CondensedNearestNeighbour( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.EditedNearestNeighbours(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RepeatedEditedNearestNeighbours( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.InstanceHardnessThreshold( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NearMiss(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NeighbourhoodCleaningRule( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.OneSidedSelection(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RandomUnderSampler(random_state=0), imblearn.under_sampling.TomekLinks(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.RandomOverSampler(random_state=0), imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.BorderlineSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SMOTENC(categorical_features=[], random_state=0, n_jobs=N_JOBS), imblearn.combine.SMOTEENN(random_state=0), imblearn.combine.SMOTETomek(random_state=0)) newlist = [] for obj in ev: if obj is None: newlist.append(None) elif obj == 'all_0': newlist.extend(preprocessings[0:35]) elif obj == 'sk_prep_all': # no KernalCenter() newlist.extend(preprocessings[0:7]) elif obj == 'fs_all': newlist.extend(preprocessings[7:14]) elif obj == 'decomp_all': newlist.extend(preprocessings[14:25]) elif obj == 'k_appr_all': newlist.extend(preprocessings[25:29]) elif obj == 'reb_all': newlist.extend(preprocessings[30:35]) elif obj == 'imb_all': newlist.extend(preprocessings[35:54]) elif type(obj) is int and -1 < obj < len(preprocessings): newlist.append(preprocessings[obj]) elif hasattr(obj, 'get_params'): # user uploaded object if 'n_jobs' in obj.get_params(): newlist.append(obj.set_params(n_jobs=N_JOBS)) else: newlist.append(obj) else: sys.exit("Unsupported estimator type: %r" % (obj)) search_params[param_name] = newlist return search_params
start_time = time.time() ############## Her 0.1 ################### input_file = '/home/ansohn/Python/data/gametes-data/loc2_filtered/a5000/Her01/a_5000s_2000her_0.1__maf_0.2_EDM-1_01.txt' rel_out = '/home/ansohn/Python/data/gametes-data/a5000_h01_relieff.txt' surf_out = '/home/ansohn/Python/data/gametes-data/a5000_h01_surf.txt' surfstar_out = '/home/ansohn/Python/data/gametes-data/a5000_h01_surfstar.txt' msurf_out = '/home/ansohn/Python/data/gametes-data/a5000_h01_msurf.txt' data = pd.read_csv(input_file, sep='\t') labels = data['Class'].values features = data.drop('Class', axis=1) rel = skrebate.ReliefF(n_jobs=18) surf = skrebate.SURF(n_jobs=18) surfstar = skrebate.SURFstar(n_jobs=18) msurf = skrebate.MultiSURF(n_jobs=18) rel1 = np.savetxt(rel_out, rel.fit(features.values, labels).top_features_.astype('int32')) surf1 = np.savetxt(surf_out, surf.fit(features.values, labels).top_features_.astype('int32')) surfstar1 = np.savetxt(surfstar_out, surfstar.fit(features.values, labels).top_features_.astype('int32')) msurf1 = np.savetxt(msurf_out, msurf.fit(features.values, labels).top_features_.astype('int32')) ############## Her 0.2 ################### input_file = '/home/ansohn/Python/data/gametes-data/loc2_filtered/a5000/Her02/a_5000s_2000her_0.2__maf_0.2_EDM-1_01.txt' rel_out = '/home/ansohn/Python/data/gametes-data/a5000_h02_relieff.txt' surf_out = '/home/ansohn/Python/data/gametes-data/a5000_h02_surf.txt'
def get_search_params(params_builder): search_params = {} safe_eval = SafeEval(load_scipy=True, load_numpy=True) safe_eval_es = SafeEval(load_estimators=True) for p in params_builder['param_set']: search_p = p['search_param_selector']['search_p'] if search_p.strip() == '': continue param_type = p['search_param_selector']['selected_param_type'] lst = search_p.split(':') assert ( len(lst) == 2 ), "Error, make sure there is one and only one colon in search parameter input." literal = lst[1].strip() param_name = lst[0].strip() if param_name: if param_name.lower() == 'n_jobs': sys.exit("Parameter `%s` is invalid for search." % param_name) elif not param_name.endswith('-'): ev = safe_eval(literal) if param_type == 'final_estimator_p': search_params['estimator__' + param_name] = ev else: search_params['preprocessing_' + param_type[5:6] + '__' + param_name] = ev else: # only for estimator eval, add `-` to the end of param #TODO maybe add regular express check ev = safe_eval_es(literal) for obj in ev: if 'n_jobs' in obj.get_params(): obj.set_params(n_jobs=N_JOBS) if param_type == 'final_estimator_p': search_params['estimator__' + param_name[:-1]] = ev else: search_params['preprocessing_' + param_type[5:6] + '__' + param_name[:-1]] = ev elif param_type != 'final_estimator_p': #TODO regular express check ? ev = safe_eval_es(literal) preprocessors = [ preprocessing.StandardScaler(), preprocessing.Binarizer(), preprocessing.Imputer(), preprocessing.MaxAbsScaler(), preprocessing.Normalizer(), preprocessing.MinMaxScaler(), preprocessing.PolynomialFeatures(), preprocessing.RobustScaler(), feature_selection.SelectKBest(), feature_selection.GenericUnivariateSelect(), feature_selection.SelectPercentile(), feature_selection.SelectFpr(), feature_selection.SelectFdr(), feature_selection.SelectFwe(), feature_selection.VarianceThreshold(), decomposition.FactorAnalysis(random_state=0), decomposition.FastICA(random_state=0), decomposition.IncrementalPCA(), decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS), decomposition.LatentDirichletAllocation(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchDictionaryLearning(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchSparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.NMF(random_state=0), decomposition.PCA(random_state=0), decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.TruncatedSVD(random_state=0), kernel_approximation.Nystroem(random_state=0), kernel_approximation.RBFSampler(random_state=0), kernel_approximation.AdditiveChi2Sampler(), kernel_approximation.SkewedChi2Sampler(random_state=0), cluster.FeatureAgglomeration(), skrebate.ReliefF(n_jobs=N_JOBS), skrebate.SURF(n_jobs=N_JOBS), skrebate.SURFstar(n_jobs=N_JOBS), skrebate.MultiSURF(n_jobs=N_JOBS), skrebate.MultiSURFstar(n_jobs=N_JOBS), imblearn.under_sampling.ClusterCentroids(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.CondensedNearestNeighbour( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.EditedNearestNeighbours(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RepeatedEditedNearestNeighbours( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.InstanceHardnessThreshold( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NearMiss(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NeighbourhoodCleaningRule( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.OneSidedSelection(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RandomUnderSampler(random_state=0), imblearn.under_sampling.TomekLinks(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.RandomOverSampler(random_state=0), imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.BorderlineSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SMOTENC(categorical_features=[], random_state=0, n_jobs=N_JOBS), imblearn.combine.SMOTEENN(random_state=0), imblearn.combine.SMOTETomek(random_state=0) ] newlist = [] for obj in ev: if obj is None: newlist.append(None) elif obj == 'all_0': newlist.extend(preprocessors[0:36]) elif obj == 'sk_prep_all': # no KernalCenter() newlist.extend(preprocessors[0:8]) elif obj == 'fs_all': newlist.extend(preprocessors[8:15]) elif obj == 'decomp_all': newlist.extend(preprocessors[15:26]) elif obj == 'k_appr_all': newlist.extend(preprocessors[26:30]) elif obj == 'reb_all': newlist.extend(preprocessors[31:36]) elif obj == 'imb_all': newlist.extend(preprocessors[36:55]) elif type(obj) is int and -1 < obj < len(preprocessors): newlist.append(preprocessors[obj]) elif hasattr(obj, 'get_params'): # user object if 'n_jobs' in obj.get_params(): newlist.append(obj.set_params(n_jobs=N_JOBS)) else: newlist.append(obj) else: sys.exit("Unsupported preprocessor type: %r" % (obj)) search_params['preprocessing_' + param_type[5:6]] = newlist else: sys.exit("Parameter name of the final estimator can't be skipped!") return search_params
import pandas as pd import skrebate import numpy as np ############## Her 0.1 ################### input_file = '/home/ansohn/Python/data/gametes-data/loc2_filtered/a10/Her01/a_10s_2000her_0.1__maf_0.2_EDM-1_01.txt' rel_out = '/home/ansohn/Python/data/gametes-data/a10_h01_relieff.txt' surf_out = '/home/ansohn/Python/data/gametes-data/a10_h01_surf.txt' surfstar_out = '/home/ansohn/Python/data/gametes-data/a10_h01_surfstar.txt' msurf_out = '/home/ansohn/Python/data/gametes-data/a10_h01_msurf.txt' data = pd.read_csv(input_file, sep='\t') labels = data['Class'].values features = data.drop('Class', axis=1) rel = skrebate.ReliefF(n_jobs=5) surf = skrebate.SURF(n_jobs=5) surfstar = skrebate.SURFstar(n_jobs=5) msurf = skrebate.MultiSURF(n_jobs=5) rel1 = np.savetxt( rel_out, rel.fit(features.values, labels).top_features_.astype('int32')) surf1 = np.savetxt( surf_out, surf.fit(features.values, labels).top_features_.astype('int32')) surfstar1 = np.savetxt( surfstar_out, surfstar.fit(features.values, labels).top_features_.astype('int32')) msurf1 = np.savetxt( msurf_out,
import pandas as pd import skrebate import numpy as np ############## Her 0.1 ################### input_file = '/home/ansohn/Python/data/gametes-data/loc2_filtered/a100/Her01/a_100s_2000her_0.1__maf_0.2_EDM-1_01.txt' rel_out = '/home/ansohn/Python/data/gametes-data/a100_h01_relieff.txt' surf_out = '/home/ansohn/Python/data/gametes-data/a100_h01_surf.txt' surfstar_out = '/home/ansohn/Python/data/gametes-data/a100_h01_surfstar.txt' msurf_out = '/home/ansohn/Python/data/gametes-data/a100_h01_msurf.txt' data = pd.read_csv(input_file, sep='\t') labels = data['Class'].values features = data.drop('Class', axis=1) rel = skrebate.ReliefF() surf = skrebate.SURF() surfstar = skrebate.SURFstar() msurf = skrebate.MultiSURF() #rel1 = np.savetxt(rel_out, rel.fit(features.values, labels).top_features_.astype('int32')) #surf1 = np.savetxt(surf_out, surf.fit(features.values, labels).top_features_.astype('int32')) #surfstar1 = np.savetxt(surfstar_out, surfstar.fit(features.values, labels).top_features_.astype('int32')) #msurf1 = np.savetxt(msurf_out, msurf.fit(features.values, labels).top_features_.astype('int32')) ############## Her 0.02 ################### #input_file = '/home/ansohn/Python/data/gametes-data/loc2_filtered/a100/Her02/a_100s_2000her_0.2__maf_0.2_EDM-1_01.txt' #rel_out = '/home/ansohn/Python/data/gametes-data/a100_h02_relieff.txt' #surf_out = '/home/ansohn/Python/data/gametes-data/a100_h02_surf.txt' #surfstar_out = '/home/ansohn/Python/data/gametes-data/a100_h02_surfstar.txt' #msurf_out = '/home/ansohn/Python/data/gametes-data/a100_h02_msurf.txt'