def get_fs_model(model, method, train, target=None, cv=None): """Connects given model with specified feature selection method and trains the final structure. """ if method == "RFE": model = fs_scikit.RFE(model, 2, step=5) if target is not None: return model.fit(train, target) else: return model.fit(train) if method == "RFECV": model = fs_scikit.RFECV(model, 3, cv=cv) if target is not None: return model.fit(train, target) else: return model.fit(train) elif method == "linearSVC": sel = SelectFromModel(LinearSVC(penalty='l1', dual=False)) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "fromModel": fm = fs_scikit.SelectFromModel(model) if target is not None: fm.fit(train, target) else: fm.fit(train) model = Pipeline([('feature_selection', fm), ('data_mining', model)]) # elif method == "Anova": # ANOVA SVM-C # anova_filter = fs_scikit.SelectKBest(f_regression, k=5) # model = Pipeline([ # ('feature_selection', anova_filter), # ('data_mining', model) # ]) elif method == "VarianceThreshold": sel = fs_scikit.VarianceThreshold(threshold=(.8 * (1 - .8))) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "SelectPercentile": sel = fs_scikit.SelectPercentile(fs_scikit.f_classif, percentile=30) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "SelectFpr": sel = fs_scikit.SelectFpr(alpha=0.2) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "SelectFdr": sel = fs_scikit.SelectFdr(alpha=0.2) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "SelectFwe": sel = fs_scikit.SelectFwe(alpha=0.2) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "ch2": sel = fs_scikit.SelectKBest(fs_scikit.chi2, k=2) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) else: print("Feature selection method was not found: " + method) sys.exit(1) return model
def univCV(data, labels, cut_level): '''calculate cross-validated univariate cut''' #loop for the list correlations = np.zeros(len(cut_level)) for inx, i in enumerate(cut_level): cross = sklcv.KFold(n=len(labels), n_folds=len(labels)) prediction = np.zeros_like(labels) for train, test in cross: univ = sklfs.SelectFpr(sklfs.f_regression, alpha=i) prep_data = data[train] prep_test = data[test] #if use_modules.find('a') != -1: # univ_agglo = sklcl.WardAgglomeration(connectivity=connect, n_clusters=ward_level) # prep_data = univ_agglo.fit_transform(prep_data) # prep_test = univ_agglo.transform(prep_test) #if use_modules.find('b') != -1: # bool_pos, bool_neg = direction_cutoff(prep_data) # prep_data = prep_data[:, bool_pos] # prep_test = prep_test[:, bool_pos] #if use_modules.find('c') != -1: # scaler = sklpre.StandardScaler() # prep_data = scaler.fit_transform(prep_data) # prep_test = scaler.transform(prep_test) prep_data = univ.fit_transform(prep_data, labels[train]) mod = sklsvm.NuSVR(kernel='linear', nu=1, C=100) #Change model mod.fit(prep_data, labels[train]) prep_test = univ.transform(prep_test) pred = mod.predict(prep_test) prediction[test] = pred #calculate prediction correlations[inx], _ = ss.spearmanr(prediction, labels) #TODO - smooth this? correlations = ssig.medfilt(correlations) best_cut = cut_level[correlations.argmax()] return best_cut
def select_features(x, y): """ :param x: dataframe of features :param y: dataframe of target property :return: Outputs of feature selection process """ x = pd.DataFrame(x) # Removing features with low variance var_threshold = f_selection.VarianceThreshold(threshold=(.8 * (1 - .8))) # Kbest-based and Percentile-based feature selection using regression f_regress = f_selection.f_regression(x, y, center=False) kbest = f_selection.SelectKBest(score_func=f_regress, k=2) percent = f_selection.SelectPercentile(score_func=f_regress, percentile=10) # Tree-based feature selection using a number of randomized decision trees trees = f_selection.SelectFromModel(ExtraTreesRegressor, prefit=True) # "False positive rate"-based feature selection using regression fpr = f_selection.SelectFpr(score_func=f_regress, alpha=0.05) # PCA-component evaluation pca = PCA(n_components=2) # Recursive feature elimination and cross-validated feature selection estimator = SVR(kernel="linear") selector = f_selection.RFECV(estimator, step=1, cv=5) # Build estimator from PCA and Univariate selection: combined_features = FeatureUnion([("pca_based", pca), ("univ_kbest", kbest), ("false_positive_rate", fpr), ("percentile_based", percent), ("RFECV_selector", selector), ("variance_threshold", var_threshold), ("trees_based", trees)]) x_union_features = combined_features.fit_transform(x, y) svm = SVC(kernel="linear") # Do grid search over all parameters: pipeline = Pipeline([("features", x_union_features), ("svm", svm)]) grid = dict(features__pca_based__n_components=range(1, 101), features__univ_kbest__k=range(1, 101), features_false_positive_rate_alpha=range(0, 1, 0.01), features_percentile_based_percentile=range(1, 20, 1), features_RFECV_selector_cv=range(1, 5), features_variance_threshold_threshold=range(0, 1, 0.01), svm__C=[0.01, 0.1, 1.0, 10.0]) grid_search = GridSearchCV(pipeline, param_grid=grid, verbose=0) x_features = grid_search.fit_transform(x, y) # Pickling feature reduction outputs with open(FS_PICKLE, 'wb') as result: pickle.dump(rf_sorted_score, result, pickle.HIGHEST_PROTOCOL) pickle.dump(grid_search.best_estimator_, result, pickle.HIGHEST_PROTOCOL) print grid_search.best_estimator_ return x_features
def select_best(): df = pd.merge( acw.gen_long_data(tpt) .normalize(columns="metric") .add_net_meta(tpt.net_hierarchy(HierarchyName.RESTRICTED_PERIPHERY_CORE)) .groupby(["task", "subject", "region", "net_meta"]).mean().reset_index() .rename(columns={"metric": "acw"}), acz.gen_long_data(tpt) .normalize(columns="metric") .add_net_meta(tpt.net_hierarchy(HierarchyName.RESTRICTED_PERIPHERY_CORE)) .groupby(["task", "subject", "region", "net_meta"]).mean().reset_index() .rename(columns={"metric": "acz"}), on=["task", "subject", "region", "net_meta"], sort=False).and_filter(NOTnet_meta="M") X = df.iloc[:, -2:].values y = df.net_meta.map({"C": 0, "P": 1}).values functions = [fs.mutual_info_classif, fs.f_classif, fs.chi2] for func in functions: for method in [fs.SelectKBest(func, k=1), fs.SelectPercentile(func), fs.SelectFdr(func), fs.SelectFpr(func), fs.SelectFwe(func)]: method.fit(X, y) print(f'{str(method).split("(")[0]} {func.__name__}: {np.argmax(method.scores_) + 1}')
from classifiers.estimators_all import CLASSIFIERS # # MAIN # synonyms_filepath = io_utils.get_synonyms_filepath() UNIVARIATE = { "uv_kbest_def": feature_selection.SelectKBest(f_classif, k=10), "uv_kbest_chi2_def": feature_selection.SelectKBest(chi2, k=10), "uv_percentile_def": feature_selection.SelectPercentile(f_classif, percentile=10), "uv_fpr_def": feature_selection.SelectFpr(f_classif), "uv_fwe_def": feature_selection.SelectFwe(f_classif) } print "Preparing Train Collection" X_train, y_train = create_train_data(io_utils.get_train_vectors_list()) print "Preparing Test Collection" X_test, test_collections = create_test_data(io_utils.get_train_vectors_list()) # Univariate for univariate_model_name in UNIVARIATE: model = UNIVARIATE[univariate_model_name] model.fit(X_train, y_train) X_train_new = model.transform(X_train) X_test_new = model.transform(X_test)
def _eval_search_params(params_builder): search_params = {} for p in params_builder['param_set']: search_list = p['sp_list'].strip() if search_list == '': continue param_name = p['sp_name'] if param_name.lower().endswith(NON_SEARCHABLE): print("Warning: `%s` is not eligible for search and was " "omitted!" % param_name) continue if not search_list.startswith(':'): safe_eval = SafeEval(load_scipy=True, load_numpy=True) ev = safe_eval(search_list) search_params[param_name] = ev else: # Have `:` before search list, asks for estimator evaluatio safe_eval_es = SafeEval(load_estimators=True) search_list = search_list[1:].strip() # TODO maybe add regular express check ev = safe_eval_es(search_list) preprocessings = ( preprocessing.StandardScaler(), preprocessing.Binarizer(), preprocessing.MaxAbsScaler(), preprocessing.Normalizer(), preprocessing.MinMaxScaler(), preprocessing.PolynomialFeatures(), preprocessing.RobustScaler(), feature_selection.SelectKBest(), feature_selection.GenericUnivariateSelect(), feature_selection.SelectPercentile(), feature_selection.SelectFpr(), feature_selection.SelectFdr(), feature_selection.SelectFwe(), feature_selection.VarianceThreshold(), decomposition.FactorAnalysis(random_state=0), decomposition.FastICA(random_state=0), decomposition.IncrementalPCA(), decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS), decomposition.LatentDirichletAllocation(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchDictionaryLearning(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchSparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.NMF(random_state=0), decomposition.PCA(random_state=0), decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.TruncatedSVD(random_state=0), kernel_approximation.Nystroem(random_state=0), kernel_approximation.RBFSampler(random_state=0), kernel_approximation.AdditiveChi2Sampler(), kernel_approximation.SkewedChi2Sampler(random_state=0), cluster.FeatureAgglomeration(), skrebate.ReliefF(n_jobs=N_JOBS), skrebate.SURF(n_jobs=N_JOBS), skrebate.SURFstar(n_jobs=N_JOBS), skrebate.MultiSURF(n_jobs=N_JOBS), skrebate.MultiSURFstar(n_jobs=N_JOBS), imblearn.under_sampling.ClusterCentroids(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.CondensedNearestNeighbour( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.EditedNearestNeighbours(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RepeatedEditedNearestNeighbours( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.InstanceHardnessThreshold( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NearMiss(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NeighbourhoodCleaningRule( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.OneSidedSelection(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RandomUnderSampler(random_state=0), imblearn.under_sampling.TomekLinks(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.RandomOverSampler(random_state=0), imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.BorderlineSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SMOTENC(categorical_features=[], random_state=0, n_jobs=N_JOBS), imblearn.combine.SMOTEENN(random_state=0), imblearn.combine.SMOTETomek(random_state=0)) newlist = [] for obj in ev: if obj is None: newlist.append(None) elif obj == 'all_0': newlist.extend(preprocessings[0:35]) elif obj == 'sk_prep_all': # no KernalCenter() newlist.extend(preprocessings[0:7]) elif obj == 'fs_all': newlist.extend(preprocessings[7:14]) elif obj == 'decomp_all': newlist.extend(preprocessings[14:25]) elif obj == 'k_appr_all': newlist.extend(preprocessings[25:29]) elif obj == 'reb_all': newlist.extend(preprocessings[30:35]) elif obj == 'imb_all': newlist.extend(preprocessings[35:54]) elif type(obj) is int and -1 < obj < len(preprocessings): newlist.append(preprocessings[obj]) elif hasattr(obj, 'get_params'): # user uploaded object if 'n_jobs' in obj.get_params(): newlist.append(obj.set_params(n_jobs=N_JOBS)) else: newlist.append(obj) else: sys.exit("Unsupported estimator type: %r" % (obj)) search_params[param_name] = newlist return search_params
def run_pipe(input_files, input_labels, use_modules, no_proc): '''run svr forkflow on data''' #--------------Organise inputs #calculate matrix #feature_matrix = prepare_modality(input_files, input_mask) #--------------Execute analysis #prepare feature agglomeration #mask_handle = nb.load(input_mask) connect = sklim.grid_to_graph(*input_files[0].shape, mask=np.invert( np.isnan(np.sum(input_files, 0)))) inshape = input_files.shape feature_matrix = input_files.reshape((inshape[0], -1)) #remove nans sum_features = np.sum(feature_matrix, 0) feature_matrix = feature_matrix[:, np.invert(np.isnan(sum_features))] #cross validation loo = sklcv.KFold(len(input_labels), n_folds=len(input_labels)) print('Starting svr') cv_pred = jl.Parallel(n_jobs=no_proc, verbose=1, pre_dispatch=no_proc * 2)( jl.delayed(do_model)(feature_matrix[train], input_labels[train], feature_matrix[test], connect, use_modules) for train, test in loo) cv_pred = np.array(cv_pred) corr, p = ss.pearsonr(cv_pred[:, 0], input_labels) #creating final model print('creating final model') if use_modules.find('a') != -1: final_agglo = sklcl.WardAgglomeration(connectivity=connect, n_clusters=int( np.median(cv_pred[:, 1]))) feature_matrix = final_agglo.fit_transform(feature_matrix) else: final_agglo = 0 if use_modules.find('b') != -1: bool_pos, bool_neg = direction_cutoff(feature_matrix) feature_matrix = feature_matrix[:, bool_pos] else: bool_pos = 0 if use_modules.find('c') != -1: final_scaler = sklpre.StandardScaler() feature_matrix = final_scaler.fit_transform(feature_matrix) else: final_scaler = 0 if use_modules.find('d') != -1: final_univ = sklfs.SelectFpr(alpha=np.median(cv_pred[:, 2])) feature_matrix = final_univ.fit_transform(feature_matrix, input_labels) else: final_univ = 0 final_model = sklsvm.NuSVR(kernel='linear', C=100, degree=1, nu=np.median(cv_pred[:, 3])) final_model.fit(feature_matrix, input_labels) return cv_pred, corr, p, final_agglo, final_univ, final_scaler, bool_pos, final_model
def do_model(train_d, train_l, test_d, connect, use_modules): #ward clustering (a) if use_modules.find('a') != -1: no_feat = len(train_d[0, :]) ward_sizes = np.array([ int(no_feat), int(no_feat * 0.8), int(no_feat * 0.5), int(no_feat * 0.1), int(no_feat * 0.01) ]) # set to about 100, 50 and 10% add 1/10000 for dbm use_wardsize = wardCV(train_d, train_l, ward_sizes, connect) agglo = sklcl.WardAgglomeration(connectivity=connect, n_clusters=use_wardsize) train_d = agglo.fit_transform(train_d) test_d = agglo.transform(test_d) else: use_wardsize = '0' #include positive values only(b) if use_modules.find('b') != -1: bool_pos, bool_neg = direction_cutoff(train_d) train_d = train_d[:, bool_pos] test_d = test_d[:, bool_pos] #scale features to z scores(c) if use_modules.find('c') != -1: scaler = sklpre.StandardScaler() train_d = scaler.fit_transform(train_d) test_d = scaler.transform(test_d) #univariate selection(d) if use_modules.find('d') != -1: univ_levels = np.array([1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0001]) #use_cut = univCV(train_d, train_l, univ_levels,use_wardsize,connect,use_modules) use_cut = univCV(train_d, train_l, univ_levels) univ_select = sklfs.SelectFpr(alpha=use_cut) train_d = univ_select.fit_transform(train_d, train_l) test_d = univ_select.transform(test_d) else: use_cut = '0' #train model nus = np.array([1]) #set nu threshold params = dict(nu=nus) model = GridSearchCV( estimator=sklsvm.NuSVR(kernel='linear', C=100, degree=1) #changed from 1000 to 10 for dbm , param_grid=params, cv=10, n_jobs=1, scoring='r2') #TODO changed from mse model.fit(train_d, train_l) pred = model.predict(test_d) use_nu = model.best_params_['nu'] results = [pred, use_wardsize, use_cut, use_nu] return results
def get_search_params(params_builder): search_params = {} safe_eval = SafeEval(load_scipy=True, load_numpy=True) safe_eval_es = SafeEval(load_estimators=True) for p in params_builder['param_set']: search_p = p['search_param_selector']['search_p'] if search_p.strip() == '': continue param_type = p['search_param_selector']['selected_param_type'] lst = search_p.split(':') assert ( len(lst) == 2 ), "Error, make sure there is one and only one colon in search parameter input." literal = lst[1].strip() param_name = lst[0].strip() if param_name: if param_name.lower() == 'n_jobs': sys.exit("Parameter `%s` is invalid for search." % param_name) elif not param_name.endswith('-'): ev = safe_eval(literal) if param_type == 'final_estimator_p': search_params['estimator__' + param_name] = ev else: search_params['preprocessing_' + param_type[5:6] + '__' + param_name] = ev else: # only for estimator eval, add `-` to the end of param #TODO maybe add regular express check ev = safe_eval_es(literal) for obj in ev: if 'n_jobs' in obj.get_params(): obj.set_params(n_jobs=N_JOBS) if param_type == 'final_estimator_p': search_params['estimator__' + param_name[:-1]] = ev else: search_params['preprocessing_' + param_type[5:6] + '__' + param_name[:-1]] = ev elif param_type != 'final_estimator_p': #TODO regular express check ? ev = safe_eval_es(literal) preprocessors = [ preprocessing.StandardScaler(), preprocessing.Binarizer(), preprocessing.Imputer(), preprocessing.MaxAbsScaler(), preprocessing.Normalizer(), preprocessing.MinMaxScaler(), preprocessing.PolynomialFeatures(), preprocessing.RobustScaler(), feature_selection.SelectKBest(), feature_selection.GenericUnivariateSelect(), feature_selection.SelectPercentile(), feature_selection.SelectFpr(), feature_selection.SelectFdr(), feature_selection.SelectFwe(), feature_selection.VarianceThreshold(), decomposition.FactorAnalysis(random_state=0), decomposition.FastICA(random_state=0), decomposition.IncrementalPCA(), decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS), decomposition.LatentDirichletAllocation(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchDictionaryLearning(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchSparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.NMF(random_state=0), decomposition.PCA(random_state=0), decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.TruncatedSVD(random_state=0), kernel_approximation.Nystroem(random_state=0), kernel_approximation.RBFSampler(random_state=0), kernel_approximation.AdditiveChi2Sampler(), kernel_approximation.SkewedChi2Sampler(random_state=0), cluster.FeatureAgglomeration(), skrebate.ReliefF(n_jobs=N_JOBS), skrebate.SURF(n_jobs=N_JOBS), skrebate.SURFstar(n_jobs=N_JOBS), skrebate.MultiSURF(n_jobs=N_JOBS), skrebate.MultiSURFstar(n_jobs=N_JOBS), imblearn.under_sampling.ClusterCentroids(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.CondensedNearestNeighbour( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.EditedNearestNeighbours(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RepeatedEditedNearestNeighbours( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.InstanceHardnessThreshold( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NearMiss(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NeighbourhoodCleaningRule( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.OneSidedSelection(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RandomUnderSampler(random_state=0), imblearn.under_sampling.TomekLinks(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.RandomOverSampler(random_state=0), imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.BorderlineSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SMOTENC(categorical_features=[], random_state=0, n_jobs=N_JOBS), imblearn.combine.SMOTEENN(random_state=0), imblearn.combine.SMOTETomek(random_state=0) ] newlist = [] for obj in ev: if obj is None: newlist.append(None) elif obj == 'all_0': newlist.extend(preprocessors[0:36]) elif obj == 'sk_prep_all': # no KernalCenter() newlist.extend(preprocessors[0:8]) elif obj == 'fs_all': newlist.extend(preprocessors[8:15]) elif obj == 'decomp_all': newlist.extend(preprocessors[15:26]) elif obj == 'k_appr_all': newlist.extend(preprocessors[26:30]) elif obj == 'reb_all': newlist.extend(preprocessors[31:36]) elif obj == 'imb_all': newlist.extend(preprocessors[36:55]) elif type(obj) is int and -1 < obj < len(preprocessors): newlist.append(preprocessors[obj]) elif hasattr(obj, 'get_params'): # user object if 'n_jobs' in obj.get_params(): newlist.append(obj.set_params(n_jobs=N_JOBS)) else: newlist.append(obj) else: sys.exit("Unsupported preprocessor type: %r" % (obj)) search_params['preprocessing_' + param_type[5:6]] = newlist else: sys.exit("Parameter name of the final estimator can't be skipped!") return search_params