def banckmark_feature_selection(): d = micro_data() X = np.array(d)[:, 1:10] Y = np.array(d)[:, 10] class PipelineRFE(Pipeline): def fit(self, X, y=None, **fit_params): super(PipelineRFE, self).fit(X, y, **fit_params) self.coef_ = self.steps[-1][-1].coef_ return self pipe = PipelineRFE( [ ('std_scaler', preprocessing.StandardScaler()), ("LR", linear_model.LogisticRegression(random_state=42)) ] ) _ = StratifiedKFold(random_state=42) print("Scores for validation banchmark sklearn RFE") feature_selector_cv = feature_selection.RFECV(pipe, cv=5, step=2, scoring="accuracy")\ .fit(X, Y.astype('int')) print(pipe.__class__.__name__+" accuracy is {}".format(feature_selector_cv.grid_scores_.mean())) feature_selector_cv = feature_selection.RFECV(pipe, cv=5, step=2, scoring="log_loss")\ .fit(X, Y.astype('int')) print(pipe.__class__.__name__+" log_loss is {}".format(feature_selector_cv.grid_scores_.mean())) feature_selector_cv = feature_selection.RFECV(pipe, cv=5, step=2, scoring="roc_auc")\ .fit(X, Y.astype('int')) print(pipe.__class__.__name__+" auc is {}".format(feature_selector_cv.grid_scores_.mean()))
def selector_logistic_rfe(self, features_indep_df: PandasDataFrame, feature_target: List, kernel: str = "linear", n_jobs: int = -1, **kwargs: Any) -> object: """Select top features using recursive feature elimination and cross-validated selection of the best number of features, to rank features. Attributes: model.n_features_ model.support_ model.ranking_ model.grid_scores_ model.estimator_ :param features_indep_df: the independent features, which are inputted into the model. :param feature_target: the target feature, which is being estimated. :param kernel: Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used. :param n_jobs: number of CPUs to use during the resampling. If ‘-1’, use all the CPUs. :param kwargs: step=1, cv=None, scoring=None, verbose=0 :return: the feature selection model. """ self.__logger.debug( "Run Feature Ranking with Recursive Feature Elimination.") estimator = SVR(kernel=kernel) selector = feature_selection.RFECV(estimator=estimator, n_jobs=n_jobs, **kwargs) return selector.fit(features_indep_df, feature_target)
def quick_fitted_tree(X, Y, model_type=['GridSearch', 'FeatureSelection'], test_split=None, random_state=None): from sklearn import tree, model_selection, feature_selection splitted_data = None sel_cols = None x = X.copy() y = Y.copy() if isinstance(test_split, (float)): x, x_test, y, y_test = model_selection.train_test_split( x, y, test_size=test_split, random_state=random_state) cv_split = model_selection.ShuffleSplit(n_splits=10, test_size=.3, train_size=.6, random_state=random_state) dtree = tree.DecisionTreeClassifier(random_state=random_state) model = dtree if 'FeatureSelection' in model_type: dtree_rfe = feature_selection.RFECV( tree.DecisionTreeClassifier(random_state=random_state), step=1, scoring='accuracy', cv=cv_split) dtree_rfe.fit(x, y) x = x[:, dtree_rfe.get_support()] if isinstance(test_split, (float)): x_test = x_test[:, dtree_rfe.get_support()] sel_cols = dtree_rfe.get_support() if 'GridSearch' in model_type: param_grid = { 'criterion': ['gini', 'entropy'], 'max_depth': [2, 4, 6, 8, 10, None], 'random_state': [0], #'splitter': ['best', 'random'], #'min_samples_split': [2,5,10,.03,.05], #'min_samples_leaf': [1,5,10,.03,.05], #'max_features': [None, 'auto'], } model = model_selection.GridSearchCV( tree.DecisionTreeClassifier(random_state=random_state), param_grid=param_grid, scoring='accuracy', cv=cv_split) model.fit(x, y) if model_type != None: model = model.best_estimator_ if isinstance(test_split, (float)): splitted_data = (x, x_test, y, y_test) else: splitted_data = (None, x, None, y) return model, sel_cols, splitted_data
def get_fs_model(model, method, train, target=None, cv=None): """Connects given model with specified feature selection method and trains the final structure. """ if method == "RFE": model = fs_scikit.RFE(model, 2, step=5) if target is not None: return model.fit(train, target) else: return model.fit(train) if method == "RFECV": model = fs_scikit.RFECV(model, 3, cv=cv) if target is not None: return model.fit(train, target) else: return model.fit(train) elif method == "linearSVC": sel = SelectFromModel(LinearSVC(penalty='l1', dual=False)) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "fromModel": fm = fs_scikit.SelectFromModel(model) if target is not None: fm.fit(train, target) else: fm.fit(train) model = Pipeline([('feature_selection', fm), ('data_mining', model)]) # elif method == "Anova": # ANOVA SVM-C # anova_filter = fs_scikit.SelectKBest(f_regression, k=5) # model = Pipeline([ # ('feature_selection', anova_filter), # ('data_mining', model) # ]) elif method == "VarianceThreshold": sel = fs_scikit.VarianceThreshold(threshold=(.8 * (1 - .8))) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "SelectPercentile": sel = fs_scikit.SelectPercentile(fs_scikit.f_classif, percentile=30) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "SelectFpr": sel = fs_scikit.SelectFpr(alpha=0.2) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "SelectFdr": sel = fs_scikit.SelectFdr(alpha=0.2) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "SelectFwe": sel = fs_scikit.SelectFwe(alpha=0.2) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "ch2": sel = fs_scikit.SelectKBest(fs_scikit.chi2, k=2) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) else: print("Feature selection method was not found: " + method) sys.exit(1) return model
def select_feature(base_result, tune_model, trainX, trainY): print('Before DT RFE Training Shape Old:', trainX.shape) print('Before DT RFE Training Columns Old:', trainX.columns.values) print('Before DT RFE Training w/bin score mean: {:.2f}'.format( base_result['train_score'].mean() * 100)) print('Before DT RFE Test w/bin score mean: {:.2f}'.format( base_result['test_score'].mean() * 100)) print('Before DT RFE Test w/bin score 3*std: +- {:.2f}'.format( base_result['test_score'].std() * 100 * 3)) print('-' * 100) # feature selection dtree_rfe = feature_selection.RFECV(dtree, step=1, scoring='accuracy', cv=cv_split) dtree_rfe.fit(trainX, trainY) X_rfe = trainX.columns.values[ dtree_rfe.get_support()] # get true for all features rfe_result = model_selection.cross_validate(dtree, trainX[X_rfe], trainY, cv=cv_split, return_train_score=True) print('After DT RFE Training Shape New:', trainX[X_rfe].shape) print('After DT RFE Training Columns New:', X_rfe) print('After DT RFE Training w/bin score mean: {:.2f}'.format( rfe_result['train_score'].mean() * 100)) print('After DT RFE Test w/bin score mean: {:.2f}'.format( rfe_result['test_score'].mean() * 100)) print('After DT RFE Test w/bin score 3*std: +- {:.2f}'.format( rfe_result['test_score'].std() * 100 * 3)) print('-' * 100) rfe_tune_model = model_selection.GridSearchCV( tree.DecisionTreeClassifier(), param_grid=param_grid, scoring='roc_auc', cv=cv_split, return_train_score=True) rfe_tune_model.fit(trainX[X_rfe], trainY) print('After DT RFE Tuned Parameters:', rfe_tune_model.best_params_) print('After DT RFE Tuned Training w/bin score mean: {:.2f}'.format( rfe_tune_model.cv_results_['mean_train_score'][ rfe_tune_model.best_index_] * 100)) print('After DT RFE Tuned Test w/bin score mean: {:.2f}'.format( rfe_tune_model.cv_results_['mean_test_score'][ rfe_tune_model.best_index_] * 100)) print('After DT RFE Tuned Test w/bin score 3*std: +- {:.2f}'.format( rfe_tune_model.cv_results_['std_test_score'][ rfe_tune_model.best_index_] * 100 * 3)) print('-' * 100)
def do_rfe(model, X, y, cv_split=None, scoring='accuracy', random_state=42): if cv_split is None: cv_split = model_selection.ShuffleSplit(n_splits=10, test_size=.3, train_size=.6, random_state=random_state) selector = feature_selection.RFECV(clone(model), step=1, scoring=scoring, cv=cv_split) selector.fit(X, y) print('do_feat_rfe: Done') return X.columns.values[selector.get_support()]
def select_feature(alg, x, y, cv_split=None, is_print=True): if cv_split == None: cv_split = model_selection.ShuffleSplit(n_splits=10, test_size=.3, train_size=.7, random_state=0) base_results = model_selection.cross_validate(alg, x, y, cv=cv_split) if is_print: print('BEFORE DT RFE Training Shape Old: ', x.shape) print('BEFORE DT RFE Training Columns Old: ', x.columns.values) print("BEFORE DT RFE Training w/bin score mean: {:.2f}".format( base_results['train_score'].mean() * 100)) print("BEFORE DT RFE Test w/bin score mean: {:.2f}".format( base_results['test_score'].mean() * 100)) print("BEFORE DT RFE Test w/bin score 3*std: +/- {:.2f}".format( base_results['test_score'].std() * 100 * 3)) print('-' * 10) #feature selection dtree_rfe = feature_selection.RFECV(alg, step=1, scoring='accuracy', cv=cv_split) dtree_rfe.fit(x, y) #transform x&y to reduced features and fit new model #alternative: can use pipeline to reduce fit and transform steps: http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html # the optimal columns X_rfe = x.columns.values[dtree_rfe.get_support()] rfe_results = model_selection.cross_validate(alg, x[X_rfe], y, cv=cv_split) if is_print: print('AFTER DT RFE Training Shape New: ', x[X_rfe].shape) print('AFTER DT RFE Training Columns New: ', X_rfe) print("AFTER DT RFE Training w/bin score mean: {:.2f}".format( rfe_results['train_score'].mean() * 100)) print("AFTER DT RFE Test w/bin score mean: {:.2f}".format( rfe_results['test_score'].mean() * 100)) print("AFTER DT RFE Test w/bin score 3*std: +/- {:.2f}".format( rfe_results['test_score'].std() * 100 * 3)) print('-' * 10) return X_rfe
def feature_select(data1, data1_x_bin, dtree, base_results, cv_split, Target): # Feature Selection # more predictor variables do not make a better model, but the right predictors do. print('feature_select') print('BEFORE DT RFE Training Shape Old: ', data1[data1_x_bin].shape) print('BEFORE DT RFE Training Columns Old: ', data1[data1_x_bin].columns.values.tolist()) print("BEFORE DT RFE Training w/bin score mean: {:.2f}".format( base_results['train_score'].mean() * 100)) print("BEFORE DT RFE Test w/bin score mean: {:.2f}".format( base_results['test_score'].mean() * 100)) print("BEFORE DT RFE Test w/bin score 3*std: +/- {:.2f}".format( base_results['test_score'].std() * 100 * 3)) print('-' * 10) # feature selection dtree_rfe = feature_selection.RFECV(dtree, step=1, scoring='accuracy', cv=cv_split, n_jobs=-1) dtree_rfe.fit(data1[data1_x_bin], data1[Target]) # transform x&y to reduced features and fit new model # alternative: can use pipeline to reduce fit and transform steps: http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html X_rfe = data1[data1_x_bin].columns.values[dtree_rfe.get_support()].tolist( ) # Get a mask, or integer index, of the features selected rfe_results = model_selection.cross_validate(dtree, data1[X_rfe], data1[Target], cv=cv_split, return_train_score=True) # print(dtree_rfe.grid_scores_) print('AFTER DT RFE Training Shape New: ', data1[X_rfe].shape) print('AFTER DT RFE Training Columns New: ', X_rfe) print("AFTER DT RFE Training w/bin score mean: {:.2f}".format( rfe_results['train_score'].mean() * 100)) print("AFTER DT RFE Test w/bin score mean: {:.2f}".format( rfe_results['test_score'].mean() * 100)) print("AFTER DT RFE Test w/bin score 3*std: +/- {:.2f}".format( rfe_results['test_score'].std() * 100 * 3)) print('-' * 10) return dtree_rfe, X_rfe, rfe_results
def key_features(X_train, y_train, sub, varience_test=True): print 'Features before reduction: ' + str(len(X_train[0])) if varience_test: #remove features with low variance sel = feature_selection.VarianceThreshold(threshold=(.8 * (1 - .8))) X_train = sel.fit_transform(X_train) sub = sel.transform(sub) print 'Features after variance reduction: ' + str(len(X_train[0])) estimator = linear_model.SGDClassifier(n_jobs=-1, class_weight='auto') selector = feature_selection.RFECV(estimator, step=1, cv=5) features = selector.fit_transform(X_train, y_train) submission = selector.transform(sub) print 'Features after recursive elimination: ' + str(len(features[0])) return (features, submission)
def rfe(X2,y2): # Create the RFE object and compute a cross-validated score. svc = svm.SVC(kernel="linear") # classifications rfecv = feature_selection.RFECV( estimator=svc, step=1, cv=model_selection.StratifiedKFold(4), scoring='roc_auc') rfecv.fit(X2, y2) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show()
def fit(self, x, y): est = clone(estimator) est.set_params(**self.get_params()) fs = feature_selection.RFECV( estimator=est, cv=folds, scoring=metrics.make_scorer(metrics.roc_auc_score, average='weighted'), n_jobs=-1, ) fs.fit(x, y) self.best_variables = list(x.iloc[:, fs.get_support(indices=True)]) x = x[self.best_variables] super().fit(x, y) i = len(variables_df) variables_df.loc[i] = 0 variables_df.loc[i, self.best_variables] = 1
def __init__(self, random_state=None, n_jobs=1, cv=5): super().__init__(random_state=random_state, n_jobs=n_jobs) PARAMS = { "num_leaves": 32, "max_depth": 5, "boosting_type": "rf", "bagging_fraction": 0.632, "bagging_freq": 1, # "feature_fraction": 0.8, # Add nulled parameters to surpress warning messages "subsample": None, "subsample_freq": None, "colsample_bytree": None, "verbose": -1, } self.treemodel = lightgbm.LGBMClassifier( random_state=self.random_state.randint(10000), n_jobs=self.n_jobs, **PARAMS) self.model = fs.RFECV(self.treemodel, cv=cv)
def selectFeatures(waveTrainSet, model, enableFS): setLen = len(waveTrainSet.columns) featuresName = list(waveTrainSet.columns.values) featuresName = featuresName[:setLen - 2] if enableFS: #Recursive feature selection recursiveFS = feature_selection.RFECV(estimator=model, step=1, cv=cross_validation.StratifiedKFold(waveTrainSet.ix[:, setLen-1], 2), scoring='f1_weighted') recursiveFS.fit(waveTrainSet[featuresName], waveTrainSet.ix[:, setLen-1]) #Get all features names ranked as 1 featuresName = list(map(lambda x: x[1],filter(lambda x: x[0]==1,zip(recursiveFS.ranking_, featuresName)))) print("Selected features:") print(featuresName) return featuresName
def tune_feature(data, features, target, model): print('------自动选择特征') alg_rfe = feature_selection.RFECV(model, step=1, scoring='accuracy', cv=cv_split) alg_rfe.fit(data[features], data[target]) x_rfe = data[features].columns[alg_rfe.get_support()] rfe_results = model_selection.cross_validate(alg_rfe, data[x_rfe], data[target], cv=cv_split, return_train_score=True) print('------最好的特征以及该特征下模型效果') print('AFTER DT RFE Training Shape New: ', data[x_rfe].shape) print('AFTER DT RFE Training Columns New: ', x_rfe) print("AFTER DT RFE Training w/bin score mean: {:.2f}".format( rfe_results['train_score'].mean() * 100)) print("AFTER DT RFE Test w/bin score mean: {:.2f}".format( rfe_results['test_score'].mean() * 100)) print("AFTER DT RFE Test w/bin score 3*std: +/- {:.2f}".format( rfe_results['test_score'].std() * 100 * 3))
def regression_analysis(self): tmp = dict() #linear tmp['logic'] = feature_selection.RFECV(lm.LinearRegression(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_ tmp['ridge'] = feature_selection.RFECV(lm.Ridge(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_ tmp['SGD'] = feature_selection.RFECV(lm.SGDRegressor(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_ tmp['lm_svm'] = feature_selection.RFECV(svm.LinearSVR(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_ #non-linear tmp['ADABoost'] = feature_selection.RFECV(ensemble.AdaBoostRegressor(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_ tmp['RandomForest'] = feature_selection.RFECV(ensemble.RandomForestRegressor(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_ #stats fscore = feature_selection.f_regression(self.x,self.y) tmp['f_score'] = fscore[0] tmp['f_pval'] = fscore[1] tmp['MIC'] = feature_selection.mutual_info_regression(self.x,self.y) return tmp
def classification_analysis(self): tmp = dict() #linear tmp['logic'] = feature_selection.RFECV(lm.LogisticRegression(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_ tmp['ridge'] = feature_selection.RFECV(lm.RidgeClassifier(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_ tmp['SGD'] = feature_selection.RFECV(lm.SGDClassifier(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_ tmp['lm_svm'] = feature_selection.RFECV(svm.LinearSVC(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_ #non-linear tmp['ADABoost'] = feature_selection.RFECV(ensemble.AdaBoostClassifier(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_ tmp['RandomForest'] = feature_selection.RFECV(ensemble.RandomForestClassifier(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_ #stats chi = feature_selection.chi2(self.x,self.y) tmp['chi2'] = chi[0] tmp['chi2_pval'] = chi[1] fscore = feature_selection.f_classif(self.x,self.y) tmp['f_score'] = fscore[0] tmp['f_pval'] = fscore[1] tmp['MIC'] = feature_selection.mutual_info_classif(self.x,self.y) return tmp
x_test_transformed = feature_select.transform(x_test) from sklearn.feature_selection import chi2, SelectKBest x, y = load_breast_cancer().data, load_breast_cancer().target x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2) feature_select_chi2 = SelectKBest(chi2, k =10) feature_select_chi2.fit(x_train, y_train) x_train_transformed = feature_select_chi2.transform(x_train) x_test_transformed = feature_select_chi2.transform(x_test) from sklearn.feature_extraction import * model = RandomForestClassifier(n_estimators = 500) import sklearn.feature_selection as fs feature_select_rfe = fs.RFECV(model, cv=5, min_features_to_select = 3) model.fit(x_train, y_train) model.feature_importances_ np.sort(model.feature_importances_) np.argsort(model.feature_importances_)
def select_features(x, y): """ :param x: dataframe of features :param y: dataframe of target property :return: Outputs of feature selection process """ x = pd.DataFrame(x) # Removing features with low variance var_threshold = f_selection.VarianceThreshold(threshold=(.8 * (1 - .8))) # Kbest-based and Percentile-based feature selection using regression f_regress = f_selection.f_regression(x, y, center=False) kbest = f_selection.SelectKBest(score_func=f_regress, k=2) percent = f_selection.SelectPercentile(score_func=f_regress, percentile=10) # Tree-based feature selection using a number of randomized decision trees trees = f_selection.SelectFromModel(ExtraTreesRegressor, prefit=True) # "False positive rate"-based feature selection using regression fpr = f_selection.SelectFpr(score_func=f_regress, alpha=0.05) # PCA-component evaluation pca = PCA(n_components=2) # Recursive feature elimination and cross-validated feature selection estimator = SVR(kernel="linear") selector = f_selection.RFECV(estimator, step=1, cv=5) # Build estimator from PCA and Univariate selection: combined_features = FeatureUnion([("pca_based", pca), ("univ_kbest", kbest), ("false_positive_rate", fpr), ("percentile_based", percent), ("RFECV_selector", selector), ("variance_threshold", var_threshold), ("trees_based", trees)]) x_union_features = combined_features.fit_transform(x, y) svm = SVC(kernel="linear") # Do grid search over all parameters: pipeline = Pipeline([("features", x_union_features), ("svm", svm)]) grid = dict(features__pca_based__n_components=range(1, 101), features__univ_kbest__k=range(1, 101), features_false_positive_rate_alpha=range(0, 1, 0.01), features_percentile_based_percentile=range(1, 20, 1), features_RFECV_selector_cv=range(1, 5), features_variance_threshold_threshold=range(0, 1, 0.01), svm__C=[0.01, 0.1, 1.0, 10.0]) grid_search = GridSearchCV(pipeline, param_grid=grid, verbose=0) x_features = grid_search.fit_transform(x, y) # Pickling feature reduction outputs with open(FS_PICKLE, 'wb') as result: pickle.dump(rf_sorted_score, result, pickle.HIGHEST_PROTOCOL) pickle.dump(grid_search.best_estimator_, result, pickle.HIGHEST_PROTOCOL) print grid_search.best_estimator_ return x_features
print(X_train.shape, X_test.shape) #%% #RFE CV to train with less feats from sklearn import feature_selection as fs from sklearn import linear_model as lin import matplotlib.pyplot as plt lr = lin.LogisticRegression(class_weight="balanced", solver="lbfgs", max_iter=100000) rfe = fs.RFECV(estimator=lr, step=1, n_jobs=-1, cv=5, scoring='f1_weighted', verbose=2) rfe.fit(X=X_train, y=y_train.ravel()) print("Optimal number of features : %d" % rfe.n_features_) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score") plt.plot(range(1, len(rfe.grid_scores_) + 1), rfe.grid_scores_) plt.show() #%% X_train_RFE = pd.DataFrame(rfe.transform(X_train),
# ## Tune the Decision Tree Model with Feature Selection # We will use Recursive Feature Elimination (RFE) method which selects features by recursively considering smaller and smaller sets of features. First, the estimator is trained on the initial set of features and the importance of each feature is obtained through a feature_importances_ attribute. Then, the least important features are pruned from current set of features. That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached. # In[ ]: print('BEFORE RFE Training Shape Old: ', x.shape) print('BEFORE RFE Training Columns Old: ', x.columns.values) print("BEFORE RFE Training w/bin score mean: {:.2f}". format(base_results_dtree['train_score'].mean()*100)) print("BEFORE RFE Test w/bin score mean: {:.2f}". format(base_results_dtree['test_score'].mean()*100)) print('-'*10) #feature selection dtree_rfe = feature_selection.RFECV(tree.DecisionTreeClassifier(), step = 1, scoring = 'accuracy', cv = cv_split) dtree_rfe.fit(x, y) #transform x&y to reduced features and fit new model X_rfe = x.columns.values[dtree_rfe.get_support()] rfe_results = cross_validate(dtree, x[X_rfe], y, cv = cv_split) print('AFTER RFE Training Shape New: ', x[X_rfe].shape) print('AFTER RFE Training Columns New: ', X_rfe) print("AFTER RFE Training w/bin score mean: {:.2f}". format(rfe_results['train_score'].mean()*100)) print("AFTER RFE Test w/bin score mean: {:.2f}". format(rfe_results['test_score'].mean()*100)) print('-'*10) param_grid = {'criterion': ['gini','entropy'], 'splitter': ['best', 'random'],
'max_depth': [2,4,6,8,10,None], #max depth tree can grow; default is none #'min_samples_split': [2,5,10,.03,.05], #minimum subset size BEFORE new split (fraction is % of total); default is 2 #'min_samples_leaf': [1,5,10,.03,.05], #minimum subset size AFTER new split split (fraction is % of total); default is 1 #'max_features': [None, 'auto'], #max features to consider when performing split; default none or all 'random_state': [0] #seed or control random number generator: https://www.quora.com/What-is-seed-in-random-number-generation } #print(list(model_selection.ParameterGrid(param_grid))) #choose best model with grid_search: #http://scikit-learn.org/stable/modules/grid_search.html#grid-search #http://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html tune_model = model_selection.GridSearchCV(tree.DecisionTreeClassifier(), param_grid=param_grid, scoring = 'roc_auc', cv = cv_split) tune_model.fit(data1[data1_x_bin], data1[Target]) #feature selection dtree_rfe = feature_selection.RFECV(dtree, step = 1, scoring = 'accuracy', cv = cv_split) dtree_rfe.fit(data1[data1_x_bin], data1[Target]) #transform x&y to reduced features and fit new model #alternative: can use pipeline to reduce fit and transform steps: http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html X_rfe = data1[data1_x_bin].columns.values[dtree_rfe.get_support()] rfe_results = model_selection.cross_validate(dtree, data1[X_rfe], data1[Target], cv = cv_split) #tune rfe model rfe_tune_model = model_selection.GridSearchCV(tree.DecisionTreeClassifier(), param_grid=param_grid, scoring = 'roc_auc', cv = cv_split) rfe_tune_model.fit(data1[X_rfe], data1[Target])
def test_svm(Xa, ya, Xl, yl): folds = 5 step_delta = 0.05 print '\nTesting L1 SVM...' iters = 10 arc_nnz = get_nonzeros(Xa, ya, iterations=iters) print 'ARCENE - Average number of non-zero elements in', iters, 'iterations:', arc_nnz leu_nnz = get_nonzeros(Xl, yl, iterations=iters) print 'LEUKEMIA - Average number of non-zero elements in', iters, 'iterations:', leu_nnz # setup cross validation folds cross_val = cross_validation.StratifiedKFold(ya, folds, shuffle=True) svm_l1 = svm.LinearSVC(penalty='l1', dual=False) svm_l2 = svm.LinearSVC(penalty='l2', dual=False) rfe = fs.RFECV(svm_l2, step=step_delta, cv=folds) # setup pipes and estimators estimators0 = [('svm1', svm_l1)] estimators1 = [('svm1', svm_l1), ('svm2', svm_l2)] estimators2 = [('svm2', svm_l2)] estimators3 = [('rfe', rfe)] pipes = [ pipeline.Pipeline(estimators0), pipeline.Pipeline(estimators1), pipeline.Pipeline(estimators2), pipeline.Pipeline(estimators3) ] if complete == True: print '\nARCENE RESULTS................' for pipe in pipes: # CHECK SCORES cv_scores = cross_validation.cross_val_score(pipe, Xa, ya, cv=folds) print 'Cross val scores for', folds, 'folds:' print np.mean(cv_scores), cv_scores print '\nLEUKEMIA RESULTS................' for pipe in pipes: # CHECK SCORES cv_scores = cross_validation.cross_val_score(pipe, Xl, yl, cv=folds) print 'Cross val scores for', folds, 'folds:' print np.mean(cv_scores), cv_scores print 'Cross validation for SVMs complete!' return
fi=etrmodel.feature_importances_ val=x.columns.values zzz=pd.DataFrame({"value":val,"fi":fi}) petr=zzz.sort_values(by="fi",ascending=False) petr #feature selection using random tree classifier rf=ensemble.RandomForestClassifier(max_depth=10) rf.fit(xtrain,ytrain) fi=rf.feature_importances_ val=x.columns.values zzz=pd.DataFrame({"value":val,"fi":fi}) prf=zzz.sort_values(by="fi",ascending=False) prf rfecv=feature_selection.RFECV(estimator=ensemble.ExtraTreesClassifier(),min_features_to_select=1,cv=5,scoring="recall") rfecv.fit(xtrain,ytrain) zzz=xtrain.columns[rfecv.get_support()] print(zzz) def modelstats2(Xtrain,Xtest,ytrain,ytest): stats=[] modelnames=["LogisticReg","DecisionTree","KNN","NB"] models=list() models.append(linear_model.LogisticRegression()) models.append(tree.DecisionTreeClassifier()) models.append(neighbors.KNeighborsClassifier()) models.append(naive_bayes.GaussianNB())
# coding=utf-8 """Recursive Feature Elimination with Support Vector Machines Classifier.""" import matplotlib.pyplot as plt from sklearn import datasets, svm, feature_selection, model_selection if __name__ == "__main__": print("Generating data...") X, y = datasets.make_classification(3000, 20, n_informative=5, n_redundant=3, n_repeated=2, n_classes=3, n_clusters_per_class=1, random_state=37) print("Performing recursive feature selection with crossvalidation.") svc = svm.SVC(kernel="linear") cvrfs = feature_selection.RFECV(svc, cv=model_selection.StratifiedKFold(2), scoring="accuracy") cvrfs.fit(X, y) print("Optimal selected features: {0}".format(cvrfs.n_features_)) print("Plotting results...") plt.plot(range(1, len(cvrfs.grid_scores_) + 1), cvrfs.grid_scores_) plt.show()
######################### normailized features / use the whole data to train ################### # feature_train, feature_test, label_train, label_test = train_test_split(normalized_feature, label, test_size=0.25, random_state=0) feature_train = normalized_feature label_train = label ######################### feature selection ###################### # univariate feature selection with F test for feature scoring selector = feature_selection.SelectPercentile(feature_selection.f_classif, percentile=30) selector = selector.fit(feature_train, label_train) fselected_feature_train = selector.transform(feature_train) # rfecv + random forest rfecv = feature_selection.RFECV(ensemble.RandomForestClassifier(), step=1, cv=5, scoring='accuracy') rfecv = rfecv.fit(fselected_feature_train, label_train) fselected_feature_train = rfecv.transform(fselected_feature_train) print("Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores # plt.figure() # plt.xlabel("Number of features selected") # plt.ylabel("Cross validation score") # plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) # plt.show() ######################### start training using SVM ######################
def get_relev_class_RFE(X, y, random_state=None, params=None): rfc = fs.RFECV(model(random_state=random_state, params=params), cv=5) rfc.fit(X, y) return rfc.support_.astype(int)
# feature selection using cross validation class PipelineRFE(Pipeline): def fit(self, X, y=None, **fit_params): super(PipelineRFE, self).fit(X, y, **fit_params) self.feature_importances_ = self.steps[-1][-1].feature_importances_ return self pipe = PipelineRFE([('std_scaler', preprocessing.StandardScaler()), ("ET", ExtraTreesRegressor(random_state=42, n_estimators=25))]) # In[39]: feature_selector_cv = feature_selection.RFECV(pipe, cv=5, step=1, scoring="neg_mean_squared_error") feature_selector_cv.fit(X, y) # In[40]: print(feature_selector_cv.n_features_) cv_grid_rmse = np.sqrt(-feature_selector_cv.grid_scores_) print(cv_grid_rmse) # * feature selection has given out 12 features. Lets print them # In[41]: feature_names = (X.columns) selected_features = feature_names[feature_selector_cv.support_].tolist()
def correlation_supervisor(path, rootdir, simple=False, lig_only=False, max_descriptors=False): # load the files from the given input file file_dict, fail_dict = accquire_file(path) #loop over sucessful imports to get descriptors: big_mat = list() col_names = list() for i, keyv in enumerate(file_dict.keys()): file_dict[keyv].get_descriptor_vector(lig_only, simple, name=False, loud=False) #print('i = ',str(i)) if i == 0: col_names = file_dict[keyv].descriptor_names # reorganize the data this_row = list() this_row.append(float(file_dict[keyv].yvalue)) this_row.extend(file_dict[keyv].descriptors) big_mat.append(this_row) big_mat = np.array(big_mat) ##### let's do some regression ### standardize model: col_array = np.array(col_names) print('length of col array is ' + str(len(col_array))) n_tot = len(col_array) X = big_mat[:, 1:] print('dimension of data matrix is ' + str(big_mat.shape)) n_obs = len(X[:, 1]) Scaler = preprocessing.StandardScaler().fit(X) Xs = Scaler.transform(X) Y = big_mat[:, 0] ## find baseline model (all descriptors) Reg = linear_model.LinearRegression() Reg.fit(Xs, Y) Ypred_all_all = Reg.predict(Xs) rs_all_all = metrics.r2_score(Y, Ypred_all_all) loo = model_selection.LeaveOneOut() r_reduce = list() mse_reduce = list() ### stepwise reduce the feature set until only one is left for n in range(0, n_tot): reductor = feature_selection.RFE(Reg, n_tot - n, step=1, verbose=0) reductor.fit(Xs, Y) Ypred_all = reductor.predict(Xs) rs_all = metrics.r2_score(Y, Ypred_all) mse_all = metrics.mean_squared_error(Y, Ypred_all) r_reduce.append(rs_all) mse_reduce.append(mse_all) ### reduce to one feature reductor_features = list() for i, ranks in enumerate(reductor.ranking_): reductor_features.append([col_array[i], ranks]) reductor_features = sorted(reductor_features, key=lambda x: x[1]) #print(reductor_features) print('****************************************') ### select best number using cv selector = feature_selection.RFECV(Reg, step=1, cv=loo, verbose=0, scoring='neg_mean_squared_error') selector.fit(Xs, Y) select_mse = selector.grid_scores_ Ypred = selector.predict(Xs) rs = metrics.r2_score(Y, Ypred) n_opt = selector.n_features_ opt_features = col_array[selector.support_] ranked_features = list() for i, ranks in enumerate(selector.ranking_): ranked_features.append([col_array[i], ranks]) ranked_features = sorted(ranked_features, key=lambda x: x[1]) print(ranked_features) if max_descriptors: ## check if we need to reduce further print('a max of ' + str(max_descriptors) + ' were requested') n_max = int(max_descriptors) if n_opt > n_max: print('the RFE process selected ' + str(n_opt) + ' varibles as optimal') print('discarding an additional ' + str(n_max - n_opt) + ' variables') new_variables = list() new_mask = np.zeros(n_tot) for i in range(0, n_max): new_variables.append(ranked_features[i]) ## report results to user print('analzyed ' + str(n_obs) + ' molecules') print('the full-space R2 is ' + str("%0.2f" % rs_all_all) + ' with ' + str(n_tot) + ' features') print('optimal number of features is ' + str(n_opt) + ' of total ' + str(n_tot)) print('the opt R2 is ' + str("%0.2f" % rs)) #print(ranked_features) X_r = selector.transform(Xs) reg_red = linear_model.LinearRegression() reg_red.fit(X_r, Y) Ypred_r = reg_red.predict(X_r) errors = [Y[i] - Ypred_r[i] for i in range(0, n_obs)] coefs = reg_red.coef_ intercept = reg_red.intercept_ mse_all = metrics.mean_squared_error(Y, Ypred_all_all) mse_r = metrics.mean_squared_error(Y, Ypred_r) if n_opt < 30: print('the optimal variables are: ' + str(opt_features)) print('the coefficients are' + str(coefs)) else: print('the (first 30) optimal variables are: ' + str(opt_features[0:29])) print('the (first 30) coefficients are' + str(coefs[0:29])) print('the intercept is ' + str("%0.2f" % intercept)) print('the training MSE with the best feature set is ' + str("%0.2f" % mse_r)) print('the MSE with all features is ' + str("%0.2f" % mse_all)) print('by eliminating ' + str(n_tot - n_opt) + ' features,' + ' CV-prediction MSE decreased from ' + str("%0.0f" % abs(select_mse[0])) + ' to ' + str("%00f" % abs(select_mse[n_tot - n_opt]))) with open(rootdir + 'RFECV_rankings.csv', 'w') as f: f.write('RFE_rank,RFE_col,RFECV_rank,RFECV_col, \n') for i, items in enumerate(reductor_features): f.write( str(items[0]) + ',' + str(items[1]) + ',' + str(ranked_features[i][0]) + ',' + str(ranked_features[i][1]) + '\n') with open(rootdir + 'y_data.csv', 'w') as f: for items in Y: f.write(str(items) + '\n') with open(rootdir + 'y_pred_r.csv', 'w') as f: for items in Ypred_r: f.write(str(items) + '\n') with open(rootdir + 'optimal_decriptor_space.csv', 'w') as f: for i in range(0, n_obs): for j in range(0, n_opt): if j == (n_opt - 1): f.write(str(X_r[i][j]) + '\n') else: f.write(str(X_r[i][j]) + ',') with open(rootdir + 'full_descriptor_space.csv', 'w') as f: for names in col_names: f.write(names + ',') f.write('\n') for i in range(0, n_obs): for j in range(0, n_tot): if j == (n_tot - 1): f.write(str(Xs[i][j]) + '\n') else: f.write(str(Xs[i][j]) + ',') with open(rootdir + 'scaling.csv', 'w') as f: means = Scaler.mean_ var = Scaler.var_ f.write('name, mean,variance \n') for i in range(0, n_tot): f.write( str(col_names[i]) + ',' + str(means[i]) + ',' + str(var[i]) + ',' + str(selector.ranking_[i]) + '\n') with open(rootdir + 'coeficients.csv', 'w') as f: f.write('intercept,' + str(intercept) + '\n') for i in range(0, n_opt): f.write(str(opt_features[i]) + ',' + str(coefs[i]) + '\n') with open(rootdir + 'rfe_mse.csv', 'w') as f: f.write('features removed,mean CV error,' + str(intercept) + '\n') count = 0 for items in mse_reduce: f.write(str(count) + ',' + str(items) + '\n') count += 1
## -->>Select k best features # RFECV function ## Reshape the Label array Labels = Labels.reshape(Labels.shape[0],) ## Set folds for nested cross validation nr.seed(988) feature_folds = ms.KFold(n_splits=10, shuffle = True) ## Define the model logistic_mod = linear_model.LogisticRegression(C = 10, class_weight = {0:0.45, 1:0.55}) ## Perform feature selection by CV with high variance features only nr.seed(6677) selector = fs.RFECV(estimator = logistic_mod, cv = feature_folds, scoring = 'roc_auc') selector = selector.fit(Features_reduced, Labels) selector.support_ # relative ranking of the features selector.ranking_ # transform method applies the selector to the feature array Features_reduced = selector.transform(Features_reduced) Features_reduced.shape # plot of AUC (the metric) vs. the number of features plt.plot(range(1, len(selector.grid_scores_) + 1), selector.grid_scores_) plt.title('Mean AUC by number of features') plt.ylabel('AUC') plt.xlabel('Number of features')
"BusinessTravel", "Department", "EducationField", "Gender", "JobRole", "MaritalStatus", "OverTime" ] pref = ["btravel", "dept", "edufield", "g", "jr", "m", "ot"] Xohe = pd.get_dummies(X, columns=cols, prefix=pref) X.head() Xohe.columns.values.shape Xohe.columns.values Xtrain, Xtest, ytrain, ytest = model_selection.train_test_split( Xohe, y, test_size=.2, random_state=42, stratify=y) selector = feature_selection.RFECV( estimator=linear_model.LogisticRegression(C=.1), cv=5, step=3, scoring="roc_auc") selector.fit(Xtrain, ytrain) Xohe.columns.values[selector.get_support()].shape Xtrain1 = Xtrain[Xohe.columns.values[selector.get_support()]] Xtest1 = Xtest[Xohe.columns.values[selector.get_support()]] print(modelstats(Xtrain1, Xtest1, ytrain, ytest)) Xcont = eadf[["Attrition", "DailyRate", "MonthlyIncome", "MonthlyRate"]] sns.heatmap(Xcont.corr(), annot=True) Xcont.corr() print(modelstats(Xtrain, Xtest, ytrain, ytest))