Beispiel #1
0
def banckmark_feature_selection():
    d = micro_data()

    X = np.array(d)[:, 1:10]
    Y = np.array(d)[:, 10]

    class PipelineRFE(Pipeline):
        def fit(self, X, y=None, **fit_params):
            super(PipelineRFE, self).fit(X, y, **fit_params)
            self.coef_ = self.steps[-1][-1].coef_
            return self

    pipe = PipelineRFE(
        [
            ('std_scaler', preprocessing.StandardScaler()),
            ("LR", linear_model.LogisticRegression(random_state=42))
        ]
    )
    _ = StratifiedKFold(random_state=42)
    print("Scores for validation banchmark sklearn RFE")
    feature_selector_cv = feature_selection.RFECV(pipe, cv=5, step=2, scoring="accuracy")\
    .fit(X, Y.astype('int'))

    print(pipe.__class__.__name__+" accuracy is {}".format(feature_selector_cv.grid_scores_.mean()))

    feature_selector_cv = feature_selection.RFECV(pipe, cv=5, step=2, scoring="log_loss")\
    .fit(X, Y.astype('int'))

    print(pipe.__class__.__name__+" log_loss is {}".format(feature_selector_cv.grid_scores_.mean()))

    feature_selector_cv = feature_selection.RFECV(pipe, cv=5, step=2, scoring="roc_auc")\
    .fit(X, Y.astype('int'))

    print(pipe.__class__.__name__+" auc is {}".format(feature_selector_cv.grid_scores_.mean()))
Beispiel #2
0
    def selector_logistic_rfe(self,
                              features_indep_df: PandasDataFrame,
                              feature_target: List,
                              kernel: str = "linear",
                              n_jobs: int = -1,
                              **kwargs: Any) -> object:
        """Select top features using recursive feature elimination and cross-validated selection of the best number
        of features, to rank features.
        Attributes:
        model.n_features_
        model.support_
        model.ranking_
        model.grid_scores_
        model.estimator_

        :param features_indep_df: the independent features, which are inputted into the model.
        :param feature_target: the target feature, which is being estimated.
        :param kernel: Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’,
        ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used.
        :param n_jobs: number of CPUs to use during the resampling. If ‘-1’, use all the CPUs.
        :param kwargs: step=1, cv=None, scoring=None, verbose=0
        :return: the feature selection model.
        """
        self.__logger.debug(
            "Run Feature Ranking with Recursive Feature Elimination.")
        estimator = SVR(kernel=kernel)
        selector = feature_selection.RFECV(estimator=estimator,
                                           n_jobs=n_jobs,
                                           **kwargs)
        return selector.fit(features_indep_df, feature_target)
Beispiel #3
0
def quick_fitted_tree(X,
                      Y,
                      model_type=['GridSearch', 'FeatureSelection'],
                      test_split=None,
                      random_state=None):
    from sklearn import tree, model_selection, feature_selection

    splitted_data = None
    sel_cols = None
    x = X.copy()
    y = Y.copy()
    if isinstance(test_split, (float)):
        x, x_test, y, y_test = model_selection.train_test_split(
            x, y, test_size=test_split, random_state=random_state)
    cv_split = model_selection.ShuffleSplit(n_splits=10,
                                            test_size=.3,
                                            train_size=.6,
                                            random_state=random_state)
    dtree = tree.DecisionTreeClassifier(random_state=random_state)
    model = dtree

    if 'FeatureSelection' in model_type:
        dtree_rfe = feature_selection.RFECV(
            tree.DecisionTreeClassifier(random_state=random_state),
            step=1,
            scoring='accuracy',
            cv=cv_split)
        dtree_rfe.fit(x, y)
        x = x[:, dtree_rfe.get_support()]
        if isinstance(test_split, (float)):
            x_test = x_test[:, dtree_rfe.get_support()]
        sel_cols = dtree_rfe.get_support()
    if 'GridSearch' in model_type:
        param_grid = {
            'criterion': ['gini', 'entropy'],
            'max_depth': [2, 4, 6, 8, 10, None],
            'random_state': [0],
            #'splitter': ['best', 'random'],
            #'min_samples_split': [2,5,10,.03,.05],
            #'min_samples_leaf': [1,5,10,.03,.05],
            #'max_features': [None, 'auto'],
        }
        model = model_selection.GridSearchCV(
            tree.DecisionTreeClassifier(random_state=random_state),
            param_grid=param_grid,
            scoring='accuracy',
            cv=cv_split)

    model.fit(x, y)
    if model_type != None:
        model = model.best_estimator_

    if isinstance(test_split, (float)):
        splitted_data = (x, x_test, y, y_test)
    else:
        splitted_data = (None, x, None, y)
    return model, sel_cols, splitted_data
Beispiel #4
0
def get_fs_model(model, method, train, target=None, cv=None):
    """Connects given model with specified feature selection method and trains
    the final structure.
    """
    if method == "RFE":
        model = fs_scikit.RFE(model, 2, step=5)
        if target is not None:
            return model.fit(train, target)
        else:
            return model.fit(train)
    if method == "RFECV":
        model = fs_scikit.RFECV(model, 3, cv=cv)
        if target is not None:
            return model.fit(train, target)
        else:
            return model.fit(train)
    elif method == "linearSVC":
        sel = SelectFromModel(LinearSVC(penalty='l1', dual=False))
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "fromModel":
        fm = fs_scikit.SelectFromModel(model)
        if target is not None:
            fm.fit(train, target)
        else:
            fm.fit(train)
        model = Pipeline([('feature_selection', fm), ('data_mining', model)])

    # elif method == "Anova":
    # ANOVA SVM-C
    # anova_filter = fs_scikit.SelectKBest(f_regression, k=5)
    # model = Pipeline([
    #     ('feature_selection', anova_filter),
    #     ('data_mining', model)
    # ])
    elif method == "VarianceThreshold":
        sel = fs_scikit.VarianceThreshold(threshold=(.8 * (1 - .8)))
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "SelectPercentile":
        sel = fs_scikit.SelectPercentile(fs_scikit.f_classif, percentile=30)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "SelectFpr":
        sel = fs_scikit.SelectFpr(alpha=0.2)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "SelectFdr":
        sel = fs_scikit.SelectFdr(alpha=0.2)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "SelectFwe":
        sel = fs_scikit.SelectFwe(alpha=0.2)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "ch2":
        sel = fs_scikit.SelectKBest(fs_scikit.chi2, k=2)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    else:
        print("Feature selection method was not found: " + method)
        sys.exit(1)
    return model
Beispiel #5
0
def select_feature(base_result, tune_model, trainX, trainY):
    print('Before DT RFE Training Shape Old:', trainX.shape)
    print('Before DT RFE Training Columns Old:', trainX.columns.values)
    print('Before DT RFE Training w/bin score mean: {:.2f}'.format(
        base_result['train_score'].mean() * 100))
    print('Before DT RFE Test w/bin score mean: {:.2f}'.format(
        base_result['test_score'].mean() * 100))
    print('Before DT RFE Test w/bin score 3*std: +- {:.2f}'.format(
        base_result['test_score'].std() * 100 * 3))
    print('-' * 100)

    # feature selection
    dtree_rfe = feature_selection.RFECV(dtree,
                                        step=1,
                                        scoring='accuracy',
                                        cv=cv_split)
    dtree_rfe.fit(trainX, trainY)

    X_rfe = trainX.columns.values[
        dtree_rfe.get_support()]  # get true for all features
    rfe_result = model_selection.cross_validate(dtree,
                                                trainX[X_rfe],
                                                trainY,
                                                cv=cv_split,
                                                return_train_score=True)

    print('After DT RFE Training Shape New:', trainX[X_rfe].shape)
    print('After DT RFE Training Columns New:', X_rfe)
    print('After DT RFE Training w/bin score mean: {:.2f}'.format(
        rfe_result['train_score'].mean() * 100))
    print('After DT RFE Test w/bin score mean: {:.2f}'.format(
        rfe_result['test_score'].mean() * 100))
    print('After DT RFE Test w/bin score 3*std: +- {:.2f}'.format(
        rfe_result['test_score'].std() * 100 * 3))
    print('-' * 100)

    rfe_tune_model = model_selection.GridSearchCV(
        tree.DecisionTreeClassifier(),
        param_grid=param_grid,
        scoring='roc_auc',
        cv=cv_split,
        return_train_score=True)
    rfe_tune_model.fit(trainX[X_rfe], trainY)
    print('After DT RFE Tuned Parameters:', rfe_tune_model.best_params_)
    print('After DT RFE Tuned Training w/bin score mean: {:.2f}'.format(
        rfe_tune_model.cv_results_['mean_train_score'][
            rfe_tune_model.best_index_] * 100))
    print('After DT RFE Tuned Test w/bin score mean: {:.2f}'.format(
        rfe_tune_model.cv_results_['mean_test_score'][
            rfe_tune_model.best_index_] * 100))
    print('After DT RFE Tuned Test w/bin score 3*std: +- {:.2f}'.format(
        rfe_tune_model.cv_results_['std_test_score'][
            rfe_tune_model.best_index_] * 100 * 3))
    print('-' * 100)
Beispiel #6
0
def do_rfe(model, X, y, cv_split=None, scoring='accuracy', random_state=42):
    if cv_split is None:
        cv_split = model_selection.ShuffleSplit(n_splits=10,
                                                test_size=.3,
                                                train_size=.6,
                                                random_state=random_state)

    selector = feature_selection.RFECV(clone(model),
                                       step=1,
                                       scoring=scoring,
                                       cv=cv_split)
    selector.fit(X, y)
    print('do_feat_rfe: Done')
    return X.columns.values[selector.get_support()]
Beispiel #7
0
def select_feature(alg, x, y, cv_split=None, is_print=True):

    if cv_split == None:
        cv_split = model_selection.ShuffleSplit(n_splits=10,
                                                test_size=.3,
                                                train_size=.7,
                                                random_state=0)

    base_results = model_selection.cross_validate(alg, x, y, cv=cv_split)

    if is_print:
        print('BEFORE DT RFE Training Shape Old: ', x.shape)
        print('BEFORE DT RFE Training Columns Old: ', x.columns.values)

        print("BEFORE DT RFE Training w/bin score mean: {:.2f}".format(
            base_results['train_score'].mean() * 100))
        print("BEFORE DT RFE Test w/bin score mean: {:.2f}".format(
            base_results['test_score'].mean() * 100))
        print("BEFORE DT RFE Test w/bin score 3*std: +/- {:.2f}".format(
            base_results['test_score'].std() * 100 * 3))
        print('-' * 10)

    #feature selection
    dtree_rfe = feature_selection.RFECV(alg,
                                        step=1,
                                        scoring='accuracy',
                                        cv=cv_split)
    dtree_rfe.fit(x, y)

    #transform x&y to reduced features and fit new model
    #alternative: can use pipeline to reduce fit and transform steps: http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
    # the optimal columns
    X_rfe = x.columns.values[dtree_rfe.get_support()]
    rfe_results = model_selection.cross_validate(alg, x[X_rfe], y, cv=cv_split)

    if is_print:
        print('AFTER DT RFE Training Shape New: ', x[X_rfe].shape)
        print('AFTER DT RFE Training Columns New: ', X_rfe)

        print("AFTER DT RFE Training w/bin score mean: {:.2f}".format(
            rfe_results['train_score'].mean() * 100))
        print("AFTER DT RFE Test w/bin score mean: {:.2f}".format(
            rfe_results['test_score'].mean() * 100))
        print("AFTER DT RFE Test w/bin score 3*std: +/- {:.2f}".format(
            rfe_results['test_score'].std() * 100 * 3))
        print('-' * 10)

    return X_rfe
def feature_select(data1, data1_x_bin, dtree, base_results, cv_split, Target):
    # Feature Selection
    # more predictor variables do not make a better model, but the right predictors do.
    print('feature_select')
    print('BEFORE DT RFE Training Shape Old: ', data1[data1_x_bin].shape)
    print('BEFORE DT RFE Training Columns Old: ',
          data1[data1_x_bin].columns.values.tolist())

    print("BEFORE DT RFE Training w/bin score mean: {:.2f}".format(
        base_results['train_score'].mean() * 100))
    print("BEFORE DT RFE Test w/bin score mean: {:.2f}".format(
        base_results['test_score'].mean() * 100))
    print("BEFORE DT RFE Test w/bin score 3*std: +/- {:.2f}".format(
        base_results['test_score'].std() * 100 * 3))
    print('-' * 10)

    # feature selection
    dtree_rfe = feature_selection.RFECV(dtree,
                                        step=1,
                                        scoring='accuracy',
                                        cv=cv_split,
                                        n_jobs=-1)
    dtree_rfe.fit(data1[data1_x_bin], data1[Target])

    # transform x&y to reduced features and fit new model
    # alternative: can use pipeline to reduce fit and transform steps: http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
    X_rfe = data1[data1_x_bin].columns.values[dtree_rfe.get_support()].tolist(
    )  # Get a mask, or integer index, of the features selected
    rfe_results = model_selection.cross_validate(dtree,
                                                 data1[X_rfe],
                                                 data1[Target],
                                                 cv=cv_split,
                                                 return_train_score=True)

    # print(dtree_rfe.grid_scores_)
    print('AFTER DT RFE Training Shape New: ', data1[X_rfe].shape)
    print('AFTER DT RFE Training Columns New: ', X_rfe)

    print("AFTER DT RFE Training w/bin score mean: {:.2f}".format(
        rfe_results['train_score'].mean() * 100))
    print("AFTER DT RFE Test w/bin score mean: {:.2f}".format(
        rfe_results['test_score'].mean() * 100))
    print("AFTER DT RFE Test w/bin score 3*std: +/- {:.2f}".format(
        rfe_results['test_score'].std() * 100 * 3))
    print('-' * 10)
    return dtree_rfe, X_rfe, rfe_results
Beispiel #9
0
def key_features(X_train, y_train, sub, varience_test=True):
    print 'Features before reduction: ' + str(len(X_train[0]))
    if varience_test:
        #remove features with low variance
        sel = feature_selection.VarianceThreshold(threshold=(.8 * (1 - .8)))
        X_train = sel.fit_transform(X_train)
        sub = sel.transform(sub)
        print 'Features after variance reduction: ' + str(len(X_train[0]))

    estimator = linear_model.SGDClassifier(n_jobs=-1, class_weight='auto')
    selector = feature_selection.RFECV(estimator, step=1, cv=5)
    features = selector.fit_transform(X_train, y_train)
    submission = selector.transform(sub)

    print 'Features after recursive elimination: ' + str(len(features[0]))

    return (features, submission)
def rfe(X2,y2):
    # Create the RFE object and compute a cross-validated score.
    svc = svm.SVC(kernel="linear")

    # classifications
    rfecv = feature_selection.RFECV(
        estimator=svc, step=1, 
        cv=model_selection.StratifiedKFold(4),
        scoring='roc_auc')
    rfecv.fit(X2, y2)

# Plot number of features VS. cross-validation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()
Beispiel #11
0
        def fit(self, x, y):
            est = clone(estimator)
            est.set_params(**self.get_params())
            fs = feature_selection.RFECV(
                estimator=est,
                cv=folds,
                scoring=metrics.make_scorer(metrics.roc_auc_score,
                                            average='weighted'),
                n_jobs=-1,
            )
            fs.fit(x, y)
            self.best_variables = list(x.iloc[:, fs.get_support(indices=True)])
            x = x[self.best_variables]
            super().fit(x, y)

            i = len(variables_df)
            variables_df.loc[i] = 0
            variables_df.loc[i, self.best_variables] = 1
Beispiel #12
0
 def __init__(self, random_state=None, n_jobs=1, cv=5):
     super().__init__(random_state=random_state, n_jobs=n_jobs)
     PARAMS = {
         "num_leaves": 32,
         "max_depth": 5,
         "boosting_type": "rf",
         "bagging_fraction": 0.632,
         "bagging_freq": 1,
         # "feature_fraction": 0.8,
         # Add nulled parameters to surpress warning messages
         "subsample": None,
         "subsample_freq": None,
         "colsample_bytree": None,
         "verbose": -1,
     }
     self.treemodel = lightgbm.LGBMClassifier(
         random_state=self.random_state.randint(10000),
         n_jobs=self.n_jobs,
         **PARAMS)
     self.model = fs.RFECV(self.treemodel, cv=cv)
def selectFeatures(waveTrainSet, model, enableFS):
    setLen = len(waveTrainSet.columns)

    featuresName = list(waveTrainSet.columns.values)
    featuresName = featuresName[:setLen - 2]

    if enableFS:

        #Recursive feature selection

        recursiveFS = feature_selection.RFECV(estimator=model, step=1, cv=cross_validation.StratifiedKFold(waveTrainSet.ix[:, setLen-1], 2),
                      scoring='f1_weighted')
        recursiveFS.fit(waveTrainSet[featuresName], waveTrainSet.ix[:, setLen-1])

        #Get all features names ranked as 1

        featuresName = list(map(lambda x: x[1],filter(lambda x: x[0]==1,zip(recursiveFS.ranking_, featuresName))))

        print("Selected features:")
        print(featuresName)

    return featuresName
Beispiel #14
0
def tune_feature(data, features, target, model):
    print('------自动选择特征')
    alg_rfe = feature_selection.RFECV(model,
                                      step=1,
                                      scoring='accuracy',
                                      cv=cv_split)
    alg_rfe.fit(data[features], data[target])

    x_rfe = data[features].columns[alg_rfe.get_support()]
    rfe_results = model_selection.cross_validate(alg_rfe,
                                                 data[x_rfe],
                                                 data[target],
                                                 cv=cv_split,
                                                 return_train_score=True)
    print('------最好的特征以及该特征下模型效果')
    print('AFTER DT RFE Training Shape New: ', data[x_rfe].shape)
    print('AFTER DT RFE Training Columns New: ', x_rfe)
    print("AFTER DT RFE Training w/bin score mean: {:.2f}".format(
        rfe_results['train_score'].mean() * 100))
    print("AFTER DT RFE Test w/bin score mean: {:.2f}".format(
        rfe_results['test_score'].mean() * 100))
    print("AFTER DT RFE Test w/bin score 3*std: +/- {:.2f}".format(
        rfe_results['test_score'].std() * 100 * 3))
Beispiel #15
0
    def regression_analysis(self):

        tmp = dict()
        #linear
        tmp['logic'] = feature_selection.RFECV(lm.LinearRegression(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_
        tmp['ridge'] = feature_selection.RFECV(lm.Ridge(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_
        tmp['SGD'] = feature_selection.RFECV(lm.SGDRegressor(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_
        tmp['lm_svm'] = feature_selection.RFECV(svm.LinearSVR(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_

        #non-linear
        tmp['ADABoost'] = feature_selection.RFECV(ensemble.AdaBoostRegressor(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_
        tmp['RandomForest'] = feature_selection.RFECV(ensemble.RandomForestRegressor(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_

        #stats        
        fscore = feature_selection.f_regression(self.x,self.y)
        tmp['f_score'] = fscore[0]
        tmp['f_pval'] = fscore[1]
        tmp['MIC'] = feature_selection.mutual_info_regression(self.x,self.y)

        return tmp           
Beispiel #16
0
    def classification_analysis(self):

        tmp = dict()
        #linear
        tmp['logic'] = feature_selection.RFECV(lm.LogisticRegression(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_
        tmp['ridge'] = feature_selection.RFECV(lm.RidgeClassifier(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_
        tmp['SGD'] = feature_selection.RFECV(lm.SGDClassifier(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_
        tmp['lm_svm'] = feature_selection.RFECV(svm.LinearSVC(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_

        #non-linear
        tmp['ADABoost'] = feature_selection.RFECV(ensemble.AdaBoostClassifier(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_
        tmp['RandomForest'] = feature_selection.RFECV(ensemble.RandomForestClassifier(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_

        #stats
        chi = feature_selection.chi2(self.x,self.y)
        tmp['chi2'] = chi[0]
        tmp['chi2_pval'] = chi[1]
        fscore = feature_selection.f_classif(self.x,self.y)
        tmp['f_score'] = fscore[0]
        tmp['f_pval'] = fscore[1]
        tmp['MIC'] = feature_selection.mutual_info_classif(self.x,self.y)

        return tmp
Beispiel #17
0
x_test_transformed = feature_select.transform(x_test)






from sklearn.feature_selection import chi2, SelectKBest

x, y = load_breast_cancer().data, load_breast_cancer().target

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2)

feature_select_chi2 =  SelectKBest(chi2, k =10)
feature_select_chi2.fit(x_train, y_train)

x_train_transformed = feature_select_chi2.transform(x_train)

x_test_transformed = feature_select_chi2.transform(x_test)

from sklearn.feature_extraction import *
model = RandomForestClassifier(n_estimators = 500)

import sklearn.feature_selection as fs
feature_select_rfe = fs.RFECV(model, cv=5, min_features_to_select = 3)
model.fit(x_train, y_train)
model.feature_importances_
np.sort(model.feature_importances_)
np.argsort(model.feature_importances_)
Beispiel #18
0
def select_features(x, y):
    """

    :param x: dataframe of features
    :param y: dataframe of target property
    :return: Outputs of feature selection process
    """
    x = pd.DataFrame(x)

    # Removing features with low variance
    var_threshold = f_selection.VarianceThreshold(threshold=(.8 * (1 - .8)))

    # Kbest-based and Percentile-based feature selection using regression
    f_regress = f_selection.f_regression(x, y, center=False)
    kbest = f_selection.SelectKBest(score_func=f_regress, k=2)
    percent = f_selection.SelectPercentile(score_func=f_regress, percentile=10)

    # Tree-based feature selection using a number of randomized decision trees
    trees = f_selection.SelectFromModel(ExtraTreesRegressor, prefit=True)

    # "False positive rate"-based feature selection using regression
    fpr = f_selection.SelectFpr(score_func=f_regress, alpha=0.05)

    # PCA-component evaluation
    pca = PCA(n_components=2)

    # Recursive feature elimination and cross-validated feature selection
    estimator = SVR(kernel="linear")
    selector = f_selection.RFECV(estimator, step=1, cv=5)

    # Build estimator from PCA and Univariate selection:
    combined_features = FeatureUnion([("pca_based", pca),
                                      ("univ_kbest", kbest),
                                      ("false_positive_rate", fpr),
                                      ("percentile_based", percent),
                                      ("RFECV_selector", selector),
                                      ("variance_threshold", var_threshold),
                                      ("trees_based", trees)])
    x_union_features = combined_features.fit_transform(x, y)

    svm = SVC(kernel="linear")

    # Do grid search over all parameters:
    pipeline = Pipeline([("features", x_union_features), ("svm", svm)])

    grid = dict(features__pca_based__n_components=range(1, 101),
                features__univ_kbest__k=range(1, 101),
                features_false_positive_rate_alpha=range(0, 1, 0.01),
                features_percentile_based_percentile=range(1, 20, 1),
                features_RFECV_selector_cv=range(1, 5),
                features_variance_threshold_threshold=range(0, 1, 0.01),
                svm__C=[0.01, 0.1, 1.0, 10.0])

    grid_search = GridSearchCV(pipeline, param_grid=grid, verbose=0)
    x_features = grid_search.fit_transform(x, y)

    # Pickling feature reduction outputs
    with open(FS_PICKLE, 'wb') as result:
        pickle.dump(rf_sorted_score, result, pickle.HIGHEST_PROTOCOL)
        pickle.dump(grid_search.best_estimator_, result,
                    pickle.HIGHEST_PROTOCOL)

    print grid_search.best_estimator_

    return x_features
print(X_train.shape, X_test.shape)

#%%
#RFE CV to train with less feats
from sklearn import feature_selection as fs
from sklearn import linear_model as lin
import matplotlib.pyplot as plt

lr = lin.LogisticRegression(class_weight="balanced",
                            solver="lbfgs",
                            max_iter=100000)

rfe = fs.RFECV(estimator=lr,
               step=1,
               n_jobs=-1,
               cv=5,
               scoring='f1_weighted',
               verbose=2)
rfe.fit(X=X_train, y=y_train.ravel())

print("Optimal number of features : %d" % rfe.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score")
plt.plot(range(1, len(rfe.grid_scores_) + 1), rfe.grid_scores_)
plt.show()

#%%
X_train_RFE = pd.DataFrame(rfe.transform(X_train),
Beispiel #20
0
# ## Tune the Decision Tree Model with Feature Selection
# We will use Recursive Feature Elimination (RFE) method which selects features by recursively considering smaller and smaller sets of features. First, the estimator is trained on the initial set of features and the importance of each feature is obtained through a feature_importances_ attribute. Then, the least important features are pruned from current set of features. That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached. 

# In[ ]:


print('BEFORE RFE Training Shape Old: ', x.shape) 
print('BEFORE RFE Training Columns Old: ', x.columns.values)

print("BEFORE RFE Training w/bin score mean: {:.2f}". format(base_results_dtree['train_score'].mean()*100)) 
print("BEFORE RFE Test w/bin score mean: {:.2f}". format(base_results_dtree['test_score'].mean()*100))
print('-'*10)

#feature selection
dtree_rfe = feature_selection.RFECV(tree.DecisionTreeClassifier(), step = 1, scoring = 'accuracy', cv = cv_split)
dtree_rfe.fit(x, y)

#transform x&y to reduced features and fit new model
X_rfe = x.columns.values[dtree_rfe.get_support()]
rfe_results = cross_validate(dtree, x[X_rfe], y, cv  = cv_split)

print('AFTER RFE Training Shape New: ', x[X_rfe].shape) 
print('AFTER RFE Training Columns New: ', X_rfe)

print("AFTER RFE Training w/bin score mean: {:.2f}". format(rfe_results['train_score'].mean()*100)) 
print("AFTER RFE Test w/bin score mean: {:.2f}". format(rfe_results['test_score'].mean()*100))
print('-'*10)

param_grid = {'criterion': ['gini','entropy'], 
              'splitter': ['best', 'random'], 
Beispiel #21
0
              'max_depth': [2,4,6,8,10,None], #max depth tree can grow; default is none
              #'min_samples_split': [2,5,10,.03,.05], #minimum subset size BEFORE new split (fraction is % of total); default is 2
              #'min_samples_leaf': [1,5,10,.03,.05], #minimum subset size AFTER new split split (fraction is % of total); default is 1
              #'max_features': [None, 'auto'], #max features to consider when performing split; default none or all
              'random_state': [0] #seed or control random number generator: https://www.quora.com/What-is-seed-in-random-number-generation
             }

#print(list(model_selection.ParameterGrid(param_grid)))

#choose best model with grid_search: #http://scikit-learn.org/stable/modules/grid_search.html#grid-search
#http://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html
tune_model = model_selection.GridSearchCV(tree.DecisionTreeClassifier(), param_grid=param_grid, scoring = 'roc_auc', cv = cv_split)
tune_model.fit(data1[data1_x_bin], data1[Target])

#feature selection
dtree_rfe = feature_selection.RFECV(dtree, step = 1, scoring = 'accuracy', cv = cv_split)
dtree_rfe.fit(data1[data1_x_bin], data1[Target])

#transform x&y to reduced features and fit new model
#alternative: can use pipeline to reduce fit and transform steps: http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
X_rfe = data1[data1_x_bin].columns.values[dtree_rfe.get_support()]
rfe_results = model_selection.cross_validate(dtree, data1[X_rfe], data1[Target], cv  = cv_split)


#tune rfe model
rfe_tune_model = model_selection.GridSearchCV(tree.DecisionTreeClassifier(), param_grid=param_grid, scoring = 'roc_auc', cv = cv_split)
rfe_tune_model.fit(data1[X_rfe], data1[Target])



Beispiel #22
0
def test_svm(Xa, ya, Xl, yl):

    folds = 5
    step_delta = 0.05

    print '\nTesting L1 SVM...'
    iters = 10

    arc_nnz = get_nonzeros(Xa, ya, iterations=iters)
    print 'ARCENE   - Average number of non-zero elements in', iters, 'iterations:', arc_nnz

    leu_nnz = get_nonzeros(Xl, yl, iterations=iters)
    print 'LEUKEMIA - Average number of non-zero elements in', iters, 'iterations:', leu_nnz

    # setup cross validation folds
    cross_val = cross_validation.StratifiedKFold(ya, folds, shuffle=True)

    svm_l1 = svm.LinearSVC(penalty='l1', dual=False)
    svm_l2 = svm.LinearSVC(penalty='l2', dual=False)

    rfe = fs.RFECV(svm_l2, step=step_delta, cv=folds)

    # setup pipes and estimators
    estimators0 = [('svm1', svm_l1)]
    estimators1 = [('svm1', svm_l1), ('svm2', svm_l2)]
    estimators2 = [('svm2', svm_l2)]
    estimators3 = [('rfe', rfe)]

    pipes = [
        pipeline.Pipeline(estimators0),
        pipeline.Pipeline(estimators1),
        pipeline.Pipeline(estimators2),
        pipeline.Pipeline(estimators3)
    ]

    if complete == True:
        print '\nARCENE RESULTS................'
        for pipe in pipes:

            # CHECK SCORES
            cv_scores = cross_validation.cross_val_score(pipe,
                                                         Xa,
                                                         ya,
                                                         cv=folds)

            print 'Cross val scores for', folds, 'folds:'
            print np.mean(cv_scores), cv_scores

        print '\nLEUKEMIA RESULTS................'
        for pipe in pipes:

            # CHECK SCORES
            cv_scores = cross_validation.cross_val_score(pipe,
                                                         Xl,
                                                         yl,
                                                         cv=folds)

            print 'Cross val scores for', folds, 'folds:'
            print np.mean(cv_scores), cv_scores

        print 'Cross validation for SVMs complete!'

    return
fi=etrmodel.feature_importances_
val=x.columns.values
zzz=pd.DataFrame({"value":val,"fi":fi})
petr=zzz.sort_values(by="fi",ascending=False)
petr

#feature selection using random tree classifier
rf=ensemble.RandomForestClassifier(max_depth=10)
rf.fit(xtrain,ytrain)
fi=rf.feature_importances_
val=x.columns.values
zzz=pd.DataFrame({"value":val,"fi":fi})
prf=zzz.sort_values(by="fi",ascending=False)
prf

rfecv=feature_selection.RFECV(estimator=ensemble.ExtraTreesClassifier(),min_features_to_select=1,cv=5,scoring="recall")
rfecv.fit(xtrain,ytrain)
zzz=xtrain.columns[rfecv.get_support()]
print(zzz)




def modelstats2(Xtrain,Xtest,ytrain,ytest):
    stats=[]
    modelnames=["LogisticReg","DecisionTree","KNN","NB"]
    models=list()
    models.append(linear_model.LogisticRegression())
    models.append(tree.DecisionTreeClassifier())
    models.append(neighbors.KNeighborsClassifier())
    models.append(naive_bayes.GaussianNB())
Beispiel #24
0
# coding=utf-8
"""Recursive Feature Elimination with Support Vector Machines Classifier."""

import matplotlib.pyplot as plt
from sklearn import datasets, svm, feature_selection, model_selection


if __name__ == "__main__":
    print("Generating data...")
    X, y = datasets.make_classification(3000, 20, n_informative=5, n_redundant=3, n_repeated=2, n_classes=3,
                                        n_clusters_per_class=1, random_state=37)

    print("Performing recursive feature selection with crossvalidation.")
    svc = svm.SVC(kernel="linear")
    cvrfs = feature_selection.RFECV(svc, cv=model_selection.StratifiedKFold(2), scoring="accuracy")
    cvrfs.fit(X, y)
    print("Optimal selected features: {0}".format(cvrfs.n_features_))

    print("Plotting results...")
    plt.plot(range(1, len(cvrfs.grid_scores_) + 1), cvrfs.grid_scores_)
    plt.show()
Beispiel #25
0
######################### normailized features / use the whole data to train ###################
# feature_train, feature_test, label_train, label_test = train_test_split(normalized_feature, label, test_size=0.25, random_state=0)
feature_train = normalized_feature
label_train = label

######################### feature selection ######################
# univariate feature selection with F test for feature scoring
selector = feature_selection.SelectPercentile(feature_selection.f_classif,
                                              percentile=30)
selector = selector.fit(feature_train, label_train)
fselected_feature_train = selector.transform(feature_train)

# rfecv + random forest
rfecv = feature_selection.RFECV(ensemble.RandomForestClassifier(),
                                step=1,
                                cv=5,
                                scoring='accuracy')
rfecv = rfecv.fit(fselected_feature_train, label_train)
fselected_feature_train = rfecv.transform(fselected_feature_train)

print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
# plt.figure()
# plt.xlabel("Number of features selected")
# plt.ylabel("Cross validation score")
# plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)

# plt.show()

######################### start training using SVM ######################
Beispiel #26
0
def get_relev_class_RFE(X, y, random_state=None, params=None):
    rfc = fs.RFECV(model(random_state=random_state, params=params), cv=5)
    rfc.fit(X, y)
    return rfc.support_.astype(int)
Beispiel #27
0
# feature selection using cross validation
class PipelineRFE(Pipeline):
    def fit(self, X, y=None, **fit_params):
        super(PipelineRFE, self).fit(X, y, **fit_params)
        self.feature_importances_ = self.steps[-1][-1].feature_importances_
        return self


pipe = PipelineRFE([('std_scaler', preprocessing.StandardScaler()),
                    ("ET", ExtraTreesRegressor(random_state=42,
                                               n_estimators=25))])

# In[39]:

feature_selector_cv = feature_selection.RFECV(pipe,
                                              cv=5,
                                              step=1,
                                              scoring="neg_mean_squared_error")
feature_selector_cv.fit(X, y)

# In[40]:

print(feature_selector_cv.n_features_)
cv_grid_rmse = np.sqrt(-feature_selector_cv.grid_scores_)
print(cv_grid_rmse)

# * feature selection has given out 12 features. Lets print them

# In[41]:

feature_names = (X.columns)
selected_features = feature_names[feature_selector_cv.support_].tolist()
def correlation_supervisor(path,
                           rootdir,
                           simple=False,
                           lig_only=False,
                           max_descriptors=False):
    # load the files from the given input file
    file_dict, fail_dict = accquire_file(path)
    #loop over sucessful imports to get descriptors:
    big_mat = list()
    col_names = list()
    for i, keyv in enumerate(file_dict.keys()):
        file_dict[keyv].get_descriptor_vector(lig_only,
                                              simple,
                                              name=False,
                                              loud=False)
        #print('i = ',str(i))
        if i == 0:
            col_names = file_dict[keyv].descriptor_names
        # reorganize the data
        this_row = list()
        this_row.append(float(file_dict[keyv].yvalue))
        this_row.extend(file_dict[keyv].descriptors)
        big_mat.append(this_row)
    big_mat = np.array(big_mat)
    ##### let's do some regression
    ### standardize model:
    col_array = np.array(col_names)
    print('length of col array is  ' + str(len(col_array)))
    n_tot = len(col_array)
    X = big_mat[:, 1:]
    print('dimension of data matrix is ' + str(big_mat.shape))
    n_obs = len(X[:, 1])
    Scaler = preprocessing.StandardScaler().fit(X)
    Xs = Scaler.transform(X)
    Y = big_mat[:, 0]
    ## find baseline model (all descriptors)
    Reg = linear_model.LinearRegression()
    Reg.fit(Xs, Y)
    Ypred_all_all = Reg.predict(Xs)
    rs_all_all = metrics.r2_score(Y, Ypred_all_all)
    loo = model_selection.LeaveOneOut()
    r_reduce = list()
    mse_reduce = list()
    ### stepwise reduce the feature set until only one is left
    for n in range(0, n_tot):
        reductor = feature_selection.RFE(Reg, n_tot - n, step=1, verbose=0)
        reductor.fit(Xs, Y)
        Ypred_all = reductor.predict(Xs)
        rs_all = metrics.r2_score(Y, Ypred_all)
        mse_all = metrics.mean_squared_error(Y, Ypred_all)
    r_reduce.append(rs_all)
    mse_reduce.append(mse_all)
    ### reduce to one feature

    reductor_features = list()
    for i, ranks in enumerate(reductor.ranking_):
        reductor_features.append([col_array[i], ranks])
    reductor_features = sorted(reductor_features, key=lambda x: x[1])
    #print(reductor_features)
    print('****************************************')
    ### select best number using cv
    selector = feature_selection.RFECV(Reg,
                                       step=1,
                                       cv=loo,
                                       verbose=0,
                                       scoring='neg_mean_squared_error')
    selector.fit(Xs, Y)
    select_mse = selector.grid_scores_
    Ypred = selector.predict(Xs)
    rs = metrics.r2_score(Y, Ypred)
    n_opt = selector.n_features_
    opt_features = col_array[selector.support_]
    ranked_features = list()
    for i, ranks in enumerate(selector.ranking_):
        ranked_features.append([col_array[i], ranks])
    ranked_features = sorted(ranked_features, key=lambda x: x[1])
    print(ranked_features)
    if max_descriptors:  ## check if we need to reduce further
        print('a max of ' + str(max_descriptors) + ' were requested')
        n_max = int(max_descriptors)
        if n_opt > n_max:
            print('the RFE process selected ' + str(n_opt) +
                  ' varibles as optimal')
            print('discarding an additional ' + str(n_max - n_opt) +
                  ' variables')
            new_variables = list()
            new_mask = np.zeros(n_tot)
            for i in range(0, n_max):
                new_variables.append(ranked_features[i])
    ## report results to user
    print('analzyed ' + str(n_obs) + ' molecules')
    print('the full-space R2 is  ' + str("%0.2f" % rs_all_all) + ' with ' +
          str(n_tot) + ' features')
    print('optimal number of features is ' + str(n_opt) + ' of total ' +
          str(n_tot))
    print('the opt R2 is  ' + str("%0.2f" % rs))

    #print(ranked_features)
    X_r = selector.transform(Xs)
    reg_red = linear_model.LinearRegression()
    reg_red.fit(X_r, Y)
    Ypred_r = reg_red.predict(X_r)
    errors = [Y[i] - Ypred_r[i] for i in range(0, n_obs)]
    coefs = reg_red.coef_
    intercept = reg_red.intercept_
    mse_all = metrics.mean_squared_error(Y, Ypred_all_all)
    mse_r = metrics.mean_squared_error(Y, Ypred_r)
    if n_opt < 30:
        print('the optimal variables are: ' + str(opt_features))
        print('the coefficients are' + str(coefs))
    else:
        print('the (first 30) optimal variables are: ' +
              str(opt_features[0:29]))
        print('the (first 30) coefficients are' + str(coefs[0:29]))
    print('the intercept is ' + str("%0.2f" % intercept))
    print('the  training MSE with the best feature set is ' +
          str("%0.2f" % mse_r))
    print('the MSE  with all features  is ' + str("%0.2f" % mse_all))
    print('by eliminating ' + str(n_tot - n_opt) + ' features,' +
          ' CV-prediction MSE decreased from ' +
          str("%0.0f" % abs(select_mse[0])) + ' to ' +
          str("%00f" % abs(select_mse[n_tot - n_opt])))
    with open(rootdir + 'RFECV_rankings.csv', 'w') as f:
        f.write('RFE_rank,RFE_col,RFECV_rank,RFECV_col, \n')
        for i, items in enumerate(reductor_features):
            f.write(
                str(items[0]) + ',' + str(items[1]) + ',' +
                str(ranked_features[i][0]) + ',' + str(ranked_features[i][1]) +
                '\n')
    with open(rootdir + 'y_data.csv', 'w') as f:
        for items in Y:
            f.write(str(items) + '\n')
    with open(rootdir + 'y_pred_r.csv', 'w') as f:
        for items in Ypred_r:
            f.write(str(items) + '\n')
    with open(rootdir + 'optimal_decriptor_space.csv', 'w') as f:
        for i in range(0, n_obs):
            for j in range(0, n_opt):
                if j == (n_opt - 1):
                    f.write(str(X_r[i][j]) + '\n')
                else:
                    f.write(str(X_r[i][j]) + ',')
    with open(rootdir + 'full_descriptor_space.csv', 'w') as f:
        for names in col_names:
            f.write(names + ',')
        f.write('\n')
        for i in range(0, n_obs):
            for j in range(0, n_tot):
                if j == (n_tot - 1):
                    f.write(str(Xs[i][j]) + '\n')
                else:
                    f.write(str(Xs[i][j]) + ',')
    with open(rootdir + 'scaling.csv', 'w') as f:
        means = Scaler.mean_
        var = Scaler.var_
        f.write('name, mean,variance \n')
        for i in range(0, n_tot):
            f.write(
                str(col_names[i]) + ',' + str(means[i]) + ',' + str(var[i]) +
                ',' + str(selector.ranking_[i]) + '\n')
    with open(rootdir + 'coeficients.csv', 'w') as f:
        f.write('intercept,' + str(intercept) + '\n')
        for i in range(0, n_opt):
            f.write(str(opt_features[i]) + ',' + str(coefs[i]) + '\n')
    with open(rootdir + 'rfe_mse.csv', 'w') as f:
        f.write('features removed,mean CV error,' + str(intercept) + '\n')
        count = 0
        for items in mse_reduce:
            f.write(str(count) + ',' + str(items) + '\n')
            count += 1
Beispiel #29
0
## -->>Select k best features
# RFECV function
## Reshape the Label array
Labels = Labels.reshape(Labels.shape[0],)

## Set folds for nested cross validation
nr.seed(988)
feature_folds = ms.KFold(n_splits=10, shuffle = True)

## Define the model
logistic_mod = linear_model.LogisticRegression(C = 10, class_weight = {0:0.45, 1:0.55}) 

## Perform feature selection by CV with high variance features only
nr.seed(6677)
selector = fs.RFECV(estimator = logistic_mod, cv = feature_folds,
                      scoring = 'roc_auc')
selector = selector.fit(Features_reduced, Labels)
selector.support_ 

# relative ranking of the features
selector.ranking_

# transform method applies the selector to the feature array
Features_reduced = selector.transform(Features_reduced)
Features_reduced.shape

# plot of AUC (the metric) vs. the number of features
plt.plot(range(1, len(selector.grid_scores_) + 1), selector.grid_scores_)
plt.title('Mean AUC by number of features')
plt.ylabel('AUC')
plt.xlabel('Number of features')
Beispiel #30
0
    "BusinessTravel", "Department", "EducationField", "Gender", "JobRole",
    "MaritalStatus", "OverTime"
]
pref = ["btravel", "dept", "edufield", "g", "jr", "m", "ot"]

Xohe = pd.get_dummies(X, columns=cols, prefix=pref)
X.head()
Xohe.columns.values.shape
Xohe.columns.values

Xtrain, Xtest, ytrain, ytest = model_selection.train_test_split(
    Xohe, y, test_size=.2, random_state=42, stratify=y)

selector = feature_selection.RFECV(
    estimator=linear_model.LogisticRegression(C=.1),
    cv=5,
    step=3,
    scoring="roc_auc")
selector.fit(Xtrain, ytrain)
Xohe.columns.values[selector.get_support()].shape
Xtrain1 = Xtrain[Xohe.columns.values[selector.get_support()]]
Xtest1 = Xtest[Xohe.columns.values[selector.get_support()]]

print(modelstats(Xtrain1, Xtest1, ytrain, ytest))

Xcont = eadf[["Attrition", "DailyRate", "MonthlyIncome", "MonthlyRate"]]
sns.heatmap(Xcont.corr(), annot=True)
Xcont.corr()

print(modelstats(Xtrain, Xtest, ytrain, ytest))