Ejemplo n.º 1
0
    def fit(self, scenario: ASlibScenario, fold: int, num_instances: int):
        self._num_algorithms = len(scenario.algorithms)
        self._algorithm_cutoff_time = scenario.algorithm_cutoff_time

        # resample `amount_of_training_instances` instances and preprocess them accordingly
        features, performances = self._resample_instances(
            scenario.feature_data.values, scenario.performance_data.values, num_instances, random_state=fold)
        features, performances = self._preprocess_scenario(
            scenario, features, performances)

        base_model = Ridge(alpha=1.0, random_state=fold)
        scorer = make_scorer(mean_squared_error, greater_is_better=False)
        sfs_params = {'estimator': base_model, 'k_features': 'best',
                          'forward': True, 'scoring': scorer, 'cv': 2}

        for num in range(self._num_algorithms):
            feature_selector = SequentialFeatureSelector(**sfs_params)
            feature_selector = feature_selector.fit(
                features, performances[:, num])
            self._features[num] = feature_selector.k_feature_idx_

            features_tmp = PolynomialFeatures(2).fit_transform(
                features[:, self._features[num]])

            feature_selector = SequentialFeatureSelector(**sfs_params)
            feature_selector = feature_selector.fit(
                features_tmp, performances[:, num])
            self._quad_features[num] = feature_selector.k_feature_idx_
            features_tmp = features_tmp[:, self._quad_features[num]]

            censored = performances[:, num] >= self._algorithm_cutoff_time
            self._models[num] = impute_censored(
                features_tmp, performances[:, num], censored, base_model, distr_func, self._algorithm_cutoff_time)
Ejemplo n.º 2
0
    def _fit_linear_approximation(self, run_fffs):
        selected_features_x = list(self.x.columns)
        if run_fffs:
            feature_selector = SequentialFeatureSelector(
                LinearRegression(normalize=True),
                k_features=max(int(np.sqrt(self.x.shape[1])),
                               self.zeds_df.shape[1]),
                forward=True,
                verbose=2,
                cv=5,
                n_jobs=-1,
                scoring='r2')

            features = feature_selector.fit(self.x, self.y)
            selected_columns = list(features.k_feature_names_)
            selected_columns.extend([
                list(self.x.columns)[i]
                for i in list(self.zeds_df.columns.astype(int))
            ])
            selected_features_x = pd.DataFrame(self.x)[set(selected_columns)]
            m = self.get_best_linear_model(selected_features_x, self.y)
            m.fit(selected_features_x, self.y)
        else:
            m = self.get_best_linear_model(self.x, self.y)
            m.fit(self.x, self.y)

        return m, selected_features_x
Ejemplo n.º 3
0
def forward_feature_selection_decision_tree(X_train, y_train_binned):
    """
    Selects features using Feedforward Feature Selection using a Decision Tree Classifier.
    -- RATIONALE  I had aimed to write my own function to let the number of features to select be variable, however
    due to time constraints I did not implement such a version. For now I selected the number of features (7), based on
    visual inspection of the Forward Feature Selection plots. --
    Parameters
    -----------
    X_train: training split of feature variables with continuous values
    y_train_binned: training split of feature variables with 3 class values
    Returns
    -----------
    """
    clf = tree.DecisionTreeClassifier()
    # Build step forward feature selection
    sfs = SequentialFeatureSelector(clf,
                                    k_features=7,
                                    forward=True,
                                    floating=False,
                                    verbose=2,
                                    scoring='r2',
                                    cv=5)

    # Perform Sequential Feature Selection
    sfs = sfs.fit(X_train, y_train_binned)
    selected_feature_names = sfs.k_feature_names_
    return selected_feature_names
Ejemplo n.º 4
0
def forward_feature_selection_linear_regression(X_train, y_train):
    """
    Selects features using Feedforward Feature Selection using a Linear Regression.
    -- RATIONALE  I had aimed to write my own function to let the number of features to select be variable, however
    due to time constraints I did not implement such a version. For now I selected the number of features (9), based on
    visual inspection of the Forward Feature Selection plots. --
    Parameters
    -----------
    Returns
    -----------
    """
    regr = LinearRegression()
    # Build step forward feature selection
    sfs = SequentialFeatureSelector(regr,
                                    k_features=9,
                                    forward=True,
                                    floating=False,
                                    verbose=2,
                                    scoring='r2',
                                    cv=5)

    # Perform Sequential Feedforward Selection
    sfs = sfs.fit(X_train, y_train)
    selected_feature_names = sfs.k_feature_names_
    return selected_feature_names
Ejemplo n.º 5
0
def apply_SFS(classifiers,X_in,Y_in,sel_feat='best'):
    models_result = {}
    models_result['Forward']=[]
    models_result['Backward']=[]
    
    for forward in [True,False]:
        for model in classifiers:
            model_name = type(model).__name__
            if model_name not in models_result:
                models_result[model_name] = {}
                models_result[model_name]['Forward Features'] = None
                models_result[model_name]['Forward Index'] = None
                models_result[model_name]['Backward Features'] = None
                models_result[model_name]['Backward Index'] = None
                
            sfs_obj = SequentialFeatureSelector(model,k_features=sel_feat, forward=forward)
            sfs = sfs_obj.fit(X_in,Y_in)
            
            
            if forward==True:
                models_result['Forward'].append(sfs.k_score_)
                models_result[model_name]['Forward Features'] = sfs.k_feature_names_
                models_result[model_name]['Forward Index'] = sfs.k_feature_idx_
            
            else:
                models_result['Backward'].append(sfs.k_score_)
                models_result[model_name]['Backward Features'] = sfs.k_feature_names_
                models_result[model_name]['Backward Index'] = sfs.k_feature_idx_
    
    return(models_result)
Ejemplo n.º 6
0
def feature_selection(train_X, valid_X, test_X, train_Y, i):
    c1 = SVC(C=0.01, kernel="linear")
    c2 = RandomForestClassifier(n_estimators=50, max_depth=10)
    c3 = KNeighborsClassifier(n_neighbors=150)
    c4 = SGDClassifier(loss="huber", penalty="l1")
    c5 = DecisionTreeClassifier(criterion="gini", min_samples_split=250)
    c6 = LinearDiscriminantAnalysis(solver="lsqr")
    c7 = naive_bayes.BernoulliNB()
    c8 = MLPClassifier(hidden_layer_sizes=(5, 3))
    c9 = GradientBoostingClassifier(random_state=0,
                                    n_estimators=100,
                                    learning_rate=0.1)
    c10 = VotingClassifier(estimators=[('a', c1), ('b', c2), (
        'c', c3), ('d', c4), ('e', c5), ('f', c6), ('g', c7), ('h',
                                                               c8), ('i', c9)])
    features = {item for item in train_X.head(0)}
    fs = SequentialFeatureSelector(c10,
                                   k_features=i,
                                   forward=False,
                                   verbose=0,
                                   scoring='accuracy',
                                   cv=4)
    fs.fit(train_X, train_Y)

    selected_features = set(fs.k_feature_names_)
    print(fs.subsets_)
    features_to_drop = list(features - selected_features)

    return train_X.drop(features_to_drop, axis=1), valid_X.drop(features_to_drop, axis=1), \
           test_X.drop(features_to_drop, axis=1)
Ejemplo n.º 7
0
def run_experiment(X, y, clf, protected_groups, unfairness_metric, unfairness_weight):
    metric = unfairness_metrics.UnfairnessMetric(protected_groups, unfairness_metric)
    unfairness_scorer = metrics.make_scorer(metric)
    unfairness_means = []
    auc_means = []
    selected_feature_props = np.zeros([ITERATIONS, X.shape[1]])
    for i in tqdm(range(ITERATIONS), desc=' Training ' + clf.__class__.__name__):
        xval = model_selection.KFold(4, shuffle=True, random_state=i)
        # Make a metric combining accuracy and subtracting unfairness w.r.t. the protected groups
        metric = unfairness_metrics.CombinedMetric(ACCURACY_METRIC, protected_groups,
                                                   unfairness_metric, unfairness_weight)
        combined_scorer = metrics.make_scorer(metric)
        sfs = SequentialFeatureSelector(clf, 'best', verbose=0, cv=xval, scoring=combined_scorer,
                                        n_jobs=2)
        pipe = pipeline.Pipeline([
            ('standardize', preprocessing.StandardScaler()),
            ('feature_selection', sfs),
            ('model', clf),
        ])
        result = model_selection.cross_validate(pipe, X, y, verbose=0, cv=xval, scoring={
            'unfairness': unfairness_scorer,
            'auc': metrics.make_scorer(ACCURACY_METRIC),
        }, return_estimator=True)
        unfairness_means.append(result['test_unfairness'].mean())
        auc_means.append(result['test_auc'].mean())
        for estimator in result['estimator']:
            for feature_i in estimator.named_steps['feature_selection'].k_feature_idx_:
                selected_feature_props[i][feature_i] += 1 / len(result['estimator'])
    return unfairness_means, auc_means, selected_feature_props
Ejemplo n.º 8
0
def FeatureSelection(pipeline_name, data_dev_mode, tag, train_filepath,
                     test_filepath):
    logger.info('FEATURE SELECTION...')

    if bool(config.params.clean_experiment_directory_before_training
            ) and os.path.isdir(config.params.experiment_dir):
        logger.info('Cleaning experiment directory...')
        shutil.rmtree(config.params.experiment_dir)

    data = _read_data(data_dev_mode, train_filepath, test_filepath)

    train_set = data['train']

    y = train_set[config.TARGET_COL].values.reshape(-1, )
    train_set = train_set.drop(columns=config.TARGET_COL)

    pipeline = PIPELINES[pipeline_name](so_config=config.SOLUTION_CONFIG,
                                        suffix=tag)

    sfs = SequentialFeatureSelector(estimator=pipeline,
                                    k_features=(10, len(train_set.columns)),
                                    forward=False,
                                    verbose=2,
                                    cv=5,
                                    scoring='roc_auc')
    sfs.fit(train_set.to_numpy(), y)

    fig = plot_sequential_feature_selection(sfs.get_metric_dict())
    plt.ylim([0.6, 1])
    plt.title('Sequential Feature Selection')
    plt.grid()
    plt.show()
Ejemplo n.º 9
0
def doSFS(runDict, save=True):
    for runName, subDict in runDict.items():
        for forward in [True, False]:
            print(runName, forward)
            featureSelector = SequentialFeatureSelector(subDict['clf'],
                                                        k_features=(1, 50),
                                                        forward=forward,
                                                        verbose=2,
                                                        scoring="accuracy",
                                                        cv=5,
                                                        n_jobs=-1)
            if forward:
                subDict['Ffeatures'] = featureSelector.fit(x, y)
                subDict['FfilteredFeatures'] = x.columns[list(
                    subDict['Ffeatures'].k_feature_idx_)]
            else:
                subDict['Bfeatures'] = featureSelector.fit(x, y)
                subDict['BfilteredFeatures'] = x.columns[list(
                    subDict['Bfeatures'].k_feature_idx_)]
            if save:
                forwardsOrBackwards = 'Bfeatures'
                if forward:
                    forwardsOrBackwards = 'Ffeatures'
                saveName = runName + '_' + forwardsOrBackwards
                pickling.save_dill(subDict[forwardsOrBackwards].subsets_,
                                   saveName)
    return runDict
Ejemplo n.º 10
0
def step_forward_selection_by_random_forest(features_to_select=27,
                                            df=df_train,
                                            to_print=True):

    if to_print:
        print(
            '\nStarting step forward feature selection test using RandomForest classifier.'
        )

    df_features = drop_label_column(df)
    df_label = get_label_column(df)

    feature_selector = SequentialFeatureSelector(RandomForestClassifier(
        n_jobs=-1, n_estimators=100),
                                                 k_features=features_to_select,
                                                 forward=True,
                                                 verbose=2,
                                                 cv=4)
    features = feature_selector.fit(df_features, df_label)
    filtered_features = df_features.columns[list(features.k_feature_idx_)]

    if to_print:
        print('Found {} features to drop. Features are: \n{}'.format(
            len(filtered_features), filtered_features))

    return filtered_features
 def backward(X_train, Y_train):
     rf_sfs = RandomForestRegressor(n_estimators=100, max_depth=50, oob_score=False, n_jobs=-1)
     SFS_b = SequentialFeatureSelector(rf_sfs, forward=False, k_features=6, scoring='neg_mean_squared_error',
                                       n_jobs=-1)
     SFS_b = SFS_b.fit(X_train.values, Y_train.values)
     indxs = list(SFS_b.k_feature_names_)
     str_cols = X_train.columns
     features = set(zip(indxs, str_cols))
     print(features)
Ejemplo n.º 12
0
def smoteenn_sffs_reduction_classify_full():

    (X, Y), feature_names = read_dataset(
        screening='')  # no screening results, only risk factors

    # dataset resampling for imbalanced data compensation
    smoteenn = SMOTEENN()

    Xres, Yres = smoteenn.fit_resample(X, Y)  # resampled dataset
    print('Resampling')
    print('Original dataset size:', Counter(Y))
    print('Resampled dataset size:', Counter(Yres))

    # feature selection using sequential forward floating selection and tuned SVM
    scoring = [
        'accuracy', 'precision', 'recall', 'balanced_accuracy',
        'average_precision', 'brier_score_loss', 'neg_log_loss'
    ]

    param_grid = {'C': np.logspace(-3, 3, 7), 'kernel': ['rbf']}

    grid = GridSearchCV(estimator=SVC(probability=True, gamma='scale'),
                        param_grid=param_grid,
                        n_jobs=-1,
                        verbose=10,
                        cv=5,
                        scoring=scoring,
                        refit='balanced_accuracy',
                        iid=False,
                        error_score=0)

    grid.fit(Xres, Yres)
    print(grid.best_params_)

    selector = SequentialFeatureSelector(
        forward=False,
        floating=True,
        k_features='best',
        verbose=2,
        n_jobs=-1,
        scoring='balanced_accuracy',
        cv=5,
        estimator=SVC(probability=True,
                      gamma='scale',
                      kernel=grid.best_params_['kernel'],
                      C=grid.best_params_['C']))

    selector.fit(Xres, Yres, custom_feature_names=feature_names)

    with open('smoteenn_sbfs.pkl', 'wb') as f:
        pickle.dump(selector, f, -1)

    df = pd.DataFrame(selector.subsets_)
    df.to_csv('smoteenn_sbfs.csv')
def classification(df, y):
    feature_selector = SequentialFeatureSelector(
        RandomForestClassifier(n_jobs=-1),
        k_features=len(df.keys()),
        forward=True,
        verbose=2,
        scoring='roc_auc',
        cv=4)

    features = feature_selector.fit(np.array(df), y)
    filtered_features = X.columns[list(features.k_feature_idx_)]
    return filtered_features
Ejemplo n.º 14
0
def selector(df):
    x_data, y_data = get_data(df)
    x_data_scaled = StandardScaler().fit_transform(x_data)

    selector = SequentialFeatureSelector(LogisticRegression(),
                                         scoring='neg_log_loss',
                                         verbose=2,
                                         k_features=3,
                                         forward=False,
                                         n_jobs=-1)
    selector.fit(x_data_scaled, y_data)
    # no output :(

    return
Ejemplo n.º 15
0
def test8():
    # Example 3 - Majority voting with classifiers trained on different feature subsets
    from sklearn import datasets

    iris = datasets.load_iris()
    X, y = iris.data[:, :], iris.target

    from sklearn.model_selection import GridSearchCV
    from sklearn.linear_model import LogisticRegression
    from sklearn.naive_bayes import GaussianNB
    from sklearn.ensemble import RandomForestClassifier
    from mlxtend.classifier import EnsembleVoteClassifier
    from sklearn.pipeline import Pipeline
    from mlxtend.feature_selection import SequentialFeatureSelector

    clf1 = LogisticRegression(random_state=1)
    clf2 = RandomForestClassifier(random_state=1)
    clf3 = GaussianNB()

    # Creating a feature-selection-classifier pipeline

    sfs1 = SequentialFeatureSelector(clf1,
                                     k_features=4,
                                     forward=True,
                                     floating=False,
                                     scoring='accuracy',
                                     verbose=0,
                                     cv=0)

    clf1_pipe = Pipeline([('sfs', sfs1), ('logreg', clf1)])

    eclf = EnsembleVoteClassifier(clfs=[clf1_pipe, clf2, clf3], voting='soft')

    params = {
        'pipeline__sfs__k_features': [1, 2, 3],
        'pipeline__logreg__C': [1.0, 100.0],
        'randomforestclassifier__n_estimators': [20, 200]
    }

    grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
    grid.fit(iris.data, iris.target)

    cv_keys = ('mean_test_score', 'std_test_score', 'params')

    print("test8")
    for r, _ in enumerate(grid.cv_results_['mean_test_score']):
        print(
            "%0.3f +/- %0.2f %r" %
            (grid.cv_results_[cv_keys[0]][r], grid.cv_results_[cv_keys[1]][r] /
             2.0, grid.cv_results_[cv_keys[2]][r]))
Ejemplo n.º 16
0
def plot_feed_forward_models():
    """
    Plots the performance for each iteration of the feedforward model.
    The number of features chosen are 15 and 20, since these showed the best result

    """
    # create Linear Regression model
    regr = LinearRegression()

    sfs_model = SequentialFeatureSelector(regr,
                                          k_features=15,
                                          forward=True,
                                          floating=False,
                                          scoring='neg_mean_squared_error',
                                          cv=10)

    sfs_model = sfs_model.fit(X_train, y_train)
    plot_sfs(sfs_model.get_metric_dict(), kind='std_err')
    plt.title('Sequential Forward Selection Linear Regression (w. StdErr)')
    plt.grid()
    plt.show()

    # Same for the Decision Tree, with some different settings
    clf = tree.DecisionTreeClassifier()

    sfs_model = SequentialFeatureSelector(clf,
                                          k_features=20,
                                          forward=True,
                                          floating=False,
                                          scoring='accuracy',
                                          cv=10)
    sfs_model = sfs_model.fit(X_train, y_train_binned)
    plot_sfs(sfs_model.get_metric_dict(), kind='std_err')
    plt.title('Sequential Forward Selection Decision Tree (w. StdErr)')
    plt.grid()
    plt.show()
Ejemplo n.º 17
0
def filter_with_sfs(train_X, valid_X, test_X, train_Y, i):
    features = {item for item in train_X.head(0)}
    fs = SequentialFeatureSelector(RandomForestClassifier(n_estimators=30,
                                                          random_state=0),
                                   k_features=i,
                                   forward=True,
                                   verbose=0,
                                   scoring='accuracy',
                                   cv=4)
    fs.fit(train_X, train_Y)

    selected_features = set(fs.k_feature_names_)
    features_to_drop = list(features - selected_features)

    return train_X.drop(features_to_drop, axis=1), valid_X.drop(features_to_drop, axis=1), \
           test_X.drop(features_to_drop, axis=1)
Ejemplo n.º 18
0
def SFS_test(input, how_many_attrs, cv_scores):
    y = np.array(input[:, -1])
    x = np.array(input[:, :-1])
    sfs = SequentialFeatureSelector(KNeighborsClassifier(n_neighbors=5,
                                                         metric="euclidean"),
                                    k_features=how_many_attrs,
                                    forward=True,
                                    floating=False,
                                    verbose=0,
                                    scoring='accuracy',
                                    n_jobs=-1,
                                    cv=4)
    sfs = sfs.fit(x, y)
    # print(sfs.k_feature_idx_)
    target = np.array(input[:, -1]).reshape(475, 1)
    return np.hstack((input[:, sfs.k_feature_idx_], target))
Ejemplo n.º 19
0
def select_features_wrapper(X, y, forward=True, k_features=20):
    # svc = SVC(gamma='auto')
    # linearSVC = LinearSVC(random_state=0, tol=1e-5, class_weight='balanced')
    random_forest_clssifier = RandomForestClassifier(max_depth=7,
                                                     random_state=0)

    sgd = SGDClassifier(max_iter=1000, tol=1e-3)
    #     knn = KNNeighborsClassifier(n_neighbors=3)
    sfs = SequentialFeatureSelector(sgd,
                                    k_features=k_features,
                                    forward=forward,
                                    floating=False,
                                    verbose=5,
                                    cv=0,
                                    n_jobs=-1)
    sfs.fit(X, y.values.ravel())
    print(sfs.k_feature_names_)
    return sfs
Ejemplo n.º 20
0
def feature_selection(X, y,  method=1, k_features=5, save_params=False, seed=127):
    logit = LogisticRegression(C=1, random_state=seed, solver='liblinear')

    if method == 1:
        rfe = RFE(logit, n_features_to_select=k_features, verbose=2)
        rfe.fit(X, y)
        if save_params:
            with open('rfe.pkl', 'wb') as file:
                pickle.dump(rfe, file, pickle.HIGHEST_PROTOCOL)
        return rfe
    elif method == 2:
        sfs = SequentialFeatureSelector(logit, cv=0, k_features=k_features,
                                        forward=False, scoring='roc_auc',
                                        verbose=2, n_jobs=-1)
        sfs.fit(X, y)
        if save_params:
            with open('sfs.pkl', 'wb') as file:
                pickle.dump(sfs, file, pickle.HIGHEST_PROTOCOL)
        return sfs
Ejemplo n.º 21
0
def start_data_pretreatment(train_path, test_path, path, flag, index):
    train = grouping(train_path)
    test = grouping(test_path)
    key_skills_processing(data_train=train, data_test=test, top=100, flag=flag)
    remove_columns = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
    train = train.drop(train.columns[remove_columns], axis=1)
    test = test.drop(test.columns[remove_columns], axis=1)
    if index == 3:
        selector = VarianceThreshold(0.009)
        tmp1 = train.group
        train = train.drop(['group'], axis=1)
        tmp2 = test.group
        test = test.drop(['group'], axis=1)
        selector.fit(train)
        col = selector.get_support(True)
        train, test = update_data(train, test, col, train.columns, tmp1, tmp2)
    if index == 4:
        model_rfc = RandomForestClassifier(n_estimators=70)
        selector = SequentialFeatureSelector(model_rfc,
                                             k_features=50,
                                             forward=True,
                                             floating=False,
                                             verbose=2,
                                             scoring='accuracy',
                                             cv=0,
                                             n_jobs=-1)
        tmp1 = train.group
        train = train.drop(['group'], axis=1)
        tmp2 = test.group
        test = test.drop(['group'], axis=1)
        selector = selector.fit(train, tmp1)
        col = selector.k_feature_idx_
        train, test = update_data(train, test, col, train.columns, tmp1, tmp2)
    train.to_csv("train_" + str(index) + "_" + path,
                 sep=';',
                 encoding="utf-8-sig",
                 index=False)
    test.to_csv("test_" + str(index) + "_" + path,
                sep=';',
                encoding="utf-8-sig",
                index=False)
    return train, test
Ejemplo n.º 22
0
def analyze_model(model: sk.base.BaseEstimator, x: pd.DataFrame,
                  y: pd.DataFrame, n_jobs: int = 1) \
        -> SequentialFeatureSelector:
    start_time = time.time()
    logger.info("Starting feature selection")

    sfs = SequentialFeatureSelector(
        estimator=model,
        k_features="parsimonious",
        cv=None,
        verbose=1,
        forward=True,
        n_jobs=n_jobs,
        # scoring is chosen as a default based on the type of model
    )
    sfs.fit(x, y)

    end_time = time.time()
    logger.info("Feature selection done in %.3f seconds", end_time - start_time)
    return sfs
Ejemplo n.º 23
0
    def hyper_parameter_tunning(self):
        # 调整特征组合
        self.__lr = LogisticRegression()
        self.__ps = PredefinedSplit(self.__train_us_validation_index)
        self.__sfs = SequentialFeatureSelector(
            estimator=self.__lr,
            k_features=(1, 11),
            forward=True,
            floating=True,
            scoring="roc_auc",
            cv=self.__ps
        )
        self.__sfs.fit(self.__train_us_validation_feature_woe, self.__train_us_validation_label)
        # 最终模型使用的 feature
        self.__feature_columns = self.__feature_columns[list(self.__sfs.k_feature_idx_)]
        # Numpy 使用不同的方式进行索引 , OOT 已经不包含通过 SFFS 删掉的特征了
        self.__train_feature_woe = self.__train_feature_woe[:, self.__sfs.k_feature_idx_]
        self.__train_us_feature_woe = self.__train_us_feature_woe[:, self.__sfs.k_feature_idx_]
        self.__validation_feature_woe = self.__validation_feature_woe[:, self.__sfs.k_feature_idx_]

        self.__train_us_validation_feature_woe = self.__train_us_validation_feature_woe[:, self.__sfs.k_feature_idx_]

        # 特征组合一定的条件下调整 LR 超参数 C
        def __lr_cv(C):
            clf = LogisticRegression(
                C=C,
                random_state=7
            )
            val = cross_val_score(
                clf,
                self.__train_us_validation_feature_woe,
                self.__train_us_validation_label,
                scoring="roc_auc",
                cv=self.__ps
            ).mean()

            return val

        self.__param = {"C": (0.1, 100)}
        self.__lr_bo = BayesianOptimization(__lr_cv, self.__param, random_state=7)
        self.__lr_bo.maximize(** {"alpha": 1e-5})
def get_features(train_set,
                 target,
                 method=None,
                 model="rf",
                 n_features="auto",
                 verbose=1):
    if model == "rf":
        model = RandomForestClassifier(n_jobs=-1, random_state=1)
    elif model == "gb":
        model = GradientBoostingClassifier(random_state=1)

    if method == None:
        selected_features = train_set.columns.values

    if method == "boruta":
        print("Fitting Boruta...")
        boruta = BorutaPy(model, n_estimators=n_features, verbose=verbose)
        boruta.fit(train_set.values, target.values)
        selected_features = train_set.columns[boruta.support_].values

    if method == "rfe":
        print("Fitting Recursive Feature Elimination...")
        rfe = RFECV(estimator=model, cv=4, scoring='accuracy', verbose=verbose)
        rfe = rfe.fit(train_set, target)
        selected_features = train_set.columns[rfe.support_].values

    if method == "sfs":
        print("Fitting Sequential Feature Selection...")
        if n_features == "auto":
            n_features = "best"
        sfs = SequentialFeatureSelector(model,
                                        k_features=n_features,
                                        verbose=verbose,
                                        n_jobs=-1,
                                        scoring='accuracy',
                                        cv=4)
        sfs.fit(train_set, target)
        selected_features = list(sfs.k_feature_names_)

    return selected_features
Ejemplo n.º 25
0
def main():
    loadVariables("Questioned")
    key = pd.read_csv("../Text/key.csv")

    for questioned_iterator in q_transitions.keys():
        data = []
        data.append(np.mean(np.array(q_centroids[questioned_iterator])))
        data.append(np.mean(np.array(q_transitions[questioned_iterator])))
        data.append(np.mean(np.array(q_ratios[questioned_iterator])))
        data.append(np.mean(np.array(q_black_Pixels[questioned_iterator])))
        data.append(np.mean(np.array(q_normalized[questioned_iterator])))
        data.append(np.mean(np.array(q_angles[questioned_iterator])))
        data.append(np.mean(np.array(
            q_normalized_blacks[questioned_iterator])))
        x.append(data)

    for temp, file in enumerate(q_ratios.keys()):
        number = str(re.findall(r'\d+', file)).replace('[', '').replace(
            ']', '').replace("'", '').lstrip('0')
        if key['Decision'].values[int(number) - 1] == 'F':
            y.append(0)
        if key['Decision'].values[int(number) - 1] == 'D':
            y.append(1)
        if key['Decision'].values[int(number) - 1] == 'G':
            y.append(2)

    knnClassifier = KNeighborsClassifier(n_neighbors=4)
    sfs = SequentialFeatureSelector(knnClassifier,
                                    k_features=7,
                                    forward=True,
                                    floating=True,
                                    verbose=2,
                                    scoring='accuracy',
                                    cv=0,
                                    n_jobs=-1)

    sfs = sfs.fit(np.array(x), np.array(y), custom_feature_names=tuple(titles))
    print()
    pprint(sfs.subsets_)
Ejemplo n.º 26
0
def feature_selection(X, y, model):

    correlated_features = set()
    correlation_matrix = X.corr()
    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) > 0.8:
                colname = correlation_matrix.columns[i]
                correlated_features.add(colname)

    X.drop(labels=correlated_features, axis=1, inplace=True)
    feature_selector = SequentialFeatureSelector(model,
                                                 k_features=11,
                                                 forward=False,
                                                 verbose=2,
                                                 scoring='balanced_accuracy',
                                                 cv=5)
    #feature_selector = ExhaustiveFeatureSelector(model, min_features=5, max_features=10, scoring='balanced_accuracy', print_progress=True, cv=3)

    features = feature_selector.fit(X, y)
    filtered_features = X.columns[list(features.k_feature_idx_)]

    return filtered_features
Ejemplo n.º 27
0
def forward_floating(data, scoring=None, model=None, k=3, cv=10):
    """A wrapper of mlxtend Sequential Forward Floating Selection algorithm.

    """
    X_train, X_test, y_train, y_test = data

    # Z-scores.
    X_train_std, X_test_std = utils.train_test_z_scores(X_train, X_test)

    # NOTE: Nested calls not supported by multiprocessing => joblib converts
    # into sequential code (thus, default n_jobs=1).
    #n_jobs = cpu_count() - 1 if cpu_count() > 1 else cpu_count()
    n_jobs = 1

    selector = SequentialFeatureSelector(
        model, k_features=k, forward=True, floating=True, scoring='roc_auc',
        cv=cv, n_jobs=n_jobs
    )
    selector.fit(X_train_std, y_train)

    support = _check_support(selector.k_feature_idx_, X_train_std)

    return _check_feature_subset(X_train_std, X_test_std, support)
Ejemplo n.º 28
0
# Removing Constant features
constant_filter = VarianceThreshold()
constant_filter.fit(X_train)
constant_columns = [col for col in X_train.columns  
                    if col not in X_train.columns[constant_filter.get_support()]]
X_train.drop(labels=constant_columns, axis=1, inplace=True) 
X_test.drop(labels=constant_columns, axis=1, inplace=True)

from sklearn.neighbors import KNeighborsClassifier
start = time.time()
classifier_ = DecisionTreeClassifier(random_state = 100)
knn = KNeighborsClassifier(n_neighbors=2) 
feature_selector = SequentialFeatureSelector(classifier_,  
           k_features=15,
           forward=True,
           scoring='accuracy',
           cv=0)
feature_selector = feature_selector.fit(X_train,y_train)
end = time.time()
print("Execution time: %0.4f seconds"%(float(end)- float(start)))
selected_features= X_train.columns[list(feature_selector.k_feature_idx_)]

X_train = X_train[selected_features]
X_test = X_test[selected_features]


start = time.time()
clf3 = DecisionTreeClassifier(random_state = 100)
clf3.fit(X_train, y_train)
end = time.time()
 def forward(X_train, Y_train):
     rf_sfs = RandomForestRegressor(n_estimators=100, max_depth=50, oob_score=False, n_jobs=-1)
     SFS = SequentialFeatureSelector(rf_sfs, k_features=6, scoring='neg_mean_squared_error', n_jobs=-1)
     SFS = SFS.fit(X_train, Y_train)
     print(SFS.k_feature_names_)
Ejemplo n.º 30
0
X = transformer.fit_transform(X).toarray()

from sklearn.metrics import roc_auc_score

from mlxtend.feature_selection import SequentialFeatureSelector

from sklearn.ensemble import RandomForestClassifier

#######

#we want 10 feature
sfs = SequentialFeatureSelector(RandomForestClassifier(n_estimators=10,
                                                       n_jobs=-1),
                                k_features=10,
                                forward=True,
                                floating=False,
                                verbose=2,
                                scoring='accuracy',
                                cv=3)

X = sfs.fit_transform(X, y)
"""
sfs=SequentialFeatureSelector(DecisionTreeClassifier(),k_features=10,
                                                     forward=True,floating=False,verbose=2,scoring='accuracy',cv=3)

sfs=sfs.fit(X,y)

X=sfs.fit_transform(X,y)
"""

#####################################################################