Python SequentialFeatureSelector.transformの例、sklearn.feature_selection.SequentialFeatureSelector.transform Pythonの例

コード例 #1

0

ファイルを表示

def test_sparse_support():
    # Make sure sparse data is supported

    X, y = make_regression(n_features=10)
    X = scipy.sparse.csr_matrix(X)
    sfs = SequentialFeatureSelector(LinearRegression(), cv=2)
    sfs.fit(X, y)
    sfs.transform(X)

コード例 #2

0

ファイルを表示

def test_nan_support():
    # Make sure nans are OK if the underlying estimator supports nans

    rng = np.random.RandomState(0)
    n_samples, n_features = 100, 10
    X, y = make_regression(n_samples, n_features, random_state=0)
    nan_mask = rng.randint(0, 2, size=(n_samples, n_features), dtype=bool)
    X[nan_mask] = np.nan
    sfs = SequentialFeatureSelector(HistGradientBoostingRegressor(), cv=2)
    sfs.fit(X, y)
    sfs.transform(X)

    with pytest.raises(ValueError, match='Input contains NaN'):
        # LinearRegression does not support nans
        SequentialFeatureSelector(LinearRegression(), cv=2).fit(X, y)

コード例 #3

0

ファイルを表示

ファイル: unsupervised_learner.py プロジェクト: molusingh/unsupervised-learning

def run_sfs(x, y, output=None, caption=''):
    sfs = SequentialFeatureSelector(estimator=KNeighborsClassifier())
    sfs.fit(x, y)
    x_reduced = pd.DataFrame(sfs.transform(x), columns=x.columns[sfs.support_])
    print(f'reduced columns: {x_reduced.columns}')
    x_reduced.to_csv(f'{output}/{caption}-sfs.csv', index=False)
    return x_reduced

コード例 #4

0

ファイルを表示

ファイル: prepare.py プロジェクト: czosel/aml

    def select_greedy(data):
        X, X_test, y = data

        svr = svm.SVR(kernel="rbf", C=100, tol=1).fit(X, y)
        tic = time()
        select = SequentialFeatureSelector(svr,
                                           direction=direction,
                                           n_features_to_select=n_features,
                                           n_jobs=-1).fit(X, y)
        toc = time()

        joblib.dump(select.get_support(), "joblib/greedy_support")

        print(f"features selected: {select.get_support()}")
        print(f"done in: {toc - tic:.2f}s")

        return select.transform(X), select.transform(X_test), y

コード例 #5

0

ファイルを表示

def test_pipeline_support():
    # Make sure that pipelines can be passed into SFS and that SFS can be
    # passed into a pipeline

    n_samples, n_features = 50, 3
    X, y = make_regression(n_samples, n_features, random_state=0)

    # pipeline in SFS
    pipe = make_pipeline(StandardScaler(), LinearRegression())
    sfs = SequentialFeatureSelector(pipe, cv=2)
    sfs.fit(X, y)
    sfs.transform(X)

    # SFS in pipeline
    sfs = SequentialFeatureSelector(LinearRegression(), cv=2)
    pipe = make_pipeline(StandardScaler(), sfs)
    pipe.fit(X, y)
    pipe.transform(X)

コード例 #6

0

ファイルを表示

ファイル: test_sequential.py プロジェクト: Aathi410/Pro123

def test_unsupervised_model_fit(n_features_to_select):
    # Make sure that models without classification labels are not being
    # validated

    X, y = make_blobs(n_features=6)
    sfs = SequentialFeatureSelector(
        KMeans(),
        n_features_to_select=n_features_to_select,
    )
    sfs.fit(X)
    assert sfs.transform(X).shape[1] == n_features_to_select

コード例 #7

0

ファイルを表示

def test_n_features_to_select(direction, n_features_to_select):
    # Make sure n_features_to_select is respected

    X, y = make_regression(n_features=10)
    sfs = SequentialFeatureSelector(LinearRegression(),
                                    n_features_to_select=n_features_to_select,
                                    direction=direction, cv=2)
    sfs.fit(X, y)
    if n_features_to_select is None:
        n_features_to_select = 5  # n_features // 2
    assert sfs.get_support(indices=True).shape[0] == n_features_to_select
    assert sfs.n_features_to_select_ == n_features_to_select
    assert sfs.transform(X).shape[1] == n_features_to_select

コード例 #8

0

ファイルを表示

def test_n_features_to_select_auto(direction):
    """Check the behaviour of `n_features_to_select="auto"` with different
    values for the parameter `tol`.
    """

    n_features = 10
    tol = 1e-3
    X, y = make_regression(n_features=n_features, random_state=0)
    sfs = SequentialFeatureSelector(
        LinearRegression(),
        n_features_to_select="auto",
        tol=tol,
        direction=direction,
        cv=2,
    )
    sfs.fit(X, y)

    max_features_to_select = n_features - 1

    assert sfs.get_support(indices=True).shape[0] <= max_features_to_select
    assert sfs.n_features_to_select_ <= max_features_to_select
    assert sfs.transform(X).shape[1] <= max_features_to_select
    assert sfs.get_support(indices=True).shape[0] == sfs.n_features_to_select_

コード例 #9

0

ファイルを表示

    'AY_Dias_passive', 'AZ_Dias_passive', 'AT_Dias_passive'
]

sfs1 = SequentialFeatureSelector(reg_model,
                                 n_features_to_select=5,
                                 direction='forward',
                                 scoring='neg_root_mean_squared_error')
sfs1.fit(X_train_test, y_train_test)
select_features = sfs1.get_feature_names_out(input_features=feature_set)
#print selected features
print('Selected features: ')
print(select_features)
print('\n')

#transform the data to only use the selected features for the model development
X_select_train_test = sfs1.transform(X_train_test)
X_select_val = sfs1.transform(X_val)

X_train_test = X_select_train_test
X_val = X_select_val

####get feature importance with SVR##########
fit = reg_model.fit(X_train_test, y_train_test)
weights = fit.coef_
print('Feature Weights: ')
print(weights)
print('\n')

##############perform grid search on training-testing set############
estimator = SVR()  #regression model for this analysis
param_dist = {

コード例 #10

0

ファイルを表示

def main():
    # train_file_name = sys.argv[1]
    # output_file_name = sys.argv[2]
    train_data_file_name = "NEWS_Training_data.csv"
    train_label_file_name = "NEWS_Training_label.csv"
    test_data_file_name = "NEWS_Test_data.csv"
    test_label_file_name = "NEWS_Test_label.csv"

    tr_data_frame_X = read_csv(train_data_file_name)
    tr_data_frame_y = read_csv(train_label_file_name)
    te_data_frame_X = read_csv(test_data_file_name)
    te_data_frame_y = read_csv(test_label_file_name)
    alias = test_data_file_name.split('/')[-1].split('_')[0]
    model_filename = "model_{}.pkl".format(alias)
    try:
        linear_reg_model = pickle.load(open(model_filename, 'rb'))
        mode = "rfe"  # prefix to open the models
        lasso_reg_model = pickle.load(open(mode + "lasso_reg_model.pkl", 'rb'))
        logistic_reg_model = pickle.load(
            open(mode + "logistic_reg_model.pkl", 'rb'))
        # logistic_reg_model = pickle.load(open("model_NEWS.pkl", 'rb'))
        extra_forest_reg_model = pickle.load(
            open(mode + "extra_forest_reg_model.pkl", 'rb'))
        svm_reg_model = pickle.load(open(mode + "svm_reg_model.pkl", 'rb'))
        knn_reg_model = pickle.load(open(mode + "knn_reg_model.pkl", 'rb'))

    except:
        print(
            "ERROR: You must provide model.pkl file in the current directory.")
        raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT),
                                model_filename)

    X_train, y_train = toNumpy(tr_data_frame_X, tr_data_frame_y)
    X_test, y_test = toNumpy(te_data_frame_X, te_data_frame_y)
    print(X_test.shape[0])
    trivial_y_pred = np.repeat(np.mean(y_train), y_test.shape[0])
    baseline_y_pred = model_predict(linear_reg_model, X_test)
    measure_performance(y_pred=trivial_y_pred,
                        y_test=y_test,
                        mode="trivial testing")
    measure_performance(y_pred=baseline_y_pred,
                        y_test=y_test,
                        mode="baseline testing")
    measure_model(lasso_reg_model,
                  logistic_reg_model,
                  extra_forest_reg_model,
                  svm_reg_model,
                  knn_reg_model,
                  X_test=X_test,
                  y_test=y_test)

    one_hot_cols = [11, 12, 13, 14, 15, 16, 29, 30, 31, 32, 33, 34, 35, 36]
    z_score_cols = [
        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 18, 37, 42, 44, 45, 46, 47, 48, 49,
        50, 54, 56, 57
    ]
    normal_cols = [17, 19, 38, 39, 40, 41, 43, 51, 52, 53, 55]
    min_max_cols = [20, 21, 22, 23, 24, 25, 26, 27, 28]
    prepro_tr_X_data = np.zeros((X_train.shape[0], X_train.shape[1]),
                                dtype='float')
    prepro_te_X_data = np.zeros((X_test.shape[0], X_test.shape[1]),
                                dtype='float')
    prepro_tr_X_data[one_hot_cols] = X_train[one_hot_cols]
    prepro_te_X_data[one_hot_cols] = X_test[one_hot_cols]
    # Standardize the test data
    z_score_scaler = StandardScaler()
    z_score_scaler.fit(X_train[z_score_cols])
    prepro_tr_X_data[z_score_cols] = z_score_scaler.transform(
        X_train[z_score_cols])
    prepro_te_X_data[z_score_cols] = z_score_scaler.transform(
        X_test[z_score_cols])
    # Normalize the test data
    prepro_tr_X_data[normal_cols] = normalize(X_train[normal_cols],
                                              norm='l2',
                                              axis=0)
    prepro_te_X_data[normal_cols] = normalize(X_test[normal_cols],
                                              norm='l2',
                                              axis=0)
    # MinMax scale the test data
    minmax_scaler = MinMaxScaler()
    minmax_scaler.fit(X_train[min_max_cols])
    prepro_tr_X_data[min_max_cols] = minmax_scaler.transform(
        X_train[min_max_cols])
    prepro_te_X_data[min_max_cols] = minmax_scaler.transform(
        X_test[min_max_cols])

    measure_model(lasso_reg_model,
                  logistic_reg_model,
                  extra_forest_reg_model,
                  svm_reg_model,
                  knn_reg_model,
                  X_test=prepro_te_X_data,
                  y_test=y_test)

    # Do feature selection (UFS)
    # Univariate Feature Selection
    usf_f = SelectKBest(f_regression, k=50)
    usf_f.fit(prepro_tr_X_data, y_train)
    ufs_f_te_X_data = usf_f.transform(prepro_te_X_data)
    measure_model(lasso_reg_model,
                  logistic_reg_model,
                  extra_forest_reg_model,
                  svm_reg_model,
                  knn_reg_model,
                  X_test=ufs_f_te_X_data,
                  y_test=y_test)

    usf_mu = SelectKBest(mutual_info_regression, k=50)
    usf_mu.fit(prepro_tr_X_data, y_train)
    ufs_mu_te_X_data = usf_mu.transform(prepro_te_X_data)
    measure_model(lasso_reg_model,
                  logistic_reg_model,
                  extra_forest_reg_model,
                  svm_reg_model,
                  knn_reg_model,
                  X_test=ufs_mu_te_X_data,
                  y_test=y_test)

    estimator = linear_model.LinearRegression()
    # Recursive feature elimination
    rfe_selector = RFE(estimator, n_features_to_select=50, step=1)
    rfe_selector.fit(prepro_tr_X_data, y_train)
    rfe_X_data = rfe_selector.transform(prepro_te_X_data)
    measure_model(lasso_reg_model,
                  logistic_reg_model,
                  extra_forest_reg_model,
                  svm_reg_model,
                  knn_reg_model,
                  X_test=rfe_X_data,
                  y_test=y_test)

    # Sequential Feature Selection (SFS)
    sfs = SequentialFeatureSelector(estimator,
                                    n_features_to_select=50,
                                    direction='backward')
    sfs.fit(prepro_tr_X_data, y_train)
    sfs_X_data = sfs.transform(prepro_te_X_data)
    measure_model(lasso_reg_model,
                  logistic_reg_model,
                  extra_forest_reg_model,
                  svm_reg_model,
                  knn_reg_model,
                  X_test=sfs_X_data,
                  y_test=y_test)

コード例 #11

0

ファイルを表示

def test_n_features_to_select_stopping_criterion(direction):
    """Check the behaviour stopping criterion for feature selection
    depending on the values of `n_features_to_select` and `tol`.

    When `direction` is `'forward'`, select a new features at random
    among those not currently selected in selector.support_,
    build a new version of the data that includes all the features
    in selector.support_ + this newly selected feature.
    And check that the cross-validation score of the model trained on
    this new dataset variant is lower than the model with
    the selected forward selected features or at least does not improve
    by more than the tol margin.

    When `direction` is `'backward'`, instead of adding a new feature
    to selector.support_, try to remove one of those selected features at random
    And check that the cross-validation score is either decreasing or
    not improving by more than the tol margin.
    """

    X, y = make_regression(n_features=50, n_informative=10, random_state=0)

    tol = 1e-3

    sfs = SequentialFeatureSelector(
        LinearRegression(),
        n_features_to_select="auto",
        tol=tol,
        direction=direction,
        cv=2,
    )
    sfs.fit(X, y)
    selected_X = sfs.transform(X)

    rng = np.random.RandomState(0)

    added_candidates = list(
        set(range(X.shape[1])) - set(sfs.get_support(indices=True)))
    added_X = np.hstack([
        selected_X,
        (X[:, rng.choice(added_candidates)])[:, np.newaxis],
    ])

    removed_candidate = rng.choice(list(range(sfs.n_features_to_select_)))
    removed_X = np.delete(selected_X, removed_candidate, axis=1)

    plain_cv_score = cross_val_score(LinearRegression(), X, y, cv=2).mean()
    sfs_cv_score = cross_val_score(LinearRegression(), selected_X, y,
                                   cv=2).mean()
    added_cv_score = cross_val_score(LinearRegression(), added_X, y,
                                     cv=2).mean()
    removed_cv_score = cross_val_score(LinearRegression(), removed_X, y,
                                       cv=2).mean()

    assert sfs_cv_score >= plain_cv_score

    if direction == "forward":
        assert (sfs_cv_score - added_cv_score) <= tol
        assert (sfs_cv_score - removed_cv_score) >= tol
    else:
        assert (added_cv_score - sfs_cv_score) <= tol
        assert (removed_cv_score - sfs_cv_score) <= tol