Ejemplo n.º 1
0
# A new iterative transformer to select features is available:
# :class:`~sklearn.feature_selection.SequentialFeatureSelector`.
# Sequential Feature Selection can add features one at a time (forward
# selection) or remove features from the list of the available features
# (backward selection), based on a cross-validated score maximization.
# See the :ref:`User Guide <sequential_feature_selection>`.

from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris

X, y = load_iris(return_X_y=True, as_frame=True)
feature_names = X.columns
knn = KNeighborsClassifier(n_neighbors=3)
sfs = SequentialFeatureSelector(knn, n_features_to_select=2)
sfs.fit(X, y)
print("Features selected by forward sequential selection: "
      f"{feature_names[sfs.get_support()].tolist()}")

##############################################################################
# New PolynomialCountSketch kernel approximation function
# -------------------------------------------------------
# The new :class:`~sklearn.kernel_approximation.PolynomialCountSketch`
# approximates a polynomial expansion of a feature space when used with linear
# models, but uses much less memory than
# :class:`~sklearn.preprocessing.PolynomialFeatures`.

from sklearn.datasets import fetch_covtype
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
Ejemplo n.º 2
0
def test_bad_n_features_to_select(n_features_to_select):
    X, y = make_regression(n_features=5)
    sfs = SequentialFeatureSelector(LinearRegression(),
                                    n_features_to_select=n_features_to_select)
    with pytest.raises(ValueError, match="must be either None"):
        sfs.fit(X, y)
Ejemplo n.º 3
0
def test_bad_direction():
    X, y = make_regression(n_features=5)
    sfs = SequentialFeatureSelector(LinearRegression(), direction="bad")
    with pytest.raises(ValueError, match="must be either 'forward' or"):
        sfs.fit(X, y)
Ejemplo n.º 4
0
reg_model = LinearSVR()

#define feature names from the input DTW array
feature_set = [
    'AX_500ms', 'AY_500ms', 'AZ_500ms', 'AT_500ms', 'AX_IVC', 'AY_IVC',
    'AZ_IVC', 'AT_IVC', 'AX_EJ', 'AY_EJ', 'AZ_EJ', 'AT_EJ', 'AX_Dias_500ms',
    'AY_Dias_500ms', 'AZ_Dias_500ms', 'AT_Dias_500ms', 'AX_Dias_active',
    'AY_Dias_active', 'AZ_Dias_active', 'AT_Dias_active', 'AX_Dias_passive',
    'AY_Dias_passive', 'AZ_Dias_passive', 'AT_Dias_passive'
]

sfs1 = SequentialFeatureSelector(reg_model,
                                 n_features_to_select=5,
                                 direction='forward',
                                 scoring='neg_root_mean_squared_error')
sfs1.fit(X_train_test, y_train_test)
select_features = sfs1.get_feature_names_out(input_features=feature_set)
#print selected features
print('Selected features: ')
print(select_features)
print('\n')

#transform the data to only use the selected features for the model development
X_select_train_test = sfs1.transform(X_train_test)
X_select_val = sfs1.transform(X_val)

X_train_test = X_select_train_test
X_val = X_select_val

####get feature importance with SVR##########
fit = reg_model.fit(X_train_test, y_train_test)
Ejemplo n.º 5
0
    df_results = pd.DataFrame(list(map(np.ravel, results)))
    df_results['mean'] = df_results.mean(axis=1)
    df_results['n_features'] = range(min_n, max_n + 1)
    df_results.to_csv(
        path_or_buf=
        'C:/Users/angel/git/Observ_models/data/ML/Regression/feature_selection_SVR_3-15.csv',
        index=False)
    # Select n_features:
    # sfs = SequentialFeatureSelector(estimator=model, n_features_to_select=58, cv=5, direction='forward', n_jobs=6)
    # sfs = SequentialFeatureSelector(estimator=model, n_features_to_select=10, cv=5, direction='forward', n_jobs=6) # test with only 10, that gives not-that-bad score
    sfs = SequentialFeatureSelector(estimator=model,
                                    n_features_to_select=7,
                                    cv=5,
                                    direction='forward',
                                    n_jobs=6)
    sfs.fit(predictors_train, labels_train)
    data_reduced_train = train_prepared[np.append(
        np.array(predictors_train.columns[sfs.support_]), ['log_visit_rate'])]
    data_reduced_test = test_prepared[np.append(
        np.array(predictors_test.columns[sfs.support_]), ['log_visit_rate'])]
    data_reduced_train.to_csv(
        'C:/Users/angel/git/Observ_models/data/ML/Regression/train/data_reduced_7.csv',
        index=False)
    data_reduced_test.to_csv(
        'C:/Users/angel/git/Observ_models/data/ML/Regression/test/data_reduced_7.csv',
        index=False)

    # EVALUATE THE MODEL WITH REDUCED PREDICTORS:
    model = SVR(C=1.7, coef0=-0.33, epsilon=0.09, gamma=0.14, kernel='rbf')
    predictors_train = data_reduced_train.iloc[:, :-1]
    labels_train = np.array(data_reduced_train.iloc[:, -1:]).flatten()
Ejemplo n.º 6
0
def main():
    # train_file_name = sys.argv[1]
    # output_file_name = sys.argv[2]
    train_data_file_name = "NEWS_Training_data.csv"
    train_label_file_name = "NEWS_Training_label.csv"
    test_data_file_name = "NEWS_Test_data.csv"
    test_label_file_name = "NEWS_Test_label.csv"

    tr_data_frame_X = read_csv(train_data_file_name)
    tr_data_frame_y = read_csv(train_label_file_name)
    te_data_frame_X = read_csv(test_data_file_name)
    te_data_frame_y = read_csv(test_label_file_name)
    alias = test_data_file_name.split('/')[-1].split('_')[0]
    model_filename = "model_{}.pkl".format(alias)
    try:
        linear_reg_model = pickle.load(open(model_filename, 'rb'))
        mode = "rfe"  # prefix to open the models
        lasso_reg_model = pickle.load(open(mode + "lasso_reg_model.pkl", 'rb'))
        logistic_reg_model = pickle.load(
            open(mode + "logistic_reg_model.pkl", 'rb'))
        # logistic_reg_model = pickle.load(open("model_NEWS.pkl", 'rb'))
        extra_forest_reg_model = pickle.load(
            open(mode + "extra_forest_reg_model.pkl", 'rb'))
        svm_reg_model = pickle.load(open(mode + "svm_reg_model.pkl", 'rb'))
        knn_reg_model = pickle.load(open(mode + "knn_reg_model.pkl", 'rb'))

    except:
        print(
            "ERROR: You must provide model.pkl file in the current directory.")
        raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT),
                                model_filename)

    X_train, y_train = toNumpy(tr_data_frame_X, tr_data_frame_y)
    X_test, y_test = toNumpy(te_data_frame_X, te_data_frame_y)
    print(X_test.shape[0])
    trivial_y_pred = np.repeat(np.mean(y_train), y_test.shape[0])
    baseline_y_pred = model_predict(linear_reg_model, X_test)
    measure_performance(y_pred=trivial_y_pred,
                        y_test=y_test,
                        mode="trivial testing")
    measure_performance(y_pred=baseline_y_pred,
                        y_test=y_test,
                        mode="baseline testing")
    measure_model(lasso_reg_model,
                  logistic_reg_model,
                  extra_forest_reg_model,
                  svm_reg_model,
                  knn_reg_model,
                  X_test=X_test,
                  y_test=y_test)

    one_hot_cols = [11, 12, 13, 14, 15, 16, 29, 30, 31, 32, 33, 34, 35, 36]
    z_score_cols = [
        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 18, 37, 42, 44, 45, 46, 47, 48, 49,
        50, 54, 56, 57
    ]
    normal_cols = [17, 19, 38, 39, 40, 41, 43, 51, 52, 53, 55]
    min_max_cols = [20, 21, 22, 23, 24, 25, 26, 27, 28]
    prepro_tr_X_data = np.zeros((X_train.shape[0], X_train.shape[1]),
                                dtype='float')
    prepro_te_X_data = np.zeros((X_test.shape[0], X_test.shape[1]),
                                dtype='float')
    prepro_tr_X_data[one_hot_cols] = X_train[one_hot_cols]
    prepro_te_X_data[one_hot_cols] = X_test[one_hot_cols]
    # Standardize the test data
    z_score_scaler = StandardScaler()
    z_score_scaler.fit(X_train[z_score_cols])
    prepro_tr_X_data[z_score_cols] = z_score_scaler.transform(
        X_train[z_score_cols])
    prepro_te_X_data[z_score_cols] = z_score_scaler.transform(
        X_test[z_score_cols])
    # Normalize the test data
    prepro_tr_X_data[normal_cols] = normalize(X_train[normal_cols],
                                              norm='l2',
                                              axis=0)
    prepro_te_X_data[normal_cols] = normalize(X_test[normal_cols],
                                              norm='l2',
                                              axis=0)
    # MinMax scale the test data
    minmax_scaler = MinMaxScaler()
    minmax_scaler.fit(X_train[min_max_cols])
    prepro_tr_X_data[min_max_cols] = minmax_scaler.transform(
        X_train[min_max_cols])
    prepro_te_X_data[min_max_cols] = minmax_scaler.transform(
        X_test[min_max_cols])

    measure_model(lasso_reg_model,
                  logistic_reg_model,
                  extra_forest_reg_model,
                  svm_reg_model,
                  knn_reg_model,
                  X_test=prepro_te_X_data,
                  y_test=y_test)

    # Do feature selection (UFS)
    # Univariate Feature Selection
    usf_f = SelectKBest(f_regression, k=50)
    usf_f.fit(prepro_tr_X_data, y_train)
    ufs_f_te_X_data = usf_f.transform(prepro_te_X_data)
    measure_model(lasso_reg_model,
                  logistic_reg_model,
                  extra_forest_reg_model,
                  svm_reg_model,
                  knn_reg_model,
                  X_test=ufs_f_te_X_data,
                  y_test=y_test)

    usf_mu = SelectKBest(mutual_info_regression, k=50)
    usf_mu.fit(prepro_tr_X_data, y_train)
    ufs_mu_te_X_data = usf_mu.transform(prepro_te_X_data)
    measure_model(lasso_reg_model,
                  logistic_reg_model,
                  extra_forest_reg_model,
                  svm_reg_model,
                  knn_reg_model,
                  X_test=ufs_mu_te_X_data,
                  y_test=y_test)

    estimator = linear_model.LinearRegression()
    # Recursive feature elimination
    rfe_selector = RFE(estimator, n_features_to_select=50, step=1)
    rfe_selector.fit(prepro_tr_X_data, y_train)
    rfe_X_data = rfe_selector.transform(prepro_te_X_data)
    measure_model(lasso_reg_model,
                  logistic_reg_model,
                  extra_forest_reg_model,
                  svm_reg_model,
                  knn_reg_model,
                  X_test=rfe_X_data,
                  y_test=y_test)

    # Sequential Feature Selection (SFS)
    sfs = SequentialFeatureSelector(estimator,
                                    n_features_to_select=50,
                                    direction='backward')
    sfs.fit(prepro_tr_X_data, y_train)
    sfs_X_data = sfs.transform(prepro_te_X_data)
    measure_model(lasso_reg_model,
                  logistic_reg_model,
                  extra_forest_reg_model,
                  svm_reg_model,
                  knn_reg_model,
                  X_test=sfs_X_data,
                  y_test=y_test)
Ejemplo n.º 7
0
    valid_df[valid_df.columns[i]] = clean_Dirt_Data(
        valid_df[valid_df.columns[i]])
    train_df[train_df.columns[i]] = clean_Dirt_Data(
        train_df[train_df.columns[i]])

x_train, y_train = prepareData(train_df)
x_valid, y_valid = prepareData(valid_df)

print('Done Read Train and Validation data!')

#Avaliacao do NB
classificador_nb = GaussianNB(priors=None, var_smoothing=1e-9)
# Sequential Forward Selection(sfs)
sfs = SequentialFeatureSelector(classificador_nb,
                                n_features_to_select=x_train.shape[1] * 0.15)
sfs.fit(x_train, y_train)
result_nb = sfs.get_support()

result_nb_out = pd.DataFrame(result_nb)
result_nb_out.to_csv('nb_interpretavel.csv')

#Avaliacao do SVC
classificador_svm = LinearSVC()
# Sequential Forward Selection(sfs)
sfs = SequentialFeatureSelector(classificador_svm,
                                n_features_to_select=x_train.shape[1] * 0.15)
sfs.fit(x_train, y_train)
result_svc = sfs.get_support()

result_svc_out = pd.DataFrame(result_svc)
result_svc_out.to_csv('svc_interpretavel.csv')
Ejemplo n.º 8
0
def test_n_features_to_select_stopping_criterion(direction):
    """Check the behaviour stopping criterion for feature selection
    depending on the values of `n_features_to_select` and `tol`.

    When `direction` is `'forward'`, select a new features at random
    among those not currently selected in selector.support_,
    build a new version of the data that includes all the features
    in selector.support_ + this newly selected feature.
    And check that the cross-validation score of the model trained on
    this new dataset variant is lower than the model with
    the selected forward selected features or at least does not improve
    by more than the tol margin.

    When `direction` is `'backward'`, instead of adding a new feature
    to selector.support_, try to remove one of those selected features at random
    And check that the cross-validation score is either decreasing or
    not improving by more than the tol margin.
    """

    X, y = make_regression(n_features=50, n_informative=10, random_state=0)

    tol = 1e-3

    sfs = SequentialFeatureSelector(
        LinearRegression(),
        n_features_to_select="auto",
        tol=tol,
        direction=direction,
        cv=2,
    )
    sfs.fit(X, y)
    selected_X = sfs.transform(X)

    rng = np.random.RandomState(0)

    added_candidates = list(
        set(range(X.shape[1])) - set(sfs.get_support(indices=True)))
    added_X = np.hstack([
        selected_X,
        (X[:, rng.choice(added_candidates)])[:, np.newaxis],
    ])

    removed_candidate = rng.choice(list(range(sfs.n_features_to_select_)))
    removed_X = np.delete(selected_X, removed_candidate, axis=1)

    plain_cv_score = cross_val_score(LinearRegression(), X, y, cv=2).mean()
    sfs_cv_score = cross_val_score(LinearRegression(), selected_X, y,
                                   cv=2).mean()
    added_cv_score = cross_val_score(LinearRegression(), added_X, y,
                                     cv=2).mean()
    removed_cv_score = cross_val_score(LinearRegression(), removed_X, y,
                                       cv=2).mean()

    assert sfs_cv_score >= plain_cv_score

    if direction == "forward":
        assert (sfs_cv_score - added_cv_score) <= tol
        assert (sfs_cv_score - removed_cv_score) >= tol
    else:
        assert (added_cv_score - sfs_cv_score) <= tol
        assert (removed_cv_score - sfs_cv_score) <= tol