def test_boundary_case_ch2():
    # Test boundary case, and always aim to select 1 feature.
    X = np.array([[10, 20], [20, 20], [20, 30]])
    y = np.array([[1], [0], [0]])
    scores, pvalues = chi2(X, y)
    assert_array_almost_equal(scores, np.array([4.0, 0.71428571]))
    assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472]))

    filter_fdr = SelectFdr(chi2, alpha=0.1)
    filter_fdr.fit(X, y)
    support_fdr = filter_fdr.get_support()
    assert_array_equal(support_fdr, np.array([True, False]))

    filter_kbest = SelectKBest(chi2, k=1)
    filter_kbest.fit(X, y)
    support_kbest = filter_kbest.get_support()
    assert_array_equal(support_kbest, np.array([True, False]))

    filter_percentile = SelectPercentile(chi2, percentile=50)
    filter_percentile.fit(X, y)
    support_percentile = filter_percentile.get_support()
    assert_array_equal(support_percentile, np.array([True, False]))

    filter_fpr = SelectFpr(chi2, alpha=0.1)
    filter_fpr.fit(X, y)
    support_fpr = filter_fpr.get_support()
    assert_array_equal(support_fpr, np.array([True, False]))

    filter_fwe = SelectFwe(chi2, alpha=0.1)
    filter_fwe.fit(X, y)
    support_fwe = filter_fwe.get_support()
    assert_array_equal(support_fwe, np.array([True, False]))
Esempio n. 2
0
def test_select_fwe_4():
    """Ensure that the TPOT select fwe outputs the same result as sklearn fwe when 0.001 < alpha < 0.05"""
    tpot_obj = TPOT()
    non_feature_columns = ['class', 'group', 'guess']
    training_features = training_testing_data.loc[training_testing_data['group'] == 'training'].drop(non_feature_columns, axis=1)
    training_class_vals = training_testing_data.loc[training_testing_data['group'] == 'training', 'class'].values

    with warnings.catch_warnings():
        warnings.simplefilter('ignore', category=UserWarning)
        selector = SelectFwe(f_classif, alpha=0.042)
        selector.fit(training_features, training_class_vals)
        mask = selector.get_support(True)
    mask_cols = list(training_features.iloc[:, mask].columns) + non_feature_columns

    assert np.array_equal(tpot_obj._select_fwe(training_testing_data, 0.042), training_testing_data[mask_cols])
Esempio n. 3
0
def test_select_heuristics_classif():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple classification problem
    # with the fdr, fwe and fpr heuristics
    X, y = make_classification(n_samples=200,
                               n_features=20,
                               n_informative=3,
                               n_redundant=2,
                               n_repeated=0,
                               n_classes=8,
                               n_clusters_per_class=1,
                               flip_y=0.0,
                               class_sep=10,
                               shuffle=False,
                               random_state=0)

    univariate_filter = SelectFwe(f_classif, alpha=0.01)
    X_r = univariate_filter.fit(X, y).transform(X)
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    for mode in ['fdr', 'fpr', 'fwe']:
        X_r2 = GenericUnivariateSelect(f_classif, mode=mode,
                                       param=0.01).fit(X, y).transform(X)
        assert_array_equal(X_r, X_r2)
        support = univariate_filter.get_support()
        assert_array_almost_equal(support, gtruth)
def test_select_heuristics_classif():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple classification problem
    # with the fdr, fwe and fpr heuristics
    X, y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )

    univariate_filter = SelectFwe(f_classif, alpha=0.01)
    X_r = univariate_filter.fit(X, y).transform(X)
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    for mode in ["fdr", "fpr", "fwe"]:
        X_r2 = GenericUnivariateSelect(f_classif, mode=mode, param=0.01).fit(X, y).transform(X)
        assert_array_equal(X_r, X_r2)
        support = univariate_filter.get_support()
        assert_array_almost_equal(support, gtruth)
def test_select_fwe_classif():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple classification problem
    with the fpr heuristic
    """
    X, Y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )

    univariate_filter = SelectFwe(f_classif, alpha=0.01)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode="fwe", param=0.01).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert np.sum(np.abs(support - gtruth)) < 2
Esempio n. 6
0
def test_select_fwe_classif():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple classification problem
    with the fpr heuristic
    """
    X, y = make_classification(n_samples=200,
                               n_features=20,
                               n_informative=3,
                               n_redundant=2,
                               n_repeated=0,
                               n_classes=8,
                               n_clusters_per_class=1,
                               flip_y=0.0,
                               class_sep=10,
                               shuffle=False,
                               random_state=0)

    univariate_filter = SelectFwe(f_classif, alpha=0.01)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode='fwe',
                                   param=0.01).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert (np.sum(np.abs(support - gtruth)) < 2)
 def test_select_fwe_float(self):
     model = SelectFwe()
     X, y = load_breast_cancer(return_X_y=True)
     model.fit(X, y)
     model_onnx = convert_sklearn(
         model, "select fwe", [("input", FloatTensorType([1, X.shape[1]]))])
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         X.astype(np.float32),
         model,
         model_onnx,
         basename="SklearnSelectFwe",
         allow_failure="StrictVersion(onnx.__version__)"
         " < StrictVersion('1.2') or "
         "StrictVersion(onnxruntime.__version__)"
         " <= StrictVersion('0.2.1')",
     )
Esempio n. 8
0
def test_select_fwe_4():
    """Ensure that the TPOT select fwe outputs the same result as sklearn fwe when 0.001 < alpha < 0.05"""
    tpot_obj = TPOT()
    non_feature_columns = ['class', 'group', 'guess']
    training_features = training_testing_data.loc[
        training_testing_data['group'] == 'training'].drop(non_feature_columns,
                                                           axis=1)
    training_class_vals = training_testing_data.loc[
        training_testing_data['group'] == 'training', 'class'].values

    with warnings.catch_warnings():
        warnings.simplefilter('ignore', category=UserWarning)
        selector = SelectFwe(f_classif, alpha=0.042)
        selector.fit(training_features, training_class_vals)
        mask = selector.get_support(True)
    mask_cols = list(
        training_features.iloc[:, mask].columns) + non_feature_columns

    assert np.array_equal(tpot_obj._select_fwe(training_testing_data, 0.042),
                          training_testing_data[mask_cols])
Esempio n. 9
0
File: tpot.py Progetto: vsolano/tpot
    def _select_fwe(self, input_df, alpha):
        """ Uses Scikit-learn's SelectFwe feature selection to filter the subset of features
           according to p-values corresponding to Family-wise error rate
        Parameters
        ----------
        input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
            Input DataFrame to perform feature selection on
        alpha: float in the range [0.001, 0.05]
            The highest uncorrected p-value for features to keep

        Returns
        -------
        subsetted_df: pandas.DataFrame {n_samples, n_filtered_features + ['guess', 'group', 'class']}
            Returns a DataFrame containing the 'best' features

        """
        training_features = input_df.loc[input_df['group'] == 'training'].drop(['class', 'group', 'guess'], axis=1)
        training_class_vals = input_df.loc[input_df['group'] == 'training', 'class'].values

        # forcing  0.001 <= alpha <= 0.05
        if alpha > 0.05:
            alpha = 0.05
        elif alpha <= 0.001:
            alpha = 0.001


        if len(training_features.columns.values) == 0:
            return input_df.copy()

        with warnings.catch_warnings():
            # Ignore warnings about constant features
            warnings.simplefilter('ignore', category=UserWarning)

            selector = SelectFwe(f_classif, alpha=alpha)
            selector.fit(training_features, training_class_vals)
            mask = selector.get_support(True)

        mask_cols = list(training_features.iloc[:, mask].columns) + ['guess', 'class', 'group']
        return input_df[mask_cols].copy()
def test_select_fwe_regression():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple regression problem
    # with the fwe heuristic
    X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectFwe(f_regression, alpha=0.01)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_regression, mode="fwe", param=0.01).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support[:5], np.ones((5,), dtype=np.bool))
    assert_less(np.sum(support[5:] == 1), 2)
Esempio n. 11
0
def test_select_fwe_regression():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple regression problem
    # with the fwe heuristic
    X, y = make_regression(n_samples=200, n_features=20,
                           n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectFwe(f_regression, alpha=0.01)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(
        f_regression, mode='fwe', param=0.01).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support[:5], np.ones((5, ), dtype=np.bool))
    assert_less(np.sum(support[5:] == 1), 2)
Esempio n. 12
0
def test_select_fwe_regression():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple regression problem
    with the fwe heuristic
    """
    X, Y = make_regression(n_samples=200, n_features=20,
                           n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectFwe(f_regression, alpha=0.01)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_regression, mode='fwe',
                    param=0.01).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert(support[:5] == 1).all()
    assert(np.sum(support[5:] == 1) < 2)
Esempio n. 13
0
# In[46]:

for idx in range(len(kbest.scores_)):
    if kbest.scores_[idx] < 2:
        print columns[idx], kbest.scores_[idx]
kbest_result = [
    columns[idx] for idx in range(len(columns) - 1) if kbest.scores_[idx] < 2
]

# perform regression without those

# In[48]:

fwe = SelectFwe(f_regression, alpha=0.7)
fwe.fit(converted_train_array[:, :-1], converted_train_array[:, -1])
for idx in range(len(columns) - 1):
    if not idx in fwe.get_support(indices=True):
        print columns[idx]

# In[49]:

variance = VarianceThreshold(threshold=1)
variance.fit(converted_train_array[:, :-1])
print len(variance.get_support(indices=True))
for idx in range(len(columns) - 1):
    if not idx in variance.get_support(indices=True):
        print columns[idx]
variance_result = [
    columns[idx] for idx in range(len(columns) - 1)
    if not idx in variance.get_support(indices=True)
Esempio n. 14
0
def train_predict_and_test(model,
                           target_name,
                           train_features,
                           train_labels,
                           test_features,
                           test_labels,
                           feature_selection=None):
    classification = (target_name == Phenotypes.DIAGNOSED_ASTHMA
                      or target_name == Phenotypes.BODY_MASS_INDEX_CATEGORICAL)

    # Standardize data
    standardized = False
    if model == Models.MLP or model == Models.SVM:
        print("Standardizing data..")
        standardized = True
        features_mean = train_features.mean()
        features_std = train_features.std()
        train_features = (train_features - features_mean) / features_std
        test_features = (test_features - features_mean) / features_std

        if not classification:
            labels_mean = train_labels.mean()
            labels_std = train_labels.std()
            train_labels = (train_labels - labels_mean) / labels_std
            test_labels = (test_labels - labels_mean) / labels_std

    # Load optimized params
    params = load_optimized_params(model, target_name)

    # Features selection
    feature_selector = VarianceThreshold(threshold=0).fit(
        train_features)  # Removing features with 0 variance
    train_col, test_col = train_features.columns, test_features.columns
    train_features = pd.DataFrame(feature_selector.transform(train_features),
                                  columns=train_col)
    test_features = pd.DataFrame(feature_selector.transform(test_features),
                                 columns=test_col)
    if feature_selection == "fwe":
        print("Selecting features according to Familly Wise Error")
        # alpha = 5e-2
        alpha = 0.3
        if params is not None:
            try:
                alpha = params['transformer_alpha']
            except KeyError:
                print(
                    "Cannot find parameter alpha for FWE feature selector. Using default value"
                )

        features_selector = SelectFwe(f_regression,
                                      alpha=alpha).fit(train_features,
                                                       train_labels)
        train_features = features_selector.transform(train_features)
        test_features = features_selector.transform(test_features)
    elif feature_selection == "kbest":
        k = 150
        if params is not None:
            try:
                k = params['k']
            except KeyError:
                print(
                    "Cannot find parameter k for k-best feature selector. Using default value: k=",
                    k)
        print("Selecting k-best features:", k)
        score_func = f_regression
        if classification:
            score_func = f_classif
        features_selector = SelectKBest(score_func=score_func, k=k)
        features_selector = features_selector.fit(train_features, train_labels)
        train_features = features_selector.transform(train_features)
        test_features = features_selector.transform(test_features)
    elif feature_selection == "tree":
        print("Selecting features from RF feature importance")
        clf = RandomForestRegressor(n_estimators=100).fit(
            train_features, train_labels)
        if classification:
            clf = RandomForestClassifier(n_estimators=100).fit(
                train_features, train_labels)
        features_selector = SelectFromModel(clf, prefit=True)
        train_features = features_selector.transform(train_features)
        test_features = features_selector.transform(test_features)
    elif feature_selection == "corr":
        threshold = 0.9  # Recommended default value
        col_corr = set()
        corr_matrix = train_features.corr()
        for i in range(len(corr_matrix.columns)):
            for j in range(i):
                if abs(corr_matrix.iloc[i, j]) > threshold:
                    colname = corr_matrix.columns[i]
                    col_corr.add(colname)
        train_features = train_features.drop(col_corr, axis=1)
        test_features = test_features.drop(col_corr, axis=1)

    # Oversampling
    if classification and model != Models.SVM and model != Models.CART and model != Models.ELASTIC:
        print("Oversampling features..")
        if target_name == Phenotypes.DIAGNOSED_ASTHMA:
            sampling_strat = 0.5
        else:
            sampling_strat = {
                0: np.max(np.bincount(train_labels)) // 4,
                1: np.max(np.bincount(train_labels)),
                2: np.max(np.bincount(train_labels)),
                3: np.max(np.bincount(train_labels)) // 2
            }
        oversampler = imblearn.over_sampling.RandomOverSampler(
            sampling_strategy=sampling_strat, random_state=42)
        # oversampler = imblearn.over_sampling.SMOTE(sampling_strategy=1.0,
        #                                          k_neighbors=5,
        #                                          random_state=42)
        train_features, train_labels = oversampler.fit_resample(
            train_features, train_labels)

    if model == Models.RF:
        if target_name == Phenotypes.BODY_MASS_INDEX_CATEGORICAL:
            # Create validation set for threshold optimization
            val_features, test_features, val_labels, test_labels = train_test_split(
                test_features, test_labels, test_size=0.5, random_state=42)
            model, predictions = _predict_rf(target_name, train_features,
                                             train_labels, val_features,
                                             val_labels)
        else:
            model, predictions = _predict_rf(target_name,
                                             train_features,
                                             train_labels,
                                             test_features,
                                             test_labels,
                                             params=params)
    elif model == Models.ELASTIC:
        model, predictions = predict_elastic_net(target_name, train_features,
                                                 train_labels, test_features,
                                                 test_labels)
    elif model == Models.XGB:
        model, predictions = _predict_xgb(target_name,
                                          train_features,
                                          train_labels,
                                          test_features,
                                          test_labels,
                                          params=params)
    elif model == Models.MLP:
        model, predictions = _predict_mlp(target_name,
                                          train_features,
                                          train_labels,
                                          test_features,
                                          test_labels,
                                          params=params)
    elif model == Models.SVM:
        model, predictions = _predict_svm(target_name, train_features,
                                          train_labels, test_features,
                                          test_labels)
    elif model == Models.CART:
        model, predictions = _predict_cart(target_name, train_features,
                                           train_labels, test_features,
                                           test_labels)
    elif model == Models.NAIVE:
        if not (classification):
            predictions = predict_naive(train_features, train_labels,
                                        test_features, test_labels)
        else:
            raise SystemExit("Cannot use naive model on classification task")
    else:
        raise SystemExit("Unkwown model:", model)

    # Destandardize results
    if standardized and not (classification):
        print("destandardize data..")
        predictions = (predictions * labels_std) + labels_mean
        test_labels = (test_labels * labels_std) + labels_mean

    # Print results
    if classification:
        print_classification_metrics(ground_truth=test_labels,
                                     predictions=predictions,
                                     num_classes=test_labels.nunique())
    else:
        print_regression_metrics(ground_truth=test_labels,
                                 predictions=predictions)

    return model, predictions
import pandas as pd

from sklearn.cross_validation import train_test_split
from sklearn.feature_selection import SelectFwe
from sklearn.feature_selection import f_classif
from sklearn.neighbors import KNeighborsClassifier

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR')
training_indices, testing_indices = train_test_split(tpot_data.index, stratify = tpot_data['class'].values, train_size=0.75, test_size=0.25)


result1 = tpot_data.copy()

training_features = result1.loc[training_indices].drop(['class', 'group', 'guess'], axis=1)
training_class_vals = result1.loc[training_indices, 'class'].values
if len(training_features.columns.values) == 0:
    result1 = result1.copy()
else:
    selector = SelectFwe(f_classif, alpha=0.05)
    selector.fit(training_features.values, training_class_vals)
    mask = selector.get_support(True)
    mask_cols = list(training_features.iloc[:, mask].columns) + ['class']
    result1 = result1[mask_cols]

# Perform classification with a k-nearest neighbor classifier
knnc2 = KNeighborsClassifier(n_neighbors=min(8, len(training_indices)))
knnc2.fit(result1.loc[training_indices].drop('class', axis=1).values, result1.loc[training_indices, 'class'].values)
result2 = result1.copy()
result2['knnc2-classification'] = knnc2.predict(result2.drop('class', axis=1).values)