def test_boundary_case_ch2(): # Test boundary case, and always aim to select 1 feature. X = np.array([[10, 20], [20, 20], [20, 30]]) y = np.array([[1], [0], [0]]) scores, pvalues = chi2(X, y) assert_array_almost_equal(scores, np.array([4.0, 0.71428571])) assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472])) filter_fdr = SelectFdr(chi2, alpha=0.1) filter_fdr.fit(X, y) support_fdr = filter_fdr.get_support() assert_array_equal(support_fdr, np.array([True, False])) filter_kbest = SelectKBest(chi2, k=1) filter_kbest.fit(X, y) support_kbest = filter_kbest.get_support() assert_array_equal(support_kbest, np.array([True, False])) filter_percentile = SelectPercentile(chi2, percentile=50) filter_percentile.fit(X, y) support_percentile = filter_percentile.get_support() assert_array_equal(support_percentile, np.array([True, False])) filter_fpr = SelectFpr(chi2, alpha=0.1) filter_fpr.fit(X, y) support_fpr = filter_fpr.get_support() assert_array_equal(support_fpr, np.array([True, False])) filter_fwe = SelectFwe(chi2, alpha=0.1) filter_fwe.fit(X, y) support_fwe = filter_fwe.get_support() assert_array_equal(support_fwe, np.array([True, False]))
def test_select_fwe_4(): """Ensure that the TPOT select fwe outputs the same result as sklearn fwe when 0.001 < alpha < 0.05""" tpot_obj = TPOT() non_feature_columns = ['class', 'group', 'guess'] training_features = training_testing_data.loc[training_testing_data['group'] == 'training'].drop(non_feature_columns, axis=1) training_class_vals = training_testing_data.loc[training_testing_data['group'] == 'training', 'class'].values with warnings.catch_warnings(): warnings.simplefilter('ignore', category=UserWarning) selector = SelectFwe(f_classif, alpha=0.042) selector.fit(training_features, training_class_vals) mask = selector.get_support(True) mask_cols = list(training_features.iloc[:, mask].columns) + non_feature_columns assert np.array_equal(tpot_obj._select_fwe(training_testing_data, 0.042), training_testing_data[mask_cols])
def test_select_heuristics_classif(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the fdr, fwe and fpr heuristics X, y = make_classification(n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0) univariate_filter = SelectFwe(f_classif, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) gtruth = np.zeros(20) gtruth[:5] = 1 for mode in ['fdr', 'fpr', 'fwe']: X_r2 = GenericUnivariateSelect(f_classif, mode=mode, param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() assert_array_almost_equal(support, gtruth)
def test_select_heuristics_classif(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the fdr, fwe and fpr heuristics X, y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) univariate_filter = SelectFwe(f_classif, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) gtruth = np.zeros(20) gtruth[:5] = 1 for mode in ["fdr", "fpr", "fwe"]: X_r2 = GenericUnivariateSelect(f_classif, mode=mode, param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() assert_array_almost_equal(support, gtruth)
def test_select_fwe_classif(): """ Test whether the relative univariate feature selection gets the correct items in a simple classification problem with the fpr heuristic """ X, Y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) univariate_filter = SelectFwe(f_classif, alpha=0.01) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode="fwe", param=0.01).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert np.sum(np.abs(support - gtruth)) < 2
def test_select_fwe_classif(): """ Test whether the relative univariate feature selection gets the correct items in a simple classification problem with the fpr heuristic """ X, y = make_classification(n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0) univariate_filter = SelectFwe(f_classif, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode='fwe', param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert (np.sum(np.abs(support - gtruth)) < 2)
def test_select_fwe_float(self): model = SelectFwe() X, y = load_breast_cancer(return_X_y=True) model.fit(X, y) model_onnx = convert_sklearn( model, "select fwe", [("input", FloatTensorType([1, X.shape[1]]))]) self.assertTrue(model_onnx is not None) dump_data_and_model( X.astype(np.float32), model, model_onnx, basename="SklearnSelectFwe", allow_failure="StrictVersion(onnx.__version__)" " < StrictVersion('1.2') or " "StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.2.1')", )
def test_select_fwe_4(): """Ensure that the TPOT select fwe outputs the same result as sklearn fwe when 0.001 < alpha < 0.05""" tpot_obj = TPOT() non_feature_columns = ['class', 'group', 'guess'] training_features = training_testing_data.loc[ training_testing_data['group'] == 'training'].drop(non_feature_columns, axis=1) training_class_vals = training_testing_data.loc[ training_testing_data['group'] == 'training', 'class'].values with warnings.catch_warnings(): warnings.simplefilter('ignore', category=UserWarning) selector = SelectFwe(f_classif, alpha=0.042) selector.fit(training_features, training_class_vals) mask = selector.get_support(True) mask_cols = list( training_features.iloc[:, mask].columns) + non_feature_columns assert np.array_equal(tpot_obj._select_fwe(training_testing_data, 0.042), training_testing_data[mask_cols])
def _select_fwe(self, input_df, alpha): """ Uses Scikit-learn's SelectFwe feature selection to filter the subset of features according to p-values corresponding to Family-wise error rate Parameters ---------- input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']} Input DataFrame to perform feature selection on alpha: float in the range [0.001, 0.05] The highest uncorrected p-value for features to keep Returns ------- subsetted_df: pandas.DataFrame {n_samples, n_filtered_features + ['guess', 'group', 'class']} Returns a DataFrame containing the 'best' features """ training_features = input_df.loc[input_df['group'] == 'training'].drop(['class', 'group', 'guess'], axis=1) training_class_vals = input_df.loc[input_df['group'] == 'training', 'class'].values # forcing 0.001 <= alpha <= 0.05 if alpha > 0.05: alpha = 0.05 elif alpha <= 0.001: alpha = 0.001 if len(training_features.columns.values) == 0: return input_df.copy() with warnings.catch_warnings(): # Ignore warnings about constant features warnings.simplefilter('ignore', category=UserWarning) selector = SelectFwe(f_classif, alpha=alpha) selector.fit(training_features, training_class_vals) mask = selector.get_support(True) mask_cols = list(training_features.iloc[:, mask].columns) + ['guess', 'class', 'group'] return input_df[mask_cols].copy()
def test_select_fwe_regression(): # Test whether the relative univariate feature selection # gets the correct items in a simple regression problem # with the fwe heuristic X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectFwe(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode="fwe", param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support[:5], np.ones((5,), dtype=np.bool)) assert_less(np.sum(support[5:] == 1), 2)
def test_select_fwe_regression(): # Test whether the relative univariate feature selection # gets the correct items in a simple regression problem # with the fwe heuristic X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectFwe(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect( f_regression, mode='fwe', param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support[:5], np.ones((5, ), dtype=np.bool)) assert_less(np.sum(support[5:] == 1), 2)
def test_select_fwe_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the fwe heuristic """ X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectFwe(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode='fwe', param=0.01).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert(support[:5] == 1).all() assert(np.sum(support[5:] == 1) < 2)
# In[46]: for idx in range(len(kbest.scores_)): if kbest.scores_[idx] < 2: print columns[idx], kbest.scores_[idx] kbest_result = [ columns[idx] for idx in range(len(columns) - 1) if kbest.scores_[idx] < 2 ] # perform regression without those # In[48]: fwe = SelectFwe(f_regression, alpha=0.7) fwe.fit(converted_train_array[:, :-1], converted_train_array[:, -1]) for idx in range(len(columns) - 1): if not idx in fwe.get_support(indices=True): print columns[idx] # In[49]: variance = VarianceThreshold(threshold=1) variance.fit(converted_train_array[:, :-1]) print len(variance.get_support(indices=True)) for idx in range(len(columns) - 1): if not idx in variance.get_support(indices=True): print columns[idx] variance_result = [ columns[idx] for idx in range(len(columns) - 1) if not idx in variance.get_support(indices=True)
def train_predict_and_test(model, target_name, train_features, train_labels, test_features, test_labels, feature_selection=None): classification = (target_name == Phenotypes.DIAGNOSED_ASTHMA or target_name == Phenotypes.BODY_MASS_INDEX_CATEGORICAL) # Standardize data standardized = False if model == Models.MLP or model == Models.SVM: print("Standardizing data..") standardized = True features_mean = train_features.mean() features_std = train_features.std() train_features = (train_features - features_mean) / features_std test_features = (test_features - features_mean) / features_std if not classification: labels_mean = train_labels.mean() labels_std = train_labels.std() train_labels = (train_labels - labels_mean) / labels_std test_labels = (test_labels - labels_mean) / labels_std # Load optimized params params = load_optimized_params(model, target_name) # Features selection feature_selector = VarianceThreshold(threshold=0).fit( train_features) # Removing features with 0 variance train_col, test_col = train_features.columns, test_features.columns train_features = pd.DataFrame(feature_selector.transform(train_features), columns=train_col) test_features = pd.DataFrame(feature_selector.transform(test_features), columns=test_col) if feature_selection == "fwe": print("Selecting features according to Familly Wise Error") # alpha = 5e-2 alpha = 0.3 if params is not None: try: alpha = params['transformer_alpha'] except KeyError: print( "Cannot find parameter alpha for FWE feature selector. Using default value" ) features_selector = SelectFwe(f_regression, alpha=alpha).fit(train_features, train_labels) train_features = features_selector.transform(train_features) test_features = features_selector.transform(test_features) elif feature_selection == "kbest": k = 150 if params is not None: try: k = params['k'] except KeyError: print( "Cannot find parameter k for k-best feature selector. Using default value: k=", k) print("Selecting k-best features:", k) score_func = f_regression if classification: score_func = f_classif features_selector = SelectKBest(score_func=score_func, k=k) features_selector = features_selector.fit(train_features, train_labels) train_features = features_selector.transform(train_features) test_features = features_selector.transform(test_features) elif feature_selection == "tree": print("Selecting features from RF feature importance") clf = RandomForestRegressor(n_estimators=100).fit( train_features, train_labels) if classification: clf = RandomForestClassifier(n_estimators=100).fit( train_features, train_labels) features_selector = SelectFromModel(clf, prefit=True) train_features = features_selector.transform(train_features) test_features = features_selector.transform(test_features) elif feature_selection == "corr": threshold = 0.9 # Recommended default value col_corr = set() corr_matrix = train_features.corr() for i in range(len(corr_matrix.columns)): for j in range(i): if abs(corr_matrix.iloc[i, j]) > threshold: colname = corr_matrix.columns[i] col_corr.add(colname) train_features = train_features.drop(col_corr, axis=1) test_features = test_features.drop(col_corr, axis=1) # Oversampling if classification and model != Models.SVM and model != Models.CART and model != Models.ELASTIC: print("Oversampling features..") if target_name == Phenotypes.DIAGNOSED_ASTHMA: sampling_strat = 0.5 else: sampling_strat = { 0: np.max(np.bincount(train_labels)) // 4, 1: np.max(np.bincount(train_labels)), 2: np.max(np.bincount(train_labels)), 3: np.max(np.bincount(train_labels)) // 2 } oversampler = imblearn.over_sampling.RandomOverSampler( sampling_strategy=sampling_strat, random_state=42) # oversampler = imblearn.over_sampling.SMOTE(sampling_strategy=1.0, # k_neighbors=5, # random_state=42) train_features, train_labels = oversampler.fit_resample( train_features, train_labels) if model == Models.RF: if target_name == Phenotypes.BODY_MASS_INDEX_CATEGORICAL: # Create validation set for threshold optimization val_features, test_features, val_labels, test_labels = train_test_split( test_features, test_labels, test_size=0.5, random_state=42) model, predictions = _predict_rf(target_name, train_features, train_labels, val_features, val_labels) else: model, predictions = _predict_rf(target_name, train_features, train_labels, test_features, test_labels, params=params) elif model == Models.ELASTIC: model, predictions = predict_elastic_net(target_name, train_features, train_labels, test_features, test_labels) elif model == Models.XGB: model, predictions = _predict_xgb(target_name, train_features, train_labels, test_features, test_labels, params=params) elif model == Models.MLP: model, predictions = _predict_mlp(target_name, train_features, train_labels, test_features, test_labels, params=params) elif model == Models.SVM: model, predictions = _predict_svm(target_name, train_features, train_labels, test_features, test_labels) elif model == Models.CART: model, predictions = _predict_cart(target_name, train_features, train_labels, test_features, test_labels) elif model == Models.NAIVE: if not (classification): predictions = predict_naive(train_features, train_labels, test_features, test_labels) else: raise SystemExit("Cannot use naive model on classification task") else: raise SystemExit("Unkwown model:", model) # Destandardize results if standardized and not (classification): print("destandardize data..") predictions = (predictions * labels_std) + labels_mean test_labels = (test_labels * labels_std) + labels_mean # Print results if classification: print_classification_metrics(ground_truth=test_labels, predictions=predictions, num_classes=test_labels.nunique()) else: print_regression_metrics(ground_truth=test_labels, predictions=predictions) return model, predictions
import pandas as pd from sklearn.cross_validation import train_test_split from sklearn.feature_selection import SelectFwe from sklearn.feature_selection import f_classif from sklearn.neighbors import KNeighborsClassifier # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR') training_indices, testing_indices = train_test_split(tpot_data.index, stratify = tpot_data['class'].values, train_size=0.75, test_size=0.25) result1 = tpot_data.copy() training_features = result1.loc[training_indices].drop(['class', 'group', 'guess'], axis=1) training_class_vals = result1.loc[training_indices, 'class'].values if len(training_features.columns.values) == 0: result1 = result1.copy() else: selector = SelectFwe(f_classif, alpha=0.05) selector.fit(training_features.values, training_class_vals) mask = selector.get_support(True) mask_cols = list(training_features.iloc[:, mask].columns) + ['class'] result1 = result1[mask_cols] # Perform classification with a k-nearest neighbor classifier knnc2 = KNeighborsClassifier(n_neighbors=min(8, len(training_indices))) knnc2.fit(result1.loc[training_indices].drop('class', axis=1).values, result1.loc[training_indices, 'class'].values) result2 = result1.copy() result2['knnc2-classification'] = knnc2.predict(result2.drop('class', axis=1).values)