class f_regressionFPRPrim(primitive): def __init__(self, random_state=0): super(f_regressionFPRPrim, self).__init__(name='f_regressionFPR') self.id = 29 self.PCA_LAPACK_Prim = [] self.type = 'feature selection' self.description = "Filter: Select the pvalues below alpha based on a FPR test with F-value between label/feature for regression tasks. FPR test stands for False Positive Rate test. It controls the total amount of false detections." self.hyperparams_run = {'default': True} self.selector = None self.accept_type = 'c_r' def can_accept(self, data): return self.can_accept_c(data, 'Regression') def is_needed(self, data): if data['X'].shape[1] < 3: return False return True def fit(self, data): data = handle_data(data) self.selector = SelectFpr(f_regression) self.selector.fit(data['X'], data['Y']) def produce(self, data): output = handle_data(data) cols = list(output['X'].columns) mask = self.selector.get_support(indices=False) final_cols = list(compress(cols, mask)) output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols) final_output = {0: output} return final_output
def test_boundary_case_ch2(): # Test boundary case, and always aim to select 1 feature. X = np.array([[10, 20], [20, 20], [20, 30]]) y = np.array([[1], [0], [0]]) scores, pvalues = chi2(X, y) assert_array_almost_equal(scores, np.array([4.0, 0.71428571])) assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472])) filter_fdr = SelectFdr(chi2, alpha=0.1) filter_fdr.fit(X, y) support_fdr = filter_fdr.get_support() assert_array_equal(support_fdr, np.array([True, False])) filter_kbest = SelectKBest(chi2, k=1) filter_kbest.fit(X, y) support_kbest = filter_kbest.get_support() assert_array_equal(support_kbest, np.array([True, False])) filter_percentile = SelectPercentile(chi2, percentile=50) filter_percentile.fit(X, y) support_percentile = filter_percentile.get_support() assert_array_equal(support_percentile, np.array([True, False])) filter_fpr = SelectFpr(chi2, alpha=0.1) filter_fpr.fit(X, y) support_fpr = filter_fpr.get_support() assert_array_equal(support_fpr, np.array([True, False])) filter_fwe = SelectFwe(chi2, alpha=0.1) filter_fwe.fit(X, y) support_fwe = filter_fwe.get_support() assert_array_equal(support_fwe, np.array([True, False]))
def selectionFwe(X, y, paramlist): k = paramlist['number _of_features'] fwe = SelectFpr(chi2, k=k) Xnew = fwe.fit_transform(X, y) indexarr = fwe.get_support(indices=True) scores_arr = fwe.scores_ return [Xnew, indexarr, scores_arr]
def test_select_fpr_classif(): """ Test whether the relative univariate feature selection gets the correct items in a simple classification problem with the fpr heuristic """ X, Y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) univariate_filter = SelectFpr(f_classif, alpha=0.0001) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode="fpr", param=0.0001).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def evaluate_model(classifier, data_records, class_labels, labels): attribute_values = [] accuracy_values = [] # Scoring the attributes using F_test and false positive rate clf = SelectFpr(f_classif, alpha=0.9) clf.fit(data_records, class_labels) print(clf.scores_) print('\n') ranked_attr_indices = [0] * len(clf.scores_) for i, x in enumerate(sorted(range(len(clf.scores_)), key=lambda y: clf.scores_[y])): ranked_attr_indices[x] = i # Performing a 4-fold cross validation against varying number of attributes. The attributes are chosen # on the basis of their scores for idx in range(2, len(ranked_attr_indices)): filtered_records = data_records[:, ranked_attr_indices[:idx]] for idx2 in ranked_attr_indices[:idx]: print(labels[idx2]) validation_score = cross_validation.cross_val_score(classifier, filtered_records, class_labels, cv=5) accuracy = max(validation_score) * 100 attribute_values.append(idx) accuracy_values.append(accuracy) print('Cross validation score - ' + str(idx) + ' attributes :' + str(validation_score) + '\n') return (attribute_values, accuracy_values)
def test_select_fpr_classif(): """ Test whether the relative univariate feature selection gets the correct items in a simple classification problem with the fpr heuristic """ X, y = make_classification(n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0) univariate_filter = SelectFpr(f_classif, alpha=0.0001) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode='fpr', param=0.0001).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def fit(self, X, y, sample_weight=None): if self.allow_missing_ids is None: self.allow_missing_ids = np.zeros(X.shape[1]).astype(bool) if self.univariate_feature_selection: # univariate feature selection feature_selector = SelectFpr(alpha=0.05).fit( X[:, ~self.allow_missing_ids], y) self.support = np.ones(X.shape[1]).astype(bool) self.support[~self. allow_missing_ids] = feature_selector.get_support() X = X[:, self.support] else: self.support = np.ones(X.shape[1]).astype(bool) # fit the model super().fit(X, y, [len(X)], sample_weight=sample_weight) # get the mean of z for each level of y self.label_encoder = LabelEncoder().fit(y) self.classes_ = self.label_encoder.classes_ z = super().predict(X).astype(float) self.z_means = np.array( [z[y == cl].mean() for cl in self.label_encoder.classes_]) return self
def test_boundary_case_ch2(): # Test boundary case, and always aim to select 1 feature. X = np.array([[10, 20], [20, 20], [20, 30]]) y = np.array([[1], [0], [0]]) scores, pvalues = chi2(X, y) assert_array_almost_equal(scores, np.array([4., 0.71428571])) assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472])) filter_fdr = SelectFdr(chi2, alpha=0.1) filter_fdr.fit(X, y) support_fdr = filter_fdr.get_support() assert_array_equal(support_fdr, np.array([True, False])) filter_kbest = SelectKBest(chi2, k=1) filter_kbest.fit(X, y) support_kbest = filter_kbest.get_support() assert_array_equal(support_kbest, np.array([True, False])) filter_percentile = SelectPercentile(chi2, percentile=50) filter_percentile.fit(X, y) support_percentile = filter_percentile.get_support() assert_array_equal(support_percentile, np.array([True, False])) filter_fpr = SelectFpr(chi2, alpha=0.1) filter_fpr.fit(X, y) support_fpr = filter_fpr.get_support() assert_array_equal(support_fpr, np.array([True, False])) filter_fwe = SelectFwe(chi2, alpha=0.1) filter_fwe.fit(X, y) support_fwe = filter_fwe.get_support() assert_array_equal(support_fwe, np.array([True, False]))
def select_with_fpr(train, test): train_data = train.drop('ID', axis=1) test_data = test.drop('ID', axis=1) train_y = train_data['TARGET'] train_X = train_data.drop('TARGET', 1) fpr = SelectFpr(alpha = 0.001) features = fpr.fit_transform(train_X, train_y) print('Fpr выбрал {} признаков.'.format(features.shape[1])) col_numbers = fpr.get_support() columns = np.delete(train_data.columns.values, train_data.shape[1] - 1, axis=0) features = [] i = 0 for i in range(len(columns)): if col_numbers[i] == True: features.append(columns[i]) new_train = train[['ID'] + features + ['TARGET']] new_train.to_csv('train_after_fpr.csv') new_test = test[['ID'] + features] new_test.to_csv('test_after_fpr.csv')
class UnivariateSelectChiFPRPrim(primitive): def __init__(self, random_state=0): super(UnivariateSelectChiFPRPrim, self).__init__(name='UnivariateSelectChiFPR') self.id = 27 self.PCA_LAPACK_Prim = [] self.type = 'feature selection' self.description = "Filter: Select the pvalues below alpha based on a FPR test with Chi-square. FPR test stands for False Positive Rate test. It controls the total amount of false detections." self.hyperparams_run = {'default': True} self.selector = None self.accept_type = 'd' def can_accept(self, data): return self.can_accept_d(data, 'Classification') def is_needed(self, data): if data['X'].shape[1] < 3: return False return True def fit(self, data): data = handle_data(data) self.selector = SelectFpr(chi2, alpha=0.05) self.selector.fit(data['X'], data['Y']) def produce(self, data): output = handle_data(data) cols = list(output['X'].columns) try: mask = self.selector.get_support(indices=False) final_cols = list(compress(cols, mask)) output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols) except Exception as e: print(e) final_output = {0: output} return final_output
def fit(self, X, y, sample_weight=None): self.label_encoder = LabelEncoder().fit(y) self.classes_ = self.label_encoder.classes_ y = self.label_encoder.transform(y) if self.allow_missing_ids is None: self.allow_missing_ids = np.zeros(X.shape[1]).astype(bool) if self.univariate_feature_selection: # univariate feature selection feature_selector = SelectFpr(alpha=0.05).fit( X[:, ~self.allow_missing_ids], y) self.support = np.ones(X.shape[1]).astype(bool) self.support[~self. allow_missing_ids] = feature_selector.get_support() X = X[:, self.support] if self.bounds is not None: self.bounds = [ self.bounds[ii] for ii in range(len(self.bounds)) if self.support[ii] ] else: self.support = np.ones(X.shape[1]).astype(bool) def func(w, X, y, alpha, sw): out, grad = _logistic_loss_and_grad(w, X, y, 0, sw) out_penalty = alpha * np.sum(np.abs(w[:-1])) grad_penalty = np.r_[alpha * np.sign(w[:-1]), 0] return out + out_penalty, grad + grad_penalty y2 = np.array(y) y2[y2 == 0] = -1 w0 = np.r_[np.random.randn(X.shape[1]) / 10, 0.] if self.bounds is None: method = 'BFGS' else: method = 'L-BFGS-B' if sample_weight is None: if self.class_weight is not None: sample_weight = get_sample_weights( y, class_weight=self.class_weight) else: sample_weight = np.ones(len(X)) sample_weight /= (np.mean(sample_weight) * len(X)) self.opt_res = minimize(func, w0, method=method, jac=True, args=(X, y2, 1. / self.C, sample_weight), bounds=self.bounds + [(None, None)], options={ "gtol": self.tol, "maxiter": self.max_iter }) self.coef_ = np.zeros(len(self.support)) self.coef_[self.support] = self.opt_res.x[:-1] self.coef_ = self.coef_.reshape(1, -1) self.intercept_ = self.opt_res.x[-1].reshape(1, ) return self
def SelectFpr_selector(data, target, sf): selector = SelectFpr(score_func=sf) data_new = selector.fit_transform(data.values, target.values.ravel()) outcome = selector.get_support(True) new_features = [] # The list of your K best features for ind in outcome: new_features.append(data.columns.values[ind]) return pd.DataFrame(data_new, columns=new_features)
def feature_SelectFpr(x_data, y_data): # print(x_data) # print(y_data) bestfeatures = SelectFpr(f_classif, alpha=0.01) fit = bestfeatures.fit(x_data, y_data) dfscores = pd.DataFrame(fit.scores_) dfcolumns = pd.DataFrame(x_data.columns) featureScores = pd.concat([dfcolumns, dfscores], axis=1) featureScores.columns = ['Specs', 'Score'] # naming the dataframe columns top_20_features = featureScores.nlargest(20, 'Score') return top_20_features
def fit(self, X, y, sample_weight=None): self.fitted_ = False if self.allow_missing_ids is None: self.allow_missing_ids = np.zeros(X.shape[1]).astype(bool) Xold = np.array(X) if self.univariate_feature_selection: # univariate feature selection feature_selector = SelectFpr(alpha=0.05).fit( X[:, ~self.allow_missing_ids], y) self.support = np.ones(X.shape[1]).astype(bool) self.support[~self. allow_missing_ids] = feature_selector.get_support() X = X[:, self.support] self.allow_missing_ids = self.allow_missing_ids[self.support] else: self.support = np.ones(X.shape[1]).astype(bool) if sample_weight is None: if self.class_weight is not None: sample_weight = get_sample_weights( y, class_weight=self.class_weight) else: sample_weight = np.ones(len(X)) sample_weight /= (np.mean(sample_weight) * len(X)) # generate pairs X2, y2, sw2 = self._generate_pairs(X, y, sample_weight) sw2 = sw2 / sw2.mean() if self.verbose: print('Generated %d pairs from %d samples' % (len(X2), len(X))) # fit the model if self.estimator.bounds is not None: self.estimator.bounds = [ self.estimator.bounds[ii] for ii in range(len(self.estimator.bounds)) if self.support[ii] ] self.estimator.fit(X2, y2, sample_weight=sw2) # get the mean of z for each level of y self.label_encoder = LabelEncoder().fit(y) self.classes_ = self.label_encoder.classes_ z = self.predict_z(Xold) self.z_means = np.array( [z[y == cl].mean() for cl in self.label_encoder.classes_]) self.coef_ = np.zeros(len(self.support)) self.coef_[self.support] = self.estimator.coef_.flatten() self.coef_ = self.coef_.reshape(1, -1) self.intercept_ = self.estimator.intercept_ self.fitted_ = True return self
def test_clone_2(): # Tests that clone doesn't copy everything. # We first create an estimator, give it an own attribute, and # make a copy of its original state. Then we check that the copy doesn't # have the specific attribute we manually added to the initial estimator. from sklearn.feature_selection import SelectFpr, f_classif selector = SelectFpr(f_classif, alpha=0.1) selector.own_attribute = "test" new_selector = clone(selector) assert_false(hasattr(new_selector, "own_attribute"))
def test_clone_2(): # Tests that clone doesn't copy everything. # We first create an estimator, give it an own attribute, and # make a copy of its original state. Then we check that the copy doesn't # have the specific attribute we manually added to the initial estimator. from sklearn.feature_selection import SelectFpr, f_classif selector = SelectFpr(f_classif, alpha=0.1) selector.own_attribute = "test" new_selector = clone(selector) assert not hasattr(new_selector, "own_attribute")
def test_clone(): """Tests that clone creates a correct deep copy. We create an estimator, make a copy of its original state (which, in this case, is the current state of the setimator), and check that the obtained copy is a correct deep copy. """ from sklearn.feature_selection import SelectFpr, f_classif selector = SelectFpr(f_classif, alpha=0.1) new_selector = clone(selector) assert_true(selector is not new_selector) assert_equal(selector._get_params(), new_selector._get_params())
def test_clone(): """Tests that clone creates a correct deep copy. We create an estimator, make a copy of its original state (which, in this case, is the current state of the setimator), and check that the obtained copy is a correct deep copy. """ from sklearn.feature_selection import SelectFpr, f_classif selector = SelectFpr(f_classif, alpha=0.1) new_selector = clone(selector) assert_true(selector is not new_selector) assert_equal(selector.get_params(), new_selector.get_params())
def test_select_fpr_int(self): model = SelectFpr() X = np.array( [[1, 2, 3, 1], [0, 3, 1, 4], [3, 5, 6, 1], [1, 2, 1, 5]], dtype=np.int64) y = np.array([0, 1, 0, 1]) model.fit(X, y) model_onnx = convert_sklearn( model, "select fpr", [("input", Int64TensorType([None, X.shape[1]]))], target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) dump_data_and_model( X, model, model_onnx, basename="SklearnSelectFpr")
def test_select_fpr_int(self): model = SelectFpr() X = np.array([[1, 2, 3, 1], [0, 3, 1, 4], [3, 5, 6, 1], [1, 2, 1, 5]]) y = np.array([0, 1, 0, 1]) model.fit(X, y) model_onnx = convert_sklearn( model, 'select fpr', [('input', Int64TensorType([1, X.shape[1]]))]) self.assertTrue(model_onnx is not None) dump_data_and_model( X, model, model_onnx, basename="SklearnSelectFpr", allow_failure= "StrictVersion(onnxruntime.__version__) <= StrictVersion('0.1.4')")
def train_decisiontree_FPR(configurationname, train_data, score_function, undersam=False, oversam=False, export=False): print("Training with configuration " + configurationname) X_train, y_train, id_to_a_train = train_data dtc = DecisionTreeClassifier(random_state=0) print("Feature Selection") # selector = SelectFpr(score_function) selector = SelectFpr(score_function) result = selector.fit(X_train, y_train) X_train = selector.transform(X_train) fitted_ids = [i for i in result.get_support(indices=True)] print("Apply Resampling") print(Counter(y_train)) if undersam and not oversam: renn = RepeatedEditedNearestNeighbours() X_train, y_train = renn.fit_resample(X_train, y_train) if oversam and not undersam: # feature_indices_array = list(range(len(f_to_id))) # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0) # X_train, y_train = smote_nc.fit_resample(X_train, y_train) sm = SMOTE(random_state=42) X_train, y_train = sm.fit_resample(X_train, y_train) if oversam and undersam: smote_enn = SMOTEENN(random_state=0) X_train, y_train = smote_enn.fit_resample(X_train, y_train) print(Counter(y_train)) print("Train Classifier") dtc = dtc.fit(X_train, y_train, check_input=True) # if export: print("Exporting decision tree image...") export_graphviz(dtc, out_file=DATAP + "/temp/trees/sltree_" + configurationname + ".dot", filled=True) transform(fitted_ids) print("Self Accuracy: " + str(dtc.score(X_train, y_train))) return selector, dtc
def build_model(clf="log_reg", train_reader=sick_train_reader, feature_vectorizer=DictVectorizer(sparse=True), features=None, feature_selector=SelectFpr(chi2, alpha=0.05), file_name=None, load_vec=None, compression=None): ''' Builds the model of choice. ''' global _models clf_pipe = None ''' Putting RFE in the pipeline feature_selector = RFE( LogisticRegression(solver='lbfgs'), n_features_to_select = 5000, step = 0.05) ''' if compression: clf_pipe = Pipeline([('dict_vector', feature_vectorizer), ('feature_selector', feature_selector), ('compression', _models[compression]), ('clf', _models[clf])]) else: clf_pipe = Pipeline([('dict_vector', feature_vectorizer), ('feature_selector', feature_selector), ('clf', _models[clf])]) feat_vec, labels = obtain_vectors(file_name, load_vec, train_reader, features) return clf_pipe, feat_vec, labels
def get_ensemble_model(w2v=None): if not w2v: glove = Glove.load() w2v = glove.get_dict() n_jobs = -1 return Pipeline([ ('feature_extraction', get_features(w2v)), # false positive rate test for feature selection ('feature_selection', SelectFpr(f_classif)), #('normalize', Normalizer(norm='l2')), ( 'proba', ProbExtractor([ RandomForestClassifier(n_estimators=300, max_depth=10, min_samples_split=5, n_jobs=n_jobs), # ExtraTreesClassifier(n_estimators=300, max_depth=10, # min_samples_split=10, # n_jobs=n_jobs), XGBClassifier(n_estimators=300, max_depth=10, n_jobs=8), LogisticRegression(C=0.1, solver='lbfgs', penalty='l2', n_jobs=n_jobs), BernoulliNB(alpha=5.0) ])), ('polynomial', PolynomialFeatures(degree=2)), ('logistic_regression', GridSearchCV(LogisticRegression(penalty='l2', random_state=42), param_grid=params)) ])
def test_select_heuristics_regression(): # Test whether the relative univariate feature selection # gets the correct items in a simple regression problem # with the fpr, fdr or fwe heuristics X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0, noise=10) univariate_filter = SelectFpr(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) gtruth = np.zeros(20) gtruth[:5] = 1 for mode in ["fdr", "fpr", "fwe"]: X_r2 = GenericUnivariateSelect(f_regression, mode=mode, param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() assert_array_equal(support[:5], np.ones((5,), dtype=np.bool)) assert_less(np.sum(support[5:] == 1), 3)
def test_clone(): # Tests that clone creates a correct deep copy. # We create an estimator, make a copy of its original state # (which, in this case, is the current state of the estimator), # and check that the obtained copy is a correct deep copy. from sklearn.feature_selection import SelectFpr, f_classif selector = SelectFpr(f_classif, alpha=0.1) new_selector = clone(selector) assert selector is not new_selector assert selector.get_params() == new_selector.get_params() selector = SelectFpr(f_classif, alpha=np.zeros((10, 2))) new_selector = clone(selector) assert selector is not new_selector
def test_clone(): # Tests that clone creates a correct deep copy. # We create an estimator, make a copy of its original state # (which, in this case, is the current state of the estimator), # and check that the obtained copy is a correct deep copy. from sklearn.feature_selection import SelectFpr, f_classif selector = SelectFpr(f_classif, alpha=0.1) new_selector = clone(selector) assert selector is not new_selector assert_equal(selector.get_params(), new_selector.get_params()) selector = SelectFpr(f_classif, alpha=np.zeros((10, 2))) new_selector = clone(selector) assert selector is not new_selector
def get_feature_extractor(w2v=None): if not w2v: glove = Glove.load() w2v = glove.get_dict() return Pipeline([("feature_extraction", get_features(w2v)), ('feature_selection', SelectFpr(f_classif)) ])
def test_select_fpr_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the fpr heuristic """ X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectFpr(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode="fpr", param=0.01).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert (support[:5] == 1).all() assert np.sum(support[5:] == 1) < 3
def feature_Univarselection(data, y, Alpha): xx = data.sort_values('pid').values xx_label = y.sort_values('pid')[sep].values select = SelectFpr(f_classif, alpha=Alpha).fit(xx, xx_label) # select = SelectFdr(f_classif, alpha=Alpha).fit(xx,xx_label) # select = SelectFwe(f_classif, alpha=Alpha).fit(xx,xx_label) # select = SelectKBest(chi2, k=num_feature).fit(xx,xx_label) # select = SelectFromModel(estimator=Lasso(), threshold=-np.inf, max_features=num_feature).fit(data,y) reduced_xx = select.transform(xx) new_data = select.inverse_transform(reduced_xx) new_data = pd.DataFrame(new_data, index=data.sort_values('pid').index, columns=data.sort_values('pid').columns) # idx = select.get_support() # print(idx) # new_data = np.delete(new_data,idx,1) return new_data
def test_select_heuristics_regression(): # Test whether the relative univariate feature selection # gets the correct items in a simple regression problem # with the fpr, fdr or fwe heuristics X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0, noise=10) univariate_filter = SelectFpr(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) gtruth = np.zeros(20) gtruth[:5] = 1 for mode in ['fdr', 'fpr', 'fwe']: X_r2 = GenericUnivariateSelect( f_regression, mode=mode, param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() assert_array_equal(support[:5], np.ones((5, ), dtype=np.bool)) assert_less(np.sum(support[5:] == 1), 3)
def test_verbose_output_for_select_select_fpr(): expected_output = ("The p-value of column 'B' (1.0000) is above the " + "specified alpha of 0.5000") model = SelectFpr(chi2, alpha=0.5) output = _capture_verbose_output_for_model(model, use_supervised_df=True) assert output == expected_output
def test_select_fpr_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the fpr heuristic """ X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectFpr(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode='fpr', param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert(support[:5] == 1).all() assert(np.sum(support[5:] == 1) < 3)
def selectFpr(args): """Uses scikit-learn's SelectFpr, select the pvalues below alpha based on a FPR test. Parameters ---------- score_func : callable Function taking two arrays X and y, and returning a pair of arrays (scores, pvalues). alpha : float, optional The highest uncorrected p-value for features to keep. """ if (args[2] == "chi2"): selector = SelectFpr(chi2, alpha=float(args[1])) elif (args[2] == "f_classif"): selector = SelectFpr(f_classif, alpha=float(args[1])) return selector
def test_select_fpr_float(self): model = SelectFpr() X = np.array( [[1, 2, 3, 1], [0, 3, 1, 4], [3, 5, 6, 1], [1, 2, 1, 5]], dtype=np.float32, ) y = np.array([0, 1, 0, 1]) model.fit(X, y) model_onnx = convert_sklearn( model, "select fpr", [("input", FloatTensorType([1, X.shape[1]]))]) self.assertTrue(model_onnx is not None) dump_data_and_model( X, model, model_onnx, basename="SklearnSelectFpr", allow_failure="StrictVersion(onnx.__version__)" " < StrictVersion('1.2') or " "StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.2.1')", )
def feature_method_selection(data, label, fsname): """ select features by option 'fsname' :param data: :param label: :param fsname: :return: new_data, selected data :return: selected_features_inx, the index of selected feature, starts with 0 """ if fsname == 'variance_threshold': #变化不大就舍弃,离散值 model = VarianceThreshold() #th=1 return model.fit_transform(data) elif fsname == 'select_kbest': model = SelectKBest(chi2, k=10) #特征值必须非负,chi2是分类 elif fsname == 'rfe':#递归消除,耗时很长 svc = SVC(kernel='linear', C=1) model = RFE(estimator=svc, n_features_to_select=10, step=1) elif fsname == 'rfecv': #交叉验证执行执行REF,label必须是数值 svc = SVC(kernel="linear") rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(label, 1), scoring='accuracy') elif fsname == 'RandLasso':#打乱重新选择,cannot perform reduce with flexible type model = RandomizedLogisticRegression() elif fsname == 'linear_svc': model = LinearSVC() #没有importance elif fsname == 'tree': model = ExtraTreesClassifier() elif fsname == 'fclassif': model = SelectFpr() #默认是f_classif,值越大,特征越有用 elif fsname == 'pearsonr': #label必须是数值 label = turn_label_2num(label)#结果是两个sample的相关性 res = pearsonr(data,label) elif fsname == 'RandForReg': #label必须是数值 label = turn_label_2num(label) model = RandomForestRegressor() else: logging.error('ERROR: feature selection option is wrong') model.fit(data, label) new_data = model.transform(data) # selected importanted data return new_data
def fval(df, y, alpha, k): """Feature Selection based on F-Value :param df: dataframe :param y: label :param alpha: hyper-parameter [alpha] :param k: number of select features :return: dataframe of feature selected """ x_bin = MinMaxScaler().fit_transform(scale(df)) select_chi2 = SelectFpr(chi2, alpha=alpha).fit(x_bin, y) select_f_classif = SelectFpr(f_classif, alpha=alpha).fit(df, y) chi2_selected = select_chi2.get_support() f_classif_selected = select_f_classif.get_support() chi2_selected_features = [ f for i, f in enumerate(df.columns) if chi2_selected[i] ] logging.info('Chi2 selected {} features {}.'.format( chi2_selected.sum(), chi2_selected_features)) f_classif_selected_features = [ f for i, f in enumerate(df.columns) if f_classif_selected[i] ] logging.info('F_classif selected {} features {}.'.format( f_classif_selected.sum(), f_classif_selected_features)) selected = chi2_selected & f_classif_selected logging.info('Chi2 & F_classif selected {} features'.format( selected.sum())) features = [f for f, s in zip(df.columns, selected) if s] logging.info(features) return df[features]
def multisplit(skf, X, y, stepsize=1000): total_score = 0 for train_index, test_index in skf: wl = [] pred1 = np.matrix([]) # Training for x in range(0, len(X[0]), stepsize): clf1 = plib.classif(X[train_index, x:x + stepsize], y[train_index]) tmp_p = np.matrix( clf1.decision_function(X[train_index, x:x + stepsize])) if pred1.size == 0: pred1 = tmp_p else: pred1 = np.concatenate((pred1, tmp_p), axis=1) wl.append(clf1) #selectf = SelectKBest(f_classif, k=5).fit(pred1, y[train_index]) selectf = SelectFpr().fit(pred1, y[train_index]) clf3 = AdaBoostClassifier(n_estimators=100) #clf3 = svm.SVC(class_weight='auto') #clf3 = RandomForestClassifier(n_estimators=20) clf3.fit(selectf.transform(pred1), y[train_index]) # Testing predtest = np.matrix([]) k = 0 for x in range(0, len(X[0]), stepsize): tmp_p = np.matrix(wl[k].decision_function(X[test_index, x:x + stepsize])) if predtest.size == 0: predtest = tmp_p else: predtest = np.concatenate((predtest, tmp_p), axis=1) k += 1 # Final prediction predfinal = clf3.predict(selectf.transform(predtest)) print "Target : ", y[test_index] print "Prediction : ", predfinal matchs = np.equal(predfinal, y[test_index]) score = np.divide(np.sum(matchs), np.float64(matchs.size)) total_score = score + total_score return np.divide(total_score, skf.n_folds)
def correlation(df, y, threshold, alpha, corr_k_pass, mode): """Feature selection based on correlation between features :param df: dataframe :param y: label :param threshold: select feature threshold :param alpha: hyper-parameter [alpha] :param corr_k_pass: correlation threshold :param mode: feature selection based on static method :return: dataframe of feature selected """ df_out = df.corr() col_pass = [] del_col = [] if mode == "chi2": filter_slect = chi2 elif mode == "f": filter_slect = f_classif else: raise Exception("No mode: " % mode) if alpha: x_bin = MinMaxScaler().fit_transform(scale(df)) fpval = SelectFpr(filter_slect, alpha=alpha).fit(x_bin, y).scores_ df_sort_fval = pd.DataFrame({ "col": list(df.columns), "fval": list(fpval) }) df_sort_fval = df_sort_fval.sort_values(by=['fval'], ascending=False) ranking_col = list(df_sort_fval['col']) else: ranking_col = list(df.columns) for i, col in enumerate(ranking_col): if col not in del_col: col_pass.append(col) del_col = list( set(del_col + (list(df_out[col][(df_out[col] > threshold) | (df_out[col] < -threshold)].index)))) else: del_col = list( set(del_col + (list(df_out[col][(df_out[col] > threshold) | (df_out[col] < -threshold)].index)))) del df_out logging.info("Del col : %d" % len(del_col)) logging.info("Passed col : %d" % len(col_pass)) if corr_k_pass: if len(col_pass) > corr_k_pass: col_pass = col_pass[:corr_k_pass] return df[col_pass]
def train_DT( feats=None, labels=[], feature_selector=SelectFpr( chi2, alpha=0.05), # Use None to stop feature selection cv=5): # Number of folds used in cross-validation # Map the count dictionaries to a sparse feature matrix: vectorizer = DictVectorizer(sparse=False) feats = vectorizer.fit_transform(feats) ##### FEATURE SELECTION feat_matrix = feats feature_selector = RFE(estimator=MultinomialNB(), n_features_to_select=None, step=1, verbose=0) feat_matrix = feature_selector.fit_transform(feats, labels) ##### HYPER-PARAMETER SEARCH # Define the basic model to use for parameter search: searchmod = DecisionTreeClassifier() # Parameters to grid-search over: parameters = { 'splitter': ['best', 'random'], 'max_features': ['sqrt', 0.25, 'log2'], 'min_samples_split': [2, 5, 10] } # Cross-validation grid search to find the best hyper-parameters: clf = GridSearchCV(searchmod, parameters, cv=cv, n_jobs=-1) clf.fit(feat_matrix, labels) params = clf.best_params_ # Establish the model we want using the parameters obtained from the search: mod = DecisionTreeClassifier(splitter=params['splitter'], max_features=params['max_features'], min_samples_split=params['min_samples_split']) ##### ASSESSMENT scores = cross_val_score(mod, feat_matrix, labels, cv=cv, scoring="f1_macro") print 'Best model', mod print '%s features selected out of %s total' % (feat_matrix.shape[1], feats.shape[1]) print 'F1 mean: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2) # TRAIN OUR MODEL: mod.fit(feat_matrix, labels) # Return the trained model along with the objects we need to # featurize test data in a way that aligns with our training # matrix: return (mod, vectorizer, feature_selector)
def train_decisiontree_FPR(configurationname, train_data, score_function, undersam=False, oversam=False, export=False): print("Training with configuration " + configurationname) X_train, y_train, id_to_a_train = train_data dtc = DecisionTreeClassifier(random_state=0) print("Feature Selection") # selector = SelectFpr(score_function) selector = SelectFpr(score_function) result = selector.fit(X_train, y_train) X_train = selector.transform(X_train) fitted_ids = [i for i in result.get_support(indices=True)] print("Apply Resampling") print(Counter(y_train)) if undersam and not oversam: renn = RepeatedEditedNearestNeighbours() X_train, y_train = renn.fit_resample(X_train, y_train) if oversam and not undersam: # feature_indices_array = list(range(len(f_to_id))) # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0) # X_train, y_train = smote_nc.fit_resample(X_train, y_train) sm = SMOTE(random_state=42) X_train, y_train = sm.fit_resample(X_train, y_train) if oversam and undersam: smote_enn = SMOTEENN(random_state=0) X_train, y_train = smote_enn.fit_resample(X_train, y_train) print(Counter(y_train)) print("Train Classifier") dtc = dtc.fit(X_train, y_train, check_input=True) if export: export_graphviz(dtc, out_file=DATAP + "/temp/trees/sltree_" + configurationname + ".dot", filled=True) transform(fitted_ids) print("Self Accuracy: " + str(dtc.score(X_train, y_train))) return selector, dtc
def multisplit(skf,X,y,stepsize=1000): total_score = 0 for train_index, test_index in skf: wl = [] pred1 = np.matrix([]) # Training for x in range(0, len(X[0]), stepsize): clf1 = plib.classif(X[train_index, x:x + stepsize], y[train_index]) tmp_p = np.matrix(clf1.decision_function(X[train_index, x:x + stepsize])) if pred1.size == 0: pred1 = tmp_p else: pred1 = np.concatenate((pred1, tmp_p), axis=1) wl.append(clf1) #selectf = SelectKBest(f_classif, k=5).fit(pred1, y[train_index]) selectf = SelectFpr().fit(pred1, y[train_index]) clf3 = AdaBoostClassifier(n_estimators=100) #clf3 = svm.SVC(class_weight='auto') #clf3 = RandomForestClassifier(n_estimators=20) clf3.fit(selectf.transform(pred1), y[train_index]) # Testing predtest = np.matrix([]) k = 0 for x in range(0, len(X[0]), stepsize): tmp_p = np.matrix(wl[k].decision_function(X[test_index, x:x + stepsize])) if predtest.size == 0: predtest = tmp_p else: predtest = np.concatenate((predtest, tmp_p), axis=1) k += 1 # Final prediction predfinal = clf3.predict(selectf.transform(predtest)) print "Target : ", y[test_index] print "Prediction : ", predfinal matchs = np.equal(predfinal, y[test_index]) score = np.divide(np.sum(matchs), np.float64(matchs.size)) total_score = score + total_score return np.divide(total_score, skf.n_folds)
from sklearn.feature_selection import VarianceThreshold, SelectFpr, f_regression # import data of all Count and Position features. Training and test sets altogether dfCountfeatures = pd.read_csv('data/CountingAndPositionFeatures_TrainAndTestData.csv') dfTrainRaw = pd.read_csv('data/train.csv') # get only training data TrainQueryIDs = dfTrainRaw["id"] relevance = dfTrainRaw["relevance"] dfCountfeatures_TrainSet = dfCountfeatures[dfCountfeatures["id"].isin(TrainQueryIDs)] #select these features which have non-zero variance selector = VarianceThreshold() selector.fit_transform(dfCountfeatures_TrainSet).shape # only one feature with zero variance - shape (74067L, 262L) # select feature based on p-values from univariate regression with target feature (relevance) selector2= SelectFpr(f_regression, alpha = 0.01) selector2.fit(dfCountfeatures_TrainSet.drop("id", axis = 1), relevance) selector2.get_support(indices=True).size # left 226 features out of 262 with p-value <=1% # get titles of features which were selected selectedCountfeatures = dfCountfeatures.columns[selector2.get_support(indices=True)] # check correlation amongst features corrReduced = dfCountfeatures_TrainSet[selectedCountfeatures].corr() corrReduced.iloc[:,:] = np.tril(corrReduced.values, k=-1) corrReduced =corrReduced.stack() # get pairs of features which are highly correlated corrReduced[corrReduced.abs()>0.8].size # 578 pairs correlated more than 80% out of 25.425 len(set(corrReduced[corrReduced.abs()>0.8].index.labels[0])) # 172 features to be removed due to high correlation with other features # get feature titles which will be used in training the model after removing highly correlated features indices = set(corrReduced[corrReduced.abs()>0.8].index.labels[0]) selectedCountfeatures2 = [i for j, i in enumerate(selectedCountfeatures.tolist()) if j not in indices]
y = iris.target ################################################################################ pl.figure(1) pl.clf() x_indices = np.arange(x.shape[-1]) ################################################################################ # Univariate feature selection from sklearn.feature_selection import SelectFpr, f_classif # As a scoring function, we use a F test for classification # We use the default selection function: the 10% most significant # features selector = SelectFpr(f_classif, alpha=0.1) selector.fit(x, y) scores = -np.log10(selector._pvalues) scores /= scores.max() pl.bar(x_indices-.45, scores, width=.3, label=r'Univariate score ($-Log(p_{value})$)', color='g') ################################################################################ # Compare to the weights of an SVM clf = svm.SVC(kernel='linear') clf.fit(x, y) svm_weights = (clf.coef_**2).sum(axis=0) svm_weights /= svm_weights.max() pl.bar(x_indices-.15, svm_weights, width=.3, label='SVM weight',
#SelectPercentile -- chi2 from sklearn.feature_selection import SelectPercentile from sklearn.feature_selection import chi2 X_fitted_4 = SelectPercentile(chi2, percentile=50).fit(X,y) print "SelectPercentile -- chi2" print X_fitted_4.scores_ print X_fitted_4.pvalues_ print X_fitted_4.get_support() X_transformed_4 = X_fitted_4.transform(X) print X_transformed_4.shape #SelectFpr --- chi2 from sklearn.feature_selection import SelectFpr from sklearn.feature_selection import chi2 X_fitted_5 = SelectFpr(chi2, alpha=2.50017968e-15).fit(X,y) print "SelectFpr --- chi2" print X_fitted_5.scores_ print X_fitted_5.pvalues_ print X_fitted_5.get_support() X_transformed_5 = X_fitted_5.transform(X) print X_transformed_5.shape #SelectFpr --- f_classif from sklearn.feature_selection import SelectFpr from sklearn.feature_selection import f_classif X_fitted_6 = SelectFpr(f_classif, alpha=1.66966919e-31 ).fit(X,y) print "SelectFpr --- f_classif" print X_fitted_6.scores_ print X_fitted_6.pvalues_ print X_fitted_6.get_support()
data1 = pdc.objFeatures[tr1_mask][:, featureIds] data2 = pdc.objFeatures[tr2_mask][:, featureIds] data = np.vstack([data1, data2]) labels1 = np.zeros((data1.shape[0],)) labels2 = np.ones((data2.shape[0],)) labels = np.hstack([labels1, labels2]) X1 = data1[:1000] X2 = data2[-1000:] X = np.vstack([X1, X2]) Y1 = labels1[:X1.shape[0]] Y2 = labels2[:X2.shape[0]] Y = np.hstack([Y1, Y2]) from sklearn.feature_selection import SelectFpr, f_classif selector = SelectFpr(f_classif, alpha=0.1) selector.fit(X, Y) scores = -np.log10(selector._pvalues) scores /= scores.max() from sklearn import svm # Compare to the weights of an SVM clf = svm.SVC(kernel='linear') clf.fit(X, Y) print 'SVM error:', clf.score(data, labels) pred = clf.predict(data) match = numpy.sum(pred == labels) print match, labels.shape[0] print match / float(labels.shape[0]) svm_weights = (clf.coef_**2).sum(axis=0)