def test_select_percentile_classif_sparse(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the percentile heuristic X, y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) X = sparse.csr_matrix(X) univariate_filter = SelectPercentile(f_classif, percentile=25) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode="percentile", param=25).fit(X, y).transform(X) assert_array_equal(X_r.toarray(), X_r2.toarray()) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth) X_r2inv = univariate_filter.inverse_transform(X_r2) assert_true(sparse.issparse(X_r2inv)) support_mask = safe_mask(X_r2inv, support) assert_equal(X_r2inv.shape, X.shape) assert_array_equal(X_r2inv[:, support_mask].toarray(), X_r.toarray()) # Check other columns are empty assert_equal(X_r2inv.getnnz(), X_r.getnnz())
def test_select_kbest_classif(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the k best heuristic X, y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) univariate_filter = SelectKBest(f_classif, k=5) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = (GenericUnivariateSelect(f_classif, mode="k_best", param=5).fit(X, y).transform(X)) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def get_feature_selection_model(cls, model_name, estimator_name=None): feature_selection_model = None model_param = copy.deepcopy( feature_selection_config_dict['model_param']) if model_name == 'Embedded': feature_selection_model = \ SelectFromModel(estimator_name, **model_param['Embedded']) elif model_name == 'Wrapper': feature_selection_model = \ RFECV(estimator = get_estimator(estimator_name), **model_param['Wrapper']) elif model_name == 'Filter': model_param['Filter']['score_func'] = get_score_func( model_param['Filter']['score_func']) feature_selection_model = GenericUnivariateSelect( **model_param['Filter']) elif model_name == 'KeepAll': feature_selection_model = 'KeepAll' else: raise ValueError( "estimator must be in ('Embedded','Wrapper','Filter','KeepAll') but is %s" % (model_name)) return feature_selection_model
def test_select_fdr_classif(): """ Test whether the relative univariate feature selection gets the correct items in a simple classification problem with the fpr heuristic """ X, y = make_classification(n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0) univariate_filter = SelectFdr(f_classif, alpha=0.0001) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode='fdr', param=0.0001).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def test_select_heuristics_classif(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the fdr, fwe and fpr heuristics X, y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) univariate_filter = SelectFwe(f_classif, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) gtruth = np.zeros(20) gtruth[:5] = 1 for mode in ["fdr", "fpr", "fwe"]: X_r2 = (GenericUnivariateSelect(f_classif, mode=mode, param=0.01).fit(X, y).transform(X)) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() assert_allclose(support, gtruth)
def test_generic_univariate_select_float(self): model = GenericUnivariateSelect() X = np.array([[1, 2, 3, 1], [0, 3, 1, 4], [3, 5, 6, 1], [1, 2, 1, 5]]) y = np.array([0, 1, 0, 1]) model.fit(X, y) model_onnx = convert_sklearn( model, 'generic univariate select', [('input', FloatTensorType([1, X.shape[1]]))]) self.assertTrue(model_onnx is not None) dump_data_and_model( X, model, model_onnx, basename="SklearnGenericUnivariateSelect", allow_failure= "StrictVersion(onnxruntime.__version__) <= StrictVersion('0.1.4')")
def single_fdr(alpha, n_informative, random_state): X, y = make_regression(n_samples=150, n_features=20, n_informative=n_informative, shuffle=False, random_state=random_state, noise=10) with warnings.catch_warnings(record=True): # Warnings can be raised when no features are selected # (low alpha or very noisy data) univariate_filter = SelectFdr(f_regression, alpha=alpha) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode='fdr', param=alpha).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() num_false_positives = np.sum(support[n_informative:] == 1) num_true_positives = np.sum(support[:n_informative] == 1) if num_false_positives == 0: return 0. false_discovery_rate = (num_false_positives / (num_true_positives + num_false_positives)) return false_discovery_rate
class MutualInfoRazor: def __init__(self, percentile=50): self.percentile = percentile self.transformer = GenericUnivariateSelect(score_func=mutual_info_regression, mode='percentile', param=self.percentile) @property def support_(self): return self.transformer.get_support() def fit(self, X, Y): self.transformer.fit(X, Y) def predict(self, X): return self.transformer.transform(X=X)
def test_select_percentile_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the percentile heuristic """ X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectPercentile(f_regression, percentile=25) X_r = univariate_filter.fit(X, y).transform(X) assert_best_scores_kept(univariate_filter) X_r2 = GenericUnivariateSelect(f_regression, mode='percentile', param=25).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth) X_2 = X.copy() X_2[:, np.logical_not(support)] = 0 assert_array_equal(X_2, univariate_filter.inverse_transform(X_r)) # Check inverse_transform respects dtype assert_array_equal(X_2.astype(bool), univariate_filter.inverse_transform(X_r.astype(bool)))
def test_select_percentile_classif_sparse(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the percentile heuristic X, y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) X = sparse.csr_matrix(X) univariate_filter = SelectPercentile(f_classif, percentile=25) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = (GenericUnivariateSelect(f_classif, mode="percentile", param=25).fit(X, y).transform(X)) assert_array_equal(X_r.toarray(), X_r2.toarray()) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth) X_r2inv = univariate_filter.inverse_transform(X_r2) assert sparse.issparse(X_r2inv) support_mask = safe_mask(X_r2inv, support) assert X_r2inv.shape == X.shape assert_array_equal(X_r2inv[:, support_mask].toarray(), X_r.toarray()) # Check other columns are empty assert X_r2inv.getnnz() == X_r.getnnz()
def feature_selection(X, y, test_size=0.2): (X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42) selector = GenericUnivariateSelect(score_func=chi2, mode='percentile', param=70) X_train_selected = selector.fit_transform(X_train, y_train) X_test_selected = selector.transform(X_test) print('Before selection shape:', X_train.shape) print('After selection shape:', X_train_selected.shape) return (X_train_selected < X_test_selected)
def EvaluatePerformance(classifier, extractionMethod: str, reviews: [Review], label: str, featuresSelector, selectorParam): skf = StratifiedKFold(n_splits=10) global_features_index = GetGlobalFeaturesIndex( reviews, list(range(0, len(reviews))), extractionMethod) x, y = [], [] for review in reviews: featuresVector = ExtractFeatureFromCorpus( global_features_index, review.review_content, extractionMethod) x.append(featuresVector) y.append(review.tag.tagId) transformer = None if extractionMethod in ef.USE_TFIDF: transformer = TfidfTransformer(smooth_idf=False) x = transformer.fit_transform(x, y).toarray().tolist() if extractionMethod in ef.USE_SMOTEENN: x, y = sme.fit_sample(x, y) elif extractionMethod in ef.USE_SMOTETOMEK: x, y = smt.fit_sample(x, y) selector = GenericUnivariateSelect( chi2, featuresSelector, param=selectorParam) x = selector.fit_transform(x, y) print("here1") grid_search = GridSearchCV(classifier, lg_param_grid, scoring=scorers, refit='accuracy_score', cv=skf, return_train_score=True, n_jobs=-1) print("here2") grid_search.fit(x, y) print("here3") # make the predictions y_pred = grid_search.predict(x) print('Best params for {}'.format('accuracy_score')) print(grid_search.best_params_)
def operate(self, input_datanode, target_fields=None): from sklearn.feature_selection import GenericUnivariateSelect feature_types = input_datanode.feature_types X, y = input_datanode.data if target_fields is None: target_fields = collect_fields(feature_types, self.input_type) X_new = X[:, target_fields] n_fields = len(feature_types) irrevalent_fields = list(range(n_fields)) for field_id in target_fields: irrevalent_fields.remove(field_id) # Because the pipeline guarantees that each feature is positive, # clip all values below zero to zero if self.score_func == 'chi2': X_new[X_new < 0] = 0.0 if self.model is None: self.model = GenericUnivariateSelect(score_func=self.call_func, param=self.alpha, mode=self.mode) self.model.fit(X_new, y) _X = self.model.transform(X_new) is_selected = self.model.get_support() irrevalent_types = [feature_types[idx] for idx in irrevalent_fields] selected_types = [ feature_types[idx] for idx in target_fields if is_selected[idx] ] selected_types.extend(irrevalent_types) new_X = np.hstack((_X, X[:, irrevalent_fields])) new_feature_types = selected_types output_datanode = DataNode((new_X, y), new_feature_types, input_datanode.task_type) output_datanode.trans_hist = input_datanode.trans_hist.copy() output_datanode.trans_hist.append(self.type) output_datanode.enable_balance = input_datanode.enable_balance output_datanode.data_balance = input_datanode.data_balance self.target_fields = target_fields.copy() return output_datanode
def test_generic_univariate_select_int(self): model = GenericUnivariateSelect() X = np.array([[1, 2, 3, 1], [0, 3, 1, 4], [3, 5, 6, 1], [1, 2, 1, 5]], dtype=np.int64) y = np.array([0, 1, 0, 1]) model.fit(X, y) model_onnx = convert_sklearn( model, 'generic univariate select', [('input', Int64TensorType([1, X.shape[1]]))]) self.assertTrue(model_onnx is not None) dump_data_and_model( X, model, model_onnx, basename="SklearnGenericUnivariateSelect", # Operator cast-1 is not implemented in onnxruntime allow_failure= "StrictVersion(onnx.__version__) < StrictVersion('1.2')")
def feature_scores(X, Y): fselector = GenericUnivariateSelect(f_classif) fselector.fit(X, Y) p2scores = -np.log10(fselector.pvalues_) p2scores /= p2scores.max() mutSelector = GenericUnivariateSelect(mutual_info_classif) mutSelector.fit(X, Y) mutscores = mutSelector.scores_ return fselector.pvalues_, p2scores, mutscores
def get_feature_selection_model_from_name(type_of_estimator, model_name): model_map = { 'classifier': { 'SelectFromModel': SelectFromModel(RandomForestClassifier(n_jobs=-1, max_depth=10, n_estimators=15), threshold='20*mean'), 'RFECV': RFECV(estimator=RandomForestClassifier(n_jobs=-1), step=0.1), 'GenericUnivariateSelect': GenericUnivariateSelect(), 'RandomizedSparse': RandomizedLogisticRegression(), 'KeepAll': 'KeepAll' }, 'regressor': { 'SelectFromModel': SelectFromModel(RandomForestRegressor(n_jobs=-1, max_depth=10, n_estimators=15), threshold='0.7*mean'), 'RFECV': RFECV(estimator=RandomForestRegressor(n_jobs=-1), step=0.1), 'GenericUnivariateSelect': GenericUnivariateSelect(), 'RandomizedSparse': RandomizedLasso(), 'KeepAll': 'KeepAll' } } return model_map[type_of_estimator][model_name]
def preprocess(X_train, y_train, X_test): print("== Preprocessing Data ==") X_merge = np.concatenate((X_train, X_test), axis=0) sep = X_train.shape[0] # Scale features to range [0, 1] scale = MinMaxScaler() X_merge = scale.fit_transform(X_merge) X_train = X_merge[:sep] X_test = X_merge[sep:] # Choose top features as ranked by chi squared test gus = GenericUnivariateSelect(score_func=chi2, mode="k_best", param=306) gus.fit(X_train, y_train) X_train = gus.transform(X_train) X_test = gus.transform(X_test) return X_train, X_test
def dtc03(self): #将y转化为一维形式:self.y_train,self.y_test self.y01_train = list() self.y01_test = list() for a in range(len(self.y_train)): self.y01_train.append(self.y_train[a][0]) for b in range(len(self.y_test)): self.y01_test.append(self.y_test[b][0]) #取出其中labels self.labels = list() for c in range(len(self.y_test)): if self.labels.count(self.y_test[c][0]) == 0: self.labels.append(self.y_test[c][0]) print (self.labels) # SelectKBest算法的实现 # 参数的获取 if not self.kedit.text().strip(): self.k = 10 else: self.k = int(self.kedit.text()) if not self.pedit.text().strip(): self.param = 1e-05 else: self.param = float(self.pedit.text()) self.mode = self.mo_box.itemText(self.mo_box.currentIndex()) # 定义模型 if self.sp_box.itemText(self.sp_box.currentIndex()) == 'SelectKBest': self.clf = SelectKBest(score_func= f_classif, k=self.k) self.clf.fit_transform(self.x_train,self.y01_train) self.f_c = self.clf.get_support() elif self.sp_box.itemText(self.sp_box.currentIndex()) == 'SelectPercentile': self.clf = SelectPercentile(score_func= f_classif, percentile= self.k) self.clf.fit_transform(self.x_train,self.y01_train) self.f_c = self.clf.get_support() else: self.clf = GenericUnivariateSelect(score_func= f_classif, mode= self.mode, param=self.param) self.clf.fit_transform(self.x_train,self.y01_train) self.f_c = self.clf.get_support() # ''' 该模块是对dtable01模块进行设置,即显示训练集的训练结果 ''' # VarianceThreshold算法的结果显示 self.ufs_dtable.setRowCount(2) self.ufs_dtable.setColumnCount(len(self.x_train[0])) mlan = "是否保留该特征(T/F)" self.ufs_dtable.setSpan(0, 0, 1, len(self.x_train[0])) self.ufs_dtable.setItem(0,0, QtGui.QTableWidgetItem(mlan.decode('utf-8'))) for j in range(len(self.f_c)): self.ufs_dtable.setItem(1,j, QtGui.QTableWidgetItem(str(self.f_c[j])))
def predictGenericUnivariateSelect(X, y, clf): features = GenericUnivariateSelect(chi2) features.fit_transform(X, y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35) fit_chi2 = clf.fit(X_train, y_train) y_pred = fit_chi2.predict(X_test) f1_scores = [] precision_scores = [] recall_scores = [] f1_score = metrics.accuracy_score(y_test, y_pred) precision = metrics.f1_score(y_test, y_pred, average=None) recall = metrics.recall_score(y_test, y_pred, average=None) f1_scores.append(f1_score) precision_scores.append(precision) recall_scores.append(recall) return np.mean(f1_scores), np.mean(precision_scores), np.mean( recall_scores)
def select_features_univariate(X, y, method='Decision_Tree'): """ with high dimensional datasets it aids classifier performance to select features of interest This function rejects features below a certain (univariate) threshold. Parameters ---------- X : ndarray repetitions by features y : ndarray vector of labels of each repetition method : string function used for data reduction {'decision_tree','decision_tree_RFECV','mutual_information',... 'univariate_select'} Returns -------- dictionary: X_transformed : ndarray repetitions by features (reduced) weights: ndarray or Boolean relative importance features or binary (important or not) """ # based on the method we choose the clf to fit and transform the data if method == 'decision_tree_RFECV': clf = DecisionTreeClassifier() trans = RFECV(clf) X_transformed = trans.fit_transform(X, y) weights = trans.get_support() elif method == 'decision_tree': clf = DecisionTreeClassifier() clf.fit(X, y) # choose features with an importance that is more than avg. selected_features = np.where( clf.feature_importances_ > clf.feature_importances_.mean(0), 1, 0) X_transformed = X[:, selected_features == 1] weights = clf.feature_importances_ elif method == 'mutual_information': mutual_info = mutual_info_classif(X, y) # choose features above the avg mutual information threshold. selected_features = np.where(mutual_info > mutual_info.mean(0), 1, 0) X_transformed = X[:, selected_features == 1] weights = mutual_info #continuous elif method == 'univariate_select': # select features with more univariate activity than avg. trans = GenericUnivariateSelect(score_func=lambda X, y: X.mean(axis=0), mode='percentile', param=50) X_transformed = trans.fit_transform(X, y) weights = trans.get_support() #binary return X_transformed, weights
def test_select_percentile_classif_sparse(): """ Test whether the relative univariate feature selection gets the correct items in a simple classification problem with the percentile heuristic """ X, y = make_classification(n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0) X = sparse.csr_matrix(X) univariate_filter = SelectPercentile(f_classif, percentile=25) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode='percentile', param=25).fit(X, y).transform(X) assert_array_equal(X_r.toarray(), X_r2.toarray()) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def get_feature_selection_model_from_name(type_of_estimator, model_name): # TODO(PRESTON): eventually let threshold be user-configurable (or grid_searchable) # TODO(PRESTON): optimize the params used here model_map = { 'classifier': { 'SelectFromModel': SelectFromModel(RandomForestClassifier(n_jobs=-1)), 'RFECV': RFECV(estimator=RandomForestClassifier(n_jobs=-1), step=0.1), 'GenericUnivariateSelect': GenericUnivariateSelect(), 'RandomizedSparse': RandomizedLogisticRegression(), 'KeepAll': 'KeepAll' }, 'regressor': { 'SelectFromModel': SelectFromModel(RandomForestRegressor(n_jobs=-1)), 'RFECV': RFECV(estimator=RandomForestRegressor(n_jobs=-1), step=0.1), 'GenericUnivariateSelect': GenericUnivariateSelect(), 'RandomizedSparse': RandomizedLasso(), 'KeepAll': 'KeepAll' } } return model_map[type_of_estimator][model_name]
def univariate_feature_selection_with_GUS(dataframe): """ uses univariate statistics such as mutual information regression to select the k best features for a regression problem """ from sklearn.feature_selection import GenericUnivariateSelect, mutual_info_regression X, y = xy_split(df) start = default_timer() select_features_gus = GenericUnivariateSelect(score_func=mutual_info_regression, mode="k_best", param=round((df.shape[1]-1)/3)).fit_transform(X, y) end = default_timer() print("Elapsed Time for feature selection: {}s".format(end-start)) print(GenericUnivariateSelect.scores_) return select_features_gus
def test_skdatasets_classif_MIFilter_vs_f_classif(X, y, p, k): ''' Test several scikits learn datasets to compare features selection based on MI and anova for classification problems ''' #scores, r = univariate_f_MI(X, y, type='cd',njobs=4) #scores, r= univariate_f_MI(X, y,k=30, type='cd') njobs=4 #print("MI scores:",scores) #support_MI = get_support(scores, 4) kbest_univ_f_MI = _MI_Filter(univariate_f_MI, mode='k_best',param=k,type='cd',njobs=4) kbest_univ_f_MI.fit(X,y) support_MI = kbest_univ_f_MI._get_support_mask() print("MI Univariate scores:",kbest_univ_f_MI.scores_) print("MI Univariate r:",kbest_univ_f_MI.ranking_) print("MI Univariate support:",support_MI) kfirst_uforward_MI = _MI_Filter(univariate_forward_f_MI, mode='k_first',param=k,type='cd',njobs=4) kfirst_uforward_MI.fit(X,y) support_uforward_MI = kfirst_uforward_MI._get_support_mask() print("MI Univariate forward scores:",kfirst_uforward_MI.scores_) print("MI Univariate forward r:",kfirst_uforward_MI.ranking_) print("MI Univariate forward support:",support_uforward_MI) kfirst_mforward_MI = _MI_Filter(multivariate_forward_f_MI, mode='k_first',param=k,type='cd',njobs=4) kfirst_mforward_MI.fit(X,y) support_mforward_MI = kfirst_mforward_MI._get_support_mask() print("MI multivariate forward scores:",kfirst_mforward_MI.scores_) print("MI multivariate forward r:",kfirst_mforward_MI.ranking_) print("MI multivariate forward support:",support_mforward_MI) kfirst_mbackward_MI = _MI_Filter(multivariate_backward_f_MI, mode='k_first',param=-k,type='cd',njobs=4) kfirst_mbackward_MI.fit(X,y) support_mbackward_MI = kfirst_mbackward_MI._get_support_mask() print("MI multivariate backward scores:",kfirst_mbackward_MI.scores_) print("MI multivariate backward r:",kfirst_mbackward_MI.ranking_) print("MI multivariate backward support:",support_mbackward_MI) filter_F = GenericUnivariateSelect(f_classif, mode='k_best',param=k) filter_F.fit(X, y) print("F scores:",filter_F.scores_) support_F = filter_F._get_support_mask() print("F support :",support_F) '''
def feature_Selection(self, target=""): """Automated feature selection using sklearn GenericUnivariateSelect Keyword arguments: target -- target column for feature selection (default "") """ data_set = self.DF y = data_set[target] X = data_set.drop(columns=[target]) print(f"We are starting with the following columns:\n{X.columns}\n") transformer = GenericUnivariateSelect( f_classif if self.type == "classification" else f_regression, mode="percentile") self.data = transformer.fit_transform(X, y) columns_retained = self.DF.iloc[:, 1:].columns[ transformer.get_support()].values self.DF = self.DF[columns_retained] self.DF[target] = y print( f"The following columns are left:\n{self.DF.drop(columns=[target]).columns}" )
def test_mutual_info_classif(): X, y = make_classification( n_samples=100, n_features=5, n_informative=1, n_redundant=1, n_repeated=0, n_classes=2, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) # Test in KBest mode. univariate_filter = SelectKBest(mutual_info_classif, k=2) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = (GenericUnivariateSelect(mutual_info_classif, mode="k_best", param=2).fit(X, y).transform(X)) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(5) gtruth[:2] = 1 assert_array_equal(support, gtruth) # Test in Percentile mode. univariate_filter = SelectPercentile(mutual_info_classif, percentile=40) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = (GenericUnivariateSelect(mutual_info_classif, mode="percentile", param=40).fit(X, y).transform(X)) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(5) gtruth[:2] = 1 assert_array_equal(support, gtruth)
def featureSelection(x_train_features, y_train): #write feature selection code in here #selected_features: holds indices of selected features # write required feature extraction code in here selected_features = GenericUnivariateSelect(f_regression, 'k_best', param=256).fit( x_train_features, y_train).get_support() return selected_features
def test_generic_univariate_select_float(self): model = GenericUnivariateSelect() X = np.array( [[1, 2, 3, 1], [0, 3, 1, 4], [3, 5, 6, 1], [1, 2, 1, 5]], dtype=np.float32, ) y = np.array([0, 1, 0, 1]) model.fit(X, y) model_onnx = convert_sklearn( model, "generic univariate select", [("input", FloatTensorType([None, X.shape[1]]))], ) self.assertTrue(model_onnx is not None) dump_data_and_model( X, model, model_onnx, basename="SklearnGenericUnivariateSelect", allow_failure="StrictVersion(onnx.__version__)" " < StrictVersion('1.2') or " "StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.2.1')", )
def test_select_percentile_regression_full(): # Test whether the relative univariate feature selection # selects all features when '100%' is asked. X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectPercentile(f_regression, percentile=100) X_r = univariate_filter.fit(X, y).transform(X) assert_best_scores_kept(univariate_filter) X_r2 = GenericUnivariateSelect( f_regression, mode='percentile', param=100).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.ones(20) assert_array_equal(support, gtruth)
def decode(cls, obj): from sklearn.feature_selection import f_classif, f_regression, GenericUnivariateSelect new_obj = GenericUnivariateSelect.__new__(GenericUnivariateSelect) new_obj.__dict__ = obj['dict'] if new_obj.score_func == 'f_classif': new_obj.score_func = f_classif elif new_obj.score_func == 'f_regression': new_obj.score_func = f_regression else: raise ValueError( 'Unsupported GenericUnivariateSelect.score_func "%s"' % new_obj.score_func) return new_obj
def test_select_kbest_regression(): # Test whether the relative univariate feature selection # gets the correct items in a simple regression problem # with the k best heuristic X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0, noise=10) univariate_filter = SelectKBest(f_regression, k=5) X_r = univariate_filter.fit(X, y).transform(X) assert_best_scores_kept(univariate_filter) X_r2 = GenericUnivariateSelect( f_regression, mode='k_best', param=5).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def test_select_fwe_regression(): # Test whether the relative univariate feature selection # gets the correct items in a simple regression problem # with the fwe heuristic X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectFwe(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect( f_regression, mode='fwe', param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support[:5], np.ones((5, ), dtype=np.bool)) assert_less(np.sum(support[5:] == 1), 2)
train.drop(["id", "fault_severity", "location"], axis=1, inplace=True) test.drop(["id", "fault_severity", "location"], axis=1, inplace=True) train = train.fillna(0) train = train.astype(float) test = test.fillna(0) test = test.astype(float) print() print(train.shape) print(test.shape) # ch2 = SelectKBest(chi2, k=550) ch2 = GenericUnivariateSelect(score_func=chi2, mode="percentile", param=80) train = ch2.fit_transform(train, labels) test = ch2.transform(test) print(train.shape) print(test.shape) # print(train.shape) # print(test.shape) # pca = PCA(n_components=400) # print('transforming data') # train = pca.fit_transform(train) # test = pca.transform(test) # print('data transformed') # print(train.shape)
ngram_range=(1,3), max_df=1.0, min_df=2, max_features=None, binary=False, norm=u'l2', use_idf=True, smooth_idf=True, sublinear_tf=True, tokenizer=rpg.SnowballEnglishStemmer()) tweet_data = rpg.tweet_corpus_maker(raw_tweet_data) # form usable by sklearn binary_region = tweet_data[2] # selecting the East/West binary splits initial_time = time() X_tfidf = tfidf_vectz.fit_transform(tweet_data[0]) # tfidf vectorization tfidf_time = time() - initial_time print ("TFIDF Vectorization Time: %0.3f" % tfidf_time) feature_selector = GenericUnivariateSelect(chi2, mode='percentile', param=25) X_selected = feature_selector.fit_transform(X_tfidf, binary_region) # This splits data into training/test splits etc. clf_tests = rpg.make_full_data_test_samples(targets=binary_region, vectz_dict = {'tfidfV': X_selected}) # Fits classifiers and prints evaluation metrics. clf_bench = rpg.test_classifiers(data=clf_tests, clfs=CLASSIFIER_LIST, vectz='tfidfV', ssize=len(binary_region), scoring=['roc_auc']) # Prints summary of ROC AUC estimate from 10-fold cross validation, and # a number of best features of each classifier. top_feats = rpg.print_classifier(clfs=clf_bench, vectz=tfidf_vectz, feat_sel=feature_selector, num_feats=20)
('filter', VarianceThreshold()), ]) object_pipe = Pipeline([ ('separator', DTypeSelector(key='object')), ('encoder', FeatureHasher(input_type='string')), ('filter', VarianceThreshold()), ]) number_pipe = Pipeline([ ('separator', DTypeSelector(key='number')), ('filter', VarianceThreshold()), ]) feature_encoder = FeatureUnion(transformer_list=[('number', number_pipe), ('datetime', datetime_pipe), ('object', object_pipe), ]) feature_selector = GenericUnivariateSelect(mode='fwe', param=0.01) train_X_df = get_train_X_df(n_rows_with_caption=train_row_count_limit) y = get_train_y_values(n_rows_with_caption=train_row_count_limit) print('encoding features') X = feature_encoder.fit_transform(train_X_df, y) print('{} encoded'.format(X.shape), flush=True) print('selecting features') X = feature_selector.fit_transform(X, y) print('{} selected'.format(X.shape), flush=True) print_memory_usage() print('gc collecting') train_X_df_ref = [train_X_df]