def transform(self, X): if (self.method == 'sfm'): if (self.n_vars is None): return sk_fs.SelectFromModel(self.clf, prefit=True).transform(X) else: return sk_fs.SelectFromModel(self.clf, max_features=self.n_vars, threshold=-np.inf, prefit=True).transform(X) elif (self.method == 'rfo'): return X[self.X_columns] elif (self.method == 'rfs'): return X[self.X_columns] elif (self.method == 'PCA'): return self.clf.transform(X) elif (self.method == 'skb'): return self.clf.transform(X) elif (self.method == 'eq'): return X elif (self.method == 'eli5_rfe'): return self.clf.transform(X) elif (self.method == 'biofes'): var_sel = list( self.Tcan.Disc.sort_values(by='0-1', ascending=False).iloc[:].index) return X[var_sel[0:min(self.n_vars, len(var_sel))]]
def _get_support_mask(self): if self._cache_support_mask is not None: return self._cache_support_mask if self.prefit: estimator = self.estimator elif hasattr(self, 'estimator_'): estimator = self.estimator_ else: raise ValueError( 'Either fit the model before transform or set "prefit=True"' ' while passing the fitted estimator to the constructor.') try: importances = getattr(estimator, "feature_importances_", None) if importances is not None and np.isnan(importances).all(): mask = np.ones(importances.shape, bool) else: mask = super(_SelectFromModel, self)._get_support_mask() except ValueError: sfm = sk_fsel.SelectFromModel( estimator.estimator_, self.threshold, True ) mask = sfm._get_support_mask() for i in self._out_mask: mask[i] = False for i in self._in_mask: mask[i] = True self._cache_support_mask = mask return mask
def get_thresh(model, train, test, label_test, label_train): if (len(test) > len(train)) or (len(label_test) > len(label_train)): raise TypeError('Invalid train and test size') model1 = XGBClassifier() if type(model) != type(XGBClassifier()): raise TypeError('Invalid model passed') if (pd.DataFrame(label_train).shape[1] > 1) or (pd.DataFrame(label_test).shape[1] > 1): raise TypeError('Multiple columns in label, Invalid shape.') max_score = 0 thrsh = 0 thresholds = np.sort(model.feature_importances_) for thresh in thresholds: selection = feature_selection.SelectFromModel(model, threshold=thresh, prefit=True) select_X_train = selection.transform(train) selection_model = XGBClassifier() selection_model.fit(select_X_train, label_train) select_X_test = selection.transform(test) y_pred = selection_model.predict(select_X_test) scr = metrics.roc_auc_score(label_test, y_pred) if (scr > max_score): max_score = scr thrsh = thresh return thrsh
def get_important_features(estimator, X, threshold='median'): if isinstance(estimator, sklearn.linear_model.Lasso) : selected_features = X.columns[estimator.coef_!=0] else: tmp_model = feature_selection.SelectFromModel(estimator, prefit=True, threshold=threshold) selected_features = X.columns[tmp_model.get_support()] return selected_features
def select_optimal_set(self, num_jobs): ''' Finds the best set of features to use through cross validation. It performs a grid search cross validation through possible cost values that yields the highest performance. The selector object will be set to this optimal model. ''' tuned_parameters = [{ 'C': [ 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000 ] }] feature_selector_model = LinearSVC(penalty='l1', dual=False, max_iter=5000, tol=0.005, class_weight=self.penalty_weights) scorer = make_scorer(precision_score, pos_label=None, average='macro') # Don't pre dispatch all jobs at once, only dispatch ones you are runnings so memory # usage does not blow up skf = StratifiedKFold(n_splits=5, shuffle=True) clf = GridSearchCV(estimator=feature_selector_model, param_grid=tuned_parameters, n_jobs=num_jobs, pre_dispatch="n_jobs", cv=skf, scoring=scorer) clf.fit(self.features, self.labels) self.current_model = clf.best_estimator_ self.current_transformer = feature_selection.SelectFromModel( clf.best_estimator_, prefit=True)
def FeatSelect(df, a, thresh, model_type): # a is the regularisation constant # thresh is the threshold with which we drop features with ridge coefficient below #model_type should be either "ridge" or "lasso" assert model_type == "ridge" or model_type == "lasso" #fit model copydf = df.copy() X = copydf.drop(["tna"], 1) t = copydf["tna"] if model_type == "ridge": reg = linear_model.Ridge(alpha=a) elif model_type == "lasso": reg = linear_model.Lasso(alpha=a) else: print(ERROR) return reg.fit(X, t) #drop unimportant features model = feature_selection.SelectFromModel(reg, threshold=thresh, prefit=True) copydf_new = model.transform(X) #Adding Column titles feature_names = np.array(X.columns) selected_features = feature_names[model.get_support()] copydf_new = pd.DataFrame(data=X, columns=selected_features) copydf_new["tna"] = t return copydf_new
def get_fs_model(model, method, train, target=None, cv=None): """Connects given model with specified feature selection method and trains the final structure. """ if method == "RFE": model = fs_scikit.RFE(model, 2, step=5) if target is not None: return model.fit(train, target) else: return model.fit(train) if method == "RFECV": model = fs_scikit.RFECV(model, 3, cv=cv) if target is not None: return model.fit(train, target) else: return model.fit(train) elif method == "linearSVC": sel = SelectFromModel(LinearSVC(penalty='l1', dual=False)) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "fromModel": fm = fs_scikit.SelectFromModel(model) if target is not None: fm.fit(train, target) else: fm.fit(train) model = Pipeline([('feature_selection', fm), ('data_mining', model)]) # elif method == "Anova": # ANOVA SVM-C # anova_filter = fs_scikit.SelectKBest(f_regression, k=5) # model = Pipeline([ # ('feature_selection', anova_filter), # ('data_mining', model) # ]) elif method == "VarianceThreshold": sel = fs_scikit.VarianceThreshold(threshold=(.8 * (1 - .8))) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "SelectPercentile": sel = fs_scikit.SelectPercentile(fs_scikit.f_classif, percentile=30) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "SelectFpr": sel = fs_scikit.SelectFpr(alpha=0.2) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "SelectFdr": sel = fs_scikit.SelectFdr(alpha=0.2) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "SelectFwe": sel = fs_scikit.SelectFwe(alpha=0.2) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "ch2": sel = fs_scikit.SelectKBest(fs_scikit.chi2, k=2) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) else: print("Feature selection method was not found: " + method) sys.exit(1) return model
def feature_selection_from_model(estimator, feature_data, target): estimator.fit(feature_data, target) features = pd.DataFrame({'feature':feature_data.columns, 'importance':estimator.feature_importances_}) features.sort_values(by=['importance'], ascending=True, inplace=True) features.set_index('feature', inplace=True) features.plot(kind='barh', figsize=(20, 20)) fs_model = feature_selection.SelectFromModel(estimator, prefit=True) return features, fs_model.transform(feature_data)
def feature_select(X, y): import sklearn.ensemble as ensemble clf = ensemble.ExtraTreesClassifier(random_state=0) clf.fit(X, y) import sklearn.feature_selection as fselect model = fselect.SelectFromModel(clf, prefit=True) X_new = model.transform(X) print("selected feature number:", X_new.shape) return X_new, model
def select_from_model_gbdt(arr0, target): from sklearn.ensemble import GradientBoostingClassifier matrix = np.array(arr0) target = np.array(target) temp = feature_selection.SelectFromModel(GradientBoostingClassifier()).fit( matrix, target) indx = temp._get_support_mask().tolist() scores = get_importance(temp.estimator_).tolist() # result = data_utility.retrieve_nan_index(temp.transform(matrix).tolist(), index) result = temp.transform(matrix).tolist() return scores, indx, result
def select_from_model_lr(arr0, target): from sklearn.linear_model import LogisticRegression matrix = np.array(arr0) target = np.array(target) temp = feature_selection.SelectFromModel( LogisticRegression(penalty="l1", C=0.1)).fit(matrix, target) indx = temp._get_support_mask().tolist() scores = get_importance(temp.estimator_).tolist() # threthold = temp.threshold_ # result = data_utility.retrieve_nan_index(temp.transform(matrix).tolist(), index) result = temp.transform(matrix).tolist() return scores, indx, result
def select_features(self, alpha=-1, threshold_q='mean'): fore_model = linear_model.RidgeCV(alphas=[alpha]) fore_model.fit(self.train_X, np.ravel(np.array(self.train_Y), 1)) sfm = feature_selection.SelectFromModel(fore_model, threshold=threshold_q, prefit=True) n_features = sfm.transform(self.train_X).shape[1] train_X_selected = pd.DataFrame( sfm.transform(self.train_X), index=self.train_X.index, columns=self.train_X.columns[sfm.get_support()]) return train_X_selected, sfm
def select_feature_by_L1(data_train, data_test): all_cols = [ 'sex', 'length', 'diameter', 'height', 'whole weight', 'shucked weight', 'viscera weight', 'shell weight' ] Y = data_train['rings'] X = data_train[all_cols] X_test = data_test[all_cols] svc = svm.LinearSVC(penalty='l1', dual=False).fit(X, Y) model = fs.SelectFromModel(svc, threshold=0.5, prefit=True) return (model.transform(X), model.transform(X_test))
def lars(): behavior_data, conn_data = pu.load_data_full_subjects() conn_data.astype(float) categorical_variables = ['smoking', 'deanxit_antidepressants', 'rivotril_antianxiety', 'sex'] categorical_data = behavior_data[categorical_variables] dummy_coded_categorical = pu.dummy_code_binary(categorical_data) covariate_data = pd.concat([behavior_data['age'], dummy_coded_categorical], axis=1) ml_data = pd.concat([conn_data, covariate_data], axis=1) target = behavior_data['distress_TQ'].values.astype(float) feature_names = list(ml_data) continuous_features = [f for f in feature_names if 'categorical' not in f] continuous_indices = [ml_data.columns.get_loc(cont) for cont in continuous_features] categorical_features = [f for f in feature_names if 'categorical' in f] categorical_indices = [ml_data.columns.get_loc(cat) for cat in categorical_features] ml_continuous = ml_data.values[:, continuous_indices] ml_categorical = ml_data.values[:, categorical_indices] # Standardization for continuous data preproc = preprocessing.StandardScaler().fit(ml_continuous) ml_z = preproc.transform(ml_continuous) # Variance threshold for categorical data varthresh = feature_selection.VarianceThreshold(threshold=0).fit(ml_categorical) ml_v = varthresh.transform(ml_categorical) ml_preprocessed = np.hstack((ml_z, ml_v)) # Feature selection with extra trees clf = ensemble.ExtraTreesRegressor() model = feature_selection.SelectFromModel(clf, threshold="2*mean") # Transform train and test data with feature selection model ml_cleaned = model.fit_transform(ml_preprocessed, target) feature_indices = model.get_support(indices=True) cleaned_features = [feature_names[i] for i in feature_indices] lars_classifier = linear_model.LarsCV(cv=3, normalize=False, fit_intercept=False) lars_classifier.fit(ml_cleaned, target) predicted = lars_classifier.predict(ml_cleaned) r2 = lars_classifier.score(ml_cleaned, target) exp_var = metrics.explained_variance_score(target, predicted) max_err = metrics.max_error(target, predicted) mae = metrics.mean_absolute_error(target, predicted) mse = metrics.mean_squared_error(target, predicted) print(r2)
def process(df): pipeline = Pipeline( memory=None, steps=[ ("standardize", StandardScaler()), ("feat_select", ftselect.SelectFromModel(RandomForestRegressor())), ( "regressor", # ElasticNetCV(cv=len(df.index), max_iter=1000, normalize=False), LassoCV(cv=len(df.index), max_iter=3000, normalize=False), ), ], ) return pipeline
def feature_selection_trees(trainX, trainY, testX, testY): """ Calculate the feature importance and select the most importance features It return the filtered training and testing sets """ ## Feature selection clf = ensemble.ExtraTreesClassifier(random_state=1729, n_estimators=250, n_jobs=-1) selector = clf.fit(trainX, trainY) fs = feature_selection.SelectFromModel(selector, prefit=True) trainX = fs.transform(trainX) testX = fs.transform(testX) return trainX, testX
def fit(self, on_engine, velocities, accelerations, *args): if on_engine.all(): self.base = self.model = DefaultStartStopModel() else: X = np.column_stack((velocities, accelerations) + args) model = sk_tree.DecisionTreeClassifier(random_state=0, max_depth=4) self.model = sk_pip.Pipeline([('feature_selection', sk_fsel.SelectFromModel(model)), ('classification', model)]) self.model.fit(X, on_engine) model = sk_tree.DecisionTreeClassifier(random_state=0, max_depth=3) self.base = sk_pip.Pipeline([ ('feature_selection', sk_prep.FunctionTransformer(lambda X: X[:, :2])), ('classification', model) ]) self.base.fit(X, on_engine) return self
def FeatSelect(df,k,m,tna_predict,thresh): #m is the dimension of feature expansion copydf = df.copy() old_df = copydf #This exists for us to extract values from later X=copydf.drop(["class"], 1) t=copydf["class"] clf = tree.DecisionTreeClassifier() clf = clf.fit(X, t) #drop unimportant features for each treshold tested model = feature_selection.SelectFromModel(clf, threshold=thresh, prefit=True) copydf_new = model.transform(X) #Adding Column titles feature_names = np.array(X.columns) selected_features = feature_names[model.get_support()] copydf_new = pd.DataFrame(data= X, columns= selected_features) copydf_new["class"] = t #Test new model return [old_df,copydf_new]
def TNAregression(train_df,test_df): #Gives predicted TNA values using optimised ridge regression #Fixing training data real_probeA = uncorrupt(train_df) FE2_probeA = FE_without_tna(real_probeA,2) stdprobe2A = standardisation_without_tna(FE2_probeA) stdprobe2A["ones"] = 1 copydf = stdprobe2A.copy() #Building regression model using known optimal hyper parameters X = copydf.drop(["tna"], 1) t = copydf["tna"] reg = linear_model.Ridge(alpha = 0.0016) reg.fit(X,t) #Feature selection model = feature_selection.SelectFromModel(reg, threshold=8.5, prefit=True) copydf_new = model.transform(X) #Adding Column titles for feature-selected dataframe feature_names = np.array(X.columns) selected_features = feature_names[model.get_support()] copydf_new = pd.DataFrame(data= X, columns= selected_features) copydf_new["tna"] = t #Rebuilding model for feature-selected dataframe X_new = copydf_new.drop(["tna"], 1) t_new = copydf_new["tna"] reg_new = linear_model.Ridge(alpha = 0.0016) reg_new.fit (X_new,t_new) #Fixing test data real_probeB = uncorrupt(test_df) FE2_probeB = FE_without_tna(real_probeB,2) stdprobe2B = standardisation_without_tna(FE2_probeB) stdprobe2B["ones"] = 1 copydf_test = stdprobe2B[X_new.columns] #tna_predict is column of tna predictions using optimal regression model tna_predict = reg_new.predict(copydf_test) tna_predict = pd.DataFrame(data=tna_predict) #converting to pandas DF to export return tna_predict
def SplitTrainTest_GetFeat_RF(df, method_ImpFeature_select): ''' Splits data into X's used for training and testing, and Y's used for the same. Then uses Random Forest classifier to sort important features from most to least important, using default metric in RF (i.e. mean decre. in impurity of feature) :param df: feature table data frame :param test_size: number between 0 and 1; % of entire data used as testing set :param method_ImpFeature_select: method of calculating feature importance scores; string: can be 'mean_decrease_in_impurity' or 'permutation_importance' :return: a pandas series with index as names of features ordered from most to least important, and 1 column of their relative importance scores ''' y = df.iloc[:, 0] # subset truelabel column (response) x = df[list(df.iloc[:, 1:df.shape[1]])] feat_imp = [] start = timer() clf_RF = RandomForestClassifier(bootstrap=True, n_estimators=500, n_jobs=2, oob_score=True, class_weight='balanced', criterion='gini', max_depth=60, max_features='auto', max_leaf_nodes=None, min_samples_leaf=2, min_samples_split=2, min_weight_fraction_leaf=0.0, random_state=0, verbose=1, warm_start=False) sfm = feature_selection.SelectFromModel( clf_RF, threshold='mean' ) # threshold of feature picking is the MEAN importance score # CAN CHANGE sfm.fit(x, y) # fit RF classifier x_sfm = sfm.transform(x) # pulling the important features only end = timer() # put together true label column + relevant features (from using SelectFromModel above) df_x_sfm = pd.DataFrame(x_sfm, columns=x.columns[sfm.get_support()]) n_features = x_sfm.shape[1] df_forSVM = pd.concat([y, df_x_sfm], axis=1) for index in sfm.get_support(indices=True): feat_imp.append(x.columns[index]) return [df_forSVM, n_features, end - start, feat_imp]
def train_drfs(train_x, train_y, eps=0.5, threshold="median"): n_samples, n_features, n_classes = \ get_counts_tt(train_x, train_y) # pick number of components min_comp = random_projection.johnson_lindenstrauss_min_dim( \ n_samples=n_samples, eps=eps) min_comp = min(min_comp, n_features) # scale and agglomerate to min_comp #scaler = preprocessing.StandardScaler() scaler = preprocessing.QuantileTransformer() feat_agg = cluster.FeatureAgglomeration( \ n_clusters=min_comp) xtc = ensemble.ExtraTreesClassifier(n_estimators=100, n_jobs=-1) scaler2 = preprocessing.RobustScaler() #poly = preprocessing.PolynomialFeatures(degree=2, interaction_only=True) # train the model pipeline dr_pipe = pipeline.Pipeline([('scaler', scaler), \ ('feat_agg', feat_agg), ('scaler2', scaler2)]) dr_pipe.fit(train_x) # transform train_x to train xtc train_x = dr_pipe.transform(train_x) # train the xtc xtc.fit(train_x, train_y) print("Feature importances:") print("\tMax:", max(xtc.feature_importances_)) print("\tMin:", min(xtc.feature_importances_)) #print(xtc.feature_importances_) # create the feature selection model from the xtc feat_sel = feature_selection.SelectFromModel( \ xtc, prefit=True, threshold=threshold) # create the pipeline to reduce dim then feature select drfs_pipe = pipeline.Pipeline(\ [('dr_pipe', dr_pipe), ('feat_sel', feat_sel)]) return drfs_pipe
def feature_selection_without_covariates(x_train, x_test, y_train, feature_names): # Standardization for continuous data preproc = preprocessing.StandardScaler().fit(x_train) x_train_z = preproc.transform(x_train) x_test_z = preproc.transform(x_test) # Feature selection with extra trees extra_tree_fs = ensemble.ExtraTreesClassifier(random_state=seed) feature_model = feature_selection.SelectFromModel(extra_tree_fs, threshold="2*mean") # Transform train and test data with feature selection model x_train_feature_selected = feature_model.fit_transform(x_train_z, y_train) x_test_feature_selected = feature_model.transform(x_test_z) feature_indices = feature_model.get_support(indices=True) cleaned_features = [feature_names[i] for i in feature_indices] return x_train_feature_selected, x_test_feature_selected, cleaned_features
def train(): data = pd.read_csv(args.datadir, names=['hash', 'entropy', 'y']) X = data.drop(['hash', 'y'], axis=1).values y = data['y'].values # Feature selection using Trees Classifier fsel = ske.ExtraTreesClassifier().fit(X, y) model = feature_selection.SelectFromModel(fsel, prefit=True) X_new = model.transform(X) nb_features = X_new.shape[1] X_train, X_test, y_train, y_test = model_selection.train_test_split( X_new, y, test_size=0.2) features = [] # XXX : take care of the feature order for f in sorted(np.argsort(fsel.feature_importances_)[::-1][:nb_features]): features.append(data.columns[2 + f]) #Algorithm comparison algorithms = { "DecisionTree": tree.DecisionTreeClassifier(max_depth=10), "RandomForest": ske.RandomForestClassifier(n_estimators=10), "GradientBoosting": ske.GradientBoostingClassifier(n_estimators=10), "AdaBoost": ske.AdaBoostClassifier(n_estimators=10), "GNB": GaussianNB() } results = {} for algo in algorithms: clf = algorithms[algo] clf.fit(X_train, y_train) score = clf.score(X_test, y_test) print("%s : %f %%" % (algo, score * 100)) results[algo] = score #choose best algorithm winner = max(results, key=results.get) #save model and features joblib.dump(algorithms[winner], os.path.join(args.output, 'classifier.pkl'))
def get_features(X, y, fsm): if fsm == '1': # SelectKBest kBest = math.ceil(len(X[0]) / 2) feature_scores = f_selection.SelectKBest(chi2, k=kBest).fit_transform(X, y) return feature_scores elif fsm == '2': # VarianceThreshold feature_scores = f_selection.VarianceThreshold( threshold=vThreshold).fit_transform(X) return feature_scores elif fsm == '3': # SelectFromModel clf = ExtraTreesClassifier(random_state=200) clf = clf.fit(X, y) model = f_selection.SelectFromModel(clf).fit(X, y) feature_scores = model.transform(X) return feature_scores else: raise ValueError("invalid fsm")
def modelSelector(self, model_name="rf", inplace=True): """ :method_name {"rf", "lasso"} """ print("Feature selecting method: ", model_name) selector = { "rf": ensemble.RandomForestClassifier(n_estimators=10), "lasso": linear_model.LassoCV(cv=5, max_iter=5000) } model = selector[model_name] sler = feature_selection.SelectFromModel(model) sler.fit(self.x, self.y) self.indexs = sler.get_support() if inplace: self.x = self.x[:, self.indexs] self.n = self.x.shape[1] self.featureNames = self.featureNames[self.indexs] return self.x, self.y else: return self.x[:, self.indexs], self.y
def train_secondary(X_train, Y_train): clf = Pipeline(steps=[('pca', feature_selection.SelectFromModel(DecisionTreeClassifier(), threshold=0.05)), ('dt', DecisionTreeClassifier())]) params = dict( dt__max_depth=np.arange(5, 105, 20), ) best_clf = GridSearchCV(clf, params, n_jobs=16, scoring='roc_auc', verbose=0, cv=3) best_clf.fit(X_train, Y_train) joblib.dump(best_clf.best_estimator_, './best_secondary_model.pkl') print best_clf.best_score_ print best_clf.best_params_ return best_clf.best_estimator_
def feature_selection_with_covariates(x_train, x_test, y_train, continuous_indices, categorical_indices, feature_names): # Split data for continuous, categorical preprocessing x_train_cont, x_test_cont = x_train[:, continuous_indices], x_test[:, continuous_indices] x_train_cat, x_test_cat = x_train[:, categorical_indices], x_test[:, categorical_indices] # Standardization for continuous data preproc = preprocessing.StandardScaler().fit(x_train_cont) x_train_z = preproc.transform(x_train_cont) x_test_z = preproc.transform(x_test_cont) # Variance threshold for categorical data varthresh = feature_selection.VarianceThreshold( threshold=0).fit(x_train_cat) x_train_v = varthresh.transform(x_train_cat) x_test_v = varthresh.transform(x_test_cat) x_train_data = np.hstack((x_train_z, x_train_v)) x_test_data = np.hstack((x_test_z, x_test_v)) # Feature selection with extra trees extra_tree_fs = ensemble.ExtraTreesClassifier(random_state=seed) feature_model = feature_selection.SelectFromModel(extra_tree_fs, threshold="2*mean") # Transform train and test data with feature selection model x_train_feature_selected = feature_model.fit_transform( x_train_data, y_train) x_test_feature_selected = feature_model.transform(x_test_data) feature_indices = feature_model.get_support(indices=True) cleaned_features = [feature_names[i] for i in feature_indices] return x_train_feature_selected, x_test_feature_selected, cleaned_features
def FeatSelectDT(df,thresh): #thresh is the threshold hyper parameter copydf = df.copy() #Building decision tree model X=copydf.drop(["class"], 1) t=copydf["class"] clf = tree.DecisionTreeClassifier() clf = clf.fit(X, t) #Drop unimportant features below the treshold hyper parameter model = feature_selection.SelectFromModel(clf, threshold=thresh, prefit=True) copydf_new = model.transform(X) #Adding Column titles feature_names = np.array(X.columns) selected_features = feature_names[model.get_support()] copydf_new = pd.DataFrame(data= X, columns= selected_features) copydf_new["class"] = t return copydf_new
def _get_support_mask(self): try: mask = super(_SelectFromModel, self)._get_support_mask() except ValueError: # SelectFromModel can directly call on transform. if self.prefit: estimator = self.estimator elif hasattr(self, 'estimator_'): estimator = self.estimator_ else: raise ValueError( 'Either fit the model before transform or set "prefit=True"' ' while passing the fitted estimator to the constructor.') sfm = sk_fsel.SelectFromModel(estimator.estimator_, self.threshold, True) mask = sfm._get_support_mask() for i in self._out_mask: mask[i] = False for i in self._in_mask: mask[i] = True return mask
def select_features( X_train: pd.DataFrame, X_valid: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, parameters: dict, model=None, ) -> list: """Extracts the most relevent features from the sample. Args: X_train: training data. X_valid: validation data. X_test: test data. parameters: parameters defined in parameters.yml. Returns: A dictionary containing: The reduced training, validation and test data. Lists of included and excluded variables. """ log = logging.getLogger(__name__) paras = parameters["features"]["selection"] if paras["skip"]: log.warning(red("Skipping feature selection.")) pause() return dict( X_train_reduced=X_train, X_valid_reduced=X_valid, X_test_reduced=X_test, included=X_train.columns, excluded=[], ) # Get feature names features = list(X_train.columns) if paras["type"] == "model": # Instantiate the selector and fit it to the training data lsvc = LinearSVC(**paras["model"]).fit(X_train, y_train) selector = fs.SelectFromModel(lsvc, prefit=True) elif paras["type"] == "k_best": selector = fs.SelectKBest(fs.mutual_info_classif, k=paras["k_best"]["k"]) selector.fit(X_train, y_train) # Get feature mask mask = selector.get_support() included = list(compress(features, mask)) excluded = list(set(features) - set(included)) log.info(blue("Included {} variables: {}".format(len(included), included))) log.info(blue("Excluded {} variables: {}".format(len(excluded), excluded))) # Transform the datasets using the fitted selector X_train_reduced = selector.transform(X_train) X_valid_reduced = selector.transform(X_valid) X_test_reduced = selector.transform(X_test) return dict( X_train_reduced=X_train_reduced, X_valid_reduced=X_valid_reduced, X_test_reduced=X_test_reduced, included=included, excluded=excluded, )