def test_lofo_importance(): df = generate_test_data(1000) features = ["A", "B", "C", "D"] lgbm = LGBMRegressor(random_state=0, n_jobs=4) lofo = LOFOImportance(lgbm, df, features, 'binary_target', cv=4, scoring='roc_auc') importance_df = lofo.get_importance() assert len(features) == importance_df.shape[0], "Missing importance value for some features!" assert importance_df["feature"].values[0] == "B", "Most important feature is different than B!"
def test_multithreading(): df = generate_test_data(100000) features = ["A", "B", "C", "D"] lr = LogisticRegression(solver='liblinear') cv = KFold(n_splits=4, shuffle=True, random_state=0) lofo = LOFOImportance(df, features, 'binary_target', model=lr, cv=cv, scoring='roc_auc', n_jobs=3) importance_df = lofo.get_importance() assert len(features) == importance_df.shape[0], "Missing importance value for some features!" assert importance_df["feature"].values[0] == "B", "Most important feature is different than B!"
def test_default_model(): df = generate_unstructured_test_data(1000) df_checkpoint = df.copy() features = ["A", "B", "C", "D", "E"] lofo = LOFOImportance(df, features, 'target', cv=4, scoring='neg_mean_absolute_error') importance_df = lofo.get_importance() assert len(features) == importance_df.shape[ 0], "Missing importance value for some features!" lofo = LOFOImportance(df, features, 'binary_target', cv=4, scoring='roc_auc') importance_df = lofo.get_importance() assert df.equals(df_checkpoint), "LOFOImportance mutated the dataframe!" assert importance_df["feature"].values[ 0] == "E", "Most important feature is different than E!"
from sklearn.model_selection import KFold import xgboost as xgb xgbm = xgb.XGBClassifier() sample_df = trainset.sample(frac=0.001, random_state=0) # Due to huge execution time, a relatively subset from original train set is used for obtaining feature importance cv = KFold(n_splits=5, shuffle=False, random_state=0) print(sample_df.shape) from lofo.lofo_importance import LOFOImportance, plot_importance # feature importance library LOFO target = "HasDetections" features = [col for col in sample_df.drop(['MachineIdentifier', 'ProductName'], axis='columns').columns if col != target] lofo = LOFOImportance(sample_df, features, target, cv=cv, scoring="roc_auc", model=xgbm) importance_df = lofo.get_importance() print(importance_df.head()) plot_importance(importance_df, figsize=(12, 20)) # plot feature importance values oredered in descending to_keep_cols = list(importance_df['feature'].values[:30]) # get the first 30 important features from importance_df (this value is set 30 from plot, just a decision) to_keep_cols += ["HasDetections", 'MachineIdentifier'] print(to_keep_cols) """ ['SmartScreen', 'Wdft_IsGamer', 'AVProductStatesIdentifier', 'Census_HasOpticalDiskDrive', 'EngineVersion',