def test_lofo_importance():
    df = generate_test_data(1000)

    features = ["A", "B", "C", "D"]

    lgbm = LGBMRegressor(random_state=0, n_jobs=4)

    lofo = LOFOImportance(lgbm, df, features, 'binary_target', cv=4, scoring='roc_auc')

    importance_df = lofo.get_importance()

    assert len(features) == importance_df.shape[0], "Missing importance value for some features!"
    assert importance_df["feature"].values[0] == "B", "Most important feature is different than B!"
Esempio n. 2
0
def test_multithreading():
    df = generate_test_data(100000)

    features = ["A", "B", "C", "D"]

    lr = LogisticRegression(solver='liblinear')
    cv = KFold(n_splits=4, shuffle=True, random_state=0)

    lofo = LOFOImportance(df, features, 'binary_target', model=lr, cv=cv, scoring='roc_auc', n_jobs=3)

    importance_df = lofo.get_importance()

    assert len(features) == importance_df.shape[0], "Missing importance value for some features!"
    assert importance_df["feature"].values[0] == "B", "Most important feature is different than B!"
def test_default_model():
    df = generate_unstructured_test_data(1000)
    df_checkpoint = df.copy()

    features = ["A", "B", "C", "D", "E"]

    lofo = LOFOImportance(df,
                          features,
                          'target',
                          cv=4,
                          scoring='neg_mean_absolute_error')
    importance_df = lofo.get_importance()
    assert len(features) == importance_df.shape[
        0], "Missing importance value for some features!"

    lofo = LOFOImportance(df,
                          features,
                          'binary_target',
                          cv=4,
                          scoring='roc_auc')
    importance_df = lofo.get_importance()

    assert df.equals(df_checkpoint), "LOFOImportance mutated the dataframe!"
    assert importance_df["feature"].values[
        0] == "E", "Most important feature is different than E!"
Esempio n. 4
0
from sklearn.model_selection import KFold
import xgboost as xgb

xgbm = xgb.XGBClassifier()

sample_df = trainset.sample(frac=0.001, random_state=0) # Due to huge execution time, a relatively subset from original train set is used for obtaining feature importance
cv = KFold(n_splits=5, shuffle=False, random_state=0)
print(sample_df.shape)

from lofo.lofo_importance import LOFOImportance, plot_importance # feature importance library LOFO

target = "HasDetections"
features = [col for col in sample_df.drop(['MachineIdentifier', 'ProductName'], axis='columns').columns if col != target]

lofo = LOFOImportance(sample_df, features, target, cv=cv, scoring="roc_auc", model=xgbm)

importance_df = lofo.get_importance()
print(importance_df.head())

plot_importance(importance_df, figsize=(12, 20)) # plot feature importance values oredered in descending

to_keep_cols = list(importance_df['feature'].values[:30]) # get the first 30 important features from importance_df (this value is set 30 from plot, just a decision)
to_keep_cols += ["HasDetections", 'MachineIdentifier']
print(to_keep_cols)
"""
['SmartScreen',
 'Wdft_IsGamer',
 'AVProductStatesIdentifier',
 'Census_HasOpticalDiskDrive',
 'EngineVersion',