def test_dataset():
    df = generate_test_data(1000, text=True)
    features = ["A", "B", "C", "D"]

    # Exception: feature group row count is not equal to the features' row count
    feature_groups = {
        "interactions": df[["A", "B"]].values[:10] * df[["C", "D"]].values[:10]
    }
    with pytest.raises(Exception):
        assert Dataset(df=df,
                       target="binary_target",
                       features=features,
                       feature_groups=feature_groups)

    # Exception: Feature group name A is in use by other features
    feature_groups = {"A": df[["A", "B"]].values * df[["C", "D"]].values}
    with pytest.raises(Exception):
        assert Dataset(df=df,
                       target="binary_target",
                       features=features,
                       feature_groups=feature_groups)

    # Exception: Feature group type is not numpy.ndarray or scipy.csr.csr_matrix
    feature_groups = {"E": df[["A", "B"]]}
    with pytest.raises(Exception):
        assert Dataset(df=df,
                       target="binary_target",
                       features=features,
                       feature_groups=feature_groups)
def test_feature_groups():
    df = generate_test_data(1000, text=True)
    features = ["A", "B", "C", "D"]

    cv = CountVectorizer(ngram_range=(3, 3), analyzer="char")
    feature_groups = dict()
    feature_groups["names"] = cv.fit_transform(df["T"])
    feature_groups["interactions"] = df[["A", "B"]].values * df[["C", "D"
                                                                 ]].values

    dataset = Dataset(df=df,
                      target="binary_target",
                      features=features,
                      feature_groups=feature_groups)

    lgbm = LGBMClassifier(random_state=0, n_jobs=4)

    lofo = LOFOImportance(dataset, model=lgbm, cv=4, scoring='roc_auc')

    importance_df = lofo.get_importance()

    assert len(features) + len(feature_groups) == importance_df.shape[
        0], "Missing importance value for some features!"
    assert importance_df["feature"].values[
        0] == "names", "Most important feature is different than 'names'!"
def test_flofo_importance():
    df = generate_test_data(100000)
    df.loc[df["A"] < df["A"].median(), "A"] = None

    train_df, val_df = train_test_split(df, test_size=0.2, random_state=0)
    val_df_checkpoint = val_df.copy()

    features = ["A", "B", "C", "D"]

    lgbm = LGBMClassifier(random_state=0, n_jobs=1)
    lgbm.fit(train_df[features], train_df["binary_target"])

    flofo = FLOFOImportance(lgbm, df, features, 'binary_target', scoring='roc_auc')
    flofo_parallel = FLOFOImportance(lgbm, df, features, 'binary_target', scoring='roc_auc', n_jobs=3)

    importance_df = flofo.get_importance()
    importance_df_parallel = flofo_parallel.get_importance()
    is_feature_order_same = importance_df["feature"].values == importance_df_parallel["feature"].values

    plot_importance(importance_df)

    assert is_feature_order_same.sum() == len(features), "Parallel FLOFO returned different result!"
    assert val_df.equals(val_df_checkpoint), "LOFOImportance mutated the dataframe!"
    assert len(features) == importance_df.shape[0], "Missing importance value for some features!"
    assert importance_df["feature"].values[0] == "B", "Most important feature is different than B!"
def test_lofo_importance():
    df = generate_test_data(1000)

    features = ["A", "B", "C", "D"]

    lgbm = LGBMRegressor(random_state=0, n_jobs=4)

    lofo = LOFOImportance(lgbm, df, features, 'binary_target', cv=4, scoring='roc_auc')

    importance_df = lofo.get_importance()

    assert len(features) == importance_df.shape[0], "Missing importance value for some features!"
    assert importance_df["feature"].values[0] == "B", "Most important feature is different than B!"
Beispiel #5
0
def test_multithreading():
    df = generate_test_data(100000)

    features = ["A", "B", "C", "D"]

    lr = LogisticRegression(solver='liblinear')
    cv = KFold(n_splits=4, shuffle=True, random_state=0)

    lofo = LOFOImportance(df, features, 'binary_target', model=lr, cv=cv, scoring='roc_auc', n_jobs=3)

    importance_df = lofo.get_importance()

    assert len(features) == importance_df.shape[0], "Missing importance value for some features!"
    assert importance_df["feature"].values[0] == "B", "Most important feature is different than B!"
def test_lofo_importance():
    df = generate_test_data(1000)
    features = ["A", "B", "C", "D"]
    dataset = Dataset(df=df, target="binary_target", features=features)

    lgbm = LGBMClassifier(random_state=0, n_jobs=4)

    lofo = LOFOImportance(dataset, model=lgbm, cv=4, scoring='roc_auc')

    importance_df = lofo.get_importance()

    plot_importance(importance_df)

    assert len(features) == importance_df.shape[
        0], "Missing importance value for some features!"
    assert importance_df["feature"].values[
        0] == "B", "Most important feature is different than B!"