Esempio n. 1
0
def test_model_n_trees_non_positive():
    case_kwargs = copy.deepcopy(toy_kwargs)
    case_kwargs.update({"n_trees": 0})
    model = CascadeForestClassifier(**case_kwargs)
    with pytest.raises(ValueError) as excinfo:
        model._set_n_trees(0)
    assert "should be strictly positive." in str(excinfo.value)
Esempio n. 2
0
def test_model_n_trees_invalid():
    case_kwargs = copy.deepcopy(toy_kwargs)
    case_kwargs.update({"n_trees": [42]})
    model = CascadeForestClassifier(**case_kwargs)
    with pytest.raises(ValueError) as excinfo:
        model._set_n_trees(0)
    assert "Invalid value for n_trees." in str(excinfo.value)
Esempio n. 3
0
def test_model_n_trees_auto():
    case_kwargs = copy.deepcopy(toy_kwargs)
    case_kwargs.update({"n_trees": "auto"})
    model = CascadeForestClassifier(**case_kwargs)

    n_trees = model._set_n_trees(0)
    assert n_trees == 100

    n_trees = model._set_n_trees(2)
    assert n_trees == 300

    n_trees = model._set_n_trees(10)
    assert n_trees == 500
Esempio n. 4
0
def test_model_invalid_training_params(param):
    case_kwargs = copy.deepcopy(toy_kwargs)
    case_kwargs.update(param[1])

    model = CascadeForestClassifier(**case_kwargs)

    with pytest.raises(ValueError) as excinfo:
        model.fit(X_train, y_train)

    if param[0] == 0:
        assert "max_layers" in str(excinfo.value)
    elif param[0] == 1:
        assert "n_tolerant_rounds" in str(excinfo.value)
    elif param[0] == 2:
        assert "delta " in str(excinfo.value)
Esempio n. 5
0
def test_custom_estimator_missing_fit():
    class tmp_estimator:
        def __init__(self):
            pass

    model = CascadeForestClassifier()
    with pytest.raises(AttributeError) as excinfo:
        model.set_estimator([tmp_estimator()])
    assert "The `fit` method of estimator" in str(excinfo.value)

    with pytest.raises(AttributeError) as excinfo:
        model.set_predictor(tmp_estimator())
    assert "The `fit` method of the predictor" in str(excinfo.value)
Esempio n. 6
0
def test_model_workflow_in_memory():
    """Run the workflow of deep forest with in-memory mode."""

    case_kwargs = copy.deepcopy(kwargs)
    case_kwargs.update({"partial_mode": False})

    model = CascadeForestClassifier(**case_kwargs)
    model.fit(X_train, y_train)

    # Predictions before saving
    y_pred_before = model.predict(X_test)

    # Save and Reload
    model.save(save_dir)

    model = CascadeForestClassifier(**case_kwargs)
    model.load(save_dir)

    # Make sure the same predictions before and after model serialization
    y_pred_after = model.predict(X_test)

    assert_array_equal(y_pred_before, y_pred_after)

    shutil.rmtree(save_dir)
Esempio n. 7
0
def test_model_sample_weight():
    """Run the workflow of deep forest with a local buffer."""

    case_kwargs = copy.deepcopy(kwargs)

    # Training without sample_weight
    model = CascadeForestClassifier(**case_kwargs)
    model.fit(X_train, y_train)
    y_pred_no_sample_weight = model.predict(X_test)

    # Training with equal sample_weight
    model = CascadeForestClassifier(**case_kwargs)
    sample_weight = np.ones(y_train.size)
    model.fit(X_train, y_train, sample_weight=sample_weight)
    y_pred_equal_sample_weight = model.predict(X_test)

    # Make sure the same predictions with None and equal sample_weight
    assert_array_equal(y_pred_no_sample_weight, y_pred_equal_sample_weight)

    model = CascadeForestClassifier(**case_kwargs)
    sample_weight = np.where(y_train == 0, 0.1, y_train)
    model.fit(X_train, y_train, sample_weight=y_train)
    y_pred_skewed_sample_weight = model.predict(X_test)

    # Make sure the different predictions with None and equal sample_weight
    assert_raises(AssertionError, assert_array_equal,
                  y_pred_skewed_sample_weight, y_pred_equal_sample_weight)

    model.clean()  # clear the buffer
Esempio n. 8
0
def test_model_workflow_partial_mode():
    """Run the workflow of deep forest with a local buffer."""

    case_kwargs = copy.deepcopy(kwargs)
    case_kwargs.update({"partial_mode": True})

    model = CascadeForestClassifier(**case_kwargs)
    model.fit(X_train, y_train)

    # Predictions before saving
    y_pred_before = model.predict(X_test)

    # Save and Reload
    model.save(save_dir)

    model = CascadeForestClassifier(**case_kwargs)
    model.load(save_dir)

    # Predictions after loading
    y_pred_after = model.predict(X_test)

    # Make sure the same predictions before and after model serialization
    assert_array_equal(y_pred_before, y_pred_after)

    model.clean()  # clear the buffer
    shutil.rmtree(save_dir)
Esempio n. 9
0
def test_model_properties_after_fitting():
    """Check the model properties after fitting a deep forest model."""
    model = CascadeForestClassifier(**toy_kwargs)
    model.fit(X_train, y_train)

    assert len(model) == model.n_layers_

    assert model[0] is model._get_layer(0)

    with pytest.raises(ValueError) as excinfo:
        model._get_layer(model.n_layers_)
    assert "The layer index should be in the range" in str(excinfo.value)

    with pytest.raises(RuntimeError) as excinfo:
        model._set_layer(0, None)
    assert "already exists in the internal container" in str(excinfo.value)

    with pytest.raises(ValueError) as excinfo:
        model._get_binner(model.n_layers_ + 1)
    assert "The binner index should be in the range" in str(excinfo.value)

    with pytest.raises(RuntimeError) as excinfo:
        model._set_binner(0, None)
    assert "already exists in the internal container" in str(excinfo.value)
Esempio n. 10
0
def test_model_input_label_encoder():
    """Test if the model behaves the same with and without label encoding."""

    # Load data
    X, y = load_digits(return_X_y=True)
    y_as_str = np.char.add("label_", y.astype(str))

    # Train model on integer labels. Labels should look like: 1, 2, 3, ...
    model = CascadeForestClassifier(**toy_kwargs)
    model.fit(X, y)
    y_pred_int_labels = model.predict(X)

    # Train model on string labels. Labels should look like: "label_1", "label_2", "label_3", ...
    model = CascadeForestClassifier(**toy_kwargs)
    model.fit(X, y_as_str)
    y_pred_str_labels = model.predict(X)

    # Check if the underlying data are the same
    y_pred_int_labels_as_str = np.char.add("label_",
                                           y_pred_int_labels.astype(str))
    assert_array_equal(y_pred_str_labels, y_pred_int_labels_as_str)

    # Clean up buffer
    model.clean()
Esempio n. 11
0
def test_model_properties_after_fitting():
    """Check the model properties after fitting a deep forest model."""
    model = CascadeForestClassifier(**toy_kwargs)
    model.fit(X_train, y_train)

    assert len(model) == model.n_layers_

    assert model[0] is model._get_layer(0)

    with pytest.raises(ValueError) as excinfo:
        model._get_layer(model.n_layers_)
    assert "The layer index should be in the range" in str(excinfo.value)

    with pytest.raises(RuntimeError) as excinfo:
        model._set_layer(0, None)
    assert "already exists in the internal container" in str(excinfo.value)

    with pytest.raises(ValueError) as excinfo:
        model._get_binner(model.n_layers_ + 1)
    assert "The binner index should be in the range" in str(excinfo.value)

    with pytest.raises(RuntimeError) as excinfo:
        model._set_binner(0, None)
    assert "already exists in the internal container" in str(excinfo.value)

    # Test the hook on forest estimator
    assert (model.get_forest(0, 0, "rf") is
            model._get_layer(0).estimators_["0-0-rf"].estimator_)

    with pytest.raises(ValueError) as excinfo:
        model.get_forest(model.n_layers_, 0, "rf")
    assert "`layer_idx` should be in the range" in str(excinfo.value)

    with pytest.raises(ValueError) as excinfo:
        model.get_forest(0, model.n_estimators, "rf")
    assert "`est_idx` should be in the range" in str(excinfo.value)

    with pytest.raises(ValueError) as excinfo:
        model.get_forest(0, 0, "Unknown")
    assert "`forest_type` should be one of" in str(excinfo.value)
Esempio n. 12
0
    x, y, random_state=1, train_size=0.99, test_size=0.01)
#划分数据与标签(验证集)
valx, valy = np.split(valdata, indices_or_sections=(10, ),
                      axis=1)  #valx为数据,valy为标签
valx = valx[:, 0:10]  #选取前n个波段作为特征

train_data = train_data.astype(np.uint8)
test_data = test_data.astype(np.uint8)
train_label = train_label.astype(np.uint8)
test_label = test_label.astype(np.uint8)
#验证集
valx = valx.astype(np.uint8)
valy = valy.astype(np.uint8)

classification = CascadeForestClassifier(max_depth=5,
                                         n_trees=100,
                                         n_jobs=6,
                                         use_predictor="forest")

classification.fit(train_data, train_label.ravel())
prediction = classification.predict_proba(valx)
probability = [prob[1] for prob in prediction]
pre_class = []
for i in probability:
    if i > 0.5:
        pre_class.append(1)
    else:
        pre_class.append(0)
prediction = prediction[:, 1]

pred = classification.predict(valx)
Esempio n. 13
0
def test_custom_base_estimator_wrong_estimator_type():

    model = CascadeForestClassifier()
    with pytest.raises(ValueError) as excinfo:
        model.set_estimator(42)
    assert "estimators should be a list" in str(excinfo.value)
Esempio n. 14
0
def test_custom_cascade_layer_workflow_partial_mode():

    model = CascadeForestClassifier(partial_mode=True)

    n_estimators = 4
    estimators = [DecisionTreeClassifier() for _ in range(n_estimators)]
    model.set_estimator(estimators)  # set custom base estimators

    predictor = DecisionTreeClassifier()
    model.set_predictor(predictor)

    model.fit(X_train, y_train)
    y_pred_before = model.predict(X_test)

    # Save and Reload
    model.save(save_dir)

    model = CascadeForestClassifier()
    model.load(save_dir)

    # Predictions after loading
    y_pred_after = model.predict(X_test)

    # Make sure the same predictions before and after model serialization
    assert_array_equal(y_pred_before, y_pred_after)

    model.clean()  # clear the buffer
    shutil.rmtree(save_dir)
    df_sum_y = pd.DataFrame(sum_y.T, columns=['Class', 'Sum'], index=None)
    print('\n', df_sum_y)

    # Apply SMOTE 生成 fake data
    sm = SMOTE(k_neighbors=args.kneighbors,
               random_state=args.randomseed,
               n_jobs=-1)
    x_resampled, y_resampled = sm.fit_sample(X, y)
    # after over sampleing 读取分类信息并返回数量
    np_resampled_y = np.asarray(np.unique(y_resampled, return_counts=True))
    df_resampled_y = pd.DataFrame(np_resampled_y.T, columns=['Class', 'Sum'])
    print("\nNumber of samples after over sampleing:\n{0}\n".format(
        df_resampled_y))

    # 初始化 classifier
    clf = CascadeForestClassifier(random_state=args.randomseed)
    print("\nClassifier parameters:")
    print(clf.get_params())
    print("\nSMOTE parameters:")
    print(sm.get_params())
    print("\n")

    # 使用SMOTE后数据进行训练
    clf.fit(x_resampled, y_resampled)
    # 预测测试集
    y_pred = clf.predict(X_test)

    # 输出测试集统计结果
    if (num_categories > 2):
        model_evaluation(num_categories, y_test, y_pred)
    else:
Esempio n. 16
0
import numpy as np
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
from deepforest import CascadeForestClassifier
from sklearn.model_selection import train_test_split

X, y = load_iris(return_X_y=True)

X = np.array(X)
y = np.array(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = CascadeForestClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred) * 100
print("Accuracy: {:.3f} %".format(acc))
Esempio n. 17
0
    df_sum_y = pd.DataFrame(sum_y.T, columns=['Class', 'Sum'], index=None)
    print('\n', df_sum_y)

    # Apply SMOTE 生成 fake data
    sm = SMOTE(k_neighbors=args.kneighbors,
               random_state=args.randomseed,
               n_jobs=-1)
    x_resampled, y_resampled = sm.fit_sample(X, y)
    # after over sampleing 读取分类信息并返回数量
    np_resampled_y = np.asarray(np.unique(y_resampled, return_counts=True))
    df_resampled_y = pd.DataFrame(np_resampled_y.T, columns=['Class', 'Sum'])
    print("\nNumber of samples after over sampleing:\n{0}\n".format(
        df_resampled_y))

    # 初始化 classifier
    clf = CascadeForestClassifier(random_state=args.randomseed)
    print("\nClassifier parameters:")
    print(clf.get_params())
    print("\nSMOTE parameters:")
    print(sm.get_params())
    print("\n")
    # 交叉验证
    y_pred_resampled = cross_val_predict(clf,
                                         x_resampled,
                                         y_resampled,
                                         cv=args.kfolds,
                                         n_jobs=-1,
                                         verbose=0)
    # 通过 index 去除 fake data
    y_pred = y_pred_resampled[0:X.shape[0]]
        records = []

        for idx, random_state in enumerate(random_states):

            msg = "Currently processing {} with trial {}..."
            print(msg.format(dataset, idx))

            model = CascadeForestClassifier(
                n_bins=n_bins,
                bin_subsample=bin_subsample,
                max_layers=max_layers,
                criterion=criterion,
                n_estimators=n_estimators,
                n_trees=n_trees,
                max_depth=max_depth,
                min_samples_leaf=min_samples_leaf,
                use_predictor=use_predictor,
                predictor=predictor,
                n_tolerant_rounds=n_tolerant_rounds,
                partial_mode=partial_mode,
                delta=delta,
                n_jobs=n_jobs,
                random_state=random_state,
                verbose=verbose,
            )

            tic = time.time()
            model.fit(X_train, y_train)
            toc = time.time()
            training_time = toc - tic

            tic = time.time()
Esempio n. 19
0
# feature_names


X, y = shap.datasets.iris()

# print("X", X.head())
# print("y", y)

feature_names = X.columns
X = np.array(X)
y = np.array(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = CascadeForestClassifier(backend="sklearn")
model.fit(X_train, y_train)
forest = model.get_forest(0, 0, "rf")
explainer = shap.TreeExplainer(forest)
shap_values = explainer.shap_values(X_test)

# shap_values = np.array(explainer.shap_values(X_train))
# print(shap_values.shape)
# print(shap_values[2].shape)

# shap.summary_plot(shap_values[2], X_train)
#
#
# clf = RandomForestClassifier(n_estimators=100)
# clf.fit(X_train, y_train)
# explainer = shap.TreeExplainer(clf)
def test_classifier_custom_cascade_layer_workflow_in_memory():

    model = CascadeForestClassifier()

    n_estimators = 4
    estimators = [DecisionTreeClassifier() for _ in range(n_estimators)]
    model.set_estimator(estimators)  # set custom base estimators

    predictor = DecisionTreeClassifier()
    model.set_predictor(predictor)

    model.fit(X_train_clf, y_train_clf)
    y_pred_before = model.predict(X_test_clf)

    # Save and Reload
    model.save(save_dir)

    model = CascadeForestClassifier()
    model.load(save_dir)

    # Predictions after loading
    y_pred_after = model.predict(X_test_clf)

    # Make sure the same predictions before and after model serialization
    assert_array_equal(y_pred_before, y_pred_after)

    assert (model.get_estimator(0, 0, "custom") is
            model._get_layer(0).estimators_["0-0-custom"].estimator_)

    model.clean()  # clear the buffer
    shutil.rmtree(save_dir)
Esempio n. 21
0
def test_model_workflow_in_memory(backend):
    """Run the workflow of deep forest with in-memory mode."""

    case_kwargs = copy.deepcopy(kwargs)
    case_kwargs.update({"partial_mode": False})
    case_kwargs.update({"backend": backend})

    model = CascadeForestClassifier(**case_kwargs)
    model.fit(X_train, y_train)

    # Test feature_importances_
    if backend == "sklearn":
        model.get_layer_feature_importances(0)
    else:
        with pytest.raises(RuntimeError) as excinfo:
            model.get_layer_feature_importances(0)
        assert "Please use the sklearn backend" in str(excinfo.value)

    # Predictions before saving
    y_pred_before = model.predict(X_test)

    # Save and Reload
    model.save(save_dir)

    model = CascadeForestClassifier(**case_kwargs)
    model.load(save_dir)

    # Make sure the same predictions before and after model serialization
    y_pred_after = model.predict(X_test)

    assert_array_equal(y_pred_before, y_pred_after)

    shutil.rmtree(save_dir)
Esempio n. 22
0
import shap
import numpy as np
from deepforest import CascadeForestClassifier
from sklearn.model_selection import train_test_split

# load JS visualization code to notebook
shap.initjs()

X, y = shap.datasets.iris()

X = np.array(X)
y = np.array(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = CascadeForestClassifier()

model.fit(X_train, y_train)

explainer = shap.TreeExplainer(model)