def test_model_n_trees_non_positive(): case_kwargs = copy.deepcopy(toy_kwargs) case_kwargs.update({"n_trees": 0}) model = CascadeForestClassifier(**case_kwargs) with pytest.raises(ValueError) as excinfo: model._set_n_trees(0) assert "should be strictly positive." in str(excinfo.value)
def test_model_n_trees_invalid(): case_kwargs = copy.deepcopy(toy_kwargs) case_kwargs.update({"n_trees": [42]}) model = CascadeForestClassifier(**case_kwargs) with pytest.raises(ValueError) as excinfo: model._set_n_trees(0) assert "Invalid value for n_trees." in str(excinfo.value)
def test_model_n_trees_auto(): case_kwargs = copy.deepcopy(toy_kwargs) case_kwargs.update({"n_trees": "auto"}) model = CascadeForestClassifier(**case_kwargs) n_trees = model._set_n_trees(0) assert n_trees == 100 n_trees = model._set_n_trees(2) assert n_trees == 300 n_trees = model._set_n_trees(10) assert n_trees == 500
def test_model_invalid_training_params(param): case_kwargs = copy.deepcopy(toy_kwargs) case_kwargs.update(param[1]) model = CascadeForestClassifier(**case_kwargs) with pytest.raises(ValueError) as excinfo: model.fit(X_train, y_train) if param[0] == 0: assert "max_layers" in str(excinfo.value) elif param[0] == 1: assert "n_tolerant_rounds" in str(excinfo.value) elif param[0] == 2: assert "delta " in str(excinfo.value)
def test_custom_estimator_missing_fit(): class tmp_estimator: def __init__(self): pass model = CascadeForestClassifier() with pytest.raises(AttributeError) as excinfo: model.set_estimator([tmp_estimator()]) assert "The `fit` method of estimator" in str(excinfo.value) with pytest.raises(AttributeError) as excinfo: model.set_predictor(tmp_estimator()) assert "The `fit` method of the predictor" in str(excinfo.value)
def test_model_workflow_in_memory(): """Run the workflow of deep forest with in-memory mode.""" case_kwargs = copy.deepcopy(kwargs) case_kwargs.update({"partial_mode": False}) model = CascadeForestClassifier(**case_kwargs) model.fit(X_train, y_train) # Predictions before saving y_pred_before = model.predict(X_test) # Save and Reload model.save(save_dir) model = CascadeForestClassifier(**case_kwargs) model.load(save_dir) # Make sure the same predictions before and after model serialization y_pred_after = model.predict(X_test) assert_array_equal(y_pred_before, y_pred_after) shutil.rmtree(save_dir)
def test_model_sample_weight(): """Run the workflow of deep forest with a local buffer.""" case_kwargs = copy.deepcopy(kwargs) # Training without sample_weight model = CascadeForestClassifier(**case_kwargs) model.fit(X_train, y_train) y_pred_no_sample_weight = model.predict(X_test) # Training with equal sample_weight model = CascadeForestClassifier(**case_kwargs) sample_weight = np.ones(y_train.size) model.fit(X_train, y_train, sample_weight=sample_weight) y_pred_equal_sample_weight = model.predict(X_test) # Make sure the same predictions with None and equal sample_weight assert_array_equal(y_pred_no_sample_weight, y_pred_equal_sample_weight) model = CascadeForestClassifier(**case_kwargs) sample_weight = np.where(y_train == 0, 0.1, y_train) model.fit(X_train, y_train, sample_weight=y_train) y_pred_skewed_sample_weight = model.predict(X_test) # Make sure the different predictions with None and equal sample_weight assert_raises(AssertionError, assert_array_equal, y_pred_skewed_sample_weight, y_pred_equal_sample_weight) model.clean() # clear the buffer
def test_model_workflow_partial_mode(): """Run the workflow of deep forest with a local buffer.""" case_kwargs = copy.deepcopy(kwargs) case_kwargs.update({"partial_mode": True}) model = CascadeForestClassifier(**case_kwargs) model.fit(X_train, y_train) # Predictions before saving y_pred_before = model.predict(X_test) # Save and Reload model.save(save_dir) model = CascadeForestClassifier(**case_kwargs) model.load(save_dir) # Predictions after loading y_pred_after = model.predict(X_test) # Make sure the same predictions before and after model serialization assert_array_equal(y_pred_before, y_pred_after) model.clean() # clear the buffer shutil.rmtree(save_dir)
def test_model_properties_after_fitting(): """Check the model properties after fitting a deep forest model.""" model = CascadeForestClassifier(**toy_kwargs) model.fit(X_train, y_train) assert len(model) == model.n_layers_ assert model[0] is model._get_layer(0) with pytest.raises(ValueError) as excinfo: model._get_layer(model.n_layers_) assert "The layer index should be in the range" in str(excinfo.value) with pytest.raises(RuntimeError) as excinfo: model._set_layer(0, None) assert "already exists in the internal container" in str(excinfo.value) with pytest.raises(ValueError) as excinfo: model._get_binner(model.n_layers_ + 1) assert "The binner index should be in the range" in str(excinfo.value) with pytest.raises(RuntimeError) as excinfo: model._set_binner(0, None) assert "already exists in the internal container" in str(excinfo.value)
def test_model_input_label_encoder(): """Test if the model behaves the same with and without label encoding.""" # Load data X, y = load_digits(return_X_y=True) y_as_str = np.char.add("label_", y.astype(str)) # Train model on integer labels. Labels should look like: 1, 2, 3, ... model = CascadeForestClassifier(**toy_kwargs) model.fit(X, y) y_pred_int_labels = model.predict(X) # Train model on string labels. Labels should look like: "label_1", "label_2", "label_3", ... model = CascadeForestClassifier(**toy_kwargs) model.fit(X, y_as_str) y_pred_str_labels = model.predict(X) # Check if the underlying data are the same y_pred_int_labels_as_str = np.char.add("label_", y_pred_int_labels.astype(str)) assert_array_equal(y_pred_str_labels, y_pred_int_labels_as_str) # Clean up buffer model.clean()
def test_model_properties_after_fitting(): """Check the model properties after fitting a deep forest model.""" model = CascadeForestClassifier(**toy_kwargs) model.fit(X_train, y_train) assert len(model) == model.n_layers_ assert model[0] is model._get_layer(0) with pytest.raises(ValueError) as excinfo: model._get_layer(model.n_layers_) assert "The layer index should be in the range" in str(excinfo.value) with pytest.raises(RuntimeError) as excinfo: model._set_layer(0, None) assert "already exists in the internal container" in str(excinfo.value) with pytest.raises(ValueError) as excinfo: model._get_binner(model.n_layers_ + 1) assert "The binner index should be in the range" in str(excinfo.value) with pytest.raises(RuntimeError) as excinfo: model._set_binner(0, None) assert "already exists in the internal container" in str(excinfo.value) # Test the hook on forest estimator assert (model.get_forest(0, 0, "rf") is model._get_layer(0).estimators_["0-0-rf"].estimator_) with pytest.raises(ValueError) as excinfo: model.get_forest(model.n_layers_, 0, "rf") assert "`layer_idx` should be in the range" in str(excinfo.value) with pytest.raises(ValueError) as excinfo: model.get_forest(0, model.n_estimators, "rf") assert "`est_idx` should be in the range" in str(excinfo.value) with pytest.raises(ValueError) as excinfo: model.get_forest(0, 0, "Unknown") assert "`forest_type` should be one of" in str(excinfo.value)
x, y, random_state=1, train_size=0.99, test_size=0.01) #划分数据与标签(验证集) valx, valy = np.split(valdata, indices_or_sections=(10, ), axis=1) #valx为数据,valy为标签 valx = valx[:, 0:10] #选取前n个波段作为特征 train_data = train_data.astype(np.uint8) test_data = test_data.astype(np.uint8) train_label = train_label.astype(np.uint8) test_label = test_label.astype(np.uint8) #验证集 valx = valx.astype(np.uint8) valy = valy.astype(np.uint8) classification = CascadeForestClassifier(max_depth=5, n_trees=100, n_jobs=6, use_predictor="forest") classification.fit(train_data, train_label.ravel()) prediction = classification.predict_proba(valx) probability = [prob[1] for prob in prediction] pre_class = [] for i in probability: if i > 0.5: pre_class.append(1) else: pre_class.append(0) prediction = prediction[:, 1] pred = classification.predict(valx)
def test_custom_base_estimator_wrong_estimator_type(): model = CascadeForestClassifier() with pytest.raises(ValueError) as excinfo: model.set_estimator(42) assert "estimators should be a list" in str(excinfo.value)
def test_custom_cascade_layer_workflow_partial_mode(): model = CascadeForestClassifier(partial_mode=True) n_estimators = 4 estimators = [DecisionTreeClassifier() for _ in range(n_estimators)] model.set_estimator(estimators) # set custom base estimators predictor = DecisionTreeClassifier() model.set_predictor(predictor) model.fit(X_train, y_train) y_pred_before = model.predict(X_test) # Save and Reload model.save(save_dir) model = CascadeForestClassifier() model.load(save_dir) # Predictions after loading y_pred_after = model.predict(X_test) # Make sure the same predictions before and after model serialization assert_array_equal(y_pred_before, y_pred_after) model.clean() # clear the buffer shutil.rmtree(save_dir)
df_sum_y = pd.DataFrame(sum_y.T, columns=['Class', 'Sum'], index=None) print('\n', df_sum_y) # Apply SMOTE 生成 fake data sm = SMOTE(k_neighbors=args.kneighbors, random_state=args.randomseed, n_jobs=-1) x_resampled, y_resampled = sm.fit_sample(X, y) # after over sampleing 读取分类信息并返回数量 np_resampled_y = np.asarray(np.unique(y_resampled, return_counts=True)) df_resampled_y = pd.DataFrame(np_resampled_y.T, columns=['Class', 'Sum']) print("\nNumber of samples after over sampleing:\n{0}\n".format( df_resampled_y)) # 初始化 classifier clf = CascadeForestClassifier(random_state=args.randomseed) print("\nClassifier parameters:") print(clf.get_params()) print("\nSMOTE parameters:") print(sm.get_params()) print("\n") # 使用SMOTE后数据进行训练 clf.fit(x_resampled, y_resampled) # 预测测试集 y_pred = clf.predict(X_test) # 输出测试集统计结果 if (num_categories > 2): model_evaluation(num_categories, y_test, y_pred) else:
import numpy as np from sklearn.datasets import load_iris from sklearn.metrics import accuracy_score from deepforest import CascadeForestClassifier from sklearn.model_selection import train_test_split X, y = load_iris(return_X_y=True) X = np.array(X) y = np.array(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) model = CascadeForestClassifier() model.fit(X_train, y_train) y_pred = model.predict(X_test) acc = accuracy_score(y_test, y_pred) * 100 print("Accuracy: {:.3f} %".format(acc))
df_sum_y = pd.DataFrame(sum_y.T, columns=['Class', 'Sum'], index=None) print('\n', df_sum_y) # Apply SMOTE 生成 fake data sm = SMOTE(k_neighbors=args.kneighbors, random_state=args.randomseed, n_jobs=-1) x_resampled, y_resampled = sm.fit_sample(X, y) # after over sampleing 读取分类信息并返回数量 np_resampled_y = np.asarray(np.unique(y_resampled, return_counts=True)) df_resampled_y = pd.DataFrame(np_resampled_y.T, columns=['Class', 'Sum']) print("\nNumber of samples after over sampleing:\n{0}\n".format( df_resampled_y)) # 初始化 classifier clf = CascadeForestClassifier(random_state=args.randomseed) print("\nClassifier parameters:") print(clf.get_params()) print("\nSMOTE parameters:") print(sm.get_params()) print("\n") # 交叉验证 y_pred_resampled = cross_val_predict(clf, x_resampled, y_resampled, cv=args.kfolds, n_jobs=-1, verbose=0) # 通过 index 去除 fake data y_pred = y_pred_resampled[0:X.shape[0]]
records = [] for idx, random_state in enumerate(random_states): msg = "Currently processing {} with trial {}..." print(msg.format(dataset, idx)) model = CascadeForestClassifier( n_bins=n_bins, bin_subsample=bin_subsample, max_layers=max_layers, criterion=criterion, n_estimators=n_estimators, n_trees=n_trees, max_depth=max_depth, min_samples_leaf=min_samples_leaf, use_predictor=use_predictor, predictor=predictor, n_tolerant_rounds=n_tolerant_rounds, partial_mode=partial_mode, delta=delta, n_jobs=n_jobs, random_state=random_state, verbose=verbose, ) tic = time.time() model.fit(X_train, y_train) toc = time.time() training_time = toc - tic tic = time.time()
# feature_names X, y = shap.datasets.iris() # print("X", X.head()) # print("y", y) feature_names = X.columns X = np.array(X) y = np.array(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) model = CascadeForestClassifier(backend="sklearn") model.fit(X_train, y_train) forest = model.get_forest(0, 0, "rf") explainer = shap.TreeExplainer(forest) shap_values = explainer.shap_values(X_test) # shap_values = np.array(explainer.shap_values(X_train)) # print(shap_values.shape) # print(shap_values[2].shape) # shap.summary_plot(shap_values[2], X_train) # # # clf = RandomForestClassifier(n_estimators=100) # clf.fit(X_train, y_train) # explainer = shap.TreeExplainer(clf)
def test_classifier_custom_cascade_layer_workflow_in_memory(): model = CascadeForestClassifier() n_estimators = 4 estimators = [DecisionTreeClassifier() for _ in range(n_estimators)] model.set_estimator(estimators) # set custom base estimators predictor = DecisionTreeClassifier() model.set_predictor(predictor) model.fit(X_train_clf, y_train_clf) y_pred_before = model.predict(X_test_clf) # Save and Reload model.save(save_dir) model = CascadeForestClassifier() model.load(save_dir) # Predictions after loading y_pred_after = model.predict(X_test_clf) # Make sure the same predictions before and after model serialization assert_array_equal(y_pred_before, y_pred_after) assert (model.get_estimator(0, 0, "custom") is model._get_layer(0).estimators_["0-0-custom"].estimator_) model.clean() # clear the buffer shutil.rmtree(save_dir)
def test_model_workflow_in_memory(backend): """Run the workflow of deep forest with in-memory mode.""" case_kwargs = copy.deepcopy(kwargs) case_kwargs.update({"partial_mode": False}) case_kwargs.update({"backend": backend}) model = CascadeForestClassifier(**case_kwargs) model.fit(X_train, y_train) # Test feature_importances_ if backend == "sklearn": model.get_layer_feature_importances(0) else: with pytest.raises(RuntimeError) as excinfo: model.get_layer_feature_importances(0) assert "Please use the sklearn backend" in str(excinfo.value) # Predictions before saving y_pred_before = model.predict(X_test) # Save and Reload model.save(save_dir) model = CascadeForestClassifier(**case_kwargs) model.load(save_dir) # Make sure the same predictions before and after model serialization y_pred_after = model.predict(X_test) assert_array_equal(y_pred_before, y_pred_after) shutil.rmtree(save_dir)
import shap import numpy as np from deepforest import CascadeForestClassifier from sklearn.model_selection import train_test_split # load JS visualization code to notebook shap.initjs() X, y = shap.datasets.iris() X = np.array(X) y = np.array(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) model = CascadeForestClassifier() model.fit(X_train, y_train) explainer = shap.TreeExplainer(model)