def test_model_input_label_encoder(): """Test if the model behaves the same with and without label encoding.""" # Load data X, y = load_digits(return_X_y=True) y_as_str = np.char.add("label_", y.astype(str)) # Train model on integer labels. Labels should look like: 1, 2, 3, ... model = CascadeForestClassifier(random_state=1) model.fit(X, y) y_pred_int_labels = model.predict(X) # Train model on string labels. Labels should look like: "label_1", "label_2", "label_3", ... model = CascadeForestClassifier(random_state=1) model.fit(X, y_as_str) y_pred_str_labels = model.predict(X) # Check if the underlying data are the same y_pred_int_labels_as_str = np.char.add( "label_", y_pred_int_labels.astype(str) ) assert_array_equal(y_pred_str_labels, y_pred_int_labels_as_str) # Clean up buffer model.clean()
def test_model_workflow_partial_mode(): """Run the workflow of deep forest with a local buffer.""" case_kwargs = copy.deepcopy(kwargs) case_kwargs.update({"partial_mode": True}) model = CascadeForestClassifier(**case_kwargs) model.fit(X_train, y_train) # Predictions before saving y_pred_before = model.predict(X_test) # Save and Reload model.save(save_dir) model = CascadeForestClassifier(**case_kwargs) model.load(save_dir) # Predictions after loading y_pred_after = model.predict(X_test) # Make sure the same predictions before and after model serialization assert_array_equal(y_pred_before, y_pred_after) model.clean() # clear the buffer shutil.rmtree(save_dir)
def test_model_workflow_in_memory(backend): """Run the workflow of deep forest with in-memory mode.""" case_kwargs = copy.deepcopy(kwargs) case_kwargs.update({"partial_mode": False}) case_kwargs.update({"backend": backend}) model = CascadeForestClassifier(**case_kwargs) model.fit(X_train, y_train) # Test feature_importances_ if backend == "sklearn": model.get_layer_feature_importances(0) else: with pytest.raises(RuntimeError) as excinfo: model.get_layer_feature_importances(0) assert "Please use the sklearn backend" in str(excinfo.value) # Predictions before saving y_pred_before = model.predict(X_test) # Save and Reload model.save(save_dir) model = CascadeForestClassifier(**case_kwargs) model.load(save_dir) # Make sure the same predictions before and after model serialization y_pred_after = model.predict(X_test) assert_array_equal(y_pred_before, y_pred_after) shutil.rmtree(save_dir)
def test_model_workflow_in_memory(backend): """Run the workflow of deep forest with in-memory mode.""" case_kwargs = copy.deepcopy(kwargs) case_kwargs.update({"partial_mode": False}) case_kwargs.update({"backend": backend}) model = CascadeForestClassifier(**case_kwargs) model.fit(X_train, y_train) # Predictions before saving y_pred_before = model.predict(X_test) # Save and Reload model.save(save_dir) model = CascadeForestClassifier(**case_kwargs) model.load(save_dir) # Make sure the same predictions before and after model serialization y_pred_after = model.predict(X_test) assert_array_equal(y_pred_before, y_pred_after) shutil.rmtree(save_dir)
def test_custom_cascade_layer_workflow_partial_mode(): model = CascadeForestClassifier(partial_mode=True) n_estimators = 4 estimators = [DecisionTreeClassifier() for _ in range(n_estimators)] model.set_estimator(estimators) # set custom base estimators predictor = DecisionTreeClassifier() model.set_predictor(predictor) model.fit(X_train, y_train) y_pred_before = model.predict(X_test) # Save and Reload model.save(save_dir) model = CascadeForestClassifier() model.load(save_dir) # Predictions after loading y_pred_after = model.predict(X_test) # Make sure the same predictions before and after model serialization assert_array_equal(y_pred_before, y_pred_after) model.clean() # clear the buffer shutil.rmtree(save_dir)
def test_classifier_custom_cascade_layer_workflow_in_memory(): model = CascadeForestClassifier() n_estimators = 4 estimators = [DecisionTreeClassifier() for _ in range(n_estimators)] model.set_estimator(estimators) # set custom base estimators predictor = DecisionTreeClassifier() model.set_predictor(predictor) model.fit(X_train_clf, y_train_clf) y_pred_before = model.predict(X_test_clf) # Save and Reload model.save(save_dir) model = CascadeForestClassifier() model.load(save_dir) # Predictions after loading y_pred_after = model.predict(X_test_clf) # Make sure the same predictions before and after model serialization assert_array_equal(y_pred_before, y_pred_after) assert (model.get_estimator(0, 0, "custom") is model._get_layer(0).estimators_["0-0-custom"].estimator_) model.clean() # clear the buffer shutil.rmtree(save_dir)
def test_model_invalid_training_params(param): case_kwargs = copy.deepcopy(toy_kwargs) case_kwargs.update(param[1]) model = CascadeForestClassifier(**case_kwargs) with pytest.raises(ValueError) as excinfo: model.fit(X_train, y_train) if param[0] == 0: assert "max_layers" in str(excinfo.value) elif param[0] == 1: assert "n_tolerant_rounds" in str(excinfo.value) elif param[0] == 2: assert "delta " in str(excinfo.value)
def test_model_properties_after_fitting(): """Check the model properties after fitting a deep forest model.""" model = CascadeForestClassifier(**toy_kwargs) model.fit(X_train, y_train) assert len(model) == model.n_layers_ assert model[0] is model._get_layer(0) with pytest.raises(ValueError) as excinfo: model._get_layer(model.n_layers_) assert "The layer index should be in the range" in str(excinfo.value) with pytest.raises(RuntimeError) as excinfo: model._set_layer(0, None) assert "already exists in the internal container" in str(excinfo.value) with pytest.raises(ValueError) as excinfo: model._get_binner(model.n_layers_ + 1) assert "The binner index should be in the range" in str(excinfo.value) with pytest.raises(RuntimeError) as excinfo: model._set_binner(0, None) assert "already exists in the internal container" in str(excinfo.value) # Test the hook on forest estimator assert ( model.get_forest(0, 0, "rf") is model._get_layer(0).estimators_["0-0-rf"].estimator_ ) with pytest.raises(ValueError) as excinfo: model.get_forest(model.n_layers_, 0, "rf") assert "`layer_idx` should be in the range" in str(excinfo.value) with pytest.raises(ValueError) as excinfo: model.get_forest(0, model.n_estimators, "rf") assert "`est_idx` should be in the range" in str(excinfo.value) with pytest.raises(ValueError) as excinfo: model.get_forest(0, 0, "Unknown") assert "`forest_type` should be one of" in str(excinfo.value)
def test_model_sample_weight(): """Run the workflow of deep forest with a local buffer.""" case_kwargs = copy.deepcopy(kwargs) # Training without sample_weight model = CascadeForestClassifier(**case_kwargs) model.fit(X_train, y_train) y_pred_no_sample_weight = model.predict(X_test) # Training with equal sample_weight model = CascadeForestClassifier(**case_kwargs) sample_weight = np.ones(y_train.size) model.fit(X_train, y_train, sample_weight=sample_weight) y_pred_equal_sample_weight = model.predict(X_test) # Make sure the same predictions with None and equal sample_weight assert_array_equal(y_pred_no_sample_weight, y_pred_equal_sample_weight) model = CascadeForestClassifier(**case_kwargs) sample_weight = np.where(y_train == 0, 0.1, y_train) model.fit(X_train, y_train, sample_weight=y_train) y_pred_skewed_sample_weight = model.predict(X_test) # Make sure the different predictions with None and equal sample_weight assert_raises(AssertionError, assert_array_equal, y_pred_skewed_sample_weight, y_pred_equal_sample_weight) model.clean() # clear the buffer
def test_model_properties_after_fitting(): """Check the model properties after fitting a deep forest model.""" model = CascadeForestClassifier(**toy_kwargs) model.fit(X_train, y_train) assert len(model) == model.n_layers_ assert model[0] is model._get_layer(0) with pytest.raises(ValueError) as excinfo: model._get_layer(model.n_layers_) assert "The layer index should be in the range" in str(excinfo.value) with pytest.raises(RuntimeError) as excinfo: model._set_layer(0, None) assert "already exists in the internal container" in str(excinfo.value) with pytest.raises(ValueError) as excinfo: model._get_binner(model.n_layers_ + 1) assert "The binner index should be in the range" in str(excinfo.value) with pytest.raises(RuntimeError) as excinfo: model._set_binner(0, None) assert "already exists in the internal container" in str(excinfo.value)
# feature_names X, y = shap.datasets.iris() # print("X", X.head()) # print("y", y) feature_names = X.columns X = np.array(X) y = np.array(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) model = CascadeForestClassifier(backend="sklearn") model.fit(X_train, y_train) forest = model.get_forest(0, 0, "rf") explainer = shap.TreeExplainer(forest) shap_values = explainer.shap_values(X_test) # shap_values = np.array(explainer.shap_values(X_train)) # print(shap_values.shape) # print(shap_values[2].shape) # shap.summary_plot(shap_values[2], X_train) # # # clf = RandomForestClassifier(n_estimators=100) # clf.fit(X_train, y_train) # explainer = shap.TreeExplainer(clf) # shap_values = np.array(explainer.shap_values(X_test))
random_state=args.randomseed, n_jobs=-1) x_resampled, y_resampled = sm.fit_sample(X, y) # after over sampleing 读取分类信息并返回数量 np_resampled_y = np.asarray(np.unique(y_resampled, return_counts=True)) df_resampled_y = pd.DataFrame(np_resampled_y.T, columns=['Class', 'Sum']) print("\nNumber of samples after over sampleing:\n{0}\n".format( df_resampled_y)) # 初始化 classifier clf = CascadeForestClassifier(random_state=args.randomseed) print("\nClassifier parameters:") print(clf.get_params()) print("\nSMOTE parameters:") print(sm.get_params()) print("\n") # 使用SMOTE后数据进行训练 clf.fit(x_resampled, y_resampled) # 预测测试集 y_pred = clf.predict(X_test) # 输出测试集统计结果 if (num_categories > 2): model_evaluation(num_categories, y_test, y_pred) else: bi_model_evaluation(y_test, y_pred) end_time = time.time() # 程序结束时间 print("\n[Finished in: {0:.6f} mins = {1:.6f} seconds]".format( ((end_time - start_time) / 60), (end_time - start_time)))
valx = valx[:, 0:10] #选取前n个波段作为特征 train_data = train_data.astype(np.uint8) test_data = test_data.astype(np.uint8) train_label = train_label.astype(np.uint8) test_label = test_label.astype(np.uint8) #验证集 valx = valx.astype(np.uint8) valy = valy.astype(np.uint8) classification = CascadeForestClassifier(max_depth=5, n_trees=100, n_jobs=6, use_predictor="forest") classification.fit(train_data, train_label.ravel()) prediction = classification.predict_proba(valx) probability = [prob[1] for prob in prediction] pre_class = [] for i in probability: if i > 0.5: pre_class.append(1) else: pre_class.append(0) prediction = prediction[:, 1] pred = classification.predict(valx) print(pred.shape) print("训练集:", classification.score(train_data, train_label))