Ejemplo n.º 1
0
        def is_parallel_split_faster(n_estimators, aggregation):
            clf = ForestClassifier(
                random_state=random_state,
                n_estimators=n_estimators,
                n_jobs=1,
                aggregation=aggregation,
            )
            tic = time()
            clf.fit(X, y)
            toc = time()
            time_no_parallel = toc - tic

            clf = ForestClassifier(
                random_state=random_state,
                n_estimators=n_estimators,
                n_jobs=effective_n_jobs,
                aggregation=aggregation,
            )

            tic = time()
            clf.fit(X, y)
            toc = time()
            time_parallel = toc - tic

            # We want parallel code to just be faster than non parallel one
            # assert time_no_parallel >= effective_n_jobs * time_parallel / 4
            assert time_no_parallel > time_parallel
Ejemplo n.º 2
0
def test_nodes_on_classification_datasets(
    data_loader,
    n_estimators,
    aggregation,
    class_weight,
    n_jobs,
    max_features,
    random_state,
    dirichlet,
    step,
    multiclass,
    cat_split_strategy,
    criterion,
):
    X, y = data_loader(raw=True)
    clf = ForestClassifier(
        n_estimators=n_estimators,
        n_jobs=n_jobs,
        multiclass=multiclass,
        cat_split_strategy=cat_split_strategy,
        aggregation=aggregation,
        criterion=criterion,
        max_features=max_features,
        class_weight=class_weight,
        random_state=random_state,
        dirichlet=dirichlet,
        step=step,
    )
    clf.fit(X, y)
    for tree in clf.trees:
        node_count = tree._tree_classifier.node_count
        nodes = tree._tree_classifier.nodes[:node_count]
        bin_partitions = tree._tree_classifier.bin_partitions
        assert tree._tree_classifier.nodes.size >= node_count
        check_nodes(nodes, bin_partitions, aggregation)
Ejemplo n.º 3
0
    def test_dirichlet_switch(self):
        breast_cancer = self.breast_cancer
        X, y = breast_cancer["data"], breast_cancer["target"]
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            shuffle=True,
                                                            stratify=y,
                                                            random_state=42,
                                                            test_size=0.3)

        clf1 = ForestClassifier(class_weight="balanced", random_state=42)
        clf2 = ForestClassifier(class_weight="balanced",
                                random_state=42,
                                dirichlet=2.0)

        clf1.fit(X_train, y_train)
        clf2.fit(X_train, y_train)
        y_score1 = clf1.predict_proba(X_test)
        y_score2 = clf2.predict_proba(X_test)

        assert np.max(np.abs(y_score1 - y_score2)) >= 0.01
        clf2.dirichlet = 0.5
        y_score2 = clf2.predict_proba(X_test)
        assert y_score1 == pytest.approx(y_score2, abs=1e-5)

        clf1.dirichlet = 1.1
        clf2.dirichlet = 1.1
        y_score1 = clf1.predict_proba(X_test)
        y_score2 = clf2.predict_proba(X_test)
        assert y_score1 == pytest.approx(y_score2, abs=1e-5)
Ejemplo n.º 4
0
        def is_parallel_split_faster(n_estimators, aggregation):
            clf = ForestClassifier(
                random_state=random_state,
                n_estimators=n_estimators,
                n_jobs=1,
                aggregation=aggregation,
            )
            tic = time()
            clf.fit(X, y)
            toc = time()
            time_no_parallel = toc - tic

            clf = ForestClassifier(
                random_state=random_state,
                n_estimators=n_estimators,
                n_jobs=effective_n_jobs,
                aggregation=aggregation,
            )

            tic = time()
            clf.fit(X, y)
            toc = time()
            time_parallel = toc - tic

            # We want parallel code to be effective_n_jobs / 3 faster when using
            # effectively effective_n_jobs threads
            assert time_no_parallel >= effective_n_jobs * time_parallel / 3
            print("time_no_parallel:", time_no_parallel)
            print("time_parallel:", time_parallel)
Ejemplo n.º 5
0
def test_several_max_bins_for_classification(loader, is_categorical_,
                                             required_log_loss, max_bins,
                                             aggregation):
    X, y = loader(raw=True)
    random_state = 42
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=random_state)
    n_estimators = 10
    class_weight = "balanced"
    n_jobs = -1
    dirichlet = 1e-2

    clf = ForestClassifier(
        n_estimators=n_estimators,
        n_jobs=n_jobs,
        aggregation=aggregation,
        max_bins=max_bins,
        dirichlet=dirichlet,
        class_weight=class_weight,
        random_state=random_state,
    )
    clf.fit(X_train, y_train)
    np.testing.assert_equal(clf.is_categorical_, is_categorical_)
    y_scores_test = clf.predict_proba(X_test)
    assert log_loss(y_test, y_scores_test) < required_log_loss
Ejemplo n.º 6
0
 def test_aggregation_dirichlet(self):
     iris = self.iris
     X, y = iris["data"], iris["target"]
     clf = ForestClassifier(dirichlet=0.0, aggregation=True)
     with pytest.raises(
             ValueError,
             match="dirichlet must be > 0 when aggregation=True"):
         clf.fit(X, y)
Ejemplo n.º 7
0
    def test_cat_split_strategy_on_car(self):
        dataset = load_car()
        dataset.one_hot_encode = False
        dataset.test_size = 1.0 / 5
        random_state = 42
        X_train, X_test, y_train, y_test = dataset.extract(
            random_state=random_state)
        n_estimators = 1
        aggregation = False
        class_weight = "balanced"
        n_jobs = 1
        max_features = None
        random_state = 42
        dirichlet = 0.0
        categorical_features = dataset.categorical_features_

        multiclass = "multinomial"
        cat_split_strategy = "binary"
        clf = ForestClassifier(
            n_estimators=n_estimators,
            n_jobs=n_jobs,
            multiclass=multiclass,
            aggregation=aggregation,
            max_features=max_features,
            class_weight=class_weight,
            categorical_features=categorical_features,
            cat_split_strategy=cat_split_strategy,
            random_state=random_state,
            dirichlet=dirichlet,
        )
        clf.fit(X_train, y_train)
        y_scores_train = clf.predict_proba(X_train)
        y_scores_test = clf.predict_proba(X_test)
        lloss_train_binary = log_loss(y_train, y_scores_train)
        lloss_test_binary = log_loss(y_test, y_scores_test)

        multiclass = "multinomial"
        cat_split_strategy = "all"
        clf = ForestClassifier(
            n_estimators=n_estimators,
            n_jobs=n_jobs,
            multiclass=multiclass,
            aggregation=aggregation,
            max_features=max_features,
            class_weight=class_weight,
            categorical_features=categorical_features,
            cat_split_strategy=cat_split_strategy,
            random_state=random_state,
            dirichlet=dirichlet,
        )
        clf.fit(X_train, y_train)
        y_scores_train = clf.predict_proba(X_train)
        y_scores_test = clf.predict_proba(X_test)
        lloss_train_all = log_loss(y_train, y_scores_train)
        lloss_test_all = log_loss(y_test, y_scores_test)

        assert lloss_train_all < lloss_train_binary
        assert lloss_test_all < lloss_test_binary
Ejemplo n.º 8
0
    def test_n_classes_classes_n_features_n_samples(self):
        y = np.array(["one", "two", "three", "one", "one", "two"])
        X = np.array([[0, 1], [1, 2], [2, 3], [3, 4], [4, 5], [5, 6]])
        clf = ForestClassifier()
        clf.fit(X, y)

        assert tuple(clf.classes_) == ("one", "three", "two")
        assert clf.n_classes_ == 3
        assert clf.n_features_in_ == 2
        assert clf.n_samples_in_ == 6
Ejemplo n.º 9
0
        def do_test_bootstrap(n_estimators, n_jobs, random_state):
            # 1. Test that all bootstrap samples are different
            clf = ForestClassifier(n_estimators=n_estimators,
                                   n_jobs=n_jobs,
                                   random_state=random_state)
            clf.fit(X, y)

            for n_estimator1, n_estimator2 in product(range(n_estimators),
                                                      range(n_estimators)):
                if n_estimator1 < n_estimator2:
                    assert clf.trees[n_estimator1]._train_indices != approx(
                        clf.trees[n_estimator2]._train_indices)
                    assert clf.trees[n_estimator1]._valid_indices != approx(
                        clf.trees[n_estimator2]._valid_indices)

            # 2. Test that random_seed makes bootstrap samples identical and that
            #    when no random_seed is used bootstrap samples are different
            clf1 = ForestClassifier(n_estimators=n_estimators,
                                    n_jobs=n_jobs,
                                    random_state=random_state)
            clf1.fit(X, y)
            clf2 = ForestClassifier(n_estimators=n_estimators,
                                    n_jobs=n_jobs,
                                    random_state=random_state)
            clf2.fit(X, y)
            for n_estimator in range(n_estimators):
                if random_state is None:
                    assert clf1.trees[n_estimator]._train_indices != approx(
                        clf2.trees[n_estimator]._train_indices)
                    assert clf1.trees[n_estimator]._valid_indices != approx(
                        clf2.trees[n_estimator]._valid_indices)
                else:
                    assert clf1.trees[n_estimator]._train_indices == approx(
                        clf2.trees[n_estimator]._train_indices)
                    assert clf1.trees[n_estimator]._valid_indices == approx(
                        clf2.trees[n_estimator]._valid_indices)

            # 3. Test that the apply() method gives the exact same leaves (this allows
            #    to check that the trees are the same, namely that random columns
            #    subsampling is indeed correctly seeded) and that predictions are the
            #    same (or not)
            clf1 = ForestClassifier(n_estimators=n_estimators,
                                    n_jobs=n_jobs,
                                    random_state=random_state)
            clf1.fit(X, y)
            clf2 = ForestClassifier(n_estimators=n_estimators,
                                    n_jobs=n_jobs,
                                    random_state=random_state)
            clf2.fit(X, y)
            if random_state is None:
                assert clf1.apply(X) != approx(clf2.apply(X))
                assert clf1.predict_proba(X) != approx(clf2.predict_proba(X))
            else:
                assert clf1.apply(X) == approx(clf2.apply(X))
                assert clf1.predict_proba(X) == approx(clf2.predict_proba(X))
Ejemplo n.º 10
0
 def test_n_features_(self):
     clf = ForestClassifier(n_estimators=2)
     with pytest.raises(
             ValueError,
             match="You must call fit before asking for n_features_"):
         _ = clf.n_features_
     np.random.seed(42)
     X = np.random.randn(10, 3)
     y = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
     clf.fit(X, y)
     assert clf.n_features_ == 3
Ejemplo n.º 11
0
 def test_performance_breast_cancer(self):
     breast_cancer = self.breast_cancer
     X, y = breast_cancer["data"], breast_cancer["target"]
     X_train, X_test, y_train, y_test = train_test_split(X,
                                                         y,
                                                         shuffle=True,
                                                         stratify=y,
                                                         random_state=42,
                                                         test_size=0.3)
     clf = ForestClassifier(class_weight="balanced", random_state=42)
     clf.fit(X_train, y_train)
     y_score = clf.predict_proba(X_test)
     assert roc_auc_score(y_test, y_score[:, 1]) >= 0.98
     clf = ForestClassifier(random_state=42)
     clf.fit(X_train, y_train)
     y_score = clf.predict_proba(X_test)
     assert roc_auc_score(y_test, y_score[:, 1]) >= 0.98
     clf = ForestClassifier(class_weight="balanced",
                            random_state=42,
                            criterion="entropy")
     clf.fit(X_train, y_train)
     y_score = clf.predict_proba(X_test)
     assert roc_auc_score(y_test, y_score[:, 1]) >= 0.98
     clf = ForestClassifier(random_state=42, criterion="entropy")
     clf.fit(X_train, y_train)
     y_score = clf.predict_proba(X_test)
     assert roc_auc_score(y_test, y_score[:, 1]) >= 0.98
Ejemplo n.º 12
0
    def test_performance_iris(self):
        iris = self.iris
        X, y = iris["data"], iris["target"]
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            shuffle=True,
                                                            stratify=y,
                                                            random_state=42,
                                                            test_size=0.3)
        clf = ForestClassifier(class_weight="balanced", random_state=42)
        clf.fit(X_train, y_train)
        y_score = clf.predict_proba(X_test)
        assert roc_auc_score(y_test, y_score, multi_class="ovo") >= 0.985
        clf = ForestClassifier(random_state=42)
        clf.fit(X_train, y_train)
        y_score = clf.predict_proba(X_test)
        assert roc_auc_score(y_test, y_score, multi_class="ovo") >= 0.985

        clf = ForestClassifier(class_weight="balanced",
                               random_state=42,
                               criterion="entropy")
        clf.fit(X_train, y_train)
        y_score = clf.predict_proba(X_test)
        assert roc_auc_score(y_test, y_score, multi_class="ovo") >= 0.985
        clf = ForestClassifier(random_state=42, criterion="entropy")
        clf.fit(X_train, y_train)
        y_score = clf.predict_proba(X_test)
        assert roc_auc_score(y_test, y_score, multi_class="ovo") >= 0.985
Ejemplo n.º 13
0
    def test_parallel_fit(self):
        n_samples = 100_000
        X, y = make_moons(n_samples=n_samples, noise=0.2, random_state=42)

        # Precompile
        clf = ForestClassifier(n_estimators=1, n_jobs=1, aggregation=True)
        clf.fit(X[:10], y[:10])
        clf = ForestClassifier(n_estimators=1, n_jobs=1, aggregation=False)
        clf.fit(X[:10], y[:10])

        random_state = 42

        effective_n_jobs = self.effective_n_jobs
        print("effective_n_jobs: ", effective_n_jobs)

        def is_parallel_split_faster(n_estimators, aggregation):
            clf = ForestClassifier(
                random_state=random_state,
                n_estimators=n_estimators,
                n_jobs=1,
                aggregation=aggregation,
            )
            tic = time()
            clf.fit(X, y)
            toc = time()
            time_no_parallel = toc - tic

            clf = ForestClassifier(
                random_state=random_state,
                n_estimators=n_estimators,
                n_jobs=effective_n_jobs,
                aggregation=aggregation,
            )

            tic = time()
            clf.fit(X, y)
            toc = time()
            time_parallel = toc - tic

            # We want parallel code to be effective_n_jobs / 3 faster when using
            # effectively effective_n_jobs threads
            assert time_no_parallel >= effective_n_jobs * time_parallel / 3
            print("time_no_parallel:", time_no_parallel)
            print("time_parallel:", time_parallel)

        # We want each thread to handle 4 trees
        n_estimators = 4 * effective_n_jobs
        is_parallel_split_faster(n_estimators=n_estimators, aggregation=True)
        is_parallel_split_faster(n_estimators=n_estimators, aggregation=False)
Ejemplo n.º 14
0
    def test_ovr_with_two_classes(self):
        """Test on a binary classification problem that 'ovr' and 'multiclass' are
        exactly identical"""
        dataset = self.adult
        dataset.one_hot_encode = False
        random_state = 42
        X_train, X_test, y_train, y_test = dataset.extract(
            random_state=random_state)

        n_estimators = 2
        aggregation = False
        class_weight = "balanced"
        n_jobs = -1
        max_features = None
        dirichlet = 0.0
        categorical_features = dataset.categorical_features_

        multiclass = "multinomial"
        clf = ForestClassifier(
            n_estimators=n_estimators,
            n_jobs=n_jobs,
            multiclass=multiclass,
            aggregation=aggregation,
            max_features=max_features,
            class_weight=class_weight,
            categorical_features=categorical_features,
            random_state=random_state,
            dirichlet=dirichlet,
        )
        clf.fit(X_train, y_train)
        y_scores_test1 = clf.predict_proba(X_test)

        multiclass = "ovr"
        clf = ForestClassifier(
            n_estimators=n_estimators,
            n_jobs=n_jobs,
            multiclass=multiclass,
            aggregation=aggregation,
            max_features=max_features,
            class_weight=class_weight,
            categorical_features=categorical_features,
            random_state=random_state,
            dirichlet=dirichlet,
        )
        clf.fit(X_train, y_train)
        y_scores_test2 = clf.predict_proba(X_test)

        assert y_scores_test1 == approx(y_scores_test2)
Ejemplo n.º 15
0
 def test_performance_cat_split_strategy_iris(self):
     iris = self.iris
     X, y = iris["data"], iris["target"]
     X_train, X_test, y_train, y_test = train_test_split(X,
                                                         y,
                                                         shuffle=True,
                                                         stratify=y,
                                                         random_state=42,
                                                         test_size=0.3)
     clf = ForestClassifier(cat_split_strategy="all", random_state=42)
     clf.fit(X_train, y_train)
     y_score = clf.predict_proba(X_test)
     assert roc_auc_score(y_test, y_score, multi_class="ovo") >= 0.985
     clf = ForestClassifier(cat_split_strategy="random", random_state=42)
     clf.fit(X_train, y_train)
     y_score = clf.predict_proba(X_test)
     assert roc_auc_score(y_test, y_score, multi_class="ovo") >= 0.985
Ejemplo n.º 16
0
    def test_performance_cat_split_strategy_breast_cancer(self):
        breast_cancer = self.breast_cancer
        X, y = breast_cancer["data"], breast_cancer["target"]
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            shuffle=True,
                                                            stratify=y,
                                                            random_state=42,
                                                            test_size=0.3)

        clf = ForestClassifier(cat_split_strategy="all", random_state=42)
        clf.fit(X_train, y_train)
        y_score = clf.predict_proba(X_test)
        assert roc_auc_score(y_test, y_score[:, 1]) >= 0.98  # all#
        clf = ForestClassifier(cat_split_strategy="random", random_state=42)
        clf.fit(X_train, y_train)
        y_score = clf.predict_proba(X_test)
        assert roc_auc_score(y_test, y_score[:, 1]) >= 0.98  # random#
Ejemplo n.º 17
0
def test_min_samples_split_min_samples_leaf_on_adult(
    aggregation,
    max_features,
    random_state,
    min_samples_split,
    min_samples_leaf,
    criterion,
):
    X, y = load_adult(raw=True)
    n_estimators = 3
    n_jobs = -1
    class_weight = "balanced"
    multiclass = "multinomial"
    step = 1.0
    clf = ForestClassifier(
        n_estimators=n_estimators,
        n_jobs=n_jobs,
        multiclass=multiclass,
        aggregation=aggregation,
        max_features=max_features,
        criterion=criterion,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        class_weight=class_weight,
        random_state=random_state,
        step=step,
    )
    clf.fit(X, y)
    min_samples = min(min_samples_split, min_samples_leaf)
    for tree in clf.trees:
        node_count = tree._tree_classifier.node_count
        nodes = tree._tree_classifier.nodes[:node_count]
        for node_id, node in enumerate(nodes):
            # Check that nodes respect the min_samples_split and
            # min_samples_leaf constraints
            assert node["n_samples_train"] >= min_samples
            if aggregation:
                assert node["n_samples_valid"] >= min_samples
Ejemplo n.º 18
0
        def do_test_bootstrap_again(n_estimators, n_jobs):
            # 4. When bootstrap seeds and column subsampling seeds are the same,
            #    the trees are all the same
            clf = ForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs)

            def _my_generate_random_states(self, n_states=None):
                self._random_states_bootstrap = np.ones(
                    (n_states or clf.n_estimators), dtype=np.int32)
                self._random_states_trees = np.ones(
                    (n_states or clf.n_estimators), dtype=np.int32)

            # Monkey patch the classifier
            clf._generate_random_states = types.MethodType(
                _my_generate_random_states, clf)
            clf.fit(X, y)
            leaves = clf.apply(X)
            for n_estimator1, n_estimator2 in product(range(n_estimators),
                                                      range(n_estimators)):
                if n_estimator1 < n_estimator2:
                    assert clf.trees[n_estimator1]._train_indices == approx(
                        clf.trees[n_estimator2]._train_indices)
                    assert clf.trees[n_estimator1]._valid_indices == approx(
                        clf.trees[n_estimator2]._valid_indices)
                    assert leaves[n_estimator1] == approx(leaves[n_estimator2])

            # 5. When bootstrap seeds are the same but column subsampling seeds are
            #    different, all the trees are different
            clf = ForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs)

            def _my_generate_random_states(self, n_states=None):
                # All bootstrap seeds are the same
                self._random_states_bootstrap = np.ones(
                    (n_states or clf.n_estimators), dtype=np.int32)
                # But column subsampling seeds are different
                self._random_states_trees = np.arange(n_states
                                                      or clf.n_estimators,
                                                      dtype=np.int32)

            # Monkey patch the classifier
            clf._generate_random_states = types.MethodType(
                _my_generate_random_states, clf)
            clf.fit(X, y)
            leaves = clf.apply(X)
            for n_estimator1, n_estimator2 in product(range(n_estimators),
                                                      range(n_estimators)):
                if n_estimator1 < n_estimator2:
                    assert clf.trees[n_estimator1]._train_indices == approx(
                        clf.trees[n_estimator2]._train_indices)
                    assert clf.trees[n_estimator1]._valid_indices == approx(
                        clf.trees[n_estimator2]._valid_indices)
                    assert leaves[n_estimator1] != approx(leaves[n_estimator2])

            # 6. When bootstrap seeds are different but column subsampling seeds are
            #    identical, all the trees are different
            clf = ForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs)

            def _my_generate_random_states(self, n_states=None):
                # All bootstrap seeds are the same
                self._random_states_bootstrap = np.arange(n_states
                                                          or clf.n_estimators,
                                                          dtype=np.int32)
                # But column subsampling seeds are different
                self._random_states_trees = np.ones(
                    (n_states or clf.n_estimators, ), dtype=np.int32)

            # Monkey patch the classifier
            clf._generate_random_states = types.MethodType(
                _my_generate_random_states, clf)
            clf.fit(X, y)
            leaves = clf.apply(X)
            for n_estimator1, n_estimator2 in product(range(n_estimators),
                                                      range(n_estimators)):
                if n_estimator1 < n_estimator2:
                    assert clf.trees[n_estimator1]._train_indices != approx(
                        clf.trees[n_estimator2]._train_indices)
                    assert clf.trees[n_estimator1]._valid_indices != approx(
                        clf.trees[n_estimator2]._valid_indices)
                    assert leaves[n_estimator1] != approx(leaves[n_estimator2])
Ejemplo n.º 19
0

clf1 = ForestClassifier(
    n_estimators=n_estimators,
    n_jobs=n_jobs,
    multiclass=multiclass,
    cat_split_strategy=cat_split_strategy,
    aggregation=aggregation,
    max_features=max_features,
    class_weight=class_weight,
    random_state=random_state,
    dirichlet=dirichlet,
    step=step,
)

clf1.fit(X_train, y_train)


filename = "forest_classifier_on_iris.pkl"
with open(filename, "wb") as f:
    pkl.dump(clf1, f)

with open(filename, "rb") as f:
    clf2 = pkl.load(f)


# os.remove(filename)
#
# assert_forests_equal(clf1, clf2, is_classifier=True)
#
# y_pred1 = clf1.predict_proba(X_test)
Ejemplo n.º 20
0
    def test_class_weight_sample_weights(self):
        iris = self.iris
        X, y = iris["data"], iris["target"]
        # Check that no sample_weight and all sample weights equal to 1. is the same
        clf1 = ForestClassifier(class_weight=None, random_state=42)
        clf1.fit(X, y)
        clf2 = ForestClassifier(class_weight=None, random_state=42)
        clf2.fit(X, y, sample_weight=np.ones(y.shape[0]))
        assert clf1.apply(X) == approx(clf2.apply(X))
        assert clf1.predict_proba(X) == approx(clf2.predict_proba(X))

        clf1 = ForestClassifier(class_weight="balanced", random_state=42)
        clf1.fit(X, y)
        clf2 = ForestClassifier(class_weight=None, random_state=42)
        sample_weight = compute_sample_weight("balanced", y)
        clf2.fit(X, y, sample_weight=sample_weight)
        assert clf1.apply(X) == approx(clf2.apply(X))
        assert clf1.predict_proba(X) == approx(clf2.predict_proba(X))

        # Simulate unbalanced data from the iris datasets
        X_unb = np.concatenate((X[0:50], X[50:56], X[100:106]), axis=0)
        y_unb = np.concatenate((y[0:50], y[50:56], y[100:106]), axis=0)

        X_train, X_test, y_train, y_test = train_test_split(X_unb,
                                                            y_unb,
                                                            shuffle=True,
                                                            stratify=y_unb,
                                                            random_state=42,
                                                            test_size=0.5)

        clf = ForestClassifier(class_weight=None,
                               random_state=42,
                               aggregation=True)
        clf.fit(X_train, y_train)
        y_scores = clf.predict(X_test)
        report1 = classification_report(y_test, y_scores, output_dict=True)

        clf = ForestClassifier(class_weight="balanced",
                               random_state=42,
                               aggregation=True)
        clf.fit(X_train, y_train)
        y_scores = clf.predict(X_test)
        report2 = classification_report(y_test, y_scores, output_dict=True)

        # In the considered case, class_weight should improve all metrics
        for label in ["0", "1", "2"]:
            label_report1 = report1[label]
            label_report2 = report2[label]
            assert label_report2["precision"] >= label_report1["precision"]
            assert label_report2["recall"] >= label_report1["recall"]
            assert label_report2["f1-score"] >= label_report1["f1-score"]

        breast_cancer = self.breast_cancer
        X, y = breast_cancer["data"], breast_cancer["target"]
        idx_0 = y == 0
        idx_1 = y == 1

        X_unb = np.concatenate((X[idx_0], X[idx_1][:10]), axis=0)
        y_unb = np.concatenate((y[idx_0], y[idx_1][:10]), axis=0)

        X_train, X_test, y_train, y_test = train_test_split(X_unb,
                                                            y_unb,
                                                            shuffle=True,
                                                            stratify=y_unb,
                                                            random_state=42,
                                                            test_size=0.5)

        clf = ForestClassifier(class_weight=None,
                               random_state=42,
                               aggregation=True)
        clf.fit(X_train, y_train)
        y_scores = clf.predict(X_test)

        y_test_binary = LabelBinarizer().fit_transform(y_test)

        avg_prec1 = average_precision_score(y_test_binary,
                                            y_scores,
                                            average="weighted")

        clf = ForestClassifier(class_weight="balanced",
                               random_state=42,
                               aggregation=True)
        clf.fit(X_train, y_train)
        y_scores = clf.predict(X_test)
        avg_prec2 = average_precision_score(y_test_binary,
                                            y_scores,
                                            average="weighted")

        assert avg_prec2 > avg_prec1
Ejemplo n.º 21
0
def test_forest_classifier_serialization(
    dataset_name,
    n_estimators,
    aggregation,
    class_weight,
    dirichlet,
    n_jobs,
    max_features,
    random_state,
    step,
    multiclass,
    cat_split_strategy,
):
    if dataset_name == "adult":
        X, y = load_adult(raw=True)
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, random_state=random_state)
    elif dataset_name == "iris":
        iris = datasets.load_iris()
        X = iris.data
        y = iris.target
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=1 / 5, random_state=random_state)

    clf1 = ForestClassifier(
        n_estimators=n_estimators,
        n_jobs=n_jobs,
        multiclass=multiclass,
        max_bins=37,
        cat_split_strategy=cat_split_strategy,
        aggregation=aggregation,
        max_features=max_features,
        class_weight=class_weight,
        random_state=random_state,
        dirichlet=dirichlet,
        step=step,
    )
    clf1.fit(X_train, y_train)

    filename = "forest_classifier_on_iris.pkl"
    with open(filename, "wb") as f:
        pkl.dump(clf1, f)

    with open(filename, "rb") as f:
        clf2 = pkl.load(f)

    os.remove(filename)

    assert_forests_equal(clf1, clf2, is_classifier=True)

    y_pred1 = clf1.predict_proba(X_test)
    y_pred2 = clf2.predict_proba(X_test)
    np.testing.assert_equal(y_pred1, y_pred2)

    y_pred1 = clf1.predict(X_test)
    y_pred2 = clf2.predict(X_test)
    np.testing.assert_equal(y_pred1, y_pred2)

    apply1 = clf1.apply(X_test)
    apply2 = clf2.apply(X_test)
    np.testing.assert_equal(apply1, apply2)
Ejemplo n.º 22
0
        y0="y",
        x1="x0",
        y1="y0",
        line_color="#151515",
        line_alpha=0.4,
        source=source_tree,
    )

    tooltips = [(attribute, "@" + attribute) for attribute in attributes]

    tree_hover = HoverTool(renderers=[circles], tooltips=tooltips)
    fig.add_tools(tree_hover)
    fig.text(x="x", y="y", text="node_id", source=source_tree)
    return fig


if __name__ == "__main__":
    X = np.repeat(np.arange(5), 20).reshape((-1, 1))
    y = np.repeat([1, 0, 0, 1, 0], 20)
    clf = ForestClassifier(n_estimators=1,
                           random_state=42,
                           categorical_features=[True],
                           dirichlet=0.0)
    # X_onehot = OneHotEncoder(sparse=False).fit_transform(X)
    clf.fit(X, y)
    df = clf.get_nodes(0)
    print(df)
    fig = plot_tree(clf, 0)

    show(fig)