Esempio n. 1
0
def test_nodes_on_classification_datasets(
    data_loader,
    n_estimators,
    aggregation,
    class_weight,
    n_jobs,
    max_features,
    random_state,
    dirichlet,
    step,
    multiclass,
    cat_split_strategy,
    criterion,
):
    X, y = data_loader(raw=True)
    clf = ForestClassifier(
        n_estimators=n_estimators,
        n_jobs=n_jobs,
        multiclass=multiclass,
        cat_split_strategy=cat_split_strategy,
        aggregation=aggregation,
        criterion=criterion,
        max_features=max_features,
        class_weight=class_weight,
        random_state=random_state,
        dirichlet=dirichlet,
        step=step,
    )
    clf.fit(X, y)
    for tree in clf.trees:
        node_count = tree._tree_classifier.node_count
        nodes = tree._tree_classifier.nodes[:node_count]
        bin_partitions = tree._tree_classifier.bin_partitions
        assert tree._tree_classifier.nodes.size >= node_count
        check_nodes(nodes, bin_partitions, aggregation)
Esempio n. 2
0
def test_several_max_bins_for_classification(loader, is_categorical_,
                                             required_log_loss, max_bins,
                                             aggregation):
    X, y = loader(raw=True)
    random_state = 42
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=random_state)
    n_estimators = 10
    class_weight = "balanced"
    n_jobs = -1
    dirichlet = 1e-2

    clf = ForestClassifier(
        n_estimators=n_estimators,
        n_jobs=n_jobs,
        aggregation=aggregation,
        max_bins=max_bins,
        dirichlet=dirichlet,
        class_weight=class_weight,
        random_state=random_state,
    )
    clf.fit(X_train, y_train)
    np.testing.assert_equal(clf.is_categorical_, is_categorical_)
    y_scores_test = clf.predict_proba(X_test)
    assert log_loss(y_test, y_scores_test) < required_log_loss
Esempio n. 3
0
 def test_aggregation_dirichlet(self):
     iris = self.iris
     X, y = iris["data"], iris["target"]
     clf = ForestClassifier(dirichlet=0.0, aggregation=True)
     with pytest.raises(
             ValueError,
             match="dirichlet must be > 0 when aggregation=True"):
         clf.fit(X, y)
Esempio n. 4
0
 def test_criterion(self):
     clf = ForestClassifier()
     assert clf.criterion == "gini"
     with pytest.raises(ValueError, match="criterion must be a string"):
         clf.criterion = 1
     clf.criterion = "entropy"
     assert clf.criterion == "entropy"
     with pytest.raises(ValueError, match="Unknown criterion: other"):
         clf.criterion = "other"
Esempio n. 5
0
    def test_n_classes_classes_n_features_n_samples(self):
        y = np.array(["one", "two", "three", "one", "one", "two"])
        X = np.array([[0, 1], [1, 2], [2, 3], [3, 4], [4, 5], [5, 6]])
        clf = ForestClassifier()
        clf.fit(X, y)

        assert tuple(clf.classes_) == ("one", "three", "two")
        assert clf.n_classes_ == 3
        assert clf.n_features_in_ == 2
        assert clf.n_samples_in_ == 6
Esempio n. 6
0
 def test_n_features_(self):
     clf = ForestClassifier(n_estimators=2)
     with pytest.raises(
             ValueError,
             match="You must call fit before asking for n_features_"):
         _ = clf.n_features_
     np.random.seed(42)
     X = np.random.randn(10, 3)
     y = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
     clf.fit(X, y)
     assert clf.n_features_ == 3
Esempio n. 7
0
    def test_loss(self):
        clf = ForestClassifier()
        assert clf.loss == "log"
        with pytest.raises(ValueError, match="loss must be a string"):
            clf.loss = 3.14
        with pytest.raises(ValueError,
                           match="Only loss='log' is supported for now"):
            clf.loss = "other"

        with pytest.raises(ValueError,
                           match="Only loss='log' is supported for now"):
            _ = ForestClassifier(loss="other")
Esempio n. 8
0
        def is_parallel_split_faster(n_estimators, aggregation):
            clf = ForestClassifier(
                random_state=random_state,
                n_estimators=n_estimators,
                n_jobs=1,
                aggregation=aggregation,
            )
            tic = time()
            clf.fit(X, y)
            toc = time()
            time_no_parallel = toc - tic

            clf = ForestClassifier(
                random_state=random_state,
                n_estimators=n_estimators,
                n_jobs=effective_n_jobs,
                aggregation=aggregation,
            )

            tic = time()
            clf.fit(X, y)
            toc = time()
            time_parallel = toc - tic

            # We want parallel code to be effective_n_jobs / 3 faster when using
            # effectively effective_n_jobs threads
            assert time_no_parallel >= effective_n_jobs * time_parallel / 3
            print("time_no_parallel:", time_no_parallel)
            print("time_parallel:", time_parallel)
Esempio n. 9
0
 def test_class_weight(self):
     # Test that default if None
     clf = ForestClassifier()
     assert clf.class_weight is None
     clf.class_weight = "balanced"
     assert clf.class_weight == "balanced"
     with pytest.raises(
             ValueError,
             match='class_weight can only be None or "balanced"'):
         clf.class_weight = "truc"
     with pytest.raises(
             ValueError,
             match='class_weight can only be None or "balanced"'):
         clf.class_weight = 1
Esempio n. 10
0
        def is_parallel_split_faster(n_estimators, aggregation):
            clf = ForestClassifier(
                random_state=random_state,
                n_estimators=n_estimators,
                n_jobs=1,
                aggregation=aggregation,
            )
            tic = time()
            clf.fit(X, y)
            toc = time()
            time_no_parallel = toc - tic

            clf = ForestClassifier(
                random_state=random_state,
                n_estimators=n_estimators,
                n_jobs=effective_n_jobs,
                aggregation=aggregation,
            )

            tic = time()
            clf.fit(X, y)
            toc = time()
            time_parallel = toc - tic

            # We want parallel code to just be faster than non parallel one
            # assert time_no_parallel >= effective_n_jobs * time_parallel / 4
            assert time_no_parallel > time_parallel
Esempio n. 11
0
    def test_ovr_with_two_classes(self):
        """Test on a binary classification problem that 'ovr' and 'multiclass' are
        exactly identical"""
        dataset = self.adult
        dataset.one_hot_encode = False
        random_state = 42
        X_train, X_test, y_train, y_test = dataset.extract(
            random_state=random_state)

        n_estimators = 2
        aggregation = False
        class_weight = "balanced"
        n_jobs = -1
        max_features = None
        dirichlet = 0.0
        categorical_features = dataset.categorical_features_

        multiclass = "multinomial"
        clf = ForestClassifier(
            n_estimators=n_estimators,
            n_jobs=n_jobs,
            multiclass=multiclass,
            aggregation=aggregation,
            max_features=max_features,
            class_weight=class_weight,
            categorical_features=categorical_features,
            random_state=random_state,
            dirichlet=dirichlet,
        )
        clf.fit(X_train, y_train)
        y_scores_test1 = clf.predict_proba(X_test)

        multiclass = "ovr"
        clf = ForestClassifier(
            n_estimators=n_estimators,
            n_jobs=n_jobs,
            multiclass=multiclass,
            aggregation=aggregation,
            max_features=max_features,
            class_weight=class_weight,
            categorical_features=categorical_features,
            random_state=random_state,
            dirichlet=dirichlet,
        )
        clf.fit(X_train, y_train)
        y_scores_test2 = clf.predict_proba(X_test)

        assert y_scores_test1 == approx(y_scores_test2)
Esempio n. 12
0
 def test_performance_breast_cancer(self):
     breast_cancer = self.breast_cancer
     X, y = breast_cancer["data"], breast_cancer["target"]
     X_train, X_test, y_train, y_test = train_test_split(X,
                                                         y,
                                                         shuffle=True,
                                                         stratify=y,
                                                         random_state=42,
                                                         test_size=0.3)
     clf = ForestClassifier(class_weight="balanced", random_state=42)
     clf.fit(X_train, y_train)
     y_score = clf.predict_proba(X_test)
     assert roc_auc_score(y_test, y_score[:, 1]) >= 0.98
     clf = ForestClassifier(random_state=42)
     clf.fit(X_train, y_train)
     y_score = clf.predict_proba(X_test)
     assert roc_auc_score(y_test, y_score[:, 1]) >= 0.98
Esempio n. 13
0
 def test_performance_cat_split_strategy_iris(self):
     iris = self.iris
     X, y = iris["data"], iris["target"]
     X_train, X_test, y_train, y_test = train_test_split(X,
                                                         y,
                                                         shuffle=True,
                                                         stratify=y,
                                                         random_state=42,
                                                         test_size=0.3)
     clf = ForestClassifier(cat_split_strategy="all", random_state=42)
     clf.fit(X_train, y_train)
     y_score = clf.predict_proba(X_test)
     assert roc_auc_score(y_test, y_score, multi_class="ovo") >= 0.985
     clf = ForestClassifier(cat_split_strategy="random", random_state=42)
     clf.fit(X_train, y_train)
     y_score = clf.predict_proba(X_test)
     assert roc_auc_score(y_test, y_score, multi_class="ovo") >= 0.985
Esempio n. 14
0
 def test_step(self):
     clf = ForestClassifier()
     assert clf.step == 1.0
     clf = ForestClassifier(step=0.17)
     assert clf.step == 0.17
     clf.step = 0.42
     assert clf.step == 0.42
     # "step must be a real number"
     # "step must be positive"
     with pytest.raises(ValueError, match="step must be a float"):
         clf.step = "1"
     with pytest.raises(ValueError, match="step must be a float"):
         clf.step = 1
     with pytest.raises(ValueError, match="step must be a float"):
         clf.step = None
     with pytest.raises(ValueError, match="step must be positive"):
         clf.step = -1.0
     with pytest.raises(ValueError, match="step must be positive"):
         clf.step = -0.42
Esempio n. 15
0
 def test_min_samples_leaf(self):
     clf = ForestClassifier()
     assert clf.min_samples_leaf == 1
     clf = ForestClassifier(min_samples_leaf=17)
     assert clf.min_samples_leaf == 17
     clf.min_samples_leaf = 5
     assert clf.min_samples_leaf == 5
     with pytest.raises(ValueError,
                        match="min_samples_leaf must be an integer number"):
         clf.min_samples_leaf = 0.42
     with pytest.raises(ValueError,
                        match="min_samples_leaf must be an integer number"):
         clf.min_samples_leaf = None
     with pytest.raises(ValueError,
                        match="min_samples_leaf must be an integer number"):
         clf.min_samples_leaf = "4"
     with pytest.raises(ValueError, match="min_samples_leaf must be >= 1"):
         clf.min_samples_leaf = 0
     with pytest.raises(ValueError, match="min_samples_leaf must be >= 1"):
         clf.min_samples_leaf = -3
Esempio n. 16
0
    def test_parallel_fit(self):
        n_samples = 100_000
        X, y = make_moons(n_samples=n_samples, noise=0.2, random_state=42)

        # Precompile
        clf = ForestClassifier(n_estimators=1, n_jobs=1, aggregation=True)
        clf.fit(X[:10], y[:10])
        clf = ForestClassifier(n_estimators=1, n_jobs=1, aggregation=False)
        clf.fit(X[:10], y[:10])

        random_state = 42

        effective_n_jobs = self.effective_n_jobs
        print("effective_n_jobs: ", effective_n_jobs)

        def is_parallel_split_faster(n_estimators, aggregation):
            clf = ForestClassifier(
                random_state=random_state,
                n_estimators=n_estimators,
                n_jobs=1,
                aggregation=aggregation,
            )
            tic = time()
            clf.fit(X, y)
            toc = time()
            time_no_parallel = toc - tic

            clf = ForestClassifier(
                random_state=random_state,
                n_estimators=n_estimators,
                n_jobs=effective_n_jobs,
                aggregation=aggregation,
            )

            tic = time()
            clf.fit(X, y)
            toc = time()
            time_parallel = toc - tic

            # We want parallel code to be effective_n_jobs / 3 faster when using
            # effectively effective_n_jobs threads
            assert time_no_parallel >= effective_n_jobs * time_parallel / 3
            print("time_no_parallel:", time_no_parallel)
            print("time_parallel:", time_parallel)

        # We want each thread to handle 4 trees
        n_estimators = 4 * effective_n_jobs
        is_parallel_split_faster(n_estimators=n_estimators, aggregation=True)
        is_parallel_split_faster(n_estimators=n_estimators, aggregation=False)
Esempio n. 17
0
 def test_aggregation(self):
     clf = ForestClassifier()
     assert clf.aggregation
     clf = ForestClassifier(aggregation=False)
     assert not clf.aggregation
     clf.aggregation = True
     assert clf.aggregation
     with pytest.raises(ValueError, match="aggregation must be boolean"):
         clf.aggregation = "true"
     with pytest.raises(ValueError, match="aggregation must be boolean"):
         clf.aggregation = 1
Esempio n. 18
0
 def test_verbose(self):
     clf = ForestClassifier()
     assert not clf.verbose
     clf = ForestClassifier(verbose=True)
     assert clf.verbose
     clf.verbose = False
     assert not clf.verbose
     with pytest.raises(ValueError, match="verbose must be boolean"):
         clf.verbose = "true"
     with pytest.raises(ValueError, match="verbose must be boolean"):
         clf.verbose = 1
Esempio n. 19
0
def test_min_samples_split_min_samples_leaf_on_adult(
    aggregation,
    max_features,
    random_state,
    min_samples_split,
    min_samples_leaf,
    criterion,
):
    X, y = load_adult(raw=True)
    n_estimators = 3
    n_jobs = -1
    class_weight = "balanced"
    multiclass = "multinomial"
    step = 1.0
    clf = ForestClassifier(
        n_estimators=n_estimators,
        n_jobs=n_jobs,
        multiclass=multiclass,
        aggregation=aggregation,
        max_features=max_features,
        criterion=criterion,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        class_weight=class_weight,
        random_state=random_state,
        step=step,
    )
    clf.fit(X, y)
    min_samples = min(min_samples_split, min_samples_leaf)
    for tree in clf.trees:
        node_count = tree._tree_classifier.node_count
        nodes = tree._tree_classifier.nodes[:node_count]
        for node_id, node in enumerate(nodes):
            # Check that nodes respect the min_samples_split and
            # min_samples_leaf constraints
            assert node["n_samples_train"] >= min_samples
            if aggregation:
                assert node["n_samples_valid"] >= min_samples
Esempio n. 20
0
    def test_min_samples_split(self):
        clf = ForestClassifier()
        assert clf.min_samples_split == 2
        clf = ForestClassifier(min_samples_split=17)
        assert clf.min_samples_split == 17
        clf.min_samples_split = 5
        assert clf.min_samples_split == 5
        with pytest.raises(
                ValueError,
                match="min_samples_split must be an integer number"):
            clf.min_samples_split = 0.42
        with pytest.raises(
                ValueError,
                match="min_samples_split must be an integer number"):
            clf.min_samples_split = None
        with pytest.raises(
                ValueError,
                match="min_samples_split must be an integer number"):
            clf.min_samples_split = "4"

        with pytest.raises(ValueError, match="min_samples_split must be >= 2"):
            clf.min_samples_split = 1
        with pytest.raises(ValueError, match="min_samples_split must be >= 2"):
            clf.min_samples_split = -3
Esempio n. 21
0
    def test_n_samples_in_(self):
        clf = ForestClassifier()
        with pytest.raises(
                ValueError,
                match="You must call fit before asking for n_samples_in_"):
            _ = clf.n_samples_in_

        clf.n_samples_in_ = 42
        with pytest.raises(
                ValueError,
                match="You must call fit before asking for n_samples_in_"):
            _ = clf.n_samples_in_

        clf._fitted = True
        assert clf.n_samples_in_ == 42
        assert clf.n_samples_in_ == 42

        np.random.seed(42)
        X = np.random.randn(10, 3)
        y = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
        clf = ForestClassifier().fit(X, y)
        assert clf.n_samples_in_ == 10
Esempio n. 22
0
    def test_class_weight_sample_weights(self):
        iris = self.iris
        X, y = iris["data"], iris["target"]
        # Check that no sample_weight and all sample weights equal to 1. is the same
        clf1 = ForestClassifier(class_weight=None, random_state=42)
        clf1.fit(X, y)
        clf2 = ForestClassifier(class_weight=None, random_state=42)
        clf2.fit(X, y, sample_weight=np.ones(y.shape[0]))
        assert clf1.apply(X) == approx(clf2.apply(X))
        assert clf1.predict_proba(X) == approx(clf2.predict_proba(X))

        clf1 = ForestClassifier(class_weight="balanced", random_state=42)
        clf1.fit(X, y)
        clf2 = ForestClassifier(class_weight=None, random_state=42)
        sample_weight = compute_sample_weight("balanced", y)
        clf2.fit(X, y, sample_weight=sample_weight)
        assert clf1.apply(X) == approx(clf2.apply(X))
        assert clf1.predict_proba(X) == approx(clf2.predict_proba(X))

        # Simulate unbalanced data from the iris datasets
        X_unb = np.concatenate((X[0:50], X[50:56], X[100:106]), axis=0)
        y_unb = np.concatenate((y[0:50], y[50:56], y[100:106]), axis=0)

        X_train, X_test, y_train, y_test = train_test_split(X_unb,
                                                            y_unb,
                                                            shuffle=True,
                                                            stratify=y_unb,
                                                            random_state=42,
                                                            test_size=0.5)

        clf = ForestClassifier(class_weight=None,
                               random_state=42,
                               aggregation=True)
        clf.fit(X_train, y_train)
        y_scores = clf.predict(X_test)
        report1 = classification_report(y_test, y_scores, output_dict=True)

        clf = ForestClassifier(class_weight="balanced",
                               random_state=42,
                               aggregation=True)
        clf.fit(X_train, y_train)
        y_scores = clf.predict(X_test)
        report2 = classification_report(y_test, y_scores, output_dict=True)

        # In the considered case, class_weight should improve all metrics
        for label in ["0", "1", "2"]:
            label_report1 = report1[label]
            label_report2 = report2[label]
            assert label_report2["precision"] >= label_report1["precision"]
            assert label_report2["recall"] >= label_report1["recall"]
            assert label_report2["f1-score"] >= label_report1["f1-score"]

        breast_cancer = self.breast_cancer
        X, y = breast_cancer["data"], breast_cancer["target"]
        idx_0 = y == 0
        idx_1 = y == 1

        X_unb = np.concatenate((X[idx_0], X[idx_1][:10]), axis=0)
        y_unb = np.concatenate((y[idx_0], y[idx_1][:10]), axis=0)

        X_train, X_test, y_train, y_test = train_test_split(X_unb,
                                                            y_unb,
                                                            shuffle=True,
                                                            stratify=y_unb,
                                                            random_state=42,
                                                            test_size=0.5)

        clf = ForestClassifier(class_weight=None,
                               random_state=42,
                               aggregation=True)
        clf.fit(X_train, y_train)
        y_scores = clf.predict(X_test)

        y_test_binary = LabelBinarizer().fit_transform(y_test)

        avg_prec1 = average_precision_score(y_test_binary,
                                            y_scores,
                                            average="weighted")

        clf = ForestClassifier(class_weight="balanced",
                               random_state=42,
                               aggregation=True)
        clf.fit(X_train, y_train)
        y_scores = clf.predict(X_test)
        avg_prec2 = average_precision_score(y_test_binary,
                                            y_scores,
                                            average="weighted")

        assert avg_prec2 > avg_prec1
Esempio n. 23
0
    def test_cat_split_strategy_on_adult(self):
        dataset = load_adult()
        dataset.one_hot_encode = False
        dataset.test_size = 1.0 / 5
        random_state = 42
        X_train, X_test, y_train, y_test = dataset.extract(
            random_state=random_state)
        n_estimators = 10
        aggregation = False
        class_weight = "balanced"
        n_jobs = -1
        max_features = None
        random_state = 42
        dirichlet = 0.0
        categorical_features = dataset.categorical_features_

        multiclass = "multinomial"
        cat_split_strategy = "binary"
        clf = ForestClassifier(
            n_estimators=n_estimators,
            n_jobs=n_jobs,
            multiclass=multiclass,
            aggregation=aggregation,
            max_features=max_features,
            class_weight=class_weight,
            categorical_features=categorical_features,
            cat_split_strategy=cat_split_strategy,
            random_state=random_state,
            dirichlet=dirichlet,
        )
        clf.fit(X_train, y_train)
        y_scores_train = clf.predict_proba(X_train)
        y_scores_test = clf.predict_proba(X_test)
        lloss_train_binary = log_loss(y_train, y_scores_train)

        multiclass = "multinomial"
        cat_split_strategy = "all"
        clf = ForestClassifier(
            n_estimators=n_estimators,
            n_jobs=n_jobs,
            multiclass=multiclass,
            aggregation=aggregation,
            max_features=max_features,
            class_weight=class_weight,
            categorical_features=categorical_features,
            cat_split_strategy=cat_split_strategy,
            random_state=random_state,
            dirichlet=dirichlet,
        )
        clf.fit(X_train, y_train)
        y_scores_train = clf.predict_proba(X_train)
        lloss_train_all = log_loss(y_train, y_scores_train)
        assert lloss_train_all < lloss_train_binary
Esempio n. 24
0
    def test_multiclass_and_ovr_on_car(self):
        dataset = load_car()
        dataset.one_hot_encode = False
        dataset.test_size = 1.0 / 5
        random_state = 42
        X_train, X_test, y_train, y_test = dataset.extract(
            random_state=random_state)
        n_estimators = 1
        aggregation = False
        class_weight = "balanced"
        n_jobs = 1
        max_features = None
        random_state = 42
        dirichlet = 0.0
        categorical_features = dataset.categorical_features_

        multiclass = "multinomial"
        clf = ForestClassifier(
            n_estimators=n_estimators,
            n_jobs=n_jobs,
            multiclass=multiclass,
            aggregation=aggregation,
            max_features=max_features,
            class_weight=class_weight,
            categorical_features=categorical_features,
            random_state=random_state,
            dirichlet=dirichlet,
        )
        clf.fit(X_train, y_train)
        y_scores_train = clf.predict_proba(X_train)
        y_scores_test = clf.predict_proba(X_test)
        lloss_train_multinomial = log_loss(y_train, y_scores_train)
        lloss_test_multinomial = log_loss(y_test, y_scores_test)

        multiclass = "ovr"
        clf = ForestClassifier(
            n_estimators=n_estimators,
            n_jobs=n_jobs,
            multiclass=multiclass,
            aggregation=aggregation,
            max_features=max_features,
            class_weight=class_weight,
            categorical_features=categorical_features,
            random_state=random_state,
            dirichlet=dirichlet,
        )
        clf.fit(X_train, y_train)
        y_scores_train = clf.predict_proba(X_train)
        y_scores_test = clf.predict_proba(X_test)
        lloss_train_ovr = log_loss(y_train, y_scores_train)
        lloss_test_ovr = log_loss(y_test, y_scores_test)

        assert lloss_train_ovr < lloss_train_multinomial
        assert lloss_test_ovr < lloss_test_multinomial
Esempio n. 25
0
 def test_categorical_features(self):
     clf = ForestClassifier()
     assert clf.categorical_features is None
     clf.categorical_features = [1, 3]
     assert clf.categorical_features == [1, 3]
Esempio n. 26
0
        def do_test_bootstrap(n_estimators, n_jobs, random_state):
            # 1. Test that all bootstrap samples are different
            clf = ForestClassifier(n_estimators=n_estimators,
                                   n_jobs=n_jobs,
                                   random_state=random_state)
            clf.fit(X, y)

            for n_estimator1, n_estimator2 in product(range(n_estimators),
                                                      range(n_estimators)):
                if n_estimator1 < n_estimator2:
                    assert clf.trees[n_estimator1]._train_indices != approx(
                        clf.trees[n_estimator2]._train_indices)
                    assert clf.trees[n_estimator1]._valid_indices != approx(
                        clf.trees[n_estimator2]._valid_indices)

            # 2. Test that random_seed makes bootstrap samples identical and that
            #    when no random_seed is used bootstrap samples are different
            clf1 = ForestClassifier(n_estimators=n_estimators,
                                    n_jobs=n_jobs,
                                    random_state=random_state)
            clf1.fit(X, y)
            clf2 = ForestClassifier(n_estimators=n_estimators,
                                    n_jobs=n_jobs,
                                    random_state=random_state)
            clf2.fit(X, y)
            for n_estimator in range(n_estimators):
                if random_state is None:
                    assert clf1.trees[n_estimator]._train_indices != approx(
                        clf2.trees[n_estimator]._train_indices)
                    assert clf1.trees[n_estimator]._valid_indices != approx(
                        clf2.trees[n_estimator]._valid_indices)
                else:
                    assert clf1.trees[n_estimator]._train_indices == approx(
                        clf2.trees[n_estimator]._train_indices)
                    assert clf1.trees[n_estimator]._valid_indices == approx(
                        clf2.trees[n_estimator]._valid_indices)

            # 3. Test that the apply() method gives the exact same leaves (this allows
            #    to check that the trees are the same, namely that random columns
            #    subsampling is indeed correctly seeded) and that predictions are the
            #    same (or not)
            clf1 = ForestClassifier(n_estimators=n_estimators,
                                    n_jobs=n_jobs,
                                    random_state=random_state)
            clf1.fit(X, y)
            clf2 = ForestClassifier(n_estimators=n_estimators,
                                    n_jobs=n_jobs,
                                    random_state=random_state)
            clf2.fit(X, y)
            if random_state is None:
                assert clf1.apply(X) != approx(clf2.apply(X))
                assert clf1.predict_proba(X) != approx(clf2.predict_proba(X))
            else:
                assert clf1.apply(X) == approx(clf2.apply(X))
                assert clf1.predict_proba(X) == approx(clf2.predict_proba(X))
Esempio n. 27
0
        def do_test_bootstrap_again(n_estimators, n_jobs):
            # 4. When bootstrap seeds and column subsampling seeds are the same,
            #    the trees are all the same
            clf = ForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs)

            def _my_generate_random_states(self, n_states=None):
                self._random_states_bootstrap = np.ones(
                    (n_states or clf.n_estimators), dtype=np.int32)
                self._random_states_trees = np.ones(
                    (n_states or clf.n_estimators), dtype=np.int32)

            # Monkey patch the classifier
            clf._generate_random_states = types.MethodType(
                _my_generate_random_states, clf)
            clf.fit(X, y)
            leaves = clf.apply(X)
            for n_estimator1, n_estimator2 in product(range(n_estimators),
                                                      range(n_estimators)):
                if n_estimator1 < n_estimator2:
                    assert clf.trees[n_estimator1]._train_indices == approx(
                        clf.trees[n_estimator2]._train_indices)
                    assert clf.trees[n_estimator1]._valid_indices == approx(
                        clf.trees[n_estimator2]._valid_indices)
                    assert leaves[n_estimator1] == approx(leaves[n_estimator2])

            # 5. When bootstrap seeds are the same but column subsampling seeds are
            #    different, all the trees are different
            clf = ForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs)

            def _my_generate_random_states(self, n_states=None):
                # All bootstrap seeds are the same
                self._random_states_bootstrap = np.ones(
                    (n_states or clf.n_estimators), dtype=np.int32)
                # But column subsampling seeds are different
                self._random_states_trees = np.arange(n_states
                                                      or clf.n_estimators,
                                                      dtype=np.int32)

            # Monkey patch the classifier
            clf._generate_random_states = types.MethodType(
                _my_generate_random_states, clf)
            clf.fit(X, y)
            leaves = clf.apply(X)
            for n_estimator1, n_estimator2 in product(range(n_estimators),
                                                      range(n_estimators)):
                if n_estimator1 < n_estimator2:
                    assert clf.trees[n_estimator1]._train_indices == approx(
                        clf.trees[n_estimator2]._train_indices)
                    assert clf.trees[n_estimator1]._valid_indices == approx(
                        clf.trees[n_estimator2]._valid_indices)
                    assert leaves[n_estimator1] != approx(leaves[n_estimator2])

            # 6. When bootstrap seeds are different but column subsampling seeds are
            #    identical, all the trees are different
            clf = ForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs)

            def _my_generate_random_states(self, n_states=None):
                # All bootstrap seeds are the same
                self._random_states_bootstrap = np.arange(n_states
                                                          or clf.n_estimators,
                                                          dtype=np.int32)
                # But column subsampling seeds are different
                self._random_states_trees = np.ones(
                    (n_states or clf.n_estimators, ), dtype=np.int32)

            # Monkey patch the classifier
            clf._generate_random_states = types.MethodType(
                _my_generate_random_states, clf)
            clf.fit(X, y)
            leaves = clf.apply(X)
            for n_estimator1, n_estimator2 in product(range(n_estimators),
                                                      range(n_estimators)):
                if n_estimator1 < n_estimator2:
                    assert clf.trees[n_estimator1]._train_indices != approx(
                        clf.trees[n_estimator2]._train_indices)
                    assert clf.trees[n_estimator1]._valid_indices != approx(
                        clf.trees[n_estimator2]._valid_indices)
                    assert leaves[n_estimator1] != approx(leaves[n_estimator2])
Esempio n. 28
0
    def test_dirichlet_switch(self):
        breast_cancer = self.breast_cancer
        X, y = breast_cancer["data"], breast_cancer["target"]
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            shuffle=True,
                                                            stratify=y,
                                                            random_state=42,
                                                            test_size=0.3)

        clf1 = ForestClassifier(class_weight="balanced", random_state=42)
        clf2 = ForestClassifier(class_weight="balanced",
                                random_state=42,
                                dirichlet=2.0)

        clf1.fit(X_train, y_train)
        clf2.fit(X_train, y_train)
        y_score1 = clf1.predict_proba(X_test)
        y_score2 = clf2.predict_proba(X_test)

        assert np.max(np.abs(y_score1 - y_score2)) >= 0.01
        clf2.dirichlet = 0.5
        y_score2 = clf2.predict_proba(X_test)
        assert y_score1 == pytest.approx(y_score2, abs=1e-5)

        clf1.dirichlet = 1.1
        clf2.dirichlet = 1.1
        y_score1 = clf1.predict_proba(X_test)
        y_score2 = clf2.predict_proba(X_test)
        assert y_score1 == pytest.approx(y_score2, abs=1e-5)
Esempio n. 29
0
def test_forest_classifier_serialization(
    dataset_name,
    n_estimators,
    aggregation,
    class_weight,
    dirichlet,
    n_jobs,
    max_features,
    random_state,
    step,
    multiclass,
    cat_split_strategy,
):
    if dataset_name == "adult":
        X, y = load_adult(raw=True)
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, random_state=random_state)
    elif dataset_name == "iris":
        iris = datasets.load_iris()
        X = iris.data
        y = iris.target
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=1 / 5, random_state=random_state)

    clf1 = ForestClassifier(
        n_estimators=n_estimators,
        n_jobs=n_jobs,
        multiclass=multiclass,
        max_bins=37,
        cat_split_strategy=cat_split_strategy,
        aggregation=aggregation,
        max_features=max_features,
        class_weight=class_weight,
        random_state=random_state,
        dirichlet=dirichlet,
        step=step,
    )
    clf1.fit(X_train, y_train)

    filename = "forest_classifier_on_iris.pkl"
    with open(filename, "wb") as f:
        pkl.dump(clf1, f)

    with open(filename, "rb") as f:
        clf2 = pkl.load(f)

    os.remove(filename)

    assert_forests_equal(clf1, clf2, is_classifier=True)

    y_pred1 = clf1.predict_proba(X_test)
    y_pred2 = clf2.predict_proba(X_test)
    np.testing.assert_equal(y_pred1, y_pred2)

    y_pred1 = clf1.predict(X_test)
    y_pred2 = clf2.predict(X_test)
    np.testing.assert_equal(y_pred1, y_pred2)

    apply1 = clf1.apply(X_test)
    apply2 = clf2.apply(X_test)
    np.testing.assert_equal(apply1, apply2)
Esempio n. 30
0
    def test_performance_iris(self):
        iris = self.iris
        X, y = iris["data"], iris["target"]
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            shuffle=True,
                                                            stratify=y,
                                                            random_state=42,
                                                            test_size=0.3)
        clf = ForestClassifier(class_weight="balanced", random_state=42)
        clf.fit(X_train, y_train)
        y_score = clf.predict_proba(X_test)
        assert roc_auc_score(y_test, y_score, multi_class="ovo") >= 0.985
        clf = ForestClassifier(random_state=42)
        clf.fit(X_train, y_train)
        y_score = clf.predict_proba(X_test)
        assert roc_auc_score(y_test, y_score, multi_class="ovo") >= 0.985

        clf = ForestClassifier(class_weight="balanced",
                               random_state=42,
                               criterion="entropy")
        clf.fit(X_train, y_train)
        y_score = clf.predict_proba(X_test)
        assert roc_auc_score(y_test, y_score, multi_class="ovo") >= 0.985
        clf = ForestClassifier(random_state=42, criterion="entropy")
        clf.fit(X_train, y_train)
        y_score = clf.predict_proba(X_test)
        assert roc_auc_score(y_test, y_score, multi_class="ovo") >= 0.985