Exemple #1
0
 def test_n_features(self):
     clf = AMFClassifier(n_classes=2)
     X = np.random.randn(2, 2)
     y = np.array([0.0, 1.0])
     clf.partial_fit(X, y)
     assert clf.n_features == 2
     with pytest.raises(ValueError,
                        match="`n_features` is a readonly attribute"):
         clf.n_features = 3
def get_amf_decision(use_aggregation, n_estimators, split_pure, dirichlet,
                     step):
    amf = AMFClassifier(
        n_classes=n_classes,
        random_state=random_state,
        use_aggregation=use_aggregation,
        n_estimators=n_estimators,
        split_pure=split_pure,
        dirichlet=dirichlet,
        step=step,
    )
    amf.partial_fit(X, y)
    zz = amf.predict_proba(X_mesh)[:, 1].reshape(xx.shape)
    return zz
Exemple #3
0
 def test_performance_on_moons(self):
     n_samples = 300
     random_state = 42
     X, y = make_moons(n_samples=n_samples,
                       noise=0.25,
                       random_state=random_state)
     X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=0.5, random_state=random_state)
     clf = AMFClassifier(n_classes=2, random_state=random_state)
     clf.partial_fit(X_train, y_train)
     y_pred = clf.predict_proba(X_test)
     score = roc_auc_score(y_test, y_pred[:, 1])
     # With this random_state, the score should be exactly 0.9709821428571429
     assert score > 0.97
Exemple #4
0
def get_amf_decision_batch(use_aggregation, n_estimators, split_pure,
                           dirichlet, step):
    # TODO: add a progress bar
    amf = AMFClassifier(
        n_classes=n_classes,
        random_state=random_state,
        use_aggregation=use_aggregation,
        n_estimators=n_estimators,
        split_pure=split_pure,
        dirichlet=dirichlet,
        step=step,
    )
    amf.partial_fit(X_train, y_train)
    zz = amf.predict_proba(xy)[:, 1].reshape(grid_size, grid_size)
    return zz
Exemple #5
0
    def test_repr(self):
        amf = AMFClassifier(n_classes=3)
        assert (repr(amf) == "AMFClassifier(n_classes=3, n_estimators=10, "
                "step=1.0, loss='log', use_aggregation=True, "
                "dirichlet=0.01, split_pure=False, n_jobs=1, "
                "random_state=None, verbose=False)")

        amf.n_estimators = 42
        assert (repr(amf) == "AMFClassifier(n_classes=3, n_estimators=42, "
                "step=1.0, loss='log', use_aggregation=True, "
                "dirichlet=0.01, split_pure=False, n_jobs=1, "
                "random_state=None, verbose=False)")

        amf.verbose = False
        assert (repr(amf) == "AMFClassifier(n_classes=3, n_estimators=42, "
                "step=1.0, loss='log', use_aggregation=True, "
                "dirichlet=0.01, split_pure=False, n_jobs=1, "
                "random_state=None, verbose=False)")
Exemple #6
0
    def test_random_state_is_consistant(self):
        n_samples = 300
        random_state = 42
        X, y = make_moons(n_samples=n_samples,
                          noise=0.25,
                          random_state=random_state)
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.5, random_state=random_state)

        clf = AMFClassifier(n_classes=2, random_state=random_state)
        clf.partial_fit(X_train, y_train)
        y_pred_1 = clf.predict_proba(X_test)

        clf = AMFClassifier(n_classes=2, random_state=random_state)
        clf.partial_fit(X_train, y_train)
        y_pred_2 = clf.predict_proba(X_test)

        assert y_pred_1 == approx(y_pred_2)
Exemple #7
0
 def test_random_state(self):
     parameter_test_with_min(
         AMFClassifier,
         parameter="random_state",
         valid_val=4,
         invalid_type_val=2.0,
         invalid_val=-1,
         min_value=0,
         min_value_str="0",
         mandatory=False,
         fixed_type=int,
         required_args={"n_classes": 2},
     )
     amf = AMFClassifier(n_classes=2)
     assert amf.random_state is None
     assert amf._random_state == -1
     amf.random_state = 1
     amf.random_state = None
     assert amf._random_state == -1
Exemple #8
0
def get_amf_decisions(use_aggregation, n_estimators, split_pure, dirichlet,
                      step):
    amf = AMFClassifier(
        n_classes=n_classes,
        random_state=random_state,
        use_aggregation=use_aggregation,
        n_estimators=n_estimators,
        split_pure=split_pure,
        dirichlet=dirichlet,
        step=step,
    )
    zzs = []
    progress_bar = st.sidebar.progress(0)
    for it in range(1, n_samples_train + 1):
        amf.partial_fit(X[it - 1].reshape(1, 2), np.array([y[it - 1]]))
        zz = amf.predict_proba(X_mesh)[:, 1].reshape(xx.shape)
        zzs.append(zz)
        progress = int(100 * it / n_samples_train)
        progress_bar.progress(progress)
    return zzs
Exemple #9
0
def get_classifiers():
    return [
        (
            "AMF",
            AMFClassifier(
                n_classes=2,
                n_estimators=n_estimators,
                random_state=random_state,
                use_aggregation=True,
                split_pure=True,
            ),
        ),
        (
            "AMF(no agg)",
            AMFClassifier(
                n_classes=2,
                n_estimators=n_estimators,
                random_state=random_state,
                use_aggregation=False,
                split_pure=True,
            ),
        ),
        (
            "MF",
            MondrianForestClassifier(n_estimators=n_estimators,
                                     random_state=random_state),
        ),
        (
            "RF",
            RandomForestClassifier(n_estimators=n_estimators,
                                   random_state=random_state),
        ),
        (
            "ET",
            ExtraTreesClassifier(n_estimators=n_estimators,
                                 random_state=random_state),
        ),
    ]
Exemple #10
0
def get_classifiers_online(n_classes, random_state=42):
    use_aggregations = [True]
    n_estimatorss = [10]
    split_pures = [False]
    dirichlets = [None]
    learning_rates = [0.1]

    for (n_estimators, use_aggregation, split_pure,
         dirichlet) in product(n_estimatorss, use_aggregations, split_pures,
                               dirichlets):
        yield (
            # "AMF(nt=%s, ag=%s, sp=%s, di=%s)"
            # % (
            #     str(n_estimators),
            #     str(use_aggregation),
            #     str(split_pure),
            #     str(dirichlet),
            # ),
            "AMF",
            AMFClassifier(
                n_classes=n_classes,
                random_state=random_state,
                use_aggregation=use_aggregation,
                n_estimators=n_estimators,
                split_pure=split_pure,
                dirichlet=dirichlet,
                verbose=False,
            ),
        )

    yield "Dummy", OnlineDummyClassifier(n_classes=n_classes)

    for n_estimators in n_estimatorss:
        yield (
            "MF",
            MondrianForestClassifier(n_estimators=n_estimators,
                                     random_state=random_state),
        )

    for learning_rate in learning_rates:
        yield (
            # "SGD(%s)" % str(learning_rate),
            "SGD",
            SGDClassifier(
                loss="log",
                learning_rate="constant",
                eta0=learning_rate,
                random_state=random_state,
            ),
        )
Exemple #11
0
def get_classifiers_n_trees_comparison(n_classes, random_state=42):
    use_aggregations = [True]
    n_estimatorss = [1, 2, 5, 10, 20, 50]
    split_pures = [False]
    dirichlets = [None]
    for (n_estimators, use_aggregation, split_pure,
         dirichlet) in product(n_estimatorss, use_aggregations, split_pures,
                               dirichlets):
        yield (
            "AMF(nt=%s)" % str(n_estimators),
            AMFClassifier(
                n_classes=n_classes,
                random_state=random_state,
                use_aggregation=use_aggregation,
                n_estimators=n_estimators,
                split_pure=split_pure,
                dirichlet=dirichlet,
                verbose=False,
            ),
        )

    for n_estimators in n_estimatorss:
        yield (
            "MF(nt=%s)" % str(n_estimators),
            MondrianForestClassifier(n_estimators=n_estimators,
                                     random_state=random_state),
        )

    for n_estimators in n_estimatorss:
        yield (
            "RF(nt=%s)" % str(n_estimators),
            RandomForestClassifier(
                n_estimators=n_estimators,
                class_weight=None,
                random_state=random_state,
                n_jobs=1,
            ),
        )

    for n_estimators in n_estimatorss:
        yield (
            "ET(nt=%s)" % str(n_estimators),
            ExtraTreesClassifier(
                n_estimators=n_estimators,
                class_weight=None,
                random_state=random_state,
                n_jobs=1,
            ),
        )
Exemple #12
0
def precompile_amf():
    X, y = make_blobs(n_samples=5)
    n_classes = int(y.max() + 1)
    amf = AMFClassifier(
        n_classes=n_classes,
        random_state=0,
        use_aggregation=True,
        n_estimators=1,
        split_pure=False,
        dirichlet=0.5,
        step=1.0,
    )
    amf.partial_fit(X, y)
    amf.predict_proba(X)
Exemple #13
0
    def test_predict_proba(self):
        clf = AMFClassifier(n_classes=2)
        with pytest.raises(
                RuntimeError,
                match=
                "You must call `partial_fit` before asking for predictions",
        ):
            X_test = np.random.randn(2, 3)
            clf.predict_proba(X_test)

        with pytest.raises(ValueError) as exc_info:
            X = np.random.randn(2, 2)
            y = np.array([0.0, 1.0])
            clf.partial_fit(X, y)
            X_test = np.random.randn(2, 3)
            clf.predict_proba(X_test)
        assert exc_info.type is ValueError
        assert exc_info.value.args[
            0] == "`partial_fit` was called with n_features=%d while predictions are asked with n_features=%d" % (
                clf.n_features,
                3,
            )
Exemple #14
0
    def test_predict_proba_tree_match_predict_proba(self):
        n_samples = 300
        n_classes = 2
        n_estimators = 10
        random_state = 42
        X, y = make_moons(n_samples=n_samples,
                          noise=0.25,
                          random_state=random_state)
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.5, random_state=random_state)
        clf = AMFClassifier(n_classes=2,
                            n_estimators=n_estimators,
                            random_state=random_state)
        clf.partial_fit(X_train, y_train)
        y_pred = clf.predict_proba(X_test)
        y_pred_tree = np.empty((y_pred.shape[0], n_classes, n_estimators))
        for idx_tree in range(n_estimators):
            y_pred_tree[:, :,
                        idx_tree] = clf.predict_proba_tree(X_test, idx_tree)

        assert y_pred == approx(y_pred_tree.mean(axis=2), 1e-6)
Exemple #15
0
def get_classifiers_batch(n_classes, random_state=42):
    use_aggregations = [True]
    n_estimatorss = [10]
    split_pures = [False]
    dirichlets = [None]
    learning_rates = [1e-1]

    for (n_estimators, use_aggregation, split_pure,
         dirichlet) in product(n_estimatorss, use_aggregations, split_pures,
                               dirichlets):
        yield (
            # "AMF(nt=%s, ag=%s, sp=%s, di=%s)"
            #           % (
            #           str(n_estimators),
            #       str(use_aggregation),
            #       str(split_pure),
            #       str(dirichlet),
            # ),
            "AMF",
            AMFClassifier(
                n_classes=n_classes,
                random_state=random_state,
                use_aggregation=use_aggregation,
                n_estimators=n_estimators,
                split_pure=split_pure,
                dirichlet=dirichlet,
                verbose=False,
            ),
        )

    for n_estimators in n_estimatorss:
        yield (
            # "MF(nt=%s)" % str(n_estimators),
            "MF",
            MondrianForestClassifier(n_estimators=n_estimators,
                                     random_state=random_state),
        )

    for n_estimators in n_estimatorss:
        yield (
            # "RF(nt=%s)" % str(n_estimators),
            "RF",
            RandomForestClassifier(
                n_estimators=n_estimators,
                class_weight=None,
                random_state=random_state,
                n_jobs=1,
            ),
        )

    for n_estimators in n_estimatorss:
        yield (
            # "ET(nt=%s)" % str(n_estimators),
            "ET",
            ExtraTreesClassifier(
                n_estimators=n_estimators,
                class_weight=None,
                random_state=random_state,
                n_jobs=1,
            ),
        )

    for learning_rate in learning_rates:
        yield (
            # "SGD(%s)" % str(learning_rate),
            "SGD",
            SGDClassifier(
                loss="log",
                learning_rate="constant",
                eta0=learning_rate,
                random_state=random_state,
            ),
        )
def test_amf_classifier_serialization():
    """Trains a AMFClassifier on iris, saves and loads it again. Check that
    everything is the same between the original and loaded forest
    """
    random_state = 42
    n_estimators = 1
    n_classes = 3

    iris = datasets.load_iris()
    X = iris.data
    y = iris.target
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=random_state)
    X_train_1, X_train_2, y_train_1, y_train_2 = train_test_split(
        X_train, y_train, test_size=0.2, random_state=random_state)
    clf1 = AMFClassifier(n_estimators=n_estimators,
                         n_classes=n_classes,
                         random_state=random_state)
    clf1.partial_fit(X_train_1, y_train_1)

    filename = "amf_on_iris.pkl"
    clf1.save(filename)
    clf2 = AMFClassifier.load(filename)
    os.remove(filename)

    def test_forests_are_equal(clf1, clf2):
        # Test samples
        samples1 = clf1.no_python.samples
        samples2 = clf2.no_python.samples
        assert samples1.n_samples_increment == samples2.n_samples_increment
        n_samples1 = samples1.n_samples
        n_samples2 = samples2.n_samples
        assert n_samples1 == n_samples2
        assert samples1.n_samples_capacity == samples2.n_samples_capacity
        assert np.all(
            samples1.labels[:n_samples1] == samples2.labels[:n_samples2])
        assert np.all(
            samples1.features[:n_samples1] == samples2.features[:n_samples2])

        # Test nopython.trees
        for n_estimator in range(n_estimators):
            tree1 = clf1.no_python.trees[n_estimator]
            tree2 = clf2.no_python.trees[n_estimator]
            # Test tree attributes
            assert tree1.n_features == tree2.n_features
            assert tree1.step == tree2.step
            assert tree1.loss == tree2.loss
            assert tree1.use_aggregation == tree2.use_aggregation
            assert tree1.iteration == tree2.iteration
            assert tree1.n_classes == tree2.n_classes
            assert tree1.dirichlet == tree2.dirichlet
            assert np.all(tree1.intensities == tree2.intensities)
            # Test tree.nodes
            nodes1 = tree1.nodes
            nodes2 = tree2.nodes
            assert np.all(nodes1.index == nodes2.index)
            assert np.all(nodes1.is_leaf == nodes2.is_leaf)
            assert np.all(nodes1.depth == nodes2.depth)
            assert np.all(nodes1.n_samples == nodes2.n_samples)
            assert np.all(nodes1.parent == nodes2.parent)
            assert np.all(nodes1.left == nodes2.left)
            assert np.all(nodes1.right == nodes2.right)
            assert np.all(nodes1.feature == nodes2.feature)
            assert np.all(nodes1.weight == nodes2.weight)
            assert np.all(nodes1.log_weight_tree == nodes2.log_weight_tree)
            assert np.all(nodes1.threshold == nodes2.threshold)
            assert np.all(nodes1.time == nodes2.time)
            assert np.all(nodes1.memory_range_min == nodes2.memory_range_min)
            assert np.all(nodes1.memory_range_max == nodes2.memory_range_max)
            assert np.all(nodes1.n_features == nodes2.n_features)
            assert nodes1.n_nodes == nodes2.n_nodes
            assert nodes1.n_samples_increment == nodes2.n_samples_increment
            assert nodes1.n_nodes_capacity == nodes2.n_nodes_capacity
            assert np.all(nodes1.counts == nodes2.counts)
            assert nodes1.n_classes == nodes2.n_classes

    test_forests_are_equal(clf1, clf2)

    # Test predict proba
    y_pred = clf1.predict_proba(X_test)
    y_pred_pkl = clf2.predict_proba(X_test)
    assert np.all(y_pred == y_pred_pkl)

    clf1.partial_fit(X_train_2, y_train_2)
    clf2.partial_fit(X_train_2, y_train_2)
    test_forests_are_equal(clf1, clf2)

    y_pred = clf1.predict_proba(X_test)
    y_pred_pkl = clf2.predict_proba(X_test)
    assert np.all(y_pred == y_pred_pkl)
Exemple #17
0
    def test_partial_fit(self):
        clf = AMFClassifier(n_classes=2)
        n_features = 4
        X = np.random.randn(2, n_features)
        y = np.array([0.0, 1.0])
        clf.partial_fit(X, y)
        assert clf.n_features == n_features
        assert clf.no_python.iteration == 2
        assert clf.no_python.samples.n_samples == 2
        assert clf.no_python.n_features == n_features

        with pytest.raises(ValueError) as exc_info:
            X = np.random.randn(2, 3)
            y = np.array([0.0, 1.0])
            clf.partial_fit(X, y)
        assert exc_info.type is ValueError
        assert (
            exc_info.value.args[0] == "`partial_fit` was first called with "
            "n_features=4 while n_features=3 in this call")

        with pytest.raises(
                ValueError,
                match="All the values in `y` must be non-negative",
        ):
            clf = AMFClassifier(n_classes=2)
            X = np.random.randn(2, n_features)
            y = np.array([0.0, -1.0])
            clf.partial_fit(X, y)

        with pytest.raises(ValueError) as exc_info:
            clf = AMFClassifier(n_classes=2)
            X = np.random.randn(2, 3)
            y = np.array([0.0, 2.0])
            clf.partial_fit(X, y)
        assert exc_info.type is ValueError
        assert exc_info.value.args[0] == "n_classes=2 while y.max()=2"
Exemple #18
0
    logging.info("Dataset %s." % dataset_name)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        stratify=y,
                                                        test_size=0.3,
                                                        random_state=42)

    n_classes = int(y.max() + 1)

    amf = AMFClassifier(
        n_classes=n_classes,
        random_state=random_state,
        use_aggregation=use_aggregation,
        n_estimators=n_estimators,
        split_pure=split_pure,
        dirichlet=dirichlet,
        # n_samples_increment=,
        step=step,
        verbose=False,
    )
    ofc = OnlineForestClassifier(
        n_classes=n_classes,
        random_state=random_state,
        use_aggregation=use_aggregation,
        n_estimators=n_estimators,
        split_pure=split_pure,
        dirichlet=dirichlet,
        step=step,
        verbose=False,
    )
Exemple #19
0
 def test_loss(self):
     amf = AMFClassifier(n_classes=2)
     assert amf.loss == "log"
     amf.loss = "other loss"
     assert amf.loss == "log"
Exemple #20
0
logging.info("Simulation of the data")
X, y = make_moons(n_samples=n_samples, noise=0.2, random_state=random_state)

logging.info("Train/Test splitting")
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.5,
                                                    random_state=random_state)

logging.info("Computation of the meshgrid")
xx, yy, X_mesh = get_mesh(X)

clf = AMFClassifier(
    n_classes=n_classes,
    n_estimators=100,
    random_state=random_state,
    split_pure=True,
    use_aggregation=True,
)

n_plots = len(save_iterations)
n_fig = 0
save_iterations = [0, *save_iterations]

fig, axes = plt.subplots(nrows=2, ncols=n_plots, figsize=(3 * n_plots, 6))

logging.info("Launching iterations")
bar = trange(n_plots, desc="Plotting iterations", leave=True)

for start, end in zip(save_iterations[:-1], save_iterations[1:]):
    X_iter = X_train[start:end]
        plot_contour_binary_classif(
            ax, xx, yy, Z, title="Tree #%d" % (idx_tree + 1), norm=norm, levels=levels
        )

    ax = plt.subplot(2, n_estimators / 2 + 1, n_estimators + 2)
    Z = forest.predict_proba(X_mesh)[:, 1].reshape(xx.shape)
    plot_contour_binary_classif(ax, xx, yy, Z, title="Forest", norm=norm, levels=levels)
    plt.tight_layout()


n_samples = 100
n_features = 2
n_classes = 2
random_state = 42
dataset = make_moons(n_samples=n_samples, noise=0.15, random_state=random_state)

n_estimators = 10
amf = AMFClassifier(
    n_classes=n_classes,
    n_estimators=n_estimators,
    random_state=random_state,
    use_aggregation=True,
    split_pure=True,
)

logging.info("Building the graph...")
plot_forest_effect(amf, dataset)

plt.savefig("forest_effect.pdf")
logging.info("Saved the forest effect plot in forest_effect.pdf")
Exemple #22
0
def get_amf_trees_and_decisions(use_aggregation, n_estimators, split_pure,
                                dirichlet, step):
    amf = AMFClassifier(
        n_classes=n_classes,
        random_state=random_state,
        use_aggregation=use_aggregation,
        n_estimators=n_estimators,
        split_pure=split_pure,
        dirichlet=dirichlet,
        step=step,
    )
    zzs = []
    df_trees = []
    df_datas = []
    progress_bar = st.sidebar.progress(0)
    for it in range(1, n_samples_train + 1):
        # Append the current data
        df_datas.append(df_data[:it])

        # Partial fit AMFClassifier
        amf.partial_fit(X_train[it - 1].reshape(1, 2),
                        np.array([y_train[it - 1]]))

        # Get the tree
        df_tree = amf.get_nodes_df(0)
        df_tree["min_x"] = df_tree["memory_range_min"].apply(lambda t: t[0])
        df_tree["min_y"] = df_tree["memory_range_min"].apply(lambda t: t[1])
        df_tree["max_x"] = df_tree["memory_range_max"].apply(lambda t: t[0])
        df_tree["max_y"] = df_tree["memory_range_max"].apply(lambda t: t[1])
        df_tree["count_0"] = df_tree["counts"].apply(lambda t: t[0])
        df_tree["count_1"] = df_tree["counts"].apply(lambda t: t[1])
        df_tree.sort_values(by=["depth", "parent", "id"], inplace=True)
        # max_depth = df.depth.max()
        max_depth = 10
        n_nodes = df_tree.shape[0]
        x = np.zeros(n_nodes)
        x[0] = 0.5
        indexes = df_tree["id"].values
        df_tree["x"] = x
        df_tree["y"] = max_depth - df_tree["depth"]
        df_tree["x0"] = df_tree["x"]
        df_tree["y0"] = df_tree["y"]
        for node in range(1, n_nodes):
            index = indexes[node]
            parent = df_tree.at[index, "parent"]
            depth = df_tree.at[index, "depth"]
            left_parent = df_tree.at[parent, "left"]
            x_parent = df_tree.at[parent, "x"]
            if left_parent == index:
                # It's a left node
                df_tree.at[index, "x"] = x_parent - 0.5**(depth + 1)
            else:
                df_tree.at[index, "x"] = x_parent + 0.5**(depth + 1)
            df_tree.at[index, "x0"] = x_parent
            df_tree.at[index, "y0"] = df_tree.at[parent, "y"]

        df_tree["color"] = df_tree["is_leaf"].astype("str")
        df_tree.replace({"color": {
            "False": "blue",
            "True": "green"
        }},
                        inplace=True)
        df_trees.append(df_tree)

        # Compute the decision function
        zz = amf.predict_proba(xy)[:, 1].reshape(grid_size, grid_size)
        zzs.append(zz)
        progress = int(100 * it / n_samples_train)
        progress_bar.progress(progress)

    return zzs, df_datas, df_trees