def test_n_features(self): clf = AMFClassifier(n_classes=2) X = np.random.randn(2, 2) y = np.array([0.0, 1.0]) clf.partial_fit(X, y) assert clf.n_features == 2 with pytest.raises(ValueError, match="`n_features` is a readonly attribute"): clf.n_features = 3
def get_amf_decision(use_aggregation, n_estimators, split_pure, dirichlet, step): amf = AMFClassifier( n_classes=n_classes, random_state=random_state, use_aggregation=use_aggregation, n_estimators=n_estimators, split_pure=split_pure, dirichlet=dirichlet, step=step, ) amf.partial_fit(X, y) zz = amf.predict_proba(X_mesh)[:, 1].reshape(xx.shape) return zz
def test_performance_on_moons(self): n_samples = 300 random_state = 42 X, y = make_moons(n_samples=n_samples, noise=0.25, random_state=random_state) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.5, random_state=random_state) clf = AMFClassifier(n_classes=2, random_state=random_state) clf.partial_fit(X_train, y_train) y_pred = clf.predict_proba(X_test) score = roc_auc_score(y_test, y_pred[:, 1]) # With this random_state, the score should be exactly 0.9709821428571429 assert score > 0.97
def get_amf_decision_batch(use_aggregation, n_estimators, split_pure, dirichlet, step): # TODO: add a progress bar amf = AMFClassifier( n_classes=n_classes, random_state=random_state, use_aggregation=use_aggregation, n_estimators=n_estimators, split_pure=split_pure, dirichlet=dirichlet, step=step, ) amf.partial_fit(X_train, y_train) zz = amf.predict_proba(xy)[:, 1].reshape(grid_size, grid_size) return zz
def test_repr(self): amf = AMFClassifier(n_classes=3) assert (repr(amf) == "AMFClassifier(n_classes=3, n_estimators=10, " "step=1.0, loss='log', use_aggregation=True, " "dirichlet=0.01, split_pure=False, n_jobs=1, " "random_state=None, verbose=False)") amf.n_estimators = 42 assert (repr(amf) == "AMFClassifier(n_classes=3, n_estimators=42, " "step=1.0, loss='log', use_aggregation=True, " "dirichlet=0.01, split_pure=False, n_jobs=1, " "random_state=None, verbose=False)") amf.verbose = False assert (repr(amf) == "AMFClassifier(n_classes=3, n_estimators=42, " "step=1.0, loss='log', use_aggregation=True, " "dirichlet=0.01, split_pure=False, n_jobs=1, " "random_state=None, verbose=False)")
def test_random_state_is_consistant(self): n_samples = 300 random_state = 42 X, y = make_moons(n_samples=n_samples, noise=0.25, random_state=random_state) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.5, random_state=random_state) clf = AMFClassifier(n_classes=2, random_state=random_state) clf.partial_fit(X_train, y_train) y_pred_1 = clf.predict_proba(X_test) clf = AMFClassifier(n_classes=2, random_state=random_state) clf.partial_fit(X_train, y_train) y_pred_2 = clf.predict_proba(X_test) assert y_pred_1 == approx(y_pred_2)
def test_random_state(self): parameter_test_with_min( AMFClassifier, parameter="random_state", valid_val=4, invalid_type_val=2.0, invalid_val=-1, min_value=0, min_value_str="0", mandatory=False, fixed_type=int, required_args={"n_classes": 2}, ) amf = AMFClassifier(n_classes=2) assert amf.random_state is None assert amf._random_state == -1 amf.random_state = 1 amf.random_state = None assert amf._random_state == -1
def get_amf_decisions(use_aggregation, n_estimators, split_pure, dirichlet, step): amf = AMFClassifier( n_classes=n_classes, random_state=random_state, use_aggregation=use_aggregation, n_estimators=n_estimators, split_pure=split_pure, dirichlet=dirichlet, step=step, ) zzs = [] progress_bar = st.sidebar.progress(0) for it in range(1, n_samples_train + 1): amf.partial_fit(X[it - 1].reshape(1, 2), np.array([y[it - 1]])) zz = amf.predict_proba(X_mesh)[:, 1].reshape(xx.shape) zzs.append(zz) progress = int(100 * it / n_samples_train) progress_bar.progress(progress) return zzs
def get_classifiers(): return [ ( "AMF", AMFClassifier( n_classes=2, n_estimators=n_estimators, random_state=random_state, use_aggregation=True, split_pure=True, ), ), ( "AMF(no agg)", AMFClassifier( n_classes=2, n_estimators=n_estimators, random_state=random_state, use_aggregation=False, split_pure=True, ), ), ( "MF", MondrianForestClassifier(n_estimators=n_estimators, random_state=random_state), ), ( "RF", RandomForestClassifier(n_estimators=n_estimators, random_state=random_state), ), ( "ET", ExtraTreesClassifier(n_estimators=n_estimators, random_state=random_state), ), ]
def get_classifiers_online(n_classes, random_state=42): use_aggregations = [True] n_estimatorss = [10] split_pures = [False] dirichlets = [None] learning_rates = [0.1] for (n_estimators, use_aggregation, split_pure, dirichlet) in product(n_estimatorss, use_aggregations, split_pures, dirichlets): yield ( # "AMF(nt=%s, ag=%s, sp=%s, di=%s)" # % ( # str(n_estimators), # str(use_aggregation), # str(split_pure), # str(dirichlet), # ), "AMF", AMFClassifier( n_classes=n_classes, random_state=random_state, use_aggregation=use_aggregation, n_estimators=n_estimators, split_pure=split_pure, dirichlet=dirichlet, verbose=False, ), ) yield "Dummy", OnlineDummyClassifier(n_classes=n_classes) for n_estimators in n_estimatorss: yield ( "MF", MondrianForestClassifier(n_estimators=n_estimators, random_state=random_state), ) for learning_rate in learning_rates: yield ( # "SGD(%s)" % str(learning_rate), "SGD", SGDClassifier( loss="log", learning_rate="constant", eta0=learning_rate, random_state=random_state, ), )
def get_classifiers_n_trees_comparison(n_classes, random_state=42): use_aggregations = [True] n_estimatorss = [1, 2, 5, 10, 20, 50] split_pures = [False] dirichlets = [None] for (n_estimators, use_aggregation, split_pure, dirichlet) in product(n_estimatorss, use_aggregations, split_pures, dirichlets): yield ( "AMF(nt=%s)" % str(n_estimators), AMFClassifier( n_classes=n_classes, random_state=random_state, use_aggregation=use_aggregation, n_estimators=n_estimators, split_pure=split_pure, dirichlet=dirichlet, verbose=False, ), ) for n_estimators in n_estimatorss: yield ( "MF(nt=%s)" % str(n_estimators), MondrianForestClassifier(n_estimators=n_estimators, random_state=random_state), ) for n_estimators in n_estimatorss: yield ( "RF(nt=%s)" % str(n_estimators), RandomForestClassifier( n_estimators=n_estimators, class_weight=None, random_state=random_state, n_jobs=1, ), ) for n_estimators in n_estimatorss: yield ( "ET(nt=%s)" % str(n_estimators), ExtraTreesClassifier( n_estimators=n_estimators, class_weight=None, random_state=random_state, n_jobs=1, ), )
def precompile_amf(): X, y = make_blobs(n_samples=5) n_classes = int(y.max() + 1) amf = AMFClassifier( n_classes=n_classes, random_state=0, use_aggregation=True, n_estimators=1, split_pure=False, dirichlet=0.5, step=1.0, ) amf.partial_fit(X, y) amf.predict_proba(X)
def test_predict_proba(self): clf = AMFClassifier(n_classes=2) with pytest.raises( RuntimeError, match= "You must call `partial_fit` before asking for predictions", ): X_test = np.random.randn(2, 3) clf.predict_proba(X_test) with pytest.raises(ValueError) as exc_info: X = np.random.randn(2, 2) y = np.array([0.0, 1.0]) clf.partial_fit(X, y) X_test = np.random.randn(2, 3) clf.predict_proba(X_test) assert exc_info.type is ValueError assert exc_info.value.args[ 0] == "`partial_fit` was called with n_features=%d while predictions are asked with n_features=%d" % ( clf.n_features, 3, )
def test_predict_proba_tree_match_predict_proba(self): n_samples = 300 n_classes = 2 n_estimators = 10 random_state = 42 X, y = make_moons(n_samples=n_samples, noise=0.25, random_state=random_state) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.5, random_state=random_state) clf = AMFClassifier(n_classes=2, n_estimators=n_estimators, random_state=random_state) clf.partial_fit(X_train, y_train) y_pred = clf.predict_proba(X_test) y_pred_tree = np.empty((y_pred.shape[0], n_classes, n_estimators)) for idx_tree in range(n_estimators): y_pred_tree[:, :, idx_tree] = clf.predict_proba_tree(X_test, idx_tree) assert y_pred == approx(y_pred_tree.mean(axis=2), 1e-6)
def get_classifiers_batch(n_classes, random_state=42): use_aggregations = [True] n_estimatorss = [10] split_pures = [False] dirichlets = [None] learning_rates = [1e-1] for (n_estimators, use_aggregation, split_pure, dirichlet) in product(n_estimatorss, use_aggregations, split_pures, dirichlets): yield ( # "AMF(nt=%s, ag=%s, sp=%s, di=%s)" # % ( # str(n_estimators), # str(use_aggregation), # str(split_pure), # str(dirichlet), # ), "AMF", AMFClassifier( n_classes=n_classes, random_state=random_state, use_aggregation=use_aggregation, n_estimators=n_estimators, split_pure=split_pure, dirichlet=dirichlet, verbose=False, ), ) for n_estimators in n_estimatorss: yield ( # "MF(nt=%s)" % str(n_estimators), "MF", MondrianForestClassifier(n_estimators=n_estimators, random_state=random_state), ) for n_estimators in n_estimatorss: yield ( # "RF(nt=%s)" % str(n_estimators), "RF", RandomForestClassifier( n_estimators=n_estimators, class_weight=None, random_state=random_state, n_jobs=1, ), ) for n_estimators in n_estimatorss: yield ( # "ET(nt=%s)" % str(n_estimators), "ET", ExtraTreesClassifier( n_estimators=n_estimators, class_weight=None, random_state=random_state, n_jobs=1, ), ) for learning_rate in learning_rates: yield ( # "SGD(%s)" % str(learning_rate), "SGD", SGDClassifier( loss="log", learning_rate="constant", eta0=learning_rate, random_state=random_state, ), )
def test_amf_classifier_serialization(): """Trains a AMFClassifier on iris, saves and loads it again. Check that everything is the same between the original and loaded forest """ random_state = 42 n_estimators = 1 n_classes = 3 iris = datasets.load_iris() X = iris.data y = iris.target X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=random_state) X_train_1, X_train_2, y_train_1, y_train_2 = train_test_split( X_train, y_train, test_size=0.2, random_state=random_state) clf1 = AMFClassifier(n_estimators=n_estimators, n_classes=n_classes, random_state=random_state) clf1.partial_fit(X_train_1, y_train_1) filename = "amf_on_iris.pkl" clf1.save(filename) clf2 = AMFClassifier.load(filename) os.remove(filename) def test_forests_are_equal(clf1, clf2): # Test samples samples1 = clf1.no_python.samples samples2 = clf2.no_python.samples assert samples1.n_samples_increment == samples2.n_samples_increment n_samples1 = samples1.n_samples n_samples2 = samples2.n_samples assert n_samples1 == n_samples2 assert samples1.n_samples_capacity == samples2.n_samples_capacity assert np.all( samples1.labels[:n_samples1] == samples2.labels[:n_samples2]) assert np.all( samples1.features[:n_samples1] == samples2.features[:n_samples2]) # Test nopython.trees for n_estimator in range(n_estimators): tree1 = clf1.no_python.trees[n_estimator] tree2 = clf2.no_python.trees[n_estimator] # Test tree attributes assert tree1.n_features == tree2.n_features assert tree1.step == tree2.step assert tree1.loss == tree2.loss assert tree1.use_aggregation == tree2.use_aggregation assert tree1.iteration == tree2.iteration assert tree1.n_classes == tree2.n_classes assert tree1.dirichlet == tree2.dirichlet assert np.all(tree1.intensities == tree2.intensities) # Test tree.nodes nodes1 = tree1.nodes nodes2 = tree2.nodes assert np.all(nodes1.index == nodes2.index) assert np.all(nodes1.is_leaf == nodes2.is_leaf) assert np.all(nodes1.depth == nodes2.depth) assert np.all(nodes1.n_samples == nodes2.n_samples) assert np.all(nodes1.parent == nodes2.parent) assert np.all(nodes1.left == nodes2.left) assert np.all(nodes1.right == nodes2.right) assert np.all(nodes1.feature == nodes2.feature) assert np.all(nodes1.weight == nodes2.weight) assert np.all(nodes1.log_weight_tree == nodes2.log_weight_tree) assert np.all(nodes1.threshold == nodes2.threshold) assert np.all(nodes1.time == nodes2.time) assert np.all(nodes1.memory_range_min == nodes2.memory_range_min) assert np.all(nodes1.memory_range_max == nodes2.memory_range_max) assert np.all(nodes1.n_features == nodes2.n_features) assert nodes1.n_nodes == nodes2.n_nodes assert nodes1.n_samples_increment == nodes2.n_samples_increment assert nodes1.n_nodes_capacity == nodes2.n_nodes_capacity assert np.all(nodes1.counts == nodes2.counts) assert nodes1.n_classes == nodes2.n_classes test_forests_are_equal(clf1, clf2) # Test predict proba y_pred = clf1.predict_proba(X_test) y_pred_pkl = clf2.predict_proba(X_test) assert np.all(y_pred == y_pred_pkl) clf1.partial_fit(X_train_2, y_train_2) clf2.partial_fit(X_train_2, y_train_2) test_forests_are_equal(clf1, clf2) y_pred = clf1.predict_proba(X_test) y_pred_pkl = clf2.predict_proba(X_test) assert np.all(y_pred == y_pred_pkl)
def test_partial_fit(self): clf = AMFClassifier(n_classes=2) n_features = 4 X = np.random.randn(2, n_features) y = np.array([0.0, 1.0]) clf.partial_fit(X, y) assert clf.n_features == n_features assert clf.no_python.iteration == 2 assert clf.no_python.samples.n_samples == 2 assert clf.no_python.n_features == n_features with pytest.raises(ValueError) as exc_info: X = np.random.randn(2, 3) y = np.array([0.0, 1.0]) clf.partial_fit(X, y) assert exc_info.type is ValueError assert ( exc_info.value.args[0] == "`partial_fit` was first called with " "n_features=4 while n_features=3 in this call") with pytest.raises( ValueError, match="All the values in `y` must be non-negative", ): clf = AMFClassifier(n_classes=2) X = np.random.randn(2, n_features) y = np.array([0.0, -1.0]) clf.partial_fit(X, y) with pytest.raises(ValueError) as exc_info: clf = AMFClassifier(n_classes=2) X = np.random.randn(2, 3) y = np.array([0.0, 2.0]) clf.partial_fit(X, y) assert exc_info.type is ValueError assert exc_info.value.args[0] == "n_classes=2 while y.max()=2"
logging.info("Dataset %s." % dataset_name) X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42) n_classes = int(y.max() + 1) amf = AMFClassifier( n_classes=n_classes, random_state=random_state, use_aggregation=use_aggregation, n_estimators=n_estimators, split_pure=split_pure, dirichlet=dirichlet, # n_samples_increment=, step=step, verbose=False, ) ofc = OnlineForestClassifier( n_classes=n_classes, random_state=random_state, use_aggregation=use_aggregation, n_estimators=n_estimators, split_pure=split_pure, dirichlet=dirichlet, step=step, verbose=False, )
def test_loss(self): amf = AMFClassifier(n_classes=2) assert amf.loss == "log" amf.loss = "other loss" assert amf.loss == "log"
logging.info("Simulation of the data") X, y = make_moons(n_samples=n_samples, noise=0.2, random_state=random_state) logging.info("Train/Test splitting") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=random_state) logging.info("Computation of the meshgrid") xx, yy, X_mesh = get_mesh(X) clf = AMFClassifier( n_classes=n_classes, n_estimators=100, random_state=random_state, split_pure=True, use_aggregation=True, ) n_plots = len(save_iterations) n_fig = 0 save_iterations = [0, *save_iterations] fig, axes = plt.subplots(nrows=2, ncols=n_plots, figsize=(3 * n_plots, 6)) logging.info("Launching iterations") bar = trange(n_plots, desc="Plotting iterations", leave=True) for start, end in zip(save_iterations[:-1], save_iterations[1:]): X_iter = X_train[start:end]
plot_contour_binary_classif( ax, xx, yy, Z, title="Tree #%d" % (idx_tree + 1), norm=norm, levels=levels ) ax = plt.subplot(2, n_estimators / 2 + 1, n_estimators + 2) Z = forest.predict_proba(X_mesh)[:, 1].reshape(xx.shape) plot_contour_binary_classif(ax, xx, yy, Z, title="Forest", norm=norm, levels=levels) plt.tight_layout() n_samples = 100 n_features = 2 n_classes = 2 random_state = 42 dataset = make_moons(n_samples=n_samples, noise=0.15, random_state=random_state) n_estimators = 10 amf = AMFClassifier( n_classes=n_classes, n_estimators=n_estimators, random_state=random_state, use_aggregation=True, split_pure=True, ) logging.info("Building the graph...") plot_forest_effect(amf, dataset) plt.savefig("forest_effect.pdf") logging.info("Saved the forest effect plot in forest_effect.pdf")
def get_amf_trees_and_decisions(use_aggregation, n_estimators, split_pure, dirichlet, step): amf = AMFClassifier( n_classes=n_classes, random_state=random_state, use_aggregation=use_aggregation, n_estimators=n_estimators, split_pure=split_pure, dirichlet=dirichlet, step=step, ) zzs = [] df_trees = [] df_datas = [] progress_bar = st.sidebar.progress(0) for it in range(1, n_samples_train + 1): # Append the current data df_datas.append(df_data[:it]) # Partial fit AMFClassifier amf.partial_fit(X_train[it - 1].reshape(1, 2), np.array([y_train[it - 1]])) # Get the tree df_tree = amf.get_nodes_df(0) df_tree["min_x"] = df_tree["memory_range_min"].apply(lambda t: t[0]) df_tree["min_y"] = df_tree["memory_range_min"].apply(lambda t: t[1]) df_tree["max_x"] = df_tree["memory_range_max"].apply(lambda t: t[0]) df_tree["max_y"] = df_tree["memory_range_max"].apply(lambda t: t[1]) df_tree["count_0"] = df_tree["counts"].apply(lambda t: t[0]) df_tree["count_1"] = df_tree["counts"].apply(lambda t: t[1]) df_tree.sort_values(by=["depth", "parent", "id"], inplace=True) # max_depth = df.depth.max() max_depth = 10 n_nodes = df_tree.shape[0] x = np.zeros(n_nodes) x[0] = 0.5 indexes = df_tree["id"].values df_tree["x"] = x df_tree["y"] = max_depth - df_tree["depth"] df_tree["x0"] = df_tree["x"] df_tree["y0"] = df_tree["y"] for node in range(1, n_nodes): index = indexes[node] parent = df_tree.at[index, "parent"] depth = df_tree.at[index, "depth"] left_parent = df_tree.at[parent, "left"] x_parent = df_tree.at[parent, "x"] if left_parent == index: # It's a left node df_tree.at[index, "x"] = x_parent - 0.5**(depth + 1) else: df_tree.at[index, "x"] = x_parent + 0.5**(depth + 1) df_tree.at[index, "x0"] = x_parent df_tree.at[index, "y0"] = df_tree.at[parent, "y"] df_tree["color"] = df_tree["is_leaf"].astype("str") df_tree.replace({"color": { "False": "blue", "True": "green" }}, inplace=True) df_trees.append(df_tree) # Compute the decision function zz = amf.predict_proba(xy)[:, 1].reshape(grid_size, grid_size) zzs.append(zz) progress = int(100 * it / n_samples_train) progress_bar.progress(progress) return zzs, df_datas, df_trees