def test_iterator_and_str(self): """Check preorder iterator""" expected = [ "root feaures=(0, 1, 2) impurity=1.0000 counts=(array([0, 1]), " "array([750, 750]))", "root - Down(2), <cgaf> - Leaf class=0 belief= 0.928297 impurity=" "0.3722 counts=(array([0, 1]), array([725, 56]))", "root - Up(2) feaures=(0, 1, 2) impurity=0.2178 counts=(array([0, " "1]), array([ 25, 694]))", "root - Up(2) - Down(3) feaures=(0, 1, 2) impurity=0.8454 counts=" "(array([0, 1]), array([8, 3]))", "root - Up(2) - Down(3) - Down(4), <pure> - Leaf class=0 belief= " "1.000000 impurity=0.0000 counts=(array([0]), array([7]))", "root - Up(2) - Down(3) - Up(4), <cgaf> - Leaf class=1 belief= " "0.750000 impurity=0.8113 counts=(array([0, 1]), array([1, 3]))", "root - Up(2) - Up(3), <cgaf> - Leaf class=1 belief= 0.975989 " "impurity=0.1634 counts=(array([0, 1]), array([ 17, 691]))", ] computed = [] expected_string = "" clf = Stree( kernel="liblinear", multiclass_strategy="ovr", random_state=self._random_state, ) clf.fit(*load_dataset(self._random_state)) for node in iter(clf): computed.append(str(node)) expected_string += str(node) + "\n" self.assertListEqual(expected, computed) self.assertEqual(expected_string, str(clf))
def test_min_samples_split(self): dataset = [[1], [2], [3]], [1, 1, 0] tcl_split = Stree(min_samples_split=3).fit(*dataset) self.assertIsNotNone(tcl_split.tree_.get_down()) self.assertIsNotNone(tcl_split.tree_.get_up()) tcl_nosplit = Stree(min_samples_split=4).fit(*dataset) self.assertIsNone(tcl_nosplit.tree_.get_down()) self.assertIsNone(tcl_nosplit.tree_.get_up())
def __init__(self, *args, **kwargs): self._random_state = 1 self._clf = Stree( random_state=self._random_state, kernel="liblinear", multiclass_strategy="ovr", ) self._clf.fit(*load_dataset(self._random_state)) super().__init__(*args, **kwargs)
def test_single_prediction(self): X, y = load_dataset(self._random_state) for kernel in self._kernels: clf = Stree( kernel=kernel, multiclass_strategy="ovr" if kernel == "liblinear" else "ovo", random_state=self._random_state, ) yp = clf.fit(X, y).predict((X[0, :].reshape(-1, X.shape[1]))) self.assertEqual(yp[0], y[0])
def test_score_max_features(self): X, y = load_dataset(self._random_state) clf = Stree( kernel="liblinear", multiclass_strategy="ovr", random_state=self._random_state, max_features=2, ) clf.fit(X, y) self.assertAlmostEqual(0.9453333333333334, clf.score(X, y))
def test_muticlass_dataset(self): warnings.filterwarnings("ignore", category=ConvergenceWarning) warnings.filterwarnings("ignore", category=RuntimeWarning) datasets = { "Synt": load_dataset(random_state=self._random_state, n_classes=3), "Iris": load_wine(return_X_y=True), } outcomes = { "Synt": { "max_samples liblinear": 0.9493333333333334, "max_samples linear": 0.9426666666666667, "max_samples rbf": 0.9606666666666667, "max_samples poly": 0.9373333333333334, "max_samples sigmoid": 0.824, "impurity liblinear": 0.9493333333333334, "impurity linear": 0.9426666666666667, "impurity rbf": 0.9606666666666667, "impurity poly": 0.9373333333333334, "impurity sigmoid": 0.824, }, "Iris": { "max_samples liblinear": 0.9550561797752809, "max_samples linear": 1.0, "max_samples rbf": 0.6685393258426966, "max_samples poly": 0.6853932584269663, "max_samples sigmoid": 0.6404494382022472, "impurity liblinear": 0.9550561797752809, "impurity linear": 1.0, "impurity rbf": 0.6685393258426966, "impurity poly": 0.6853932584269663, "impurity sigmoid": 0.6404494382022472, }, } for name, dataset in datasets.items(): px, py = dataset for criteria in ["max_samples", "impurity"]: for kernel in self._kernels: clf = Stree( max_iter=1e4, multiclass_strategy="ovr" if kernel == "liblinear" else "ovo", kernel=kernel, random_state=self._random_state, ) clf.fit(px, py) outcome = outcomes[name][f"{criteria} {kernel}"] # print(f'"{criteria} {kernel}": {clf.score(px, py)},') self.assertAlmostEqual( outcome, clf.score(px, py), 5, f"{name} - {criteria} - {kernel}", )
def test_build_tree(self): """Check if the tree is built the same way as predictions of models""" warnings.filterwarnings("ignore") for kernel in self._kernels: clf = Stree( kernel="sigmoid", multiclass_strategy="ovr" if kernel == "liblinear" else "ovo", random_state=self._random_state, ) clf.fit(*load_dataset(self._random_state)) self._check_tree(clf.tree_)
def test_check_max_depth(self): depths = (3, 4) for depth in depths: tcl = Stree( kernel="liblinear", multiclass_strategy="ovr", random_state=self._random_state, max_depth=depth, ) tcl.fit(*load_dataset(self._random_state)) self.assertEqual(depth, tcl.depth_)
def test_multiple_prediction(self): # First 27 elements the predictions are the same as the truth num = 27 X, y = load_dataset(self._random_state) for kernel in ["liblinear", "linear", "rbf", "poly"]: clf = Stree( kernel=kernel, multiclass_strategy="ovr" if kernel == "liblinear" else "ovo", random_state=self._random_state, ) yp = clf.fit(X, y).predict(X[:num, :]) self.assertListEqual(y[:num].tolist(), yp.tolist())
def test_score_multiclass_linear(self): warnings.filterwarnings("ignore", category=ConvergenceWarning) warnings.filterwarnings("ignore", category=RuntimeWarning) X, y = load_dataset( random_state=self._random_state, n_classes=3, n_features=5, n_samples=1500, ) clf = Stree( kernel="liblinear", multiclass_strategy="ovr", random_state=self._random_state, ) self.assertEqual(0.9533333333333334, clf.fit(X, y).score(X, y)) # Check with context based standardization clf2 = Stree( kernel="liblinear", multiclass_strategy="ovr", random_state=self._random_state, normalize=True, ) self.assertEqual(0.9526666666666667, clf2.fit(X, y).score(X, y)) X, y = load_wine(return_X_y=True) self.assertEqual(0.9831460674157303, clf.fit(X, y).score(X, y)) self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
def test_multiclass_classifier_integrity(self): """Checks if the multiclass operation is done right""" X, y = load_iris(return_X_y=True) clf = Stree(kernel="liblinear", multiclass_strategy="ovr", random_state=0) clf.fit(X, y) score = clf.score(X, y) # Check accuracy of the whole model self.assertAlmostEquals(0.98, score, 5) svm = LinearSVC(random_state=0) svm.fit(X, y) self.assertAlmostEquals(0.9666666666666667, svm.score(X, y), 5) data = svm.decision_function(X) expected = [ 0.4444444444444444, 0.35777777777777775, 0.4569777777777778, ] ty = data.copy() ty[data <= 0] = 0 ty[data > 0] = 1 ty = ty.astype(int) for i in range(3): self.assertAlmostEquals( expected[i], clf.splitter_._gini(ty[:, i]), ) # 1st Branch # up has to have 50 samples of class 0 # down should have 100 [50, 50] up = data[:, 2] > 0 resup = np.unique(y[up], return_counts=True) resdn = np.unique(y[~up], return_counts=True) self.assertListEqual([1, 2], resup[0].tolist()) self.assertListEqual([3, 50], resup[1].tolist()) self.assertListEqual([0, 1], resdn[0].tolist()) self.assertListEqual([50, 47], resdn[1].tolist()) # 2nd Branch # up should have 53 samples of classes [1, 2] [3, 50] # down shoud have 47 samples of class 1 node_up = clf.tree_.get_down().get_up() node_dn = clf.tree_.get_down().get_down() resup = np.unique(node_up._y, return_counts=True) resdn = np.unique(node_dn._y, return_counts=True) self.assertListEqual([1, 2], resup[0].tolist()) self.assertListEqual([3, 50], resup[1].tolist()) self.assertListEqual([1], resdn[0].tolist()) self.assertListEqual([47], resdn[1].tolist())
def test_incompatible_hyperparameters(self): X, y = load_wine(return_X_y=True) clf = Stree(kernel="liblinear", multiclass_strategy="ovo") with self.assertRaises(ValueError): clf.fit(X, y) clf = Stree(multiclass_strategy="ovo", split_criteria="max_samples") with self.assertRaises(ValueError): clf.fit(X, y)
def test_check_max_depth_is_positive_or_None(self): tcl = Stree() self.assertIsNone(tcl.max_depth) tcl = Stree(max_depth=1) self.assertGreaterEqual(1, tcl.max_depth) with self.assertRaises(ValueError): tcl = Stree(max_depth=-1) tcl.fit(*load_dataset(self._random_state))
def test_nodes_coefs(self): """Check if the nodes of the tree have the right attributes filled""" def run_tree(node: Snode): if node._belief < 1: # only exclude pure leaves self.assertIsNotNone(node._clf) self.assertIsNotNone(node._clf.coef_) if node.is_leaf(): return run_tree(node.get_up()) run_tree(node.get_down()) model = Stree(self._random_state) model.fit(*load_dataset(self._random_state, 3, 4)) run_tree(model.tree_)
def test_nodes_leaves_artificial(self): n1 = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test1") n2 = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test2") n3 = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test3") n4 = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test4") n5 = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test5") n6 = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test6") n1.set_up(n2) n2.set_up(n3) n2.set_down(n4) n3.set_up(n5) n4.set_down(n6) clf = Stree(random_state=self._random_state) clf.tree_ = n1 nodes, leaves = clf.nodes_leaves() self.assertEqual(6, nodes) self.assertEqual(2, leaves)
def test_nodes_leaves(self): X, y = load_dataset( random_state=self._random_state, n_classes=3, n_features=5, n_samples=1500, ) clf = Stree(random_state=self._random_state) clf.fit(X, y) nodes, leaves = clf.nodes_leaves() self.assertEqual(31, nodes) self.assertEqual(16, leaves) X, y = load_wine(return_X_y=True) clf = Stree(random_state=self._random_state) clf.fit(X, y) nodes, leaves = clf.nodes_leaves() self.assertEqual(11, nodes) self.assertEqual(6, leaves)
def test_copy_node(self): px = [1, 2, 3, 4] py = [1] test = Snode(Stree(), px, py, [], 0.0, "test") computed = Snode.copy(test) self.assertListEqual(computed._X, px) self.assertListEqual(computed._y, py) self.assertEqual("test", computed._title) self.assertIsInstance(computed._clf, Stree) self.assertEqual(test._partition_column, computed._partition_column) self.assertEqual(test._sample_weight, computed._sample_weight) self.assertEqual(test._scaler, computed._scaler)
def test_depth(self): X, y = load_dataset( random_state=self._random_state, n_classes=3, n_features=5, n_samples=1500, ) clf = Stree(random_state=self._random_state) clf.fit(X, y) self.assertEqual(6, clf.depth_) X, y = load_wine(return_X_y=True) clf = Stree(random_state=self._random_state) clf.fit(X, y) self.assertEqual(4, clf.depth_)
def test_predict_feature_dimensions(self): X = np.random.rand(10, 5) y = np.random.randint(0, 2, 10) clf = Stree() clf.fit(X, y) with self.assertRaises(ValueError): clf.predict(X[:, :3])
def test_multiclass_strategy(self): X, y = load_wine(return_X_y=True) clf_o = Stree(multiclass_strategy="ovo") clf_r = Stree(multiclass_strategy="ovr") score_o = clf_o.fit(X, y).score(X, y) score_r = clf_r.fit(X, y).score(X, y) self.assertEqual(1.0, score_o) self.assertEqual(0.9269662921348315, score_r)
def test_score_multiclass_rbf(self): X, y = load_dataset( random_state=self._random_state, n_classes=3, n_features=5, n_samples=500, ) clf = Stree(kernel="rbf", random_state=self._random_state) clf2 = Stree(kernel="rbf", random_state=self._random_state, normalize=True) self.assertEqual(0.966, clf.fit(X, y).score(X, y)) self.assertEqual(0.964, clf2.fit(X, y).score(X, y)) X, y = load_wine(return_X_y=True) self.assertEqual(0.6685393258426966, clf.fit(X, y).score(X, y)) self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
def test_max_features(self): n_features = 16 expected_values = [ ("auto", 4), ("log2", 4), ("sqrt", 4), (0.5, 8), (3, 3), (None, 16), ] clf = Stree() clf.n_features_ = n_features for max_features, expected in expected_values: clf.set_params(**dict(max_features=max_features)) computed = clf._initialize_max_features() self.assertEqual(expected, computed) # Check bogus max_features values = ["duck", -0.1, 0.0] for max_features in values: clf.set_params(**dict(max_features=max_features)) with self.assertRaises(ValueError): _ = clf._initialize_max_features()
def test_score_multiclass_sigmoid(self): X, y = load_dataset( random_state=self._random_state, n_classes=3, n_features=5, n_samples=500, ) clf = Stree(kernel="sigmoid", random_state=self._random_state, C=10) clf2 = Stree( kernel="sigmoid", random_state=self._random_state, normalize=True, C=10, ) self.assertEqual(0.796, clf.fit(X, y).score(X, y)) self.assertEqual(0.952, clf2.fit(X, y).score(X, y)) X, y = load_wine(return_X_y=True) self.assertEqual(0.6910112359550562, clf.fit(X, y).score(X, y)) self.assertEqual(0.9662921348314607, clf2.fit(X, y).score(X, y))
def test_simple_muticlass_dataset(self): for kernel in self._kernels: clf = Stree( kernel=kernel, multiclass_strategy="ovr" if kernel == "liblinear" else "ovo", random_state=self._random_state, ) px = [[1, 2], [5, 6], [9, 10]] py = [0, 1, 2] clf.fit(px, py) self.assertEqual(1.0, clf.score(px, py)) self.assertListEqual(py, clf.predict(px).tolist()) self.assertListEqual(py, clf.classes_.tolist())
def test_mask_samples_weighted_zero(self): X = np.array([ [1, 1], [1, 1], [1, 1], [2, 2], [2, 2], [2, 2], [3, 3], [3, 3], [3, 3], ]) y = np.array([1, 1, 1, 2, 2, 2, 5, 5, 5]) yw = np.array([1, 1, 1, 1, 1, 1, 5, 5, 5]) w = [1, 1, 1, 0, 0, 0, 1, 1, 1] model1 = Stree().fit(X, y) model2 = Stree().fit(X, y, w) predict1 = model1.predict(X) predict2 = model2.predict(X) self.assertListEqual(y.tolist(), predict1.tolist()) self.assertListEqual(yw.tolist(), predict2.tolist()) self.assertEqual(model1.score(X, y), 1) self.assertAlmostEqual(model2.score(X, y), 0.66666667) self.assertEqual(model2.score(X, y, w), 1)
def test_score_multiclass_poly(self): X, y = load_dataset( random_state=self._random_state, n_classes=3, n_features=5, n_samples=500, ) clf = Stree(kernel="poly", random_state=self._random_state, C=10, degree=5) clf2 = Stree( kernel="poly", random_state=self._random_state, normalize=True, ) self.assertEqual(0.946, clf.fit(X, y).score(X, y)) self.assertEqual(0.972, clf2.fit(X, y).score(X, y)) X, y = load_wine(return_X_y=True) self.assertEqual(0.7808988764044944, clf.fit(X, y).score(X, y)) self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
def test_score_multiclass_liblinear(self): X, y = load_dataset( random_state=self._random_state, n_classes=3, n_features=5, n_samples=500, ) clf = Stree( kernel="liblinear", multiclass_strategy="ovr", random_state=self._random_state, C=10, ) clf2 = Stree( kernel="liblinear", multiclass_strategy="ovr", random_state=self._random_state, normalize=True, ) self.assertEqual(0.968, clf.fit(X, y).score(X, y)) self.assertEqual(0.97, clf2.fit(X, y).score(X, y)) X, y = load_wine(return_X_y=True) self.assertEqual(1.0, clf.fit(X, y).score(X, y)) self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
def test_get_subspaces(self): dataset = np.random.random((10, 16)) y = np.random.randint(0, 2, 10) expected_values = [ ("auto", 4), ("log2", 4), ("sqrt", 4), (0.5, 8), (3, 3), (None, 16), ] clf = Stree() for max_features, expected in expected_values: clf.set_params(**dict(max_features=max_features)) clf.fit(dataset, y) computed, indices = clf.splitter_.get_subspace( dataset, y, clf.max_features_) self.assertListEqual(dataset[:, indices].tolist(), computed.tolist()) self.assertEqual(expected, len(indices))
def test_single_vs_multiple_prediction(self): """Check if predicting sample by sample gives the same result as predicting all samples at once """ X, y = load_dataset(self._random_state) for kernel in self._kernels: clf = Stree( kernel=kernel, multiclass_strategy="ovr" if kernel == "liblinear" else "ovo", random_state=self._random_state, ) clf.fit(X, y) # Compute prediction line by line yp_line = np.array([], dtype=int) for xp in X: yp_line = np.append(yp_line, clf.predict(xp.reshape(-1, X.shape[1]))) # Compute prediction at once yp_once = clf.predict(X) self.assertListEqual(yp_line.tolist(), yp_once.tolist())
def test_score_binary(self): X, y = load_dataset(self._random_state) accuracies = [ 0.9506666666666667, 0.9493333333333334, 0.9606666666666667, 0.9433333333333334, 0.9153333333333333, ] for kernel, accuracy_expected in zip(self._kernels, accuracies): clf = Stree( random_state=self._random_state, multiclass_strategy="ovr" if kernel == "liblinear" else "ovo", kernel=kernel, ) clf.fit(X, y) accuracy_score = clf.score(X, y) yp = clf.predict(X) accuracy_computed = np.mean(yp == y) self.assertEqual(accuracy_score, accuracy_computed) self.assertAlmostEqual(accuracy_expected, accuracy_score)