def test_enact_best_split_basic(self): tn = TreeNode([[1.0, 2.0], [-2.0, 2.0]], [7.0, 4.0]) tn.enact_best_split(min_data_per_node=1) self.assertEqual(0.0, tn.get_cost()) self.assertEqual(0.0, tn.left_child.get_cost()) self.assertEqual(0.0, tn.right_child.get_cost()) self.assertEqual(1, tn.left_child.N) self.assertEqual(1, tn.right_child.N)
def test_predict_checkerboard(self): x1_vals = np.arange(0, 1.0, 0.05).tolist() x2_vals = np.arange(0, 1.0, 0.05).tolist() x0predictors = [x for x in x1_vals for y in x2_vals] x1predictors = [y for x in x1_vals for y in x2_vals] predictors = [x0predictors, x1predictors] responses = [(0.0 if x < 0.5 else 0.1) if y < 0.5 else (3.1 if x < 0.5 else 2.9) for x in x1_vals for y in x2_vals] tn = TreeNode(predictors, responses) tn.enact_best_split() tn.enact_best_split() tn.enact_best_split() self.assertEqual(0.0, tn.predict([0.05, 0.05])) self.assertAlmostEqual(3.1, tn.predict([0.05, 0.95])) self.assertAlmostEqual(0.1, tn.predict([0.95, 0.05])) self.assertAlmostEqual(2.9, tn.predict([0.95, 0.95]))
def test_checkerboard_split(self): x1_vals = np.arange(0, 1.0, 0.05).tolist() x2_vals = np.arange(0, 1.0, 0.05).tolist() x0predictors = [x for x in x1_vals for y in x2_vals] x1predictors = [y for x in x1_vals for y in x2_vals] predictors = [x0predictors, x1predictors] responses = [(0.0 if x < 0.5 else 0.1) if y < 0.5 else (3.1 if x < 0.5 else 2.9) for x in x1_vals for y in x2_vals] tn = TreeNode(predictors, responses) self.assertEqual(True, tn.enact_best_split()) self.assertEqual(0.05, tn.left_child.unsplit_prediction) self.assertEqual(3.00, tn.right_child.unsplit_prediction) self.assertEqual(True, tn.enact_best_split()) self.assertEqual(True, tn.enact_best_split()) self.assertEqual(0.0, tn.left_child.left_child.unsplit_prediction) self.assertAlmostEqual(0.1, tn.left_child.right_child.unsplit_prediction) self.assertAlmostEqual(3.1, tn.right_child.left_child.unsplit_prediction) self.assertAlmostEqual(2.9, tn.right_child.right_child.unsplit_prediction) self.assertEqual(False, tn.enact_best_split())
print_costs_during_training=False training_accuracies = [] validation_accuracies = [] for k in range(K): training_fold = training_dataframe.select(lambda i: i%K != k) validation_fold = training_dataframe.select(lambda i: i%K == k) training_predictors = makePredictors(training_fold) training_responses = makeResponses(training_fold) tn = TreeNode(training_predictors, training_responses) if print_costs_during_training: print("Cost: {}".format(tn.get_cost())) while tn.enact_best_split(): if print_costs_during_training: print("Cost: {}".format(tn.get_cost())) print("Fold {}:".format(k)) training_predictions = [tn.predict(x) for x in zip(training_predictors[0], training_predictors[1], training_predictors[2])] training_errors = [0.0 if abs(x-y) < 0.5 else 1.0 for (x,y) in zip(training_predictions, training_responses)] training_accuracy = 1.0 - np.mean(training_errors) print("In-sample accuracy: {}".format(training_accuracy)) training_accuracies.append(training_accuracy) validation_predictors = makePredictors(validation_fold) validation_responses = makeResponses(validation_fold) validation_predictions = [tn.predict(x) for x in zip(validation_predictors[0], validation_predictors[1], validation_predictors[2])]