class TestBinaryDecisionTrees(unittest.TestCase): """Tests for the BinaryDecisionTrees class. Uses lending club data to test binary decision trees. Attributes: binary_decision_trees (BinaryDecisionTrees): Binary decision tree class. predict_output (PredictOutput): Predict output class to predict output for decision tree. accuracy (Accuracy): Measures the accuracy of algorithms. error (Error): Measure the accuracy of algorithms. loans_data (pandas.DataFrame): Lending Club Data. features (list of str): List of features to build decision tree on. target (str): The target that we are predicting. train_data (pandas.DataFrame): Lending Club training Data. test_data (pandas.DataFrame): Lending Club testing data. Statics: _multiprocess_can_split_ (bool): Flag for nose tests to run tests in parallel. """ _multiprocess_can_split_ = True def setUp(self): """Constructor for TestBinaryDecisionTrees. We will clean up the loans_data by doing one hot encoding our features list, however, in the end we will use some pre-built data for training and testing, but it uses the same method for one hot encoding. """ self.binary_decision_trees = BinaryDecisionTrees() self.predict_output = PredictOutput() self.accuracy = Accuracy() self.error = Error() # Pandas type set dtype_dict = {'grade': str, 'term': str, 'emp_length': str, 'bad_loans': int} # Load the lending club data self.loans_data = pd.read_csv('./unit_tests/test_data/classification/lending_club/lending_club_data.csv', dtype=dtype_dict) # List features and targets that we are interested self.features = ['grade', 'term', 'home_ownership', 'emp_length'] self.target = 'safe_loans' # Do a one hot encoding of features for feature in self.features: # One hot encode loans_data_one_hot_encoded = pd.get_dummies(self.loans_data[feature].apply(lambda x: x), prefix=feature, prefix_sep='.') # Drop the feature self.loans_data.drop(feature, axis=1, inplace=True) # Join the feature with the new one encoded features self.loans_data = pd.concat([self.loans_data, loans_data_one_hot_encoded], axis=1) # Update our features self.features = list(self.loans_data.columns.values) self.features.remove('safe_loans') # Load our training and testing data self.train_data = pd.read_csv('./unit_tests/test_data/classification/lending_club/lending_club_train.csv') self.test_data = pd.read_csv('./unit_tests/test_data/classification/lending_club/lending_club_test.csv') def test_01_greedy_recursive(self): """Tests greedy recursive function for BinaryDecisionTrees class We will use the training data to build a decision tree, and measure the accuracy with some known good values. """ # Create a decision tree decision_tree = self.binary_decision_trees.greedy_recursive(self.train_data, self.features, self.target, {"current_depth": 0, "max_depth": 6}) # Get the classification result of the first row classification = self.predict_output.binary_tree(decision_tree, self.test_data.iloc[0]) # Assert that the classification should be -1 self.assertEqual(classification, -1) # Compute the accuracy of the decision tree accuracy = self.error.binary_tree(decision_tree, self.test_data, self.target) # Assert that the classification should be 0.3837785437311504 self.assertEqual(round(accuracy, 5), round(0.3837785437311504, 5)) def test_02_greedy_recursive_high_depth_low_feature(self): """Tests greedy recursive function for BinaryDecisionTrees class We will use the training data to build a decision tree, and use high depth. """ # Create a decision tree decision_tree = self.binary_decision_trees.greedy_recursive(self.train_data, ['grade.A', 'grade.B'], self.target, {"current_depth": 0, "max_depth": 1000}) # Get the classification result of the first row classification = self.predict_output.binary_tree(decision_tree, self.test_data.iloc[0]) # Assert that the classification should be -1 self.assertEqual(classification, -1) # Compute the accuracy of the decision tree accuracy = self.error.binary_tree(decision_tree, self.test_data, self.target) # Assert that the classification should be 0.38432 self.assertEqual(round(accuracy, 5), round(0.38432, 5)) def test_04_greedy_recursive_high_depth(self): """Tests greedy recursive function for BinaryDecisionTrees class We will use the training data to build a decision tree, and use high depth. """ # Create a decision tree decision_tree = self.binary_decision_trees.greedy_recursive(self.train_data, self.features, self.target, {"current_depth": 0, "max_depth": 10000}) # Get the classification result of the first row classification = self.predict_output.binary_tree(decision_tree, self.test_data.iloc[0]) # Assert that the classification should be -1 self.assertEqual(classification, -1) # Compute the accuracy of the decision tree accuracy = self.error.binary_tree(decision_tree, self.test_data, self.target) # Assert that the classification should be 0.37732 self.assertEqual(round(accuracy, 5), round(0.37732, 5)) def test_03_greedy_recursive_early_stop(self): """Tests for greedy recursive with early stopping for BinaryDecisionTrees class We will use early stopping for greedy recursive, and measure performance. """ # Create a model with max_depth=6, min_node_size=100, min_error_reduction=0 model_1 = self.binary_decision_trees.greedy_recursive_early_stop(self.train_data, self.features, self.target, {"current_depth": 0, "max_depth": 6, "min_node_size": 100, "min_error_reduction": 0.0}) # Get the classification result of the first row classification = self.predict_output.binary_tree(model_1, self.test_data.iloc[0]) # Assert that the classification should be -1 self.assertEqual(classification, -1) # Compute the accuracy of the decision tree accuracy = self.error.binary_tree(model_1, self.test_data, self.target) # Assert that the classification should be 0.38367083153813014 self.assertEqual(round(accuracy, 5), round(0.38367083153813014, 5)) # Create a model with max_depth=6, min_node_size=0, min_error_reduction=-1 model_2 = self.binary_decision_trees.greedy_recursive_early_stop(self.train_data, self.features, self.target, {"current_depth": 0, "max_depth": 6, "min_node_size": 0, "min_error_reduction": -1}) # Get the classification result of the first row classification = self.predict_output.binary_tree(model_2, self.test_data.iloc[0]) # Assert that the classification should be -1 self.assertEqual(classification, -1) # Compute the accuracy of the decision tree accuracy = self.error.binary_tree(model_2, self.test_data, self.target) # Assert that the classification should be 0.3837785437311504 self.assertEqual(round(accuracy, 5), round(0.3837785437311504, 5)) def test_04_greedy_recursive_early_stop_high_depth(self): """Tests for greedy recursive with early stopping for BinaryDecisionTrees class We will use early stopping for greedy recursive, and measure performance. """ # Create a model with max_depth=5000, min_node_size=0, min_error_reduction=0 model_1 = self.binary_decision_trees.greedy_recursive_early_stop(self.train_data, ['grade.A', 'grade.B'], self.target, {"current_depth": 0, "max_depth": 5000, "min_node_size": 0, "min_error_reduction": 0.0}) # Get the classification result of the first row classification = self.predict_output.binary_tree(model_1, self.test_data.iloc[0]) # Assert that the classification should be -1 self.assertEqual(classification, -1) # Compute the accuracy of the decision tree accuracy = self.error.binary_tree(model_1, self.test_data, self.target) # Assert that the classification should be 0.38432 self.assertEqual(round(accuracy, 5), round(0.38432, 5)) def test_05_greedy_recursive_early_stop_high_depth(self): """Tests for greedy recursive with early stopping for BinaryDecisionTrees class We will use early stopping for greedy recursive, and measure performance. """ # Create a model with max_depth=5000, min_node_size=0, min_error_reduction=0 model_1 = self.binary_decision_trees.greedy_recursive_early_stop(self.train_data, ['grade.A', 'grade.B', 'grade.C', 'grade.D', 'grade.E', 'grade.F', 'grade.G', 'term. 36 months'], self.target, {"current_depth": 0, "max_depth": 5000, "min_node_size": 0, "min_error_reduction": -50000}) # Get the classification result of the first row classification = self.predict_output.binary_tree(model_1, self.test_data.iloc[0]) # Assert that the classification should be -1 self.assertEqual(classification, -1) # Compute the accuracy of the decision tree accuracy = self.error.binary_tree(model_1, self.test_data, self.target) # Assert that the classification should be 0.38162 self.assertEqual(round(accuracy, 5), round(0.38162, 5))
class TestWeightedBinaryDecisionTrees(unittest.TestCase): """Tests for the BinaryDecisionTrees class. Uses lending club data to test binary decision trees. Statics: _multiprocess_can_split_ (bool): Flag for nose tests to run tests in parallel. """ _multiprocess_can_split_ = True def setUp(self): """Constructor for WeightedBinaryDecisionTrees. We will clean up the loans_data by doing one hot encoding our features list, however, in the end we will use some pre-built data for training and testing, but it uses the same method for one hot encoding. """ self.weighted_binary_decision_trees = WeightedBinaryDecisionTrees() self.adaboost = AdaBoost() self.predict = PredictOutput() self.accuracy = Accuracy() self.error = Error() # Pandas type set dtype_dict = {'grade': str, 'term': str, 'emp_length': str, 'bad_loans': int} # Load the lending club data self.loans_data = pd.read_csv('./unit_tests/test_data/classification/lending_club/lending_club_data.csv', dtype=dtype_dict) # List features and targets that we are interested self.features = ['grade', 'term', 'home_ownership', 'emp_length'] self.target = 'safe_loans' # Do a one hot encoding of features for feature in self.features: # One hot encode loans_data_one_hot_encoded = pd.get_dummies(self.loans_data[feature].apply(lambda x: x), prefix=feature, prefix_sep='.') # Drop the feature self.loans_data.drop(feature, axis=1, inplace=True) # Join the feature with the new one encoded features self.loans_data = pd.concat([self.loans_data, loans_data_one_hot_encoded], axis=1) # Update our features self.features = list(self.loans_data.columns.values) self.features.remove('safe_loans') # Load our training and testing data self.train_data = pd.read_csv('./unit_tests/test_data/classification/lending_club/lending_club_train.csv') self.test_data = pd.read_csv('./unit_tests/test_data/classification/lending_club/lending_club_test.csv') def test_01_greedy_recursive(self): """Tests greedy recursive function for BinaryDecisionTrees class We will use the training data to build a decision tree, and measure the accuracy with some known good values. """ # Create data weights data_weights = pd.Series([1.] * 10 + [0.] * (len(self.train_data) - 20) + [1.] * 10) # Create a decision tree decision_tree = self.weighted_binary_decision_trees.greedy_recursive(self.train_data, self.features, self.target, {"data_weights": data_weights, "current_depth": 0, "max_depth": 2, "minimum_error": 1e-15}) # Compute the accuracy of the decision tree accuracy = self.error.binary_tree(decision_tree, self.train_data, self.target) # Assert that the classification should be 0.48124865678057166 self.assertEqual(round(accuracy, 5), round(0.48124865678057166, 5)) def test_02_greedy_recursive_high_depth_low_features(self): """Tests greedy recursive function for BinaryDecisionTrees class We will use the training data to build a decision tree, and measure the accuracy with some known good values. """ # Create data weights data_weights = pd.Series([1.] * 10 + [0.] * (len(self.train_data) - 20) + [1.] * 10) # Create a decision tree decision_tree = self.weighted_binary_decision_trees.greedy_recursive(self.train_data, ['grade.A', 'grade.B'], self.target, {"data_weights": data_weights, "current_depth": 0, "max_depth": 2000, "minimum_error": 1e-15}) # Compute the accuracy of the decision tree accuracy = self.error.binary_tree(decision_tree, self.train_data, self.target) # Assert that the classification should be 0.54148 self.assertEqual(round(accuracy, 5), round(0.54148, 5)) def test_03_greedy_recursive_high_depth_low_features(self): """Tests greedy recursive function for BinaryDecisionTrees class We will use the training data to build a decision tree, and measure the accuracy with some known good values. """ # Create data weights data_weights = pd.Series([1.] * 10 + [2.] * (len(self.train_data) - 20) + [-1.] * 10) # Create a decision tree decision_tree = self.weighted_binary_decision_trees.greedy_recursive(self.train_data, ['grade.A', 'grade.B', 'grade.C', 'grade.D', 'grade.E', 'grade.F', 'grade.G', 'term. 36 months'], self.target, {"data_weights": data_weights, "current_depth": 0, "max_depth": 20000, "minimum_error": 1e-15}) # Compute the accuracy of the decision tree accuracy = self.error.binary_tree(decision_tree, self.train_data, self.target) # Assert that the classification should be 0.38491 self.assertEqual(round(accuracy, 5), round(0.38491, 5)) def test_04_adaboost(self): """Tests the adaboost algorithm. Tests the adaboost algorithm with low number of iterations. """ # Create two weighted binary decision trees weights_list, _ = self.adaboost.decision_tree(self.train_data, self.features, self.target, iterations=2, model_dict={"predict_method": self.predict.binary_tree, "model": self.weighted_binary_decision_trees, "model_method": "greedy_recursive", "model_parameters": {"max_depth": 1, "minimum_error": 1e-15, "current_depth": 0}}) # The weights have to equal to [0.15802933659263743, 0.1768236329364191] self.assertEqual([round(i, 5) for i in weights_list], [round(0.15802933659263743, 5), round(0.1768236329364191, 5)]) def test_05_adaboost_high_iterations(self): """Tests the adaboost algorithm. Tests the adaboost algorithm with high number of iterations. """ # Create ten weighted binary decision trees weights_list, models_list = self.adaboost.decision_tree(self.train_data, self.features, self.target, iterations=10, model_dict={"predict_method": self.predict.binary_tree, "model": self.weighted_binary_decision_trees, "model_method": "greedy_recursive", "model_parameters": {"max_depth": 1, "minimum_error": 1e-15, "current_depth": 0 }}) # Get the predictions of each dataset in the test data predictions = self.predict.adaboost_binary_decision_tree(self.predict.binary_tree, models_list, weights_list, self.test_data) # Assert the predictions self.assertEqual(list(predictions)[0:20], [-1, 1, -1, -1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, 1, 1, 1, -1, -1, -1]) # Accuracy has to match 0.620314519604 self.assertEqual(round(self.accuracy.decision_tree(self.test_data, predictions, self.target), 5), round(0.620314519604, 5))