def setUp(self): """Constructor for TestLogisticRegression. Loads Amazon data, and creates training and testing data. """ # Create an instance of the Convert Numpy class self.convert_numpy = ConvertNumpy() # Create an instance of log likelihood self.log_likelhood = LogLikelihood() # Create an instance of the accuracy class self.accuracy = Accuracy() # Load the important words self.important_words = json.load(open('./unit_tests/test_data/classification/amazon/important_words.json', 'r')) # Create an instance of the Logistic Regression with L2 Norm class self.logistic_regression_l2_norm = LogisticRegressionL2Norm() # Load the amazon baby train subset self.training_data = pd.read_csv('./unit_tests/test_data/classification/amazon/amazon_baby_subset_train.csv') # Load the amazon baby train subset self.validation_data = pd.read_csv('./unit_tests/test_data/' 'classification/amazon/amazon_baby_subset_validation.csv')
def setUp(self): """Constructor for TestWeightedLogisticRegression. Loads Amazon data, and creates training and testing data. """ self.convert_numpy = ConvertNumpy() self.predict = PredictOutput() self.ada = AdaBoost() self.accuracy = Accuracy() self.weighted_logistic_regression = WeightedLogisticRegression() # Load the important words self.important_words = json.load(open('./unit_tests/test_data/classification/amazon/important_words.json', 'r')) # Load the amazon baby subset self.review_frame = pd.read_csv('./unit_tests/test_data/classification/amazon/amazon_baby_subset.csv') # Review needs to be text self.review_frame['review'].astype(str) # Clean up the punctuations self.review_frame['review_clean'] = self.review_frame.apply( axis=1, func=lambda row: str(row["review"]).translate(str.maketrans({key: None for key in string.punctuation}))) # Remove any nan text self.review_frame['review_clean'] = self.review_frame.apply( axis=1, func=lambda row: '' if row["review_clean"] == "nan" else row["review_clean"]) # Count the number of words that appears in each review, and make an indepedent column for word in self.important_words: self.review_frame[word] = self.review_frame['review_clean'].apply(lambda s, w=word: s.split().count(w))
def setUp(self): """Constructor for WeightedBinaryDecisionTrees. We will clean up the loans_data by doing one hot encoding our features list, however, in the end we will use some pre-built data for training and testing, but it uses the same method for one hot encoding. """ self.weighted_binary_decision_trees = WeightedBinaryDecisionTrees() self.adaboost = AdaBoost() self.predict = PredictOutput() self.accuracy = Accuracy() self.error = Error() # Pandas type set dtype_dict = {'grade': str, 'term': str, 'emp_length': str, 'bad_loans': int} # Load the lending club data self.loans_data = pd.read_csv('./unit_tests/test_data/classification/lending_club/lending_club_data.csv', dtype=dtype_dict) # List features and targets that we are interested self.features = ['grade', 'term', 'home_ownership', 'emp_length'] self.target = 'safe_loans' # Do a one hot encoding of features for feature in self.features: # One hot encode loans_data_one_hot_encoded = pd.get_dummies(self.loans_data[feature].apply(lambda x: x), prefix=feature, prefix_sep='.') # Drop the feature self.loans_data.drop(feature, axis=1, inplace=True) # Join the feature with the new one encoded features self.loans_data = pd.concat([self.loans_data, loans_data_one_hot_encoded], axis=1) # Update our features self.features = list(self.loans_data.columns.values) self.features.remove('safe_loans') # Load our training and testing data self.train_data = pd.read_csv('./unit_tests/test_data/classification/lending_club/lending_club_train.csv') self.test_data = pd.read_csv('./unit_tests/test_data/classification/lending_club/lending_club_test.csv')
class TestLogisticRegressionL2Norm(unittest.TestCase): """Tests for LogisticRegressionL2Norm class. Uses Amazon data to test logistic regression. Statics: _multiprocess_can_split_ (bool): Flag for nose tests to run tests in parallel. """ _multiprocess_can_split_ = True def setUp(self): """Constructor for TestLogisticRegression. Loads Amazon data, and creates training and testing data. """ # Create an instance of the Convert Numpy class self.convert_numpy = ConvertNumpy() # Create an instance of log likelihood self.log_likelhood = LogLikelihood() # Create an instance of the accuracy class self.accuracy = Accuracy() # Load the important words self.important_words = json.load(open('./unit_tests/test_data/classification/amazon/important_words.json', 'r')) # Create an instance of the Logistic Regression with L2 Norm class self.logistic_regression_l2_norm = LogisticRegressionL2Norm() # Load the amazon baby train subset self.training_data = pd.read_csv('./unit_tests/test_data/classification/amazon/amazon_baby_subset_train.csv') # Load the amazon baby train subset self.validation_data = pd.read_csv('./unit_tests/test_data/' 'classification/amazon/amazon_baby_subset_validation.csv') def test_01_gradient_ascent_no_penalty(self): """Tests gradient ascent algorithm. Tests the gradient ascent algorithm but with no l2 penalty. """ # We will use important words for the output features = self.important_words # Output will use sentiment output = ['sentiment'] # Convert our pandas frame to numpy feature_matrix_train, label_train = self.convert_numpy.convert_to_numpy(self.training_data, features, output, 1) feature_matrix_valid, label_valid = self.convert_numpy.convert_to_numpy(self.validation_data, features, output, 1) # Compute the coefficients coefficients = self.logistic_regression_l2_norm.gradient_ascent(feature_matrix_train, label_train, {"initial_coefficients": np.zeros(194), "step_size": 5e-6, "l2_penalty": 0, "max_iter": 501}) # Get the accuracy train_accuracy = self.accuracy.logistic_regression(feature_matrix_train, label_train, coefficients) validation_accuracy = self.accuracy.logistic_regression(feature_matrix_valid, label_valid, coefficients) # Make sure the accuraries are correct self.assertEqual(round(0.785156157787, 5), round(train_accuracy, 5)) self.assertEqual(round(0.78143964149, 5), round(validation_accuracy, 5)) def test_02_gradient_ascent_10_penalty(self): """Test gradient ascent algorithm. Tests the gradient ascent algorithm with penalty. """ # We will use important words for the output features = self.important_words # Output will use sentiment output = ['sentiment'] # Convert our pandas frame to numpy feature_matrix_train, label_train = self.convert_numpy.convert_to_numpy(self.training_data, features, output, 1) feature_matrix_valid, label_valid = self.convert_numpy.convert_to_numpy(self.validation_data, features, output, 1) # Compute the coefficients coefficients = self.logistic_regression_l2_norm.gradient_ascent(feature_matrix_train, label_train, {"initial_coefficients": np.zeros(194), "step_size": 5e-6, "l2_penalty": 10, "max_iter": 501}) # Get the accuracy train_accuracy = self.accuracy.logistic_regression(feature_matrix_train, label_train, coefficients) validation_accuracy = self.accuracy.logistic_regression(feature_matrix_valid, label_valid, coefficients) # Make sure the accuracies are correct self.assertEqual(round(0.784990911452, 5), round(train_accuracy, 5)) self.assertEqual(round(0.781719727383, 5), round(validation_accuracy, 5)) def test_03_log_likelihood(self): """Tests log likelihood with l2 norm. Tests the log likelihood with l2 norm and compare it with known values. """ # Generate test feature, coefficients, and label feature_matrix = np.array([[1., 2., 3.], [1., -1., -1]]) coefficients = np.array([1., 3., -1.]) label = np.array([-1, 1]) # Compute the log likelihood lg = self.log_likelhood.log_likelihood_l2_norm(feature_matrix, label, coefficients, 10) # Assert the value self.assertEqual(round(lg, 5), round(-105.33141000000001, 5))
class TestWeightedLogisticRegression(unittest.TestCase): """Tests WeightedLogisticRegression class. Uses Amazon data to test WeightedLogisticRegression class. Statics: _multiprocess_can_split_ (bool): Flag for nose tests to run tests in parallel. """ _multiprocess_can_split_ = True def setUp(self): """Constructor for TestWeightedLogisticRegression. Loads Amazon data, and creates training and testing data. """ self.convert_numpy = ConvertNumpy() self.predict = PredictOutput() self.ada = AdaBoost() self.accuracy = Accuracy() self.weighted_logistic_regression = WeightedLogisticRegression() # Load the important words self.important_words = json.load(open('./unit_tests/test_data/classification/amazon/important_words.json', 'r')) # Load the amazon baby subset self.review_frame = pd.read_csv('./unit_tests/test_data/classification/amazon/amazon_baby_subset.csv') # Review needs to be text self.review_frame['review'].astype(str) # Clean up the punctuations self.review_frame['review_clean'] = self.review_frame.apply( axis=1, func=lambda row: str(row["review"]).translate(str.maketrans({key: None for key in string.punctuation}))) # Remove any nan text self.review_frame['review_clean'] = self.review_frame.apply( axis=1, func=lambda row: '' if row["review_clean"] == "nan" else row["review_clean"]) # Count the number of words that appears in each review, and make an indepedent column for word in self.important_words: self.review_frame[word] = self.review_frame['review_clean'].apply(lambda s, w=word: s.split().count(w)) def test_01_gradient_ascent(self): """Tests gradient ascent algorithm. Tests the gradient ascent algorithm and compare it with known values. """ # We will use important words for the output features = self.important_words # Output will use sentiment output = ['sentiment'] # Convert our pandas frame to numpy feature_matrix, sentiment = self.convert_numpy.convert_to_numpy(self.review_frame, features, output, 1) # Create weight list for training data weights_list = np.array([1]*len(self.review_frame)) # Compute the coefficients coefficients = self.weighted_logistic_regression.gradient_ascent(feature_matrix, sentiment, {"initial_coefficients": np.zeros(194), "weights_list": weights_list, "step_size": 1e-7, "max_iter": 30}) # Assert the coefficients self.assertEqual([round(i, 5) for i in coefficients[0:20]], [round(i, 5) for i in [0.00020000000000000001, 0.0014300000000000001, -0.00131, 0.0068900000000000003, 0.0068500000000000002, 0.00034000000000000002, -0.0062399999999999999, -0.00059000000000000003, 0.0067099999999999998, 0.0046600000000000001, 0.00042999999999999999, 0.0020300000000000001, 0.0030300000000000001, -0.00332, 0.0015, -0.00011, 0.00115, -0.0021700000000000001, -0.00139, -0.0046600000000000001]]) # Compute predictions predictions = self.predict.logistic_regression(feature_matrix, coefficients) # Accuracy has to match 0.74356999999999995 self.assertEqual(round(self.accuracy.general(predictions, sentiment), 5), round(0.74356999999999995, 5)) def test_02_adaboost(self): """Tests adaboost algorithm. Tests the adaboost algorithm with weighted logistic regression. """ # We will use important words for the output features = self.important_words # Output will use sentiment output = ['sentiment'] # Convert our pandas frame to numpy feature_matrix, sentiment = self.convert_numpy.convert_to_numpy(self.review_frame, features, output, 1) # Create 15 weighted logistic regression weights, models = self.ada.logistic_regression(feature_matrix, sentiment, iterations=15, model_dict={"predict_method": self.predict.logistic_regression, "model": self.weighted_logistic_regression, "model_method": "gradient_ascent", "model_parameters": {"step_size": 1e-7, "max_iter": 30, "initial_coefficients": np.zeros(194)}}) # Get the predictions of each dataset in the test data predictions = self.predict.adaboost_logistic_regression(self.predict.logistic_regression, models, weights, feature_matrix) # Assert the predictions self.assertEqual(list(predictions)[0:20], [1, -1, 1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1]) # Accuracy has to match 0.77612999999999999 self.assertEqual(round(self.accuracy.general(predictions, sentiment), 5), round(0.77612999999999999, 5))
class TestWeightedBinaryDecisionTrees(unittest.TestCase): """Tests for the BinaryDecisionTrees class. Uses lending club data to test binary decision trees. Statics: _multiprocess_can_split_ (bool): Flag for nose tests to run tests in parallel. """ _multiprocess_can_split_ = True def setUp(self): """Constructor for WeightedBinaryDecisionTrees. We will clean up the loans_data by doing one hot encoding our features list, however, in the end we will use some pre-built data for training and testing, but it uses the same method for one hot encoding. """ self.weighted_binary_decision_trees = WeightedBinaryDecisionTrees() self.adaboost = AdaBoost() self.predict = PredictOutput() self.accuracy = Accuracy() self.error = Error() # Pandas type set dtype_dict = {'grade': str, 'term': str, 'emp_length': str, 'bad_loans': int} # Load the lending club data self.loans_data = pd.read_csv('./unit_tests/test_data/classification/lending_club/lending_club_data.csv', dtype=dtype_dict) # List features and targets that we are interested self.features = ['grade', 'term', 'home_ownership', 'emp_length'] self.target = 'safe_loans' # Do a one hot encoding of features for feature in self.features: # One hot encode loans_data_one_hot_encoded = pd.get_dummies(self.loans_data[feature].apply(lambda x: x), prefix=feature, prefix_sep='.') # Drop the feature self.loans_data.drop(feature, axis=1, inplace=True) # Join the feature with the new one encoded features self.loans_data = pd.concat([self.loans_data, loans_data_one_hot_encoded], axis=1) # Update our features self.features = list(self.loans_data.columns.values) self.features.remove('safe_loans') # Load our training and testing data self.train_data = pd.read_csv('./unit_tests/test_data/classification/lending_club/lending_club_train.csv') self.test_data = pd.read_csv('./unit_tests/test_data/classification/lending_club/lending_club_test.csv') def test_01_greedy_recursive(self): """Tests greedy recursive function for BinaryDecisionTrees class We will use the training data to build a decision tree, and measure the accuracy with some known good values. """ # Create data weights data_weights = pd.Series([1.] * 10 + [0.] * (len(self.train_data) - 20) + [1.] * 10) # Create a decision tree decision_tree = self.weighted_binary_decision_trees.greedy_recursive(self.train_data, self.features, self.target, {"data_weights": data_weights, "current_depth": 0, "max_depth": 2, "minimum_error": 1e-15}) # Compute the accuracy of the decision tree accuracy = self.error.binary_tree(decision_tree, self.train_data, self.target) # Assert that the classification should be 0.48124865678057166 self.assertEqual(round(accuracy, 5), round(0.48124865678057166, 5)) def test_02_greedy_recursive_high_depth_low_features(self): """Tests greedy recursive function for BinaryDecisionTrees class We will use the training data to build a decision tree, and measure the accuracy with some known good values. """ # Create data weights data_weights = pd.Series([1.] * 10 + [0.] * (len(self.train_data) - 20) + [1.] * 10) # Create a decision tree decision_tree = self.weighted_binary_decision_trees.greedy_recursive(self.train_data, ['grade.A', 'grade.B'], self.target, {"data_weights": data_weights, "current_depth": 0, "max_depth": 2000, "minimum_error": 1e-15}) # Compute the accuracy of the decision tree accuracy = self.error.binary_tree(decision_tree, self.train_data, self.target) # Assert that the classification should be 0.54148 self.assertEqual(round(accuracy, 5), round(0.54148, 5)) def test_03_greedy_recursive_high_depth_low_features(self): """Tests greedy recursive function for BinaryDecisionTrees class We will use the training data to build a decision tree, and measure the accuracy with some known good values. """ # Create data weights data_weights = pd.Series([1.] * 10 + [2.] * (len(self.train_data) - 20) + [-1.] * 10) # Create a decision tree decision_tree = self.weighted_binary_decision_trees.greedy_recursive(self.train_data, ['grade.A', 'grade.B', 'grade.C', 'grade.D', 'grade.E', 'grade.F', 'grade.G', 'term. 36 months'], self.target, {"data_weights": data_weights, "current_depth": 0, "max_depth": 20000, "minimum_error": 1e-15}) # Compute the accuracy of the decision tree accuracy = self.error.binary_tree(decision_tree, self.train_data, self.target) # Assert that the classification should be 0.38491 self.assertEqual(round(accuracy, 5), round(0.38491, 5)) def test_04_adaboost(self): """Tests the adaboost algorithm. Tests the adaboost algorithm with low number of iterations. """ # Create two weighted binary decision trees weights_list, _ = self.adaboost.decision_tree(self.train_data, self.features, self.target, iterations=2, model_dict={"predict_method": self.predict.binary_tree, "model": self.weighted_binary_decision_trees, "model_method": "greedy_recursive", "model_parameters": {"max_depth": 1, "minimum_error": 1e-15, "current_depth": 0}}) # The weights have to equal to [0.15802933659263743, 0.1768236329364191] self.assertEqual([round(i, 5) for i in weights_list], [round(0.15802933659263743, 5), round(0.1768236329364191, 5)]) def test_05_adaboost_high_iterations(self): """Tests the adaboost algorithm. Tests the adaboost algorithm with high number of iterations. """ # Create ten weighted binary decision trees weights_list, models_list = self.adaboost.decision_tree(self.train_data, self.features, self.target, iterations=10, model_dict={"predict_method": self.predict.binary_tree, "model": self.weighted_binary_decision_trees, "model_method": "greedy_recursive", "model_parameters": {"max_depth": 1, "minimum_error": 1e-15, "current_depth": 0 }}) # Get the predictions of each dataset in the test data predictions = self.predict.adaboost_binary_decision_tree(self.predict.binary_tree, models_list, weights_list, self.test_data) # Assert the predictions self.assertEqual(list(predictions)[0:20], [-1, 1, -1, -1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, 1, 1, 1, -1, -1, -1]) # Accuracy has to match 0.620314519604 self.assertEqual(round(self.accuracy.decision_tree(self.test_data, predictions, self.target), 5), round(0.620314519604, 5))