def setUp(self):
        """Constructor for TestLogisticRegression.

        Loads Amazon data, and creates training and testing data.

        """
        # Create an instance of the Convert Numpy class
        self.convert_numpy = ConvertNumpy()

        # Create an instance of log likelihood
        self.log_likelhood = LogLikelihood()

        # Create an instance of the accuracy class
        self.accuracy = Accuracy()

        # Load the important words
        self.important_words = json.load(open('./unit_tests/test_data/classification/amazon/important_words.json', 'r'))

        # Create an instance of the Logistic Regression with L2 Norm class
        self.logistic_regression_l2_norm = LogisticRegressionL2Norm()

        # Load the amazon baby train subset
        self.training_data = pd.read_csv('./unit_tests/test_data/classification/amazon/amazon_baby_subset_train.csv')

        # Load the amazon baby train subset
        self.validation_data = pd.read_csv('./unit_tests/test_data/'
                                           'classification/amazon/amazon_baby_subset_validation.csv')
    def setUp(self):
        """Constructor for TestWeightedLogisticRegression.

        Loads Amazon data, and creates training and testing data.

        """
        self.convert_numpy = ConvertNumpy()
        self.predict = PredictOutput()
        self.ada = AdaBoost()
        self.accuracy = Accuracy()
        self.weighted_logistic_regression = WeightedLogisticRegression()

        # Load the important words
        self.important_words = json.load(open('./unit_tests/test_data/classification/amazon/important_words.json', 'r'))

        # Load the amazon baby subset
        self.review_frame = pd.read_csv('./unit_tests/test_data/classification/amazon/amazon_baby_subset.csv')

        # Review needs to be text
        self.review_frame['review'].astype(str)

        # Clean up the punctuations
        self.review_frame['review_clean'] = self.review_frame.apply(
            axis=1,
            func=lambda row: str(row["review"]).translate(str.maketrans({key: None for key in string.punctuation})))

        # Remove any nan text
        self.review_frame['review_clean'] = self.review_frame.apply(
            axis=1,
            func=lambda row: '' if row["review_clean"] == "nan" else row["review_clean"])

        # Count the number of words that appears in each review, and make an indepedent column
        for word in self.important_words:
            self.review_frame[word] = self.review_frame['review_clean'].apply(lambda s, w=word: s.split().count(w))
    def setUp(self):
        """Constructor for WeightedBinaryDecisionTrees.

        We will clean up the loans_data by doing one hot encoding our features list, however, in the end we will
        use some pre-built data for training and testing, but it uses the same method for one hot encoding.

        """
        self.weighted_binary_decision_trees = WeightedBinaryDecisionTrees()
        self.adaboost = AdaBoost()
        self.predict = PredictOutput()
        self.accuracy = Accuracy()
        self.error = Error()

        # Pandas type set
        dtype_dict = {'grade': str, 'term': str, 'emp_length': str, 'bad_loans': int}

        # Load the lending club data
        self.loans_data = pd.read_csv('./unit_tests/test_data/classification/lending_club/lending_club_data.csv',
                                      dtype=dtype_dict)

        # List features and targets that we are interested
        self.features = ['grade', 'term', 'home_ownership', 'emp_length']
        self.target = 'safe_loans'

        # Do a one hot encoding of features
        for feature in self.features:
            # One hot encode
            loans_data_one_hot_encoded = pd.get_dummies(self.loans_data[feature].apply(lambda x: x),
                                                        prefix=feature, prefix_sep='.')

            # Drop the feature
            self.loans_data.drop(feature, axis=1, inplace=True)

            # Join the feature with the new one encoded features
            self.loans_data = pd.concat([self.loans_data, loans_data_one_hot_encoded], axis=1)

        # Update our features
        self.features = list(self.loans_data.columns.values)
        self.features.remove('safe_loans')

        # Load our training and testing data
        self.train_data = pd.read_csv('./unit_tests/test_data/classification/lending_club/lending_club_train.csv')
        self.test_data = pd.read_csv('./unit_tests/test_data/classification/lending_club/lending_club_test.csv')
class TestLogisticRegressionL2Norm(unittest.TestCase):

    """Tests for LogisticRegressionL2Norm class.

    Uses Amazon data to test logistic regression.

    Statics:
        _multiprocess_can_split_ (bool): Flag for nose tests to run tests in parallel.

    """

    _multiprocess_can_split_ = True

    def setUp(self):
        """Constructor for TestLogisticRegression.

        Loads Amazon data, and creates training and testing data.

        """
        # Create an instance of the Convert Numpy class
        self.convert_numpy = ConvertNumpy()

        # Create an instance of log likelihood
        self.log_likelhood = LogLikelihood()

        # Create an instance of the accuracy class
        self.accuracy = Accuracy()

        # Load the important words
        self.important_words = json.load(open('./unit_tests/test_data/classification/amazon/important_words.json', 'r'))

        # Create an instance of the Logistic Regression with L2 Norm class
        self.logistic_regression_l2_norm = LogisticRegressionL2Norm()

        # Load the amazon baby train subset
        self.training_data = pd.read_csv('./unit_tests/test_data/classification/amazon/amazon_baby_subset_train.csv')

        # Load the amazon baby train subset
        self.validation_data = pd.read_csv('./unit_tests/test_data/'
                                           'classification/amazon/amazon_baby_subset_validation.csv')

    def test_01_gradient_ascent_no_penalty(self):
        """Tests gradient ascent algorithm.

        Tests the gradient ascent algorithm but with no l2 penalty.

        """
        # We will use important words for the output
        features = self.important_words

        # Output will use sentiment
        output = ['sentiment']

        # Convert our pandas frame to numpy
        feature_matrix_train, label_train = self.convert_numpy.convert_to_numpy(self.training_data,
                                                                                features,
                                                                                output, 1)
        feature_matrix_valid, label_valid = self.convert_numpy.convert_to_numpy(self.validation_data,
                                                                                features,
                                                                                output, 1)

        # Compute the coefficients
        coefficients = self.logistic_regression_l2_norm.gradient_ascent(feature_matrix_train, label_train,
                                                                        {"initial_coefficients": np.zeros(194),
                                                                         "step_size": 5e-6, "l2_penalty": 0,
                                                                         "max_iter": 501})

        # Get the accuracy
        train_accuracy = self.accuracy.logistic_regression(feature_matrix_train, label_train, coefficients)
        validation_accuracy = self.accuracy.logistic_regression(feature_matrix_valid, label_valid, coefficients)

        # Make sure the accuraries are correct
        self.assertEqual(round(0.785156157787, 5), round(train_accuracy, 5))
        self.assertEqual(round(0.78143964149, 5), round(validation_accuracy, 5))

    def test_02_gradient_ascent_10_penalty(self):
        """Test gradient ascent algorithm.

        Tests the gradient ascent algorithm with penalty.

        """
        # We will use important words for the output
        features = self.important_words

        # Output will use sentiment
        output = ['sentiment']

        # Convert our pandas frame to numpy
        feature_matrix_train, label_train = self.convert_numpy.convert_to_numpy(self.training_data,
                                                                                features,
                                                                                output, 1)
        feature_matrix_valid, label_valid = self.convert_numpy.convert_to_numpy(self.validation_data,
                                                                                features,
                                                                                output, 1)

        # Compute the coefficients
        coefficients = self.logistic_regression_l2_norm.gradient_ascent(feature_matrix_train, label_train,
                                                                        {"initial_coefficients": np.zeros(194),
                                                                         "step_size": 5e-6, "l2_penalty": 10,
                                                                         "max_iter": 501})

        # Get the accuracy
        train_accuracy = self.accuracy.logistic_regression(feature_matrix_train, label_train, coefficients)
        validation_accuracy = self.accuracy.logistic_regression(feature_matrix_valid, label_valid, coefficients)

        # Make sure the accuracies are correct
        self.assertEqual(round(0.784990911452, 5), round(train_accuracy, 5))
        self.assertEqual(round(0.781719727383, 5), round(validation_accuracy, 5))

    def test_03_log_likelihood(self):
        """Tests log likelihood with l2 norm.

        Tests the log likelihood with l2 norm and compare it with known values.

        """
        # Generate test feature, coefficients, and label
        feature_matrix = np.array([[1., 2., 3.], [1., -1., -1]])
        coefficients = np.array([1., 3., -1.])
        label = np.array([-1, 1])

        # Compute the log likelihood
        lg = self.log_likelhood.log_likelihood_l2_norm(feature_matrix, label, coefficients, 10)

        # Assert the value
        self.assertEqual(round(lg, 5), round(-105.33141000000001, 5))
class TestWeightedLogisticRegression(unittest.TestCase):

    """Tests WeightedLogisticRegression class.

    Uses Amazon data to test WeightedLogisticRegression class.

    Statics:
        _multiprocess_can_split_ (bool): Flag for nose tests to run tests in parallel.

    """

    _multiprocess_can_split_ = True

    def setUp(self):
        """Constructor for TestWeightedLogisticRegression.

        Loads Amazon data, and creates training and testing data.

        """
        self.convert_numpy = ConvertNumpy()
        self.predict = PredictOutput()
        self.ada = AdaBoost()
        self.accuracy = Accuracy()
        self.weighted_logistic_regression = WeightedLogisticRegression()

        # Load the important words
        self.important_words = json.load(open('./unit_tests/test_data/classification/amazon/important_words.json', 'r'))

        # Load the amazon baby subset
        self.review_frame = pd.read_csv('./unit_tests/test_data/classification/amazon/amazon_baby_subset.csv')

        # Review needs to be text
        self.review_frame['review'].astype(str)

        # Clean up the punctuations
        self.review_frame['review_clean'] = self.review_frame.apply(
            axis=1,
            func=lambda row: str(row["review"]).translate(str.maketrans({key: None for key in string.punctuation})))

        # Remove any nan text
        self.review_frame['review_clean'] = self.review_frame.apply(
            axis=1,
            func=lambda row: '' if row["review_clean"] == "nan" else row["review_clean"])

        # Count the number of words that appears in each review, and make an indepedent column
        for word in self.important_words:
            self.review_frame[word] = self.review_frame['review_clean'].apply(lambda s, w=word: s.split().count(w))

    def test_01_gradient_ascent(self):
        """Tests gradient ascent algorithm.

        Tests the gradient ascent algorithm and compare it with known values.

        """
        # We will use important words for the output
        features = self.important_words

        # Output will use sentiment
        output = ['sentiment']

        # Convert our pandas frame to numpy
        feature_matrix, sentiment = self.convert_numpy.convert_to_numpy(self.review_frame, features, output, 1)

        # Create weight list for training data
        weights_list = np.array([1]*len(self.review_frame))

        # Compute the coefficients
        coefficients = self.weighted_logistic_regression.gradient_ascent(feature_matrix, sentiment,
                                                                         {"initial_coefficients": np.zeros(194),
                                                                          "weights_list": weights_list,
                                                                          "step_size": 1e-7, "max_iter": 30})

        # Assert the coefficients
        self.assertEqual([round(i, 5) for i in coefficients[0:20]],
                         [round(i, 5) for i in [0.00020000000000000001, 0.0014300000000000001, -0.00131,
                                                0.0068900000000000003, 0.0068500000000000002, 0.00034000000000000002,
                                                -0.0062399999999999999, -0.00059000000000000003, 0.0067099999999999998,
                                                0.0046600000000000001, 0.00042999999999999999, 0.0020300000000000001,
                                                0.0030300000000000001, -0.00332, 0.0015, -0.00011, 0.00115,
                                                -0.0021700000000000001, -0.00139, -0.0046600000000000001]])

        # Compute predictions
        predictions = self.predict.logistic_regression(feature_matrix, coefficients)

        # Accuracy has to match 0.74356999999999995
        self.assertEqual(round(self.accuracy.general(predictions, sentiment), 5),
                         round(0.74356999999999995, 5))

    def test_02_adaboost(self):
        """Tests adaboost algorithm.

        Tests the adaboost algorithm with weighted logistic regression.

        """
        # We will use important words for the output
        features = self.important_words

        # Output will use sentiment
        output = ['sentiment']

        # Convert our pandas frame to numpy
        feature_matrix, sentiment = self.convert_numpy.convert_to_numpy(self.review_frame, features, output, 1)

        # Create 15 weighted logistic regression
        weights, models = self.ada.logistic_regression(feature_matrix, sentiment,
                                                       iterations=15,
                                                       model_dict={"predict_method": self.predict.logistic_regression,
                                                                   "model": self.weighted_logistic_regression,
                                                                   "model_method": "gradient_ascent",
                                                                   "model_parameters": {"step_size": 1e-7,
                                                                                        "max_iter": 30,
                                                                                        "initial_coefficients":
                                                                                            np.zeros(194)}})

        # Get the predictions of each dataset in the test data
        predictions = self.predict.adaboost_logistic_regression(self.predict.logistic_regression,
                                                                models, weights, feature_matrix)

        # Assert the predictions
        self.assertEqual(list(predictions)[0:20],
                         [1, -1, 1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1])

        # Accuracy has to match 0.77612999999999999
        self.assertEqual(round(self.accuracy.general(predictions, sentiment), 5),
                         round(0.77612999999999999, 5))
class TestWeightedBinaryDecisionTrees(unittest.TestCase):

    """Tests for the BinaryDecisionTrees class.

    Uses lending club data to test binary decision trees.

    Statics:
        _multiprocess_can_split_ (bool): Flag for nose tests to run tests in parallel.

    """

    _multiprocess_can_split_ = True

    def setUp(self):
        """Constructor for WeightedBinaryDecisionTrees.

        We will clean up the loans_data by doing one hot encoding our features list, however, in the end we will
        use some pre-built data for training and testing, but it uses the same method for one hot encoding.

        """
        self.weighted_binary_decision_trees = WeightedBinaryDecisionTrees()
        self.adaboost = AdaBoost()
        self.predict = PredictOutput()
        self.accuracy = Accuracy()
        self.error = Error()

        # Pandas type set
        dtype_dict = {'grade': str, 'term': str, 'emp_length': str, 'bad_loans': int}

        # Load the lending club data
        self.loans_data = pd.read_csv('./unit_tests/test_data/classification/lending_club/lending_club_data.csv',
                                      dtype=dtype_dict)

        # List features and targets that we are interested
        self.features = ['grade', 'term', 'home_ownership', 'emp_length']
        self.target = 'safe_loans'

        # Do a one hot encoding of features
        for feature in self.features:
            # One hot encode
            loans_data_one_hot_encoded = pd.get_dummies(self.loans_data[feature].apply(lambda x: x),
                                                        prefix=feature, prefix_sep='.')

            # Drop the feature
            self.loans_data.drop(feature, axis=1, inplace=True)

            # Join the feature with the new one encoded features
            self.loans_data = pd.concat([self.loans_data, loans_data_one_hot_encoded], axis=1)

        # Update our features
        self.features = list(self.loans_data.columns.values)
        self.features.remove('safe_loans')

        # Load our training and testing data
        self.train_data = pd.read_csv('./unit_tests/test_data/classification/lending_club/lending_club_train.csv')
        self.test_data = pd.read_csv('./unit_tests/test_data/classification/lending_club/lending_club_test.csv')

    def test_01_greedy_recursive(self):
        """Tests greedy recursive function for BinaryDecisionTrees class

        We will use the training data to build a decision tree, and measure the accuracy with some known good values.

        """
        # Create data weights
        data_weights = pd.Series([1.] * 10 + [0.] * (len(self.train_data) - 20) + [1.] * 10)

        # Create a decision tree
        decision_tree = self.weighted_binary_decision_trees.greedy_recursive(self.train_data, self.features,
                                                                             self.target,
                                                                             {"data_weights": data_weights,
                                                                              "current_depth": 0,
                                                                              "max_depth": 2,
                                                                              "minimum_error": 1e-15})

        # Compute the accuracy of the decision tree
        accuracy = self.error.binary_tree(decision_tree, self.train_data, self.target)

        # Assert that the classification should be 0.48124865678057166
        self.assertEqual(round(accuracy, 5), round(0.48124865678057166, 5))

    def test_02_greedy_recursive_high_depth_low_features(self):
        """Tests greedy recursive function for BinaryDecisionTrees class

        We will use the training data to build a decision tree, and measure the accuracy with some known good values.

        """
        # Create data weights
        data_weights = pd.Series([1.] * 10 + [0.] * (len(self.train_data) - 20) + [1.] * 10)

        # Create a decision tree
        decision_tree = self.weighted_binary_decision_trees.greedy_recursive(self.train_data, ['grade.A', 'grade.B'],
                                                                             self.target,
                                                                             {"data_weights": data_weights,
                                                                              "current_depth": 0,
                                                                              "max_depth": 2000,
                                                                              "minimum_error": 1e-15})

        # Compute the accuracy of the decision tree
        accuracy = self.error.binary_tree(decision_tree, self.train_data, self.target)

        # Assert that the classification should be 0.54148
        self.assertEqual(round(accuracy, 5), round(0.54148, 5))

    def test_03_greedy_recursive_high_depth_low_features(self):
        """Tests greedy recursive function for BinaryDecisionTrees class

        We will use the training data to build a decision tree, and measure the accuracy with some known good values.

        """
        # Create data weights
        data_weights = pd.Series([1.] * 10 + [2.] * (len(self.train_data) - 20) + [-1.] * 10)

        # Create a decision tree
        decision_tree = self.weighted_binary_decision_trees.greedy_recursive(self.train_data, ['grade.A', 'grade.B',
                                                                                               'grade.C', 'grade.D',
                                                                                               'grade.E', 'grade.F',
                                                                                               'grade.G',
                                                                                               'term. 36 months'],
                                                                             self.target,
                                                                             {"data_weights": data_weights,
                                                                              "current_depth": 0,
                                                                              "max_depth": 20000,
                                                                              "minimum_error": 1e-15})

        # Compute the accuracy of the decision tree
        accuracy = self.error.binary_tree(decision_tree, self.train_data, self.target)

        # Assert that the classification should be 0.38491
        self.assertEqual(round(accuracy, 5), round(0.38491, 5))

    def test_04_adaboost(self):
        """Tests the adaboost algorithm.

        Tests the adaboost algorithm with low number of iterations.

        """
        # Create two weighted binary decision trees
        weights_list, _ = self.adaboost.decision_tree(self.train_data, self.features, self.target,
                                                      iterations=2,
                                                      model_dict={"predict_method": self.predict.binary_tree,
                                                                  "model": self.weighted_binary_decision_trees,
                                                                  "model_method": "greedy_recursive",
                                                                  "model_parameters": {"max_depth": 1,
                                                                                       "minimum_error": 1e-15,
                                                                                       "current_depth": 0}})

        # The weights have to equal to [0.15802933659263743, 0.1768236329364191]
        self.assertEqual([round(i, 5) for i in weights_list],
                         [round(0.15802933659263743, 5), round(0.1768236329364191, 5)])

    def test_05_adaboost_high_iterations(self):
        """Tests the adaboost algorithm.

        Tests the adaboost algorithm with high number of iterations.

        """
        # Create ten weighted binary decision trees
        weights_list, models_list = self.adaboost.decision_tree(self.train_data, self.features, self.target,
                                                                iterations=10,
                                                                model_dict={"predict_method": self.predict.binary_tree,
                                                                            "model": self.weighted_binary_decision_trees,
                                                                            "model_method": "greedy_recursive",
                                                                            "model_parameters": {"max_depth": 1,
                                                                                                 "minimum_error": 1e-15,
                                                                                                 "current_depth": 0
                                                                                                 }})

        # Get the predictions of each dataset in the test data
        predictions = self.predict.adaboost_binary_decision_tree(self.predict.binary_tree, models_list, weights_list,
                                                                 self.test_data)

        # Assert the predictions
        self.assertEqual(list(predictions)[0:20],
                         [-1, 1, -1, -1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, 1, 1, 1, -1, -1, -1])

        # Accuracy has to match 0.620314519604
        self.assertEqual(round(self.accuracy.decision_tree(self.test_data,
                                                           predictions,
                                                           self.target),
                               5),
                         round(0.620314519604, 5))