Esempio n. 1
0
class TestNaiveBayes(unittest.TestCase):
    """ This class contains unit tests for the naive bayes algorithm"""
    def setUp(self) -> None:
        self.x, self.y = load_iris(return_X_y=True)

        self.x = preprocessing.scale(self.x).T
        self.y = self.y.reshape(1, -1)

        self.x1, self.y1 = load_wine(return_X_y=True)
        self.x1 = self.x1.T
        self.y1 = self.y1.reshape(1, -1)
        self.k_obj = KFoldCrossValidation()
        self.naive_bayes = GaussianNaiveBayes()

        return super().setUp()

    def test1(self):
        # achieves a k-fold validated score of 96.6% -> very good.
        k_score = self.k_obj.get_k_score(self.x, self.y, accuracy,
                                         self.naive_bayes)
        self.assertGreaterEqual(k_score, 0.90)

    def test2(self):
        # achieves a k-fold validated score of 100% -> very good
        k_score = self.k_obj.get_k_score(self.x1, self.y1, accuracy,
                                         self.naive_bayes)
        self.assertGreaterEqual(k_score, 0.90)
Esempio n. 2
0
    def setUp(self):
        self.x1, self.y1 = load_breast_cancer(return_X_y=True)
        self.x1 = self.x1.T
        self.y1 = self.y1.T.reshape(1, -1)

        self.x2, self.y2 = load_iris(return_X_y=True)
        self.x2 = self.x2.T
        self.y2 = self.y2.T.reshape(1, -1)
        self.k_cv = KFoldCrossValidation()
Esempio n. 3
0
    def setUp(self) -> None:
        self.x, self.y = load_iris(return_X_y=True)

        self.x = preprocessing.scale(self.x).T
        self.y = self.y.reshape(1, -1)

        self.x1, self.y1 = load_wine(return_X_y=True)
        self.x1 = self.x1.T
        self.y1 = self.y1.reshape(1, -1)
        self.k_obj = KFoldCrossValidation()
        self.naive_bayes = GaussianNaiveBayes()

        return super().setUp()
    def test_1(self):
        X, Y = load_breast_cancer(return_X_y=True)
        X = preprocessing.scale(X).T
        Y = Y.T.reshape(1, -1)

        LR1 = LogisticRegression(X.shape[0], classificationThreshold=0.5)

        output = KFoldCrossValidation().get_k_score(X, Y, accuracy, LR1)

        self.assertGreaterEqual(output, 0.95)
Esempio n. 5
0
    def setUp(self):
        self.x, self.y = load_iris(return_X_y=True)
        self.x = preprocessing.scale(self.x).T
        self.y_encoded = one_hot_encode(self.y)
        self.softmax_model_no_regularization = SoftmaxRegression(
            self.x.shape[0], len(self.y_encoded))

        self.softmax_model_l1_regularization = SoftmaxRegression(
            self.x.shape[0],
            len(self.y_encoded),
            regularization="L1",
            reg_parameter=0.01)

        self.softmax_model_l2_regularization = SoftmaxRegression(
            self.x.shape[0],
            len(self.y_encoded),
            regularization="L2",
            reg_parameter=0.01)

        self.k_fold_obj = KFoldCrossValidation()
Esempio n. 6
0
    def test1(self):
        """ In order to train the weights for every logistic regression
        model, you have to train for a tonne of epochs.

        Training for 10 epochs gets cross val score of 0.78
        Training for 350 epochs gets ~0.95. 450 epochs ~0.96
        """
        num_classes = len(np.unique(self.y))

        one_vs_all_logistic_regression = OneVsAllLogisticRegression(
            num_classes, self.x.shape[0], num_epochs=450, learn_rate=0.3)

        cross_val = KFoldCrossValidation()
        k_score = cross_val.get_k_score(self.x, self.y, accuracy,
                                        one_vs_all_logistic_regression)

        sklearn_log = LR(penalty="none", multi_class="ovr")

        sklearn_score = np.average(
            cross_val_score(sklearn_log, self.x.T, self.y.ravel()))

        difference = abs(sklearn_score - k_score)
        self.assertLessEqual(difference, 0.1)
class RegressionTreeTests(unittest.TestCase):
    """ This class contains test for the regression decision tree
    algorithm """
    def setUp(self):
        self.x1, self.y1 = load_boston(return_X_y=True)
        self.x1 = self.x1.T
        self.y1 = self.y1.T.reshape(1, -1)
        self.k_cv = KFoldCrossValidation()

    def tearDown(self):
        self.x1 = None
        self.y1 = None
        self.k_cv = None

    def testFit(self):
        # The tree is not being constrained when it is fitting
        # thus we should be able to achieve 0 error AKA all residuals are 0
        regression_obj = RegressionTree(minSamplesSplit=1)
        regression_obj.fit(self.x1, self.y1)
        predictions = regression_obj.predict(self.x1)
        rmse = root_mean_squared_error(self.y1, predictions)
        mae = mean_absolute_error(self.y1, predictions)
        mse = mean_squared_error(self.y1, predictions)
        rmse = root_mean_squared_error(self.y1, predictions)
        self.assertEqual(mae, 0)
        self.assertEqual(mse, 0)
        self.assertEqual(rmse, 0)

        # test generalization of regression tree
        regression_obj2 = RegressionTree(minSamplesSplit=1)
        kScoreRMSE = self.k_cv.get_k_score(self.x1, self.y1,
                                           root_mean_squared_error,
                                           regression_obj2)
        kScoreMSE = self.k_cv.get_k_score(self.x1, self.y1, mean_squared_error,
                                          regression_obj2)
        kScoreMAE = self.k_cv.get_k_score(self.x1, self.y1,
                                          mean_absolute_error, regression_obj2)
        # Dataset is easy so we should expect 0 error
        self.assertEqual(kScoreRMSE, 0)
        self.assertEqual(kScoreMSE, 0)
        self.assertEqual(kScoreMAE, 0)

    def test_sanityChecks(self):
        # If we regularize the tree, we should get a higher
        # error than if we don't
        regression_obj2 = RegressionTree(minSamplesSplit=5,
                                         maxDepth=3,
                                         min_impurity_decrease=0.15)
        regression_obj2.fit(self.x1, self.y1)
        predictions2 = regression_obj2.predict(self.x1)
        error2 = root_mean_squared_error(self.y1, predictions2)

        # Sanity checks - regularization is so high all we get is one leaf
        # meaning all predictions are equal to mean of labels
        # meaning that the RSS of the predictions should be equal to
        # TSS of the label
        regression_obj3 = RegressionTree(minSamplesSplit=10,
                                         maxDepth=0,
                                         min_impurity_decrease=0.15)
        regression_obj3.fit(self.x1, self.y1)
        predictions3 = regression_obj3.predict(self.x1)

        regression_obj4 = RegressionTree(minSamplesSplit=1000,
                                         maxDepth=10,
                                         min_impurity_decrease=0.15)
        regression_obj4.fit(self.x1, self.y1)
        predictions4 = regression_obj4.predict(self.x1)

        regression_obj5 = RegressionTree(minSamplesSplit=10,
                                         maxDepth=10,
                                         min_impurity_decrease=1)
        regression_obj5.fit(self.x1, self.y1)
        predictions5 = regression_obj4.predict(self.x1)

        self.assertGreaterEqual(error2, 0)
        self.assertAlmostEqual(residual_sum_of_squares(self.y1, predictions3),
                               total_sum_of_squares(self.y1))
        self.assertAlmostEqual(residual_sum_of_squares(self.y1, predictions4),
                               total_sum_of_squares(self.y1))
        self.assertAlmostEqual(residual_sum_of_squares(self.y1, predictions5),
                               total_sum_of_squares(self.y1))
 def setUp(self):
     self.x1, self.y1 = load_boston(return_X_y=True)
     self.x1 = self.x1.T
     self.y1 = self.y1.T.reshape(1, -1)
     self.k_cv = KFoldCrossValidation()
Esempio n. 9
0
class SoftmaxTests(unittest.TestCase):
    """ This class contains unit tests for the softmax regression
    algorithm.

    A few notes:
        With L1 regularization and L2 regularization, the classifier performs
        as expected -> performance is very sensitive to reg_parameter. If the
        regularization parameter is even slightly high (>0.3), the performance
        for the l1 regularized and l2 regularized softmax regression models
        falter heavily.
    """
    def setUp(self):
        self.x, self.y = load_iris(return_X_y=True)
        self.x = preprocessing.scale(self.x).T
        self.y_encoded = one_hot_encode(self.y)
        self.softmax_model_no_regularization = SoftmaxRegression(
            self.x.shape[0], len(self.y_encoded))

        self.softmax_model_l1_regularization = SoftmaxRegression(
            self.x.shape[0],
            len(self.y_encoded),
            regularization="L1",
            reg_parameter=0.01)

        self.softmax_model_l2_regularization = SoftmaxRegression(
            self.x.shape[0],
            len(self.y_encoded),
            regularization="L2",
            reg_parameter=0.01)

        self.k_fold_obj = KFoldCrossValidation()

    def test_softmax_no_reg(self):

        # Strength of RMSProp shown - get a 6% increase in accuracy w/ it. 99.3%
        # RMSprop and 93.7% normal gradient descent
        cross_val_score_gradient_descent = self.k_fold_obj.get_k_score(
            self.x,
            self.y_encoded,
            accuracy,
            self.softmax_model_no_regularization,
            numEpochs=100,
            learn_rate=0.2,
            k=8)

        cross_val_score_rms_prop = self.k_fold_obj.get_k_score(
            self.x,
            self.y_encoded,
            accuracy,
            self.softmax_model_no_regularization,
            numEpochs=100,
            learn_rate=0.2,
            k=8,
            optim=RMSProp())

        # Adam is the most sensitive out of the three tested and requires the
        # most hyperparameter tuning
        _, train_acc = self.softmax_model_no_regularization.fit(
            self.x,
            self.y_encoded,
            num_epochs=1000,
            learn_rate=0.01,
            optim=Adam(),
            ret_train_loss=True)

        cross_val_score_adam = self.k_fold_obj.get_k_score(
            self.x,
            self.y_encoded,
            accuracy,
            self.softmax_model_no_regularization,
            numEpochs=1000,
            learn_rate=0.01,
            k=8,
            optim=Adam())

        self.assertGreaterEqual(np.average(train_acc), 0.90)
        self.assertGreaterEqual(cross_val_score_gradient_descent, 0.90)
        self.assertGreaterEqual(cross_val_score_rms_prop, 0.96)
        self.assertGreaterEqual(cross_val_score_adam, 0.85)

    def test_softmax_reg(self):

        cross_val_score_l1_reg = self.k_fold_obj.get_k_score(
            self.x,
            self.y_encoded,
            accuracy,
            self.softmax_model_l1_regularization,
            numEpochs=150,
            learn_rate=0.01,
            k=8)

        cross_val_score_l2_reg = self.k_fold_obj.get_k_score(
            self.x,
            self.y_encoded,
            accuracy,
            self.softmax_model_l2_regularization,
            numEpochs=150,
            learn_rate=0.01,
            k=8)

        self.assertGreaterEqual(cross_val_score_l1_reg, 0.80)
        self.assertGreaterEqual(cross_val_score_l2_reg, 0.80)
Esempio n. 10
0
class TestsClassificationTree(unittest.TestCase):
    """ This class contains unit tests for the classification tree algorithm """
    def setUp(self):
        self.x1, self.y1 = load_breast_cancer(return_X_y=True)
        self.x1 = self.x1.T
        self.y1 = self.y1.T.reshape(1, -1)

        self.x2, self.y2 = load_iris(return_X_y=True)
        self.x2 = self.x2.T
        self.y2 = self.y2.T.reshape(1, -1)
        self.k_cv = KFoldCrossValidation()

    def tearDown(self):
        self.x1 = None
        self.y1 = None
        self.x2 = None
        self.y2 = None
        self.k_cv = None

    def test_multi_class(self):
        # Should be able to overfit multiiclasses easy
        # notice lack of preprocessing - no need to normalize features
        # no need to one hot encode labels

        # out of box model :D
        classification_obj = ClassificationTree(entropy=False,
                                                minSamplesSplit=1)
        classification_obj.fit(self.x2, self.y2)
        predictions = classification_obj.predict(self.x2)
        acc = accuracy(self.y2, predictions)
        self.assertEqual(acc, 1)

        classification_obj2 = ClassificationTree(entropy=False,
                                                 minSamplesSplit=1)
        kScore = self.k_cv.get_k_score(self.x2, self.y2, accuracy,
                                       classification_obj2)
        self.assertEqual(kScore, 1)

    def test_binary(self):
        # The tree we are growing is unconstrained so it should be able to
        # fit the training set perfectly 100% (AKA overfitting  :) )
        classification_obj = ClassificationTree(entropy=False,
                                                minSamplesSplit=1)
        classification_obj.fit(self.x1, self.y1)
        predictions = classification_obj.predict(self.x1)
        acc = accuracy(self.y1, predictions)
        self.assertEqual(acc, 1)
        classification_obj2 = ClassificationTree(entropy=False,
                                                 minSamplesSplit=1)
        kScore = self.k_cv.get_k_score(self.x1, self.y1, accuracy,
                                       classification_obj2)
        self.assertEqual(kScore, 1)

    def test_sanity_checks(self):
        classification_obj2 = ClassificationTree(entropy=False,
                                                 minSamplesSplit=5,
                                                 maxDepth=3,
                                                 min_impurity_decrease=0.09)
        classification_obj2.fit(self.x1, self.y1)
        predictions2 = classification_obj2.predict(self.x1)
        acc2 = accuracy(self.y1, predictions2)

        classification_obj3 = ClassificationTree(entropy=False,
                                                 minSamplesSplit=10,
                                                 maxDepth=0,
                                                 min_impurity_decrease=0.15)
        classification_obj3.fit(self.x1, self.y1)
        predictions3 = classification_obj3.predict(self.x1)
        acc3 = accuracy(self.y1, predictions3)

        classification_obj4 = ClassificationTree(entropy=False,
                                                 minSamplesSplit=1000,
                                                 maxDepth=10,
                                                 min_impurity_decrease=0.15)
        classification_obj4.fit(self.x1, self.y1)
        predictions4 = classification_obj4.predict(self.x1)
        acc4 = accuracy(self.y1, predictions4)

        classification_obj5 = ClassificationTree(entropy=False,
                                                 minSamplesSplit=10,
                                                 maxDepth=10,
                                                 min_impurity_decrease=1)
        classification_obj5.fit(self.x1, self.y1)
        predictions5 = classification_obj4.predict(self.x1)
        acc5 = accuracy(self.y1, predictions5)

        self.assertLessEqual(acc2, 1)
        self.assertAlmostEqual(acc3, 0.6274165202108963)
        self.assertAlmostEqual(acc4, 0.6274165202108963)
        self.assertAlmostEqual(acc5, 0.6274165202108963)