Ejemplo n.º 1
0
    def setUp(self):
        """Constructor for TestLogisticRegression.

        Loads Amazon data, and creates training and testing data.

        """
        self.convert_numpy = ConvertNumpy()
        self.log_likelhood = LogLikelihood()
        self.predict_output = PredictOutput()
        self.logistic_regression = LogisticRegression()
        self.confusion_matrix = ConfusionMatrix()

        # Load the important words
        self.important_words = json.load(open('./unit_tests/test_data/classification/amazon/important_words.json', 'r'))

        # Load the amazon baby subset
        self.review_frame = pd.read_csv('./unit_tests/test_data/classification/amazon/amazon_baby_subset.csv')

        # Review needs to be text
        self.review_frame['review'].astype(str)

        # Clean up the punctuations
        self.review_frame['review_clean'] = self.review_frame.apply(
            axis=1,
            func=lambda row: str(row["review"]).translate(str.maketrans({key: None for key in string.punctuation})))

        # Remove any nan text
        self.review_frame['review_clean'] = self.review_frame.apply(
            axis=1,
            func=lambda row: '' if row["review_clean"] == "nan" else row["review_clean"])

        # Count the number of words that appears in each review, and make an indepedent column
        for word in self.important_words:
            self.review_frame[word] = self.review_frame['review_clean'].apply(lambda s, w=word: s.split().count(w))

        # Load training data
        self.train_frame = pd.read_csv('./unit_tests/test_data/classification/amazon/amazon_baby_subset_train_mod2.csv')
Ejemplo n.º 2
0
class TestLogisticRegression(unittest.TestCase):

    """Tests for LogisticRegression class.

    Uses Amazon data to test logistic regression.

    Statics:
        _multiprocess_can_split_ (bool): Flag for nose tests to run tests in parallel.

    """

    _multiprocess_can_split_ = True

    def setUp(self):
        """Constructor for TestLogisticRegression.

        Loads Amazon data, and creates training and testing data.

        """
        self.convert_numpy = ConvertNumpy()
        self.log_likelhood = LogLikelihood()
        self.predict_output = PredictOutput()
        self.logistic_regression = LogisticRegression()
        self.confusion_matrix = ConfusionMatrix()

        # Load the important words
        self.important_words = json.load(open('./unit_tests/test_data/classification/amazon/important_words.json', 'r'))

        # Load the amazon baby subset
        self.review_frame = pd.read_csv('./unit_tests/test_data/classification/amazon/amazon_baby_subset.csv')

        # Review needs to be text
        self.review_frame['review'].astype(str)

        # Clean up the punctuations
        self.review_frame['review_clean'] = self.review_frame.apply(
            axis=1,
            func=lambda row: str(row["review"]).translate(str.maketrans({key: None for key in string.punctuation})))

        # Remove any nan text
        self.review_frame['review_clean'] = self.review_frame.apply(
            axis=1,
            func=lambda row: '' if row["review_clean"] == "nan" else row["review_clean"])

        # Count the number of words that appears in each review, and make an indepedent column
        for word in self.important_words:
            self.review_frame[word] = self.review_frame['review_clean'].apply(lambda s, w=word: s.split().count(w))

        # Load training data
        self.train_frame = pd.read_csv('./unit_tests/test_data/classification/amazon/amazon_baby_subset_train_mod2.csv')

    def test_01_gradient_ascent(self):
        """Test gradient ascent algorithm.

        Tests the gradient ascent algorithm and compare it with known values.

        """
        # We will use important words for the output
        features = self.important_words

        # Output will use sentiment
        output = ['sentiment']

        # Convert our pandas frame to numpy
        feature_matrix, sentiment = self.convert_numpy.convert_to_numpy(self.review_frame, features, output, 1)

        # Compute the coefficients
        coefficients = self.logistic_regression.gradient_ascent(feature_matrix, sentiment,
                                                                {"initial_coefficients": np.zeros(194),
                                                                 "step_size": 1e-7, "max_iter": 301})

        # Real coefficients that we need to compare with the computed coefficients
        real_coef = [5.16220157e-03,   1.55656966e-02,  -8.50204675e-03,
                     6.65460842e-02,   6.58907629e-02,   5.01743882e-03,
                     -5.38601484e-02,  -3.50488413e-03,   6.47945868e-02,
                     4.54356263e-02,   3.98353364e-03,   2.00775410e-02,
                     3.01350011e-02,  -2.87115530e-02,   1.52161964e-02,
                     2.72592062e-04,   1.19448177e-02,  -1.82461935e-02,
                     -1.21706420e-02,  -4.15110334e-02,   2.76820391e-03,
                     1.77031999e-02,  -4.39700067e-03,   4.49764014e-02,
                     9.90916464e-03,   8.99239081e-04,  -1.36219516e-03,
                     1.26859357e-02,   8.26466695e-03,  -2.77426972e-02,
                     6.10128809e-04,   1.54084501e-02,  -1.32134753e-02,
                     -3.00512492e-02,   2.97399371e-02,   1.84087080e-02,
                     2.86178752e-03,  -1.05768015e-02,  -6.57350362e-04,
                     -1.01476555e-02,  -4.79579528e-03,   7.50891810e-03,
                     4.27938289e-03,   3.06785501e-03,  -2.20317661e-03,
                     9.57273354e-03,   9.91666827e-05,  -1.98462567e-02,
                     1.75702722e-02,   1.55478612e-03,  -1.77375440e-02,
                     9.78324102e-03,   1.17031606e-02,  -7.35345937e-03,
                     -6.08714030e-03,   6.43766808e-03,   1.07159665e-02,
                     -3.05345476e-03,   7.17190727e-03,   5.73320003e-03,
                     4.60661876e-03,  -5.20588421e-03,   6.71012331e-03,
                     9.03281814e-03,   1.74563147e-03,   6.00279979e-03,
                     1.20181744e-02,  -1.83594607e-02,  -6.91010811e-03,
                     -1.38687273e-02,  -1.50406590e-02,   5.92353611e-03,
                     5.67478991e-03,  -5.28786220e-03,   3.08147864e-03,
                     5.53751236e-03,   1.49917916e-02,  -3.35666000e-04,
                     -3.30695153e-02,  -4.78990943e-03,  -6.41368859e-03,
                     7.99938935e-03,  -8.61390444e-04,   1.68052959e-02,
                     1.32539901e-02,   1.72307051e-03,   2.98030675e-03,
                     8.58284300e-03,   1.17082481e-02,   2.80825907e-03,
                     2.18724016e-03,   1.68824711e-02,  -4.65973741e-03,
                     1.51368285e-03,  -1.09509122e-02,   9.17842898e-03,
                     -1.88572281e-04,  -3.89820373e-02,  -2.44821005e-02,
                     -1.87023714e-02,  -2.13943485e-02,  -1.29690465e-02,
                     -1.71378670e-02,  -1.37566767e-02,  -1.49770449e-02,
                     -5.10287978e-03,  -2.89789761e-02,  -1.48663194e-02,
                     -1.28088380e-02,  -1.07709355e-02,  -6.95286915e-03,
                     -5.04082164e-03,  -9.25914404e-03,  -2.40427481e-02,
                     -2.65927785e-02,  -1.97320937e-03,  -5.04127508e-03,
                     -7.00791912e-03,  -3.48088523e-03,  -6.40958916e-03,
                     -4.07497010e-03,  -6.30054296e-03,  -1.09187932e-02,
                     -1.26051900e-02,  -1.66895314e-03,  -7.76418781e-03,
                     -5.15960485e-04,  -1.94199551e-03,  -1.24761586e-03,
                     -5.01291731e-03,  -9.12049191e-03,  -7.22098801e-03,
                     -8.31782981e-03,  -5.60573348e-03,  -1.47098335e-02,
                     -9.31520819e-03,  -2.22034402e-03,  -7.07573098e-03,
                     -5.10115608e-03,  -8.93572862e-03,  -1.27545713e-02,
                     -7.04171991e-03,  -9.76219676e-04,   4.12091713e-04,
                     8.29251160e-04,   2.64661064e-03,  -7.73228782e-03,
                     1.53471164e-03,  -7.37263060e-03,  -3.73694386e-03,
                     -3.81416409e-03,  -1.64575145e-03,  -3.31887732e-03,
                     1.22257832e-03,   1.36699286e-05,  -3.01866601e-03,
                     -1.02826343e-02,  -1.06691327e-02,   2.23639046e-03,
                     -9.87424798e-03,  -1.02192048e-02,  -3.41330929e-03,
                     3.34489960e-03,  -3.50984516e-03,  -6.26283150e-03,
                     -7.22419943e-03,  -5.47016154e-03,  -1.25063947e-02,
                     -2.47805699e-03,  -1.60017985e-02,  -6.40098934e-03,
                     -4.26644386e-03,  -1.55376990e-02,   2.31349237e-03,
                     -9.06653337e-03,  -6.30012672e-03,  -1.21010303e-02,
                     -3.02578875e-03,  -6.76289718e-03,  -5.65498722e-03,
                     -6.87050239e-03,  -1.18950595e-02,  -1.86489236e-04,
                     -1.15230476e-02,   2.81533219e-03,  -8.10150295e-03,
                     -1.00062131e-02,   4.02037651e-03,  -5.44300346e-03,
                     2.85818985e-03,   1.19885003e-04,  -6.47587687e-03,
                     -1.14493516e-03,  -7.09205934e-03]

        # Loop through each value, the coefficients must be the same
        for pred_coef, coef in zip(coefficients, real_coef):

            # Assert that both values are the same
            self.assertEqual(round(pred_coef, 5), round(coef, 5))

        # Get the output of the logistic regression with threshold 0
        output = self.predict_output.logistic_regression(feature_matrix, coefficients, 0)

        # Generate a confusion matrix
        confusion_matrix = self.confusion_matrix.confusion_matrix(sentiment, output)

        # Assert the values are to be expected
        self.assertEqual(confusion_matrix, {'false_negatives': 7311, 'true_negatives': 20635,
                                            'true_positives': 19268, 'false_positives': 5858})

        # Assert that the precision is correct
        self.assertEqual(round(self.confusion_matrix.precision(sentiment, output), 5),
                         round(0.7249332179540239, 5))

        # Assert that the recall is correct
        self.assertEqual(round(self.confusion_matrix.recall(sentiment, output), 5),
                         round(0.7668550505452519, 5))

    def test_02_stochastic_gradient_ascent(self):
        """Test stochastic gradient descent for logistic regression.

        Tests stochastic gradient descent and test it with some known values.

        """
        # We will use important words for the output
        features = self.important_words

        # Output will use sentiment
        output = ['sentiment']

        # Convert our pandas frame to numpy
        feature_matrix, sentiment = self.convert_numpy.convert_to_numpy(self.train_frame, features, output, 1)

        # Compute the coefficients
        coefficients = self.logistic_regression.stochastic_gradient_ascent(feature_matrix, sentiment,
                                                                           {"initial_coefficients": np.zeros(194),
                                                                            "step_size": 5e-1,
                                                                            "batch_size": 1,
                                                                            "max_iter": 10})

        # Real coefficients that we need to compare with the computed coefficients
        real_coef = [0.26845909, 0.05510662, -0.78232359, 0.24929641, 0.1213813,
                     -0.13194118, -0.42110769, 0.23944013, 0.52334226, 0.30746343,
                     1.46697311, 0.15734639, 0.24112255, -0.22849175, -0.48095714,
                     0., 0.05984944, -0.41942527, -0.48095714, 0.10654088,
                     0., 0.06153186, -0.41942527, 0.43843464,  0.,
                     0.21719583, 0., 0.84326475, 0.28108825, 0.28108825,
                     0., 0., 0.24611428, -0.19986888, 0.15734639,
                     0., 0., -0.48095714, 0.12623269,  0.,
                     0.28108825, 0.07542718, 0., -0.42110769, 0.15734639,
                     -0.48095714, 0.24611428, -0.48095714, 0.,  0.,
                     0.06153186, 0.28108825, 0., 0., 0.,
                     0.05984944, 0.5932902, 0.5621765, -0.48095714, 0.,
                     0.05984944, 0.05984944, 0.31220195, 0.11805882, 0.,
                     0.15085436, 0.24611428, 0., 0., 0.,
                     0.06153186, 0.12623269, 0., 0., 0.,
                     0., 0., 0., -0.35472444, 0.12623269,
                     0.,  0., 0.68023532, 0.28108825, 0.06153186,
                     0.0311137, 0.35651543, 0., 0.28108825, 0.,
                     0.05984944, 0., 0.35651543, 0.28108825, 0.,
                     0., 0., -0.90206483, 0.07542718, -0.48095714,
                     0., 0., -0.48095714, 0., 0.,
                     0., -0.25, 0.0311137, 0., 0.28108825,
                     0., 0., 0., 0., 0.,
                     0., 0.34262011, -0.48095714, 0.28108825, 0.,
                     0., 0., 0., 0., 0.06153186,
                     0.12623269, 0.05984944, 0., 0., 0.,
                     0., 0.12623269, 0., 0., 0.12623269,
                     0.07542718, 0.15085436, 0.07542718, -0.68082602, 0.,
                     0., 0., 0.05984944, 0., 0.,
                     0.28108825, 0., -0.25, 0., 0.,
                     0.07542718, 0., 0., 0.28108825, 0.,
                     0., 0., 0., 0., 0.,
                     0.06153186, 0.0311137, 0., -0.48095714, 0.,
                     0., 0., 0., 0., 0.,
                     0., 0.40732094, 0., 0., 0.05984944,
                     0., 0., 0., 0., 0.,
                     0., 0., 0.06153186, 0., 0.06153186,
                     0., -0.25, 0.05984944, 0., 0.,
                     0., 0., -0.96191427, 0.]

        # Loop through each value, the coefficients must be the same
        for pred_coef, coef in zip(coefficients, real_coef):
            # Assert that both values are the same
            self.assertEqual(round(pred_coef, 5), round(coef, 5))

        # Get the output of the logistic regression with threshold 0
        output = self.predict_output.logistic_regression(feature_matrix, coefficients, 0)

        # Generate a confusion matrix
        confusion_matrix = self.confusion_matrix.confusion_matrix(sentiment, output)

        # Assert the values are to be expected
        self.assertEqual(confusion_matrix, {'false_negatives': 6517, 'true_negatives': 11707,
                                            'true_positives': 17331, 'false_positives': 12225})

        # Assert that the precision is correct
        self.assertEqual(round(self.confusion_matrix.precision(sentiment, output), 5),
                         round(0.72673, 5))

        # Assert that the recall is correct
        self.assertEqual(round(self.confusion_matrix.recall(sentiment, output), 5),
                         round(0.58638, 5))

    def test_02_stochastic_gradient_ascent_high_iteration(self):
        """Test stochastic gradient descent for logistic regression.

        Tests stochastic gradient descent and test it with some known values.

        """
        # We will use important words for the output
        features = self.important_words

        # Output will use sentiment
        output = ['sentiment']

        # Convert our pandas frame to numpy
        feature_matrix, sentiment = self.convert_numpy.convert_to_numpy(self.train_frame, features, output, 1)

        # Compute the coefficients
        coefficients = self.logistic_regression.stochastic_gradient_ascent(feature_matrix, sentiment,
                                                                           {"initial_coefficients": np.zeros(194),
                                                                            "step_size": 5e-1,
                                                                            "batch_size": 1000,
                                                                            "max_iter": 1000})

        # Real coefficients that we need to compare with the computed coefficients
        real_coef = [-0.06659918,  0.07516305,  0.02337901,  0.91476437,  1.25935729, -0.01093744,
                     -0.29808423,  0.00724611,  1.14319635,  0.58421811, -0.10388794,  0.25341405,
                     0.51935047, -0.16643157,  0.1581433,  -0.01678466,  0.11023426, -0.07801531,
                     -0.11943521, -0.23901842,  0.19961916,  0.26962603,  0.00726172,  1.58116946,
                     -0.04749877, -0.01222728, -0.12452547,  0.2408741,   0.23996495, -0.27318487,
                     0.16391931,  0.46141695, -0.00520781, -0.41720674,  1.3914436,   0.59286041,
                     -0.01877455, -0.1177062,   0.04522629, -0.05050944, -0.1872891,   0.1119123,
                     0.05552736,  0.018883,   -0.28821684,  0.35454167,  0.09146771, -0.15185966,
                     0.45980111,  0.13696004, -0.27719711,  0.37826182,  0.51482099, -0.12707594,
                     -0.08043197,  0.27088589,  0.20836676, -0.22217221,  0.34308818,  0.05011724,
                     0.01336183, -0.00422257,  0.25914879,  0.18971367,  0.11804381,  0.06478439,
                     0.13413068, -0.35940054, -0.04225724, -0.23574987, -0.26178573,  0.37077618,
                     0.266064,    0.0552738,   0.25274691,  0.15248314,  0.9721445,   0.03951392,
                     -0.59577998, -0.09680726, -0.13168621,  0.42806047,  0.03576358,  1.03088019,
                     0.52916025, -0.09516351,  0.23544152,  0.31386904,  0.50647271,  0.25383116,
                     0.1369185,  0.93673001, -0.06280486,  0.1670564,  -0.20573152,  0.2201837,
                     0.12892914, -0.9711816,  -0.24387714, -0.3566874,  -0.65956699, -0.28473646,
                     -0.34083222, -0.44708957, -0.29828401, -0.52797307, -1.92693359, -0.33116364,
                     -0.43025271, -0.21284617, 0.16375567, -0.0299845,  -0.30294927, -1.25019619,
                     -1.55092776, -0.09266983, -0.08014312, -0.07565967, -0.00950432,  0.00327247,
                     0.03190358, -0.04247063, -0.28205865, -0.45678176,  0.06141561, -0.2690871,
                     -0.05979329, -0.0019354,  -0.01279985,  0.05323391, -0.35513613, -0.26639425,
                     -0.41094467, -0.14117863, -0.90001241, -0.33279773,  0.01621988, -0.08709595,
                     -0.10450457, -0.12567406, -0.61727551, -0.18663497,  0.17636203,  0.09316913,
                     -0.06829369,  0.1880183,  -0.5078543,   0.03964466, -0.26089197, -0.07480237,
                     -0.05556211, -0.1450303,  -0.04780934,  0.08911386, -0.15163772,  0.06213261,
                     -0.34512242, -0.33522342,  0.06580618, -0.44499204, -0.68623426, -0.12564489,
                     0.2609755,   0.09998045, -0.25098629, -0.29549973, -0.15944276, -0.47408765,
                     -0.03058168, -1.42253269, -0.49855378,  0.05835175, -1.17789127, -0.08226967,
                     -0.56793665, -0.35814271, -0.98559717, -0.16918106, -0.12477773, -0.23457722,
                     -0.13170106, -0.64351485, -0.01773532, -0.2686544,  0.047442,   -0.34218929,
                     -0.48340895,  0.37866335, -0.25162177,  0.05277577,  0.01545386, -0.26267815,
                     -0.09903819, -0.54500151]

        # Loop through each value, the coefficients must be the same
        for pred_coef, coef in zip(coefficients, real_coef):
            # Assert that both values are the same
            self.assertEqual(round(pred_coef, 5), round(coef, 5))

        # Get the output of the logistic regression with threshold 0
        output = self.predict_output.logistic_regression(feature_matrix, coefficients, 0)

        # Generate a confusion matrix
        confusion_matrix = self.confusion_matrix.confusion_matrix(sentiment, output)

        # Assert the values are to be expected
        self.assertEqual(confusion_matrix, {'false_negatives': 5018, 'true_negatives': 18995,
                                            'true_positives': 18830, 'false_positives': 4937})

        # Assert that the precision is correct
        self.assertEqual(round(self.confusion_matrix.precision(sentiment, output), 5),
                         round(0.78958, 5))

        # Assert that the recall is correct
        self.assertEqual(round(self.confusion_matrix.recall(sentiment, output), 5),
                         round(0.79228, 5))

    def test_04_log_likelihood(self):
        """Test log likelihood.

        Test the log likelihood algorithm, and compare it with some known values.

        """
        # Generate test feature, coefficients, and label
        feature_matrix = np.array([[1., 2., 3.], [1., -1., -1]])
        coefficients = np.array([1., 3., -1.])
        label = np.array([-1, 1])

        # Compute the log likelihood
        lg = self.log_likelhood.log_likelihood(feature_matrix, label, coefficients)

        # Assert the value
        self.assertEqual(round(lg, 5), round(-5.33141161544, 5))

    def test_05_average_log_likelihood(self):
        """Test average log likelihood.

        Test the average log likelihood algorithm, and compare it with some known values.

        """
        # Generate test feature, coefficients, and label
        feature_matrix = np.array([[1., 2., 3.], [1., -1., -1]])
        coefficients = np.array([1., 3., -1.])
        label = np.array([-1, 1])

        # Compute the log likelihood
        lg = self.log_likelhood.average_log_likelihood(feature_matrix, label, coefficients)

        # Assert the value
        self.assertEqual(round(lg, 5), round(-2.6657099999999998, 5))