def setUp(self):
        """Constructor for TestKNearestNeighborRegression.

        Loads housing data, and creates training and testing data.

        """
        self.convert_numpy = ConvertNumpy()
        self.normalize_features = NormalizeFeatures()
        self.knn = KNearestNeighborRegression()
        self.euclidean_distance = EuclideanDistance()
        self.determine_k_knn = DetermineKKnn()

        # Create a dictionary type to store relevant data types so that our pandas
        # will read the correct information
        dtype_dict = {'bathrooms': float, 'waterfront': int, 'sqft_above': int, 'sqft_living15': float,
                      'grade': int, 'yr_renovated': int, 'price': float, 'bedrooms': float, 'zipcode': str,
                      'long': float, 'sqft_lot15': float, 'sqft_living': float, 'floors': str, 'condition': int,
                      'lat': float, 'date': str, 'sqft_basement': int, 'yr_built': int, 'id': str, 'sqft_lot': int,
                      'view': int}

        # Create a kc_house that encompasses all test and train data
        self.kc_house = pd.read_csv('./unit_tests/test_data/regression/kc_house_knn/kc_house_data_small.csv',
                                    dtype=dtype_dict)

        # Create a kc_house_test_frame that encompasses only train data
        self.kc_house_train = pd.read_csv('./unit_tests/test_data/regression/kc_house_knn/'
                                          'kc_house_data_small_train.csv',
                                          dtype=dtype_dict)

        # Create a kc_house_frames that encompasses only test data
        self.kc_house_test = pd.read_csv('./unit_tests/test_data/regression/kc_house_knn/kc_house_data_small_test.csv',
                                         dtype=dtype_dict)

        # Create a kc_house_frames that encompasses only validation data
        self.kc_house_valid = pd.read_csv('./unit_tests/test_data/regression/kc_house_knn/kc_house_data_validation.csv',
                                          dtype=dtype_dict)

        # Convert all the frames with the floors to float type
        self.kc_house['floors'] = self.kc_house['floors'].astype(float)
        self.kc_house_train['floors'] = self.kc_house_train['floors'].astype(float)
        self.kc_house_test['floors'] = self.kc_house_test['floors'].astype(float)
        self.kc_house_valid['floors'] = self.kc_house_valid['floors'].astype(float)

        # Then back to int type
        self.kc_house['floors'] = self.kc_house['floors'].astype(int)
        self.kc_house_train['floors'] = self.kc_house_train['floors'].astype(int)
        self.kc_house_test['floors'] = self.kc_house_test['floors'].astype(int)
        self.kc_house_valid['floors'] = self.kc_house_valid['floors'].astype(int)
Ejemplo n.º 2
0
    def setUp(self):
        """Constructor for TestLassoRegression.

        Loads housing data, and creates training and testing data.

        """
        self.convert_numpy = ConvertNumpy()
        self.normalize_features = NormalizeFeatures()
        self.lasso = LassoRegression()
        self.predict_output = PredictOutput()
        self.residual_sum_squares = ResidualSumSquares()
        self.k_fold_cross_validation = KFoldCrossValidation()

        # Create a dictionary type to store relevant data types so that our pandas
        # will read the correct information
        dtype_dict = {'bathrooms': float, 'waterfront': int, 'sqft_above': int, 'sqft_living15': float,
                      'grade': int, 'yr_renovated': int, 'price': float, 'bedrooms': float, 'zipcode': str,
                      'long': float, 'sqft_lot15': float, 'sqft_living': float, 'floors': str, 'condition': int,
                      'lat': float, 'date': str, 'sqft_basement': int, 'yr_built': int, 'id': str, 'sqft_lot': int,
                      'view': int}

        # Create a kc_house that encompasses all test and train data
        self.kc_house = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_data.csv', dtype=dtype_dict)

        # Create a kc_house_test_frame that encompasses only train data
        self.kc_house_train = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_train_data.csv',
                                          dtype=dtype_dict)

        # Create a kc_house_frames that encompasses only test data
        self.kc_house_test = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_test_data.csv',
                                         dtype=dtype_dict)

        # Convert all the frames with the floors to float type
        self.kc_house['floors'] = self.kc_house['floors'].astype(float)
        self.kc_house_train['floors'] = self.kc_house['floors'].astype(float)
        self.kc_house_test['floors'] = self.kc_house['floors'].astype(float)

        # Then back to int type
        self.kc_house['floors'] = self.kc_house['floors'].astype(int)
        self.kc_house_train['floors'] = self.kc_house['floors'].astype(int)
        self.kc_house_test['floors'] = self.kc_house['floors'].astype(int)
Ejemplo n.º 3
0
class TestLassoRegression(unittest.TestCase):

    """Tests for TestLassoRegression.

    Uses housing data to test LassoRegression.

    Statics:
        _multiprocess_can_split_ (bool): Flag for nose tests to run tests in parallel.

    """

    _multiprocess_can_split_ = True

    def setUp(self):
        """Constructor for TestLassoRegression.

        Loads housing data, and creates training and testing data.

        """
        self.convert_numpy = ConvertNumpy()
        self.normalize_features = NormalizeFeatures()
        self.lasso = LassoRegression()
        self.predict_output = PredictOutput()
        self.residual_sum_squares = ResidualSumSquares()
        self.k_fold_cross_validation = KFoldCrossValidation()

        # Create a dictionary type to store relevant data types so that our pandas
        # will read the correct information
        dtype_dict = {'bathrooms': float, 'waterfront': int, 'sqft_above': int, 'sqft_living15': float,
                      'grade': int, 'yr_renovated': int, 'price': float, 'bedrooms': float, 'zipcode': str,
                      'long': float, 'sqft_lot15': float, 'sqft_living': float, 'floors': str, 'condition': int,
                      'lat': float, 'date': str, 'sqft_basement': int, 'yr_built': int, 'id': str, 'sqft_lot': int,
                      'view': int}

        # Create a kc_house that encompasses all test and train data
        self.kc_house = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_data.csv', dtype=dtype_dict)

        # Create a kc_house_test_frame that encompasses only train data
        self.kc_house_train = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_train_data.csv',
                                          dtype=dtype_dict)

        # Create a kc_house_frames that encompasses only test data
        self.kc_house_test = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_test_data.csv',
                                         dtype=dtype_dict)

        # Convert all the frames with the floors to float type
        self.kc_house['floors'] = self.kc_house['floors'].astype(float)
        self.kc_house_train['floors'] = self.kc_house['floors'].astype(float)
        self.kc_house_test['floors'] = self.kc_house['floors'].astype(float)

        # Then back to int type
        self.kc_house['floors'] = self.kc_house['floors'].astype(int)
        self.kc_house_train['floors'] = self.kc_house['floors'].astype(int)
        self.kc_house_test['floors'] = self.kc_house['floors'].astype(int)

    def test_01_normalize_features(self):
        """Tests normalizing features.

        Test normalization features, and compare it with known values.

        """
        # Normalize the features, and also return the norms
        features, norms = self.normalize_features.l2_norm(np.array([[3., 6., 9.], [4., 8., 12.]]))

        # Assert that the np array is equal to features
        self.assertTrue(np.array_equal(np.array([[0.6, 0.6, 0.6], [0.8, 0.8, 0.8]]), features), True)

        # Assert that the np array is equal to norms
        self.assertTrue(np.array_equal(np.array([5., 10., 15.]), norms), True)

    def test_02_compute_ro(self):
        """Test compute ro

        Test compute one round of ro.

        """
        # We will use sqft_iving, and sqft_living15
        features = ['sqft_living', 'bedrooms']

        # Output will use price
        output = ['price']

        # Convert our pandas frame to numpy
        feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house, features, output, 1)

        # Create our initial weights
        normalized_feature_matrix, _ = self.normalize_features.l2_norm(feature_matrix)

        # Set initial weights
        weights = np.array([1., 4., 1.])

        # Compute ro_j
        ro_j = self.lasso.compute_ro_j(normalized_feature_matrix, output, weights)

        # Assert the output of ro_j
        self.assertTrue(np.allclose(ro_j, np.array([79400300.03492916, 87939470.77299108, 80966698.67596565])))

    def test_03_compute_coordinate_descent_step(self):
        """Test one coordinate descent step.

        Test one coordinate descent step and compare it with known values.

        """
        # Assert that both are equal
        self.assertEquals(round(self.lasso.lasso_coordinate_descent_step({"i": 1,
                                                                          "weights": np.array([1., 4.])},
                                                                         np.array([[3./math.sqrt(13),
                                                                                    1./math.sqrt(10)],
                                                                                   [2./math.sqrt(13),
                                                                                    3./math.sqrt(10)]]),
                                                                         np.array([1., 1.]),
                                                                         {"l1_penalty": 0.1}), 8),
                          round(0.425558846691, 8))

    def test_04_coordinate_descent(self):
        """Test coordinate descent.

        Test coordinate descent and compare with known values.

        """
        # We will use sqft_iving, and sqft_living15
        features = ['sqft_living', 'bedrooms']

        # Output will use price
        output = ['price']

        # Convert our pandas frame to numpy
        feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house, features, output, 1)

        # Create our initial weights
        normalized_feature_matrix, _ = self.normalize_features.l2_norm(feature_matrix)

        # Set initial weights
        initial_weights = np.zeros(3)

        # Set l1 penalty
        l1_penalty = 1e7

        # Set tolerance
        tolerance = 1.0

        # Compute the weights using coordinate descent
        weights = self.lasso.lasso_cyclical_coordinate_descent(normalized_feature_matrix, output,
                                                               {"initial_weights": initial_weights,
                                                                "l1_penalty": l1_penalty,
                                                                "tolerance": tolerance})

        # Assert that these two numpy arrays are the same
        self.assertTrue(np.allclose(weights, np.array([21624998.3663629, 63157246.78545423, 0.]), True))

        # Predict the output
        predicted_output = self.predict_output.regression(normalized_feature_matrix, weights)

        # Assert that the RSS is what we wanted
        self.assertEquals(round(self.residual_sum_squares.residual_sum_squares_regression(output,
                                                                                          predicted_output), -10),
                          round(1.63049248148e+15, -10))

    def test_05_coordinate_descent_with_normalization(self):
        """Test coordinate descent with normalization.

        Test coordinate descent and then normalize the result, so that we can use the weights on a test set.

        """
        # We will use multiple features
        features = ['bedrooms',
                    'bathrooms',
                    'sqft_living',
                    'sqft_lot',
                    'floors',
                    'waterfront',
                    'view',
                    'condition',
                    'grade',
                    'sqft_above',
                    'sqft_basement',
                    'yr_built',
                    'yr_renovated']

        # Output will use price
        output = ['price']

        # Convert our pandas frame to numpy
        feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train, features, output, 1)

        # Create our initial weights
        normalized_feature_matrix, norms = self.normalize_features.l2_norm(feature_matrix)

        # Compute Multiple Weights
        weights1e7 = self.lasso.lasso_cyclical_coordinate_descent(normalized_feature_matrix, output,
                                                                  {"initial_weights": np.zeros(len(features)+1),
                                                                   "l1_penalty": 1e7,
                                                                   "tolerance": 1})
        weights1e8 = self.lasso.lasso_cyclical_coordinate_descent(normalized_feature_matrix, output,
                                                                  {"initial_weights": np.zeros(len(features)+1),
                                                                   "l1_penalty": 1e8,
                                                                   "tolerance": 1})
        weights1e4 = self.lasso.lasso_cyclical_coordinate_descent(normalized_feature_matrix, output,
                                                                  {"initial_weights": np.zeros(len(features)+1),
                                                                   "l1_penalty": 1e4,
                                                                   "tolerance": 5e5})

        # Compute multiple normalized
        normalized_weights1e4 = weights1e4 / norms
        normalized_weights1e7 = weights1e7 / norms
        normalized_weights1e8 = weights1e8 / norms

        # We will use multiple features
        features = ['bedrooms',
                    'bathrooms',
                    'sqft_living',
                    'sqft_lot',
                    'floors',
                    'waterfront',
                    'view',
                    'condition',
                    'grade',
                    'sqft_above',
                    'sqft_basement',
                    'yr_built',
                    'yr_renovated']

        # Output will use price
        output = ['price']

        # Convert our test pandas frame to numpy
        test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_house_test, features, output, 1)

        # Predict the output
        predicted_output = self.predict_output.regression(test_feature_matrix, normalized_weights1e4)

        # Assert that the RSS is what we wanted
        self.assertEquals(round(self.residual_sum_squares.residual_sum_squares_regression(test_output,
                                                                                          predicted_output), -12),
                          round(2.2778100476e+14, -12))

        # Predict the output
        predicted_output = self.predict_output.regression(test_feature_matrix, normalized_weights1e7)

        # Assert that the RSS is what we wanted
        self.assertEquals(round(self.residual_sum_squares.residual_sum_squares_regression(test_output,
                                                                                          predicted_output), -12),
                          round(2.75962079909e+14, -12))

        # Predict the output
        predicted_output = self.predict_output.regression(test_feature_matrix, normalized_weights1e8)

        # Assert that the RSS is what we wanted
        self.assertEquals(round(self.residual_sum_squares.residual_sum_squares_regression(test_output,
                                                                                          predicted_output), -12),
                          round(5.37049248148e+14, -12))
class TestKNearestNeighborRegression(unittest.TestCase):

    """Tests for TestKNearestNeighborRegression.

    Uses housing data to test KNearestNeighborRegression.

    Statics:
        _multiprocess_can_split_ (bool): Flag for nose tests to run tests in parallel.

    """

    _multiprocess_can_split_ = True

    def setUp(self):
        """Constructor for TestKNearestNeighborRegression.

        Loads housing data, and creates training and testing data.

        """
        self.convert_numpy = ConvertNumpy()
        self.normalize_features = NormalizeFeatures()
        self.knn = KNearestNeighborRegression()
        self.euclidean_distance = EuclideanDistance()
        self.determine_k_knn = DetermineKKnn()

        # Create a dictionary type to store relevant data types so that our pandas
        # will read the correct information
        dtype_dict = {'bathrooms': float, 'waterfront': int, 'sqft_above': int, 'sqft_living15': float,
                      'grade': int, 'yr_renovated': int, 'price': float, 'bedrooms': float, 'zipcode': str,
                      'long': float, 'sqft_lot15': float, 'sqft_living': float, 'floors': str, 'condition': int,
                      'lat': float, 'date': str, 'sqft_basement': int, 'yr_built': int, 'id': str, 'sqft_lot': int,
                      'view': int}

        # Create a kc_house that encompasses all test and train data
        self.kc_house = pd.read_csv('./unit_tests/test_data/regression/kc_house_knn/kc_house_data_small.csv',
                                    dtype=dtype_dict)

        # Create a kc_house_test_frame that encompasses only train data
        self.kc_house_train = pd.read_csv('./unit_tests/test_data/regression/kc_house_knn/'
                                          'kc_house_data_small_train.csv',
                                          dtype=dtype_dict)

        # Create a kc_house_frames that encompasses only test data
        self.kc_house_test = pd.read_csv('./unit_tests/test_data/regression/kc_house_knn/kc_house_data_small_test.csv',
                                         dtype=dtype_dict)

        # Create a kc_house_frames that encompasses only validation data
        self.kc_house_valid = pd.read_csv('./unit_tests/test_data/regression/kc_house_knn/kc_house_data_validation.csv',
                                          dtype=dtype_dict)

        # Convert all the frames with the floors to float type
        self.kc_house['floors'] = self.kc_house['floors'].astype(float)
        self.kc_house_train['floors'] = self.kc_house_train['floors'].astype(float)
        self.kc_house_test['floors'] = self.kc_house_test['floors'].astype(float)
        self.kc_house_valid['floors'] = self.kc_house_valid['floors'].astype(float)

        # Then back to int type
        self.kc_house['floors'] = self.kc_house['floors'].astype(int)
        self.kc_house_train['floors'] = self.kc_house_train['floors'].astype(int)
        self.kc_house_test['floors'] = self.kc_house_test['floors'].astype(int)
        self.kc_house_valid['floors'] = self.kc_house_valid['floors'].astype(int)

    def test_01_compute_euclidean_distance(self):
        """Tests Euclidean distance.

        Tests Euclidean distance and compare it with known values.

        """
        # List of features to convert to numpy
        feature_list = ['bedrooms',
                        'bathrooms',
                        'sqft_living',
                        'sqft_lot',
                        'floors',
                        'waterfront',
                        'view',
                        'condition',
                        'grade',
                        'sqft_above',
                        'sqft_basement',
                        'yr_built',
                        'yr_renovated',
                        'lat',
                        'long',
                        'sqft_living15',
                        'sqft_lot15']

        # Output to convert to numpy
        output = ['price']

        # Extract features and output for train, test, and validation set
        features_train, _ = self.convert_numpy.convert_to_numpy(self.kc_house_train, feature_list, output, 1)
        features_test, _ = self.convert_numpy.convert_to_numpy(self.kc_house_test, feature_list, output, 1)
        # features_valid, output_valid = self.convert_numpy.convert_to_numpy(self.kc_house_valid, feature_list,
        #                                                                    output, 1)

        # Normalize our training features, and then normalize the test set and valid set
        features_train, norms = self.normalize_features.l2_norm(features_train)
        features_test = features_test / norms
        # features_valid = features_valid / norms

        # Compute the euclidean distance
        distance = self.euclidean_distance.euclidean_distance(features_test[0], features_train[9])

        # Assert that both are equal
        self.assertEqual(round(distance, 3), round(0.059723593716661257, 3))

    def test_02_compute_euclidean_distance_query_point(self):
        """Tests Euclidean distance with a set of query points.

        Test to compute euclidean distance from a query point to multiple points in the training set

        """
        # List of features to convert to numpy
        feature_list = ['bedrooms',
                        'bathrooms',
                        'sqft_living',
                        'sqft_lot',
                        'floors',
                        'waterfront',
                        'view',
                        'condition',
                        'grade',
                        'sqft_above',
                        'sqft_basement',
                        'yr_built',
                        'yr_renovated',
                        'lat',
                        'long',
                        'sqft_living15',
                        'sqft_lot15']

        # Output to convert to numpy
        output = ['price']

        # Extract features and output for train, test, and validation set
        features_train, output_train = self.convert_numpy.convert_to_numpy(self.kc_house_train, feature_list, output, 1)
        features_test, _ = self.convert_numpy.convert_to_numpy(self.kc_house_test, feature_list, output, 1)
        # features_valid, output_valid = self.convert_numpy.convert_to_numpy(self.kc_house_valid, feature_list,
        #                                                                    output, 1)

        # Normalize our training features, and then normalize the test set and valid set
        features_train, norms = self.normalize_features.l2_norm(features_train)
        features_test = features_test / norms
        # features_valid = features_valid / norms

        # Determine the smallest euclidean distance set we get
        smallest = sys.maxsize
        smallest_index = 0
        for index, val in enumerate(self.euclidean_distance.euclidean_distance_cmp_one_value(features_train,
                                                                                             features_test[2])):
            if val < smallest:
                smallest = val
                smallest_index = index

        # Assert that we are getting the right prediction (for 1-NN neighbor)
        self.assertEqual(round(smallest, 8), round(0.00286049526751, 8))
        self.assertEqual(output_train[smallest_index], 249000)
        self.assertEqual(smallest_index, 382)

    def test_03_compute_knn(self):
        """Tests knn regression algorithm.

        Tests the knn algorithm and compare it with known values.

        """
        # List of features to convert to numpy
        feature_list = ['bedrooms',
                        'bathrooms',
                        'sqft_living',
                        'sqft_lot',
                        'floors',
                        'waterfront',
                        'view',
                        'condition',
                        'grade',
                        'sqft_above',
                        'sqft_basement',
                        'yr_built',
                        'yr_renovated',
                        'lat',
                        'long',
                        'sqft_living15',
                        'sqft_lot15']

        # Output to convert to numpy
        output = ['price']

        # Extract features and output for train, test, and validation set
        features_train, output_train = self.convert_numpy.convert_to_numpy(self.kc_house_train, feature_list, output, 1)
        features_test, _ = self.convert_numpy.convert_to_numpy(self.kc_house_test, feature_list, output, 1)
        # features_valid, output_valid = self.convert_numpy.convert_to_numpy(self.kc_house_valid, feature_list,
        #                                                                    output, 1)

        # Normalize our training features, and then normalize the test set and valid set
        features_train, norms = self.normalize_features.l2_norm(features_train)
        features_test = features_test / norms
        # features_valid = features_valid / norms

        # Assert that the array is the closest with the 3rd house in features_test
        self.assertTrue(np.array_equal(self.knn.k_nearest_neighbor_regression(4, features_train, features_test[2]),
                                       np.array([382, 1149, 4087, 3142])))

        # Assert that the 413987.5 is the correct prediction
        self.assertEqual(self.knn.predict_k_nearest_neighbor_regression(4, features_train,
                                                                        output_train, features_test[2]),
                         413987.5)

        # Compute the lowest predicted value
        lowest_predicted = sys.maxsize
        lowest_predicted_index = 0
        for index, val in enumerate(self.knn.predict_k_nearest_neighbor_all_regression(10, features_train,
                                                                                       output_train,
                                                                                       features_test[0:10])):
            if val < lowest_predicted:
                lowest_predicted = val
                lowest_predicted_index = index

        # Assert that the few values such as lowest predicted values and index are the one we expect
        self.assertEqual(lowest_predicted, 350032.0)
        self.assertEqual(lowest_predicted_index, 6)

    def test_03_compute_best_k(self):
        """Compute best K for KNN Regression.

        Compute best K using K Fold Cross Validation.

        """
        # List of features to convert to numpy
        feature_list = ['bedrooms',
                        'bathrooms',
                        'sqft_living',
                        'sqft_lot',
                        'floors',
                        'waterfront',
                        'view',
                        'condition',
                        'grade',
                        'sqft_above',
                        'sqft_basement',
                        'yr_built',
                        'yr_renovated',
                        'lat',
                        'long',
                        'sqft_living15',
                        'sqft_lot15']

        # Output to convert to numpy
        output = ['price']

        # Extract features and output for train, test, and validation set
        features_train, output_train = self.convert_numpy.convert_to_numpy(self.kc_house_train, feature_list, output, 1)
        # features_test, output_test = self.convert_numpy.convert_to_numpy(self.kc_house_test, feature_list,
        #                                                                  output, 1)
        features_valid, output_valid = self.convert_numpy.convert_to_numpy(self.kc_house_valid, feature_list, output, 1)

        # Normalize our training features, and then normalize the test set and valid set
        features_train, norms = self.normalize_features.l2_norm(features_train)
        # features_test = features_test / norms
        features_valid = features_valid / norms

        # Compute the lowest K and lowest K's RSS
        low_rss, low_idx = self.determine_k_knn.determine_k_knn(self.knn.predict_k_nearest_neighbor_all_regression,
                                                                1, 16, {"features_train": features_train,
                                                                        "features_valid": features_valid,
                                                                        "output_train": output_train,
                                                                        "output_valid": output_valid})

        # Assert that the lowest k and rss is correct
        self.assertEqual(round(low_rss, -13), round(6.73616787355e+13, -13))
        self.assertEqual(low_idx, 8)