def setUp(self): """Constructor for TestLassoRegression. Loads housing data, and creates training and testing data. """ self.convert_numpy = ConvertNumpy() self.normalize_features = NormalizeFeatures() self.lasso = LassoRegression() self.predict_output = PredictOutput() self.residual_sum_squares = ResidualSumSquares() self.k_fold_cross_validation = KFoldCrossValidation() # Create a dictionary type to store relevant data types so that our pandas # will read the correct information dtype_dict = {'bathrooms': float, 'waterfront': int, 'sqft_above': int, 'sqft_living15': float, 'grade': int, 'yr_renovated': int, 'price': float, 'bedrooms': float, 'zipcode': str, 'long': float, 'sqft_lot15': float, 'sqft_living': float, 'floors': str, 'condition': int, 'lat': float, 'date': str, 'sqft_basement': int, 'yr_built': int, 'id': str, 'sqft_lot': int, 'view': int} # Create a kc_house that encompasses all test and train data self.kc_house = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_data.csv', dtype=dtype_dict) # Create a kc_house_test_frame that encompasses only train data self.kc_house_train = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_train_data.csv', dtype=dtype_dict) # Create a kc_house_frames that encompasses only test data self.kc_house_test = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_test_data.csv', dtype=dtype_dict) # Convert all the frames with the floors to float type self.kc_house['floors'] = self.kc_house['floors'].astype(float) self.kc_house_train['floors'] = self.kc_house['floors'].astype(float) self.kc_house_test['floors'] = self.kc_house['floors'].astype(float) # Then back to int type self.kc_house['floors'] = self.kc_house['floors'].astype(int) self.kc_house_train['floors'] = self.kc_house['floors'].astype(int) self.kc_house_test['floors'] = self.kc_house['floors'].astype(int)
class TestLassoRegression(unittest.TestCase): """Tests for TestLassoRegression. Uses housing data to test LassoRegression. Statics: _multiprocess_can_split_ (bool): Flag for nose tests to run tests in parallel. """ _multiprocess_can_split_ = True def setUp(self): """Constructor for TestLassoRegression. Loads housing data, and creates training and testing data. """ self.convert_numpy = ConvertNumpy() self.normalize_features = NormalizeFeatures() self.lasso = LassoRegression() self.predict_output = PredictOutput() self.residual_sum_squares = ResidualSumSquares() self.k_fold_cross_validation = KFoldCrossValidation() # Create a dictionary type to store relevant data types so that our pandas # will read the correct information dtype_dict = {'bathrooms': float, 'waterfront': int, 'sqft_above': int, 'sqft_living15': float, 'grade': int, 'yr_renovated': int, 'price': float, 'bedrooms': float, 'zipcode': str, 'long': float, 'sqft_lot15': float, 'sqft_living': float, 'floors': str, 'condition': int, 'lat': float, 'date': str, 'sqft_basement': int, 'yr_built': int, 'id': str, 'sqft_lot': int, 'view': int} # Create a kc_house that encompasses all test and train data self.kc_house = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_data.csv', dtype=dtype_dict) # Create a kc_house_test_frame that encompasses only train data self.kc_house_train = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_train_data.csv', dtype=dtype_dict) # Create a kc_house_frames that encompasses only test data self.kc_house_test = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_test_data.csv', dtype=dtype_dict) # Convert all the frames with the floors to float type self.kc_house['floors'] = self.kc_house['floors'].astype(float) self.kc_house_train['floors'] = self.kc_house['floors'].astype(float) self.kc_house_test['floors'] = self.kc_house['floors'].astype(float) # Then back to int type self.kc_house['floors'] = self.kc_house['floors'].astype(int) self.kc_house_train['floors'] = self.kc_house['floors'].astype(int) self.kc_house_test['floors'] = self.kc_house['floors'].astype(int) def test_01_normalize_features(self): """Tests normalizing features. Test normalization features, and compare it with known values. """ # Normalize the features, and also return the norms features, norms = self.normalize_features.l2_norm(np.array([[3., 6., 9.], [4., 8., 12.]])) # Assert that the np array is equal to features self.assertTrue(np.array_equal(np.array([[0.6, 0.6, 0.6], [0.8, 0.8, 0.8]]), features), True) # Assert that the np array is equal to norms self.assertTrue(np.array_equal(np.array([5., 10., 15.]), norms), True) def test_02_compute_ro(self): """Test compute ro Test compute one round of ro. """ # We will use sqft_iving, and sqft_living15 features = ['sqft_living', 'bedrooms'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house, features, output, 1) # Create our initial weights normalized_feature_matrix, _ = self.normalize_features.l2_norm(feature_matrix) # Set initial weights weights = np.array([1., 4., 1.]) # Compute ro_j ro_j = self.lasso.compute_ro_j(normalized_feature_matrix, output, weights) # Assert the output of ro_j self.assertTrue(np.allclose(ro_j, np.array([79400300.03492916, 87939470.77299108, 80966698.67596565]))) def test_03_compute_coordinate_descent_step(self): """Test one coordinate descent step. Test one coordinate descent step and compare it with known values. """ # Assert that both are equal self.assertEquals(round(self.lasso.lasso_coordinate_descent_step({"i": 1, "weights": np.array([1., 4.])}, np.array([[3./math.sqrt(13), 1./math.sqrt(10)], [2./math.sqrt(13), 3./math.sqrt(10)]]), np.array([1., 1.]), {"l1_penalty": 0.1}), 8), round(0.425558846691, 8)) def test_04_coordinate_descent(self): """Test coordinate descent. Test coordinate descent and compare with known values. """ # We will use sqft_iving, and sqft_living15 features = ['sqft_living', 'bedrooms'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house, features, output, 1) # Create our initial weights normalized_feature_matrix, _ = self.normalize_features.l2_norm(feature_matrix) # Set initial weights initial_weights = np.zeros(3) # Set l1 penalty l1_penalty = 1e7 # Set tolerance tolerance = 1.0 # Compute the weights using coordinate descent weights = self.lasso.lasso_cyclical_coordinate_descent(normalized_feature_matrix, output, {"initial_weights": initial_weights, "l1_penalty": l1_penalty, "tolerance": tolerance}) # Assert that these two numpy arrays are the same self.assertTrue(np.allclose(weights, np.array([21624998.3663629, 63157246.78545423, 0.]), True)) # Predict the output predicted_output = self.predict_output.regression(normalized_feature_matrix, weights) # Assert that the RSS is what we wanted self.assertEquals(round(self.residual_sum_squares.residual_sum_squares_regression(output, predicted_output), -10), round(1.63049248148e+15, -10)) def test_05_coordinate_descent_with_normalization(self): """Test coordinate descent with normalization. Test coordinate descent and then normalize the result, so that we can use the weights on a test set. """ # We will use multiple features features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train, features, output, 1) # Create our initial weights normalized_feature_matrix, norms = self.normalize_features.l2_norm(feature_matrix) # Compute Multiple Weights weights1e7 = self.lasso.lasso_cyclical_coordinate_descent(normalized_feature_matrix, output, {"initial_weights": np.zeros(len(features)+1), "l1_penalty": 1e7, "tolerance": 1}) weights1e8 = self.lasso.lasso_cyclical_coordinate_descent(normalized_feature_matrix, output, {"initial_weights": np.zeros(len(features)+1), "l1_penalty": 1e8, "tolerance": 1}) weights1e4 = self.lasso.lasso_cyclical_coordinate_descent(normalized_feature_matrix, output, {"initial_weights": np.zeros(len(features)+1), "l1_penalty": 1e4, "tolerance": 5e5}) # Compute multiple normalized normalized_weights1e4 = weights1e4 / norms normalized_weights1e7 = weights1e7 / norms normalized_weights1e8 = weights1e8 / norms # We will use multiple features features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated'] # Output will use price output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_house_test, features, output, 1) # Predict the output predicted_output = self.predict_output.regression(test_feature_matrix, normalized_weights1e4) # Assert that the RSS is what we wanted self.assertEquals(round(self.residual_sum_squares.residual_sum_squares_regression(test_output, predicted_output), -12), round(2.2778100476e+14, -12)) # Predict the output predicted_output = self.predict_output.regression(test_feature_matrix, normalized_weights1e7) # Assert that the RSS is what we wanted self.assertEquals(round(self.residual_sum_squares.residual_sum_squares_regression(test_output, predicted_output), -12), round(2.75962079909e+14, -12)) # Predict the output predicted_output = self.predict_output.regression(test_feature_matrix, normalized_weights1e8) # Assert that the RSS is what we wanted self.assertEquals(round(self.residual_sum_squares.residual_sum_squares_regression(test_output, predicted_output), -12), round(5.37049248148e+14, -12))