def setUp(self): # Usage: # Constructor for TestLinearRegression # Arguments: # None # Create an instance of the Convert Numpy class self.convert_numpy = ConvertNumpy() # Create an instance of the Linear Regression class self.linear_regression = LinearRegression() # Create an instance of the Predict Output Class self.predict_output = PredictOutput() # Create an instance of the Residual Sum Squares Class self.residual_sum_squares = ResidualSumSquares() # Create a dictionary type to store relevant data types so that our pandas # will read the correct information dtype_dict = { 'bathrooms': float, 'waterfront': int, 'sqft_above': int, 'sqft_living15': float, 'grade': int, 'yr_renovated': int, 'price': float, 'bedrooms': float, 'zipcode': str, 'long': float, 'sqft_lot15': float, 'sqft_living': float, 'floors': str, 'condition': int, 'lat': float, 'date': str, 'sqft_basement': int, 'yr_built': int, 'id': str, 'sqft_lot': int, 'view': int } # Create a kc_house_frame that encompasses all test and train data self.kc_house_frame = pd.read_csv( './unit_tests/test_data/kc_house/kc_house_data.csv', dtype=dtype_dict) # Create a kc_house_test_frame that encompasses only train data self.kc_house_train_frame = pd.read_csv( './unit_tests/test_data/kc_house/kc_house_train_data.csv', dtype=dtype_dict) # Create a kc_house_frames that encompasses only test data self.kc_test_frames = pd.read_csv( './unit_tests/test_data/kc_house/kc_house_test_data.csv', dtype=dtype_dict)
def __init__(self): # Usage: # Constructor for KFoldCrossValidation, used to setup ConvertNumpy class to convert pandas # data to numpy. # Arguments: # None # Create an instance of the Convert Numpy class self.convert_numpy = ConvertNumpy() # Create an instance of the Predict Output Class self.predict_output = PredictOutput() # Create an instance of the Residual Sum Squares Class self.residual_sum_squares = ResidualSumSquares()
def __init__(self): """Constructor for Error. Constructor for Error, sets up output prediction class. """ self.predict_output = PredictOutput()
def __init__(self): """Constructor for LassoRegression to setup PredictOutput. Constructor for the LassoRegression class, mainly used to setup PredictOutput. """ self.predict_output = PredictOutput()
def __init__(self): """Constructor for Accuracy class to setup predict output. Constructor for Accuracy class to setup predict output class. """ self.predict_output = PredictOutput()
def setUp(self): """Constructor for TestWeightedLogisticRegression. Loads Amazon data, and creates training and testing data. """ self.convert_numpy = ConvertNumpy() self.predict = PredictOutput() self.ada = AdaBoost() self.accuracy = Accuracy() self.weighted_logistic_regression = WeightedLogisticRegression() # Load the important words self.important_words = json.load(open('./unit_tests/test_data/classification/amazon/important_words.json', 'r')) # Load the amazon baby subset self.review_frame = pd.read_csv('./unit_tests/test_data/classification/amazon/amazon_baby_subset.csv') # Review needs to be text self.review_frame['review'].astype(str) # Clean up the punctuations self.review_frame['review_clean'] = self.review_frame.apply( axis=1, func=lambda row: str(row["review"]).translate(str.maketrans({key: None for key in string.punctuation}))) # Remove any nan text self.review_frame['review_clean'] = self.review_frame.apply( axis=1, func=lambda row: '' if row["review_clean"] == "nan" else row["review_clean"]) # Count the number of words that appears in each review, and make an indepedent column for word in self.important_words: self.review_frame[word] = self.review_frame['review_clean'].apply(lambda s, w=word: s.split().count(w))
def setUp(self): # Usage: # Constructor for TestLinearRegression # Arguments: # None # Create an instance of the Convert Numpy class self.convert_numpy = ConvertNumpy() # Create an instance of the Linear Regression class self.linear_regression = LinearRegression() # Create an instance of the Predict Output Class self.predict_output = PredictOutput() # Create an instance of the Residual Sum Squares Class self.residual_sum_squares = ResidualSumSquares() # Create a dictionary type to store relevant data types so that our pandas # will read the correct information dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int} # Create a kc_house_frame that encompasses all test and train data self.kc_house_frame = pd.read_csv('./unit_tests/test_data/kc_house/kc_house_data.csv', dtype=dtype_dict) # Create a kc_house_test_frame that encompasses only train data self.kc_house_train_frame = pd.read_csv('./unit_tests/test_data/kc_house/kc_house_train_data.csv', dtype=dtype_dict) # Create a kc_house_frames that encompasses only test data self.kc_test_frames = pd.read_csv('./unit_tests/test_data/kc_house/kc_house_test_data.csv', dtype=dtype_dict)
def __init__(self): """Constructor for KFoldCrossValidation. Constructor for KFoldCrossValidation, used to setup numpy conversion, output prediction, and residual sum of squares. """ self.convert_numpy = ConvertNumpy() self.predict_output = PredictOutput() self.residual_sum_squares = ResidualSumSquares()
class Error: """Computes error for classification. Computes error for classification algorithms, such as binary tree. Attributes: predict_output (PredictOutput): Class for predicting output. """ def __init__(self): """Constructor for Error. Constructor for Error, sets up output prediction class. """ self.predict_output = PredictOutput() def binary_tree(self, tree, data, target): """Computes classification error for binary tree. Computes classification error for binary tree classification. Classification error = # Mistakes ---------------- # Total examples Args: tree (dict): The top node of a binary tree, with the following dict format: { 'is_leaf' (bool): False, 'prediction' (NoneType): None, 'splitting_feature' (str): splitting_feature, 'left' (dict): left_tree, 'right' (dict): right_tree } data (pandas.DataFrame): A pandas frame that has the same features binary tree. target (str): The target we want to predict. Returns: float: Clarification error. """ # Apply the classify(tree, x) to each row in your data prediction = data.apply(lambda x: self.predict_output.binary_tree(tree, x), axis=1) # Once you've made the predictions, calculate the classification error and return it data["prediction"] = prediction mistakes = data.apply(lambda x: x[target] != x["prediction"], axis=1).sum() # Return mistakes/total examples return float(mistakes)/float(len(data))
def setUp(self): """Constructor for WeightedBinaryDecisionTrees. We will clean up the loans_data by doing one hot encoding our features list, however, in the end we will use some pre-built data for training and testing, but it uses the same method for one hot encoding. """ self.weighted_binary_decision_trees = WeightedBinaryDecisionTrees() self.adaboost = AdaBoost() self.predict = PredictOutput() self.accuracy = Accuracy() self.error = Error() # Pandas type set dtype_dict = {'grade': str, 'term': str, 'emp_length': str, 'bad_loans': int} # Load the lending club data self.loans_data = pd.read_csv('./unit_tests/test_data/classification/lending_club/lending_club_data.csv', dtype=dtype_dict) # List features and targets that we are interested self.features = ['grade', 'term', 'home_ownership', 'emp_length'] self.target = 'safe_loans' # Do a one hot encoding of features for feature in self.features: # One hot encode loans_data_one_hot_encoded = pd.get_dummies(self.loans_data[feature].apply(lambda x: x), prefix=feature, prefix_sep='.') # Drop the feature self.loans_data.drop(feature, axis=1, inplace=True) # Join the feature with the new one encoded features self.loans_data = pd.concat([self.loans_data, loans_data_one_hot_encoded], axis=1) # Update our features self.features = list(self.loans_data.columns.values) self.features.remove('safe_loans') # Load our training and testing data self.train_data = pd.read_csv('./unit_tests/test_data/classification/lending_club/lending_club_train.csv') self.test_data = pd.read_csv('./unit_tests/test_data/classification/lending_club/lending_club_test.csv')
def setUp(self): """Constructor for TestLassoRegression. Loads housing data, and creates training and testing data. """ self.convert_numpy = ConvertNumpy() self.normalize_features = NormalizeFeatures() self.lasso = LassoRegression() self.predict_output = PredictOutput() self.residual_sum_squares = ResidualSumSquares() self.k_fold_cross_validation = KFoldCrossValidation() # Create a dictionary type to store relevant data types so that our pandas # will read the correct information dtype_dict = {'bathrooms': float, 'waterfront': int, 'sqft_above': int, 'sqft_living15': float, 'grade': int, 'yr_renovated': int, 'price': float, 'bedrooms': float, 'zipcode': str, 'long': float, 'sqft_lot15': float, 'sqft_living': float, 'floors': str, 'condition': int, 'lat': float, 'date': str, 'sqft_basement': int, 'yr_built': int, 'id': str, 'sqft_lot': int, 'view': int} # Create a kc_house that encompasses all test and train data self.kc_house = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_data.csv', dtype=dtype_dict) # Create a kc_house_test_frame that encompasses only train data self.kc_house_train = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_train_data.csv', dtype=dtype_dict) # Create a kc_house_frames that encompasses only test data self.kc_house_test = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_test_data.csv', dtype=dtype_dict) # Convert all the frames with the floors to float type self.kc_house['floors'] = self.kc_house['floors'].astype(float) self.kc_house_train['floors'] = self.kc_house['floors'].astype(float) self.kc_house_test['floors'] = self.kc_house['floors'].astype(float) # Then back to int type self.kc_house['floors'] = self.kc_house['floors'].astype(int) self.kc_house_train['floors'] = self.kc_house['floors'].astype(int) self.kc_house_test['floors'] = self.kc_house['floors'].astype(int)
def setUp(self): """Constructor for TestRidgeRegression. Loads housing data, and creates training and testing data. """ self.convert_numpy = ConvertNumpy() self.ridge_regression = RidgeRegression() self.predict_output = PredictOutput() self.residual_sum_squares = ResidualSumSquares() self.k_fold_cross_validation = KFoldCrossValidation() # Create a dictionary type to store relevant data types so that our pandas # will read the correct information dtype_dict = {'bathrooms': float, 'waterfront': int, 'sqft_above': int, 'sqft_living15': float, 'grade': int, 'yr_renovated': int, 'price': float, 'bedrooms': float, 'zipcode': str, 'long': float, 'sqft_lot15': float, 'sqft_living': float, 'floors': str, 'condition': int, 'lat': float, 'date': str, 'sqft_basement': int, 'yr_built': int, 'id': str, 'sqft_lot': int, 'view': int} # Create a kc_house that encompasses all test and train data self.kc_house = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_data.csv', dtype=dtype_dict) # Create a kc_house_test_frame that encompasses only train data self.kc_house_train = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_train_data.csv', dtype=dtype_dict) # Create a kc_house_frames that encompasses only test data self.kc_house_test = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_test_data.csv', dtype=dtype_dict) # Create a kc_house_train_valid_shuffled that encompasses both train and valid data and shuffled self.kc_house_train_valid_shuffled = pd.read_csv('./unit_tests/test_data/regression/' 'kc_house_with_validation_k_fold/' 'wk3_kc_house_train_valid_shuffled.csv', dtype=dtype_dict)
class TestWeightedLogisticRegression(unittest.TestCase): """Tests WeightedLogisticRegression class. Uses Amazon data to test WeightedLogisticRegression class. Statics: _multiprocess_can_split_ (bool): Flag for nose tests to run tests in parallel. """ _multiprocess_can_split_ = True def setUp(self): """Constructor for TestWeightedLogisticRegression. Loads Amazon data, and creates training and testing data. """ self.convert_numpy = ConvertNumpy() self.predict = PredictOutput() self.ada = AdaBoost() self.accuracy = Accuracy() self.weighted_logistic_regression = WeightedLogisticRegression() # Load the important words self.important_words = json.load(open('./unit_tests/test_data/classification/amazon/important_words.json', 'r')) # Load the amazon baby subset self.review_frame = pd.read_csv('./unit_tests/test_data/classification/amazon/amazon_baby_subset.csv') # Review needs to be text self.review_frame['review'].astype(str) # Clean up the punctuations self.review_frame['review_clean'] = self.review_frame.apply( axis=1, func=lambda row: str(row["review"]).translate(str.maketrans({key: None for key in string.punctuation}))) # Remove any nan text self.review_frame['review_clean'] = self.review_frame.apply( axis=1, func=lambda row: '' if row["review_clean"] == "nan" else row["review_clean"]) # Count the number of words that appears in each review, and make an indepedent column for word in self.important_words: self.review_frame[word] = self.review_frame['review_clean'].apply(lambda s, w=word: s.split().count(w)) def test_01_gradient_ascent(self): """Tests gradient ascent algorithm. Tests the gradient ascent algorithm and compare it with known values. """ # We will use important words for the output features = self.important_words # Output will use sentiment output = ['sentiment'] # Convert our pandas frame to numpy feature_matrix, sentiment = self.convert_numpy.convert_to_numpy(self.review_frame, features, output, 1) # Create weight list for training data weights_list = np.array([1]*len(self.review_frame)) # Compute the coefficients coefficients = self.weighted_logistic_regression.gradient_ascent(feature_matrix, sentiment, {"initial_coefficients": np.zeros(194), "weights_list": weights_list, "step_size": 1e-7, "max_iter": 30}) # Assert the coefficients self.assertEqual([round(i, 5) for i in coefficients[0:20]], [round(i, 5) for i in [0.00020000000000000001, 0.0014300000000000001, -0.00131, 0.0068900000000000003, 0.0068500000000000002, 0.00034000000000000002, -0.0062399999999999999, -0.00059000000000000003, 0.0067099999999999998, 0.0046600000000000001, 0.00042999999999999999, 0.0020300000000000001, 0.0030300000000000001, -0.00332, 0.0015, -0.00011, 0.00115, -0.0021700000000000001, -0.00139, -0.0046600000000000001]]) # Compute predictions predictions = self.predict.logistic_regression(feature_matrix, coefficients) # Accuracy has to match 0.74356999999999995 self.assertEqual(round(self.accuracy.general(predictions, sentiment), 5), round(0.74356999999999995, 5)) def test_02_adaboost(self): """Tests adaboost algorithm. Tests the adaboost algorithm with weighted logistic regression. """ # We will use important words for the output features = self.important_words # Output will use sentiment output = ['sentiment'] # Convert our pandas frame to numpy feature_matrix, sentiment = self.convert_numpy.convert_to_numpy(self.review_frame, features, output, 1) # Create 15 weighted logistic regression weights, models = self.ada.logistic_regression(feature_matrix, sentiment, iterations=15, model_dict={"predict_method": self.predict.logistic_regression, "model": self.weighted_logistic_regression, "model_method": "gradient_ascent", "model_parameters": {"step_size": 1e-7, "max_iter": 30, "initial_coefficients": np.zeros(194)}}) # Get the predictions of each dataset in the test data predictions = self.predict.adaboost_logistic_regression(self.predict.logistic_regression, models, weights, feature_matrix) # Assert the predictions self.assertEqual(list(predictions)[0:20], [1, -1, 1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1]) # Accuracy has to match 0.77612999999999999 self.assertEqual(round(self.accuracy.general(predictions, sentiment), 5), round(0.77612999999999999, 5))
class TestLinearRegression(unittest.TestCase): # Usage: # Tests for the Linear Regression Class. def setUp(self): # Usage: # Constructor for TestLinearRegression # Arguments: # None # Create an instance of the Convert Numpy class self.convert_numpy = ConvertNumpy() # Create an instance of the Linear Regression class self.linear_regression = LinearRegression() # Create an instance of the Predict Output Class self.predict_output = PredictOutput() # Create an instance of the Residual Sum Squares Class self.residual_sum_squares = ResidualSumSquares() # Create a dictionary type to store relevant data types so that our pandas # will read the correct information dtype_dict = { 'bathrooms': float, 'waterfront': int, 'sqft_above': int, 'sqft_living15': float, 'grade': int, 'yr_renovated': int, 'price': float, 'bedrooms': float, 'zipcode': str, 'long': float, 'sqft_lot15': float, 'sqft_living': float, 'floors': str, 'condition': int, 'lat': float, 'date': str, 'sqft_basement': int, 'yr_built': int, 'id': str, 'sqft_lot': int, 'view': int } # Create a kc_house_frame that encompasses all test and train data self.kc_house_frame = pd.read_csv( './unit_tests/test_data/kc_house/kc_house_data.csv', dtype=dtype_dict) # Create a kc_house_test_frame that encompasses only train data self.kc_house_train_frame = pd.read_csv( './unit_tests/test_data/kc_house/kc_house_train_data.csv', dtype=dtype_dict) # Create a kc_house_frames that encompasses only test data self.kc_test_frames = pd.read_csv( './unit_tests/test_data/kc_house/kc_house_test_data.csv', dtype=dtype_dict) def test_01_gradient_descent(self): # Usage: # Tests the result on gradient descent # Arguments: # None # We will use sqft_living for our features features = ['sqft_living'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy( self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([-47000., 1.]) # Step size step_size = 7e-12 # Tolerance tolerance = 2.5e7 # Compute our gradient descent value final_weights = self.linear_regression.gradient_descent( feature_matrix, output, initial_weights, step_size, tolerance) # Assert that the weights is correct self.assertEquals(round(-46999.887165546708, 3), round(final_weights[0], 3)) self.assertEquals(round(281.91211917520917, 3), round(final_weights[1], 3)) def test_02_gradient_descent_multiple(self): # Usage: # Computes gradient descent on multiple input, and computes predicted model and RSS # Arguments: # None # We will use sqft_iving, and sqft_living15 features = ['sqft_living', 'sqft_living15'] # Output will be price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy( self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([-100000., 1., 1.]) # Step size step_size = 4e-12 # Tolerance tolerance = 1e9 # Compute our gradient descent value final_weights = self.linear_regression.gradient_descent( feature_matrix, output, initial_weights, step_size, tolerance) # We will use sqft_iving, and sqft_living15 test_features = ['sqft_living', 'sqft_living15'] # Output will be price test_output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy( self.kc_test_frames, test_features, test_output, 1) # Predict the output of test features predicted_output = self.predict_output.predict_output_linear_regression( test_feature_matrix, final_weights) # Compute RSS rss = self.residual_sum_squares.residual_sum_squares_linear_regression( test_output, predicted_output) # Assert that rss is correct self.assertEquals(round(270263443629803.41, -3), round(rss, -3)) def test_03_hill_climbing(self): # Usage: # Tests the result on hill climbing # Arguments: # None # We will use sqft_living for our features features = ['sqft_living'] # Output will be price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy( self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([-47000., 1.]) # Step size step_size = 7e-12 # Tolerance tolerance = 2.5e7 # Compute our hill climbing value final_weights = self.linear_regression.hill_climbing( feature_matrix, output, initial_weights, step_size, tolerance) # Assert that the weights is correct self.assertEquals(round(-47000.142201335177, 3), round(final_weights[0], 3)) self.assertEquals(round(-352.86068692252599, 3), round(final_weights[1], 3))
class KFoldCrossValidation: # Usage: # Computes K Fold Cross Validation def __init__(self): # Usage: # Constructor for KFoldCrossValidation, used to setup ConvertNumpy class to convert pandas # data to numpy. # Arguments: # None # Create an instance of the Convert Numpy class self.convert_numpy = ConvertNumpy() # Create an instance of the Predict Output Class self.predict_output = PredictOutput() # Create an instance of the Residual Sum Squares Class self.residual_sum_squares = ResidualSumSquares() def k_fold_cross_validation(self, k, data, model, model_parameters, output, features): # Usage: # Takes in our data, and splits the data to smaller subsets, and these smaller subsets # are used as validation sets, and everything else not included in the validation set is used # as training sets. The model will be trained using the training set, and the performance assessment # such as RSS would be used on the validation set against the model. # Parameters: # k (int) : number of folds # data (pandas object) : data used for k folds cross validation # model (object) : model used for k folds cross validation # model_parameters (dict) : model parameters to train the specified model # features (list of string) : a list of feature names # output (string) : output name # Return: # validation_error (double) : average validation error # Get the length of the data length_data = len(data) # Sum of the validation error, will divide by k (fold) later validation_error_sum = 0 # Loop through each fold for i in range(k): # Compute the start section of the current fold start = int((length_data*i)/k) # Compute the end section of the current fold end = int((length_data*(i+1))/k-1) # Get our validation set from the start to the end+1 (+1 since we need to include the end) # <Start : end + 1> Validation Set validation_set = data[start:end+1] # The Training set the left and the right parts of the validation set # < 0 : Start > Train Set 1 # < Start : End + 1 > Validation Set # < End + 1 : n > Train Set 2 # Train Set 1 + Train Set 2 = All data excluding validation set training_set = data[0:start].append(data[end+1:length_data]) # Convert our pandas frame to numpy validation_feature_matrix, validation_output = self.convert_numpy.convert_to_numpy(validation_set, features, output, 1) # Convert our pandas frame to numpy training_feature_matrix, training_output = self.convert_numpy.convert_to_numpy(training_set, features, output, 1) # Create a model with Train Set 1 + Train Set 2 final_weights = model(**model_parameters, feature_matrix=training_feature_matrix, output=training_output) # Predict the output of test features predicted_output = self.predict_output.predict_output_linear_regression(validation_feature_matrix, final_weights) # compute squared error (in other words, rss) validation_error_sum += self.residual_sum_squares.residual_sum_squares_linear_regression(validation_output, predicted_output) # Return the validation_error_sum divided by fold return validation_error_sum/k
class TestLinearRegression(unittest.TestCase): # Usage: # Tests for the Linear Regression Class. def setUp(self): # Usage: # Constructor for TestLinearRegression # Arguments: # None # Create an instance of the Convert Numpy class self.convert_numpy = ConvertNumpy() # Create an instance of the Linear Regression class self.linear_regression = LinearRegression() # Create an instance of the Predict Output Class self.predict_output = PredictOutput() # Create an instance of the Residual Sum Squares Class self.residual_sum_squares = ResidualSumSquares() # Create a dictionary type to store relevant data types so that our pandas # will read the correct information dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int} # Create a kc_house_frame that encompasses all test and train data self.kc_house_frame = pd.read_csv('./unit_tests/test_data/kc_house/kc_house_data.csv', dtype=dtype_dict) # Create a kc_house_test_frame that encompasses only train data self.kc_house_train_frame = pd.read_csv('./unit_tests/test_data/kc_house/kc_house_train_data.csv', dtype=dtype_dict) # Create a kc_house_frames that encompasses only test data self.kc_test_frames = pd.read_csv('./unit_tests/test_data/kc_house/kc_house_test_data.csv', dtype=dtype_dict) def test_01_gradient_descent(self): # Usage: # Tests the result on gradient descent # Arguments: # None # We will use sqft_living for our features features = ['sqft_living'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([-47000., 1.]) # Step size step_size = 7e-12 # Tolerance tolerance = 2.5e7 # Compute our gradient descent value final_weights = self.linear_regression.gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance) # Assert that the weights is correct self.assertEquals(round(-46999.887165546708, 3), round(final_weights[0], 3)) self.assertEquals(round(281.91211917520917, 3), round(final_weights[1], 3)) def test_02_gradient_descent_multiple(self): # Usage: # Computes gradient descent on multiple input, and computes predicted model and RSS # Arguments: # None # We will use sqft_iving, and sqft_living15 features = ['sqft_living', 'sqft_living15'] # Output will be price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([-100000., 1., 1.]) # Step size step_size = 4e-12 # Tolerance tolerance = 1e9 # Compute our gradient descent value final_weights = self.linear_regression.gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance) # We will use sqft_iving, and sqft_living15 test_features = ['sqft_living', 'sqft_living15'] # Output will be price test_output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_test_frames, test_features, test_output, 1) # Predict the output of test features predicted_output = self.predict_output.predict_output_linear_regression(test_feature_matrix, final_weights) # Compute RSS rss = self.residual_sum_squares.residual_sum_squares_linear_regression(test_output, predicted_output) # Assert that rss is correct self.assertEquals(round(270263443629803.41, -3), round(rss, -3)) def test_03_hill_climbing(self): # Usage: # Tests the result on hill climbing # Arguments: # None # We will use sqft_living for our features features = ['sqft_living'] # Output will be price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([-47000., 1.]) # Step size step_size = 7e-12 # Tolerance tolerance = 2.5e7 # Compute our hill climbing value final_weights = self.linear_regression.hill_climbing(feature_matrix, output, initial_weights, step_size, tolerance) # Assert that the weights is correct self.assertEquals(round(-47000.142201335177, 3), round(final_weights[0], 3)) self.assertEquals(round(-352.86068692252599, 3), round(final_weights[1], 3))
class TestRidgeRegression(unittest.TestCase): """Test for RidgeRegression. Uses housing data to test RidgeRegression. Statics: _multiprocess_can_split_ (bool): Flag for nose tests to run tests in parallel. """ _multiprocess_can_split_ = True def setUp(self): """Constructor for TestRidgeRegression. Loads housing data, and creates training and testing data. """ self.convert_numpy = ConvertNumpy() self.ridge_regression = RidgeRegression() self.predict_output = PredictOutput() self.residual_sum_squares = ResidualSumSquares() self.k_fold_cross_validation = KFoldCrossValidation() # Create a dictionary type to store relevant data types so that our pandas # will read the correct information dtype_dict = {'bathrooms': float, 'waterfront': int, 'sqft_above': int, 'sqft_living15': float, 'grade': int, 'yr_renovated': int, 'price': float, 'bedrooms': float, 'zipcode': str, 'long': float, 'sqft_lot15': float, 'sqft_living': float, 'floors': str, 'condition': int, 'lat': float, 'date': str, 'sqft_basement': int, 'yr_built': int, 'id': str, 'sqft_lot': int, 'view': int} # Create a kc_house that encompasses all test and train data self.kc_house = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_data.csv', dtype=dtype_dict) # Create a kc_house_test_frame that encompasses only train data self.kc_house_train = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_train_data.csv', dtype=dtype_dict) # Create a kc_house_frames that encompasses only test data self.kc_house_test = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_test_data.csv', dtype=dtype_dict) # Create a kc_house_train_valid_shuffled that encompasses both train and valid data and shuffled self.kc_house_train_valid_shuffled = pd.read_csv('./unit_tests/test_data/regression/' 'kc_house_with_validation_k_fold/' 'wk3_kc_house_train_valid_shuffled.csv', dtype=dtype_dict) def test_01_gradient_descent_no_penalty(self): """Tests gradient descent algorithm. Tests the result on gradient descent with low penalty. """ # We will use sqft_living for our features features = ['sqft_living'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train, features, output, 1) # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = None # L2 Penalty l2_penalty = 0.0 # Compute our gradient descent value final_weights = self.ridge_regression.gradient_descent(feature_matrix, output, {"initial_weights": initial_weights, "step_size": step_size, "tolerance": tolerance, "l2_penalty": l2_penalty, "max_iteration": max_iterations}) # We will use sqft_iving, and sqft_living15 test_features = ['sqft_living'] # Output will be price test_output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_house_test, test_features, test_output, 1) # Predict the output of test features predicted_output = self.predict_output.regression(test_feature_matrix, final_weights) # Compute RSS rss = self.residual_sum_squares.residual_sum_squares_regression(test_output, predicted_output) # Assert that the weights is correct self.assertEquals(round(-0.16311351478746433, 5), round(final_weights[0], 5)) self.assertEquals(round(263.02436896538489, 3), round(final_weights[1], 3)) # Assert that rss is correct self.assertEquals(round(275723632153607.72, -5), round(rss, -5)) def test_02_gradient_descent_high_penalty(self): """Tests gradient descent. Tests the result on gradient descent with high penalty. """ # We will use sqft_living for our features features = ['sqft_living'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train, features, output, 1) # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = None # L2 Penalty l2_penalty = 1e11 # Compute our gradient descent value final_weights = self.ridge_regression.gradient_descent(feature_matrix, output, {"initial_weights": initial_weights, "step_size": step_size, "tolerance": tolerance, "l2_penalty": l2_penalty, "max_iteration": max_iterations}) # We will use sqft_iving test_features = ['sqft_living'] # Output will be price test_output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_house_test, test_features, test_output, 1) # Predict the output of test features predicted_output = self.predict_output.regression(test_feature_matrix, final_weights) # Compute RSS rss = self.residual_sum_squares.residual_sum_squares_regression(test_output, predicted_output) # Assert that the weights is correct self.assertEquals(round(9.7673000000000005, 5), round(final_weights[0], 5)) self.assertEquals(round(124.572, 3), round(final_weights[1], 3)) # Assert that rss is correct self.assertEquals(round(694642101500000.0, -5), round(rss, -5)) def test_03_gradient_descent_multiple_high_penalty(self): """Tests gradient descent. Tests gradient descent with multiple features, and high penalty. """ # We will use sqft_iving, and sqft_living15 features = ['sqft_living', 'sqft_living15'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train, features, output, 1) # Create our initial weights initial_weights = np.array([0.0, 0.0, 0.0]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = None # L2 Penalty l2_penalty = 1e11 # Compute our gradient descent value final_weights = self.ridge_regression.gradient_descent(feature_matrix, output, {"initial_weights": initial_weights, "step_size": step_size, "tolerance": tolerance, "l2_penalty": l2_penalty, "max_iteration": max_iterations}) # We will use sqft_iving, and sqft_living15 test_features = ['sqft_living', 'sqft_living15'] # Output will be price test_output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_house_test, test_features, test_output, 1) # Predict the output of test features predicted_output = self.predict_output.regression(test_feature_matrix, final_weights) # Compute RSS rss = self.residual_sum_squares.residual_sum_squares_regression(test_output, predicted_output) # Assert that the weights is correct self.assertEquals(round(6.7429699999999997, 5), round(final_weights[0], 5)) self.assertEquals(round(91.489000000000004, 3), round(final_weights[1], 3)) self.assertEquals(round(78.437490333967176, 3), round(final_weights[2], 3)) # Assert that rss is correct self.assertEquals(round(500404800500842.0, -5), round(rss, -5)) # Look at the first predicted output self.assertEquals(round(270453.53000000003, 3), round(predicted_output[0], 3)) # The first output should be 310000 in the test set self.assertEquals(310000.0, test_output[0]) def test_04_gradient_descent_k_fold(self): """Tests gradient descent with K fold cross validation. Tests best l2_penalty for ridge regression using gradient descent. """ # We will use sqft_living for our features features = ['sqft_living'] # Output will use price output = ['price'] # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Tolerance tolerance = None # Max Iterations to Run max_iterations = 1000 # Number of Folds folds = 10 # Store Cross Validation results cross_validation_results = [] # We want to test l2 penalty values in [10^1, 10^2, 10^3, 10^4, ..., 10^11] for l2_penalty in np.logspace(1, 11, num=11): # Create a dictionary of model_parameters model_parameters = {'step_size': step_size, 'max_iteration': max_iterations, 'initial_weights': initial_weights, 'tolerance': tolerance, 'l2_penalty': l2_penalty} # Compute the cross validation results cv = self.k_fold_cross_validation.k_fold_cross_validation(folds, self.ridge_regression.gradient_descent, model_parameters, {"data": self.kc_house_train, "output": output, "features": features}) # Append it into the results cross_validation_results.append((l2_penalty, cv)) # Lowest Result lowest = sorted(cross_validation_results, key=lambda x: x[1])[0] # Assert True that 10000000 is the l2_penalty that gives the lowest cross validation error self.assertEquals(10000000.0, lowest[0]) # Assert True that is the lowest l2_penalty self.assertEquals(round(120916225809145.0, 0), round(lowest[1], 0)) def test_05_gradient_ascent(self): """Tests gradient ascent. Tests gradient ascent and compare it with known values. """ # We will use sqft_living for our features features = ['sqft_living'] # Output will be price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train, features, output, 1) # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = None # L2 Penalty l2_penalty = 0.0 # Compute our hill climbing value final_weights = self.ridge_regression.gradient_ascent(feature_matrix, output, {"initial_weights": initial_weights, "step_size": step_size, "tolerance": tolerance, "l2_penalty": l2_penalty, "max_iteration": max_iterations}) # Assert that the weights is correct self.assertEquals(round(-7.7535764461428101e+70, -68), round(final_weights[0], -68)) self.assertEquals(round(-1.9293745396177612e+74, -70), round(final_weights[1], -70)) def test_07_gradient_ascent_high_tolerance(self): """Tests gradient ascent. Tests gradient ascent and compare it with known values. """ # We will use sqft_living for our features features = ['sqft_living'] # Output will be price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train, features, output, 1) # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = 1 # L2 Penalty l2_penalty = 0.0 # Compute our hill climbing value final_weights = self.ridge_regression.gradient_ascent(feature_matrix, output, {"initial_weights": initial_weights, "step_size": step_size, "tolerance": tolerance, "l2_penalty": l2_penalty, "max_iteration": max_iterations}) # Assert that the weights is correct self.assertEquals(0, round(final_weights[0], -68)) self.assertEquals(0, round(final_weights[1], -70)) def test_08_gradient_descent_no_penalty_high_tolerance(self): """Tests gradient descent algorithm. Tests the result on gradient descent with low penalty. """ # We will use sqft_living for our features features = ['sqft_living'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train, features, output, 1) # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 100000 # Tolerance tolerance = 10000000000 # L2 Penalty l2_penalty = 0.0 # Compute our gradient descent value final_weights = self.ridge_regression.gradient_descent(feature_matrix, output, {"initial_weights": initial_weights, "step_size": step_size, "tolerance": tolerance, "l2_penalty": l2_penalty, "max_iteration": max_iterations}) # We will use sqft_iving, and sqft_living15 test_features = ['sqft_living'] # Output will be price test_output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_house_test, test_features, test_output, 1) # Predict the output of test features predicted_output = self.predict_output.regression(test_feature_matrix, final_weights) # Compute RSS rss = self.residual_sum_squares.residual_sum_squares_regression(test_output, predicted_output) # Assert that the weights is correct self.assertEquals(round(0.093859999999999999, 5), round(final_weights[0], 5)) self.assertEquals(round(262.98200000000003, 3), round(final_weights[1], 3)) # Assert that rss is correct self.assertEquals(round(275724298300000.0, -5), round(rss, -5))
class TestWeightedBinaryDecisionTrees(unittest.TestCase): """Tests for the BinaryDecisionTrees class. Uses lending club data to test binary decision trees. Statics: _multiprocess_can_split_ (bool): Flag for nose tests to run tests in parallel. """ _multiprocess_can_split_ = True def setUp(self): """Constructor for WeightedBinaryDecisionTrees. We will clean up the loans_data by doing one hot encoding our features list, however, in the end we will use some pre-built data for training and testing, but it uses the same method for one hot encoding. """ self.weighted_binary_decision_trees = WeightedBinaryDecisionTrees() self.adaboost = AdaBoost() self.predict = PredictOutput() self.accuracy = Accuracy() self.error = Error() # Pandas type set dtype_dict = {'grade': str, 'term': str, 'emp_length': str, 'bad_loans': int} # Load the lending club data self.loans_data = pd.read_csv('./unit_tests/test_data/classification/lending_club/lending_club_data.csv', dtype=dtype_dict) # List features and targets that we are interested self.features = ['grade', 'term', 'home_ownership', 'emp_length'] self.target = 'safe_loans' # Do a one hot encoding of features for feature in self.features: # One hot encode loans_data_one_hot_encoded = pd.get_dummies(self.loans_data[feature].apply(lambda x: x), prefix=feature, prefix_sep='.') # Drop the feature self.loans_data.drop(feature, axis=1, inplace=True) # Join the feature with the new one encoded features self.loans_data = pd.concat([self.loans_data, loans_data_one_hot_encoded], axis=1) # Update our features self.features = list(self.loans_data.columns.values) self.features.remove('safe_loans') # Load our training and testing data self.train_data = pd.read_csv('./unit_tests/test_data/classification/lending_club/lending_club_train.csv') self.test_data = pd.read_csv('./unit_tests/test_data/classification/lending_club/lending_club_test.csv') def test_01_greedy_recursive(self): """Tests greedy recursive function for BinaryDecisionTrees class We will use the training data to build a decision tree, and measure the accuracy with some known good values. """ # Create data weights data_weights = pd.Series([1.] * 10 + [0.] * (len(self.train_data) - 20) + [1.] * 10) # Create a decision tree decision_tree = self.weighted_binary_decision_trees.greedy_recursive(self.train_data, self.features, self.target, {"data_weights": data_weights, "current_depth": 0, "max_depth": 2, "minimum_error": 1e-15}) # Compute the accuracy of the decision tree accuracy = self.error.binary_tree(decision_tree, self.train_data, self.target) # Assert that the classification should be 0.48124865678057166 self.assertEqual(round(accuracy, 5), round(0.48124865678057166, 5)) def test_02_greedy_recursive_high_depth_low_features(self): """Tests greedy recursive function for BinaryDecisionTrees class We will use the training data to build a decision tree, and measure the accuracy with some known good values. """ # Create data weights data_weights = pd.Series([1.] * 10 + [0.] * (len(self.train_data) - 20) + [1.] * 10) # Create a decision tree decision_tree = self.weighted_binary_decision_trees.greedy_recursive(self.train_data, ['grade.A', 'grade.B'], self.target, {"data_weights": data_weights, "current_depth": 0, "max_depth": 2000, "minimum_error": 1e-15}) # Compute the accuracy of the decision tree accuracy = self.error.binary_tree(decision_tree, self.train_data, self.target) # Assert that the classification should be 0.54148 self.assertEqual(round(accuracy, 5), round(0.54148, 5)) def test_03_greedy_recursive_high_depth_low_features(self): """Tests greedy recursive function for BinaryDecisionTrees class We will use the training data to build a decision tree, and measure the accuracy with some known good values. """ # Create data weights data_weights = pd.Series([1.] * 10 + [2.] * (len(self.train_data) - 20) + [-1.] * 10) # Create a decision tree decision_tree = self.weighted_binary_decision_trees.greedy_recursive(self.train_data, ['grade.A', 'grade.B', 'grade.C', 'grade.D', 'grade.E', 'grade.F', 'grade.G', 'term. 36 months'], self.target, {"data_weights": data_weights, "current_depth": 0, "max_depth": 20000, "minimum_error": 1e-15}) # Compute the accuracy of the decision tree accuracy = self.error.binary_tree(decision_tree, self.train_data, self.target) # Assert that the classification should be 0.38491 self.assertEqual(round(accuracy, 5), round(0.38491, 5)) def test_04_adaboost(self): """Tests the adaboost algorithm. Tests the adaboost algorithm with low number of iterations. """ # Create two weighted binary decision trees weights_list, _ = self.adaboost.decision_tree(self.train_data, self.features, self.target, iterations=2, model_dict={"predict_method": self.predict.binary_tree, "model": self.weighted_binary_decision_trees, "model_method": "greedy_recursive", "model_parameters": {"max_depth": 1, "minimum_error": 1e-15, "current_depth": 0}}) # The weights have to equal to [0.15802933659263743, 0.1768236329364191] self.assertEqual([round(i, 5) for i in weights_list], [round(0.15802933659263743, 5), round(0.1768236329364191, 5)]) def test_05_adaboost_high_iterations(self): """Tests the adaboost algorithm. Tests the adaboost algorithm with high number of iterations. """ # Create ten weighted binary decision trees weights_list, models_list = self.adaboost.decision_tree(self.train_data, self.features, self.target, iterations=10, model_dict={"predict_method": self.predict.binary_tree, "model": self.weighted_binary_decision_trees, "model_method": "greedy_recursive", "model_parameters": {"max_depth": 1, "minimum_error": 1e-15, "current_depth": 0 }}) # Get the predictions of each dataset in the test data predictions = self.predict.adaboost_binary_decision_tree(self.predict.binary_tree, models_list, weights_list, self.test_data) # Assert the predictions self.assertEqual(list(predictions)[0:20], [-1, 1, -1, -1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, 1, 1, 1, -1, -1, -1]) # Accuracy has to match 0.620314519604 self.assertEqual(round(self.accuracy.decision_tree(self.test_data, predictions, self.target), 5), round(0.620314519604, 5))
class KFoldCrossValidation: """Class for K Fold Cross Validation. Class for K Fold Cross Validation for selecting best parameters. Attributes: convert_numpy (ConvertNumpy): Pandas to Numpy conversion class. predict_output (PredictOutput): Output prediction. residual_sum_squares (ResidualSumSquares): Computes residual sum of squares. """ def __init__(self): """Constructor for KFoldCrossValidation. Constructor for KFoldCrossValidation, used to setup numpy conversion, output prediction, and residual sum of squares. """ self.convert_numpy = ConvertNumpy() self.predict_output = PredictOutput() self.residual_sum_squares = ResidualSumSquares() def k_fold_cross_validation(self, k, model, model_parameters, data_parameters): """Performs K Fold Cross Validation. Takes in our data, and splits the data to smaller subsets, and these smaller subsets are used as validation sets, and everything else not included in the validation set is used as training sets. The model will be trained using the training set, and the performance assessment such as RSS would be used on the validation set against the model. Args: k (int): Number of folds.= model (obj): Model used for k folds cross validation. model_parameters (dict): Model parameters to train the specified model. data_parameters (dict): A dictionary of data information: { data (pandas.DataFrame): Data used for k folds cross validation, output (str): Output name, features (list of str): A list of feature names. } Returns: float: Average validation error. """ # Sum of the validation error, will divide by k (fold) later validation_error_sum = 0 # Loop through each fold for i in range(k): # Computes validation, and training set validation_set, training_set = self.create_validation_training_set(data_parameters["data"], k, i) # Convert our pandas frame to numpy to create validation set validation_set_matrix, validation_output = self.convert_numpy.convert_to_numpy(validation_set, data_parameters["features"], data_parameters["output"], 1) # Create a model with Train Set 1 + Train Set 2 final_weights = self.create_weights(model, model_parameters, training_set, data_parameters) # Predict the output of test features predicted_output = self.predict_output.regression(validation_set_matrix, final_weights) # compute squared error (in other words, rss) validation_error_sum += self.residual_sum_squares.residual_sum_squares_regression(validation_output, predicted_output) # Return the validation_error_sum divided by fold return validation_error_sum/k @staticmethod def create_validation_training_set(data, k, iteration): """Slice data according to k, iteration, and size of data. Computes the validation, and training set according to the k number of folds, and the current iteration. Args: data (pandas.DataFrame): Data used for k folds cross validation. k (int): Number of folds. iteration (int): Current K fold validation iteration. Returns: A tuple that contains training set, and validation set: ( validation_set (pandas.DataFrame): Validation set. training_set (pandas.DataFrame): Training set. ) """ length_data = len(data) # Compute the start section of the current fold start = int((length_data * iteration) / k) # Compute the end section of the current fold end = int((length_data * (iteration + 1)) / k - 1) # Get our validation set from the start to the end+1 (+1 since we need to include the end) # <Start : end + 1> Validation Set validation_set = data[start:end + 1] # The Training set the left and the right parts of the validation set # < 0 : Start > Train Set 1 # < Start : End + 1 > Validation Set # < End + 1 : n > Train Set 2 # Train Set 1 + Train Set 2 = All data excluding validation set training_set = data[0:start].append(data[end + 1:length_data]) return validation_set, training_set def create_weights(self, model, model_parameters, training_set, data_parameters): """Use model to create weights. Use model, model parameters, and training set, create a set of coefficients. Args: model (obj): Model that can be run. model_parameters (dict): A dictionary of model parameters. training_set (pandas.DataFrame): Train set used for k folds cross validation. data_parameters (dict): A dictionary of data information: { data (pandas.DataFrame): Data used for k folds cross validation, output (str): Output name, features (list of str): A list of feature names. } Returns: numpy.array: numpy array of weights created by running model. """ # Convert our pandas frame to numpy to create training set training_feature_matrix, training_output = self.convert_numpy.convert_to_numpy(training_set, data_parameters["features"], data_parameters["output"], 1) # Create a model with Train Set 1 + Train Set 2 return model(model_parameters=model_parameters, feature_matrix=training_feature_matrix, output=training_output)
class TestLassoRegression(unittest.TestCase): """Tests for TestLassoRegression. Uses housing data to test LassoRegression. Statics: _multiprocess_can_split_ (bool): Flag for nose tests to run tests in parallel. """ _multiprocess_can_split_ = True def setUp(self): """Constructor for TestLassoRegression. Loads housing data, and creates training and testing data. """ self.convert_numpy = ConvertNumpy() self.normalize_features = NormalizeFeatures() self.lasso = LassoRegression() self.predict_output = PredictOutput() self.residual_sum_squares = ResidualSumSquares() self.k_fold_cross_validation = KFoldCrossValidation() # Create a dictionary type to store relevant data types so that our pandas # will read the correct information dtype_dict = {'bathrooms': float, 'waterfront': int, 'sqft_above': int, 'sqft_living15': float, 'grade': int, 'yr_renovated': int, 'price': float, 'bedrooms': float, 'zipcode': str, 'long': float, 'sqft_lot15': float, 'sqft_living': float, 'floors': str, 'condition': int, 'lat': float, 'date': str, 'sqft_basement': int, 'yr_built': int, 'id': str, 'sqft_lot': int, 'view': int} # Create a kc_house that encompasses all test and train data self.kc_house = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_data.csv', dtype=dtype_dict) # Create a kc_house_test_frame that encompasses only train data self.kc_house_train = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_train_data.csv', dtype=dtype_dict) # Create a kc_house_frames that encompasses only test data self.kc_house_test = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_test_data.csv', dtype=dtype_dict) # Convert all the frames with the floors to float type self.kc_house['floors'] = self.kc_house['floors'].astype(float) self.kc_house_train['floors'] = self.kc_house['floors'].astype(float) self.kc_house_test['floors'] = self.kc_house['floors'].astype(float) # Then back to int type self.kc_house['floors'] = self.kc_house['floors'].astype(int) self.kc_house_train['floors'] = self.kc_house['floors'].astype(int) self.kc_house_test['floors'] = self.kc_house['floors'].astype(int) def test_01_normalize_features(self): """Tests normalizing features. Test normalization features, and compare it with known values. """ # Normalize the features, and also return the norms features, norms = self.normalize_features.l2_norm(np.array([[3., 6., 9.], [4., 8., 12.]])) # Assert that the np array is equal to features self.assertTrue(np.array_equal(np.array([[0.6, 0.6, 0.6], [0.8, 0.8, 0.8]]), features), True) # Assert that the np array is equal to norms self.assertTrue(np.array_equal(np.array([5., 10., 15.]), norms), True) def test_02_compute_ro(self): """Test compute ro Test compute one round of ro. """ # We will use sqft_iving, and sqft_living15 features = ['sqft_living', 'bedrooms'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house, features, output, 1) # Create our initial weights normalized_feature_matrix, _ = self.normalize_features.l2_norm(feature_matrix) # Set initial weights weights = np.array([1., 4., 1.]) # Compute ro_j ro_j = self.lasso.compute_ro_j(normalized_feature_matrix, output, weights) # Assert the output of ro_j self.assertTrue(np.allclose(ro_j, np.array([79400300.03492916, 87939470.77299108, 80966698.67596565]))) def test_03_compute_coordinate_descent_step(self): """Test one coordinate descent step. Test one coordinate descent step and compare it with known values. """ # Assert that both are equal self.assertEquals(round(self.lasso.lasso_coordinate_descent_step({"i": 1, "weights": np.array([1., 4.])}, np.array([[3./math.sqrt(13), 1./math.sqrt(10)], [2./math.sqrt(13), 3./math.sqrt(10)]]), np.array([1., 1.]), {"l1_penalty": 0.1}), 8), round(0.425558846691, 8)) def test_04_coordinate_descent(self): """Test coordinate descent. Test coordinate descent and compare with known values. """ # We will use sqft_iving, and sqft_living15 features = ['sqft_living', 'bedrooms'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house, features, output, 1) # Create our initial weights normalized_feature_matrix, _ = self.normalize_features.l2_norm(feature_matrix) # Set initial weights initial_weights = np.zeros(3) # Set l1 penalty l1_penalty = 1e7 # Set tolerance tolerance = 1.0 # Compute the weights using coordinate descent weights = self.lasso.lasso_cyclical_coordinate_descent(normalized_feature_matrix, output, {"initial_weights": initial_weights, "l1_penalty": l1_penalty, "tolerance": tolerance}) # Assert that these two numpy arrays are the same self.assertTrue(np.allclose(weights, np.array([21624998.3663629, 63157246.78545423, 0.]), True)) # Predict the output predicted_output = self.predict_output.regression(normalized_feature_matrix, weights) # Assert that the RSS is what we wanted self.assertEquals(round(self.residual_sum_squares.residual_sum_squares_regression(output, predicted_output), -10), round(1.63049248148e+15, -10)) def test_05_coordinate_descent_with_normalization(self): """Test coordinate descent with normalization. Test coordinate descent and then normalize the result, so that we can use the weights on a test set. """ # We will use multiple features features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train, features, output, 1) # Create our initial weights normalized_feature_matrix, norms = self.normalize_features.l2_norm(feature_matrix) # Compute Multiple Weights weights1e7 = self.lasso.lasso_cyclical_coordinate_descent(normalized_feature_matrix, output, {"initial_weights": np.zeros(len(features)+1), "l1_penalty": 1e7, "tolerance": 1}) weights1e8 = self.lasso.lasso_cyclical_coordinate_descent(normalized_feature_matrix, output, {"initial_weights": np.zeros(len(features)+1), "l1_penalty": 1e8, "tolerance": 1}) weights1e4 = self.lasso.lasso_cyclical_coordinate_descent(normalized_feature_matrix, output, {"initial_weights": np.zeros(len(features)+1), "l1_penalty": 1e4, "tolerance": 5e5}) # Compute multiple normalized normalized_weights1e4 = weights1e4 / norms normalized_weights1e7 = weights1e7 / norms normalized_weights1e8 = weights1e8 / norms # We will use multiple features features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated'] # Output will use price output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_house_test, features, output, 1) # Predict the output predicted_output = self.predict_output.regression(test_feature_matrix, normalized_weights1e4) # Assert that the RSS is what we wanted self.assertEquals(round(self.residual_sum_squares.residual_sum_squares_regression(test_output, predicted_output), -12), round(2.2778100476e+14, -12)) # Predict the output predicted_output = self.predict_output.regression(test_feature_matrix, normalized_weights1e7) # Assert that the RSS is what we wanted self.assertEquals(round(self.residual_sum_squares.residual_sum_squares_regression(test_output, predicted_output), -12), round(2.75962079909e+14, -12)) # Predict the output predicted_output = self.predict_output.regression(test_feature_matrix, normalized_weights1e8) # Assert that the RSS is what we wanted self.assertEquals(round(self.residual_sum_squares.residual_sum_squares_regression(test_output, predicted_output), -12), round(5.37049248148e+14, -12))
class Accuracy: """Class for computing accuracy. Computes accuracy for general method, decision trees, and logistic regression. Attributes: predict_output (PredictOutput): Class used to predict output. """ def __init__(self): """Constructor for Accuracy class to setup predict output. Constructor for Accuracy class to setup predict output class. """ self.predict_output = PredictOutput() @staticmethod def general(predictions, label): """Computes general form of accuracy for classification. Needs to have predictions and labels before using this function. General form of computing accuracy for classification = # Correctly classified data points ---------------------------------- # Total data points Args: predictions (numpy.Series): A numpy matrix containing features. label (numpy.array): A numpy array containing labels. Returns: float: Accuracy. """ # Sum the number of correct predictions num_correct = (pd.Series(predictions) == pd.Series(label)).sum() # Compute the accuracy, which is the number of correct predictions divided by the length of label or prediction return num_correct / len(label) def logistic_regression(self, feature_matrix, label, coefficients): """Computes accuracy for logistic regression. Can take in feature matrix and coefficients from logistic regression, and compute accuracy. Computes accuracy for classification, which is based on accuracy = # Correctly classified data points ---------------------------------- # Total data points Args: feature_matrix (numpy.matrix): A numpy matrix containing features. label (numpy.array): A numpy array containing labels. coefficients (numpy.array): A numpy array containing coefficients. Returns: float: Accuracy. """ # Get the predictions predictions = self.predict_output.logistic_regression(feature_matrix, coefficients) # Sum the number of correct predictions num_correct = (predictions == label).sum() # Compute the accuracy, which is the number of correct predictions divided by the length of feature matrix return num_correct / len(feature_matrix) @staticmethod def decision_tree(data, predictions, target): """Computes accuracy for logistic regression. Can take in data and predictions along with target from a decision tree to compute accuracy. Computes accuracy for decision trees, which is based on accuracy = # Correctly classified data points ---------------------------------- # Total data points Args: data (pandas.DataFrame): Train/testing data. predictions (pandas.Series): A pandas series containing output prediction for data. target (str): The target string. Returns: float: Accuracy. """ # Add the predictions to the data data["prediction"] = predictions # Calculate the number of mistakes mistakes = data.apply(lambda x: x[target] != x["prediction"], axis=1).sum() # One minus the mistakes divided by the length of the data return 1-(float(mistakes)/float(len(data)))
class TestRidgeRegression(unittest.TestCase): # Usage: # Tests for the Linear Regression Class. def setUp(self): # Usage: # Constructor for TestRidgeRegression # Arguments: # None # Create an instance of the Convert Numpy class self.convert_numpy = ConvertNumpy() # Create an instance of the Linear Regression class self.ridge_regression = RidgeRegression() # Create an instance of the Predict Output Class self.predict_output = PredictOutput() # Create an instance of the Residual Sum Squares Class self.residual_sum_squares = ResidualSumSquares() # Create an instance of the K Fold Cross Validation Class self.k_fold_cross_validation = KFoldCrossValidation() # Create a dictionary type to store relevant data types so that our pandas # will read the correct information dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int} # Create a kc_house_frame that encompasses all test and train data self.kc_house_frame = pd.read_csv('./unit_tests/test_data/kc_house/kc_house_data.csv', dtype=dtype_dict) # Create a kc_house_test_frame that encompasses only train data self.kc_house_train_frame = pd.read_csv('./unit_tests/test_data/kc_house/kc_house_train_data.csv', dtype=dtype_dict) # Create a kc_house_frames that encompasses only test data self.kc_test_frames = pd.read_csv('./unit_tests/test_data/kc_house/kc_house_test_data.csv', dtype=dtype_dict) # Create a kc_house_train_valid_shuffled that encompasses both train and valid data and shuffled self.kc_house_train_valid_shuffled = pd.read_csv('./unit_tests/test_data/kc_house_with_validation_k_fold/wk3_kc_house_train_valid_shuffled.csv', dtype=dtype_dict) def test_01_gradient_descent_no_penalty(self): # Usage: # Tests the result on gradient descent with low penalty # Arguments: # None # We will use sqft_living for our features features = ['sqft_living'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = None # L2 Penalty l2_penalty = 0.0 # Compute our gradient descent value final_weights = self.ridge_regression.gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance, l2_penalty, max_iterations) # We will use sqft_iving, and sqft_living15 test_features = ['sqft_living'] # Output will be price test_output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_test_frames, test_features, test_output, 1) # Predict the output of test features predicted_output = self.predict_output.predict_output_linear_regression(test_feature_matrix, final_weights) # Compute RSS rss = self.residual_sum_squares.residual_sum_squares_linear_regression(test_output, predicted_output) # Assert that the weights is correct self.assertEquals(round(-0.16311351478746433, 5), round(final_weights[0], 5)) self.assertEquals(round(263.02436896538489, 3), round(final_weights[1], 3)) # Assert that rss is correct self.assertEquals(round(275723632153607.72, -5), round(rss, -5)) def test_02_gradient_descent_high_penalty(self): # Usage: # Tests the result on gradient descent with high penalty # Arguments: # None # We will use sqft_living for our features features = ['sqft_living'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = None # L2 Penalty l2_penalty = 1e11 # Compute our gradient descent value final_weights = self.ridge_regression.gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance, l2_penalty, max_iterations) # We will use sqft_iving test_features = ['sqft_living'] # Output will be price test_output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_test_frames, test_features, test_output, 1) # Predict the output of test features predicted_output = self.predict_output.predict_output_linear_regression(test_feature_matrix, final_weights) # Compute RSS rss = self.residual_sum_squares.residual_sum_squares_linear_regression(test_output, predicted_output) # Assert that the weights is correct self.assertEquals(round(0.048718475774044, 5), round(final_weights[0], 5)) self.assertEquals(round(124.57402057376679, 3), round(final_weights[1], 3)) # Assert that rss is correct self.assertEquals(round(694654309578537.25, -5), round(rss, -5)) def test_03_gradient_descent_multiple_high_penalty(self): # Usage: # Tests the result on gradient descent with high penalty # Arguments: # None # We will use sqft_iving, and sqft_living15 features = ['sqft_living', 'sqft_living15'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([0.0, 0.0, 0.0]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = None # L2 Penalty l2_penalty = 1e11 # Compute our gradient descent value final_weights = self.ridge_regression.gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance, l2_penalty, max_iterations) # We will use sqft_iving, and sqft_living15 test_features = ['sqft_living', 'sqft_living15'] # Output will be price test_output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_test_frames, test_features, test_output, 1) # Predict the output of test features predicted_output = self.predict_output.predict_output_linear_regression(test_feature_matrix, final_weights) # Compute RSS rss = self.residual_sum_squares.residual_sum_squares_linear_regression(test_output, predicted_output) # Assert that the weights is correct self.assertEquals(round(0.033601165521060711, 5), round(final_weights[0], 5)) self.assertEquals(round(91.490167574878328, 3), round(final_weights[1], 3)) self.assertEquals(round(78.437490333967176, 3), round(final_weights[2], 3)) # Assert that rss is correct self.assertEquals(round(500408530236718.31, 0), round(rss, 0)) # Look at the first predicted output self.assertEquals(round(270449.70602770313, 3), round(predicted_output[0], 3)) # The first output should be 310000 in the test set self.assertEquals(310000.0, test_output[0]) def test_04_gradient_descent_k_fold(self): # Usage: # Tests best l2_penalty for ridge regression using gradient descent # Arguments: # None # We will use sqft_living for our features features = ['sqft_living'] # Output will use price output = ['price'] # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Tolerance tolerance = None # Max Iterations to Run max_iterations = 1000 # Number of Folds folds = 10 # Store Cross Validation results cross_validation_results = [] # We want to test l2 penalty values in [10^1, 10^2, 10^3, 10^4, ..., 10^11] for l2_penalty in np.logspace(1, 11, num=11): # Create a dictionary of model_parameters model_parameters = {'step_size': step_size, 'max_iteration': max_iterations, 'initial_weights': initial_weights, 'tolerance': tolerance, 'l2_penalty': l2_penalty} # Compute the cross validation results cross_validation = self.k_fold_cross_validation.k_fold_cross_validation(folds, self.kc_house_train_frame, self.ridge_regression.gradient_descent, model_parameters, output, features) # Append it into the results cross_validation_results.append((l2_penalty, cross_validation)) # Lowest Result lowest = sorted(cross_validation_results, key=lambda x: x[1])[0] # Assert True that 10000000 is the l2_penalty that gives the lowest cross validation error self.assertEquals(10000000.0, lowest[0]) # Assert True that is the lowest l2_penalty self.assertEquals(round(120916225812152.84, 0), round(lowest[1], 0)) def test_05_hill_climbing(self): # Usage: # Tests the result on hill climbing # Arguments: # None # We will use sqft_living for our features features = ['sqft_living'] # Output will be price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = None # L2 Penalty l2_penalty = 0.0 # Compute our hill climbing value final_weights = self.ridge_regression.hill_climbing(feature_matrix, output, initial_weights, step_size, tolerance, l2_penalty, max_iterations) # Assert that the weights is correct self.assertEquals(round(-7.7535764461428101e+70, -68), round(final_weights[0], -68)) self.assertEquals(round(-1.9293745396177612e+74, -70), round(final_weights[1], -70))
class LassoRegression: """Class to compute Lasso Regression. Lasso Regression is essentially L1 Norm with Linear Regression. We cannot use gradient descent since the absolute value for L1 Norm is not differentiable. Hence we use coordinate descent. Attributes: predict_output (PredictOutput): A PredictOutput class that can predict output given features and weights. """ def __init__(self): """Constructor for LassoRegression to setup PredictOutput. Constructor for the LassoRegression class, mainly used to setup PredictOutput. """ self.predict_output = PredictOutput() def lasso_cyclical_coordinate_descent(self, feature_matrix, output, model_parameters): """Coordinate descent algorithm for Lasso regression. Performs a Lasso Cyclical Coordinate Descent, which will loop over each features and then perform coordinate descent, and if all of the weight changes are less than the tolerance, then we will stop. Lasso Regression is based on: w_j = ro_j + delta/2 if ro_j < -delta/2 0 if ro_j between [-delta/2,delta/2] ro_j - delta/2 if ro_j > delta/2 Where, ro_j = Sigma(N, i=1, h_j(x_i)(y_i-y^_i(w_-j). h_j(x_i): Normalized features of x_i (input features, but without j feature). y_i: Real output. y^_i(w_-j): Predicted output without feature j. Args: feature_matrix (numpy.ndarray): Feature matrix. output (numpy.array): Real output for the feature matrix. model_parameters (dict): A dictionary of model parameters, { initial_weights (numpy.array): The starting initial weights, step_size (float): Step size, tolerance (float or None): Tolerance (or epsilon), l1_penalty (float): L1 penalty value, max_iteration (int): Maximum iteration to compute. } Returns: numpy.array: final weights after coordinate descent has been completed """ # Flag to indicate that the change is too low low_change = False # Set Weights to initial_weights weights = model_parameters["initial_weights"] # While the change is not too low (meaning lower than tolerance) while not low_change: # An array of boolean to detect if all the changes are less than tolerance change = [] # Need to incorporate all the new changes to the weights for i in range(len(weights)): # Remember the old weights old_weights_i = weights[i] # Compute the current weight weights[i] = self.lasso_coordinate_descent_step({"i": i, "weights": weights}, feature_matrix, output, model_parameters) # Returns true if any weight changes greater than tolerance change.append(abs(old_weights_i-weights[i]) > model_parameters["tolerance"]) # Returns true if all the changes are less than tolerance low_change = not any(change) return weights def lasso_coordinate_descent_step(self, step_parameters, feature_matrix, output, model_parameters): """Computes the Lasso coordinate descent step. Computes the Lasso coordinate descent step, which is essentially computing a new ro_i, and based on the index and ro_i, compute new w_i weight. Args: step_parameters (dict): A dictionary for step data, { i (int): Feature i, weights (numpy.array): Current weights. } feature_matrix (numpy.ndarray): Feature matrix. output (numpy.array): Real output for feature_matrix. model_parameters (dict): A dictionary of model parameters, { step_size (float): Step size, tolerance (float or None): Tolerance (or epsilon), l1_penalty (float): L1 penalty value, max_iteration (int): Maximum iteration to compute. } Returns: new_weight_i (float): New weight for the feature i. """ # compute ro[i] = SUM[ [feature_i]*(output - prediction + weight[i]*[feature_i]) ] ro_i = self.compute_ro_j(feature_matrix, output, step_parameters["weights"])[step_parameters["i"]] # when i == 0, then it's a intercept -- do not regularize # else # w_i = ro_i + delta/2 if ro_i < -delta/2 # 0 if ro_i between [-delta/2,delta/2] # ro_i - delta/2 if ro_i > delta/2 if step_parameters["i"] == 0: new_weight_i = ro_i elif ro_i < -model_parameters["l1_penalty"]/2.: new_weight_i = ro_i + model_parameters["l1_penalty"]/2 elif ro_i > model_parameters["l1_penalty"]/2.: new_weight_i = ro_i - model_parameters["l1_penalty"]/2 else: new_weight_i = 0. # Return the new weight for feature i return new_weight_i def compute_ro_j(self, feature_matrix, real_output, weights): """Computes ro_j. Computes ro_j using ro_j = Sigma(N, i=1, h_j(x_i)(y_i-y^_i(w_-j). Args: feature_matrix (numpy.ndarray): Feature matrix. real_output (numpy.array): Real output (not predicted) for feature_matrix. weights (numpy.array): The current weights. Returns: ro (numpy.array): ro (or new weights for each feature). """ # Number of features (columns) feature_num = feature_matrix.shape[1] # Set ro to be an array that is feature_num size ro = np.zeros(feature_num) # Loop through feature for j in range(feature_num): # prediction = y_i(w_-j), prediction without feature j prediction = self.predict_output.regression(np.delete(feature_matrix, j, axis=1), np.delete(weights, j)) # residual = output - prediction residual = real_output-prediction # ro[j] = Sigma(N, i=1, feature_i) * residual ro[j] = np.sum([feature_matrix[:, j]*residual]) return ro
class TestRidgeRegression(unittest.TestCase): # Usage: # Tests for the Linear Regression Class. def setUp(self): # Usage: # Constructor for TestRidgeRegression # Arguments: # None # Create an instance of the Convert Numpy class self.convert_numpy = ConvertNumpy() # Create an instance of the Linear Regression class self.ridge_regression = RidgeRegression() # Create an instance of the Predict Output Class self.predict_output = PredictOutput() # Create an instance of the Residual Sum Squares Class self.residual_sum_squares = ResidualSumSquares() # Create an instance of the K Fold Cross Validation Class self.k_fold_cross_validation = KFoldCrossValidation() # Create a dictionary type to store relevant data types so that our pandas # will read the correct information dtype_dict = { 'bathrooms': float, 'waterfront': int, 'sqft_above': int, 'sqft_living15': float, 'grade': int, 'yr_renovated': int, 'price': float, 'bedrooms': float, 'zipcode': str, 'long': float, 'sqft_lot15': float, 'sqft_living': float, 'floors': str, 'condition': int, 'lat': float, 'date': str, 'sqft_basement': int, 'yr_built': int, 'id': str, 'sqft_lot': int, 'view': int } # Create a kc_house_frame that encompasses all test and train data self.kc_house_frame = pd.read_csv( './unit_tests/test_data/kc_house/kc_house_data.csv', dtype=dtype_dict) # Create a kc_house_test_frame that encompasses only train data self.kc_house_train_frame = pd.read_csv( './unit_tests/test_data/kc_house/kc_house_train_data.csv', dtype=dtype_dict) # Create a kc_house_frames that encompasses only test data self.kc_test_frames = pd.read_csv( './unit_tests/test_data/kc_house/kc_house_test_data.csv', dtype=dtype_dict) # Create a kc_house_train_valid_shuffled that encompasses both train and valid data and shuffled self.kc_house_train_valid_shuffled = pd.read_csv( './unit_tests/test_data/kc_house_with_validation_k_fold/wk3_kc_house_train_valid_shuffled.csv', dtype=dtype_dict) def test_01_gradient_descent_no_penalty(self): # Usage: # Tests the result on gradient descent with low penalty # Arguments: # None # We will use sqft_living for our features features = ['sqft_living'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy( self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = None # L2 Penalty l2_penalty = 0.0 # Compute our gradient descent value final_weights = self.ridge_regression.gradient_descent( feature_matrix, output, initial_weights, step_size, tolerance, l2_penalty, max_iterations) # We will use sqft_iving, and sqft_living15 test_features = ['sqft_living'] # Output will be price test_output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy( self.kc_test_frames, test_features, test_output, 1) # Predict the output of test features predicted_output = self.predict_output.predict_output_linear_regression( test_feature_matrix, final_weights) # Compute RSS rss = self.residual_sum_squares.residual_sum_squares_linear_regression( test_output, predicted_output) # Assert that the weights is correct self.assertEquals(round(-0.16311351478746433, 5), round(final_weights[0], 5)) self.assertEquals(round(263.02436896538489, 3), round(final_weights[1], 3)) # Assert that rss is correct self.assertEquals(round(275723632153607.72, -5), round(rss, -5)) def test_02_gradient_descent_high_penalty(self): # Usage: # Tests the result on gradient descent with high penalty # Arguments: # None # We will use sqft_living for our features features = ['sqft_living'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy( self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = None # L2 Penalty l2_penalty = 1e11 # Compute our gradient descent value final_weights = self.ridge_regression.gradient_descent( feature_matrix, output, initial_weights, step_size, tolerance, l2_penalty, max_iterations) # We will use sqft_iving test_features = ['sqft_living'] # Output will be price test_output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy( self.kc_test_frames, test_features, test_output, 1) # Predict the output of test features predicted_output = self.predict_output.predict_output_linear_regression( test_feature_matrix, final_weights) # Compute RSS rss = self.residual_sum_squares.residual_sum_squares_linear_regression( test_output, predicted_output) # Assert that the weights is correct self.assertEquals(round(0.048718475774044, 5), round(final_weights[0], 5)) self.assertEquals(round(124.57402057376679, 3), round(final_weights[1], 3)) # Assert that rss is correct self.assertEquals(round(694654309578537.25, -5), round(rss, -5)) def test_03_gradient_descent_multiple_high_penalty(self): # Usage: # Tests the result on gradient descent with high penalty # Arguments: # None # We will use sqft_iving, and sqft_living15 features = ['sqft_living', 'sqft_living15'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy( self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([0.0, 0.0, 0.0]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = None # L2 Penalty l2_penalty = 1e11 # Compute our gradient descent value final_weights = self.ridge_regression.gradient_descent( feature_matrix, output, initial_weights, step_size, tolerance, l2_penalty, max_iterations) # We will use sqft_iving, and sqft_living15 test_features = ['sqft_living', 'sqft_living15'] # Output will be price test_output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy( self.kc_test_frames, test_features, test_output, 1) # Predict the output of test features predicted_output = self.predict_output.predict_output_linear_regression( test_feature_matrix, final_weights) # Compute RSS rss = self.residual_sum_squares.residual_sum_squares_linear_regression( test_output, predicted_output) # Assert that the weights is correct self.assertEquals(round(0.033601165521060711, 5), round(final_weights[0], 5)) self.assertEquals(round(91.490167574878328, 3), round(final_weights[1], 3)) self.assertEquals(round(78.437490333967176, 3), round(final_weights[2], 3)) # Assert that rss is correct self.assertEquals(round(500408530236718.31, 0), round(rss, 0)) # Look at the first predicted output self.assertEquals(round(270449.70602770313, 3), round(predicted_output[0], 3)) # The first output should be 310000 in the test set self.assertEquals(310000.0, test_output[0]) def test_04_gradient_descent_k_fold(self): # Usage: # Tests best l2_penalty for ridge regression using gradient descent # Arguments: # None # We will use sqft_living for our features features = ['sqft_living'] # Output will use price output = ['price'] # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Tolerance tolerance = None # Max Iterations to Run max_iterations = 1000 # Number of Folds folds = 10 # Store Cross Validation results cross_validation_results = [] # We want to test l2 penalty values in [10^1, 10^2, 10^3, 10^4, ..., 10^11] for l2_penalty in np.logspace(1, 11, num=11): # Create a dictionary of model_parameters model_parameters = { 'step_size': step_size, 'max_iteration': max_iterations, 'initial_weights': initial_weights, 'tolerance': tolerance, 'l2_penalty': l2_penalty } # Compute the cross validation results cross_validation = self.k_fold_cross_validation.k_fold_cross_validation( folds, self.kc_house_train_frame, self.ridge_regression.gradient_descent, model_parameters, output, features) # Append it into the results cross_validation_results.append((l2_penalty, cross_validation)) # Lowest Result lowest = sorted(cross_validation_results, key=lambda x: x[1])[0] # Assert True that 10000000 is the l2_penalty that gives the lowest cross validation error self.assertEquals(10000000.0, lowest[0]) # Assert True that is the lowest l2_penalty self.assertEquals(round(120916225812152.84, 0), round(lowest[1], 0)) def test_05_hill_climbing(self): # Usage: # Tests the result on hill climbing # Arguments: # None # We will use sqft_living for our features features = ['sqft_living'] # Output will be price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy( self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = None # L2 Penalty l2_penalty = 0.0 # Compute our hill climbing value final_weights = self.ridge_regression.hill_climbing( feature_matrix, output, initial_weights, step_size, tolerance, l2_penalty, max_iterations) # Assert that the weights is correct self.assertEquals(round(-7.7535764461428101e+70, -68), round(final_weights[0], -68)) self.assertEquals(round(-1.9293745396177612e+74, -70), round(final_weights[1], -70))
class TestBinaryDecisionTrees(unittest.TestCase): """Tests for the BinaryDecisionTrees class. Uses lending club data to test binary decision trees. Attributes: binary_decision_trees (BinaryDecisionTrees): Binary decision tree class. predict_output (PredictOutput): Predict output class to predict output for decision tree. accuracy (Accuracy): Measures the accuracy of algorithms. error (Error): Measure the accuracy of algorithms. loans_data (pandas.DataFrame): Lending Club Data. features (list of str): List of features to build decision tree on. target (str): The target that we are predicting. train_data (pandas.DataFrame): Lending Club training Data. test_data (pandas.DataFrame): Lending Club testing data. Statics: _multiprocess_can_split_ (bool): Flag for nose tests to run tests in parallel. """ _multiprocess_can_split_ = True def setUp(self): """Constructor for TestBinaryDecisionTrees. We will clean up the loans_data by doing one hot encoding our features list, however, in the end we will use some pre-built data for training and testing, but it uses the same method for one hot encoding. """ self.binary_decision_trees = BinaryDecisionTrees() self.predict_output = PredictOutput() self.accuracy = Accuracy() self.error = Error() # Pandas type set dtype_dict = {'grade': str, 'term': str, 'emp_length': str, 'bad_loans': int} # Load the lending club data self.loans_data = pd.read_csv('./unit_tests/test_data/classification/lending_club/lending_club_data.csv', dtype=dtype_dict) # List features and targets that we are interested self.features = ['grade', 'term', 'home_ownership', 'emp_length'] self.target = 'safe_loans' # Do a one hot encoding of features for feature in self.features: # One hot encode loans_data_one_hot_encoded = pd.get_dummies(self.loans_data[feature].apply(lambda x: x), prefix=feature, prefix_sep='.') # Drop the feature self.loans_data.drop(feature, axis=1, inplace=True) # Join the feature with the new one encoded features self.loans_data = pd.concat([self.loans_data, loans_data_one_hot_encoded], axis=1) # Update our features self.features = list(self.loans_data.columns.values) self.features.remove('safe_loans') # Load our training and testing data self.train_data = pd.read_csv('./unit_tests/test_data/classification/lending_club/lending_club_train.csv') self.test_data = pd.read_csv('./unit_tests/test_data/classification/lending_club/lending_club_test.csv') def test_01_greedy_recursive(self): """Tests greedy recursive function for BinaryDecisionTrees class We will use the training data to build a decision tree, and measure the accuracy with some known good values. """ # Create a decision tree decision_tree = self.binary_decision_trees.greedy_recursive(self.train_data, self.features, self.target, {"current_depth": 0, "max_depth": 6}) # Get the classification result of the first row classification = self.predict_output.binary_tree(decision_tree, self.test_data.iloc[0]) # Assert that the classification should be -1 self.assertEqual(classification, -1) # Compute the accuracy of the decision tree accuracy = self.error.binary_tree(decision_tree, self.test_data, self.target) # Assert that the classification should be 0.3837785437311504 self.assertEqual(round(accuracy, 5), round(0.3837785437311504, 5)) def test_02_greedy_recursive_high_depth_low_feature(self): """Tests greedy recursive function for BinaryDecisionTrees class We will use the training data to build a decision tree, and use high depth. """ # Create a decision tree decision_tree = self.binary_decision_trees.greedy_recursive(self.train_data, ['grade.A', 'grade.B'], self.target, {"current_depth": 0, "max_depth": 1000}) # Get the classification result of the first row classification = self.predict_output.binary_tree(decision_tree, self.test_data.iloc[0]) # Assert that the classification should be -1 self.assertEqual(classification, -1) # Compute the accuracy of the decision tree accuracy = self.error.binary_tree(decision_tree, self.test_data, self.target) # Assert that the classification should be 0.38432 self.assertEqual(round(accuracy, 5), round(0.38432, 5)) def test_04_greedy_recursive_high_depth(self): """Tests greedy recursive function for BinaryDecisionTrees class We will use the training data to build a decision tree, and use high depth. """ # Create a decision tree decision_tree = self.binary_decision_trees.greedy_recursive(self.train_data, self.features, self.target, {"current_depth": 0, "max_depth": 10000}) # Get the classification result of the first row classification = self.predict_output.binary_tree(decision_tree, self.test_data.iloc[0]) # Assert that the classification should be -1 self.assertEqual(classification, -1) # Compute the accuracy of the decision tree accuracy = self.error.binary_tree(decision_tree, self.test_data, self.target) # Assert that the classification should be 0.37732 self.assertEqual(round(accuracy, 5), round(0.37732, 5)) def test_03_greedy_recursive_early_stop(self): """Tests for greedy recursive with early stopping for BinaryDecisionTrees class We will use early stopping for greedy recursive, and measure performance. """ # Create a model with max_depth=6, min_node_size=100, min_error_reduction=0 model_1 = self.binary_decision_trees.greedy_recursive_early_stop(self.train_data, self.features, self.target, {"current_depth": 0, "max_depth": 6, "min_node_size": 100, "min_error_reduction": 0.0}) # Get the classification result of the first row classification = self.predict_output.binary_tree(model_1, self.test_data.iloc[0]) # Assert that the classification should be -1 self.assertEqual(classification, -1) # Compute the accuracy of the decision tree accuracy = self.error.binary_tree(model_1, self.test_data, self.target) # Assert that the classification should be 0.38367083153813014 self.assertEqual(round(accuracy, 5), round(0.38367083153813014, 5)) # Create a model with max_depth=6, min_node_size=0, min_error_reduction=-1 model_2 = self.binary_decision_trees.greedy_recursive_early_stop(self.train_data, self.features, self.target, {"current_depth": 0, "max_depth": 6, "min_node_size": 0, "min_error_reduction": -1}) # Get the classification result of the first row classification = self.predict_output.binary_tree(model_2, self.test_data.iloc[0]) # Assert that the classification should be -1 self.assertEqual(classification, -1) # Compute the accuracy of the decision tree accuracy = self.error.binary_tree(model_2, self.test_data, self.target) # Assert that the classification should be 0.3837785437311504 self.assertEqual(round(accuracy, 5), round(0.3837785437311504, 5)) def test_04_greedy_recursive_early_stop_high_depth(self): """Tests for greedy recursive with early stopping for BinaryDecisionTrees class We will use early stopping for greedy recursive, and measure performance. """ # Create a model with max_depth=5000, min_node_size=0, min_error_reduction=0 model_1 = self.binary_decision_trees.greedy_recursive_early_stop(self.train_data, ['grade.A', 'grade.B'], self.target, {"current_depth": 0, "max_depth": 5000, "min_node_size": 0, "min_error_reduction": 0.0}) # Get the classification result of the first row classification = self.predict_output.binary_tree(model_1, self.test_data.iloc[0]) # Assert that the classification should be -1 self.assertEqual(classification, -1) # Compute the accuracy of the decision tree accuracy = self.error.binary_tree(model_1, self.test_data, self.target) # Assert that the classification should be 0.38432 self.assertEqual(round(accuracy, 5), round(0.38432, 5)) def test_05_greedy_recursive_early_stop_high_depth(self): """Tests for greedy recursive with early stopping for BinaryDecisionTrees class We will use early stopping for greedy recursive, and measure performance. """ # Create a model with max_depth=5000, min_node_size=0, min_error_reduction=0 model_1 = self.binary_decision_trees.greedy_recursive_early_stop(self.train_data, ['grade.A', 'grade.B', 'grade.C', 'grade.D', 'grade.E', 'grade.F', 'grade.G', 'term. 36 months'], self.target, {"current_depth": 0, "max_depth": 5000, "min_node_size": 0, "min_error_reduction": -50000}) # Get the classification result of the first row classification = self.predict_output.binary_tree(model_1, self.test_data.iloc[0]) # Assert that the classification should be -1 self.assertEqual(classification, -1) # Compute the accuracy of the decision tree accuracy = self.error.binary_tree(model_1, self.test_data, self.target) # Assert that the classification should be 0.38162 self.assertEqual(round(accuracy, 5), round(0.38162, 5))
class TestLinearRegression(unittest.TestCase): """Test for LinearRegression. Uses housing data to test LinearRegression. Statics: _multiprocess_can_split_ (bool): Flag for nose tests to run tests in parallel. """ _multiprocess_can_split_ = True def setUp(self): """Constructor for TestLinearRegression. Loads housing data, and creates training and testing data. """ self.convert_numpy = ConvertNumpy() self.linear_regression = LinearRegression() self.predict_output = PredictOutput() self.residual_sum_squares = ResidualSumSquares() # Create a dictionary type to store relevant data types so that our pandas # will read the correct information dtype_dict = {'bathrooms': float, 'waterfront': int, 'sqft_above': int, 'sqft_living15': float, 'grade': int, 'yr_renovated': int, 'price': float, 'bedrooms': float, 'zipcode': str, 'long': float, 'sqft_lot15': float, 'sqft_living': float, 'floors': str, 'condition': int, 'lat': float, 'date': str, 'sqft_basement': int, 'yr_built': int, 'id': str, 'sqft_lot': int, 'view': int} # Create a kc_house that encompasses all test and train data self.kc_house = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_data.csv', dtype=dtype_dict) # Create a kc_house_test_frame that encompasses only train data self.kc_house_train = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_train_data.csv', dtype=dtype_dict) # Create a kc_house_frames that encompasses only test data self.kc_house_test = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_test_data.csv', dtype=dtype_dict) def test_01_gradient_descent(self): """Test gradient descent. Tests gradient descent and compare it to known values. """ # We will use sqft_living for our features features = ['sqft_living'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train, features, output, 1) # Create our initial weights initial_weights = np.array([-47000., 1.]) # Step size step_size = 7e-12 # Tolerance tolerance = 2.5e7 # Compute our gradient descent value final_weights = self.linear_regression.gradient_descent(feature_matrix, output, {"initial_weights": initial_weights, "step_size": step_size, "tolerance": tolerance}) # Assert that the weights is correct self.assertEquals(round(-46999.887165546708, 3), round(final_weights[0], 3)) self.assertEquals(round(281.91211917520917, 3), round(final_weights[1], 3)) def test_02_gradient_descent_multiple(self): """Tests gradient descent on multiple features. Computes gradient descent on multiple input, and computes predicted model and RSS. """ # We will use sqft_iving, and sqft_living15 features = ['sqft_living', 'sqft_living15'] # Output will be price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train, features, output, 1) # Create our initial weights initial_weights = np.array([-100000., 1., 1.]) # Step size step_size = 4e-12 # Tolerance tolerance = 1e9 # Compute our gradient descent value final_weights = self.linear_regression.gradient_descent(feature_matrix, output, {"initial_weights": initial_weights, "step_size": step_size, "tolerance": tolerance}) # We will use sqft_iving, and sqft_living15 test_features = ['sqft_living', 'sqft_living15'] # Output will be price test_output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_house_test, test_features, test_output, 1) # Predict the output of test features predicted_output = self.predict_output.regression(test_feature_matrix, final_weights) # Compute RSS rss = self.residual_sum_squares.residual_sum_squares_regression(test_output, predicted_output) # Assert that rss is correct self.assertEquals(round(270263443629803.41, -3), round(rss, -3)) def test_03_gradient_ascent(self): """Test gradient ascent. Test gradient ascent and compare it to known values. """ # We will use sqft_living for our features features = ['sqft_living'] # Output will be price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train, features, output, 1) # Create our initial weights initial_weights = np.array([-47000., 1.]) # Step size step_size = 7e-12 # Tolerance tolerance = 2.5e7 # Compute our hill climbing value final_weights = self.linear_regression.gradient_ascent(feature_matrix, output, {"initial_weights": initial_weights, "step_size": step_size, "tolerance": tolerance}) # Assert that the weights is correct self.assertEquals(round(-47000.142201335177, 3), round(final_weights[0], 3)) self.assertEquals(round(-352.86068692252599, 3), round(final_weights[1], 3))
class TestLogisticRegression(unittest.TestCase): """Tests for LogisticRegression class. Uses Amazon data to test logistic regression. Statics: _multiprocess_can_split_ (bool): Flag for nose tests to run tests in parallel. """ _multiprocess_can_split_ = True def setUp(self): """Constructor for TestLogisticRegression. Loads Amazon data, and creates training and testing data. """ self.convert_numpy = ConvertNumpy() self.log_likelhood = LogLikelihood() self.predict_output = PredictOutput() self.logistic_regression = LogisticRegression() self.confusion_matrix = ConfusionMatrix() # Load the important words self.important_words = json.load(open('./unit_tests/test_data/classification/amazon/important_words.json', 'r')) # Load the amazon baby subset self.review_frame = pd.read_csv('./unit_tests/test_data/classification/amazon/amazon_baby_subset.csv') # Review needs to be text self.review_frame['review'].astype(str) # Clean up the punctuations self.review_frame['review_clean'] = self.review_frame.apply( axis=1, func=lambda row: str(row["review"]).translate(str.maketrans({key: None for key in string.punctuation}))) # Remove any nan text self.review_frame['review_clean'] = self.review_frame.apply( axis=1, func=lambda row: '' if row["review_clean"] == "nan" else row["review_clean"]) # Count the number of words that appears in each review, and make an indepedent column for word in self.important_words: self.review_frame[word] = self.review_frame['review_clean'].apply(lambda s, w=word: s.split().count(w)) # Load training data self.train_frame = pd.read_csv('./unit_tests/test_data/classification/amazon/amazon_baby_subset_train_mod2.csv') def test_01_gradient_ascent(self): """Test gradient ascent algorithm. Tests the gradient ascent algorithm and compare it with known values. """ # We will use important words for the output features = self.important_words # Output will use sentiment output = ['sentiment'] # Convert our pandas frame to numpy feature_matrix, sentiment = self.convert_numpy.convert_to_numpy(self.review_frame, features, output, 1) # Compute the coefficients coefficients = self.logistic_regression.gradient_ascent(feature_matrix, sentiment, {"initial_coefficients": np.zeros(194), "step_size": 1e-7, "max_iter": 301}) # Real coefficients that we need to compare with the computed coefficients real_coef = [5.16220157e-03, 1.55656966e-02, -8.50204675e-03, 6.65460842e-02, 6.58907629e-02, 5.01743882e-03, -5.38601484e-02, -3.50488413e-03, 6.47945868e-02, 4.54356263e-02, 3.98353364e-03, 2.00775410e-02, 3.01350011e-02, -2.87115530e-02, 1.52161964e-02, 2.72592062e-04, 1.19448177e-02, -1.82461935e-02, -1.21706420e-02, -4.15110334e-02, 2.76820391e-03, 1.77031999e-02, -4.39700067e-03, 4.49764014e-02, 9.90916464e-03, 8.99239081e-04, -1.36219516e-03, 1.26859357e-02, 8.26466695e-03, -2.77426972e-02, 6.10128809e-04, 1.54084501e-02, -1.32134753e-02, -3.00512492e-02, 2.97399371e-02, 1.84087080e-02, 2.86178752e-03, -1.05768015e-02, -6.57350362e-04, -1.01476555e-02, -4.79579528e-03, 7.50891810e-03, 4.27938289e-03, 3.06785501e-03, -2.20317661e-03, 9.57273354e-03, 9.91666827e-05, -1.98462567e-02, 1.75702722e-02, 1.55478612e-03, -1.77375440e-02, 9.78324102e-03, 1.17031606e-02, -7.35345937e-03, -6.08714030e-03, 6.43766808e-03, 1.07159665e-02, -3.05345476e-03, 7.17190727e-03, 5.73320003e-03, 4.60661876e-03, -5.20588421e-03, 6.71012331e-03, 9.03281814e-03, 1.74563147e-03, 6.00279979e-03, 1.20181744e-02, -1.83594607e-02, -6.91010811e-03, -1.38687273e-02, -1.50406590e-02, 5.92353611e-03, 5.67478991e-03, -5.28786220e-03, 3.08147864e-03, 5.53751236e-03, 1.49917916e-02, -3.35666000e-04, -3.30695153e-02, -4.78990943e-03, -6.41368859e-03, 7.99938935e-03, -8.61390444e-04, 1.68052959e-02, 1.32539901e-02, 1.72307051e-03, 2.98030675e-03, 8.58284300e-03, 1.17082481e-02, 2.80825907e-03, 2.18724016e-03, 1.68824711e-02, -4.65973741e-03, 1.51368285e-03, -1.09509122e-02, 9.17842898e-03, -1.88572281e-04, -3.89820373e-02, -2.44821005e-02, -1.87023714e-02, -2.13943485e-02, -1.29690465e-02, -1.71378670e-02, -1.37566767e-02, -1.49770449e-02, -5.10287978e-03, -2.89789761e-02, -1.48663194e-02, -1.28088380e-02, -1.07709355e-02, -6.95286915e-03, -5.04082164e-03, -9.25914404e-03, -2.40427481e-02, -2.65927785e-02, -1.97320937e-03, -5.04127508e-03, -7.00791912e-03, -3.48088523e-03, -6.40958916e-03, -4.07497010e-03, -6.30054296e-03, -1.09187932e-02, -1.26051900e-02, -1.66895314e-03, -7.76418781e-03, -5.15960485e-04, -1.94199551e-03, -1.24761586e-03, -5.01291731e-03, -9.12049191e-03, -7.22098801e-03, -8.31782981e-03, -5.60573348e-03, -1.47098335e-02, -9.31520819e-03, -2.22034402e-03, -7.07573098e-03, -5.10115608e-03, -8.93572862e-03, -1.27545713e-02, -7.04171991e-03, -9.76219676e-04, 4.12091713e-04, 8.29251160e-04, 2.64661064e-03, -7.73228782e-03, 1.53471164e-03, -7.37263060e-03, -3.73694386e-03, -3.81416409e-03, -1.64575145e-03, -3.31887732e-03, 1.22257832e-03, 1.36699286e-05, -3.01866601e-03, -1.02826343e-02, -1.06691327e-02, 2.23639046e-03, -9.87424798e-03, -1.02192048e-02, -3.41330929e-03, 3.34489960e-03, -3.50984516e-03, -6.26283150e-03, -7.22419943e-03, -5.47016154e-03, -1.25063947e-02, -2.47805699e-03, -1.60017985e-02, -6.40098934e-03, -4.26644386e-03, -1.55376990e-02, 2.31349237e-03, -9.06653337e-03, -6.30012672e-03, -1.21010303e-02, -3.02578875e-03, -6.76289718e-03, -5.65498722e-03, -6.87050239e-03, -1.18950595e-02, -1.86489236e-04, -1.15230476e-02, 2.81533219e-03, -8.10150295e-03, -1.00062131e-02, 4.02037651e-03, -5.44300346e-03, 2.85818985e-03, 1.19885003e-04, -6.47587687e-03, -1.14493516e-03, -7.09205934e-03] # Loop through each value, the coefficients must be the same for pred_coef, coef in zip(coefficients, real_coef): # Assert that both values are the same self.assertEqual(round(pred_coef, 5), round(coef, 5)) # Get the output of the logistic regression with threshold 0 output = self.predict_output.logistic_regression(feature_matrix, coefficients, 0) # Generate a confusion matrix confusion_matrix = self.confusion_matrix.confusion_matrix(sentiment, output) # Assert the values are to be expected self.assertEqual(confusion_matrix, {'false_negatives': 7311, 'true_negatives': 20635, 'true_positives': 19268, 'false_positives': 5858}) # Assert that the precision is correct self.assertEqual(round(self.confusion_matrix.precision(sentiment, output), 5), round(0.7249332179540239, 5)) # Assert that the recall is correct self.assertEqual(round(self.confusion_matrix.recall(sentiment, output), 5), round(0.7668550505452519, 5)) def test_02_stochastic_gradient_ascent(self): """Test stochastic gradient descent for logistic regression. Tests stochastic gradient descent and test it with some known values. """ # We will use important words for the output features = self.important_words # Output will use sentiment output = ['sentiment'] # Convert our pandas frame to numpy feature_matrix, sentiment = self.convert_numpy.convert_to_numpy(self.train_frame, features, output, 1) # Compute the coefficients coefficients = self.logistic_regression.stochastic_gradient_ascent(feature_matrix, sentiment, {"initial_coefficients": np.zeros(194), "step_size": 5e-1, "batch_size": 1, "max_iter": 10}) # Real coefficients that we need to compare with the computed coefficients real_coef = [0.26845909, 0.05510662, -0.78232359, 0.24929641, 0.1213813, -0.13194118, -0.42110769, 0.23944013, 0.52334226, 0.30746343, 1.46697311, 0.15734639, 0.24112255, -0.22849175, -0.48095714, 0., 0.05984944, -0.41942527, -0.48095714, 0.10654088, 0., 0.06153186, -0.41942527, 0.43843464, 0., 0.21719583, 0., 0.84326475, 0.28108825, 0.28108825, 0., 0., 0.24611428, -0.19986888, 0.15734639, 0., 0., -0.48095714, 0.12623269, 0., 0.28108825, 0.07542718, 0., -0.42110769, 0.15734639, -0.48095714, 0.24611428, -0.48095714, 0., 0., 0.06153186, 0.28108825, 0., 0., 0., 0.05984944, 0.5932902, 0.5621765, -0.48095714, 0., 0.05984944, 0.05984944, 0.31220195, 0.11805882, 0., 0.15085436, 0.24611428, 0., 0., 0., 0.06153186, 0.12623269, 0., 0., 0., 0., 0., 0., -0.35472444, 0.12623269, 0., 0., 0.68023532, 0.28108825, 0.06153186, 0.0311137, 0.35651543, 0., 0.28108825, 0., 0.05984944, 0., 0.35651543, 0.28108825, 0., 0., 0., -0.90206483, 0.07542718, -0.48095714, 0., 0., -0.48095714, 0., 0., 0., -0.25, 0.0311137, 0., 0.28108825, 0., 0., 0., 0., 0., 0., 0.34262011, -0.48095714, 0.28108825, 0., 0., 0., 0., 0., 0.06153186, 0.12623269, 0.05984944, 0., 0., 0., 0., 0.12623269, 0., 0., 0.12623269, 0.07542718, 0.15085436, 0.07542718, -0.68082602, 0., 0., 0., 0.05984944, 0., 0., 0.28108825, 0., -0.25, 0., 0., 0.07542718, 0., 0., 0.28108825, 0., 0., 0., 0., 0., 0., 0.06153186, 0.0311137, 0., -0.48095714, 0., 0., 0., 0., 0., 0., 0., 0.40732094, 0., 0., 0.05984944, 0., 0., 0., 0., 0., 0., 0., 0.06153186, 0., 0.06153186, 0., -0.25, 0.05984944, 0., 0., 0., 0., -0.96191427, 0.] # Loop through each value, the coefficients must be the same for pred_coef, coef in zip(coefficients, real_coef): # Assert that both values are the same self.assertEqual(round(pred_coef, 5), round(coef, 5)) # Get the output of the logistic regression with threshold 0 output = self.predict_output.logistic_regression(feature_matrix, coefficients, 0) # Generate a confusion matrix confusion_matrix = self.confusion_matrix.confusion_matrix(sentiment, output) # Assert the values are to be expected self.assertEqual(confusion_matrix, {'false_negatives': 6517, 'true_negatives': 11707, 'true_positives': 17331, 'false_positives': 12225}) # Assert that the precision is correct self.assertEqual(round(self.confusion_matrix.precision(sentiment, output), 5), round(0.72673, 5)) # Assert that the recall is correct self.assertEqual(round(self.confusion_matrix.recall(sentiment, output), 5), round(0.58638, 5)) def test_02_stochastic_gradient_ascent_high_iteration(self): """Test stochastic gradient descent for logistic regression. Tests stochastic gradient descent and test it with some known values. """ # We will use important words for the output features = self.important_words # Output will use sentiment output = ['sentiment'] # Convert our pandas frame to numpy feature_matrix, sentiment = self.convert_numpy.convert_to_numpy(self.train_frame, features, output, 1) # Compute the coefficients coefficients = self.logistic_regression.stochastic_gradient_ascent(feature_matrix, sentiment, {"initial_coefficients": np.zeros(194), "step_size": 5e-1, "batch_size": 1000, "max_iter": 1000}) # Real coefficients that we need to compare with the computed coefficients real_coef = [-0.06659918, 0.07516305, 0.02337901, 0.91476437, 1.25935729, -0.01093744, -0.29808423, 0.00724611, 1.14319635, 0.58421811, -0.10388794, 0.25341405, 0.51935047, -0.16643157, 0.1581433, -0.01678466, 0.11023426, -0.07801531, -0.11943521, -0.23901842, 0.19961916, 0.26962603, 0.00726172, 1.58116946, -0.04749877, -0.01222728, -0.12452547, 0.2408741, 0.23996495, -0.27318487, 0.16391931, 0.46141695, -0.00520781, -0.41720674, 1.3914436, 0.59286041, -0.01877455, -0.1177062, 0.04522629, -0.05050944, -0.1872891, 0.1119123, 0.05552736, 0.018883, -0.28821684, 0.35454167, 0.09146771, -0.15185966, 0.45980111, 0.13696004, -0.27719711, 0.37826182, 0.51482099, -0.12707594, -0.08043197, 0.27088589, 0.20836676, -0.22217221, 0.34308818, 0.05011724, 0.01336183, -0.00422257, 0.25914879, 0.18971367, 0.11804381, 0.06478439, 0.13413068, -0.35940054, -0.04225724, -0.23574987, -0.26178573, 0.37077618, 0.266064, 0.0552738, 0.25274691, 0.15248314, 0.9721445, 0.03951392, -0.59577998, -0.09680726, -0.13168621, 0.42806047, 0.03576358, 1.03088019, 0.52916025, -0.09516351, 0.23544152, 0.31386904, 0.50647271, 0.25383116, 0.1369185, 0.93673001, -0.06280486, 0.1670564, -0.20573152, 0.2201837, 0.12892914, -0.9711816, -0.24387714, -0.3566874, -0.65956699, -0.28473646, -0.34083222, -0.44708957, -0.29828401, -0.52797307, -1.92693359, -0.33116364, -0.43025271, -0.21284617, 0.16375567, -0.0299845, -0.30294927, -1.25019619, -1.55092776, -0.09266983, -0.08014312, -0.07565967, -0.00950432, 0.00327247, 0.03190358, -0.04247063, -0.28205865, -0.45678176, 0.06141561, -0.2690871, -0.05979329, -0.0019354, -0.01279985, 0.05323391, -0.35513613, -0.26639425, -0.41094467, -0.14117863, -0.90001241, -0.33279773, 0.01621988, -0.08709595, -0.10450457, -0.12567406, -0.61727551, -0.18663497, 0.17636203, 0.09316913, -0.06829369, 0.1880183, -0.5078543, 0.03964466, -0.26089197, -0.07480237, -0.05556211, -0.1450303, -0.04780934, 0.08911386, -0.15163772, 0.06213261, -0.34512242, -0.33522342, 0.06580618, -0.44499204, -0.68623426, -0.12564489, 0.2609755, 0.09998045, -0.25098629, -0.29549973, -0.15944276, -0.47408765, -0.03058168, -1.42253269, -0.49855378, 0.05835175, -1.17789127, -0.08226967, -0.56793665, -0.35814271, -0.98559717, -0.16918106, -0.12477773, -0.23457722, -0.13170106, -0.64351485, -0.01773532, -0.2686544, 0.047442, -0.34218929, -0.48340895, 0.37866335, -0.25162177, 0.05277577, 0.01545386, -0.26267815, -0.09903819, -0.54500151] # Loop through each value, the coefficients must be the same for pred_coef, coef in zip(coefficients, real_coef): # Assert that both values are the same self.assertEqual(round(pred_coef, 5), round(coef, 5)) # Get the output of the logistic regression with threshold 0 output = self.predict_output.logistic_regression(feature_matrix, coefficients, 0) # Generate a confusion matrix confusion_matrix = self.confusion_matrix.confusion_matrix(sentiment, output) # Assert the values are to be expected self.assertEqual(confusion_matrix, {'false_negatives': 5018, 'true_negatives': 18995, 'true_positives': 18830, 'false_positives': 4937}) # Assert that the precision is correct self.assertEqual(round(self.confusion_matrix.precision(sentiment, output), 5), round(0.78958, 5)) # Assert that the recall is correct self.assertEqual(round(self.confusion_matrix.recall(sentiment, output), 5), round(0.79228, 5)) def test_04_log_likelihood(self): """Test log likelihood. Test the log likelihood algorithm, and compare it with some known values. """ # Generate test feature, coefficients, and label feature_matrix = np.array([[1., 2., 3.], [1., -1., -1]]) coefficients = np.array([1., 3., -1.]) label = np.array([-1, 1]) # Compute the log likelihood lg = self.log_likelhood.log_likelihood(feature_matrix, label, coefficients) # Assert the value self.assertEqual(round(lg, 5), round(-5.33141161544, 5)) def test_05_average_log_likelihood(self): """Test average log likelihood. Test the average log likelihood algorithm, and compare it with some known values. """ # Generate test feature, coefficients, and label feature_matrix = np.array([[1., 2., 3.], [1., -1., -1]]) coefficients = np.array([1., 3., -1.]) label = np.array([-1, 1]) # Compute the log likelihood lg = self.log_likelhood.average_log_likelihood(feature_matrix, label, coefficients) # Assert the value self.assertEqual(round(lg, 5), round(-2.6657099999999998, 5))