def forecasting_models_example(): """ Build 3 models and evaluate the performance of model trees, regression trees and standard linear regression. """ train_file = 'data/bikeSpeedVsIq_train.txt' test_file = 'data/bikeSpeedVsIq_test.txt' training_matrix = np.mat(utils.load_tsv_into_array(train_file)) test_matrix = np.mat(utils.load_tsv_into_array(test_file)) # training tree tree = regression_trees.create_tree(training_matrix, ops=(1, 20)) y_hat = regression_trees.create_forecast(tree, test_matrix[:, 0]) accuracy = np.corrcoef(y_hat, test_matrix[:, 1], rowvar=0)[0, 1] logging.info("training accuracy = {0}".format(accuracy)) # model tree tree = regression_trees.create_tree(training_matrix, regression_trees.model_leaf, regression_trees.model_error, (1, 20)) y_hat = regression_trees.create_forecast( tree, test_matrix[:, 0], regression_trees.model_tree_evaluation) accuracy = np.corrcoef(y_hat, test_matrix[:, 1], rowvar=0)[0, 1] logging.info("model tree accuracy = {0}".format(accuracy)) weights, x, y = regression_trees.linearly_solve(training_matrix) for i in range(np.shape(test_matrix)[0]): y_hat[i] = test_matrix[i, 0] * weights[1, 0] + weights[0, 0] accuracy = np.corrcoef(y_hat, test_matrix[:, 1], rowvar=0)[0, 1] logging.info("regression accuracy = {0}".format(accuracy))
def pruning_example(): complex_tree = more_complex_tree() log_formatted_tree(complex_tree) my_data = utils.load_tsv_into_array('data/ex2test.txt') my_matrix = np.mat(my_data) pruned = regression_trees.prune(complex_tree, my_matrix) log_formatted_tree(pruned, 'pruned tree')
def main(): data_array = utils.load_tsv_into_array('data/k_means_test_set.txt') data_matrix = np.mat(data_array) rand_cent = k_means.random_centroid(data_matrix, 2) logging.info("random centroid = {rand_cent}".format(rand_cent=rand_cent)) euc = k_means.euclidean_distance(data_matrix[0], data_matrix[1]) logging.info("Euclidean distance = {euc}".format(euc=euc)) centroids, cluster_assignment = k_means.k_means(data_matrix, 4) logging.info("centroids = {cent}".format(cent=centroids)) bisecting_k_means()
def test_k_means(self): """k_means - k means should build clusters""" data_matrix = np.mat(utils.load_tsv_into_array('data/test_set_3.txt')) centroids, cluster_assignment = k_means.k_means(data_matrix, 4) expected = np.mat( np.array([[1., 0.45675494], [0., 0.3032197], [3., 1.74481454], [1., 0.80407696], [0., 1.02508049], [3., 2.59648559], [1., 0.42859499], [0., 0.0305198], [3., 2.37924609], [2., 0.], [0., 5.38984416], [3., 0.04519236], [1., 1.23757291], [0., 0.01298907], [3., 3.28350116], [1., 2.33205513], [0., 3.72839989], [3., 0.1398885], [1., 0.03288099], [0., 0.4038706], [3., 1.00363352], [1., 1.16346981], [0., 0.93928783], [3., 0.02261741], [1., 3.42458409], [0., 5.92927609], [3., 0.98873759], [1., 1.83018987], [0., 0.91125974], [3., 1.28677032]])) self.assertEqual(cluster_assignment.any(), expected.any())
def test_k_means(self): """k_means - k means should build clusters""" data_matrix = np.mat(utils.load_tsv_into_array('data/test_set_3.txt')) centroids, cluster_assignment = k_means.k_means(data_matrix, 4) expected = np.mat(np.array([[1., 0.45675494], [0., 0.3032197], [3., 1.74481454], [1., 0.80407696], [0., 1.02508049], [3., 2.59648559], [1., 0.42859499], [0., 0.0305198], [3., 2.37924609], [2., 0.], [0., 5.38984416], [3., 0.04519236], [1., 1.23757291], [0., 0.01298907], [3., 3.28350116], [1., 2.33205513], [0., 3.72839989], [3., 0.1398885], [1., 0.03288099], [0., 0.4038706], [3., 1.00363352], [1., 1.16346981], [0., 0.93928783], [3., 0.02261741], [1., 3.42458409], [0., 5.92927609], [3., 0.98873759], [1., 1.83018987], [0., 0.91125974], [3., 1.28677032]])) self.assertEqual(cluster_assignment.any(), expected.any())
def piecewise_linear_solve_example(): matrix_2 = np.mat(utils.load_tsv_into_array('data/exp2.txt')) model_tree = regression_trees.create_tree(matrix_2, regression_trees.model_leaf, regression_trees.model_error) log_formatted_tree(model_tree, 'model tree')
def more_complex_tree(): data = utils.load_tsv_into_array('data/ex0.txt') matrix = np.mat(data) tree = regression_trees.create_tree(matrix, ops=(0, 1)) return tree
def very_simple_tree(): data = utils.load_tsv_into_array('data/ex00.txt') matrix = np.mat(data) tree = regression_trees.create_tree(matrix) log_formatted_tree(tree, "the tree")
def bisecting_k_means(): data_matrix = np.mat(utils.load_tsv_into_array('data/test_set_2.txt')) centroid_list, assessments = k_means.bisect_k_means(data_matrix, 3) return centroid_list, assessments