def trainAndTest(): # Import the training and test data as numpy arrays train_array = util.csvAsArray('data/train_updated.csv') test_array = util.csvAsArray('data/test.csv') # Generate a list of (feature vector, value) tuples for the training data feature_names = util.getCsvHeaders('data/train_updated.csv') train_examples = [] k_examples = [] for i in range(len(train_array)): feature_count = range(len(train_array[i]) - 1) feature_values = [train_array[i][j] for j in feature_count] feature_vector = featurize(feature_values, feature_names) output = train_array[i][len(train_array[0]) - 1] train_examples.append((feature_vector, output)) k_examples.append(feature_vector) # Train a k-means model on the training data and evaluate its mean # squared error with the test data random.shuffle(train_examples) for i in range(0, NUM_SPLITS, 2): startTest = i * len(train_examples) / NUM_SPLITS endTest = (i + 1) * len(train_examples) / NUM_SPLITS currentTrainExamples = train_examples[0:startTest] + train_examples[ endTest:len(train_examples)] (centroids, assign, loss, loss_list, centroid_vals) = kmeans(currentTrainExamples, NUM_CLUSTERS, 500) currentBoostedExamples = [(currentTrainExamples[ind][0], loss_list[ind]) for ind in range(len(currentTrainExamples))] boostedRegPredictor = learnBoostedRegression(currentBoostedExamples, 500, \ 0.00000000001, num_trees=NUM_B_TREES) pre_computed_centroid_dots = [ util.dotProduct(centroids[ind], centroids[ind]) for ind in range(NUM_CLUSTERS) ] def kmeanspredictor(x): assignment = 0 min_dist = 1000000 for j in range(NUM_CLUSTERS): cur_dist = util.dotProduct(x, x) - 2 * util.dotProduct( centroids[j], x) + pre_computed_centroid_dots[j] if cur_dist < min_dist: assignment = j min_dist = cur_dist return centroid_vals[assignment] def boostedKPredictor(x): return kmeanspredictor(x) + boostedRegPredictor(x) print "leaving out the", ( i + 1 ), "th segment of the data, the validation error for the regression is:", util.evaluatePredictor( boostedKPredictor, train_examples[startTest:endTest])
def trainAndTest(): """Trains neighborhood clustering and prints its results. """ # Import the training and test data as numpy arrays train_array = util.csvAsArray('data/neighborhood_data_final_w_loc.csv') # Generate a list of (feature vector, value) tuples for the training data feature_names = util.getCsvHeaders('data/neighborhood_data_final_w_loc.csv') train_examples = [] names = [] for i in range(len(train_array)): feature_count = range(2, len(train_array[i])) feature_values = [train_array[i][j] for j in feature_count] feature_vector = util.featurize(feature_values, feature_names[1:]) train_examples.append(feature_vector) names.append(train_array[i][1]) # Train a k-means model on the training data and evaluate its mean # squared error with the test data for i in range(STEP_SIZE, MAX_NUM_CLUSTERS + 1, STEP_SIZE): (centroids, assign, loss, loss_list, centroid_vals) \ = kmeans(full_examples=train_examples, K=i, maxIters=500) filename = "neighborhood_centroids" + str(i) + ".p" pickle.dump((centroids, assign, loss, loss_list, centroid_vals, names), open(os.path.join("neighborhood_centroids", filename), "wb")) print names, assign
def trainAndEvaluate(): """Trains a baseline predictor and prints its mean squared error. """ # Import the training and test data as numpy matrices train_array = csvAsArray('data/train.csv') # Format the training data as a list of (input, output) tuples train_examples = [] for i in range(len(train_array)): input_size = range(len(train_array[i]) - 1) input_data = (train_array[i][j] for j in input_size) output = train_array[i][80] / 1000.0 train_examples.append((input_data, output)) # Define predictor functions for baseline and oracle baseline = learnBaseline(train_array) oracle_train = learnOracle(train_examples) # Evaluate mean squared error of predictors baseline_error = evaluatePredictor(baseline, train_examples) oracle_error = evaluatePredictor(oracle_train, train_examples) # Print the results print "" print "-------------------" print "BASELINE AND ORACLE" print "-------------------" print "Number of examples: ", len(train_examples) print "Baseline (median) MSE: ", baseline_error print "Oracle MSE: ", oracle_error print ""
def crossValidate(predictor, num_folds): """Performs k-fold cross validation on a specified predictor function and prints the results. Args: predictor (func): A predictor function. num_folds (int): Number of data folds for cross-validation. """ # Import the training data as a numpy array train_array = csvAsArray('data/train_updated.csv') # Generate a list of (feature vector, value) tuples for the training data feature_names = getCsvHeaders('data/train_updated.csv') # Convert the training array into ([features], value) example tuples train_examples = [] for i in range(len(train_array)): feature_count = range(len(train_array[i]) - 1) feature_values = [train_array[i][j] for j in feature_count] feature_vector = featurize(feature_values, feature_names) output = train_array[i][len(train_array[0]) - 1] train_examples.append((feature_vector, output)) # Randomize the order of the example tuples to aid validation random.shuffle(train_examples) # Validation on each fold validation_set_size = len(train_examples) / num_folds for fold in range(num_folds): # Create training and validation sets valdiation_start = fold * validation_set_size validation_end = validation_start + validation_set_size validation_set = train_examples[validation_start:validation_end] training_set = train_examples[:validation_start] + train_examples[ validation_end:] # Train a regression model on the training data and evaluate its mean # squared error with the validation set tuning_parameter = 1 predictor_fn = predictor(train, 1, 0.01, tuning_parameter) regression_error = evaluatePredictor(predictor_fn, validation_set) # Print the results print "" print "----------" print "REGRESSION" print "----------" print "Lambda: ", tuning_parameter print "Number of examples: ", len(train_examples) print "Regression MSE: ", regression_error print ""
def trainAndEvaluate(): """Trains a linear regression predictor and prints its mean squared error. """ # Import the training data as a numpy array train_array = csvAsArray('data/train_updated.csv') # Generate a list of (feature vector, value) tuples for the training data feature_names = getCsvHeaders('data/train_updated.csv') train_examples = [] for i in range(len(train_array)): feature_count = range(len(train_array[i]) - 1) feature_values = [train_array[i][j] for j in feature_count] feature_vector = featurize(feature_values, feature_names) output = train_array[i][len(train_array[0]) - 1] train_examples.append((feature_vector, output)) random.shuffle(train_examples) test = train_examples[:len(train_examples) / 10] train = train_examples[len(train_examples) / 10:] # Train a regression model on the training data and evaluate its mean # squared error with the test data for tuning_parameter in range(5, 21, 5): tuning_parameter = 1.0 * tuning_parameter / 10 regressionPredictor = learnRegression(train, 500, 0.00000000001, tuning_parameter) regression_error = evaluatePredictor(regressionPredictor, test) # Print the results print "" print "----------" print "REGRESSION" print "----------" print "Lambda (lasso): ", tuning_parameter print "Number of examples: ", len(train_examples) print "Regression MSE: ", regression_error print ""
def trainAndEvaluate(): """Trains a gradient-boosted linear regression predictor and prints its mean squared error. """ # Import the training data as a numpy array train_array = csvAsArray('data/train_updated.csv') # Generate a list of (feature vector, value) tuples for the training data feature_names = getCsvHeaders('data/train_updated.csv') train_examples = [] for i in range(len(train_array)): feature_count = range(len(train_array[i]) - 1) feature_values = [train_array[i][j] for j in feature_count] feature_vector = featurize(feature_values, feature_names) output = train_array[i][len(train_array[0]) - 1] train_examples.append((feature_vector, output)) random.shuffle(train_examples) test = train_examples[:len(train_examples) / 10] train_examples = train_examples[len(train_examples) / 10:] # Train a regression model on the training data and evaluate its mean # squared error with the test data boostedRegressionPredictor = learnBoostedRegression(train_examples, 500, \ 0.000000001, num_trees=5) regression_error = evaluatePredictor(boostedRegressionPredictor, \ test) # Print the results print "" print "------------------" print "BOOSTED REGRESSION" print "------------------" print "Number of examples: " + str(len(train_examples)) print "Regression MSE: " + str(regression_error) print ""
def trainAndTest(): """Defines K-means clustering and perform clustered regression. """ # Import the training and test data as numpy arrays train_array = util.csvAsArray('data/train_updated.csv') test_array = util.csvAsArray('data/test.csv') # Generate a list of (feature vector, value) tuples for the training data feature_names = util.getCsvHeaders('data/train_updated.csv') train_examples = [] k_examples = [] for i in range(len(train_array)): feature_count = range(len(train_array[i]) - 1) feature_values = [train_array[i][j] for j in feature_count] feature_vector = util.featurize(feature_values, feature_names) output = train_array[i][len(train_array[0]) - 1] train_examples.append((feature_vector, output)) random.shuffle(train_examples) for i in range(1, NUM_SPLITS, 2): startTest = i * len(train_examples) / NUM_SPLITS endTest = (i + 1) * len(train_examples) / NUM_SPLITS currentTrain = train_examples[0:startTest] + train_examples[ endTest:len(train_examples)] currentTest = train_examples[startTest:endTest] # Cluster the data using k-means (centroids, assign, loss, loss_list, centroid_vals) = kmeans.kmeans(currentTrain, NUM_CENTROIDS, K_ITERS) # Make clusters cluster_list = [[] for _ in range(len(centroids))] for j in range(len(currentTrain)): cluster_list[assign[i]].append(currentTrain[j]) # Train a regression model on the training data (by cluster) # and evaluate its mean squared error with the train data regression_error = 0 predictor_list = [] pre_computed_centroid_dots = [ util.dotProduct(centroids[k], centroids[k]) for k in range(len(centroids)) ] for cluster_points in cluster_list: boostedRegressionPredictor = boostedtree.learnBoostedRegression( cluster_points, SGD_ITERS, ETA, 5, 0) predictor_list.append(boostedRegressionPredictor) def predictor(x): centroid_ind = 0 minDist = float('inf') for k in range(len(centroids)): cur_dist = util.dotProduct(x, x) - 2 * util.dotProduct( centroids[k], x) + pre_computed_centroid_dots[k] min_dist = float('inf') if cur_dist < min_dist: assignment = k min_dist = cur_dist return predictor_list[i](x) regression_error = boostedtree.evaluatePredictor( predictor, currentTest) #regression_error /= len(train_examples) # Print the results print "" print "------------------" print "CLUSTERED REGRESSION WITH BOOSTING" print "------------------" print "Leaving out segment: " + str(i) print "Number of centroids: " + str(10) print "Number of examples: " + str(len(train_examples)) print "Regression MSE: " + str(regression_error) print "" return predictor_list, centroids, regression_error