def learnPredictor(trainExamples, testExamples, featureExtractor): weights = collections.Counter() def loss(w, phi, y): return max(1 - util.dotProduct(w, phi) * y, 0) eta = 0.1 numIters = 3 def sgradLoss(w, phi, y): if loss(w, phi, y) == 0: return collections.Counter() for key, value in phi.items(): phi[key] = -1 * phi[key] * y return phi def predictor(x): if x == None: return -1 if util.dotProduct(featureExtractor(x), weights) > 0: return 1 else: return 0 for iteration in xrange(numIters): for input, output in trainExamples: if input == None: continue util.increment(weights, -1 * eta, sgradLoss(weights, featureExtractor(input), output)) if DEBUG: print util.evaluatePredictor(trainExamples, predictor) #print util.evaluatePredictor(testExamples, predictor) return weights
def trainAndEvaluate(): """Trains a baseline predictor and prints its mean squared error. """ # Import the training and test data as numpy matrices train_array = csvAsArray('data/train.csv') # Format the training data as a list of (input, output) tuples train_examples = [] for i in range(len(train_array)): input_size = range(len(train_array[i]) - 1) input_data = (train_array[i][j] for j in input_size) output = train_array[i][80] / 1000.0 train_examples.append((input_data, output)) # Define predictor functions for baseline and oracle baseline = learnBaseline(train_array) oracle_train = learnOracle(train_examples) # Evaluate mean squared error of predictors baseline_error = evaluatePredictor(baseline, train_examples) oracle_error = evaluatePredictor(oracle_train, train_examples) # Print the results print "" print "-------------------" print "BASELINE AND ORACLE" print "-------------------" print "Number of examples: ", len(train_examples) print "Baseline (median) MSE: ", baseline_error print "Oracle MSE: ", oracle_error print ""
def trainAndTest(): # Import the training and test data as numpy arrays train_array = util.csvAsArray('data/train_updated.csv') test_array = util.csvAsArray('data/test.csv') # Generate a list of (feature vector, value) tuples for the training data feature_names = util.getCsvHeaders('data/train_updated.csv') train_examples = [] k_examples = [] for i in range(len(train_array)): feature_count = range(len(train_array[i]) - 1) feature_values = [train_array[i][j] for j in feature_count] feature_vector = featurize(feature_values, feature_names) output = train_array[i][len(train_array[0]) - 1] train_examples.append((feature_vector, output)) k_examples.append(feature_vector) # Train a k-means model on the training data and evaluate its mean # squared error with the test data random.shuffle(train_examples) for i in range(0, NUM_SPLITS, 2): startTest = i * len(train_examples) / NUM_SPLITS endTest = (i + 1) * len(train_examples) / NUM_SPLITS currentTrainExamples = train_examples[0:startTest] + train_examples[ endTest:len(train_examples)] (centroids, assign, loss, loss_list, centroid_vals) = kmeans(currentTrainExamples, NUM_CLUSTERS, 500) currentBoostedExamples = [(currentTrainExamples[ind][0], loss_list[ind]) for ind in range(len(currentTrainExamples))] boostedRegPredictor = learnBoostedRegression(currentBoostedExamples, 500, \ 0.00000000001, num_trees=NUM_B_TREES) pre_computed_centroid_dots = [ util.dotProduct(centroids[ind], centroids[ind]) for ind in range(NUM_CLUSTERS) ] def kmeanspredictor(x): assignment = 0 min_dist = 1000000 for j in range(NUM_CLUSTERS): cur_dist = util.dotProduct(x, x) - 2 * util.dotProduct( centroids[j], x) + pre_computed_centroid_dots[j] if cur_dist < min_dist: assignment = j min_dist = cur_dist return centroid_vals[assignment] def boostedKPredictor(x): return kmeanspredictor(x) + boostedRegPredictor(x) print "leaving out the", ( i + 1 ), "th segment of the data, the validation error for the regression is:", util.evaluatePredictor( boostedKPredictor, train_examples[startTest:endTest])
def crossValidate(predictor, num_folds): """Performs k-fold cross validation on a specified predictor function and prints the results. Args: predictor (func): A predictor function. num_folds (int): Number of data folds for cross-validation. """ # Import the training data as a numpy array train_array = csvAsArray('data/train_updated.csv') # Generate a list of (feature vector, value) tuples for the training data feature_names = getCsvHeaders('data/train_updated.csv') # Convert the training array into ([features], value) example tuples train_examples = [] for i in range(len(train_array)): feature_count = range(len(train_array[i]) - 1) feature_values = [train_array[i][j] for j in feature_count] feature_vector = featurize(feature_values, feature_names) output = train_array[i][len(train_array[0]) - 1] train_examples.append((feature_vector, output)) # Randomize the order of the example tuples to aid validation random.shuffle(train_examples) # Validation on each fold validation_set_size = len(train_examples) / num_folds for fold in range(num_folds): # Create training and validation sets valdiation_start = fold * validation_set_size validation_end = validation_start + validation_set_size validation_set = train_examples[validation_start:validation_end] training_set = train_examples[:validation_start] + train_examples[ validation_end:] # Train a regression model on the training data and evaluate its mean # squared error with the validation set tuning_parameter = 1 predictor_fn = predictor(train, 1, 0.01, tuning_parameter) regression_error = evaluatePredictor(predictor_fn, validation_set) # Print the results print "" print "----------" print "REGRESSION" print "----------" print "Lambda: ", tuning_parameter print "Number of examples: ", len(train_examples) print "Regression MSE: ", regression_error print ""
def r_squared(examples, predictor): prediction_error = util.evaluatePredictor(predictor, examples) * len(examples) outputs = [] for i in range(len(examples)): outputs.append(examples[i][1]) mean = 1.0 * sum(outputs) / len(outputs) variance = 0 for i in range(len(outputs)): variance += math.pow(outputs[i] - mean, 2) variance = 1.0 * variance print prediction_error / variance return 1 - (prediction_error / variance)
def trainAndEvaluate(): """Trains a linear regression predictor and prints its mean squared error. """ # Import the training data as a numpy array train_array = csvAsArray('data/train_updated.csv') # Generate a list of (feature vector, value) tuples for the training data feature_names = getCsvHeaders('data/train_updated.csv') train_examples = [] for i in range(len(train_array)): feature_count = range(len(train_array[i]) - 1) feature_values = [train_array[i][j] for j in feature_count] feature_vector = featurize(feature_values, feature_names) output = train_array[i][len(train_array[0]) - 1] train_examples.append((feature_vector, output)) random.shuffle(train_examples) test = train_examples[:len(train_examples) / 10] train = train_examples[len(train_examples) / 10:] # Train a regression model on the training data and evaluate its mean # squared error with the test data for tuning_parameter in range(5, 21, 5): tuning_parameter = 1.0 * tuning_parameter / 10 regressionPredictor = learnRegression(train, 500, 0.00000000001, tuning_parameter) regression_error = evaluatePredictor(regressionPredictor, test) # Print the results print "" print "----------" print "REGRESSION" print "----------" print "Lambda (lasso): ", tuning_parameter print "Number of examples: ", len(train_examples) print "Regression MSE: ", regression_error print ""
def trainAndEvaluate(): """Trains a gradient-boosted linear regression predictor and prints its mean squared error. """ # Import the training data as a numpy array train_array = csvAsArray('data/train_updated.csv') # Generate a list of (feature vector, value) tuples for the training data feature_names = getCsvHeaders('data/train_updated.csv') train_examples = [] for i in range(len(train_array)): feature_count = range(len(train_array[i]) - 1) feature_values = [train_array[i][j] for j in feature_count] feature_vector = featurize(feature_values, feature_names) output = train_array[i][len(train_array[0]) - 1] train_examples.append((feature_vector, output)) random.shuffle(train_examples) test = train_examples[:len(train_examples) / 10] train_examples = train_examples[len(train_examples) / 10:] # Train a regression model on the training data and evaluate its mean # squared error with the test data boostedRegressionPredictor = learnBoostedRegression(train_examples, 500, \ 0.000000001, num_trees=5) regression_error = evaluatePredictor(boostedRegressionPredictor, \ test) # Print the results print "" print "------------------" print "BOOSTED REGRESSION" print "------------------" print "Number of examples: " + str(len(train_examples)) print "Regression MSE: " + str(regression_error) print ""
def featureExtractor(x): # x = "took Mauritius into" phi = defaultdict(float) #phi[x] = 1 tokens = x.split() left, entity, right = tokens[0], tokens[1:-1], tokens[-1] phi['entity is ' + ' '.join(entity)] = 1 phi['left is ' + left] = 1 phi['right is ' + right] = 1 for word in entity: phi['entity contains ' + word] = 1 phi['entity contains prefix ' + word[:4]] = 1 phi['entity contains suffix ' + word[-4:]] = 1 return phi # Learn a predictor weights = submission.learnPredictor(trainExamples, devExamples, featureExtractor, 30, 0.05) util.outputWeights(weights, 'weights') util.outputErrorAnalysis(devExamples, featureExtractor, weights, 'error-analysis') # Test!!! testExamples = util.readExamples('names.test') predictor = lambda x: 1 if util.dotProduct(featureExtractor(x), weights ) > 0 else -1 print 'test error =', util.evaluatePredictor(testExamples, predictor)