def main(): args = parseArgs() with open(args.model, "r") as modelIn: model = pickle.load(modelIn) testingLabels, testingFeatures = filehandlers.loadTestingData(args.testingInput, pca=args.pca) if args.pca: pca = filehandlers.loadPca(args.pca) testingFeatures = pca.transform(testingFeatures) predictions = model.predict(testingFeatures) testingLabels.shape = (predictions.shape[0], 1) numpy.savetxt(args.outFile, numpy.hstack([testingLabels, predictions]), delimiter=",", fmt=["%d"] + ["%f"] * predictions.shape[1], header=loadHeader(args.headersFrom)) print "Don't forget to delete the comment mark on the first line of output"
def runTests(filename, pcaFile=None): print "TRAINING/TESTING {}".format(filename) trainingInputs, trainingOutputs, validationInputs, validationOutputs = filehandlers.loadTrainingSets(filename) if pcaFile: pca = filehandlers.loadPca(pcaFile) trainingInputs = pca.transform(trainingInputs) validationInputs = pca.transform(validationInputs) numExamples = trainingInputs.shape[0] numFeatures = trainingInputs.shape[1] print "Loaded training data with shape {} and {}".format(trainingInputs.shape, trainingOutputs.shape) print "Loaded validation data with shape {} and {}".format(validationInputs.shape, validationOutputs.shape) avgPredictions = trainingOutputs.mean(axis=0) print "Baselines" print "\tPredict zero on all outputs (val): {}".format(rms(0, validationOutputs.ravel())) print "\tPredict one on all outputs (val): {}".format(rms(1, validationOutputs.ravel())) print "\tPredict average on all outputs (val): {}".format(rms(avgPredictions.repeat(validationOutputs.shape[0]), validationOutputs.ravel())) print "\tPredict one on all outputs then norm (val): {}".format(rms(normalizePredictions(numpy.ones(validationOutputs.shape)).ravel(), validationOutputs.ravel())) tiledAverage = numpy.tile(avgPredictions, (validationOutputs.shape[0], 1)) print "\tPredict average on all outputs then norm (val): {}".format(rms(normalizePredictions(tiledAverage).ravel(), validationOutputs.ravel())) experimentLinearRegression(trainingInputs, trainingOutputs, validationInputs, validationOutputs) #experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, 20, 5) #experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, 20, 5, maxFeatures="auto") #experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, 20, None) #experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, 20, None, maxFeatures=int(numFeatures * 0.75)) #experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, 40, None, maxFeatures=int(numFeatures * 0.5)) #experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, 80, None, maxFeatures=int(numFeatures * 0.5)) #experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, 160, None, maxFeatures=int(numFeatures * 0.5)) #for numTrees in [ 40, 80, 160 ]: # experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, numTrees, None, maxFeatures=int(numFeatures * 0.5), minSplit=5) experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, 40, None, maxFeatures=int(numFeatures * 0.5), minSplit=20, saveAsPath="randomForest-40t-0.5f-20mss.pickle")