Example #1
0
def main():
    args = parseArgs()
    X, _, _, _ = filehandlers.loadTrainingSets(args.input)
    
    pca = sklearn.decomposition.PCA(args.variance)
    pca.fit(X)
    
    numComponents = pca.components_.shape[0]
    numFeatures = pca.components_.shape[1]
    print "{:.1f}% ({} / {}) features to retain {}% of the variance".format(100. * numComponents / numFeatures, numComponents, numFeatures, 100 * args.variance)
    
    filehandlers.savePca(args.output, pca)
Example #2
0
def runTests(filename, pcaFile=None):
    print "TRAINING/TESTING {}".format(filename)
    
    trainingInputs, trainingOutputs, validationInputs, validationOutputs = filehandlers.loadTrainingSets(filename)
    if pcaFile:
        pca = filehandlers.loadPca(pcaFile)
        trainingInputs = pca.transform(trainingInputs)
        validationInputs = pca.transform(validationInputs)

    numExamples = trainingInputs.shape[0]
    numFeatures = trainingInputs.shape[1]
    print "Loaded training data with shape {} and {}".format(trainingInputs.shape, trainingOutputs.shape)
    print "Loaded validation data with shape {} and {}".format(validationInputs.shape, validationOutputs.shape)
    
    avgPredictions = trainingOutputs.mean(axis=0)

    print "Baselines"
    print "\tPredict zero on all outputs (val): {}".format(rms(0, validationOutputs.ravel()))
    print "\tPredict one on all outputs (val): {}".format(rms(1, validationOutputs.ravel()))
    print "\tPredict average on all outputs (val): {}".format(rms(avgPredictions.repeat(validationOutputs.shape[0]), validationOutputs.ravel()))
    
    print "\tPredict one on all outputs then norm (val): {}".format(rms(normalizePredictions(numpy.ones(validationOutputs.shape)).ravel(), validationOutputs.ravel()))
    
    tiledAverage = numpy.tile(avgPredictions, (validationOutputs.shape[0], 1))
    print "\tPredict average on all outputs then norm (val): {}".format(rms(normalizePredictions(tiledAverage).ravel(), validationOutputs.ravel()))

    experimentLinearRegression(trainingInputs, trainingOutputs, validationInputs, validationOutputs)

 
    #experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, 20, 5)
    #experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, 20, 5, maxFeatures="auto")
    #experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, 20, None)
    #experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, 20, None, maxFeatures=int(numFeatures * 0.75))
    #experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, 40, None, maxFeatures=int(numFeatures * 0.5))
    #experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, 80, None, maxFeatures=int(numFeatures * 0.5))
    #experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, 160, None, maxFeatures=int(numFeatures * 0.5))

    #for numTrees in [ 40, 80, 160 ]:
    #    experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, numTrees, None, maxFeatures=int(numFeatures * 0.5), minSplit=5)
    
    experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, 40, None, maxFeatures=int(numFeatures * 0.5), minSplit=20, saveAsPath="randomForest-40t-0.5f-20mss.pickle")