def readCommand( argv ): "Processes the command used to run from the command line." from optparse import OptionParser parser = OptionParser(USAGE_STRING) parser.add_option('-c', '--classifier', help=default('The type of classifier'), choices=['mostFrequent', 'nb', 'naiveBayes', 'perceptron','mira'], default='perceptron') parser.add_option('-d', '--data', help=default('Dataset to use'), choices=['digits', 'faces'], default='digits') parser.add_option('-t', '--training', help=default('The size of the training set'), default=100, type="int") parser.add_option('-f', '--features', help=default('Whether to use enhanced features'), default=False, action="store_true") parser.add_option('-o', '--odds', help=default('Whether to compute odds ratios'), default=False, action="store_true") parser.add_option('-1', '--label1', help=default("First label in an odds ratio comparison"), default=0, type="int") parser.add_option('-2', '--label2', help=default("Second label in an odds ratio comparison"), default=1, type="int") parser.add_option('-w', '--weights', help=default('Whether to print weights'), default=False, action="store_true") parser.add_option('-k', '--smoothing', help=default("Smoothing parameter (ignored when using --autotune)"), type="float", default=2.0) parser.add_option('-a', '--autotune', help=default("Whether to automatically tune hyperparameters"), default=False, action="store_true") parser.add_option('-i', '--iterations', help=default("Maximum iterations to run training"), default=3, type="int") parser.add_option('-s', '--test', help=default("Amount of test data to use"), default=TEST_SET_SIZE, type="int") parser.add_option('-n', '--analysis', help=default("Shows which data is wrongly predicted"), default=True, action="store_true") parser.add_option('-r', '--random', help=default("Trains the data set using random data and calculates averages for percent accuracy and standard deviation"), default=True, action="store_true") options, otherjunk = parser.parse_args(argv) if len(otherjunk) != 0: raise Exception('Command line input not understood: ' + str(otherjunk)) args = {} # Set up variables according to the command line input. print ("Doing classification") print ("--------------------") print ("Data:\t\t" + options.data) print ("Classifier:\t\t" + options.classifier) print ("Using enhanced features?:\t" + str(options.features)) if not options.random: print ("Training set size:\t" + str(options.training)) if(options.data=="digits"): printImage = ImagePrinter(DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT).printImage if (options.features): featureFunction = enhancedFeatureExtractorDigit else: featureFunction = basicFeatureExtractorDigit elif(options.data=="faces"): printImage = ImagePrinter(FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT).printImage if (options.features): featureFunction = enhancedFeatureExtractorFace else: featureFunction = basicFeatureExtractorFace else: print ("Unknown dataset", options.data) print (USAGE_STRING) sys.exit(2) if(options.data=="digits"): legalLabels = range(10) else: legalLabels = range(2) if options.training <= 0: print ("Training set size should be a positive integer (you provided: %d)" % options.training) print (USAGE_STRING) sys.exit(2) if options.smoothing <= 0: print ("Please provide a positive number for smoothing (you provided: %f)" % options.smoothing) print (USAGE_STRING) sys.exit(2) if options.odds: if options.label1 not in legalLabels or options.label2 not in legalLabels: print ("Didn't provide a legal labels for the odds ratio: (%d,%d)" % (options.label1, options.label2)) print (USAGE_STRING) sys.exit(2) if(options.classifier == "naiveBayes" or options.classifier == "nb"): classifier = naiveBayes.NaiveBayesClassifier(legalLabels) classifier.setSmoothing(options.smoothing) if (options.autotune): print ("Using automatic tuning for naivebayes") classifier.automaticTuning = True else: print ("Using smoothing parameter k=%f for naivebayes" % options.smoothing) elif(options.classifier == "perceptron"): classifier = perceptron.PerceptronClassifier(legalLabels,options.iterations) elif(options.classifier == "mira"): classifier = mira.MiraClassifier(legalLabels, options.iterations) if (options.autotune): print ("Using automatic tuning for MIRA") classifier.automaticTuning = True else: print ("Using default C=0.001 for MIRA") else: print ("Unknown classifier:", options.classifier) print (USAGE_STRING) sys.exit(2) args['classifier'] = classifier args['featureFunction'] = featureFunction args['printImage'] = printImage return args, options
def readCommand(argv): "Processes the command used to run from the command line." from optparse import OptionParser parser = OptionParser(USAGE_STRING) parser.add_option('-c', '--classifier', help=default('The type of classifier'), choices=[ 'mostFrequent', 'nb', 'naiveBayes', 'perceptron', 'mira', 'minicontest' ], default='mostFrequent') parser.add_option('-d', '--data', help=default('Dataset to use'), choices=['digits', 'faces', 'pacman'], default='digits') parser.add_option('-t', '--training', help=default('The size of the training set'), default=100, type="int") parser.add_option('-f', '--features', help=default('Whether to use enhanced features'), default=False, action="store_true") parser.add_option('-o', '--odds', help=default('Whether to compute odds ratios'), default=False, action="store_true") parser.add_option('-1', '--label1', help=default("First label in an odds ratio comparison"), default=0, type="int") parser.add_option('-2', '--label2', help=default("Second label in an odds ratio comparison"), default=1, type="int") parser.add_option('-w', '--weights', help=default('Whether to print weights'), default=False, action="store_true") parser.add_option( '-k', '--smoothing', help=default("Smoothing parameter (ignored when using --autotune)"), type="float", default=2.0) parser.add_option( '-a', '--autotune', help=default("Whether to automatically tune hyperparameters"), default=False, action="store_true") parser.add_option('-i', '--iterations', help=default("Maximum iterations to run training"), default=3, type="int") parser.add_option('-s', '--test', help=default("Amount of test data to use"), default=TEST_SET_SIZE, type="int") parser.add_option('-g', '--agentToClone', help=default("Pacman agent to copy"), default=None, type="str") options, otherjunk = parser.parse_args(argv) if len(otherjunk) != 0: raise Exception('Command line input not understood: ' + str(otherjunk)) args = {} # Set up variables according to the command line input. print "Doing classification" print "--------------------" print "data:\t\t" + options.data print "classifier:\t\t" + options.classifier if not options.classifier == 'minicontest': print "using enhanced features?:\t" + str(options.features) else: print "using minicontest feature extractor" print "training set size:\t" + str(options.training) if (options.data == "digits"): printImage = ImagePrinter(DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT).printImage if (options.features): featureFunction = enhancedFeatureExtractorDigit else: featureFunction = basicFeatureExtractorDigit if (options.classifier == 'minicontest'): featureFunction = contestFeatureExtractorDigit elif (options.data == "faces"): printImage = ImagePrinter(FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT).printImage if (options.features): featureFunction = enhancedFeatureExtractorFace else: featureFunction = basicFeatureExtractorFace elif (options.data == "pacman"): printImage = None if (options.features): featureFunction = enhancedFeatureExtractorPacman else: featureFunction = basicFeatureExtractorPacman else: print "Unknown dataset", options.data print USAGE_STRING sys.exit(2) if (options.data == "digits"): legalLabels = range(10) else: legalLabels = ['Stop', 'West', 'East', 'North', 'South'] if options.training <= 0: print "Training set size should be a positive integer (you provided: %d)" % options.training print USAGE_STRING sys.exit(2) if options.smoothing <= 0: print "Please provide a positive number for smoothing (you provided: %f)" % options.smoothing print USAGE_STRING sys.exit(2) if options.odds: if options.label1 not in legalLabels or options.label2 not in legalLabels: print "Didn't provide a legal labels for the odds ratio: (%d,%d)" % ( options.label1, options.label2) print USAGE_STRING sys.exit(2) if (options.classifier == "mostFrequent"): classifier = mostFrequent.MostFrequentClassifier(legalLabels) elif (options.classifier == "naiveBayes" or options.classifier == "nb"): classifier = naiveBayes.NaiveBayesClassifier(legalLabels) classifier.setSmoothing(options.smoothing) if (options.autotune): print "using automatic tuning for naivebayes" classifier.automaticTuning = True else: print "using smoothing parameter k=%f for naivebayes" % options.smoothing elif (options.classifier == "perceptron"): if options.data != 'pacman': classifier = perceptron.PerceptronClassifier( legalLabels, options.iterations) else: classifier = perceptron_pacman.PerceptronClassifierPacman( legalLabels, options.iterations) elif (options.classifier == "mira"): if options.data != 'pacman': classifier = mira.MiraClassifier(legalLabels, options.iterations) if (options.autotune): print "using automatic tuning for MIRA" classifier.automaticTuning = True else: print "using default C=0.001 for MIRA" elif (options.classifier == 'minicontest'): import minicontest classifier = minicontest.contestClassifier(legalLabels) else: print "Unknown classifier:", options.classifier print USAGE_STRING sys.exit(2) args['agentToClone'] = options.agentToClone args['classifier'] = classifier args['featureFunction'] = featureFunction args['printImage'] = printImage return args, options
def runClassifier(): """ Harness code for running different classifiers on the face or digit data. This is the main function for classification, and is designed to be invoked from the command line (outside the Python interpreter). Usage: > python dataClassifier.py OR > python dataClassifier.py <data> <classifierName> OR > python dataClassifier.py <data> <classifierName> <featureFunction> OR > python dataClassifier.py <data> <classifierName> <featureFunction> <numTrainingExamples> OR > python dataClassifier.py <data> <classifierName> <featureFunction> <numTrainingExamples> <odds class1 class2> For example: > python dataClassifier.py digits naivebayes basic 1000 would run the naive Bayes classifier on 1000 training examples using the basicFeatureExtractor function, and then test the classifier on the test data. """ print "Doing classification" print "--------------------" # Assign default values for arguments if they are not provided. if (len(sys.argv) == 1): print "No data specified; using digits." sys.argv.append("digits") if (len(sys.argv) == 2): print "No classifier specified; using default." sys.argv.append("mostfrequent") if (len(sys.argv) == 3): print "No feature extraction function specified; using default." sys.argv.append("basic") if (len(sys.argv) == 4): print "No training set size specified; using default." sys.argv.append("100") if (len(sys.argv) == 5): print "Not doing odds ratio computation." sys.argv.append("noodds") # Set up variables according to the command line input. print "data:\t\t" + sys.argv[1] print "classifier:\t\t" + sys.argv[2] print "feature extractor:\t" + sys.argv[3] print "training set size:\t" + sys.argv[4] if ((sys.argv[1] == "digits") & (sys.argv[3] == "basic")): featureFunction = basicFeatureExtractorDigit elif ((sys.argv[1] == "faces") & (sys.argv[3] == "basic")): featureFunction = basicFeatureExtractorFace elif ((sys.argv[1] == "digits") & (sys.argv[3] == "enhanced")): featureFunction = enhancedFeatureExtractorDigit elif ((sys.argv[1] == "faces") & (sys.argv[3] == "enhanced")): featureFunction = enhancedFeatureExtractorFace else: print "Unknown feature function:", sys.argv[2] return if (sys.argv[1] == "digits"): # if digits detect legalLabels = range(10) else: # if face detect legalLabels = range(2) if (sys.argv[2] == "mostfrequent"): classifier = mostFrequent.MostFrequentClassifier(legalLabels) elif (sys.argv[2] == "naivebayes"): classifier = naiveBayes.NaiveBayesClassifier(legalLabels) elif (sys.argv[2] == "perceptron"): classifier = perceptron.PerceptronClassifier(legalLabels) else: print "Unknown classifier:", sys.argv[2] return # Load data numTraining = int(sys.argv[4]) if (sys.argv[1] == "faces"): rawTrainingData = samples.loadDataFile("facedata/facedatatrain", numTraining, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTraining) rawValidationData = samples.loadDataFile("facedata/facedatatrain", TEST_SET_SIZE, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile( "facedata/facedatatrainlabels", TEST_SET_SIZE) rawTestData = samples.loadDataFile("facedata/facedatatest", TEST_SET_SIZE, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("facedata/facedatatestlabels", TEST_SET_SIZE) else: rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining) rawValidationData = samples.loadDataFile("digitdata/validationimages", TEST_SET_SIZE, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("digitdata/validationlabels", TEST_SET_SIZE) rawTestData = samples.loadDataFile("digitdata/testimages", TEST_SET_SIZE, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("digitdata/testlabels", TEST_SET_SIZE) # Extract features print "Extracting features..." trainingData = map(featureFunction, rawTrainingData) validationData = map(featureFunction, rawValidationData) testData = map(featureFunction, rawTestData) # Conduct training and testing print "Training..." classifier.train(trainingData, trainingLabels, validationData, validationLabels) print "Validating..." guesses = classifier.classify(validationData) correct = [ guesses[i] == validationLabels[i] for i in range(len(validationLabels)) ].count(True) print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels)) print "Testing..." guesses = classifier.classify(testData) correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True) print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels)) util.pause() analysis(classifier, guesses, testLabels, rawTestData) # do odds ratio computation if specified at command line if ((sys.argv[5] == "odds") & (len(sys.argv) == 8)): features_class1, features_class2, features_odds = classifier.findHighOddsFeatures( int(sys.argv[6]), int(sys.argv[7])) if (sys.argv[1] == "faces"): printImage(features_class1, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) printImage(features_class2, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) printImage(features_odds, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) else: printImage(features_class1, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) printImage(features_class2, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) printImage(features_odds, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT)
def readCommand(argv): """Processes the command used to run from the command line.""" from optparse import OptionParser parser = OptionParser(USAGE_STRING) parser.add_option( '-r', '--run', help=default('automatically runs training and test cycle for 5 times'), default=False, action='store_true') parser.add_option('-c', '--classifier', help=default('The type of classifier'), choices=['perceptron', 'naiveBayes', 'mira'], default='naiveBayes') parser.add_option('-d', '--data', help=default('Dataset to use'), choices=['digits', 'faces'], default='digits') parser.add_option('-t', '--training', help=default('The ratio of the training set to use'), default=1.0, type="float") parser.add_option('-f', '--features', help=default('Whether to use enhanced features'), default=False, action="store_true") parser.add_option('-o', '--odds', help=default('Whether to compute odds ratios'), default=False, action="store_true") parser.add_option('-1', '--label1', help=default("First label in an odds ratio comparison"), default=0, type="int") parser.add_option('-2', '--label2', help=default("Second label in an odds ratio comparison"), default=1, type="int") parser.add_option( '-k', '--smoothing', help=default("Smoothing parameter (ignored when using --autotune)"), type="float", default=2.0) parser.add_option( '-a', '--autotune', help=default("Whether to automatically tune hyperparameters"), default=False, action="store_true") parser.add_option('-i', '--iterations', help=default("Maximum iterations to run training"), default=3, type="int") options, otherjunk = parser.parse_args(argv) if len(otherjunk) != 0: raise Exception('Command line input not understood: ' + str(otherjunk)) args = {} # Set up variables according to the command line input. print("Doing classification") print("--------------------") print("data:\t\t" + options.data) print("classifier:\t\t" + options.classifier) print("using enhanced features?:\t" + str(options.features)) if options.data == "digits": printImage = ImagePrinter(DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT).printImage if options.features: featureFunction = enhancedFeatureExtractorDigit else: featureFunction = basicFeatureExtractorDigit elif options.data == "faces": printImage = ImagePrinter(FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT).printImage if options.features: featureFunction = enhancedFeatureExtractorFace else: featureFunction = basicFeatureExtractorFace else: print("Unknown dataset", options.data) print(USAGE_STRING) sys.exit(2) if options.data == "digits": legalLabels = range(10) else: legalLabels = range(2) if options.training <= 0: print( "Training set size should be a positive integer (you provided: %d)" % options.training) print(USAGE_STRING) sys.exit(2) if options.smoothing <= 0: print( "Please provide a positive number for smoothing (you provided: %f)" % options.smoothing) print(USAGE_STRING) sys.exit(2) if options.odds: if options.label1 not in legalLabels or options.label2 not in legalLabels: print("Didn't provide a legal labels for the odds ratio: (%d,%d)" % (options.label1, options.label2)) print(USAGE_STRING) sys.exit(2) if options.classifier == "mira": classifier = mira.MiraClassifier(legalLabels, options.iterations) elif options.classifier == "naiveBayes": classifier = naiveBayes.NaiveBayesClassifier(legalLabels) classifier.setSmoothing(options.smoothing) if options.autotune: print "using automatic tuning for naivebayes" classifier.automaticTuning = True else: print("using smoothing parameter k=%f for naivebayes" % options.smoothing) elif options.classifier == "perceptron": classifier = perceptron.PerceptronClassifier(legalLabels, options.iterations) elif options.classifier == "knn": classifier = knn.KNN(legalLabels) else: print("Unknown classifier:", options.classifier) print(USAGE_STRING) sys.exit(2) args['classifier'] = classifier args['featureFunction'] = featureFunction args['printImage'] = printImage return args, options
def readCommand( argv ): "Processes the command used to run from the command line." from optparse import OptionParser parser = OptionParser(USAGE_STRING) parser.add_option('-c', '--classifier', help=default('The type of classifier'), choices=['naiveBayes', 'perceptron', 'kNN'], default='naiveBayes') parser.add_option('-d', '--data', help=default('Dataset to use'), choices=['digits', 'faces'], default='digits') parser.add_option('-t', '--training', help=default('The size of the training set'), default=100, type="int") parser.add_option('-w', '--weights', help=default('Whether to print weights'), default=False, action="store_true") parser.add_option('-k', '--neighbors', help=default("Numbers of neighbors in k-Nearest Neighbors"), type="int", default=3) parser.add_option('-i', '--iterations', help=default("Maximum iterations to run training"), default=3, type="int") parser.add_option('-s', '--test', help=default("Amount of test data to use"), default=TEST_SET_SIZE, type="int") options, otherjunk = parser.parse_args(argv) if len(otherjunk) != 0: raise Exception('Command line input not understood: ' + str(otherjunk)) args = {} # Set up variables according to the command line input. print "Doing classification" print "--------------------" print "data:\t\t" + options.data print "classifier:\t\t" + options.classifier print "training set size:\t" + str(options.training) if(options.data=="digits"): printImage = ImagePrinter(DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT).printImage featureFunction = basicFeatureExtractorDigit elif(options.data=="faces"): printImage = ImagePrinter(FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT).printImage featureFunction = basicFeatureExtractorFace else: print "Unknown dataset", options.data print USAGE_STRING sys.exit(2) if(options.data=="digits"): legalLabels = range(10) else: legalLabels = range(2) if options.training <= 0: print "Training set size should be a positive integer (you provided: %d)" % options.training print USAGE_STRING sys.exit(2) if options.neighbors <= 0: print "Neighbors for kNN should be a positive integer (you provided: %d)" % options.neighbors print USAGE_STRING sys.exit(2) if(options.classifier == "naiveBayes"): classifier = naiveBayes.NaiveBayesClassifier(legalLabels) elif(options.classifier == "perceptron"): classifier = perceptron.PerceptronClassifier(legalLabels,options.iterations) elif(options.classifier == "kNN"): classifier = kNN.kNNClassifier(legalLabels,options.neighbors) else: print "Unknown classifier:", options.classifier print USAGE_STRING sys.exit(2) args['classifier'] = classifier args['featureFunction'] = featureFunction args['printImage'] = printImage return args, options
def readCommand(argv): "Processes the command used to run from the command line." from optparse import OptionParser parser = OptionParser(USAGE_STRING) parser.add_option('-c', '--classifier', help=default('The type of classifier'), choices=['mostFrequent', 'nb', 'naiveBayes', 'GDA'], default='mostFrequent') parser.add_option('-d', '--data', help=default('Dataset to use'), choices=['digits', 'faces'], default='digits') parser.add_option('-t', '--training', help=default('The size of the training set'), default=450, type="int") parser.add_option( '-k', '--smoothing', help=default("Smoothing parameter (ignored when using --autotune)"), type="float", default=2.0) parser.add_option( '-a', '--autotune', help=default("Whether to automatically tune hyperparameters"), default=False, action="store_true") parser.add_option('-i', '--iterations', help=default("Maximum iterations to run training"), default=3, type="int") options, otherjunk = parser.parse_args(argv) if len(otherjunk) != 0: raise Exception('Command line input not understood: ' + str(otherjunk)) args = {} # Set up variables according to the command line input. print "Doing classification" print "--------------------" print "data:\t\t" + options.data print "classifier:\t\t" + options.classifier print "training set size:\t" + str(options.training) if (options.data == "digits"): printImage = ImagePrinter(DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT).printImage featureFunction = basicFeatureExtractorDigit elif (options.data == "faces"): printImage = ImagePrinter(FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT).printImage featureFunction = basicFeatureExtractorFace else: print "Unknown dataset", options.data print USAGE_STRING sys.exit(2) if (options.data == "digits"): legalLabels = range(10) else: legalLabels = range(2) if options.training <= 0: print "Training set size should be a positive integer (you provided: %d)" % options.training print USAGE_STRING sys.exit(2) if options.smoothing <= 0: print "Please provide a positive number for smoothing (you provided: %f)" % options.smoothing print USAGE_STRING sys.exit(2) if (options.classifier == "mostFrequent"): classifier = mostFrequent.MostFrequentClassifier(legalLabels) elif (options.classifier == "naiveBayes" or options.classifier == "nb"): classifier = naiveBayes.NaiveBayesClassifier(legalLabels) classifier.setSmoothing(options.smoothing) if (options.autotune): print "using automatic tuning for naivebayes" classifier.automaticTuning = True else: print "using smoothing parameter k=%f for naivebayes" % options.smoothing elif (options.classifier == "GDA"): classifier = gaussianDiscriminantAnalysis.GaussianDiscriminantAnalysisClassifier( legalLabels, "GDA") else: print "Unknown classifier:", options.classifier print USAGE_STRING sys.exit(2) args['classifier'] = classifier args['featureFunction'] = featureFunction args['printImage'] = printImage return args, options
def readCommand(argv): "Processes the command used to run from the command line." from optparse import OptionParser parser = OptionParser(USAGE_STRING) parser.add_option( '-c', '--classifier', help=default('The type of classifier'), choices=['mostFrequent', 'nb', 'naiveBayes', 'perceptron', 'knn'], default='mostFrequent') parser.add_option('-d', '--data', help=default('Dataset to use'), choices=['digits', 'faces'], default='digits') parser.add_option('-t', '--training', help=default('The size of the training set'), default=100, type="int") parser.add_option('-f', '--features', help=default('Whether to use enhanced features'), default=False, action="store_true") parser.add_option('-o', '--odds', help=default('Whether to compute odds ratios'), default=False, action="store_true") parser.add_option('-1', '--label1', help=default("First label in an odds ratio comparison"), default=0, type="int") parser.add_option('-2', '--label2', help=default("Second label in an odds ratio comparison"), default=1, type="int") parser.add_option('-w', '--weights', help=default('Whether to print weights'), default=False, action="store_true") parser.add_option( '-k', '--smoothing', help=default("Smoothing parameter (ignored when using --autotune)"), type="float", default=K_VALUE) parser.add_option( '-a', '--autotune', help=default("Whether to automatically tune hyperparameters"), default=False, action="store_true") parser.add_option('-i', '--iterations', help=default("Maximum iterations to run training"), default=MAX_ITERATIONS, type="int") parser.add_option('-s', '--test', help=default("Amount of test data to use"), default=TEST_SET_SIZE, type="int") parser.add_option( '-q', '--index', help=default( "index of data whose predicted label and actual label you want to display" ), default=-1, type="int") options, otherjunk = parser.parse_args(argv) if len(otherjunk) != 0: raise Exception('Command line input not understood: ' + str(otherjunk)) args = {} # Set up variables according to the command line input. print("Doing classification") print("--------------------") print("data:\t\t", options.data) print("classifier:\t\t", options.classifier) print("training set size:\t" + str(options.training)) if (options.data == "digits"): printImage = ImagePrinter(DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT).printImage if (options.features): featureFunction = enhancedFeatureExtractorDigit else: print('using basicFeatureExtractorDigit for digits') featureFunction = basicFeatureExtractorDigit elif (options.data == "faces"): printImage = ImagePrinter(FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT).printImage if (options.features): featureFunction = enhancedFeatureExtractorFace else: print('using basicFeatureExtractorDigit for faces') featureFunction = basicFeatureExtractorFace else: print("Unknown dataset", options.data) print(USAGE_STRING) sys.exit(2) if (options.data == "digits"): legalLabels = range(10) else: legalLabels = range(2) if options.training <= 0: print( "Training set size should be a positive integer (you provided: %d)", options.training) print(USAGE_STRING) sys.exit(2) if options.smoothing <= 0: print( "Please provide a positive number for smoothing (you provided: %f)", options.smoothing) print(USAGE_STRING) sys.exit(2) if options.odds: if options.label1 not in legalLabels or options.label2 not in legalLabels: print("Didn't provide a legal labels for the odds ratio: (%d,%d)" % (options.label1, options.label2)) print(USAGE_STRING) sys.exit(2) if (options.classifier == "mostFrequent"): classifier = mostFrequent.MostFrequentClassifier(legalLabels) elif (options.classifier == "naiveBayes" or options.classifier == "nb"): classifier = naiveBayes.NaiveBayesClassifier(legalLabels) classifier.setSmoothing(options.smoothing) if (options.autotune): print("using automatic tuning for naivebayes") classifier.automaticTuning = True else: print("using smoothing parameter k=%f for naivebayes", options.smoothing) elif (options.classifier == "perceptron"): classifier = perceptron.PerceptronClassifier(legalLabels, options.iterations) elif (options.classifier == 'knn'): if (options.data == "digits"): classifier = knn.KNNClassifier(legalLabels, options.smoothing) else: classifier = knn_faces.KNNClassifierFaces(legalLabels, options.smoothing) else: print("Unknown classifier:", options.classifier) print(USAGE_STRING) sys.exit(2) args['classifier'] = classifier args['featureFunction'] = featureFunction args['printImage'] = printImage return args, options
def __init__(self, legalLabels): self.guess = None self.type = "minicontest" self.classifier = naiveBayes.NaiveBayesClassifier(legalLabels) #self.classifier = mira.MiraClassifier(legalLabels, 5) self.classifier.automaticTuning = True
def runClassifier(): ######################################################################################################################################## #Edited Code #Store info for each iteration nbDigits = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] nbFaces = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] perceptronDigits = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] perceptronFaces = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] trainingCounts = { 0: 500, 1: 1000, 2: 1500, 3: 2000, 4: 2500, 5: 3000, 6: 3500, 7: 4000, 8: 4500, 9: 5000, 10: 500, 11: 1000, 12: 1500, 13: 2000, 14: 2500, 15: 3000, 16: 3500, 17: 4000, 18: 4500, 19: 5000, 20: 45, 21: 90, 22: 135, 23: 180, 24: 225, 25: 270, 26: 315, 27: 360, 28: 405, 29: 450, 30: 45, 31: 90, 32: 135, 33: 180, 34: 225, 35: 270, 36: 315, 37: 360, 38: 405, 39: 450 } #FaceData rawFaceTrainingData = samples.loadDataFile("facedata/facedatatrain", 450, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) faceTrainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", 450) rawFaceValidationData = samples.loadDataFile("facedata/facedatatrain", 300, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) faceValidationLabels = samples.loadLabelsFile( "facedata/facedatatrainlabels", 300) rawFaceTestData = samples.loadDataFile("facedata/facedatatest", 149, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) testFaceLabels = samples.loadLabelsFile("facedata/facedatatestlabels", 149) #DigitData rawDigitTrainingData = samples.loadDataFile("digitdata/trainingimages", 5000, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) digitTrainingLabels = samples.loadLabelsFile("digitdata/traininglabels", 5000) rawDigitValidationData = samples.loadDataFile("digitdata/validationimages", 1000, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) digitValidationLabels = samples.loadLabelsFile( "digitdata/validationlabels", 1000) rawDigitTestData = samples.loadDataFile("digitdata/testimages", 1000, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) testDigitLabels = samples.loadLabelsFile("digitdata/testlabels", 1000) #Automation of test for each classifier and data type for x in range(40): if x < 10: classifierName = "nb" elif x < 20: classifierName = "perceptron" elif x < 30: classifierName = "nb" else: classifierName = "perceptron" if x < 20: Data = "digits" else: Data = "faces" if (Data == "digits"): legalLabels = range(10) #featureFunction = enhancedFeatureExtractorDigit featureFunction = basicFeatureExtractorDigit else: legalLabels = range(2) #featureFunction = enhancedFeatureExtractorFace featureFunction = basicFeatureExtractorFace if (classifierName == "nb"): classifier = naiveBayes.NaiveBayesClassifier(legalLabels) classifier.setSmoothing(2.0) elif (classifierName == "perceptron"): classifier = perceptron.PerceptronClassifier(legalLabels, 3) print("Doing classification") print("--------------------") print("data:\t\t" + Data) print("classifier:\t\t " + classifierName) print("using enhanced features") print("training set size:\t" + str(trainingCounts[x])) # Extract features print("Extracting features...") # Load data if Data == "digits": startTime = time.process_time() h = 0 while h < 3: print("Iteration %d" % h) numTraining = trainingCounts[x] rawTrainingData = [] rawTrainingLabels = [] i = 0 while i < numTraining: k = list(range(0, 5000)) random.shuffle(k) j = k.pop() rawTrainingLabels.append(digitTrainingLabels[j]) rawTrainingData.append(rawDigitTrainingData[j]) i += 1 trainingData = list(map(featureFunction, rawTrainingData)) validationData = list( map(featureFunction, rawDigitValidationData)) testData = list(map(featureFunction, rawDigitTestData)) print("Training...") classifier.train(trainingData, rawTrainingLabels, validationData, digitValidationLabels) print("Validating...") guesses = classifier.classify(validationData) correct = [ guesses[i] == digitValidationLabels[i] for i in range(len(digitValidationLabels)) ].count(True) print(str(correct), ("correct out of " + str(len(digitValidationLabels)) + " (%.1f%%).") % (100.0 * correct / len(digitValidationLabels))) print("Testing...") guesses = classifier.classify(testData) correct = [ guesses[i] == testDigitLabels[i] for i in range(len(testDigitLabels)) ].count(True) print(str(correct), ("correct out of " + str(len(testDigitLabels)) + " (%.1f%%).") % (100.0 * correct / len(testDigitLabels))) h += 1 #Gather correct count for each iteration and use to compute standard deviation if classifierName == "nb": if Data == "digits": nbDigits[x % 10] += correct nbDigits[(x % 10) + 10] += time.process_time() - startTime else: nbFaces[x % 10] += correct nbFaces[(x % 10) + 10] += time.process_time() - startTime else: if Data == "digits": perceptronDigits[x % 10] += correct perceptronDigits[(x % 10) + 10] += time.process_time() - startTime else: perceptronFaces[x % 10] += correct perceptronFaces[(x % 10) + 10] += time.process_time() - startTime else: h = 0 while h < 3: print("Iteration %d" % h) numTraining = trainingCounts[x] rawTrainingData = [] rawTrainingLabels = [] i = 0 while i < numTraining: k = list(range(0, 450)) random.shuffle(k) j = k.pop() rawTrainingLabels.append(faceTrainingLabels[j]) rawTrainingData.append(rawFaceTrainingData[j]) i += 1 trainingData = list(map(featureFunction, rawTrainingData)) validationData = list( map(featureFunction, rawFaceValidationData)) testData = list(map(featureFunction, rawFaceTestData)) print("Training...") classifier.train(trainingData, rawTrainingLabels, validationData, faceValidationLabels) print("Validating...") guesses = classifier.classify(validationData) correct = [ guesses[i] == faceValidationLabels[i] for i in range(len(faceValidationLabels)) ].count(True) print(str(correct), ("correct out of " + str(len(faceValidationLabels)) + " (%.1f%%).") % (100.0 * correct / len(faceValidationLabels))) print("Testing...") guesses = classifier.classify(testData) correct = [ guesses[i] == testFaceLabels[i] for i in range(len(testFaceLabels)) ].count(True) print(str(correct), ("correct out of " + str(len(testFaceLabels)) + " (%.1f%%).") % (100.0 * correct / len(testFaceLabels))) h += 1 #Gather correct count for each iteration and use to compute standard deviation if classifierName == "nb": if Data == "digits": nbDigits[x % 10] += correct nbDigits[(x % 10) + 10] += time.process_time() - startTime else: nbFaces[x % 10] += correct nbFaces[(x % 10) + 10] += time.process_time() - startTime else: if Data == "digits": perceptronDigits[x % 10] += correct perceptronDigits[(x % 10) + 10] += time.process_time() - startTime else: perceptronFaces[x % 10] += correct perceptronFaces[(x % 10) + 10] += time.process_time() - startTime #NAIVE BAYES DIGITS print( "Average Correct Guesses for Naive Bayes Digits Based on Percentage of TrainingData Used" ) print( "10%% %d/1000, 20%% %d/1000, 30%% %d/1000, 40%% %d/1000, 50%% %d/1000, 60%% %d/1000, 70%% %d/1000, 80%% %d/1000, 90%% %d/1000, 100%% %d/1000" % (nbDigits[0] / 3, nbDigits[1] / 3, nbDigits[2] / 3, nbDigits[3] / 3, nbDigits[4] / 3, nbDigits[5] / 3, nbDigits[6] / 3, nbDigits[7] / 3, nbDigits[8] / 3, nbDigits[9] / 3)) print( "Standard Deviation for Naive Bayes Digits Based on Percentage of Training Data Used" ) stndDev = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] i = 0 while i < 10: stndDev[i] = nbDigits[i] / 3 stndDev[i] = nbDigits[i] - stndDev[i] stndDev[i] = math.pow(stndDev[i], 2) stndDev[i] = stndDev[i] / 1000 stndDev[i] = math.sqrt(stndDev[i]) i += 1 print( "10%% %d, 20%% %d, 30%% %d, 40%% %d, 50%% %d, 60%% %d, 70%% %d, 80%% %d, 90%% %d, 100%% %d" % (stndDev[0], stndDev[1], stndDev[2], stndDev[3], stndDev[4], stndDev[5], stndDev[6], stndDev[7], stndDev[8], stndDev[9])) print( "Average Time to Complete Each Iteration Based on Percentage of Training Data Used In Seconds" ) print( "10%% %d seconds, 20%% %d seconds, 30%% %d seconds, 40%% %d seconds, 50%% %d seconds, 60%% %d seconds, 70%% %d seconds, 80%% %d seconds, 90%% %d seconds, 100%% %d seconds" % (nbDigits[10] / 3, nbDigits[11] / 3, nbDigits[12] / 3, nbDigits[13] / 3, nbDigits[14] / 3, nbDigits[15] / 3, nbDigits[16] / 3, nbDigits[17] / 3, nbDigits[18] / 3, nbDigits[19] / 3)) #NAIVE BAYES FACES print( "Average Correct Guesses for Naive Bayes Faces Based on Percentage of TrainingData Used" ) print( "10%% %d/149, 20%% %d/149, 30%% %d/149, 40%% %d/149, 50%% %d/149, 60%% %d/149, 70%% %d/149, 80%% %d/149, 90%% %d/149, 100%% %d/149" % (nbFaces[0] / 3, nbFaces[1] / 3, nbFaces[2] / 3, nbFaces[3] / 3, nbFaces[4] / 3, nbFaces[5] / 3, nbFaces[6] / 3, nbFaces[7] / 3, nbFaces[8] / 3, nbFaces[9] / 3)) stndDev = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] i = 0 while i < 10: stndDev[i] = nbFaces[i] / 3 stndDev[i] = nbFaces[i] - stndDev[i] stndDev[i] = math.pow(stndDev[i], 2) stndDev[i] = stndDev[i] / 149 stndDev[i] = math.sqrt(stndDev[i]) i += 1 print( "Standard Deviation for Naive Bayes Faces Based on Percentage of Training Data Used" ) print( "10%% %d, 20%% %d, 30%% %d, 40%% %d, 50%% %d, 60%% %d, 70%% %d, 80%% %d, 90%% %d, 100%% %d" % (stndDev[0], stndDev[1], stndDev[2], stndDev[3], stndDev[4], stndDev[5], stndDev[6], stndDev[7], stndDev[8], stndDev[9])) print( "Time to Complete Each Iteration Based on Percentage of Training Data Used In Seconds" ) print( "10%% %d, 20%% %d, 30%% %d, 40%% %d, 50%% %d, 60%% %d, 70%% %d, 80%% %d, 90%% %d, 100%% %d" % (nbFaces[10] / 3, nbFaces[11] / 3, nbFaces[12] / 3, nbFaces[13] / 3, nbFaces[14] / 3, nbFaces[15] / 3, nbFaces[16] / 3, nbFaces[17] / 3, nbFaces[18] / 3, nbFaces[19] / 3)) #PERCEPTRON DIGITS print( "Average Correct Guesses for Perceptron Digits Based on Percentage of Training Data Used" ) print( "10%% %d/1000, 20%% %d/1000, 30%% %d/1000, 40%% %d/1000, 50%% %d/1000, 60%% %d/1000, 70%% %d/1000, 80%% %d/1000, 90%% %d/1000, 100%% %d/1000" % (perceptronDigits[0] / 3, perceptronDigits[1] / 3, perceptronDigits[2] / 3, perceptronDigits[3] / 3, perceptronDigits[4] / 3, perceptronDigits[5] / 3, perceptronDigits[6] / 3, perceptronDigits[7] / 3, perceptronDigits[8] / 3, perceptronDigits[9] / 3)) print( "Standard Deviation for Perceptron Digits Based on Percentage of Training Data Used" ) stndDev = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] i = 0 while i < 10: stndDev[i] = perceptronDigits[i] / 3 stndDev[i] = perceptronDigits[i] - stndDev[i] stndDev[i] = math.pow(stndDev[i], 2) stndDev[i] = stndDev[i] / 1000 stndDev[i] = math.sqrt(stndDev[i]) i += 1 print( "10%% %d, 20%% %d, 30%% %d, 40%% %d, 50%% %d, 60%% %d, 70%% %d, 80%% %d, 90%% %d, 100%% %d" % (stndDev[0], stndDev[1], stndDev[2], stndDev[3], stndDev[4], stndDev[5], stndDev[6], stndDev[7], stndDev[8], stndDev[9])) print( "Time to Complete Each Iteration Based on Percentage of Training Data Used In Seconds" ) print( "10%% %d, 20%% %d, 30%% %d, 40%% %d, 50%% %d, 60%% %d, 70%% %d, 80%% %d, 90%% %d, 100%% %d" % (perceptronDigits[10] / 3, perceptronDigits[11] / 3, perceptronDigits[12] / 3, perceptronDigits[13] / 3, perceptronDigits[14] / 3, perceptronDigits[15] / 3, perceptronDigits[16] / 3, perceptronDigits[17] / 3, perceptronDigits[18] / 3, perceptronDigits[19] / 3)) #PERCEPTRON FACES print( "Average Correct Guesses for Perceptron Faces Based on Percentage of Training Data Used" ) print( "10%% %d/149, 20%% %d/149, 30%% %d/149, 40%% %d/149, 50%% %d/149, 60%% %d/149, 70%% %d/149, 80%% %d/149, 90%% %d/149, 100%% %d/149" % (perceptronFaces[0] / 3, perceptronFaces[1] / 3, perceptronFaces[2] / 3, perceptronFaces[3] / 3, perceptronFaces[4] / 3, perceptronFaces[5] / 3, perceptronFaces[6] / 3, perceptronFaces[7] / 3, perceptronFaces[8] / 3, perceptronFaces[9] / 3)) print( "Standard Deviation for Perceptron Faces Based on Percentage of Training Data Used" ) stndDev = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] i = 0 while i < 10: stndDev[i] = perceptronFaces[i] / 3 stndDev[i] = perceptronFaces[i] - stndDev[i] stndDev[i] = math.pow(stndDev[i], 2) stndDev[i] = stndDev[i] / 149 stndDev[i] = math.sqrt(stndDev[i]) i += 1 print( "10%% %d, 20%% %d, 30%% %d, 40%% %d, 50%% %d, 60%% %d, 70%% %d, 80%% %d, 90%% %d, 100%% %d" % (stndDev[0], stndDev[1], stndDev[2], stndDev[3], stndDev[4], stndDev[5], stndDev[6], stndDev[7], stndDev[8], stndDev[9])) print( "Time to Complete Each Iteration Based on Percentage of Training Data Used In Seconds" ) print( "10%% %d, 20%% %d, 30%% %d, 40%% %d, 50%% %d, 60%% %d, 70%% %d, 80%% %d, 90%% %d, 100%% %d" % (perceptronFaces[10] / 3, perceptronFaces[11] / 3, perceptronFaces[12] / 3, perceptronFaces[13] / 3, perceptronFaces[14] / 3, perceptronFaces[15] / 3, perceptronFaces[16] / 3, perceptronFaces[17] / 3, perceptronFaces[18] / 3, perceptronFaces[19] / 3))
def readCommand(argv): "Processes the command used to run from the command line." from optparse import OptionParser #is a powerful tool to parsing command line options. parser = OptionParser(USAGE_STRING) #parser.add_option('-f', '--features', help=default('Whether to use enhanced features'), default=False, action="store_true") #-f or --features both mean the same option, we can use either of them on the command line. #<script> -h will print all the help texts set for each option. #default: it sets the option.features to false if the option is not present in command line. but it present always, action is chosen i:e. True parser.add_option( '-c', '--classifier', help=default('The type of classifier'), choices=['mostFrequent', 'nb', 'naiveBayes', 'perceptron'], default='naiveBayes') parser.add_option('-d', '--data', help=default('Dataset to use'), choices=['digits', 'faces'], default='digits') parser.add_option('-t', '--training', help=default('The size of the training set'), default=100, type="int") parser.add_option('-f', '--features', help=default('Whether to use enhanced features'), default=False, action="store_true") parser.add_option('-o', '--odds', help=default('Whether to compute odds ratios'), default=False, action="store_true") parser.add_option('-1', '--label1', help=default("First label in an odds ratio comparison"), default=0, type="int") parser.add_option('-2', '--label2', help=default("Second label in an odds ratio comparison"), default=1, type="int") parser.add_option('-w', '--weights', help=default('Whether to print weights'), default=False, action="store_true") parser.add_option( '-k', '--smoothing', help=default("Smoothing parameter (ignored when using --autotune)"), type="float", default=2.0) parser.add_option( '-a', '--autotune', help=default("Whether to automatically tune hyperparameters"), default=False, action="store_true") parser.add_option('-i', '--iterations', help=default("Maximum iterations to run training"), default=15, type="int") parser.add_option('-s', '--test', help=default("Amount of test data to use"), default=TEST_SET_SIZE, type="int") parser.add_option('-n', '--analysis', help=default("Shows which data is wrongly predicted"), default=False, action="store_true") parser.add_option('-r', '--random', help=default("Trains the data set using random data and \ calculates averages for percent accuracy and standard deviation"), default=False, action="store_true") options, otherjunk = parser.parse_args(argv) if len(otherjunk) != 0: raise Exception('Command line input not understood: ' + str(otherjunk)) args = {} #empty dictionary to capture the command line inputs. # Set up variables according to the command line input. This is the start line of the whole drama. print("Doing classification") print("--------------------") print("Data:\t\t" + options.data) print("Classifier:\t\t" + options.classifier) print("Using enhanced features?:\t" + str(options.features)) if not options.random: print("Training set size:\t" + str(options.training)) if (options.data == "digits"): printImage = ImagePrinter( DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT ).printImage #DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT are global variables if (options.features): featureFunction = enhancedFeatureExtractorDigit else: featureFunction = basicFeatureExtractorDigit elif (options.data == "faces"): printImage = ImagePrinter( FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT ).printImage #FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT are global variables #& creating an object of class ImagePrinter. #print("ImagePrinter is used") if ( options.features ): #to decide on what to choose b/w enhancedFeatureExtractorFace function or basicFeatureExtractorFace function. featureFunction = enhancedFeatureExtractorFace else: featureFunction = basicFeatureExtractorFace else: #if both digits and faces are not what we called on the command prompt. print("Unknown dataset", options.data) print(USAGE_STRING) sys.exit(2) if (options.data == "digits"): legalLabels = range(10) #0,1,2,3,4,5,6,7,8,9 else: legalLabels = range(2) #face or not face #we are not keeping training <=0 hence, below wont be used if options.training <= 0: print( "Training set size should be a positive integer (you provided: %d)" % options.training) print(USAGE_STRING) sys.exit(2) #we are not using smoothing. hence, below wont be used if options.smoothing <= 0: print( "Please provide a positive number for smoothing (you provided: %f)" % options.smoothing) print(USAGE_STRING) sys.exit(2) #we are not using odds. hence, below wont be used if options.odds: if options.label1 not in legalLabels or options.label2 not in legalLabels: print("Didn't provide a legal labels for the odds ratio: (%d,%d)" % (options.label1, options.label2)) print(USAGE_STRING) sys.exit(2) #defining decision structure based on asked classifier. if (options.classifier == "naiveBayes" or options.classifier == "nb"): classifier = naiveBayes.NaiveBayesClassifier(legalLabels) classifier.setSmoothing(options.smoothing) if (options.autotune): print("Using automatic tuning for naivebayes") classifier.automaticTuning = True else: print("Using smoothing parameter k=%f for naivebayes" % options.smoothing) elif (options.classifier == "perceptron"): classifier = perceptron.PerceptronClassifier(legalLabels, options.iterations) #creating a PerceptronClassifier object by passing legalLabels and iterations=3 as max iterations to PerceptronClassifier's constructor. else: print("Unknown classifier:", options.classifier) print(USAGE_STRING) sys.exit(2) args[ 'classifier'] = classifier #assining classifier as a value to key 'classifier' args['featureFunction'] = featureFunction args['printImage'] = printImage return args, options
def readCommand( argv ): """ Processes the command used to run from the command line. """ import getopt # Set default options options = {'classifier': 'mostfrequent', 'data': 'digits', 'enhancedFeatures': False, 'train': 100, 'odds': False, 'class1': 1, 'class2': 0, 'smoothing': 1, 'automaticTuning' : False, 'maxIterations': 3} args = {} # This dictionary will hold the objects used by the main method # Read input from the command line commands = ['help', 'classifer=', 'data=', 'train=', 'enhancedFeatures', 'odds', 'class1=', 'class2=', 'smoothing=', 'automaticTuning' 'maxIterations='] try: opts = getopt.getopt( argv, "hc:d:t:fo1:2:k:ai:", commands ) except getopt.GetoptError: print USAGE_STRING sys.exit( 2 ) for option, value in opts[0]: if option in ['--help', '-h']: print USAGE_STRING sys.exit( 0 ) if option in ['--classifier', '-c']: options['classifier'] = value if option in ['--data', '-d']: options['data'] = value if option in ['--train', '-t']: options['train'] = int(value) if option in ['--enhancedFeatures', '-f']: options['enhancedFeatures'] = True if option in ['--odds', '-o']: options['odds'] = True if option in ['--class1', '-1']: options['class1'] = int(value) if option in ['--class2', '-2']: options['class2'] = int(value) if option in ['--smoothing', '-k']: options['smoothing'] = float( value ) if option in ['--automaticTuning', '-a']: options['automaticTuning'] = True if option in ['--maxIterations', '-i']: options['maxIterations'] = int(value) # Set up variables according to the command line input. print "Doing classification" print "--------------------" print "data:\t\t" + options['data'] print "classifier:\t\t" + options['classifier'] print "using enhanced features?:\t" + str(options['enhancedFeatures']) print "training set size:\t" + str(options['train']) if(options['data']=="digits"): printImage = ImagePrinter(DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT).printImage if (options['enhancedFeatures']): featureFunction = enhancedFeatureExtractorDigit else: featureFunction = basicFeatureExtractorDigit elif(options['data']=="faces"): printImage = ImagePrinter(FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT).printImage if (options['enhancedFeatures']): featureFunction = enhancedFeatureExtractorFace else: featureFunction = basicFeatureExtractorFace else: print "Unknown dataset", options['data'] print USAGE_STRING sys.exit(2) if(options['data']=="digits"): legalLabels = range(10) else: legalLabels = range(2) if options['train'] <= 0: print "Training set size should be a positive integer (you provided: %d)" % options['train'] print USAGE_STRING sys.exit(2) if options['smoothing'] <= 0: print "Please provide a positive number for smoothing (you provided: %f)" % options['smoothing'] print USAGE_STRING sys.exit(2) if options['odds']: for className in ['class1','class2']: if options[className] not in legalLabels: print "Didn't provide a legal labels for the odds ratio for %s" % className print USAGE_STRING sys.exit(2) if(options['classifier'] == "mostfrequent"): classifier = mostFrequent.MostFrequentClassifier(legalLabels) elif(options['classifier'] == "naivebayes"): classifier = naiveBayes.NaiveBayesClassifier(legalLabels) classifier.setSmoothing(options['smoothing']) if (options['automaticTuning']): print "using automatic tuning for naivebayes" classifier.automaticTuning = True else: print "using smoothing parameter k=%f for naivebayes" % options['smoothing'] elif(options['classifier'] == "perceptron"): classifier = perceptron.PerceptronClassifier(legalLabels,options['maxIterations']) elif(options['classifier'] == "mira"): classifier = mira.MiraClassifier(legalLabels, options['maxIterations']) if (options['automaticTuning']): print "using automatic tuning for MIRA" classifier.automaticTuning = True else: print "using default C=0.001 for MIRA" else: print "Unknown classifier:", options['classifier'] print USAGE_STRING sys.exit(2) args['classifier'] = classifier args['featureFunction'] = featureFunction args['printImage'] = printImage return args, options
if options.smoothing <= 0: print "Please provide a positive number for smoothing (you provided: %f)" % options.smoothing print USAGE_STRING sys.exit(2) if options.odds: if options.label1 not in legalLabels or options.label2 not in legalLabels: print "Didn't provide a legal labels for the odds ratio: (%d,%d)" % (options.label1, options.label2) print USAGE_STRING sys.exit(2) if(options.classifier == "mostFrequent"): classifier = mostFrequent.MostFrequentClassifier(legalLabels) elif(options.classifier == "naiveBayes" or options.classifier == "nb"): classifier = naiveBayes.NaiveBayesClassifier(legalLabels) classifier.setSmoothing(options.smoothing) if (options.autotune): print "using automatic tuning for naivebayes" classifier.automaticTuning = True else: print "using smoothing parameter k=%f for naivebayes" % options.smoothing elif(options.classifier == "perceptron"): if options.data != 'pacman': classifier = perceptron.PerceptronClassifier(legalLabels,options.iterations) else: classifier = perceptron_pacman.PerceptronClassifierPacman(legalLabels,options.iterations) elif(options.classifier == "mira"): if options.data != 'pacman': classifier = mira.MiraClassifier(legalLabels, options.iterations) if (options.autotune):
def runClassifier(args, options, legalLabels): featureFunction = args['featureFunction'] classifier = args['classifier'] printImage = args['printImage'] # Load data numTraining = options.training numTest = options.test if (options.data == "faces"): rawValidationData = samples.loadDataFile("facedata/facedatatrain", numTest, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile( "facedata/facedatatrainlabels", numTest) rawTestData = samples.loadDataFile("facedata/facedatatest", numTest, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("facedata/facedatatestlabels", numTest) else: rawValidationData = samples.loadDataFile("digitdata/validationimages", numTest, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("digitdata/validationlabels", numTest) rawTestData = samples.loadDataFile("digitdata/testimages", numTest, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest) # Extract features print "Extracting features..." validationData = map(featureFunction, rawValidationData) testData = map(featureFunction, rawTestData) total = numTraining f_out = open( './results/' + options.classifier + "_" + options.data + '.txt', 'w') # train and classify for portions of the training data, compare performance for i in range(1, 11): print "\n\nUsing", i * 10, "% of training data\n" multiplier = i / 10.0 numTraining = int(total * multiplier) if (options.classifier == "naiveBayes"): classifier = naiveBayes.NaiveBayesClassifier(legalLabels) elif (options.classifier == "perceptron"): classifier = perceptron.PerceptronClassifier( legalLabels, options.iterations) if options.data == "faces": rawTrainingData = samples.loadDataFile("facedata/facedatatrain", numTraining, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile( "facedata/facedatatrainlabels", numTraining) else: rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining) trainingData = map(featureFunction, rawTrainingData) # Conduct training and testing start_time = time.time() print "Training..." classifier.train(trainingData, trainingLabels, validationData, validationLabels) end_time = time.time() exec_time = end_time - start_time print "\n\nUsing " + str(numTraining) + " training images" print "Training took " + str(exec_time) + " seconds\n\n" print "Validating..." guesses = classifier.classify(validationData) correct = [ guesses[i] == validationLabels[i] for i in range(len(validationLabels)) ].count(True) print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels)) val_correct = correct print "Testing..." guesses = classifier.classify(testData) correct = [ guesses[i] == testLabels[i] for i in range(len(testLabels)) ].count(True) print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels)) test_correct = correct # analysis(classifier, guesses, testLabels, testData, rawTestData, printImage) f_out.write( str(numTraining) + " " + str(exec_time) + " " + str(numTest) + " " + str(val_correct) + " " + str(test_correct) + '\n') f_out.close()