def readCommand( argv ):
  "Processes the command used to run from the command line."
  from optparse import OptionParser
  parser = OptionParser(USAGE_STRING)

  parser.add_option('-c', '--classifier', help=default('The type of classifier'), choices=['mostFrequent', 'nb', 'naiveBayes', 'perceptron','mira'], default='perceptron')
  parser.add_option('-d', '--data', help=default('Dataset to use'), choices=['digits', 'faces'], default='digits')
  parser.add_option('-t', '--training', help=default('The size of the training set'), default=100, type="int")
  parser.add_option('-f', '--features', help=default('Whether to use enhanced features'), default=False, action="store_true")
  parser.add_option('-o', '--odds', help=default('Whether to compute odds ratios'), default=False, action="store_true")
  parser.add_option('-1', '--label1', help=default("First label in an odds ratio comparison"), default=0, type="int")
  parser.add_option('-2', '--label2', help=default("Second label in an odds ratio comparison"), default=1, type="int")
  parser.add_option('-w', '--weights', help=default('Whether to print weights'), default=False, action="store_true")
  parser.add_option('-k', '--smoothing', help=default("Smoothing parameter (ignored when using --autotune)"), type="float", default=2.0)
  parser.add_option('-a', '--autotune', help=default("Whether to automatically tune hyperparameters"), default=False, action="store_true")
  parser.add_option('-i', '--iterations', help=default("Maximum iterations to run training"), default=3, type="int")
  parser.add_option('-s', '--test', help=default("Amount of test data to use"), default=TEST_SET_SIZE, type="int")
  parser.add_option('-n', '--analysis', help=default("Shows which data is wrongly predicted"), default=True, action="store_true")
  parser.add_option('-r', '--random', help=default("Trains the data set using random data and calculates averages for percent accuracy and standard deviation"), default=True, action="store_true")

  options, otherjunk = parser.parse_args(argv)
  if len(otherjunk) != 0: raise Exception('Command line input not understood: ' + str(otherjunk))
  args = {}

  # Set up variables according to the command line input.
  print ("Doing classification")
  print ("--------------------")
  print ("Data:\t\t" + options.data)
  print ("Classifier:\t\t" + options.classifier)
  print ("Using enhanced features?:\t" + str(options.features))
  if not options.random:
      print ("Training set size:\t" + str(options.training))
  if(options.data=="digits"):
    printImage = ImagePrinter(DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT).printImage
    if (options.features):
      featureFunction = enhancedFeatureExtractorDigit
    else:
      featureFunction = basicFeatureExtractorDigit
  elif(options.data=="faces"):
    printImage = ImagePrinter(FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT).printImage
    if (options.features):
      featureFunction = enhancedFeatureExtractorFace
    else:
      featureFunction = basicFeatureExtractorFace
  else:
    print ("Unknown dataset", options.data)
    print (USAGE_STRING)
    sys.exit(2)

  if(options.data=="digits"):
    legalLabels = range(10)
  else:
    legalLabels = range(2)

  if options.training <= 0:
    print ("Training set size should be a positive integer (you provided: %d)" % options.training)
    print (USAGE_STRING)
    sys.exit(2)

  if options.smoothing <= 0:
    print ("Please provide a positive number for smoothing (you provided: %f)" % options.smoothing)
    print (USAGE_STRING)
    sys.exit(2)

  if options.odds:
    if options.label1 not in legalLabels or options.label2 not in legalLabels:
      print ("Didn't provide a legal labels for the odds ratio: (%d,%d)" % (options.label1, options.label2))
      print (USAGE_STRING)
      sys.exit(2)
      
  if(options.classifier == "naiveBayes" or options.classifier == "nb"):
    classifier = naiveBayes.NaiveBayesClassifier(legalLabels)
    classifier.setSmoothing(options.smoothing)
    if (options.autotune):
        print ("Using automatic tuning for naivebayes")
        classifier.automaticTuning = True
    else:
        print ("Using smoothing parameter k=%f for naivebayes" %  options.smoothing)
  elif(options.classifier == "perceptron"):
    classifier = perceptron.PerceptronClassifier(legalLabels,options.iterations)
  elif(options.classifier == "mira"):
    classifier = mira.MiraClassifier(legalLabels, options.iterations)
    if (options.autotune):
        print ("Using automatic tuning for MIRA")
        classifier.automaticTuning = True
    else:
        print ("Using default C=0.001 for MIRA")
  else:
    print ("Unknown classifier:", options.classifier)
    print (USAGE_STRING)

    sys.exit(2)

  args['classifier'] = classifier
  args['featureFunction'] = featureFunction
  args['printImage'] = printImage

  return args, options
def readCommand(argv):
    "Processes the command used to run from the command line."
    from optparse import OptionParser
    parser = OptionParser(USAGE_STRING)

    parser.add_option('-c',
                      '--classifier',
                      help=default('The type of classifier'),
                      choices=[
                          'mostFrequent', 'nb', 'naiveBayes', 'perceptron',
                          'mira', 'minicontest'
                      ],
                      default='mostFrequent')
    parser.add_option('-d',
                      '--data',
                      help=default('Dataset to use'),
                      choices=['digits', 'faces', 'pacman'],
                      default='digits')
    parser.add_option('-t',
                      '--training',
                      help=default('The size of the training set'),
                      default=100,
                      type="int")
    parser.add_option('-f',
                      '--features',
                      help=default('Whether to use enhanced features'),
                      default=False,
                      action="store_true")
    parser.add_option('-o',
                      '--odds',
                      help=default('Whether to compute odds ratios'),
                      default=False,
                      action="store_true")
    parser.add_option('-1',
                      '--label1',
                      help=default("First label in an odds ratio comparison"),
                      default=0,
                      type="int")
    parser.add_option('-2',
                      '--label2',
                      help=default("Second label in an odds ratio comparison"),
                      default=1,
                      type="int")
    parser.add_option('-w',
                      '--weights',
                      help=default('Whether to print weights'),
                      default=False,
                      action="store_true")
    parser.add_option(
        '-k',
        '--smoothing',
        help=default("Smoothing parameter (ignored when using --autotune)"),
        type="float",
        default=2.0)
    parser.add_option(
        '-a',
        '--autotune',
        help=default("Whether to automatically tune hyperparameters"),
        default=False,
        action="store_true")
    parser.add_option('-i',
                      '--iterations',
                      help=default("Maximum iterations to run training"),
                      default=3,
                      type="int")
    parser.add_option('-s',
                      '--test',
                      help=default("Amount of test data to use"),
                      default=TEST_SET_SIZE,
                      type="int")
    parser.add_option('-g',
                      '--agentToClone',
                      help=default("Pacman agent to copy"),
                      default=None,
                      type="str")

    options, otherjunk = parser.parse_args(argv)
    if len(otherjunk) != 0:
        raise Exception('Command line input not understood: ' + str(otherjunk))
    args = {}

    # Set up variables according to the command line input.
    print "Doing classification"
    print "--------------------"
    print "data:\t\t" + options.data
    print "classifier:\t\t" + options.classifier
    if not options.classifier == 'minicontest':
        print "using enhanced features?:\t" + str(options.features)
    else:
        print "using minicontest feature extractor"
    print "training set size:\t" + str(options.training)
    if (options.data == "digits"):
        printImage = ImagePrinter(DIGIT_DATUM_WIDTH,
                                  DIGIT_DATUM_HEIGHT).printImage
        if (options.features):
            featureFunction = enhancedFeatureExtractorDigit
        else:
            featureFunction = basicFeatureExtractorDigit
        if (options.classifier == 'minicontest'):
            featureFunction = contestFeatureExtractorDigit
    elif (options.data == "faces"):
        printImage = ImagePrinter(FACE_DATUM_WIDTH,
                                  FACE_DATUM_HEIGHT).printImage
        if (options.features):
            featureFunction = enhancedFeatureExtractorFace
        else:
            featureFunction = basicFeatureExtractorFace
    elif (options.data == "pacman"):
        printImage = None
        if (options.features):
            featureFunction = enhancedFeatureExtractorPacman
        else:
            featureFunction = basicFeatureExtractorPacman
    else:
        print "Unknown dataset", options.data
        print USAGE_STRING
        sys.exit(2)

    if (options.data == "digits"):
        legalLabels = range(10)
    else:
        legalLabels = ['Stop', 'West', 'East', 'North', 'South']

    if options.training <= 0:
        print "Training set size should be a positive integer (you provided: %d)" % options.training
        print USAGE_STRING
        sys.exit(2)

    if options.smoothing <= 0:
        print "Please provide a positive number for smoothing (you provided: %f)" % options.smoothing
        print USAGE_STRING
        sys.exit(2)

    if options.odds:
        if options.label1 not in legalLabels or options.label2 not in legalLabels:
            print "Didn't provide a legal labels for the odds ratio: (%d,%d)" % (
                options.label1, options.label2)
            print USAGE_STRING
            sys.exit(2)

    if (options.classifier == "mostFrequent"):
        classifier = mostFrequent.MostFrequentClassifier(legalLabels)
    elif (options.classifier == "naiveBayes" or options.classifier == "nb"):
        classifier = naiveBayes.NaiveBayesClassifier(legalLabels)
        classifier.setSmoothing(options.smoothing)
        if (options.autotune):
            print "using automatic tuning for naivebayes"
            classifier.automaticTuning = True
        else:
            print "using smoothing parameter k=%f for naivebayes" % options.smoothing
    elif (options.classifier == "perceptron"):
        if options.data != 'pacman':
            classifier = perceptron.PerceptronClassifier(
                legalLabels, options.iterations)
        else:
            classifier = perceptron_pacman.PerceptronClassifierPacman(
                legalLabels, options.iterations)
    elif (options.classifier == "mira"):
        if options.data != 'pacman':
            classifier = mira.MiraClassifier(legalLabels, options.iterations)
        if (options.autotune):
            print "using automatic tuning for MIRA"
            classifier.automaticTuning = True
        else:
            print "using default C=0.001 for MIRA"
    elif (options.classifier == 'minicontest'):
        import minicontest
        classifier = minicontest.contestClassifier(legalLabels)
    else:
        print "Unknown classifier:", options.classifier
        print USAGE_STRING

        sys.exit(2)

    args['agentToClone'] = options.agentToClone

    args['classifier'] = classifier
    args['featureFunction'] = featureFunction
    args['printImage'] = printImage

    return args, options
Example #3
0
def runClassifier():
    """
  Harness code for running different classifiers on the face or digit data.
  
  This is the main function for classification, and is designed
  to be invoked from the command line (outside the Python interpreter).
  
  Usage:
    > python dataClassifier.py 
    OR
    > python dataClassifier.py <data> <classifierName>
    OR
    > python dataClassifier.py <data> <classifierName> <featureFunction>
    OR
    > python dataClassifier.py <data> <classifierName> <featureFunction> <numTrainingExamples>
    OR
    > python dataClassifier.py <data> <classifierName> <featureFunction> <numTrainingExamples> <odds class1 class2>
    
  For example:
    > python dataClassifier.py digits naivebayes basic 1000
    
  would run the naive Bayes classifier on 1000 training examples using the
  basicFeatureExtractor function, and then test the classifier on the test data.
  """
    print "Doing classification"
    print "--------------------"
    # Assign default values for arguments if they are not provided.
    if (len(sys.argv) == 1):
        print "No data specified; using digits."
        sys.argv.append("digits")
    if (len(sys.argv) == 2):
        print "No classifier specified; using default."
        sys.argv.append("mostfrequent")
    if (len(sys.argv) == 3):
        print "No feature extraction function specified; using default."
        sys.argv.append("basic")
    if (len(sys.argv) == 4):
        print "No training set size specified; using default."
        sys.argv.append("100")
    if (len(sys.argv) == 5):
        print "Not doing odds ratio computation."
        sys.argv.append("noodds")

    # Set up variables according to the command line input.
    print "data:\t\t" + sys.argv[1]
    print "classifier:\t\t" + sys.argv[2]
    print "feature extractor:\t" + sys.argv[3]
    print "training set size:\t" + sys.argv[4]
    if ((sys.argv[1] == "digits") & (sys.argv[3] == "basic")):
        featureFunction = basicFeatureExtractorDigit
    elif ((sys.argv[1] == "faces") & (sys.argv[3] == "basic")):
        featureFunction = basicFeatureExtractorFace
    elif ((sys.argv[1] == "digits") & (sys.argv[3] == "enhanced")):
        featureFunction = enhancedFeatureExtractorDigit
    elif ((sys.argv[1] == "faces") & (sys.argv[3] == "enhanced")):
        featureFunction = enhancedFeatureExtractorFace
    else:
        print "Unknown feature function:", sys.argv[2]
        return

    if (sys.argv[1] == "digits"):  # if digits detect
        legalLabels = range(10)
    else:  # if face detect
        legalLabels = range(2)

    if (sys.argv[2] == "mostfrequent"):
        classifier = mostFrequent.MostFrequentClassifier(legalLabels)
    elif (sys.argv[2] == "naivebayes"):
        classifier = naiveBayes.NaiveBayesClassifier(legalLabels)
    elif (sys.argv[2] == "perceptron"):
        classifier = perceptron.PerceptronClassifier(legalLabels)
    else:
        print "Unknown classifier:", sys.argv[2]
        return

    # Load data
    numTraining = int(sys.argv[4])

    if (sys.argv[1] == "faces"):
        rawTrainingData = samples.loadDataFile("facedata/facedatatrain",
                                               numTraining, FACE_DATUM_WIDTH,
                                               FACE_DATUM_HEIGHT)
        trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels",
                                                numTraining)
        rawValidationData = samples.loadDataFile("facedata/facedatatrain",
                                                 TEST_SET_SIZE,
                                                 FACE_DATUM_WIDTH,
                                                 FACE_DATUM_HEIGHT)
        validationLabels = samples.loadLabelsFile(
            "facedata/facedatatrainlabels", TEST_SET_SIZE)
        rawTestData = samples.loadDataFile("facedata/facedatatest",
                                           TEST_SET_SIZE, FACE_DATUM_WIDTH,
                                           FACE_DATUM_HEIGHT)
        testLabels = samples.loadLabelsFile("facedata/facedatatestlabels",
                                            TEST_SET_SIZE)
    else:
        rawTrainingData = samples.loadDataFile("digitdata/trainingimages",
                                               numTraining, DIGIT_DATUM_WIDTH,
                                               DIGIT_DATUM_HEIGHT)
        trainingLabels = samples.loadLabelsFile("digitdata/traininglabels",
                                                numTraining)
        rawValidationData = samples.loadDataFile("digitdata/validationimages",
                                                 TEST_SET_SIZE,
                                                 DIGIT_DATUM_WIDTH,
                                                 DIGIT_DATUM_HEIGHT)
        validationLabels = samples.loadLabelsFile("digitdata/validationlabels",
                                                  TEST_SET_SIZE)
        rawTestData = samples.loadDataFile("digitdata/testimages",
                                           TEST_SET_SIZE, DIGIT_DATUM_WIDTH,
                                           DIGIT_DATUM_HEIGHT)
        testLabels = samples.loadLabelsFile("digitdata/testlabels",
                                            TEST_SET_SIZE)

    # Extract features
    print "Extracting features..."
    trainingData = map(featureFunction, rawTrainingData)
    validationData = map(featureFunction, rawValidationData)
    testData = map(featureFunction, rawTestData)

    # Conduct training and testing
    print "Training..."
    classifier.train(trainingData, trainingLabels, validationData,
                     validationLabels)
    print "Validating..."
    guesses = classifier.classify(validationData)
    correct = [
        guesses[i] == validationLabels[i] for i in range(len(validationLabels))
    ].count(True)
    print str(correct), ("correct out of " + str(len(validationLabels)) +
                         " (%.1f%%).") % (100.0 * correct /
                                          len(validationLabels))
    print "Testing..."
    guesses = classifier.classify(testData)
    correct = [guesses[i] == testLabels[i]
               for i in range(len(testLabels))].count(True)
    print str(correct), ("correct out of " + str(len(testLabels)) +
                         " (%.1f%%).") % (100.0 * correct / len(testLabels))
    util.pause()
    analysis(classifier, guesses, testLabels, rawTestData)

    # do odds ratio computation if specified at command line
    if ((sys.argv[5] == "odds") & (len(sys.argv) == 8)):
        features_class1, features_class2, features_odds = classifier.findHighOddsFeatures(
            int(sys.argv[6]), int(sys.argv[7]))
        if (sys.argv[1] == "faces"):
            printImage(features_class1, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT)
            printImage(features_class2, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT)
            printImage(features_odds, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT)
        else:
            printImage(features_class1, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT)
            printImage(features_class2, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT)
            printImage(features_odds, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT)
Example #4
0
def readCommand(argv):
    """Processes the command used to run from the command line."""
    from optparse import OptionParser
    parser = OptionParser(USAGE_STRING)

    parser.add_option(
        '-r',
        '--run',
        help=default('automatically runs training and test cycle for 5 times'),
        default=False,
        action='store_true')

    parser.add_option('-c',
                      '--classifier',
                      help=default('The type of classifier'),
                      choices=['perceptron', 'naiveBayes', 'mira'],
                      default='naiveBayes')
    parser.add_option('-d',
                      '--data',
                      help=default('Dataset to use'),
                      choices=['digits', 'faces'],
                      default='digits')
    parser.add_option('-t',
                      '--training',
                      help=default('The ratio of the training set to use'),
                      default=1.0,
                      type="float")
    parser.add_option('-f',
                      '--features',
                      help=default('Whether to use enhanced features'),
                      default=False,
                      action="store_true")
    parser.add_option('-o',
                      '--odds',
                      help=default('Whether to compute odds ratios'),
                      default=False,
                      action="store_true")
    parser.add_option('-1',
                      '--label1',
                      help=default("First label in an odds ratio comparison"),
                      default=0,
                      type="int")
    parser.add_option('-2',
                      '--label2',
                      help=default("Second label in an odds ratio comparison"),
                      default=1,
                      type="int")
    parser.add_option(
        '-k',
        '--smoothing',
        help=default("Smoothing parameter (ignored when using --autotune)"),
        type="float",
        default=2.0)
    parser.add_option(
        '-a',
        '--autotune',
        help=default("Whether to automatically tune hyperparameters"),
        default=False,
        action="store_true")
    parser.add_option('-i',
                      '--iterations',
                      help=default("Maximum iterations to run training"),
                      default=3,
                      type="int")

    options, otherjunk = parser.parse_args(argv)
    if len(otherjunk) != 0:
        raise Exception('Command line input not understood: ' + str(otherjunk))
    args = {}

    # Set up variables according to the command line input.
    print("Doing classification")
    print("--------------------")
    print("data:\t\t" + options.data)
    print("classifier:\t\t" + options.classifier)
    print("using enhanced features?:\t" + str(options.features))

    if options.data == "digits":
        printImage = ImagePrinter(DIGIT_DATUM_WIDTH,
                                  DIGIT_DATUM_HEIGHT).printImage
        if options.features:
            featureFunction = enhancedFeatureExtractorDigit
        else:
            featureFunction = basicFeatureExtractorDigit
    elif options.data == "faces":
        printImage = ImagePrinter(FACE_DATUM_WIDTH,
                                  FACE_DATUM_HEIGHT).printImage
        if options.features:
            featureFunction = enhancedFeatureExtractorFace
        else:
            featureFunction = basicFeatureExtractorFace
    else:
        print("Unknown dataset", options.data)
        print(USAGE_STRING)
        sys.exit(2)

    if options.data == "digits":
        legalLabels = range(10)
    else:
        legalLabels = range(2)

    if options.training <= 0:
        print(
            "Training set size should be a positive integer (you provided: %d)"
            % options.training)
        print(USAGE_STRING)
        sys.exit(2)

    if options.smoothing <= 0:
        print(
            "Please provide a positive number for smoothing (you provided: %f)"
            % options.smoothing)
        print(USAGE_STRING)
        sys.exit(2)

    if options.odds:
        if options.label1 not in legalLabels or options.label2 not in legalLabels:
            print("Didn't provide a legal labels for the odds ratio: (%d,%d)" %
                  (options.label1, options.label2))
            print(USAGE_STRING)
            sys.exit(2)

    if options.classifier == "mira":
        classifier = mira.MiraClassifier(legalLabels, options.iterations)
    elif options.classifier == "naiveBayes":
        classifier = naiveBayes.NaiveBayesClassifier(legalLabels)
        classifier.setSmoothing(options.smoothing)
        if options.autotune:
            print
            "using automatic tuning for naivebayes"
            classifier.automaticTuning = True
        else:
            print("using smoothing parameter k=%f for naivebayes" %
                  options.smoothing)
    elif options.classifier == "perceptron":
        classifier = perceptron.PerceptronClassifier(legalLabels,
                                                     options.iterations)

    elif options.classifier == "knn":
        classifier = knn.KNN(legalLabels)

    else:
        print("Unknown classifier:", options.classifier)
        print(USAGE_STRING)

        sys.exit(2)

    args['classifier'] = classifier
    args['featureFunction'] = featureFunction
    args['printImage'] = printImage

    return args, options
def readCommand( argv ):
    "Processes the command used to run from the command line."
    from optparse import OptionParser    
    parser = OptionParser(USAGE_STRING)
    
    parser.add_option('-c', '--classifier', help=default('The type of classifier'), choices=['naiveBayes', 'perceptron', 'kNN'], default='naiveBayes')
    parser.add_option('-d', '--data', help=default('Dataset to use'), choices=['digits', 'faces'], default='digits')
    parser.add_option('-t', '--training', help=default('The size of the training set'), default=100, type="int")
    parser.add_option('-w', '--weights', help=default('Whether to print weights'), default=False, action="store_true")
    parser.add_option('-k', '--neighbors', help=default("Numbers of neighbors in k-Nearest Neighbors"), type="int", default=3)
    parser.add_option('-i', '--iterations', help=default("Maximum iterations to run training"), default=3, type="int")
    parser.add_option('-s', '--test', help=default("Amount of test data to use"), default=TEST_SET_SIZE, type="int")

    options, otherjunk = parser.parse_args(argv)
    if len(otherjunk) != 0: raise Exception('Command line input not understood: ' + str(otherjunk))
    args = {}
    
    # Set up variables according to the command line input.
    print "Doing classification"
    print "--------------------"
    print "data:\t\t" + options.data
    print "classifier:\t\t" + options.classifier
    print "training set size:\t" + str(options.training)
    if(options.data=="digits"):
        printImage = ImagePrinter(DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT).printImage
        featureFunction = basicFeatureExtractorDigit
    elif(options.data=="faces"):
        printImage = ImagePrinter(FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT).printImage
        featureFunction = basicFeatureExtractorFace            
    else:
        print "Unknown dataset", options.data
        print USAGE_STRING
        sys.exit(2)
        
    if(options.data=="digits"):
        legalLabels = range(10)
    else:
        legalLabels = range(2)
        
    if options.training <= 0:
        print "Training set size should be a positive integer (you provided: %d)" % options.training
        print USAGE_STRING
        sys.exit(2)

    if options.neighbors <= 0:
        print "Neighbors for kNN should be a positive integer (you provided: %d)" % options.neighbors
        print USAGE_STRING
        sys.exit(2)
        
    if(options.classifier == "naiveBayes"):
        classifier = naiveBayes.NaiveBayesClassifier(legalLabels)
    elif(options.classifier == "perceptron"):
        classifier = perceptron.PerceptronClassifier(legalLabels,options.iterations)
    elif(options.classifier == "kNN"):
        classifier = kNN.kNNClassifier(legalLabels,options.neighbors)
    else:
        print "Unknown classifier:", options.classifier
        print USAGE_STRING
        
        sys.exit(2)

    args['classifier'] = classifier
    args['featureFunction'] = featureFunction
    args['printImage'] = printImage
    
    return args, options
def readCommand(argv):
    "Processes the command used to run from the command line."
    from optparse import OptionParser
    parser = OptionParser(USAGE_STRING)

    parser.add_option('-c',
                      '--classifier',
                      help=default('The type of classifier'),
                      choices=['mostFrequent', 'nb', 'naiveBayes', 'GDA'],
                      default='mostFrequent')
    parser.add_option('-d',
                      '--data',
                      help=default('Dataset to use'),
                      choices=['digits', 'faces'],
                      default='digits')
    parser.add_option('-t',
                      '--training',
                      help=default('The size of the training set'),
                      default=450,
                      type="int")
    parser.add_option(
        '-k',
        '--smoothing',
        help=default("Smoothing parameter (ignored when using --autotune)"),
        type="float",
        default=2.0)
    parser.add_option(
        '-a',
        '--autotune',
        help=default("Whether to automatically tune hyperparameters"),
        default=False,
        action="store_true")
    parser.add_option('-i',
                      '--iterations',
                      help=default("Maximum iterations to run training"),
                      default=3,
                      type="int")

    options, otherjunk = parser.parse_args(argv)
    if len(otherjunk) != 0:
        raise Exception('Command line input not understood: ' + str(otherjunk))
    args = {}

    # Set up variables according to the command line input.
    print "Doing classification"
    print "--------------------"
    print "data:\t\t" + options.data
    print "classifier:\t\t" + options.classifier
    print "training set size:\t" + str(options.training)
    if (options.data == "digits"):
        printImage = ImagePrinter(DIGIT_DATUM_WIDTH,
                                  DIGIT_DATUM_HEIGHT).printImage
        featureFunction = basicFeatureExtractorDigit
    elif (options.data == "faces"):
        printImage = ImagePrinter(FACE_DATUM_WIDTH,
                                  FACE_DATUM_HEIGHT).printImage
        featureFunction = basicFeatureExtractorFace
    else:
        print "Unknown dataset", options.data
        print USAGE_STRING
        sys.exit(2)

    if (options.data == "digits"):
        legalLabels = range(10)
    else:
        legalLabels = range(2)

    if options.training <= 0:
        print "Training set size should be a positive integer (you provided: %d)" % options.training
        print USAGE_STRING
        sys.exit(2)

    if options.smoothing <= 0:
        print "Please provide a positive number for smoothing (you provided: %f)" % options.smoothing
        print USAGE_STRING
        sys.exit(2)

    if (options.classifier == "mostFrequent"):
        classifier = mostFrequent.MostFrequentClassifier(legalLabels)
    elif (options.classifier == "naiveBayes" or options.classifier == "nb"):
        classifier = naiveBayes.NaiveBayesClassifier(legalLabels)
        classifier.setSmoothing(options.smoothing)
        if (options.autotune):
            print "using automatic tuning for naivebayes"
            classifier.automaticTuning = True
        else:
            print "using smoothing parameter k=%f for naivebayes" % options.smoothing
    elif (options.classifier == "GDA"):
        classifier = gaussianDiscriminantAnalysis.GaussianDiscriminantAnalysisClassifier(
            legalLabels, "GDA")
    else:
        print "Unknown classifier:", options.classifier
        print USAGE_STRING

        sys.exit(2)

    args['classifier'] = classifier
    args['featureFunction'] = featureFunction
    args['printImage'] = printImage

    return args, options
Example #7
0
def readCommand(argv):
    "Processes the command used to run from the command line."
    from optparse import OptionParser
    parser = OptionParser(USAGE_STRING)

    parser.add_option(
        '-c',
        '--classifier',
        help=default('The type of classifier'),
        choices=['mostFrequent', 'nb', 'naiveBayes', 'perceptron', 'knn'],
        default='mostFrequent')
    parser.add_option('-d',
                      '--data',
                      help=default('Dataset to use'),
                      choices=['digits', 'faces'],
                      default='digits')
    parser.add_option('-t',
                      '--training',
                      help=default('The size of the training set'),
                      default=100,
                      type="int")
    parser.add_option('-f',
                      '--features',
                      help=default('Whether to use enhanced features'),
                      default=False,
                      action="store_true")
    parser.add_option('-o',
                      '--odds',
                      help=default('Whether to compute odds ratios'),
                      default=False,
                      action="store_true")
    parser.add_option('-1',
                      '--label1',
                      help=default("First label in an odds ratio comparison"),
                      default=0,
                      type="int")
    parser.add_option('-2',
                      '--label2',
                      help=default("Second label in an odds ratio comparison"),
                      default=1,
                      type="int")
    parser.add_option('-w',
                      '--weights',
                      help=default('Whether to print weights'),
                      default=False,
                      action="store_true")
    parser.add_option(
        '-k',
        '--smoothing',
        help=default("Smoothing parameter (ignored when using --autotune)"),
        type="float",
        default=K_VALUE)
    parser.add_option(
        '-a',
        '--autotune',
        help=default("Whether to automatically tune hyperparameters"),
        default=False,
        action="store_true")
    parser.add_option('-i',
                      '--iterations',
                      help=default("Maximum iterations to run training"),
                      default=MAX_ITERATIONS,
                      type="int")
    parser.add_option('-s',
                      '--test',
                      help=default("Amount of test data to use"),
                      default=TEST_SET_SIZE,
                      type="int")
    parser.add_option(
        '-q',
        '--index',
        help=default(
            "index of data whose predicted label and actual label you want to display"
        ),
        default=-1,
        type="int")
    options, otherjunk = parser.parse_args(argv)
    if len(otherjunk) != 0:
        raise Exception('Command line input not understood: ' + str(otherjunk))
    args = {}

    # Set up variables according to the command line input.
    print("Doing classification")
    print("--------------------")
    print("data:\t\t", options.data)
    print("classifier:\t\t", options.classifier)
    print("training set size:\t" + str(options.training))
    if (options.data == "digits"):
        printImage = ImagePrinter(DIGIT_DATUM_WIDTH,
                                  DIGIT_DATUM_HEIGHT).printImage
        if (options.features):
            featureFunction = enhancedFeatureExtractorDigit
        else:
            print('using basicFeatureExtractorDigit for digits')
            featureFunction = basicFeatureExtractorDigit
    elif (options.data == "faces"):
        printImage = ImagePrinter(FACE_DATUM_WIDTH,
                                  FACE_DATUM_HEIGHT).printImage
        if (options.features):
            featureFunction = enhancedFeatureExtractorFace
        else:
            print('using basicFeatureExtractorDigit for faces')
            featureFunction = basicFeatureExtractorFace
    else:
        print("Unknown dataset", options.data)
        print(USAGE_STRING)
        sys.exit(2)

    if (options.data == "digits"):
        legalLabels = range(10)
    else:
        legalLabels = range(2)

    if options.training <= 0:
        print(
            "Training set size should be a positive integer (you provided: %d)",
            options.training)
        print(USAGE_STRING)
        sys.exit(2)

    if options.smoothing <= 0:
        print(
            "Please provide a positive number for smoothing (you provided: %f)",
            options.smoothing)
        print(USAGE_STRING)
        sys.exit(2)

    if options.odds:
        if options.label1 not in legalLabels or options.label2 not in legalLabels:
            print("Didn't provide a legal labels for the odds ratio: (%d,%d)" %
                  (options.label1, options.label2))
            print(USAGE_STRING)
            sys.exit(2)

    if (options.classifier == "mostFrequent"):
        classifier = mostFrequent.MostFrequentClassifier(legalLabels)
    elif (options.classifier == "naiveBayes" or options.classifier == "nb"):
        classifier = naiveBayes.NaiveBayesClassifier(legalLabels)
        classifier.setSmoothing(options.smoothing)
        if (options.autotune):
            print("using automatic tuning for naivebayes")
            classifier.automaticTuning = True
        else:
            print("using smoothing parameter k=%f for naivebayes",
                  options.smoothing)
    elif (options.classifier == "perceptron"):
        classifier = perceptron.PerceptronClassifier(legalLabels,
                                                     options.iterations)
    elif (options.classifier == 'knn'):
        if (options.data == "digits"):
            classifier = knn.KNNClassifier(legalLabels, options.smoothing)
        else:
            classifier = knn_faces.KNNClassifierFaces(legalLabels,
                                                      options.smoothing)
    else:
        print("Unknown classifier:", options.classifier)
        print(USAGE_STRING)

        sys.exit(2)

    args['classifier'] = classifier
    args['featureFunction'] = featureFunction
    args['printImage'] = printImage

    return args, options
Example #8
0
 def __init__(self, legalLabels):
   self.guess = None
   self.type = "minicontest"
   self.classifier = naiveBayes.NaiveBayesClassifier(legalLabels)
   #self.classifier = mira.MiraClassifier(legalLabels, 5)
   self.classifier.automaticTuning = True
def runClassifier():
    ########################################################################################################################################
    #Edited Code
    #Store info for each iteration
    nbDigits = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    nbFaces = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    perceptronDigits = [
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    ]
    perceptronFaces = [
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    ]
    trainingCounts = {
        0: 500,
        1: 1000,
        2: 1500,
        3: 2000,
        4: 2500,
        5: 3000,
        6: 3500,
        7: 4000,
        8: 4500,
        9: 5000,
        10: 500,
        11: 1000,
        12: 1500,
        13: 2000,
        14: 2500,
        15: 3000,
        16: 3500,
        17: 4000,
        18: 4500,
        19: 5000,
        20: 45,
        21: 90,
        22: 135,
        23: 180,
        24: 225,
        25: 270,
        26: 315,
        27: 360,
        28: 405,
        29: 450,
        30: 45,
        31: 90,
        32: 135,
        33: 180,
        34: 225,
        35: 270,
        36: 315,
        37: 360,
        38: 405,
        39: 450
    }
    #FaceData
    rawFaceTrainingData = samples.loadDataFile("facedata/facedatatrain", 450,
                                               FACE_DATUM_WIDTH,
                                               FACE_DATUM_HEIGHT)
    faceTrainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels",
                                                450)

    rawFaceValidationData = samples.loadDataFile("facedata/facedatatrain", 300,
                                                 FACE_DATUM_WIDTH,
                                                 FACE_DATUM_HEIGHT)
    faceValidationLabels = samples.loadLabelsFile(
        "facedata/facedatatrainlabels", 300)

    rawFaceTestData = samples.loadDataFile("facedata/facedatatest", 149,
                                           FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT)
    testFaceLabels = samples.loadLabelsFile("facedata/facedatatestlabels", 149)

    #DigitData
    rawDigitTrainingData = samples.loadDataFile("digitdata/trainingimages",
                                                5000, DIGIT_DATUM_WIDTH,
                                                DIGIT_DATUM_HEIGHT)
    digitTrainingLabels = samples.loadLabelsFile("digitdata/traininglabels",
                                                 5000)

    rawDigitValidationData = samples.loadDataFile("digitdata/validationimages",
                                                  1000, DIGIT_DATUM_WIDTH,
                                                  DIGIT_DATUM_HEIGHT)
    digitValidationLabels = samples.loadLabelsFile(
        "digitdata/validationlabels", 1000)

    rawDigitTestData = samples.loadDataFile("digitdata/testimages", 1000,
                                            DIGIT_DATUM_WIDTH,
                                            DIGIT_DATUM_HEIGHT)
    testDigitLabels = samples.loadLabelsFile("digitdata/testlabels", 1000)

    #Automation of test for each classifier and data type
    for x in range(40):

        if x < 10:
            classifierName = "nb"
        elif x < 20:
            classifierName = "perceptron"
        elif x < 30:
            classifierName = "nb"
        else:
            classifierName = "perceptron"

        if x < 20:
            Data = "digits"
        else:
            Data = "faces"

        if (Data == "digits"):
            legalLabels = range(10)
            #featureFunction = enhancedFeatureExtractorDigit
            featureFunction = basicFeatureExtractorDigit
        else:
            legalLabels = range(2)
            #featureFunction = enhancedFeatureExtractorFace
            featureFunction = basicFeatureExtractorFace

        if (classifierName == "nb"):
            classifier = naiveBayes.NaiveBayesClassifier(legalLabels)
            classifier.setSmoothing(2.0)

        elif (classifierName == "perceptron"):
            classifier = perceptron.PerceptronClassifier(legalLabels, 3)

        print("Doing classification")
        print("--------------------")
        print("data:\t\t" + Data)
        print("classifier:\t\t " + classifierName)
        print("using enhanced features")
        print("training set size:\t" + str(trainingCounts[x]))

        # Extract features
        print("Extracting features...")
        # Load data

        if Data == "digits":
            startTime = time.process_time()
            h = 0
            while h < 3:
                print("Iteration %d" % h)
                numTraining = trainingCounts[x]
                rawTrainingData = []
                rawTrainingLabels = []
                i = 0
                while i < numTraining:
                    k = list(range(0, 5000))
                    random.shuffle(k)
                    j = k.pop()
                    rawTrainingLabels.append(digitTrainingLabels[j])
                    rawTrainingData.append(rawDigitTrainingData[j])
                    i += 1

                trainingData = list(map(featureFunction, rawTrainingData))
                validationData = list(
                    map(featureFunction, rawDigitValidationData))
                testData = list(map(featureFunction, rawDigitTestData))

                print("Training...")
                classifier.train(trainingData, rawTrainingLabels,
                                 validationData, digitValidationLabels)
                print("Validating...")
                guesses = classifier.classify(validationData)
                correct = [
                    guesses[i] == digitValidationLabels[i]
                    for i in range(len(digitValidationLabels))
                ].count(True)
                print(str(correct),
                      ("correct out of " + str(len(digitValidationLabels)) +
                       " (%.1f%%).") %
                      (100.0 * correct / len(digitValidationLabels)))
                print("Testing...")
                guesses = classifier.classify(testData)
                correct = [
                    guesses[i] == testDigitLabels[i]
                    for i in range(len(testDigitLabels))
                ].count(True)
                print(str(correct),
                      ("correct out of " + str(len(testDigitLabels)) +
                       " (%.1f%%).") %
                      (100.0 * correct / len(testDigitLabels)))
                h += 1
                #Gather correct count for each iteration and use to compute standard deviation
                if classifierName == "nb":
                    if Data == "digits":
                        nbDigits[x % 10] += correct
                        nbDigits[(x % 10) +
                                 10] += time.process_time() - startTime
                    else:
                        nbFaces[x % 10] += correct
                        nbFaces[(x % 10) +
                                10] += time.process_time() - startTime
                else:
                    if Data == "digits":
                        perceptronDigits[x % 10] += correct
                        perceptronDigits[(x % 10) +
                                         10] += time.process_time() - startTime
                    else:
                        perceptronFaces[x % 10] += correct
                        perceptronFaces[(x % 10) +
                                        10] += time.process_time() - startTime
        else:
            h = 0
            while h < 3:
                print("Iteration %d" % h)
                numTraining = trainingCounts[x]
                rawTrainingData = []
                rawTrainingLabels = []
                i = 0
                while i < numTraining:
                    k = list(range(0, 450))
                    random.shuffle(k)
                    j = k.pop()
                    rawTrainingLabels.append(faceTrainingLabels[j])
                    rawTrainingData.append(rawFaceTrainingData[j])
                    i += 1
                trainingData = list(map(featureFunction, rawTrainingData))
                validationData = list(
                    map(featureFunction, rawFaceValidationData))
                testData = list(map(featureFunction, rawFaceTestData))

                print("Training...")
                classifier.train(trainingData, rawTrainingLabels,
                                 validationData, faceValidationLabels)
                print("Validating...")
                guesses = classifier.classify(validationData)
                correct = [
                    guesses[i] == faceValidationLabels[i]
                    for i in range(len(faceValidationLabels))
                ].count(True)
                print(str(correct),
                      ("correct out of " + str(len(faceValidationLabels)) +
                       " (%.1f%%).") %
                      (100.0 * correct / len(faceValidationLabels)))
                print("Testing...")
                guesses = classifier.classify(testData)
                correct = [
                    guesses[i] == testFaceLabels[i]
                    for i in range(len(testFaceLabels))
                ].count(True)
                print(str(correct),
                      ("correct out of " + str(len(testFaceLabels)) +
                       " (%.1f%%).") % (100.0 * correct / len(testFaceLabels)))
                h += 1
                #Gather correct count for each iteration and use to compute standard deviation
                if classifierName == "nb":
                    if Data == "digits":
                        nbDigits[x % 10] += correct
                        nbDigits[(x % 10) +
                                 10] += time.process_time() - startTime
                    else:
                        nbFaces[x % 10] += correct
                        nbFaces[(x % 10) +
                                10] += time.process_time() - startTime
                else:
                    if Data == "digits":
                        perceptronDigits[x % 10] += correct
                        perceptronDigits[(x % 10) +
                                         10] += time.process_time() - startTime
                    else:
                        perceptronFaces[x % 10] += correct
                        perceptronFaces[(x % 10) +
                                        10] += time.process_time() - startTime

    #NAIVE BAYES DIGITS
    print(
        "Average Correct Guesses for Naive Bayes Digits Based on Percentage of TrainingData Used"
    )
    print(
        "10%% %d/1000, 20%% %d/1000, 30%% %d/1000, 40%% %d/1000, 50%% %d/1000, 60%% %d/1000, 70%% %d/1000, 80%% %d/1000, 90%% %d/1000, 100%% %d/1000"
        % (nbDigits[0] / 3, nbDigits[1] / 3, nbDigits[2] / 3, nbDigits[3] / 3,
           nbDigits[4] / 3, nbDigits[5] / 3, nbDigits[6] / 3, nbDigits[7] / 3,
           nbDigits[8] / 3, nbDigits[9] / 3))
    print(
        "Standard Deviation for Naive Bayes Digits Based on Percentage of Training Data Used"
    )
    stndDev = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    i = 0
    while i < 10:
        stndDev[i] = nbDigits[i] / 3
        stndDev[i] = nbDigits[i] - stndDev[i]
        stndDev[i] = math.pow(stndDev[i], 2)
        stndDev[i] = stndDev[i] / 1000
        stndDev[i] = math.sqrt(stndDev[i])
        i += 1
    print(
        "10%% %d, 20%% %d, 30%% %d, 40%% %d, 50%% %d, 60%% %d, 70%% %d, 80%% %d, 90%% %d, 100%% %d"
        % (stndDev[0], stndDev[1], stndDev[2], stndDev[3], stndDev[4],
           stndDev[5], stndDev[6], stndDev[7], stndDev[8], stndDev[9]))
    print(
        "Average Time to Complete Each Iteration Based on Percentage of Training Data Used In Seconds"
    )
    print(
        "10%% %d seconds, 20%% %d seconds, 30%% %d seconds, 40%% %d seconds, 50%% %d seconds, 60%% %d seconds, 70%% %d seconds, 80%% %d seconds, 90%% %d seconds, 100%% %d seconds"
        % (nbDigits[10] / 3, nbDigits[11] / 3, nbDigits[12] / 3, nbDigits[13] /
           3, nbDigits[14] / 3, nbDigits[15] / 3, nbDigits[16] / 3,
           nbDigits[17] / 3, nbDigits[18] / 3, nbDigits[19] / 3))

    #NAIVE BAYES FACES
    print(
        "Average Correct Guesses for Naive Bayes Faces Based on Percentage of TrainingData Used"
    )
    print(
        "10%% %d/149, 20%% %d/149, 30%% %d/149, 40%% %d/149, 50%% %d/149, 60%% %d/149, 70%% %d/149, 80%% %d/149, 90%% %d/149, 100%% %d/149"
        % (nbFaces[0] / 3, nbFaces[1] / 3, nbFaces[2] / 3, nbFaces[3] / 3,
           nbFaces[4] / 3, nbFaces[5] / 3, nbFaces[6] / 3, nbFaces[7] / 3,
           nbFaces[8] / 3, nbFaces[9] / 3))
    stndDev = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    i = 0
    while i < 10:
        stndDev[i] = nbFaces[i] / 3
        stndDev[i] = nbFaces[i] - stndDev[i]
        stndDev[i] = math.pow(stndDev[i], 2)
        stndDev[i] = stndDev[i] / 149
        stndDev[i] = math.sqrt(stndDev[i])
        i += 1
    print(
        "Standard Deviation for Naive Bayes Faces Based on Percentage of Training Data Used"
    )
    print(
        "10%% %d, 20%% %d, 30%% %d, 40%% %d, 50%% %d, 60%% %d, 70%% %d, 80%% %d, 90%% %d, 100%% %d"
        % (stndDev[0], stndDev[1], stndDev[2], stndDev[3], stndDev[4],
           stndDev[5], stndDev[6], stndDev[7], stndDev[8], stndDev[9]))
    print(
        "Time to Complete Each Iteration Based on Percentage of Training Data Used In Seconds"
    )
    print(
        "10%% %d, 20%% %d, 30%% %d, 40%% %d, 50%% %d, 60%% %d, 70%% %d, 80%% %d, 90%% %d, 100%% %d"
        % (nbFaces[10] / 3, nbFaces[11] / 3, nbFaces[12] / 3, nbFaces[13] / 3,
           nbFaces[14] / 3, nbFaces[15] / 3, nbFaces[16] / 3, nbFaces[17] / 3,
           nbFaces[18] / 3, nbFaces[19] / 3))

    #PERCEPTRON DIGITS
    print(
        "Average Correct Guesses for Perceptron Digits Based on Percentage of Training Data Used"
    )
    print(
        "10%% %d/1000, 20%% %d/1000, 30%% %d/1000, 40%% %d/1000, 50%% %d/1000, 60%% %d/1000, 70%% %d/1000, 80%% %d/1000, 90%% %d/1000, 100%% %d/1000"
        % (perceptronDigits[0] / 3, perceptronDigits[1] / 3,
           perceptronDigits[2] / 3, perceptronDigits[3] / 3,
           perceptronDigits[4] / 3, perceptronDigits[5] / 3,
           perceptronDigits[6] / 3, perceptronDigits[7] / 3,
           perceptronDigits[8] / 3, perceptronDigits[9] / 3))
    print(
        "Standard Deviation for Perceptron Digits Based on Percentage of Training Data Used"
    )
    stndDev = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    i = 0
    while i < 10:
        stndDev[i] = perceptronDigits[i] / 3
        stndDev[i] = perceptronDigits[i] - stndDev[i]
        stndDev[i] = math.pow(stndDev[i], 2)
        stndDev[i] = stndDev[i] / 1000
        stndDev[i] = math.sqrt(stndDev[i])
        i += 1
    print(
        "10%% %d, 20%% %d, 30%% %d, 40%% %d, 50%% %d, 60%% %d, 70%% %d, 80%% %d, 90%% %d, 100%% %d"
        % (stndDev[0], stndDev[1], stndDev[2], stndDev[3], stndDev[4],
           stndDev[5], stndDev[6], stndDev[7], stndDev[8], stndDev[9]))
    print(
        "Time to Complete Each Iteration Based on Percentage of Training Data Used In Seconds"
    )
    print(
        "10%% %d, 20%% %d, 30%% %d, 40%% %d, 50%% %d, 60%% %d, 70%% %d, 80%% %d, 90%% %d, 100%% %d"
        % (perceptronDigits[10] / 3, perceptronDigits[11] / 3,
           perceptronDigits[12] / 3, perceptronDigits[13] / 3,
           perceptronDigits[14] / 3, perceptronDigits[15] / 3,
           perceptronDigits[16] / 3, perceptronDigits[17] / 3,
           perceptronDigits[18] / 3, perceptronDigits[19] / 3))

    #PERCEPTRON FACES
    print(
        "Average Correct Guesses for Perceptron Faces Based on Percentage of Training Data Used"
    )
    print(
        "10%% %d/149, 20%% %d/149, 30%% %d/149, 40%% %d/149, 50%% %d/149, 60%% %d/149, 70%% %d/149, 80%% %d/149, 90%% %d/149, 100%% %d/149"
        % (perceptronFaces[0] / 3, perceptronFaces[1] / 3, perceptronFaces[2] /
           3, perceptronFaces[3] / 3, perceptronFaces[4] / 3,
           perceptronFaces[5] / 3, perceptronFaces[6] / 3, perceptronFaces[7] /
           3, perceptronFaces[8] / 3, perceptronFaces[9] / 3))
    print(
        "Standard Deviation for Perceptron Faces Based on Percentage of Training Data Used"
    )
    stndDev = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    i = 0
    while i < 10:
        stndDev[i] = perceptronFaces[i] / 3
        stndDev[i] = perceptronFaces[i] - stndDev[i]
        stndDev[i] = math.pow(stndDev[i], 2)
        stndDev[i] = stndDev[i] / 149
        stndDev[i] = math.sqrt(stndDev[i])
        i += 1
    print(
        "10%% %d, 20%% %d, 30%% %d, 40%% %d, 50%% %d, 60%% %d, 70%% %d, 80%% %d, 90%% %d, 100%% %d"
        % (stndDev[0], stndDev[1], stndDev[2], stndDev[3], stndDev[4],
           stndDev[5], stndDev[6], stndDev[7], stndDev[8], stndDev[9]))
    print(
        "Time to Complete Each Iteration Based on Percentage of Training Data Used In Seconds"
    )
    print(
        "10%% %d, 20%% %d, 30%% %d, 40%% %d, 50%% %d, 60%% %d, 70%% %d, 80%% %d, 90%% %d, 100%% %d"
        % (perceptronFaces[10] / 3, perceptronFaces[11] / 3,
           perceptronFaces[12] / 3, perceptronFaces[13] / 3,
           perceptronFaces[14] / 3, perceptronFaces[15] / 3,
           perceptronFaces[16] / 3, perceptronFaces[17] / 3,
           perceptronFaces[18] / 3, perceptronFaces[19] / 3))
Example #10
0
def readCommand(argv):
    "Processes the command used to run from the command line."
    from optparse import OptionParser  #is a powerful tool to parsing command line options.
    parser = OptionParser(USAGE_STRING)
    #parser.add_option('-f', '--features', help=default('Whether to use enhanced features'), default=False, action="store_true")
    #-f or --features both mean the same option, we can use either of them on the command line.
    #<script> -h will print all the help texts set for each option.
    #default: it sets the option.features to false if the option is not present in command line. but it present always, action is chosen i:e. True

    parser.add_option(
        '-c',
        '--classifier',
        help=default('The type of classifier'),
        choices=['mostFrequent', 'nb', 'naiveBayes', 'perceptron'],
        default='naiveBayes')
    parser.add_option('-d',
                      '--data',
                      help=default('Dataset to use'),
                      choices=['digits', 'faces'],
                      default='digits')
    parser.add_option('-t',
                      '--training',
                      help=default('The size of the training set'),
                      default=100,
                      type="int")
    parser.add_option('-f',
                      '--features',
                      help=default('Whether to use enhanced features'),
                      default=False,
                      action="store_true")
    parser.add_option('-o',
                      '--odds',
                      help=default('Whether to compute odds ratios'),
                      default=False,
                      action="store_true")
    parser.add_option('-1',
                      '--label1',
                      help=default("First label in an odds ratio comparison"),
                      default=0,
                      type="int")
    parser.add_option('-2',
                      '--label2',
                      help=default("Second label in an odds ratio comparison"),
                      default=1,
                      type="int")
    parser.add_option('-w',
                      '--weights',
                      help=default('Whether to print weights'),
                      default=False,
                      action="store_true")
    parser.add_option(
        '-k',
        '--smoothing',
        help=default("Smoothing parameter (ignored when using --autotune)"),
        type="float",
        default=2.0)
    parser.add_option(
        '-a',
        '--autotune',
        help=default("Whether to automatically tune hyperparameters"),
        default=False,
        action="store_true")
    parser.add_option('-i',
                      '--iterations',
                      help=default("Maximum iterations to run training"),
                      default=15,
                      type="int")
    parser.add_option('-s',
                      '--test',
                      help=default("Amount of test data to use"),
                      default=TEST_SET_SIZE,
                      type="int")
    parser.add_option('-n',
                      '--analysis',
                      help=default("Shows which data is wrongly predicted"),
                      default=False,
                      action="store_true")
    parser.add_option('-r',
                      '--random',
                      help=default("Trains the data set using random data and \
   calculates averages for percent accuracy and standard deviation"),
                      default=False,
                      action="store_true")

    options, otherjunk = parser.parse_args(argv)
    if len(otherjunk) != 0:
        raise Exception('Command line input not understood: ' + str(otherjunk))
    args = {}  #empty dictionary to capture the command line inputs.

    # Set up variables according to the command line input. This is the start line of the whole drama.
    print("Doing classification")
    print("--------------------")
    print("Data:\t\t" + options.data)
    print("Classifier:\t\t" + options.classifier)
    print("Using enhanced features?:\t" + str(options.features))
    if not options.random:
        print("Training set size:\t" + str(options.training))

    if (options.data == "digits"):
        printImage = ImagePrinter(
            DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT
        ).printImage  #DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT are global variables

        if (options.features):
            featureFunction = enhancedFeatureExtractorDigit
        else:
            featureFunction = basicFeatureExtractorDigit

    elif (options.data == "faces"):
        printImage = ImagePrinter(
            FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT
        ).printImage  #FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT are global variables
        #& creating an object of class ImagePrinter.
        #print("ImagePrinter is used")
        if (
                options.features
        ):  #to decide on what to choose b/w enhancedFeatureExtractorFace function or basicFeatureExtractorFace function.
            featureFunction = enhancedFeatureExtractorFace
        else:
            featureFunction = basicFeatureExtractorFace
    else:  #if both digits and faces are not what we called on the command prompt.
        print("Unknown dataset", options.data)
        print(USAGE_STRING)
        sys.exit(2)

    if (options.data == "digits"):
        legalLabels = range(10)  #0,1,2,3,4,5,6,7,8,9
    else:
        legalLabels = range(2)  #face or not face

    #we are not keeping training <=0 hence, below wont be used
    if options.training <= 0:
        print(
            "Training set size should be a positive integer (you provided: %d)"
            % options.training)
        print(USAGE_STRING)
        sys.exit(2)
    #we are not using smoothing. hence, below wont be used
    if options.smoothing <= 0:
        print(
            "Please provide a positive number for smoothing (you provided: %f)"
            % options.smoothing)
        print(USAGE_STRING)
        sys.exit(2)
    #we are not using odds. hence, below wont be used
    if options.odds:
        if options.label1 not in legalLabels or options.label2 not in legalLabels:
            print("Didn't provide a legal labels for the odds ratio: (%d,%d)" %
                  (options.label1, options.label2))
            print(USAGE_STRING)
            sys.exit(2)

    #defining decision structure based on asked classifier.
    if (options.classifier == "naiveBayes" or options.classifier == "nb"):
        classifier = naiveBayes.NaiveBayesClassifier(legalLabels)
        classifier.setSmoothing(options.smoothing)
        if (options.autotune):
            print("Using automatic tuning for naivebayes")
            classifier.automaticTuning = True
        else:
            print("Using smoothing parameter k=%f for naivebayes" %
                  options.smoothing)

    elif (options.classifier == "perceptron"):
        classifier = perceptron.PerceptronClassifier(legalLabels,
                                                     options.iterations)
        #creating a PerceptronClassifier object by passing legalLabels and iterations=3 as max iterations to PerceptronClassifier's constructor.

    else:
        print("Unknown classifier:", options.classifier)
        print(USAGE_STRING)

        sys.exit(2)

    args[
        'classifier'] = classifier  #assining classifier as a value to key 'classifier'
    args['featureFunction'] = featureFunction
    args['printImage'] = printImage

    return args, options
def readCommand( argv ):
  """
  Processes the command used to run from the command line.
  """
  import getopt

  # Set default options
  options = {'classifier': 'mostfrequent', 
             'data': 'digits', 
             'enhancedFeatures': False,
             'train': 100,
             'odds': False,
             'class1': 1,
             'class2': 0,
             'smoothing': 1,
             'automaticTuning' : False,
             'maxIterations': 3}
             
  args = {} # This dictionary will hold the objects used by the main method
  
  # Read input from the command line
  commands = ['help', 
              'classifer=', 
              'data=',
              'train=', 
              'enhancedFeatures', 
              'odds',
              'class1=',
              'class2=',
              'smoothing=',
              'automaticTuning'
              'maxIterations=']
  try:
    opts = getopt.getopt( argv, "hc:d:t:fo1:2:k:ai:", commands )
  except getopt.GetoptError:
    print USAGE_STRING
    sys.exit( 2 )
    
  for option, value in opts[0]:
    if option in ['--help', '-h']:
      print USAGE_STRING
      sys.exit( 0 )
    if option in ['--classifier', '-c']:
      options['classifier'] = value
    if option in ['--data', '-d']:
      options['data'] = value
    if option in ['--train', '-t']:
      options['train'] = int(value)
    if option in ['--enhancedFeatures', '-f']:
      options['enhancedFeatures'] = True
    if option in ['--odds', '-o']:
      options['odds'] = True
    if option in ['--class1', '-1']:
      options['class1'] = int(value)
    if option in ['--class2', '-2']:
      options['class2'] = int(value)
    if option in ['--smoothing', '-k']:
      options['smoothing'] = float( value )
    if option in ['--automaticTuning', '-a']:
      options['automaticTuning'] = True
    if option in ['--maxIterations', '-i']:
      options['maxIterations'] = int(value)
    
  # Set up variables according to the command line input.
  print "Doing classification"
  print "--------------------"
  print "data:\t\t" + options['data']
  print "classifier:\t\t" + options['classifier']
  print "using enhanced features?:\t" + str(options['enhancedFeatures'])
  print "training set size:\t" + str(options['train'])
  if(options['data']=="digits"):
    printImage = ImagePrinter(DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT).printImage
    if (options['enhancedFeatures']):
      featureFunction = enhancedFeatureExtractorDigit
    else:
      featureFunction = basicFeatureExtractorDigit
  elif(options['data']=="faces"):
    printImage = ImagePrinter(FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT).printImage
    if (options['enhancedFeatures']):
      featureFunction = enhancedFeatureExtractorFace
    else:
      featureFunction = basicFeatureExtractorFace      
  else:
    print "Unknown dataset", options['data']
    print USAGE_STRING
    sys.exit(2)
    
  if(options['data']=="digits"):
    legalLabels = range(10)
  else:
    legalLabels = range(2)
    
  if options['train'] <= 0:
    print "Training set size should be a positive integer (you provided: %d)" % options['train']
    print USAGE_STRING
    sys.exit(2)
    
  if options['smoothing'] <= 0:
    print "Please provide a positive number for smoothing (you provided: %f)" % options['smoothing']
    print USAGE_STRING
    sys.exit(2)
    
  if options['odds']:
    for className in ['class1','class2']:
      if options[className] not in legalLabels:
        print "Didn't provide a legal labels for the odds ratio for %s" % className
        print USAGE_STRING
        sys.exit(2)

  if(options['classifier'] == "mostfrequent"):
    classifier = mostFrequent.MostFrequentClassifier(legalLabels)
  elif(options['classifier'] == "naivebayes"):
    classifier = naiveBayes.NaiveBayesClassifier(legalLabels)
    classifier.setSmoothing(options['smoothing'])
    if (options['automaticTuning']):
        print "using automatic tuning for naivebayes"
        classifier.automaticTuning = True
    else:
        print "using smoothing parameter k=%f for naivebayes" %  options['smoothing']
  elif(options['classifier'] == "perceptron"):
    classifier = perceptron.PerceptronClassifier(legalLabels,options['maxIterations'])
  elif(options['classifier'] == "mira"):
    classifier = mira.MiraClassifier(legalLabels, options['maxIterations'])
    if (options['automaticTuning']):
        print "using automatic tuning for MIRA"
        classifier.automaticTuning = True
    else:
        print "using default C=0.001 for MIRA"
  else:
    print "Unknown classifier:", options['classifier']
    print USAGE_STRING
    sys.exit(2)

  args['classifier'] = classifier
  args['featureFunction'] = featureFunction
  args['printImage'] = printImage
  
  return args, options
Example #12
0
    
    if options.smoothing <= 0:
        print "Please provide a positive number for smoothing (you provided: %f)" % options.smoothing
        print USAGE_STRING
        sys.exit(2)
    
    if options.odds:
        if options.label1 not in legalLabels or options.label2 not in legalLabels:
            print "Didn't provide a legal labels for the odds ratio: (%d,%d)" % (options.label1, options.label2)
            print USAGE_STRING
            sys.exit(2)

if(options.classifier == "mostFrequent"):
    classifier = mostFrequent.MostFrequentClassifier(legalLabels)
    elif(options.classifier == "naiveBayes" or options.classifier == "nb"):
        classifier = naiveBayes.NaiveBayesClassifier(legalLabels)
        classifier.setSmoothing(options.smoothing)
        if (options.autotune):
            print "using automatic tuning for naivebayes"
            classifier.automaticTuning = True
        else:
            print "using smoothing parameter k=%f for naivebayes" %  options.smoothing
elif(options.classifier == "perceptron"):
    if options.data != 'pacman':
        classifier = perceptron.PerceptronClassifier(legalLabels,options.iterations)
        else:
            classifier = perceptron_pacman.PerceptronClassifierPacman(legalLabels,options.iterations)
elif(options.classifier == "mira"):
    if options.data != 'pacman':
        classifier = mira.MiraClassifier(legalLabels, options.iterations)
        if (options.autotune):
def runClassifier(args, options, legalLabels):

    featureFunction = args['featureFunction']
    classifier = args['classifier']
    printImage = args['printImage']

    # Load data
    numTraining = options.training
    numTest = options.test

    if (options.data == "faces"):

        rawValidationData = samples.loadDataFile("facedata/facedatatrain",
                                                 numTest, FACE_DATUM_WIDTH,
                                                 FACE_DATUM_HEIGHT)
        validationLabels = samples.loadLabelsFile(
            "facedata/facedatatrainlabels", numTest)
        rawTestData = samples.loadDataFile("facedata/facedatatest", numTest,
                                           FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT)
        testLabels = samples.loadLabelsFile("facedata/facedatatestlabels",
                                            numTest)
    else:

        rawValidationData = samples.loadDataFile("digitdata/validationimages",
                                                 numTest, DIGIT_DATUM_WIDTH,
                                                 DIGIT_DATUM_HEIGHT)
        validationLabels = samples.loadLabelsFile("digitdata/validationlabels",
                                                  numTest)
        rawTestData = samples.loadDataFile("digitdata/testimages", numTest,
                                           DIGIT_DATUM_WIDTH,
                                           DIGIT_DATUM_HEIGHT)
        testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest)

    # Extract features
    print "Extracting features..."

    validationData = map(featureFunction, rawValidationData)
    testData = map(featureFunction, rawTestData)

    total = numTraining

    f_out = open(
        './results/' + options.classifier + "_" + options.data + '.txt', 'w')

    # train and classify for portions of the training data, compare performance
    for i in range(1, 11):

        print "\n\nUsing", i * 10, "% of training data\n"

        multiplier = i / 10.0
        numTraining = int(total * multiplier)

        if (options.classifier == "naiveBayes"):
            classifier = naiveBayes.NaiveBayesClassifier(legalLabels)
        elif (options.classifier == "perceptron"):
            classifier = perceptron.PerceptronClassifier(
                legalLabels, options.iterations)

        if options.data == "faces":
            rawTrainingData = samples.loadDataFile("facedata/facedatatrain",
                                                   numTraining,
                                                   FACE_DATUM_WIDTH,
                                                   FACE_DATUM_HEIGHT)
            trainingLabels = samples.loadLabelsFile(
                "facedata/facedatatrainlabels", numTraining)
        else:
            rawTrainingData = samples.loadDataFile("digitdata/trainingimages",
                                                   numTraining,
                                                   DIGIT_DATUM_WIDTH,
                                                   DIGIT_DATUM_HEIGHT)
            trainingLabels = samples.loadLabelsFile("digitdata/traininglabels",
                                                    numTraining)

        trainingData = map(featureFunction, rawTrainingData)
        # Conduct training and testing

        start_time = time.time()

        print "Training..."
        classifier.train(trainingData, trainingLabels, validationData,
                         validationLabels)

        end_time = time.time()
        exec_time = end_time - start_time

        print "\n\nUsing " + str(numTraining) + " training images"
        print "Training took " + str(exec_time) + " seconds\n\n"

        print "Validating..."
        guesses = classifier.classify(validationData)
        correct = [
            guesses[i] == validationLabels[i]
            for i in range(len(validationLabels))
        ].count(True)
        print str(correct), ("correct out of " + str(len(validationLabels)) +
                             " (%.1f%%).") % (100.0 * correct /
                                              len(validationLabels))
        val_correct = correct
        print "Testing..."
        guesses = classifier.classify(testData)
        correct = [
            guesses[i] == testLabels[i] for i in range(len(testLabels))
        ].count(True)
        print str(correct), ("correct out of " + str(len(testLabels)) +
                             " (%.1f%%).") % (100.0 * correct /
                                              len(testLabels))
        test_correct = correct
        # analysis(classifier, guesses, testLabels, testData, rawTestData, printImage)

        f_out.write(
            str(numTraining) + " " + str(exec_time) + " " + str(numTest) +
            " " + str(val_correct) + " " + str(test_correct) + '\n')

    f_out.close()