def monitorCondor(args,suffix):
  numStudents = len(getUniqueStudents())
  jobs = [-1 for i in range(numStudents)]
  unfinished = range(numStudents)
  saveFiles = []
  for studentInd in range(numStudents):
    options,_ = parseArgs(addArgs(args,suffix,studentInd))
    saveFiles.append(options.saveFile)
    
  while len(unfinished) > 0:
    needToRun = []
    # check what's running and still needs to run
    p = subprocess.Popen(['condor_q','sbarrett'],stdout=subprocess.PIPE)
    out,_ = p.communicate()
    for studentInd in list(unfinished):
      if out.find(str(jobs[studentInd])) < 0:
        if os.path.exists(saveFiles[studentInd]):
          unfinished.remove(studentInd)
        else:
          needToRun.append(studentInd)
    # submit new jobs as needed
    for studentInd in needToRun:
      jobs[studentInd] = submit([str(studentInd)] + args,suffix)
    # good-night sweet prince
    time.sleep(20)
Exemple #2
0
def main(targetDir, sourceDir, inPrefix, outPrefix, studentInd, origNumSource):
    #sourceDir = 'data/dt/perturbed-noop0.1-50000'
    #targetDir = 'data/dt/perturbed-noop0.1-1000'
    inFilename = inPrefix + '-%s.desc'
    outDesc = outPrefix + '-%s.desc'
    outTree = outPrefix + '-%s.weka'
    studentFile = 'data/newStudents29.txt'

    factor = origNumSource / (4 * 50000.0)
    students = getUniqueStudents(studentFile)
    #maxLength = 0
    for i, student in enumerate(students):
        if (studentInd is not None) and (i != studentInd):
            continue
        filename = os.path.join(targetDir, 'desc', inFilename % student)
        #print student
        res = parseDesc(filename)
        #print res
        if res is None:
            print >> sys.stderr, 'SKIPPING', student
            continue
        #maxLength = max(maxLength,len(res))

        cmd = [
            'bin/%s/boostWeights' % getArch(), student, targetDir, sourceDir
        ]
        for k, v in res.iteritems():
            cmd += [k, str(v * factor)]
        #print ' '.join(cmd)
        outDescPath = os.path.join(targetDir, 'desc', outDesc % student)
        outTreePath = os.path.join(targetDir, 'weighted', outTree % student)
        subprocess.check_call(cmd, stdout=open(outDescPath, 'w'))
        extractTree(outDescPath, outTreePath)
Exemple #3
0
def monitorCondor(args, suffix):
    numStudents = len(getUniqueStudents())
    jobs = [-1 for i in range(numStudents)]
    unfinished = range(numStudents)
    saveFiles = []
    for studentInd in range(numStudents):
        options, _ = parseArgs(addArgs(args, suffix, studentInd))
        saveFiles.append(options.saveFile)

    while len(unfinished) > 0:
        needToRun = []
        # check what's running and still needs to run
        p = subprocess.Popen(['condor_q', 'sbarrett'], stdout=subprocess.PIPE)
        out, _ = p.communicate()
        for studentInd in list(unfinished):
            if out.find(str(jobs[studentInd])) < 0:
                if os.path.exists(saveFiles[studentInd]):
                    unfinished.remove(studentInd)
                else:
                    needToRun.append(studentInd)
        # submit new jobs as needed
        for studentInd in needToRun:
            jobs[studentInd] = submit([str(studentInd)] + args, suffix)
        # good-night sweet prince
        time.sleep(20)
def main(dataBasename,stayWeight,outliers,treeOptions,useWeka):
  # get the students
  students = getUniqueStudents()
  
  dataDir = os.path.join('data','dt-train',dataBasename)
  # get the filenames
  commonFiles = []
  outlierFiles = []
  for student in students:
    filename = os.path.join(dataDir,'%s.arff' % student)
    if student in outliers:
      outlierFiles.append(filename)
    else:
      commonFiles.append(filename)

  assert(len(outlierFiles) == len(outliers))

  # make sure all of the student files exist before starting
  for studentFile in outlierFiles + commonFiles:
    if not(os.path.exists(studentFile)):
      print >>sys.stderr,'Student file missing: %s' % studentFile
      sys.exit(1)
  
  # read in the files and create the trees
  commonData = []
  for filename in commonFiles:
    header,lines = readFile(filename)
    commonData.extend(lines)
  commonData = header + commonData
  process(dataBasename,'common',commonData,stayWeight,treeOptions,useWeka)

  for filename in outlierFiles:
    student = os.path.splitext(os.path.basename(filename))[0]
    header,lines = readFile(filename)
    process(dataBasename,'outlier-%s' % student,header+lines,stayWeight,treeOptions,useWeka)
Exemple #5
0
def main(targetDir,sourceDir,inPrefix,outPrefix,studentInd,origNumSource):
  #sourceDir = 'data/dt/perturbed-noop0.1-50000'
  #targetDir = 'data/dt/perturbed-noop0.1-1000'
  inFilename = inPrefix + '-%s.desc'
  outDesc = outPrefix + '-%s.desc'
  outTree = outPrefix + '-%s.weka'
  studentFile = 'data/newStudents29.txt'

  factor = origNumSource / (4 * 50000.0) 
  students = getUniqueStudents(studentFile)
  #maxLength = 0
  for i,student in enumerate(students):
    if (studentInd is not None) and (i != studentInd):
      continue
    filename = os.path.join(targetDir,'desc',inFilename % student)
    #print student
    res = parseDesc(filename)
    #print res
    if res is None:
      print >>sys.stderr,'SKIPPING',student
      continue
    #maxLength = max(maxLength,len(res))

    cmd = ['bin/%s/boostWeights' % getArch(),student,targetDir,sourceDir]
    for k,v in res.iteritems():
      cmd += [k,str(v * factor)]
    #print ' '.join(cmd)
    outDescPath = os.path.join(targetDir,'desc',outDesc % student)
    outTreePath = os.path.join(targetDir,'weighted',outTree % student)
    subprocess.check_call(cmd,stdout=open(outDescPath,'w'))
    extractTree(outDescPath,outTreePath)
def main(basename,dataDir,stayWeight,treeOptions,options):
  students = getUniqueStudents()
  
  for i,student in enumerate(students):
    if (options.studentInd is not None) and (i != options.studentInd):
      continue
    print '-------------------'
    print student
    print '-------------------'
    dataFile = getFilename(dataDir,student,TRAIN)
    createDT(dataFile,basename,'only-%s' % student,stayWeight,treeOptions,options.useWeka,options.numInstances,options.randomTree,options.numRandomTrees,options.featureFrac,options.resampleFrac,options.randomTreeInd)
def main(targetDir,sourceDir,destDir,prefix,maxNumBoosts,numTargetInstances,studentInd):
  students = getUniqueStudents(studentFile)
  for i,student in enumerate(students):
    if (studentInd is not None) and (i != studentInd):
      continue
    print '-------------------'
    print student
    print '-------------------'
    orderFile = os.path.join(destDir,'desc','combined-%s.desc' % student)
    cmd = ['bin/%s/boostGivenOrder' % getArch(),student,orderFile,targetDir,sourceDir,str(maxNumBoosts),str(numTargetInstances)]
    descFile = os.path.join(destDir,'desc',prefix + '-' + student + '.desc')
    resultFile = os.path.join(destDir,'weighted',prefix + '-' + student + '.weka')
    subprocess.check_call(cmd,stdout=open(descFile,'w'))
    extractTree(descFile,resultFile)
def main(targetDir,sourceDir,destDir,prefix,studentInd):
  students = getUniqueStudents(studentFile)
  prefix = prefix + '-using%sSource' % (sourceData if sourceData > 0 else 'All')
  for i,student in enumerate(students):
    if (studentInd is not None) and (i != studentInd):
      continue
    print '-------------------'
    print student
    print '-------------------'
    cmd = ['bin/%s/boostTest' % getArch(),student,studentFile,targetDir,sourceDir,str(sourceData)]
    descFile = os.path.join(destDir,'desc',prefix + '-' + student + '.desc')
    resultFile = os.path.join(destDir,'weighted',prefix + '-' + student + '.weka')
    subprocess.check_call(cmd,stdout=open(descFile,'w'))
    extractTree(descFile,resultFile)
Exemple #9
0
def main(targetDir,sourceDir,destDir,numTargetInstances,jobInd):
  students = getUniqueStudents(studentFile)
  i = -1
  for targetStudent in students:
    for sourceStudent in students:
      if (targetStudent == sourceStudent):
        continue
      i += 1
      if (jobInd is not None) and (i != jobInd):
        continue
      print targetStudent, sourceStudent
      cmd = ['bin/%s/boostIndependent' % getArch(),targetStudent,sourceStudent,targetDir,sourceDir,str(numTargetInstances)]
      descFile = os.path.join(destDir,'boostIndependent',targetStudent + '-' + sourceStudent + '.desc')
      subprocess.check_call(cmd,stdout=open(descFile,'w'))
def main(basename, dataDir, stayWeight, treeOptions, options):
    students = getUniqueStudents()

    for i, student in enumerate(students):
        if (options.studentInd is not None) and (i != options.studentInd):
            continue
        print '-------------------'
        print student
        print '-------------------'
        dataFile = getFilename(dataDir, student, TRAIN)
        createDT(dataFile, basename, 'only-%s' % student, stayWeight,
                 treeOptions, options.useWeka, options.numInstances,
                 options.randomTree, options.numRandomTrees,
                 options.featureFrac, options.resampleFrac,
                 options.randomTreeInd)
def main(targetDir, prefix, jobInd):
    TOP_NUM = 30
    students = getUniqueStudents(studentFile)
    i = -1
    for targetStudent in students:
        i += 1
        if (jobInd is not None) and (i != jobInd):
            continue
        orderedVals = []
        orderedStudents = []
        for sourceStudent in students:
            if (targetStudent == sourceStudent):
                continue
            descFile = os.path.join(
                targetDir, 'boostIndependent',
                targetStudent + '-' + sourceStudent + '.desc')
            with open(descFile, 'r') as f:
                line = f.readlines()[-1].strip()
            val, student = line.split(' ')
            val = float(val)
            assert (student == sourceStudent)
            if val > 1e-10:
                i = 0
                for i, v in enumerate(orderedVals):
                    if val > v:
                        break
                else:
                    i = len(orderedVals)
                orderedVals.insert(i, val)
                orderedStudents.insert(i, sourceStudent)
        print targetStudent
        print '  ', orderedVals
        print '  ', orderedStudents
        print '  ', len(orderedVals)
        print '\n'
        outFile = os.path.join(targetDir, 'desc',
                               prefix + '-' + targetStudent + '.desc')
        if os.path.exists(outFile):
            print >> sys.stderr, 'FILE EXISTS:', outFile
            sys.exit(1)
        with open(outFile, 'w') as f:
            for v, s in zip(orderedVals[:TOP_NUM], orderedStudents[:TOP_NUM]):
                f.write('%s %s\n' % (v, s))
            f.write('TwoStageTrAdaBoost\n')
def main(targetDir,prefix,jobInd):
  TOP_NUM = 30
  students = getUniqueStudents(studentFile)
  i = -1
  for targetStudent in students:
    i += 1
    if (jobInd is not None) and (i != jobInd):
      continue
    orderedVals = []
    orderedStudents = []
    for sourceStudent in students:
      if (targetStudent == sourceStudent):
        continue
      descFile = os.path.join(targetDir,'boostIndependent',targetStudent + '-' + sourceStudent + '.desc')
      with open(descFile,'r') as f:
        line = f.readlines()[-1].strip()
      val,student = line.split(' ')
      val = float(val)
      assert(student == sourceStudent)
      if val > 1e-10:
        i = 0
        for i,v in enumerate(orderedVals):
          if val > v:
            break
        else:
          i = len(orderedVals)
        orderedVals.insert(i,val)
        orderedStudents.insert(i,sourceStudent)
    print targetStudent
    print '  ',orderedVals
    print '  ',orderedStudents
    print '  ',len(orderedVals)
    print '\n'
    outFile = os.path.join(targetDir,'desc',prefix + '-' + targetStudent + '.desc')
    if os.path.exists(outFile):
      print >>sys.stderr,'FILE EXISTS:',outFile
      sys.exit(1)
    with open(outFile,'w') as f:
      for v,s in zip(orderedVals[:TOP_NUM],orderedStudents[:TOP_NUM]):
        f.write('%s %s\n' % (v,s))
      f.write('TwoStageTrAdaBoost\n')
def main(targetDir, sourceDir, destDir, prefix, studentInd):
    students = getUniqueStudents(studentFile)
    prefix = prefix + '-using%sSource' % (sourceData
                                          if sourceData > 0 else 'All')
    for i, student in enumerate(students):
        if (studentInd is not None) and (i != studentInd):
            continue
        print '-------------------'
        print student
        print '-------------------'
        cmd = [
            'bin/%s/boostTest' % getArch(), student, studentFile, targetDir,
            sourceDir,
            str(sourceData)
        ]
        descFile = os.path.join(destDir, 'desc',
                                prefix + '-' + student + '.desc')
        resultFile = os.path.join(destDir, 'weighted',
                                  prefix + '-' + student + '.weka')
        subprocess.check_call(cmd, stdout=open(descFile, 'w'))
        extractTree(descFile, resultFile)
Exemple #14
0
def main(targetDir, sourceDir, destDir, prefix, maxNumBoosts,
         numTargetInstances, studentInd):
    students = getUniqueStudents(studentFile)
    for i, student in enumerate(students):
        if (studentInd is not None) and (i != studentInd):
            continue
        print '-------------------'
        print student
        print '-------------------'
        orderFile = os.path.join(destDir, 'desc', 'combined-%s.desc' % student)
        cmd = [
            'bin/%s/boostGivenOrder' % getArch(), student, orderFile,
            targetDir, sourceDir,
            str(maxNumBoosts),
            str(numTargetInstances)
        ]
        descFile = os.path.join(destDir, 'desc',
                                prefix + '-' + student + '.desc')
        resultFile = os.path.join(destDir, 'weighted',
                                  prefix + '-' + student + '.weka')
        subprocess.check_call(cmd, stdout=open(descFile, 'w'))
        extractTree(descFile, resultFile)
def main(dataBasename, stayWeight, outliers, treeOptions, useWeka):
    # get the students
    students = getUniqueStudents()

    dataDir = os.path.join('data', 'dt-train', dataBasename)
    # get the filenames
    commonFiles = []
    outlierFiles = []
    for student in students:
        filename = os.path.join(dataDir, '%s.arff' % student)
        if student in outliers:
            outlierFiles.append(filename)
        else:
            commonFiles.append(filename)

    assert (len(outlierFiles) == len(outliers))

    # make sure all of the student files exist before starting
    for studentFile in outlierFiles + commonFiles:
        if not (os.path.exists(studentFile)):
            print >> sys.stderr, 'Student file missing: %s' % studentFile
            sys.exit(1)

    # read in the files and create the trees
    commonData = []
    for filename in commonFiles:
        header, lines = readFile(filename)
        commonData.extend(lines)
    commonData = header + commonData
    process(dataBasename, 'common', commonData, stayWeight, treeOptions,
            useWeka)

    for filename in outlierFiles:
        student = os.path.splitext(os.path.basename(filename))[0]
        header, lines = readFile(filename)
        process(dataBasename, 'outlier-%s' % student, header + lines,
                stayWeight, treeOptions, useWeka)
def parseArgs(args,
              parserOptions=[],
              numAdditionalArgs=0,
              additionalArgsString=''):
    from optparse import OptionParser
    parser = OptionParser('%prog [options] classifier student ' +
                          additionalArgsString)
    parser.add_option('-b',
                      '--baseLearner',
                      action='store',
                      dest='baseLearner',
                      type='str',
                      default=None,
                      help='use the classifier as the base learner')
    parser.add_option('-f',
                      '--fallbackLearner',
                      action='store',
                      dest='fallbackLearner',
                      type='str',
                      default=None,
                      help='use the classifier file as the fallback learner')
    #parser.add_option('--student',action='store',dest='student',type='str',default=None)
    parser.add_option('-s',
                      '--source',
                      action='store',
                      dest='numSource',
                      type='int',
                      default=None,
                      help='num source instances')
    parser.add_option('-t',
                      '--target',
                      action='store',
                      dest='numTarget',
                      type='int',
                      default=None,
                      help='num target instances')
    parser.add_option('--no-source',
                      action='store_false',
                      dest='useSource',
                      default=True,
                      help='don\'t use the source data, but use the name')
    parser.add_option(
        '--debug',
        action='store_true',
        dest='debug',
        default=False,
        help='print the cmd and don\'t remove the combined config')
    parser.add_option('--fracSourceData',
                      action='store',
                      dest='fracSourceData',
                      default=None,
                      help='frac of source data to use')
    parser.add_option('--partialMax',
                      action='store',
                      dest='partialMax',
                      type='int',
                      default=None,
                      help='num partial runs')
    parser.add_option('--no-save',
                      action='store_false',
                      dest='save',
                      default=True,
                      help='disable saving of the classifier')
    parser.add_option('--catchOutput',
                      action='store_true',
                      dest='catchOutput',
                      default=False,
                      help='catch the output of the training')
    parser.add_option('--ignorePartialMax',
                      action='store_true',
                      dest='ignorePartialMax',
                      default=False,
                      help='ignore partialmax')
    for option in parserOptions:
        parser.add_option(option)
    options, args = parser.parse_args(args)

    numExpectedArgs = 2
    if len(args) != numExpectedArgs + numAdditionalArgs:
        print >> sys.stderr, 'Incorrect number of arguments expected %i but got %i' % (
            numExpectedArgs, len(args))
        parser.parse_args(['--help'])
        sys.exit(1)

    classifierType, student = args[:numExpectedArgs]
    # num training should be an int, and the filename should exist
    options.base = 'studentsNew29-unperturbed-%i'
    assert (options.numTarget is not None), 'numTarget unspecified'
    options.targetBase = options.base % options.numTarget
    if not (baseExists(options.targetBase)):
        print >> sys.stderr, 'Dir for numTarget doesn\'t exist at:', options.targetBase
        sys.exit(2)
    assert (options.numSource is not None), 'numSource unspecified'
    if options.numSource != 0:
        options.sourceBase = options.base % options.numSource
        if not (baseExists(options.sourceBase)):
            print >> sys.stderr, 'Dir for numSource doesn\'t exist at:', options.sourceBase
            sys.exit(2)
    # get students and check provided student
    students = getUniqueStudents()
    try:
        ind = int(student)
        if (options.partialMax is not None) and not options.ignorePartialMax:
            options.partialInd = ind % options.partialMax
            ind /= options.partialMax
            print 'ind: %i partialInd: %i' % (ind, options.partialInd)
        student = students[ind]
    except:
        if student not in students:
            print >> sys.stderr, 'Unknown student:', student
            sys.exit(2)
    options.student = student
    options.studentInd = students.index(student)
    options.otherStudents = list(students)
    options.otherStudents.remove(student)
    # check provided classifiers
    options.classifier = getClassifier(classifierType)
    if (options.classifier[1] == True) and (options.baseLearner is None):
        print >> sys.stderr, 'Missing base learner:', classifierType
        sys.exit(1)
    if (options.classifier[2] == True) and (options.fallbackLearner is None):
        print >> sys.stderr, 'Missing fallback learner:', classifierType
        sys.exit(1)
    if options.baseLearner is not None:
        temp = getClassifier(options.baseLearner)
        assert ((not temp[1]) and (not temp[2]))
    if options.fallbackLearner is not None:
        temp = getClassifier(options.fallbackLearner)
        assert ((not temp[1]) and (not temp[2]))
    # get name
    name = options.classifier[0]
    if name == 'trbagg-partialLoad':
        name = 'trbagg'
    if name == 'twostagetradaboost-partial':
        name = 'twostagetradaboost'
    if options.baseLearner is not None:
        name += '_base' + options.baseLearner
    if options.fallbackLearner is not None:
        name += '_fb' + options.fallbackLearner
    options.name = name
    options.saveName = name + '-target%i-source%i' % (options.numTarget,
                                                      options.numSource)

    # get the arguments
    options.saveConfigFilename = getConfigFilename('saved/' +
                                                   options.saveName + '-' +
                                                   student)
    options.saveFile = getSaveFilename(options)

    return options, args[numExpectedArgs:]
Exemple #17
0
#!/usr/bin/env python

import sys, os
from common import getUniqueStudents

directory = sys.argv[1]
if not os.path.exists(directory):
    print 'Directory does not exist'
    sys.exit(2)
prefix = sys.argv[2]
numJobs = 1000
students = getUniqueStudents()
for student in students:
    missing = []
    for job in range(numJobs):
        path = os.path.join(directory,
                            '%s-%s-%i.weka' % (prefix, student, job))
        if not os.path.exists(path):
            missing.append(job)
    if len(missing) == numJobs:
        print 'Probably a problem with your directory or prefix'
        sys.exit(3)
    if len(missing) != 0:
        print student, ' '.join(map(str, missing))
#!/usr/bin/env python

import sys, os
from common import getUniqueStudents

directory = sys.argv[1]
if not os.path.exists(directory):
  print 'Directory does not exist'
  sys.exit(2)
prefix = sys.argv[2]
numJobs = 1000
students = getUniqueStudents()
for student in students:
  missing = []
  for job in range(numJobs):
    path = os.path.join(directory, '%s-%s-%i.weka' % (prefix,student,job))
    if not os.path.exists(path):
      missing.append(job)
  if len(missing) == numJobs:
    print 'Probably a problem with your directory or prefix'
    sys.exit(3)
  if len(missing) != 0:
    print student,' '.join(map(str,missing))
Exemple #19
0
def main(args=sys.argv[1:]):
    usage = 'createTrBagg [--balanced] numTarget numSource jobStart [numJobs]'
    balanced = False
    if '--balanced' in args:
        args.remove('--balanced')
        balanced = True
    if (len(args) < 3) or (len(args) > 4):
        print >> sys.stderr, usage
        sys.exit(1)
    numClassifiers = 1000
    numTarget = int(args[0])
    numSource = int(args[1])
    jobStart = int(args[2])
    if len(args) >= 4:
        numJobs = int(args[3])
    else:
        numJobs = 1
    jobStart *= numJobs
    base = 'studentsNew29-unperturbed-%i'
    outputBase = 'studentsNew29-unperturbed-transfer/target%i-source%i' % (
        numTarget, numSource)
    if not os.path.exists('data/dt/' + outputBase + '/desc'):
        os.makedirs('data/dt/' + outputBase + '/desc')
    if not os.path.exists('data/dt/' + outputBase + '/weighted'):
        os.makedirs('data/dt/' + outputBase + '/weighted')

    for jobOffset in range(numJobs):
        jobNum = jobStart + jobOffset
        studentInd = jobNum / numClassifiers
        classifierInd = jobNum % numClassifiers
        print 'job,student,classifier:', jobNum, studentInd, classifierInd

        students = getUniqueStudents()
        if studentInd >= len(students):
            return
        if balanced:
            probs = calcBalancedProbs(students, numSource, numTarget,
                                      studentInd)
        else:
            probs = calcProbs(students, numSource, numTarget, studentInd)
        print 'probs:', probs, sum(probs)

        counts = [0 for s in students]
        for i in range(numTarget):
            r = random.random()
            total = 0
            for j, p in enumerate(probs):
                total += p
                if r < total:
                    ind = j
                    break
            else:
                ind = len(probs) - 1
            counts[ind] += 1
        eps = 1e-10
        props = [
            eps + float(c) / (numTarget if i == studentInd else numSource)
            for i, c in enumerate(counts)
        ]

        try:
            arffFilename = makeTemp('.arff')
            #arffFilenameFilt = makeTemp('.arff')
            tempFile = makeTemp('.arff')
            with open(arffFilename, 'w') as arffFile:
                for i, (student, prop) in enumerate(zip(students, props)):
                    inFile = getFilename(
                        base % (numTarget if student == students[studentInd]
                                else numSource), student, TRAIN)
                    print 'resampling', student
                    resample(inFile, tempFile, prop)
                    with open(tempFile, 'r') as f:
                        if i != 0:
                            for line in f:
                                if line.strip() == '@data':
                                    break
                        for line in f:
                            arffFile.write(line)
            print 'removing trial step'
            #removeTrialStep(arffFilename,arffFilenameFilt)
            if balanced:
                name = 'trBagg-balanced-%s-%i' % (students[studentInd],
                                                  classifierInd)
            else:
                name = 'trBagg-%s-%i' % (students[studentInd], classifierInd)
            makeTree(arffFilename, True, None, outputBase, name, [], False,
                     1.0)
            #makeTree(arffFilenameFilt,True,None,outputBase,'trBagg-%s-%i' % (students[studentInd],classifierInd),[],False,1.0)
        finally:
            os.remove(arffFilename)
            #os.remove(arffFilenameFilt)
            os.remove(tempFile)
            os.remove(getFilename(outputBase, name, DESC))
def parseArgs(args, parserOptions=[], numAdditionalArgs=0, additionalArgsString=""):
    from optparse import OptionParser

    parser = OptionParser("%prog [options] classifier student " + additionalArgsString)
    parser.add_option(
        "-b",
        "--baseLearner",
        action="store",
        dest="baseLearner",
        type="str",
        default=None,
        help="use the classifier as the base learner",
    )
    parser.add_option(
        "-f",
        "--fallbackLearner",
        action="store",
        dest="fallbackLearner",
        type="str",
        default=None,
        help="use the classifier file as the fallback learner",
    )
    # parser.add_option('--student',action='store',dest='student',type='str',default=None)
    parser.add_option(
        "-s", "--source", action="store", dest="numSource", type="int", default=None, help="num source instances"
    )
    parser.add_option(
        "-t", "--target", action="store", dest="numTarget", type="int", default=None, help="num target instances"
    )
    parser.add_option(
        "--no-source",
        action="store_false",
        dest="useSource",
        default=True,
        help="don't use the source data, but use the name",
    )
    parser.add_option(
        "--debug",
        action="store_true",
        dest="debug",
        default=False,
        help="print the cmd and don't remove the combined config",
    )
    parser.add_option(
        "--fracSourceData", action="store", dest="fracSourceData", default=None, help="frac of source data to use"
    )
    parser.add_option(
        "--partialMax", action="store", dest="partialMax", type="int", default=None, help="num partial runs"
    )
    parser.add_option(
        "--no-save", action="store_false", dest="save", default=True, help="disable saving of the classifier"
    )
    parser.add_option(
        "--catchOutput", action="store_true", dest="catchOutput", default=False, help="catch the output of the training"
    )
    parser.add_option(
        "--ignorePartialMax", action="store_true", dest="ignorePartialMax", default=False, help="ignore partialmax"
    )
    for option in parserOptions:
        parser.add_option(option)
    options, args = parser.parse_args(args)

    numExpectedArgs = 2
    if len(args) != numExpectedArgs + numAdditionalArgs:
        print >> sys.stderr, "Incorrect number of arguments expected %i but got %i" % (numExpectedArgs, len(args))
        parser.parse_args(["--help"])
        sys.exit(1)

    classifierType, student = args[:numExpectedArgs]
    # num training should be an int, and the filename should exist
    options.base = "studentsNew29-unperturbed-%i"
    assert options.numTarget is not None, "numTarget unspecified"
    options.targetBase = options.base % options.numTarget
    if not (baseExists(options.targetBase)):
        print >> sys.stderr, "Dir for numTarget doesn't exist at:", options.targetBase
        sys.exit(2)
    assert options.numSource is not None, "numSource unspecified"
    if options.numSource != 0:
        options.sourceBase = options.base % options.numSource
        if not (baseExists(options.sourceBase)):
            print >> sys.stderr, "Dir for numSource doesn't exist at:", options.sourceBase
            sys.exit(2)
    # get students and check provided student
    students = getUniqueStudents()
    try:
        ind = int(student)
        if (options.partialMax is not None) and not options.ignorePartialMax:
            options.partialInd = ind % options.partialMax
            ind /= options.partialMax
            print "ind: %i partialInd: %i" % (ind, options.partialInd)
        student = students[ind]
    except:
        if student not in students:
            print >> sys.stderr, "Unknown student:", student
            sys.exit(2)
    options.student = student
    options.studentInd = students.index(student)
    options.otherStudents = list(students)
    options.otherStudents.remove(student)
    # check provided classifiers
    options.classifier = getClassifier(classifierType)
    if (options.classifier[1] == True) and (options.baseLearner is None):
        print >> sys.stderr, "Missing base learner:", classifierType
        sys.exit(1)
    if (options.classifier[2] == True) and (options.fallbackLearner is None):
        print >> sys.stderr, "Missing fallback learner:", classifierType
        sys.exit(1)
    if options.baseLearner is not None:
        temp = getClassifier(options.baseLearner)
        assert (not temp[1]) and (not temp[2])
    if options.fallbackLearner is not None:
        temp = getClassifier(options.fallbackLearner)
        assert (not temp[1]) and (not temp[2])
    # get name
    name = options.classifier[0]
    if name == "trbagg-partialLoad":
        name = "trbagg"
    if name == "twostagetradaboost-partial":
        name = "twostagetradaboost"
    if options.baseLearner is not None:
        name += "_base" + options.baseLearner
    if options.fallbackLearner is not None:
        name += "_fb" + options.fallbackLearner
    options.name = name
    options.saveName = name + "-target%i-source%i" % (options.numTarget, options.numSource)

    # get the arguments
    options.saveConfigFilename = getConfigFilename("saved/" + options.saveName + "-" + student)
    options.saveFile = getSaveFilename(options)

    return options, args[numExpectedArgs:]
Exemple #21
0
def main(args=sys.argv[1:]):
  usage = 'createTrBagg [--balanced] numTarget numSource jobStart [numJobs]'
  balanced = False
  if '--balanced' in args:
    args.remove('--balanced')
    balanced = True
  if (len(args) < 3) or (len(args) > 4):
    print >>sys.stderr,usage
    sys.exit(1)
  numClassifiers = 1000
  numTarget = int(args[0])
  numSource = int(args[1])
  jobStart = int(args[2])
  if len(args) >= 4:
    numJobs = int(args[3])
  else:
    numJobs = 1
  jobStart *= numJobs
  base = 'studentsNew29-unperturbed-%i'
  outputBase = 'studentsNew29-unperturbed-transfer/target%i-source%i' % (numTarget,numSource)
  if not os.path.exists('data/dt/' + outputBase + '/desc'):
    os.makedirs('data/dt/' + outputBase + '/desc')
  if not os.path.exists('data/dt/' + outputBase + '/weighted'):
    os.makedirs('data/dt/' + outputBase + '/weighted')

  for jobOffset in range(numJobs):
    jobNum = jobStart + jobOffset
    studentInd = jobNum / numClassifiers
    classifierInd = jobNum % numClassifiers
    print 'job,student,classifier:',jobNum,studentInd,classifierInd

    students = getUniqueStudents()
    if studentInd >= len(students):
      return
    if balanced:
      probs = calcBalancedProbs(students,numSource,numTarget,studentInd)
    else:
      probs = calcProbs(students,numSource,numTarget,studentInd)
    print 'probs:',probs,sum(probs)

    counts = [0 for s in students]
    for i in range(numTarget):
      r = random.random()
      total = 0
      for j,p in enumerate(probs):
        total += p
        if r < total:
          ind = j
          break
      else:
        ind = len(probs) - 1
      counts[ind] += 1
    eps = 1e-10
    props = [eps + float(c) / (numTarget if i == studentInd else numSource) for i,c in enumerate(counts)]

    try:
      arffFilename = makeTemp('.arff')
      #arffFilenameFilt = makeTemp('.arff')
      tempFile = makeTemp('.arff')
      with open(arffFilename,'w') as arffFile:
        for i,(student,prop) in enumerate(zip(students,props)):
          inFile = getFilename(base % (numTarget if student == students[studentInd] else numSource),student,TRAIN)
          print 'resampling',student
          resample(inFile,tempFile,prop)
          with open(tempFile,'r') as f:
            if i != 0:
              for line in f:
                if line.strip() == '@data':
                  break
            for line in f:
              arffFile.write(line)
      print 'removing trial step'
      #removeTrialStep(arffFilename,arffFilenameFilt)
      if balanced:
        name = 'trBagg-balanced-%s-%i' % (students[studentInd],classifierInd)
      else:
        name = 'trBagg-%s-%i' % (students[studentInd],classifierInd)
      makeTree(arffFilename,True,None,outputBase,name,[],False,1.0)
      #makeTree(arffFilenameFilt,True,None,outputBase,'trBagg-%s-%i' % (students[studentInd],classifierInd),[],False,1.0)
    finally:
      os.remove(arffFilename)
      #os.remove(arffFilenameFilt)
      os.remove(tempFile)
      os.remove(getFilename(outputBase,name,DESC))