def monitorCondor(args,suffix): numStudents = len(getUniqueStudents()) jobs = [-1 for i in range(numStudents)] unfinished = range(numStudents) saveFiles = [] for studentInd in range(numStudents): options,_ = parseArgs(addArgs(args,suffix,studentInd)) saveFiles.append(options.saveFile) while len(unfinished) > 0: needToRun = [] # check what's running and still needs to run p = subprocess.Popen(['condor_q','sbarrett'],stdout=subprocess.PIPE) out,_ = p.communicate() for studentInd in list(unfinished): if out.find(str(jobs[studentInd])) < 0: if os.path.exists(saveFiles[studentInd]): unfinished.remove(studentInd) else: needToRun.append(studentInd) # submit new jobs as needed for studentInd in needToRun: jobs[studentInd] = submit([str(studentInd)] + args,suffix) # good-night sweet prince time.sleep(20)
def main(targetDir, sourceDir, inPrefix, outPrefix, studentInd, origNumSource): #sourceDir = 'data/dt/perturbed-noop0.1-50000' #targetDir = 'data/dt/perturbed-noop0.1-1000' inFilename = inPrefix + '-%s.desc' outDesc = outPrefix + '-%s.desc' outTree = outPrefix + '-%s.weka' studentFile = 'data/newStudents29.txt' factor = origNumSource / (4 * 50000.0) students = getUniqueStudents(studentFile) #maxLength = 0 for i, student in enumerate(students): if (studentInd is not None) and (i != studentInd): continue filename = os.path.join(targetDir, 'desc', inFilename % student) #print student res = parseDesc(filename) #print res if res is None: print >> sys.stderr, 'SKIPPING', student continue #maxLength = max(maxLength,len(res)) cmd = [ 'bin/%s/boostWeights' % getArch(), student, targetDir, sourceDir ] for k, v in res.iteritems(): cmd += [k, str(v * factor)] #print ' '.join(cmd) outDescPath = os.path.join(targetDir, 'desc', outDesc % student) outTreePath = os.path.join(targetDir, 'weighted', outTree % student) subprocess.check_call(cmd, stdout=open(outDescPath, 'w')) extractTree(outDescPath, outTreePath)
def monitorCondor(args, suffix): numStudents = len(getUniqueStudents()) jobs = [-1 for i in range(numStudents)] unfinished = range(numStudents) saveFiles = [] for studentInd in range(numStudents): options, _ = parseArgs(addArgs(args, suffix, studentInd)) saveFiles.append(options.saveFile) while len(unfinished) > 0: needToRun = [] # check what's running and still needs to run p = subprocess.Popen(['condor_q', 'sbarrett'], stdout=subprocess.PIPE) out, _ = p.communicate() for studentInd in list(unfinished): if out.find(str(jobs[studentInd])) < 0: if os.path.exists(saveFiles[studentInd]): unfinished.remove(studentInd) else: needToRun.append(studentInd) # submit new jobs as needed for studentInd in needToRun: jobs[studentInd] = submit([str(studentInd)] + args, suffix) # good-night sweet prince time.sleep(20)
def main(dataBasename,stayWeight,outliers,treeOptions,useWeka): # get the students students = getUniqueStudents() dataDir = os.path.join('data','dt-train',dataBasename) # get the filenames commonFiles = [] outlierFiles = [] for student in students: filename = os.path.join(dataDir,'%s.arff' % student) if student in outliers: outlierFiles.append(filename) else: commonFiles.append(filename) assert(len(outlierFiles) == len(outliers)) # make sure all of the student files exist before starting for studentFile in outlierFiles + commonFiles: if not(os.path.exists(studentFile)): print >>sys.stderr,'Student file missing: %s' % studentFile sys.exit(1) # read in the files and create the trees commonData = [] for filename in commonFiles: header,lines = readFile(filename) commonData.extend(lines) commonData = header + commonData process(dataBasename,'common',commonData,stayWeight,treeOptions,useWeka) for filename in outlierFiles: student = os.path.splitext(os.path.basename(filename))[0] header,lines = readFile(filename) process(dataBasename,'outlier-%s' % student,header+lines,stayWeight,treeOptions,useWeka)
def main(targetDir,sourceDir,inPrefix,outPrefix,studentInd,origNumSource): #sourceDir = 'data/dt/perturbed-noop0.1-50000' #targetDir = 'data/dt/perturbed-noop0.1-1000' inFilename = inPrefix + '-%s.desc' outDesc = outPrefix + '-%s.desc' outTree = outPrefix + '-%s.weka' studentFile = 'data/newStudents29.txt' factor = origNumSource / (4 * 50000.0) students = getUniqueStudents(studentFile) #maxLength = 0 for i,student in enumerate(students): if (studentInd is not None) and (i != studentInd): continue filename = os.path.join(targetDir,'desc',inFilename % student) #print student res = parseDesc(filename) #print res if res is None: print >>sys.stderr,'SKIPPING',student continue #maxLength = max(maxLength,len(res)) cmd = ['bin/%s/boostWeights' % getArch(),student,targetDir,sourceDir] for k,v in res.iteritems(): cmd += [k,str(v * factor)] #print ' '.join(cmd) outDescPath = os.path.join(targetDir,'desc',outDesc % student) outTreePath = os.path.join(targetDir,'weighted',outTree % student) subprocess.check_call(cmd,stdout=open(outDescPath,'w')) extractTree(outDescPath,outTreePath)
def main(basename,dataDir,stayWeight,treeOptions,options): students = getUniqueStudents() for i,student in enumerate(students): if (options.studentInd is not None) and (i != options.studentInd): continue print '-------------------' print student print '-------------------' dataFile = getFilename(dataDir,student,TRAIN) createDT(dataFile,basename,'only-%s' % student,stayWeight,treeOptions,options.useWeka,options.numInstances,options.randomTree,options.numRandomTrees,options.featureFrac,options.resampleFrac,options.randomTreeInd)
def main(targetDir,sourceDir,destDir,prefix,maxNumBoosts,numTargetInstances,studentInd): students = getUniqueStudents(studentFile) for i,student in enumerate(students): if (studentInd is not None) and (i != studentInd): continue print '-------------------' print student print '-------------------' orderFile = os.path.join(destDir,'desc','combined-%s.desc' % student) cmd = ['bin/%s/boostGivenOrder' % getArch(),student,orderFile,targetDir,sourceDir,str(maxNumBoosts),str(numTargetInstances)] descFile = os.path.join(destDir,'desc',prefix + '-' + student + '.desc') resultFile = os.path.join(destDir,'weighted',prefix + '-' + student + '.weka') subprocess.check_call(cmd,stdout=open(descFile,'w')) extractTree(descFile,resultFile)
def main(targetDir,sourceDir,destDir,prefix,studentInd): students = getUniqueStudents(studentFile) prefix = prefix + '-using%sSource' % (sourceData if sourceData > 0 else 'All') for i,student in enumerate(students): if (studentInd is not None) and (i != studentInd): continue print '-------------------' print student print '-------------------' cmd = ['bin/%s/boostTest' % getArch(),student,studentFile,targetDir,sourceDir,str(sourceData)] descFile = os.path.join(destDir,'desc',prefix + '-' + student + '.desc') resultFile = os.path.join(destDir,'weighted',prefix + '-' + student + '.weka') subprocess.check_call(cmd,stdout=open(descFile,'w')) extractTree(descFile,resultFile)
def main(targetDir,sourceDir,destDir,numTargetInstances,jobInd): students = getUniqueStudents(studentFile) i = -1 for targetStudent in students: for sourceStudent in students: if (targetStudent == sourceStudent): continue i += 1 if (jobInd is not None) and (i != jobInd): continue print targetStudent, sourceStudent cmd = ['bin/%s/boostIndependent' % getArch(),targetStudent,sourceStudent,targetDir,sourceDir,str(numTargetInstances)] descFile = os.path.join(destDir,'boostIndependent',targetStudent + '-' + sourceStudent + '.desc') subprocess.check_call(cmd,stdout=open(descFile,'w'))
def main(basename, dataDir, stayWeight, treeOptions, options): students = getUniqueStudents() for i, student in enumerate(students): if (options.studentInd is not None) and (i != options.studentInd): continue print '-------------------' print student print '-------------------' dataFile = getFilename(dataDir, student, TRAIN) createDT(dataFile, basename, 'only-%s' % student, stayWeight, treeOptions, options.useWeka, options.numInstances, options.randomTree, options.numRandomTrees, options.featureFrac, options.resampleFrac, options.randomTreeInd)
def main(targetDir, prefix, jobInd): TOP_NUM = 30 students = getUniqueStudents(studentFile) i = -1 for targetStudent in students: i += 1 if (jobInd is not None) and (i != jobInd): continue orderedVals = [] orderedStudents = [] for sourceStudent in students: if (targetStudent == sourceStudent): continue descFile = os.path.join( targetDir, 'boostIndependent', targetStudent + '-' + sourceStudent + '.desc') with open(descFile, 'r') as f: line = f.readlines()[-1].strip() val, student = line.split(' ') val = float(val) assert (student == sourceStudent) if val > 1e-10: i = 0 for i, v in enumerate(orderedVals): if val > v: break else: i = len(orderedVals) orderedVals.insert(i, val) orderedStudents.insert(i, sourceStudent) print targetStudent print ' ', orderedVals print ' ', orderedStudents print ' ', len(orderedVals) print '\n' outFile = os.path.join(targetDir, 'desc', prefix + '-' + targetStudent + '.desc') if os.path.exists(outFile): print >> sys.stderr, 'FILE EXISTS:', outFile sys.exit(1) with open(outFile, 'w') as f: for v, s in zip(orderedVals[:TOP_NUM], orderedStudents[:TOP_NUM]): f.write('%s %s\n' % (v, s)) f.write('TwoStageTrAdaBoost\n')
def main(targetDir,prefix,jobInd): TOP_NUM = 30 students = getUniqueStudents(studentFile) i = -1 for targetStudent in students: i += 1 if (jobInd is not None) and (i != jobInd): continue orderedVals = [] orderedStudents = [] for sourceStudent in students: if (targetStudent == sourceStudent): continue descFile = os.path.join(targetDir,'boostIndependent',targetStudent + '-' + sourceStudent + '.desc') with open(descFile,'r') as f: line = f.readlines()[-1].strip() val,student = line.split(' ') val = float(val) assert(student == sourceStudent) if val > 1e-10: i = 0 for i,v in enumerate(orderedVals): if val > v: break else: i = len(orderedVals) orderedVals.insert(i,val) orderedStudents.insert(i,sourceStudent) print targetStudent print ' ',orderedVals print ' ',orderedStudents print ' ',len(orderedVals) print '\n' outFile = os.path.join(targetDir,'desc',prefix + '-' + targetStudent + '.desc') if os.path.exists(outFile): print >>sys.stderr,'FILE EXISTS:',outFile sys.exit(1) with open(outFile,'w') as f: for v,s in zip(orderedVals[:TOP_NUM],orderedStudents[:TOP_NUM]): f.write('%s %s\n' % (v,s)) f.write('TwoStageTrAdaBoost\n')
def main(targetDir, sourceDir, destDir, prefix, studentInd): students = getUniqueStudents(studentFile) prefix = prefix + '-using%sSource' % (sourceData if sourceData > 0 else 'All') for i, student in enumerate(students): if (studentInd is not None) and (i != studentInd): continue print '-------------------' print student print '-------------------' cmd = [ 'bin/%s/boostTest' % getArch(), student, studentFile, targetDir, sourceDir, str(sourceData) ] descFile = os.path.join(destDir, 'desc', prefix + '-' + student + '.desc') resultFile = os.path.join(destDir, 'weighted', prefix + '-' + student + '.weka') subprocess.check_call(cmd, stdout=open(descFile, 'w')) extractTree(descFile, resultFile)
def main(targetDir, sourceDir, destDir, prefix, maxNumBoosts, numTargetInstances, studentInd): students = getUniqueStudents(studentFile) for i, student in enumerate(students): if (studentInd is not None) and (i != studentInd): continue print '-------------------' print student print '-------------------' orderFile = os.path.join(destDir, 'desc', 'combined-%s.desc' % student) cmd = [ 'bin/%s/boostGivenOrder' % getArch(), student, orderFile, targetDir, sourceDir, str(maxNumBoosts), str(numTargetInstances) ] descFile = os.path.join(destDir, 'desc', prefix + '-' + student + '.desc') resultFile = os.path.join(destDir, 'weighted', prefix + '-' + student + '.weka') subprocess.check_call(cmd, stdout=open(descFile, 'w')) extractTree(descFile, resultFile)
def main(dataBasename, stayWeight, outliers, treeOptions, useWeka): # get the students students = getUniqueStudents() dataDir = os.path.join('data', 'dt-train', dataBasename) # get the filenames commonFiles = [] outlierFiles = [] for student in students: filename = os.path.join(dataDir, '%s.arff' % student) if student in outliers: outlierFiles.append(filename) else: commonFiles.append(filename) assert (len(outlierFiles) == len(outliers)) # make sure all of the student files exist before starting for studentFile in outlierFiles + commonFiles: if not (os.path.exists(studentFile)): print >> sys.stderr, 'Student file missing: %s' % studentFile sys.exit(1) # read in the files and create the trees commonData = [] for filename in commonFiles: header, lines = readFile(filename) commonData.extend(lines) commonData = header + commonData process(dataBasename, 'common', commonData, stayWeight, treeOptions, useWeka) for filename in outlierFiles: student = os.path.splitext(os.path.basename(filename))[0] header, lines = readFile(filename) process(dataBasename, 'outlier-%s' % student, header + lines, stayWeight, treeOptions, useWeka)
def parseArgs(args, parserOptions=[], numAdditionalArgs=0, additionalArgsString=''): from optparse import OptionParser parser = OptionParser('%prog [options] classifier student ' + additionalArgsString) parser.add_option('-b', '--baseLearner', action='store', dest='baseLearner', type='str', default=None, help='use the classifier as the base learner') parser.add_option('-f', '--fallbackLearner', action='store', dest='fallbackLearner', type='str', default=None, help='use the classifier file as the fallback learner') #parser.add_option('--student',action='store',dest='student',type='str',default=None) parser.add_option('-s', '--source', action='store', dest='numSource', type='int', default=None, help='num source instances') parser.add_option('-t', '--target', action='store', dest='numTarget', type='int', default=None, help='num target instances') parser.add_option('--no-source', action='store_false', dest='useSource', default=True, help='don\'t use the source data, but use the name') parser.add_option( '--debug', action='store_true', dest='debug', default=False, help='print the cmd and don\'t remove the combined config') parser.add_option('--fracSourceData', action='store', dest='fracSourceData', default=None, help='frac of source data to use') parser.add_option('--partialMax', action='store', dest='partialMax', type='int', default=None, help='num partial runs') parser.add_option('--no-save', action='store_false', dest='save', default=True, help='disable saving of the classifier') parser.add_option('--catchOutput', action='store_true', dest='catchOutput', default=False, help='catch the output of the training') parser.add_option('--ignorePartialMax', action='store_true', dest='ignorePartialMax', default=False, help='ignore partialmax') for option in parserOptions: parser.add_option(option) options, args = parser.parse_args(args) numExpectedArgs = 2 if len(args) != numExpectedArgs + numAdditionalArgs: print >> sys.stderr, 'Incorrect number of arguments expected %i but got %i' % ( numExpectedArgs, len(args)) parser.parse_args(['--help']) sys.exit(1) classifierType, student = args[:numExpectedArgs] # num training should be an int, and the filename should exist options.base = 'studentsNew29-unperturbed-%i' assert (options.numTarget is not None), 'numTarget unspecified' options.targetBase = options.base % options.numTarget if not (baseExists(options.targetBase)): print >> sys.stderr, 'Dir for numTarget doesn\'t exist at:', options.targetBase sys.exit(2) assert (options.numSource is not None), 'numSource unspecified' if options.numSource != 0: options.sourceBase = options.base % options.numSource if not (baseExists(options.sourceBase)): print >> sys.stderr, 'Dir for numSource doesn\'t exist at:', options.sourceBase sys.exit(2) # get students and check provided student students = getUniqueStudents() try: ind = int(student) if (options.partialMax is not None) and not options.ignorePartialMax: options.partialInd = ind % options.partialMax ind /= options.partialMax print 'ind: %i partialInd: %i' % (ind, options.partialInd) student = students[ind] except: if student not in students: print >> sys.stderr, 'Unknown student:', student sys.exit(2) options.student = student options.studentInd = students.index(student) options.otherStudents = list(students) options.otherStudents.remove(student) # check provided classifiers options.classifier = getClassifier(classifierType) if (options.classifier[1] == True) and (options.baseLearner is None): print >> sys.stderr, 'Missing base learner:', classifierType sys.exit(1) if (options.classifier[2] == True) and (options.fallbackLearner is None): print >> sys.stderr, 'Missing fallback learner:', classifierType sys.exit(1) if options.baseLearner is not None: temp = getClassifier(options.baseLearner) assert ((not temp[1]) and (not temp[2])) if options.fallbackLearner is not None: temp = getClassifier(options.fallbackLearner) assert ((not temp[1]) and (not temp[2])) # get name name = options.classifier[0] if name == 'trbagg-partialLoad': name = 'trbagg' if name == 'twostagetradaboost-partial': name = 'twostagetradaboost' if options.baseLearner is not None: name += '_base' + options.baseLearner if options.fallbackLearner is not None: name += '_fb' + options.fallbackLearner options.name = name options.saveName = name + '-target%i-source%i' % (options.numTarget, options.numSource) # get the arguments options.saveConfigFilename = getConfigFilename('saved/' + options.saveName + '-' + student) options.saveFile = getSaveFilename(options) return options, args[numExpectedArgs:]
#!/usr/bin/env python import sys, os from common import getUniqueStudents directory = sys.argv[1] if not os.path.exists(directory): print 'Directory does not exist' sys.exit(2) prefix = sys.argv[2] numJobs = 1000 students = getUniqueStudents() for student in students: missing = [] for job in range(numJobs): path = os.path.join(directory, '%s-%s-%i.weka' % (prefix, student, job)) if not os.path.exists(path): missing.append(job) if len(missing) == numJobs: print 'Probably a problem with your directory or prefix' sys.exit(3) if len(missing) != 0: print student, ' '.join(map(str, missing))
#!/usr/bin/env python import sys, os from common import getUniqueStudents directory = sys.argv[1] if not os.path.exists(directory): print 'Directory does not exist' sys.exit(2) prefix = sys.argv[2] numJobs = 1000 students = getUniqueStudents() for student in students: missing = [] for job in range(numJobs): path = os.path.join(directory, '%s-%s-%i.weka' % (prefix,student,job)) if not os.path.exists(path): missing.append(job) if len(missing) == numJobs: print 'Probably a problem with your directory or prefix' sys.exit(3) if len(missing) != 0: print student,' '.join(map(str,missing))
def main(args=sys.argv[1:]): usage = 'createTrBagg [--balanced] numTarget numSource jobStart [numJobs]' balanced = False if '--balanced' in args: args.remove('--balanced') balanced = True if (len(args) < 3) or (len(args) > 4): print >> sys.stderr, usage sys.exit(1) numClassifiers = 1000 numTarget = int(args[0]) numSource = int(args[1]) jobStart = int(args[2]) if len(args) >= 4: numJobs = int(args[3]) else: numJobs = 1 jobStart *= numJobs base = 'studentsNew29-unperturbed-%i' outputBase = 'studentsNew29-unperturbed-transfer/target%i-source%i' % ( numTarget, numSource) if not os.path.exists('data/dt/' + outputBase + '/desc'): os.makedirs('data/dt/' + outputBase + '/desc') if not os.path.exists('data/dt/' + outputBase + '/weighted'): os.makedirs('data/dt/' + outputBase + '/weighted') for jobOffset in range(numJobs): jobNum = jobStart + jobOffset studentInd = jobNum / numClassifiers classifierInd = jobNum % numClassifiers print 'job,student,classifier:', jobNum, studentInd, classifierInd students = getUniqueStudents() if studentInd >= len(students): return if balanced: probs = calcBalancedProbs(students, numSource, numTarget, studentInd) else: probs = calcProbs(students, numSource, numTarget, studentInd) print 'probs:', probs, sum(probs) counts = [0 for s in students] for i in range(numTarget): r = random.random() total = 0 for j, p in enumerate(probs): total += p if r < total: ind = j break else: ind = len(probs) - 1 counts[ind] += 1 eps = 1e-10 props = [ eps + float(c) / (numTarget if i == studentInd else numSource) for i, c in enumerate(counts) ] try: arffFilename = makeTemp('.arff') #arffFilenameFilt = makeTemp('.arff') tempFile = makeTemp('.arff') with open(arffFilename, 'w') as arffFile: for i, (student, prop) in enumerate(zip(students, props)): inFile = getFilename( base % (numTarget if student == students[studentInd] else numSource), student, TRAIN) print 'resampling', student resample(inFile, tempFile, prop) with open(tempFile, 'r') as f: if i != 0: for line in f: if line.strip() == '@data': break for line in f: arffFile.write(line) print 'removing trial step' #removeTrialStep(arffFilename,arffFilenameFilt) if balanced: name = 'trBagg-balanced-%s-%i' % (students[studentInd], classifierInd) else: name = 'trBagg-%s-%i' % (students[studentInd], classifierInd) makeTree(arffFilename, True, None, outputBase, name, [], False, 1.0) #makeTree(arffFilenameFilt,True,None,outputBase,'trBagg-%s-%i' % (students[studentInd],classifierInd),[],False,1.0) finally: os.remove(arffFilename) #os.remove(arffFilenameFilt) os.remove(tempFile) os.remove(getFilename(outputBase, name, DESC))
def parseArgs(args, parserOptions=[], numAdditionalArgs=0, additionalArgsString=""): from optparse import OptionParser parser = OptionParser("%prog [options] classifier student " + additionalArgsString) parser.add_option( "-b", "--baseLearner", action="store", dest="baseLearner", type="str", default=None, help="use the classifier as the base learner", ) parser.add_option( "-f", "--fallbackLearner", action="store", dest="fallbackLearner", type="str", default=None, help="use the classifier file as the fallback learner", ) # parser.add_option('--student',action='store',dest='student',type='str',default=None) parser.add_option( "-s", "--source", action="store", dest="numSource", type="int", default=None, help="num source instances" ) parser.add_option( "-t", "--target", action="store", dest="numTarget", type="int", default=None, help="num target instances" ) parser.add_option( "--no-source", action="store_false", dest="useSource", default=True, help="don't use the source data, but use the name", ) parser.add_option( "--debug", action="store_true", dest="debug", default=False, help="print the cmd and don't remove the combined config", ) parser.add_option( "--fracSourceData", action="store", dest="fracSourceData", default=None, help="frac of source data to use" ) parser.add_option( "--partialMax", action="store", dest="partialMax", type="int", default=None, help="num partial runs" ) parser.add_option( "--no-save", action="store_false", dest="save", default=True, help="disable saving of the classifier" ) parser.add_option( "--catchOutput", action="store_true", dest="catchOutput", default=False, help="catch the output of the training" ) parser.add_option( "--ignorePartialMax", action="store_true", dest="ignorePartialMax", default=False, help="ignore partialmax" ) for option in parserOptions: parser.add_option(option) options, args = parser.parse_args(args) numExpectedArgs = 2 if len(args) != numExpectedArgs + numAdditionalArgs: print >> sys.stderr, "Incorrect number of arguments expected %i but got %i" % (numExpectedArgs, len(args)) parser.parse_args(["--help"]) sys.exit(1) classifierType, student = args[:numExpectedArgs] # num training should be an int, and the filename should exist options.base = "studentsNew29-unperturbed-%i" assert options.numTarget is not None, "numTarget unspecified" options.targetBase = options.base % options.numTarget if not (baseExists(options.targetBase)): print >> sys.stderr, "Dir for numTarget doesn't exist at:", options.targetBase sys.exit(2) assert options.numSource is not None, "numSource unspecified" if options.numSource != 0: options.sourceBase = options.base % options.numSource if not (baseExists(options.sourceBase)): print >> sys.stderr, "Dir for numSource doesn't exist at:", options.sourceBase sys.exit(2) # get students and check provided student students = getUniqueStudents() try: ind = int(student) if (options.partialMax is not None) and not options.ignorePartialMax: options.partialInd = ind % options.partialMax ind /= options.partialMax print "ind: %i partialInd: %i" % (ind, options.partialInd) student = students[ind] except: if student not in students: print >> sys.stderr, "Unknown student:", student sys.exit(2) options.student = student options.studentInd = students.index(student) options.otherStudents = list(students) options.otherStudents.remove(student) # check provided classifiers options.classifier = getClassifier(classifierType) if (options.classifier[1] == True) and (options.baseLearner is None): print >> sys.stderr, "Missing base learner:", classifierType sys.exit(1) if (options.classifier[2] == True) and (options.fallbackLearner is None): print >> sys.stderr, "Missing fallback learner:", classifierType sys.exit(1) if options.baseLearner is not None: temp = getClassifier(options.baseLearner) assert (not temp[1]) and (not temp[2]) if options.fallbackLearner is not None: temp = getClassifier(options.fallbackLearner) assert (not temp[1]) and (not temp[2]) # get name name = options.classifier[0] if name == "trbagg-partialLoad": name = "trbagg" if name == "twostagetradaboost-partial": name = "twostagetradaboost" if options.baseLearner is not None: name += "_base" + options.baseLearner if options.fallbackLearner is not None: name += "_fb" + options.fallbackLearner options.name = name options.saveName = name + "-target%i-source%i" % (options.numTarget, options.numSource) # get the arguments options.saveConfigFilename = getConfigFilename("saved/" + options.saveName + "-" + student) options.saveFile = getSaveFilename(options) return options, args[numExpectedArgs:]
def main(args=sys.argv[1:]): usage = 'createTrBagg [--balanced] numTarget numSource jobStart [numJobs]' balanced = False if '--balanced' in args: args.remove('--balanced') balanced = True if (len(args) < 3) or (len(args) > 4): print >>sys.stderr,usage sys.exit(1) numClassifiers = 1000 numTarget = int(args[0]) numSource = int(args[1]) jobStart = int(args[2]) if len(args) >= 4: numJobs = int(args[3]) else: numJobs = 1 jobStart *= numJobs base = 'studentsNew29-unperturbed-%i' outputBase = 'studentsNew29-unperturbed-transfer/target%i-source%i' % (numTarget,numSource) if not os.path.exists('data/dt/' + outputBase + '/desc'): os.makedirs('data/dt/' + outputBase + '/desc') if not os.path.exists('data/dt/' + outputBase + '/weighted'): os.makedirs('data/dt/' + outputBase + '/weighted') for jobOffset in range(numJobs): jobNum = jobStart + jobOffset studentInd = jobNum / numClassifiers classifierInd = jobNum % numClassifiers print 'job,student,classifier:',jobNum,studentInd,classifierInd students = getUniqueStudents() if studentInd >= len(students): return if balanced: probs = calcBalancedProbs(students,numSource,numTarget,studentInd) else: probs = calcProbs(students,numSource,numTarget,studentInd) print 'probs:',probs,sum(probs) counts = [0 for s in students] for i in range(numTarget): r = random.random() total = 0 for j,p in enumerate(probs): total += p if r < total: ind = j break else: ind = len(probs) - 1 counts[ind] += 1 eps = 1e-10 props = [eps + float(c) / (numTarget if i == studentInd else numSource) for i,c in enumerate(counts)] try: arffFilename = makeTemp('.arff') #arffFilenameFilt = makeTemp('.arff') tempFile = makeTemp('.arff') with open(arffFilename,'w') as arffFile: for i,(student,prop) in enumerate(zip(students,props)): inFile = getFilename(base % (numTarget if student == students[studentInd] else numSource),student,TRAIN) print 'resampling',student resample(inFile,tempFile,prop) with open(tempFile,'r') as f: if i != 0: for line in f: if line.strip() == '@data': break for line in f: arffFile.write(line) print 'removing trial step' #removeTrialStep(arffFilename,arffFilenameFilt) if balanced: name = 'trBagg-balanced-%s-%i' % (students[studentInd],classifierInd) else: name = 'trBagg-%s-%i' % (students[studentInd],classifierInd) makeTree(arffFilename,True,None,outputBase,name,[],False,1.0) #makeTree(arffFilenameFilt,True,None,outputBase,'trBagg-%s-%i' % (students[studentInd],classifierInd),[],False,1.0) finally: os.remove(arffFilename) #os.remove(arffFilenameFilt) os.remove(tempFile) os.remove(getFilename(outputBase,name,DESC))