def main(inFile,base,name,stayWeight=None,treeOptions=[],useWeka=False,numInstances=None,randomTree=False,numRandomTrees=10,featureFrac = 0.8,resampleFrac=0.5,randomTreeInd=None): # create the temporary files we need tmpData = makeTemp('.arff') removeFiles = [tmpData] try: with open(inFile,'r') as f: lines = [f.readline() for i in range(5)] content = ' '.join(lines) if 'Trial' not in content: removeFiles.remove(tmpData) tmpData = inFile else: print 'Removing trial and step features' removeTrialStep(inFile,tmpData,numInstances) if randomTree: tmpDataSampled = makeTemp('-sampled.arff') removeFiles.append(tmpDataSampled) for i in range(numRandomTrees): if (randomTreeInd is not None) and (i != randomTreeInd): continue print '*** Random Tree %i' % i resample(tmpData,tmpDataSampled,resampleFrac) makeTree(tmpDataSampled,useWeka,stayWeight,base,name + '-%i' % i,treeOptions,randomTree,featureFrac) else: makeTree(tmpData,useWeka,stayWeight,base,name,treeOptions,randomTree,featureFrac) print 'Done.' finally: for f in removeFiles: os.remove(f)
def processStudent(base,dest,i,header,studentData,treeOptions,stayWeight,useWeka): filename = makeTemp('.arff') try: writeData(header,studentData,filename,i) createDT(filename,base,dest,stayWeight,treeOptions,useWeka) finally: os.remove(filename)
def processStudent(base, dest, i, header, studentData, treeOptions, stayWeight, useWeka): filename = makeTemp('.arff') try: writeData(header, studentData, filename, i) createDT(filename, base, dest, stayWeight, treeOptions, useWeka) finally: os.remove(filename)
def main(inFile, base, name, stayWeight=None, treeOptions=[], useWeka=False, numInstances=None, randomTree=False, numRandomTrees=10, featureFrac=0.8, resampleFrac=0.5, randomTreeInd=None): # create the temporary files we need tmpData = makeTemp('.arff') removeFiles = [tmpData] try: with open(inFile, 'r') as f: lines = [f.readline() for i in range(5)] content = ' '.join(lines) if 'Trial' not in content: removeFiles.remove(tmpData) tmpData = inFile else: print 'Removing trial and step features' removeTrialStep(inFile, tmpData, numInstances) if randomTree: tmpDataSampled = makeTemp('-sampled.arff') removeFiles.append(tmpDataSampled) for i in range(numRandomTrees): if (randomTreeInd is not None) and (i != randomTreeInd): continue print '*** Random Tree %i' % i resample(tmpData, tmpDataSampled, resampleFrac) makeTree(tmpDataSampled, useWeka, stayWeight, base, name + '-%i' % i, treeOptions, randomTree, featureFrac) else: makeTree(tmpData, useWeka, stayWeight, base, name, treeOptions, randomTree, featureFrac) print 'Done.' finally: for f in removeFiles: os.remove(f)
def main(base, suffix, stayWeight, treeOptions, students, useWeka): makeDirs(base, False) header, studentData = readStudents(base, students) filename = makeTemp('.arff') try: writeData(header, studentData, filename) if len(suffix) > 0: if suffix[0] != '-': suffix = '-' + suffix dest = 'common%s' % (suffix) createDT(filename, base, dest, stayWeight, treeOptions, useWeka) finally: os.remove(filename)
def main(base,suffix,stayWeight,treeOptions,students,useWeka): makeDirs(base,False) header,studentData = readStudents(base,students) filename = makeTemp('.arff') try: writeData(header,studentData,filename) if len(suffix) > 0: if suffix[0] != '-': suffix = '-' + suffix dest = 'common%s' % (suffix) createDT(filename,base,dest,stayWeight,treeOptions,useWeka) finally: os.remove(filename)
def combineConfigs(saveFile, options, convertFunction): learner = options.classifier[0] with open(getConfigFilename(learner), 'r') as f: content = f.read().strip() if learner == 'trbagg-partialLoad': content = content.replace( '$(PARTIAL_FILENAME)', 'data/dt/studentsNew29-unperturbed-transfer/target10000-source50000/weighted/trBagg-%s.weka' % options.student) if learner == 'twostagetradaboost-partial': content = content.replace('$(BEST_T)', str(options.partialInd)) content = content.replace('$(EVALUATE_BEST_T)', 'false' if options.save else 'true') content = convertFunction(content) endInd = content.rfind('}') if content[endInd - 1] == '\n': endInd -= 1 res = content[:endInd] prefixList = ['"baseLearner":', '"fallbackLearner":'] configs = [options.baseLearner, options.fallbackLearner] for conf, prefix in zip(configs, prefixList): if conf is None: continue res += ',\n ' + prefix + ' ' with open(getConfigFilename(conf), 'r') as f: for i, line in enumerate(f): res += (' ' if i != 0 else '') + line res = res.strip() res += '\n}' filename = makeTemp() with open(filename, 'w') as f: f.write(res) # for save config ind = res.find('{') ind += 1 while res[ind] in ['\n', '\r', ' ', '\t']: ind += 1 filenameLine = '"filename": "%s",\n ' % saveFile res = res[:ind] + filenameLine + res[ind:] res = re.sub('.*"partialFilename".*\n', '', res) with open(options.saveConfigFilename, 'w') as f: f.write(res) return filename
def combineConfigs(saveFile, options, convertFunction): learner = options.classifier[0] with open(getConfigFilename(learner), "r") as f: content = f.read().strip() if learner == "trbagg-partialLoad": content = content.replace( "$(PARTIAL_FILENAME)", "data/dt/studentsNew29-unperturbed-transfer/target10000-source50000/weighted/trBagg-%s.weka" % options.student, ) if learner == "twostagetradaboost-partial": content = content.replace("$(BEST_T)", str(options.partialInd)) content = content.replace("$(EVALUATE_BEST_T)", "false" if options.save else "true") content = convertFunction(content) endInd = content.rfind("}") if content[endInd - 1] == "\n": endInd -= 1 res = content[:endInd] prefixList = ['"baseLearner":', '"fallbackLearner":'] configs = [options.baseLearner, options.fallbackLearner] for conf, prefix in zip(configs, prefixList): if conf is None: continue res += ",\n " + prefix + " " with open(getConfigFilename(conf), "r") as f: for i, line in enumerate(f): res += (" " if i != 0 else "") + line res = res.strip() res += "\n}" filename = makeTemp() with open(filename, "w") as f: f.write(res) # for save config ind = res.find("{") ind += 1 while res[ind] in ["\n", "\r", " ", "\t"]: ind += 1 filenameLine = '"filename": "%s",\n ' % saveFile res = res[:ind] + filenameLine + res[ind:] res = re.sub('.*"partialFilename".*\n', "", res) with open(options.saveConfigFilename, "w") as f: f.write(res) return filename
def main(args=sys.argv[1:]): usage = 'createTrBagg [--balanced] numTarget numSource jobStart [numJobs]' balanced = False if '--balanced' in args: args.remove('--balanced') balanced = True if (len(args) < 3) or (len(args) > 4): print >> sys.stderr, usage sys.exit(1) numClassifiers = 1000 numTarget = int(args[0]) numSource = int(args[1]) jobStart = int(args[2]) if len(args) >= 4: numJobs = int(args[3]) else: numJobs = 1 jobStart *= numJobs base = 'studentsNew29-unperturbed-%i' outputBase = 'studentsNew29-unperturbed-transfer/target%i-source%i' % ( numTarget, numSource) if not os.path.exists('data/dt/' + outputBase + '/desc'): os.makedirs('data/dt/' + outputBase + '/desc') if not os.path.exists('data/dt/' + outputBase + '/weighted'): os.makedirs('data/dt/' + outputBase + '/weighted') for jobOffset in range(numJobs): jobNum = jobStart + jobOffset studentInd = jobNum / numClassifiers classifierInd = jobNum % numClassifiers print 'job,student,classifier:', jobNum, studentInd, classifierInd students = getUniqueStudents() if studentInd >= len(students): return if balanced: probs = calcBalancedProbs(students, numSource, numTarget, studentInd) else: probs = calcProbs(students, numSource, numTarget, studentInd) print 'probs:', probs, sum(probs) counts = [0 for s in students] for i in range(numTarget): r = random.random() total = 0 for j, p in enumerate(probs): total += p if r < total: ind = j break else: ind = len(probs) - 1 counts[ind] += 1 eps = 1e-10 props = [ eps + float(c) / (numTarget if i == studentInd else numSource) for i, c in enumerate(counts) ] try: arffFilename = makeTemp('.arff') #arffFilenameFilt = makeTemp('.arff') tempFile = makeTemp('.arff') with open(arffFilename, 'w') as arffFile: for i, (student, prop) in enumerate(zip(students, props)): inFile = getFilename( base % (numTarget if student == students[studentInd] else numSource), student, TRAIN) print 'resampling', student resample(inFile, tempFile, prop) with open(tempFile, 'r') as f: if i != 0: for line in f: if line.strip() == '@data': break for line in f: arffFile.write(line) print 'removing trial step' #removeTrialStep(arffFilename,arffFilenameFilt) if balanced: name = 'trBagg-balanced-%s-%i' % (students[studentInd], classifierInd) else: name = 'trBagg-%s-%i' % (students[studentInd], classifierInd) makeTree(arffFilename, True, None, outputBase, name, [], False, 1.0) #makeTree(arffFilenameFilt,True,None,outputBase,'trBagg-%s-%i' % (students[studentInd],classifierInd),[],False,1.0) finally: os.remove(arffFilename) #os.remove(arffFilenameFilt) os.remove(tempFile) os.remove(getFilename(outputBase, name, DESC))
def main(args=sys.argv[1:]): usage = 'createTrBagg [--balanced] numTarget numSource jobStart [numJobs]' balanced = False if '--balanced' in args: args.remove('--balanced') balanced = True if (len(args) < 3) or (len(args) > 4): print >>sys.stderr,usage sys.exit(1) numClassifiers = 1000 numTarget = int(args[0]) numSource = int(args[1]) jobStart = int(args[2]) if len(args) >= 4: numJobs = int(args[3]) else: numJobs = 1 jobStart *= numJobs base = 'studentsNew29-unperturbed-%i' outputBase = 'studentsNew29-unperturbed-transfer/target%i-source%i' % (numTarget,numSource) if not os.path.exists('data/dt/' + outputBase + '/desc'): os.makedirs('data/dt/' + outputBase + '/desc') if not os.path.exists('data/dt/' + outputBase + '/weighted'): os.makedirs('data/dt/' + outputBase + '/weighted') for jobOffset in range(numJobs): jobNum = jobStart + jobOffset studentInd = jobNum / numClassifiers classifierInd = jobNum % numClassifiers print 'job,student,classifier:',jobNum,studentInd,classifierInd students = getUniqueStudents() if studentInd >= len(students): return if balanced: probs = calcBalancedProbs(students,numSource,numTarget,studentInd) else: probs = calcProbs(students,numSource,numTarget,studentInd) print 'probs:',probs,sum(probs) counts = [0 for s in students] for i in range(numTarget): r = random.random() total = 0 for j,p in enumerate(probs): total += p if r < total: ind = j break else: ind = len(probs) - 1 counts[ind] += 1 eps = 1e-10 props = [eps + float(c) / (numTarget if i == studentInd else numSource) for i,c in enumerate(counts)] try: arffFilename = makeTemp('.arff') #arffFilenameFilt = makeTemp('.arff') tempFile = makeTemp('.arff') with open(arffFilename,'w') as arffFile: for i,(student,prop) in enumerate(zip(students,props)): inFile = getFilename(base % (numTarget if student == students[studentInd] else numSource),student,TRAIN) print 'resampling',student resample(inFile,tempFile,prop) with open(tempFile,'r') as f: if i != 0: for line in f: if line.strip() == '@data': break for line in f: arffFile.write(line) print 'removing trial step' #removeTrialStep(arffFilename,arffFilenameFilt) if balanced: name = 'trBagg-balanced-%s-%i' % (students[studentInd],classifierInd) else: name = 'trBagg-%s-%i' % (students[studentInd],classifierInd) makeTree(arffFilename,True,None,outputBase,name,[],False,1.0) #makeTree(arffFilenameFilt,True,None,outputBase,'trBagg-%s-%i' % (students[studentInd],classifierInd),[],False,1.0) finally: os.remove(arffFilename) #os.remove(arffFilenameFilt) os.remove(tempFile) os.remove(getFilename(outputBase,name,DESC))