예제 #1
0
def main(inFile,base,name,stayWeight=None,treeOptions=[],useWeka=False,numInstances=None,randomTree=False,numRandomTrees=10,featureFrac = 0.8,resampleFrac=0.5,randomTreeInd=None):

  # create the temporary files we need
  tmpData = makeTemp('.arff')
  removeFiles = [tmpData]
  try:
    with open(inFile,'r') as f:
      lines = [f.readline() for i in range(5)]
    content = ' '.join(lines)
    if 'Trial' not in content:
      removeFiles.remove(tmpData)
      tmpData = inFile
    else:
      print 'Removing trial and step features'
      removeTrialStep(inFile,tmpData,numInstances)
    if randomTree:
      tmpDataSampled = makeTemp('-sampled.arff')
      removeFiles.append(tmpDataSampled)
      for i in range(numRandomTrees):
        if (randomTreeInd is not None) and (i != randomTreeInd):
          continue
        print '*** Random Tree %i' % i
        resample(tmpData,tmpDataSampled,resampleFrac)
        makeTree(tmpDataSampled,useWeka,stayWeight,base,name + '-%i' % i,treeOptions,randomTree,featureFrac)
    else:
      makeTree(tmpData,useWeka,stayWeight,base,name,treeOptions,randomTree,featureFrac)
    print 'Done.'
  finally:
    for f in removeFiles:
      os.remove(f)
예제 #2
0
def processStudent(base,dest,i,header,studentData,treeOptions,stayWeight,useWeka):
  filename = makeTemp('.arff')
  try:
    writeData(header,studentData,filename,i)
    createDT(filename,base,dest,stayWeight,treeOptions,useWeka)
  finally:
    os.remove(filename)
예제 #3
0
def processStudent(base, dest, i, header, studentData, treeOptions, stayWeight,
                   useWeka):
    filename = makeTemp('.arff')
    try:
        writeData(header, studentData, filename, i)
        createDT(filename, base, dest, stayWeight, treeOptions, useWeka)
    finally:
        os.remove(filename)
예제 #4
0
def main(inFile,
         base,
         name,
         stayWeight=None,
         treeOptions=[],
         useWeka=False,
         numInstances=None,
         randomTree=False,
         numRandomTrees=10,
         featureFrac=0.8,
         resampleFrac=0.5,
         randomTreeInd=None):

    # create the temporary files we need
    tmpData = makeTemp('.arff')
    removeFiles = [tmpData]
    try:
        with open(inFile, 'r') as f:
            lines = [f.readline() for i in range(5)]
        content = ' '.join(lines)
        if 'Trial' not in content:
            removeFiles.remove(tmpData)
            tmpData = inFile
        else:
            print 'Removing trial and step features'
            removeTrialStep(inFile, tmpData, numInstances)
        if randomTree:
            tmpDataSampled = makeTemp('-sampled.arff')
            removeFiles.append(tmpDataSampled)
            for i in range(numRandomTrees):
                if (randomTreeInd is not None) and (i != randomTreeInd):
                    continue
                print '*** Random Tree %i' % i
                resample(tmpData, tmpDataSampled, resampleFrac)
                makeTree(tmpDataSampled, useWeka, stayWeight, base,
                         name + '-%i' % i, treeOptions, randomTree,
                         featureFrac)
        else:
            makeTree(tmpData, useWeka, stayWeight, base, name, treeOptions,
                     randomTree, featureFrac)
        print 'Done.'
    finally:
        for f in removeFiles:
            os.remove(f)
예제 #5
0
def main(base, suffix, stayWeight, treeOptions, students, useWeka):
    makeDirs(base, False)
    header, studentData = readStudents(base, students)
    filename = makeTemp('.arff')
    try:
        writeData(header, studentData, filename)
        if len(suffix) > 0:
            if suffix[0] != '-':
                suffix = '-' + suffix
        dest = 'common%s' % (suffix)
        createDT(filename, base, dest, stayWeight, treeOptions, useWeka)
    finally:
        os.remove(filename)
예제 #6
0
def main(base,suffix,stayWeight,treeOptions,students,useWeka):
  makeDirs(base,False)
  header,studentData = readStudents(base,students)
  filename = makeTemp('.arff')
  try:
    writeData(header,studentData,filename)
    if len(suffix) > 0:
      if suffix[0] != '-':
        suffix = '-' + suffix
    dest = 'common%s' % (suffix)
    createDT(filename,base,dest,stayWeight,treeOptions,useWeka)
  finally:
    os.remove(filename)
예제 #7
0
def combineConfigs(saveFile, options, convertFunction):
    learner = options.classifier[0]
    with open(getConfigFilename(learner), 'r') as f:
        content = f.read().strip()
    if learner == 'trbagg-partialLoad':
        content = content.replace(
            '$(PARTIAL_FILENAME)',
            'data/dt/studentsNew29-unperturbed-transfer/target10000-source50000/weighted/trBagg-%s.weka'
            % options.student)
    if learner == 'twostagetradaboost-partial':
        content = content.replace('$(BEST_T)', str(options.partialInd))
        content = content.replace('$(EVALUATE_BEST_T)',
                                  'false' if options.save else 'true')
    content = convertFunction(content)
    endInd = content.rfind('}')
    if content[endInd - 1] == '\n':
        endInd -= 1
    res = content[:endInd]

    prefixList = ['"baseLearner":', '"fallbackLearner":']
    configs = [options.baseLearner, options.fallbackLearner]
    for conf, prefix in zip(configs, prefixList):
        if conf is None:
            continue
        res += ',\n  ' + prefix + ' '
        with open(getConfigFilename(conf), 'r') as f:
            for i, line in enumerate(f):
                res += ('  ' if i != 0 else '') + line
        res = res.strip()
    res += '\n}'

    filename = makeTemp()
    with open(filename, 'w') as f:
        f.write(res)

    # for save config
    ind = res.find('{')
    ind += 1
    while res[ind] in ['\n', '\r', ' ', '\t']:
        ind += 1
    filenameLine = '"filename": "%s",\n  ' % saveFile
    res = res[:ind] + filenameLine + res[ind:]
    res = re.sub('.*"partialFilename".*\n', '', res)
    with open(options.saveConfigFilename, 'w') as f:
        f.write(res)

    return filename
예제 #8
0
def combineConfigs(saveFile, options, convertFunction):
    learner = options.classifier[0]
    with open(getConfigFilename(learner), "r") as f:
        content = f.read().strip()
    if learner == "trbagg-partialLoad":
        content = content.replace(
            "$(PARTIAL_FILENAME)",
            "data/dt/studentsNew29-unperturbed-transfer/target10000-source50000/weighted/trBagg-%s.weka"
            % options.student,
        )
    if learner == "twostagetradaboost-partial":
        content = content.replace("$(BEST_T)", str(options.partialInd))
        content = content.replace("$(EVALUATE_BEST_T)", "false" if options.save else "true")
    content = convertFunction(content)
    endInd = content.rfind("}")
    if content[endInd - 1] == "\n":
        endInd -= 1
    res = content[:endInd]

    prefixList = ['"baseLearner":', '"fallbackLearner":']
    configs = [options.baseLearner, options.fallbackLearner]
    for conf, prefix in zip(configs, prefixList):
        if conf is None:
            continue
        res += ",\n  " + prefix + " "
        with open(getConfigFilename(conf), "r") as f:
            for i, line in enumerate(f):
                res += ("  " if i != 0 else "") + line
        res = res.strip()
    res += "\n}"

    filename = makeTemp()
    with open(filename, "w") as f:
        f.write(res)

    # for save config
    ind = res.find("{")
    ind += 1
    while res[ind] in ["\n", "\r", " ", "\t"]:
        ind += 1
    filenameLine = '"filename": "%s",\n  ' % saveFile
    res = res[:ind] + filenameLine + res[ind:]
    res = re.sub('.*"partialFilename".*\n', "", res)
    with open(options.saveConfigFilename, "w") as f:
        f.write(res)

    return filename
예제 #9
0
def main(args=sys.argv[1:]):
    usage = 'createTrBagg [--balanced] numTarget numSource jobStart [numJobs]'
    balanced = False
    if '--balanced' in args:
        args.remove('--balanced')
        balanced = True
    if (len(args) < 3) or (len(args) > 4):
        print >> sys.stderr, usage
        sys.exit(1)
    numClassifiers = 1000
    numTarget = int(args[0])
    numSource = int(args[1])
    jobStart = int(args[2])
    if len(args) >= 4:
        numJobs = int(args[3])
    else:
        numJobs = 1
    jobStart *= numJobs
    base = 'studentsNew29-unperturbed-%i'
    outputBase = 'studentsNew29-unperturbed-transfer/target%i-source%i' % (
        numTarget, numSource)
    if not os.path.exists('data/dt/' + outputBase + '/desc'):
        os.makedirs('data/dt/' + outputBase + '/desc')
    if not os.path.exists('data/dt/' + outputBase + '/weighted'):
        os.makedirs('data/dt/' + outputBase + '/weighted')

    for jobOffset in range(numJobs):
        jobNum = jobStart + jobOffset
        studentInd = jobNum / numClassifiers
        classifierInd = jobNum % numClassifiers
        print 'job,student,classifier:', jobNum, studentInd, classifierInd

        students = getUniqueStudents()
        if studentInd >= len(students):
            return
        if balanced:
            probs = calcBalancedProbs(students, numSource, numTarget,
                                      studentInd)
        else:
            probs = calcProbs(students, numSource, numTarget, studentInd)
        print 'probs:', probs, sum(probs)

        counts = [0 for s in students]
        for i in range(numTarget):
            r = random.random()
            total = 0
            for j, p in enumerate(probs):
                total += p
                if r < total:
                    ind = j
                    break
            else:
                ind = len(probs) - 1
            counts[ind] += 1
        eps = 1e-10
        props = [
            eps + float(c) / (numTarget if i == studentInd else numSource)
            for i, c in enumerate(counts)
        ]

        try:
            arffFilename = makeTemp('.arff')
            #arffFilenameFilt = makeTemp('.arff')
            tempFile = makeTemp('.arff')
            with open(arffFilename, 'w') as arffFile:
                for i, (student, prop) in enumerate(zip(students, props)):
                    inFile = getFilename(
                        base % (numTarget if student == students[studentInd]
                                else numSource), student, TRAIN)
                    print 'resampling', student
                    resample(inFile, tempFile, prop)
                    with open(tempFile, 'r') as f:
                        if i != 0:
                            for line in f:
                                if line.strip() == '@data':
                                    break
                        for line in f:
                            arffFile.write(line)
            print 'removing trial step'
            #removeTrialStep(arffFilename,arffFilenameFilt)
            if balanced:
                name = 'trBagg-balanced-%s-%i' % (students[studentInd],
                                                  classifierInd)
            else:
                name = 'trBagg-%s-%i' % (students[studentInd], classifierInd)
            makeTree(arffFilename, True, None, outputBase, name, [], False,
                     1.0)
            #makeTree(arffFilenameFilt,True,None,outputBase,'trBagg-%s-%i' % (students[studentInd],classifierInd),[],False,1.0)
        finally:
            os.remove(arffFilename)
            #os.remove(arffFilenameFilt)
            os.remove(tempFile)
            os.remove(getFilename(outputBase, name, DESC))
예제 #10
0
def main(args=sys.argv[1:]):
  usage = 'createTrBagg [--balanced] numTarget numSource jobStart [numJobs]'
  balanced = False
  if '--balanced' in args:
    args.remove('--balanced')
    balanced = True
  if (len(args) < 3) or (len(args) > 4):
    print >>sys.stderr,usage
    sys.exit(1)
  numClassifiers = 1000
  numTarget = int(args[0])
  numSource = int(args[1])
  jobStart = int(args[2])
  if len(args) >= 4:
    numJobs = int(args[3])
  else:
    numJobs = 1
  jobStart *= numJobs
  base = 'studentsNew29-unperturbed-%i'
  outputBase = 'studentsNew29-unperturbed-transfer/target%i-source%i' % (numTarget,numSource)
  if not os.path.exists('data/dt/' + outputBase + '/desc'):
    os.makedirs('data/dt/' + outputBase + '/desc')
  if not os.path.exists('data/dt/' + outputBase + '/weighted'):
    os.makedirs('data/dt/' + outputBase + '/weighted')

  for jobOffset in range(numJobs):
    jobNum = jobStart + jobOffset
    studentInd = jobNum / numClassifiers
    classifierInd = jobNum % numClassifiers
    print 'job,student,classifier:',jobNum,studentInd,classifierInd

    students = getUniqueStudents()
    if studentInd >= len(students):
      return
    if balanced:
      probs = calcBalancedProbs(students,numSource,numTarget,studentInd)
    else:
      probs = calcProbs(students,numSource,numTarget,studentInd)
    print 'probs:',probs,sum(probs)

    counts = [0 for s in students]
    for i in range(numTarget):
      r = random.random()
      total = 0
      for j,p in enumerate(probs):
        total += p
        if r < total:
          ind = j
          break
      else:
        ind = len(probs) - 1
      counts[ind] += 1
    eps = 1e-10
    props = [eps + float(c) / (numTarget if i == studentInd else numSource) for i,c in enumerate(counts)]

    try:
      arffFilename = makeTemp('.arff')
      #arffFilenameFilt = makeTemp('.arff')
      tempFile = makeTemp('.arff')
      with open(arffFilename,'w') as arffFile:
        for i,(student,prop) in enumerate(zip(students,props)):
          inFile = getFilename(base % (numTarget if student == students[studentInd] else numSource),student,TRAIN)
          print 'resampling',student
          resample(inFile,tempFile,prop)
          with open(tempFile,'r') as f:
            if i != 0:
              for line in f:
                if line.strip() == '@data':
                  break
            for line in f:
              arffFile.write(line)
      print 'removing trial step'
      #removeTrialStep(arffFilename,arffFilenameFilt)
      if balanced:
        name = 'trBagg-balanced-%s-%i' % (students[studentInd],classifierInd)
      else:
        name = 'trBagg-%s-%i' % (students[studentInd],classifierInd)
      makeTree(arffFilename,True,None,outputBase,name,[],False,1.0)
      #makeTree(arffFilenameFilt,True,None,outputBase,'trBagg-%s-%i' % (students[studentInd],classifierInd),[],False,1.0)
    finally:
      os.remove(arffFilename)
      #os.remove(arffFilenameFilt)
      os.remove(tempFile)
      os.remove(getFilename(outputBase,name,DESC))