Example #1
0
class TaggerHandler:
  def __init__(self, dataDir, table):
    sys.stderr.write("TaggerHandler: Constructor\n")
    self.__TaggerInstance = Tagger()
    self.__dataDir = dataDir
    self.__tableFile = open(table,'w')
    
  def __updateTable(self, setting, accuracies):
    self.__tableFile.write(setting+'\t'+'\t'.join(map(lambda x:str(x), accuracies))+'\n')
  
  def __runTagger(self, trainFile, testFile):
    self.__TaggerInstance.loadData(trainFile, testFile)
    self.__TaggerInstance.train()
    accuracies = self.__TaggerInstance.test()
    setting = trainFile.split("_")[0]
    self.__updateTable(setting, accuracies)
  
  def run(self, trainFiles, testFiles):
    trainFiles = [self.__dataDir+line.strip() for line in open(trainFiles)]
    testFiles = [self.__dataDir+line.strip() for line in open(testFiles)]
    for trainFile in trainFiles:
      for testFile in testFiles:
        self.__runTagger(trainFile, testFile)
    self.__tableFile.close()
Example #2
0
class TaggerHandler:
  def __init__(self, dataDir, table):
    sys.stderr.write("TaggerHandler: Constructor\n")
    self.__TaggerInstance = Tagger()
    self.__dataDir = dataDir
    self.__tableFile = open(table, 'w', 1)
    self.__tableFile.write('TrainCSType\tTrainPureCSSplit\tTrainSize\tExperimentType\tTestCSType\tTestPureCSSplit\tTestSize\tTagset\tOverallAccuracy\tSameContextAccuracy\tDifferentContextAccuracy\tPrevWordDifferentAccuracy\tPrePrevWordDifferentAccuracy\tUnknowns\n')
    
  def __updateTable(self, setting, accuracies):
    ##print accuracies
    self.__tableFile.write(setting + '\t' + '\t'.join(map(lambda x:str(x), accuracies)) + '\n')
  
  def __getSetting(self, string):
    string = string.split("/")[-1].split("TrainCS")[1]
    cstype = string.split("CS")[0]
    csSplit = string.split("CS")[1].split("Pure")[0]
    pureSplit = string.split("Pure")[1].split("Total")[0]
    pureCSSplit = pureSplit + '-' + csSplit 
    totalSize = string.split("Total")[1].split("_")[0]
    return '\t'.join([cstype, pureCSSplit, totalSize])
  
  def __runTagger(self, trainFile, testFile):
    self.__TaggerInstance.loadData(trainFile, testFile)
    self.__TaggerInstance.train()
    accuracies = self.__TaggerInstance.test()
    trainSetting = self.__getSetting(trainFile)
    testSetting = self.__getSetting(testFile)
    tagset = self.__tagset(trainFile)
    self.__updateTable(trainSetting + '\t' + testSetting + '\t' + tagset, accuracies)
    
  def __runTagger2(self, trainFile, testFile, expType):
    trainFile = self.__dataDir+trainFile
    testFile = self.__dataDir+testFile
    self.__TaggerInstance.loadData(trainFile, testFile)
    self.__TaggerInstance.train()
    accuracies = self.__TaggerInstance.test()
    trainSetting = self.__getSetting(trainFile)
    testSetting = self.__getSetting(testFile)
    tagset = self.__tagset(trainFile)
    self.__updateTable(trainSetting + '\t' + expType + '\t' + testSetting + '\t' + tagset, accuracies)
    
  def __tagset(self, string):
    tagset = "Mixed"
    if len(string.split(".")) > 1 and string.split(".")[1] == "uni":
      tagset = 'Universal'
    if string.find(".uniq") >= 0:
      tagset += ".uniq"
    return tagset
  
  def run(self, trainFiles, testFiles):
    #trainFiles = [self.__dataDir+line.strip() for line in open(trainFiles)]
    #testFiles = [self.__dataDir+line.strip() for line in open(testFiles)]
    trainFiles = [line.strip() for line in open(trainFiles)]
    testFiles = [line.strip() for line in open(testFiles)]
    for trainFile in trainFiles:
      #if trainFile.find("Type1")>=0 or trainFile.find("Type0")>=0:
      #  continue
      for testFile in testFiles:
        if self.__tagset(trainFile) != "Mixed" or self.__tagset(trainFile) != self.__tagset(testFile):
        #if self.__tagset(trainFile)!= self.__tagset(testFile):
          continue
        ##if testFile.find("CS0Pure100")<0:
        ##  continue
        ##print testFile
        self.__runTagger(trainFile, testFile)
    self.__tableFile.close()
    
  def run2(self, trainFiles, testFiles):
    trainFiles = [line.strip() for line in open(trainFiles)]
    testFiles = [line.strip() for line in open(testFiles)]
    for trainFile in trainFiles:
      for testFile in testFiles:
        if self.__tagset(trainFile) != self.__tagset(testFile):
          continue
        controlTrainFile = trainFile + "_Control"
        if self.__tagset(trainFile) == "Universal":
          controlTrainFile = trainFile.split(".uni")[0] + "_Control" + ".uni"
        ##if testFile.find("CS0Pure100")<0:
        ##  continue
        ##print testFile
        self.__runTagger2(trainFile, testFile, "Experiment")
        self.__runTagger2(controlTrainFile, testFile, "Control")
    self.__tableFile.close()