예제 #1
0
  def readLabelFile(self, labelFilename, entityTypes):
    """ read a mallet label file and return the list of labels """
    labelLines = open(labelFilename, 'r').readlines()
    labelList = []
    
    if len(entityTypes) == 1:
      binaryClassification = True
    else:
      binaryClassification = False
      
    labelConversionList = ['other']
    for eType in entityTypes:
      labelConversionList.append(eType)
    
    for line in labelLines:
      parsedLine = line.strip().split()
      if len(parsedLine) > 0:
        mClass = int(parsedLine[0])
        prob = float(parsedLine[1])
        if binaryClassification:
          if prob < self.binaryThreshold:
            mClass = 0
          else:
            mClass = 1
        label = labelConversionList[mClass]   
        
        tLabel = TokenLabel(label)
        tLabel.prob = prob
        tLabel.sequenceProb = 1
        
        labelList.append([tLabel])
              
#        labelList.append(label) 
     
    return labelList
예제 #2
0
  def readLabelFile(self, labelFilename, entityTypes):
    """ read a mallet label file and return the list of labels """
    labelLines = open(labelFilename, 'r').readlines()
    labels = []
    lineNo = 1
    sequenceProb = []
    for i in range(self.topK):
      sequenceProb.append(0.0)
    
    currentTopK = self.topK  
    for line in labelLines:
      try:
        topKLabels = line.strip().split()
        if len(topKLabels) > 0:
          if topKLabels[0] == 'k' :
            newTopK = int(topKLabels[1])
            if newTopK != currentTopK:
              currentTopK = newTopK
              sequenceProb = []
              for i in range(currentTopK):
                sequenceProb.append(0.0)
            # this is the list of sequence probabilities
            for i in range(currentTopK):
              sequenceProb[i] = float(topKLabels[i+2])
#            print lineNo, topKLabels[1], currentTopK, sequenceProb
          elif len(topKLabels) == 2*currentTopK:
            tokenLabelList = []
            for i in range(0,currentTopK*2,2):
              label = topKLabels[i]
              prob = float(topKLabels[i+1])
              tLabel = TokenLabel(label)
              tLabel.prob = prob
              tLabel.sequenceProb = sequenceProb[i/2]
              tokenLabelList.append(tLabel)
#              print 'Read:', tLabel.label, tLabel.sequenceProb, tLabel.prob  
    
            labels.append(tokenLabelList)  
#        if len(topKLabels) == self.topK:
#    #        for i in range(len(topKLabels)):
#    #          if topKLabels[i] == 'O':
#    #            topKLabels[i] = 'other'
#          if self.topK == 1:
#            labels.append(topKLabels[0])
#          else:
#            labels.append(topKLabels) 
      except:
        print '%s: Error at line number %d' % (labelFilename, lineNo)  
      lineNo += 1     
    return labels