Ejemplo n.º 1
0
  def readLabelFile(self, labelFilename, entityTypes):
    """ read a mallet label file and return the list of labels """
    labelLines = open(labelFilename, 'r').readlines()
    labelList = []
    
    if len(entityTypes) == 1:
      binaryClassification = True
    else:
      binaryClassification = False
      
    labelConversionList = ['other']
    for eType in entityTypes:
      labelConversionList.append(eType)
    
    for line in labelLines:
      parsedLine = line.strip().split()
      if len(parsedLine) > 0:
        mClass = int(parsedLine[0])
        prob = float(parsedLine[1])
        if binaryClassification:
          if prob < self.binaryThreshold:
            mClass = 0
          else:
            mClass = 1
        label = labelConversionList[mClass]   
        
        tLabel = TokenLabel(label)
        tLabel.prob = prob
        tLabel.sequenceProb = 1
        
        labelList.append([tLabel])
              
#        labelList.append(label) 
     
    return labelList
Ejemplo n.º 2
0
  def readLabelFile(self, labelFilename, entityTypes):
    """ read a mallet label file and return the list of labels """
    labelLines = open(labelFilename, 'r').readlines()
    labels = []
    lineNo = 1
    sequenceProb = []
    for i in range(self.topK):
      sequenceProb.append(0.0)
    
    currentTopK = self.topK  
    for line in labelLines:
      try:
        topKLabels = line.strip().split()
        if len(topKLabels) > 0:
          if topKLabels[0] == 'k' :
            newTopK = int(topKLabels[1])
            if newTopK != currentTopK:
              currentTopK = newTopK
              sequenceProb = []
              for i in range(currentTopK):
                sequenceProb.append(0.0)
            # this is the list of sequence probabilities
            for i in range(currentTopK):
              sequenceProb[i] = float(topKLabels[i+2])
#            print lineNo, topKLabels[1], currentTopK, sequenceProb
          elif len(topKLabels) == 2*currentTopK:
            tokenLabelList = []
            for i in range(0,currentTopK*2,2):
              label = topKLabels[i]
              prob = float(topKLabels[i+1])
              tLabel = TokenLabel(label)
              tLabel.prob = prob
              tLabel.sequenceProb = sequenceProb[i/2]
              tokenLabelList.append(tLabel)
#              print 'Read:', tLabel.label, tLabel.sequenceProb, tLabel.prob  
    
            labels.append(tokenLabelList)  
#        if len(topKLabels) == self.topK:
#    #        for i in range(len(topKLabels)):
#    #          if topKLabels[i] == 'O':
#    #            topKLabels[i] = 'other'
#          if self.topK == 1:
#            labels.append(topKLabels[0])
#          else:
#            labels.append(topKLabels) 
      except:
        print '%s: Error at line number %d' % (labelFilename, lineNo)  
      lineNo += 1     
    return labels
Ejemplo n.º 3
0
 def getTopKLabelings(self, sentence, finder, topK):
   """ return list of top k sequence labelings for the sentence """
   labelings = []
   for k in range(min(topK,finder.tokenClassifier.topK)):
     topKLabelingExists = False
     sequenceLabels = []
     for token in sentence:
       if finder.entityTypesString in token.topKLabels and k < len(token.topKLabels[finder.entityTypesString]):
         label = token.topKLabels[finder.entityTypesString][k]
         topKLabelingExists = True
       else:
         # give each token a label. number finder labelings typically only label numbers and not other tokens.
         label = TokenLabel('other')
         label.prob = 0
       sequenceLabels.append(label)
     if topKLabelingExists or k == 0:
       # only keep labeling if it is the first one (may be all 'other'), or if the labeling exists.
       labelings.append(sequenceLabels)
   
   return labelings
Ejemplo n.º 4
0
  def test(self, absList, modelfilename='', fold=None):
    """ Apply ensemble of classifiers to given list of abstracts. 
        Ignores any given model file.
        """           
    for i in range(self.nClassifiers):
      print 'test:', self.entityTypesString, i
      if self.type == 'abstract':
        self.finder.test(absList, self.modelFilenames[i])
      else:
        self.useBaggedFeatures(self.baggedFeatures[i], absList, self.modelFilenames[i], self.finder.test)
      self.renameLabels(absList, i)
      
#    resultFilename = '%s%s.r%d.ensemble.txt'%(self.entityTypesString, self.getFoldString(fold), self.randomSeed)
#    resultsOut = open(resultFilename,'w')
    
    print self.entityTypesString
    for abstract in absList:
#      resultsOut.write('---%s---' % abstract.id)
      for sentence in abstract.sentences:
        for token in sentence:
          token.topKLabels[self.entityTypesString] = []  
          for i in range(self.nClassifiers):
            token.topKLabels[self.entityTypesString].append(TokenLabel('other'))
          
          eLabelMatches = token.getLabelMatches(self.ensembleTypes)
          
          for eLabel in eLabelMatches:
            [label, i] = self.toRegularLabel(eLabel)
            tLabel = TokenLabel(label)
            token.topKLabels[self.entityTypesString][i] = tLabel
#              tLabel.prob = prob
#              tLabel.sequenceProb = sequenceProb[i/2]            
            token.removeLabel(eLabel)
#            if label != 'other':
#              token.addLabel(label)            
          
#          resultsOut.write(str(ensembleLabels)+'\n')  
#          resultsOut.write('%s,  %s\n' %(token.text.ljust(12), eLabelMatches))
    self.finder.rerankLabelsAndAssign(absList, rerankType=self.rerankType, topKMax=5, fold=fold, countOther=self.countOther)