def readLabelFile(self, labelFilename, entityTypes): """ read a mallet label file and return the list of labels """ labelLines = open(labelFilename, 'r').readlines() labelList = [] if len(entityTypes) == 1: binaryClassification = True else: binaryClassification = False labelConversionList = ['other'] for eType in entityTypes: labelConversionList.append(eType) for line in labelLines: parsedLine = line.strip().split() if len(parsedLine) > 0: mClass = int(parsedLine[0]) prob = float(parsedLine[1]) if binaryClassification: if prob < self.binaryThreshold: mClass = 0 else: mClass = 1 label = labelConversionList[mClass] tLabel = TokenLabel(label) tLabel.prob = prob tLabel.sequenceProb = 1 labelList.append([tLabel]) # labelList.append(label) return labelList
def readLabelFile(self, labelFilename, entityTypes): """ read a mallet label file and return the list of labels """ labelLines = open(labelFilename, 'r').readlines() labels = [] lineNo = 1 sequenceProb = [] for i in range(self.topK): sequenceProb.append(0.0) currentTopK = self.topK for line in labelLines: try: topKLabels = line.strip().split() if len(topKLabels) > 0: if topKLabels[0] == 'k' : newTopK = int(topKLabels[1]) if newTopK != currentTopK: currentTopK = newTopK sequenceProb = [] for i in range(currentTopK): sequenceProb.append(0.0) # this is the list of sequence probabilities for i in range(currentTopK): sequenceProb[i] = float(topKLabels[i+2]) # print lineNo, topKLabels[1], currentTopK, sequenceProb elif len(topKLabels) == 2*currentTopK: tokenLabelList = [] for i in range(0,currentTopK*2,2): label = topKLabels[i] prob = float(topKLabels[i+1]) tLabel = TokenLabel(label) tLabel.prob = prob tLabel.sequenceProb = sequenceProb[i/2] tokenLabelList.append(tLabel) # print 'Read:', tLabel.label, tLabel.sequenceProb, tLabel.prob labels.append(tokenLabelList) # if len(topKLabels) == self.topK: # # for i in range(len(topKLabels)): # # if topKLabels[i] == 'O': # # topKLabels[i] = 'other' # if self.topK == 1: # labels.append(topKLabels[0]) # else: # labels.append(topKLabels) except: print '%s: Error at line number %d' % (labelFilename, lineNo) lineNo += 1 return labels
def getTopKLabelings(self, sentence, finder, topK): """ return list of top k sequence labelings for the sentence """ labelings = [] for k in range(min(topK,finder.tokenClassifier.topK)): topKLabelingExists = False sequenceLabels = [] for token in sentence: if finder.entityTypesString in token.topKLabels and k < len(token.topKLabels[finder.entityTypesString]): label = token.topKLabels[finder.entityTypesString][k] topKLabelingExists = True else: # give each token a label. number finder labelings typically only label numbers and not other tokens. label = TokenLabel('other') label.prob = 0 sequenceLabels.append(label) if topKLabelingExists or k == 0: # only keep labeling if it is the first one (may be all 'other'), or if the labeling exists. labelings.append(sequenceLabels) return labelings
def test(self, absList, modelfilename='', fold=None): """ Apply ensemble of classifiers to given list of abstracts. Ignores any given model file. """ for i in range(self.nClassifiers): print 'test:', self.entityTypesString, i if self.type == 'abstract': self.finder.test(absList, self.modelFilenames[i]) else: self.useBaggedFeatures(self.baggedFeatures[i], absList, self.modelFilenames[i], self.finder.test) self.renameLabels(absList, i) # resultFilename = '%s%s.r%d.ensemble.txt'%(self.entityTypesString, self.getFoldString(fold), self.randomSeed) # resultsOut = open(resultFilename,'w') print self.entityTypesString for abstract in absList: # resultsOut.write('---%s---' % abstract.id) for sentence in abstract.sentences: for token in sentence: token.topKLabels[self.entityTypesString] = [] for i in range(self.nClassifiers): token.topKLabels[self.entityTypesString].append(TokenLabel('other')) eLabelMatches = token.getLabelMatches(self.ensembleTypes) for eLabel in eLabelMatches: [label, i] = self.toRegularLabel(eLabel) tLabel = TokenLabel(label) token.topKLabels[self.entityTypesString][i] = tLabel # tLabel.prob = prob # tLabel.sequenceProb = sequenceProb[i/2] token.removeLabel(eLabel) # if label != 'other': # token.addLabel(label) # resultsOut.write(str(ensembleLabels)+'\n') # resultsOut.write('%s, %s\n' %(token.text.ljust(12), eLabelMatches)) self.finder.rerankLabelsAndAssign(absList, rerankType=self.rerankType, topKMax=5, fold=fold, countOther=self.countOther)