Esempio n. 1
0
  def write(self, filename, separator='  ', computeTotal=False, computeAverage=False):
    """ write all stats to a file """
    out = open(filename, 'w')
    nameList = self.irStats.keys()
    nameList.sort()
    for name in nameList:
      out.write(name+':\n')
      list = self.irStats[name]
      for irStat in list:
        out.write(separator)
        irStat.writerpf(out, separator)
      if computeTotal:
        totalStat = IRstats()
        for irStat in list:
          totalStat.addStats(irStat)
        out.write(separator)
        totalStat.writerpf(out, separator)    
      if computeAverage:
        rSum = 0.0
        pSum = 0.0
        fSum = 0.0
        dupPSum = 0.0
        for irStat in list:
          rSum += irStat.recall()
          pSum += irStat.precision()
          fSum += irStat.fscore()
          dupPSum += irStat.precisionWithoutDuplicates()
        nStat = len(list)  
        out.write('%s%s%s%s%.2f%s%.2f%s%.2f%4s%.2f\n'%(separator.ljust(5), separator.ljust(5), separator.ljust(5), \
                                                      separator, rSum/nStat, separator, pSum/nStat, separator,\
                                                       fSum/nStat, separator, dupPSum/nStat))

    nameList = self.labeledStats.keys()
    nameList.sort()
    nameList.reverse()
    for name in nameList:
      out.write(name+':\n')
      lists = self.labeledStats[name]
      out.write(separator)
      for [label, value] in lists[0]:
        out.write(label+separator)
      out.write('\n')
      out.write(separator)

      for list in lists:
        for [label, value] in list:
          if isinstance(value, float):
            out.write('%.4f%s' % (value, separator))
          else:
            out.write('%d%s' % (value, separator))
        out.write('\n')
    out.close()
Esempio n. 2
0
    def countAgeMatches(self, aAgeTemplates, errorOut):
        """ count the number of age value matches in a set of annotated age templates """
        annotatedAgeValues = {
            'min': set([]),
            'max': set([]),
            'mean': set([]),
            'median': set([])
        }
        for template in aAgeTemplates:
            for type, avList in template.trueValues.items():
                for av in avList:
                    #          print '@@@ ADDING AGE VALUE:', type, av.value
                    #          if len(annotatedAgeValues[type]) > 0:
                    #            print '-- Redundant value'
                    annotatedAgeValues[type].add(av)
        self.nTrueAgeValues = 0
        annotatedValueFound = {}
        for avSet in annotatedAgeValues.values():
            for av in avSet:
                annotatedValueFound[av] = False
                self.nTrueAgeValues += 1
        # count the number of detected values that match annotated ones
        stats = IRstats()
        for type, av in self.ageValues.items():
            #      print '@@@ Checking:', type, av.value
            if av.source != 'trial_registry':
                foundAgeValue = False
                for annotatedValue in annotatedAgeValues[type]:
                    if av.value == annotatedValue.value:
                        stats.incTP()
                        errorOut.write('  +TP: %s = %d\n' % (type, av.value))
                        #            print '  +TP: %s = %d' % (type,av.value)
                        annotatedValueFound[annotatedValue] = True
                        av.evaluation.markCorrect()
                        foundAgeValue = True
                if foundAgeValue == False:
                    stats.incFP()
                    errorOut.write('  -FP: %s = %d\n' % (type, av.value))
                    #          print '  -FP: %s = %d' % (type, av.value)
                    av.evaluation.markIncorrect()
#      else:
#        print '@@@@ AGE VALUE SOURCE IS TRIAL REGISTRY'

# count the ones that we missed
        for av, found in annotatedValueFound.items():
            if found == False:
                stats.incFN()
                errorOut.write('  -FN: %s = %d\n' % (av.type, av.value))
#        print '  -FN: %s = %d' % (av.type, av.value)

        return stats
  def countAgeMatches(self, aAgeTemplates, errorOut):
    """ count the number of age value matches in a set of annotated age templates """
    annotatedAgeValues = {'min':set([]), 'max':set([]), 'mean':set([]), 'median':set([])}
    for template in aAgeTemplates:
      for type, avList in template.trueValues.items():
        for av in avList:
#          print '@@@ ADDING AGE VALUE:', type, av.value
#          if len(annotatedAgeValues[type]) > 0:
#            print '-- Redundant value'
          annotatedAgeValues[type].add(av)
    self.nTrueAgeValues = 0
    annotatedValueFound = {}
    for avSet in annotatedAgeValues.values():
      for av in avSet:
        annotatedValueFound[av] = False
        self.nTrueAgeValues += 1
    # count the number of detected values that match annotated ones
    stats = IRstats()
    for type, av in self.ageValues.items():
#      print '@@@ Checking:', type, av.value
      if av.source != 'trial_registry':
        foundAgeValue = False
        for annotatedValue in annotatedAgeValues[type]:          
          if av.value == annotatedValue.value:
            stats.incTP()
            errorOut.write('  +TP: %s = %d\n' % (type,av.value))
#            print '  +TP: %s = %d' % (type,av.value)
            annotatedValueFound[annotatedValue] = True
            av.evaluation.markCorrect()
            foundAgeValue = True
        if foundAgeValue == False:  
          stats.incFP()
          errorOut.write('  -FP: %s = %d\n' % (type, av.value))
#          print '  -FP: %s = %d' % (type, av.value)
          av.evaluation.markIncorrect()
#      else:
#        print '@@@@ AGE VALUE SOURCE IS TRIAL REGISTRY'
        
    # count the ones that we missed
    for av, found in annotatedValueFound.items():
      if found == False:
        stats.incFN()
        errorOut.write('  -FN: %s = %d\n' % (av.type, av.value))
#        print '  -FN: %s = %d' % (av.type, av.value)
           
    return stats
Esempio n. 4
0
  def computeTupleMentionError(self, recomputeAnnotatedMentions, errorWeights={}):
    """ compute the number of FP, FN, Duplicate mentions in the sentence """
    totalFP = 0
    totalFN = 0
    totalDuplicates = 0
    stats = {}
    aList = {}
    if len(errorWeights) == 0:
      errorWeights['group']      = {'fp':1, 'fn':1, 'dup':1}
      errorWeights['outcome']    = {'fp':1, 'fn':1, 'dup':1}
      errorWeights['eventrate']  = {'fp':1, 'fn':1, 'dup':1}
      errorWeights['on']         = {'fp':1, 'fn':1, 'dup':1}
      errorWeights['gs']         = {'fp':1, 'fn':1, 'dup':1}

    
    mentions = {}
    mentions['group'] = (self.groupLabeling.entities['group'], self.groupLabeling.finder)
    mentions['outcome'] = (self.outcomeLabeling.entities['outcome'], self.outcomeLabeling.finder)
    mentions['eventrate'] = (self.eventrateLabeling.entities['eventrate'], self.eventrateLabeling.finder)
    mentions['on'] = (self.numberLabeling.entities['on'], self.numberLabeling.finder)
    mentions['gs'] = (self.numberLabeling.entities['gs'], self.numberLabeling.finder)
    
    for mType, (dList, finder) in mentions.items():
      aList[mType] = self.sentence.getAnnotatedMentions(mType, recomputeMentions=recomputeAnnotatedMentions)
      stats[mType] = IRstats()  
      finder.compareMentionLists(dList, aList[mType], mType, stats[mType])
      totalFP += stats[mType].fp * errorWeights[mType]['fp']
      totalFN += stats[mType].fn * errorWeights[mType]['fn']
      totalDuplicates += stats[mType].duplicates * errorWeights[mType]['dup']
    
#    mType = 'eventrate'
#    print 'True:', [m.text for m in aList[mType]]
#    print 'Detected:',[m.text for m in mentions[mType][0]]
    totalError = totalFP + totalDuplicates + totalFN   
#    if totalError > 9:
#      for mType in mentions.keys():
#        print 'Type: %s, FP: %d, FN: %d, DUP: %d'%(mType, stats[mType].fp, stats[mType].fn, stats[mType].duplicates)         
#      print self.sentence.abstract.id, 'Total error = ', totalError
      
    return totalError
Esempio n. 5
0
 def __init__(self, entityTypes):
     """ start computing RPF statistics for new set of abstracts """
     self.irstats = {}
     self.entityTypes = entityTypes
     for mType in self.entityTypes:
         self.irstats[mType] = IRstats()
Esempio n. 6
0
  def computeStatistics(self, errorOut):
    """ Count RPF statistics for each unique AGE, CONDITION, POPULATION entity
        statOut = file stream for RPF stats for all parts of summarization system
        errorOut = file stream for TPs, FPs, FNs
        
        return hash of IRstats, one for each mention type, keyed by mention type
        """
    stats = {}
    self.nTrueGroupSizes = 0

    aAgeTemplates = createAnnotatedMergedList(self.abstract, 'age')
    errorOut.write('age:\n')
    stats['age'] = self.ageInfo.countAgeMatches(aAgeTemplates, errorOut)

    errorOut.write('condition:\n')          
    aConditionTemplates = self.abstract.annotatedEntities.getList('condition')
    stats['condition'] = countMatches(aConditionTemplates, \
                                     self.conditionTemplates, errorOut)
    errorOut.write('group:\n')          
    aGroupTemplates = self.abstract.annotatedEntities.getList('group')
    stats['group'] = countMatches(aGroupTemplates, self.groupTemplates, errorOut)

    self.nTrueConditions = len(aConditionTemplates)
    self.nTrueGroups = len(aGroupTemplates)

    errorOut.write('group size:\n') 
    gsStats = IRstats()
    gsFound = set([])
    for gTemplate in self.groupTemplates:
      gSize = gTemplate.getSize(maxSize=True)
      if gSize != 0:
        # look for group size match in sizes for annotated group 
        found = False
        if gTemplate.matchedTemplate != None:
          for trueGSize in gTemplate.matchedTemplate.sizes:
            if gSize == trueGSize.value:
              found = True
              break
                 
        if found:
          # group size is correct
          gsStats.incTP()        
          errorOut.write('  +TP: %s size = %d\n' % (gTemplate.name, gSize))
          gTemplate.groupSizeEvaluation.markCorrect()
          gsFound.add(gTemplate.matchedTemplate)
        else:
          # group size is incorrect
          gsStats.incFP()  
          errorOut.write('  -FP: %s size = %d\n' % (gTemplate.name, gSize))
          gTemplate.groupSizeEvaluation.markIncorrect()
    # look for false negatives
    for trueTemplate in aGroupTemplates:
      if trueTemplate not in gsFound and trueTemplate.matchedTemplate != None and trueTemplate.getSize() > 0:
        # there should be a group size for this group
        gsStats.incFN()  
        errorOut.write('  -FN: %s size = %d\n' % \
                (trueTemplate.name, trueTemplate.getSize()))
        
    
    stats['group size'] = gsStats
    self.nTrueGroupSizes = gsStats.tp + gsStats.fn
#     errorOut.write('population:\n')
#     templates = createAnnotatedMergedList(self.abstract, 'population')
#     aPopulationTemplates = []
#     for pTemplate in templates:
#       if pTemplate.isInteresting() > 0:
#         # term is informative, keep it
#         aPopulationTemplates.append(pTemplate)
#     stats['population'] = self.countMatches(aPopulationTemplates, \
#                       self.populationTemplates, errorOut)
    return stats
Esempio n. 7
0
#             if token.hasAnnotation(entityType):
#               verbRuleCounts[depToken.lemma].incTP()
#             else:
#               verbRuleCounts[depToken.lemma].incFP()

      for dep in token.governors:
        if dep.isRoot() == False and dep.type == 'pobj':
          depToken = token.sentence[dep.index]
#          print depToken.text, token.text
          for g in depToken.governors:
            if g.isRoot() == False:# and g.type == 'prep':
              gToken = token.sentence[g.index]
#              print gToken.text+'_'+g.type, depToken.text, token.text
              if gToken.pos[0:2] == 'VB':
                if gToken.lemma not in verbRuleCounts:
                  verbRuleCounts[gToken.lemma] = IRstats()  
                if token.hasAnnotation(entityType):
                  verbRuleCounts[gToken.lemma].incTP()
                else:
                  verbRuleCounts[gToken.lemma].incFP()
    
    for token in sentence:
    
      for type in entityTypes:
        if token.hasAnnotation(type):
          entityTokenCounts[type] += 1
        
    # for token in sentence 
#     for token in sentence:
#       if token.text != 'greater' and token.text != 'less':
#         continue