def aggregateResults(results): """Aggregates results from crosswikis by ignoring case on the anchor. Numerators are added up, and probabilities are averaged, weighted by their numerators. Args: results: a results set of the form [(anchor, entity, cprob, num, denom)] Returns: a new results set of the form [(anchor, entity, cprob, num, denom)] """ linkCounts = {} linkCprobs = {} for anchor, entity, info, cprob in results: anchor=anchor.lower() labelCounts = getLabelCounts(info) num = sum([num for (num, denom) in labelCounts.values()]) myutils.addToDict(linkCounts, (anchor, entity), num) myutils.addToDict(linkCprobs, (anchor, entity), num*cprob) denom = sum(linkCounts.values()) results = [] for ((anchor, entity), num) in linkCounts.items(): cprob = linkCprobs[(anchor, entity)] / denom results.append((anchor, entity, cprob, num, denom)) return results
def getNumInstances(instancesFile): instanceCounts = {} for line in instancesFile: lineParts = line.strip().split('\t') string = lineParts[0] count = int(lineParts[1]) myutils.addToDict(instanceCounts, string, count) return instanceCounts
def dedupePr(): prFile = open(PR_OUTPUT_PATH) dedupedPrFile = open(PR_DEDUPED_PATH, 'w') prResults = readPrFile(prFile) prCounts = {} prParams = {} for p, c, t, precision, recall in prResults: myutils.addToDict(prCounts, (precision, recall), 1) prParams[(precision, recall)] = 'p={}, c={}, t={}'.format(p, c, t) for ((precision, recall), count) in prCounts.items(): paramString = prParams[(precision, recall)] print('{}\t{}\t{}\t{}'.format(paramString, precision, recall, count), file=dedupedPrFile)
def dedupePr(): prFile = open(PR_OUTPUT_PATH) dedupedPrFile = open(PR_DEDUPED_PATH, 'w') prResults = readPrFile(prFile) prCounts = {} prParams = {} for p, c, t, precision, recall in prResults: myutils.addToDict(prCounts, (precision, recall), 1) prParams[(precision, recall)] = 'p={}, c={}, t={}'.format(p, c, t) for ((precision, recall), count) in prCounts.items(): paramString = prParams[(precision, recall)] print('{}\t{}\t{}\t{}'.format( paramString, precision, recall, count ), file=dedupedPrFile )