def subseqMatchStats(reportedSeqs, trueSeqs, matchFunc=None, spoofNumReported=-1, spoofNumTrue=-1, ignorePositions=False, matchUpLabels=False, matchAllClasses=False, minOverlapFraction=.5, requireContainment=False, **sink): """[PatternInstance] x [PatternInstance] -> numMatches, numReported, numTrue""" if (not matchFunc) and ignorePositions: matchFunc = matchIgnoringPositions if matchUpLabels: # might find pattern, but not know what to label it lbl2reported = sequence.splitElementsBy(lambda inst: inst.label, reportedSeqs) lbl2truth = sequence.splitElementsBy(lambda inst: inst.label, trueSeqs) # print "subseqMatchStats: matching up with {} true labels".format(len(lbl2truth)) # print "subseqMatchStats: matching up {} reported with {} actual".format( # len(reportedSeqs), len(trueSeqs)) # XXX if we report more than one label, there isn't necessarily a # bijective mapping between reported and ground truth labels; it's # possible to get more matches than true labels here, among other issues numMatches = 0 # or avg overlap fraction (IOU) numTrue = 0 for repLbl, repSeqs in lbl2reported.iteritems(): bestNumMatches = -1 # can't be 0 or numTrue stays unset if no matches bestNumTruth = 0 for truthLbl, truthSeqs in lbl2truth.iteritems(): if len(truthSeqs) < 2: # ignore patterns that only happen once continue numMatchesForLabel = computeNumMatches(repSeqs, truthSeqs, matchFunc, ignoreLabel=True, minOverlapFraction=minOverlapFraction, requireContainment=requireContainment) if numMatchesForLabel > bestNumMatches: bestNumMatches = numMatchesForLabel bestNumTruth = len(truthSeqs) numMatches += max(0, bestNumMatches) minNumTruth = min([len(truthSeqs) for _, truthSeqs in lbl2truth.iteritems()]) minNumTruth = max(minNumTruth, 2) numTrue += max(minNumTruth, bestNumTruth) else: numMatches = computeNumMatches(reportedSeqs, trueSeqs, matchFunc, minOverlapFraction=minOverlapFraction, requireContainment=requireContainment) if matchAllClasses: numTrue = len(trueSeqs) numReported = spoofNumReported if spoofNumReported >= 0 else len(reportedSeqs) numTrue = spoofNumTrue if spoofNumTrue >= 0 else numTrue return numReported, numTrue, numMatches
def old_matchingSubseqs(reportedSeqs, trueSeqs, matchFunc=None): """ Given the (seqId, start, end, class) tuples reported by a classifier and the true (seqId, start, end, class) tuples, compute which of the true tuples each reported tuple corresponds to (-1 if none of them). seqId is a unique ID for each input sequence, start and end are indices within this sequence, and matchFunc is the function used to determine whether a reported and ground truth tuple match. Tuples are split by seqId, so matchFunc need only assess start and end indices and class. By default, matchFunc defaults to subseqsMatch (also in this file). Matches are assigned greedily from beginning to end, sorted by start index. Returns a dict: seqId -> idxs of matching truth tuple (or -1) for each reported tuple """ # make sure we have a func to test for matches matchFunc = matchFunc or subseqsMatch # group reported and true seqs by sequence id (in position 0) seq2reported = sequence.splitElementsBy(lambda tup: tup[0], reportedSeqs) seq2truth = sequence.splitElementsBy(lambda tup: tup[0], trueSeqs) matchesDict = {} for seqId, reported in seq2reported.iteritems(): truth = seq2truth.get(seqId) if not truth: # ground truth has no instances in this sequence continue matches = [] # sort by start time reported = sorted(reported, key=lambda x: x[1]) truth = sorted(truth, key=lambda x: x[1]) for i, repSeq in enumerate(reported): matches.append(-1) for j, trueSeq in enumerate(truth): if matchFunc(repSeq, trueSeq): matches[i] = j del truth[j] # can't match the same thing twice break matchesDict[seqId] = matches return matchesDict
def subseqIOUStats(reportedSeqs, trueSeqs, matchUpLabels=False, returnMoreStats=False): if matchUpLabels: # might find pattern, but not know what to label it lbl2reported = sequence.splitElementsBy(lambda inst: inst.label, reportedSeqs) lbl2truth = sequence.splitElementsBy(lambda inst: inst.label, trueSeqs) intersectionSize = 0. unionSize = 0. reportedSize = 0. truthSize = 0. for repLbl, repSeqs in lbl2reported.iteritems(): bestIntersection = 0 bestUnion = 0 bestReportedSize = 0 bestTruthSize = 0 bestIOU = 0. for truthLbl, truthSeqs in lbl2truth.iteritems(): interSz, unionSz, iou = computeIOU(repSeqs, truthSeqs, ignoreLabel=True) if iou >= bestIOU: # so that ties for 0 will replace stuff bestIntersection = interSz bestUnion = unionSz bestReportedSize = totalInstancesSize(repSeqs) bestTruthSize = totalInstancesSize(truthSeqs) bestIOU = iou intersectionSize += bestIntersection unionSize += bestUnion reportedSize += bestReportedSize truthSize += bestTruthSize iou = float(intersectionSize) / unionSize else: intersectionSize, unionSize, iou = computeIOU(reportedSeqs, trueSeqs, ignoreLabel=False) reportedSize = totalInstancesSize(reportedSeqs) truthSize = totalInstancesSize(trueSeqs) if returnMoreStats: return intersectionSize, unionSize, iou, reportedSize, truthSize return intersectionSize, unionSize, iou
def subseqMatchStats(reportedSeqs, trueSeqs, matchFunc=None, spoofNumReported=-1, spoofNumTrue=-1, ignorePositions=False, matchUpLabels=False, matchAllClasses=False, minOverlapFraction=.5, requireContainment=False, **sink): """[PatternInstance] x [PatternInstance] -> numMatches, numReported, numTrue""" if (not matchFunc) and ignorePositions: matchFunc = matchIgnoringPositions if matchUpLabels: # might find pattern, but not know what to label it lbl2reported = sequence.splitElementsBy(lambda inst: inst.label, reportedSeqs) lbl2truth = sequence.splitElementsBy(lambda inst: inst.label, trueSeqs) # print "subseqMatchStats: matching up with {} true labels".format(len(lbl2truth)) # print "subseqMatchStats: matching up {} reported with {} actual".format( # len(reportedSeqs), len(trueSeqs)) # XXX if we report more than one label, there isn't necessarily a # bijective mapping between reported and ground truth labels; it's # possible to get more matches than true labels here, among other issues numMatches = 0 # or avg overlap fraction (IOU) numTrue = 0 for repLbl, repSeqs in lbl2reported.iteritems(): bestNumMatches = -1 # can't be 0 or numTrue stays unset if no matches bestNumTruth = 0 for truthLbl, truthSeqs in lbl2truth.iteritems(): if len(truthSeqs) < 2: # ignore patterns that only happen once continue numMatchesForLabel = computeNumMatches( repSeqs, truthSeqs, matchFunc, ignoreLabel=True, minOverlapFraction=minOverlapFraction, requireContainment=requireContainment) if numMatchesForLabel > bestNumMatches: bestNumMatches = numMatchesForLabel bestNumTruth = len(truthSeqs) numMatches += max(0, bestNumMatches) minNumTruth = min( [len(truthSeqs) for _, truthSeqs in lbl2truth.iteritems()]) minNumTruth = max(minNumTruth, 2) numTrue += max(minNumTruth, bestNumTruth) else: numMatches = computeNumMatches(reportedSeqs, trueSeqs, matchFunc, minOverlapFraction=minOverlapFraction, requireContainment=requireContainment) if matchAllClasses: numTrue = len(trueSeqs) numReported = spoofNumReported if spoofNumReported >= 0 else len( reportedSeqs) numTrue = spoofNumTrue if spoofNumTrue >= 0 else numTrue return numReported, numTrue, numMatches