Python splitElementsBy Examples, sequence.splitElementsBy Python Examples

Example #1

0

Show file

File: evaluate.py Project: dblalock/dig

def subseqMatchStats(reportedSeqs, trueSeqs, matchFunc=None,
	spoofNumReported=-1, spoofNumTrue=-1, ignorePositions=False,
	matchUpLabels=False, matchAllClasses=False, minOverlapFraction=.5,
	requireContainment=False, **sink):
	"""[PatternInstance] x [PatternInstance] -> numMatches, numReported, numTrue"""

	if (not matchFunc) and ignorePositions:
		matchFunc = matchIgnoringPositions

	if matchUpLabels: # might find pattern, but not know what to label it
		lbl2reported = sequence.splitElementsBy(lambda inst: inst.label, reportedSeqs)
		lbl2truth = sequence.splitElementsBy(lambda inst: inst.label, trueSeqs)

		# print "subseqMatchStats: matching up with {} true labels".format(len(lbl2truth))
		# print "subseqMatchStats: matching up {} reported with {} actual".format(
		# 	len(reportedSeqs), len(trueSeqs))

		# XXX if we report more than one label, there isn't necessarily a
		# bijective mapping between reported and ground truth labels; it's
		# possible to get more matches than true labels here, among other issues
		numMatches = 0 # or avg overlap fraction (IOU)
		numTrue = 0
		for repLbl, repSeqs in lbl2reported.iteritems():
			bestNumMatches = -1 # can't be 0 or numTrue stays unset if no matches
			bestNumTruth = 0
			for truthLbl, truthSeqs in lbl2truth.iteritems():
				if len(truthSeqs) < 2: # ignore patterns that only happen once
					continue
				numMatchesForLabel = computeNumMatches(repSeqs, truthSeqs, matchFunc,
					ignoreLabel=True, minOverlapFraction=minOverlapFraction,
					requireContainment=requireContainment)
				if numMatchesForLabel > bestNumMatches:
					bestNumMatches = numMatchesForLabel
					bestNumTruth = len(truthSeqs)
			numMatches += max(0, bestNumMatches)
			minNumTruth = min([len(truthSeqs) for _, truthSeqs in lbl2truth.iteritems()])
			minNumTruth = max(minNumTruth, 2)
			numTrue += max(minNumTruth, bestNumTruth)
	else:
		numMatches = computeNumMatches(reportedSeqs, trueSeqs, matchFunc,
			minOverlapFraction=minOverlapFraction,
			requireContainment=requireContainment)

	if matchAllClasses:
		numTrue = len(trueSeqs)

	numReported = spoofNumReported if spoofNumReported >= 0 else len(reportedSeqs)
	numTrue = spoofNumTrue if spoofNumTrue >= 0 else numTrue

	return numReported, numTrue, numMatches

Example #2

0

Show file

File: evaluate.py Project: yiweichung/extract

def old_matchingSubseqs(reportedSeqs, trueSeqs, matchFunc=None):
    """
	Given the (seqId, start, end, class) tuples reported by a classifier and the
	true (seqId, start, end, class) tuples, compute which of the true tuples
	each reported tuple corresponds to (-1 if none of them).

	seqId is a unique ID for each input sequence, start and end are indices
	within this sequence, and matchFunc is the function used to determine
	whether a reported and ground truth tuple match. Tuples are split by
	seqId, so matchFunc need only assess start and end indices and class. By
	default, matchFunc defaults to subseqsMatch (also in this file).

	Matches are assigned greedily from beginning to end, sorted by start index.

	Returns a dict: seqId -> idxs of matching truth tuple (or -1) for each
	reported tuple
	"""

    # make sure we have a func to test for matches
    matchFunc = matchFunc or subseqsMatch

    # group reported and true seqs by sequence id (in position 0)
    seq2reported = sequence.splitElementsBy(lambda tup: tup[0], reportedSeqs)
    seq2truth = sequence.splitElementsBy(lambda tup: tup[0], trueSeqs)

    matchesDict = {}
    for seqId, reported in seq2reported.iteritems():
        truth = seq2truth.get(seqId)
        if not truth:  # ground truth has no instances in this sequence
            continue
        matches = []

        # sort by start time
        reported = sorted(reported, key=lambda x: x[1])
        truth = sorted(truth, key=lambda x: x[1])

        for i, repSeq in enumerate(reported):
            matches.append(-1)
            for j, trueSeq in enumerate(truth):
                if matchFunc(repSeq, trueSeq):
                    matches[i] = j
                    del truth[j]  # can't match the same thing twice
                    break

        matchesDict[seqId] = matches

    return matchesDict

Example #3

0

Show file

File: evaluate.py Project: dblalock/dig

def old_matchingSubseqs(reportedSeqs, trueSeqs, matchFunc=None):
	"""
	Given the (seqId, start, end, class) tuples reported by a classifier and the
	true (seqId, start, end, class) tuples, compute which of the true tuples
	each reported tuple corresponds to (-1 if none of them).

	seqId is a unique ID for each input sequence, start and end are indices
	within this sequence, and matchFunc is the function used to determine
	whether a reported and ground truth tuple match. Tuples are split by
	seqId, so matchFunc need only assess start and end indices and class. By
	default, matchFunc defaults to subseqsMatch (also in this file).

	Matches are assigned greedily from beginning to end, sorted by start index.

	Returns a dict: seqId -> idxs of matching truth tuple (or -1) for each
	reported tuple
	"""

	# make sure we have a func to test for matches
	matchFunc = matchFunc or subseqsMatch

	# group reported and true seqs by sequence id (in position 0)
	seq2reported = sequence.splitElementsBy(lambda tup: tup[0], reportedSeqs)
	seq2truth = sequence.splitElementsBy(lambda tup: tup[0], trueSeqs)

	matchesDict = {}
	for seqId, reported in seq2reported.iteritems():
		truth = seq2truth.get(seqId)
		if not truth: # ground truth has no instances in this sequence
			continue
		matches = []

		# sort by start time
		reported = sorted(reported, key=lambda x: x[1])
		truth = sorted(truth, key=lambda x: x[1])

		for i, repSeq in enumerate(reported):
			matches.append(-1)
			for j, trueSeq in enumerate(truth):
				if matchFunc(repSeq, trueSeq):
					matches[i] = j
					del truth[j] # can't match the same thing twice
					break

		matchesDict[seqId] = matches

	return matchesDict

Example #4

0

Show file

File: evaluate.py Project: yiweichung/extract

def subseqIOUStats(reportedSeqs,
                   trueSeqs,
                   matchUpLabels=False,
                   returnMoreStats=False):

    if matchUpLabels:  # might find pattern, but not know what to label it
        lbl2reported = sequence.splitElementsBy(lambda inst: inst.label,
                                                reportedSeqs)
        lbl2truth = sequence.splitElementsBy(lambda inst: inst.label, trueSeqs)

        intersectionSize = 0.
        unionSize = 0.
        reportedSize = 0.
        truthSize = 0.
        for repLbl, repSeqs in lbl2reported.iteritems():
            bestIntersection = 0
            bestUnion = 0
            bestReportedSize = 0
            bestTruthSize = 0
            bestIOU = 0.
            for truthLbl, truthSeqs in lbl2truth.iteritems():
                interSz, unionSz, iou = computeIOU(repSeqs,
                                                   truthSeqs,
                                                   ignoreLabel=True)
                if iou >= bestIOU:  # so that ties for 0 will replace stuff
                    bestIntersection = interSz
                    bestUnion = unionSz
                    bestReportedSize = totalInstancesSize(repSeqs)
                    bestTruthSize = totalInstancesSize(truthSeqs)
                    bestIOU = iou
            intersectionSize += bestIntersection
            unionSize += bestUnion
            reportedSize += bestReportedSize
            truthSize += bestTruthSize
        iou = float(intersectionSize) / unionSize
    else:
        intersectionSize, unionSize, iou = computeIOU(reportedSeqs,
                                                      trueSeqs,
                                                      ignoreLabel=False)
        reportedSize = totalInstancesSize(reportedSeqs)
        truthSize = totalInstancesSize(trueSeqs)

    if returnMoreStats:
        return intersectionSize, unionSize, iou, reportedSize, truthSize

    return intersectionSize, unionSize, iou

Example #5

0

Show file

File: evaluate.py Project: dblalock/dig

def subseqIOUStats(reportedSeqs, trueSeqs, matchUpLabels=False, returnMoreStats=False):

	if matchUpLabels: # might find pattern, but not know what to label it
		lbl2reported = sequence.splitElementsBy(lambda inst: inst.label, reportedSeqs)
		lbl2truth = sequence.splitElementsBy(lambda inst: inst.label, trueSeqs)

		intersectionSize = 0.
		unionSize = 0.
		reportedSize = 0.
		truthSize = 0.
		for repLbl, repSeqs in lbl2reported.iteritems():
			bestIntersection = 0
			bestUnion = 0
			bestReportedSize = 0
			bestTruthSize = 0
			bestIOU = 0.
			for truthLbl, truthSeqs in lbl2truth.iteritems():
				interSz, unionSz, iou = computeIOU(repSeqs, truthSeqs,
					ignoreLabel=True)
				if iou >= bestIOU: # so that ties for 0 will replace stuff
					bestIntersection = interSz
					bestUnion = unionSz
					bestReportedSize = totalInstancesSize(repSeqs)
					bestTruthSize = totalInstancesSize(truthSeqs)
					bestIOU = iou
			intersectionSize += bestIntersection
			unionSize += bestUnion
			reportedSize += bestReportedSize
			truthSize += bestTruthSize
		iou = float(intersectionSize) / unionSize
	else:
		intersectionSize, unionSize, iou = computeIOU(reportedSeqs, trueSeqs,
			ignoreLabel=False)
		reportedSize = totalInstancesSize(reportedSeqs)
		truthSize = totalInstancesSize(trueSeqs)

	if returnMoreStats:
		return intersectionSize, unionSize, iou, reportedSize, truthSize

	return intersectionSize, unionSize, iou

Example #6

0

Show file

File: evaluate.py Project: yiweichung/extract

def subseqMatchStats(reportedSeqs,
                     trueSeqs,
                     matchFunc=None,
                     spoofNumReported=-1,
                     spoofNumTrue=-1,
                     ignorePositions=False,
                     matchUpLabels=False,
                     matchAllClasses=False,
                     minOverlapFraction=.5,
                     requireContainment=False,
                     **sink):
    """[PatternInstance] x [PatternInstance] -> numMatches, numReported, numTrue"""

    if (not matchFunc) and ignorePositions:
        matchFunc = matchIgnoringPositions

    if matchUpLabels:  # might find pattern, but not know what to label it
        lbl2reported = sequence.splitElementsBy(lambda inst: inst.label,
                                                reportedSeqs)
        lbl2truth = sequence.splitElementsBy(lambda inst: inst.label, trueSeqs)

        # print "subseqMatchStats: matching up with {} true labels".format(len(lbl2truth))
        # print "subseqMatchStats: matching up {} reported with {} actual".format(
        # 	len(reportedSeqs), len(trueSeqs))

        # XXX if we report more than one label, there isn't necessarily a
        # bijective mapping between reported and ground truth labels; it's
        # possible to get more matches than true labels here, among other issues
        numMatches = 0  # or avg overlap fraction (IOU)
        numTrue = 0
        for repLbl, repSeqs in lbl2reported.iteritems():
            bestNumMatches = -1  # can't be 0 or numTrue stays unset if no matches
            bestNumTruth = 0
            for truthLbl, truthSeqs in lbl2truth.iteritems():
                if len(truthSeqs) < 2:  # ignore patterns that only happen once
                    continue
                numMatchesForLabel = computeNumMatches(
                    repSeqs,
                    truthSeqs,
                    matchFunc,
                    ignoreLabel=True,
                    minOverlapFraction=minOverlapFraction,
                    requireContainment=requireContainment)
                if numMatchesForLabel > bestNumMatches:
                    bestNumMatches = numMatchesForLabel
                    bestNumTruth = len(truthSeqs)
            numMatches += max(0, bestNumMatches)
            minNumTruth = min(
                [len(truthSeqs) for _, truthSeqs in lbl2truth.iteritems()])
            minNumTruth = max(minNumTruth, 2)
            numTrue += max(minNumTruth, bestNumTruth)
    else:
        numMatches = computeNumMatches(reportedSeqs,
                                       trueSeqs,
                                       matchFunc,
                                       minOverlapFraction=minOverlapFraction,
                                       requireContainment=requireContainment)

    if matchAllClasses:
        numTrue = len(trueSeqs)

    numReported = spoofNumReported if spoofNumReported >= 0 else len(
        reportedSeqs)
    numTrue = spoofNumTrue if spoofNumTrue >= 0 else numTrue

    return numReported, numTrue, numMatches