Esempio n. 1
0
def find_curve_range_intersection(curve_1, curve_2, cut_at_inflection=False):
    """
    Return intersections of x- and y-ranges of two real curves,
    which are parametric curves on the xy-plane given as
    (x_array, y_array), a tuple of NumPy arrays.
    """
    x1, y1 = curve_1
    x2, y2 = curve_2

    if cut_at_inflection is True:
        x1_min, x1_max = sorted([x1[0], x1[-1]])
        x2_min, x2_max = sorted([x2[0], x2[-1]])

        y1_min, y1_may = sorted([y1[0], y1[-1]])
        y2_min, y2_may = sorted([y2[0], y2[-1]])
    else:
        x1_min, x1_max = numpy.sort(x1)[[0, -1]]
        x2_min, x2_max = numpy.sort(x2)[[0, -1]]

        y1_min, y1_may = numpy.sort(y1)[[0, -1]]
        y2_min, y2_may = numpy.sort(y2)[[0, -1]]

    x1_interval = Interval(x1_min, x1_max)
    x2_interval = Interval(x2_min, x2_max)

    y1_interval = Interval(y1_min, y1_may)
    y2_interval = Interval(y2_min, y2_may)

    x_range = x1_interval.intersect(x2_interval)
    y_range = y1_interval.intersect(y2_interval)

    return (x_range, y_range)
Esempio n. 2
0
def find_curve_range_intersection(curve_1, curve_2, cut_at_inflection=False):
    """
    Return intersections of x- and y-ranges of two real curves,
    which are parametric curves on the xy-plane given as 
    (x_array, y_array), a tuple of NumPy arrays.
    """
    x1, y1 = curve_1
    x2, y2 = curve_2

    if cut_at_inflection is True:
        x1_min, x1_max = sorted([x1[0], x1[-1]])
        x2_min, x2_max = sorted([x2[0], x2[-1]])

        y1_min, y1_may = sorted([y1[0], y1[-1]])
        y2_min, y2_may = sorted([y2[0], y2[-1]])
    else:
        x1_min, x1_max = numpy.sort(x1)[[0, -1]]
        x2_min, x2_max = numpy.sort(x2)[[0, -1]]

        y1_min, y1_may = numpy.sort(y1)[[0, -1]]
        y2_min, y2_may = numpy.sort(y2)[[0, -1]]

    x1_interval = Interval(x1_min, x1_max)
    x2_interval = Interval(x2_min, x2_max)

    y1_interval = Interval(y1_min, y1_may)
    y2_interval = Interval(y2_min, y2_may)

    x_range = x1_interval.intersect(x2_interval)
    y_range = y1_interval.intersect(y2_interval)

    return (x_range, y_range)
Esempio n. 3
0
def test_measure():
    a = Symbol('a', real=True)

    assert Interval(1, 3).measure == 2
    assert Interval(0, a).measure == a
    assert Interval(1, a).measure == a - 1

    assert Union(Interval(1, 2), Interval(3, 4)).measure == 2
    assert Union(Interval(1, 2), Interval(3, 4), FiniteSet(5, 6, 7)).measure \
        == 2

    assert FiniteSet(1, 2, oo, a, -oo, -5).measure == 0

    assert S.EmptySet.measure == 0

    square = Interval(0, 10) * Interval(0, 10)
    offsetsquare = Interval(5, 15) * Interval(5, 15)
    band = Interval(-oo, oo) * Interval(2, 4)

    assert square.measure == offsetsquare.measure == 100
    assert (square + offsetsquare).measure == 175  # there is some overlap
    assert (square - offsetsquare).measure == 75
    assert (square * FiniteSet(1, 2, 3)).measure == 0
    assert (square.intersect(band)).measure == 20
    assert (square + band).measure == oo
    assert (band * FiniteSet(1, 2, 3)).measure == nan
Esempio n. 4
0
def test_union():
    assert Union(Interval(1, 2), Interval(2, 3)) == Interval(1, 3)
    assert Union(Interval(1, 2), Interval(2, 3, True)) == Interval(1, 3)
    assert Union(Interval(1, 3), Interval(2, 4)) == Interval(1, 4)
    assert Union(Interval(1, 2), Interval(1, 3)) == Interval(1, 3)
    assert Union(Interval(1, 3), Interval(1, 2)) == Interval(1, 3)
    assert Union(Interval(1, 3, False, True), Interval(1, 2)) == \
        Interval(1, 3, False, True)
    assert Union(Interval(1, 3), Interval(1, 2, False, True)) == Interval(1, 3)
    assert Union(Interval(1, 2, True), Interval(1, 3)) == Interval(1, 3)
    assert Union(Interval(1, 2, True), Interval(1, 3, True)) == \
        Interval(1, 3, True)
    assert Union(Interval(1, 2, True), Interval(1, 3, True, True)) == \
        Interval(1, 3, True, True)
    assert Union(Interval(1, 2, True, True), Interval(1, 3, True)) == \
        Interval(1, 3, True)
    assert Union(Interval(1, 3), Interval(2, 3)) == Interval(1, 3)
    assert Union(Interval(1, 3, False, True), Interval(2, 3)) == \
        Interval(1, 3)
    assert Union(Interval(1, 2, False, True), Interval(2, 3, True)) != \
        Interval(1, 3)
    assert Union(Interval(1, 2), S.EmptySet) == Interval(1, 2)
    assert Union(S.EmptySet) == S.EmptySet

    assert Union(Interval(0, 1), [FiniteSet(1.0/n) for n in range(1, 10)]) == \
        Interval(0, 1)

    assert Interval(1, 2).union(Interval(2, 3)) == \
        Interval(1, 2) + Interval(2, 3)

    assert Interval(1, 2).union(Interval(2, 3)) == Interval(1, 3)

    assert Union(Set()) == Set()

    assert FiniteSet(1) + FiniteSet(2) + FiniteSet(3) == FiniteSet(1, 2, 3)
    assert FiniteSet(['ham']) + FiniteSet(['eggs']) == FiniteSet('ham', 'eggs')
    assert FiniteSet(1, 2, 3) + S.EmptySet == FiniteSet(1, 2, 3)

    assert FiniteSet(1, 2, 3) & FiniteSet(2, 3, 4) == FiniteSet(2, 3)
    assert FiniteSet(1, 2, 3) | FiniteSet(2, 3, 4) == FiniteSet(1, 2, 3, 4)

    x = Symbol("x")
    y = Symbol("y")
    z = Symbol("z")
    assert S.EmptySet | FiniteSet(x, FiniteSet(y, z)) == \
        FiniteSet(x, FiniteSet(y, z))

    # Test that Intervals and FiniteSets play nicely
    assert Interval(1, 3) + FiniteSet(2) == Interval(1, 3)
    assert Interval(1, 3, True, True) + FiniteSet(3) == \
        Interval(1, 3, True, False)
    X = Interval(1, 3) + FiniteSet(5)
    Y = Interval(1, 2) + FiniteSet(3)
    XandY = X.intersect(Y)
    assert 2 in X and 3 in X and 3 in XandY
    assert X.subset(XandY) and Y.subset(XandY)

    raises(TypeError, lambda: Union(1, 2, 3))

    assert X.is_iterable is False
Esempio n. 5
0
def test_complement():
    assert Interval(0, 1).complement == \
           Union(Interval(-oo, 0, True, True), Interval(1, oo, True, True))
    assert Interval(0, 1, True, False).complement == \
           Union(Interval(-oo, 0, True, False), Interval(1, oo, True, True))
    assert Interval(0, 1, False, True).complement == \
           Union(Interval(-oo, 0, True, True), Interval(1, oo, False, True))
    assert Interval(0, 1, True, True).complement == \
           Union(Interval(-oo, 0, True, False), Interval(1, oo, False, True))

    assert -S.EmptySet == S.EmptySet.complement
    assert ~S.EmptySet == S.EmptySet.complement

    assert S.EmptySet.complement == Interval(-oo, oo)

    assert Union(Interval(0, 1), Interval(2, 3)).complement == \
           Union(Interval(-oo, 0, True, True), Interval(1, 2, True, True),
                 Interval(3, oo, True, True))

    assert FiniteSet(0).complement == Union(Interval(-oo, 0, True, True),
                                            Interval(0, oo, True, True))

    assert (FiniteSet(5) + Interval(S.NegativeInfinity, 0)).complement == \
            Interval(0, 5, True, True) + Interval(5, S.Infinity, True,True)

    assert FiniteSet(1, 2, 3).complement == Interval(
        S.NegativeInfinity, 1, True, True) + Interval(
            1, 2, True, True) + Interval(2, 3, True, True) + Interval(
                3, S.Infinity, True, True)

    X = Interval(1, 3) + FiniteSet(5)
    assert X.intersect(X.complement) == S.EmptySet
Esempio n. 6
0
def test_union():
    assert Union(Interval(1, 2), Interval(2, 3)) == Interval(1, 3)
    assert Union(Interval(1, 2), Interval(2, 3, True)) == Interval(1, 3)
    assert Union(Interval(1, 3), Interval(2, 4)) == Interval(1, 4)
    assert Union(Interval(1, 2), Interval(1, 3)) == Interval(1, 3)
    assert Union(Interval(1, 3), Interval(1, 2)) == Interval(1, 3)
    assert Union(Interval(1, 3, False, True), Interval(1, 2)) == \
        Interval(1, 3, False, True)
    assert Union(Interval(1, 3), Interval(1, 2, False, True)) == Interval(1, 3)
    assert Union(Interval(1, 2, True), Interval(1, 3)) == Interval(1, 3)
    assert Union(Interval(1, 2, True), Interval(1, 3, True)) == \
        Interval(1, 3, True)
    assert Union(Interval(1, 2, True), Interval(1, 3, True, True)) == \
        Interval(1, 3, True, True)
    assert Union(Interval(1, 2, True, True), Interval(1, 3, True)) == \
        Interval(1, 3, True)
    assert Union(Interval(1, 3), Interval(2, 3)) == Interval(1, 3)
    assert Union(Interval(1, 3, False, True), Interval(2, 3)) == \
        Interval(1, 3)
    assert Union(Interval(1, 2, False, True), Interval(2, 3, True)) != \
        Interval(1, 3)
    assert Union(Interval(1, 2), S.EmptySet) == Interval(1, 2)
    assert Union(S.EmptySet) == S.EmptySet

    assert Union(Interval(0, 1), [FiniteSet(1.0/n) for n in range(1, 10)]) == \
        Interval(0, 1)

    assert Interval(1, 2).union(Interval(2, 3)) == \
        Interval(1, 2) + Interval(2, 3)

    assert Interval(1, 2).union(Interval(2, 3)) == Interval(1, 3)

    assert Union(Set()) == Set()

    assert FiniteSet(1) + FiniteSet(2) + FiniteSet(3) == FiniteSet(1, 2, 3)
    assert FiniteSet('ham') + FiniteSet('eggs') == FiniteSet('ham', 'eggs')
    assert FiniteSet(1, 2, 3) + S.EmptySet == FiniteSet(1, 2, 3)

    assert FiniteSet(1, 2, 3) & FiniteSet(2, 3, 4) == FiniteSet(2, 3)
    assert FiniteSet(1, 2, 3) | FiniteSet(2, 3, 4) == FiniteSet(1, 2, 3, 4)

    x = Symbol("x")
    y = Symbol("y")
    z = Symbol("z")
    assert S.EmptySet | FiniteSet(x, FiniteSet(y, z)) == \
        FiniteSet(x, FiniteSet(y, z))

    # Test that Intervals and FiniteSets play nicely
    assert Interval(1, 3) + FiniteSet(2) == Interval(1, 3)
    assert Interval(1, 3, True, True) + FiniteSet(3) == \
        Interval(1, 3, True, False)
    X = Interval(1, 3) + FiniteSet(5)
    Y = Interval(1, 2) + FiniteSet(3)
    XandY = X.intersect(Y)
    assert 2 in X and 3 in X and 3 in XandY
    assert XandY.is_subset(X) and XandY.is_subset(Y)

    raises(TypeError, lambda: Union(1, 2, 3))

    assert X.is_iterable is False
Esempio n. 7
0
class WordInterval(object):
    SILENCE_WORD = '#'

    def __init__(self, inf, sup, word):
        self.word = word
        self.interval = Interval(inf, sup)

    @property
    def is_silent(self):
        return self.word == WordInterval.SILENCE_WORD

    @property
    def inf(self):
        return self.interval.inf

    @property
    def sup(self):
        return self.interval.sup

    def intersect(self, another_interval):
        return self.interval.intersect(another_interval)

    def __eq__(self, other):
        return (self.interval == other.interval) and (self.word == other.word)

    def __str__(self):
        return "%s -> %s" % (self.interval, self.word)

    def __repr__(self):
        return self.__str__()
Esempio n. 8
0
class WordInterval(object):
    SILENCE_WORD = '#'

    def __init__(self, inf, sup, word):
        self.word = word
        self.interval = Interval(inf, sup)

    @property
    def is_silent(self):
        return self.word == WordInterval.SILENCE_WORD

    @property
    def inf(self):
        return self.interval.inf

    @property
    def sup(self):
        return self.interval.sup

    def intersect(self, another_interval):
        return self.interval.intersect(another_interval)


    def __eq__(self, other):
        return (self.interval == other.interval) and (self.word == other.word)

    def __str__(self):
        return "%s -> %s" % (self.interval, self.word)

    def __repr__(self):
        return self.__str__()
Esempio n. 9
0
def test_complement():
    assert Interval(0, 1).complement == \
           Union(Interval(-oo, 0, True, True), Interval(1, oo, True, True))
    assert Interval(0, 1, True, False).complement == \
           Union(Interval(-oo, 0, True, False), Interval(1, oo, True, True))
    assert Interval(0, 1, False, True).complement == \
           Union(Interval(-oo, 0, True, True), Interval(1, oo, False, True))
    assert Interval(0, 1, True, True).complement == \
           Union(Interval(-oo, 0, True, False), Interval(1, oo, False, True))

    assert -S.EmptySet == S.EmptySet.complement
    assert ~S.EmptySet == S.EmptySet.complement

    assert S.EmptySet.complement == Interval(-oo, oo)

    assert Union(Interval(0, 1), Interval(2, 3)).complement == \
           Union(Interval(-oo, 0, True, True), Interval(1, 2, True, True),
                 Interval(3, oo, True, True))

    assert FiniteSet(0).complement == Union(Interval(-oo,0, True,True) ,
            Interval(0,oo, True, True))

    assert (FiniteSet(5) + Interval(S.NegativeInfinity, 0)).complement == \
            Interval(0, 5, True, True) + Interval(5, S.Infinity, True,True)

    assert FiniteSet(1,2,3).complement == Interval(S.NegativeInfinity,1, True,True) + Interval(1,2, True,True) + Interval(2,3, True,True) + Interval(3,S.Infinity, True,True)

    X = Interval(1,3)+FiniteSet(5)
    assert X.intersect(X.complement) == S.EmptySet
Esempio n. 10
0
def test_union():
    assert Union(Interval(1, 2), Interval(2, 3)) == Interval(1, 3)
    assert Union(Interval(1, 2), Interval(2, 3, True)) == Interval(1, 3)
    assert Union(Interval(1, 3), Interval(2, 4)) == Interval(1, 4)
    assert Union(Interval(1, 2), Interval(1, 3)) == Interval(1, 3)
    assert Union(Interval(1, 3), Interval(1, 2)) == Interval(1, 3)
    assert Union(Interval(1, 3, False, True), Interval(1, 2)) == \
           Interval(1, 3, False, True)
    assert Union(Interval(1, 3), Interval(1, 2, False, True)) == Interval(1, 3)
    assert Union(Interval(1, 2, True), Interval(1, 3)) == Interval(1, 3)
    assert Union(Interval(1, 2, True), Interval(1, 3, True)) == Interval(1, 3, True)
    assert Union(Interval(1, 2, True), Interval(1, 3, True, True)) == \
           Interval(1, 3, True, True)
    assert Union(Interval(1, 2, True, True), Interval(1, 3, True)) == \
           Interval(1, 3, True)
    assert Union(Interval(1, 3), Interval(2, 3)) == Interval(1, 3)
    assert Union(Interval(1, 3, False, True), Interval(2, 3)) == \
           Interval(1, 3)
    assert Union(Interval(1, 2, False, True), Interval(2, 3, True)) != \
           Interval(1, 3)
    assert Union(Interval(1, 2), S.EmptySet) == Interval(1, 2)
    assert Union(S.EmptySet) == S.EmptySet

    assert Union(Interval(0,1), [FiniteSet(1.0/n) for n in range(1,10)]) == \
            Interval(0,1)

    assert Interval(1, 2).union(Interval(2, 3)) == \
           Interval(1, 2) + Interval(2, 3)

    assert Interval(1, 2).union(Interval(2, 3)) == Interval(1, 3)

    assert Union(Set()) == Set()

    assert FiniteSet(1) + FiniteSet(2) + FiniteSet(3) == FiniteSet(1,2,3)
    assert FiniteSet(['ham']) + FiniteSet(['eggs']) == FiniteSet('ham', 'eggs')
    assert FiniteSet(1,2,3) + S.EmptySet == FiniteSet(1,2,3)

    assert FiniteSet(1,2,3) & FiniteSet(2,3,4) == FiniteSet(2,3)
    assert FiniteSet(1,2,3) | FiniteSet(2,3,4) == FiniteSet(1,2,3,4)


    # Test that Intervals and FiniteSets play nicely
    assert Interval(1,3) + FiniteSet(2) == Interval(1,3)
    assert Interval(1,3, True,True) + FiniteSet(3) == Interval(1,3, True,False)
    X = Interval(1,3)+FiniteSet(5)
    Y = Interval(1,2)+FiniteSet(3)
    XandY = X.intersect(Y)
    assert 2 in X and 3 in X and 3 in XandY
    assert X.subset(XandY) and Y.subset(XandY)


    raises(TypeError, "Union(1, 2, 3)")
Esempio n. 11
0
def test_complement():
    assert Interval(0, 1).complement == \
           Union(Interval(-oo, 0, True, True), Interval(1, oo, True, True))
    assert Interval(0, 1, True, False).complement == \
           Union(Interval(-oo, 0, True, False), Interval(1, oo, True, True))
    assert Interval(0, 1, False, True).complement == \
           Union(Interval(-oo, 0, True, True), Interval(1, oo, False, True))
    assert Interval(0, 1, True, True).complement == \
           Union(Interval(-oo, 0, True, False), Interval(1, oo, False, True))

    assert -S.EmptySet == S.EmptySet.complement
    assert ~S.EmptySet == S.EmptySet.complement

    assert S.EmptySet.complement == S.UniversalSet
    assert S.UniversalSet.complement == S.EmptySet

    assert Union(Interval(0, 1), Interval(2, 3)).complement == \
           Union(Interval(-oo, 0, True, True), Interval(1, 2, True, True),
                 Interval(3, oo, True, True))

    assert FiniteSet(0).complement == Union(Interval(-oo, 0, True, True),
                                            Interval(0, oo, True, True))

    assert (FiniteSet(5) + Interval(S.NegativeInfinity, 0)).complement == \
            Interval(0, 5, True, True) + Interval(5, S.Infinity, True,True)

    assert FiniteSet(1, 2, 3).complement == Interval(
        S.NegativeInfinity, 1, True, True) + Interval(
            1, 2, True, True) + Interval(2, 3, True, True) + Interval(
                3, S.Infinity, True, True)

    X = Interval(1, 3) + FiniteSet(5)
    assert X.intersect(X.complement) == S.EmptySet

    square = Interval(0, 1) * Interval(0, 1)
    notsquare = square.complement

    assert all(pt in square for pt in [(0, 0), (.5, .5), (1, 0), (1, 1)])
    assert not any(pt in notsquare
                   for pt in [(0, 0), (.5, .5), (1, 0), (1, 1)])
    assert not any(pt in square for pt in [(-1, 0), (1.5, .5), (10, 10)])
    assert all(pt in notsquare for pt in [(-1, 0), (1.5, .5), (10, 10)])
Esempio n. 12
0
def test_complement():
    assert Interval(0, 1).complement == \
        Union(Interval(-oo, 0, True, True), Interval(1, oo, True, True))
    assert Interval(0, 1, True, False).complement == \
        Union(Interval(-oo, 0, True, False), Interval(1, oo, True, True))
    assert Interval(0, 1, False, True).complement == \
        Union(Interval(-oo, 0, True, True), Interval(1, oo, False, True))
    assert Interval(0, 1, True, True).complement == \
        Union(Interval(-oo, 0, True, False), Interval(1, oo, False, True))

    assert -S.EmptySet == S.EmptySet.complement
    assert ~S.EmptySet == S.EmptySet.complement

    assert S.EmptySet.complement == S.UniversalSet
    assert S.UniversalSet.complement == S.EmptySet

    assert Union(Interval(0, 1), Interval(2, 3)).complement == \
        Union(Interval(-oo, 0, True, True), Interval(1, 2, True, True),
              Interval(3, oo, True, True))

    assert FiniteSet(0).complement == Union(Interval(-oo, 0, True, True),
            Interval(0, oo, True, True))

    assert (FiniteSet(5) + Interval(S.NegativeInfinity, 0)).complement == \
        Interval(0, 5, True, True) + Interval(5, S.Infinity, True, True)

    assert FiniteSet(1, 2, 3).complement == \
        Interval(S.NegativeInfinity, 1, True, True) + Interval(1, 2, True, True) + \
        Interval(2, 3, True, True) + Interval(3, S.Infinity, True, True)

    X = Interval(1, 3) + FiniteSet(5)
    assert X.intersect(X.complement) == S.EmptySet

    square = Interval(0, 1) * Interval(0, 1)
    notsquare = square.complement

    assert all(pt in square for pt in [(0, 0), (.5, .5), (1, 0), (1, 1)])
    assert not any(
        pt in notsquare for pt in [(0, 0), (.5, .5), (1, 0), (1, 1)])
    assert not any(pt in square for pt in [(-1, 0), (1.5, .5), (10, 10)])
    assert all(pt in notsquare for pt in [(-1, 0), (1.5, .5), (10, 10)])
Esempio n. 13
0
def compare_single_doc(gs, ann):
    """
    Report on what proportion of a gold standard dataframe is
    covered by the given annotations.
    Assumes all annotations correspond to the same single document.

    Evaluation
    - exact - 0/1 an exact match on interval (1)
    - partial - 0/1 partially detected (counts as miss)

    False positives
    - proportion of string incorrectly labelled as PHI (excluding white space)
    - or proportion of labelled text beyond true labels
    """

    # short circuit comparisons if trivial cases
    if (gs.shape[0] == 0) | (ann.shape[0] == 0):
        # if both df are empty, we will output an empty dataframe
        performance = []
        if gs.shape[0] > 0:
            doc_id = gs['document_id'].values[0]
            # append gold standard rows as misses
            for i, row in gs.iterrows():
                span = '{} {}'.format(row['start'], row['stop'])
                performance.append(
                    [doc_id, row['annotation_id'], 0, 0, 1, span]
                )

        # if ann.shape[0] == 0:
        #     # append ann rows as false positives
        #     for i, row in ann.iterrows():
        #         span = '{} {}'.format(row['start'], row['stop'])
        #         performance.append(
        #             [doc_id, row['annotation_id'], 0, 0, 0, span])

        performance = pd.DataFrame.from_records(
            performance,
            columns=[
                'document_id', 'annotation_id', 'exact', 'partial', 'missed',
                'span'
            ]
        )
        return performance

    # performance is list of lists
    # index, document_id, exact, partial, missed, start, stop
    # if partial, start/stop denote the missed section
    # otherwise, they encompass the entire entity
    performance = list()
    if gs.shape[0] > 0:
        doc_id = gs['document_id'].values[0]
    else:
        doc_id = ann['document_id'].values[0]

    # create right-open intervals
    cmp_intervals = [
        [x[0], x[1], False, True] for x in ann.loc[:, ['start', 'stop']].values
    ]
    cmp_intervals = [Interval(*c) for c in cmp_intervals]
    cmp_intervals = Union(*cmp_intervals)

    for _, row in gs.iterrows():
        # indices are right open
        i = Interval(row['start'], row['stop'], False, True)
        overlap = i.intersect(cmp_intervals)
        mismatch = i - overlap
        # exact match
        if mismatch.is_EmptySet:
            span = '{} {}'.format(row['start'], row['stop'])
            performance.append([doc_id, row['annotation_id'], 1, 0, 0, span])
        # partial match
        else:
            # no match
            if mismatch == i:
                span = '{} {}'.format(row['start'], row['stop'])
                performance.append(
                    [doc_id, row['annotation_id'], 0, 0, 1, span]
                )
            else:
                if type(mismatch) is Union:
                    # we have non-continuous segments in our mismatch
                    span = []
                    for m in mismatch.args:
                        m = adjust_interval(m)
                        span.append('{} {}'.format(m.left, m.right))
                    # brat format: non-contiguous segments are delimited by ';'
                    span = ';'.join(span)
                else:
                    mismatch = adjust_interval(mismatch)
                    span = '{} {}'.format(mismatch.left, mismatch.right)

                performance.append(
                    [doc_id, row['annotation_id'], 0, 1, 0, span]
                )

    # convert back to a dataframe with same index as gs
    performance = pd.DataFrame.from_records(
        performance,
        columns=[
            'document_id', 'annotation_id', 'exact', 'partial', 'missed', 'span'
        ]
    )
    return performance
Esempio n. 14
0
def compare(goldstandard, comparison):
    """
    Report on what proportion of the gold standard corpus is
    covered by the given annotations.

    Evaluation
    - exact - 0/1 an exact match on interval (1)
    - partial - 0/1 partially detected (counts as miss)

    False positives
    - proportion of string incorrectly labelled as PHI (excluding white space)
    - or proportion of labelled text beyond true labels
    """

    group_names = ['document_id']
    cols = ['document_id', 'annotation_id', 'start', 'stop', 'entity']

    # stack gold standard and annotations into the same dataframe
    comparison['annotation_id'] = comparison['annotator']
    df = pd.concat(
        [goldstandard[cols], comparison[cols]], ignore_index=True, axis=0
    )

    # delineate gold standard from annotation
    df['source'] = 'gs'
    df.loc[goldstandard.shape[0]:, 'source'] = 'ann'

    # performance is list of lists
    # index, document_id, exact, partial, missed, start, stop
    # if partial, start/stop denote the missed section
    # otherwise, they encompass the entire entity
    performance = list()

    n_groups = df[group_names].drop_duplicates().shape[0]
    # iterate through each document
    for grp_idx, grp in tqdm(df.groupby(group_names), total=n_groups):
        idxG = grp['source'] == 'gs'
        # create right-open intervals
        cmp_intervals = [
            [x[0], x[1], False, True]
            for x in grp.loc[~idxG, ['start', 'stop']].values
        ]
        cmp_intervals = [Interval(*c) for c in cmp_intervals]
        cmp_intervals = Union(*cmp_intervals)

        for idx, row in grp.loc[idxG, :].iterrows():
            # indices are right open
            i = Interval(row['start'], row['stop'], False, True)
            overlap = i.intersect(cmp_intervals)
            mismatch = i - overlap
            # exact match
            if mismatch.is_EmptySet:
                span = '{} {}'.format(row['start'], row['stop'])
                performance.append(
                    [grp_idx, row['annotation_id'], 1, 0, 0, span]
                )
            # partial match
            else:
                # no match
                if mismatch == i:
                    span = '{} {}'.format(row['start'], row['stop'])
                    performance.append(
                        [grp_idx, row['annotation_id'], 0, 0, 1, span]
                    )
                else:
                    if type(mismatch) is Union:
                        # we have non-continuous segments in our mismatch
                        span = []
                        for m in mismatch.args:
                            m = adjust_interval(m)
                            span.append('{} {}'.format(m.left, m.right))
                        # brat format: non-contiguous segments are delimited by ';'
                        span = ';'.join(span)
                    else:
                        mismatch = adjust_interval(mismatch)
                        span = '{} {}'.format(mismatch.left, mismatch.right)

                    performance.append(
                        [grp_idx, row['annotation_id'], 0, 1, 0, span]
                    )

    # convert back to a dataframe with same index as gs
    performance = pd.DataFrame.from_records(
        performance,
        columns=[
            'document_id', 'annotation_id', 'exact', 'partial', 'missed', 'span'
        ]
    )
    return performance