Beispiel #1
0
def take_stats(txt, ann, fn, stats, options):
    annotations = []
    for ln, line in enumerate(ann.splitlines(), start=1):
        if not line or line.isspace() or line[0] not in 'TN':
            info('skipping line {} in {}: {}'.format(ln, fn, line))
        if line[0] == 'T':
            id_, type_span, text = line.split('\t')
            type_, span = type_span.split(' ', 1)
            stats[ENTITY_TYPE][type_] += 1
            stats[ENTITY_TEXT][text] += 1
            stats[TEXT_BY_TYPE.format(type_)][text] += 1
            stats[TOTALS]['textbounds'] += 1
            if len(span.split(';')) > 1:
                stats[FRAGMENTED_SPAN][type_] += 1
            annotations.append(Textbound(id_, type_, span, text))
        elif line[0] == 'N':
            id_, type_rid_tid, text = line.split('\t')
            type_, rid, tid = type_rid_tid.split(' ')
            if (tid.startswith(TAXONOMY_PREFIX)
                    and options.taxdata is not None):
                tax_id = tid[len(TAXONOMY_PREFIX):]
                rank = options.taxdata.get_rank(tax_id)
                if rank == '<UNKNOWN>':
                    stats[TAXONOMY_UNKNOWN][tax_id] += 1
                division = options.taxdata.get_division(tax_id)
                stats[TAXONOMY_RANK][rank] += 1
                stats[TAXONOMY_DIV][division] += 1
                stats[TAXONOMY_RANK_DIV]['/'.join([rank, division])] += 1
                stats[TEXT_BY_RANK.format(rank)][text] += 1
            stats[TOTALS]['normalizations'] += 1
        else:
            assert False, 'internal error'
    stats[TOTALS]['documents'] += 1

    is_consistent = True
    overlapping = find_overlapping(annotations)
    for t1, t2 in overlapping:
        sorted_types = '{}-{}'.format(*sorted([t1.type, t2.type]))
        if t1.span_matches(t2):
            if t1.type == t2.type:
                # same span, different types
                is_consistent = False
            stats[SAME_SPAN][sorted_types] += 1
            stats[SAME_SPAN_TEXT][t1.text] += 1
        elif t1.contains(t2):
            stats[CONTAINMENT]['{} in {}'.format(t2.type, t1.type)] += 1
            stats[CONTAINMENT_TEXT]['{} in {}'.format(t2.text, t1.text)] += 1
        elif t2.contains(t1):
            stats[CONTAINMENT]['{} in {}'.format(t1.type, t2.type)] += 1
            stats[CONTAINMENT_TEXT]['{} in {}'.format(t1.text, t2.text)] += 1
        elif t1.span_crosses(t2):
            is_consistent = False
            stats[CROSSING_SPAN]['{}/{}'.format(t1.type, t2.type)] += 1
            stats[CROSSING_SPAN_TEXT]['{}/{}'.format(t1.text, t2.text)] += 1
        else:
            assert False, 'internal error'
    if is_consistent:
        stats[CONSISTENCY]['consistent'] += 1
    else:
        stats[CONSISTENCY]['inconsistent'] += 1
Beispiel #2
0
    def standoffs(self, index):
        """Return sentence annotations as list of Standoff objects."""

        textbounds = []
        for type_, start, end in self.get_tagged():
            tstart, tend = start-self.base_offset, end-self.base_offset
            textbounds.append(Textbound('T%d' % index, type_, start, end,
                                        self.text[tstart:tend]))
            index += 1
        return textbounds
Beispiel #3
0
def make_textbound(type_, span_str, text):
    id_ = generate_id('T')
    spans = []
    for span in span_str.split(';'):
        start, end = (int(i) for i in span.split())
        spans.append((start, end))
    min_start = min(s[0] for s in spans)
    max_end = max(s[1] for s in spans)
    if len(spans) > 1:
        warning('replacing fragmented span {} with {} {}'.format(
            span_str, min_start, max_end))
    return Textbound(id_, type_, min_start, max_end, text)
Beispiel #4
0
def mentions_to_standoffs(mentions, options):
    standoffs = []
    # Mentions with identical span and type map to one textbound with
    # multiple normalizations.
    grouped = defaultdict(list)
    for m in mentions:
        grouped[(m.start, m.end, m.typename, m.text)].append(m)
    t_idx, n_idx = count(1), count(1)
    for (start, end, type_, text), group in sorted(grouped.items()):
        t_id = 'T{}'.format(next(t_idx))
        standoffs.append(Textbound(t_id, type_, start, end, text))
        for m in group:
            n_id = 'N{}'.format(next(n_idx))
            n_name = get_norm_name(m.serial, m.text, options)
            # if we have a species name, add it to the norm text
            if m.species:
                n_name = n_name + ' ({})'.format(m.species)
            norm_id = get_norm_id(m.serial, 'TAGGER:{}'.format(m.serial),
                                  options)
            norm_id = rewrite_norm_id(norm_id, type_, m.species)
            standoffs.append(Normalization(n_id, t_id, norm_id, n_name))
    return standoffs