def ner(xml, start_id=1):
    soup = _soup(xml)
    token_by_ids = _token_by_ids(soup)

    # Stanford only has Inside and Outside tags, so conversion is easy
    nes = []
    last_ne_tok = None
    prev_tok = None
    for _, _, tok in _tok_it(token_by_ids):
        if tok.ner != 'O':
            if last_ne_tok is None:
                # Start of an NE from nothing
                last_ne_tok = tok
            elif tok.ner != last_ne_tok.ner:
                # Change in NE type
                nes.append((last_ne_tok.start, prev_tok.end, last_ne_tok.ner, ))
                last_ne_tok = tok
            else:
                # Continuation of the last NE, move along
                pass
        elif last_ne_tok is not None:
            # NE ended
            nes.append((last_ne_tok.start, prev_tok.end, last_ne_tok.ner, ))
            last_ne_tok = None
        prev_tok = tok
    else:
        # Do we need to terminate the last named entity?
        if last_ne_tok is not None:
            nes.append((last_ne_tok.start, prev_tok.end, last_ne_tok.ner, ))

    curr_id = start_id
    for start, end, _type in nes:
        yield TextBoundAnnotation(((start, end), ), 'T%s' % curr_id, _type, '')
        curr_id += 1
def _pos(xml, start_id=1):
    soup = _soup(xml)
    token_by_ids = _token_by_ids(soup)

    curr_id = start_id
    for s_id, t_id, tok in _tok_it(token_by_ids):
        yield s_id, t_id, TextBoundAnnotation(((tok.start, tok.end, ), ),
                'T%s' % curr_id, tok.pos, '')
        curr_id += 1
def coref(xml, start_id=1):
    soup = _soup(xml)
    token_by_ids = _token_by_ids(soup)
    
    docs_e = soup.findall('document')
    assert len(docs_e) == 1
    docs_e = docs_e[0]
    # Despite the name, this element contains conferences (note the "s")
    corefs_e = docs_e.findall('coreference')
    if not corefs_e:
        # No coreferences to process
        raise StopIteration
    assert len(corefs_e) == 1
    corefs_e = corefs_e[0]

    curr_id = start_id
    for coref_e in corefs_e:
        if corefs_e.tag != 'coreference':
            # To be on the safe side
            continue

        # This tag is now a full corference chain
        chain = []
        for mention_e in coref_e.getiterator('mention'):
            # Note: There is a "representative" attribute signalling the most
            #   "suitable" mention, we are currently not using this
            # Note: We don't use the head information for each mention
            sentence_id = int(mention_e.find('sentence').text)
            start_tok_id = int(mention_e.find('start').text)
            end_tok_id = int(mention_e.find('end').text) - 1

            mention_id = 'T%s' % (curr_id, )
            chain.append(mention_id)
            curr_id += 1
            yield TextBoundAnnotation(
                    ((token_by_ids[sentence_id][start_tok_id].start,
                    token_by_ids[sentence_id][end_tok_id].end), ),
                    mention_id, 'Mention', '')

        yield EquivAnnotation('Coreference', chain, '')