def ner(xml, start_id=1): soup = _soup(xml) token_by_ids = _token_by_ids(soup) # Stanford only has Inside and Outside tags, so conversion is easy nes = [] last_ne_tok = None prev_tok = None for _, _, tok in _tok_it(token_by_ids): if tok.ner != 'O': if last_ne_tok is None: # Start of an NE from nothing last_ne_tok = tok elif tok.ner != last_ne_tok.ner: # Change in NE type nes.append((last_ne_tok.start, prev_tok.end, last_ne_tok.ner, )) last_ne_tok = tok else: # Continuation of the last NE, move along pass elif last_ne_tok is not None: # NE ended nes.append((last_ne_tok.start, prev_tok.end, last_ne_tok.ner, )) last_ne_tok = None prev_tok = tok else: # Do we need to terminate the last named entity? if last_ne_tok is not None: nes.append((last_ne_tok.start, prev_tok.end, last_ne_tok.ner, )) curr_id = start_id for start, end, _type in nes: yield TextBoundAnnotation(((start, end), ), 'T%s' % curr_id, _type, '') curr_id += 1
def _pos(xml, start_id=1): soup = _soup(xml) token_by_ids = _token_by_ids(soup) curr_id = start_id for s_id, t_id, tok in _tok_it(token_by_ids): yield s_id, t_id, TextBoundAnnotation(((tok.start, tok.end, ), ), 'T%s' % curr_id, tok.pos, '') curr_id += 1
def coref(xml, start_id=1): soup = _soup(xml) token_by_ids = _token_by_ids(soup) docs_e = soup.findall('document') assert len(docs_e) == 1 docs_e = docs_e[0] # Despite the name, this element contains conferences (note the "s") corefs_e = docs_e.findall('coreference') if not corefs_e: # No coreferences to process raise StopIteration assert len(corefs_e) == 1 corefs_e = corefs_e[0] curr_id = start_id for coref_e in corefs_e: if corefs_e.tag != 'coreference': # To be on the safe side continue # This tag is now a full corference chain chain = [] for mention_e in coref_e.getiterator('mention'): # Note: There is a "representative" attribute signalling the most # "suitable" mention, we are currently not using this # Note: We don't use the head information for each mention sentence_id = int(mention_e.find('sentence').text) start_tok_id = int(mention_e.find('start').text) end_tok_id = int(mention_e.find('end').text) - 1 mention_id = 'T%s' % (curr_id, ) chain.append(mention_id) curr_id += 1 yield TextBoundAnnotation( ((token_by_ids[sentence_id][start_tok_id].start, token_by_ids[sentence_id][end_tok_id].end), ), mention_id, 'Mention', '') yield EquivAnnotation('Coreference', chain, '')