def from_text(cls, text, base_offset=0, tokenization_re=None): tokens = [] offset = 0 for t in sentence_to_tokens(text, tokenization_re): if not t.isspace(): tokens.append(Token.from_text(t, offset + base_offset)) offset += len(t) return cls(text, base_offset, tokens)
def convert_documents(document, textbounds, relations_dict): converted_documents = [] for sentence in document.sentences: if sentence.tokens: sent_end = sentence.tokens[-1].end tokens = [t.text for t in sentence.tokens] document = dict(tokens=tokens, entities=[], relations=[]) entity_idx = {} for i in range(len(textbounds)): tb = textbounds[i] if tb.end <= sentence.tokens[0].start: continue if tb.start > sent_end: continue entity_tokens = [ t for t in sentence_to_tokens(tb.text) if not t.isspace() ] idxs = [ i for i, t in enumerate(sentence.tokens) if t.start == tb.start ] if len(idxs) == 0: raise Exception("Not exact match! " + tb.text + " IN: " + sentence.text) assert len(idxs) == 1, "Something goes wrong. Ambigues choice!" s = idxs[0] # check the rest entity_idx[tb.id] = len(document['entities']) document['entities'].append( dict(type=tb.type, start=s, end=s + len(entity_tokens))) for entity_id in entity_idx: if relations_dict.get(entity_id): (relation, arg1, arg2) = relations_dict[entity_id] if not entity_idx.get(arg2): continue head = entity_idx[arg1] tail = entity_idx[arg2] document['relations'].append( dict(type=relation, head=head, tail=tail)) converted_documents.append(document) return converted_documents