Esempio n. 1
0
    def from_text(cls, text, base_offset=0, tokenization_re=None):
        tokens = []
        offset = 0
        for t in sentence_to_tokens(text, tokenization_re):
            if not t.isspace():
                tokens.append(Token.from_text(t, offset + base_offset))
            offset += len(t)

        return cls(text, base_offset, tokens)
Esempio n. 2
0
def convert_documents(document, textbounds, relations_dict):
    converted_documents = []
    for sentence in document.sentences:
        if sentence.tokens:
            sent_end = sentence.tokens[-1].end
            tokens = [t.text for t in sentence.tokens]
            document = dict(tokens=tokens, entities=[], relations=[])
            entity_idx = {}

            for i in range(len(textbounds)):
                tb = textbounds[i]
                if tb.end <= sentence.tokens[0].start:
                    continue

                if tb.start > sent_end:
                    continue

                entity_tokens = [
                    t for t in sentence_to_tokens(tb.text) if not t.isspace()
                ]
                idxs = [
                    i for i, t in enumerate(sentence.tokens)
                    if t.start == tb.start
                ]
                if len(idxs) == 0:
                    raise Exception("Not exact match! " + tb.text + " IN: " +
                                    sentence.text)

                assert len(idxs) == 1, "Something goes wrong. Ambigues choice!"
                s = idxs[0]
                # check the rest
                entity_idx[tb.id] = len(document['entities'])
                document['entities'].append(
                    dict(type=tb.type, start=s, end=s + len(entity_tokens)))

            for entity_id in entity_idx:
                if relations_dict.get(entity_id):
                    (relation, arg1, arg2) = relations_dict[entity_id]

                    if not entity_idx.get(arg2):
                        continue

                    head = entity_idx[arg1]
                    tail = entity_idx[arg2]
                    document['relations'].append(
                        dict(type=relation, head=head, tail=tail))

            converted_documents.append(document)

    return converted_documents