コード例 #1
0
ファイル: document.py プロジェクト: zy200459/standoff2conll
    def from_standoff(cls, text, annotations, sentence_split=True,
                      discont_rule=None, overlap_rule=None,
                      filter_types=None, exclude_types=None,
                      tokenization_re=None, document_id=None):
        """Return Document given text and standoff annotations."""

        # first create a document from the text without annotations
        # with all "out" tags (i.e. "O"), then re-tag the tokens based
        # on the textbounds.

        textbounds = parse_textbounds(annotations, discont_rule)

        document = cls.from_text(text, sentence_split, textbounds,
                                 tokenization_re=tokenization_re)

        if document_id is not None:
            document.id = document_id
        if filter_types:
            textbounds = filter_textbounds(textbounds, filter_types)
        if exclude_types:
            textbounds = filter_textbounds(textbounds, exclude_types,
                                           exclude=True)
        verify_textbounds(textbounds, text)
        textbounds = eliminate_overlaps(textbounds, overlap_rule)
        retag_document(document, textbounds)

        return document
コード例 #2
0
    def from_standoff_to_spert(cls,
                               text,
                               annotations,
                               sentence_split=True,
                               discont_rule=None,
                               overlap_rule=None,
                               filter_types=None,
                               exclude_types=None,
                               tokenization_re=None,
                               document_id=None):
        """Return Document given text and standoff annotations."""

        # todo: убрать костыль с исключением отношений
        textbounds, relations = parse_textbounds(
            annotations, discont_rule, exclude_relations=exclude_types)

        document = cls.from_text(text,
                                 sentence_split,
                                 textbounds,
                                 tokenization_re=tokenization_re)

        if document_id is not None:
            document.id = document_id
        if filter_types:
            textbounds = filter_textbounds(textbounds, filter_types)
        if exclude_types:
            textbounds = filter_textbounds(textbounds,
                                           exclude_types,
                                           exclude=True)

        verify_textbounds(textbounds, text)
        relation_dict = {
            arg1: (relation, arg1, arg2)
            for (relation, arg1, arg2) in relations
        }

        docs = convert_documents(document, textbounds, relation_dict)

        return docs