コード例 #1
0
    def annotated_text(
        self,
        text: str,
        annotations: Collection[AnnotatedSpan],
        *,
        text_offsets: Optional[Span] = None,
    ) -> str:
        """
        Mark annotations on text in an HTML-like style.

        Each annotation will becomes an HTML tag wrapping the text at the corresponding offsets.
        Any attributes will become HTML attributes.

        This does not add any other HTML annotations (`head`, `body`, etc.), so if desired the
        user should add them afterwards.

        If `text_offsets` is specified, the annotations are assumed to have offsets with respect
        to some larger string, where `text` is a substring of that string with offsets
        `text_offsets` relative to it.  You might use this, for example, to render a single
        paragraph from a document.
        """
        if not text_offsets:
            text_offsets = Span.from_inclusive_to_exclusive(0, len(text))
        check_arg(
            len(text_offsets) == len(text),
            f"Text offsets length {len(text_offsets)} "
            f"does not match text length {len(text)}",
        )

        # we process the annotations to (a) ensure they all fit within the requested snippet
        # and (b) shift their offsets so that all offsets are relative to the text being
        # formatted
        processed_annotations = self._clip_to_offsets_and_shift(
            annotations, text_offsets)

        ret = io.StringIO()
        last_uncopied_offset = 0
        for tag in self._tag_sequence(processed_annotations):
            if last_uncopied_offset < tag.offset:
                ret.write(text[last_uncopied_offset:tag.offset])
                last_uncopied_offset = tag.offset

            ret.write(tag.string)

        # get any trailing text after last tag
        if last_uncopied_offset < text_offsets.end:
            ret.write(text[last_uncopied_offset:text_offsets.end])
        return ret.getvalue()
コード例 #2
0
def parse_text_from_source(text_justification_lookup: TextJustificationLookup,
                           inf_just_pattern,
                           inf_just_span):
    match = re.search(inf_just_pattern, inf_just_span)
    if match:
        # source = match.group(1)
        document = match.group(2)
        start = int(match.group(3))
        end = int(match.group(4))
        text_descriptor = TextDescriptor(doceid=document,
                                         span=Span.from_inclusive_to_exclusive(start, end + 1),
                                         language=None)
        try:
            lookup = text_justification_lookup.text_for_justification(text_descriptor, 50)
            return lookup.spanning_tokens, lookup.original_text
        except (RuntimeError, AttributeError):
            return 'None', 'None'
    else:
        return 'None', 'None'
コード例 #3
0
    metadata_from_wordpiece = doc.metadata_for(tokens_from_wordpiece)
    tokens_from_spacy = metadata_from_wordpiece[
        WordPieceTokenizationAnnotator.EXISTING_TOKEN_THEORY_USED_FOR_WORDPIECE
    ]
    map_spacy_to_wordpiece_indices = metadata_from_wordpiece[
        WordPieceTokenizationAnnotator.MULTIMAP_FROM_EXISTING_TO_WORDPIECE_TOKENIZATION
    ]
    mentions_from_apf = doc_with_lots_of_mention_algos.mentions(algorithm(ApfIngester))
    mentions_from_corenlp = doc_with_lots_of_mention_algos.mentions(
        algorithm(CoreNLPNameFinder)
    )
    mentions_from_spacy = doc_with_lots_of_mention_algos.mentions(
        algorithm(SpacyAnnotator)
    )

    s = Span.from_inclusive_to_exclusive(2778, 2915)
    print("== ACE ==")
    print(
        "\n".join(
            str(mention)
            for mention in get_items_overlapping_with_this_one(s, mentions_from_apf)
        )
    )
    print("== CoreNLP ==")
    print(
        "\n".join(
            str(mention)
            for mention in get_items_overlapping_with_this_one(s, mentions_from_corenlp)
        )
    )
    print("== spaCy ==")