def annotated_text( self, text: str, annotations: Collection[AnnotatedSpan], *, text_offsets: Optional[Span] = None, ) -> str: """ Mark annotations on text in an HTML-like style. Each annotation will becomes an HTML tag wrapping the text at the corresponding offsets. Any attributes will become HTML attributes. This does not add any other HTML annotations (`head`, `body`, etc.), so if desired the user should add them afterwards. If `text_offsets` is specified, the annotations are assumed to have offsets with respect to some larger string, where `text` is a substring of that string with offsets `text_offsets` relative to it. You might use this, for example, to render a single paragraph from a document. """ if not text_offsets: text_offsets = Span.from_inclusive_to_exclusive(0, len(text)) check_arg( len(text_offsets) == len(text), f"Text offsets length {len(text_offsets)} " f"does not match text length {len(text)}", ) # we process the annotations to (a) ensure they all fit within the requested snippet # and (b) shift their offsets so that all offsets are relative to the text being # formatted processed_annotations = self._clip_to_offsets_and_shift( annotations, text_offsets) ret = io.StringIO() last_uncopied_offset = 0 for tag in self._tag_sequence(processed_annotations): if last_uncopied_offset < tag.offset: ret.write(text[last_uncopied_offset:tag.offset]) last_uncopied_offset = tag.offset ret.write(tag.string) # get any trailing text after last tag if last_uncopied_offset < text_offsets.end: ret.write(text[last_uncopied_offset:text_offsets.end]) return ret.getvalue()
def parse_text_from_source(text_justification_lookup: TextJustificationLookup, inf_just_pattern, inf_just_span): match = re.search(inf_just_pattern, inf_just_span) if match: # source = match.group(1) document = match.group(2) start = int(match.group(3)) end = int(match.group(4)) text_descriptor = TextDescriptor(doceid=document, span=Span.from_inclusive_to_exclusive(start, end + 1), language=None) try: lookup = text_justification_lookup.text_for_justification(text_descriptor, 50) return lookup.spanning_tokens, lookup.original_text except (RuntimeError, AttributeError): return 'None', 'None' else: return 'None', 'None'
metadata_from_wordpiece = doc.metadata_for(tokens_from_wordpiece) tokens_from_spacy = metadata_from_wordpiece[ WordPieceTokenizationAnnotator.EXISTING_TOKEN_THEORY_USED_FOR_WORDPIECE ] map_spacy_to_wordpiece_indices = metadata_from_wordpiece[ WordPieceTokenizationAnnotator.MULTIMAP_FROM_EXISTING_TO_WORDPIECE_TOKENIZATION ] mentions_from_apf = doc_with_lots_of_mention_algos.mentions(algorithm(ApfIngester)) mentions_from_corenlp = doc_with_lots_of_mention_algos.mentions( algorithm(CoreNLPNameFinder) ) mentions_from_spacy = doc_with_lots_of_mention_algos.mentions( algorithm(SpacyAnnotator) ) s = Span.from_inclusive_to_exclusive(2778, 2915) print("== ACE ==") print( "\n".join( str(mention) for mention in get_items_overlapping_with_this_one(s, mentions_from_apf) ) ) print("== CoreNLP ==") print( "\n".join( str(mention) for mention in get_items_overlapping_with_this_one(s, mentions_from_corenlp) ) ) print("== spaCy ==")