def test_format(self): # we show how we can format a snippet (below) when the # annotations are relative to a larger enclosing text original_text = ( "Eliot: Time present and time past / " "Are both perhaps present in time future..." ) snippet_text = ( "Time present and time past / " "Are both perhaps present in time future..." ) annotations = [ AnnotatedSpan("SIR_NOT_APPEARING_IN_THIS_SNIPPET", Span(1, 5)), AnnotatedSpan("LINE", Span(7, 33)), AnnotatedSpan("BAR", Span(8, 9)), AnnotatedSpan("LINE", Span(36, 75)), AnnotatedSpan("PP", Span(61, 75)), AnnotatedSpan("ELLIPSES", Span(75, 78)), AnnotatedSpan("FOO", Span(7, 19)), AnnotatedSpan("SINGLE_CHAR", Span(61, 62)), ] expected_result = ( "<LINE><FOO>T<BAR>i</BAR>me present</FOO> and time past</LINE> " "/ <LINE>Are both perhaps present <PP><SINGLE_CHAR>i</SINGLE_CHAR>n " "time future</PP></LINE><ELLIPSES>...</ELLIPSES>" ) self.assertEqual( expected_result, HTMLStyleAnnotationFormatter().annotated_text( snippet_text, annotations, text_offsets=Span(7, len(original_text)) ), )
def get_sentence_spans(document_text: str) -> List[Span]: tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer() return [ Span(*s) for s in tokenizer.span_tokenize(document_text, realign_boundaries=True) ]
def contexts_from_justifications(justifications: ImmutableDict[str, Span], document) -> ImmutableDict[str, Span]: document_text = document["fulltext"] sentence_spans = get_sentence_spans(document_text) contexts: Dict[str, Span] = {} for justification_id, justification_span in justifications.items(): for s_span in sentence_spans: if s_span.contains_span(justification_span): # the sentence tokenizer doesn't recognize double newline as a potential sentence boundary, # so we split on double newlines and return the parts of the pre/post context # that are closest to the mention precontext_lines = document_text[ s_span.start:justification_span.start].split("\n\n") precontext_extra = ("\n\n".join(precontext_lines[:-1]) if len(precontext_lines) > 1 else "") postcontext_lines = document_text[justification_span. end:s_span.end].split("\n\n") postcontext_extra = ("\n\n".join(postcontext_lines[1:]) if len(postcontext_lines) > 1 else "") modified_sentence_start = s_span.start + len(precontext_extra) modified_sentence_end = s_span.end - len(postcontext_extra) assert ( justification_id not in contexts ), "justification should not be overlapping with more than one sentence" contexts[justification_id] = Span(modified_sentence_start, modified_sentence_end) return immutabledict(contexts)
def _render_html(corpus: Corpus, output_dir: Path, parent_or_child_id: str, start: int, end: int) -> Tuple[Path, str]: """Outputs either the whole document rendered in HTML or a subspan. `end` is inclusive.""" document = _get_document(corpus, parent_or_child_id) if not document: raise ValueError( f"{document['parent_id']} not present in the document database.") justification_spans: ImmutableDict[str, Span] = immutabledict( {f"{start}:{end}": Span(start, end + 1)}) contexts = contexts_from_justifications(justification_spans, document) to_render, _ = render_document(document["fulltext"], justification_spans, contexts) if not to_render: raise ValueError("Could not find anything to render.") final_html = _render_template( document=immutabledict({ "id": document["parent_id"], "title": document["title"], "html": to_render, "span": f"{start}:{end}", })) output_file = output_dir / f"{document['parent_id']}_{start}-{end}.html" output_file.write_text(final_html) return output_file, document["fulltext"][start:end + 1]
def match_against_tokens( self, token_sequence_to_match_against: Tuple[str, ...], *, slots_to_filler_spans: Mapping[SyntaxSemanticsVariable, Span], ) -> Optional[Span]: """ Gets the token indices, if any, for the first match of this template against *token_sequence_to_match_against*, assuming any slots are filled by the tokens given by *slots_to_fillers_spans*. """ # First, we turn the template into a token sequence to search for # by filling in all the slots form the provided token span mapping. tokens_to_match = [] for element in self.elements: if isinstance(element, str): tokens_to_match.append(element) else: slot_filler_span = slots_to_filler_spans.get(element) if slot_filler_span: # endpoints are exclusive tokens_to_match.extend(token_sequence_to_match_against[ slot_filler_span.start:slot_filler_span.end]) else: raise RuntimeError( f"Template contained variable {element}, " f"but it was not found in the mapping of slots to spans: " f"{slots_to_filler_spans}") # Now we need to check if the tokens to match occur in the given token sequence to # match against. We don't expect these sequences to be long, so an inefficient solution # is okay. if not tokens_to_match: raise RuntimeError( "Don't know how to match any empty token sequence") next_idx_to_search_from = 0 while next_idx_to_search_from < len(token_sequence_to_match_against): try: index_of_first_token = token_sequence_to_match_against.index( tokens_to_match[0], next_idx_to_search_from) candidate_match_exclusive_end = index_of_first_token + len( tokens_to_match) if candidate_match_exclusive_end <= len( token_sequence_to_match_against): if tokens_to_match == list(token_sequence_to_match_against[ index_of_first_token:candidate_match_exclusive_end] ): # span endpoints are exclusive return Span(index_of_first_token, candidate_match_exclusive_end) # False alarm - the first token matched, but not the whole sequence. next_idx_to_search_from = index_of_first_token + 1 except ValueError: # If we can't even find the first token of what we are searching for, # we definitely have no match. return None # We got all the way to the end without finding a match return None
def _sort_mapping_by_token_spans( pairs) -> ImmutableDict[ObjectSemanticNode, Span]: # we type: ignore because the proper typing of pairs is huge and mypy is going to screw it up # anyway. unsorted = immutabledict(pairs) # type: ignore return immutabledict( (matched_node, token_span) for (matched_node, token_span) in sorted( unsorted.items(), key=lambda item: Span.earliest_then_longest_first_key(item[1]), ))
def annotated_text( self, text: str, annotations: Collection[AnnotatedSpan], *, text_offsets: Optional[Span] = None, ) -> str: """ Mark annotations on text in an HTML-like style. Each annotation will becomes an HTML tag wrapping the text at the corresponding offsets. Any attributes will become HTML attributes. This does not add any other HTML annotations (`head`, `body`, etc.), so if desired the user should add them afterwards. If `text_offsets` is specified, the annotations are assumed to have offsets with respect to some larger string, where `text` is a substring of that string with offsets `text_offsets` relative to it. You might use this, for example, to render a single paragraph from a document. """ if not text_offsets: text_offsets = Span.from_inclusive_to_exclusive(0, len(text)) check_arg( len(text_offsets) == len(text), f"Text offsets length {len(text_offsets)} " f"does not match text length {len(text)}", ) # we process the annotations to (a) ensure they all fit within the requested snippet # and (b) shift their offsets so that all offsets are relative to the text being # formatted processed_annotations = self._clip_to_offsets_and_shift( annotations, text_offsets) ret = io.StringIO() last_uncopied_offset = 0 for tag in self._tag_sequence(processed_annotations): if last_uncopied_offset < tag.offset: ret.write(text[last_uncopied_offset:tag.offset]) last_uncopied_offset = tag.offset ret.write(tag.string) # get any trailing text after last tag if last_uncopied_offset < text_offsets.end: ret.write(text[last_uncopied_offset:text_offsets.end]) return ret.getvalue()
def test_intersection(self) -> None: s1, s2, s3, s4 = (Span(0, 3), Span(2, 25), Span(25, 30), Span(10, 20)) s1_s2_intersection = Span(2, 3) s2_s4_intersection = Span(10, 20) self.assertIsNone(s1.intersection(s3)) self.assertIsNone(s2.intersection(s3)) self.assertEqual(s1.intersection(s2), s1_s2_intersection) self.assertEqual(s2.intersection(s4), s2_s4_intersection) self.assertEqual(s4.intersection(s2), s2_s4_intersection)
def parse_text_from_source(text_justification_lookup: TextJustificationLookup, inf_just_pattern, inf_just_span): match = re.search(inf_just_pattern, inf_just_span) if match: # source = match.group(1) document = match.group(2) start = int(match.group(3)) end = int(match.group(4)) text_descriptor = TextDescriptor(doceid=document, span=Span.from_inclusive_to_exclusive(start, end + 1), language=None) try: lookup = text_justification_lookup.text_for_justification(text_descriptor, 50) return lookup.spanning_tokens, lookup.original_text except (RuntimeError, AttributeError): return 'None', 'None' else: return 'None', 'None'
def render_single_justification_document(document: dict, justification: Justification) -> str: span_start = justification.span_start span_end = justification.span_end if not span_start or not span_end: raise ValueError( "Justification to render must have values for span_start and span_end." ) justification_spans: ImmutableDict[str, Span] = immutabledict( {f"{span_start}:{span_end}": Span(span_start, span_end + 1)}) contexts = contexts_from_justifications(justification_spans, document) to_render, _ = render_document(document["fulltext"], justification_spans, contexts) if not to_render: raise ValueError("Could not find anything to render.") return to_render
def is_legal_template_span( candidate_token_span: Span, *, invalid_token_spans: ImmutableSet[Span] ) -> bool: # A template token span can't exceed the bounds of the utterance if candidate_token_span.start < 0: return False if candidate_token_span.end > len(sentence_tokens): return False # or be bigger than our maximum template size... if len(candidate_token_span) > max_length: return False for span in invalid_token_spans: if candidate_token_span.contains_span(span): return False # or we have already aligned any of the tokens in between the objects # to some other meaning. for token_index in range(candidate_token_span.start, candidate_token_span.end): if language_concept_alignment.token_index_is_aligned(token_index): return False return True
metadata_from_wordpiece = doc.metadata_for(tokens_from_wordpiece) tokens_from_spacy = metadata_from_wordpiece[ WordPieceTokenizationAnnotator.EXISTING_TOKEN_THEORY_USED_FOR_WORDPIECE ] map_spacy_to_wordpiece_indices = metadata_from_wordpiece[ WordPieceTokenizationAnnotator.MULTIMAP_FROM_EXISTING_TO_WORDPIECE_TOKENIZATION ] mentions_from_apf = doc_with_lots_of_mention_algos.mentions(algorithm(ApfIngester)) mentions_from_corenlp = doc_with_lots_of_mention_algos.mentions( algorithm(CoreNLPNameFinder) ) mentions_from_spacy = doc_with_lots_of_mention_algos.mentions( algorithm(SpacyAnnotator) ) s = Span.from_inclusive_to_exclusive(2778, 2915) print("== ACE ==") print( "\n".join( str(mention) for mention in get_items_overlapping_with_this_one(s, mentions_from_apf) ) ) print("== CoreNLP ==") print( "\n".join( str(mention) for mention in get_items_overlapping_with_this_one(s, mentions_from_corenlp) ) ) print("== spaCy ==")
def process_aligned_objects_with_template( candidate_template: Tuple[AlignmentSlots, ...], aligned_nodes: Tuple[SemanticNodeWithSpan, ...], *, invalid_token_spans: ImmutableSet[Span], ) -> Iterable[Optional[SurfaceTemplateBoundToSemanticNodes]]: aligned_node_index = 0 template_elements: List[Union[str, SyntaxSemanticsVariable]] = [] slot_to_semantic_node: List[Tuple[SyntaxSemanticsVariable, SemanticNode]] = [] # We need to handle fixed strings that are pre or post fix to the rest of the # Sentence differently as they don't have a fixed length so we could generate # multiple options. prefix_string_end = None postfix_string_start = None # In the event we generate a candidate template like: # A, F, F, A then we want to compute this like A, F, A # So we keep track if the previous token was a FixedString indicator previous_node_was_string = False for token in candidate_template: # If the token in our template is an argument we need to assign it a # unique SyntaxSemanticsVariable, and map it to the SemanticNode if token == AlignmentSlots.Argument: slot_semantic_variable = STANDARD_SLOT_VARIABLES[aligned_node_index] template_elements.append(slot_semantic_variable) aligned_node = aligned_nodes[aligned_node_index].node if not isinstance(aligned_node, ObjectSemanticNode): logging.debug( f"Attempted to make template where an Argument is not an ObjectSemanticNode." f"Invalid node: {aligned_node}" ) # Log this failure and then ignore this attempt yield None slot_to_semantic_node.append((slot_semantic_variable, aligned_node)) aligned_node_index += 1 previous_node_was_string = False else: # We ignore this case to process A, F, F, A like A, F, A if previous_node_was_string: continue # We make a note of where the end of our prefix string can be # Then continue as we'll handle this case afterwards elif aligned_node_index == 0: prefix_string_end = aligned_nodes[aligned_node_index].span.start # Similiarly to above, we instead mark the start of the postfix string elif aligned_node_index == len(aligned_nodes): postfix_string_start = aligned_nodes[aligned_node_index - 1].span.end else: # If our FixedString is flanked by two Arguments we just want to acquire all the tokens # between them if ( aligned_nodes[aligned_node_index - 1].span.end != aligned_nodes[aligned_node_index].span.start ): candidate_token_span = Span( aligned_nodes[aligned_node_index - 1].span.end, aligned_nodes[aligned_node_index].span.start, ) if not is_legal_template_span( candidate_token_span, invalid_token_spans=invalid_token_spans ): # If not a valid span, ignore this attempt continue template_elements.extend( sentence_tokens[ candidate_token_span.start : candidate_token_span.end ] ) previous_node_was_string = True # We need to handle searching before or after the aligned token # And we could generate multiple options of different lengths # between 1 and _MAXIMUM_ACTION_TEMPLATE_TOKEN_LENGTH if prefix_string_end and postfix_string_start: for max_token_length_for_template_prefix in range(1, max_length + 1): prefix_candidate_token_span = Span( prefix_string_end - max_token_length_for_template_prefix, prefix_string_end, ) if is_legal_template_span( prefix_candidate_token_span, invalid_token_spans=invalid_token_spans ): for max_token_length_for_template_postfix in range(1, max_length + 1): postfix_candidate_token_span = Span( postfix_string_start, postfix_string_start + max_token_length_for_template_postfix, ) if is_legal_template_span( postfix_candidate_token_span, invalid_token_spans=invalid_token_spans, ): final_template_elements: List[ Union[str, SyntaxSemanticsVariable] ] = list( sentence_tokens[ prefix_candidate_token_span.start : prefix_candidate_token_span.end ] ) final_template_elements.extend(template_elements) final_template_elements.extend( sentence_tokens[ postfix_candidate_token_span.start : postfix_candidate_token_span.end ] ) yield SurfaceTemplateBoundToSemanticNodes( surface_template=SurfaceTemplate( elements=final_template_elements, determiner_prefix_slots=[ SLOT for (SLOT, _) in slot_to_semantic_node ], language_mode=language_mode, ), slot_to_semantic_node=slot_to_semantic_node, ) elif prefix_string_end: for max_token_length_for_template_prefix in range(1, max_length + 1): prefix_candidate_token_span = Span( prefix_string_end - max_token_length_for_template_prefix, prefix_string_end, ) if is_legal_template_span( prefix_candidate_token_span, invalid_token_spans=invalid_token_spans ): final_template_elements = list( sentence_tokens[ prefix_candidate_token_span.start : prefix_candidate_token_span.end ] ) final_template_elements.extend(template_elements) yield SurfaceTemplateBoundToSemanticNodes( surface_template=SurfaceTemplate( elements=final_template_elements, determiner_prefix_slots=[ SLOT for (SLOT, _) in slot_to_semantic_node ], language_mode=language_mode, ), slot_to_semantic_node=slot_to_semantic_node, ) elif postfix_string_start: for max_token_length_for_template_postfix in range(1, max_length + 1): postfix_candidate_token_span = Span( postfix_string_start, postfix_string_start + max_token_length_for_template_postfix, ) if is_legal_template_span( postfix_candidate_token_span, invalid_token_spans=invalid_token_spans ): final_template_elements = list(template_elements) final_template_elements.extend( sentence_tokens[ postfix_candidate_token_span.start : postfix_candidate_token_span.end ] ) yield SurfaceTemplateBoundToSemanticNodes( surface_template=SurfaceTemplate( elements=final_template_elements, determiner_prefix_slots=[ SLOT for (SLOT, _) in slot_to_semantic_node ], language_mode=language_mode, ), slot_to_semantic_node=slot_to_semantic_node, ) else: yield SurfaceTemplateBoundToSemanticNodes( surface_template=SurfaceTemplate( elements=template_elements, determiner_prefix_slots=[SLOT for (SLOT, _) in slot_to_semantic_node], language_mode=language_mode, ), slot_to_semantic_node=slot_to_semantic_node, )
def _candidate_templates( self, language_perception_semantic_alignment: LanguagePerceptionSemanticAlignment ) -> AbstractSet[SurfaceTemplateBoundToSemanticNodes]: ret = [] language_concept_alignment = ( language_perception_semantic_alignment.language_concept_alignment ) # Find all objects we have recognized... for ( object_node, span_for_object, ) in language_concept_alignment.node_to_language_span.items(): if isinstance(object_node, ObjectSemanticNode): try: # Any words immediately before them or after them are candidate attributes. # See https://github.com/isi-vista/adam/issues/791 . preceding_token_index = span_for_object.start - 1 if ( preceding_token_index >= 0 and not language_concept_alignment.token_index_is_aligned( preceding_token_index ) ): ret.append( SurfaceTemplateBoundToSemanticNodes( language_concept_alignment.to_surface_template( {object_node: SLOT1}, restrict_to_span=Span( preceding_token_index, span_for_object.end ), language_mode=self._language_mode, ), {SLOT1: object_node}, ) ) following_token_index = span_for_object.end + 1 if following_token_index < len( language_concept_alignment.language.as_token_sequence() ) and not language_concept_alignment.token_index_is_aligned( following_token_index ): ret.append( SurfaceTemplateBoundToSemanticNodes( language_concept_alignment.to_surface_template( {object_node: SLOT1}, restrict_to_span=Span( span_for_object.start, following_token_index ), language_mode=self._language_mode, ), {SLOT1: object_node}, ) ) # Catches errors in to_surface_template() - we skip this case to prevent the learning from breaking. except RuntimeError: continue return immutableset( bound_surface_template for bound_surface_template in ret # For now, we require templates to account for the entire utterance. # See https://github.com/isi-vista/adam/issues/789 if covers_entire_utterance( bound_surface_template, language_concept_alignment, # We need to explicitly ignore determiners here for some reason # See: https://github.com/isi-vista/adam/issues/871 ignore_determiners=True, ) # this keeps the relation learner from learning things such as "a_slot1" which will pose an issue for # later learning of attributes since the learner may consider both the attribute and the object to be objects initially, # leading it to try to match two objects with a template that only has one slot and not all( (e in ENGLISH_DETERMINERS or isinstance(e, SyntaxSemanticsVariable)) for e in bound_surface_template.surface_template.elements ) )
def match_against_tokens( self, token_sequence_to_match_against: Tuple[str, ...], *, slots_to_filler_spans: Mapping[SyntaxSemanticsVariable, Span], ) -> Optional[Span]: """ Gets the token indices, if any, for the first match of this template against *token_sequence_to_match_against*, assuming any slots are filled by the tokens given by *slots_to_fillers_spans*. """ # First, we turn the template into a token sequence to search for # by filling in all the slots form the provided token span mapping. tokens_to_match = [] for element in self.elements: if isinstance(element, str): # Hack to handle determiners. # # This may not handle Chinese properly; see # https://github.com/isi-vista/adam/issues/993 try: index = token_sequence_to_match_against.index(element) if (index - 1 >= 0 and token_sequence_to_match_against[index - 1] in ENGLISH_DETERMINERS): tokens_to_match.append( token_sequence_to_match_against[index - 1]) except ValueError: pass finally: tokens_to_match.append(element) else: slot_filler_span = slots_to_filler_spans.get(element) if slot_filler_span: # endpoints are exclusive start = slot_filler_span.start # Hack to handle determiners # # This may not handle Chinese properly; see # https://github.com/isi-vista/adam/issues/993 if (slot_filler_span.start - 1 >= 0 and token_sequence_to_match_against[ slot_filler_span.start - 1] in ENGLISH_DETERMINERS): start -= 1 tokens_to_match.extend( token_sequence_to_match_against[start:slot_filler_span. end]) # If template contains an element not found in the mapping of slots to spans, we can return empty here. # We don't want to do this now because of generics. # else: # return None # Now we need to check if the tokens to match occur in the given token sequence to # match against. We don't expect these sequences to be long, so an inefficient solution # is okay. if not tokens_to_match: raise RuntimeError( "Don't know how to match any empty token sequence") next_idx_to_search_from = 0 while next_idx_to_search_from < len(token_sequence_to_match_against): try: index_of_first_token = token_sequence_to_match_against.index( tokens_to_match[0], next_idx_to_search_from) candidate_match_exclusive_end = index_of_first_token + len( tokens_to_match) if candidate_match_exclusive_end <= len( token_sequence_to_match_against): if tokens_to_match == list(token_sequence_to_match_against[ index_of_first_token:candidate_match_exclusive_end] ): # span endpoints are exclusive return Span(index_of_first_token, candidate_match_exclusive_end) # False alarm - the first token matched, but not the whole sequence. next_idx_to_search_from = index_of_first_token + 1 except ValueError: # If we can't even find the first token of what we are searching for, # we definitely have no match. return None # We got all the way to the end without finding a match return None
def test_index(self) -> None: overlapping_items = ( Foo(Span(0, 10)), Foo(Span(5, 25)), Foo(Span(20, 30)), Bar(Span(20, 30)), ) index = HasSpanIndex.index(overlapping_items) self.assertEqual(immutableset([overlapping_items[0]]), index.get_exactly_matching(Span(0, 10))) self.assertEqual(immutableset([overlapping_items[1]]), index.get_exactly_matching(Span(5, 25))) self.assertEqual( immutableset([overlapping_items[2], overlapping_items[3]]), index.get_exactly_matching(Span(20, 30)), ) self.assertEqual(immutableset(), index.get_exactly_matching(Span(6, 26))) self.assertEqual(immutableset(), index.get_overlapping(Span(31, 35))) self.assertEqual( immutableset([overlapping_items[2], overlapping_items[3]]), index.get_overlapping(Span(29, 32)), ) self.assertEqual(immutableset(), index.get_contained(Span(25, 30))) self.assertEqual( immutableset([overlapping_items[2], overlapping_items[3]]), index.get_contained(Span(20, 30)), ) self.assertEqual( immutableset([ overlapping_items[0], overlapping_items[1], overlapping_items[2], overlapping_items[3], ]), index.get_contained(Span(0, 30)), ) self.assertEqual(immutableset(), index.get_containing(Span(0, 15))) self.assertEqual( immutableset([ overlapping_items[1], overlapping_items[2], overlapping_items[3] ]), index.get_containing(Span(21, 24)), )
def span(self, start_index: int, *, end_index_exclusive: int) -> Span: return Span(start_index, end_index_exclusive)
def test_disjoint_index(self) -> None: overlapping_items = ( Foo(Span(0, 10)), Foo(Span(5, 25)), Foo(Span(20, 30)), Bar(Span(20, 30)), ) with self.assertRaisesRegex( ValueError, "Some range keys are connected or overlapping"): HasSpanIndex.index_disjoint(overlapping_items) s1, s2, s3 = (Span(0, 3), Span(5, 25), Span(25, 30)) s2_within = Span(5, 10) s4_contains = Span(5, 30) fs1, fs2, fs3 = Foo(s1), Foo(s2), Foo(s3) index = HasSpanIndex.index_disjoint((fs1, fs2, fs3)) self.assertIsNone(index.get_exactly_matching(s2_within)) self.assertEqual(fs3, index.get_exactly_matching(s3)) self.assertEqual(immutableset(), index.get_overlapping(Span(35, 40))) self.assertEqual(immutableset([fs3]), index.get_overlapping(Span(28, 35))) self.assertEqual(immutableset([fs1, fs2]), index.get_overlapping(Span(2, 7))) self.assertEqual(immutableset(), index.get_contained(s2_within)) self.assertEqual(immutableset([fs2, fs3]), index.get_contained(s4_contains)) self.assertIsNone(index.get_containing(s4_contains)) self.assertEqual(fs2, index.get_containing(s2_within))