Ejemplo n.º 1
0
    def test_format(self):
        # we show how we can format a snippet (below) when the
        # annotations are relative to a larger enclosing text
        original_text = (
            "Eliot: Time present and time past / "
            "Are both perhaps present in time future..."
        )
        snippet_text = (
            "Time present and time past / " "Are both perhaps present in time future..."
        )

        annotations = [
            AnnotatedSpan("SIR_NOT_APPEARING_IN_THIS_SNIPPET", Span(1, 5)),
            AnnotatedSpan("LINE", Span(7, 33)),
            AnnotatedSpan("BAR", Span(8, 9)),
            AnnotatedSpan("LINE", Span(36, 75)),
            AnnotatedSpan("PP", Span(61, 75)),
            AnnotatedSpan("ELLIPSES", Span(75, 78)),
            AnnotatedSpan("FOO", Span(7, 19)),
            AnnotatedSpan("SINGLE_CHAR", Span(61, 62)),
        ]

        expected_result = (
            "<LINE><FOO>T<BAR>i</BAR>me present</FOO> and time past</LINE> "
            "/ <LINE>Are both perhaps present <PP><SINGLE_CHAR>i</SINGLE_CHAR>n "
            "time future</PP></LINE><ELLIPSES>...</ELLIPSES>"
        )

        self.assertEqual(
            expected_result,
            HTMLStyleAnnotationFormatter().annotated_text(
                snippet_text, annotations, text_offsets=Span(7, len(original_text))
            ),
        )
Ejemplo n.º 2
0
def get_sentence_spans(document_text: str) -> List[Span]:
    tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()

    return [
        Span(*s) for s in tokenizer.span_tokenize(document_text,
                                                  realign_boundaries=True)
    ]
Ejemplo n.º 3
0
def contexts_from_justifications(justifications: ImmutableDict[str, Span],
                                 document) -> ImmutableDict[str, Span]:
    document_text = document["fulltext"]
    sentence_spans = get_sentence_spans(document_text)
    contexts: Dict[str, Span] = {}

    for justification_id, justification_span in justifications.items():
        for s_span in sentence_spans:
            if s_span.contains_span(justification_span):
                # the sentence tokenizer doesn't recognize double newline as a potential sentence boundary,
                # so we split on double newlines and return the parts of the pre/post context
                # that are closest to the mention
                precontext_lines = document_text[
                    s_span.start:justification_span.start].split("\n\n")
                precontext_extra = ("\n\n".join(precontext_lines[:-1])
                                    if len(precontext_lines) > 1 else "")

                postcontext_lines = document_text[justification_span.
                                                  end:s_span.end].split("\n\n")
                postcontext_extra = ("\n\n".join(postcontext_lines[1:])
                                     if len(postcontext_lines) > 1 else "")

                modified_sentence_start = s_span.start + len(precontext_extra)
                modified_sentence_end = s_span.end - len(postcontext_extra)

                assert (
                    justification_id not in contexts
                ), "justification should not be overlapping with more than one sentence"
                contexts[justification_id] = Span(modified_sentence_start,
                                                  modified_sentence_end)

    return immutabledict(contexts)
Ejemplo n.º 4
0
def _render_html(corpus: Corpus, output_dir: Path, parent_or_child_id: str,
                 start: int, end: int) -> Tuple[Path, str]:
    """Outputs either the whole document rendered in HTML or a subspan. `end` is inclusive."""

    document = _get_document(corpus, parent_or_child_id)
    if not document:
        raise ValueError(
            f"{document['parent_id']} not present in the document database.")

    justification_spans: ImmutableDict[str, Span] = immutabledict(
        {f"{start}:{end}": Span(start, end + 1)})

    contexts = contexts_from_justifications(justification_spans, document)

    to_render, _ = render_document(document["fulltext"], justification_spans,
                                   contexts)
    if not to_render:
        raise ValueError("Could not find anything to render.")

    final_html = _render_template(
        document=immutabledict({
            "id": document["parent_id"],
            "title": document["title"],
            "html": to_render,
            "span": f"{start}:{end}",
        }))
    output_file = output_dir / f"{document['parent_id']}_{start}-{end}.html"
    output_file.write_text(final_html)

    return output_file, document["fulltext"][start:end + 1]
Ejemplo n.º 5
0
    def match_against_tokens(
        self,
        token_sequence_to_match_against: Tuple[str, ...],
        *,
        slots_to_filler_spans: Mapping[SyntaxSemanticsVariable, Span],
    ) -> Optional[Span]:
        """
        Gets the token indices, if any, for the first match of this template against
        *token_sequence_to_match_against*, assuming any slots are filled by the tokens given by
        *slots_to_fillers_spans*.
        """
        # First, we turn the template into a token sequence to search for
        # by filling in all the slots form the provided token span mapping.
        tokens_to_match = []
        for element in self.elements:
            if isinstance(element, str):
                tokens_to_match.append(element)
            else:
                slot_filler_span = slots_to_filler_spans.get(element)
                if slot_filler_span:
                    # endpoints are exclusive
                    tokens_to_match.extend(token_sequence_to_match_against[
                        slot_filler_span.start:slot_filler_span.end])
                else:
                    raise RuntimeError(
                        f"Template contained variable {element}, "
                        f"but it was not found in the mapping of slots to spans: "
                        f"{slots_to_filler_spans}")

        # Now we need to check if the tokens to match occur in the given token sequence to
        # match against.  We don't expect these sequences to be long, so an inefficient solution
        # is okay.
        if not tokens_to_match:
            raise RuntimeError(
                "Don't know how to match any empty token sequence")
        next_idx_to_search_from = 0
        while next_idx_to_search_from < len(token_sequence_to_match_against):
            try:
                index_of_first_token = token_sequence_to_match_against.index(
                    tokens_to_match[0], next_idx_to_search_from)
                candidate_match_exclusive_end = index_of_first_token + len(
                    tokens_to_match)
                if candidate_match_exclusive_end <= len(
                        token_sequence_to_match_against):
                    if tokens_to_match == list(token_sequence_to_match_against[
                            index_of_first_token:candidate_match_exclusive_end]
                                               ):
                        # span endpoints are exclusive
                        return Span(index_of_first_token,
                                    candidate_match_exclusive_end)
                # False alarm - the first token matched, but not the whole sequence.
                next_idx_to_search_from = index_of_first_token + 1
            except ValueError:
                # If we can't even find the first token of what we are searching for,
                # we definitely have no match.
                return None
        # We got all the way to the end without finding a match
        return None
Ejemplo n.º 6
0
def _sort_mapping_by_token_spans(
        pairs) -> ImmutableDict[ObjectSemanticNode, Span]:
    # we type: ignore because the proper typing of pairs is huge and mypy is going to screw it up
    # anyway.
    unsorted = immutabledict(pairs)  # type: ignore
    return immutabledict(
        (matched_node, token_span) for (matched_node, token_span) in sorted(
            unsorted.items(),
            key=lambda item: Span.earliest_then_longest_first_key(item[1]),
        ))
Ejemplo n.º 7
0
    def annotated_text(
        self,
        text: str,
        annotations: Collection[AnnotatedSpan],
        *,
        text_offsets: Optional[Span] = None,
    ) -> str:
        """
        Mark annotations on text in an HTML-like style.

        Each annotation will becomes an HTML tag wrapping the text at the corresponding offsets.
        Any attributes will become HTML attributes.

        This does not add any other HTML annotations (`head`, `body`, etc.), so if desired the
        user should add them afterwards.

        If `text_offsets` is specified, the annotations are assumed to have offsets with respect
        to some larger string, where `text` is a substring of that string with offsets
        `text_offsets` relative to it.  You might use this, for example, to render a single
        paragraph from a document.
        """
        if not text_offsets:
            text_offsets = Span.from_inclusive_to_exclusive(0, len(text))
        check_arg(
            len(text_offsets) == len(text),
            f"Text offsets length {len(text_offsets)} "
            f"does not match text length {len(text)}",
        )

        # we process the annotations to (a) ensure they all fit within the requested snippet
        # and (b) shift their offsets so that all offsets are relative to the text being
        # formatted
        processed_annotations = self._clip_to_offsets_and_shift(
            annotations, text_offsets)

        ret = io.StringIO()
        last_uncopied_offset = 0
        for tag in self._tag_sequence(processed_annotations):
            if last_uncopied_offset < tag.offset:
                ret.write(text[last_uncopied_offset:tag.offset])
                last_uncopied_offset = tag.offset

            ret.write(tag.string)

        # get any trailing text after last tag
        if last_uncopied_offset < text_offsets.end:
            ret.write(text[last_uncopied_offset:text_offsets.end])
        return ret.getvalue()
Ejemplo n.º 8
0
    def test_intersection(self) -> None:
        s1, s2, s3, s4 = (Span(0, 3), Span(2, 25), Span(25, 30), Span(10, 20))
        s1_s2_intersection = Span(2, 3)
        s2_s4_intersection = Span(10, 20)

        self.assertIsNone(s1.intersection(s3))
        self.assertIsNone(s2.intersection(s3))
        self.assertEqual(s1.intersection(s2), s1_s2_intersection)
        self.assertEqual(s2.intersection(s4), s2_s4_intersection)
        self.assertEqual(s4.intersection(s2), s2_s4_intersection)
Ejemplo n.º 9
0
def parse_text_from_source(text_justification_lookup: TextJustificationLookup,
                           inf_just_pattern,
                           inf_just_span):
    match = re.search(inf_just_pattern, inf_just_span)
    if match:
        # source = match.group(1)
        document = match.group(2)
        start = int(match.group(3))
        end = int(match.group(4))
        text_descriptor = TextDescriptor(doceid=document,
                                         span=Span.from_inclusive_to_exclusive(start, end + 1),
                                         language=None)
        try:
            lookup = text_justification_lookup.text_for_justification(text_descriptor, 50)
            return lookup.spanning_tokens, lookup.original_text
        except (RuntimeError, AttributeError):
            return 'None', 'None'
    else:
        return 'None', 'None'
Ejemplo n.º 10
0
def render_single_justification_document(document: dict,
                                         justification: Justification) -> str:

    span_start = justification.span_start
    span_end = justification.span_end

    if not span_start or not span_end:
        raise ValueError(
            "Justification to render must have values for span_start and span_end."
        )

    justification_spans: ImmutableDict[str, Span] = immutabledict(
        {f"{span_start}:{span_end}": Span(span_start, span_end + 1)})

    contexts = contexts_from_justifications(justification_spans, document)

    to_render, _ = render_document(document["fulltext"], justification_spans,
                                   contexts)
    if not to_render:
        raise ValueError("Could not find anything to render.")

    return to_render
Ejemplo n.º 11
0
    def is_legal_template_span(
        candidate_token_span: Span, *, invalid_token_spans: ImmutableSet[Span]
    ) -> bool:
        # A template token span can't exceed the bounds of the utterance
        if candidate_token_span.start < 0:
            return False
        if candidate_token_span.end > len(sentence_tokens):
            return False
        # or be bigger than our maximum template size...
        if len(candidate_token_span) > max_length:
            return False

        for span in invalid_token_spans:
            if candidate_token_span.contains_span(span):
                return False

        # or we have already aligned any of the tokens in between the objects
        # to some other meaning.
        for token_index in range(candidate_token_span.start, candidate_token_span.end):
            if language_concept_alignment.token_index_is_aligned(token_index):
                return False

        return True
Ejemplo n.º 12
0
    metadata_from_wordpiece = doc.metadata_for(tokens_from_wordpiece)
    tokens_from_spacy = metadata_from_wordpiece[
        WordPieceTokenizationAnnotator.EXISTING_TOKEN_THEORY_USED_FOR_WORDPIECE
    ]
    map_spacy_to_wordpiece_indices = metadata_from_wordpiece[
        WordPieceTokenizationAnnotator.MULTIMAP_FROM_EXISTING_TO_WORDPIECE_TOKENIZATION
    ]
    mentions_from_apf = doc_with_lots_of_mention_algos.mentions(algorithm(ApfIngester))
    mentions_from_corenlp = doc_with_lots_of_mention_algos.mentions(
        algorithm(CoreNLPNameFinder)
    )
    mentions_from_spacy = doc_with_lots_of_mention_algos.mentions(
        algorithm(SpacyAnnotator)
    )

    s = Span.from_inclusive_to_exclusive(2778, 2915)
    print("== ACE ==")
    print(
        "\n".join(
            str(mention)
            for mention in get_items_overlapping_with_this_one(s, mentions_from_apf)
        )
    )
    print("== CoreNLP ==")
    print(
        "\n".join(
            str(mention)
            for mention in get_items_overlapping_with_this_one(s, mentions_from_corenlp)
        )
    )
    print("== spaCy ==")
Ejemplo n.º 13
0
    def process_aligned_objects_with_template(
        candidate_template: Tuple[AlignmentSlots, ...],
        aligned_nodes: Tuple[SemanticNodeWithSpan, ...],
        *,
        invalid_token_spans: ImmutableSet[Span],
    ) -> Iterable[Optional[SurfaceTemplateBoundToSemanticNodes]]:

        aligned_node_index = 0
        template_elements: List[Union[str, SyntaxSemanticsVariable]] = []
        slot_to_semantic_node: List[Tuple[SyntaxSemanticsVariable, SemanticNode]] = []

        # We need to handle fixed strings that are pre or post fix to the rest of the
        # Sentence differently as they don't have a fixed length so we could generate
        # multiple options.
        prefix_string_end = None
        postfix_string_start = None
        # In the event we generate a candidate template like:
        # A, F, F, A then we want to compute this like A, F, A
        # So we keep track if the previous token was a FixedString indicator
        previous_node_was_string = False

        for token in candidate_template:
            # If the token in our template is an argument we need to assign it a
            # unique SyntaxSemanticsVariable, and map it to the SemanticNode
            if token == AlignmentSlots.Argument:
                slot_semantic_variable = STANDARD_SLOT_VARIABLES[aligned_node_index]
                template_elements.append(slot_semantic_variable)
                aligned_node = aligned_nodes[aligned_node_index].node
                if not isinstance(aligned_node, ObjectSemanticNode):
                    logging.debug(
                        f"Attempted to make template where an Argument is not an ObjectSemanticNode."
                        f"Invalid node: {aligned_node}"
                    )
                    # Log this failure and then ignore this attempt
                    yield None
                slot_to_semantic_node.append((slot_semantic_variable, aligned_node))
                aligned_node_index += 1
                previous_node_was_string = False
            else:
                # We ignore this case to process A, F, F, A like A, F, A
                if previous_node_was_string:
                    continue
                # We make a note of where the end of our prefix string can be
                # Then continue as we'll handle this case afterwards
                elif aligned_node_index == 0:
                    prefix_string_end = aligned_nodes[aligned_node_index].span.start
                # Similiarly to above, we instead mark the start of the postfix string
                elif aligned_node_index == len(aligned_nodes):
                    postfix_string_start = aligned_nodes[aligned_node_index - 1].span.end
                else:
                    # If our FixedString is flanked by two Arguments we just want to acquire all the tokens
                    # between them
                    if (
                        aligned_nodes[aligned_node_index - 1].span.end
                        != aligned_nodes[aligned_node_index].span.start
                    ):
                        candidate_token_span = Span(
                            aligned_nodes[aligned_node_index - 1].span.end,
                            aligned_nodes[aligned_node_index].span.start,
                        )
                        if not is_legal_template_span(
                            candidate_token_span, invalid_token_spans=invalid_token_spans
                        ):
                            # If not a valid span, ignore this attempt
                            continue
                        template_elements.extend(
                            sentence_tokens[
                                candidate_token_span.start : candidate_token_span.end
                            ]
                        )
                    previous_node_was_string = True
        # We need to handle searching before or after the aligned token
        # And we could generate multiple options of different lengths
        # between 1 and _MAXIMUM_ACTION_TEMPLATE_TOKEN_LENGTH
        if prefix_string_end and postfix_string_start:
            for max_token_length_for_template_prefix in range(1, max_length + 1):
                prefix_candidate_token_span = Span(
                    prefix_string_end - max_token_length_for_template_prefix,
                    prefix_string_end,
                )
                if is_legal_template_span(
                    prefix_candidate_token_span, invalid_token_spans=invalid_token_spans
                ):
                    for max_token_length_for_template_postfix in range(1, max_length + 1):
                        postfix_candidate_token_span = Span(
                            postfix_string_start,
                            postfix_string_start + max_token_length_for_template_postfix,
                        )
                        if is_legal_template_span(
                            postfix_candidate_token_span,
                            invalid_token_spans=invalid_token_spans,
                        ):
                            final_template_elements: List[
                                Union[str, SyntaxSemanticsVariable]
                            ] = list(
                                sentence_tokens[
                                    prefix_candidate_token_span.start : prefix_candidate_token_span.end
                                ]
                            )
                            final_template_elements.extend(template_elements)
                            final_template_elements.extend(
                                sentence_tokens[
                                    postfix_candidate_token_span.start : postfix_candidate_token_span.end
                                ]
                            )
                            yield SurfaceTemplateBoundToSemanticNodes(
                                surface_template=SurfaceTemplate(
                                    elements=final_template_elements,
                                    determiner_prefix_slots=[
                                        SLOT for (SLOT, _) in slot_to_semantic_node
                                    ],
                                    language_mode=language_mode,
                                ),
                                slot_to_semantic_node=slot_to_semantic_node,
                            )
        elif prefix_string_end:
            for max_token_length_for_template_prefix in range(1, max_length + 1):
                prefix_candidate_token_span = Span(
                    prefix_string_end - max_token_length_for_template_prefix,
                    prefix_string_end,
                )
                if is_legal_template_span(
                    prefix_candidate_token_span, invalid_token_spans=invalid_token_spans
                ):
                    final_template_elements = list(
                        sentence_tokens[
                            prefix_candidate_token_span.start : prefix_candidate_token_span.end
                        ]
                    )
                    final_template_elements.extend(template_elements)
                    yield SurfaceTemplateBoundToSemanticNodes(
                        surface_template=SurfaceTemplate(
                            elements=final_template_elements,
                            determiner_prefix_slots=[
                                SLOT for (SLOT, _) in slot_to_semantic_node
                            ],
                            language_mode=language_mode,
                        ),
                        slot_to_semantic_node=slot_to_semantic_node,
                    )
        elif postfix_string_start:
            for max_token_length_for_template_postfix in range(1, max_length + 1):
                postfix_candidate_token_span = Span(
                    postfix_string_start,
                    postfix_string_start + max_token_length_for_template_postfix,
                )
                if is_legal_template_span(
                    postfix_candidate_token_span, invalid_token_spans=invalid_token_spans
                ):
                    final_template_elements = list(template_elements)
                    final_template_elements.extend(
                        sentence_tokens[
                            postfix_candidate_token_span.start : postfix_candidate_token_span.end
                        ]
                    )
                    yield SurfaceTemplateBoundToSemanticNodes(
                        surface_template=SurfaceTemplate(
                            elements=final_template_elements,
                            determiner_prefix_slots=[
                                SLOT for (SLOT, _) in slot_to_semantic_node
                            ],
                            language_mode=language_mode,
                        ),
                        slot_to_semantic_node=slot_to_semantic_node,
                    )
        else:
            yield SurfaceTemplateBoundToSemanticNodes(
                surface_template=SurfaceTemplate(
                    elements=template_elements,
                    determiner_prefix_slots=[SLOT for (SLOT, _) in slot_to_semantic_node],
                    language_mode=language_mode,
                ),
                slot_to_semantic_node=slot_to_semantic_node,
            )
Ejemplo n.º 14
0
    def _candidate_templates(
        self, language_perception_semantic_alignment: LanguagePerceptionSemanticAlignment
    ) -> AbstractSet[SurfaceTemplateBoundToSemanticNodes]:
        ret = []
        language_concept_alignment = (
            language_perception_semantic_alignment.language_concept_alignment
        )
        # Find all objects we have recognized...
        for (
            object_node,
            span_for_object,
        ) in language_concept_alignment.node_to_language_span.items():
            if isinstance(object_node, ObjectSemanticNode):
                try:
                    # Any words immediately before them or after them are candidate attributes.
                    # See https://github.com/isi-vista/adam/issues/791 .
                    preceding_token_index = span_for_object.start - 1
                    if (
                        preceding_token_index >= 0
                        and not language_concept_alignment.token_index_is_aligned(
                            preceding_token_index
                        )
                    ):

                        ret.append(
                            SurfaceTemplateBoundToSemanticNodes(
                                language_concept_alignment.to_surface_template(
                                    {object_node: SLOT1},
                                    restrict_to_span=Span(
                                        preceding_token_index, span_for_object.end
                                    ),
                                    language_mode=self._language_mode,
                                ),
                                {SLOT1: object_node},
                            )
                        )
                    following_token_index = span_for_object.end + 1
                    if following_token_index < len(
                        language_concept_alignment.language.as_token_sequence()
                    ) and not language_concept_alignment.token_index_is_aligned(
                        following_token_index
                    ):
                        ret.append(
                            SurfaceTemplateBoundToSemanticNodes(
                                language_concept_alignment.to_surface_template(
                                    {object_node: SLOT1},
                                    restrict_to_span=Span(
                                        span_for_object.start, following_token_index
                                    ),
                                    language_mode=self._language_mode,
                                ),
                                {SLOT1: object_node},
                            )
                        )
                # Catches errors in to_surface_template() - we skip this case to prevent the learning from breaking.
                except RuntimeError:
                    continue
        return immutableset(
            bound_surface_template
            for bound_surface_template in ret
            # For now, we require templates to account for the entire utterance.
            # See https://github.com/isi-vista/adam/issues/789
            if covers_entire_utterance(
                bound_surface_template,
                language_concept_alignment,
                # We need to explicitly ignore determiners here for some reason
                # See: https://github.com/isi-vista/adam/issues/871
                ignore_determiners=True,
            )
            # this keeps the relation learner from learning things such as "a_slot1" which will pose an issue for
            # later learning of attributes since the learner may consider both the attribute and the object to be objects initially,
            # leading it to try to match two objects with a template that only has one slot
            and not all(
                (e in ENGLISH_DETERMINERS or isinstance(e, SyntaxSemanticsVariable))
                for e in bound_surface_template.surface_template.elements
            )
        )
Ejemplo n.º 15
0
    def match_against_tokens(
        self,
        token_sequence_to_match_against: Tuple[str, ...],
        *,
        slots_to_filler_spans: Mapping[SyntaxSemanticsVariable, Span],
    ) -> Optional[Span]:
        """
        Gets the token indices, if any, for the first match of this template against
        *token_sequence_to_match_against*, assuming any slots are filled by the tokens given by
        *slots_to_fillers_spans*.
        """
        # First, we turn the template into a token sequence to search for
        # by filling in all the slots form the provided token span mapping.
        tokens_to_match = []
        for element in self.elements:
            if isinstance(element, str):
                # Hack to handle determiners.
                #
                # This may not handle Chinese properly; see
                # https://github.com/isi-vista/adam/issues/993
                try:
                    index = token_sequence_to_match_against.index(element)
                    if (index - 1 >= 0
                            and token_sequence_to_match_against[index - 1]
                            in ENGLISH_DETERMINERS):
                        tokens_to_match.append(
                            token_sequence_to_match_against[index - 1])
                except ValueError:
                    pass
                finally:
                    tokens_to_match.append(element)
            else:
                slot_filler_span = slots_to_filler_spans.get(element)
                if slot_filler_span:
                    # endpoints are exclusive
                    start = slot_filler_span.start
                    # Hack to handle determiners
                    #
                    # This may not handle Chinese properly; see
                    # https://github.com/isi-vista/adam/issues/993
                    if (slot_filler_span.start - 1 >= 0
                            and token_sequence_to_match_against[
                                slot_filler_span.start - 1]
                            in ENGLISH_DETERMINERS):
                        start -= 1
                    tokens_to_match.extend(
                        token_sequence_to_match_against[start:slot_filler_span.
                                                        end])
                # If template contains an element not found in the mapping of slots to spans, we can return empty here.
                # We don't want to do this now because of generics.
                # else:
                #   return None

        # Now we need to check if the tokens to match occur in the given token sequence to
        # match against.  We don't expect these sequences to be long, so an inefficient solution
        # is okay.
        if not tokens_to_match:
            raise RuntimeError(
                "Don't know how to match any empty token sequence")
        next_idx_to_search_from = 0
        while next_idx_to_search_from < len(token_sequence_to_match_against):
            try:
                index_of_first_token = token_sequence_to_match_against.index(
                    tokens_to_match[0], next_idx_to_search_from)
                candidate_match_exclusive_end = index_of_first_token + len(
                    tokens_to_match)
                if candidate_match_exclusive_end <= len(
                        token_sequence_to_match_against):
                    if tokens_to_match == list(token_sequence_to_match_against[
                            index_of_first_token:candidate_match_exclusive_end]
                                               ):
                        # span endpoints are exclusive
                        return Span(index_of_first_token,
                                    candidate_match_exclusive_end)
                # False alarm - the first token matched, but not the whole sequence.
                next_idx_to_search_from = index_of_first_token + 1
            except ValueError:
                # If we can't even find the first token of what we are searching for,
                # we definitely have no match.
                return None
        # We got all the way to the end without finding a match
        return None
Ejemplo n.º 16
0
 def test_index(self) -> None:
     overlapping_items = (
         Foo(Span(0, 10)),
         Foo(Span(5, 25)),
         Foo(Span(20, 30)),
         Bar(Span(20, 30)),
     )
     index = HasSpanIndex.index(overlapping_items)
     self.assertEqual(immutableset([overlapping_items[0]]),
                      index.get_exactly_matching(Span(0, 10)))
     self.assertEqual(immutableset([overlapping_items[1]]),
                      index.get_exactly_matching(Span(5, 25)))
     self.assertEqual(
         immutableset([overlapping_items[2], overlapping_items[3]]),
         index.get_exactly_matching(Span(20, 30)),
     )
     self.assertEqual(immutableset(),
                      index.get_exactly_matching(Span(6, 26)))
     self.assertEqual(immutableset(), index.get_overlapping(Span(31, 35)))
     self.assertEqual(
         immutableset([overlapping_items[2], overlapping_items[3]]),
         index.get_overlapping(Span(29, 32)),
     )
     self.assertEqual(immutableset(), index.get_contained(Span(25, 30)))
     self.assertEqual(
         immutableset([overlapping_items[2], overlapping_items[3]]),
         index.get_contained(Span(20, 30)),
     )
     self.assertEqual(
         immutableset([
             overlapping_items[0],
             overlapping_items[1],
             overlapping_items[2],
             overlapping_items[3],
         ]),
         index.get_contained(Span(0, 30)),
     )
     self.assertEqual(immutableset(), index.get_containing(Span(0, 15)))
     self.assertEqual(
         immutableset([
             overlapping_items[1], overlapping_items[2],
             overlapping_items[3]
         ]),
         index.get_containing(Span(21, 24)),
     )
Ejemplo n.º 17
0
 def span(self, start_index: int, *, end_index_exclusive: int) -> Span:
     return Span(start_index, end_index_exclusive)
Ejemplo n.º 18
0
    def test_disjoint_index(self) -> None:
        overlapping_items = (
            Foo(Span(0, 10)),
            Foo(Span(5, 25)),
            Foo(Span(20, 30)),
            Bar(Span(20, 30)),
        )
        with self.assertRaisesRegex(
                ValueError, "Some range keys are connected or overlapping"):
            HasSpanIndex.index_disjoint(overlapping_items)

        s1, s2, s3 = (Span(0, 3), Span(5, 25), Span(25, 30))
        s2_within = Span(5, 10)
        s4_contains = Span(5, 30)
        fs1, fs2, fs3 = Foo(s1), Foo(s2), Foo(s3)
        index = HasSpanIndex.index_disjoint((fs1, fs2, fs3))

        self.assertIsNone(index.get_exactly_matching(s2_within))
        self.assertEqual(fs3, index.get_exactly_matching(s3))
        self.assertEqual(immutableset(), index.get_overlapping(Span(35, 40)))
        self.assertEqual(immutableset([fs3]),
                         index.get_overlapping(Span(28, 35)))
        self.assertEqual(immutableset([fs1, fs2]),
                         index.get_overlapping(Span(2, 7)))
        self.assertEqual(immutableset(), index.get_contained(s2_within))
        self.assertEqual(immutableset([fs2, fs3]),
                         index.get_contained(s4_contains))
        self.assertIsNone(index.get_containing(s4_contains))
        self.assertEqual(fs2, index.get_containing(s2_within))