Ejemplo n.º 1
0
    def _split_uniques_coalesce_rest(
        cls,
        split_file: List[IntermediateFileSlice],
        raw_occurrences: Dict[str, List[int]],
        templ_occurrences: Dict[str, List[int]],
        templated_str: str,
    ) -> Iterator[TemplatedFileSlice]:
        """Within each of the compound sections split on unique literals.

        For everything else we coalesce to the dominant type.

        Returns:
            Iterable of the type of segment, the slice in the raw file
                and the slice in the templated file.

        """
        # A buffer to capture tail segments
        tail_buffer: List[TemplatedFileSlice] = []

        templater_logger.debug("    _split_uniques_coalesce_rest: %s", split_file)

        for int_file_slice in split_file:
            # Yield anything from the tail buffer
            if tail_buffer:
                templater_logger.debug(
                    "        Yielding Tail Buffer [start]: %s", tail_buffer
                )
                yield from tail_buffer
                tail_buffer = []

            # Check whether we're handling a zero length slice.
            if (
                int_file_slice.templated_slice.stop
                - int_file_slice.templated_slice.start
                == 0
            ):
                point_combo = int_file_slice.coalesce()
                templater_logger.debug(
                    "        Yielding Point Combination: %s", point_combo
                )
                yield point_combo
                continue

            # Yield anything simple
            try:
                simple_elem = int_file_slice.try_simple()
                templater_logger.debug("        Yielding Simple: %s", simple_elem)
                yield simple_elem
                continue
            except ValueError:
                pass

            # Trim ends and overwrite the current working copy.
            head_buffer, int_file_slice, tail_buffer = int_file_slice.trim_ends(
                templated_str=templated_str
            )
            if head_buffer:
                yield from head_buffer
            # Have we consumed the whole thing?
            if not int_file_slice.slice_buffer:
                continue

            # Try to yield simply again (post trim)
            try:
                simple_elem = int_file_slice.try_simple()
                templater_logger.debug("        Yielding Simple: %s", simple_elem)
                yield simple_elem
                continue
            except ValueError:
                pass

            templater_logger.debug("        Intermediate Slice: %s", int_file_slice)
            # Generate the coalesced version in case we need it
            coalesced = int_file_slice.coalesce()

            # Look for anchors
            raw_occs = cls._filter_occurrences(
                int_file_slice.source_slice, raw_occurrences
            )
            templ_occs = cls._filter_occurrences(
                int_file_slice.templated_slice, templ_occurrences
            )
            # Do we have any uniques to split on?
            # NB: We use `get` on the templated occurrences, because it's possible
            # that because of an if statement, something is in the source, but
            # not in the templated at all. In that case, we shouldn't use it.
            one_way_uniques = [
                key
                for key in raw_occs.keys()
                if len(raw_occs[key]) == 1 and len(templ_occs.get(key, [])) >= 1
            ]
            two_way_uniques = [
                key for key in one_way_uniques if len(templ_occs[key]) == 1
            ]
            # if we don't have anything to anchor on, then just return (coalescing types)
            if not raw_occs or not templ_occs or not one_way_uniques:
                templater_logger.debug(
                    "        No Anchors or Uniques. Yielding Whole: %s", coalesced
                )
                yield coalesced
                continue

            # Deal with the inner segment itself.
            templater_logger.debug(
                "        Intermediate Slice [post trim]: %s: %r",
                int_file_slice,
                templated_str[int_file_slice.templated_slice],
            )
            templater_logger.debug("        One Way Uniques: %s", one_way_uniques)
            templater_logger.debug("        Two Way Uniques: %s", two_way_uniques)

            # Hang onto the starting position, which we'll advance as we go.
            starts = (
                int_file_slice.source_slice.start,
                int_file_slice.templated_slice.start,
            )

            # Deal with two way uniques first, because they are easier.
            # If we do find any we use recursion, because we'll want to do
            # all of the above checks again.
            if two_way_uniques:
                # Yield the uniques and coalesce anything between.
                bookmark_idx = 0
                for idx, raw_slice in enumerate(int_file_slice.slice_buffer):
                    pos = 0
                    unq: Optional[str] = None
                    # Does this element contain one of our uniques? If so, where?
                    for unique in two_way_uniques:
                        if unique in raw_slice.raw:
                            pos = raw_slice.raw.index(unique)
                            unq = unique

                    if unq:
                        # Yes it does. Handle it.

                        # Get the position of the unique section.
                        unique_position = (
                            raw_occs[unq][0],
                            templ_occs[unq][0],
                        )
                        templater_logger.debug(
                            "            Handling Unique: %r, %s, %s, %r",
                            unq,
                            pos,
                            unique_position,
                            raw_slice,
                        )

                        # Handle full slices up to this one
                        if idx > bookmark_idx:
                            # Recurse to deal with any loops separately
                            yield from cls._split_uniques_coalesce_rest(
                                [
                                    IntermediateFileSlice(
                                        "compound",
                                        # slice up to this unique
                                        slice(starts[0], unique_position[0] - pos),
                                        slice(starts[1], unique_position[1] - pos),
                                        int_file_slice.slice_buffer[bookmark_idx:idx],
                                    )
                                ],
                                raw_occs,
                                templ_occs,
                                templated_str,
                            )

                        # Handle any potential partial slice if we're part way through this one.
                        if pos > 0:
                            yield TemplatedFileSlice(
                                raw_slice.slice_type,
                                slice(unique_position[0] - pos, unique_position[0]),
                                slice(unique_position[1] - pos, unique_position[1]),
                            )

                        # Handle the unique itself and update the bookmark
                        starts = (
                            unique_position[0] + len(unq),
                            unique_position[1] + len(unq),
                        )
                        yield TemplatedFileSlice(
                            raw_slice.slice_type,
                            slice(unique_position[0], starts[0]),
                            slice(unique_position[1], starts[1]),
                        )
                        # Move the bookmark after this position
                        bookmark_idx = idx + 1

                        # Handle any remnant after the unique.
                        if raw_slice.raw[pos + len(unq) :]:
                            remnant_length = len(raw_slice.raw) - (len(unq) + pos)
                            _starts = starts
                            starts = (
                                starts[0] + remnant_length,
                                starts[1] + remnant_length,
                            )
                            yield TemplatedFileSlice(
                                raw_slice.slice_type,
                                slice(_starts[0], starts[0]),
                                slice(_starts[1], starts[1]),
                            )

                if bookmark_idx == 0:  # pragma: no cover
                    # This is a SAFETY VALVE. In Theory we should never be here
                    # and if we are it implies an error elsewhere. This clause
                    # should stop any potential infinite recursion in its tracks
                    # by simply classifying the whole of the current block as
                    # templated and just stopping here.
                    # Bugs triggering this eventuality have been observed in 0.4.0.
                    templater_logger.info(
                        "        Safety Value Info: %s, %r",
                        two_way_uniques,
                        templated_str[int_file_slice.templated_slice],
                    )
                    templater_logger.warning(
                        "        Python templater safety value unexpectedly triggered. "
                        "Please report your raw and compiled query on github for debugging."
                    )
                    # NOTE: If a bug is reported here, this will incorrectly
                    # classify more of the query as "templated" than it should.
                    yield coalesced
                    continue

                # At the end of the loop deal with any remaining slices.
                # The above "Safety Valve"TM should keep us safe from infinite
                # recursion.
                if len(int_file_slice.slice_buffer) > bookmark_idx:
                    # Recurse to deal with any loops separately
                    yield from cls._split_uniques_coalesce_rest(
                        [
                            IntermediateFileSlice(
                                "compound",
                                # Slicing is easy here, we have no choice
                                slice(starts[0], int_file_slice.source_slice.stop),
                                slice(starts[1], int_file_slice.templated_slice.stop),
                                # Calculate the subsection to deal with.
                                int_file_slice.slice_buffer[
                                    bookmark_idx : len(int_file_slice.slice_buffer)
                                ],
                            )
                        ],
                        raw_occs,
                        templ_occs,
                        templated_str,
                    )
                # We continue here because the buffer should be exhausted,
                # and if there's more to do we'll do it in the recursion.
                continue

            # If we get here, then there ARE uniques, but they are only ONE WAY.
            # This means loops. Loops are tricky.
            # We're very unlikely to get here (impossible?) with just python
            # formatting, but this class is also the base for the jinja templater
            # (and others?) so it may be used there.
            # One way uniques give us landmarks to try and estimate what to do with them.
            owu_templ_tuples = cls._sorted_occurrence_tuples(
                {key: templ_occs[key] for key in one_way_uniques}
            )

            templater_logger.debug(
                "        Handling One Way Uniques: %s", owu_templ_tuples
            )

            # Hang onto out *ending* position too from here.
            stops = (
                int_file_slice.source_slice.stop,
                int_file_slice.templated_slice.stop,
            )

            # OWU in this context refers to "One Way Unique"
            this_owu_idx: Optional[int] = None
            last_owu_idx: Optional[int] = None
            # Iterate through occurrence tuples of the one-way uniques.
            for raw, template_idx in owu_templ_tuples:
                raw_idx = raw_occs[raw][0]
                raw_len = len(raw)

                # Find the index of this owu in the slice_buffer, store the previous
                last_owu_idx = this_owu_idx
                try:
                    this_owu_idx = next(
                        idx
                        for idx, slc in enumerate(int_file_slice.slice_buffer)
                        if slc.raw == raw
                    )
                except StopIteration:
                    # This can happen if the unique was detected, but was introduced
                    # by a templater step. This is a false positive. Skip and move on.
                    templater_logger.info(
                        "One Way Unique %r not found in slice buffer. Skipping...", raw
                    )
                    continue

                templater_logger.debug(
                    "        Handling OWU: %r @%s (raw @%s) [this_owu_idx: %s, last_owu_dx: %s]",
                    raw,
                    template_idx,
                    raw_idx,
                    this_owu_idx,
                    last_owu_idx,
                )

                if template_idx > starts[1]:
                    # Yield the bit before this literal. We yield it
                    # all as a tuple, because if we could do any better
                    # we would have done it by now.

                    # Can we identify a meaningful portion of the patch
                    # to recurse a split?
                    sub_section: Optional[List[RawFileSlice]] = None
                    # If it's the start, the slicing is easy
                    if (
                        starts[1] == int_file_slice.templated_slice.stop
                    ):  # pragma: no cover TODO?
                        sub_section = int_file_slice.slice_buffer[:this_owu_idx]
                    # If we are AFTER the previous in the template, then it's
                    # also easy. [assuming it's not the same owu]
                    elif raw_idx > starts[0] and last_owu_idx != this_owu_idx:
                        if last_owu_idx:
                            sub_section = int_file_slice.slice_buffer[
                                last_owu_idx + 1 : this_owu_idx
                            ]
                        else:
                            sub_section = int_file_slice.slice_buffer[:this_owu_idx]

                    # If we succeeded in one of the above, we can also recurse
                    # and be more intelligent with the other sections.
                    if sub_section:
                        # This assertion makes MyPy happy. In this case, we
                        # never set source_slice without also setting
                        # subsection.
                        templater_logger.debug(
                            "        Attempting Subsplit [pre]: %s, %r",
                            sub_section,
                            templated_str[slice(starts[1], template_idx)],
                        )
                        yield from cls._split_uniques_coalesce_rest(
                            [
                                IntermediateFileSlice(
                                    "compound",
                                    # Slicing is easy here, we have no choice
                                    slice(starts[0], raw_idx),
                                    slice(starts[1], template_idx),
                                    sub_section,
                                )
                            ],
                            raw_occs,
                            templ_occs,
                            templated_str,
                        )
                    # Otherwise, it's the tricky case.
                    else:
                        # In this case we've found a literal, coming AFTER another
                        # in the templated version, but BEFORE (or the same) in the
                        # raw version. This only happens during loops, but it means
                        # that identifying exactly what the intervening bit refers
                        # to is a bit arbitrary. In this case we're going to OVER
                        # estimate and refer to the whole loop segment.

                        # TODO: Maybe this should make two chunks instead, one
                        # working backward, and one working forward. But that's
                        # a job for another day.

                        # First find where we are starting this remainder
                        # in the template (as an index in the buffer).
                        # Any segments *after* cur_idx are involved.
                        if last_owu_idx is None or last_owu_idx + 1 >= len(
                            int_file_slice.slice_buffer
                        ):
                            cur_idx = 0  # pragma: no cover
                        else:
                            cur_idx = last_owu_idx + 1

                        # We need to know how many block_ends are after this.
                        block_ends = sum(
                            slc[1] == "block_end"
                            for slc in int_file_slice.slice_buffer[cur_idx:]
                        )
                        # We can allow up to this number of preceding block starts
                        block_start_indices = [
                            idx
                            for idx, slc in enumerate(
                                int_file_slice.slice_buffer[:cur_idx]
                            )
                            if slc[1] == "block_start"
                        ]

                        # Trim anything which we're not allowed to use.
                        if len(block_start_indices) > block_ends:
                            offset = block_start_indices[-1 - block_ends] + 1
                            elem_sub_buffer = int_file_slice.slice_buffer[offset:]
                            cur_idx -= offset
                        else:
                            elem_sub_buffer = int_file_slice.slice_buffer

                        # We also need to know whether any of the *starting*
                        # segments are involved.
                        # Anything up to start_idx (exclusive) is included.
                        include_start = raw_idx > elem_sub_buffer[0][2]

                        # The ending point of this slice, is already decided.
                        end_point = elem_sub_buffer[-1].end_source_idx()

                        # If start_idx is None, we're in luck. We don't need to include the beginning.
                        if include_start:
                            start_point = elem_sub_buffer[0].source_idx
                        # Otherwise we know it's looped round, we need to include the whole slice.
                        else:
                            start_point = elem_sub_buffer[cur_idx].source_idx

                        tricky = TemplatedFileSlice(
                            "templated",
                            slice(start_point, end_point),
                            slice(starts[1], template_idx),
                        )

                        templater_logger.debug(
                            "        Yielding Tricky Case : %s",
                            tricky,
                        )

                        yield tricky

                # Yield the literal
                owu_literal_slice = TemplatedFileSlice(
                    "literal",
                    slice(raw_idx, raw_idx + raw_len),
                    slice(template_idx, template_idx + raw_len),
                )
                templater_logger.debug(
                    "    Yielding Unique: %r, %s",
                    raw,
                    owu_literal_slice,
                )
                yield owu_literal_slice
                # Update our bookmark
                starts = (
                    raw_idx + raw_len,
                    template_idx + raw_len,
                )

            if starts[1] < stops[1] and last_owu_idx is not None:
                # Yield the end bit
                templater_logger.debug("        Attempting Subsplit [post].")
                yield from cls._split_uniques_coalesce_rest(
                    [
                        IntermediateFileSlice(
                            "compound",
                            # Slicing is easy here, we have no choice
                            slice(raw_idx + raw_len, stops[0]),
                            slice(starts[1], stops[1]),
                            int_file_slice.slice_buffer[last_owu_idx + 1 :],
                        )
                    ],
                    raw_occs,
                    templ_occs,
                    templated_str,
                )

        # Yield anything from the tail buffer
        if tail_buffer:
            templater_logger.debug(
                "        Yielding Tail Buffer [end]: %s", tail_buffer
            )
            yield from tail_buffer
Ejemplo n.º 2
0
    def _trim_end(
        self, templated_str: str, target_end: str = "head"
    ) -> Tuple["IntermediateFileSlice", List[TemplatedFileSlice]]:
        """Trim the ends of a intermediate segment."""
        target_idx = 0 if target_end == "head" else -1
        terminator_types = ("block_start") if target_end == "head" else ("block_end")
        main_source_slice = self.source_slice
        main_templated_slice = self.templated_slice
        slice_buffer = self.slice_buffer

        end_buffer = []

        # Yield any leading literals, comments or blocks.
        while len(slice_buffer) > 0 and slice_buffer[target_idx].slice_type in (
            "literal",
            "block_start",
            "block_end",
            "comment",
        ):
            focus = slice_buffer[target_idx]
            templater_logger.debug("            %s Focus: %s", target_end, focus)
            # Is it a zero length item?
            if focus.slice_type in ("block_start", "block_end", "comment"):
                # Only add the length in the source space.
                templated_len = 0
            else:
                # Assume it's a literal, check the literal actually matches.
                templated_len = len(focus.raw)
                if target_end == "head":
                    check_slice = slice(
                        main_templated_slice.start,
                        main_templated_slice.start + templated_len,
                    )
                else:
                    check_slice = slice(
                        main_templated_slice.stop - templated_len,
                        main_templated_slice.stop,
                    )

                if templated_str[check_slice] != focus.raw:
                    # It doesn't match, we can't use it. break
                    templater_logger.debug("                Nope")
                    break

            # If it does match, set up the new slices
            if target_end == "head":
                division = (
                    main_source_slice.start + len(focus.raw),
                    main_templated_slice.start + templated_len,
                )
                new_slice = TemplatedFileSlice(
                    focus.slice_type,
                    slice(main_source_slice.start, division[0]),
                    slice(main_templated_slice.start, division[1]),
                )
                end_buffer.append(new_slice)
                main_source_slice = slice(division[0], main_source_slice.stop)
                main_templated_slice = slice(division[1], main_templated_slice.stop)
            else:
                division = (
                    main_source_slice.stop - len(focus.raw),
                    main_templated_slice.stop - templated_len,
                )
                new_slice = TemplatedFileSlice(
                    focus.slice_type,
                    slice(division[0], main_source_slice.stop),
                    slice(division[1], main_templated_slice.stop),
                )
                end_buffer.insert(0, new_slice)
                main_source_slice = slice(main_source_slice.start, division[0])
                main_templated_slice = slice(main_templated_slice.start, division[1])

            slice_buffer.pop(target_idx)
            if focus.slice_type in terminator_types:
                break
        # Return a new Intermediate slice and the buffer.
        # NB: Don't check size of slice buffer here. We can do that later.
        new_intermediate = self.__class__(
            "compound", main_source_slice, main_templated_slice, slice_buffer
        )
        return new_intermediate, end_buffer
Ejemplo n.º 3
0
    def _split_invariants(
        cls,
        raw_sliced: List[RawFileSlice],
        literals: List[str],
        raw_occurrences: Dict[str, List[int]],
        templated_occurrences: Dict[str, List[int]],
        templated_str: str,
    ) -> Iterator[IntermediateFileSlice]:
        """Split a sliced file on its invariant literals.

        We prioritise the _longest_ invariants first as they
        are more likely to the the anchors.
        """
        # Calculate invariants
        invariants = [
            literal
            for literal in literals
            if len(raw_occurrences[literal]) == 1
            and len(templated_occurrences[literal]) == 1
        ]
        # Work through the invariants and make sure they appear
        # in order.
        for linv in sorted(invariants, key=len, reverse=True):
            # Any invariants which have templated positions, relative
            # to source positions, which aren't in order, should be
            # ignored.

            # Is this one still relevant?
            if linv not in invariants:
                continue

            source_pos, templ_pos = raw_occurrences[linv], templated_occurrences[linv]
            # Copy the list before iterating because we're going to edit it.
            for tinv in invariants.copy():
                if tinv != linv:
                    src_dir = source_pos > raw_occurrences[tinv]
                    tmp_dir = templ_pos > templated_occurrences[tinv]
                    # If it's not in the same direction in the source and template remove it.
                    if src_dir != tmp_dir:
                        templater_logger.debug(
                            "          Invariant found out of order: %r", tinv
                        )
                        invariants.remove(tinv)

        # Set up some buffers
        buffer: List[RawFileSlice] = []
        idx: Optional[int] = None
        templ_idx = 0
        # Loop through
        for raw, token_type, raw_pos, _ in raw_sliced:
            if raw in invariants:
                if buffer:
                    yield IntermediateFileSlice(
                        "compound",
                        slice(idx, raw_pos),
                        slice(templ_idx, templated_occurrences[raw][0]),
                        buffer,
                    )
                buffer = []
                idx = None
                yield IntermediateFileSlice(
                    "invariant",
                    slice(raw_pos, raw_pos + len(raw)),
                    slice(
                        templated_occurrences[raw][0],
                        templated_occurrences[raw][0] + len(raw),
                    ),
                    [RawFileSlice(raw, token_type, templated_occurrences[raw][0])],
                )
                templ_idx = templated_occurrences[raw][0] + len(raw)
            else:
                buffer.append(RawFileSlice(raw, token_type, raw_pos))
                if idx is None:
                    idx = raw_pos
        # If we have a final buffer, yield it
        if buffer:
            yield IntermediateFileSlice(
                "compound",
                slice((idx or 0), (idx or 0) + sum(len(slc.raw) for slc in buffer)),
                slice(templ_idx, len(templated_str)),
                buffer,
            )
Ejemplo n.º 4
0
 def slice_file(
     cls,
     raw_str: str,
     templated_str: str,
     config=None,
 ) -> Tuple[List[RawFileSlice], List[TemplatedFileSlice], str]:
     """Slice the file to determine regions where we can fix."""
     templater_logger.info("Slicing File Template")
     templater_logger.debug("    Raw String: %r", raw_str)
     templater_logger.debug("    Templated String: %r", templated_str)
     # Slice the raw file
     raw_sliced = list(cls._slice_template(raw_str))
     templater_logger.debug("    Raw Sliced:")
     for idx, raw_slice in enumerate(raw_sliced):
         templater_logger.debug("        %s: %r", idx, raw_slice)
     # Find the literals
     literals = [
         raw_slice.raw
         for raw_slice in raw_sliced
         if raw_slice.slice_type == "literal"
     ]
     templater_logger.debug("    Literals: %s", literals)
     for loop_idx in range(2):
         templater_logger.debug("    # Slice Loop %s", loop_idx)
         # Calculate occurrences
         raw_occurrences = cls._substring_occurrences(raw_str, literals)
         templated_occurrences = cls._substring_occurrences(templated_str, literals)
         templater_logger.debug(
             "    Occurrences: Raw: %s, Templated: %s",
             raw_occurrences,
             templated_occurrences,
         )
         # Split on invariants
         split_sliced = list(
             cls._split_invariants(
                 raw_sliced,
                 literals,
                 raw_occurrences,
                 templated_occurrences,
                 templated_str,
             )
         )
         templater_logger.debug("    Split Sliced:")
         for idx, split_slice in enumerate(split_sliced):
             templater_logger.debug("        %s: %r", idx, split_slice)
         # Deal with uniques and coalesce the rest
         sliced_file = list(
             cls._split_uniques_coalesce_rest(
                 split_sliced, raw_occurrences, templated_occurrences, templated_str
             )
         )
         templater_logger.debug("    Fully Sliced:")
         for idx, templ_slice in enumerate(sliced_file):
             templater_logger.debug("        %s: %r", idx, templ_slice)
         unwrap_wrapped = (
             True
             if config is None
             else config.get(
                 "unwrap_wrapped_queries", section="templater", default=True
             )
         )
         sliced_file, new_templated_str = cls._check_for_wrapped(
             sliced_file, templated_str, unwrap_wrapped=unwrap_wrapped
         )
         if new_templated_str == templated_str:
             # If we didn't change it then we're done.
             break
         else:
             # If it's not equal, loop around
             templated_str = new_templated_str
     return raw_sliced, sliced_file, new_templated_str