def build_mark_annot(f: File) -> Annot[Set[int]]: '''Build an annotation on the entire file, labeled with sets of NodeIds indicating the marked nodes overlapping each source location.''' # We start with one big annotation that labels the entire file with the # empty set (or the singleton set containing CRATE_NODE_ID, if the crate is # marked), and zip it with an annotation for each marked node in turn. if CRATE_NODE_ID not in f.marks: default = frozenset() else: default = frozenset((CRATE_NODE_ID, )) annot = [Span(0, len(f.text), default)] for u_start, u_end, node_id in f.unformatted_nodes: if node_id not in f.marks: continue # `unformatted_nodes` uses source locations in the unformatted text, # which we need to translate to locations in the formatted text. start = f.fmt_map_translate(u_start) end = f.fmt_map_translate(u_end) node_annot = fill_annot([Span(start, end, frozenset((node_id, )))], len(f.text), label=default) annot = zip_annot(annot, node_annot, f=lambda a, b: a | b) return annot
def cut_annot_at_points(orig: Annot[T], cut: List[Point[U]]) -> Annot[T]: '''Cut the spans of annotation `orig` at each point in `cut`. The resulting annotation applies all the same labels to the same regions as in `orig`, but any span that previously crossed a `cut` point is broken into two or more consecutive subspans.''' result = [] def emit(s): # Filter out any zero-length spans. This should only happen when two # points in `cut` occupy the same position. if len(s) > 0: result.append(s) i = 0 for span in orig: # Skip points that lie strictly before `span`. while i < len(cut) and cut[i].pos <= span.start: i += 1 # For each point that lies inside `span`, emit the subspan before the # point, then check for additional cut points in the subspan after the # point. while i < len(cut) and cut[i].pos < span.end: emit(Span(span.start, cut[i].pos, span.label)) span = Span(cut[i].pos, span.end, span.label) i += 1 emit(span) return result
def annotate_blocks(blocks: List[DiffBlock]) \ -> Tuple[Annot[Span[None]], Annot[Span[None]]]: '''Return annotations on the old and new files, labeling each line with the block that contains it.''' old = [] new = [] for b in blocks: old.append(Span(b.old_span.start, b.old_span.end, b)) new.append(Span(b.new_span.start, b.new_span.end, b)) return old, new
def flush(): nonlocal old_start, new_start # This check means we can blindly call `flush()` without worrying about # cluttering the output with zero-length blocks. if old_cur - old_start > 0 or new_cur - new_start > 0: diff_blocks.append( DiffBlock(changed, Span(old_start, old_cur), Span(new_start, new_cur))) old_start = old_cur new_start = new_cur
def init_file_keep_mark_lines(f: File): '''Initialize `f.keep_mark_lines` with an annotation covering the start of each node where a mark was added or removed.''' # Figure out which marks were changed - text for these will be kept in the # output even if it's not part of any hunk's context. keep_marks = set() for node_id, (added, removed, kept) in f.mark_labels.items(): if len(added) > 0 or len(removed) > 0: keep_marks.add(node_id) # Get the start line for each kept mark. keep_start_lines = set() for u_start, u_end, node_id in f.unformatted_nodes: if node_id not in keep_marks: continue start = f.fmt_map_translate(u_start) line_span = lookup_span(f.line_annot, start) keep_start_lines.add(line_span.label) # Label a region around each mark's start line. keep_lines = SpanMerger() for start in sorted(keep_start_lines): keep_lines.add(Span(start - 3, start + 6)) f.set_keep_mark_lines(keep_lines.finish())
def calc_tokenized_intra(l1: Line, l2: Line) -> Tuple[Annot[str], Annot[str]]: '''Calculate token-based intraline edit annotations for `l1` and `l2`. `difflib.ndiff` does a pretty good job of matching up similar lines, but it computes intraline changes character-by-character, which often produces bad results. For example, it might turn `unsafe` into `malloc` by replacing `uns` -> `m` and `fe` -> `lloc`, instead of doing `unsafe` -> `malloc` in one go. Here we calculate some intraline edits that are easier to read, using the tokenization provided by `pygments` to align edit boundaries to the boundaries of source tokens.''' annot1 = token_annot(l1) annot2 = token_annot(l2) tokens1 = [l1.text[s.start:s.end] for s in annot1] tokens2 = [l2.text[s.start:s.end] for s in annot2] intra1 = [] intra2 = [] sm = difflib.SequenceMatcher(a=tokens1, b=tokens2) for tag, i1, i2, j1, j2 in sm.get_opcodes(): if tag == 'equal': continue while i1 < i2 and tokens1[i1].isspace(): i1 += 1 while i2 > i1 and tokens1[i2 - 1].isspace(): i2 -= 1 while j1 < j2 and tokens2[j1].isspace(): j1 += 1 while j2 > j1 and tokens2[j2 - 1].isspace(): j2 -= 1 if i1 != i2: intra1.append( Span(annot1[i1].start, annot1[i2 - 1].end, 'chg' if tag == 'replace' else 'del')) if j1 != j2: intra2.append( Span(annot2[j1].start, annot2[j2 - 1].end, 'chg' if tag == 'replace' else 'ins')) return (intra1, intra2)
def init_fmt_map(f: File): '''Initialize a `File`'s `fmt_map` field, which describes the mapping from unformatted text positions to formatted ones.''' matching_spans = [] sm = difflib.SequenceMatcher(a=f.unformatted, b=f.text) for tag, i1, i2, j1, j2 in sm.get_opcodes(): if tag == 'equal': matching_spans.append((Span(i1, i2), j1)) fmt_map_index = [s.start for s, pos in matching_spans] f.set_fmt_map(matching_spans, fmt_map_index)
def calc_file_keep(f, is_new): if context_diff: keep = context_annot(d.blocks, is_new, 5) if f.keep_mark_lines is not None: keep = merge_annot(keep, f.keep_mark_lines) else: if len(f.line_annot) > 0: keep = [Span(0, f.line_annot[-1].end)] else: keep = [] if f.drop_irrelevant_lines is not None: keep = sub_annot(keep, f.drop_irrelevant_lines) return keep
def annotate_irrelevant(f: File, start: str, end: str): start_re = re.compile(start) end_re = re.compile(end) result = [] start_line = None for i, l in enumerate(f.lines): if start_line is None and start_re.match(l.text): start_line = i if start_line is not None and end_re.match(l.text): result.append(Span(start_line, i + 1, None)) start_line = None f.set_drop_irrelevant_lines(result)
def fmt_map_lookup(self, unformatted_pos: int) -> Tuple[Span[None], int]: '''Look up an unformatted text position, returning a (span, offset) pair. `span` is the containing span in the unformatted text (or a nearby span, if `unformatted_pos` is in text that was modified by formatting), and `offset` is the offset corresponding to `span.start` in the formatted text.''' if self.fmt_map is None: self._init_fmt_map() i = bisect.bisect_right(self.fmt_map_index, unformatted_pos) if i == 0: # Dummy result return (Span(0, 0), 0) else: return self.fmt_map[i - 1]
def context_annot(blocks: List[DiffBlock], new: bool, context_lines: int) -> Annot[None]: '''Generate an annotation of the old or new file's lines, indicating which lines are changes or context for changes (within `context_lines` distance).''' result = SpanMerger() for (changed, old_span, new_span) in blocks: if not changed: continue span = new_span if new else old_span result.add(Span(span.start - context_lines, span.end + context_lines)) return result.finish()
def parse_intra_annot(s: str) -> Annot[str]: '''Parse an `ndiff` detail (`?`) line and convert it to an annotation indicating intraline edits in the text of the preceding line. The annotation labels inserted, deleted, and changed characters with `'ins'`, `'del'`, and `'chg'` respectively.''' spans = [] for m in RUN_RE.finditer(s): c = m.group(1) # Map the symbols used by `ndiff` to something more meaningful. label = { '+': 'ins', '-': 'del', '^': 'chg', }[c] spans.append(Span(m.start(), m.end(), label)) return spans
def highlight_file(f: File): '''Run syntax highlighting on `f`, setting the `highlight` annotation for each of its lines.''' # Annotate the entire file lexer = pygments.lexers.get_lexer_by_name('rust') raw_annot = [] for start, token, token_text in lexer.get_tokens_unprocessed(f.text): if token == pygments.token.Whitespace: continue raw_annot.append(Span(start, start + len(token_text), token)) # Cut annotations into pieces, one per line. for line_span, line_annot in cut_annot(raw_annot, f.line_annot): f.lines[line_span.label].set_highlight(line_annot) assert all(l.highlight is not None for l in f.lines)
def build_diff_hunks(d: Diff, context_diff: bool = True): '''Build a list of output hunks, and assign it to `d.hunks`. If `d.old_file` or `d.new_file` has a `keep_mark_lines` annotation, all annotated lines will be kept as additional context.''' # Find the set of lines each file wants to keep. def calc_file_keep(f, is_new): if context_diff: keep = context_annot(d.blocks, is_new, 5) if f.keep_mark_lines is not None: keep = merge_annot(keep, f.keep_mark_lines) else: if len(f.line_annot) > 0: keep = [Span(0, f.line_annot[-1].end)] else: keep = [] if f.drop_irrelevant_lines is not None: keep = sub_annot(keep, f.drop_irrelevant_lines) return keep keep_old = calc_file_keep(d.old_file, False) keep_new = calc_file_keep(d.new_file, True) # In unchanged blocks, add each file's keep lines to the other file's set. # This works because unchanged blocks have the same number of lines on each # side. old_blocks, new_blocks = annotate_blocks(d.blocks) extra_keep_old = [] extra_keep_new = [] for block_span, keep_spans in cut_annot(keep_old, old_blocks): if block_span.label.changed: continue base = block_span.label.new_span.start extra_keep_new.extend(s + base for s in keep_spans) for block_span, keep_spans in cut_annot(keep_new, new_blocks): if block_span.label.changed: continue base = block_span.label.old_span.start extra_keep_old.extend(s + base for s in keep_spans) keep_old = merge_annot(keep_old, extra_keep_old) keep_new = merge_annot(keep_new, extra_keep_new) # For changed blocks, we can't match up lines from different files, so we # just hope for the best. (Normally all changed lines are kept, so there's # no need to match - the only exception is when the `irrelevant_*_regex` # options are set.) # Build the filtered list of blocks. There can be different numbers of # blocks on the old and new sides. We use a fairly naive strategy to match # them up, but it generally seems to work okay. blocks = [] for (old_block, old_keeps), (new_block, new_keeps) in zip(cut_annot(keep_old, old_blocks), cut_annot(keep_new, new_blocks)): # `old_blocks` and `new_blocks` have corresponding entries (based on # the same block) at corresponding positions. assert old_block.label is new_block.label block = old_block.label # Match up `old_keeps` and `new_keeps` entries by position. In most # cases, the two lists will have the same length. for old_keep, new_keep in zip(old_keeps, new_keeps): blocks.append( DiffBlock(block.changed, old_keep + block.old_span.start, new_keep + block.new_span.start)) for old_keep in old_keeps[len(new_keeps):]: blocks.append( DiffBlock(block.changed, old_keep + block.old_span.start, Span(block.new_span.end, block.new_span.end))) for new_keep in new_keeps[len(old_keeps):]: blocks.append( DiffBlock(block.changed, Span(block.old_span.end, block.old_span.end), new_keep + block.new_span.start)) # Split the new blocks into hunks, and save them in the `Diff`. hunks = split_hunks(blocks) d.set_hunks(hunks)