def _parse_hunk_line(self, line): """ Given a hunk line in `git diff` output, return the line number at the start of the hunk. A hunk is a segment of code that contains changes. The format of the hunk line is: @@ -k,l +n,m @@ TEXT where `k,l` represent the start line and length before the changes and `n,m` represent the start line and length after the changes. `git diff` will sometimes put a code excerpt from within the hunk in the `TEXT` section of the line. """ # Split the line at the @@ terminators (start and end of the line) components = line.split('@@') # The first component should be an empty string, because # the line starts with '@@'. The second component should # be the hunk information, and any additional components # are excerpts from the code. if len(components) >= 2: hunk_info = components[1] groups = self.HUNK_LINE_RE.findall(hunk_info) if len(groups) == 1: try: return int(groups[0]) except ValueError: msg = "Could not parse '{}' as a line number".format( groups[0]) raise GitDiffError(msg) else: msg = "Could not find start of hunk in line '{}'".format(line) raise GitDiffError(msg) else: msg = "Could not parse hunk in line '{}'".format(line) raise GitDiffError(msg)
def _parse_source_line(self, line): """ Given a source line in `git diff` output, return the path to the source file. """ if "--git" in line: regex = self.SRC_FILE_RE elif "--cc" in line: regex = self.MERGE_CONFLICT_RE else: msg = f"Do not recognize format of source in line '{line}'" raise GitDiffError(msg) # Parse for the source file path groups = regex.findall(line) if len(groups) == 1: return groups[0] msg = f"Could not parse source path in line '{line}'" raise GitDiffError(msg)
def _parse_source_sections(self, diff_str): """ Given the output of `git diff`, return a dictionary with keys that are source file paths. Each value is a list of lines from the `git diff` output related to the source file. Raises a `GitDiffError` if `diff_str` is in an invalid format. """ # Create a dict to map source files to lines in the diff output source_dict = dict() # Keep track of the current source file src_path = None # Signal that we've found a hunk (after starting a source file) found_hunk = False # Parse the diff string into sections by source file for line in diff_str.split('\n'): # If the line starts with "diff --git" # or "diff --cc" (in the case of a merge conflict) # then it is the start of a new source file if line.startswith('diff --git') or line.startswith('diff --cc'): # Retrieve the name of the source file src_path = self._parse_source_line(line) # Create an entry for the source file, if we don't # already have one. if src_path not in source_dict: source_dict[src_path] = [] # Signal that we're waiting for a hunk for this source file found_hunk = False # Every other line is stored in the dictionary for this source file # once we find a hunk section else: # Only add lines if we're in a hunk section # (ignore index and files changed lines) if found_hunk or line.startswith('@@'): # Remember that we found a hunk found_hunk = True if src_path is not None: source_dict[src_path].append(line) else: # We tolerate other information before we have # a source file defined, unless it's a hunk line if line.startswith("@@"): msg = "Hunk has no source file: '{0}'".format(line) raise GitDiffError(msg) return source_dict