def parse_special_header(self, linenum, info): linenum = super(CVSDiffParser, self).parse_special_header(linenum, info) if 'index' not in info: # We didn't find an index, so the rest is probably bogus too. return linenum m = self.regex_full.match(self.lines[linenum]) if not m: m = self.regex_small.match(self.lines[linenum]) if m: info['filename'] = m.group(1) linenum += 1 else: raise DiffParserError('Unable to find RCS line', linenum) while self.lines[linenum].startswith(b'retrieving '): linenum += 1 if self.lines[linenum].startswith(b'diff '): linenum += 1 return linenum
def _parse_diff_git_line(self, diff_git_line, file_info, linenum): """Parses the "diff --git" line for filename information. Not all diffs have "---" and "+++" lines we can parse for the filenames. Git leaves these out if there aren't any changes made to the file. This function attempts to extract this information from the "diff --git" lines in the diff. It supports the following: * All filenames with quotes. * All filenames with a/ and b/ prefixes. * Filenames without quotes, prefixes, or spaces. * Filenames without quotes or prefixes, where the original and modified filenames are identical. """ for regex in self.DIFF_GIT_LINE_RES: m = regex.match(diff_git_line) if m: file_info.orig_filename = m.group('orig_filename') file_info.modified_filename = m.group('new_filename') return raise DiffParserError( 'Unable to parse the "diff --git" line for this file, due to ' 'the use of filenames with spaces or --no-prefix, --src-prefix, ' 'or --dst-prefix options.', linenum)
def parse(self): """ Parses the diff, returning a list of File objects representing each file in the diff. """ self.files = [] i = 0 preamble = '' while i < len(self.lines): next_i, file_info, new_diff = self._parse_diff(i) if file_info: self._ensure_file_has_required_fields(file_info) if preamble: file_info.data = preamble + file_info.data preamble = '' self.files.append(file_info) elif new_diff: # We found a diff, but it was empty and has no file entry. # Reset the preamble. preamble = '' else: preamble += self.lines[i] + '\n' i = next_i if not self.files and preamble.strip() != '': # This is probably not an actual git diff file. raise DiffParserError('This does not appear to be a git diff', 0) return self.files
def parse_special_header(self, linenum, info): diff_line = self.lines[linenum] split_line = diff_line.split() # git style diffs are supported as long as the node ID and parent ID # are present in the patch header if diff_line.startswith(b"# Node ID") and len(split_line) == 4: self.new_changeset_id = split_line[3] elif diff_line.startswith(b"# Parent") and len(split_line) == 3: self.orig_changeset_id = split_line[2] elif diff_line.startswith(b"diff -r"): # diff between two revisions are in the following form: # "diff -r abcdef123456 -r 123456abcdef filename" # diff between a revision and the working copy are like: # "diff -r abcdef123456 filename" self.is_git_diff = False try: # ordinary hg diffs don't record renames, so # new file always == old file if len(split_line) > 4 and split_line[3] == b'-r': # Committed revision name_start_ix = 5 info['newInfo'] = split_line[4] else: # Uncommitted revision name_start_ix = 3 info['newInfo'] = "Uncommitted" info['newFile'] = info['origFile'] = \ b' '.join(split_line[name_start_ix:]) info['origInfo'] = split_line[2] info['orig_changeset_id'] = split_line[2] except ValueError: raise DiffParserError( "The diff file is missing revision " "information", linenum) linenum += 1 elif (diff_line.startswith(b"diff --git") and self.orig_changeset_id): # diff is in the following form: # "diff --git a/origfilename b/newfilename" # possibly followed by: # "{copy|rename} from origfilename" # "{copy|rename} from newfilename" self.is_git_diff = True info['origInfo'] = self.orig_changeset_id info['orig_changeset_id'] = self.orig_changeset_id if not self.new_changeset_id: info['newInfo'] = "Uncommitted" else: info['newInfo'] = self.new_changeset_id match = re.search(r' a/(.*?) b/(.*?)( (copy|rename) from .*)?$', diff_line) info['origFile'] = match.group(1) info['newFile'] = match.group(2) linenum += 1 return linenum
def parse_special_header(self, linenum, info): diffLine = self.lines[linenum].split() # git style diffs are supported as long as the node ID and parent ID # are present in the patch header if self.lines[linenum].startswith("# Node ID") and len(diffLine) == 4: self.newChangesetId = diffLine[3] elif self.lines[linenum].startswith("# Parent") and len(diffLine) == 3: self.origChangesetId = diffLine[2] elif self.lines[linenum].startswith("diff -r"): # diff between two revisions are in the following form: # "diff -r abcdef123456 -r 123456abcdef filename" # diff between a revision and the working copy are like: # "diff -r abcdef123456 filename" self.isGitDiff = False try: # ordinary hg diffs don't record renames, so # new file always == old file isCommitted = len(diffLine) > 4 and diffLine[3] == '-r' if isCommitted: nameStartIndex = 5 info['newInfo'] = diffLine[4] else: nameStartIndex = 3 info['newInfo'] = "Uncommitted" info['newFile'] = info['origFile'] = \ ' '.join(diffLine[nameStartIndex:]) info['origInfo'] = diffLine[2] info['origChangesetId'] = diffLine[2] except ValueError: raise DiffParserError( "The diff file is missing revision " "information", linenum) linenum += 1 elif self.lines[linenum].startswith("diff --git") and \ self.origChangesetId and diffLine[2].startswith("a/") and \ diffLine[3].startswith("b/"): # diff is in the following form: # "diff --git a/origfilename b/newfilename" # possibly followed by: # "{copy|rename} from origfilename" # "{copy|rename} from newfilename" self.isGitDiff = True info['origInfo'] = info['origChangesetId'] = self.origChangesetId if not self.newChangesetId: info['newInfo'] = "Uncommitted" else: info['newInfo'] = self.newChangesetId info['origFile'] = diffLine[2][2:] info['newFile'] = diffLine[3][2:] linenum += 1 return linenum
def parse_special_header(self, linenum, parsed_file): """Parse a special diff header marking the start of a new file's info. This will look for: * An ``Index:`` line at the given line number, which must be present for any further processing of special headers * An ``RCS file:`` line, which specifies the filename that should be used for modified filenames, and for original filenames when parsing a binary file. * Any ``retrieving ...` lines, or a ``diff`` line, all of which will be skipped. Args: linenum (int): The line number to begin parsing. parsed_file (reviewboard.diffviewer.parser.ParsedDiffFile): The file currently being parsed. Returns: int: The next line number to parse. Raises: reviewboard.diffviewer.errors.DiffParserError: There was an error parsing the special header. This may be a corrupted diff, or an error in the parsing implementation. Details are in the error message. """ linenum = super(CVSDiffParser, self).parse_special_header(linenum, parsed_file) if not parsed_file.index_header_value: # We didn't find an index, so the rest is probably bogus too. return linenum lines = self.lines m = self.rcs_file_re.match(lines[linenum]) if m: parsed_file.rcs_filename = m.group('path') linenum += 1 else: raise DiffParserError('Unable to find RCS line', linenum=linenum) while lines[linenum].startswith(b'retrieving '): linenum += 1 if lines[linenum].startswith(b'diff '): linenum += 1 return linenum
def parse(self): """ Parses the diff, returning a list of File objects representing each file in the diff. """ self.files = [] i = 0 preamble = io.BytesIO() while i < len(self.lines): next_i, file_info, new_diff = self._parse_diff(i) if file_info: if self.files: self.files[-1].append_data(preamble.getvalue()) preamble.close() preamble = io.BytesIO() self.files[-1].finalize() self._ensure_file_has_required_fields(file_info) file_info.prepend_data(preamble.getvalue()) preamble.close() preamble = io.BytesIO() self.files.append(file_info) elif new_diff: # We found a diff, but it was empty and has no file entry. # Reset the preamble. preamble.close() preamble = io.BytesIO() else: preamble.write(self.lines[i]) preamble.write(b'\n') i = next_i try: if self.files: self.files[-1].append_data(preamble.getvalue()) self.files[-1].finalize() elif preamble.getvalue().strip() != b'': # This is probably not an actual git diff file. raise DiffParserError('This does not appear to be a git diff', 0) finally: preamble.close() return self.files
def parse_special_header(self, linenum, info): diff_line = self.lines[linenum] split_line = diff_line.split() if diff_line.startswith(b"# Node ID") and len(split_line) == 4: self.new_changeset_id = split_line[3] elif diff_line.startswith(b"# Parent") and len(split_line) == 3: self.orig_changeset_id = split_line[2] elif diff_line.startswith(b"diff -r"): # diff between two revisions are in the following form: # "diff -r abcdef123456 -r 123456abcdef filename" # diff between a revision and the working copy are like: # "diff -r abcdef123456 filename" try: # ordinary hg diffs don't record renames, so # new file always == old file if len(split_line) > 4 and split_line[3] == b'-r': # Committed revision name_start_ix = 5 info['newInfo'] = split_line[4] else: # Uncommitted revision name_start_ix = 3 info['newInfo'] = b'Uncommitted' info['newFile'] = info['origFile'] = b' '.join( split_line[name_start_ix:]) info['origInfo'] = split_line[2] info['origChangesetId'] = split_line[2] self.orig_changeset_id = split_line[2] except ValueError: raise DiffParserError( "The diff file is missing revision " "information", linenum) linenum += 1 return linenum
def _parse_git_diff(self, linenum): # First check if it is a new file with no content or # a file mode change with no content or # a deleted file with no content # then skip try: if self._is_empty_change(linenum): linenum += GIT_DIFF_EMPTY_CHANGESET_SIZE return linenum, None except IndexError: # This means this is the only bit left in the file linenum += GIT_DIFF_EMPTY_CHANGESET_SIZE return linenum, None # Now we have a diff we are going to use so get the filenames + commits file_info = File() file_info.data = self.lines[linenum] + "\n" file_info.binary = False diff_line = self.lines[linenum].split() try: # Need to remove the "a/" and "b/" prefix file_info.origFile = GIT_DIFF_PREFIX.sub("", diff_line[-2]) file_info.newFile = GIT_DIFF_PREFIX.sub("", diff_line[-1]) except ValueError: raise DiffParserError('The diff file is missing revision ' 'information', linenum) linenum += 1 # Save the new file, deleted file, mode change and index if self._is_new_file(linenum): file_info.data += self.lines[linenum] + "\n" linenum += 1 elif self._is_deleted_file(linenum): file_info.data += self.lines[linenum] + "\n" linenum += 1 file_info.deleted = True elif self._is_mode_change(linenum): file_info.data += self.lines[linenum] + "\n" file_info.data += self.lines[linenum + 1] + "\n" linenum += 2 if self._is_index_range_line(linenum): index_range = self.lines[linenum].split(None, 2)[1] if '..' in index_range: file_info.origInfo, file_info.newInfo = index_range.split("..") if self.pre_creation_regexp.match(file_info.origInfo): file_info.origInfo = PRE_CREATION file_info.data += self.lines[linenum] + "\n" linenum += 1 # Get the changes while linenum < len(self.lines): if self._is_git_diff(linenum): return linenum, file_info if self._is_binary_patch(linenum): file_info.binary = True return linenum + 1, file_info if self._is_diff_fromfile_line(linenum): if self.lines[linenum].split()[1] == "/dev/null": file_info.origInfo = PRE_CREATION file_info.data += self.lines[linenum] + "\n" linenum += 1 return linenum, file_info
def _parse_git_diff(self, linenum): # First check if it is a new file with no content or # a file mode change with no content or # a deleted file with no content # then skip # Now we have a diff we are going to use so get the filenames + commits file_info = File() file_info.data = self.lines[linenum] + b"\n" file_info.binary = False diff_line = self.lines[linenum].split() try: # Need to remove the "a/" and "b/" prefix file_info.origFile = GIT_DIFF_PREFIX.sub(b"", diff_line[-2]) file_info.newFile = GIT_DIFF_PREFIX.sub(b"", diff_line[-1]) if isinstance(file_info.origFile, six.binary_type): file_info.origFile = file_info.origFile.decode('utf-8') if isinstance(file_info.newFile, six.binary_type): file_info.newFile = file_info.newFile.decode('utf-8') except ValueError: raise DiffParserError( 'The diff file is missing revision ' 'information', linenum) linenum += 1 # Check to make sure we haven't reached the end of the diff. if linenum >= len(self.lines): return linenum, None # Parse the extended header to save the new file, deleted file, # mode change, file move, and index. if self._is_new_file(linenum): file_info.data += self.lines[linenum] + b"\n" linenum += 1 elif self._is_deleted_file(linenum): file_info.data += self.lines[linenum] + b"\n" linenum += 1 file_info.deleted = True elif self._is_mode_change(linenum): file_info.data += self.lines[linenum] + b"\n" file_info.data += self.lines[linenum + 1] + b"\n" linenum += 2 elif self._is_moved_file(linenum): file_info.data += self.lines[linenum] + b"\n" file_info.data += self.lines[linenum + 1] + b"\n" file_info.data += self.lines[linenum + 2] + b"\n" linenum += 3 file_info.moved = True elif self._is_copied_file(linenum): file_info.data += self.lines[linenum] + b"\n" file_info.data += self.lines[linenum + 1] + b"\n" file_info.data += self.lines[linenum + 2] + b"\n" linenum += 3 file_info.copied = True # Assume by default that the change is empty. If we find content # later, we'll clear this. empty_change = True if self._is_index_range_line(linenum): index_range = self.lines[linenum].split(None, 2)[1] if '..' in index_range: file_info.origInfo, file_info.newInfo = index_range.split("..") if self.pre_creation_regexp.match(file_info.origInfo): file_info.origInfo = PRE_CREATION file_info.data += self.lines[linenum] + b"\n" linenum += 1 # Get the changes while linenum < len(self.lines): if self._is_git_diff(linenum): break elif self._is_binary_patch(linenum): file_info.binary = True file_info.data += self.lines[linenum] + b"\n" empty_change = False linenum += 1 break elif self._is_diff_fromfile_line(linenum): if self.lines[linenum].split()[1] == b"/dev/null": file_info.origInfo = PRE_CREATION file_info.data += self.lines[linenum] + b'\n' file_info.data += self.lines[linenum + 1] + b'\n' linenum += 2 else: empty_change = False linenum = self.parse_diff_line(linenum, file_info) if empty_change and not (file_info.moved or file_info.copied): # We didn't find any interesting content, so leave out this # file's info. # # Note that we may want to change this in the future to preserve # data like mode changes, but that will require filtering out # empty changes at the diff viewer level in a sane way. file_info = None return linenum, file_info
def _parse_git_diff(self, linenum): # First check if it is a new file with no content or # a file mode change with no content or # a deleted file with no content # then skip empty_change = self._is_empty_change(linenum) empty_change_linenum = linenum + GIT_DIFF_EMPTY_CHANGESET_SIZE # Now we have a diff we are going to use so get the filenames + commits file_info = File() file_info.data = self.lines[linenum] + "\n" file_info.binary = False diff_line = self.lines[linenum].split() try: # Need to remove the "a/" and "b/" prefix file_info.origFile = GIT_DIFF_PREFIX.sub("", diff_line[-2]) file_info.newFile = GIT_DIFF_PREFIX.sub("", diff_line[-1]) except ValueError: raise DiffParserError('The diff file is missing revision ' 'information', linenum) linenum += 1 # Parse the extended header to save the new file, deleted file, # mode change, file move, and index. if self._is_new_file(linenum): file_info.data += self.lines[linenum] + "\n" linenum += 1 elif self._is_deleted_file(linenum): file_info.data += self.lines[linenum] + "\n" linenum += 1 file_info.deleted = True elif self._is_mode_change(linenum): file_info.data += self.lines[linenum] + "\n" file_info.data += self.lines[linenum + 1] + "\n" linenum += 2 elif self._is_moved_file(linenum): file_info.data += self.lines[linenum] + "\n" file_info.data += self.lines[linenum + 1] + "\n" file_info.data += self.lines[linenum + 2] + "\n" linenum += 3 file_info.moved = True # Only show interesting empty changes. Basically, deletions. # It's likely a binary file if we're at this point, and so we want # to process the rest of it. if empty_change and not file_info.deleted: return empty_change_linenum, None if self._is_index_range_line(linenum): index_range = self.lines[linenum].split(None, 2)[1] if '..' in index_range: file_info.origInfo, file_info.newInfo = index_range.split("..") if self.pre_creation_regexp.match(file_info.origInfo): file_info.origInfo = PRE_CREATION file_info.data += self.lines[linenum] + "\n" linenum += 1 # Get the changes while linenum < len(self.lines): if self._is_git_diff(linenum): return linenum, file_info elif self._is_binary_patch(linenum): file_info.binary = True file_info.data += self.lines[linenum] + "\n" return linenum + 1, file_info elif self._is_diff_fromfile_line(linenum): if self.lines[linenum].split()[1] == "/dev/null": file_info.origInfo = PRE_CREATION file_info.data += self.lines[linenum] + '\n' file_info.data += self.lines[linenum + 1] + '\n' linenum += 2 else: linenum = self.parse_diff_line(linenum, file_info) return linenum, file_info
def parse_special_header(self, linenum, parsed_file): """Parse a special diff header marking the start of a new file's info. This looks for some special markers found in Mercurial diffs, trying to find a ``Parent`` or a ``diff -r`` line. A ``Parent`` line specifies a changeset ID that will be used as the source revision for all files. A ``diff -r`` line contains information identifying the file's name and other details. Args: linenum (int): The line number to begin parsing. parsed_file (reviewboard.diffviewer.parser.ParsedDiffFile): The file currently being parsed. Returns: int: The next line number to parse. Raises: reviewboard.diffviewer.errors.DiffParserError: There was an error parsing the special header. This may be a corrupted diff, or an error in the parsing implementation. Details are in the error message. """ diff_line = self.lines[linenum] split_line = diff_line.split() if diff_line.startswith(b'# Parent') and len(split_line) == 3: self.orig_changeset_id = split_line[2] elif diff_line.startswith(b'diff -r'): # A diff between two revisions are in the following form: # # diff -r abcdef123456 -r 123456abcdef filename # # A diff between a revision and the working copy: # # diff -r abcdef123456 filename try: # Ordinary hg diffs don't record renames, so a new file # is always equivalent to an old file. if len(split_line) > 4 and split_line[3] == b'-r': # Committed revision name_start_ix = 5 parsed_file.modified_file_details = split_line[4] else: # Uncommitted revision name_start_ix = 3 parsed_file.modified_file_details = b'Uncommitted' filename = b' '.join(split_line[name_start_ix:]) parsed_file.orig_filename = filename parsed_file.orig_file_details = split_line[2] parsed_file.modified_filename = filename self.orig_changeset_id = split_line[2] except ValueError: raise DiffParserError( 'The diff file is missing revision ' 'information', linenum=linenum) linenum += 1 return linenum