def ParseChangelogFromJSON(self, range_start, range_end, changelog_url, revision_url, revision_map, file_to_revision_map): """Parses changelog by going over the JSON file. Args: range_start: Starting range of the regression. range_end: Ending range of the regression. changelog_url: The url to retrieve changelog from. revision_url: The url to retrieve individual revision from. revision_map: A map from a git hash number to its revision information. file_to_revision_map: A map from file to a git hash in which it occurs. """ # Compute URLs from given range, and retrieves changelog. Stop if it fails. changelog_url %= (range_start, range_end) json_url = changelog_url + '?format=json' response = crash_utils.GetDataFromURL(json_url) if not response: return # Parse changelog from the returned object. The returned string should # start with ")}]'\n", so start from the 6th character. revisions = crash_utils.LoadJSON(response[5:]) if not revisions: return # Parse individual revision in the log. for revision in revisions['log']: githash = revision['commit'] self.ParseRevision(revision_url, githash, revision_map, file_to_revision_map) # Parse the revision with range_start, because googlesource ignores # that one. self.ParseRevision(revision_url, range_start, revision_map, file_to_revision_map)
def ParseMessage(self, message, codereview_api_url): """Parses the message. It checks the message to extract the code review website and list of reviewers, and it also checks if the CL is a revert of another CL. Args: message: The message to parse. codereview_api_url: URL to retrieve codereview data from. """ for line in message.splitlines(): line = line.strip() review_url_line_match = REVIEW_URL_PATTERN.match(line) # Check if the line has the code review information. if review_url_line_match: # Get review number for the code review site from the line. issue_number = review_url_line_match.group(2) # Get JSON from the code review site, ignore the line if it fails. url = codereview_api_url % issue_number json_string = crash_utils.GetDataFromURL(url) if not json_string: logging.warning( 'Failed to retrieve code review information from %s', url) continue # Load the JSON from the string, and get the list of reviewers. code_review = crash_utils.LoadJSON(json_string) if code_review: self.reviewers = code_review['reviewers'] # Check if this CL is a revert of other CL. if line.lower().startswith('revert'): self.is_reverted = True # Check if the line says what CL this CL is a revert of. revert = self.REVERT_PATTERN.match(line) if revert: self.revert_of = revert.group(2) return
def ParseLineDiff(self, path, component, file_change_type, githash): changed_line_numbers = [] changed_line_contents = [] base_url = self.component_to_url_map[component]['repository'] backup_url = (base_url + self.url_parts_map['revision_url']) % githash # If the file is added (not modified), treat it as if it is not changed. if file_change_type in ('A', 'C', 'R'): # TODO(stgao): Maybe return whole file change for Add, Rename, and Copy? return (backup_url, changed_line_numbers, changed_line_contents) # Retrieves the diff data from URL, and if it fails, return emptry lines. url = (base_url + self.url_parts_map['diff_url']) % (githash, path) data = crash_utils.GetDataFromURL(url + '?format=text') if not data: return (backup_url, changed_line_numbers, changed_line_contents) # Decode the returned object to line diff info diff = base64.b64decode(data).splitlines() # Iterate through the lines in diff. Set current line to -1 so that we know # that current line is part of the diff chunk. current_line = -1 for line in diff: line = line.strip() # If line starts with @@, a new chunk starts. if line.startswith('@@'): current_line = int(line.split('+')[1].split(',')[0]) # If we are in a chunk. elif current_line != -1: # If line is either added or modified. if line.startswith('+'): changed_line_numbers.append(current_line) changed_line_contents.append(line[2:]) # Do not increment current line if the change is 'delete'. if not line.startswith('-'): current_line += 1 # Return url without '?format=json' return (url, changed_line_numbers, changed_line_contents)
def ParseRevision(self, revision_url, githash, revision_map, file_to_revision_map): # Retrieve data from the URL, return if it fails. url = revision_url % githash response = crash_utils.GetDataFromURL(url + '?format=json') if not response: return # Load JSON object from the string. If it fails, terminate the function. json_revision = crash_utils.LoadJSON(response[5:]) if not json_revision: return # Create a map representing object and get githash from the JSON object. revision = {} githash = json_revision['commit'] # Set author, message and URL of this CL. revision['author'] = json_revision['author']['name'] revision['time'] = json_revision['author']['time'] revision['message'] = json_revision['message'] revision['url'] = url # Iterate through the changed files. for diff in json_revision['tree_diff']: file_path = diff['new_path'] file_change_type = diff['type'] # Normalize file action so that it fits with svn_repository_parser. file_change_type = _ConvertToFileChangeType(file_change_type) # Add the file to the map. if file_path not in file_to_revision_map: file_to_revision_map[file_path] = [] file_to_revision_map[file_path].append((githash, file_change_type)) # Add this CL to the map. revision_map[githash] = revision return
def ParseBlameInfo(self, component, file_path, line, revision): base_url = self.component_to_url_map[component]['repository'] # Retrieve blame JSON file from googlesource. If it fails, return None. url_part = self.url_parts_map['blame_url'] % (revision, file_path) blame_url = base_url + url_part json_string = crash_utils.GetDataFromURL(blame_url) if not json_string: return # Parse JSON object from the string. The returned string should # start with ")}]'\n", so start from the 6th character. annotation = crash_utils.LoadJSON(json_string[5:]) if not annotation: return # Go through the regions, which is a list of consecutive lines with same # author/revision. for blame_line in annotation['regions']: start = blame_line['start'] count = blame_line['count'] # For each region, check if the line we want the blame info of is in this # region. if start <= line and line <= start + count - 1: # If we are in the right region, get the information from the line. revision = blame_line['commit'] author = blame_line['author']['name'] revision_url_parts = self.url_parts_map[ 'revision_url'] % revision revision_url = base_url + revision_url_parts # TODO(jeun): Add a way to get content from JSON object. content = None (revision_info, _) = self.ParseChangelog(component, revision, revision) message = revision_info[revision]['message'] return (content, revision, author, revision_url, message) # Return none if the region does not exist. return None
def ParseLineDiff(self, path, component, file_change_type, revision_number): changed_line_numbers = [] changed_line_contents = [] url_map = self.component_to_urls_map.get(component) if not url_map: return (None, None, None) # If the file is added (not modified), treat it as if it is not changed. backup_url = url_map['revision_url'] % revision_number if file_change_type == 'A': return (backup_url, changed_line_numbers, changed_line_contents) # Retrieve data from the url. If no data is retrieved, return empty lists. url = url_map['diff_url'] % (path, revision_number - 1, revision_number, revision_number) data = crash_utils.GetDataFromURL(url) if not data: return (backup_url, changed_line_numbers, changed_line_contents) line_diff_html = minidom.parseString(data) tables = line_diff_html.getElementsByTagName('table') # If there are not NUM_TABLES tables in the html page, there should be an # error in the html page. if len(tables) != NUM_TABLES_IN_LINEDIFF_PAGE: return (backup_url, changed_line_numbers, changed_line_contents) # Diff content is in the second table. Each line of the diff content # is in <tr>. trs = tables[1].getElementsByTagName('tr') prefix_len = len('vc_diff_') # Filter trs so that it only contains diff chunk with contents. filtered_trs = [] for tr in trs: tr_class = tr.getAttribute('class') # Check for the classes of the <tr>s. if tr_class: tr_class = tr_class[prefix_len:] # Do not have to add header. if tr_class == 'header' or tr_class == 'chunk_header': continue # If the class of tr is empty, this page does not have any change. if tr_class == 'empty': return (backup_url, changed_line_numbers, changed_line_contents) filtered_trs.append(tr) # Iterate through filtered trs, and grab line diff information. for tr in filtered_trs: tds = tr.getElementsByTagName('td') # If there aren't 3 tds, this line does should not contain line diff. if len(tds) != NUM_TDS_IN_LINEDIFF_PAGE: continue # If line number information is not in hyperlink, ignore this line. try: line_num = tds[0].getElementsByTagName( 'a')[0].firstChild.nodeValue left_diff_type = tds[1].getAttribute('class')[prefix_len:] right_diff_type = tds[2].getAttribute('class')[prefix_len:] except IndexError: continue # Treat the line as modified only if both left and right diff has type # changed or both have different change type, and if the change is not # deletion. if (left_diff_type != right_diff_type) or (left_diff_type == 'change' and right_diff_type == 'change'): # Check if the line content is not empty. try: new_line = tds[2].firstChild.nodeValue except AttributeError: new_line = '' if not (left_diff_type == 'remove' and right_diff_type == 'empty'): changed_line_numbers.append(int(line_num)) changed_line_contents.append(new_line.strip()) return (url, changed_line_numbers, changed_line_contents)
def ParseChangelog(self, component, range_start, range_end): file_to_revision_map = {} revision_map = {} # Check if the current component is supported by reading the components # parsed from config file. If it is not, fail. url_map = self.component_to_urls_map.get(component) if not url_map: return (revision_map, file_to_revision_map) # Retrieve data from the url, return empty map if fails. revision_range_str = '%s:%s' % (range_start, range_end) url = url_map['changelog_url'] % revision_range_str response = crash_utils.GetDataFromURL(url) if not response: return (revision_map, file_to_revision_map) # Parse xml out of the returned string. If it fails, return empty map. try: xml_revisions = minidom.parseString(response) except ExpatError: return (revision_map, file_to_revision_map) # Iterate through the returned XML object. revisions = xml_revisions.getElementsByTagName('logentry') for revision in revisions: # Create new revision object for each of the revision. revision_object = {} # Set author of the CL. revision_object['author'] = revision.getElementsByTagName( 'author')[0].firstChild.nodeValue # Get the revision number from xml. revision_number = int(revision.getAttribute('revision')) # Iterate through the changed paths in the CL. paths = revision.getElementsByTagName('paths') if paths: for changed_path in paths[0].getElementsByTagName('path'): # Get path and file change type from the xml. file_path = changed_path.firstChild.nodeValue file_change_type = changed_path.getAttribute('action') if file_path.startswith('/trunk/'): file_path = file_path[len('/trunk/'):] # Add file to the map. if file_path not in file_to_revision_map: file_to_revision_map[file_path] = [] file_to_revision_map[file_path].append( (revision_number, file_change_type)) # Set commit message of the CL. revision_object['message'] = revision.getElementsByTagName( 'msg')[0].firstChild.nodeValue # Set url of this CL. revision_url = url_map['revision_url'] % revision_number revision_object['url'] = revision_url # Add this CL to the revision map. revision_map[revision_number] = revision_object return (revision_map, file_to_revision_map)
def ParseBlameInfo(self, component, file_path, line, revision): url_map = self.component_to_urls_map.get(component) if not url_map: return None # Retrieve blame data from url, return None if fails. url = url_map['blame_url'] % (file_path, revision, revision) data = crash_utils.GetDataFromURL(url) if not data: return None blame_html = minidom.parseString(data) title = blame_html.getElementsByTagName('title') # If the returned html page is an exception page, return None. if title[0].firstChild.nodeValue == 'ViewVC Exception': return None # Each of the blame result is in <tr>. blame_results = blame_html.getElementsByTagName('tr') try: blame_result = blame_results[line] except IndexError: return None # There must be 4 <td> for each <tr>. If not, this page is wrong. tds = blame_result.getElementsByTagName('td') if len(tds) != 4: return None # The third <td> has the line content, separated by <span>s. Combine # those to get a string of changed line. If it has nothing, the line # is empty. line_content = '' if tds[3].hasChildNodes(): contents = tds[3].childNodes for content in contents: # Nodetype 3 means it is text node. if content.nodeType == minidom.Node.TEXT_NODE: line_content += content.nodeValue else: line_content += content.firstChild.nodeValue line_content = line_content.strip() # If the current line has the same author/revision as the previous lines, # the result is not shown. Propagate up until we find the line with info. while not tds[1].firstChild: line -= 1 blame_result = blame_results[line] tds = blame_result.getElementsByTagName('td') author = tds[1].firstChild.nodeValue # Revision can either be in hyperlink or plain text. try: revision = tds[2].getElementsByTagName('a')[0].firstChild.nodeValue except IndexError: revision = tds[2].firstChild.nodeValue (revision_info, _) = self.ParseChangelog(component, revision, revision) message = revision_info[int(revision)]['message'] # Return the parsed information. revision_url = url_map['revision_url'] % int(revision) return (line_content, revision, author, revision_url, message)
def ParseChangelog(self, component_path, range_start, range_end): file_to_revision_map = {} revision_map = {} base_url = self.component_to_url_map[component_path]['repository'] changelog_url = base_url + self.url_parts_map['changelog_url'] revision_url = base_url + self.url_parts_map['revision_url'] # Retrieve data from the url, return empty maps if fails. Html url is a\ # url where the changelog can be parsed from html. url = changelog_url % (range_start, range_end) html_url = url + '?pretty=fuller' response = crash_utils.GetDataFromURL(html_url) if not response: return (revision_map, file_to_revision_map) # Parse xml out of the returned string. If it failes, Try parsing # from JSON objects. try: dom = minidom.parseString(response) except ExpatError: self.ParseChangelogFromJSON(range_start, range_end, changelog_url, revision_url, revision_map, file_to_revision_map) return (revision_map, file_to_revision_map) # The revisions information are in from the third divs to the second # to last one. divs = dom.getElementsByTagName('div')[2:-1] pres = dom.getElementsByTagName('pre') uls = dom.getElementsByTagName('ul') # Divs, pres and uls each contain revision information for one CL, so # they should have same length. if not divs or len(divs) != len(pres) or len(pres) != len(uls): self.ParseChangelogFromJSON(range_start, range_end, changelog_url, revision_url, revision_map, file_to_revision_map) return (revision_map, file_to_revision_map) # Iterate through divs and parse revisions for (div, pre, ul) in zip(divs, pres, uls): # Create new revision object for each revision. revision = {} # There must be three <tr>s. If not, this page is wrong. trs = div.getElementsByTagName('tr') if len(trs) != 3: continue # Retrieve git hash. githash = trs[0].getElementsByTagName('a')[0].firstChild.nodeValue # Retrieve and set author. author = trs[1].getElementsByTagName( 'td')[0].firstChild.nodeValue.split('<')[0] revision['author'] = author revision['time'] = trs[1].getElementsByTagName( 'td')[1].firstChild.nodeValue # Retrive and set message. revision['message'] = pre.firstChild.nodeValue # Set url of this CL. revision_url_part = self.url_parts_map['revision_url'] % githash revision['url'] = base_url + revision_url_part # Go through changed files, they are in li. lis = ul.getElementsByTagName('li') for li in lis: # Retrieve path and action of the changed file file_path = li.getElementsByTagName('a')[0].firstChild.nodeValue file_change_type = li.getElementsByTagName('span')[ 0].getAttribute('class') # Normalize file action so that it is same as SVN parser. file_change_type = _ConvertToFileChangeType(file_change_type) # Add the changed file to the map. if file_path not in file_to_revision_map: file_to_revision_map[file_path] = [] file_to_revision_map[file_path].append((githash, file_change_type)) # Add this revision object to the map. revision_map[githash] = revision # Parse one revision for the start range, because googlesource does not # include the start of the range. self.ParseRevision(revision_url, range_start, revision_map, file_to_revision_map) return (revision_map, file_to_revision_map)