def ParseChangelogFromJSON(self, range_start, range_end, changelog_url,
                             revision_url, revision_map, file_to_revision_map):
    """Parses changelog by going over the JSON file.

    Args:
      range_start: Starting range of the regression.
      range_end: Ending range of the regression.
      changelog_url: The url to retrieve changelog from.
      revision_url: The url to retrieve individual revision from.
      revision_map: A map from a git hash number to its revision information.
      file_to_revision_map: A map from file to a git hash in which it occurs.
    """
    # Compute URLs from given range, and retrieves changelog. Stop if it fails.
    changelog_url %= (range_start, range_end)
    json_url = changelog_url + '?format=json'
    response = crash_utils.GetDataFromURL(json_url)
    if not response:
      return

    # Parse changelog from the returned object. The returned string should
    # start with ")}]'\n", so start from the 6th character.
    revisions = crash_utils.LoadJSON(response[5:])
    if not revisions:
      return

    # Parse individual revision in the log.
    for revision in revisions['log']:
      githash = revision['commit']
      self.ParseRevision(revision_url, githash, revision_map,
                         file_to_revision_map)

    # Parse the revision with range_start, because googlesource ignores
    # that one.
    self.ParseRevision(revision_url, range_start, revision_map,
                       file_to_revision_map)
    def ParseMessage(self, message, codereview_api_url):
        """Parses the message.

    It checks the message to extract the code review website and list of
    reviewers, and it also checks if the CL is a revert of another CL.

    Args:
      message: The message to parse.
      codereview_api_url: URL to retrieve codereview data from.
    """
        for line in message.splitlines():
            line = line.strip()
            review_url_line_match = REVIEW_URL_PATTERN.match(line)

            # Check if the line has the code review information.
            if review_url_line_match:

                # Get review number for the code review site from the line.
                issue_number = review_url_line_match.group(2)

                # Get JSON from the code review site, ignore the line if it fails.
                url = codereview_api_url % issue_number
                json_string = crash_utils.GetDataFromURL(url)
                if not json_string:
                    logging.warning(
                        'Failed to retrieve code review information from %s',
                        url)
                    continue

                # Load the JSON from the string, and get the list of reviewers.
                code_review = crash_utils.LoadJSON(json_string)
                if code_review:
                    self.reviewers = code_review['reviewers']

            # Check if this CL is a revert of other CL.
            if line.lower().startswith('revert'):
                self.is_reverted = True

                # Check if the line says what CL this CL is a revert of.
                revert = self.REVERT_PATTERN.match(line)
                if revert:
                    self.revert_of = revert.group(2)
                return
  def ParseLineDiff(self, path, component, file_change_type, githash):
    changed_line_numbers = []
    changed_line_contents = []
    base_url = self.component_to_url_map[component]['repository']
    backup_url = (base_url + self.url_parts_map['revision_url']) % githash

    # If the file is added (not modified), treat it as if it is not changed.
    if file_change_type in ('A', 'C', 'R'):
      # TODO(stgao): Maybe return whole file change for Add, Rename, and Copy?
      return (backup_url, changed_line_numbers, changed_line_contents)

    # Retrieves the diff data from URL, and if it fails, return emptry lines.
    url = (base_url + self.url_parts_map['diff_url']) % (githash, path)
    data = crash_utils.GetDataFromURL(url + '?format=text')
    if not data:
      return (backup_url, changed_line_numbers, changed_line_contents)

    # Decode the returned object to line diff info
    diff = base64.b64decode(data).splitlines()

    # Iterate through the lines in diff. Set current line to -1 so that we know
    # that current line is part of the diff chunk.
    current_line = -1
    for line in diff:
      line = line.strip()

      # If line starts with @@, a new chunk starts.
      if line.startswith('@@'):
        current_line = int(line.split('+')[1].split(',')[0])

      # If we are in a chunk.
      elif current_line != -1:
        # If line is either added or modified.
        if line.startswith('+'):
          changed_line_numbers.append(current_line)
          changed_line_contents.append(line[2:])

        # Do not increment current line if the change is 'delete'.
        if not line.startswith('-'):
          current_line += 1

    # Return url without '?format=json'
    return (url, changed_line_numbers, changed_line_contents)
  def ParseRevision(self, revision_url, githash, revision_map,
                    file_to_revision_map):

    # Retrieve data from the URL, return if it fails.
    url = revision_url % githash
    response = crash_utils.GetDataFromURL(url + '?format=json')
    if not response:
      return

    # Load JSON object from the string. If it fails, terminate the function.
    json_revision = crash_utils.LoadJSON(response[5:])
    if not json_revision:
      return

    # Create a map representing object and get githash from the JSON object.
    revision = {}
    githash = json_revision['commit']

    # Set author, message and URL of this CL.
    revision['author'] = json_revision['author']['name']
    revision['time'] = json_revision['author']['time']
    revision['message'] = json_revision['message']
    revision['url'] = url

    # Iterate through the changed files.
    for diff in json_revision['tree_diff']:
      file_path = diff['new_path']
      file_change_type = diff['type']

      # Normalize file action so that it fits with svn_repository_parser.
      file_change_type = _ConvertToFileChangeType(file_change_type)

      # Add the file to the map.
      if file_path not in file_to_revision_map:
        file_to_revision_map[file_path] = []
      file_to_revision_map[file_path].append((githash, file_change_type))

    # Add this CL to the map.
    revision_map[githash] = revision

    return
    def ParseBlameInfo(self, component, file_path, line, revision):
        base_url = self.component_to_url_map[component]['repository']

        # Retrieve blame JSON file from googlesource. If it fails, return None.
        url_part = self.url_parts_map['blame_url'] % (revision, file_path)
        blame_url = base_url + url_part
        json_string = crash_utils.GetDataFromURL(blame_url)
        if not json_string:
            return

        # Parse JSON object from the string. The returned string should
        # start with ")}]'\n", so start from the 6th character.
        annotation = crash_utils.LoadJSON(json_string[5:])
        if not annotation:
            return

        # Go through the regions, which is a list of consecutive lines with same
        # author/revision.
        for blame_line in annotation['regions']:
            start = blame_line['start']
            count = blame_line['count']

            # For each region, check if the line we want the blame info of is in this
            # region.
            if start <= line and line <= start + count - 1:
                # If we are in the right region, get the information from the line.
                revision = blame_line['commit']
                author = blame_line['author']['name']
                revision_url_parts = self.url_parts_map[
                    'revision_url'] % revision
                revision_url = base_url + revision_url_parts
                # TODO(jeun): Add a way to get content from JSON object.
                content = None

                (revision_info,
                 _) = self.ParseChangelog(component, revision, revision)
                message = revision_info[revision]['message']
                return (content, revision, author, revision_url, message)

        # Return none if the region does not exist.
        return None
Exemple #6
0
    def ParseLineDiff(self, path, component, file_change_type,
                      revision_number):
        changed_line_numbers = []
        changed_line_contents = []

        url_map = self.component_to_urls_map.get(component)
        if not url_map:
            return (None, None, None)

        # If the file is added (not modified), treat it as if it is not changed.
        backup_url = url_map['revision_url'] % revision_number
        if file_change_type == 'A':
            return (backup_url, changed_line_numbers, changed_line_contents)

        # Retrieve data from the url. If no data is retrieved, return empty lists.
        url = url_map['diff_url'] % (path, revision_number - 1,
                                     revision_number, revision_number)
        data = crash_utils.GetDataFromURL(url)
        if not data:
            return (backup_url, changed_line_numbers, changed_line_contents)

        line_diff_html = minidom.parseString(data)
        tables = line_diff_html.getElementsByTagName('table')
        # If there are not NUM_TABLES tables in the html page, there should be an
        # error in the html page.
        if len(tables) != NUM_TABLES_IN_LINEDIFF_PAGE:
            return (backup_url, changed_line_numbers, changed_line_contents)

        # Diff content is in the second table. Each line of the diff content
        # is in <tr>.
        trs = tables[1].getElementsByTagName('tr')
        prefix_len = len('vc_diff_')

        # Filter trs so that it only contains diff chunk with contents.
        filtered_trs = []
        for tr in trs:
            tr_class = tr.getAttribute('class')

            # Check for the classes of the <tr>s.
            if tr_class:
                tr_class = tr_class[prefix_len:]

                # Do not have to add header.
                if tr_class == 'header' or tr_class == 'chunk_header':
                    continue

                # If the class of tr is empty, this page does not have any change.
                if tr_class == 'empty':
                    return (backup_url, changed_line_numbers,
                            changed_line_contents)

            filtered_trs.append(tr)

        # Iterate through filtered trs, and grab line diff information.
        for tr in filtered_trs:
            tds = tr.getElementsByTagName('td')

            # If there aren't 3 tds, this line does should not contain line diff.
            if len(tds) != NUM_TDS_IN_LINEDIFF_PAGE:
                continue

            # If line number information is not in hyperlink, ignore this line.
            try:
                line_num = tds[0].getElementsByTagName(
                    'a')[0].firstChild.nodeValue
                left_diff_type = tds[1].getAttribute('class')[prefix_len:]
                right_diff_type = tds[2].getAttribute('class')[prefix_len:]
            except IndexError:
                continue

            # Treat the line as modified only if both left and right diff has type
            # changed or both have different change type, and if the change is not
            # deletion.
            if (left_diff_type !=
                    right_diff_type) or (left_diff_type == 'change'
                                         and right_diff_type == 'change'):

                # Check if the line content is not empty.
                try:
                    new_line = tds[2].firstChild.nodeValue
                except AttributeError:
                    new_line = ''

                if not (left_diff_type == 'remove'
                        and right_diff_type == 'empty'):
                    changed_line_numbers.append(int(line_num))
                    changed_line_contents.append(new_line.strip())

        return (url, changed_line_numbers, changed_line_contents)
Exemple #7
0
    def ParseChangelog(self, component, range_start, range_end):
        file_to_revision_map = {}
        revision_map = {}

        # Check if the current component is supported by reading the components
        # parsed from config file. If it is not, fail.

        url_map = self.component_to_urls_map.get(component)
        if not url_map:
            return (revision_map, file_to_revision_map)

        # Retrieve data from the url, return empty map if fails.
        revision_range_str = '%s:%s' % (range_start, range_end)
        url = url_map['changelog_url'] % revision_range_str
        response = crash_utils.GetDataFromURL(url)
        if not response:
            return (revision_map, file_to_revision_map)

        # Parse xml out of the returned string. If it fails, return empty map.
        try:
            xml_revisions = minidom.parseString(response)
        except ExpatError:
            return (revision_map, file_to_revision_map)

        # Iterate through the returned XML object.
        revisions = xml_revisions.getElementsByTagName('logentry')
        for revision in revisions:
            # Create new revision object for each of the revision.
            revision_object = {}

            # Set author of the CL.
            revision_object['author'] = revision.getElementsByTagName(
                'author')[0].firstChild.nodeValue

            # Get the revision number from xml.
            revision_number = int(revision.getAttribute('revision'))

            # Iterate through the changed paths in the CL.
            paths = revision.getElementsByTagName('paths')
            if paths:
                for changed_path in paths[0].getElementsByTagName('path'):
                    # Get path and file change type from the xml.
                    file_path = changed_path.firstChild.nodeValue
                    file_change_type = changed_path.getAttribute('action')

                    if file_path.startswith('/trunk/'):
                        file_path = file_path[len('/trunk/'):]

                    # Add file to the map.
                    if file_path not in file_to_revision_map:
                        file_to_revision_map[file_path] = []
                    file_to_revision_map[file_path].append(
                        (revision_number, file_change_type))

            # Set commit message of the CL.
            revision_object['message'] = revision.getElementsByTagName(
                'msg')[0].firstChild.nodeValue

            # Set url of this CL.
            revision_url = url_map['revision_url'] % revision_number
            revision_object['url'] = revision_url

            # Add this CL to the revision map.
            revision_map[revision_number] = revision_object

        return (revision_map, file_to_revision_map)
Exemple #8
0
    def ParseBlameInfo(self, component, file_path, line, revision):
        url_map = self.component_to_urls_map.get(component)
        if not url_map:
            return None

        # Retrieve blame data from url, return None if fails.
        url = url_map['blame_url'] % (file_path, revision, revision)
        data = crash_utils.GetDataFromURL(url)
        if not data:
            return None

        blame_html = minidom.parseString(data)

        title = blame_html.getElementsByTagName('title')
        # If the returned html page is an exception page, return None.
        if title[0].firstChild.nodeValue == 'ViewVC Exception':
            return None

        # Each of the blame result is in <tr>.
        blame_results = blame_html.getElementsByTagName('tr')
        try:
            blame_result = blame_results[line]
        except IndexError:
            return None

        # There must be 4 <td> for each <tr>. If not, this page is wrong.
        tds = blame_result.getElementsByTagName('td')
        if len(tds) != 4:
            return None

        # The third <td> has the line content, separated by <span>s. Combine
        # those to get a string of changed line. If it has nothing, the line
        # is empty.
        line_content = ''
        if tds[3].hasChildNodes():
            contents = tds[3].childNodes

            for content in contents:
                # Nodetype 3 means it is text node.
                if content.nodeType == minidom.Node.TEXT_NODE:
                    line_content += content.nodeValue
                else:
                    line_content += content.firstChild.nodeValue

            line_content = line_content.strip()

        # If the current line has the same author/revision as the previous lines,
        # the result is not shown. Propagate up until we find the line with info.
        while not tds[1].firstChild:
            line -= 1
            blame_result = blame_results[line]
            tds = blame_result.getElementsByTagName('td')
        author = tds[1].firstChild.nodeValue

        # Revision can either be in hyperlink or plain text.
        try:
            revision = tds[2].getElementsByTagName('a')[0].firstChild.nodeValue
        except IndexError:
            revision = tds[2].firstChild.nodeValue

        (revision_info, _) = self.ParseChangelog(component, revision, revision)
        message = revision_info[int(revision)]['message']

        # Return the parsed information.
        revision_url = url_map['revision_url'] % int(revision)
        return (line_content, revision, author, revision_url, message)
  def ParseChangelog(self, component_path, range_start, range_end):
    file_to_revision_map = {}
    revision_map = {}
    base_url = self.component_to_url_map[component_path]['repository']
    changelog_url = base_url + self.url_parts_map['changelog_url']
    revision_url = base_url + self.url_parts_map['revision_url']

    # Retrieve data from the url, return empty maps if fails. Html url is a\
    # url where the changelog can be parsed from html.
    url = changelog_url % (range_start, range_end)
    html_url = url + '?pretty=fuller'
    response = crash_utils.GetDataFromURL(html_url)
    if not response:
      return (revision_map, file_to_revision_map)

    # Parse xml out of the returned string. If it failes, Try parsing
    # from JSON objects.
    try:
      dom = minidom.parseString(response)
    except ExpatError:
      self.ParseChangelogFromJSON(range_start, range_end, changelog_url,
                                  revision_url, revision_map,
                                  file_to_revision_map)
      return (revision_map, file_to_revision_map)

    # The revisions information are in from the third divs to the second
    # to last one.
    divs = dom.getElementsByTagName('div')[2:-1]
    pres = dom.getElementsByTagName('pre')
    uls = dom.getElementsByTagName('ul')

    # Divs, pres and uls each contain revision information for one CL, so
    # they should have same length.
    if not divs or len(divs) != len(pres) or len(pres) != len(uls):
      self.ParseChangelogFromJSON(range_start, range_end, changelog_url,
                                  revision_url, revision_map,
                                  file_to_revision_map)
      return (revision_map, file_to_revision_map)

    # Iterate through divs and parse revisions
    for (div, pre, ul) in zip(divs, pres, uls):
      # Create new revision object for each revision.
      revision = {}

      # There must be three <tr>s. If not, this page is wrong.
      trs = div.getElementsByTagName('tr')
      if len(trs) != 3:
        continue

      # Retrieve git hash.
      githash = trs[0].getElementsByTagName('a')[0].firstChild.nodeValue

      # Retrieve and set author.
      author = trs[1].getElementsByTagName(
          'td')[0].firstChild.nodeValue.split('<')[0]
      revision['author'] = author
      revision['time'] = trs[1].getElementsByTagName(
          'td')[1].firstChild.nodeValue

      # Retrive and set message.
      revision['message'] = pre.firstChild.nodeValue

      # Set url of this CL.
      revision_url_part = self.url_parts_map['revision_url'] % githash
      revision['url'] = base_url + revision_url_part

      # Go through changed files, they are in li.
      lis = ul.getElementsByTagName('li')
      for li in lis:
        # Retrieve path and action of the changed file
        file_path = li.getElementsByTagName('a')[0].firstChild.nodeValue
        file_change_type = li.getElementsByTagName('span')[
            0].getAttribute('class')

        # Normalize file action so that it is same as SVN parser.
        file_change_type = _ConvertToFileChangeType(file_change_type)

        # Add the changed file to the map.
        if file_path not in file_to_revision_map:
          file_to_revision_map[file_path] = []
        file_to_revision_map[file_path].append((githash, file_change_type))

      # Add this revision object to the map.
      revision_map[githash] = revision

    # Parse one revision for the start range, because googlesource does not
    # include the start of the range.
    self.ParseRevision(revision_url, range_start, revision_map,
                       file_to_revision_map)

    return (revision_map, file_to_revision_map)