Esempio n. 1
0
def load_buildpairs(dir_of_jsons: str, repo: str):
    """
    :param dir_of_jsons: A directory containing JSON files of build pairs.
    :param repo: repo_slug name
    :raises json.decoder.JSONDecodeError: When the passed directory contains JSON files with invalid JSON.
    """
    all_buildpairs = []
    count = 0
    task_name = repo.replace('/', '-')
    filename = task_name + '.json'
    try:
        data = read_json(os.path.join(dir_of_jsons, filename))
    except json.decoder.JSONDecodeError:
        log.error('{} contains invalid JSON.'.format(filename))
        return None
    except FileNotFoundError:
        log.error('{} is not found.'.format(filename))
        return None

    all_buildpairs.extend(data)
    if not data:
        log.warning('{} does not contain any build pairs.'.format(filename))
    count += 1
    log.info('Read {} build pairs from {}.'.format(len(all_buildpairs), filename))
    return all_buildpairs
Esempio n. 2
0
 def debug_when(branch, b, repo, utils):
     # DEBUG when matching committer date gets more result than HTML approach
     commits_with_same_datetime = [
         commit['sha'] for commit in branch.github_commits
         if commit['commit']['committer']['date'] == b.committed_at
     ]
     # AssignTriggerCommits.find_trigger_commit_by_matching_datetime(repo, utils, commits_with_same_datetime, b,
     #                                                               checked_commits)
     if not b.trigger_commit and commits_with_same_datetime:
         b.trigger_commit = 'mock'
         print(
             'why got matching committer date but did not find trigger from from html?',
             len(commits_with_same_datetime), commits_with_same_datetime[0])
         for c in commits_with_same_datetime:
             if utils.github.is_commit_associated_with_build(
                     repo, c, b.build_id):
                 log.warning(
                     'this commit is confirmed to be the trigger but it didnt show up in HTML.'
                 )
                 AssignTriggerCommits.print_helpful_links_for_debugging(
                     repo, c, branch, b)
                 utils.github.get_pr_commits_by_html(
                     repo, str(branch.pr_num), branch)
                 for commit in branch.html_commits:
                     if str(b.build_id) == branch.html_commits[commit]:
                         log.debug(
                             'Tried again and found the commit in HTML this time.'
                         )
                 break
Esempio n. 3
0
 def _exceeded_api_quota(self) -> Tuple[bool, Optional[int]]:
     """
     :return: A 2-tuple. (True, number of seconds until the quota resets) if the API quota has been exceeded.
              (False, None) otherwise.
     :raises Exception: When an exception is raised by the request.
     """
     quota_url = 'https://api.github.com/rate_limit'
     log.info('Checking GitHub API quota.')
     response = self._session.get(quota_url)
     try:
         response.raise_for_status()
         result = response.json()
         if 'resources' in result:
             remaining = result['resources']['core']['remaining']
             if remaining <= 0:
                 reset_at = result['resources']['core'][
                     'reset']  # Time when the quota resets, in UTC epoch seconds
                 log.warning(
                     'GitHub API quota exceeded and will reset at UTC {}.'.
                     format(reset_at))
                 now = int(time.time())
                 sleep_duration = (
                     reset_at - now
                 ) + 10  # Add a few seconds to be sure that we sleep long enough.
                 return True, sleep_duration
     except Exception as e:
         log.error('Exception while checking API quota:', e)
         raise
     return False, None
Esempio n. 4
0
 def _get(self, address, **kwargs):
     sleep_seconds = _SLEEP_SECONDS
     attempts = 0
     while True:
         response = self._session.get(address, params=kwargs)
         code = response.status_code
         if code == 200:
             return response.json()
         elif code == 404:
             log.error('Get request for {} returned 404 Not Found.'.format(
                 address))
             response.raise_for_status()
         elif code == 429:
             if attempts < 1 or not _TOKENS:
                 log.warning(
                     'The Travis API returned status code 429 Too Many Requests. '
                     'Retrying after sleeping for {} seconds.'.format(
                         sleep_seconds))
                 time.sleep(sleep_seconds)
                 attempts += 1
             else:
                 # Use another token if # of attempts for GET Requests >= 1, will use next token in list
                 # deque.pop() removes element from the right so we appendleft()
                 self._session.headers['Authorization'] = 'token {}'.format(
                     _TOKENS[0])
                 _TOKENS.appendleft(_TOKENS.pop())
         else:
             log.error('Get request for {} returned {}.'.format(
                 address, code))
             raise requests.exceptions.ConnectionError(
                 '{} download failed. Error code is {}.'.format(
                     address, code))
Esempio n. 5
0
 def check_docker_disk_space_available(self, docker_storage_path):
     if self.config.skip_check_disk:
         return True
     total_b, used_b, free_b = shutil.disk_usage(docker_storage_path)
     if free_b < self.config.docker_disk_space_requirement:
         amount = str(round(free_b / 1024**3, 2))
         log.warning('Inadequate disk space available for storing Docker Images: {} GiB.'.format(amount))
         return False
     return True
Esempio n. 6
0
 def check_disk_space_available(self):
     if self.config.skip_check_disk:
         return True
     total_b, used_b, free_b = shutil.disk_usage('.')
     if free_b < self.config.disk_space_requirement:
         amount = str(round(free_b / 1024**3, 2))
         log.warning('Inadequate disk space available for reproducing: {} GiB.'.format(amount))
         return False
     return True
Esempio n. 7
0
 def check_docker_disk_space_available(self, docker_storage_path):
     total_b, used_b, free_b = shutil.disk_usage(docker_storage_path)
     available = free_b / total_b
     if available < 0.3:
         percent = str(round(available * 100, 2))
         log.warning(
             'Inadequate disk space available for storing Docker Images: {}%.'
             .format(percent))
         return False
     return True
Esempio n. 8
0
 def check_disk_space_available():
     total_b, used_b, free_b = shutil.disk_usage('.')
     available = free_b / total_b
     if available < 0.2:
         percent = str(round(available * 100, 2))
         log.warning(
             'Inadequate disk space available for reproducing: {}%.'.format(
                 percent))
         return False
     return True
Esempio n. 9
0
    def push_image(self, image_tag):
        # Push to Docker Hub
        try:
            result = self.client.images.push(
                DOCKER_HUB_REPO,
                tag=image_tag,
                stream=False,
                auth_config=self.docker_hub_auth_config)
            result = result.splitlines()
            result = result[-1]
            dictionary = ast.literal_eval(result)
            if "error" in dictionary.keys():
                log.error('Error: ', dictionary.get('error'))
            elif "status" in dictionary.keys():
                log.info('Status: ', dictionary.get('status'))

        except docker.errors.APIError:
            raise ValueError(
                'Encountered a Docker API error while pushing a Docker image to Docker Hub.'
            )
        except KeyboardInterrupt:
            log.error(
                'Caught a KeyboardInterrupt while pushing a Docker image to Docker Hub.'
            )
        # Push to Registry
        if not DOCKER_REGISTRY_REPO:
            log.warning(
                'Docker Private Registry info not being set. Skipping.')
            return
        try:
            result = self.client.images.push(
                DOCKER_HUB_REPO,
                tag=image_tag,
                stream=False,
                auth_config=self.docker_registry_auth_config)
            result = result.splitlines()
            result = result[-1]
            dictionary = ast.literal_eval(result)
            if "error" in dictionary.keys():
                log.error('Error: ', dictionary.get('error'))
            elif "status" in dictionary.keys():
                log.info('Status: ', dictionary.get('status'))

        except docker.errors.APIError:
            raise ValueError(
                'Encountered a Docker API error while pushing a Docker image to Docker Registry.'
            )
        except KeyboardInterrupt:
            log.error(
                'Caught a KeyboardInterrupt while pushing a Docker image to Docker Registry.'
            )
Esempio n. 10
0
def _get_log_from_url(log_url: str, max_retries: int, retry_count: int = 0):
    sleep_duration = 3  # Seconds.
    try:
        with urllib.request.urlopen(log_url) as url:
            result = url.read()
            log.info('Downloaded log from {}.'.format(log_url))
            return result
    except URLError:
        log.info('Could not download log from {}.'.format(log_url))
        return None
    except ConnectionResetError:
        if retry_count == max_retries:
            log.info('Could not download log from', log_url, 'after retrying', max_retries, 'times.')
            return None
        log.warning('The server reset the connection. Retrying after', sleep_duration, 'seconds.')
        time.sleep(sleep_duration)
        _get_log_from_url(log_url, max_retries, retry_count + 1)
Esempio n. 11
0
 def _get(self, address, **kwargs):
     sleep_seconds = _SLEEP_SECONDS
     while True:
         response = self._session.get(address, params=kwargs)
         code = response.status_code
         if code == 200:
             return response.json()
         elif code == 404:
             log.error('Get request for {} returned 404 Not Found.'.format(address))
             response.raise_for_status()
         elif code == 429:
             log.warning(
                 'The Travis API returned status code 429 Too Many Requests. '
                 'Retrying after sleeping for {} seconds.'.format(sleep_seconds))
             time.sleep(sleep_seconds)
             sleep_seconds = min(sleep_seconds * 2, _MAX_SLEEP_SECONDS)
         else:
             log.error('Get request for {} returned {}.'.format(address, code))
             raise requests.exceptions.ConnectionError('{} download failed. Error code is {}.'.format(address, code))
Esempio n. 12
0
    def load_buildpairs(dir_of_jsons: str, filename: str):
        """
        :param dir_of_jsons: A directory containing JSON files of build pairs.
        :param filename: the name of json file
        :raises json.decoder.JSONDecodeError: When the passed directory contains JSON files with invalid JSON.
        """
        all_buildpairs = []
        # Iterate over files that we expect to contain JSON.
        try:
            data = read_json(os.path.join(dir_of_jsons, filename))
        except json.decoder.JSONDecodeError:
            log.error('{} contains invalid JSON.'.format(filename))
            raise

        all_buildpairs.extend(data)
        if not data:
            log.warning(
                '{} does not contain any build pairs.'.format(filename))
        log.info('Read {} build pairs from {}.'.format(len(all_buildpairs),
                                                       filename))
        return all_buildpairs
Esempio n. 13
0
 def _get_specific_language_analyzer(self, primary_language, lines, folds,
                                     job_id, build_system, trigger_sha,
                                     repo, force):
     # Update this function to extend to other languages.
     lang = str(primary_language.lower())
     use_java = ['java', 'scala', 'groovy', 'clojure']
     if force:
         log.warning('Forcing Java analyzer')
         return self._get_java_analyzer('java', lines, folds, job_id,
                                        build_system, trigger_sha, repo)
     if lang == 'ruby':
         # return RubyLogFileAnalyzer(log, folds)
         return None
     elif lang in use_java:
         return self._get_java_analyzer(primary_language, lines, folds,
                                        job_id, build_system, trigger_sha,
                                        repo)
     elif lang == 'python':
         return PythonLogFileAnalyzer(primary_language, folds, job_id)
     else:
         # log.warning('No primary language detected. lang =', lang)
         return None
Esempio n. 14
0
 def _create_session(self):
     """
     When the quota is exceeded for a token, the program will switch to another tokens and attempt to continue.
     If the quota is exceeded for all tokens, the program will wait for the token with the lowest wait time.
     """
     min_wait_time = 9999
     chosen_token = None
     updated_token = copy.deepcopy(self._tokens)
     for t in self._tokens:
         self._session = cachecontrol.CacheControl(requests.Session())
         self._session.headers['Authorization'] = 'token %s' % t
         has_wait, wait_time = self._exceeded_api_quota()
         if not has_wait:
             chosen_token = t
             min_wait_time = 0
             # if a token is chosen, move it to the end
             updated_token.append(t)
             del updated_token[updated_token.index(t)]
             break
         if wait_time < min_wait_time:
             min_wait_time = wait_time
             chosen_token = t
             # if a token is chosen, move it to the end
             updated_token.append(t)
             del updated_token[updated_token.index(t)]
     self._tokens = updated_token
     if not chosen_token:
         raise RuntimeError(
             'Unexpected state: No GitHub token chosen in github.py.')
     log.debug('Chose token {}.'.format(chosen_token))
     if min_wait_time:
         # Sleep until the quota is reset. See https://developer.github.com/v3/#rate-limiting for more information.
         log.warning('Sleeping until the GitHub API quota is reset in',
                     min_wait_time / 60, 'minutes.')
         time.sleep(min_wait_time)
     self._session = cachecontrol.CacheControl(requests.Session())
     self._session.headers['Authorization'] = 'token %s' % chosen_token
Esempio n. 15
0
    def get(self, url: str):
        """
        Request a URL from the GitHub API.
        Handles retrying, waiting for quota to reset, and token switching.

        :param url: The GitHub API URL to request.
        :return: A 2-tuple of the resulting response and the JSON representation of the response body. If there was a
                 problem, the returned tuple is (None, None).
        """
        if not isinstance(url, str):
            raise TypeError('The provided URL must be a string.')
        if urlparse(url).netloc != 'api.github.com':
            raise ValueError('The provided URL is not for the GitHub API.')

        retry_back_off = 5  # Seconds.
        retry_count = 0
        while True:
            response = None
            try:
                response = self._session.get(url)
                response.raise_for_status()
                if not response.text:
                    return None, None
                return response, response.json()
            except Exception as e:
                # If the exception is a connection error, the server may have dropped the connection.
                # In this case, we should try resetting the session.
                if e is requests.ConnectionError:
                    log.info('Recreating session.')
                    self._create_session()

                if response.status_code == 404:
                    return None, None
                elif response.status_code == 451:  # Repository access blocked.
                    log.error('Repository access blocked:', url)
                    return None, None
                elif response.status_code == 401:  # Not authorized.
                    log.error('Invalid GitHub API token: ',
                              self._session.headers['Authorization'])
                    return None, None
                elif response.status_code == 422:
                    return None, None
                else:
                    log.error('Request for url failed:', url)
                    log.error('Exception:', e)

                # If the status code is 403 (Forbidden), then we may have exceeded our GitHub API quota.
                # In this case, we should verify that the quota was exceeded and, if so, wait until the quota is reset.
                if response is not None and response.status_code == 403:
                    result = response.json()
                    # Check whether GitHub's abuse detection mechanism was triggered.
                    if 'message' in result:
                        if 'abuse detection mechanism' in result['message']:
                            log.warning(
                                'Triggered the GitHub abuse detection mechanism. Sleeping for 1 minute.'
                            )
                            time.sleep(60)
                        if 'Not Found' == result['message']:
                            return None, None

                    quota_exceeded, sleep_duration = self._exceeded_api_quota()
                    if quota_exceeded:
                        # Pick another token.
                        self._create_session()
                time.sleep(retry_back_off)
                retry_count += 1
    def process(self, data: Any, context: dict) -> Optional[Any]:
        repo = context['repo']
        travis = TravisWrapper()

        builds_json_file = Utils.get_repo_builds_api_result_file(repo)
        builds_info_json_file = Utils.get_repo_builds_info_api_result_file(repo)
        if os.path.isfile(builds_json_file):
            build_list = read_json(builds_json_file)
        else:
            log.info('Getting the list of builds...')
            start_time = time.time()
            try:
                builds = travis.get_builds_for_repo(repo)
            except RequestException:
                error_message = 'Encountered an error while downloading builds for repository {}.'.format(repo)
                raise StepException(error_message)
            build_list = list(builds)
            write_json(builds_json_file, build_list)
            log.info('Got the list of builds in', time.time() - start_time, 'seconds.')

        if os.path.isfile(builds_info_json_file):
            build_list = read_json(builds_info_json_file)
        else:
            log.info('Downloading build info for',
                     len(build_list),
                     'builds... This step may take several minutes for large repositories.')
            start_time = time.time()
            for idx, build in enumerate(build_list):
                build_id = build['id']
                try:
                    build_info = travis.get_build_info(build_id)
                except RequestException:
                    error_message = 'Encountered an error while downloading build info for build {}.'.format(build_id)
                    raise StepException(error_message)
                build['build_info'] = build_info
                if (idx + 1) % 500 == 0:
                    log.info('Downloaded build info for', idx + 1, 'builds so far...')
            write_json(builds_info_json_file, build_list)
            log.info('Downloaded build info in', time.time() - start_time, 'seconds.')

        # Now that we have data from the Travis API, restructure it so it appears as if it came from the database using
        # the following query:
        #   SELECT j.job_id, j.job_number, j.config, j.result,
        #          b.build_id, b.number, b.finished_at, b.commit, b.branch, b.event_type, b.language,
        #          c.committed_at, c.compare_at, c.committer_name, c.message
        #   FROM jobs j
        #   LEFT JOIN builds b on b.build_id = j.build_id
        #   LEFT JOIN commits c on b.commit = c.sha
        #   WHERE j.repo_id = "<repo_id>"
        jobs = []
        for build in build_list:
            for job in build['build_info']['matrix']:
                j = {
                    'job_id': job['id'],
                    'job_number': job['number'],
                    'config': job['config'],
                    'result': job['result'],
                    'build_id': build['id'],
                    'number': build['number'],
                    'finished_at': job['finished_at'],
                    'commit': build['commit'],
                    'message': build['message'],
                    'branch': build['branch'],
                    'event_type': build['build_info']['event_type'],
                    'committed_at': build['build_info']['committed_at'],
                    'compare_at': build['build_info']['compare_url'],
                    'committer_name': build['build_info']['committer_name'],
                }
                if 'language' in job['config']:
                    language = job['config']['language']
                else:
                    log.debug('Language not found in config, defaulting to ruby for job ID {}.'.format(job['id']))
                    language = 'ruby'
                j['language'] = language
                jobs.append(j)

        # Expose mining progression metrics via the context. Other pipeline steps must not change these values.
        # Do not raise a StepException before the context is populated.
        failed_builds, failed_pr_builds = GetJobsFromTravisAPI._count_failed_builds(build_list)
        failed_jobs, failed_pr_jobs = GetJobsFromTravisAPI._count_failed_jobs(build_list)
        context['mined_project_builder'].builds = len(build_list)
        context['mined_project_builder'].jobs = len(jobs)
        context['mined_project_builder'].failed_builds = failed_builds
        context['mined_project_builder'].failed_jobs = failed_jobs
        context['mined_project_builder'].failed_pr_builds = failed_pr_builds
        context['mined_project_builder'].failed_pr_jobs = failed_pr_jobs

        if not jobs:
            msg = 'Did not get any jobs for {}.'.format(repo)
            log.warning(msg)
            raise StepException(msg)

        return jobs