def load_buildpairs(dir_of_jsons: str, repo: str): """ :param dir_of_jsons: A directory containing JSON files of build pairs. :param repo: repo_slug name :raises json.decoder.JSONDecodeError: When the passed directory contains JSON files with invalid JSON. """ all_buildpairs = [] count = 0 task_name = repo.replace('/', '-') filename = task_name + '.json' try: data = read_json(os.path.join(dir_of_jsons, filename)) except json.decoder.JSONDecodeError: log.error('{} contains invalid JSON.'.format(filename)) return None except FileNotFoundError: log.error('{} is not found.'.format(filename)) return None all_buildpairs.extend(data) if not data: log.warning('{} does not contain any build pairs.'.format(filename)) count += 1 log.info('Read {} build pairs from {}.'.format(len(all_buildpairs), filename)) return all_buildpairs
def debug_when(branch, b, repo, utils): # DEBUG when matching committer date gets more result than HTML approach commits_with_same_datetime = [ commit['sha'] for commit in branch.github_commits if commit['commit']['committer']['date'] == b.committed_at ] # AssignTriggerCommits.find_trigger_commit_by_matching_datetime(repo, utils, commits_with_same_datetime, b, # checked_commits) if not b.trigger_commit and commits_with_same_datetime: b.trigger_commit = 'mock' print( 'why got matching committer date but did not find trigger from from html?', len(commits_with_same_datetime), commits_with_same_datetime[0]) for c in commits_with_same_datetime: if utils.github.is_commit_associated_with_build( repo, c, b.build_id): log.warning( 'this commit is confirmed to be the trigger but it didnt show up in HTML.' ) AssignTriggerCommits.print_helpful_links_for_debugging( repo, c, branch, b) utils.github.get_pr_commits_by_html( repo, str(branch.pr_num), branch) for commit in branch.html_commits: if str(b.build_id) == branch.html_commits[commit]: log.debug( 'Tried again and found the commit in HTML this time.' ) break
def _exceeded_api_quota(self) -> Tuple[bool, Optional[int]]: """ :return: A 2-tuple. (True, number of seconds until the quota resets) if the API quota has been exceeded. (False, None) otherwise. :raises Exception: When an exception is raised by the request. """ quota_url = 'https://api.github.com/rate_limit' log.info('Checking GitHub API quota.') response = self._session.get(quota_url) try: response.raise_for_status() result = response.json() if 'resources' in result: remaining = result['resources']['core']['remaining'] if remaining <= 0: reset_at = result['resources']['core'][ 'reset'] # Time when the quota resets, in UTC epoch seconds log.warning( 'GitHub API quota exceeded and will reset at UTC {}.'. format(reset_at)) now = int(time.time()) sleep_duration = ( reset_at - now ) + 10 # Add a few seconds to be sure that we sleep long enough. return True, sleep_duration except Exception as e: log.error('Exception while checking API quota:', e) raise return False, None
def _get(self, address, **kwargs): sleep_seconds = _SLEEP_SECONDS attempts = 0 while True: response = self._session.get(address, params=kwargs) code = response.status_code if code == 200: return response.json() elif code == 404: log.error('Get request for {} returned 404 Not Found.'.format( address)) response.raise_for_status() elif code == 429: if attempts < 1 or not _TOKENS: log.warning( 'The Travis API returned status code 429 Too Many Requests. ' 'Retrying after sleeping for {} seconds.'.format( sleep_seconds)) time.sleep(sleep_seconds) attempts += 1 else: # Use another token if # of attempts for GET Requests >= 1, will use next token in list # deque.pop() removes element from the right so we appendleft() self._session.headers['Authorization'] = 'token {}'.format( _TOKENS[0]) _TOKENS.appendleft(_TOKENS.pop()) else: log.error('Get request for {} returned {}.'.format( address, code)) raise requests.exceptions.ConnectionError( '{} download failed. Error code is {}.'.format( address, code))
def check_docker_disk_space_available(self, docker_storage_path): if self.config.skip_check_disk: return True total_b, used_b, free_b = shutil.disk_usage(docker_storage_path) if free_b < self.config.docker_disk_space_requirement: amount = str(round(free_b / 1024**3, 2)) log.warning('Inadequate disk space available for storing Docker Images: {} GiB.'.format(amount)) return False return True
def check_disk_space_available(self): if self.config.skip_check_disk: return True total_b, used_b, free_b = shutil.disk_usage('.') if free_b < self.config.disk_space_requirement: amount = str(round(free_b / 1024**3, 2)) log.warning('Inadequate disk space available for reproducing: {} GiB.'.format(amount)) return False return True
def check_docker_disk_space_available(self, docker_storage_path): total_b, used_b, free_b = shutil.disk_usage(docker_storage_path) available = free_b / total_b if available < 0.3: percent = str(round(available * 100, 2)) log.warning( 'Inadequate disk space available for storing Docker Images: {}%.' .format(percent)) return False return True
def check_disk_space_available(): total_b, used_b, free_b = shutil.disk_usage('.') available = free_b / total_b if available < 0.2: percent = str(round(available * 100, 2)) log.warning( 'Inadequate disk space available for reproducing: {}%.'.format( percent)) return False return True
def push_image(self, image_tag): # Push to Docker Hub try: result = self.client.images.push( DOCKER_HUB_REPO, tag=image_tag, stream=False, auth_config=self.docker_hub_auth_config) result = result.splitlines() result = result[-1] dictionary = ast.literal_eval(result) if "error" in dictionary.keys(): log.error('Error: ', dictionary.get('error')) elif "status" in dictionary.keys(): log.info('Status: ', dictionary.get('status')) except docker.errors.APIError: raise ValueError( 'Encountered a Docker API error while pushing a Docker image to Docker Hub.' ) except KeyboardInterrupt: log.error( 'Caught a KeyboardInterrupt while pushing a Docker image to Docker Hub.' ) # Push to Registry if not DOCKER_REGISTRY_REPO: log.warning( 'Docker Private Registry info not being set. Skipping.') return try: result = self.client.images.push( DOCKER_HUB_REPO, tag=image_tag, stream=False, auth_config=self.docker_registry_auth_config) result = result.splitlines() result = result[-1] dictionary = ast.literal_eval(result) if "error" in dictionary.keys(): log.error('Error: ', dictionary.get('error')) elif "status" in dictionary.keys(): log.info('Status: ', dictionary.get('status')) except docker.errors.APIError: raise ValueError( 'Encountered a Docker API error while pushing a Docker image to Docker Registry.' ) except KeyboardInterrupt: log.error( 'Caught a KeyboardInterrupt while pushing a Docker image to Docker Registry.' )
def _get_log_from_url(log_url: str, max_retries: int, retry_count: int = 0): sleep_duration = 3 # Seconds. try: with urllib.request.urlopen(log_url) as url: result = url.read() log.info('Downloaded log from {}.'.format(log_url)) return result except URLError: log.info('Could not download log from {}.'.format(log_url)) return None except ConnectionResetError: if retry_count == max_retries: log.info('Could not download log from', log_url, 'after retrying', max_retries, 'times.') return None log.warning('The server reset the connection. Retrying after', sleep_duration, 'seconds.') time.sleep(sleep_duration) _get_log_from_url(log_url, max_retries, retry_count + 1)
def _get(self, address, **kwargs): sleep_seconds = _SLEEP_SECONDS while True: response = self._session.get(address, params=kwargs) code = response.status_code if code == 200: return response.json() elif code == 404: log.error('Get request for {} returned 404 Not Found.'.format(address)) response.raise_for_status() elif code == 429: log.warning( 'The Travis API returned status code 429 Too Many Requests. ' 'Retrying after sleeping for {} seconds.'.format(sleep_seconds)) time.sleep(sleep_seconds) sleep_seconds = min(sleep_seconds * 2, _MAX_SLEEP_SECONDS) else: log.error('Get request for {} returned {}.'.format(address, code)) raise requests.exceptions.ConnectionError('{} download failed. Error code is {}.'.format(address, code))
def load_buildpairs(dir_of_jsons: str, filename: str): """ :param dir_of_jsons: A directory containing JSON files of build pairs. :param filename: the name of json file :raises json.decoder.JSONDecodeError: When the passed directory contains JSON files with invalid JSON. """ all_buildpairs = [] # Iterate over files that we expect to contain JSON. try: data = read_json(os.path.join(dir_of_jsons, filename)) except json.decoder.JSONDecodeError: log.error('{} contains invalid JSON.'.format(filename)) raise all_buildpairs.extend(data) if not data: log.warning( '{} does not contain any build pairs.'.format(filename)) log.info('Read {} build pairs from {}.'.format(len(all_buildpairs), filename)) return all_buildpairs
def _get_specific_language_analyzer(self, primary_language, lines, folds, job_id, build_system, trigger_sha, repo, force): # Update this function to extend to other languages. lang = str(primary_language.lower()) use_java = ['java', 'scala', 'groovy', 'clojure'] if force: log.warning('Forcing Java analyzer') return self._get_java_analyzer('java', lines, folds, job_id, build_system, trigger_sha, repo) if lang == 'ruby': # return RubyLogFileAnalyzer(log, folds) return None elif lang in use_java: return self._get_java_analyzer(primary_language, lines, folds, job_id, build_system, trigger_sha, repo) elif lang == 'python': return PythonLogFileAnalyzer(primary_language, folds, job_id) else: # log.warning('No primary language detected. lang =', lang) return None
def _create_session(self): """ When the quota is exceeded for a token, the program will switch to another tokens and attempt to continue. If the quota is exceeded for all tokens, the program will wait for the token with the lowest wait time. """ min_wait_time = 9999 chosen_token = None updated_token = copy.deepcopy(self._tokens) for t in self._tokens: self._session = cachecontrol.CacheControl(requests.Session()) self._session.headers['Authorization'] = 'token %s' % t has_wait, wait_time = self._exceeded_api_quota() if not has_wait: chosen_token = t min_wait_time = 0 # if a token is chosen, move it to the end updated_token.append(t) del updated_token[updated_token.index(t)] break if wait_time < min_wait_time: min_wait_time = wait_time chosen_token = t # if a token is chosen, move it to the end updated_token.append(t) del updated_token[updated_token.index(t)] self._tokens = updated_token if not chosen_token: raise RuntimeError( 'Unexpected state: No GitHub token chosen in github.py.') log.debug('Chose token {}.'.format(chosen_token)) if min_wait_time: # Sleep until the quota is reset. See https://developer.github.com/v3/#rate-limiting for more information. log.warning('Sleeping until the GitHub API quota is reset in', min_wait_time / 60, 'minutes.') time.sleep(min_wait_time) self._session = cachecontrol.CacheControl(requests.Session()) self._session.headers['Authorization'] = 'token %s' % chosen_token
def get(self, url: str): """ Request a URL from the GitHub API. Handles retrying, waiting for quota to reset, and token switching. :param url: The GitHub API URL to request. :return: A 2-tuple of the resulting response and the JSON representation of the response body. If there was a problem, the returned tuple is (None, None). """ if not isinstance(url, str): raise TypeError('The provided URL must be a string.') if urlparse(url).netloc != 'api.github.com': raise ValueError('The provided URL is not for the GitHub API.') retry_back_off = 5 # Seconds. retry_count = 0 while True: response = None try: response = self._session.get(url) response.raise_for_status() if not response.text: return None, None return response, response.json() except Exception as e: # If the exception is a connection error, the server may have dropped the connection. # In this case, we should try resetting the session. if e is requests.ConnectionError: log.info('Recreating session.') self._create_session() if response.status_code == 404: return None, None elif response.status_code == 451: # Repository access blocked. log.error('Repository access blocked:', url) return None, None elif response.status_code == 401: # Not authorized. log.error('Invalid GitHub API token: ', self._session.headers['Authorization']) return None, None elif response.status_code == 422: return None, None else: log.error('Request for url failed:', url) log.error('Exception:', e) # If the status code is 403 (Forbidden), then we may have exceeded our GitHub API quota. # In this case, we should verify that the quota was exceeded and, if so, wait until the quota is reset. if response is not None and response.status_code == 403: result = response.json() # Check whether GitHub's abuse detection mechanism was triggered. if 'message' in result: if 'abuse detection mechanism' in result['message']: log.warning( 'Triggered the GitHub abuse detection mechanism. Sleeping for 1 minute.' ) time.sleep(60) if 'Not Found' == result['message']: return None, None quota_exceeded, sleep_duration = self._exceeded_api_quota() if quota_exceeded: # Pick another token. self._create_session() time.sleep(retry_back_off) retry_count += 1
def process(self, data: Any, context: dict) -> Optional[Any]: repo = context['repo'] travis = TravisWrapper() builds_json_file = Utils.get_repo_builds_api_result_file(repo) builds_info_json_file = Utils.get_repo_builds_info_api_result_file(repo) if os.path.isfile(builds_json_file): build_list = read_json(builds_json_file) else: log.info('Getting the list of builds...') start_time = time.time() try: builds = travis.get_builds_for_repo(repo) except RequestException: error_message = 'Encountered an error while downloading builds for repository {}.'.format(repo) raise StepException(error_message) build_list = list(builds) write_json(builds_json_file, build_list) log.info('Got the list of builds in', time.time() - start_time, 'seconds.') if os.path.isfile(builds_info_json_file): build_list = read_json(builds_info_json_file) else: log.info('Downloading build info for', len(build_list), 'builds... This step may take several minutes for large repositories.') start_time = time.time() for idx, build in enumerate(build_list): build_id = build['id'] try: build_info = travis.get_build_info(build_id) except RequestException: error_message = 'Encountered an error while downloading build info for build {}.'.format(build_id) raise StepException(error_message) build['build_info'] = build_info if (idx + 1) % 500 == 0: log.info('Downloaded build info for', idx + 1, 'builds so far...') write_json(builds_info_json_file, build_list) log.info('Downloaded build info in', time.time() - start_time, 'seconds.') # Now that we have data from the Travis API, restructure it so it appears as if it came from the database using # the following query: # SELECT j.job_id, j.job_number, j.config, j.result, # b.build_id, b.number, b.finished_at, b.commit, b.branch, b.event_type, b.language, # c.committed_at, c.compare_at, c.committer_name, c.message # FROM jobs j # LEFT JOIN builds b on b.build_id = j.build_id # LEFT JOIN commits c on b.commit = c.sha # WHERE j.repo_id = "<repo_id>" jobs = [] for build in build_list: for job in build['build_info']['matrix']: j = { 'job_id': job['id'], 'job_number': job['number'], 'config': job['config'], 'result': job['result'], 'build_id': build['id'], 'number': build['number'], 'finished_at': job['finished_at'], 'commit': build['commit'], 'message': build['message'], 'branch': build['branch'], 'event_type': build['build_info']['event_type'], 'committed_at': build['build_info']['committed_at'], 'compare_at': build['build_info']['compare_url'], 'committer_name': build['build_info']['committer_name'], } if 'language' in job['config']: language = job['config']['language'] else: log.debug('Language not found in config, defaulting to ruby for job ID {}.'.format(job['id'])) language = 'ruby' j['language'] = language jobs.append(j) # Expose mining progression metrics via the context. Other pipeline steps must not change these values. # Do not raise a StepException before the context is populated. failed_builds, failed_pr_builds = GetJobsFromTravisAPI._count_failed_builds(build_list) failed_jobs, failed_pr_jobs = GetJobsFromTravisAPI._count_failed_jobs(build_list) context['mined_project_builder'].builds = len(build_list) context['mined_project_builder'].jobs = len(jobs) context['mined_project_builder'].failed_builds = failed_builds context['mined_project_builder'].failed_jobs = failed_jobs context['mined_project_builder'].failed_pr_builds = failed_pr_builds context['mined_project_builder'].failed_pr_jobs = failed_pr_jobs if not jobs: msg = 'Did not get any jobs for {}.'.format(repo) log.warning(msg) raise StepException(msg) return jobs