def _base_pre_run(self): if self.job_center.total_jobs < 1: log.info('No jobs to reproduce. Exiting.') return # Set up the required directories. os.makedirs(self.config.orig_logs_dir, exist_ok=True) os.makedirs(self.config.output_dir, exist_ok=True) self.utils.directories_setup() if os.path.isfile(self.utils.get_error_reason_file_path()): self.error_reasons = read_json( self.utils.get_error_reason_file_path()) self.error_reasons = self.manager.dict(self.error_reasons) # Check if commands to Travis work. if not Utils.is_travis_installed(): log.error( colored( 'Commands to Travis are failing unexpectedly. Try restarting your shell and ensure your ' 'environment is provisioned correctly. Also try restarting your shell.', 'red')) raise Exception( 'Unexpected state: Commands to Travis are failing unexpectedly.' ) # Read travis_images.json. try: self.travis_images = read_json(self.config.travis_images_json) except FileNotFoundError: log.error( colored( self.config.travis_images_json + ' not found. Exiting.', 'red')) raise
def process(self, repo, builds_json_file, builds_info_json_file) -> Optional[Any]: # repo = context['repo'] travis = TravisWrapper() if os.path.isfile(builds_json_file): build_list = read_json(builds_json_file) else: log.info('Getting the list of builds...') start_time = time.time() try: builds = travis.get_builds_for_repo(repo) except RequestException: error_message = 'Encountered an error while downloading builds for repository {}.'.format(repo) build_list = list(builds) write_json(builds_json_file, build_list) log.info('Got the list of builds in', time.time() - start_time, 'seconds.') if os.path.isfile(builds_info_json_file): build_list = read_json(builds_info_json_file) else: log.info('Downloading build info for', len(build_list), 'builds... This step may take several minutes for large repositories.') start_time = time.time() for idx, build in enumerate(build_list): build_id = build['id'] try: build_info = travis.get_build_info(build_id) except RequestException: error_message = 'Encountered an error while downloading build info for build {}.'.format(build_id) build['build_info'] = build_info if (idx + 1) % 500 == 0: log.info('Downloaded build info for', idx + 1, 'builds so far...') write_json(builds_info_json_file, build_list) log.info('Downloaded build info in', time.time() - start_time, 'seconds.')
def filter_non_exact_images(job_id, log_path): """ Check if all jobs in this pair (from both the failed and passed build) used images that are available. If an image is found to match the job pair then it gets added to the job pair. This function assumes the language specified in the Travis configuration does not change between the failed and passed builds. Returns a 4-tuple of filter counts. The tuple members represent the following: 1. The number of pairs filtered due to original log not found 2. The number of pairs filtered due to an error reading the original log 3. The number of pairs filtered due to no image provision timestamp in the original log 4. The number of pairs filtered due to usage of a non-exact Docker image. """ log.debug( 'To detect non-exact pairs, we first extract the used images from the original logs.' ) travis_images = read_json(os.path.join(BASE_DIR, "travis_images.json")) provisioned_strs = [] for language in travis_images: provisioned_strs += travis_images[language].values() dockerhub_images = read_json( os.path.join(BASE_DIR, "dockerhub_images.json")) no_original_log = 0 error_reading_original_log = 0 orig_log_path = os.path.join(log_path, str(job_id) + ".txt") if not download_log(job_id, orig_log_path): no_original_log += 1 # Try to find the image by timestamp. If found, add it to the job pair. try: chooser = ExactImageChooserByTime(orig_log_path, travis_images, language) orig_log_image_provision_timestamp = chooser.find_image_datetime_from_log( ) image = chooser.get_image_tag() if image is not None: return image except OSError: # The original log file was not found. error_reading_original_log += 1 # Try to find image by tag. If found, add it to the job pair. if not image: chooser = ExactImageChooserByTag(orig_log_path) image = chooser.get_image_tag() if image is not None: return image # Try to find image by GCE commit SHA. If found, add it to the job pair. if not image: chooser = ExactImageChooserByCommitSHA(orig_log_path, dockerhub_images) image = chooser.get_image_tag() if image is not None: return image
def clean_bad_json_files(task_name): log.info('Cleaning bad JSON files.') count = 0 task_dir = os.path.join(OUTPUT_DIR, task_name) for file in os.listdir(task_dir): if '.json' in file: filepath = os.path.join(task_dir, file) try: read_json(filepath) except (json.decoder.JSONDecodeError, UnicodeDecodeError): os.remove(filepath) log.info('Removing', filepath) count += 1 log.info('Removed', count, 'bad JSON files.')
def get_pr_commits_by_parsing_html(self): start_time = time.time() html_commits_json_file = self.utils.get_html_commits_json_file( self.repo) html_commits = {} if os.path.isfile(html_commits_json_file): html_commits = read_json(html_commits_json_file) for _, branch_obj in self.branches.items(): if branch_obj.pr_num != -1: # if it's a PR branch branch_obj.html_commits = html_commits[str( branch_obj.pr_num)] else: threads = [ threading.Thread( target=self.utils.github.get_pr_commits_by_html, args=(self.repo, str(branch_obj.pr_num), branch_obj)) for _, branch_obj in self.branches.items() ] for thread in threads: thread.start() for thread in threads: thread.join() for _, branch_obj in self.branches.items(): if branch_obj.pr_num != -1: # if it's a PR branch html_commits[branch_obj.pr_num] = branch_obj.html_commits write_json(html_commits_json_file, html_commits) log.info('Got pull request commits (via HTML parsing) in', time.time() - start_time, 'seconds.')
def load_buildpairs(dir_of_jsons: str, repo: str): """ :param dir_of_jsons: A directory containing JSON files of build pairs. :param repo: repo_slug name :raises json.decoder.JSONDecodeError: When the passed directory contains JSON files with invalid JSON. """ all_buildpairs = [] count = 0 task_name = repo.replace('/', '-') filename = task_name + '.json' try: data = read_json(os.path.join(dir_of_jsons, filename)) except json.decoder.JSONDecodeError: log.error('{} contains invalid JSON.'.format(filename)) return None except FileNotFoundError: log.error('{} is not found.'.format(filename)) return None all_buildpairs.extend(data) if not data: log.warning('{} does not contain any build pairs.'.format(filename)) count += 1 log.info('Read {} build pairs from {}.'.format(len(all_buildpairs), filename)) return all_buildpairs
def get_commit_info_for_virtual_commit(self): start_time = time.time() virtual_commits_info = {} virtual_commits_info_json_file = self.utils.get_virtual_commits_info_json_file( self.repo) has_json_file = os.path.isfile(virtual_commits_info_json_file) if has_json_file: virtual_commits_info = read_json(virtual_commits_info_json_file) for _, branch_obj in self.branches.items(): if not branch_obj.pairs: continue for pair in branch_obj.pairs: builds = [pair.failed_build, pair.passed_build] for b in builds: if has_json_file: if b.commit in virtual_commits_info: b.virtual_commit_info = virtual_commits_info[ b.commit] else: c = self.utils.github.get_commit_info( self.repo, b.commit) if c: virtual_commits_info[b.commit] = c b.virtual_commit_info = c if not has_json_file: write_json(virtual_commits_info_json_file, virtual_commits_info) log.info('Got commit info for virtual commits in', time.time() - start_time, 'seconds.')
def _load_jobs_from_pairs_for_repo(self, input_file): """ Read the input file, which should contain mined pairs from the database. """ try: buildpairs = read_json(input_file) except json.JSONDecodeError: log.error('Error reading input file {} in PairCenter. Exiting.') raise for bp in buildpairs: # For debug purposes: When we only want to reproduce non-PR pairs, we can uncomment these lines. # if bp['pr_num'] == -1: # continue repo = bp['repo'] if repo not in self.repos: self.repos[repo] = Repo(repo) self.uninitialized_repos.put(repo) self._append_buildpair_and_jobpair_to_repo(repo, bp) self._init_names() self.set_skip_of_job_pairs() self._init_queue_of_repos() # Calculate buildpair and job numbers after done loading from file. self._calc_num_total_buildpairs() self._calc_num_total_jobpairs() self._calc_num_total_jobs() log.debug('pair_center.total_buildpairs =', self.total_buildpairs, 'pair_center.total_jobpairs =', self.total_jobpairs, 'pair_center.total_jobs =', self.total_jobs)
def get_commits_from_github_api(self): start_time = time.time() github_commits = {} get_github_commits = True github_commits_json_file = self.utils.get_github_commits_json_file( self.repo) if os.path.isfile(github_commits_json_file): github_commits = read_json(github_commits_json_file) get_github_commits = False for _, branch_obj in self.branches.items(): if branch_obj.pr_num != -1: # Whether it is a PR branch. # Get commits from the GitHub API. if get_github_commits: github_commits[str(branch_obj.pr_num )] = self.utils.github.list_pr_commits( self.repo, str(branch_obj.pr_num)) branch_obj.github_commits = github_commits[str( branch_obj.pr_num)] # for commit in github_commits[str(branch.pr_num)]: # commit['build_ids'] = self.utils.github.get_build_ids_for_commit(self.repo, commit['sha']) write_json(github_commits_json_file, github_commits) log.info('Got pull request commits (via GitHub API calls) in', time.time() - start_time, 'seconds.')
def get_stats(output_file_path: str) -> Optional[dict]: try: return read_json(output_file_path)['stats'] except FileNotFoundError: print('Cannot find output file at ' + output_file_path + '. Skipping.') except KeyError: print(output_file_path, 'does not have the key "stats". Skipping.') return None
def get_duration(output_file_path: str) -> Optional[int]: try: duration_string = read_json(output_file_path)['duration'] duration = datetime.datetime.strptime(duration_string, '%Hh %Mm %S.%fs') delta = datetime.timedelta(hours=duration.hour, minutes=duration.minute, seconds=duration.second) return delta.total_seconds() // 60 except FileNotFoundError: print('Cannot find output file at ' + output_file_path + '. Skipping.') except KeyError: print(output_file_path, 'does not have the key "duration". Skipping.') return None
def get_repr_metadata_dict(task_json_path, repr_metadata_dict): buildpairs = read_json(task_json_path) for bp in buildpairs: for jp in bp['jobpairs']: image_tag = bugswarmutils.get_image_tag(bp['repo'], jp['failed_job']['job_id']) failed_job = jp['failed_job'] passed_job = jp['passed_job'] jobs = [failed_job, passed_job] tag_metadata = dict() tag_metadata['repo'] = bp['repo'] build_system = failed_job['orig_result'][ 'tr_build_system'] if failed_job['orig_result'] else '' tag_metadata['build_system'] = build_system tag_metadata['failed_job'] = {'job_id': jobs[0]['job_id']} tag_metadata['passed_job'] = {'job_id': jobs[1]['job_id']} repr_metadata_dict[image_tag] = tag_metadata return repr_metadata_dict
def load_buildpairs(dir_of_jsons: str, filename: str): """ :param dir_of_jsons: A directory containing JSON files of build pairs. :param filename: the name of json file :raises json.decoder.JSONDecodeError: When the passed directory contains JSON files with invalid JSON. """ all_buildpairs = [] # Iterate over files that we expect to contain JSON. try: data = read_json(os.path.join(dir_of_jsons, filename)) except json.decoder.JSONDecodeError: log.error('{} contains invalid JSON.'.format(filename)) raise all_buildpairs.extend(data) if not data: log.warning( '{} does not contain any build pairs.'.format(filename)) log.info('Read {} build pairs from {}.'.format(len(all_buildpairs), filename)) return all_buildpairs
def run(self): buildpairs = read_json(self.input_file) # Only check for skipping if CSV mode is disabled. to_insert = [] for bp in buildpairs: for jp in bp['jobpairs']: image_tag = Utils.construct_jobpair_image_tag_from_dict(jp, bp['repo']) reproduce_successes, _, _ = Packager._calc_stability(jp) artifact_exists = Packager._is_artifact_in_db(image_tag) if artifact_exists and not reproduce_successes: log.info('Artifact', image_tag, 'already exists in the database.') continue to_insert.append((image_tag, artifact_exists, self._structure_artifact_data(image_tag, bp, jp))) if self.csv_mode: self._write_csv(to_insert) elif not to_insert: log.info('Done! No new metadata to insert.') else: inserts = 0 errors = 0 for artifact_data in to_insert: (image_tag, artifact_exists, artifact) = artifact_data if artifact_exists: if Packager._update_artifact(image_tag, artifact): inserts += 1 else: errors += 1 elif Packager._insert_artifact(artifact): inserts += 1 else: errors += 1 if errors == 0: log.info('Done! Inserted metadata for {} jobpairs with 0 errors.'.format(inserts)) else: log.info('Done! Attempted to insert {} jobpairs into the database. {} insertions succeeded and {} ' 'encountered an error.'.format(len(to_insert), inserts, errors))
def process(self, data: Dict[str, Branch], context: dict) -> Optional[Any]: repo = context['repo'] utils = context['utils'] branches = data # Get the merge state of each pull request. log.info('Getting merge state for all pull requests.') start_time = time.time() pr_list_json_file = utils.get_pr_list_json_file(repo) pr_dict = {} if os.path.isfile(pr_list_json_file): try: pr_dict = read_json(pr_list_json_file) except ValueError: os.remove(pr_list_json_file) raise StepException else: pr_entities = utils.github.list_pull_requests(repo) for pr_entity in pr_entities: pr_dict[str(pr_entity['number'])] = pr_entity write_json(pr_list_json_file, pr_dict) for branch_id, branch_obj in branches.items(): if branch_obj.pr_num != -1: # Whether the branch is a pull request branch. if str(branch_obj.pr_num) in pr_dict: branch_obj.merged_at = pr_dict[str( branch_obj.pr_num)]['merged_at'] branch_obj.base_branch = pr_dict[str( branch_obj.pr_num)]['base']['ref'] branch_obj.pr_info = pr_dict[str(branch_obj.pr_num)] log.debug('Got merge state for all pull requests in', time.time() - start_time, 'seconds.') return branches
def process(self, data: Any, context: dict) -> Optional[Any]: repo = context['repo'] travis = TravisWrapper() builds_json_file = Utils.get_repo_builds_api_result_file(repo) builds_info_json_file = Utils.get_repo_builds_info_api_result_file(repo) if os.path.isfile(builds_json_file): build_list = read_json(builds_json_file) else: log.info('Getting the list of builds...') start_time = time.time() try: builds = travis.get_builds_for_repo(repo) except RequestException: error_message = 'Encountered an error while downloading builds for repository {}.'.format(repo) raise StepException(error_message) build_list = list(builds) write_json(builds_json_file, build_list) log.info('Got the list of builds in', time.time() - start_time, 'seconds.') if os.path.isfile(builds_info_json_file): build_list = read_json(builds_info_json_file) else: log.info('Downloading build info for', len(build_list), 'builds... This step may take several minutes for large repositories.') start_time = time.time() for idx, build in enumerate(build_list): build_id = build['id'] try: build_info = travis.get_build_info(build_id) except RequestException: error_message = 'Encountered an error while downloading build info for build {}.'.format(build_id) raise StepException(error_message) build['build_info'] = build_info if (idx + 1) % 500 == 0: log.info('Downloaded build info for', idx + 1, 'builds so far...') write_json(builds_info_json_file, build_list) log.info('Downloaded build info in', time.time() - start_time, 'seconds.') # Now that we have data from the Travis API, restructure it so it appears as if it came from the database using # the following query: # SELECT j.job_id, j.job_number, j.config, j.result, # b.build_id, b.number, b.finished_at, b.commit, b.branch, b.event_type, b.language, # c.committed_at, c.compare_at, c.committer_name, c.message # FROM jobs j # LEFT JOIN builds b on b.build_id = j.build_id # LEFT JOIN commits c on b.commit = c.sha # WHERE j.repo_id = "<repo_id>" jobs = [] for build in build_list: for job in build['build_info']['matrix']: j = { 'job_id': job['id'], 'job_number': job['number'], 'config': job['config'], 'result': job['result'], 'build_id': build['id'], 'number': build['number'], 'finished_at': job['finished_at'], 'commit': build['commit'], 'message': build['message'], 'branch': build['branch'], 'event_type': build['build_info']['event_type'], 'committed_at': build['build_info']['committed_at'], 'compare_at': build['build_info']['compare_url'], 'committer_name': build['build_info']['committer_name'], } if 'language' in job['config']: language = job['config']['language'] else: log.debug('Language not found in config, defaulting to ruby for job ID {}.'.format(job['id'])) language = 'ruby' j['language'] = language jobs.append(j) # Expose mining progression metrics via the context. Other pipeline steps must not change these values. # Do not raise a StepException before the context is populated. failed_builds, failed_pr_builds = GetJobsFromTravisAPI._count_failed_builds(build_list) failed_jobs, failed_pr_jobs = GetJobsFromTravisAPI._count_failed_jobs(build_list) context['mined_project_builder'].builds = len(build_list) context['mined_project_builder'].jobs = len(jobs) context['mined_project_builder'].failed_builds = failed_builds context['mined_project_builder'].failed_jobs = failed_jobs context['mined_project_builder'].failed_pr_builds = failed_pr_builds context['mined_project_builder'].failed_pr_jobs = failed_pr_jobs if not jobs: msg = 'Did not get any jobs for {}.'.format(repo) log.warning(msg) raise StepException(msg) return jobs
def _write_output_json(self): log.info('Writing output JSON annotated with match history.') pairs = read_json(self.input_file) # Write default attributes. for p in pairs: for jp in p['jobpairs']: jp['match_history'] = {} jp['failed_job']['match_history'] = {} jp['passed_job']['match_history'] = {} jp['failed_job']['orig_result'] = '' jp['passed_job']['orig_result'] = '' jp['failed_job']['mismatch_attrs'] = [] jp['passed_job']['mismatch_attrs'] = [] jp['failed_job']['pip_patch'] = False jp['passed_job']['pip_patch'] = False for p in pairs: repo = p['repo'] if repo not in self.pair_center.repos: continue # Try to find this build pair in pair center. for bp in self.pair_center.repos[repo].buildpairs: if p['failed_build']['build_id'] == bp.builds[0].build_id: # Found build pair in pair center. # Optional: Write buildpair match type. # This is not used since we switched to jobpair packaging. p['match'] = bp.match.value trigger_sha = p['failed_build']['head_sha'] # Similarly, for each job pair in build pair, try to find it in the pair center. for jp in p['jobpairs']: # For a build that has some jobs filtered and some jobs not filtered, # the job cannot be found in paircenter. if jp['is_filtered']: continue found_in_paircenter = False for jobpair in bp.jobpairs: if str(jobpair.jobs[0].job_id) == str(jp['failed_job']['job_id']): found_in_paircenter = True # Write jobpair match history, analyzed results, and mismatched attributes. jp['match_history'] = jobpair.match_history jp['failed_job']['match_history'] = jobpair.failed_job_match_history jp['passed_job']['match_history'] = jobpair.passed_job_match_history jp['failed_job']['orig_result'] = jobpair.jobs[0].orig_result jp['passed_job']['orig_result'] = jobpair.jobs[1].orig_result jp['failed_job']['mismatch_attrs'] = jobpair.jobs[0].mismatch_attrs jp['passed_job']['mismatch_attrs'] = jobpair.jobs[1].mismatch_attrs jp['failed_job']['pip_patch'] = jobpair.jobs[0].pip_patch jp['passed_job']['pip_patch'] = jobpair.jobs[1].pip_patch if not found_in_paircenter: # If not found in pair center, this jobpair was filtered out. # In this case, we still analyze the original log to get as many attributes as possible. for i in range(2): job_name = 'failed_job' if i == 0 else 'passed_job' job_id = jp[job_name]['job_id'] original_log_path = self.utils.get_orig_log_path(job_id) if not download_log(job_id, original_log_path): continue original_result = self.analyzer.analyze_single_log(original_log_path, job_id, trigger_sha, repo) if 'not_in_supported_language' in original_result: continue jp[job_name]['orig_result'] = original_result raise RuntimeError('Unexpected state: Jobpair not found in pair center. Exiting.') os.makedirs(self.config.result_json_dir, exist_ok=True) filename = self.config.task + '.json' filepath = os.path.join(self.config.result_json_dir, filename) write_json(filepath, pairs)
def process(self, data: Any, context: dict) -> Optional[Any]: repo = context['repo'] mined_build_exists = False lock = Lock() with lock: travis = TravisWrapper() last_mined_build_number = 0 if context['original_mined_project_metrics']['last_build_mined']['build_number']: last_mined_build_number = context['original_mined_project_metrics']['last_build_mined']['build_number'] mined_build_exists = True builds_json_file = Utils.get_repo_builds_api_result_file(repo) builds_info_json_file = Utils.get_repo_builds_info_api_result_file(repo) if os.path.isfile(builds_json_file): build_list = read_json(builds_json_file) else: log.info('Getting the list of builds...') start_time = time.time() try: if not mined_build_exists: # gets all builds for project builds = travis.get_builds_for_repo(repo) else: # gets the latest builds and stops mining after reaching our last mined build number builds = travis.get_builds_for_repo(repo, last_mined_build_number) except RequestException: error_message = 'Encountered an error while downloading builds for repository {}.'.format(repo) raise StepException(error_message) build_list = list(builds) write_json(builds_json_file, build_list) log.info('Got the list of builds in', time.time() - start_time, 'seconds.') if not build_list: msg = 'Did not get any new builds for {}.'.format(repo) raise StepException(msg) if os.path.isfile(builds_info_json_file): build_list = read_json(builds_info_json_file) else: log.info('Downloading build info for', len(build_list), 'builds... This step may take several minutes for large repositories.') start_time = time.time() for idx, build in enumerate(build_list): build_id = build['id'] try: build_info = travis.get_build_info(build_id) except RequestException: error_message = 'Encountered an error while downloading build info for build {}.'.format(build_id) raise StepException(error_message) build['build_info'] = build_info if (idx + 1) % 500 == 0: log.info('Downloaded build info for', idx + 1, 'builds so far...') write_json(builds_info_json_file, build_list) log.info('Downloaded build info in', time.time() - start_time, 'seconds.') # Now that we have data from the Travis API, restructure it so it appears as if it came from the database using # the following query: # SELECT j.job_id, j.job_number, j.config, j.result, # b.build_id, b.number, b.finished_at, b.commit, b.branch, b.event_type, b.language, # c.committed_at, c.compare_at, c.committer_name, c.message # FROM jobs j # LEFT JOIN builds b on b.build_id = j.build_id # LEFT JOIN commits c on b.commit = c.sha # WHERE j.repo_id = "<repo_id>" jobs = [] leftover_build_list = [] highest_build_number = 0 highest_build_number_id = 0 # The 'build_list' will return at minimum 25 builds due to the response gathered from Travis API being a page. # We will always set the 'highest_build_number/id' and skip builds that we have mined previously by checking if # the 'build_number <= last_mined_build_number' for build in build_list: build_id = build['id'] build_number = int(build['number']) if build_number > highest_build_number: highest_build_number_id = build_id highest_build_number = build_number if build_number <= last_mined_build_number: continue for job in build['build_info']['matrix']: j = { 'job_id': job['id'], 'job_number': job['number'], 'config': job['config'], 'result': job['result'], 'build_id': build['id'], 'number': build['number'], 'finished_at': job['finished_at'], 'commit': build['commit'], 'message': build['message'], 'branch': build['branch'], 'event_type': build['build_info']['event_type'], 'committed_at': build['build_info']['committed_at'], 'compare_at': build['build_info']['compare_url'], 'committer_name': build['build_info']['committer_name'], } if 'language' in job['config']: language = job['config']['language'] else: log.debug('Language not found in config, defaulting to ruby for job ID {}.'.format(job['id'])) language = 'ruby' j['language'] = language jobs.append(j) leftover_build_list.append(build) if not jobs: msg = 'Did not get any jobs for {}.'.format(repo) # Set the build_number & build_id metric to the latest build info we've received if no jobs are found. bugswarmapi = DatabaseAPI(DATABASE_PIPELINE_TOKEN) bugswarmapi.set_latest_build_info_metric(repo, highest_build_number, highest_build_number_id) raise StepException(msg) # Expose mining progression metrics via the context. Other pipeline steps must not change these values. # Do not raise a StepException before the context is populated. failed_builds, failed_pr_builds = GetJobsFromTravisAPI._count_failed_builds(leftover_build_list) failed_jobs, failed_pr_jobs = GetJobsFromTravisAPI._count_failed_jobs(leftover_build_list) context['mined_project_builder'].builds = len(leftover_build_list) + \ context['original_mined_project_metrics']['progression_metrics']['builds'] context['mined_project_builder'].jobs = len(jobs) + \ context['original_mined_project_metrics']['progression_metrics']['jobs'] context['mined_project_builder'].failed_builds = failed_builds + \ context['original_mined_project_metrics']['progression_metrics']['failed_builds'] context['mined_project_builder'].failed_jobs = failed_jobs + \ context['original_mined_project_metrics']['progression_metrics']['failed_jobs'] context['mined_project_builder'].failed_pr_builds = failed_pr_builds + \ context['original_mined_project_metrics']['progression_metrics']['failed_pr_builds'] context['mined_project_builder'].failed_pr_jobs = failed_pr_jobs + \ context['original_mined_project_metrics']['progression_metrics']['failed_pr_jobs'] context['mined_project_builder'].last_build_mined['build_id'] = highest_build_number_id context['mined_project_builder'].last_build_mined['build_number'] = highest_build_number return jobs
import unittest import sys from bugswarm.common.json import read_json sys.path.append('../') from pair_filter.image_chooser import (ExactImageChooserByCommitSHA, ExactImageChooserByTag, ExactImageChooserByTime) # noqa: E402 from pair_filter.constants import DOCKERHUB_IMAGES_JSON, TRAVIS_IMAGES_JSON # noqa: E402 _DOCKERHUB_IMAGES = read_json(DOCKERHUB_IMAGES_JSON) _TRAVIS_IMAGES = read_json('../' + TRAVIS_IMAGES_JSON) class Test(unittest.TestCase): def test_match_object_by_commit_sha_1(self): log = '566084454-orig.log' file_path = 'logs/' + log chooser = ExactImageChooserByCommitSHA(file_path, _DOCKERHUB_IMAGES) image = chooser.get_image_tag() assert image == 'travisci/ci-sardonyx:packer-1558623664-f909ac5' def test_match_object_by_commit_sha_2(self): log = '520562883-orig.log' file_path = 'logs/' + log chooser = ExactImageChooserByCommitSHA(file_path, _DOCKERHUB_IMAGES) image = chooser.get_image_tag() assert image == 'travisci/ci-sardonyx:packer-1558623664-f909ac5' def test_match_object_by_commit_sha_3(self): log = '100252761-orig.log'
import unittest import sys from bugswarm.common.json import read_json sys.path.append("../") from pair_filter.image_chooser import ExactImageChooserByCommitSHA, ExactImageChooserByTag # noqa: E402 from pair_filter.constants import DOCKERHUB_IMAGES_JSON # noqa: E402 _DOCKERHUB_IMAGES = read_json(DOCKERHUB_IMAGES_JSON) class Test(unittest.TestCase): def test_match_object_by_commit_sha_1(self): log = "566084454-orig.log" file_path = "logs/" + log chooser = ExactImageChooserByCommitSHA(file_path, _DOCKERHUB_IMAGES) image = chooser.get_image_tag() assert image == 'travisci/ci-sardonyx:packer-1558623664-f909ac5' def test_match_object_by_commit_sha_2(self): log = "520562883-orig.log" file_path = "logs/" + log chooser = ExactImageChooserByCommitSHA(file_path, _DOCKERHUB_IMAGES) image = chooser.get_image_tag() assert image == 'travisci/ci-sardonyx:packer-1558623664-f909ac5' def test_match_object_by_commit_sha_3(self): log = "100252761-orig.log" file_path = "logs/" + log chooser = ExactImageChooserByCommitSHA(file_path, _DOCKERHUB_IMAGES)
def _update_mined_project(repo: str, buildpairs: List): bugswarmapi = DatabaseAPI(token=DATABASE_PIPELINE_TOKEN) file_name = utils.canonical_repo(repo) file_path = os.path.join( os.path.dirname(os.path.realpath('.')), 'pair-finder/output/original_metrics/{}.json'.format(file_name)) original_d = read_json(file_path) def _key(filter_name: str, pr: bool): return 'filtered{}_{}'.format('_pr' if pr else '', filter_name) def _unfiltered_key(pr: bool): return 'unfiltered{}'.format('_pr' if pr else '') d = { 'filtered_no_sha': 0, 'filtered_same_commit': 0, 'filtered_unavailable': 0, 'filtered_no_original_log': 0, 'filtered_error_reading_original_log': 0, 'filtered_no_image_provision_timestamp': 0, 'filtered_inaccessible_image': 0, 'unfiltered': 0, 'filtered_pr_no_sha': 0, 'filtered_pr_same_commit': 0, 'filtered_pr_unavailable': 0, 'filtered_pr_no_original_log': 0, 'filtered_pr_error_reading_original_log': 0, 'filtered_pr_no_image_provision_timestamp': 0, 'filtered_pr_inaccessible_image': 0, 'unfiltered_pr': 0, } for bp in buildpairs: is_pr = bp['pr_num'] > 0 d[_unfiltered_key(is_pr)] += utils.count_unfiltered_jobpairs([bp]) for jp in bp['jobpairs']: reason = jp[FILTERED_REASON_KEY] if reason == reasons.NO_HEAD_SHA: d[_key('no_sha', is_pr)] += 1 elif reason == reasons.SAME_COMMIT_PAIR: d[_key('same_commit', is_pr)] += 1 elif reason == reasons.NOT_AVAILABLE: d[_key('unavailable', is_pr)] += 1 elif reason == reasons.NO_ORIGINAL_LOG: d[_key('no_original_log', is_pr)] += 1 elif reason == reasons.ERROR_READING_ORIGINAL_LOG: d[_key('error_reading_original_log', is_pr)] += 1 elif reason == reasons.NO_IMAGE_PROVISION_TIMESTAMP: d[_key('no_image_provision_timestamp', is_pr)] += 1 elif reason == reasons.INACCESSIBLE_IMAGE: d[_key('inaccessible_image', is_pr)] += 1 for metric_name, metric_value in d.items(): try: metric_value = metric_value + original_d[ 'progression_metrics'][metric_name] except KeyError: pass if not bugswarmapi.set_mined_project_progression_metric( repo, metric_name, metric_value): log.error( 'Encountered an error while setting a progression metric. Exiting.' ) sys.exit(1)
def filter_non_exact_images(pairs: List) -> Tuple[int, int, int, int]: """ Check if all jobs in this pair (from both the failed and passed build) used images that are available. If an image is found to match the job pair then it gets added to the job pair. This function assumes the language specified in the Travis configuration does not change between the failed and passed builds. Returns a 4-tuple of filter counts. The tuple members represent the following: 1. The number of pairs filtered due to original log not found 2. The number of pairs filtered due to an error reading the original log 3. The number of pairs filtered due to no image provision timestamp in the original log 4. The number of pairs filtered due to usage of a non-exact Docker image. """ log.debug('To detect non-exact pairs, we first extract the used images from the original logs.') travis_images = read_json(TRAVIS_IMAGES_JSON) provisioned_strs = [] for language in travis_images: provisioned_strs += travis_images[language].values() dockerhub_images = read_json(DOCKERHUB_IMAGES_JSON) filtered = 0 no_original_log = 0 error_reading_original_log = 0 no_image_provision_timestamp = 0 inaccessible_image = 0 exact_jobs = 0 images_we_have = {} images_we_dont_have = {} log.debug('Analyzing original logs to extract used images.') processed = 0 for p in pairs: config = p['failed_build']['jobs'][0]['config'] # Travis defaults to the Ruby image if the language is not specified. # See https://github.com/travis-ci/travis-ci/issues/4895. language = config.get('language') or 'ruby' # Multiple languages can be specified by using a list. In this case, we take the first language in the list. # We should eventually consider supporting the behavior mentioned in # https://stackoverflow.com/a/44054333/5007059 if it becomes officially supported. if isinstance(language, list): language = language[0] if language == 'java': language = 'jvm' for jp in p['jobpairs']: # If the job pair has already been filtered, skip it. if utils.jobpair_is_filtered(jp): continue jobs = [jp['failed_job'], jp['passed_job']] for j in jobs: processed += 1 job_id = j['job_id'] orig_log_path = utils.get_orig_log_path(job_id) if not download_log(job_id, orig_log_path): no_original_log += 1 jp[FILTERED_REASON_KEY] = reasons.NO_ORIGINAL_LOG continue # Try to find the image by timestamp. If found, add it to the job pair. try: chooser = ExactImageChooserByTime(orig_log_path, travis_images, language) orig_log_image_provision_timestamp = chooser.find_image_datetime_from_log() image = chooser.get_image_tag() if image is not None: j[PARSED_IMAGE_TAG_KEY] = image except OSError: # The original log file was not found. error_reading_original_log += 1 jp[FILTERED_REASON_KEY] = reasons.ERROR_READING_ORIGINAL_LOG continue if not orig_log_image_provision_timestamp: # Jobs older than 01/2015 did not use Docker, so the build log does not contain an image provision # timestamp. no_image_provision_timestamp += 1 jp[FILTERED_REASON_KEY] = reasons.NO_IMAGE_PROVISION_TIMESTAMP continue # Try to find image by tag. If found, add it to the job pair. if not image: chooser = ExactImageChooserByTag(orig_log_path) image = chooser.get_image_tag() if image is not None: j[PARSED_IMAGE_TAG_KEY] = image # Try to find image by GCE commit SHA. If found, add it to the job pair. if not image: chooser = ExactImageChooserByCommitSHA(orig_log_path, dockerhub_images) image = chooser.get_image_tag() if image is not None: j[PARSED_IMAGE_TAG_KEY] = image # 'tr_build_image' is the attribute containing the provision timestamp extracted from a build log. if orig_log_image_provision_timestamp not in provisioned_strs and image is None: # This image is inaccessible. inaccessible_image += 1 if orig_log_image_provision_timestamp not in images_we_dont_have and image is None: images_we_dont_have[orig_log_image_provision_timestamp] = 1 else: images_we_dont_have[orig_log_image_provision_timestamp] += 1 jp[FILTERED_REASON_KEY] = reasons.INACCESSIBLE_IMAGE else: exact_jobs += 1 if orig_log_image_provision_timestamp not in images_we_have: images_we_have[orig_log_image_provision_timestamp] = 1 else: images_we_have[orig_log_image_provision_timestamp] += 1 if utils.jobpair_is_filtered(jp): filtered += 1 # Print the images we have and do not have and how many times they are used by these jobs. log.debug('Stats about images that we have:') for k in images_we_have: log.debug('{} jobs use an image provisioned on {}.'.format(k, images_we_have[k])) log.debug('Stats about images that we do not have:') for k in images_we_dont_have: log.debug('{} jobs use an unavabilable image provisioned on {}.'.format(k, images_we_dont_have[k])) log.debug('Total exact jobs:', exact_jobs) log.debug('Total non-exact jobs:', inaccessible_image) log.debug('Jobs with missing logs:', no_original_log) utils.log_filter_count(filtered, 'jobpairs that use non-exact images') return no_original_log, error_reading_original_log, no_image_provision_timestamp, inaccessible_image