def get_pr_commits_by_parsing_html(self): start_time = time.time() html_commits_json_file = self.utils.get_html_commits_json_file( self.repo) html_commits = {} if os.path.isfile(html_commits_json_file): html_commits = read_json(html_commits_json_file) for _, branch_obj in self.branches.items(): if branch_obj.pr_num != -1: # if it's a PR branch branch_obj.html_commits = html_commits[str( branch_obj.pr_num)] else: threads = [ threading.Thread( target=self.utils.github.get_pr_commits_by_html, args=(self.repo, str(branch_obj.pr_num), branch_obj)) for _, branch_obj in self.branches.items() ] for thread in threads: thread.start() for thread in threads: thread.join() for _, branch_obj in self.branches.items(): if branch_obj.pr_num != -1: # if it's a PR branch html_commits[branch_obj.pr_num] = branch_obj.html_commits write_json(html_commits_json_file, html_commits) log.info('Got pull request commits (via HTML parsing) in', time.time() - start_time, 'seconds.')
def get_commit_info_for_virtual_commit(self): start_time = time.time() virtual_commits_info = {} virtual_commits_info_json_file = self.utils.get_virtual_commits_info_json_file( self.repo) has_json_file = os.path.isfile(virtual_commits_info_json_file) if has_json_file: virtual_commits_info = read_json(virtual_commits_info_json_file) for _, branch_obj in self.branches.items(): if not branch_obj.pairs: continue for pair in branch_obj.pairs: builds = [pair.failed_build, pair.passed_build] for b in builds: if has_json_file: if b.commit in virtual_commits_info: b.virtual_commit_info = virtual_commits_info[ b.commit] else: c = self.utils.github.get_commit_info( self.repo, b.commit) if c: virtual_commits_info[b.commit] = c b.virtual_commit_info = c if not has_json_file: write_json(virtual_commits_info_json_file, virtual_commits_info) log.info('Got commit info for virtual commits in', time.time() - start_time, 'seconds.')
def get_commits_from_github_api(self): start_time = time.time() github_commits = {} get_github_commits = True github_commits_json_file = self.utils.get_github_commits_json_file( self.repo) if os.path.isfile(github_commits_json_file): github_commits = read_json(github_commits_json_file) get_github_commits = False for _, branch_obj in self.branches.items(): if branch_obj.pr_num != -1: # Whether it is a PR branch. # Get commits from the GitHub API. if get_github_commits: github_commits[str(branch_obj.pr_num )] = self.utils.github.list_pr_commits( self.repo, str(branch_obj.pr_num)) branch_obj.github_commits = github_commits[str( branch_obj.pr_num)] # for commit in github_commits[str(branch.pr_num)]: # commit['build_ids'] = self.utils.github.get_build_ids_for_commit(self.repo, commit['sha']) write_json(github_commits_json_file, github_commits) log.info('Got pull request commits (via GitHub API calls) in', time.time() - start_time, 'seconds.')
def process(self, repo, builds_json_file, builds_info_json_file) -> Optional[Any]: # repo = context['repo'] travis = TravisWrapper() if os.path.isfile(builds_json_file): build_list = read_json(builds_json_file) else: log.info('Getting the list of builds...') start_time = time.time() try: builds = travis.get_builds_for_repo(repo) except RequestException: error_message = 'Encountered an error while downloading builds for repository {}.'.format(repo) build_list = list(builds) write_json(builds_json_file, build_list) log.info('Got the list of builds in', time.time() - start_time, 'seconds.') if os.path.isfile(builds_info_json_file): build_list = read_json(builds_info_json_file) else: log.info('Downloading build info for', len(build_list), 'builds... This step may take several minutes for large repositories.') start_time = time.time() for idx, build in enumerate(build_list): build_id = build['id'] try: build_info = travis.get_build_info(build_id) except RequestException: error_message = 'Encountered an error while downloading build info for build {}.'.format(build_id) build['build_info'] = build_info if (idx + 1) % 500 == 0: log.info('Downloaded build info for', idx + 1, 'builds so far...') write_json(builds_info_json_file, build_list) log.info('Downloaded build info in', time.time() - start_time, 'seconds.')
def generate_build_pair_json(repo): log.info('Getting build_pair from Database') dir_of_jsons = "input/" task_name = repo.replace('/', '-') bugswarmapi = DatabaseAPI(token=DATABASE_PIPELINE_TOKEN) buildpairs = bugswarmapi.filter_mined_build_pairs_for_repo(repo) os.makedirs(os.path.dirname(dir_of_jsons), exist_ok=True) write_json('{}{}.json'.format(dir_of_jsons, task_name), buildpairs) return dir_of_jsons
def main(argv=None): argv = argv or sys.argv # Configure logging. log.config_logging(getattr(logging, 'INFO', None)) output_path, repo, failed_job_id, passed_job_id = _validate_input(argv) log.info('Choosing pairs from {}.'.format(repo)) bugswarmapi = DatabaseAPI(token=DATABASE_PIPELINE_TOKEN) buildpairs = bugswarmapi.filter_mined_build_pairs_for_repo(repo) if not buildpairs: log.error( 'No mined build pairs exist in the database for {}. Exiting.'. format(repo)) return 1 filename = 'artifacts_for_comparing.json' if not os.path.isfile(filename): artifacts = bugswarmapi.list_artifacts() _create_static_artifacts_file(filename, artifacts) with open(filename, 'r') as file: artifacts = json.load(file) filtered_buildpairs = [] filtered_jobpair_count = 0 for bp in buildpairs: filtered_jobpairs = [] for jp in bp['jobpairs']: if should_include_jobpair(jp, failed_job_id, passed_job_id): if not is_jp_unique(repo, jp, artifacts): continue filtered_jobpairs.append(jp) filtered_jobpair_count += 1 if filtered_jobpairs: bp['jobpairs'] = filtered_jobpairs filtered_buildpairs.append(bp) # Create any missing path components to the output file. os.makedirs(os.path.dirname(output_path), exist_ok=True) # Write the output file. write_json(output_path, filtered_buildpairs) bp_pluralized = 'buildpair' if len( filtered_buildpairs) == 1 else 'buildpairs' jp_pluralized = 'jobpair' if filtered_jobpair_count == 1 else 'jobpairs' log.info('Wrote {} {} with {} {} to {}.'.format(len(filtered_buildpairs), bp_pluralized, filtered_jobpair_count, jp_pluralized, output_path)) log.info('Done!')
def gen_script(utils, job, dependence_solver): """ Invoke travis-build to generate the build script. """ build_sh = os.path.join('reproduce_tmp', job.job_id + '.sh') reproducing_dir = utils.get_reproducing_repo_dir(job) if dependence_solver: from bugswarm.dependency_solver.dependency_solver import fix_dict pip_patch_result = os.path.join(utils.get_jobpair_dir(job), '{}-pip-patch.json'.format(job.job_id)) commit_time = job.build.commit_time if not commit_time: github_wrapper = GitHubWrapper(GITHUB_TOKENS) _, commit_json = github_wrapper.get( 'https://api.github.com/repos/{}/git/commits/{}'.format( job.repo, job.travis_merge_sha)) commit_time = commit_json['committer']['date'] yaml_path = os.path.join(reproducing_dir, '.travis.yml') yaml_dict = job.config fixed_yaml_dict, pip_patch, apt_patch = fix_dict( reproducing_dir, yaml_dict, commit_time) with open(yaml_path, "w+") as f: yaml.dump(fixed_yaml_dict, f) if pip_patch: write_json(pip_patch_result, pip_patch) # update travis compile path based on https://github.com/travis-ci/travis-build/pull/1137 travis_command = '~/.travis/travis-build/bin/travis compile > {}'.format( build_sh) else: # default travis compile should include build number and job number to resolve the matrix travis_command = '~/.travis/travis-build/bin/travis compile {} > {}'.format( job.build_job, build_sh) cd_command = 'cd {}'.format(reproducing_dir) _, stderr, returncode = ShellWrapper.run_commands(cd_command, travis_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) if returncode != 0: raise ReproduceError( 'Encountered an error while generating the build script with travis-build: {}.' .format(stderr))
def generate_build_pair_json(repo, orig_file=None): log.info('Getting build_pair from Database') dir_of_jsons = "input/" task_name = repo.replace('/', '-') bugswarmapi = DatabaseAPI(token=DATABASE_PIPELINE_TOKEN) buildpairs = bugswarmapi.filter_mined_build_pairs_for_repo(repo) if orig_file is not None: log.info('Filtering build pairs using local file', orig_file) with open(orig_file) as f: local_bps = json.load(f) # We use orig_file as a filter (instead of just copying it to dir_of_jsons) because we need the `_id` # attribute to patch our results to the database, and the json in orig_file does not have that attribute. bp_filter = [_build_pair_id_tuple(bp) for bp in local_bps] buildpairs = [ bp for bp in buildpairs if _build_pair_id_tuple(bp) in bp_filter ] os.makedirs(os.path.dirname(dir_of_jsons), exist_ok=True) write_json('{}{}.json'.format(dir_of_jsons, task_name), buildpairs) return dir_of_jsons
def process(self, data: Dict[str, Branch], context: dict) -> Optional[Any]: repo = context['repo'] utils = context['utils'] branches = data # Get the merge state of each pull request. log.info('Getting merge state for all pull requests.') start_time = time.time() pr_list_json_file = utils.get_pr_list_json_file(repo) pr_dict = {} if os.path.isfile(pr_list_json_file): try: pr_dict = read_json(pr_list_json_file) except ValueError: os.remove(pr_list_json_file) raise StepException else: pr_entities = utils.github.list_pull_requests(repo) for pr_entity in pr_entities: pr_dict[str(pr_entity['number'])] = pr_entity write_json(pr_list_json_file, pr_dict) for branch_id, branch_obj in branches.items(): if branch_obj.pr_num != -1: # Whether the branch is a pull request branch. if str(branch_obj.pr_num) in pr_dict: branch_obj.merged_at = pr_dict[str( branch_obj.pr_num)]['merged_at'] branch_obj.base_branch = pr_dict[str( branch_obj.pr_num)]['base']['ref'] branch_obj.pr_info = pr_dict[str(branch_obj.pr_num)] log.debug('Got merge state for all pull requests in', time.time() - start_time, 'seconds.') return branches
def _write_output_json(self): log.info('Writing output JSON annotated with match history.') pairs = read_json(self.input_file) # Write default attributes. for p in pairs: for jp in p['jobpairs']: jp['match_history'] = {} jp['failed_job']['match_history'] = {} jp['passed_job']['match_history'] = {} jp['failed_job']['orig_result'] = '' jp['passed_job']['orig_result'] = '' jp['failed_job']['mismatch_attrs'] = [] jp['passed_job']['mismatch_attrs'] = [] jp['failed_job']['pip_patch'] = False jp['passed_job']['pip_patch'] = False for p in pairs: repo = p['repo'] if repo not in self.pair_center.repos: continue # Try to find this build pair in pair center. for bp in self.pair_center.repos[repo].buildpairs: if p['failed_build']['build_id'] == bp.builds[0].build_id: # Found build pair in pair center. # Optional: Write buildpair match type. # This is not used since we switched to jobpair packaging. p['match'] = bp.match.value trigger_sha = p['failed_build']['head_sha'] # Similarly, for each job pair in build pair, try to find it in the pair center. for jp in p['jobpairs']: # For a build that has some jobs filtered and some jobs not filtered, # the job cannot be found in paircenter. if jp['is_filtered']: continue found_in_paircenter = False for jobpair in bp.jobpairs: if str(jobpair.jobs[0].job_id) == str(jp['failed_job']['job_id']): found_in_paircenter = True # Write jobpair match history, analyzed results, and mismatched attributes. jp['match_history'] = jobpair.match_history jp['failed_job']['match_history'] = jobpair.failed_job_match_history jp['passed_job']['match_history'] = jobpair.passed_job_match_history jp['failed_job']['orig_result'] = jobpair.jobs[0].orig_result jp['passed_job']['orig_result'] = jobpair.jobs[1].orig_result jp['failed_job']['mismatch_attrs'] = jobpair.jobs[0].mismatch_attrs jp['passed_job']['mismatch_attrs'] = jobpair.jobs[1].mismatch_attrs jp['failed_job']['pip_patch'] = jobpair.jobs[0].pip_patch jp['passed_job']['pip_patch'] = jobpair.jobs[1].pip_patch if not found_in_paircenter: # If not found in pair center, this jobpair was filtered out. # In this case, we still analyze the original log to get as many attributes as possible. for i in range(2): job_name = 'failed_job' if i == 0 else 'passed_job' job_id = jp[job_name]['job_id'] original_log_path = self.utils.get_orig_log_path(job_id) if not download_log(job_id, original_log_path): continue original_result = self.analyzer.analyze_single_log(original_log_path, job_id, trigger_sha, repo) if 'not_in_supported_language' in original_result: continue jp[job_name]['orig_result'] = original_result raise RuntimeError('Unexpected state: Jobpair not found in pair center. Exiting.') os.makedirs(self.config.result_json_dir, exist_ok=True) filename = self.config.task + '.json' filepath = os.path.join(self.config.result_json_dir, filename) write_json(filepath, pairs)
def update_local_files(self): write_json(self.utils.get_error_reason_file_path(), Utils.deep_copy(self.error_reasons))
def write_empty_json(repo, task_name): write_json(Utils.output_file_path_from_repo(repo, task_name), [])
def output(repo: str, output_path: str, branches: Dict): total_buildpairs = 0 resettable_buildpairs = 0 total_jobpairs = 0 output_pairs = [] for _, branch_obj in branches.items(): for p in branch_obj.pairs: # Exclude pairs that were marked in clean_pairs.py. if p.exclude_from_output: continue failed_build = p.failed_build passed_build = p.passed_build # For buildpair stats. total_buildpairs += 1 if failed_build.resettable and passed_build.resettable: resettable_buildpairs += 1 # For jobpair stats. total_jobpairs += len(p.jobpairs) jobpairs = [] for jp in p.jobpairs: jobpairs.append({ 'failed_job': { 'job_id': jp.failed_job.job_id, }, 'passed_job': { 'job_id': jp.passed_job.job_id, }, 'build_system': jp.build_system }) pair = { 'repo': repo, 'repo_mined_version': p.repo_mined_version, 'pr_num': branch_obj.pr_num, 'merged_at': branch_obj.merged_at, 'branch': branch_obj.branch_name, 'base_branch': branch_obj.base_branch, 'is_error_pass': failed_build.errored(), 'failed_build': { 'build_id': failed_build.build_id, 'travis_merge_sha': failed_build.commit if branch_obj.pr_num > 0 else None, 'base_sha': failed_build.base_commit, 'head_sha': failed_build.trigger_commit, 'github_archived': failed_build.github_archived, 'resettable': failed_build.resettable, 'committed_at': OutputManager._convert_datetime_to_github_timestamp( failed_build.committed_at) if isinstance( failed_build.committed_at, datetime.datetime) else failed_build.committed_at, 'message': failed_build.message, }, 'passed_build': { 'build_id': passed_build.build_id, 'travis_merge_sha': passed_build.commit if branch_obj.pr_num > 0 else None, 'base_sha': passed_build.base_commit, 'head_sha': passed_build.trigger_commit, 'github_archived': passed_build.github_archived, 'resettable': passed_build.resettable, 'committed_at': OutputManager._convert_datetime_to_github_timestamp( passed_build.committed_at) if isinstance( passed_build.committed_at, datetime.datetime) else passed_build.committed_at, 'message': passed_build.message, }, 'jobpairs': jobpairs, } builds = [failed_build, passed_build] for i in range(2): build_name = 'failed_build' if i == 0 else 'passed_build' pair[build_name]['jobs'] = [] build = builds[i] for j in build.jobs: job = { 'build_job': '{}.{}'.format(build.build_num, j.job_num), 'job_id': j.job_id, 'config': j.config, 'language': OutputManager.adjust_language(j.language), } pair[build_name]['jobs'].append(job) output_pairs.append(pair) # Write output JSON to file. log.info('Saving output to', output_path) write_json(output_path, output_pairs) log.info('Done writing output file.') log.info('Total build pairs found:', total_buildpairs) log.info('Total job pairs found:', total_jobpairs) log.debug('Total resettable build pairs found:', resettable_buildpairs)
def _thread_main(repo, task_name, log_level, skip_if_output_exists, keep_clone): log.config_logging(log_level, Utils.log_file_path_from_repo(repo)) # Log the current version of this BugSwarm component. log.info(get_current_component_version_message('PairFinder')) log.info('Processing', repo) output_file_path = Utils.output_file_path_from_repo(repo, task_name) if skip_if_output_exists and os.path.exists( output_file_path) and os.path.getsize(output_file_path) > 0: log.info('Skipping', repo, 'because output already exists.') return start_time = time.time() in_context = { 'repo': repo, 'utils': Utils(), 'keep_clone': keep_clone, 'task_name': task_name, 'mined_project_builder': MinedProjectBuilder(), 'original_mined_project_metrics': MinedProjectBuilder.query_current_metrics(repo) } steps = [ Preflight(), GetJobsFromTravisAPI(), GroupJobsByBranch(), ExtractAllBuildPairs(), AlignJobPairs(), GetPullRequestMergeStatuses(), DownloadPullRequestCommits(), AssignTriggerCommits(), AssignBaseCommits(), CleanPairs(), GetBuildSystemInfo(), Postflight(), ] pipeline = Pipeline(steps) result, out_context = pipeline.run(None, in_context) if not result: # A filter in the pipeline encountered a fatal error and made the pipeline exit early. # Skip writing the output file. log.info('Skipping', repo, 'as there are no jobs to continue mining.') return builder = out_context['mined_project_builder'] builder.repo = repo builder.latest_mined_version = Utils.get_latest_commit_for_repo(repo) (mined_build_pairs, mined_job_pairs, mined_pr_build_pairs, mined_pr_job_pairs) = Utils.count_mined_pairs_in_branches(result) builder.mined_job_pairs = mined_job_pairs + \ in_context['original_mined_project_metrics']['progression_metrics']['mined_job_pairs'] builder.mined_pr_job_pairs = mined_pr_job_pairs + \ in_context['original_mined_project_metrics']['progression_metrics']['mined_pr_job_pairs'] builder.mined_build_pairs = mined_build_pairs + \ in_context['original_mined_project_metrics']['progression_metrics']['mined_build_pairs'] builder.mined_pr_build_pairs = mined_pr_build_pairs + \ in_context['original_mined_project_metrics']['progression_metrics']['mined_pr_build_pairs'] mined_project = builder.build() OutputManager.output_to_database(mined_project) OutputManager.output(repo, output_path=output_file_path, branches=result) metrics_output_file_path = Utils.output_metrics_path_from_repo( repo, task_name) write_json(metrics_output_file_path, in_context['original_mined_project_metrics']) elapsed = time.time() - start_time log.info('Processed {} in {} seconds. Done!'.format(repo, elapsed))
def process(self, data: Any, context: dict) -> Optional[Any]: repo = context['repo'] mined_build_exists = False lock = Lock() with lock: travis = TravisWrapper() last_mined_build_number = 0 if context['original_mined_project_metrics']['last_build_mined']['build_number']: last_mined_build_number = context['original_mined_project_metrics']['last_build_mined']['build_number'] mined_build_exists = True builds_json_file = Utils.get_repo_builds_api_result_file(repo) builds_info_json_file = Utils.get_repo_builds_info_api_result_file(repo) if os.path.isfile(builds_json_file): build_list = read_json(builds_json_file) else: log.info('Getting the list of builds...') start_time = time.time() try: if not mined_build_exists: # gets all builds for project builds = travis.get_builds_for_repo(repo) else: # gets the latest builds and stops mining after reaching our last mined build number builds = travis.get_builds_for_repo(repo, last_mined_build_number) except RequestException: error_message = 'Encountered an error while downloading builds for repository {}.'.format(repo) raise StepException(error_message) build_list = list(builds) write_json(builds_json_file, build_list) log.info('Got the list of builds in', time.time() - start_time, 'seconds.') if not build_list: msg = 'Did not get any new builds for {}.'.format(repo) raise StepException(msg) if os.path.isfile(builds_info_json_file): build_list = read_json(builds_info_json_file) else: log.info('Downloading build info for', len(build_list), 'builds... This step may take several minutes for large repositories.') start_time = time.time() for idx, build in enumerate(build_list): build_id = build['id'] try: build_info = travis.get_build_info(build_id) except RequestException: error_message = 'Encountered an error while downloading build info for build {}.'.format(build_id) raise StepException(error_message) build['build_info'] = build_info if (idx + 1) % 500 == 0: log.info('Downloaded build info for', idx + 1, 'builds so far...') write_json(builds_info_json_file, build_list) log.info('Downloaded build info in', time.time() - start_time, 'seconds.') # Now that we have data from the Travis API, restructure it so it appears as if it came from the database using # the following query: # SELECT j.job_id, j.job_number, j.config, j.result, # b.build_id, b.number, b.finished_at, b.commit, b.branch, b.event_type, b.language, # c.committed_at, c.compare_at, c.committer_name, c.message # FROM jobs j # LEFT JOIN builds b on b.build_id = j.build_id # LEFT JOIN commits c on b.commit = c.sha # WHERE j.repo_id = "<repo_id>" jobs = [] leftover_build_list = [] highest_build_number = 0 highest_build_number_id = 0 # The 'build_list' will return at minimum 25 builds due to the response gathered from Travis API being a page. # We will always set the 'highest_build_number/id' and skip builds that we have mined previously by checking if # the 'build_number <= last_mined_build_number' for build in build_list: build_id = build['id'] build_number = int(build['number']) if build_number > highest_build_number: highest_build_number_id = build_id highest_build_number = build_number if build_number <= last_mined_build_number: continue for job in build['build_info']['matrix']: j = { 'job_id': job['id'], 'job_number': job['number'], 'config': job['config'], 'result': job['result'], 'build_id': build['id'], 'number': build['number'], 'finished_at': job['finished_at'], 'commit': build['commit'], 'message': build['message'], 'branch': build['branch'], 'event_type': build['build_info']['event_type'], 'committed_at': build['build_info']['committed_at'], 'compare_at': build['build_info']['compare_url'], 'committer_name': build['build_info']['committer_name'], } if 'language' in job['config']: language = job['config']['language'] else: log.debug('Language not found in config, defaulting to ruby for job ID {}.'.format(job['id'])) language = 'ruby' j['language'] = language jobs.append(j) leftover_build_list.append(build) if not jobs: msg = 'Did not get any jobs for {}.'.format(repo) # Set the build_number & build_id metric to the latest build info we've received if no jobs are found. bugswarmapi = DatabaseAPI(DATABASE_PIPELINE_TOKEN) bugswarmapi.set_latest_build_info_metric(repo, highest_build_number, highest_build_number_id) raise StepException(msg) # Expose mining progression metrics via the context. Other pipeline steps must not change these values. # Do not raise a StepException before the context is populated. failed_builds, failed_pr_builds = GetJobsFromTravisAPI._count_failed_builds(leftover_build_list) failed_jobs, failed_pr_jobs = GetJobsFromTravisAPI._count_failed_jobs(leftover_build_list) context['mined_project_builder'].builds = len(leftover_build_list) + \ context['original_mined_project_metrics']['progression_metrics']['builds'] context['mined_project_builder'].jobs = len(jobs) + \ context['original_mined_project_metrics']['progression_metrics']['jobs'] context['mined_project_builder'].failed_builds = failed_builds + \ context['original_mined_project_metrics']['progression_metrics']['failed_builds'] context['mined_project_builder'].failed_jobs = failed_jobs + \ context['original_mined_project_metrics']['progression_metrics']['failed_jobs'] context['mined_project_builder'].failed_pr_builds = failed_pr_builds + \ context['original_mined_project_metrics']['progression_metrics']['failed_pr_builds'] context['mined_project_builder'].failed_pr_jobs = failed_pr_jobs + \ context['original_mined_project_metrics']['progression_metrics']['failed_pr_jobs'] context['mined_project_builder'].last_build_mined['build_id'] = highest_build_number_id context['mined_project_builder'].last_build_mined['build_number'] = highest_build_number return jobs
def process(self, data: Any, context: dict) -> Optional[Any]: repo = context['repo'] travis = TravisWrapper() builds_json_file = Utils.get_repo_builds_api_result_file(repo) builds_info_json_file = Utils.get_repo_builds_info_api_result_file(repo) if os.path.isfile(builds_json_file): build_list = read_json(builds_json_file) else: log.info('Getting the list of builds...') start_time = time.time() try: builds = travis.get_builds_for_repo(repo) except RequestException: error_message = 'Encountered an error while downloading builds for repository {}.'.format(repo) raise StepException(error_message) build_list = list(builds) write_json(builds_json_file, build_list) log.info('Got the list of builds in', time.time() - start_time, 'seconds.') if os.path.isfile(builds_info_json_file): build_list = read_json(builds_info_json_file) else: log.info('Downloading build info for', len(build_list), 'builds... This step may take several minutes for large repositories.') start_time = time.time() for idx, build in enumerate(build_list): build_id = build['id'] try: build_info = travis.get_build_info(build_id) except RequestException: error_message = 'Encountered an error while downloading build info for build {}.'.format(build_id) raise StepException(error_message) build['build_info'] = build_info if (idx + 1) % 500 == 0: log.info('Downloaded build info for', idx + 1, 'builds so far...') write_json(builds_info_json_file, build_list) log.info('Downloaded build info in', time.time() - start_time, 'seconds.') # Now that we have data from the Travis API, restructure it so it appears as if it came from the database using # the following query: # SELECT j.job_id, j.job_number, j.config, j.result, # b.build_id, b.number, b.finished_at, b.commit, b.branch, b.event_type, b.language, # c.committed_at, c.compare_at, c.committer_name, c.message # FROM jobs j # LEFT JOIN builds b on b.build_id = j.build_id # LEFT JOIN commits c on b.commit = c.sha # WHERE j.repo_id = "<repo_id>" jobs = [] for build in build_list: for job in build['build_info']['matrix']: j = { 'job_id': job['id'], 'job_number': job['number'], 'config': job['config'], 'result': job['result'], 'build_id': build['id'], 'number': build['number'], 'finished_at': job['finished_at'], 'commit': build['commit'], 'message': build['message'], 'branch': build['branch'], 'event_type': build['build_info']['event_type'], 'committed_at': build['build_info']['committed_at'], 'compare_at': build['build_info']['compare_url'], 'committer_name': build['build_info']['committer_name'], } if 'language' in job['config']: language = job['config']['language'] else: log.debug('Language not found in config, defaulting to ruby for job ID {}.'.format(job['id'])) language = 'ruby' j['language'] = language jobs.append(j) # Expose mining progression metrics via the context. Other pipeline steps must not change these values. # Do not raise a StepException before the context is populated. failed_builds, failed_pr_builds = GetJobsFromTravisAPI._count_failed_builds(build_list) failed_jobs, failed_pr_jobs = GetJobsFromTravisAPI._count_failed_jobs(build_list) context['mined_project_builder'].builds = len(build_list) context['mined_project_builder'].jobs = len(jobs) context['mined_project_builder'].failed_builds = failed_builds context['mined_project_builder'].failed_jobs = failed_jobs context['mined_project_builder'].failed_pr_builds = failed_pr_builds context['mined_project_builder'].failed_pr_jobs = failed_pr_jobs if not jobs: msg = 'Did not get any jobs for {}.'.format(repo) log.warning(msg) raise StepException(msg) return jobs