Example #1
0
    def get_pr_commits_by_parsing_html(self):
        start_time = time.time()
        html_commits_json_file = self.utils.get_html_commits_json_file(
            self.repo)
        html_commits = {}
        if os.path.isfile(html_commits_json_file):
            html_commits = read_json(html_commits_json_file)
            for _, branch_obj in self.branches.items():
                if branch_obj.pr_num != -1:  # if it's a PR branch
                    branch_obj.html_commits = html_commits[str(
                        branch_obj.pr_num)]
        else:
            threads = [
                threading.Thread(
                    target=self.utils.github.get_pr_commits_by_html,
                    args=(self.repo, str(branch_obj.pr_num), branch_obj))
                for _, branch_obj in self.branches.items()
            ]
            for thread in threads:
                thread.start()
            for thread in threads:
                thread.join()

            for _, branch_obj in self.branches.items():
                if branch_obj.pr_num != -1:  # if it's a PR branch
                    html_commits[branch_obj.pr_num] = branch_obj.html_commits
            write_json(html_commits_json_file, html_commits)
            log.info('Got pull request commits (via HTML parsing) in',
                     time.time() - start_time, 'seconds.')
Example #2
0
    def get_commit_info_for_virtual_commit(self):
        start_time = time.time()
        virtual_commits_info = {}
        virtual_commits_info_json_file = self.utils.get_virtual_commits_info_json_file(
            self.repo)
        has_json_file = os.path.isfile(virtual_commits_info_json_file)
        if has_json_file:
            virtual_commits_info = read_json(virtual_commits_info_json_file)

        for _, branch_obj in self.branches.items():
            if not branch_obj.pairs:
                continue
            for pair in branch_obj.pairs:
                builds = [pair.failed_build, pair.passed_build]
                for b in builds:
                    if has_json_file:
                        if b.commit in virtual_commits_info:
                            b.virtual_commit_info = virtual_commits_info[
                                b.commit]
                    else:
                        c = self.utils.github.get_commit_info(
                            self.repo, b.commit)
                        if c:
                            virtual_commits_info[b.commit] = c
                            b.virtual_commit_info = c
        if not has_json_file:
            write_json(virtual_commits_info_json_file, virtual_commits_info)
        log.info('Got commit info for virtual commits in',
                 time.time() - start_time, 'seconds.')
Example #3
0
    def get_commits_from_github_api(self):
        start_time = time.time()
        github_commits = {}
        get_github_commits = True
        github_commits_json_file = self.utils.get_github_commits_json_file(
            self.repo)
        if os.path.isfile(github_commits_json_file):
            github_commits = read_json(github_commits_json_file)
            get_github_commits = False

        for _, branch_obj in self.branches.items():
            if branch_obj.pr_num != -1:  # Whether it is a PR branch.
                # Get commits from the GitHub API.
                if get_github_commits:
                    github_commits[str(branch_obj.pr_num
                                       )] = self.utils.github.list_pr_commits(
                                           self.repo, str(branch_obj.pr_num))
                branch_obj.github_commits = github_commits[str(
                    branch_obj.pr_num)]
                # for commit in github_commits[str(branch.pr_num)]:
                #     commit['build_ids'] = self.utils.github.get_build_ids_for_commit(self.repo, commit['sha'])

        write_json(github_commits_json_file, github_commits)
        log.info('Got pull request commits (via GitHub API calls) in',
                 time.time() - start_time, 'seconds.')
Example #4
0
    def process(self, repo, builds_json_file, builds_info_json_file) -> Optional[Any]:
        # repo = context['repo']
        travis = TravisWrapper()
        if os.path.isfile(builds_json_file):
            build_list = read_json(builds_json_file)
        else:
            log.info('Getting the list of builds...')
            start_time = time.time()
            try:
                builds = travis.get_builds_for_repo(repo)
            except RequestException:
                error_message = 'Encountered an error while downloading builds for repository {}.'.format(repo)
            build_list = list(builds)
            write_json(builds_json_file, build_list)
            log.info('Got the list of builds in', time.time() - start_time, 'seconds.')

        if os.path.isfile(builds_info_json_file):
            build_list = read_json(builds_info_json_file)
        else:
            log.info('Downloading build info for',
                     len(build_list),
                     'builds... This step may take several minutes for large repositories.')
            start_time = time.time()
            for idx, build in enumerate(build_list):
                build_id = build['id']
                try:
                    build_info = travis.get_build_info(build_id)
                except RequestException:
                    error_message = 'Encountered an error while downloading build info for build {}.'.format(build_id)
                build['build_info'] = build_info
                if (idx + 1) % 500 == 0:
                    log.info('Downloaded build info for', idx + 1, 'builds so far...')
            write_json(builds_info_json_file, build_list)
            log.info('Downloaded build info in', time.time() - start_time, 'seconds.')
Example #5
0
def generate_build_pair_json(repo):
    log.info('Getting build_pair from Database')
    dir_of_jsons = "input/"
    task_name = repo.replace('/', '-')
    bugswarmapi = DatabaseAPI(token=DATABASE_PIPELINE_TOKEN)
    buildpairs = bugswarmapi.filter_mined_build_pairs_for_repo(repo)
    os.makedirs(os.path.dirname(dir_of_jsons), exist_ok=True)
    write_json('{}{}.json'.format(dir_of_jsons, task_name), buildpairs)
    return dir_of_jsons
Example #6
0
def main(argv=None):
    argv = argv or sys.argv

    # Configure logging.
    log.config_logging(getattr(logging, 'INFO', None))

    output_path, repo, failed_job_id, passed_job_id = _validate_input(argv)

    log.info('Choosing pairs from {}.'.format(repo))

    bugswarmapi = DatabaseAPI(token=DATABASE_PIPELINE_TOKEN)
    buildpairs = bugswarmapi.filter_mined_build_pairs_for_repo(repo)
    if not buildpairs:
        log.error(
            'No mined build pairs exist in the database for {}. Exiting.'.
            format(repo))
        return 1

    filename = 'artifacts_for_comparing.json'
    if not os.path.isfile(filename):
        artifacts = bugswarmapi.list_artifacts()
        _create_static_artifacts_file(filename, artifacts)
    with open(filename, 'r') as file:
        artifacts = json.load(file)

    filtered_buildpairs = []
    filtered_jobpair_count = 0
    for bp in buildpairs:
        filtered_jobpairs = []
        for jp in bp['jobpairs']:
            if should_include_jobpair(jp, failed_job_id, passed_job_id):
                if not is_jp_unique(repo, jp, artifacts):
                    continue
                filtered_jobpairs.append(jp)
                filtered_jobpair_count += 1
        if filtered_jobpairs:
            bp['jobpairs'] = filtered_jobpairs
            filtered_buildpairs.append(bp)

    # Create any missing path components to the output file.
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    # Write the output file.
    write_json(output_path, filtered_buildpairs)
    bp_pluralized = 'buildpair' if len(
        filtered_buildpairs) == 1 else 'buildpairs'
    jp_pluralized = 'jobpair' if filtered_jobpair_count == 1 else 'jobpairs'
    log.info('Wrote {} {} with {} {} to {}.'.format(len(filtered_buildpairs),
                                                    bp_pluralized,
                                                    filtered_jobpair_count,
                                                    jp_pluralized,
                                                    output_path))
    log.info('Done!')
Example #7
0
def gen_script(utils, job, dependence_solver):
    """
    Invoke travis-build to generate the build script.
    """
    build_sh = os.path.join('reproduce_tmp', job.job_id + '.sh')
    reproducing_dir = utils.get_reproducing_repo_dir(job)

    if dependence_solver:
        from bugswarm.dependency_solver.dependency_solver import fix_dict
        pip_patch_result = os.path.join(utils.get_jobpair_dir(job),
                                        '{}-pip-patch.json'.format(job.job_id))
        commit_time = job.build.commit_time

        if not commit_time:
            github_wrapper = GitHubWrapper(GITHUB_TOKENS)
            _, commit_json = github_wrapper.get(
                'https://api.github.com/repos/{}/git/commits/{}'.format(
                    job.repo, job.travis_merge_sha))
            commit_time = commit_json['committer']['date']

        yaml_path = os.path.join(reproducing_dir, '.travis.yml')
        yaml_dict = job.config
        fixed_yaml_dict, pip_patch, apt_patch = fix_dict(
            reproducing_dir, yaml_dict, commit_time)
        with open(yaml_path, "w+") as f:
            yaml.dump(fixed_yaml_dict, f)

        if pip_patch:
            write_json(pip_patch_result, pip_patch)
        # update travis compile path based on https://github.com/travis-ci/travis-build/pull/1137
        travis_command = '~/.travis/travis-build/bin/travis compile > {}'.format(
            build_sh)
    else:
        # default travis compile should include build number and job number to resolve the matrix
        travis_command = '~/.travis/travis-build/bin/travis compile {} > {}'.format(
            job.build_job, build_sh)
    cd_command = 'cd {}'.format(reproducing_dir)
    _, stderr, returncode = ShellWrapper.run_commands(cd_command,
                                                      travis_command,
                                                      stdout=subprocess.PIPE,
                                                      stderr=subprocess.PIPE,
                                                      shell=True)

    if returncode != 0:
        raise ReproduceError(
            'Encountered an error while generating the build script with travis-build: {}.'
            .format(stderr))
Example #8
0
def generate_build_pair_json(repo, orig_file=None):
    log.info('Getting build_pair from Database')
    dir_of_jsons = "input/"
    task_name = repo.replace('/', '-')
    bugswarmapi = DatabaseAPI(token=DATABASE_PIPELINE_TOKEN)
    buildpairs = bugswarmapi.filter_mined_build_pairs_for_repo(repo)
    if orig_file is not None:
        log.info('Filtering build pairs using local file', orig_file)
        with open(orig_file) as f:
            local_bps = json.load(f)
        # We use orig_file as a filter (instead of just copying it to dir_of_jsons) because we need the `_id`
        # attribute to patch our results to the database, and the json in orig_file does not have that attribute.
        bp_filter = [_build_pair_id_tuple(bp) for bp in local_bps]
        buildpairs = [
            bp for bp in buildpairs if _build_pair_id_tuple(bp) in bp_filter
        ]
    os.makedirs(os.path.dirname(dir_of_jsons), exist_ok=True)
    write_json('{}{}.json'.format(dir_of_jsons, task_name), buildpairs)
    return dir_of_jsons
Example #9
0
    def process(self, data: Dict[str, Branch], context: dict) -> Optional[Any]:
        repo = context['repo']
        utils = context['utils']

        branches = data

        # Get the merge state of each pull request.
        log.info('Getting merge state for all pull requests.')

        start_time = time.time()
        pr_list_json_file = utils.get_pr_list_json_file(repo)
        pr_dict = {}
        if os.path.isfile(pr_list_json_file):
            try:
                pr_dict = read_json(pr_list_json_file)
            except ValueError:
                os.remove(pr_list_json_file)
                raise StepException
        else:
            pr_entities = utils.github.list_pull_requests(repo)
            for pr_entity in pr_entities:
                pr_dict[str(pr_entity['number'])] = pr_entity
            write_json(pr_list_json_file, pr_dict)

        for branch_id, branch_obj in branches.items():
            if branch_obj.pr_num != -1:  # Whether the branch is a pull request branch.
                if str(branch_obj.pr_num) in pr_dict:
                    branch_obj.merged_at = pr_dict[str(
                        branch_obj.pr_num)]['merged_at']
                    branch_obj.base_branch = pr_dict[str(
                        branch_obj.pr_num)]['base']['ref']
                    branch_obj.pr_info = pr_dict[str(branch_obj.pr_num)]

        log.debug('Got merge state for all pull requests in',
                  time.time() - start_time, 'seconds.')
        return branches
Example #10
0
    def _write_output_json(self):
        log.info('Writing output JSON annotated with match history.')
        pairs = read_json(self.input_file)
        # Write default attributes.
        for p in pairs:
            for jp in p['jobpairs']:
                jp['match_history'] = {}
                jp['failed_job']['match_history'] = {}
                jp['passed_job']['match_history'] = {}
                jp['failed_job']['orig_result'] = ''
                jp['passed_job']['orig_result'] = ''
                jp['failed_job']['mismatch_attrs'] = []
                jp['passed_job']['mismatch_attrs'] = []
                jp['failed_job']['pip_patch'] = False
                jp['passed_job']['pip_patch'] = False

        for p in pairs:
            repo = p['repo']
            if repo not in self.pair_center.repos:
                continue

            # Try to find this build pair in pair center.
            for bp in self.pair_center.repos[repo].buildpairs:
                if p['failed_build']['build_id'] == bp.builds[0].build_id:
                    # Found build pair in pair center.

                    # Optional: Write buildpair match type.
                    # This is not used since we switched to jobpair packaging.
                    p['match'] = bp.match.value
                    trigger_sha = p['failed_build']['head_sha']
                    # Similarly, for each job pair in build pair, try to find it in the pair center.
                    for jp in p['jobpairs']:
                        # For a build that has some jobs filtered and some jobs not filtered,
                        # the job cannot be found in paircenter.
                        if jp['is_filtered']:
                            continue

                        found_in_paircenter = False
                        for jobpair in bp.jobpairs:
                            if str(jobpair.jobs[0].job_id) == str(jp['failed_job']['job_id']):
                                found_in_paircenter = True
                                # Write jobpair match history, analyzed results, and mismatched attributes.
                                jp['match_history'] = jobpair.match_history
                                jp['failed_job']['match_history'] = jobpair.failed_job_match_history
                                jp['passed_job']['match_history'] = jobpair.passed_job_match_history
                                jp['failed_job']['orig_result'] = jobpair.jobs[0].orig_result
                                jp['passed_job']['orig_result'] = jobpair.jobs[1].orig_result
                                jp['failed_job']['mismatch_attrs'] = jobpair.jobs[0].mismatch_attrs
                                jp['passed_job']['mismatch_attrs'] = jobpair.jobs[1].mismatch_attrs
                                jp['failed_job']['pip_patch'] = jobpair.jobs[0].pip_patch
                                jp['passed_job']['pip_patch'] = jobpair.jobs[1].pip_patch

                        if not found_in_paircenter:
                            # If not found in pair center, this jobpair was filtered out.
                            # In this case, we still analyze the original log to get as many attributes as possible.
                            for i in range(2):
                                job_name = 'failed_job' if i == 0 else 'passed_job'
                                job_id = jp[job_name]['job_id']
                                original_log_path = self.utils.get_orig_log_path(job_id)
                                if not download_log(job_id, original_log_path):
                                    continue
                                original_result = self.analyzer.analyze_single_log(original_log_path, job_id,
                                                                                   trigger_sha, repo)
                                if 'not_in_supported_language' in original_result:
                                    continue
                                jp[job_name]['orig_result'] = original_result
                            raise RuntimeError('Unexpected state: Jobpair not found in pair center. Exiting.')

        os.makedirs(self.config.result_json_dir, exist_ok=True)
        filename = self.config.task + '.json'
        filepath = os.path.join(self.config.result_json_dir, filename)
        write_json(filepath, pairs)
Example #11
0
 def update_local_files(self):
     write_json(self.utils.get_error_reason_file_path(),
                Utils.deep_copy(self.error_reasons))
Example #12
0
 def write_empty_json(repo, task_name):
     write_json(Utils.output_file_path_from_repo(repo, task_name), [])
Example #13
0
    def output(repo: str, output_path: str, branches: Dict):
        total_buildpairs = 0
        resettable_buildpairs = 0
        total_jobpairs = 0
        output_pairs = []
        for _, branch_obj in branches.items():
            for p in branch_obj.pairs:
                # Exclude pairs that were marked in clean_pairs.py.
                if p.exclude_from_output:
                    continue

                failed_build = p.failed_build
                passed_build = p.passed_build

                # For buildpair stats.
                total_buildpairs += 1
                if failed_build.resettable and passed_build.resettable:
                    resettable_buildpairs += 1

                # For jobpair stats.
                total_jobpairs += len(p.jobpairs)

                jobpairs = []
                for jp in p.jobpairs:
                    jobpairs.append({
                        'failed_job': {
                            'job_id': jp.failed_job.job_id,
                        },
                        'passed_job': {
                            'job_id': jp.passed_job.job_id,
                        },
                        'build_system': jp.build_system
                    })

                pair = {
                    'repo': repo,
                    'repo_mined_version': p.repo_mined_version,
                    'pr_num': branch_obj.pr_num,
                    'merged_at': branch_obj.merged_at,
                    'branch': branch_obj.branch_name,
                    'base_branch': branch_obj.base_branch,
                    'is_error_pass': failed_build.errored(),
                    'failed_build': {
                        'build_id':
                        failed_build.build_id,
                        'travis_merge_sha':
                        failed_build.commit if branch_obj.pr_num > 0 else None,
                        'base_sha':
                        failed_build.base_commit,
                        'head_sha':
                        failed_build.trigger_commit,
                        'github_archived':
                        failed_build.github_archived,
                        'resettable':
                        failed_build.resettable,
                        'committed_at':
                        OutputManager._convert_datetime_to_github_timestamp(
                            failed_build.committed_at) if isinstance(
                                failed_build.committed_at, datetime.datetime)
                        else failed_build.committed_at,
                        'message':
                        failed_build.message,
                    },
                    'passed_build': {
                        'build_id':
                        passed_build.build_id,
                        'travis_merge_sha':
                        passed_build.commit if branch_obj.pr_num > 0 else None,
                        'base_sha':
                        passed_build.base_commit,
                        'head_sha':
                        passed_build.trigger_commit,
                        'github_archived':
                        passed_build.github_archived,
                        'resettable':
                        passed_build.resettable,
                        'committed_at':
                        OutputManager._convert_datetime_to_github_timestamp(
                            passed_build.committed_at) if isinstance(
                                passed_build.committed_at, datetime.datetime)
                        else passed_build.committed_at,
                        'message':
                        passed_build.message,
                    },
                    'jobpairs': jobpairs,
                }
                builds = [failed_build, passed_build]
                for i in range(2):
                    build_name = 'failed_build' if i == 0 else 'passed_build'
                    pair[build_name]['jobs'] = []
                    build = builds[i]
                    for j in build.jobs:
                        job = {
                            'build_job':
                            '{}.{}'.format(build.build_num, j.job_num),
                            'job_id': j.job_id,
                            'config': j.config,
                            'language':
                            OutputManager.adjust_language(j.language),
                        }
                        pair[build_name]['jobs'].append(job)

                output_pairs.append(pair)

        # Write output JSON to file.
        log.info('Saving output to', output_path)
        write_json(output_path, output_pairs)
        log.info('Done writing output file.')

        log.info('Total build pairs found:', total_buildpairs)
        log.info('Total job pairs found:', total_jobpairs)
        log.debug('Total resettable build pairs found:', resettable_buildpairs)
Example #14
0
def _thread_main(repo, task_name, log_level, skip_if_output_exists,
                 keep_clone):
    log.config_logging(log_level, Utils.log_file_path_from_repo(repo))

    # Log the current version of this BugSwarm component.
    log.info(get_current_component_version_message('PairFinder'))

    log.info('Processing', repo)
    output_file_path = Utils.output_file_path_from_repo(repo, task_name)
    if skip_if_output_exists and os.path.exists(
            output_file_path) and os.path.getsize(output_file_path) > 0:
        log.info('Skipping', repo, 'because output already exists.')
        return

    start_time = time.time()

    in_context = {
        'repo':
        repo,
        'utils':
        Utils(),
        'keep_clone':
        keep_clone,
        'task_name':
        task_name,
        'mined_project_builder':
        MinedProjectBuilder(),
        'original_mined_project_metrics':
        MinedProjectBuilder.query_current_metrics(repo)
    }
    steps = [
        Preflight(),
        GetJobsFromTravisAPI(),
        GroupJobsByBranch(),
        ExtractAllBuildPairs(),
        AlignJobPairs(),
        GetPullRequestMergeStatuses(),
        DownloadPullRequestCommits(),
        AssignTriggerCommits(),
        AssignBaseCommits(),
        CleanPairs(),
        GetBuildSystemInfo(),
        Postflight(),
    ]
    pipeline = Pipeline(steps)

    result, out_context = pipeline.run(None, in_context)
    if not result:
        # A filter in the pipeline encountered a fatal error and made the pipeline exit early.
        # Skip writing the output file.
        log.info('Skipping', repo, 'as there are no jobs to continue mining.')
        return

    builder = out_context['mined_project_builder']
    builder.repo = repo
    builder.latest_mined_version = Utils.get_latest_commit_for_repo(repo)
    (mined_build_pairs, mined_job_pairs, mined_pr_build_pairs,
     mined_pr_job_pairs) = Utils.count_mined_pairs_in_branches(result)
    builder.mined_job_pairs = mined_job_pairs + \
        in_context['original_mined_project_metrics']['progression_metrics']['mined_job_pairs']
    builder.mined_pr_job_pairs = mined_pr_job_pairs + \
        in_context['original_mined_project_metrics']['progression_metrics']['mined_pr_job_pairs']
    builder.mined_build_pairs = mined_build_pairs + \
        in_context['original_mined_project_metrics']['progression_metrics']['mined_build_pairs']
    builder.mined_pr_build_pairs = mined_pr_build_pairs + \
        in_context['original_mined_project_metrics']['progression_metrics']['mined_pr_build_pairs']
    mined_project = builder.build()
    OutputManager.output_to_database(mined_project)
    OutputManager.output(repo, output_path=output_file_path, branches=result)
    metrics_output_file_path = Utils.output_metrics_path_from_repo(
        repo, task_name)
    write_json(metrics_output_file_path,
               in_context['original_mined_project_metrics'])

    elapsed = time.time() - start_time
    log.info('Processed {} in {} seconds. Done!'.format(repo, elapsed))
    def process(self, data: Any, context: dict) -> Optional[Any]:
        repo = context['repo']
        mined_build_exists = False
        lock = Lock()
        with lock:
            travis = TravisWrapper()

        last_mined_build_number = 0
        if context['original_mined_project_metrics']['last_build_mined']['build_number']:
            last_mined_build_number = context['original_mined_project_metrics']['last_build_mined']['build_number']
            mined_build_exists = True

        builds_json_file = Utils.get_repo_builds_api_result_file(repo)
        builds_info_json_file = Utils.get_repo_builds_info_api_result_file(repo)
        if os.path.isfile(builds_json_file):
            build_list = read_json(builds_json_file)
        else:
            log.info('Getting the list of builds...')
            start_time = time.time()
            try:
                if not mined_build_exists:
                    # gets all builds for project
                    builds = travis.get_builds_for_repo(repo)
                else:
                    # gets the latest builds and stops mining after reaching our last mined build number
                    builds = travis.get_builds_for_repo(repo, last_mined_build_number)
            except RequestException:
                error_message = 'Encountered an error while downloading builds for repository {}.'.format(repo)
                raise StepException(error_message)
            build_list = list(builds)
            write_json(builds_json_file, build_list)
            log.info('Got the list of builds in', time.time() - start_time, 'seconds.')

        if not build_list:
            msg = 'Did not get any new builds for {}.'.format(repo)
            raise StepException(msg)

        if os.path.isfile(builds_info_json_file):
            build_list = read_json(builds_info_json_file)
        else:
            log.info('Downloading build info for',
                     len(build_list),
                     'builds... This step may take several minutes for large repositories.')
            start_time = time.time()
            for idx, build in enumerate(build_list):
                build_id = build['id']
                try:
                    build_info = travis.get_build_info(build_id)
                except RequestException:
                    error_message = 'Encountered an error while downloading build info for build {}.'.format(build_id)
                    raise StepException(error_message)
                build['build_info'] = build_info
                if (idx + 1) % 500 == 0:
                    log.info('Downloaded build info for', idx + 1, 'builds so far...')
            write_json(builds_info_json_file, build_list)
            log.info('Downloaded build info in', time.time() - start_time, 'seconds.')

        # Now that we have data from the Travis API, restructure it so it appears as if it came from the database using
        # the following query:
        #   SELECT j.job_id, j.job_number, j.config, j.result,
        #          b.build_id, b.number, b.finished_at, b.commit, b.branch, b.event_type, b.language,
        #          c.committed_at, c.compare_at, c.committer_name, c.message
        #   FROM jobs j
        #   LEFT JOIN builds b on b.build_id = j.build_id
        #   LEFT JOIN commits c on b.commit = c.sha
        #   WHERE j.repo_id = "<repo_id>"
        jobs = []
        leftover_build_list = []
        highest_build_number = 0
        highest_build_number_id = 0

        # The 'build_list' will return at minimum 25 builds due to the response gathered from Travis API being a page.
        # We will always set the 'highest_build_number/id' and skip builds that we have mined previously by checking if
        # the 'build_number <= last_mined_build_number'
        for build in build_list:
            build_id = build['id']
            build_number = int(build['number'])

            if build_number > highest_build_number:
                highest_build_number_id = build_id
                highest_build_number = build_number
            if build_number <= last_mined_build_number:
                continue

            for job in build['build_info']['matrix']:
                j = {
                    'job_id': job['id'],
                    'job_number': job['number'],
                    'config': job['config'],
                    'result': job['result'],
                    'build_id': build['id'],
                    'number': build['number'],
                    'finished_at': job['finished_at'],
                    'commit': build['commit'],
                    'message': build['message'],
                    'branch': build['branch'],
                    'event_type': build['build_info']['event_type'],
                    'committed_at': build['build_info']['committed_at'],
                    'compare_at': build['build_info']['compare_url'],
                    'committer_name': build['build_info']['committer_name'],
                }
                if 'language' in job['config']:
                    language = job['config']['language']
                else:
                    log.debug('Language not found in config, defaulting to ruby for job ID {}.'.format(job['id']))
                    language = 'ruby'
                j['language'] = language
                jobs.append(j)

            leftover_build_list.append(build)

        if not jobs:
            msg = 'Did not get any jobs for {}.'.format(repo)
            # Set the build_number & build_id metric to the latest build info we've received if no jobs are found.
            bugswarmapi = DatabaseAPI(DATABASE_PIPELINE_TOKEN)
            bugswarmapi.set_latest_build_info_metric(repo, highest_build_number, highest_build_number_id)
            raise StepException(msg)

        # Expose mining progression metrics via the context. Other pipeline steps must not change these values.
        # Do not raise a StepException before the context is populated.
        failed_builds, failed_pr_builds = GetJobsFromTravisAPI._count_failed_builds(leftover_build_list)
        failed_jobs, failed_pr_jobs = GetJobsFromTravisAPI._count_failed_jobs(leftover_build_list)
        context['mined_project_builder'].builds = len(leftover_build_list) + \
            context['original_mined_project_metrics']['progression_metrics']['builds']
        context['mined_project_builder'].jobs = len(jobs) + \
            context['original_mined_project_metrics']['progression_metrics']['jobs']
        context['mined_project_builder'].failed_builds = failed_builds + \
            context['original_mined_project_metrics']['progression_metrics']['failed_builds']
        context['mined_project_builder'].failed_jobs = failed_jobs + \
            context['original_mined_project_metrics']['progression_metrics']['failed_jobs']
        context['mined_project_builder'].failed_pr_builds = failed_pr_builds + \
            context['original_mined_project_metrics']['progression_metrics']['failed_pr_builds']
        context['mined_project_builder'].failed_pr_jobs = failed_pr_jobs + \
            context['original_mined_project_metrics']['progression_metrics']['failed_pr_jobs']
        context['mined_project_builder'].last_build_mined['build_id'] = highest_build_number_id
        context['mined_project_builder'].last_build_mined['build_number'] = highest_build_number

        return jobs
    def process(self, data: Any, context: dict) -> Optional[Any]:
        repo = context['repo']
        travis = TravisWrapper()

        builds_json_file = Utils.get_repo_builds_api_result_file(repo)
        builds_info_json_file = Utils.get_repo_builds_info_api_result_file(repo)
        if os.path.isfile(builds_json_file):
            build_list = read_json(builds_json_file)
        else:
            log.info('Getting the list of builds...')
            start_time = time.time()
            try:
                builds = travis.get_builds_for_repo(repo)
            except RequestException:
                error_message = 'Encountered an error while downloading builds for repository {}.'.format(repo)
                raise StepException(error_message)
            build_list = list(builds)
            write_json(builds_json_file, build_list)
            log.info('Got the list of builds in', time.time() - start_time, 'seconds.')

        if os.path.isfile(builds_info_json_file):
            build_list = read_json(builds_info_json_file)
        else:
            log.info('Downloading build info for',
                     len(build_list),
                     'builds... This step may take several minutes for large repositories.')
            start_time = time.time()
            for idx, build in enumerate(build_list):
                build_id = build['id']
                try:
                    build_info = travis.get_build_info(build_id)
                except RequestException:
                    error_message = 'Encountered an error while downloading build info for build {}.'.format(build_id)
                    raise StepException(error_message)
                build['build_info'] = build_info
                if (idx + 1) % 500 == 0:
                    log.info('Downloaded build info for', idx + 1, 'builds so far...')
            write_json(builds_info_json_file, build_list)
            log.info('Downloaded build info in', time.time() - start_time, 'seconds.')

        # Now that we have data from the Travis API, restructure it so it appears as if it came from the database using
        # the following query:
        #   SELECT j.job_id, j.job_number, j.config, j.result,
        #          b.build_id, b.number, b.finished_at, b.commit, b.branch, b.event_type, b.language,
        #          c.committed_at, c.compare_at, c.committer_name, c.message
        #   FROM jobs j
        #   LEFT JOIN builds b on b.build_id = j.build_id
        #   LEFT JOIN commits c on b.commit = c.sha
        #   WHERE j.repo_id = "<repo_id>"
        jobs = []
        for build in build_list:
            for job in build['build_info']['matrix']:
                j = {
                    'job_id': job['id'],
                    'job_number': job['number'],
                    'config': job['config'],
                    'result': job['result'],
                    'build_id': build['id'],
                    'number': build['number'],
                    'finished_at': job['finished_at'],
                    'commit': build['commit'],
                    'message': build['message'],
                    'branch': build['branch'],
                    'event_type': build['build_info']['event_type'],
                    'committed_at': build['build_info']['committed_at'],
                    'compare_at': build['build_info']['compare_url'],
                    'committer_name': build['build_info']['committer_name'],
                }
                if 'language' in job['config']:
                    language = job['config']['language']
                else:
                    log.debug('Language not found in config, defaulting to ruby for job ID {}.'.format(job['id']))
                    language = 'ruby'
                j['language'] = language
                jobs.append(j)

        # Expose mining progression metrics via the context. Other pipeline steps must not change these values.
        # Do not raise a StepException before the context is populated.
        failed_builds, failed_pr_builds = GetJobsFromTravisAPI._count_failed_builds(build_list)
        failed_jobs, failed_pr_jobs = GetJobsFromTravisAPI._count_failed_jobs(build_list)
        context['mined_project_builder'].builds = len(build_list)
        context['mined_project_builder'].jobs = len(jobs)
        context['mined_project_builder'].failed_builds = failed_builds
        context['mined_project_builder'].failed_jobs = failed_jobs
        context['mined_project_builder'].failed_pr_builds = failed_pr_builds
        context['mined_project_builder'].failed_pr_jobs = failed_pr_jobs

        if not jobs:
            msg = 'Did not get any jobs for {}.'.format(repo)
            log.warning(msg)
            raise StepException(msg)

        return jobs