Example #1
0
def _modify_script(utils: Utils, jobpair: JobPair):
    for j in jobpair.jobs:
        script_path = join(utils.get_jobpair_dir(jobpair.jobs[0]),
                           j.job_id + '.sh')
        if not isfile(script_path):
            log.error('Script file not found at', script_path)
            return 1

        lines = []
        with open(script_path) as f:
            found_cd_line = False
            for l in f:
                if r'travis_cmd cd\ ' + j.repo in l:
                    found_cd_line = True
                    lines.append(_replace_repo_path(j, l))
                elif 'export TRAVIS_BUILD_DIR=$HOME/build/' in l:
                    lines.append(_replace_repo_path(j, l))
                else:
                    lines.append(l)

            if not found_cd_line:
                raise ReproduceError('found_cd_line is False for {}'.format(
                    j.job_id))

        with open(
                join(utils.get_jobpair_dir(jobpair.jobs[0]),
                     j.job_id + '-p.sh'), 'w') as f:
            for l in lines:
                f.write(l)
Example #2
0
 def __init__(self, input_file, task_name, threads=1, keep=False, package_mode=False, dependency_solver=False,
              skip_check_disk=False):
     """
     Initializes JobDispatcher with user specified input and starts work.
     If `threads` is specified, JobDispatcher will dispatch jobs to be reproduced in each thread. Otherwise, each job
     will be reproduced sequentially.
     """
     log.info('Initializing job dispatcher.')
     self.input_file = input_file
     self.thread_num = threads
     self.keep = keep
     self.package_mode = package_mode
     self.dependency_solver = dependency_solver
     # -----
     self.config = Config(task_name)
     self.config.skip_check_disk = skip_check_disk
     self.utils = Utils(self.config)
     self.items_processed = Value('i', 0)
     self.reproduce_err = Value('i', 0)
     self.job_time_acc = 0
     self.start_time = time.time()
     self.docker = DockerWrapper(self.utils)
     self.docker_storage_path = self.docker.setup_docker_storage_path()
     self.terminate = Value('i', 0)
     self.manager = Manager()
     self.lock = Lock()
     self.workspace_locks = self.manager.dict()
     self.cloned_repos = self.manager.dict()
     self.threads = {}
     self.error_reasons = {}
     self.alive_threads = 0
     self.travis_images = None
     self.job_center = PairCenter(self.input_file, self.utils, self.package_mode)
Example #3
0
def _copy_repo_tar(utils: Utils, jobpair: JobPair):
    for j in jobpair.jobs:
        if not utils.get_repo_tar_path_in_task(j):
            raise ReproduceError(
                'Cannot find the repository tar file to copy for {}.'.format(
                    j.job_id))
        utils.copy_repo_tar_from_storage_into_jobpair_dir(j)
Example #4
0
def _copy_original_log(utils: Utils, jobpair: JobPair):
    for j in jobpair.jobs:
        original_log_path = utils.get_orig_log_path(j.job_id)
        if not download_log(j.job_id, original_log_path):
            raise ReproduceError(
                'Error while copying the original log for {}.'.format(
                    j.job_id))
        utils.copy_orig_log_into_jobpair_dir(j)
Example #5
0
class ArtifactLogAdder(object):
    def __init__(self, task_name):
        log.info('Initializing ArtifactLogAdder.')
        self.config = Config(task_name)
        self.utils = Utils(self.config)
        self.task = task_name

    def run(self):
        bugswarmapi = DatabaseAPI(token=DATABASE_PIPELINE_TOKEN)

        if os.path.isfile('../cache-dependency/output/{}.csv'.format(
                self.task)) is False:
            log.error('cache-dependency output CSV does not exist for task {}'.
                      format(self.task))
            sys.exit()

        cached_image_tags = set()
        with open('../cache-dependency/output/{}.csv'.format(self.task)) as f:
            for row in f:
                # This assumes format '<image tag>, <succeed/error>, <size>, <size increase>'
                row_list = row.split(', ')
                if row_list[1] == 'succeed':
                    cached_image_tags.add(row_list[0])

        for image_tag in cached_image_tags:
            response = bugswarmapi.find_artifact(image_tag)
            if not response.ok:
                log.error('Artifact not found: {}'.format(image_tag))
                continue

            artifact = response.json()
            job_id = {
                'failed': artifact['failed_job']['job_id'],
                'passed': artifact['passed_job']['job_id'],
            }

            job_orig_log = {
                'failed':
                os.getcwd() + '/' +
                self.utils.get_orig_log_path(job_id['failed']),
                'passed':
                os.getcwd() + '/' +
                self.utils.get_orig_log_path(job_id['passed']),
            }

            for f_or_p in ['failed', 'passed']:
                response = bugswarmapi.set_build_log(str(job_id[f_or_p]),
                                                     job_orig_log[f_or_p])
                if response.ok:
                    log.info(
                        '{} build log with ID {} successfully set for artifact: {}'
                        .format(f_or_p, job_id[f_or_p], image_tag))
                else:
                    log.error(
                        'Error {} attempting to set {} build log with ID {} set for artifact: {}'
                        .format(str(response), f_or_p, job_id[f_or_p],
                                image_tag))
Example #6
0
def _write_package_dockerfile(utils: Utils, jobpair: JobPair):
    failed_job_id = jobpair.jobs[0].job_id
    passed_job_id = jobpair.jobs[1].job_id

    failed_dockerfile_path = join(utils.get_jobpair_dir(jobpair.jobs[0]),
                                  failed_job_id + '-Dockerfile')
    passed_dockerfile_path = join(utils.get_jobpair_dir(jobpair.jobs[1]),
                                  passed_job_id + '-Dockerfile')

    with open(failed_dockerfile_path) as f:
        failed_lines = list(map(str.strip, f.readlines()))

    with open(passed_dockerfile_path) as f:
        passed_lines = list(map(str.strip, f.readlines()))

    # Check that both the failed job and the passed job used the same Travis Docker image.
    if failed_lines[0] != passed_lines[0]:
        raise ReproduceError(
            'The failed job and the passed job used different Travis Docker images.'
        )

    lines = [
        failed_lines[0],
        # Remove PPA and clean APT
        'RUN sudo rm -rf /var/lib/apt/lists/*',
        'RUN sudo rm -rf /etc/apt/sources.list.d/*',
        'RUN sudo apt-get clean',

        # Update OpenSSL and libssl to avoid using deprecated versions of TLS (TLSv1.0 and TLSv1.1).
        # TODO: Do we actually only want to do this when deriving from an image that has an out-of-date version of TLS?
        'RUN sudo apt-get update && sudo apt-get install --only-upgrade openssl libssl-dev',

        # Add the repositories.
        'ADD failed.tar /home/travis/build/failed/',
        'ADD passed.tar /home/travis/build/passed/',

        # Add the original logs.
        'ADD {}-orig.log /home/travis/build/'.format(failed_job_id),
        'ADD {}-orig.log /home/travis/build/'.format(passed_job_id),
        'RUN chmod 777 -R /home/travis/build',

        # Add the build scripts.
        'ADD {}-p.sh /usr/local/bin/run_failed.sh'.format(failed_job_id),
        'ADD {}-p.sh /usr/local/bin/run_passed.sh'.format(passed_job_id),
        'RUN chmod +x /usr/local/bin/run_failed.sh',
        'RUN chmod +x /usr/local/bin/run_passed.sh',

        # Set the user to use when running the image.
        'USER travis',
    ]
    # Append a newline to each line and then concatenate all the lines.
    content = ''.join(map(lambda l: l + '\n', lines))
    package_dockerfile = utils.get_abs_jobpair_dockerfile_path(jobpair)
    with open(package_dockerfile, 'w') as f:
        f.write(content)
Example #7
0
def modify_build_sh(repo: str, build_sh_path: str):
    """
    Travis builds are sometimes not reproducible when run using the build.sh script ("the build script") generated by
    travis-build. Thus, to increase the number of reproducible builds, we modify the build script.

    This function applies the necessary modifications to the build script located at `build_sh_path` and writes the
    modified file to that same location.

    This set of modifications was developed over time as we observed more build reproducibility issues that were likely
    caused by using the build script generated by travis-build.

    There's a file in Google Drive that explains the modifications applied to the build script.

    A previous version of this function included several additional build script modifications that have since been
    removed because travis-build was patched to generate build scripts that need fewer modifications. The BugSwarm
    pipeline utilizes the patched version of travis-build to generate build scripts.

    :param repo: A GitHub repository slug
    :param build_sh_path: The path to the unmodified build script (generated by travis-build).
    """
    if not isinstance(repo, str):
        raise TypeError
    if not repo:
        raise ValueError
    if not isinstance(build_sh_path, str):
        raise TypeError
    if not build_sh_path:
        raise ValueError

    log.debug('Modifying build script at {}.'.format(build_sh_path))

    # Read and modify the original build script.
    lines = []
    with open(build_sh_path) as f:
        skip = False
        for line in f:
            if 'start git.checkout' in line:
                skip = True
            elif 'travis_fold end git.checkout' in line:
                skip = False
                lines.append(r'travis_cmd cd\ ' + repo + ' --assert --echo\n')
            else:
                if not skip:
                    lines.append(line)

    # Overwrite the original build script with the modified build script.
    Utils.remove_file(build_sh_path)
    with open(build_sh_path, 'w') as f2:
        for l in lines:
            f2.write(l)
Example #8
0
def package_jobpair_image(utils: Utils, docker: DockerWrapper,
                          jobpair: JobPair):
    _copy_repo_tar(utils, jobpair)
    _copy_original_log(utils, jobpair)
    _modify_script(utils, jobpair)
    _write_package_dockerfile(utils, jobpair)

    image_tag = utils.construct_jobpair_image_tag(jobpair)
    full_image_name = utils.construct_full_image_name(image_tag)

    docker.build_image(utils.get_abs_jobpair_dir(jobpair.jobs[0]),
                       utils.get_abs_jobpair_dockerfile_path(jobpair),
                       full_image_name)
    docker.push_image(image_tag)

    _clean_after_package(utils, docker, jobpair, full_image_name)
Example #9
0
    def _base_pre_run(self):
        if self.job_center.total_jobs < 1:
            log.info('No jobs to reproduce. Exiting.')
            return

        # Set up the required directories.
        os.makedirs(self.config.orig_logs_dir, exist_ok=True)
        os.makedirs(self.config.output_dir, exist_ok=True)
        self.utils.directories_setup()
        if os.path.isfile(self.utils.get_error_reason_file_path()):
            self.error_reasons = read_json(
                self.utils.get_error_reason_file_path())
        self.error_reasons = self.manager.dict(self.error_reasons)
        # Check if commands to Travis work.
        if not Utils.is_travis_installed():
            log.error(
                colored(
                    'Commands to Travis are failing unexpectedly. Try restarting your shell and ensure your '
                    'environment is provisioned correctly. Also try restarting your shell.',
                    'red'))
            raise Exception(
                'Unexpected state: Commands to Travis are failing unexpectedly.'
            )
        # Read travis_images.json.
        try:
            self.travis_images = read_json(self.config.travis_images_json)
        except FileNotFoundError:
            log.error(
                colored(
                    self.config.travis_images_json + ' not found. Exiting.',
                    'red'))
            raise
Example #10
0
    def __init__(self, input_file, runs, task_name):
        log.info('Initializing ReproducedResultsAnalyzer.')
        self.input_file = input_file
        self.runs = runs
        self.config = Config(task_name)
        self.utils = Utils(self.config)
        self.analyzer = analyzer.Analyzer()
        # Initializing pair_center should not be in _pre_analyze because we want the pairs to maintain state between
        # analyzing each run.
        self.pair_center = PairCenter(input_file, self.utils)

        # The below attributes are initialized in _pre_analyze.
        self.start_time = None
        self.reproduced_logs = None
        self.reproduced_logs_analyzed = None
        self.error_count = None
Example #11
0
def _clean_after_package(utils: Utils, docker: DockerWrapper, jobpair: JobPair,
                         image_name: str):
    for j in jobpair.jobs:
        # Remove repo.
        Utils.remove_file(utils.get_tar_file_in_jobpair_dir(j))
        # Remove original log.
        Utils.remove_file(utils.get_orig_log_path_in_jobpair_dir(j))
Example #12
0
    def run(self):
        buildpairs = read_json(self.input_file)
        # Only check for skipping if CSV mode is disabled.
        to_insert = []
        for bp in buildpairs:
            for jp in bp['jobpairs']:
                image_tag = Utils.construct_jobpair_image_tag_from_dict(jp, bp['repo'])
                reproduce_successes, _, _ = Packager._calc_stability(jp)
                artifact_exists = Packager._is_artifact_in_db(image_tag)
                if artifact_exists and not reproduce_successes:
                    log.info('Artifact', image_tag, 'already exists in the database.')
                    continue
                to_insert.append((image_tag, artifact_exists, self._structure_artifact_data(image_tag, bp, jp)))

        if self.csv_mode:
            self._write_csv(to_insert)
        elif not to_insert:
            log.info('Done! No new metadata to insert.')
        else:
            inserts = 0
            errors = 0
            for artifact_data in to_insert:
                (image_tag, artifact_exists, artifact) = artifact_data
                if artifact_exists:
                    if Packager._update_artifact(image_tag, artifact):
                        inserts += 1
                    else:
                        errors += 1
                elif Packager._insert_artifact(artifact):
                    inserts += 1
                else:
                    errors += 1
            if errors == 0:
                log.info('Done! Inserted metadata for {} jobpairs with 0 errors.'.format(inserts))
            else:
                log.info('Done! Attempted to insert {} jobpairs into the database. {} insertions succeeded and {} '
                         'encountered an error.'.format(len(to_insert), inserts, errors))
Example #13
0
 def update_local_files(self):
     write_json(self.utils.get_error_reason_file_path(),
                Utils.deep_copy(self.error_reasons))
Example #14
0
 def __init__(self, input_file, csv_mode=False):
     self.input_file = input_file
     self.csv_mode = csv_mode
     self.task = str(os.path.splitext(input_file)[0].split('/')[-1])
     self.config = Config(self.task)
     self.utils = Utils(self.config)
Example #15
0
    def _structure_artifact_data(self, image_tag, bp, jp):
        log.info('Extracting metadata for jobpair', image_tag + '.')

        reproduced = Packager._is_jobpair_reproduced(jp)
        repo = bp['repo']
        reproduce_successes, reproduce_attempts, stability = Packager._calc_stability(jp)
        failed_job = jp['failed_job']
        passed_job = jp['passed_job']
        builds = [bp['failed_build'], bp['passed_build']]
        jobs = [failed_job, passed_job]

        today = str(date.today())
        status = 'Unreproducible'
        if reproduce_successes == 5:
            status = 'Reproducible'
        elif 0 < reproduce_successes < 5:
            status = 'Flaky'
        current_status = {
            'time_stamp': today,
            'status': status
        }

        d = {
            'tag': image_tag,
            'image_tag': image_tag,
            'repo': repo,
            'repo_mined_version': bp['repo_mined_version'],
            'pr_num': int(bp['pr_num']),
            'branch': bp['branch'],
            'base_branch': bp['base_branch'],
            'lang': [j['language'] for j in bp['failed_build']['jobs']
                     if j['job_id'] == failed_job['job_id']][0].title(),
            # Assume the build system and test framework is the same for both jobs. In some rare cases, this assumption
            # will not hold.
            'build_system': failed_job['orig_result']['tr_build_system'] if failed_job['orig_result'] else '',
            'test_framework': failed_job['orig_result']['tr_log_frameworks'] if failed_job['orig_result'] else '',
            'merged_at': bp['merged_at'],
            'is_error_pass': bp['is_error_pass'],
            'reproduced': reproduced,

            'match': Packager._calc_match_over_run(jp) if reproduced else '',
            'reproduce_successes': reproduce_successes,
            'reproduce_attempts': reproduce_attempts,
            'stability': stability,
            'creation_time': int(time.time()),

            # Evaluation info.
            'filtered_reason': jp.get('filtered_reason', ''),

            # Metrics. Empty by default and will be populated later by other components during post-processing.
            'metrics': {},
            'current_status': current_status
        }

        for i in range(2):
            job_key = 'failed_job' if i == 0 else 'passed_job'
            patches = {}
            job = jobs[i]

            # Find index of job in builds[i]['jobs'].
            job_id = job['job_id']
            jobs_index = next(i for i, j in enumerate(builds[i]['jobs']) if j['job_id'] == job_id)

            # Add patch information if the job is Java 7 and has at least one reproduce success.
            job_config = builds[i]['jobs'][jobs_index]['config']
            if job_config.get('jdk') in ['oraclejdk7', 'openjdk7']:
                # TODO: Collect patch names as each patch is applied. That is, do not wait until this method because the
                # patches created now may not exactly match the patches applied.
                patches['mvn-tls'] = today

            if job.get('pip_patch'):
                patches['pip-yaml-patch'] = today

            patches['remove-ppa'] = today

            d[job_key] = {
                'base_sha': builds[i]['base_sha'],
                'build_id': builds[i]['build_id'],
                'build_job': [j['build_job'] for j in builds[i]['jobs'] if j['job_id'] == jobs[i]['job_id']][0],
                'committed_at': builds[i]['committed_at'],
                'failed_tests': jobs[i]['orig_result']['tr_log_tests_failed'] if jobs[i]['orig_result'] else '',
                'job_id': job_id,
                'message': builds[i]['message'],
                'mismatch_attrs': jobs[i]['mismatch_attrs'],
                'num_tests_failed': jobs[i]['orig_result']['tr_log_num_tests_failed'] if jobs[i]['orig_result'] else '',
                'num_tests_run': jobs[i]['orig_result']['tr_log_num_tests_run'] if jobs[i]['orig_result'] else '',
                'trigger_sha': builds[i]['travis_merge_sha'] if builds[i]['travis_merge_sha'] else
                builds[i]['head_sha'],
                'is_git_repo': Packager._artifact_is_git_repo(builds[i]),
                'config': job_config,
                'patches': patches,
                'component_versions': {
                    'analyzer': Utils.get_analyzer_version(),
                    'reproducer': Utils.get_reproducer_version(),
                },
            }
        return d
Example #16
0
 def __init__(self, task_name):
     log.info('Initializing ArtifactLogAdder.')
     self.config = Config(task_name)
     self.utils = Utils(self.config)
     self.task = task_name
Example #17
0
class ReproducedResultsAnalyzer(object):
    def __init__(self, input_file, runs, task_name):
        log.info('Initializing ReproducedResultsAnalyzer.')
        self.input_file = input_file
        self.runs = runs
        self.config = Config(task_name)
        self.utils = Utils(self.config)
        self.analyzer = analyzer.Analyzer()
        # Initializing pair_center should not be in _pre_analyze because we want the pairs to maintain state between
        # analyzing each run.
        self.pair_center = PairCenter(input_file, self.utils)

        # The below attributes are initialized in _pre_analyze.
        self.start_time = None
        self.reproduced_logs = None
        self.reproduced_logs_analyzed = None
        self.error_count = None

    def run(self):
        for i in range(1, self.runs + 1):
            self._pre_analyze()
            self._analyze(i)
            self._post_analyze(i)
        self._show_reproducibility()
        self._write_output_json()
        log.info('Done!')

    def _pre_analyze(self):
        """
        Reset state before analyzing the next run.
        """
        self.start_time = time.time()
        self.reproduced_logs = {}
        self.reproduced_logs_analyzed = 0
        self.error_count = 0

        # Reset the match type flag before each run
        for r in self.pair_center.repos:
            for bp in self.pair_center.repos[r].buildpairs:
                bp.set_match_type.value = False

    def _analyze(self, run):
        """
        Analyze a single run of reproduced results.
        For each job in a jobpair, check if the reproduced log exists in the task folder. If it does, then download the
        original Travis log. Finally, analyze and compare the two logs.
        """
        for r in self.pair_center.repos:
            for bp in self.pair_center.repos[r].buildpairs:
                for jp in bp.jobpairs:
                    for j in jp.jobs:
                        try:
                            analyzed_reproduced_log = analyze_and_compare(self, j, run)
                            if analyzed_reproduced_log:
                                self.reproduced_logs_analyzed += 1
                        except Exception as e:
                            log.error('Encountered an error while analyzing and comparing {}: {}'.format(j.job_name, e))
                            self.error_count += 1
        self.pair_center.update_buildpair_done_status()
        self.pair_center.assign_pair_match_types()
        self.pair_center.assign_pair_match_history(run)
        self.pair_center.assign_pair_patch_history(run)

    def _post_analyze(self, run):
        """
        This function is called after analyzing each run. Print statistics like how many pairs matched and time elapsed
        and then visualize the match history after this run.
        """
        log.info('Done analyzing run {}.'.format(run))
        self._visualize_match_history()
        log.info('{} reproduced logs analyzed and {} errors in run {}.'
                 .format(self.reproduced_logs_analyzed, self.error_count, run))
        # Print a blank line to separate each run.
        log.info()
        mmm = self.utils.construct_mmm_count(self.pair_center)
        aaa = self.utils.construct_aaa_count(self.pair_center)
        log.debug('Match types in run {}: m1-m2-m3: {} a1-a2-a3: {}.'.format(run, mmm, aaa))

    def _write_output_json(self):
        log.info('Writing output JSON annotated with match history.')
        pairs = read_json(self.input_file)
        # Write default attributes.
        for p in pairs:
            for jp in p['jobpairs']:
                jp['match_history'] = {}
                jp['failed_job']['match_history'] = {}
                jp['passed_job']['match_history'] = {}
                jp['failed_job']['orig_result'] = ''
                jp['passed_job']['orig_result'] = ''
                jp['failed_job']['mismatch_attrs'] = []
                jp['passed_job']['mismatch_attrs'] = []
                jp['failed_job']['pip_patch'] = False
                jp['passed_job']['pip_patch'] = False

        for p in pairs:
            repo = p['repo']
            if repo not in self.pair_center.repos:
                continue

            # Try to find this build pair in pair center.
            for bp in self.pair_center.repos[repo].buildpairs:
                if p['failed_build']['build_id'] == bp.builds[0].build_id:
                    # Found build pair in pair center.

                    # Optional: Write buildpair match type.
                    # This is not used since we switched to jobpair packaging.
                    p['match'] = bp.match.value
                    trigger_sha = p['failed_build']['head_sha']
                    # Similarly, for each job pair in build pair, try to find it in the pair center.
                    for jp in p['jobpairs']:
                        # For a build that has some jobs filtered and some jobs not filtered,
                        # the job cannot be found in paircenter.
                        if jp['is_filtered']:
                            continue

                        found_in_paircenter = False
                        for jobpair in bp.jobpairs:
                            if str(jobpair.jobs[0].job_id) == str(jp['failed_job']['job_id']):
                                found_in_paircenter = True
                                # Write jobpair match history, analyzed results, and mismatched attributes.
                                jp['match_history'] = jobpair.match_history
                                jp['failed_job']['match_history'] = jobpair.failed_job_match_history
                                jp['passed_job']['match_history'] = jobpair.passed_job_match_history
                                jp['failed_job']['orig_result'] = jobpair.jobs[0].orig_result
                                jp['passed_job']['orig_result'] = jobpair.jobs[1].orig_result
                                jp['failed_job']['mismatch_attrs'] = jobpair.jobs[0].mismatch_attrs
                                jp['passed_job']['mismatch_attrs'] = jobpair.jobs[1].mismatch_attrs
                                jp['failed_job']['pip_patch'] = jobpair.jobs[0].pip_patch
                                jp['passed_job']['pip_patch'] = jobpair.jobs[1].pip_patch

                        if not found_in_paircenter:
                            # If not found in pair center, this jobpair was filtered out.
                            # In this case, we still analyze the original log to get as many attributes as possible.
                            for i in range(2):
                                job_name = 'failed_job' if i == 0 else 'passed_job'
                                job_id = jp[job_name]['job_id']
                                original_log_path = self.utils.get_orig_log_path(job_id)
                                if not download_log(job_id, original_log_path):
                                    continue
                                original_result = self.analyzer.analyze_single_log(original_log_path, job_id,
                                                                                   trigger_sha, repo)
                                if 'not_in_supported_language' in original_result:
                                    continue
                                jp[job_name]['orig_result'] = original_result
                            raise RuntimeError('Unexpected state: Jobpair not found in pair center. Exiting.')

        os.makedirs(self.config.result_json_dir, exist_ok=True)
        filename = self.config.task + '.json'
        filepath = os.path.join(self.config.result_json_dir, filename)
        write_json(filepath, pairs)

    def _get_all_jobpairs_and_all_runs(self) -> Tuple[List[JobPair], List[str]]:
        all_jobpairs = []
        for r in self.pair_center.repos:
            for bp in self.pair_center.repos[r].buildpairs:
                for jp in bp.jobpairs:
                    all_jobpairs.append(jp)
        all_runs = []
        for jp in all_jobpairs:
            for run in jp.match_history:
                all_runs.append(run)
        all_runs = list(set(all_runs))
        all_runs.sort()
        return all_jobpairs, all_runs

    def _visualize_match_history(self):
        log.info('Visualizing match history:')
        log.info('N means no reproduced log exists. (An error occured in reproducer while reproducing the job.)')
        all_jobpairs, all_runs = self._get_all_jobpairs_and_all_runs()
        for jp in all_jobpairs:
            log.info(jp.full_name)
            match_histories = [
                (jp.match_history, 'Job pair'),
                (jp.failed_job_match_history, 'Failed job'),
                (jp.passed_job_match_history, 'Passed job'),
            ]
            for match_history, history_name in match_histories:
                # Task name is run number 1-5
                mh = [str(match_history.get(run, 'N')) for run in all_runs]
                if mh:
                    full_history_name = '{} match history'.format(history_name)
                    log.info('{:>24}:'.format(full_history_name), ' -> '.join(mh))
                else:
                    log.info('No match history. (This jobpair is not reproduced.)')

    def _show_reproducibility(self):
        log.info('Visualizing reproducibility:')
        all_jobpairs, all_runs = self._get_all_jobpairs_and_all_runs()
        if not all_jobpairs:
            log.info('Nothing to visualize since no jobs were run.')
        else:
            full_name_max_length = max([len(jp.full_name) for jp in all_jobpairs])
            for jp in all_jobpairs:
                mh = []
                for run in all_runs:
                    run_result = jp.match_history.get(run)
                    # run_result could be 'N', 0, or 1
                    if run_result != 1:
                        mh.append(0)
                    else:
                        mh.append(run_result)

                # No reproducing runs were successful
                if all(v == 0 for v in mh):
                    reproducibility = 'Unreproducible'
                # match history is all 1s, all runs reproducible
                elif all(mh):
                    reproducibility = 'Reproducible'
                else:
                    reproducibility = 'Flaky'
                log.info('{full_name: >{width}} job pair reproducibility: {result}'
                         .format(width=full_name_max_length, full_name=jp.full_name, result=reproducibility))
        # Print a blank separator line.
        log.info()
Example #18
0
class JobDispatcher(object):
    """
    JobDispatcher controls the entire reproducing workflow by dispatching tasks to a pool of worker threads.
    Subclasses determine the specific task.
    """

    def __init__(self, input_file, task_name, threads=1, keep=False, package_mode=False, dependency_solver=False,
                 skip_check_disk=False):
        """
        Initializes JobDispatcher with user specified input and starts work.
        If `threads` is specified, JobDispatcher will dispatch jobs to be reproduced in each thread. Otherwise, each job
        will be reproduced sequentially.
        """
        log.info('Initializing job dispatcher.')
        self.input_file = input_file
        self.thread_num = threads
        self.keep = keep
        self.package_mode = package_mode
        self.dependency_solver = dependency_solver
        # -----
        self.config = Config(task_name)
        self.config.skip_check_disk = skip_check_disk
        self.utils = Utils(self.config)
        self.items_processed = Value('i', 0)
        self.reproduce_err = Value('i', 0)
        self.job_time_acc = 0
        self.start_time = time.time()
        self.docker = DockerWrapper(self.utils)
        self.docker_storage_path = self.docker.setup_docker_storage_path()
        self.terminate = Value('i', 0)
        self.manager = Manager()
        self.lock = Lock()
        self.workspace_locks = self.manager.dict()
        self.cloned_repos = self.manager.dict()
        self.threads = {}
        self.error_reasons = {}
        self.alive_threads = 0
        self.travis_images = None
        self.job_center = PairCenter(self.input_file, self.utils, self.package_mode)

    def run(self):
        """
        The entry point for reproducing jobs. Calls post_run() after all items are processed.

        Subclasses must not override this method.
        """
        self._base_pre_run()
        self.pre_run()
        try:
            while self.job_center.get_num_remaining_items(self.package_mode):
                log.info('Ready to initialize threads.')
                if not self.utils.check_disk_space_available():
                    self.utils.clean_disk_usage(self)
                    if not self.utils.check_disk_space_available():
                        msg = 'Still inadequate disk space after removing temporary Reproducer files. Exiting.'
                        log.error(msg)
                        raise OSError(msg)
                if not self.utils.check_docker_disk_space_available(self.docker_storage_path):
                    self.utils.clean_docker_disk_usage(self.docker)
                    if not self.utils.check_docker_disk_space_available(self.docker_storage_path):
                        msg = 'Still inadequate disk space after removing inactive Docker Images. Exiting.'
                        log.error(msg)
                        raise OSError(msg)
                self._init_threads()
        except KeyboardInterrupt:
            log.info('Caught KeyboardInterrupt. Cleaning up before terminating.')
            self.terminate.value = 1
        else:
            self.post_run()
            log.info('Done!')
        finally:
            log.info(self.progress_str())

    def _spawn(self, tid):
        t = Process(target=self._thread_main, args=(tid,))
        thread = {'process': t, 'exit_reason': ''}
        self.threads[tid] = thread
        t.start()

    def _thread_watcher(self):
        """
        Repeatedly check if process is alive.
        """
        log.info('Initialized', len(self.threads), 'threads.')
        count = 0
        old_str = self.progress_str()
        while True:
            time.sleep(3)
            count += 1
            if count == 6:
                count = 0
                self.update_local_files()  # Update local files every 3*6 seconds.
                if self.terminate.value:
                    log.info(colored('Waiting for threads...', 'blue'))
                # elif not self.utils.check_disk_space_available():
                #     log.warning(colored('Not enough disk space. Joining threads...', 'yellow'))
                #     self.terminate.value = 1

            alive_threads = 0
            for tid in self.threads:
                p = self.threads[tid]['process']
                if p.is_alive():
                    alive_threads += 1
                else:
                    if p.exitcode is None:  # Not finished and not running.
                        # Do error handling and restarting here assigning the new process to processes[n].
                        self.threads[tid]['exit_reason'] = 'not finished and not running'
                        self._spawn(tid)
                    elif p.exitcode != 0:
                        self.threads[tid]['exit_reason'] = 'errored or terminated'
                        # Handle this either by restarting or deleting the entry so it is removed from list.
                        self._spawn(tid)
                    else:
                        self.threads[tid]['exit_reason'] = 'finished'
                        self.terminate.value = 1
                        p.join()  # Allow cleanup.

            self.alive_threads = alive_threads
            if not alive_threads:
                break

            curr_str = self.progress_str()
            if curr_str != old_str:
                old_str = curr_str
                if curr_str:
                    log.info(curr_str)

    def _init_threads(self):
        """
        Initialize min(num_threads, number of jobs to reproduce) threads.
        """
        self.lock = Lock()
        self.workspace_locks = self.manager.dict()
        self.cloned_repos = self.manager.dict()
        self.threads = {}
        self.terminate.value = 0
        num_remaining_items = self.job_center.get_num_remaining_items(self.package_mode)
        if not num_remaining_items:
            log.info('No remaining items. Exiting.')
            return 0
        self.thread_num = min(self.thread_num, num_remaining_items)
        self.job_center.init_queues_for_threads(self.thread_num, self.package_mode)
        # Begin initializing threads.
        for tid in range(self.thread_num):
            self._spawn(tid)
        self._thread_watcher()

    def _thread_main(self, tid):
        """
        This is the target function for each thread.
        It receives the work load (a queue) for a given thread from job_center.thread_workloads.
        For each item, it calls self.process_item() to run.
        :param tid: Thread ID
        """
        workload = self.job_center.thread_workloads[tid]
        while not workload.empty():
            # Break out of the loop if the terminate flag is set.
            if self.terminate.value:
                return 0
            item = workload.get()

            # Intentionally catch ReproduceError but allow KeyboardInterrupt to propagate.
            try:
                self.process_item(item, tid)
            except ReproduceError as e:
                log.info(colored('[THREAD {}] {} {}'.format(tid, item, e), 'red'))
                self.reproduce_err.value += 1
                self.record_error_reason(item, str(e))
                # Optionally handle failed reproducing here.
        log.info('[THREAD {}] Workload complete. Exiting thread.'.format(tid))

    def _base_pre_run(self):
        if self.job_center.total_jobs < 1:
            log.info('No jobs to reproduce. Exiting.')
            return

        # Set up the required directories.
        os.makedirs(self.config.orig_logs_dir, exist_ok=True)
        os.makedirs(self.config.output_dir, exist_ok=True)
        self.utils.directories_setup()
        if os.path.isfile(self.utils.get_error_reason_file_path()):
            self.error_reasons = read_json(self.utils.get_error_reason_file_path())
        self.error_reasons = self.manager.dict(self.error_reasons)
        # Check if commands to Travis work.
        if not Utils.is_travis_installed():
            log.error(colored('Commands to Travis are failing unexpectedly. Try restarting your shell and ensure your '
                              'environment is provisioned correctly. Also try restarting your shell.', 'red'))
            raise Exception('Unexpected state: Commands to Travis are failing unexpectedly.')
        # Read travis_images.json.
        try:
            self.travis_images = read_json(self.config.travis_images_json)
        except FileNotFoundError:
            log.error(colored(self.config.travis_images_json + ' not found. Exiting.', 'red'))
            raise

    def pre_run(self):
        """
        Called before any items have been processed.

        Overriding is optional. Defaults to no-op.
        """
        pass

    def progress_str(self) -> Optional[str]:
        """
        Subclasses should return a string, which will be logged, representing progress at the time the method is called.
        Returns None by default, which indicates to the caller that logging the progress should be skipped.

        Overriding is optional.
        :return: A string representing the dispatcher's progress or None to skip logging the progress.
        """
        return None

    def update_local_files(self):
        """
        Called periodically to allow the dispatcher to update local files as needed.

        Overriding is optional. Defaults to no-op.
        """
        pass

    def process_item(self, item, tid):
        """
        Subclasses must override this method to process each item in the workload.
        :param item: The item to process.
        :param tid: The thread ID tasked with processing the item.
        """
        raise NotImplementedError

    def record_error_reason(self, item, message):
        """
        Overriding is optional. Defaults to no-op.
        :param item: The item for which to record an error message.
        :param message: The error message to record.
        """
        pass

    def post_run(self):
        """
        Called after all items have been processed.

        Overriding is optional. Defaults to no-op.
        """
        pass
Example #19
0
    def _structure_artifact_data(self, image_tag, bp, jp):
        log.info('Extracting metadata for jobpair', image_tag + '.')

        reproduced = Packager._is_jobpair_reproduced(jp)
        repo = bp['repo']
        reproduce_successes, reproduce_attempts, stability = Packager._calc_stability(jp)
        failed_job = jp['failed_job']
        passed_job = jp['passed_job']
        builds = [bp['failed_build'], bp['passed_build']]
        jobs = [failed_job, passed_job]

        # In the case where "No files are changed", determined by our pair-classifier.py, there would not be a
        # classification key for that jobpair. We insert a blank template in case the artifact is populated to our
        # website
        try:
            classification = jp['classification']
            classification = {
                'test': classification['test'],
                'build': classification['build'],
                'code': classification['code'],
                'exceptions': classification['exceptions']
            }
        except KeyError:
            classification = {
                'test': 'NA',
                'build': 'NA',
                'code': 'NA',
                'exceptions': []
            }

        today = str(date.today())
        status = 'Unreproducible'
        if reproduce_successes == 5:
            status = 'Reproducible'
        elif 0 < reproduce_successes < 5:
            status = 'Flaky'
        current_status = {
            'time_stamp': today,
            'status': status
        }

        # Previously mined jobpairs that are ran through the Reproducer, for example from generated pairs,
        # may not contain any 'metrics' so we add add a blank template for updating
        try:
            metrics = jp['metrics']
        except KeyError:
            metrics = {}

        d = {
            'current_image_tag': image_tag,
            'image_tag': image_tag,
            'repo': repo,
            'repo_mined_version': bp['repo_mined_version'],
            'pr_num': int(bp['pr_num']),
            'branch': bp['branch'],
            'base_branch': bp['base_branch'],
            'lang': [j['language'] for j in bp['failed_build']['jobs']
                     if j['job_id'] == failed_job['job_id']][0].title(),
            # Assume the build system and test framework is the same for both jobs. In some rare cases, this assumption
            # will not hold.
            'build_system': failed_job['orig_result']['tr_build_system'] if failed_job['orig_result'] else '',
            'test_framework': failed_job['orig_result']['tr_log_frameworks'] if failed_job['orig_result'] else '',
            'merged_at': bp['merged_at'],
            'is_error_pass': bp['is_error_pass'],
            'reproduced': reproduced,

            'match': Packager._calc_match_over_run(jp) if reproduced else '',
            'reproduce_successes': reproduce_successes,
            'reproduce_attempts': reproduce_attempts,
            'stability': stability,
            'creation_time': int(time.time()),

            # Evaluation info.
            'filtered_reason': jp.get('filtered_reason', ''),

            'metrics': metrics,
            'current_status': current_status,
            'classification': classification
        }

        for i in range(2):
            job_key = 'failed_job' if i == 0 else 'passed_job'
            patches = {}
            job = jobs[i]

            # Find index of job in builds[i]['jobs'].
            job_id = job['job_id']
            jobs_index = next(i for i, j in enumerate(builds[i]['jobs']) if j['job_id'] == job_id)

            # Add patch information if the job is Java 7 and has at least one reproduce success.
            job_config = builds[i]['jobs'][jobs_index]['config']
            if job_config.get('jdk') in ['oraclejdk7', 'openjdk7']:
                # TODO: Collect patch names as each patch is applied. That is, do not wait until this method because the
                # patches created now may not exactly match the patches applied.
                patches['mvn-tls'] = today

            if job.get('pip_patch'):
                patches['pip-yaml-patch'] = today

            patches['remove-ppa'] = today

            d[job_key] = {
                'base_sha': builds[i]['base_sha'],
                'build_id': builds[i]['build_id'],
                'build_job': [j['build_job'] for j in builds[i]['jobs'] if j['job_id'] == jobs[i]['job_id']][0],
                'committed_at': builds[i]['committed_at'],
                'failed_tests': jobs[i]['orig_result']['tr_log_tests_failed'] if jobs[i]['orig_result'] else '',
                'job_id': job_id,
                'message': builds[i]['message'],
                'mismatch_attrs': jobs[i]['mismatch_attrs'],
                'num_tests_failed': jobs[i]['orig_result']['tr_log_num_tests_failed'] if jobs[i]['orig_result'] else '',
                'num_tests_run': jobs[i]['orig_result']['tr_log_num_tests_run'] if jobs[i]['orig_result'] else '',
                'trigger_sha': builds[i]['travis_merge_sha'] if builds[i]['travis_merge_sha'] else
                builds[i]['head_sha'],
                'is_git_repo': Packager._artifact_is_git_repo(builds[i]),
                'config': job_config,
                'patches': patches,
                'component_versions': {
                    'analyzer': Utils.get_analyzer_version(),
                    'reproducer': Utils.get_reproducer_version(),
                },
            }
        return d