예제 #1
0
def download_orig_log(art, img_tag, f_or_p):
    job_id = art['{}_job'.format(f_or_p)]['job_id']
    original_log_name = '{}.{}.orig.log'.format(img_tag, f_or_p)
    downloaded = download_log(job_id, join(os.getcwd(), original_log_name))
    if downloaded:
        return join(os.getcwd(), original_log_name)
    return None
예제 #2
0
def _copy_original_log(utils: Utils, jobpair: JobPair):
    for j in jobpair.jobs:
        original_log_path = utils.get_orig_log_path(j.job_id)
        if not download_log(j.job_id, original_log_path):
            raise ReproduceError(
                'Error while copying the original log for {}.'.format(
                    j.job_id))
        utils.copy_orig_log_into_jobpair_dir(j)
예제 #3
0
def filter_non_exact_images(job_id, log_path):
    """
    Check if all jobs in this pair (from both the failed and passed build) used images that are available.

    If an image is found to match the job pair then it gets added to the job pair.

    This function assumes the language specified in the Travis configuration does not change between the failed and
    passed builds.

    Returns a 4-tuple of filter counts. The tuple members represent the following:
    1. The number of pairs filtered due to original log not found
    2. The number of pairs filtered due to an error reading the original log
    3. The number of pairs filtered due to no image provision timestamp in the original log
    4. The number of pairs filtered due to usage of a non-exact Docker image.
    """
    log.debug(
        'To detect non-exact pairs, we first extract the used images from the original logs.'
    )
    travis_images = read_json(os.path.join(BASE_DIR, "travis_images.json"))
    provisioned_strs = []
    for language in travis_images:
        provisioned_strs += travis_images[language].values()
    dockerhub_images = read_json(
        os.path.join(BASE_DIR, "dockerhub_images.json"))
    no_original_log = 0
    error_reading_original_log = 0
    orig_log_path = os.path.join(log_path, str(job_id) + ".txt")
    if not download_log(job_id, orig_log_path):
        no_original_log += 1

    # Try to find the image by timestamp. If found, add it to the job pair.
    try:
        chooser = ExactImageChooserByTime(orig_log_path, travis_images,
                                          language)
        orig_log_image_provision_timestamp = chooser.find_image_datetime_from_log(
        )
        image = chooser.get_image_tag()
        if image is not None:
            return image
    except OSError:
        # The original log file was not found.
        error_reading_original_log += 1

    # Try to find image by tag. If found, add it to the job pair.
    if not image:
        chooser = ExactImageChooserByTag(orig_log_path)
        image = chooser.get_image_tag()
        if image is not None:
            return image
    # Try to find image by GCE commit SHA. If found, add it to the job pair.
    if not image:
        chooser = ExactImageChooserByCommitSHA(orig_log_path, dockerhub_images)
        image = chooser.get_image_tag()
        if image is not None:
            return image
예제 #4
0
def main(argv=None):
    argv = argv or sys.argv

    job_pair_file, output_file = _validate_input(argv)

    log_list = []
    log_dict = {}
    logs_with_fail_tests = []

    # Make temp directory to store downloaded logs
    with tempfile.TemporaryDirectory() as tmpdirname:

        # Download the failing log for each build in the csv file
        with open(job_pair_file) as jp_file:
            for line in jp_file:
                line_split = line.strip().split(',')
                project_name = line_split[0].replace('/', '-')
                failing_job_id = line_split[1]
                passing_job_id = line_split[2]

                log_filename = '{}-{}'.format(project_name, failing_job_id)
                log_path = os.path.join(tmpdirname, log_filename)

                # Download the log
                if log_downloader.download_log(failing_job_id, log_path):
                    log_list.append(log_path)
                    log_dict[log_path] = '{},{},{}'.format(
                        project_name, failing_job_id, passing_job_id)
                else:
                    log.error(
                        'Error trying to download: {}'.format(log_filename))

        # Run Analyzer on each log and filter
        analyzer = Analyzer()
        for fail_log in log_list:
            result = analyzer.analyze_single_log(
                fail_log)['tr_log_num_tests_failed']

            try:
                result_int = int(result)
            # Catch NA's and set to 0
            except ValueError:
                result_int = 0

            if result_int > 0:
                logs_with_fail_tests.append(fail_log)

        # Write out the logs that have at least one failing test
        with open(output_file, 'w') as out_file:
            for fail_log in logs_with_fail_tests:
                out_file.write('{}\n'.format(log_dict[fail_log]))
예제 #5
0
def download_artifact_log(artifact):
    failed_job_id = artifact["failed_job"]["job_id"]
    passed_job_id = artifact["passed_job"]["job_id"]

    mkdir('{}/{}'.format(_TMP_DIR, failed_job_id))
    mkdir('{}/{}'.format(_TMP_DIR, passed_job_id))

    failed_job_orig_log_path = '{}/{}/log-failed.log'.format(
        _TMP_DIR, failed_job_id)
    passed_job_orig_log_path = '{}/{}/log-passed.log'.format(
        _TMP_DIR, passed_job_id)

    result = download_log(failed_job_id, failed_job_orig_log_path)
    if not result:
        print_error(
            'Error downloading log for failed_job_id {}'.format(failed_job_id))
        return -1, failed_job_orig_log_path, passed_job_orig_log_path

    result = download_log(passed_job_id, passed_job_orig_log_path)
    if not result:
        print_error(
            'Error downloading log for passed_job_id {}'.format(passed_job_id))
        return -1, failed_job_orig_log_path, passed_job_orig_log_path
    return 1, failed_job_orig_log_path, passed_job_orig_log_path
예제 #6
0
def _get_original_result(analyzer, utils, job_id, trigger_sha, repo):
    original_log_path = utils.get_orig_log_path(job_id)

    # If the original log does not exist in the expected location, try to download it to that location. If the log
    # cannot be downloaded, return error.
    if not os.path.isfile(original_log_path):
        log.debug('Original log not found at {}.'.format(original_log_path))
        log.info('Download original log.')
        if not download_log(job_id, original_log_path):
            log.info('Could not download original log.')
            return None, original_log_path

    original_result = analyzer.analyze_single_log(original_log_path,
                                                  job_id,
                                                  trigger_sha=trigger_sha,
                                                  repo=repo)
    if original_result.get('not_in_supported_language') is True:
        raise ReproduceError(
            'Original log was not generated from a job in a supported programming language. '
            'The primary language was "{}."'.format(
                original_result['primary_language']))
    return original_result, original_log_path
예제 #7
0
    def _write_output_json(self):
        log.info('Writing output JSON annotated with match history.')
        pairs = read_json(self.input_file)
        # Write default attributes.
        for p in pairs:
            for jp in p['jobpairs']:
                jp['match_history'] = {}
                jp['failed_job']['match_history'] = {}
                jp['passed_job']['match_history'] = {}
                jp['failed_job']['orig_result'] = ''
                jp['passed_job']['orig_result'] = ''
                jp['failed_job']['mismatch_attrs'] = []
                jp['passed_job']['mismatch_attrs'] = []
                jp['failed_job']['pip_patch'] = False
                jp['passed_job']['pip_patch'] = False

        for p in pairs:
            repo = p['repo']
            if repo not in self.pair_center.repos:
                continue

            # Try to find this build pair in pair center.
            for bp in self.pair_center.repos[repo].buildpairs:
                if p['failed_build']['build_id'] == bp.builds[0].build_id:
                    # Found build pair in pair center.

                    # Optional: Write buildpair match type.
                    # This is not used since we switched to jobpair packaging.
                    p['match'] = bp.match.value
                    trigger_sha = p['failed_build']['head_sha']
                    # Similarly, for each job pair in build pair, try to find it in the pair center.
                    for jp in p['jobpairs']:
                        # For a build that has some jobs filtered and some jobs not filtered,
                        # the job cannot be found in paircenter.
                        if jp['is_filtered']:
                            continue

                        found_in_paircenter = False
                        for jobpair in bp.jobpairs:
                            if str(jobpair.jobs[0].job_id) == str(jp['failed_job']['job_id']):
                                found_in_paircenter = True
                                # Write jobpair match history, analyzed results, and mismatched attributes.
                                jp['match_history'] = jobpair.match_history
                                jp['failed_job']['match_history'] = jobpair.failed_job_match_history
                                jp['passed_job']['match_history'] = jobpair.passed_job_match_history
                                jp['failed_job']['orig_result'] = jobpair.jobs[0].orig_result
                                jp['passed_job']['orig_result'] = jobpair.jobs[1].orig_result
                                jp['failed_job']['mismatch_attrs'] = jobpair.jobs[0].mismatch_attrs
                                jp['passed_job']['mismatch_attrs'] = jobpair.jobs[1].mismatch_attrs
                                jp['failed_job']['pip_patch'] = jobpair.jobs[0].pip_patch
                                jp['passed_job']['pip_patch'] = jobpair.jobs[1].pip_patch

                        if not found_in_paircenter:
                            # If not found in pair center, this jobpair was filtered out.
                            # In this case, we still analyze the original log to get as many attributes as possible.
                            for i in range(2):
                                job_name = 'failed_job' if i == 0 else 'passed_job'
                                job_id = jp[job_name]['job_id']
                                original_log_path = self.utils.get_orig_log_path(job_id)
                                if not download_log(job_id, original_log_path):
                                    continue
                                original_result = self.analyzer.analyze_single_log(original_log_path, job_id,
                                                                                   trigger_sha, repo)
                                if 'not_in_supported_language' in original_result:
                                    continue
                                jp[job_name]['orig_result'] = original_result
                            raise RuntimeError('Unexpected state: Jobpair not found in pair center. Exiting.')

        os.makedirs(self.config.result_json_dir, exist_ok=True)
        filename = self.config.task + '.json'
        filepath = os.path.join(self.config.result_json_dir, filename)
        write_json(filepath, pairs)
예제 #8
0
def filter_non_exact_images(pairs: List) -> Tuple[int, int, int, int]:
    """
    Check if all jobs in this pair (from both the failed and passed build) used images that are available.

    If an image is found to match the job pair then it gets added to the job pair.

    This function assumes the language specified in the Travis configuration does not change between the failed and
    passed builds.

    Returns a 4-tuple of filter counts. The tuple members represent the following:
    1. The number of pairs filtered due to original log not found
    2. The number of pairs filtered due to an error reading the original log
    3. The number of pairs filtered due to no image provision timestamp in the original log
    4. The number of pairs filtered due to usage of a non-exact Docker image.
    """
    log.debug('To detect non-exact pairs, we first extract the used images from the original logs.')

    travis_images = read_json(TRAVIS_IMAGES_JSON)
    provisioned_strs = []
    for language in travis_images:
        provisioned_strs += travis_images[language].values()

    dockerhub_images = read_json(DOCKERHUB_IMAGES_JSON)
    filtered = 0
    no_original_log = 0
    error_reading_original_log = 0
    no_image_provision_timestamp = 0
    inaccessible_image = 0
    exact_jobs = 0
    images_we_have = {}
    images_we_dont_have = {}

    log.debug('Analyzing original logs to extract used images.')
    processed = 0
    for p in pairs:
        config = p['failed_build']['jobs'][0]['config']

        # Travis defaults to the Ruby image if the language is not specified.
        # See https://github.com/travis-ci/travis-ci/issues/4895.
        language = config.get('language') or 'ruby'

        # Multiple languages can be specified by using a list. In this case, we take the first language in the list.
        # We should eventually consider supporting the behavior mentioned in
        # https://stackoverflow.com/a/44054333/5007059 if it becomes officially supported.
        if isinstance(language, list):
            language = language[0]

        if language == 'java':
            language = 'jvm'

        for jp in p['jobpairs']:
            # If the job pair has already been filtered, skip it.
            if utils.jobpair_is_filtered(jp):
                continue

            jobs = [jp['failed_job'], jp['passed_job']]
            for j in jobs:
                processed += 1

                job_id = j['job_id']
                orig_log_path = utils.get_orig_log_path(job_id)
                if not download_log(job_id, orig_log_path):
                    no_original_log += 1
                    jp[FILTERED_REASON_KEY] = reasons.NO_ORIGINAL_LOG
                    continue

                # Try to find the image by timestamp. If found, add it to the job pair.
                try:
                    chooser = ExactImageChooserByTime(orig_log_path, travis_images, language)
                    orig_log_image_provision_timestamp = chooser.find_image_datetime_from_log()
                    image = chooser.get_image_tag()
                    if image is not None:
                        j[PARSED_IMAGE_TAG_KEY] = image
                except OSError:
                    # The original log file was not found.
                    error_reading_original_log += 1
                    jp[FILTERED_REASON_KEY] = reasons.ERROR_READING_ORIGINAL_LOG
                    continue

                if not orig_log_image_provision_timestamp:
                    # Jobs older than 01/2015 did not use Docker, so the build log does not contain an image provision
                    # timestamp.
                    no_image_provision_timestamp += 1
                    jp[FILTERED_REASON_KEY] = reasons.NO_IMAGE_PROVISION_TIMESTAMP
                    continue

                # Try to find image by tag. If found, add it to the job pair.
                if not image:
                    chooser = ExactImageChooserByTag(orig_log_path)
                    image = chooser.get_image_tag()
                    if image is not None:
                        j[PARSED_IMAGE_TAG_KEY] = image
                # Try to find image by GCE commit SHA. If found, add it to the job pair.
                if not image:
                    chooser = ExactImageChooserByCommitSHA(orig_log_path, dockerhub_images)
                    image = chooser.get_image_tag()
                    if image is not None:
                        j[PARSED_IMAGE_TAG_KEY] = image

                # 'tr_build_image' is the attribute containing the provision timestamp extracted from a build log.
                if orig_log_image_provision_timestamp not in provisioned_strs and image is None:
                    # This image is inaccessible.
                    inaccessible_image += 1
                    if orig_log_image_provision_timestamp not in images_we_dont_have and image is None:
                        images_we_dont_have[orig_log_image_provision_timestamp] = 1
                    else:
                        images_we_dont_have[orig_log_image_provision_timestamp] += 1
                    jp[FILTERED_REASON_KEY] = reasons.INACCESSIBLE_IMAGE
                else:
                    exact_jobs += 1
                    if orig_log_image_provision_timestamp not in images_we_have:
                        images_we_have[orig_log_image_provision_timestamp] = 1
                    else:
                        images_we_have[orig_log_image_provision_timestamp] += 1

            if utils.jobpair_is_filtered(jp):
                filtered += 1

    # Print the images we have and do not have and how many times they are used by these jobs.
    log.debug('Stats about images that we have:')
    for k in images_we_have:
        log.debug('{} jobs use an image provisioned on {}.'.format(k, images_we_have[k]))
    log.debug('Stats about images that we do not have:')
    for k in images_we_dont_have:
        log.debug('{} jobs use an unavabilable image provisioned on {}.'.format(k, images_we_dont_have[k]))
    log.debug('Total exact jobs:', exact_jobs)
    log.debug('Total non-exact jobs:', inaccessible_image)
    log.debug('Jobs with missing logs:', no_original_log)
    utils.log_filter_count(filtered, 'jobpairs that use non-exact images')
    return no_original_log, error_reading_original_log, no_image_provision_timestamp, inaccessible_image
예제 #9
0
    def cache_artifact_dependency(self):
        response = bugswarmapi.find_artifact(self.image_tag)
        if not response.ok:
            raise CachingScriptError('Unable to get artifact data')

        artifact = response.json()
        build_system = artifact['build_system']
        job_id = {
            'failed': artifact['failed_job']['job_id'],
            'passed': artifact['passed_job']['job_id'],
        }
        repo = artifact['repo']

        job_orig_log = {
            'failed':
            '{}/orig-failed-{}.log'.format(self.workdir, job_id['failed']),
            'passed':
            '{}/orig-passed-{}.log'.format(self.workdir, job_id['passed']),
        }
        if not download_log(job_id['failed'], job_orig_log['failed']):
            raise CachingScriptError(
                'Error downloading log for failed job {}'.format(
                    job_id['failed']))
        if not download_log(job_id['passed'], job_orig_log['passed']):
            raise CachingScriptError(
                'Error downloading log for passed job {}'.format(
                    job_id['passed']))

        docker_image_tag = '{}:{}'.format(self.args.src_repo, self.image_tag)
        original_size = self.pull_image(docker_image_tag)

        # Start a failed build and a passed build to collect cached files
        caching_build_log_path = {
            'failed': '{}/cache-failed.log'.format(self.workdir),
            'passed': '{}/cache-passed.log'.format(self.workdir),
        }
        for fail_or_pass in ['failed', 'passed']:
            container_id = self.create_container(docker_image_tag, 'cache',
                                                 fail_or_pass)
            src = os.path.join(procutils.HOST_SANDBOX, _COPY_DIR,
                               _PROCESS_SCRIPT)
            des = os.path.join(_TRAVIS_DIR, _PROCESS_SCRIPT)
            self.copy_file_to_container(container_id, src, des)
            self._run_patch_script(container_id, repo, ['add-mvn-local-repo'])
            build_result = self.run_build_script(
                container_id, fail_or_pass,
                caching_build_log_path[fail_or_pass],
                job_orig_log[fail_or_pass], job_id[fail_or_pass], build_system)
            if not build_result and not self.args.ignore_cache_error:
                raise CachingScriptError(
                    'Run build script not reproducible for caching {}'.format(
                        fail_or_pass))
            for name, path_func in CACHE_DIRECTORIES.items():
                cont_path = path_func(fail_or_pass, repo)
                cont_tar = '{}/{}.tar'.format(_TRAVIS_DIR, name)
                host_tar = '{}/{}-{}.tar'.format(self.workdir, name,
                                                 fail_or_pass)
                _, stdout, stderr, ok = self.run_command(
                    'docker exec {} tar -cvf {} {}'.format(
                        container_id, cont_tar, cont_path),
                    fail_on_error=False,
                    print_on_error=False)
                if not ok:
                    # Check whether path does not exist
                    _, _, _, ok = self.run_command(
                        'docker exec {} ls -d {}'.format(
                            container_id, cont_path),
                        fail_on_error=False,
                        print_on_error=False)
                    if ok:
                        self.print_error('Cannot tar {}'.format(cont_path),
                                         stdout, stderr)
                        raise CachingScriptError(
                            'Cannot tar {}'.format(cont_path))
                else:
                    self.copy_file_out_of_container(container_id, cont_tar,
                                                    host_tar)
            self.remove_container(container_id)

        # Create a new container and place files into it
        container_id = self.create_container(docker_image_tag, 'pack')
        # Run patching script (add localRepository and offline)
        src = os.path.join(procutils.HOST_SANDBOX, _COPY_DIR, _PROCESS_SCRIPT)
        des = os.path.join(_TRAVIS_DIR, _PROCESS_SCRIPT)
        self.copy_file_to_container(container_id, src, des)
        self._run_patch_script(container_id, repo,
                               ['add-mvn-local-repo', 'offline-maven'])
        self.remove_file_from_container(container_id, des)
        # Copy files to the new container
        for fail_or_pass in ['failed', 'passed']:
            container_tar_files = []
            for name, path_func in CACHE_DIRECTORIES.items():
                cont_path = path_func(fail_or_pass, repo)
                cont_tar = '{}/{}.tar'.format(_TRAVIS_DIR, name)
                host_tar = '{}/{}-{}.tar'.format(self.workdir, name,
                                                 fail_or_pass)
                if self.args.__getattribute__('no_copy_' +
                                              name.replace('-', '_')):
                    self.logger.info(
                        'Skipping {} because of command line arguments')
                    continue
                if not os.path.exists(host_tar):
                    self.logger.info('{} does not exist'.format(host_tar))
                    continue
                self.copy_file_to_container(container_id, host_tar, cont_tar)
                if self.args.separate_passed_failed:
                    container_tar_files.append(cont_tar)
                    continue
                # Without --separate-passed-failed: untar the tar file now
                _, stdout, stderr, ok = self.run_command(
                    'docker exec {} tar --directory / -xkvf {}'.format(
                        container_id, cont_tar),
                    fail_on_error=False,
                    loglevel=logging.INFO)
                if not ok:
                    # Ignore error because tar's -k may return non-zero values
                    self.logger.info('Tar xkvf failed for {}, {}'.format(
                        fail_or_pass, name))
                self.remove_file_from_container(container_id, cont_tar)
            if self.args.separate_passed_failed:
                # With --separate-passed-failed: untar the tar file at the start of build script
                # This can fix some caching errors when failed and passed caches conflict (e.g. in ~/.gradle/)
                self._add_untar_to_build_script(container_id, fail_or_pass,
                                                container_tar_files)
        if not self.args.no_remove_maven_repositories:
            self._remove_container_maven_repositories(container_id,
                                                      '/home/travis/.m2/')
        # Commit cached image
        cached_tag, cached_id = self.docker_commit(self.image_tag,
                                                   container_id)
        self.remove_container(container_id)

        # Start two runs to test cached image
        test_build_log_path = {
            'failed': '{}/test-failed.log'.format(self.workdir),
            'passed': '{}/test-passed.log'.format(self.workdir),
        }
        for fail_or_pass in ['failed', 'passed']:
            container_id = self.create_container(cached_tag, 'test',
                                                 fail_or_pass)
            # When testing here, we by default apply a stricter patch (offline-all-maven, offline-all-gradle)
            if not self.args.no_strict_offline_test:
                src = os.path.join(procutils.HOST_SANDBOX, _COPY_DIR,
                                   _PROCESS_SCRIPT)
                des = os.path.join(_TRAVIS_DIR, _PROCESS_SCRIPT)
                self.copy_file_to_container(container_id, src, des)
                self._run_patch_script(
                    container_id, repo,
                    ['offline-all-maven', 'offline-all-gradle'])
            build_result = self.run_build_script(
                container_id, fail_or_pass, test_build_log_path[fail_or_pass],
                job_orig_log[fail_or_pass], job_id[fail_or_pass], build_system)
            if not build_result:
                raise CachingScriptError(
                    'Run build script not reproducible for testing {}'.format(
                        fail_or_pass))

        # Push image
        latest_layer_size = self.get_last_layer_size(cached_tag)
        self.tag_and_push_cached_image(self.image_tag, cached_tag)
        self.write_output(
            self.image_tag, 'succeed, {}, {}'.format(original_size,
                                                     latest_layer_size))
예제 #10
0
    def run(repo: str, dir_of_jsons: str, args: dict):
        git_wrapper = github_wrapper.GitHubWrapper(credentials.GITHUB_TOKENS)
        task_name = repo.replace('/', '-')
        analyzer = Analyzer()
        bugswarmapi = DatabaseAPI(token=DATABASE_PIPELINE_TOKEN)

        try:
            buildpairs = PairClassifier.load_buildpairs(
                dir_of_jsons, '{}.json'.format(task_name))
        except json.decoder.JSONDecodeError:
            log.error(
                '{} contains invalid JSON. Exiting.'.format(dir_of_jsons))
            sys.exit(1)

        for bp in buildpairs:
            bp_id = bp['_id']
            for jp in bp['jobpairs']:
                _ = jp.setdefault('build_system', "NA")
                files_changed, files_deleted, files_added = [], [], []
                if jp['is_filtered']:
                    continue

                failed_sha = bp['failed_build']['travis_merge_sha'] if bp['failed_build']['travis_merge_sha'] else \
                    bp['failed_build']['head_sha']
                passed_sha = bp['passed_build']['travis_merge_sha'] if bp['passed_build']['travis_merge_sha'] else \
                    bp['passed_build']['head_sha']
                url = 'https://api.github.com/repos/{}/compare/{}...{}'.format(
                    repo, failed_sha, passed_sha)
                status, json_data = git_wrapper.get(url)
                if status is None or not status.ok:
                    print('Network Err: {}'.format(status))
                    continue

                for f in json_data['files']:
                    if f['status'] == 'added':
                        files_added.append(f['filename'])
                    elif f['status'] == 'modified':
                        files_changed.append(f['filename'])
                    elif f['status'] == 'deleted':
                        files_deleted.append(f['filename'])
                failed_job_id = jp['failed_job']['job_id']
                passed_job_id = jp['passed_job']['job_id']

                file_list = [
                    '{}-orig.log'.format(failed_job_id),
                    '{}-orig.log'.format(passed_job_id)
                ]

                origin_log_dir = args.get('log_path')

                # origin_log_dir is not provided, then download the log
                if origin_log_dir is None:
                    origin_log_dir = 'original-logs/'
                    os.makedirs(os.path.dirname(origin_log_dir), exist_ok=True)
                    if not download_log(failed_job_id, '{}/{}-orig.log'.format(origin_log_dir, failed_job_id)) \
                            or not download_log(passed_job_id, '{}/{}-orig.log'.format(origin_log_dir, passed_job_id)):
                        print("Error: Log cannot be download for {}".format(
                            failed_job_id))
                        continue

                failed_log = process_logs(origin_log_dir, file_list)
                if failed_log is None:
                    continue
                try:
                    language = bp['failed_build']['jobs'][0]['language']
                except KeyError:
                    log.info("Language not detected")
                    continue
                if language not in ['python', 'java']:
                    print('Lang is :{}'.format(language))
                    continue

                # CLASSIFICATION
                files_modified = []
                files_modified.extend(files_changed)
                files_modified.extend(files_deleted)
                files_modified.extend(files_added)
                files_modified = list(
                    filter(lambda x: '.git' not in x, files_modified))
                is_test, test_confidence, remain_files = classify_test(
                    files_modified)
                is_build, build_confidence, remain_files = classify_build(
                    remain_files, files_modified)
                is_code, code_confidence = classify_code(
                    remain_files, files_modified)
                error_dict, userdefined, _ = process_error(
                    language, failed_log)
                test_confidence = PairClassifier._get_category_confidence(
                    test_confidence)
                build_confidence = PairClassifier._get_category_confidence(
                    build_confidence)
                code_confidence = PairClassifier._get_category_confidence(
                    code_confidence)

                # default to be -1
                num_tests_failed = -1

                try:
                    result = analyzer.analyze_single_log(
                        '{}/{}-orig.log'.format(origin_log_dir, failed_job_id),
                        failed_job_id)
                except BaseException:
                    log.error(
                        'Error analyzing log for {}'.format(failed_job_id))
                    continue
                if 'tr_log_num_tests_failed' in result and not result[
                        'tr_log_num_tests_failed'] == 'NA':
                    num_tests_failed = result['tr_log_num_tests_failed']

                classification = {
                    'code': code_confidence,
                    'test': test_confidence,
                    'build': build_confidence,
                    'exceptions': list(error_dict.keys()),
                    'tr_log_num_tests_failed': num_tests_failed
                }
                jp['classification'] = classification

            log.info('patching job pairs to the database.')
            resp = bugswarmapi.patch_job_pairs(bp_id, bp['jobpairs'])
            if not resp.ok:
                print(resp)

        log.info('Finished classification.')
        log.info('Writing build pairs to the database.')
        log.info('Saving classified json.')
        PairClassifier._save_output(repo, buildpairs)
        log.info('Finished')
예제 #11
0
def gen_files_for_job(job_dispatcher,
                      job,
                      copy_files=False,
                      dependency_solver=False):
    """
    This function generates the files needed to reproduce a job.
    It begins running the steps to reproduce a job. The steps are explained in the comments.
    The steps are mainly calling functions written in the `pipeline` folder.
    Steps:
      Pre-job step: check for skipping
      1. Setup workspace repository: copy, reset, and tar the repository
      2. Ensure .travis.yml exists. Otherwise, skip the job
      3. Download the original log.
      4. Generate the build script with travis-build.
      5. Generate the Dockerfile.
      6. Build the Docker image.
      7. Spawn the Docker container.
      Post-job step: copying files

    :param job_dispatcher:
    :param job:
    :param copy_files:
    """
    job_dispatcher.utils.setup_jobpair_dir(job)

    # If all three essential items to build a job are in the task folder, copy them to the workspace folder and return.
    repo_in_task_path = job_dispatcher.utils.get_repo_tar_path_in_task(job)
    build_sh_in_task_path = job_dispatcher.utils.get_build_sh_path_in_task(job)
    dockerfile_in_task_path = job_dispatcher.utils.get_dockerfile_in_task_path(
        job)
    if isfile(repo_in_task_path) and isfile(build_sh_in_task_path) and isfile(
            dockerfile_in_task_path):
        # Before copying from the task directory into the workspace directory, make the workspace folder. If this branch
        # is not executed, the directory is created in `setup_repo`.
        os.makedirs(job_dispatcher.utils.get_reproduce_tmp_dir(job),
                    exist_ok=True)
        job_dispatcher.utils.copy_repo_from_task_into_workspace(job)
        job_dispatcher.utils.copy_build_sh_from_task_into_workspace(job)
        job_dispatcher.utils.copy_dockerfile_from_task_into_workspace(job)
        return

    # STEP 1: Clone, copy, reset, the repository.
    setup_repo(job, job_dispatcher.utils, job_dispatcher)
    reproduce_tmp_path = job_dispatcher.utils.get_reproduce_tmp_dir(job)
    os.makedirs(reproduce_tmp_path, exist_ok=True)

    # STEP 2: Download the original log if we do not yet have it.
    original_log_path = job_dispatcher.utils.get_orig_log_path(job.job_id)
    download_log(job.job_id, original_log_path)

    # STEP 3: Generate the build script with travis-build and then modify and patch it.
    build_sh_path = job_dispatcher.utils.get_build_sh_path(job)
    if not isfile(build_sh_path):
        gen_script(job_dispatcher.utils, job, dependency_solver)
        modify_build_sh(job.repo, build_sh_path)
        # Check if job is java and is jdk7. If so, then patch the build.sh file by adding flags to mvn command to use
        # TLSv1.2 instead of the default TLSv1.0.
        if job.config.get('jdk') in ['oraclejdk7', 'openjdk7']:
            patch_build_script(build_sh_path)

    # STEP 3.5: Tar the repository.
    tar_path = job_dispatcher.utils.get_repo_tar_path(job)
    if not isfile(tar_path):
        tar_repo(job, job_dispatcher.utils)

    # STEP 4: Generate the Dockerfile.
    dockerfile_path = job_dispatcher.utils.get_dockerfile_path(job)
    if not isfile(dockerfile_path):
        gen_dockerfile(job.image_tag, job.job_id, dockerfile_path)

    # Post-job step.
    if copy_files:
        _copy_workspace_files(job_dispatcher.utils, job)
예제 #12
0
    def cache_artifact_dependency(self):
        response = bugswarmapi.find_artifact(self.image_tag)
        if not response.ok:
            raise CachingScriptError('Unable to get artifact data')

        artifact = response.json()
        job_id = {
            'failed': artifact['failed_job']['job_id'],
            'passed': artifact['passed_job']['job_id'],
        }
        repo = artifact['repo']

        job_orig_log = {
            'failed': '{}/orig-failed-{}.log'.format(self.workdir, job_id['failed']),
            'passed': '{}/orig-passed-{}.log'.format(self.workdir, job_id['passed']),
        }
        if not download_log(job_id['failed'], job_orig_log['failed']):
            raise CachingScriptError('Error downloading log for failed job {}'.format(job_id['failed']))
        if not download_log(job_id['passed'], job_orig_log['passed']):
            raise CachingScriptError('Error downloading log for passed job {}'.format(job_id['passed']))

        docker_image_tag = '{}:{}'.format(self.args.src_repo, self.image_tag)
        original_size = self.pull_image(docker_image_tag)

        # Reproduce the artifact if needed
        logs_to_parse = {}
        for fail_or_pass in ['failed', 'passed']:
            if self.args.parse_new_log:
                logs_to_parse[fail_or_pass] = '{}/repr-{}-{}.log'.format(self.workdir, fail_or_pass,
                                                                         job_id[fail_or_pass])
                container_id = self.create_container(docker_image_tag, 'repr', fail_or_pass)
                build_result = self.run_build_script(container_id, fail_or_pass, logs_to_parse[fail_or_pass],
                                                     job_orig_log[fail_or_pass], job_id[fail_or_pass], None)
                if not build_result:
                    raise CachingScriptError('Cannot reproduce {}'.format(fail_or_pass))
            else:
                logs_to_parse[fail_or_pass] = job_orig_log[fail_or_pass]

        # Download cached files
        for fail_or_pass in ['failed', 'passed']:
            pip_packages = get_dependencies(logs_to_parse[fail_or_pass])
            self.download_dependencies(fail_or_pass, pip_packages, docker_image_tag)

        # Create a new container and place files into it
        container_id = self.create_container(docker_image_tag, 'pack')
        # Run patching script (add localRepository and offline)
        src = os.path.join(procutils.HOST_SANDBOX, _COPY_DIR, _PROCESS_SCRIPT)
        des = os.path.join(_TRAVIS_DIR, _PROCESS_SCRIPT)
        self.copy_file_to_container(container_id, src, des)
        for fail_or_pass in ['failed', 'passed']:
            self._run_patch_script(container_id, repo, fail_or_pass)
        self.remove_file_from_container(container_id, des)
        # Copy files to the new container
        for fail_or_pass in ['failed', 'passed']:
            self.move_dependencies_into_container(container_id, fail_or_pass)
        # Commit cached image
        cached_tag, cached_id = self.docker_commit(self.image_tag, container_id)
        self.remove_container(container_id)

        # Start two runs to test cached image
        test_build_log_path = {
            'failed': '{}/test-failed.log'.format(self.workdir),
            'passed': '{}/test-passed.log'.format(self.workdir),
        }
        for fail_or_pass in ['failed', 'passed']:
            container_id = self.create_container(cached_tag, 'test', fail_or_pass)
            build_result = self.run_build_script(container_id, fail_or_pass, test_build_log_path[fail_or_pass],
                                                 job_orig_log[fail_or_pass], job_id[fail_or_pass], None)
            if not build_result:
                raise CachingScriptError('Run build script not reproducible for testing {}'.format(fail_or_pass))

        # Push image
        latest_layer_size = self.get_last_layer_size(cached_tag)
        self.tag_and_push_cached_image(self.image_tag, cached_tag)
        self.write_output(self.image_tag, 'succeed, {}, {}'.format(original_size, latest_layer_size))