Beispiel #1
0
def main(args=dict()):
    log.config_logging(getattr(logging, 'INFO', None))

    # Log the current version of this BugSwarm component.
    log.info(get_current_component_version_message('Classifier'))

    repo_list, pipeline = _validate_input(args)
    filter_output_dir = os.path.join(os.path.dirname(__file__),
                                     '../pair-filter/output-json/')

    if pipeline and not os.path.exists(filter_output_dir):
        log.error(
            'pipeline == true, but output_file_path ({}) does not exist. '
            'Exiting PairClassifier.'.format(filter_output_dir))
        return

    for repo in repo_list:
        if pipeline:
            task_name = repo.replace('/', '-')
            json_path = os.path.join(filter_output_dir, task_name + '.json')
            if not os.path.exists(json_path):
                log.error(json_path, 'does not exist. Repo', repo,
                          'will be skipped.')
                continue
            # Get the input json from the file generated by pair-filter.
            dir_of_jsons = generate_build_pair_json(repo, json_path)
        else:
            # Get the input json from the DB.
            dir_of_jsons = generate_build_pair_json(repo)
        PairClassifier.run(repo, dir_of_jsons, args)
def main(argv):
    log.config_logging(getattr(logging, 'INFO', None))
    in_paths, out_path = _validate_input(argv)

    buildpairs = []
    tasks = []
    for path in in_paths:
        with open(path) as f:
            # Get task names to check for previous caching output CSVs
            tasks.append(str(os.path.splitext(path)[0].split('/')[-1]))
            buildpairs += json.load(f)

    to_be_cached = []
    for bp in buildpairs:
        # Only accept reproducible build pairs
        if 'match' not in bp or bp['match'] != 1:
            continue

        # Make sure language is Java
        java_jobs = []
        for job in bp['failed_build']['jobs']:
            if job['language'] == 'java':
                java_jobs.append(job['job_id'])
        for job in bp['passed_build']['jobs']:
            if job['language'] == 'java':
                java_jobs.append(job['job_id'])

        # Cache all reproducible & unfiltered job pairs that use Java & Maven
        prefix = bp['repo'].replace('/', '-') + '-'
        for jp in bp['jobpairs']:
            should_be_cached = (not jp['is_filtered']
                                and jp['build_system'] == 'Maven'
                                and jp['failed_job']['job_id'] in java_jobs
                                and jp['passed_job']['job_id'] in java_jobs)
            if should_be_cached:
                to_be_cached.append(prefix + str(jp['failed_job']['job_id']))

    try:
        os.mkdir('input')
    except FileExistsError:
        pass

    cached_image_tags = set()
    for task in tasks:
        if os.path.isfile('../cache-dependency/output/{}'.format(task)):
            with open('../cache-dependency/output/{}.csv'.format(task)) as f:
                for row in f:
                    row_list = row.split(', ')
                    if row_list[1] == 'succeed':
                        cached_image_tags.add(row_list[0])

    with open(out_path, 'w') as f:
        for image_tag in to_be_cached:
            if image_tag not in cached_image_tags:
                f.write(image_tag + '\n')
    log.info('Wrote file to {}/{}'.format(os.getcwd(), out_path))
Beispiel #3
0
def main(argv=None):
    argv = argv or sys.argv

    # Configure logging.
    log.config_logging(getattr(logging, 'INFO', None))

    # Log the current version of this BugSwarm component.
    log.info(get_current_component_version_message('ReproducedResultsAnalyzer'))

    input_file, runs, task_name = _validate_input(argv)
    ReproducedResultsAnalyzer(input_file, runs, task_name).run()
Beispiel #4
0
def main(argv=None):
    log.config_logging(getattr(logging, 'INFO', None))
    argv = argv or sys.argv

    image_tags_file, num_workers = _validate_input(argv)

    t_start = time.time()
    ReproducePairRunner(image_tags_file, workers=num_workers).run()
    t_end = time.time()
    print('Running run_reproduce_pair_wrapper took {}s'.format(t_end -
                                                               t_start))
Beispiel #5
0
def main(argv=None):
    log.config_logging(getattr(logging, 'INFO', None))

    argv = argv or sys.argv
    image_tags_file, output_file, args = validate_input(argv, 'python')

    t_start = time.time()
    PatchArtifactRunner(PatchArtifactPythonTask, image_tags_file, _COPY_DIR, output_file, args,
                        workers=args.workers).run()
    t_end = time.time()
    log.info('Running patch took {}s'.format(t_end - t_start))
Beispiel #6
0
def main(argv=None):
    argv = argv or sys.argv

    if len(argv) != 2:
        log.info('Usage: add_artifact_logs.py <task_name>')
        sys.exit()

    log.config_logging(getattr(logging, 'INFO', None))

    task_name = argv[1]
    ArtifactLogAdder(task_name=task_name).run()
def main(args=dict()):
    log.config_logging(getattr(logging, 'INFO', None))

    # Log the current version of this BugSwarm component.
    log.info(get_current_component_version_message('Classifier'))

    repo_list = _validate_input(args)

    for repo in repo_list:
        # get the input json from DB
        dir_of_jsons = generate_build_pair_json(repo)
        PairClassifier.run(repo, dir_of_jsons, args)
Beispiel #8
0
def main(argv=None):
    argv = argv or sys.argv

    # Configure logging.
    log.config_logging(getattr(logging, 'INFO', None))

    output_path, repo, failed_job_id, passed_job_id = _validate_input(argv)

    log.info('Choosing pairs from {}.'.format(repo))

    bugswarmapi = DatabaseAPI(token=DATABASE_PIPELINE_TOKEN)
    buildpairs = bugswarmapi.filter_mined_build_pairs_for_repo(repo)
    if not buildpairs:
        log.error(
            'No mined build pairs exist in the database for {}. Exiting.'.
            format(repo))
        return 1

    filename = 'artifacts_for_comparing.json'
    if not os.path.isfile(filename):
        artifacts = bugswarmapi.list_artifacts()
        _create_static_artifacts_file(filename, artifacts)
    with open(filename, 'r') as file:
        artifacts = json.load(file)

    filtered_buildpairs = []
    filtered_jobpair_count = 0
    for bp in buildpairs:
        filtered_jobpairs = []
        for jp in bp['jobpairs']:
            if should_include_jobpair(jp, failed_job_id, passed_job_id):
                if not is_jp_unique(repo, jp, artifacts):
                    continue
                filtered_jobpairs.append(jp)
                filtered_jobpair_count += 1
        if filtered_jobpairs:
            bp['jobpairs'] = filtered_jobpairs
            filtered_buildpairs.append(bp)

    # Create any missing path components to the output file.
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    # Write the output file.
    write_json(output_path, filtered_buildpairs)
    bp_pluralized = 'buildpair' if len(
        filtered_buildpairs) == 1 else 'buildpairs'
    jp_pluralized = 'jobpair' if filtered_jobpair_count == 1 else 'jobpairs'
    log.info('Wrote {} {} with {} {} to {}.'.format(len(filtered_buildpairs),
                                                    bp_pluralized,
                                                    filtered_jobpair_count,
                                                    jp_pluralized,
                                                    output_path))
    log.info('Done!')
Beispiel #9
0
def main(argv=None):
    argv = argv or sys.argv

    # Configure logging.
    log.config_logging(getattr(logging, 'INFO', None))

    # Log the current version of this BugSwarm component.
    log.info(get_current_component_version_message('MetadataPackager'))

    # Decrease logging severity from the requests library.
    logging.getLogger('requests').setLevel(logging.WARNING)

    input_file, csv_mode = _validate_input(argv)
    Packager(input_file, csv_mode).run()
Beispiel #10
0
def main(argv=None):
    argv = argv or sys.argv

    # Configure logging.
    log.config_logging(getattr(logging, 'INFO', None))
    # Log the current version of this BugSwarm component.
    log.info(get_current_component_version_message('PairFilter'))
    if not path.exists(DOCKERHUB_IMAGES_JSON):
        log.info(
            'File dockerhub_image.json not found. Please run gen_image_list.py'
        )

    repo, dir_of_jsons = _validate_input(argv)
    PairFilter.run(repo, dir_of_jsons)
Beispiel #11
0
def main(argv=None):
    argv = argv or sys.argv

    # Configure logging.
    log.config_logging(getattr(logging, 'INFO', None))

    # Log the current version of this BugSwarm component.
    log.info(get_current_component_version_message('Analyzer'))

    mode, reproduced, orig, log_filename, print_result, job_id, build_system, trigger_sha, repo = _validate_input(argv)
    analyzer = Analyzer()
    if mode == 0:
        analyzer.compare_single_log(reproduced, orig, job_id, build_system, trigger_sha, repo, print_result)
    elif mode == 1:
        analyzer.analyze_single_log(log_filename, job_id, build_system, trigger_sha, repo, print_result)
    else:
        raise Exception('Unsupported mode: {}.'.format(mode))
Beispiel #12
0
def main(argv=None):
    log.config_logging(getattr(logging, 'INFO', None))

    argv = argv or sys.argv
    image_tags_file, output_file, args = validate_input(argv, 'maven')

    # Remains empty if run outside of reproducer pipeline
    repr_metadata_dict = dict()
    # Task JSON path will be an empty string by default
    if args.task_json:
        log.info('Writing pairs to reference dict from ReproducedResultsAnalyzer JSON')
        repr_metadata_dict = get_repr_metadata_dict(args.task_json, repr_metadata_dict)
    t_start = time.time()
    PatchArtifactRunner(PatchArtifactMavenTask, image_tags_file, _COPY_DIR, output_file, repr_metadata_dict,
                        args, workers=args.workers).run()
    t_end = time.time()
    log.info('Running patch took {}s'.format(t_end - t_start))
Beispiel #13
0
def main(argv=None):
    log.config_logging(getattr(logging, 'INFO', None))

    argv = argv or sys.argv
    image_tags_file, output_file, args = validate_input(argv, 'python')
    repr_metadata_dict = dict(
    )  # This arg for PatchArtifactRunner only used in Java at the moment

    t_start = time.time()
    PatchArtifactRunner(PatchArtifactPythonTask,
                        image_tags_file,
                        _COPY_DIR,
                        output_file,
                        repr_metadata_dict,
                        args,
                        workers=args.workers).run()
    t_end = time.time()
    log.info('Running patch took {}s'.format(t_end - t_start))
def main(argv=None):
    argv = argv or sys.argv

    # Configure logging.
    log.config_logging(getattr(logging, 'INFO', None))

    repo_list, output_path, include_attempted, include_archived_only, include_resettable, include_test_failures_only, \
        include_different_base_image, restrict_classified_build, restrict_classified_code, restrict_classified_test, \
        restrict_classified_exception, restrict_build_system, restrict_os_version, restrict_diff_size = \
        _validate_input(argv)

    # create tmp folder to store logs
    os.makedirs('tmp/', exist_ok=True)

    # Returns '1 project' or 'n projects' where n is not 1.

    def _pluralize(n: int):
        s = '{} '.format(n)
        return s + 'project' if n == 1 else s + 'projects'

    # Returns 'Including' if the parameter is truthy and 'Excluding' otherwise.
    def _including_or_excluding(include: bool):
        return 'Including' if include else 'Excluding'

    # Print some context for the upcoming operation.
    log.info('Choosing pairs from {}.'.format(_pluralize(len(repo_list))))
    log.info('{} pairs with at least one reproduce attempt.'.format(
        _including_or_excluding(include_attempted)))
    log.info('{} pairs that are only archived by GitHub.'.format(
        _including_or_excluding(include_archived_only)))
    log.info('{} pairs that are resettable.'.format(
        _including_or_excluding(include_resettable)))
    log.info('{} pairs that have different base images'.format(
        _including_or_excluding(include_different_base_image)))
    log.info('Excluding pairs that were filtered by PairFilter.')
    if include_test_failures_only:
        log.info('Restricted to test_failures')
    if restrict_classified_build:
        log.info('Restricted to classified build')
    if restrict_classified_test:
        log.info('Restricted to classified test')
    if restrict_classified_code:
        log.info('Restricted to classified code')
    if restrict_classified_exception != '':
        log.info('Restricted to classified exception: {}'.format(
            restrict_classified_exception))
    if restrict_build_system != '':
        log.info(
            'Restricted to build system: {}'.format(restrict_build_system))
    if restrict_os_version != '':
        log.info('Restricted OS version to: {}'.format(restrict_os_version))
    if restrict_diff_size != '':
        log.info('Restricted diff size: {}'.format(restrict_diff_size))
    log.info()

    with ThreadPoolExecutor(max_workers=min(len(repo_list), 64)) as executor:
        future_to_repo = {
            executor.submit(_choose_pairs_from_repo, repo, include_attempted,
                            include_archived_only, include_resettable,
                            include_test_failures_only,
                            include_different_base_image,
                            restrict_classified_build,
                            restrict_classified_code, restrict_classified_test,
                            restrict_classified_exception,
                            restrict_build_system, restrict_os_version,
                            restrict_diff_size): repo
            for repo in repo_list
        }

    errored = 0
    all_lines = []
    skipped_repos = []
    for future in as_completed(future_to_repo):
        try:
            lines, skipped_repo = future.result()
        except Exception:
            errored += 1
            raise
        all_lines += lines
        if skipped_repo:
            skipped_repos.append(skipped_repo)

    # Create any missing path components to the output file.
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    # Sort the lines and then append a newline to each line.
    all_lines = list(map(lambda l: l + '\n', sorted(all_lines)))
    # Write the output file.
    with open(output_path, 'w') as f:
        f.writelines(all_lines)

    # Print some context for the result of the operation.
    log.info()
    log.info('Attempted to choose pairs from {}.'.format(
        _pluralize(len(repo_list))))
    log.info('{} resulted in an error.'.format(_pluralize(errored)))
    if len(skipped_repos):
        log.info('{} were skipped because they have not been mined:'.format(
            _pluralize(len(skipped_repos))))
        for r in skipped_repos:
            log.info('    {}'.format(r))
    else:
        log.info('0 projects were skipped because they have not been mined.')
    log.info('Wrote {} lines to {}.'.format(len(all_lines), output_path))
    log.info('Done!')
Beispiel #15
0
def cli():
    """A command line interface for the BugSwarm dataset."""
    # Configure logging.
    log.config_logging(getattr(logging, 'INFO', None))
Beispiel #16
0
def main(argv=None):
    argv = argv or sys.argv

    # Configure logging.
    log.config_logging(getattr(logging, 'INFO', None))

    # Log the current version of this BugSwarm component.
    log.info(get_current_component_version_message('Reproducer'))

    # Parse input.
    shortopts = 'i:t:o:kpds'
    longopts = 'input-file= threads= task-name= keep package skip-check-disk'.split(
    )
    input_file = None
    threads = 1
    task_name = None
    keep = False
    package_mode = False
    dependency_solver = False
    skip_check_disk = False
    try:
        optlist, args = getopt.getopt(argv[1:], shortopts, longopts)
    except getopt.GetoptError:
        log.error('Error parsing arguments. Exiting.')
        print_usage()
        sys.exit(2)
    for opt, arg in optlist:
        if opt in ['-i', '--input-file']:
            input_file = arg
        if opt in ['-t', '--threads']:
            threads = int(arg)
        if opt in ['-o', '--task-name']:
            task_name = arg
        if opt in ['-k', '--keep']:
            keep = True
        if opt in ['-p', '--package']:
            package_mode = True
        if opt in ['-d', '--dependency-solver']:
            dependency_solver = True
        if opt in ['-s', '--skip-check-disk']:
            skip_check_disk = True

    if not input_file:
        print_usage()
        sys.exit(2)
    if threads <= 0:
        log.error('The threads argument must be greater than 0. Exiting.')
        sys.exit(1)
    if not os.path.isfile(input_file):
        log.error(
            'The input_file argument is not a file or does not exist. Exiting.'
        )
        sys.exit(1)
    if not task_name:
        print_usage()
        sys.exit(2)

    # Initialize JobDispatcher.
    if package_mode:
        reproducer = ImagePackager(input_file, task_name, threads, keep,
                                   package_mode, dependency_solver,
                                   skip_check_disk)
    else:
        reproducer = JobReproducer(input_file, task_name, threads, keep,
                                   package_mode, dependency_solver,
                                   skip_check_disk)
    reproducer.run()
Beispiel #17
0
def _thread_main(repo, task_name, log_level, skip_if_output_exists,
                 keep_clone):
    log.config_logging(log_level, Utils.log_file_path_from_repo(repo))

    # Log the current version of this BugSwarm component.
    log.info(get_current_component_version_message('PairFinder'))

    log.info('Processing', repo)
    output_file_path = Utils.output_file_path_from_repo(repo, task_name)
    if skip_if_output_exists and os.path.exists(
            output_file_path) and os.path.getsize(output_file_path) > 0:
        log.info('Skipping', repo, 'because output already exists.')
        return

    start_time = time.time()

    in_context = {
        'repo': repo,
        'utils': Utils(),
        'keep_clone': keep_clone,
        'task_name': task_name,
        'mined_project_builder': MinedProjectBuilder(),
    }
    steps = [
        Preflight(),
        GetJobsFromTravisAPI(),
        GroupJobsByBranch(),
        ExtractAllBuildPairs(),
        AlignJobPairs(),
        GetPullRequestMergeStatuses(),
        DownloadPullRequestCommits(),
        AssignTriggerCommits(),
        AssignBaseCommits(),
        CleanPairs(),
        GetBuildSystemInfo(),
        Postflight(),
    ]
    pipeline = Pipeline(steps)

    result, out_context = pipeline.run(None, in_context)

    builder = out_context['mined_project_builder']
    builder.repo = repo
    builder.latest_mined_version = Utils.get_latest_commit_for_repo(repo)
    (mined_build_pairs, mined_job_pairs, mined_pr_build_pairs,
     mined_pr_job_pairs) = Utils.count_mined_pairs_in_branches(result)
    builder.mined_job_pairs = mined_job_pairs
    builder.mined_pr_job_pairs = mined_pr_job_pairs
    builder.mined_build_pairs = mined_build_pairs
    builder.mined_pr_build_pairs = mined_pr_build_pairs
    mined_project = builder.build()
    OutputManager.output_to_database(mined_project)

    if not result:
        # A filter in the pipeline encountered a fatal error and made the pipeline exit early.
        # Skip writing the output file.
        return

    OutputManager.output(repo, output_path=output_file_path, branches=result)

    elapsed = time.time() - start_time
    log.info('Processed {} in {} seconds. Done!'.format(repo, elapsed))