def main(args=dict()): log.config_logging(getattr(logging, 'INFO', None)) # Log the current version of this BugSwarm component. log.info(get_current_component_version_message('Classifier')) repo_list, pipeline = _validate_input(args) filter_output_dir = os.path.join(os.path.dirname(__file__), '../pair-filter/output-json/') if pipeline and not os.path.exists(filter_output_dir): log.error( 'pipeline == true, but output_file_path ({}) does not exist. ' 'Exiting PairClassifier.'.format(filter_output_dir)) return for repo in repo_list: if pipeline: task_name = repo.replace('/', '-') json_path = os.path.join(filter_output_dir, task_name + '.json') if not os.path.exists(json_path): log.error(json_path, 'does not exist. Repo', repo, 'will be skipped.') continue # Get the input json from the file generated by pair-filter. dir_of_jsons = generate_build_pair_json(repo, json_path) else: # Get the input json from the DB. dir_of_jsons = generate_build_pair_json(repo) PairClassifier.run(repo, dir_of_jsons, args)
def main(argv): log.config_logging(getattr(logging, 'INFO', None)) in_paths, out_path = _validate_input(argv) buildpairs = [] tasks = [] for path in in_paths: with open(path) as f: # Get task names to check for previous caching output CSVs tasks.append(str(os.path.splitext(path)[0].split('/')[-1])) buildpairs += json.load(f) to_be_cached = [] for bp in buildpairs: # Only accept reproducible build pairs if 'match' not in bp or bp['match'] != 1: continue # Make sure language is Java java_jobs = [] for job in bp['failed_build']['jobs']: if job['language'] == 'java': java_jobs.append(job['job_id']) for job in bp['passed_build']['jobs']: if job['language'] == 'java': java_jobs.append(job['job_id']) # Cache all reproducible & unfiltered job pairs that use Java & Maven prefix = bp['repo'].replace('/', '-') + '-' for jp in bp['jobpairs']: should_be_cached = (not jp['is_filtered'] and jp['build_system'] == 'Maven' and jp['failed_job']['job_id'] in java_jobs and jp['passed_job']['job_id'] in java_jobs) if should_be_cached: to_be_cached.append(prefix + str(jp['failed_job']['job_id'])) try: os.mkdir('input') except FileExistsError: pass cached_image_tags = set() for task in tasks: if os.path.isfile('../cache-dependency/output/{}'.format(task)): with open('../cache-dependency/output/{}.csv'.format(task)) as f: for row in f: row_list = row.split(', ') if row_list[1] == 'succeed': cached_image_tags.add(row_list[0]) with open(out_path, 'w') as f: for image_tag in to_be_cached: if image_tag not in cached_image_tags: f.write(image_tag + '\n') log.info('Wrote file to {}/{}'.format(os.getcwd(), out_path))
def main(argv=None): argv = argv or sys.argv # Configure logging. log.config_logging(getattr(logging, 'INFO', None)) # Log the current version of this BugSwarm component. log.info(get_current_component_version_message('ReproducedResultsAnalyzer')) input_file, runs, task_name = _validate_input(argv) ReproducedResultsAnalyzer(input_file, runs, task_name).run()
def main(argv=None): log.config_logging(getattr(logging, 'INFO', None)) argv = argv or sys.argv image_tags_file, num_workers = _validate_input(argv) t_start = time.time() ReproducePairRunner(image_tags_file, workers=num_workers).run() t_end = time.time() print('Running run_reproduce_pair_wrapper took {}s'.format(t_end - t_start))
def main(argv=None): log.config_logging(getattr(logging, 'INFO', None)) argv = argv or sys.argv image_tags_file, output_file, args = validate_input(argv, 'python') t_start = time.time() PatchArtifactRunner(PatchArtifactPythonTask, image_tags_file, _COPY_DIR, output_file, args, workers=args.workers).run() t_end = time.time() log.info('Running patch took {}s'.format(t_end - t_start))
def main(argv=None): argv = argv or sys.argv if len(argv) != 2: log.info('Usage: add_artifact_logs.py <task_name>') sys.exit() log.config_logging(getattr(logging, 'INFO', None)) task_name = argv[1] ArtifactLogAdder(task_name=task_name).run()
def main(args=dict()): log.config_logging(getattr(logging, 'INFO', None)) # Log the current version of this BugSwarm component. log.info(get_current_component_version_message('Classifier')) repo_list = _validate_input(args) for repo in repo_list: # get the input json from DB dir_of_jsons = generate_build_pair_json(repo) PairClassifier.run(repo, dir_of_jsons, args)
def main(argv=None): argv = argv or sys.argv # Configure logging. log.config_logging(getattr(logging, 'INFO', None)) output_path, repo, failed_job_id, passed_job_id = _validate_input(argv) log.info('Choosing pairs from {}.'.format(repo)) bugswarmapi = DatabaseAPI(token=DATABASE_PIPELINE_TOKEN) buildpairs = bugswarmapi.filter_mined_build_pairs_for_repo(repo) if not buildpairs: log.error( 'No mined build pairs exist in the database for {}. Exiting.'. format(repo)) return 1 filename = 'artifacts_for_comparing.json' if not os.path.isfile(filename): artifacts = bugswarmapi.list_artifacts() _create_static_artifacts_file(filename, artifacts) with open(filename, 'r') as file: artifacts = json.load(file) filtered_buildpairs = [] filtered_jobpair_count = 0 for bp in buildpairs: filtered_jobpairs = [] for jp in bp['jobpairs']: if should_include_jobpair(jp, failed_job_id, passed_job_id): if not is_jp_unique(repo, jp, artifacts): continue filtered_jobpairs.append(jp) filtered_jobpair_count += 1 if filtered_jobpairs: bp['jobpairs'] = filtered_jobpairs filtered_buildpairs.append(bp) # Create any missing path components to the output file. os.makedirs(os.path.dirname(output_path), exist_ok=True) # Write the output file. write_json(output_path, filtered_buildpairs) bp_pluralized = 'buildpair' if len( filtered_buildpairs) == 1 else 'buildpairs' jp_pluralized = 'jobpair' if filtered_jobpair_count == 1 else 'jobpairs' log.info('Wrote {} {} with {} {} to {}.'.format(len(filtered_buildpairs), bp_pluralized, filtered_jobpair_count, jp_pluralized, output_path)) log.info('Done!')
def main(argv=None): argv = argv or sys.argv # Configure logging. log.config_logging(getattr(logging, 'INFO', None)) # Log the current version of this BugSwarm component. log.info(get_current_component_version_message('MetadataPackager')) # Decrease logging severity from the requests library. logging.getLogger('requests').setLevel(logging.WARNING) input_file, csv_mode = _validate_input(argv) Packager(input_file, csv_mode).run()
def main(argv=None): argv = argv or sys.argv # Configure logging. log.config_logging(getattr(logging, 'INFO', None)) # Log the current version of this BugSwarm component. log.info(get_current_component_version_message('PairFilter')) if not path.exists(DOCKERHUB_IMAGES_JSON): log.info( 'File dockerhub_image.json not found. Please run gen_image_list.py' ) repo, dir_of_jsons = _validate_input(argv) PairFilter.run(repo, dir_of_jsons)
def main(argv=None): argv = argv or sys.argv # Configure logging. log.config_logging(getattr(logging, 'INFO', None)) # Log the current version of this BugSwarm component. log.info(get_current_component_version_message('Analyzer')) mode, reproduced, orig, log_filename, print_result, job_id, build_system, trigger_sha, repo = _validate_input(argv) analyzer = Analyzer() if mode == 0: analyzer.compare_single_log(reproduced, orig, job_id, build_system, trigger_sha, repo, print_result) elif mode == 1: analyzer.analyze_single_log(log_filename, job_id, build_system, trigger_sha, repo, print_result) else: raise Exception('Unsupported mode: {}.'.format(mode))
def main(argv=None): log.config_logging(getattr(logging, 'INFO', None)) argv = argv or sys.argv image_tags_file, output_file, args = validate_input(argv, 'maven') # Remains empty if run outside of reproducer pipeline repr_metadata_dict = dict() # Task JSON path will be an empty string by default if args.task_json: log.info('Writing pairs to reference dict from ReproducedResultsAnalyzer JSON') repr_metadata_dict = get_repr_metadata_dict(args.task_json, repr_metadata_dict) t_start = time.time() PatchArtifactRunner(PatchArtifactMavenTask, image_tags_file, _COPY_DIR, output_file, repr_metadata_dict, args, workers=args.workers).run() t_end = time.time() log.info('Running patch took {}s'.format(t_end - t_start))
def main(argv=None): log.config_logging(getattr(logging, 'INFO', None)) argv = argv or sys.argv image_tags_file, output_file, args = validate_input(argv, 'python') repr_metadata_dict = dict( ) # This arg for PatchArtifactRunner only used in Java at the moment t_start = time.time() PatchArtifactRunner(PatchArtifactPythonTask, image_tags_file, _COPY_DIR, output_file, repr_metadata_dict, args, workers=args.workers).run() t_end = time.time() log.info('Running patch took {}s'.format(t_end - t_start))
def main(argv=None): argv = argv or sys.argv # Configure logging. log.config_logging(getattr(logging, 'INFO', None)) repo_list, output_path, include_attempted, include_archived_only, include_resettable, include_test_failures_only, \ include_different_base_image, restrict_classified_build, restrict_classified_code, restrict_classified_test, \ restrict_classified_exception, restrict_build_system, restrict_os_version, restrict_diff_size = \ _validate_input(argv) # create tmp folder to store logs os.makedirs('tmp/', exist_ok=True) # Returns '1 project' or 'n projects' where n is not 1. def _pluralize(n: int): s = '{} '.format(n) return s + 'project' if n == 1 else s + 'projects' # Returns 'Including' if the parameter is truthy and 'Excluding' otherwise. def _including_or_excluding(include: bool): return 'Including' if include else 'Excluding' # Print some context for the upcoming operation. log.info('Choosing pairs from {}.'.format(_pluralize(len(repo_list)))) log.info('{} pairs with at least one reproduce attempt.'.format( _including_or_excluding(include_attempted))) log.info('{} pairs that are only archived by GitHub.'.format( _including_or_excluding(include_archived_only))) log.info('{} pairs that are resettable.'.format( _including_or_excluding(include_resettable))) log.info('{} pairs that have different base images'.format( _including_or_excluding(include_different_base_image))) log.info('Excluding pairs that were filtered by PairFilter.') if include_test_failures_only: log.info('Restricted to test_failures') if restrict_classified_build: log.info('Restricted to classified build') if restrict_classified_test: log.info('Restricted to classified test') if restrict_classified_code: log.info('Restricted to classified code') if restrict_classified_exception != '': log.info('Restricted to classified exception: {}'.format( restrict_classified_exception)) if restrict_build_system != '': log.info( 'Restricted to build system: {}'.format(restrict_build_system)) if restrict_os_version != '': log.info('Restricted OS version to: {}'.format(restrict_os_version)) if restrict_diff_size != '': log.info('Restricted diff size: {}'.format(restrict_diff_size)) log.info() with ThreadPoolExecutor(max_workers=min(len(repo_list), 64)) as executor: future_to_repo = { executor.submit(_choose_pairs_from_repo, repo, include_attempted, include_archived_only, include_resettable, include_test_failures_only, include_different_base_image, restrict_classified_build, restrict_classified_code, restrict_classified_test, restrict_classified_exception, restrict_build_system, restrict_os_version, restrict_diff_size): repo for repo in repo_list } errored = 0 all_lines = [] skipped_repos = [] for future in as_completed(future_to_repo): try: lines, skipped_repo = future.result() except Exception: errored += 1 raise all_lines += lines if skipped_repo: skipped_repos.append(skipped_repo) # Create any missing path components to the output file. os.makedirs(os.path.dirname(output_path), exist_ok=True) # Sort the lines and then append a newline to each line. all_lines = list(map(lambda l: l + '\n', sorted(all_lines))) # Write the output file. with open(output_path, 'w') as f: f.writelines(all_lines) # Print some context for the result of the operation. log.info() log.info('Attempted to choose pairs from {}.'.format( _pluralize(len(repo_list)))) log.info('{} resulted in an error.'.format(_pluralize(errored))) if len(skipped_repos): log.info('{} were skipped because they have not been mined:'.format( _pluralize(len(skipped_repos)))) for r in skipped_repos: log.info(' {}'.format(r)) else: log.info('0 projects were skipped because they have not been mined.') log.info('Wrote {} lines to {}.'.format(len(all_lines), output_path)) log.info('Done!')
def cli(): """A command line interface for the BugSwarm dataset.""" # Configure logging. log.config_logging(getattr(logging, 'INFO', None))
def main(argv=None): argv = argv or sys.argv # Configure logging. log.config_logging(getattr(logging, 'INFO', None)) # Log the current version of this BugSwarm component. log.info(get_current_component_version_message('Reproducer')) # Parse input. shortopts = 'i:t:o:kpds' longopts = 'input-file= threads= task-name= keep package skip-check-disk'.split( ) input_file = None threads = 1 task_name = None keep = False package_mode = False dependency_solver = False skip_check_disk = False try: optlist, args = getopt.getopt(argv[1:], shortopts, longopts) except getopt.GetoptError: log.error('Error parsing arguments. Exiting.') print_usage() sys.exit(2) for opt, arg in optlist: if opt in ['-i', '--input-file']: input_file = arg if opt in ['-t', '--threads']: threads = int(arg) if opt in ['-o', '--task-name']: task_name = arg if opt in ['-k', '--keep']: keep = True if opt in ['-p', '--package']: package_mode = True if opt in ['-d', '--dependency-solver']: dependency_solver = True if opt in ['-s', '--skip-check-disk']: skip_check_disk = True if not input_file: print_usage() sys.exit(2) if threads <= 0: log.error('The threads argument must be greater than 0. Exiting.') sys.exit(1) if not os.path.isfile(input_file): log.error( 'The input_file argument is not a file or does not exist. Exiting.' ) sys.exit(1) if not task_name: print_usage() sys.exit(2) # Initialize JobDispatcher. if package_mode: reproducer = ImagePackager(input_file, task_name, threads, keep, package_mode, dependency_solver, skip_check_disk) else: reproducer = JobReproducer(input_file, task_name, threads, keep, package_mode, dependency_solver, skip_check_disk) reproducer.run()
def _thread_main(repo, task_name, log_level, skip_if_output_exists, keep_clone): log.config_logging(log_level, Utils.log_file_path_from_repo(repo)) # Log the current version of this BugSwarm component. log.info(get_current_component_version_message('PairFinder')) log.info('Processing', repo) output_file_path = Utils.output_file_path_from_repo(repo, task_name) if skip_if_output_exists and os.path.exists( output_file_path) and os.path.getsize(output_file_path) > 0: log.info('Skipping', repo, 'because output already exists.') return start_time = time.time() in_context = { 'repo': repo, 'utils': Utils(), 'keep_clone': keep_clone, 'task_name': task_name, 'mined_project_builder': MinedProjectBuilder(), } steps = [ Preflight(), GetJobsFromTravisAPI(), GroupJobsByBranch(), ExtractAllBuildPairs(), AlignJobPairs(), GetPullRequestMergeStatuses(), DownloadPullRequestCommits(), AssignTriggerCommits(), AssignBaseCommits(), CleanPairs(), GetBuildSystemInfo(), Postflight(), ] pipeline = Pipeline(steps) result, out_context = pipeline.run(None, in_context) builder = out_context['mined_project_builder'] builder.repo = repo builder.latest_mined_version = Utils.get_latest_commit_for_repo(repo) (mined_build_pairs, mined_job_pairs, mined_pr_build_pairs, mined_pr_job_pairs) = Utils.count_mined_pairs_in_branches(result) builder.mined_job_pairs = mined_job_pairs builder.mined_pr_job_pairs = mined_pr_job_pairs builder.mined_build_pairs = mined_build_pairs builder.mined_pr_build_pairs = mined_pr_build_pairs mined_project = builder.build() OutputManager.output_to_database(mined_project) if not result: # A filter in the pipeline encountered a fatal error and made the pipeline exit early. # Skip writing the output file. return OutputManager.output(repo, output_path=output_file_path, branches=result) elapsed = time.time() - start_time log.info('Processed {} in {} seconds. Done!'.format(repo, elapsed))