def test_log() -> None: with tempfile.NamedTemporaryFile("w") as f_tmp: flutes.set_log_file(f_tmp.name) flutes.set_log_file(f_tmp.name) flutes.set_logging_level("warning") flutes.log("info output", "info") flutes.log("warning output", "warning") flutes.log("error output", "error") flutes.log("success output", "success")
def main() -> None: if args.n_procs == 0: # Only do this on the single-threaded case. flutes.register_ipython_excepthook() flutes.log(f"Running with {args.n_procs} worker processes", "warning") # Check for/create output directories make_directory(args.output_dir) # Use RAM-backed memory for tmp if available if os.path.exists('/dev/shm'): tempfile.tempdir = '/dev/shm' flutes.set_log_file(args.log_file) write_pseudo_registry() # Obtain a list of all binaries binaries = get_binary_mapping(args.binary_mapping_cache_file) flutes.log(f"{len(binaries)} binaries to process.") file_count = 0 db = ghcc.BinaryDB() with flutes.safe_pool(args.n_procs, closing=[db]) as pool: decompile_fn: Callable[[BinaryInfo], DecompilationResult] = functools.partial( decompile, output_dir=args.output_dir, binary_dir=args.binaries_dir, timeout=args.timeout) for result in pool.imap_unordered(decompile_fn, iter_binaries(db, binaries)): file_count += 1 if result is not None: db.add_binary(result.info["repo_owner"], result.info["repo_name"], result.hash, result.status is DecompilationStatus.Success) if file_count % 100 == 0: flutes.log(f"Processed {file_count} binaries", force_console=True)
def main() -> None: if not ghcc.utils.verify_docker_image(verbose=True): exit(1) sys.setrecursionlimit(10000) args = Arguments() if args.pdb: flutes.register_ipython_excepthook() if args.n_procs == 0: globals()['match_functions'] = match_functions.__wrapped__ if not args.verbose: flutes.set_logging_level("quiet", console=True, file=False) flutes.set_log_file(args.log_file) flutes.log("Running with arguments:\n" + args.to_string(), force_console=True) if os.path.exists(args.temp_dir): flutes.log( f"Removing contents of temporary folder '{args.temp_dir}'...", "warning", force_console=True) ghcc.utils.run_docker_command( ["rm", "-rf", "/usr/src/*"], user=0, directory_mapping={args.temp_dir: "/usr/src"}) db = ghcc.MatchFuncDB() output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) manager = flutes.ProgressBarManager( verbose=args.show_progress, bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}{postfix}]") with flutes.safe_pool(args.n_procs, closing=[db, manager]) as pool: iterator, stats = iter_repos( db, args.max_repos, skip_to=args.skip_to, cache_path=args.repo_binary_info_cache_path, force_reprocess=args.force_reprocess) match_fn: Callable[[RepoInfo], Result] = functools.partial( match_functions, archive_folder=args.archive_dir, temp_folder=args.temp_dir, decompile_folder=args.decompile_dir, use_fake_libc_headers=args.use_fake_libc_headers, preprocess_timeout=args.preprocess_timeout, progress_bar=manager.proxy) repo_count = stats.repo_count func_count = stats.func_count func_without_ast_count = stats.func_without_ast_count for result in pool.imap_unordered(match_fn, iterator): if result is None: # Exception occurred. if args.exit_on_exception: flutes.log( f"Exception occurred, exiting because 'exit_on_exception' is True", "warning") break continue # Write the matched functions to disk. result: Result # type: ignore repo_dir = output_dir / result.repo_owner / result.repo_name repo_dir.mkdir(parents=True, exist_ok=True) with (repo_dir / "matched_funcs.jsonl").open("w") as f: for matched_func in result.matched_functions: f.write( json.dumps(matched_func._asdict(), separators=(',', ':')) + "\n") for sha, code in result.preprocessed_original_code.items(): with (repo_dir / f"{sha}.c").open("w") as f: pos = code.rfind(ghcc.parse.FAKE_LIBC_END_LINE) if pos != -1: code = code[(pos + len(ghcc.parse.FAKE_LIBC_END_LINE)):] f.write(code) if args.write_db: db.add_repo( result.repo_owner, result.repo_name, files_found=result.files_found, funcs_found=result.functions_found, funcs_matched=len(result.matched_functions), funcs_matched_without_ast=result.funcs_without_asts) repo_count += 1 func_count += len(result.matched_functions) func_without_ast_count += result.funcs_without_asts if repo_count % 100 == 0: flutes.log( f"Processed {repo_count} repositories, {func_count} functions matched " f"({func_without_ast_count} w/o AST)", force_console=True)
def main() -> None: if not ghcc.utils.verify_docker_image(verbose=True): exit(1) args = Arguments() if args.n_procs == 0: # Only do this on the single-threaded case. flutes.register_ipython_excepthook() flutes.set_log_file(args.log_file) flutes.set_logging_level(args.logging_level, console=True, file=False) flutes.log("Running with arguments:\n" + args.to_string(), force_console=True) if os.path.exists(args.clone_folder): flutes.log( f"Removing contents of clone folder '{args.clone_folder}'...", "warning", force_console=True) ghcc.utils.run_docker_command( ["rm", "-rf", "/usr/src/*"], user=0, directory_mapping={args.clone_folder: "/usr/src"}) flutes.log("Crawling starts...", "warning", force_console=True) db = ghcc.RepoDB() libraries: Set[str] = set() if args.record_libraries is not None and os.path.exists( args.record_libraries): with open(args.record_libraries, "r") as f: libraries = set(f.read().split()) def flush_libraries(): if args.record_libraries is not None: with open(args.record_libraries, "w") as f: f.write("\n".join(libraries)) with flutes.safe_pool(args.n_procs, closing=[db, flush_libraries]) as pool: iterator = iter_repos(db, args.repo_list_file, args.max_repos) pipeline_fn: Callable[ [RepoInfo], Optional[PipelineResult]] = functools.partial( clone_and_compile, clone_folder=args.clone_folder, binary_folder=args.binary_folder, archive_folder=args.archive_folder, recursive_clone=args.recursive_clone, clone_timeout=args.clone_timeout, compile_timeout=args.compile_timeout, force_reclone=args.force_reclone, force_recompile=args.force_recompile, docker_batch_compile=args.docker_batch_compile, max_archive_size=args.max_archive_size, compression_type=args.compression_type, record_libraries=(args.record_libraries is not None), record_metainfo=args.record_metainfo, gcc_override_flags=args.gcc_override_flags) repo_count = 0 meta_info = MetaInfo() for result in pool.imap_unordered(pipeline_fn, iterator): repo_count += 1 if repo_count % 100 == 0: flutes.log(f"Processed {repo_count} repositories", force_console=True) if result is None: continue repo_owner, repo_name = result.repo_info.repo_owner, result.repo_info.repo_name if args.write_db: if result.clone_success is not None or result.repo_info.db_result is None: # There's probably an inconsistency somewhere if we didn't clone while `db_result` is None. # To prevent more errors, just add it to the DB. repo_size = result.repo_size or -1 # a value of zero is probably also wrong clone_success = result.clone_success if result.clone_success is not None else True db.add_repo(repo_owner, repo_name, clone_success, repo_size=repo_size) flutes.log(f"Added {repo_owner}/{repo_name} to DB") if result.makefiles is not None: update_result = db.update_makefile( repo_owner, repo_name, result.makefiles, ignore_length_mismatch=True) if not update_result: flutes.log( f"Makefiles of {repo_owner}/{repo_name} not saved to DB due to Unicode encoding " f"errors", "error") if result.libraries is not None: libraries.update(result.libraries) if repo_count % 10 == 0: # flush every 10 repos flush_libraries() if args.record_metainfo: meta_info.add_repo(result) if repo_count % 100 == 0: flutes.log(repr(meta_info), force_console=True) flutes.log(repr(meta_info), force_console=True)