def main(): if args.single_process: makefiles = list(compile_makefiles()) else: q = mp.Queue() process = mp.Process(target=worker, args=(q, )) process.start() start_time = time.time() makefiles: List[ghcc.RepoDB.MakefileEntry] = [] while process.is_alive(): time.sleep(2) # no rush cur_time = time.time() # Get stuff out of the queue before possible termination -- otherwise it might deadlock. # See https://docs.python.org/3/library/multiprocessing.html#multiprocessing-programming, # the "Joining processes that use queues" section. read_queue(makefiles, q) # Note that it's still possible to have deadlocks if the child process pushed new elements into the queue # after we read and before we terminate. A better solution would be to send a message to the child and ask # it to quit, and only terminate when it doesn't respond. However, this current implementation is probably # good enough for most cases. if cur_time - start_time > args.compile_timeout + TIMEOUT_TOLERANCE: process.terminate() print(f"Timeout ({args.compile_timeout}s), killed", flush=True) ghcc.clean(REPO_PATH) # clean up after the worker process break read_queue(makefiles, q) flutes.kill_proc_tree( os.getpid(), including_parent=False) # make sure all subprocesses are dead with open(os.path.join(BINARY_PATH, "log.pkl"), "wb") as f: pickle.dump(makefiles, f) flutes.run_command(["chmod", "-R", "g+w", BINARY_PATH]) flutes.run_command(["chmod", "-R", "g+w", REPO_PATH])
def test_run_command() -> None: with open(__file__, "rb") as f: code = f.read() result = flutes.run_command(["cat", __file__], verbose=True, return_output=True) assert result.return_code == 0 assert result.captured_output == code with pytest.raises(subprocess.CalledProcessError, match=r"Captured output:\n\s+Test output"): flutes.run_command(["sh", "-c", "echo 'Test output'; exit 1"], verbose=True)
def test_gcc_library_log(self) -> None: from ghcc.compile import MOCK_PATH library_log_path = os.path.join(self.tempdir.name, "libraries.txt") env = { "PATH": f"{MOCK_PATH}:{os.environ['PATH']}", "MOCK_GCC_LIBRARY_LOG": library_log_path, } libraries = ["pthread", "m", "opencv", "openmp", "library_with_random_name"] try: flutes.run_command( ["gcc", *[f"-l{lib}" for lib in libraries], "nonexistent_file.c"], env=env) except subprocess.CalledProcessError: pass # error must occur because file is nonexistent assert os.path.exists(library_log_path) with open(library_log_path) as f: recorded_libraries = f.read().split() assert set(libraries) == set(recorded_libraries)
def test_copy_tree() -> None: with tempfile.TemporaryDirectory() as tempdir: path = Path(tempdir) result = flutes.run_command( ["git", "clone", "https://github.com/huzecong/flutes"], cwd=path) assert result.return_code == 0 flutes.copy_tree(path / "flutes", path / "flutes_copy") assert flutes.get_folder_size( path / "flutes") == flutes.get_folder_size(path / "flutes_copy")
def run_decompiler(file_name: str, script: str, env: Optional[EnvDict] = None, timeout: Optional[int] = None): r"""Run a decompiler script. :param file_name: The binary to be decompiled. :param env: An `os.environ` mapping, useful for passing arguments. :param script: The script file to run. :param timeout: Timeout in seconds (default no timeout). """ idacall = [args.ida, '-B', f'-S{script}', file_name] try: flutes.run_command(idacall, env=env, timeout=timeout) except subprocess.CalledProcessError as e: if b"Traceback (most recent call last):" in e.output: # Exception raised by Python script called by IDA, throw it up. raise e flutes.run_command(['rm', '-f', f'{file_name}.i64']) if b"Corrupted pseudo-registry file" in e.output: write_pseudo_registry() # Run again without try-catch; if it fails, it should crash. flutes.run_command(idacall, env=env, timeout=timeout)
def match_functions( repo_info: RepoInfo, archive_folder: str, temp_folder: str, decompile_folder: str, use_fake_libc_headers: bool = True, preprocess_timeout: Optional[int] = None, *, progress_bar: Optional[flutes.ProgressBarManager.Proxy] = None ) -> Result: # Directions: # 1. Clone or extract from archive. # 2. For each Makefile, rerun the compilation process with the flag "-E", so only the preprocessor is run. # This probably won't take long as the compiler exits after running the processor, and linking would fail. # Also, consider using "-nostdlib -Ipath/to/fake_libc_include" as suggested by `pycparser`. # 3. The .o files are now preprocessed C code. Parse them using `pycparser` to obtain a list of functions. start_time = time.time() total_files = sum( len(makefile) for makefile in repo_info.makefiles.values()) repo_folder_name = f"{repo_info.repo_owner}_____{repo_info.repo_name}" repo_full_name = f"{repo_info.repo_owner}/{repo_info.repo_name}" archive_path = (Path(archive_folder) / f"{repo_full_name}.tar.gz").absolute() repo_dir = (Path(temp_folder) / repo_folder_name).absolute() repo_src_path = repo_dir / "src" repo_binary_dir = repo_dir / "bin" repo_binary_dir.mkdir(parents=True, exist_ok=True) has_error = False if progress_bar is not None: worker_id = flutes.get_worker_id() process_name = f"Worker {worker_id}" if worker_id is not None else "Main Process" progress_bar.new(total=total_files, desc=process_name + f" [{repo_full_name}]") flutes.log(f"Begin processing {repo_full_name} ({total_files} files)") if os.path.exists(archive_path): # Extract archive flutes.run_command(["tar", f"xzf", str(archive_path)], cwd=str(repo_dir)) (repo_dir / repo_folder_name).rename(repo_src_path) else: # Clone repo if repo_src_path.exists(): shutil.rmtree(repo_src_path) ret = ghcc.clone(repo_info.repo_owner, repo_info.repo_name, clone_folder=str(repo_dir), folder_name="src") if ret.error_type not in [None, ghcc.CloneErrorType.SubmodulesFailed]: flutes.log( f"Failed to clone {repo_full_name}: error type {ret.error_type}", "error") # Return a dummy result so this repo is ignored in the future. return Result(repo_info.repo_owner, repo_info.repo_name, [], {}, 0, 0, 0) # Write makefile info to pickle with (repo_binary_dir / "makefiles.pkl").open("wb") as f_pkl: pickle.dump(repo_info.makefiles, f_pkl) gcc_flags = "-E" directory_mapping = None if use_fake_libc_headers: gcc_flags = f"-E -nostdlib -I/usr/src/libc" directory_mapping = {ghcc.parse.FAKE_LIBC_PATH: "/usr/src/libc"} if progress_bar is not None: progress_bar.update(postfix={"status": "preprocessing"}) makefiles = ghcc.docker_batch_compile( str(repo_binary_dir), str(repo_src_path), compile_timeout=preprocess_timeout, gcc_override_flags=gcc_flags, use_makefile_info_pkl=True, directory_mapping=directory_mapping, user_id=(repo_info.idx % 10000) + 30000, # user IDs 30000 ~ 39999 exception_log_fn=functools.partial(exception_handler, repo_info=repo_info)) parser = CParser(lexer=ghcc.parse.CachedCLexer) lexer = ghcc.parse.LexerWrapper() decompile_path = Path(decompile_folder) extractor = ghcc.parse.FunctionExtractor() matched_functions: List[MatchedFunction] = [] preprocessed_original_code: Dict[str, str] = {} files_found = 0 functions_found = 0 for makefile in makefiles: mkfile_dir = Path(makefile['directory']) for path, sha in zip(makefile["binaries"], makefile["sha256"]): # Load and parse preprocessed original code. code_path = str(mkfile_dir / path) json_path = decompile_path / (sha + ".jsonl") preprocessed_code_path = repo_binary_dir / sha if progress_bar is not None: progress_bar.update(1, postfix={"file": code_path}) if not json_path.exists() or not preprocessed_code_path.exists(): continue try: with preprocessed_code_path.open("r") as f: code = f.read() code = LINE_CONTROL_REGEX.sub("", code) except UnicodeDecodeError: continue # probably a real binary file preprocessed_original_code[sha] = code try: original_ast: ASTNode = parser.parse(code, filename=os.path.join( repo_full_name, path)) except (pycparser.c_parser.ParseError, AssertionError) as e: # For some reason `pycparser` uses `assert`s in places where there should have been a check. flutes.log( f"{repo_full_name}: Parser error when processing file " f"{code_path} ({sha}): {str(e)}", "error") has_error = True continue # ignore parsing errors original_tokens = ghcc.parse.convert_to_tokens( code, parser.clex.cached_tokens) files_found += 1 function_asts = extractor.find_functions(original_ast) functions_found += len(function_asts) # Collect decompiled functions with matching original code. with json_path.open("r") as f: decompiled_json = [ line for line in f if line ] # don't decode, as we only need the function name decompiled_funcs: Dict[str, str] = {} # (func_name) -> decompiled_code decompiled_var_names: Dict[str, Dict[str, Tuple[str, str]]] = {} \ # (func_name) -> (var_id) -> (decomp_name, orig_name) for line_num, j in enumerate(decompiled_json): # Find function name from JSON line without parsing. match = JSON_FUNC_NAME_REGEX.search(j) assert match is not None func_name = match.group(1) if func_name not in function_asts: continue try: decompiled_data = json.loads(j) except json.JSONDecodeError as e: flutes.log( f"{repo_full_name}: Decode error when reading JSON file at {json_path}: " f"{str(e)}", "error") continue decompiled_code = decompiled_data["raw_code"] # Store the variable names used in the function. # We use a random string as the identifier prefix. Sadly, C89 (and `pycparser`) doesn't support Unicode. for length in range(3, 10 + 1): var_identifier_prefix = "v" + "".join( random.choices(string.ascii_lowercase, k=length)) if var_identifier_prefix not in decompiled_code: break else: # No way this is happening, right? flutes.log( f"{repo_full_name}: Could not find valid identifier prefix for " f"{func_name} in {code_path} ({sha})", "error") continue variables: Dict[str, Tuple[str, str]] = { } # (var_id) -> (decompiled_name, original_name) for match in DECOMPILED_VAR_REGEX.finditer(decompiled_code): var_id, decompiled_name, original_name = match.groups() var_id = f"{var_identifier_prefix}_{var_id}" if var_id in variables: assert variables[var_id] == (decompiled_name, original_name) else: variables[var_id] = (decompiled_name, original_name) decompiled_var_names[func_name] = variables # Remove irregularities in decompiled code to make the it parsable: # - Replace `@@VAR` with special identifiers (literally anything identifier that doesn't clash). # - Remove the register allocation indication in `var@<rdi>`. decompiled_code = DECOMPILED_VAR_REGEX.sub( rf"{var_identifier_prefix}_\1", decompiled_code) decompiled_code = DECOMPILED_REG_ALLOC_REGEX.sub( "", decompiled_code) if func_name.startswith("_"): # For some reason, Hexrays would chomp off one leading underscore from function names in their # generated code, which might lead to corrupt code (`_01inverse` -> `01inverse`). Here we # heuristically try to find and replace the changed function name. decompiled_code = re.sub( # replace all identifiers with matching name r"(?<![a-zA-Z0-9_])" + func_name[1:] + r"(?![a-zA-Z0-9_])", func_name, decompiled_code) # Note that this doesn't fix references of the function in other functions. But really, why would # someone name their function `_01inverse`? decompiled_funcs[func_name] = decompiled_code # Generate code replacing original functions with decompiled functions. replacer = ghcc.parse.FunctionReplacer(decompiled_funcs) replaced_code = replacer.visit(original_ast) # Obtain AST for decompiled code by parsing it again. code_to_preprocess = DECOMPILED_CODE_HEADER + "\n" + replaced_code try: code_to_parse = ghcc.parse.preprocess(code_to_preprocess) except ghcc.parse.PreprocessError as e: msg = ( f"{repo_full_name}: GCC return value nonzero for decompiled code of " f"{code_path} ({sha})") if len(e.args) > 0: msg += ":\n" + str(e) flutes.log(msg, "error") has_error = True continue try: decompiled_ast, code_to_parse = ghcc.parse.parse_decompiled_code( code_to_parse, lexer, parser) decompiled_tokens = ghcc.parse.convert_to_tokens( code_to_parse, parser.clex.cached_tokens) except (ValueError, pycparser.c_parser.ParseError) as e: flutes.log( f"{repo_full_name}: Could not parse decompiled code for " f"{code_path} ({sha}): {str(e)}", "error") has_error = True # We don't have ASTs for decompiled functions, but we can still dump the code. # Use the dummy typedefs to extract functions. code_lines = code_to_parse.split("\n") func_begin_end: Dict[str, List[Optional[int]]] = defaultdict( lambda: [None, None]) for idx, line in enumerate(code_lines): name, is_begin = replacer.extract_func_name(line) if name is not None: func_begin_end[name][0 if is_begin else 1] = idx for func_name, (begin, end) in func_begin_end.items(): if begin is not None and end is not None and func_name in function_asts: decompiled_func_tokens = lexer.lex("\n".join( code_lines[(begin + 1):end])) original_func_ast = function_asts[func_name] original_ast_json, original_func_tokens = serialize( original_func_ast, original_tokens) matched_func = MatchedFunction( file_path=code_path, binary_hash=sha, func_name=func_name, variable_names=decompiled_var_names[func_name], original_tokens=original_func_tokens, decompiled_tokens=decompiled_func_tokens, original_ast_json=original_ast_json, decompiled_ast_json=None) matched_functions.append(matched_func) else: # We've successfully parsed decompiled code. decompiled_func_asts = extractor.find_functions(decompiled_ast) for func_name in decompiled_funcs.keys(): original_func_ast = function_asts[func_name] if func_name not in decompiled_func_asts: # Maybe there's other Hexrays-renamed functions that we didn't fix, just ignore them. continue decompiled_func_ast = decompiled_func_asts[func_name] original_ast_json, original_func_tokens = serialize( original_func_ast, original_tokens) decompiled_ast_json, decompiled_func_tokens = serialize( decompiled_func_ast, decompiled_tokens) matched_func = MatchedFunction( file_path=code_path, binary_hash=sha, func_name=func_name, variable_names=decompiled_var_names[func_name], original_tokens=original_func_tokens, decompiled_tokens=decompiled_func_tokens, original_ast_json=original_ast_json, decompiled_ast_json=decompiled_ast_json) matched_functions.append(matched_func) # Cleanup the folders; if errors occurred, keep the preprocessed code. status = ("success" if not has_error and len(matched_functions) > 0 else ( "warning" if not has_error or len(matched_functions) > 0 else "error")) shutil.rmtree(repo_dir) end_time = time.time() funcs_without_asts = sum(matched_func.decompiled_ast_json is None for matched_func in matched_functions) flutes.log( f"[{end_time - start_time:6.2f}s] " f"{repo_full_name}: " f"Files found: {files_found}/{total_files}, " f"functions matched: {len(matched_functions)}/{functions_found} " f"({funcs_without_asts} w/o ASTs)", status, force_console=True) return Result(repo_owner=repo_info.repo_owner, repo_name=repo_info.repo_name, matched_functions=matched_functions, preprocessed_original_code=preprocessed_original_code, files_found=files_found, functions_found=functions_found, funcs_without_asts=funcs_without_asts)
def _test_debug_info(self, elf_paths: List[str]) -> None: # Check if binaries contain debugging information (whether mock GCC works). for elf in elf_paths: # NOTE: This doesn't work under macOS. ret = flutes.run_command(f"objdump --syms {elf} | grep debug | wc -l", return_output=True, shell=True) assert int(ret.captured_output.decode('utf-8')) > 0
def clone_and_compile( repo_info: RepoInfo, clone_folder: str, binary_folder: str, archive_folder: str, recursive_clone: bool = True, clone_timeout: Optional[float] = None, compile_timeout: Optional[float] = None, force_reclone: bool = False, force_recompile: bool = False, docker_batch_compile: bool = True, max_archive_size: Optional[int] = None, compression_type: str = "gzip", record_libraries: bool = False, record_metainfo: bool = False, gcc_override_flags: Optional[str] = None) -> PipelineResult: r"""Perform the entire pipeline. :param repo_info: Information about the repository. :param clone_folder: Path to the folder where the repository will be stored. The actual destination folder will be ``clone_folder/repo_owner_____repo_name``, e.g., ``clone_folder/torvalds_____linux``. This strange notation is used in order to have a flat directory hierarchy, so we're not left with a bunch of empty folders for repository owners. :param binary_folder: Path to the folder where compiled binaries will be stored. The actual destination folder will be ``binary_folder/repo_owner/repo_name``, e.g., ``binary_folder/torvalds/linux``. :param archive_folder: Path to the folder where archived repositories will be stored. The actual archive file will be ``archive_folder/repo_owner/repo_name.tar.xz``, e.g., ``archive_folder/torvalds/linux.tar.xz``. :param recursive_clone: If ``True``, uses ``--recursive`` when cloning. :param clone_timeout: Timeout for cloning, or `None` (default) for unlimited time. :param compile_timeout: Timeout for compilation, or `None` (default) for unlimited time. :param force_reclone: If ``True``, always clone a fresh copy for compilation. If ``False``, only clone when there are no matching archives. :param force_recompile: If ``True``, the repository is compiled regardless of the value in DB. :param docker_batch_compile: If ``True``, compile all Makefiles within a repository in a single Docker container. :param max_archive_size: If specified, only archive repositories whose size is not larger than the given value (in bytes). :param compression_type: The file type of the archive to produce. Valid values are ``"gzip"`` (faster) and ``"xz"`` (smaller). :param record_libraries: If ``True``, record the libraries used in compilation. :param record_metainfo: If ``True``, record meta-info values. :param gcc_override_flags: If not ``None``, these flags will be appended to each invocation of GCC. :return: An entry to insert into the DB, or `None` if no operations are required. """ repo_full_name = f"{repo_info.repo_owner}/{repo_info.repo_name}" repo_folder_name = f"{repo_info.repo_owner}_____{repo_info.repo_name}" repo_path = os.path.join(clone_folder, repo_folder_name) if compression_type == "xz": archive_extension = ".tar.xz" tar_type_flag = "J" elif compression_type == "gzip": archive_extension = ".tar.gz" tar_type_flag = "z" else: raise ValueError(f"Invalid compression type '{compression_type}'") archive_path = os.path.abspath( os.path.join(archive_folder, f"{repo_full_name}{archive_extension}")) repo_entry = repo_info.db_result clone_success = None # Skip repos that are fully processed if (repo_entry is not None and (repo_entry["clone_successful"] and not force_reclone) and (repo_entry["compiled"] and not force_recompile)): return PipelineResult(repo_info) # Stage 1: Cloning from GitHub. if not force_reclone and os.path.exists(archive_path): # Extract the archive instead of cloning. try: flutes.run_command(["tar", f"x{tar_type_flag}f", archive_path], timeout=clone_timeout, cwd=clone_folder) flutes.log(f"{repo_full_name} extracted from archive", "success") except (subprocess.TimeoutExpired, subprocess.CalledProcessError) as e: flutes.log( f"Unknown error when extracting {repo_full_name}. Captured output: '{e.output}'", "error") shutil.rmtree(repo_path) return PipelineResult(repo_info) # return dummy info repo_size = flutes.get_folder_size(repo_path) elif (repo_entry is None or # not processed force_reclone or (repo_entry["clone_successful"] and # not compiled (not repo_entry["compiled"] or force_recompile) and not os.path.exists(repo_path))): clone_result = ghcc.clone(repo_info.repo_owner, repo_info.repo_name, clone_folder=clone_folder, folder_name=repo_folder_name, timeout=clone_timeout, skip_if_exists=False, recursive=recursive_clone) clone_success = clone_result.success if not clone_result.success: if clone_result.error_type is CloneErrorType.FolderExists: flutes.log(f"{repo_full_name} skipped because folder exists", "warning") elif clone_result.error_type is CloneErrorType.PrivateOrNonexistent: flutes.log( f"Failed to clone {repo_full_name} because repository is private or nonexistent", "warning") else: if clone_result.error_type is CloneErrorType.Unknown: msg = f"Failed to clone {repo_full_name} with unknown error" else: # CloneErrorType.Timeout msg = f"Time expired ({clone_timeout}s) when attempting to clone {repo_full_name}" if clone_result.captured_output is not None: msg += f". Captured output: '{clone_result.captured_output!r}'" flutes.log(msg, "error") if clone_result.error_type is CloneErrorType.Unknown: return PipelineResult(repo_info) # return dummy info return PipelineResult(repo_info, clone_success=clone_success) elif clone_result.error_type is CloneErrorType.SubmodulesFailed: msg = f"Submodules in {repo_full_name} ignored due to error" if clone_result.captured_output is not None: msg += f". Captured output: '{clone_result.captured_output!r}'" flutes.log(msg, "warning") repo_size = flutes.get_folder_size(repo_path) flutes.log( f"{repo_full_name} successfully cloned ({clone_result.time:.2f}s, " f"{flutes.readable_size(repo_size)})", "success") else: if not repo_entry["clone_successful"]: return PipelineResult(repo_info) # return dummy info repo_size = flutes.get_folder_size(repo_path) makefiles = None libraries = None meta_info: Optional[PipelineMetaInfo] = None if not repo_entry or not repo_entry["compiled"] or force_recompile: # # SPECIAL CHECK: Do not attempt to compile OS kernels! # kernel_name = None # if contains_in_file(os.path.join(repo_path, "README"), "Linux kernel release"): # kernel_name = "Linux" # elif contains_in_file(os.path.join(repo_path, "README"), "FreeBSD source directory"): # kernel_name = "FreeBSD" # if kernel_name is not None: # shutil.rmtree(repo_path) # ghcc.log(f"Found {kernel_name} kernel in {repo_full_name}, will not attempt to compile. " # f"Repository deleted", "warning") # return PipelineResult(repo_info, clone_success=clone_success, makefiles=[]) # Stage 2: Finding Makefiles. makefile_dirs = ghcc.find_makefiles(repo_path) if len(makefile_dirs) == 0: # Repo has no Makefiles, delete. shutil.rmtree(repo_path) flutes.log( f"No Makefiles found in {repo_full_name}, repository deleted", "warning") return PipelineResult(repo_info, clone_success=clone_success, makefiles=[]) else: pass # Stage 3: Compile each Makefile. repo_binary_dir = os.path.join(binary_folder, repo_full_name) if not os.path.exists(repo_binary_dir): os.makedirs(repo_binary_dir) flutes.log(f"Starting compilation for {repo_full_name}...") if docker_batch_compile: makefiles = ghcc.docker_batch_compile( repo_binary_dir, repo_path, compile_timeout, record_libraries, gcc_override_flags, user_id=(repo_info.idx % 10000) + 30000, # user IDs 30000 ~ 39999 exception_log_fn=functools.partial(exception_handler, repo_info=repo_info)) else: makefiles = list( ghcc.compile_and_move(repo_binary_dir, repo_path, makefile_dirs, compile_timeout, record_libraries, gcc_override_flags)) num_succeeded = sum(makefile["success"] for makefile in makefiles) if record_libraries: library_log_path = os.path.join(repo_binary_dir, "libraries.txt") if os.path.exists(library_log_path): with open(library_log_path) as f: libraries = list(set(f.read().split())) else: libraries = [] num_binaries = sum(len(makefile["binaries"]) for makefile in makefiles) msg = f"{num_succeeded} ({len(makefiles)}) out of {len(makefile_dirs)} Makefile(s) " \ f"in {repo_full_name} compiled (partially), yielding {num_binaries} binaries" flutes.log( msg, "success" if num_succeeded == len(makefile_dirs) else "warning") if record_metainfo: meta_info = PipelineMetaInfo({ "num_makefiles": len(makefile_dirs), "has_gitmodules": os.path.exists(os.path.join(repo_path, ".gitmodules")), "makefiles_using_automake": sum( ghcc.contains_files(directory, ["configure.ac", "configure.in"]) for directory in makefile_dirs) }) # Stage 4: Clean and zip repo. if max_archive_size is not None and repo_size > max_archive_size: shutil.rmtree(repo_path) flutes.log( f"Removed {repo_full_name} because repository size ({flutes.readable_size(repo_size)}) " f"exceeds limits", "info") else: # Repository is already cleaned in the compile stage. os.makedirs(os.path.split(archive_path)[0], exist_ok=True) compress_success = False try: flutes.run_command([ "tar", f"c{tar_type_flag}f", archive_path, repo_folder_name ], timeout=clone_timeout, cwd=clone_folder) compress_success = True except subprocess.TimeoutExpired: flutes.log( f"Compression timeout for {repo_full_name}, giving up", "error") except subprocess.CalledProcessError as e: flutes.log( f"Unknown error when compressing {repo_full_name}. Captured output: '{e.output}'", "error") shutil.rmtree(repo_path) if compress_success: flutes.log(f"Compressed {repo_full_name}, folder removed", "info") elif os.path.exists(archive_path): os.remove(archive_path) return PipelineResult(repo_info, clone_success=clone_success, repo_size=repo_size, makefiles=makefiles, libraries=libraries, meta_info=meta_info)
def decompile(binary_info: BinaryInfo, output_dir: str, binary_dir: str, timeout: Optional[int] = None) -> DecompilationResult: binary_path = binary_info["path"] original_path = binary_info["path_in_repo"] binary_hash = os.path.split(binary_path)[1] def create_result( status: DecompilationStatus, time: Optional[datetime.timedelta] = None) -> DecompilationResult: return DecompilationResult(binary_info, binary_hash, status, time) output_path = os.path.join(output_dir, f"{binary_hash}.jsonl") if os.path.exists(output_path): # Binary already decompiled, but for some reason it wasn't written to the DB. return create_result(DecompilationStatus.Success) start = datetime.datetime.now() env: EnvDict = os.environ.copy() env['IDALOG'] = '/dev/stdout' env['PREFIX'] = binary_hash file_path = os.path.join(binary_dir, binary_path) # Create a temporary directory, since the decompiler makes a lot of additional # files that we can't clean up from here. with tempfile.TemporaryDirectory() as tempdir: # Put the output JSONL file here as well to prevent partially-generated files. env['OUTPUT_DIR'] = os.path.abspath(tempdir) with tempfile.NamedTemporaryFile(dir=tempdir) as collected_vars: # First collect variables. env['COLLECTED_VARS'] = collected_vars.name with tempfile.NamedTemporaryFile(dir=tempdir) as orig: flutes.run_command(['cp', file_path, orig.name]) # Timeout after 30 seconds for first run. try: run_decompiler(orig.name, COLLECT, env=env, timeout=timeout) except subprocess.TimeoutExpired: flutes.log(f"[TIMED OUT] {original_path} ({binary_path})", "warning") return create_result(DecompilationStatus.TimedOut) try: assert pickle.load(collected_vars) # non-empty except: flutes.log(f"[NO VARS] {original_path} ({binary_path})", "warning") return create_result(DecompilationStatus.NoVariables) # Make a new stripped copy and pass it the collected vars. with tempfile.NamedTemporaryFile(dir=tempdir) as stripped: flutes.run_command(['cp', file_path, stripped.name]) flutes.run_command(['strip', '--strip-debug', stripped.name]) # Dump the trees. # No timeout here, we know it'll run in a reasonable amount of # time and don't want mismatched files. run_decompiler(stripped.name, DUMP_TREES, env=env) jsonl_path = os.path.join(tempdir, f"{binary_hash}.jsonl") flutes.run_command(['cp', jsonl_path, output_path]) end = datetime.datetime.now() duration = end - start flutes.log( f"[OK {duration.total_seconds():5.2f}s] {original_path} ({binary_path})", "success") return create_result(DecompilationStatus.Success, duration)