Beispiel #1
0
def main():
    if args.single_process:
        makefiles = list(compile_makefiles())
    else:
        q = mp.Queue()
        process = mp.Process(target=worker, args=(q, ))
        process.start()
        start_time = time.time()

        makefiles: List[ghcc.RepoDB.MakefileEntry] = []
        while process.is_alive():
            time.sleep(2)  # no rush
            cur_time = time.time()
            # Get stuff out of the queue before possible termination -- otherwise it might deadlock.
            # See https://docs.python.org/3/library/multiprocessing.html#multiprocessing-programming,
            # the "Joining processes that use queues" section.
            read_queue(makefiles, q)
            # Note that it's still possible to have deadlocks if the child process pushed new elements into the queue
            # after we read and before we terminate. A better solution would be to send a message to the child and ask
            # it to quit, and only terminate when it doesn't respond. However, this current implementation is probably
            # good enough for most cases.
            if cur_time - start_time > args.compile_timeout + TIMEOUT_TOLERANCE:
                process.terminate()
                print(f"Timeout ({args.compile_timeout}s), killed", flush=True)
                ghcc.clean(REPO_PATH)  # clean up after the worker process
                break
        read_queue(makefiles, q)

    flutes.kill_proc_tree(
        os.getpid(),
        including_parent=False)  # make sure all subprocesses are dead
    with open(os.path.join(BINARY_PATH, "log.pkl"), "wb") as f:
        pickle.dump(makefiles, f)
    flutes.run_command(["chmod", "-R", "g+w", BINARY_PATH])
    flutes.run_command(["chmod", "-R", "g+w", REPO_PATH])
Beispiel #2
0
def test_run_command() -> None:
    with open(__file__, "rb") as f:
        code = f.read()
    result = flutes.run_command(["cat", __file__], verbose=True, return_output=True)
    assert result.return_code == 0
    assert result.captured_output == code

    with pytest.raises(subprocess.CalledProcessError, match=r"Captured output:\n\s+Test output"):
        flutes.run_command(["sh", "-c", "echo 'Test output'; exit 1"], verbose=True)
Beispiel #3
0
 def test_gcc_library_log(self) -> None:
     from ghcc.compile import MOCK_PATH
     library_log_path = os.path.join(self.tempdir.name, "libraries.txt")
     env = {
         "PATH": f"{MOCK_PATH}:{os.environ['PATH']}",
         "MOCK_GCC_LIBRARY_LOG": library_log_path,
     }
     libraries = ["pthread", "m", "opencv", "openmp", "library_with_random_name"]
     try:
         flutes.run_command(
             ["gcc", *[f"-l{lib}" for lib in libraries], "nonexistent_file.c"], env=env)
     except subprocess.CalledProcessError:
         pass  # error must occur because file is nonexistent
     assert os.path.exists(library_log_path)
     with open(library_log_path) as f:
         recorded_libraries = f.read().split()
         assert set(libraries) == set(recorded_libraries)
def test_copy_tree() -> None:
    with tempfile.TemporaryDirectory() as tempdir:
        path = Path(tempdir)
        result = flutes.run_command(
            ["git", "clone", "https://github.com/huzecong/flutes"], cwd=path)
        assert result.return_code == 0
        flutes.copy_tree(path / "flutes", path / "flutes_copy")
        assert flutes.get_folder_size(
            path / "flutes") == flutes.get_folder_size(path / "flutes_copy")
Beispiel #5
0
def run_decompiler(file_name: str,
                   script: str,
                   env: Optional[EnvDict] = None,
                   timeout: Optional[int] = None):
    r"""Run a decompiler script.

    :param file_name: The binary to be decompiled.
    :param env: An `os.environ` mapping, useful for passing arguments.
    :param script: The script file to run.
    :param timeout: Timeout in seconds (default no timeout).
    """
    idacall = [args.ida, '-B', f'-S{script}', file_name]
    try:
        flutes.run_command(idacall, env=env, timeout=timeout)
    except subprocess.CalledProcessError as e:
        if b"Traceback (most recent call last):" in e.output:
            # Exception raised by Python script called by IDA, throw it up.
            raise e
        flutes.run_command(['rm', '-f', f'{file_name}.i64'])
        if b"Corrupted pseudo-registry file" in e.output:
            write_pseudo_registry()
            # Run again without try-catch; if it fails, it should crash.
            flutes.run_command(idacall, env=env, timeout=timeout)
Beispiel #6
0
def match_functions(
        repo_info: RepoInfo,
        archive_folder: str,
        temp_folder: str,
        decompile_folder: str,
        use_fake_libc_headers: bool = True,
        preprocess_timeout: Optional[int] = None,
        *,
        progress_bar: Optional[flutes.ProgressBarManager.Proxy] = None
) -> Result:
    # Directions:
    # 1. Clone or extract from archive.
    # 2. For each Makefile, rerun the compilation process with the flag "-E", so only the preprocessor is run.
    #    This probably won't take long as the compiler exits after running the processor, and linking would fail.
    #    Also, consider using "-nostdlib -Ipath/to/fake_libc_include" as suggested by `pycparser`.
    # 3. The .o files are now preprocessed C code. Parse them using `pycparser` to obtain a list of functions.

    start_time = time.time()
    total_files = sum(
        len(makefile) for makefile in repo_info.makefiles.values())
    repo_folder_name = f"{repo_info.repo_owner}_____{repo_info.repo_name}"
    repo_full_name = f"{repo_info.repo_owner}/{repo_info.repo_name}"
    archive_path = (Path(archive_folder) /
                    f"{repo_full_name}.tar.gz").absolute()
    repo_dir = (Path(temp_folder) / repo_folder_name).absolute()
    repo_src_path = repo_dir / "src"
    repo_binary_dir = repo_dir / "bin"
    repo_binary_dir.mkdir(parents=True, exist_ok=True)
    has_error = False

    if progress_bar is not None:
        worker_id = flutes.get_worker_id()
        process_name = f"Worker {worker_id}" if worker_id is not None else "Main Process"
        progress_bar.new(total=total_files,
                         desc=process_name + f" [{repo_full_name}]")

    flutes.log(f"Begin processing {repo_full_name} ({total_files} files)")

    if os.path.exists(archive_path):
        # Extract archive
        flutes.run_command(["tar", f"xzf", str(archive_path)],
                           cwd=str(repo_dir))
        (repo_dir / repo_folder_name).rename(repo_src_path)
    else:
        # Clone repo
        if repo_src_path.exists():
            shutil.rmtree(repo_src_path)
        ret = ghcc.clone(repo_info.repo_owner,
                         repo_info.repo_name,
                         clone_folder=str(repo_dir),
                         folder_name="src")
        if ret.error_type not in [None, ghcc.CloneErrorType.SubmodulesFailed]:
            flutes.log(
                f"Failed to clone {repo_full_name}: error type {ret.error_type}",
                "error")
            # Return a dummy result so this repo is ignored in the future.
            return Result(repo_info.repo_owner, repo_info.repo_name, [], {}, 0,
                          0, 0)

    # Write makefile info to pickle
    with (repo_binary_dir / "makefiles.pkl").open("wb") as f_pkl:
        pickle.dump(repo_info.makefiles, f_pkl)

    gcc_flags = "-E"
    directory_mapping = None
    if use_fake_libc_headers:
        gcc_flags = f"-E -nostdlib -I/usr/src/libc"
        directory_mapping = {ghcc.parse.FAKE_LIBC_PATH: "/usr/src/libc"}

    if progress_bar is not None:
        progress_bar.update(postfix={"status": "preprocessing"})
    makefiles = ghcc.docker_batch_compile(
        str(repo_binary_dir),
        str(repo_src_path),
        compile_timeout=preprocess_timeout,
        gcc_override_flags=gcc_flags,
        use_makefile_info_pkl=True,
        directory_mapping=directory_mapping,
        user_id=(repo_info.idx % 10000) + 30000,  # user IDs 30000 ~ 39999
        exception_log_fn=functools.partial(exception_handler,
                                           repo_info=repo_info))

    parser = CParser(lexer=ghcc.parse.CachedCLexer)
    lexer = ghcc.parse.LexerWrapper()
    decompile_path = Path(decompile_folder)
    extractor = ghcc.parse.FunctionExtractor()
    matched_functions: List[MatchedFunction] = []
    preprocessed_original_code: Dict[str, str] = {}
    files_found = 0
    functions_found = 0
    for makefile in makefiles:
        mkfile_dir = Path(makefile['directory'])
        for path, sha in zip(makefile["binaries"], makefile["sha256"]):
            # Load and parse preprocessed original code.
            code_path = str(mkfile_dir / path)
            json_path = decompile_path / (sha + ".jsonl")
            preprocessed_code_path = repo_binary_dir / sha
            if progress_bar is not None:
                progress_bar.update(1, postfix={"file": code_path})
            if not json_path.exists() or not preprocessed_code_path.exists():
                continue
            try:
                with preprocessed_code_path.open("r") as f:
                    code = f.read()
                code = LINE_CONTROL_REGEX.sub("", code)
            except UnicodeDecodeError:
                continue  # probably a real binary file
            preprocessed_original_code[sha] = code
            try:
                original_ast: ASTNode = parser.parse(code,
                                                     filename=os.path.join(
                                                         repo_full_name, path))
            except (pycparser.c_parser.ParseError, AssertionError) as e:
                # For some reason `pycparser` uses `assert`s in places where there should have been a check.
                flutes.log(
                    f"{repo_full_name}: Parser error when processing file "
                    f"{code_path} ({sha}): {str(e)}", "error")
                has_error = True
                continue  # ignore parsing errors
            original_tokens = ghcc.parse.convert_to_tokens(
                code, parser.clex.cached_tokens)
            files_found += 1
            function_asts = extractor.find_functions(original_ast)
            functions_found += len(function_asts)

            # Collect decompiled functions with matching original code.
            with json_path.open("r") as f:
                decompiled_json = [
                    line for line in f if line
                ]  # don't decode, as we only need the function name
            decompiled_funcs: Dict[str,
                                   str] = {}  # (func_name) -> decompiled_code
            decompiled_var_names: Dict[str, Dict[str, Tuple[str, str]]] = {} \
                # (func_name) -> (var_id) -> (decomp_name, orig_name)

            for line_num, j in enumerate(decompiled_json):
                # Find function name from JSON line without parsing.
                match = JSON_FUNC_NAME_REGEX.search(j)
                assert match is not None
                func_name = match.group(1)
                if func_name not in function_asts:
                    continue

                try:
                    decompiled_data = json.loads(j)
                except json.JSONDecodeError as e:
                    flutes.log(
                        f"{repo_full_name}: Decode error when reading JSON file at {json_path}: "
                        f"{str(e)}", "error")
                    continue
                decompiled_code = decompiled_data["raw_code"]
                # Store the variable names used in the function.
                # We use a random string as the identifier prefix. Sadly, C89 (and `pycparser`) doesn't support Unicode.
                for length in range(3, 10 + 1):
                    var_identifier_prefix = "v" + "".join(
                        random.choices(string.ascii_lowercase, k=length))
                    if var_identifier_prefix not in decompiled_code:
                        break
                else:
                    # No way this is happening, right?
                    flutes.log(
                        f"{repo_full_name}: Could not find valid identifier prefix for "
                        f"{func_name} in {code_path} ({sha})", "error")
                    continue
                variables: Dict[str, Tuple[str, str]] = {
                }  # (var_id) -> (decompiled_name, original_name)
                for match in DECOMPILED_VAR_REGEX.finditer(decompiled_code):
                    var_id, decompiled_name, original_name = match.groups()
                    var_id = f"{var_identifier_prefix}_{var_id}"
                    if var_id in variables:
                        assert variables[var_id] == (decompiled_name,
                                                     original_name)
                    else:
                        variables[var_id] = (decompiled_name, original_name)
                decompiled_var_names[func_name] = variables
                # Remove irregularities in decompiled code to make the it parsable:
                # - Replace `@@VAR` with special identifiers (literally anything identifier that doesn't clash).
                # - Remove the register allocation indication in `var@<rdi>`.
                decompiled_code = DECOMPILED_VAR_REGEX.sub(
                    rf"{var_identifier_prefix}_\1", decompiled_code)
                decompiled_code = DECOMPILED_REG_ALLOC_REGEX.sub(
                    "", decompiled_code)
                if func_name.startswith("_"):
                    # For some reason, Hexrays would chomp off one leading underscore from function names in their
                    # generated code, which might lead to corrupt code (`_01inverse` -> `01inverse`). Here we
                    # heuristically try to find and replace the changed function name.
                    decompiled_code = re.sub(  # replace all identifiers with matching name
                        r"(?<![a-zA-Z0-9_])" + func_name[1:] +
                        r"(?![a-zA-Z0-9_])", func_name, decompiled_code)
                    # Note that this doesn't fix references of the function in other functions. But really, why would
                    # someone name their function `_01inverse`?
                decompiled_funcs[func_name] = decompiled_code

            # Generate code replacing original functions with decompiled functions.
            replacer = ghcc.parse.FunctionReplacer(decompiled_funcs)
            replaced_code = replacer.visit(original_ast)

            # Obtain AST for decompiled code by parsing it again.
            code_to_preprocess = DECOMPILED_CODE_HEADER + "\n" + replaced_code
            try:
                code_to_parse = ghcc.parse.preprocess(code_to_preprocess)
            except ghcc.parse.PreprocessError as e:
                msg = (
                    f"{repo_full_name}: GCC return value nonzero for decompiled code of "
                    f"{code_path} ({sha})")
                if len(e.args) > 0:
                    msg += ":\n" + str(e)
                flutes.log(msg, "error")
                has_error = True
                continue

            try:
                decompiled_ast, code_to_parse = ghcc.parse.parse_decompiled_code(
                    code_to_parse, lexer, parser)
                decompiled_tokens = ghcc.parse.convert_to_tokens(
                    code_to_parse, parser.clex.cached_tokens)
            except (ValueError, pycparser.c_parser.ParseError) as e:
                flutes.log(
                    f"{repo_full_name}: Could not parse decompiled code for "
                    f"{code_path} ({sha}): {str(e)}", "error")
                has_error = True

                # We don't have ASTs for decompiled functions, but we can still dump the code.
                # Use the dummy typedefs to extract functions.
                code_lines = code_to_parse.split("\n")
                func_begin_end: Dict[str, List[Optional[int]]] = defaultdict(
                    lambda: [None, None])
                for idx, line in enumerate(code_lines):
                    name, is_begin = replacer.extract_func_name(line)
                    if name is not None:
                        func_begin_end[name][0 if is_begin else 1] = idx
                for func_name, (begin, end) in func_begin_end.items():
                    if begin is not None and end is not None and func_name in function_asts:
                        decompiled_func_tokens = lexer.lex("\n".join(
                            code_lines[(begin + 1):end]))
                        original_func_ast = function_asts[func_name]
                        original_ast_json, original_func_tokens = serialize(
                            original_func_ast, original_tokens)
                        matched_func = MatchedFunction(
                            file_path=code_path,
                            binary_hash=sha,
                            func_name=func_name,
                            variable_names=decompiled_var_names[func_name],
                            original_tokens=original_func_tokens,
                            decompiled_tokens=decompiled_func_tokens,
                            original_ast_json=original_ast_json,
                            decompiled_ast_json=None)
                        matched_functions.append(matched_func)

            else:
                # We've successfully parsed decompiled code.
                decompiled_func_asts = extractor.find_functions(decompiled_ast)
                for func_name in decompiled_funcs.keys():
                    original_func_ast = function_asts[func_name]
                    if func_name not in decompiled_func_asts:
                        # Maybe there's other Hexrays-renamed functions that we didn't fix, just ignore them.
                        continue
                    decompiled_func_ast = decompiled_func_asts[func_name]
                    original_ast_json, original_func_tokens = serialize(
                        original_func_ast, original_tokens)
                    decompiled_ast_json, decompiled_func_tokens = serialize(
                        decompiled_func_ast, decompiled_tokens)
                    matched_func = MatchedFunction(
                        file_path=code_path,
                        binary_hash=sha,
                        func_name=func_name,
                        variable_names=decompiled_var_names[func_name],
                        original_tokens=original_func_tokens,
                        decompiled_tokens=decompiled_func_tokens,
                        original_ast_json=original_ast_json,
                        decompiled_ast_json=decompiled_ast_json)
                    matched_functions.append(matched_func)

    # Cleanup the folders; if errors occurred, keep the preprocessed code.
    status = ("success" if not has_error and len(matched_functions) > 0 else (
        "warning" if not has_error or len(matched_functions) > 0 else "error"))
    shutil.rmtree(repo_dir)

    end_time = time.time()
    funcs_without_asts = sum(matched_func.decompiled_ast_json is None
                             for matched_func in matched_functions)
    flutes.log(
        f"[{end_time - start_time:6.2f}s] "
        f"{repo_full_name}: "
        f"Files found: {files_found}/{total_files}, "
        f"functions matched: {len(matched_functions)}/{functions_found} "
        f"({funcs_without_asts} w/o ASTs)",
        status,
        force_console=True)
    return Result(repo_owner=repo_info.repo_owner,
                  repo_name=repo_info.repo_name,
                  matched_functions=matched_functions,
                  preprocessed_original_code=preprocessed_original_code,
                  files_found=files_found,
                  functions_found=functions_found,
                  funcs_without_asts=funcs_without_asts)
Beispiel #7
0
 def _test_debug_info(self, elf_paths: List[str]) -> None:
     # Check if binaries contain debugging information (whether mock GCC works).
     for elf in elf_paths:
         # NOTE: This doesn't work under macOS.
         ret = flutes.run_command(f"objdump --syms {elf} | grep debug | wc -l", return_output=True, shell=True)
         assert int(ret.captured_output.decode('utf-8')) > 0
Beispiel #8
0
def clone_and_compile(
        repo_info: RepoInfo,
        clone_folder: str,
        binary_folder: str,
        archive_folder: str,
        recursive_clone: bool = True,
        clone_timeout: Optional[float] = None,
        compile_timeout: Optional[float] = None,
        force_reclone: bool = False,
        force_recompile: bool = False,
        docker_batch_compile: bool = True,
        max_archive_size: Optional[int] = None,
        compression_type: str = "gzip",
        record_libraries: bool = False,
        record_metainfo: bool = False,
        gcc_override_flags: Optional[str] = None) -> PipelineResult:
    r"""Perform the entire pipeline.

    :param repo_info: Information about the repository.
    :param clone_folder: Path to the folder where the repository will be stored. The actual destination folder will be
        ``clone_folder/repo_owner_____repo_name``, e.g., ``clone_folder/torvalds_____linux``.
        This strange notation is used in order to have a flat directory hierarchy, so we're not left with a bunch of
        empty folders for repository owners.
    :param binary_folder: Path to the folder where compiled binaries will be stored. The actual destination folder will
        be ``binary_folder/repo_owner/repo_name``, e.g., ``binary_folder/torvalds/linux``.
    :param archive_folder: Path to the folder where archived repositories will be stored. The actual archive file will
        be ``archive_folder/repo_owner/repo_name.tar.xz``, e.g., ``archive_folder/torvalds/linux.tar.xz``.

    :param recursive_clone: If ``True``, uses ``--recursive`` when cloning.
    :param clone_timeout: Timeout for cloning, or `None` (default) for unlimited time.
    :param compile_timeout: Timeout for compilation, or `None` (default) for unlimited time.
    :param force_reclone: If ``True``, always clone a fresh copy for compilation. If ``False``, only clone when there
        are no matching archives.
    :param force_recompile: If ``True``, the repository is compiled regardless of the value in DB.
    :param docker_batch_compile: If ``True``, compile all Makefiles within a repository in a single Docker container.
    :param max_archive_size: If specified, only archive repositories whose size is not larger than the given
        value (in bytes).
    :param compression_type: The file type of the archive to produce. Valid values are ``"gzip"`` (faster) and
        ``"xz"`` (smaller).
    :param record_libraries: If ``True``, record the libraries used in compilation.
    :param record_metainfo: If ``True``, record meta-info values.
    :param gcc_override_flags: If not ``None``, these flags will be appended to each invocation of GCC.

    :return: An entry to insert into the DB, or `None` if no operations are required.
    """
    repo_full_name = f"{repo_info.repo_owner}/{repo_info.repo_name}"
    repo_folder_name = f"{repo_info.repo_owner}_____{repo_info.repo_name}"
    repo_path = os.path.join(clone_folder, repo_folder_name)
    if compression_type == "xz":
        archive_extension = ".tar.xz"
        tar_type_flag = "J"
    elif compression_type == "gzip":
        archive_extension = ".tar.gz"
        tar_type_flag = "z"
    else:
        raise ValueError(f"Invalid compression type '{compression_type}'")
    archive_path = os.path.abspath(
        os.path.join(archive_folder, f"{repo_full_name}{archive_extension}"))

    repo_entry = repo_info.db_result
    clone_success = None

    # Skip repos that are fully processed
    if (repo_entry is not None
            and (repo_entry["clone_successful"] and not force_reclone)
            and (repo_entry["compiled"] and not force_recompile)):
        return PipelineResult(repo_info)

    # Stage 1: Cloning from GitHub.
    if not force_reclone and os.path.exists(archive_path):
        # Extract the archive instead of cloning.
        try:
            flutes.run_command(["tar", f"x{tar_type_flag}f", archive_path],
                               timeout=clone_timeout,
                               cwd=clone_folder)
            flutes.log(f"{repo_full_name} extracted from archive", "success")
        except (subprocess.TimeoutExpired, subprocess.CalledProcessError) as e:
            flutes.log(
                f"Unknown error when extracting {repo_full_name}. Captured output: '{e.output}'",
                "error")
            shutil.rmtree(repo_path)
            return PipelineResult(repo_info)  # return dummy info
        repo_size = flutes.get_folder_size(repo_path)
    elif (repo_entry is None or  # not processed
          force_reclone or (repo_entry["clone_successful"] and  # not compiled
                            (not repo_entry["compiled"] or force_recompile) and
                            not os.path.exists(repo_path))):
        clone_result = ghcc.clone(repo_info.repo_owner,
                                  repo_info.repo_name,
                                  clone_folder=clone_folder,
                                  folder_name=repo_folder_name,
                                  timeout=clone_timeout,
                                  skip_if_exists=False,
                                  recursive=recursive_clone)
        clone_success = clone_result.success
        if not clone_result.success:
            if clone_result.error_type is CloneErrorType.FolderExists:
                flutes.log(f"{repo_full_name} skipped because folder exists",
                           "warning")
            elif clone_result.error_type is CloneErrorType.PrivateOrNonexistent:
                flutes.log(
                    f"Failed to clone {repo_full_name} because repository is private or nonexistent",
                    "warning")
            else:
                if clone_result.error_type is CloneErrorType.Unknown:
                    msg = f"Failed to clone {repo_full_name} with unknown error"
                else:  # CloneErrorType.Timeout
                    msg = f"Time expired ({clone_timeout}s) when attempting to clone {repo_full_name}"
                if clone_result.captured_output is not None:
                    msg += f". Captured output: '{clone_result.captured_output!r}'"
                flutes.log(msg, "error")

                if clone_result.error_type is CloneErrorType.Unknown:
                    return PipelineResult(repo_info)  # return dummy info

            return PipelineResult(repo_info, clone_success=clone_success)

        elif clone_result.error_type is CloneErrorType.SubmodulesFailed:
            msg = f"Submodules in {repo_full_name} ignored due to error"
            if clone_result.captured_output is not None:
                msg += f". Captured output: '{clone_result.captured_output!r}'"
            flutes.log(msg, "warning")

        repo_size = flutes.get_folder_size(repo_path)
        flutes.log(
            f"{repo_full_name} successfully cloned ({clone_result.time:.2f}s, "
            f"{flutes.readable_size(repo_size)})", "success")
    else:
        if not repo_entry["clone_successful"]:
            return PipelineResult(repo_info)  # return dummy info
        repo_size = flutes.get_folder_size(repo_path)

    makefiles = None
    libraries = None
    meta_info: Optional[PipelineMetaInfo] = None
    if not repo_entry or not repo_entry["compiled"] or force_recompile:
        # # SPECIAL CHECK: Do not attempt to compile OS kernels!
        # kernel_name = None
        # if contains_in_file(os.path.join(repo_path, "README"), "Linux kernel release"):
        #     kernel_name = "Linux"
        # elif contains_in_file(os.path.join(repo_path, "README"), "FreeBSD source directory"):
        #     kernel_name = "FreeBSD"
        # if kernel_name is not None:
        #     shutil.rmtree(repo_path)
        #     ghcc.log(f"Found {kernel_name} kernel in {repo_full_name}, will not attempt to compile. "
        #              f"Repository deleted", "warning")
        #     return PipelineResult(repo_info, clone_success=clone_success, makefiles=[])

        # Stage 2: Finding Makefiles.
        makefile_dirs = ghcc.find_makefiles(repo_path)
        if len(makefile_dirs) == 0:
            # Repo has no Makefiles, delete.
            shutil.rmtree(repo_path)
            flutes.log(
                f"No Makefiles found in {repo_full_name}, repository deleted",
                "warning")
            return PipelineResult(repo_info,
                                  clone_success=clone_success,
                                  makefiles=[])
        else:
            pass

        # Stage 3: Compile each Makefile.
        repo_binary_dir = os.path.join(binary_folder, repo_full_name)
        if not os.path.exists(repo_binary_dir):
            os.makedirs(repo_binary_dir)
        flutes.log(f"Starting compilation for {repo_full_name}...")

        if docker_batch_compile:
            makefiles = ghcc.docker_batch_compile(
                repo_binary_dir,
                repo_path,
                compile_timeout,
                record_libraries,
                gcc_override_flags,
                user_id=(repo_info.idx % 10000) +
                30000,  # user IDs 30000 ~ 39999
                exception_log_fn=functools.partial(exception_handler,
                                                   repo_info=repo_info))
        else:
            makefiles = list(
                ghcc.compile_and_move(repo_binary_dir, repo_path,
                                      makefile_dirs, compile_timeout,
                                      record_libraries, gcc_override_flags))
        num_succeeded = sum(makefile["success"] for makefile in makefiles)
        if record_libraries:
            library_log_path = os.path.join(repo_binary_dir, "libraries.txt")
            if os.path.exists(library_log_path):
                with open(library_log_path) as f:
                    libraries = list(set(f.read().split()))
            else:
                libraries = []
        num_binaries = sum(len(makefile["binaries"]) for makefile in makefiles)

        msg = f"{num_succeeded} ({len(makefiles)}) out of {len(makefile_dirs)} Makefile(s) " \
              f"in {repo_full_name} compiled (partially), yielding {num_binaries} binaries"
        flutes.log(
            msg,
            "success" if num_succeeded == len(makefile_dirs) else "warning")

        if record_metainfo:
            meta_info = PipelineMetaInfo({
                "num_makefiles":
                len(makefile_dirs),
                "has_gitmodules":
                os.path.exists(os.path.join(repo_path, ".gitmodules")),
                "makefiles_using_automake":
                sum(
                    ghcc.contains_files(directory,
                                        ["configure.ac", "configure.in"])
                    for directory in makefile_dirs)
            })

        # Stage 4: Clean and zip repo.
        if max_archive_size is not None and repo_size > max_archive_size:
            shutil.rmtree(repo_path)
            flutes.log(
                f"Removed {repo_full_name} because repository size ({flutes.readable_size(repo_size)}) "
                f"exceeds limits", "info")
        else:
            # Repository is already cleaned in the compile stage.
            os.makedirs(os.path.split(archive_path)[0], exist_ok=True)
            compress_success = False
            try:
                flutes.run_command([
                    "tar", f"c{tar_type_flag}f", archive_path, repo_folder_name
                ],
                                   timeout=clone_timeout,
                                   cwd=clone_folder)
                compress_success = True
            except subprocess.TimeoutExpired:
                flutes.log(
                    f"Compression timeout for {repo_full_name}, giving up",
                    "error")
            except subprocess.CalledProcessError as e:
                flutes.log(
                    f"Unknown error when compressing {repo_full_name}. Captured output: '{e.output}'",
                    "error")
            shutil.rmtree(repo_path)
            if compress_success:
                flutes.log(f"Compressed {repo_full_name}, folder removed",
                           "info")
            elif os.path.exists(archive_path):
                os.remove(archive_path)

    return PipelineResult(repo_info,
                          clone_success=clone_success,
                          repo_size=repo_size,
                          makefiles=makefiles,
                          libraries=libraries,
                          meta_info=meta_info)
Beispiel #9
0
def decompile(binary_info: BinaryInfo,
              output_dir: str,
              binary_dir: str,
              timeout: Optional[int] = None) -> DecompilationResult:
    binary_path = binary_info["path"]
    original_path = binary_info["path_in_repo"]
    binary_hash = os.path.split(binary_path)[1]

    def create_result(
            status: DecompilationStatus,
            time: Optional[datetime.timedelta] = None) -> DecompilationResult:
        return DecompilationResult(binary_info, binary_hash, status, time)

    output_path = os.path.join(output_dir, f"{binary_hash}.jsonl")
    if os.path.exists(output_path):
        # Binary already decompiled, but for some reason it wasn't written to the DB.
        return create_result(DecompilationStatus.Success)

    start = datetime.datetime.now()
    env: EnvDict = os.environ.copy()
    env['IDALOG'] = '/dev/stdout'
    env['PREFIX'] = binary_hash
    file_path = os.path.join(binary_dir, binary_path)

    # Create a temporary directory, since the decompiler makes a lot of additional
    # files that we can't clean up from here.
    with tempfile.TemporaryDirectory() as tempdir:
        # Put the output JSONL file here as well to prevent partially-generated files.
        env['OUTPUT_DIR'] = os.path.abspath(tempdir)
        with tempfile.NamedTemporaryFile(dir=tempdir) as collected_vars:
            # First collect variables.
            env['COLLECTED_VARS'] = collected_vars.name
            with tempfile.NamedTemporaryFile(dir=tempdir) as orig:
                flutes.run_command(['cp', file_path, orig.name])
                # Timeout after 30 seconds for first run.
                try:
                    run_decompiler(orig.name,
                                   COLLECT,
                                   env=env,
                                   timeout=timeout)
                except subprocess.TimeoutExpired:
                    flutes.log(f"[TIMED OUT] {original_path} ({binary_path})",
                               "warning")
                    return create_result(DecompilationStatus.TimedOut)
                try:
                    assert pickle.load(collected_vars)  # non-empty
                except:
                    flutes.log(f"[NO VARS] {original_path} ({binary_path})",
                               "warning")
                    return create_result(DecompilationStatus.NoVariables)
            # Make a new stripped copy and pass it the collected vars.
            with tempfile.NamedTemporaryFile(dir=tempdir) as stripped:
                flutes.run_command(['cp', file_path, stripped.name])
                flutes.run_command(['strip', '--strip-debug', stripped.name])
                # Dump the trees.
                # No timeout here, we know it'll run in a reasonable amount of
                # time and don't want mismatched files.
                run_decompiler(stripped.name, DUMP_TREES, env=env)
        jsonl_path = os.path.join(tempdir, f"{binary_hash}.jsonl")
        flutes.run_command(['cp', jsonl_path, output_path])
    end = datetime.datetime.now()
    duration = end - start
    flutes.log(
        f"[OK {duration.total_seconds():5.2f}s] {original_path} ({binary_path})",
        "success")
    return create_result(DecompilationStatus.Success, duration)