def test_clone(self) -> None: # Clone an existing repo. result = ghcc.clone("huzecong", "memes", clone_folder=self.tempdir.name) self.assertTrue(result.success, msg=result.captured_output) self.assertTrue(os.path.exists( os.path.join(self.tempdir.name, "huzecong", "memes", "Get Memes.scpt")), msg=result.captured_output) # Non-existent repo. result = ghcc.clone("huzecong", "non-existent-repo", clone_folder=self.tempdir.name) self.assertFalse(result.success, msg=result.captured_output) self.assertEqual(ghcc.CloneErrorType.PrivateOrNonexistent, result.error_type, msg=result.captured_output) # Timeout result = ghcc.clone("torvalds", "linux", clone_folder=self.tempdir.name, timeout=1) self.assertFalse(result.success, msg=result.captured_output) self.assertEqual(ghcc.CloneErrorType.Timeout, result.error_type, msg=result.captured_output)
def main(): flutes.register_ipython_excepthook() random.seed(ghcc.__MAGIC__) np.random.seed(ghcc.__MAGIC__) repo_info = analyze_logs(args.log_file) changed = changed_repos(repo_info) # Sample 100 failed repositories. repos_with_fail = [ repo for repo, info in repo_info.items() if info["n_partial"][-1] < info["n_total"][-1] ] samples = np.random.choice(len(repos_with_fail), 100, replace=False) _repo_samples = [repos_with_fail[x] for x in samples] # Remove repositories with more than 50 Makefiles. repo_samples = [] for repo in _repo_samples: _, val = repo_info[repo]["n_total"][-1] if val <= 50: repo_samples.append(repo) else: print(f"{repo} contains {val} Makefiles, skipping") # Clone the repositories. for repo in tqdm(repo_samples, desc="Cloning repos"): owner, name = repo.split("/") ghcc.clone(owner, name, "test_compile") # Write repository information into a CSV file. # Each line is a separate Makefile. db = ghcc.RepoDB() with open("repo_samples.csv", "w") as f: writer = csv.writer(f) writer.writerow(["Repo", "Makefile", "Status", "Failed Reason?"]) for repo in tqdm(repo_samples, desc="Writing CSV"): makefiles = ghcc.find_makefiles(os.path.join("test_compile", repo)) owner, name = repo.split("/") entry = db.get(owner, name) success_makefiles = set() for makefile_info in entry['makefiles']: directory = makefile_info["directory"] directory = "/".join([owner, name] + directory.split("/")[4:]) success_makefiles.add(directory) for makefile in makefiles: directory = "/".join(makefile.split("/")[1:]) status = "" if directory in success_makefiles else "Failed" writer.writerow([repo, directory, status]) print(repo, directory, status)
def setUp(self) -> None: self.tempdir = tempfile.TemporaryDirectory() self.repo_owner = "pjreddie" self.repo_name = "uwimg" # Clone an existing repo. result = ghcc.clone(self.repo_owner, self.repo_name, clone_folder=self.tempdir.name, skip_if_exists=False) assert result.success is True, result.captured_output self.directory = os.path.join(self.tempdir.name, self.repo_owner, self.repo_name) self.target_elfs = [ "libuwimg.so", "obj/args.o", "obj/classifier.o", "obj/data.o", "obj/filter_image.o", "obj/flow_image.o", "obj/harris_image.o", "obj/image_opencv.o", "obj/list.o", "obj/load_image.o", "obj/main.o", "obj/matrix.o", "obj/panorama_image.o", "obj/process_image.o", "obj/resize_image.o", "obj/test.o", "uwimg", ]
def test_serialization(self) -> None: # Clone the `pycparser` repo. result = ghcc.clone("eliben", "pycparser", clone_folder=self.tempdir.name) assert result.success def _test(code: str): ast = self.parser.parse(code) json_dict = ghcc.parse.ast_to_dict(ast) deserialized_ast = ghcc.parse.dict_to_ast(json_dict) self._test_ast_equivalent(ast, deserialized_ast) for file in (Path(self.tempdir.name) / "eliben" / "pycparser" / "examples" / "c_files").iterdir(): preprocessed_code = ghcc.parse.preprocess_file(str(file)) _test(preprocessed_code) for code, _ in EXAMPLE_CODE: preprocessed_code = ghcc.parse.preprocess(code) _test(preprocessed_code)
def match_functions( repo_info: RepoInfo, archive_folder: str, temp_folder: str, decompile_folder: str, use_fake_libc_headers: bool = True, preprocess_timeout: Optional[int] = None, *, progress_bar: Optional[flutes.ProgressBarManager.Proxy] = None ) -> Result: # Directions: # 1. Clone or extract from archive. # 2. For each Makefile, rerun the compilation process with the flag "-E", so only the preprocessor is run. # This probably won't take long as the compiler exits after running the processor, and linking would fail. # Also, consider using "-nostdlib -Ipath/to/fake_libc_include" as suggested by `pycparser`. # 3. The .o files are now preprocessed C code. Parse them using `pycparser` to obtain a list of functions. start_time = time.time() total_files = sum( len(makefile) for makefile in repo_info.makefiles.values()) repo_folder_name = f"{repo_info.repo_owner}_____{repo_info.repo_name}" repo_full_name = f"{repo_info.repo_owner}/{repo_info.repo_name}" archive_path = (Path(archive_folder) / f"{repo_full_name}.tar.gz").absolute() repo_dir = (Path(temp_folder) / repo_folder_name).absolute() repo_src_path = repo_dir / "src" repo_binary_dir = repo_dir / "bin" repo_binary_dir.mkdir(parents=True, exist_ok=True) has_error = False if progress_bar is not None: worker_id = flutes.get_worker_id() process_name = f"Worker {worker_id}" if worker_id is not None else "Main Process" progress_bar.new(total=total_files, desc=process_name + f" [{repo_full_name}]") flutes.log(f"Begin processing {repo_full_name} ({total_files} files)") if os.path.exists(archive_path): # Extract archive flutes.run_command(["tar", f"xzf", str(archive_path)], cwd=str(repo_dir)) (repo_dir / repo_folder_name).rename(repo_src_path) else: # Clone repo if repo_src_path.exists(): shutil.rmtree(repo_src_path) ret = ghcc.clone(repo_info.repo_owner, repo_info.repo_name, clone_folder=str(repo_dir), folder_name="src") if ret.error_type not in [None, ghcc.CloneErrorType.SubmodulesFailed]: flutes.log( f"Failed to clone {repo_full_name}: error type {ret.error_type}", "error") # Return a dummy result so this repo is ignored in the future. return Result(repo_info.repo_owner, repo_info.repo_name, [], {}, 0, 0, 0) # Write makefile info to pickle with (repo_binary_dir / "makefiles.pkl").open("wb") as f_pkl: pickle.dump(repo_info.makefiles, f_pkl) gcc_flags = "-E" directory_mapping = None if use_fake_libc_headers: gcc_flags = f"-E -nostdlib -I/usr/src/libc" directory_mapping = {ghcc.parse.FAKE_LIBC_PATH: "/usr/src/libc"} if progress_bar is not None: progress_bar.update(postfix={"status": "preprocessing"}) makefiles = ghcc.docker_batch_compile( str(repo_binary_dir), str(repo_src_path), compile_timeout=preprocess_timeout, gcc_override_flags=gcc_flags, use_makefile_info_pkl=True, directory_mapping=directory_mapping, user_id=(repo_info.idx % 10000) + 30000, # user IDs 30000 ~ 39999 exception_log_fn=functools.partial(exception_handler, repo_info=repo_info)) parser = CParser(lexer=ghcc.parse.CachedCLexer) lexer = ghcc.parse.LexerWrapper() decompile_path = Path(decompile_folder) extractor = ghcc.parse.FunctionExtractor() matched_functions: List[MatchedFunction] = [] preprocessed_original_code: Dict[str, str] = {} files_found = 0 functions_found = 0 for makefile in makefiles: mkfile_dir = Path(makefile['directory']) for path, sha in zip(makefile["binaries"], makefile["sha256"]): # Load and parse preprocessed original code. code_path = str(mkfile_dir / path) json_path = decompile_path / (sha + ".jsonl") preprocessed_code_path = repo_binary_dir / sha if progress_bar is not None: progress_bar.update(1, postfix={"file": code_path}) if not json_path.exists() or not preprocessed_code_path.exists(): continue try: with preprocessed_code_path.open("r") as f: code = f.read() code = LINE_CONTROL_REGEX.sub("", code) except UnicodeDecodeError: continue # probably a real binary file preprocessed_original_code[sha] = code try: original_ast: ASTNode = parser.parse(code, filename=os.path.join( repo_full_name, path)) except (pycparser.c_parser.ParseError, AssertionError) as e: # For some reason `pycparser` uses `assert`s in places where there should have been a check. flutes.log( f"{repo_full_name}: Parser error when processing file " f"{code_path} ({sha}): {str(e)}", "error") has_error = True continue # ignore parsing errors original_tokens = ghcc.parse.convert_to_tokens( code, parser.clex.cached_tokens) files_found += 1 function_asts = extractor.find_functions(original_ast) functions_found += len(function_asts) # Collect decompiled functions with matching original code. with json_path.open("r") as f: decompiled_json = [ line for line in f if line ] # don't decode, as we only need the function name decompiled_funcs: Dict[str, str] = {} # (func_name) -> decompiled_code decompiled_var_names: Dict[str, Dict[str, Tuple[str, str]]] = {} \ # (func_name) -> (var_id) -> (decomp_name, orig_name) for line_num, j in enumerate(decompiled_json): # Find function name from JSON line without parsing. match = JSON_FUNC_NAME_REGEX.search(j) assert match is not None func_name = match.group(1) if func_name not in function_asts: continue try: decompiled_data = json.loads(j) except json.JSONDecodeError as e: flutes.log( f"{repo_full_name}: Decode error when reading JSON file at {json_path}: " f"{str(e)}", "error") continue decompiled_code = decompiled_data["raw_code"] # Store the variable names used in the function. # We use a random string as the identifier prefix. Sadly, C89 (and `pycparser`) doesn't support Unicode. for length in range(3, 10 + 1): var_identifier_prefix = "v" + "".join( random.choices(string.ascii_lowercase, k=length)) if var_identifier_prefix not in decompiled_code: break else: # No way this is happening, right? flutes.log( f"{repo_full_name}: Could not find valid identifier prefix for " f"{func_name} in {code_path} ({sha})", "error") continue variables: Dict[str, Tuple[str, str]] = { } # (var_id) -> (decompiled_name, original_name) for match in DECOMPILED_VAR_REGEX.finditer(decompiled_code): var_id, decompiled_name, original_name = match.groups() var_id = f"{var_identifier_prefix}_{var_id}" if var_id in variables: assert variables[var_id] == (decompiled_name, original_name) else: variables[var_id] = (decompiled_name, original_name) decompiled_var_names[func_name] = variables # Remove irregularities in decompiled code to make the it parsable: # - Replace `@@VAR` with special identifiers (literally anything identifier that doesn't clash). # - Remove the register allocation indication in `var@<rdi>`. decompiled_code = DECOMPILED_VAR_REGEX.sub( rf"{var_identifier_prefix}_\1", decompiled_code) decompiled_code = DECOMPILED_REG_ALLOC_REGEX.sub( "", decompiled_code) if func_name.startswith("_"): # For some reason, Hexrays would chomp off one leading underscore from function names in their # generated code, which might lead to corrupt code (`_01inverse` -> `01inverse`). Here we # heuristically try to find and replace the changed function name. decompiled_code = re.sub( # replace all identifiers with matching name r"(?<![a-zA-Z0-9_])" + func_name[1:] + r"(?![a-zA-Z0-9_])", func_name, decompiled_code) # Note that this doesn't fix references of the function in other functions. But really, why would # someone name their function `_01inverse`? decompiled_funcs[func_name] = decompiled_code # Generate code replacing original functions with decompiled functions. replacer = ghcc.parse.FunctionReplacer(decompiled_funcs) replaced_code = replacer.visit(original_ast) # Obtain AST for decompiled code by parsing it again. code_to_preprocess = DECOMPILED_CODE_HEADER + "\n" + replaced_code try: code_to_parse = ghcc.parse.preprocess(code_to_preprocess) except ghcc.parse.PreprocessError as e: msg = ( f"{repo_full_name}: GCC return value nonzero for decompiled code of " f"{code_path} ({sha})") if len(e.args) > 0: msg += ":\n" + str(e) flutes.log(msg, "error") has_error = True continue try: decompiled_ast, code_to_parse = ghcc.parse.parse_decompiled_code( code_to_parse, lexer, parser) decompiled_tokens = ghcc.parse.convert_to_tokens( code_to_parse, parser.clex.cached_tokens) except (ValueError, pycparser.c_parser.ParseError) as e: flutes.log( f"{repo_full_name}: Could not parse decompiled code for " f"{code_path} ({sha}): {str(e)}", "error") has_error = True # We don't have ASTs for decompiled functions, but we can still dump the code. # Use the dummy typedefs to extract functions. code_lines = code_to_parse.split("\n") func_begin_end: Dict[str, List[Optional[int]]] = defaultdict( lambda: [None, None]) for idx, line in enumerate(code_lines): name, is_begin = replacer.extract_func_name(line) if name is not None: func_begin_end[name][0 if is_begin else 1] = idx for func_name, (begin, end) in func_begin_end.items(): if begin is not None and end is not None and func_name in function_asts: decompiled_func_tokens = lexer.lex("\n".join( code_lines[(begin + 1):end])) original_func_ast = function_asts[func_name] original_ast_json, original_func_tokens = serialize( original_func_ast, original_tokens) matched_func = MatchedFunction( file_path=code_path, binary_hash=sha, func_name=func_name, variable_names=decompiled_var_names[func_name], original_tokens=original_func_tokens, decompiled_tokens=decompiled_func_tokens, original_ast_json=original_ast_json, decompiled_ast_json=None) matched_functions.append(matched_func) else: # We've successfully parsed decompiled code. decompiled_func_asts = extractor.find_functions(decompiled_ast) for func_name in decompiled_funcs.keys(): original_func_ast = function_asts[func_name] if func_name not in decompiled_func_asts: # Maybe there's other Hexrays-renamed functions that we didn't fix, just ignore them. continue decompiled_func_ast = decompiled_func_asts[func_name] original_ast_json, original_func_tokens = serialize( original_func_ast, original_tokens) decompiled_ast_json, decompiled_func_tokens = serialize( decompiled_func_ast, decompiled_tokens) matched_func = MatchedFunction( file_path=code_path, binary_hash=sha, func_name=func_name, variable_names=decompiled_var_names[func_name], original_tokens=original_func_tokens, decompiled_tokens=decompiled_func_tokens, original_ast_json=original_ast_json, decompiled_ast_json=decompiled_ast_json) matched_functions.append(matched_func) # Cleanup the folders; if errors occurred, keep the preprocessed code. status = ("success" if not has_error and len(matched_functions) > 0 else ( "warning" if not has_error or len(matched_functions) > 0 else "error")) shutil.rmtree(repo_dir) end_time = time.time() funcs_without_asts = sum(matched_func.decompiled_ast_json is None for matched_func in matched_functions) flutes.log( f"[{end_time - start_time:6.2f}s] " f"{repo_full_name}: " f"Files found: {files_found}/{total_files}, " f"functions matched: {len(matched_functions)}/{functions_found} " f"({funcs_without_asts} w/o ASTs)", status, force_console=True) return Result(repo_owner=repo_info.repo_owner, repo_name=repo_info.repo_name, matched_functions=matched_functions, preprocessed_original_code=preprocessed_original_code, files_found=files_found, functions_found=functions_found, funcs_without_asts=funcs_without_asts)
def clone_and_compile( repo_info: RepoInfo, clone_folder: str, binary_folder: str, archive_folder: str, recursive_clone: bool = True, clone_timeout: Optional[float] = None, compile_timeout: Optional[float] = None, force_reclone: bool = False, force_recompile: bool = False, docker_batch_compile: bool = True, max_archive_size: Optional[int] = None, compression_type: str = "gzip", record_libraries: bool = False, record_metainfo: bool = False, gcc_override_flags: Optional[str] = None) -> PipelineResult: r"""Perform the entire pipeline. :param repo_info: Information about the repository. :param clone_folder: Path to the folder where the repository will be stored. The actual destination folder will be ``clone_folder/repo_owner_____repo_name``, e.g., ``clone_folder/torvalds_____linux``. This strange notation is used in order to have a flat directory hierarchy, so we're not left with a bunch of empty folders for repository owners. :param binary_folder: Path to the folder where compiled binaries will be stored. The actual destination folder will be ``binary_folder/repo_owner/repo_name``, e.g., ``binary_folder/torvalds/linux``. :param archive_folder: Path to the folder where archived repositories will be stored. The actual archive file will be ``archive_folder/repo_owner/repo_name.tar.xz``, e.g., ``archive_folder/torvalds/linux.tar.xz``. :param recursive_clone: If ``True``, uses ``--recursive`` when cloning. :param clone_timeout: Timeout for cloning, or `None` (default) for unlimited time. :param compile_timeout: Timeout for compilation, or `None` (default) for unlimited time. :param force_reclone: If ``True``, always clone a fresh copy for compilation. If ``False``, only clone when there are no matching archives. :param force_recompile: If ``True``, the repository is compiled regardless of the value in DB. :param docker_batch_compile: If ``True``, compile all Makefiles within a repository in a single Docker container. :param max_archive_size: If specified, only archive repositories whose size is not larger than the given value (in bytes). :param compression_type: The file type of the archive to produce. Valid values are ``"gzip"`` (faster) and ``"xz"`` (smaller). :param record_libraries: If ``True``, record the libraries used in compilation. :param record_metainfo: If ``True``, record meta-info values. :param gcc_override_flags: If not ``None``, these flags will be appended to each invocation of GCC. :return: An entry to insert into the DB, or `None` if no operations are required. """ repo_full_name = f"{repo_info.repo_owner}/{repo_info.repo_name}" repo_folder_name = f"{repo_info.repo_owner}_____{repo_info.repo_name}" repo_path = os.path.join(clone_folder, repo_folder_name) if compression_type == "xz": archive_extension = ".tar.xz" tar_type_flag = "J" elif compression_type == "gzip": archive_extension = ".tar.gz" tar_type_flag = "z" else: raise ValueError(f"Invalid compression type '{compression_type}'") archive_path = os.path.abspath( os.path.join(archive_folder, f"{repo_full_name}{archive_extension}")) repo_entry = repo_info.db_result clone_success = None # Skip repos that are fully processed if (repo_entry is not None and (repo_entry["clone_successful"] and not force_reclone) and (repo_entry["compiled"] and not force_recompile)): return PipelineResult(repo_info) # Stage 1: Cloning from GitHub. if not force_reclone and os.path.exists(archive_path): # Extract the archive instead of cloning. try: flutes.run_command(["tar", f"x{tar_type_flag}f", archive_path], timeout=clone_timeout, cwd=clone_folder) flutes.log(f"{repo_full_name} extracted from archive", "success") except (subprocess.TimeoutExpired, subprocess.CalledProcessError) as e: flutes.log( f"Unknown error when extracting {repo_full_name}. Captured output: '{e.output}'", "error") shutil.rmtree(repo_path) return PipelineResult(repo_info) # return dummy info repo_size = flutes.get_folder_size(repo_path) elif (repo_entry is None or # not processed force_reclone or (repo_entry["clone_successful"] and # not compiled (not repo_entry["compiled"] or force_recompile) and not os.path.exists(repo_path))): clone_result = ghcc.clone(repo_info.repo_owner, repo_info.repo_name, clone_folder=clone_folder, folder_name=repo_folder_name, timeout=clone_timeout, skip_if_exists=False, recursive=recursive_clone) clone_success = clone_result.success if not clone_result.success: if clone_result.error_type is CloneErrorType.FolderExists: flutes.log(f"{repo_full_name} skipped because folder exists", "warning") elif clone_result.error_type is CloneErrorType.PrivateOrNonexistent: flutes.log( f"Failed to clone {repo_full_name} because repository is private or nonexistent", "warning") else: if clone_result.error_type is CloneErrorType.Unknown: msg = f"Failed to clone {repo_full_name} with unknown error" else: # CloneErrorType.Timeout msg = f"Time expired ({clone_timeout}s) when attempting to clone {repo_full_name}" if clone_result.captured_output is not None: msg += f". Captured output: '{clone_result.captured_output!r}'" flutes.log(msg, "error") if clone_result.error_type is CloneErrorType.Unknown: return PipelineResult(repo_info) # return dummy info return PipelineResult(repo_info, clone_success=clone_success) elif clone_result.error_type is CloneErrorType.SubmodulesFailed: msg = f"Submodules in {repo_full_name} ignored due to error" if clone_result.captured_output is not None: msg += f". Captured output: '{clone_result.captured_output!r}'" flutes.log(msg, "warning") repo_size = flutes.get_folder_size(repo_path) flutes.log( f"{repo_full_name} successfully cloned ({clone_result.time:.2f}s, " f"{flutes.readable_size(repo_size)})", "success") else: if not repo_entry["clone_successful"]: return PipelineResult(repo_info) # return dummy info repo_size = flutes.get_folder_size(repo_path) makefiles = None libraries = None meta_info: Optional[PipelineMetaInfo] = None if not repo_entry or not repo_entry["compiled"] or force_recompile: # # SPECIAL CHECK: Do not attempt to compile OS kernels! # kernel_name = None # if contains_in_file(os.path.join(repo_path, "README"), "Linux kernel release"): # kernel_name = "Linux" # elif contains_in_file(os.path.join(repo_path, "README"), "FreeBSD source directory"): # kernel_name = "FreeBSD" # if kernel_name is not None: # shutil.rmtree(repo_path) # ghcc.log(f"Found {kernel_name} kernel in {repo_full_name}, will not attempt to compile. " # f"Repository deleted", "warning") # return PipelineResult(repo_info, clone_success=clone_success, makefiles=[]) # Stage 2: Finding Makefiles. makefile_dirs = ghcc.find_makefiles(repo_path) if len(makefile_dirs) == 0: # Repo has no Makefiles, delete. shutil.rmtree(repo_path) flutes.log( f"No Makefiles found in {repo_full_name}, repository deleted", "warning") return PipelineResult(repo_info, clone_success=clone_success, makefiles=[]) else: pass # Stage 3: Compile each Makefile. repo_binary_dir = os.path.join(binary_folder, repo_full_name) if not os.path.exists(repo_binary_dir): os.makedirs(repo_binary_dir) flutes.log(f"Starting compilation for {repo_full_name}...") if docker_batch_compile: makefiles = ghcc.docker_batch_compile( repo_binary_dir, repo_path, compile_timeout, record_libraries, gcc_override_flags, user_id=(repo_info.idx % 10000) + 30000, # user IDs 30000 ~ 39999 exception_log_fn=functools.partial(exception_handler, repo_info=repo_info)) else: makefiles = list( ghcc.compile_and_move(repo_binary_dir, repo_path, makefile_dirs, compile_timeout, record_libraries, gcc_override_flags)) num_succeeded = sum(makefile["success"] for makefile in makefiles) if record_libraries: library_log_path = os.path.join(repo_binary_dir, "libraries.txt") if os.path.exists(library_log_path): with open(library_log_path) as f: libraries = list(set(f.read().split())) else: libraries = [] num_binaries = sum(len(makefile["binaries"]) for makefile in makefiles) msg = f"{num_succeeded} ({len(makefiles)}) out of {len(makefile_dirs)} Makefile(s) " \ f"in {repo_full_name} compiled (partially), yielding {num_binaries} binaries" flutes.log( msg, "success" if num_succeeded == len(makefile_dirs) else "warning") if record_metainfo: meta_info = PipelineMetaInfo({ "num_makefiles": len(makefile_dirs), "has_gitmodules": os.path.exists(os.path.join(repo_path, ".gitmodules")), "makefiles_using_automake": sum( ghcc.contains_files(directory, ["configure.ac", "configure.in"]) for directory in makefile_dirs) }) # Stage 4: Clean and zip repo. if max_archive_size is not None and repo_size > max_archive_size: shutil.rmtree(repo_path) flutes.log( f"Removed {repo_full_name} because repository size ({flutes.readable_size(repo_size)}) " f"exceeds limits", "info") else: # Repository is already cleaned in the compile stage. os.makedirs(os.path.split(archive_path)[0], exist_ok=True) compress_success = False try: flutes.run_command([ "tar", f"c{tar_type_flag}f", archive_path, repo_folder_name ], timeout=clone_timeout, cwd=clone_folder) compress_success = True except subprocess.TimeoutExpired: flutes.log( f"Compression timeout for {repo_full_name}, giving up", "error") except subprocess.CalledProcessError as e: flutes.log( f"Unknown error when compressing {repo_full_name}. Captured output: '{e.output}'", "error") shutil.rmtree(repo_path) if compress_success: flutes.log(f"Compressed {repo_full_name}, folder removed", "info") elif os.path.exists(archive_path): os.remove(archive_path) return PipelineResult(repo_info, clone_success=clone_success, repo_size=repo_size, makefiles=makefiles, libraries=libraries, meta_info=meta_info)