def _download(urls: List[str], sha256: Optional[str], max_retries: int) -> bytes: if not urls: raise ValueError("No URLs to download") # Cache hit. if sha256 and cache_path(f"downloads/{sha256}").is_file(): with open(str(cache_path(f"downloads/{sha256}")), "rb") as f: return f.read() # A retry loop, and loop over all urls provided. last_exception = None wait_time = 10 for _ in range(max(max_retries, 1)): for url in urls: try: return _do_download_attempt(url, sha256) except TooManyRequests as e: last_exception = e logger.info( "Download attempt failed with Too Many Requests error. " "Watiting %.1f seconds", wait_time, ) sleep(wait_time) wait_time *= 1.5 except DownloadFailed as e: logger.info("Download attempt failed: %s", truncate(e)) last_exception = e raise last_exception
def download_and_unpack_database(db: str, sha256: str) -> Path: """Download the given database, unpack it to the local filesystem, and return the path. """ local_dir = cache_path(f"state_transition_dataset/{sha256}") with _DB_DOWNLOAD_LOCK, InterProcessLock( transient_cache_path(".state_transition_database_download.LOCK") ): if not (local_dir / ".installed").is_file(): tar_data = io.BytesIO(download(db, sha256)) local_dir.mkdir(parents=True, exist_ok=True) logger.info("Unpacking database to %s ...", local_dir) with tarfile.open(fileobj=tar_data, mode="r:bz2") as arc: arc.extractall(str(local_dir)) (local_dir / ".installed").touch() unpacked = [f for f in local_dir.iterdir() if f.name != ".installed"] if len(unpacked) != 1: print( f"fatal: Archive {db} expected to contain one file, contains: {len(unpacked)}", file=sys.stderr, ) return unpacked[0]
def test_download_cache_hit(mocker): """Check that download is not repeated on cache hit.""" data = b"Hello, world" data_checksum = "4ae7c3b6ac0beff671efa8cf57386151c06e58ca53a78d83f36107316cec125f" cached_path = cache_path(f"downloads/{data_checksum}") # Tidy up from a previous test, if applicable. if cached_path.is_file(): cached_path.unlink() def patched_download(*args): return data mocker.patch.object(download, "_get_url_data", patched_download) mocker.spy(download, "_get_url_data") assert ( download.download( "example", sha256="4ae7c3b6ac0beff671efa8cf57386151c06e58ca53a78d83f36107316cec125f", ) == data ) download._get_url_data.assert_called_once_with("example") assert cached_path.is_file() # Cache hit. assert ( download.download( "example", sha256="4ae7c3b6ac0beff671efa8cf57386151c06e58ca53a78d83f36107316cec125f", ) == data ) assert download._get_url_data.call_count == 1
def download_llvm_files() -> Path: """Download and unpack the LLVM data pack.""" global _LLVM_UNPACKED_LOCATION unpacked_location = site_data_path("llvm-v0") # Fast path for repeated calls. if _LLVM_UNPACKED_LOCATION == unpacked_location: return unpacked_location with _LLVM_DOWNLOAD_LOCK: # Fast path for first call. This check will be repeated inside the locked # region if required. if (unpacked_location / ".unpacked").is_file(): _LLVM_UNPACKED_LOCATION = unpacked_location return unpacked_location with InterProcessLock(cache_path(".llvm-v0-install.LOCK")): # Now that the lock is acquired, repeat the check to see if it is # necessary to download the dataset. if (unpacked_location / ".unpacked").is_file(): return unpacked_location _download_llvm_files(unpacked_location) # Create the marker file to indicate that the directory is unpacked # and ready to go. (unpacked_location / ".unpacked").touch() _LLVM_UNPACKED_LOCATION = unpacked_location return unpacked_location
def make_working_dir(): """Make a working directory for a service. The calling code is responsible for removing this directory when done. """ service_directory = cache_path("service") timestamp = datetime.now().isoformat() random_hash = random.getrandbits(32) working_dir = Path(service_directory / f"{timestamp}-{random_hash:08x}") (working_dir / "logs").mkdir(parents=True, exist_ok=False) return working_dir
def get_storage_paths() -> List[Path]: """Return the list of paths used by CompilerGym for filesystem storage. :return: A list of filesystem paths that CompilerGym uses to store files. """ return sorted({ runfiles_path.cache_path("."), runfiles_path.transient_cache_path("."), runfiles_path.site_data_path("."), })
def download(url: str, sha256: Optional[str] = None) -> bytes: """Download a file and return its contents. If :code:`sha256` is provided and the download succeeds, the file contents are cached locally in :code:`$cache_path/downloads/$sha256`. See :func:`compiler_gym.cache_path`. An inter-process lock ensures that only a single call to this function may execute at a time. :param url: The URL of the file to download. :param sha256: The expected sha256 checksum of the file. :return: The contents of the downloaded file. :raises OSError: If the download fails, or if the downloaded content does match the expected :code:`sha256` checksum. """ # Cache hit. if sha256 and cache_path(f"downloads/{sha256}").is_file(): with open(str(cache_path(f"downloads/{sha256}")), "rb") as f: return f.read() logging.info(f"Downloading {url} ...") content = _download(url) if sha256: # Validate the checksum. checksum = hashlib.sha256() checksum.update(content) actual_sha256 = checksum.hexdigest() if sha256 != actual_sha256: raise OSError(f"Checksum of downloaded dataset does not match:\n" f"Url: {url}\n" f"Expected: {sha256}\n" f"Actual: {actual_sha256}") # Cache the downloaded file. cache_path("downloads").mkdir(parents=True, exist_ok=True) with open(str(cache_path(f"downloads/{sha256}")), "wb") as f: f.write(content) logging.info(f"Downloaded {url}") return content
def __init__(self): self.path = _create_timestamped_unique_service_dir( transient_cache_path(".")) (self.path / "logs").mkdir() self._directories_to_remove = [self.path] if is_in_memory(self.path): disk = _create_timestamped_unique_service_dir(cache_path(".")) self._directories_to_remove.append(disk) os.symlink(disk, self.path / "disk") else: (self.path / "disk").mkdir()
def download(urls: Union[str, List[str]], sha256: Optional[str] = None, max_retries: int = 5) -> bytes: """Download a file and return its contents. If :code:`sha256` is provided and the download succeeds, the file contents are cached locally in :code:`$cache_path/downloads/$sha256`. See :func:`compiler_gym.cache_path`. An inter-process lock ensures that only a single call to this function may execute at a time. :param urls: Either a single URL of the file to download, or a list of URLs to download. :param sha256: The expected sha256 checksum of the file. :return: The contents of the downloaded file. :raises IOError: If the download fails, or if the downloaded content does match the expected :code:`sha256` checksum. """ # Convert a singular string into a list of strings. urls = [urls] if not isinstance(urls, list) else urls # Only a single process may download a file at a time. The idea here is to # prevent redundant downloads when multiple simultaneous processes all try # and download the same resource. If we don't have an ID for the resource # then we just lock globally to reduce NIC thrashing. if sha256: with fasteners.InterProcessLock( cache_path(f"downloads/.{sha256}.lock")): return _download(urls, sha256, max_retries) else: with fasteners.InterProcessLock(cache_path("downloads/.lock")): return _download(urls, None, max_retries)
def _do_download_attempt(url: str, sha256: Optional[str]) -> bytes: logger.info("Downloading %s ...", url) content = _get_url_data(url) if sha256: # Validate the checksum. checksum = hashlib.sha256() checksum.update(content) actual_sha256 = checksum.hexdigest() if sha256 != actual_sha256: raise DownloadFailed(f"Checksum of download does not match:\n" f"Url: {url}\n" f"Expected: {sha256}\n" f"Actual: {actual_sha256}") # Cache the downloaded file. path = cache_path(f"downloads/{sha256}") path.parent.mkdir(parents=True, exist_ok=True) with atomic_file_write(path, fileobj=True) as f: f.write(content) logger.debug(f"Downloaded {url}") return content
def _download(url: str) -> bytes: req = requests.get(url) try: if req.status_code != 200: raise OSError(f"GET returned status code {req.status_code}: {url}") return req.content finally: req.close() # Only a single process may download at a time. The idea here is to prevent # overloading the NIC when, for example, you launch a bunch of simultaneous # learning processes which all require the same dataset. @fasteners.interprocess_locked(cache_path("downloads/LOCK")) def download(url: str, sha256: Optional[str] = None) -> bytes: """Download a file and return its contents. If :code:`sha256` is provided and the download succeeds, the file contents are cached locally in :code:`$cache_path/downloads/$sha256`. See :func:`compiler_gym.cache_path`. An inter-process lock ensures that only a single call to this function may execute at a time. :param url: The URL of the file to download. :param sha256: The expected sha256 checksum of the file. :return: The contents of the downloaded file. :raises OSError: If the download fails, or if the downloaded content does match the expected :code:`sha256` checksum. """
def make_benchmark( inputs: Union[str, Path, ClangInvocation, List[Union[str, Path, ClangInvocation]]], copt: Optional[List[str]] = None, system_includes: bool = True, timeout: int = 600, ) -> Benchmark: """Create a benchmark for use by LLVM environments. This function takes one or more inputs and uses them to create a benchmark that can be passed to :meth:`compiler_gym.envs.LlvmEnv.reset`. For single-source C/C++ programs, you can pass the path of the source file: >>> benchmark = make_benchmark('my_app.c') >>> env = gym.make("llvm-v0") >>> env.reset(benchmark=benchmark) The clang invocation used is roughly equivalent to: .. code-block:: $ clang my_app.c -O0 -c -emit-llvm -o benchmark.bc Additional compile-time arguments to clang can be provided using the :code:`copt` argument: >>> benchmark = make_benchmark('/path/to/my_app.cpp', copt=['-O2']) If you need more fine-grained control over the options, you can directly construct a :class:`ClangInvocation <compiler_gym.envs.llvm.ClangInvocation>` to pass a list of arguments to clang: >>> benchmark = make_benchmark( ClangInvocation(['/path/to/my_app.c'], timeout=10) ) For multi-file programs, pass a list of inputs that will be compiled separately and then linked to a single module: >>> benchmark = make_benchmark([ 'main.c', 'lib.cpp', 'lib2.bc', ]) If you already have prepared bitcode files, those can be linked and used directly: >>> benchmark = make_benchmark([ 'bitcode1.bc', 'bitcode2.bc', ]) .. note:: LLVM bitcode compatibility is `not guaranteed <https://llvm.org/docs/DeveloperPolicy.html#ir-backwards-compatibility>`_, so you must ensure that any precompiled bitcodes are compatible with the LLVM version used by CompilerGym, which can be queried using :func:`LlvmEnv.compiler_version <compiler_gym.envs.CompilerEnv.compiler_version>`. :param inputs: An input, or list of inputs. :param copt: A list of command line options to pass to clang when compiling source files. :param system_includes: Whether to include the system standard libraries during compilation jobs. This requires a system toolchain. See :func:`get_system_includes`. :param timeout: The maximum number of seconds to allow clang to run before terminating. :return: A :code:`Benchmark` message. :raises FileNotFoundError: If any input sources are not found. :raises TypeError: If the inputs are of unsupported types. :raises OSError: If a compilation job fails. :raises TimeoutExpired: If a compilation job exceeds :code:`timeout` seconds. """ copt = copt or [] bitcodes: List[Path] = [] clang_jobs: List[ClangInvocation] = [] def _add_path(path: Path): # NOTE(cummins): There is some discussion about the best way to create # a bitcode that is unoptimized yet does not hinder downstream # optimization opportunities. Here we are using a configuration based # on -O0, yet there is a suggestion that an optimized configuration # can produce better results if the optimizations themselves are # explicitly disabled, as in: ["-Oz", "-Xclang", "-disable-llvm-optzns"] # See: https://lists.llvm.org/pipermail/llvm-dev/2018-August/thread.html#125365 DEFAULT_COPT = [ "-O", "-Xclang", "-disable-O0-optnone", "-Xclang", "-disable-llvm-passes", ] if not path.is_file(): raise FileNotFoundError(path) if path.suffix == ".bc": bitcodes.append(path) elif path.suffix in {".c", ".cxx", ".cpp", ".cc"}: clang_jobs.append( ClangInvocation( [str(path)] + DEFAULT_COPT + copt, system_includes=system_includes, timeout=timeout, ) ) else: raise ValueError(f"Unrecognized file type: {path.name}") # Determine from inputs the list of pre-compiled bitcodes and the clang # invocations required to compile the bitcodes. if isinstance(inputs, str) or isinstance(inputs, Path): _add_path(Path(inputs)) elif isinstance(inputs, ClangInvocation): clang_jobs.append(inputs) else: for input in inputs: if isinstance(input, str) or isinstance(input, Path): _add_path(Path(input)) elif isinstance(input, ClangInvocation): clang_jobs.append(input) else: raise TypeError(f"Invalid input type: {type(input).__name__}") if not bitcodes and not clang_jobs: raise ValueError("No inputs") # Shortcut if we only have a single pre-compiled bitcode. if len(bitcodes) == 1 and not clang_jobs: bitcode = bitcodes[0] return Benchmark( uri=f"file:///{bitcode}", program=File(uri=f"file:///{bitcode}") ) with tempfile.TemporaryDirectory(dir=cache_path(".")) as d: working_dir = Path(d) # Run the clang invocations in parallel. clang_outs = [ working_dir / f"out-{i}.bc" for i in range(1, len(clang_jobs) + 1) ] clang_cmds = [ (job.command(out), job.timeout) for job, out in zip(clang_jobs, clang_outs) ] with multiprocessing.Pool() as pool: list(pool.imap_unordered(_run_command, clang_cmds)) # Check that the expected files were generated. for i, b in enumerate(clang_outs): if not b.is_file(): raise OSError( f"Clang invocation failed to produce a file: {' '.join(clang_cmds[i])}" ) if len(bitcodes + clang_outs) > 1: # Link all of the bitcodes into a single module. llvm_link_cmd = [str(LLVM_LINK), "-o", "-"] + [ str(path) for path in bitcodes + clang_outs ] llvm_link = subprocess.Popen( llvm_link_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) bitcode, stderr = _communicate(llvm_link, timeout=timeout) if llvm_link.returncode: raise OSError( f"Failed to link LLVM bitcodes with error: {stderr.decode('utf-8')}" ) else: # We only have a single bitcode so read it. with open(str(list(bitcodes + clang_outs)[0]), "rb") as f: bitcode = f.read() timestamp = datetime.now().strftime(f"%Y%m%HT%H%M%S-{random.randrange(16**4):04x}") return Benchmark( uri=f"benchmark://user/{timestamp}", program=File(contents=bitcode) )
) finally: binary.unlink() if process.returncode: try: output = stdout.decode("utf-8") msg = f"Benchmark exited with returncode {process.returncode}. Output: {output}" except UnicodeDecodeError: msg = f"Benchmark exited with returncode {process.returncode}" return BenchmarkExecutionResult(walltime_seconds=timer.time, error=msg) return BenchmarkExecutionResult(walltime_seconds=timer.time, output=stdout) @fasteners.interprocess_locked(cache_path("cBench-v0-runtime-data.LOCK")) def download_cBench_runtime_data() -> bool: """Download and unpack the cBench runtime dataset.""" if _CBENCH_DATA.is_dir(): return False else: tar_contents = io.BytesIO( download(_CBENCH_DATA_URL, sha256=_CBENCH_DATA_SHA256)) with tarfile.open(fileobj=tar_contents, mode="r:bz2") as tar: _CBENCH_DATA.parent.mkdir(parents=True) tar.extractall(_CBENCH_DATA.parent) assert _CBENCH_DATA.is_dir() return True def _make_cBench_validator(
def install(self): super().install() with _CBENCH_DOWNLOAD_THREAD_LOCK: with fasteners.InterProcessLock( cache_path(".cbench-v1-runtime-data.LOCK")): download_cBench_runtime_data()
def validator_cb( env: "LlvmEnv") -> Optional[ValidationError]: # noqa: F821 """The validation callback.""" with _CBENCH_DOWNLOAD_THREAD_LOCK: with fasteners.InterProcessLock( cache_path(".cbench-v1-runtime-data.LOCK")): download_cBench_runtime_data() cbench_data = site_data_path( "llvm-v0/cbench-v1-runtime-data/runtime_data") for input_file_name in input_files: path = cbench_data / input_file_name if not path.is_file(): raise FileNotFoundError( f"Required benchmark input not found: {path}") # Create a temporary working directory to execute the benchmark in. with tempfile.TemporaryDirectory( dir=env.service.connection.working_dir) as d: cwd = Path(d) # Expand shell variable substitutions in the benchmark command. expanded_command = cmd.replace("$D", str(cbench_data)) # Translate the output file names into paths inside the working # directory. output_paths = [cwd / o for o in output_files] if pre_execution_callback: pre_execution_callback(cwd) # Produce a gold-standard output using a reference version of # the benchmark. if compare_output or output_files: gs_env = env.fork() try: # Reset to the original benchmark state and compile it. gs_env.reset(benchmark=env.benchmark) gs_env.write_bitcode(cwd / "benchmark.bc") gold_standard = _compile_and_run_bitcode_file( bitcode_file=cwd / "benchmark.bc", cmd=expanded_command, cwd=cwd, num_runs=1, # Use default optimizations for gold standard. linkopts=linkopts + ["-O2"], # Always assume safe. sanitizer=None, env=os_env, ) if gold_standard.error: return ValidationError( type=f"Gold standard: {gold_standard.error.type}", data=gold_standard.error.data, ) finally: gs_env.close() # Check that the reference run produced the expected output # files. for path in output_paths: if not path.is_file(): try: output = gold_standard.output except UnicodeDecodeError: output = "<binary>" raise FileNotFoundError( f"Expected file '{path.name}' not generated\n" f"Benchmark: {env.benchmark}\n" f"Command: {cmd}\n" f"Output: {output}") path.rename(f"{path}.gold_standard") # Serialize the benchmark to a bitcode file that will then be # compiled to a binary. env.write_bitcode(cwd / "benchmark.bc") outcome = _compile_and_run_bitcode_file( bitcode_file=cwd / "benchmark.bc", cmd=expanded_command, cwd=cwd, num_runs=num_runs, linkopts=linkopts, sanitizer=sanitizer, env=os_env, ) if outcome.error: return outcome.error # Run a user-specified validation hook. if validate_result: validate_result(outcome) # Difftest the console output. if compare_output and gold_standard.output != outcome.output: return ValidationError( type="Wrong output", data={ "expected": gold_standard.output, "actual": outcome.output }, ) # Difftest the output files. for path in output_paths: if not path.is_file(): return ValidationError( type="Output not generated", data={ "path": path.name, "command": cmd }, ) diff = subprocess.Popen( ["diff", str(path), f"{path}.gold_standard"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, ) stdout, _ = diff.communicate() if diff.returncode: try: stdout = stdout.decode("utf-8") return ValidationError( type="Wrong output (file)", data={ "path": path.name, "diff": stdout }, ) except UnicodeDecodeError: return ValidationError( type="Wrong output (file)", data={ "path": path.name, "diff": "<binary>" }, )