def __init__(self, *, name: Optional[str] = None, backend: Optional[ServiceBackend] = None, image: Optional[str] = None, cpus_per_job: Optional[Union[int, str]] = None, wait_on_exit: bool = True, cleanup_bucket: bool = True, project: Optional[str] = None): self.name = name or "BatchPoolExecutor-" + secret_alnum_string(4) self.backend = backend or ServiceBackend() if not isinstance(self.backend, ServiceBackend): raise ValueError( f'BatchPoolExecutor is not compatible with {type(backend)}') self.batches: List[Batch] = [] self.directory = self.backend.remote_tmpdir + f'batch-pool-executor/{self.name}/' self.inputs = self.directory + 'inputs/' self.outputs = self.directory + 'outputs/' self.fs = RouterAsyncFS('file', gcs_kwargs={'project': project}) self.futures: List[BatchPoolFuture] = [] self.finished_future_count = 0 self._shutdown = False version = sys.version_info if image is None: if version.major != 3 or version.minor not in (6, 7, 8): raise ValueError( f'You must specify an image if you are using a Python version other than 3.6, 3.7, or 3.8 (you are using {version})' ) self.image = f'hailgenetics/python-dill:{version.major}.{version.minor}-slim' else: self.image = image self.cpus_per_job = cpus_per_job self.cleanup_bucket = cleanup_bucket self.wait_on_exit = wait_on_exit
def setUp(self): remote_tmpdir = get_user_config().get('batch', 'remote_tmpdir') token = uuid.uuid4() self.test_path = f'{remote_tmpdir}memory-tests/{token}' self.fs = RouterAsyncFS( 'gs', filesystems=[aiogoogle.GoogleStorageAsyncFS(project=PROJECT)]) self.client = BlockingMemoryClient(fs=self.fs) self.temp_files = set()
async def copy_test_specs(): test_specs = [] with ThreadPoolExecutor() as thread_pool: async with RouterAsyncFS( 'file', filesystems=[LocalAsyncFS(thread_pool)]) as fs: for config in copy_test_configurations(): token = secrets.token_hex(16) base = f'/tmp/{token}/' src_base = f'{base}src/' dest_base = f'{base}dest/' await fs.mkdir(base) await fs.mkdir(src_base) await fs.mkdir(dest_base) # make sure dest_base exists async with await fs.create(f'{dest_base}keep'): pass sema = asyncio.Semaphore(50) async with sema: result = await run_test_spec(sema, fs, config, src_base, dest_base) config['result'] = result test_specs.append(config) await fs.rmtree(sema, base) assert not await fs.isdir(base) return test_specs
async def gs_filesystem(request): token = secret_alnum_string() with ThreadPoolExecutor() as thread_pool: if request.param.startswith('router/'): fs = RouterAsyncFS('file', filesystems=[ LocalAsyncFS(thread_pool), GoogleStorageAsyncFS() ]) else: assert request.param.endswith('gs') fs = GoogleStorageAsyncFS() async with fs: test_storage_uri = os.environ['HAIL_TEST_STORAGE_URI'] protocol = 'gs://' assert test_storage_uri[:len(protocol)] == protocol base = f'{test_storage_uri}/tmp/{token}/' await fs.mkdir(base) sema = asyncio.Semaphore(50) async with sema: yield (sema, fs, base) await fs.rmtree(sema, base) assert not await fs.isdir(base)
def setUp(self): self.backend = ServiceBackend() remote_tmpdir = get_user_config().get('batch', 'remote_tmpdir') if not remote_tmpdir.endswith('/'): remote_tmpdir += '/' self.remote_tmpdir = remote_tmpdir if remote_tmpdir.startswith('gs://'): self.bucket = re.fullmatch( 'gs://(?P<bucket_name>[^/]+).*', remote_tmpdir).groupdict()['bucket_name'] else: assert remote_tmpdir.startswith('hail-az://') storage_account, container_name = re.fullmatch( 'hail-az://(?P<storage_account>[^/]+)/(?P<container_name>[^/]+).*', remote_tmpdir).groups() self.bucket = f'{storage_account}/{container_name}' self.cloud_input_dir = f'{self.remote_tmpdir}batch-tests/resources' token = uuid.uuid4() self.cloud_output_path = f'/batch-tests/{token}' self.cloud_output_dir = f'{self.remote_tmpdir}{self.cloud_output_path}' in_cluster_key_file = '/test-gsa-key/key.json' if not os.path.exists(in_cluster_key_file): in_cluster_key_file = None router_fs = RouterAsyncFS( 'gs', gcs_kwargs={ 'project': 'hail-vdc', 'credentials_file': in_cluster_key_file }, azure_kwargs={'credential_file': in_cluster_key_file}) def sync_exists(url): return async_to_blocking(router_fs.exists(url)) def sync_write(url, data): return async_to_blocking(router_fs.write(url, data)) if not sync_exists( f'{self.remote_tmpdir}batch-tests/resources/hello.txt'): sync_write(f'{self.remote_tmpdir}batch-tests/resources/hello.txt', b'hello world') if not sync_exists( f'{self.remote_tmpdir}batch-tests/resources/hello spaces.txt'): sync_write( f'{self.remote_tmpdir}batch-tests/resources/hello spaces.txt', b'hello') if not sync_exists( f'{self.remote_tmpdir}batch-tests/resources/hello (foo) spaces.txt' ): sync_write( f'{self.remote_tmpdir}batch-tests/resources/hello (foo) spaces.txt', b'hello')
class Tests(unittest.TestCase): def setUp(self): remote_tmpdir = get_user_config().get('batch', 'remote_tmpdir') token = uuid.uuid4() self.test_path = f'{remote_tmpdir}memory-tests/{token}' self.fs = RouterAsyncFS( 'gs', filesystems=[aiogoogle.GoogleStorageAsyncFS(project=PROJECT)]) self.client = BlockingMemoryClient(fs=self.fs) self.temp_files = set() def tearDown(self): async_to_blocking(self.fs.rmtree(None, self.test_path)) self.client.close() async def add_temp_file_from_string(self, name: str, str_value: bytes): handle = f'{self.test_path}/{name}' async with await self.fs.create(handle) as f: await f.write(str_value) return handle def test_non_existent(self): for _ in range(3): self.assertIsNone( self.client._get_file_if_exists( f'{self.test_path}/nonexistent')) def test_small_write_around(self): async def read(url): async with await self.fs.open(url) as f: return await f.read() cases = [('empty_file', b''), ('null', b'\0'), ('small', b'hello world')] for file, data in cases: handle = async_to_blocking( self.add_temp_file_from_string(file, data)) expected = async_to_blocking(read(handle)) self.assertEqual(expected, data) i = 0 cached = self.client._get_file_if_exists(handle) while cached is None and i < 10: cached = self.client._get_file_if_exists(handle) i += 1 self.assertEqual(cached, expected) def test_small_write_through(self): cases = [('empty_file2', b''), ('null2', b'\0'), ('small2', b'hello world')] for file, data in cases: filename = f'{self.test_path}/{file}' self.client.write_file(filename, data) cached = self.client._get_file_if_exists(filename) self.assertEqual(cached, data)
async def router_filesystem( request ) -> AsyncIterator[Tuple[asyncio.Semaphore, AsyncFS, Dict[str, str]]]: token = secrets.token_hex(16) with ThreadPoolExecutor() as thread_pool: async with RouterAsyncFS('file', filesystems=[ LocalAsyncFS(thread_pool), GoogleStorageAsyncFS(), S3AsyncFS(thread_pool), AzureAsyncFS() ]) as fs: file_base = f'/tmp/{token}/' await fs.mkdir(file_base) gs_bucket = os.environ['HAIL_TEST_GCS_BUCKET'] gs_base = f'gs://{gs_bucket}/tmp/{token}/' s3_bucket = os.environ['HAIL_TEST_S3_BUCKET'] s3_base = f's3://{s3_bucket}/tmp/{token}/' azure_account = os.environ['HAIL_TEST_AZURE_ACCOUNT'] azure_container = os.environ['HAIL_TEST_AZURE_CONTAINER'] azure_base = f'hail-az://{azure_account}/{azure_container}/tmp/{token}/' bases = { 'file': file_base, 'gs': gs_base, 's3': s3_base, 'hail-az': azure_base } sema = asyncio.Semaphore(50) async with sema: yield (sema, fs, bases) await bounded_gather2( sema, functools.partial(fs.rmtree, sema, file_base), functools.partial(fs.rmtree, sema, gs_base), functools.partial(fs.rmtree, sema, s3_base), functools.partial(fs.rmtree, sema, azure_base)) assert not await fs.isdir(file_base) assert not await fs.isdir(gs_base) assert not await fs.isdir(s3_base) assert not await fs.isdir(azure_base)
async def filesystem( request) -> AsyncIterator[Tuple[asyncio.Semaphore, AsyncFS, str]]: token = secret_alnum_string() with ThreadPoolExecutor() as thread_pool: fs: AsyncFS if request.param.startswith('router/'): fs = RouterAsyncFS('file', filesystems=[ LocalAsyncFS(thread_pool), GoogleStorageAsyncFS(), S3AsyncFS(thread_pool), AzureAsyncFS() ]) elif request.param == 'file': fs = LocalAsyncFS(thread_pool) elif request.param.endswith('gs'): fs = GoogleStorageAsyncFS() elif request.param.endswith('s3'): fs = S3AsyncFS(thread_pool) else: assert request.param.endswith('hail-az') fs = AzureAsyncFS() async with fs: if request.param.endswith('file'): base = f'/tmp/{token}/' elif request.param.endswith('gs'): bucket = os.environ['HAIL_TEST_GCS_BUCKET'] base = f'gs://{bucket}/tmp/{token}/' elif request.param.endswith('s3'): bucket = os.environ['HAIL_TEST_S3_BUCKET'] base = f's3://{bucket}/tmp/{token}/' else: assert request.param.endswith('hail-az') account = os.environ['HAIL_TEST_AZURE_ACCOUNT'] container = os.environ['HAIL_TEST_AZURE_CONTAINER'] base = f'hail-az://{account}/{container}/tmp/{token}/' await fs.mkdir(base) sema = asyncio.Semaphore(50) async with sema: yield (sema, fs, base) await fs.rmtree(sema, base) assert not await fs.isdir(base)
async def create(*, billing_project: Optional[str] = None, batch_client: Optional[aiohb.BatchClient] = None, skip_logging_configuration: Optional[bool] = None, disable_progress_bar: bool = True, remote_tmpdir: Optional[str] = None, flags: Optional[Dict[str, str]] = None): del skip_logging_configuration if billing_project is None: billing_project = get_user_config().get('batch', 'billing_project', fallback=None) if billing_project is None: raise ValueError( "No billing project. Call 'init_service' with the billing " "project or run 'hailctl config set batch/billing_project " "MY_BILLING_PROJECT'") async_fs = RouterAsyncFS('file') sync_fs = RouterFS(async_fs) if batch_client is None: batch_client = await aiohb.BatchClient.create(billing_project) bc = hb.BatchClient.from_async(batch_client) batch_attributes: Dict[str, str] = dict() user_local_reference_cache_dir = Path(get_user_local_cache_dir(), 'references', version()) os.makedirs(user_local_reference_cache_dir, exist_ok=True) remote_tmpdir = get_remote_tmpdir('ServiceBackend', remote_tmpdir=remote_tmpdir) return ServiceBackend( billing_project=billing_project, sync_fs=sync_fs, async_fs=async_fs, bc=bc, disable_progress_bar=disable_progress_bar, batch_attributes=batch_attributes, user_local_reference_cache_dir=user_local_reference_cache_dir, remote_tmpdir=remote_tmpdir, flags=flags or {}, )
def __init__(self, tmp_dir: str = '/tmp/', gsa_key_file: Optional[str] = None, extra_docker_run_flags: Optional[str] = None): self._tmp_dir = tmp_dir.rstrip('/') flags = '' if extra_docker_run_flags is not None: flags += extra_docker_run_flags elif os.environ.get('HAIL_BATCH_EXTRA_DOCKER_RUN_FLAGS') is not None: flags += os.environ['HAIL_BATCH_EXTRA_DOCKER_RUN_FLAGS'] if gsa_key_file is None: gsa_key_file = os.environ.get('HAIL_BATCH_GSA_KEY_FILE') if gsa_key_file is not None: flags += f' -v {gsa_key_file}:/gsa-key/key.json' self._extra_docker_run_flags = flags self.__fs: AsyncFS = RouterAsyncFS(default_scheme='file')
class BatchPoolExecutor: """An executor which executes Python functions in the cloud. :class:`.concurrent.futures.ProcessPoolExecutor` and :class:`.concurrent.futures.ThreadPoolExecutor` enable the use of all the computer cores available on a single computer. :class:`.BatchPoolExecutor` enables the use of an effectively arbitrary number of cloud computer cores. Functions provided to :meth:`.submit` are serialized using `dill <https://dill.readthedocs.io/en/latest/dill.html>`__, sent to a Python docker container in the cloud, deserialized, and executed. The results are serialized and returned to the machine from which :meth:`.submit` was called. The Python version in the docker container will share a major and minor verison with the local process. The `image` parameter overrides this behavior. When used as a context manager (the ``with`` syntax), the executor will wait for all jobs to finish before finishing the ``with`` statement. This behavior can be controlled by the `wait_on_exit` parameter. This class creates a folder ``batch-pool-executor`` at the root of the bucket specified by the `backend`. This folder can be safely deleted after all jobs have completed. Examples -------- Add ``3`` to ``6`` on a machine in the cloud and send the result back to this machine: >>> with BatchPoolExecutor() as bpe: # doctest: +SKIP ... future_nine = bpe.submit(lambda: 3 + 6) >>> future_nine.result() # doctest: +SKIP 9 :meth:`.map` facilitates the common case of executing a function on many values in parallel: >>> with BatchPoolExecutor() as bpe: # doctest: +SKIP ... list(bpe.map(lambda x: x * 3, range(4))) [0, 3, 6, 9] Parameters ---------- name: A name for the executor. Executors produce many batches and each batch will include this name as a prefix. backend: Backend used to execute the jobs. Must be a :class:`.ServiceBackend`. image: The name of a Docker image used for each submitted job. The image must include Python 3.7 or later and must have the ``dill`` Python package installed. If you intend to use ``numpy``, ensure that OpenBLAS is also installed. If unspecified, an image with a matching Python verison and ``numpy``, ``scipy``, and ``sklearn`` installed is used. cpus_per_job: The number of CPU cores to allocate to each job. The default value is ``1``. The parameter is passed unaltered to :meth:`.Job.cpu`. This parameter's value is used to set several environment variables instructing BLAS and LAPACK to limit core use. wait_on_exit: If ``True`` or unspecified, wait for all jobs to complete when exiting a context. If ``False``, do not wait. This option has no effect if this executor is not used with the ``with`` syntax. cleanup_bucket: If ``True`` or unspecified, delete all temporary files in the cloud storage bucket when this executor fully shuts down. If Python crashes before the executor is shutdown, the files will not be deleted. project: If specified, the project to use when authenticating with Google Storage. Google Storage is used to transfer serialized values between this computer and the cloud machines that execute jobs. """ def __init__(self, *, name: Optional[str] = None, backend: Optional[ServiceBackend] = None, image: Optional[str] = None, cpus_per_job: Optional[Union[int, str]] = None, wait_on_exit: bool = True, cleanup_bucket: bool = True, project: Optional[str] = None): self.name = name or "BatchPoolExecutor-" + secret_alnum_string(4) self.backend = backend or ServiceBackend() if not isinstance(self.backend, ServiceBackend): raise ValueError( f'BatchPoolExecutor is not compatible with {type(backend)}') self.batches: List[Batch] = [] self.directory = self.backend.remote_tmpdir + f'batch-pool-executor/{self.name}/' self.inputs = self.directory + 'inputs/' self.outputs = self.directory + 'outputs/' self.fs = RouterAsyncFS('file', gcs_kwargs={'project': project}) self.futures: List[BatchPoolFuture] = [] self.finished_future_count = 0 self._shutdown = False version = sys.version_info if image is None: if version.major != 3 or version.minor not in (6, 7, 8): raise ValueError( f'You must specify an image if you are using a Python version other than 3.6, 3.7, or 3.8 (you are using {version})' ) self.image = f'hailgenetics/python-dill:{version.major}.{version.minor}-slim' else: self.image = image self.cpus_per_job = cpus_per_job self.cleanup_bucket = cleanup_bucket self.wait_on_exit = wait_on_exit def __enter__(self): return self def map(self, fn: Callable, *iterables: Iterable[Any], timeout: Optional[Union[int, float]] = None, chunksize: int = 1): """Call `fn` on cloud machines with arguments from `iterables`. This function returns a generator which will produce each result in the same order as the `iterables`, only blocking if the result is not yet ready. You can convert the generator to a list with :class:`.list`. Examples -------- Do nothing, but on the cloud: >>> with BatchPoolExecutor() as bpe: # doctest: +SKIP ... list(bpe.map(lambda x: x, range(4))) [0, 1, 2, 3] Call a function with two parameters, on the cloud: >>> with BatchPoolExecutor() as bpe: # doctest: +SKIP ... list(bpe.map(lambda x, y: x + y, ... ["white", "cat", "best"], ... ["house", "dog", "friend"])) ["whitehouse", "catdog", "bestfriend"] Generate products of random matrices, on the cloud: >>> def random_product(seed): ... np.random.seed(seed) ... w = np.random.rand(1, 100) ... u = np.random.rand(100, 1) ... return float(w @ u) >>> with BatchPoolExecutor() as bpe: # doctest: +SKIP ... list(bpe.map(random_product, range(4))) [24.440006386777277, 23.325755364428026, 23.920184804993806, 25.47912882125101] Parameters ---------- fn: The function to execute. iterables: The `iterables` are zipped together and each tuple is used as arguments to `fn`. See the second example for more detail. It is not possible to pass keyword arguments. Each element of `iterables` must have the same length. timeout: This is roughly a timeout on how long we wait on each function call. Specifically, each call to the returned generator's :class:`.BatchPoolFuture` :meth:`.iterator.__next__` invokes :meth:`.BatchPoolFuture.result` with this `timeout`. chunksize: The number of tasks to schedule in the same docker container. Docker containers take about 5 seconds to start. Ideally, each task should take an order of magnitude more time than start-up time. You can make the chunksize larger to reduce parallelism but increase the amount of meaningful work done per-container. """ agen = async_to_blocking( self.async_map(fn, iterables, timeout=timeout, chunksize=chunksize)) def generator_from_async_generator(aiter): try: while True: yield async_to_blocking(aiter.__anext__()) except StopAsyncIteration: return return generator_from_async_generator(agen.__aiter__()) async def async_map(self, fn: Callable, iterables: Iterable[Iterable[Any]], timeout: Optional[Union[int, float]] = None, chunksize: int = 1): """Aysncio compatible version of :meth:`.map`.""" if not iterables: return iter([]) if chunksize > 1: list_per_argument = [list(x) for x in iterables] n = len(list_per_argument[0]) assert all(n == len(x) for x in list_per_argument) n_chunks = (n + chunksize - 1) // chunksize iterables_chunks = [ list(partition(n_chunks, x)) for x in list_per_argument ] iterables_chunks = [ chunk for chunk in iterables_chunks if len(chunk) > 0 ] fn = chunk(fn) iterables = iterables_chunks submit_tasks = [ asyncio.ensure_future(self.async_submit(fn, *arguments)) for arguments in zip(*iterables) ] try: bp_futures = [await t for t in submit_tasks] except: for t in submit_tasks: if t.done() and not t.exception(): await t.result().async_cancel() elif not t.done(): t.cancel() raise async def async_result_or_cancel_all(future): try: return await future.async_result(timeout=timeout) except: await asyncio.gather( *[bp_fut.async_cancel() for bp_fut in bp_futures], return_exceptions=True) raise if chunksize > 1: return (val for future in bp_futures for val in await async_result_or_cancel_all(future)) return (await async_result_or_cancel_all(future) for future in bp_futures) def submit(self, fn: Callable, *args: Any, **kwargs: Any) -> 'BatchPoolFuture': """Call `fn` on a cloud machine with all remaining arguments and keyword arguments. The function, any objects it references, the arguments, and the keyword arguments will be serialized to the cloud machine. Python modules are not serialized, so you must ensure any needed Python modules and packages already present in the underlying Docker image. For more details see the `default_image` argument to :class:`.BatchPoolExecutor` This function does not return the function's output, it returns a :class:`.BatchPoolFuture` whose :meth:`.BatchPoolFuture.result` method can be used to access the value. Examples -------- Do nothing, but on the cloud: >>> with BatchPoolExecutor() as bpe: # doctest: +SKIP ... future = bpe.submit(lambda x: x, 4) ... future.result() 4 Call a function with two arguments and one keyword argument, on the cloud: >>> with BatchPoolExecutor() as bpe: # doctest: +SKIP ... future = bpe.submit(lambda x, y, z: x + y + z, ... "poly", "ethyl", z="ene") ... future.result() "polyethylene" Generate a product of two random matrices, on the cloud: >>> def random_product(seed): ... np.random.seed(seed) ... w = np.random.rand(1, 100) ... u = np.random.rand(100, 1) ... return float(w @ u) >>> with BatchPoolExecutor() as bpe: # doctest: +SKIP ... future = bpe.submit(random_product, 1) ... future.result() [23.325755364428026] Parameters ---------- fn: The function to execute. args: Arguments for the funciton. kwargs: Keyword arguments for the function. """ return async_to_blocking(self.async_submit(fn, *args, **kwargs)) async def async_submit(self, unapplied: Callable, *args: Any, **kwargs: Any) -> 'BatchPoolFuture': """Aysncio compatible version of :meth:`BatchPoolExecutor.submit`.""" if self._shutdown: raise RuntimeError('BatchPoolExecutor has already been shutdown.') try: name = unapplied.__name__ except AttributeError: name = '<anonymous>' name = f'{name}-{secret_alnum_string(4)}' batch = Batch(name=self.name + '-' + name, backend=self.backend, default_image=self.image) self.batches.append(batch) j = batch.new_job(name) pipe = BytesIO() dill.dump(functools.partial(unapplied, *args, **kwargs), pipe, recurse=True) pipe.seek(0) pickledfun_remote = self.inputs + f'{name}/pickledfun' await self.fs.write(pickledfun_remote, pipe.getvalue()) pickledfun_local = batch.read_input(pickledfun_remote) thread_limit = "1" if self.cpus_per_job: j.cpu(self.cpus_per_job) thread_limit = str( int(max(1.0, cpu_spec_to_float(self.cpus_per_job)))) j.env("OMP_NUM_THREADS", thread_limit) j.env("OPENBLAS_NUM_THREADS", thread_limit) j.env("MKL_NUM_THREADS", thread_limit) j.env("VECLIB_MAXIMUM_THREADS", thread_limit) j.env("NUMEXPR_NUM_THREADS", thread_limit) j.command('set -ex') j.command(f'''python3 -c " import base64 import dill import traceback with open(\\"{j.ofile}\\", \\"wb\\") as out: try: with open(\\"{pickledfun_local}\\", \\"rb\\") as f: dill.dump((dill.load(f)(), None), out, recurse=True) except Exception as e: print(\\"BatchPoolExecutor encountered an exception:\\") traceback.print_exc() dill.dump((e, traceback.format_exception(type(e), e, e.__traceback__)), out, recurse=True) "''') output_gcs = self.outputs + f'{name}/output' batch.write_output(j.ofile, output_gcs) backend_batch = batch.run(wait=False, disable_progress_bar=True)._async_batch try: return BatchPoolFuture( self, backend_batch, low_level_batch_client.Job.submitted_job(backend_batch, 1), output_gcs) except: await backend_batch.cancel() raise def __exit__(self, exc_type: Optional[Type[BaseException]], exc_value: Optional[BaseException], traceback: Optional[TracebackType]): self.shutdown(wait=self.wait_on_exit) def _add_future(self, f): self.futures.append(f) def _finish_future(self): self.finished_future_count += 1 if self._shutdown and self.finished_future_count == len(self.futures): self._cleanup() def shutdown(self, wait: bool = True): """Allow temporary resources to be cleaned up. Until shutdown is called, some temporary cloud storage files will persist. After shutdown has been called *and* all outstanding jobs have completed, these files will be deleted. Parameters ---------- wait: If true, wait for all jobs to complete before returning from this method. """ if wait: async def ignore_exceptions(f): try: await f.async_result() except Exception: pass async_to_blocking( asyncio.gather(*[ignore_exceptions(f) for f in self.futures])) if self.finished_future_count == len(self.futures): self._cleanup() self._shutdown = True def _cleanup(self): if self.cleanup_bucket: async_to_blocking(self.fs.rmtree(None, self.directory)) async_to_blocking(self.fs.close()) self.backend.close()
def __init__(self, *args, billing_project: Optional[str] = None, bucket: Optional[str] = None, remote_tmpdir: Optional[str] = None, google_project: Optional[str] = None, token: Optional[str] = None): if len(args) > 2: raise TypeError( f'ServiceBackend() takes 2 positional arguments but {len(args)} were given' ) if len(args) >= 1: if billing_project is not None: raise TypeError( 'ServiceBackend() got multiple values for argument \'billing_project\'' ) warnings.warn( 'Use of deprecated positional argument \'billing_project\' in ServiceBackend(). Specify \'billing_project\' as a keyword argument instead.' ) billing_project = args[0] if len(args) >= 2: if bucket is not None: raise TypeError( 'ServiceBackend() got multiple values for argument \'bucket\'' ) warnings.warn( 'Use of deprecated positional argument \'bucket\' in ServiceBackend(). Specify \'bucket\' as a keyword argument instead.' ) bucket = args[1] if billing_project is None: billing_project = get_user_config().get('batch', 'billing_project', fallback=None) if billing_project is None: raise ValueError( 'the billing_project parameter of ServiceBackend must be set ' 'or run `hailctl config set batch/billing_project ' 'MY_BILLING_PROJECT`') self._batch_client = BatchClient(billing_project, _token=token) user_config = get_user_config() if bucket is not None: warnings.warn( 'Use of deprecated argument \'bucket\' in ServiceBackend(). Specify \'remote_tmpdir\' as a keyword argument instead.' ) if remote_tmpdir is not None and bucket is not None: raise ValueError( 'Cannot specify both \'remote_tmpdir\' and \'bucket\' in ServiceBackend(). Specify \'remote_tmpdir\' as a keyword argument instead.' ) if bucket is None and remote_tmpdir is None: remote_tmpdir = user_config.get('batch', 'remote_tmpdir', fallback=None) if remote_tmpdir is None: if bucket is None: bucket = user_config.get('batch', 'bucket', fallback=None) warnings.warn( 'Using deprecated configuration setting \'batch/bucket\'. Run `hailctl config set batch/remote_tmpdir` ' 'to set the default for \'remote_tmpdir\' instead.') if bucket is None: raise ValueError( 'The \'remote_tmpdir\' parameter of ServiceBackend must be set. ' 'Run `hailctl config set batch/remote_tmpdir REMOTE_TMPDIR`' ) if 'gs://' in bucket: raise ValueError( 'The bucket parameter to ServiceBackend() should be a bucket name, not a path. ' 'Use the remote_tmpdir parameter to specify a path.') remote_tmpdir = f'gs://{bucket}/batch' else: schemes = {'gs', 'hail-az'} found_scheme = any( remote_tmpdir.startswith(f'{scheme}://') for scheme in schemes) if not found_scheme: raise ValueError( f'remote_tmpdir must be a storage uri path like gs://bucket/folder. Possible schemes include {schemes}' ) if remote_tmpdir[-1] != '/': remote_tmpdir += '/' self.remote_tmpdir = remote_tmpdir gcs_kwargs = {'project': google_project} self.__fs: AsyncFS = RouterAsyncFS(default_scheme='file', gcs_kwargs=gcs_kwargs)
async def parallel_file_exists_async(fpaths: List[str], parallelism: int = 750 ) -> Dict[str, bool]: """ Check whether a large number of files exist. Created for use with hail Batch jobs. Normal `file_exists` function is very slow when checking a large number of files. :param fpaths: List of file paths to check. Files can be in local or Google cloud storage. :param parallelism: Integer that sets parallelism of file existence checking task. Default is 750. :return: Dictionary of file paths (str) and whether the file exists (boolean). """ async def async_file_exists(fs: AsyncFS, fpath: str) -> bool: """ Determine file existence. :param fs: AsyncFS object. :param fpath: Path to file to check. :return: Whether file exists. """ fext = os.path.splitext(fpath)[1] if fext in [".ht", ".mt"]: fpath += "/_SUCCESS" try: await fs.statfile(fpath) except FileNotFoundError: return False else: return True with tqdm(total=len(fpaths), desc="check files for existence", disable=False) as pbar: with ThreadPoolExecutor() as thread_pool: async with RouterAsyncFS("file", filesystems=[ LocalAsyncFS(thread_pool), GoogleStorageAsyncFS() ]) as fs: def check_existence_and_update_pbar_thunk( fpath: str) -> Callable: """ Create function to check if file exists and update progress bar in stdout. Function delays coroutine creation to avoid creating too many live coroutines. :param fpath: Path to file to check. :return: Function that checks for file existence and updates progress bar. """ async def unapplied_function(): x = await async_file_exists(fs, fpath) pbar.update(1) return x return unapplied_function file_existence_checks = [ check_existence_and_update_pbar_thunk(fpath) for fpath in fpaths ] file_existence = await bounded_gather(*file_existence_checks, parallelism=parallelism) return dict(zip(fpaths, file_existence))