async def filesystem(request): token = secret_alnum_string() with ThreadPoolExecutor() as thread_pool: if request.param.startswith('router/'): fs = RouterAsyncFS( 'file', [LocalAsyncFS(thread_pool), GoogleStorageAsyncFS()]) elif request.param == 'file': fs = LocalAsyncFS(thread_pool) else: fs = GoogleStorageAsyncFS() async with fs: if request.param.endswith('file'): base = f'/tmp/{token}/' else: assert request.param.endswith('gs') bucket = os.environ['HAIL_TEST_BUCKET'] base = f'gs://{bucket}/tmp/{token}/' await fs.mkdir(base) sema = asyncio.Semaphore(50) async with sema: yield (sema, fs, base) await fs.rmtree(sema, base) assert not await fs.isdir(base)
async def router_filesystem(request): token = secrets.token_hex(16) with ThreadPoolExecutor() as thread_pool: async with RouterAsyncFS( 'file', [LocalAsyncFS(thread_pool), GoogleStorageAsyncFS()]) as fs: file_base = f'/tmp/{token}/' await fs.mkdir(file_base) bucket = os.environ['HAIL_TEST_BUCKET'] gs_base = f'gs://{bucket}/tmp/{token}/' bases = { 'file': file_base, 'gs': gs_base } sema = asyncio.Semaphore(50) async with sema: yield (sema, fs, bases) await bounded_gather2(sema, fs.rmtree(sema, file_base), fs.rmtree(sema, gs_base)) assert not await fs.isdir(file_base) assert not await fs.isdir(gs_base)
async def gs_filesystem(request): token = secret_alnum_string() with ThreadPoolExecutor() as thread_pool: if request.param.startswith('router/'): fs = RouterAsyncFS('file', filesystems=[ LocalAsyncFS(thread_pool), GoogleStorageAsyncFS() ]) else: assert request.param.endswith('gs') fs = GoogleStorageAsyncFS() async with fs: test_storage_uri = os.environ['HAIL_TEST_STORAGE_URI'] protocol = 'gs://' assert test_storage_uri[:len(protocol)] == protocol base = f'{test_storage_uri}/tmp/{token}/' await fs.mkdir(base) sema = asyncio.Semaphore(50) async with sema: yield (sema, fs, base) await fs.rmtree(sema, base) assert not await fs.isdir(base)
async def copy_test_specs(): test_specs = [] with ThreadPoolExecutor() as thread_pool: async with RouterAsyncFS( 'file', filesystems=[LocalAsyncFS(thread_pool)]) as fs: for config in copy_test_configurations(): token = secrets.token_hex(16) base = f'/tmp/{token}/' src_base = f'{base}src/' dest_base = f'{base}dest/' await fs.mkdir(base) await fs.mkdir(src_base) await fs.mkdir(dest_base) # make sure dest_base exists async with await fs.create(f'{dest_base}keep'): pass sema = asyncio.Semaphore(50) async with sema: result = await run_test_spec(sema, fs, config, src_base, dest_base) config['result'] = result test_specs.append(config) await fs.rmtree(sema, base) assert not await fs.isdir(base) return test_specs
def _fs(self) -> AsyncFS: if self._DEPRECATED_project is not None: if self._DEPRECATED_fs is None: self._DEPRECATED_fs = RouterAsyncFS('file', [ LocalAsyncFS(ThreadPoolExecutor()), GoogleStorageAsyncFS(project=self._DEPRECATED_project) ]) return self._DEPRECATED_fs return self._backend._fs
async def filesystem( request) -> AsyncIterator[Tuple[asyncio.Semaphore, AsyncFS, str]]: token = secret_alnum_string() with ThreadPoolExecutor() as thread_pool: fs: AsyncFS if request.param.startswith('router/'): fs = RouterAsyncFS('file', filesystems=[ LocalAsyncFS(thread_pool), GoogleStorageAsyncFS(), S3AsyncFS(thread_pool), AzureAsyncFS() ]) elif request.param == 'file': fs = LocalAsyncFS(thread_pool) elif request.param.endswith('gs'): fs = GoogleStorageAsyncFS() elif request.param.endswith('s3'): fs = S3AsyncFS(thread_pool) else: assert request.param.endswith('hail-az') fs = AzureAsyncFS() async with fs: if request.param.endswith('file'): base = f'/tmp/{token}/' elif request.param.endswith('gs'): bucket = os.environ['HAIL_TEST_GCS_BUCKET'] base = f'gs://{bucket}/tmp/{token}/' elif request.param.endswith('s3'): bucket = os.environ['HAIL_TEST_S3_BUCKET'] base = f's3://{bucket}/tmp/{token}/' else: assert request.param.endswith('hail-az') account = os.environ['HAIL_TEST_AZURE_ACCOUNT'] container = os.environ['HAIL_TEST_AZURE_CONTAINER'] base = f'hail-az://{account}/{container}/tmp/{token}/' await fs.mkdir(base) sema = asyncio.Semaphore(50) async with sema: yield (sema, fs, base) await fs.rmtree(sema, base) assert not await fs.isdir(base)
async def local_filesystem(request): token = secret_alnum_string() with ThreadPoolExecutor() as thread_pool: async with LocalAsyncFS(thread_pool) as fs: base = f'/tmp/{token}/' await fs.mkdir(base) sema = asyncio.Semaphore(50) async with sema: yield (sema, fs, base) await fs.rmtree(sema, base) assert not await fs.isdir(base)
async def router_filesystem( request ) -> AsyncIterator[Tuple[asyncio.Semaphore, AsyncFS, Dict[str, str]]]: token = secrets.token_hex(16) with ThreadPoolExecutor() as thread_pool: async with RouterAsyncFS('file', filesystems=[ LocalAsyncFS(thread_pool), GoogleStorageAsyncFS(), S3AsyncFS(thread_pool), AzureAsyncFS() ]) as fs: file_base = f'/tmp/{token}/' await fs.mkdir(file_base) gs_bucket = os.environ['HAIL_TEST_GCS_BUCKET'] gs_base = f'gs://{gs_bucket}/tmp/{token}/' s3_bucket = os.environ['HAIL_TEST_S3_BUCKET'] s3_base = f's3://{s3_bucket}/tmp/{token}/' azure_account = os.environ['HAIL_TEST_AZURE_ACCOUNT'] azure_container = os.environ['HAIL_TEST_AZURE_CONTAINER'] azure_base = f'hail-az://{azure_account}/{azure_container}/tmp/{token}/' bases = { 'file': file_base, 'gs': gs_base, 's3': s3_base, 'hail-az': azure_base } sema = asyncio.Semaphore(50) async with sema: yield (sema, fs, bases) await bounded_gather2( sema, functools.partial(fs.rmtree, sema, file_base), functools.partial(fs.rmtree, sema, gs_base), functools.partial(fs.rmtree, sema, s3_base), functools.partial(fs.rmtree, sema, azure_base)) assert not await fs.isdir(file_base) assert not await fs.isdir(gs_base) assert not await fs.isdir(s3_base) assert not await fs.isdir(azure_base)
def __init__(self, tmp_dir: str = '/tmp/', gsa_key_file: Optional[str] = None, extra_docker_run_flags: Optional[str] = None): self._tmp_dir = tmp_dir.rstrip('/') flags = '' if extra_docker_run_flags is not None: flags += extra_docker_run_flags elif os.environ.get('HAIL_BATCH_EXTRA_DOCKER_RUN_FLAGS') is not None: flags += os.environ['HAIL_BATCH_EXTRA_DOCKER_RUN_FLAGS'] if gsa_key_file is None: gsa_key_file = os.environ.get('HAIL_BATCH_GSA_KEY_FILE') if gsa_key_file is not None: flags += f' -v {gsa_key_file}:/gsa-key/key.json' self._extra_docker_run_flags = flags self.__fs: AsyncFS = LocalAsyncFS(ThreadPoolExecutor())
def __init__(self, *args, billing_project: Optional[str] = None, bucket: Optional[str] = None, remote_tmpdir: Optional[str] = None, google_project: Optional[str] = None, token: str = None): if len(args) > 2: raise TypeError( f'ServiceBackend() takes 2 positional arguments but {len(args)} were given' ) if len(args) >= 1: if billing_project is not None: raise TypeError( 'ServiceBackend() got multiple values for argument \'billing_project\'' ) warnings.warn( 'Use of deprecated positional argument \'billing_project\' in ServiceBackend(). Specify \'billing_project\' as a keyword argument instead.' ) billing_project = args[0] if len(args) >= 2: if bucket is not None: raise TypeError( 'ServiceBackend() got multiple values for argument \'bucket\'' ) warnings.warn( 'Use of deprecated positional argument \'bucket\' in ServiceBackend(). Specify \'bucket\' as a keyword argument instead.' ) bucket = args[1] if remote_tmpdir is not None and bucket is not None: raise ValueError( 'Cannot specify both remote_tmpdir and bucket in ServiceBackend()' ) if billing_project is None: billing_project = get_user_config().get('batch', 'billing_project', fallback=None) if billing_project is None: raise ValueError( 'the billing_project parameter of ServiceBackend must be set ' 'or run `hailctl config set batch/billing_project ' 'MY_BILLING_PROJECT`') self._batch_client = BatchClient(billing_project, _token=token) self.__fs: AsyncFS = RouterAsyncFS('file', [ LocalAsyncFS(ThreadPoolExecutor()), GoogleStorageAsyncFS(project=google_project) ]) if remote_tmpdir is None: if bucket is None: bucket = get_user_config().get('batch', 'bucket', fallback=None) if bucket is None: raise ValueError( 'either the bucket or remote_tmpdir parameter of ServiceBackend ' 'must be set or run `hailctl config set batch/bucket MY_BUCKET`' ) if 'gs://' in bucket: raise ValueError( 'The bucket parameter to ServiceBackend() should be a bucket name, not a path. ' 'Use the remote_tmpdir parameter to specify a path.') remote_tmpdir = f'gs://{bucket}/batch' else: if not remote_tmpdir.startswith('gs://'): raise ValueError( 'remote_tmpdir must be a google storage path like gs://bucket/folder' ) if remote_tmpdir[-1] != '/': remote_tmpdir += '/' self.remote_tmpdir = remote_tmpdir
async def parallel_file_exists_async(fpaths: List[str], parallelism: int = 750 ) -> Dict[str, bool]: """ Check whether a large number of files exist. Created for use with hail Batch jobs. Normal `file_exists` function is very slow when checking a large number of files. :param fpaths: List of file paths to check. Files can be in local or Google cloud storage. :param parallelism: Integer that sets parallelism of file existence checking task. Default is 750. :return: Dictionary of file paths (str) and whether the file exists (boolean). """ async def async_file_exists(fs: AsyncFS, fpath: str) -> bool: """ Determine file existence. :param fs: AsyncFS object. :param fpath: Path to file to check. :return: Whether file exists. """ fext = os.path.splitext(fpath)[1] if fext in [".ht", ".mt"]: fpath += "/_SUCCESS" try: await fs.statfile(fpath) except FileNotFoundError: return False else: return True with tqdm(total=len(fpaths), desc="check files for existence", disable=False) as pbar: with ThreadPoolExecutor() as thread_pool: async with RouterAsyncFS("file", filesystems=[ LocalAsyncFS(thread_pool), GoogleStorageAsyncFS() ]) as fs: def check_existence_and_update_pbar_thunk( fpath: str) -> Callable: """ Create function to check if file exists and update progress bar in stdout. Function delays coroutine creation to avoid creating too many live coroutines. :param fpath: Path to file to check. :return: Function that checks for file existence and updates progress bar. """ async def unapplied_function(): x = await async_file_exists(fs, fpath) pbar.update(1) return x return unapplied_function file_existence_checks = [ check_existence_and_update_pbar_thunk(fpath) for fpath in fpaths ] file_existence = await bounded_gather(*file_existence_checks, parallelism=parallelism) return dict(zip(fpaths, file_existence))