Esempio n. 1
0
 def __init__(self,
              *,
              name: Optional[str] = None,
              backend: Optional[ServiceBackend] = None,
              image: Optional[str] = None,
              cpus_per_job: Optional[Union[int, str]] = None,
              wait_on_exit: bool = True,
              cleanup_bucket: bool = True,
              project: Optional[str] = None):
     self.name = name or "BatchPoolExecutor-" + secret_alnum_string(4)
     self.backend = backend or ServiceBackend()
     if not isinstance(self.backend, ServiceBackend):
         raise ValueError(
             f'BatchPoolExecutor is not compatible with {type(backend)}')
     self.batches: List[Batch] = []
     self.directory = self.backend.remote_tmpdir + f'batch-pool-executor/{self.name}/'
     self.inputs = self.directory + 'inputs/'
     self.outputs = self.directory + 'outputs/'
     self.fs = RouterAsyncFS('file', gcs_kwargs={'project': project})
     self.futures: List[BatchPoolFuture] = []
     self.finished_future_count = 0
     self._shutdown = False
     version = sys.version_info
     if image is None:
         if version.major != 3 or version.minor not in (6, 7, 8):
             raise ValueError(
                 f'You must specify an image if you are using a Python version other than 3.6, 3.7, or 3.8 (you are using {version})'
             )
         self.image = f'hailgenetics/python-dill:{version.major}.{version.minor}-slim'
     else:
         self.image = image
     self.cpus_per_job = cpus_per_job
     self.cleanup_bucket = cleanup_bucket
     self.wait_on_exit = wait_on_exit
Esempio n. 2
0
 def setUp(self):
     remote_tmpdir = get_user_config().get('batch', 'remote_tmpdir')
     token = uuid.uuid4()
     self.test_path = f'{remote_tmpdir}memory-tests/{token}'
     self.fs = RouterAsyncFS(
         'gs',
         filesystems=[aiogoogle.GoogleStorageAsyncFS(project=PROJECT)])
     self.client = BlockingMemoryClient(fs=self.fs)
     self.temp_files = set()
Esempio n. 3
0
async def copy_test_specs():
    test_specs = []

    with ThreadPoolExecutor() as thread_pool:
        async with RouterAsyncFS(
                'file',
                filesystems=[LocalAsyncFS(thread_pool)]) as fs:
            for config in copy_test_configurations():
                token = secrets.token_hex(16)

                base = f'/tmp/{token}/'
                src_base = f'{base}src/'
                dest_base = f'{base}dest/'

                await fs.mkdir(base)
                await fs.mkdir(src_base)
                await fs.mkdir(dest_base)
                # make sure dest_base exists
                async with await fs.create(f'{dest_base}keep'):
                    pass

                sema = asyncio.Semaphore(50)
                async with sema:
                    result = await run_test_spec(sema, fs, config, src_base, dest_base)
                    config['result'] = result

                    test_specs.append(config)

                    await fs.rmtree(sema, base)
                    assert not await fs.isdir(base)

    return test_specs
Esempio n. 4
0
async def gs_filesystem(request):
    token = secret_alnum_string()

    with ThreadPoolExecutor() as thread_pool:
        if request.param.startswith('router/'):
            fs = RouterAsyncFS('file',
                               filesystems=[
                                   LocalAsyncFS(thread_pool),
                                   GoogleStorageAsyncFS()
                               ])
        else:
            assert request.param.endswith('gs')
            fs = GoogleStorageAsyncFS()
        async with fs:
            test_storage_uri = os.environ['HAIL_TEST_STORAGE_URI']
            protocol = 'gs://'
            assert test_storage_uri[:len(protocol)] == protocol
            base = f'{test_storage_uri}/tmp/{token}/'

            await fs.mkdir(base)
            sema = asyncio.Semaphore(50)
            async with sema:
                yield (sema, fs, base)
                await fs.rmtree(sema, base)
            assert not await fs.isdir(base)
Esempio n. 5
0
    def setUp(self):
        self.backend = ServiceBackend()

        remote_tmpdir = get_user_config().get('batch', 'remote_tmpdir')
        if not remote_tmpdir.endswith('/'):
            remote_tmpdir += '/'
        self.remote_tmpdir = remote_tmpdir

        if remote_tmpdir.startswith('gs://'):
            self.bucket = re.fullmatch(
                'gs://(?P<bucket_name>[^/]+).*',
                remote_tmpdir).groupdict()['bucket_name']
        else:
            assert remote_tmpdir.startswith('hail-az://')
            storage_account, container_name = re.fullmatch(
                'hail-az://(?P<storage_account>[^/]+)/(?P<container_name>[^/]+).*',
                remote_tmpdir).groups()
            self.bucket = f'{storage_account}/{container_name}'

        self.cloud_input_dir = f'{self.remote_tmpdir}batch-tests/resources'

        token = uuid.uuid4()
        self.cloud_output_path = f'/batch-tests/{token}'
        self.cloud_output_dir = f'{self.remote_tmpdir}{self.cloud_output_path}'

        in_cluster_key_file = '/test-gsa-key/key.json'
        if not os.path.exists(in_cluster_key_file):
            in_cluster_key_file = None

        router_fs = RouterAsyncFS(
            'gs',
            gcs_kwargs={
                'project': 'hail-vdc',
                'credentials_file': in_cluster_key_file
            },
            azure_kwargs={'credential_file': in_cluster_key_file})

        def sync_exists(url):
            return async_to_blocking(router_fs.exists(url))

        def sync_write(url, data):
            return async_to_blocking(router_fs.write(url, data))

        if not sync_exists(
                f'{self.remote_tmpdir}batch-tests/resources/hello.txt'):
            sync_write(f'{self.remote_tmpdir}batch-tests/resources/hello.txt',
                       b'hello world')
        if not sync_exists(
                f'{self.remote_tmpdir}batch-tests/resources/hello spaces.txt'):
            sync_write(
                f'{self.remote_tmpdir}batch-tests/resources/hello spaces.txt',
                b'hello')
        if not sync_exists(
                f'{self.remote_tmpdir}batch-tests/resources/hello (foo) spaces.txt'
        ):
            sync_write(
                f'{self.remote_tmpdir}batch-tests/resources/hello (foo) spaces.txt',
                b'hello')
Esempio n. 6
0
class Tests(unittest.TestCase):
    def setUp(self):
        remote_tmpdir = get_user_config().get('batch', 'remote_tmpdir')
        token = uuid.uuid4()
        self.test_path = f'{remote_tmpdir}memory-tests/{token}'
        self.fs = RouterAsyncFS(
            'gs',
            filesystems=[aiogoogle.GoogleStorageAsyncFS(project=PROJECT)])
        self.client = BlockingMemoryClient(fs=self.fs)
        self.temp_files = set()

    def tearDown(self):
        async_to_blocking(self.fs.rmtree(None, self.test_path))
        self.client.close()

    async def add_temp_file_from_string(self, name: str, str_value: bytes):
        handle = f'{self.test_path}/{name}'

        async with await self.fs.create(handle) as f:
            await f.write(str_value)

        return handle

    def test_non_existent(self):
        for _ in range(3):
            self.assertIsNone(
                self.client._get_file_if_exists(
                    f'{self.test_path}/nonexistent'))

    def test_small_write_around(self):
        async def read(url):
            async with await self.fs.open(url) as f:
                return await f.read()

        cases = [('empty_file', b''), ('null', b'\0'),
                 ('small', b'hello world')]
        for file, data in cases:
            handle = async_to_blocking(
                self.add_temp_file_from_string(file, data))
            expected = async_to_blocking(read(handle))
            self.assertEqual(expected, data)
            i = 0
            cached = self.client._get_file_if_exists(handle)
            while cached is None and i < 10:
                cached = self.client._get_file_if_exists(handle)
                i += 1
            self.assertEqual(cached, expected)

    def test_small_write_through(self):
        cases = [('empty_file2', b''), ('null2', b'\0'),
                 ('small2', b'hello world')]
        for file, data in cases:
            filename = f'{self.test_path}/{file}'
            self.client.write_file(filename, data)
            cached = self.client._get_file_if_exists(filename)
            self.assertEqual(cached, data)
Esempio n. 7
0
async def router_filesystem(
    request
) -> AsyncIterator[Tuple[asyncio.Semaphore, AsyncFS, Dict[str, str]]]:
    token = secrets.token_hex(16)

    with ThreadPoolExecutor() as thread_pool:
        async with RouterAsyncFS('file',
                                 filesystems=[
                                     LocalAsyncFS(thread_pool),
                                     GoogleStorageAsyncFS(),
                                     S3AsyncFS(thread_pool),
                                     AzureAsyncFS()
                                 ]) as fs:
            file_base = f'/tmp/{token}/'
            await fs.mkdir(file_base)

            gs_bucket = os.environ['HAIL_TEST_GCS_BUCKET']
            gs_base = f'gs://{gs_bucket}/tmp/{token}/'

            s3_bucket = os.environ['HAIL_TEST_S3_BUCKET']
            s3_base = f's3://{s3_bucket}/tmp/{token}/'

            azure_account = os.environ['HAIL_TEST_AZURE_ACCOUNT']
            azure_container = os.environ['HAIL_TEST_AZURE_CONTAINER']
            azure_base = f'hail-az://{azure_account}/{azure_container}/tmp/{token}/'

            bases = {
                'file': file_base,
                'gs': gs_base,
                's3': s3_base,
                'hail-az': azure_base
            }

            sema = asyncio.Semaphore(50)
            async with sema:
                yield (sema, fs, bases)
                await bounded_gather2(
                    sema, functools.partial(fs.rmtree, sema, file_base),
                    functools.partial(fs.rmtree, sema, gs_base),
                    functools.partial(fs.rmtree, sema, s3_base),
                    functools.partial(fs.rmtree, sema, azure_base))

            assert not await fs.isdir(file_base)
            assert not await fs.isdir(gs_base)
            assert not await fs.isdir(s3_base)
            assert not await fs.isdir(azure_base)
Esempio n. 8
0
async def filesystem(
        request) -> AsyncIterator[Tuple[asyncio.Semaphore, AsyncFS, str]]:
    token = secret_alnum_string()

    with ThreadPoolExecutor() as thread_pool:
        fs: AsyncFS
        if request.param.startswith('router/'):
            fs = RouterAsyncFS('file',
                               filesystems=[
                                   LocalAsyncFS(thread_pool),
                                   GoogleStorageAsyncFS(),
                                   S3AsyncFS(thread_pool),
                                   AzureAsyncFS()
                               ])
        elif request.param == 'file':
            fs = LocalAsyncFS(thread_pool)
        elif request.param.endswith('gs'):
            fs = GoogleStorageAsyncFS()
        elif request.param.endswith('s3'):
            fs = S3AsyncFS(thread_pool)
        else:
            assert request.param.endswith('hail-az')
            fs = AzureAsyncFS()
        async with fs:
            if request.param.endswith('file'):
                base = f'/tmp/{token}/'
            elif request.param.endswith('gs'):
                bucket = os.environ['HAIL_TEST_GCS_BUCKET']
                base = f'gs://{bucket}/tmp/{token}/'
            elif request.param.endswith('s3'):
                bucket = os.environ['HAIL_TEST_S3_BUCKET']
                base = f's3://{bucket}/tmp/{token}/'
            else:
                assert request.param.endswith('hail-az')
                account = os.environ['HAIL_TEST_AZURE_ACCOUNT']
                container = os.environ['HAIL_TEST_AZURE_CONTAINER']
                base = f'hail-az://{account}/{container}/tmp/{token}/'

            await fs.mkdir(base)
            sema = asyncio.Semaphore(50)
            async with sema:
                yield (sema, fs, base)
                await fs.rmtree(sema, base)
            assert not await fs.isdir(base)
Esempio n. 9
0
    async def create(*,
                     billing_project: Optional[str] = None,
                     batch_client: Optional[aiohb.BatchClient] = None,
                     skip_logging_configuration: Optional[bool] = None,
                     disable_progress_bar: bool = True,
                     remote_tmpdir: Optional[str] = None,
                     flags: Optional[Dict[str, str]] = None):
        del skip_logging_configuration

        if billing_project is None:
            billing_project = get_user_config().get('batch',
                                                    'billing_project',
                                                    fallback=None)
        if billing_project is None:
            raise ValueError(
                "No billing project.  Call 'init_service' with the billing "
                "project or run 'hailctl config set batch/billing_project "
                "MY_BILLING_PROJECT'")

        async_fs = RouterAsyncFS('file')
        sync_fs = RouterFS(async_fs)
        if batch_client is None:
            batch_client = await aiohb.BatchClient.create(billing_project)
        bc = hb.BatchClient.from_async(batch_client)
        batch_attributes: Dict[str, str] = dict()
        user_local_reference_cache_dir = Path(get_user_local_cache_dir(),
                                              'references', version())
        os.makedirs(user_local_reference_cache_dir, exist_ok=True)
        remote_tmpdir = get_remote_tmpdir('ServiceBackend',
                                          remote_tmpdir=remote_tmpdir)

        return ServiceBackend(
            billing_project=billing_project,
            sync_fs=sync_fs,
            async_fs=async_fs,
            bc=bc,
            disable_progress_bar=disable_progress_bar,
            batch_attributes=batch_attributes,
            user_local_reference_cache_dir=user_local_reference_cache_dir,
            remote_tmpdir=remote_tmpdir,
            flags=flags or {},
        )
Esempio n. 10
0
    def __init__(self,
                 tmp_dir: str = '/tmp/',
                 gsa_key_file: Optional[str] = None,
                 extra_docker_run_flags: Optional[str] = None):
        self._tmp_dir = tmp_dir.rstrip('/')

        flags = ''

        if extra_docker_run_flags is not None:
            flags += extra_docker_run_flags
        elif os.environ.get('HAIL_BATCH_EXTRA_DOCKER_RUN_FLAGS') is not None:
            flags += os.environ['HAIL_BATCH_EXTRA_DOCKER_RUN_FLAGS']

        if gsa_key_file is None:
            gsa_key_file = os.environ.get('HAIL_BATCH_GSA_KEY_FILE')
        if gsa_key_file is not None:
            flags += f' -v {gsa_key_file}:/gsa-key/key.json'

        self._extra_docker_run_flags = flags
        self.__fs: AsyncFS = RouterAsyncFS(default_scheme='file')
Esempio n. 11
0
class BatchPoolExecutor:
    """An executor which executes Python functions in the cloud.

    :class:`.concurrent.futures.ProcessPoolExecutor` and
    :class:`.concurrent.futures.ThreadPoolExecutor` enable the use of all the
    computer cores available on a single computer. :class:`.BatchPoolExecutor`
    enables the use of an effectively arbitrary number of cloud computer cores.

    Functions provided to :meth:`.submit` are serialized using `dill
    <https://dill.readthedocs.io/en/latest/dill.html>`__, sent to a Python
    docker container in the cloud, deserialized, and executed. The results are
    serialized and returned to the machine from which :meth:`.submit` was
    called. The Python version in the docker container will share a major and
    minor verison with the local process. The `image` parameter overrides this
    behavior.

    When used as a context manager (the ``with`` syntax), the executor will wait
    for all jobs to finish before finishing the ``with`` statement. This
    behavior can be controlled by the `wait_on_exit` parameter.

    This class creates a folder ``batch-pool-executor`` at the root of the
    bucket specified by the `backend`. This folder can be safely deleted after
    all jobs have completed.

    Examples
    --------

    Add ``3`` to ``6`` on a machine in the cloud and send the result back to
    this machine:

    >>> with BatchPoolExecutor() as bpe:  # doctest: +SKIP
    ...     future_nine = bpe.submit(lambda: 3 + 6)
    >>> future_nine.result()  # doctest: +SKIP
    9

    :meth:`.map` facilitates the common case of executing a function on many
    values in parallel:

    >>> with BatchPoolExecutor() as bpe:  # doctest: +SKIP
    ...     list(bpe.map(lambda x: x * 3, range(4)))
    [0, 3, 6, 9]

    Parameters
    ----------
    name:
        A name for the executor. Executors produce many batches and each batch
        will include this name as a prefix.
    backend:
        Backend used to execute the jobs. Must be a :class:`.ServiceBackend`.
    image:
        The name of a Docker image used for each submitted job. The image must
        include Python 3.7 or later and must have the ``dill`` Python package
        installed. If you intend to use ``numpy``, ensure that OpenBLAS is also
        installed. If unspecified, an image with a matching Python verison and
        ``numpy``, ``scipy``, and ``sklearn`` installed is used.
    cpus_per_job:
        The number of CPU cores to allocate to each job. The default value is
        ``1``. The parameter is passed unaltered to :meth:`.Job.cpu`. This
        parameter's value is used to set several environment variables
        instructing BLAS and LAPACK to limit core use.
    wait_on_exit:
        If ``True`` or unspecified, wait for all jobs to complete when exiting a
        context. If ``False``, do not wait. This option has no effect if this
        executor is not used with the ``with`` syntax.
    cleanup_bucket:
        If ``True`` or unspecified, delete all temporary files in the cloud
        storage bucket when this executor fully shuts down. If Python crashes
        before the executor is shutdown, the files will not be deleted.
    project:
        If specified, the project to use when authenticating with Google
        Storage. Google Storage is used to transfer serialized values between
        this computer and the cloud machines that execute jobs.
    """
    def __init__(self,
                 *,
                 name: Optional[str] = None,
                 backend: Optional[ServiceBackend] = None,
                 image: Optional[str] = None,
                 cpus_per_job: Optional[Union[int, str]] = None,
                 wait_on_exit: bool = True,
                 cleanup_bucket: bool = True,
                 project: Optional[str] = None):
        self.name = name or "BatchPoolExecutor-" + secret_alnum_string(4)
        self.backend = backend or ServiceBackend()
        if not isinstance(self.backend, ServiceBackend):
            raise ValueError(
                f'BatchPoolExecutor is not compatible with {type(backend)}')
        self.batches: List[Batch] = []
        self.directory = self.backend.remote_tmpdir + f'batch-pool-executor/{self.name}/'
        self.inputs = self.directory + 'inputs/'
        self.outputs = self.directory + 'outputs/'
        self.fs = RouterAsyncFS('file', gcs_kwargs={'project': project})
        self.futures: List[BatchPoolFuture] = []
        self.finished_future_count = 0
        self._shutdown = False
        version = sys.version_info
        if image is None:
            if version.major != 3 or version.minor not in (6, 7, 8):
                raise ValueError(
                    f'You must specify an image if you are using a Python version other than 3.6, 3.7, or 3.8 (you are using {version})'
                )
            self.image = f'hailgenetics/python-dill:{version.major}.{version.minor}-slim'
        else:
            self.image = image
        self.cpus_per_job = cpus_per_job
        self.cleanup_bucket = cleanup_bucket
        self.wait_on_exit = wait_on_exit

    def __enter__(self):
        return self

    def map(self,
            fn: Callable,
            *iterables: Iterable[Any],
            timeout: Optional[Union[int, float]] = None,
            chunksize: int = 1):
        """Call `fn` on cloud machines with arguments from `iterables`.

        This function returns a generator which will produce each result in the
        same order as the `iterables`, only blocking if the result is not yet
        ready. You can convert the generator to a list with :class:`.list`.

        Examples
        --------

        Do nothing, but on the cloud:

        >>> with BatchPoolExecutor() as bpe:  # doctest: +SKIP
        ...     list(bpe.map(lambda x: x, range(4)))
        [0, 1, 2, 3]

        Call a function with two parameters, on the cloud:

        >>> with BatchPoolExecutor() as bpe:  # doctest: +SKIP
        ...     list(bpe.map(lambda x, y: x + y,
        ...                  ["white", "cat", "best"],
        ...                  ["house", "dog", "friend"]))
        ["whitehouse", "catdog", "bestfriend"]

        Generate products of random matrices, on the cloud:

        >>> def random_product(seed):
        ...     np.random.seed(seed)
        ...     w = np.random.rand(1, 100)
        ...     u = np.random.rand(100, 1)
        ...     return float(w @ u)
        >>> with BatchPoolExecutor() as bpe:  # doctest: +SKIP
        ...     list(bpe.map(random_product, range(4)))
        [24.440006386777277, 23.325755364428026, 23.920184804993806, 25.47912882125101]

        Parameters
        ----------
        fn:
            The function to execute.
        iterables:
            The `iterables` are zipped together and each tuple is used as
            arguments to `fn`. See the second example for more detail. It is not
            possible to pass keyword arguments. Each element of `iterables` must
            have the same length.
        timeout:
            This is roughly a timeout on how long we wait on each function
            call. Specifically, each call to the returned generator's
            :class:`.BatchPoolFuture`
            :meth:`.iterator.__next__` invokes :meth:`.BatchPoolFuture.result` with this
            `timeout`.
        chunksize:
            The number of tasks to schedule in the same docker container. Docker
            containers take about 5 seconds to start. Ideally, each task should
            take an order of magnitude more time than start-up time. You can
            make the chunksize larger to reduce parallelism but increase the
            amount of meaningful work done per-container.
        """

        agen = async_to_blocking(
            self.async_map(fn, iterables, timeout=timeout,
                           chunksize=chunksize))

        def generator_from_async_generator(aiter):
            try:
                while True:
                    yield async_to_blocking(aiter.__anext__())
            except StopAsyncIteration:
                return

        return generator_from_async_generator(agen.__aiter__())

    async def async_map(self,
                        fn: Callable,
                        iterables: Iterable[Iterable[Any]],
                        timeout: Optional[Union[int, float]] = None,
                        chunksize: int = 1):
        """Aysncio compatible version of :meth:`.map`."""
        if not iterables:
            return iter([])

        if chunksize > 1:
            list_per_argument = [list(x) for x in iterables]
            n = len(list_per_argument[0])
            assert all(n == len(x) for x in list_per_argument)
            n_chunks = (n + chunksize - 1) // chunksize
            iterables_chunks = [
                list(partition(n_chunks, x)) for x in list_per_argument
            ]
            iterables_chunks = [
                chunk for chunk in iterables_chunks if len(chunk) > 0
            ]
            fn = chunk(fn)
            iterables = iterables_chunks

        submit_tasks = [
            asyncio.ensure_future(self.async_submit(fn, *arguments))
            for arguments in zip(*iterables)
        ]
        try:
            bp_futures = [await t for t in submit_tasks]
        except:
            for t in submit_tasks:
                if t.done() and not t.exception():
                    await t.result().async_cancel()
                elif not t.done():
                    t.cancel()
            raise

        async def async_result_or_cancel_all(future):
            try:
                return await future.async_result(timeout=timeout)
            except:
                await asyncio.gather(
                    *[bp_fut.async_cancel() for bp_fut in bp_futures],
                    return_exceptions=True)
                raise

        if chunksize > 1:
            return (val for future in bp_futures
                    for val in await async_result_or_cancel_all(future))

        return (await async_result_or_cancel_all(future)
                for future in bp_futures)

    def submit(self, fn: Callable, *args: Any,
               **kwargs: Any) -> 'BatchPoolFuture':
        """Call `fn` on a cloud machine with all remaining arguments and keyword arguments.

        The function, any objects it references, the arguments, and the keyword
        arguments will be serialized to the cloud machine. Python modules are
        not serialized, so you must ensure any needed Python modules and
        packages already present in the underlying Docker image. For more
        details see the `default_image` argument to :class:`.BatchPoolExecutor`

        This function does not return the function's output, it returns a
        :class:`.BatchPoolFuture` whose :meth:`.BatchPoolFuture.result` method
        can be used to access the value.

        Examples
        --------

        Do nothing, but on the cloud:

        >>> with BatchPoolExecutor() as bpe:  # doctest: +SKIP
        ...     future = bpe.submit(lambda x: x, 4)
        ...     future.result()
        4

        Call a function with two arguments and one keyword argument, on the
        cloud:

        >>> with BatchPoolExecutor() as bpe:  # doctest: +SKIP
        ...     future = bpe.submit(lambda x, y, z: x + y + z,
        ...                         "poly", "ethyl", z="ene")
        ...     future.result()
        "polyethylene"

        Generate a product of two random matrices, on the cloud:

        >>> def random_product(seed):
        ...     np.random.seed(seed)
        ...     w = np.random.rand(1, 100)
        ...     u = np.random.rand(100, 1)
        ...     return float(w @ u)
        >>> with BatchPoolExecutor() as bpe:  # doctest: +SKIP
        ...     future = bpe.submit(random_product, 1)
        ...     future.result()
        [23.325755364428026]

        Parameters
        ----------
        fn:
            The function to execute.
        args:
            Arguments for the funciton.
        kwargs:
            Keyword arguments for the function.
        """
        return async_to_blocking(self.async_submit(fn, *args, **kwargs))

    async def async_submit(self, unapplied: Callable, *args: Any,
                           **kwargs: Any) -> 'BatchPoolFuture':
        """Aysncio compatible version of :meth:`BatchPoolExecutor.submit`."""
        if self._shutdown:
            raise RuntimeError('BatchPoolExecutor has already been shutdown.')

        try:
            name = unapplied.__name__
        except AttributeError:
            name = '<anonymous>'
        name = f'{name}-{secret_alnum_string(4)}'
        batch = Batch(name=self.name + '-' + name,
                      backend=self.backend,
                      default_image=self.image)
        self.batches.append(batch)
        j = batch.new_job(name)

        pipe = BytesIO()
        dill.dump(functools.partial(unapplied, *args, **kwargs),
                  pipe,
                  recurse=True)
        pipe.seek(0)
        pickledfun_remote = self.inputs + f'{name}/pickledfun'
        await self.fs.write(pickledfun_remote, pipe.getvalue())
        pickledfun_local = batch.read_input(pickledfun_remote)

        thread_limit = "1"
        if self.cpus_per_job:
            j.cpu(self.cpus_per_job)
            thread_limit = str(
                int(max(1.0, cpu_spec_to_float(self.cpus_per_job))))
        j.env("OMP_NUM_THREADS", thread_limit)
        j.env("OPENBLAS_NUM_THREADS", thread_limit)
        j.env("MKL_NUM_THREADS", thread_limit)
        j.env("VECLIB_MAXIMUM_THREADS", thread_limit)
        j.env("NUMEXPR_NUM_THREADS", thread_limit)

        j.command('set -ex')
        j.command(f'''python3 -c "
import base64
import dill
import traceback
with open(\\"{j.ofile}\\", \\"wb\\") as out:
    try:
        with open(\\"{pickledfun_local}\\", \\"rb\\") as f:
            dill.dump((dill.load(f)(), None), out, recurse=True)
    except Exception as e:
        print(\\"BatchPoolExecutor encountered an exception:\\")
        traceback.print_exc()
        dill.dump((e, traceback.format_exception(type(e), e, e.__traceback__)), out, recurse=True)
"''')
        output_gcs = self.outputs + f'{name}/output'
        batch.write_output(j.ofile, output_gcs)
        backend_batch = batch.run(wait=False,
                                  disable_progress_bar=True)._async_batch
        try:
            return BatchPoolFuture(
                self, backend_batch,
                low_level_batch_client.Job.submitted_job(backend_batch, 1),
                output_gcs)
        except:
            await backend_batch.cancel()
            raise

    def __exit__(self, exc_type: Optional[Type[BaseException]],
                 exc_value: Optional[BaseException],
                 traceback: Optional[TracebackType]):
        self.shutdown(wait=self.wait_on_exit)

    def _add_future(self, f):
        self.futures.append(f)

    def _finish_future(self):
        self.finished_future_count += 1
        if self._shutdown and self.finished_future_count == len(self.futures):
            self._cleanup()

    def shutdown(self, wait: bool = True):
        """Allow temporary resources to be cleaned up.

        Until shutdown is called, some temporary cloud storage files will
        persist. After shutdown has been called *and* all outstanding jobs have
        completed, these files will be deleted.

        Parameters
        ----------
        wait:
            If true, wait for all jobs to complete before returning from this
            method.
        """
        if wait:

            async def ignore_exceptions(f):
                try:
                    await f.async_result()
                except Exception:
                    pass

            async_to_blocking(
                asyncio.gather(*[ignore_exceptions(f) for f in self.futures]))
        if self.finished_future_count == len(self.futures):
            self._cleanup()
        self._shutdown = True

    def _cleanup(self):
        if self.cleanup_bucket:
            async_to_blocking(self.fs.rmtree(None, self.directory))
        async_to_blocking(self.fs.close())
        self.backend.close()
Esempio n. 12
0
    def __init__(self,
                 *args,
                 billing_project: Optional[str] = None,
                 bucket: Optional[str] = None,
                 remote_tmpdir: Optional[str] = None,
                 google_project: Optional[str] = None,
                 token: Optional[str] = None):
        if len(args) > 2:
            raise TypeError(
                f'ServiceBackend() takes 2 positional arguments but {len(args)} were given'
            )
        if len(args) >= 1:
            if billing_project is not None:
                raise TypeError(
                    'ServiceBackend() got multiple values for argument \'billing_project\''
                )
            warnings.warn(
                'Use of deprecated positional argument \'billing_project\' in ServiceBackend(). Specify \'billing_project\' as a keyword argument instead.'
            )
            billing_project = args[0]
        if len(args) >= 2:
            if bucket is not None:
                raise TypeError(
                    'ServiceBackend() got multiple values for argument \'bucket\''
                )
            warnings.warn(
                'Use of deprecated positional argument \'bucket\' in ServiceBackend(). Specify \'bucket\' as a keyword argument instead.'
            )
            bucket = args[1]

        if billing_project is None:
            billing_project = get_user_config().get('batch',
                                                    'billing_project',
                                                    fallback=None)
        if billing_project is None:
            raise ValueError(
                'the billing_project parameter of ServiceBackend must be set '
                'or run `hailctl config set batch/billing_project '
                'MY_BILLING_PROJECT`')
        self._batch_client = BatchClient(billing_project, _token=token)

        user_config = get_user_config()

        if bucket is not None:
            warnings.warn(
                'Use of deprecated argument \'bucket\' in ServiceBackend(). Specify \'remote_tmpdir\' as a keyword argument instead.'
            )

        if remote_tmpdir is not None and bucket is not None:
            raise ValueError(
                'Cannot specify both \'remote_tmpdir\' and \'bucket\' in ServiceBackend(). Specify \'remote_tmpdir\' as a keyword argument instead.'
            )

        if bucket is None and remote_tmpdir is None:
            remote_tmpdir = user_config.get('batch',
                                            'remote_tmpdir',
                                            fallback=None)

        if remote_tmpdir is None:
            if bucket is None:
                bucket = user_config.get('batch', 'bucket', fallback=None)
                warnings.warn(
                    'Using deprecated configuration setting \'batch/bucket\'. Run `hailctl config set batch/remote_tmpdir` '
                    'to set the default for \'remote_tmpdir\' instead.')
            if bucket is None:
                raise ValueError(
                    'The \'remote_tmpdir\' parameter of ServiceBackend must be set. '
                    'Run `hailctl config set batch/remote_tmpdir REMOTE_TMPDIR`'
                )
            if 'gs://' in bucket:
                raise ValueError(
                    'The bucket parameter to ServiceBackend() should be a bucket name, not a path. '
                    'Use the remote_tmpdir parameter to specify a path.')
            remote_tmpdir = f'gs://{bucket}/batch'
        else:
            schemes = {'gs', 'hail-az'}
            found_scheme = any(
                remote_tmpdir.startswith(f'{scheme}://') for scheme in schemes)
            if not found_scheme:
                raise ValueError(
                    f'remote_tmpdir must be a storage uri path like gs://bucket/folder. Possible schemes include {schemes}'
                )
        if remote_tmpdir[-1] != '/':
            remote_tmpdir += '/'
        self.remote_tmpdir = remote_tmpdir

        gcs_kwargs = {'project': google_project}
        self.__fs: AsyncFS = RouterAsyncFS(default_scheme='file',
                                           gcs_kwargs=gcs_kwargs)
Esempio n. 13
0
async def parallel_file_exists_async(fpaths: List[str],
                                     parallelism: int = 750
                                     ) -> Dict[str, bool]:
    """
    Check whether a large number of files exist.

    Created for use with hail Batch jobs.
    Normal `file_exists` function is very slow when checking a large number of files.

    :param fpaths: List of file paths to check. Files can be in local or Google cloud storage.
    :param parallelism: Integer that sets parallelism of file existence checking task. Default is 750.
    :return: Dictionary of file paths (str) and whether the file exists (boolean).
    """
    async def async_file_exists(fs: AsyncFS, fpath: str) -> bool:
        """
        Determine file existence.

        :param fs: AsyncFS object.
        :param fpath: Path to file to check.
        :return: Whether file exists.
        """
        fext = os.path.splitext(fpath)[1]
        if fext in [".ht", ".mt"]:
            fpath += "/_SUCCESS"
        try:
            await fs.statfile(fpath)
        except FileNotFoundError:
            return False
        else:
            return True

    with tqdm(total=len(fpaths),
              desc="check files for existence",
              disable=False) as pbar:
        with ThreadPoolExecutor() as thread_pool:
            async with RouterAsyncFS("file",
                                     filesystems=[
                                         LocalAsyncFS(thread_pool),
                                         GoogleStorageAsyncFS()
                                     ]) as fs:

                def check_existence_and_update_pbar_thunk(
                        fpath: str) -> Callable:
                    """
                    Create function to check if file exists and update progress bar in stdout.

                    Function delays coroutine creation to avoid creating too many live coroutines.

                    :param fpath: Path to file to check.
                    :return: Function that checks for file existence and updates progress bar.
                    """
                    async def unapplied_function():
                        x = await async_file_exists(fs, fpath)
                        pbar.update(1)
                        return x

                    return unapplied_function

                file_existence_checks = [
                    check_existence_and_update_pbar_thunk(fpath)
                    for fpath in fpaths
                ]
                file_existence = await bounded_gather(*file_existence_checks,
                                                      parallelism=parallelism)
    return dict(zip(fpaths, file_existence))