Beispiel #1
0
 def _fs(self) -> AsyncFS:
     if self._DEPRECATED_project is not None:
         if self._DEPRECATED_fs is None:
             self._DEPRECATED_fs = RouterAsyncFS('file', [
                 LocalAsyncFS(ThreadPoolExecutor()),
                 GoogleStorageAsyncFS(project=self._DEPRECATED_project)
             ])
         return self._DEPRECATED_fs
     return self._backend._fs
Beispiel #2
0
async def filesystem(request):
    token = secret_alnum_string()

    with ThreadPoolExecutor() as thread_pool:
        if request.param.startswith('router/'):
            fs = RouterAsyncFS(
                'file', [LocalAsyncFS(thread_pool),
                         GoogleStorageAsyncFS()])
        elif request.param == 'file':
            fs = LocalAsyncFS(thread_pool)
        else:
            fs = GoogleStorageAsyncFS()
        async with fs:
            if request.param.endswith('file'):
                base = f'/tmp/{token}/'
            else:
                assert request.param.endswith('gs')
                bucket = os.environ['HAIL_TEST_BUCKET']
                base = f'gs://{bucket}/tmp/{token}/'

            await fs.mkdir(base)
            sema = asyncio.Semaphore(50)
            async with sema:
                yield (sema, fs, base)
                await fs.rmtree(sema, base)
            assert not await fs.isdir(base)
Beispiel #3
0
async def router_filesystem(request):
    token = secrets.token_hex(16)

    with ThreadPoolExecutor() as thread_pool:
        async with RouterAsyncFS(
                'file', [LocalAsyncFS(thread_pool), GoogleStorageAsyncFS()]) as fs:
            file_base = f'/tmp/{token}/'
            await fs.mkdir(file_base)

            bucket = os.environ['HAIL_TEST_BUCKET']
            gs_base = f'gs://{bucket}/tmp/{token}/'

            bases = {
                'file': file_base,
                'gs': gs_base
            }

            sema = asyncio.Semaphore(50)
            async with sema:
                yield (sema, fs, bases)
                await bounded_gather2(sema,
                                      fs.rmtree(sema, file_base),
                                      fs.rmtree(sema, gs_base))

            assert not await fs.isdir(file_base)
            assert not await fs.isdir(gs_base)
Beispiel #4
0
async def copy_test_specs():
    test_specs = []

    with ThreadPoolExecutor() as thread_pool:
        async with RouterAsyncFS('file', [LocalAsyncFS(thread_pool)]) as fs:
            for config in copy_test_configurations():
                token = secrets.token_hex(16)

                base = f'/tmp/{token}/'
                src_base = f'{base}src/'
                dest_base = f'{base}dest/'

                await fs.mkdir(base)
                await fs.mkdir(src_base)
                await fs.mkdir(dest_base)
                # make sure dest_base exists
                async with await fs.create(f'{dest_base}keep'):
                    pass

                sema = asyncio.Semaphore(50)
                async with sema:
                    result = await run_test_spec(sema, fs, config, src_base,
                                                 dest_base)
                    config['result'] = result

                    test_specs.append(config)

                    await fs.rmtree(sema, base)
                    assert not await fs.isdir(base)

    return test_specs
Beispiel #5
0
async def router_filesystem(request):
    token = secrets.token_hex(16)

    with ThreadPoolExecutor() as thread_pool:
        async with RouterAsyncFS('file', [
                LocalAsyncFS(thread_pool),
                GoogleStorageAsyncFS(),
                S3AsyncFS(thread_pool),
                AzureAsyncFS()
        ]) as fs:
            file_base = f'/tmp/{token}/'
            await fs.mkdir(file_base)

            gs_bucket = os.environ['HAIL_TEST_GCS_BUCKET']
            gs_base = f'gs://{gs_bucket}/tmp/{token}/'

            s3_bucket = os.environ['HAIL_TEST_S3_BUCKET']
            s3_base = f's3://{s3_bucket}/tmp/{token}/'

            azure_account = os.environ['HAIL_TEST_AZURE_ACCOUNT']
            azure_container = os.environ['HAIL_TEST_AZURE_CONTAINER']
            azure_base = f'hail-az://{azure_account}/{azure_container}/tmp/{token}/'

            bases = {
                'file': file_base,
                'gs': gs_base,
                's3': s3_base,
                'hail-az': azure_base
            }

            sema = asyncio.Semaphore(50)
            async with sema:
                yield (sema, fs, bases)
                await bounded_gather2(
                    sema, functools.partial(fs.rmtree, sema, file_base),
                    functools.partial(fs.rmtree, sema, gs_base),
                    functools.partial(fs.rmtree, sema, s3_base),
                    functools.partial(fs.rmtree, sema, azure_base))

            assert not await fs.isdir(file_base)
            assert not await fs.isdir(gs_base)
            assert not await fs.isdir(s3_base)
            assert not await fs.isdir(azure_base)
Beispiel #6
0
async def filesystem(request):
    token = secret_alnum_string()

    with ThreadPoolExecutor() as thread_pool:
        if request.param.startswith('router/'):
            fs = RouterAsyncFS('file', [
                LocalAsyncFS(thread_pool),
                GoogleStorageAsyncFS(),
                S3AsyncFS(thread_pool),
                AzureAsyncFS()
            ])
        elif request.param == 'file':
            fs = LocalAsyncFS(thread_pool)
        elif request.param.endswith('gs'):
            fs = GoogleStorageAsyncFS()
        elif request.param.endswith('s3'):
            fs = S3AsyncFS(thread_pool)
        else:
            assert request.param.endswith('hail-az')
            fs = AzureAsyncFS()
        async with fs:
            if request.param.endswith('file'):
                base = f'/tmp/{token}/'
            elif request.param.endswith('gs'):
                bucket = os.environ['HAIL_TEST_GCS_BUCKET']
                base = f'gs://{bucket}/tmp/{token}/'
            elif request.param.endswith('s3'):
                bucket = os.environ['HAIL_TEST_S3_BUCKET']
                base = f's3://{bucket}/tmp/{token}/'
            else:
                assert request.param.endswith('hail-az')
                account = os.environ['HAIL_TEST_AZURE_ACCOUNT']
                container = os.environ['HAIL_TEST_AZURE_CONTAINER']
                base = f'hail-az://{account}/{container}/tmp/{token}/'

            await fs.mkdir(base)
            sema = asyncio.Semaphore(50)
            async with sema:
                yield (sema, fs, base)
                await fs.rmtree(sema, base)
            assert not await fs.isdir(base)
Beispiel #7
0
    def __init__(self,
                 *args,
                 billing_project: Optional[str] = None,
                 bucket: Optional[str] = None,
                 remote_tmpdir: Optional[str] = None,
                 google_project: Optional[str] = None,
                 token: str = None):
        if len(args) > 2:
            raise TypeError(
                f'ServiceBackend() takes 2 positional arguments but {len(args)} were given'
            )
        if len(args) >= 1:
            if billing_project is not None:
                raise TypeError(
                    'ServiceBackend() got multiple values for argument \'billing_project\''
                )
            warnings.warn(
                'Use of deprecated positional argument \'billing_project\' in ServiceBackend(). Specify \'billing_project\' as a keyword argument instead.'
            )
            billing_project = args[0]
        if len(args) >= 2:
            if bucket is not None:
                raise TypeError(
                    'ServiceBackend() got multiple values for argument \'bucket\''
                )
            warnings.warn(
                'Use of deprecated positional argument \'bucket\' in ServiceBackend(). Specify \'bucket\' as a keyword argument instead.'
            )
            bucket = args[1]

        if remote_tmpdir is not None and bucket is not None:
            raise ValueError(
                'Cannot specify both remote_tmpdir and bucket in ServiceBackend()'
            )

        if billing_project is None:
            billing_project = get_user_config().get('batch',
                                                    'billing_project',
                                                    fallback=None)
        if billing_project is None:
            raise ValueError(
                'the billing_project parameter of ServiceBackend must be set '
                'or run `hailctl config set batch/billing_project '
                'MY_BILLING_PROJECT`')
        self._batch_client = BatchClient(billing_project, _token=token)
        self.__fs: AsyncFS = RouterAsyncFS('file', [
            LocalAsyncFS(ThreadPoolExecutor()),
            GoogleStorageAsyncFS(project=google_project)
        ])
        if remote_tmpdir is None:
            if bucket is None:
                bucket = get_user_config().get('batch',
                                               'bucket',
                                               fallback=None)
            if bucket is None:
                raise ValueError(
                    'either the bucket or remote_tmpdir parameter of ServiceBackend '
                    'must be set or run `hailctl config set batch/bucket MY_BUCKET`'
                )
            if 'gs://' in bucket:
                raise ValueError(
                    'The bucket parameter to ServiceBackend() should be a bucket name, not a path. '
                    'Use the remote_tmpdir parameter to specify a path.')
            remote_tmpdir = f'gs://{bucket}/batch'
        else:
            if not remote_tmpdir.startswith('gs://'):
                raise ValueError(
                    'remote_tmpdir must be a google storage path like gs://bucket/folder'
                )
        if remote_tmpdir[-1] != '/':
            remote_tmpdir += '/'
        self.remote_tmpdir = remote_tmpdir
Beispiel #8
0
class Batch:
    """
    Object representing the distributed acyclic graph (DAG) of jobs to run.

    Examples
    --------
    Create a batch object:

    >>> p = Batch()

    Create a new job that prints "hello":

    >>> t = p.new_job()
    >>> t.command(f'echo "hello" ')

    Execute the DAG:

    >>> p.run()

    Notes
    -----

    The methods :meth:`.Batch.read_input` and :meth:`.Batch.read_input_group`
    are for adding input files to a batch. An input file is a file that already
    exists before executing a batch and is not present in the docker container
    the job is being run in.

    Files generated by executing a job are temporary files and must be written
    to a permanent location using the method :meth:`.Batch.write_output`.

    Parameters
    ----------
    name:
        Name of the batch.
    backend:
        Backend used to execute the jobs. Default is :class:`.LocalBackend`.
    attributes:
        Key-value pairs of additional attributes. 'name' is not a valid keyword.
        Use the name argument instead.
    requester_pays_project:
        The name of the Google project to be billed when accessing requester pays buckets.
    default_image:
        Default docker image to use for Bash jobs. This must be the full name of the
        image including any repository prefix and tags if desired (default tag is `latest`).
    default_memory:
        Memory setting to use by default if not specified by a job. Only
        applicable if a docker image is specified for the :class:`.LocalBackend`
        or the :class:`.ServiceBackend`. See :meth:`.Job.memory`.
    default_cpu:
        CPU setting to use by default if not specified by a job. Only
        applicable if a docker image is specified for the :class:`.LocalBackend`
        or the :class:`.ServiceBackend`. See :meth:`.Job.cpu`.
    default_storage:
        Storage setting to use by default if not specified by a job. Only
        applicable for the :class:`.ServiceBackend`. See :meth:`.Job.storage`.
    default_timeout:
        Maximum time in seconds for a job to run before being killed. Only
        applicable for the :class:`.ServiceBackend`. If `None`, there is no
        timeout.
    default_python_image:
        Default image to use for all Python jobs. This must be the full name of the
        image including any repository prefix and tags if desired (default tag is `latest`).
        The image must have the `dill` Python package installed and have the same version of
        Python installed that is currently running. If `None`, a compatible Python image with
        `dill` pre-installed will automatically be used if the current Python version is
        3.6, 3.7, or 3.8.
    project:
        DEPRECATED: please specify `google_project` on the ServiceBackend instead. If specified,
        the project to use when authenticating with Google Storage. Google Storage is used to
        transfer serialized values between this computer and the cloud machines that execute Python
        jobs.
    cancel_after_n_failures:
        Automatically cancel the batch after N failures have occurred. The default
        behavior is there is no limit on the number of failures. Only
        applicable for the :class:`.ServiceBackend`. Must be greater than 0.

    """

    _counter = 0
    _uid_prefix = "__BATCH__"
    _regex_pattern = r"(?P<BATCH>{}\d+)".format(_uid_prefix)

    @classmethod
    def _get_uid(cls):
        uid = "{}{}".format(cls._uid_prefix, cls._counter)
        cls._counter += 1
        return uid

    def __init__(self,
                 name: Optional[str] = None,
                 backend: Optional[_backend.Backend] = None,
                 attributes: Optional[Dict[str, str]] = None,
                 requester_pays_project: Optional[str] = None,
                 default_image: Optional[str] = None,
                 default_memory: Optional[Union[int, str]] = None,
                 default_cpu: Optional[Union[float, int, str]] = None,
                 default_storage: Optional[Union[int, str]] = None,
                 default_timeout: Optional[Union[float, int]] = None,
                 default_shell: Optional[str] = None,
                 default_python_image: Optional[str] = None,
                 project: Optional[str] = None,
                 cancel_after_n_failures: Optional[int] = None):
        self._jobs: List[job.Job] = []
        self._resource_map: Dict[str, _resource.Resource] = {}
        self._allocated_files: Set[str] = set()
        self._input_resources: Set[_resource.InputResourceFile] = set()
        self._uid = Batch._get_uid()
        self._job_tokens: Set[str] = set()

        self._backend = backend if backend else _backend.LocalBackend()

        self.name = name

        if attributes is None:
            attributes = {}
        if 'name' in attributes:
            raise BatchException(
                "'name' is not a valid attribute. Use the name argument instead."
            )
        self.attributes = attributes

        self.requester_pays_project = requester_pays_project

        self._default_image = default_image
        self._default_memory = default_memory
        self._default_cpu = default_cpu
        self._default_storage = default_storage
        self._default_timeout = default_timeout
        self._default_shell = default_shell
        self._default_python_image = default_python_image

        if project is not None:
            warnings.warn(
                'The project argument to Batch is deprecated, please instead use the google_project argument to '
                'ServiceBackend. Use of this argument may trigger warnings from aiohttp about unclosed objects.'
            )
        self._DEPRECATED_project = project
        self._DEPRECATED_fs: Optional[RouterAsyncFS] = None

        self._cancel_after_n_failures = cancel_after_n_failures

    def _unique_job_token(self, n=5):
        token = secret_alnum_string(n)
        while token in self._job_tokens:
            token = secret_alnum_string(n)
        return token

    @property
    def _fs(self) -> AsyncFS:
        if self._DEPRECATED_project is not None:
            if self._DEPRECATED_fs is None:
                self._DEPRECATED_fs = RouterAsyncFS('file', [
                    LocalAsyncFS(ThreadPoolExecutor()),
                    GoogleStorageAsyncFS(project=self._DEPRECATED_project)
                ])
            return self._DEPRECATED_fs
        return self._backend._fs

    def new_job(self,
                name: Optional[str] = None,
                attributes: Optional[Dict[str, str]] = None,
                shell: Optional[str] = None) -> job.BashJob:
        """
        Alias for :meth:`.Batch.new_bash_job`
        """

        return self.new_bash_job(name, attributes, shell)

    def new_bash_job(self,
                     name: Optional[str] = None,
                     attributes: Optional[Dict[str, str]] = None,
                     shell: Optional[str] = None) -> job.BashJob:
        """
        Initialize a :class:`.BashJob` object with default memory, storage,
        image, and CPU settings (defined in :class:`.Batch`) upon batch creation.

        Examples
        --------
        Create and execute a batch `b` with one job `j` that prints "hello world":

        >>> b = Batch()
        >>> j = b.new_bash_job(name='hello', attributes={'language': 'english'})
        >>> j.command('echo "hello world"')
        >>> b.run()

        Parameters
        ----------
        name:
            Name of the job.
        attributes:
            Key-value pairs of additional attributes. 'name' is not a valid keyword.
            Use the name argument instead.
        """

        if attributes is None:
            attributes = {}

        if shell is None:
            shell = self._default_shell

        token = self._unique_job_token()
        j = job.BashJob(batch=self,
                        token=token,
                        name=name,
                        attributes=attributes,
                        shell=shell)

        if self._default_image is not None:
            j.image(self._default_image)
        if self._default_memory is not None:
            j.memory(self._default_memory)
        if self._default_cpu is not None:
            j.cpu(self._default_cpu)
        if self._default_storage is not None:
            j.storage(self._default_storage)
        if self._default_timeout is not None:
            j.timeout(self._default_timeout)

        self._jobs.append(j)
        return j

    def new_python_job(
            self,
            name: Optional[str] = None,
            attributes: Optional[Dict[str, str]] = None) -> job.PythonJob:
        """
        Initialize a new :class:`.PythonJob` object with default
        Python image, memory, storage, and CPU settings (defined in :class:`.Batch`)
        upon batch creation.

        Examples
        --------
        Create and execute a batch `b` with one job `j` that prints "hello alice":

        .. code-block:: python

            b = Batch(default_python_image='gcr.io/hail-vdc/python-dill:3.7-slim')

            def hello(name):
                return f'hello {name}'

            j = b.new_python_job()
            output = j.call(hello, 'alice')

            # Write out the str representation of result to a file

            b.write_output(output.as_str(), 'hello.txt')

            b.run()

        Notes
        -----

        The image to use for Python jobs can be specified by `default_python_image`
        when constructing a :class:`.Batch`. The image specified must have the `dill`
        package installed. If ``default_python_image`` is not specified, then a Docker
        image will automatically be created for you with the base image
        `hailgenetics/python-dill:[major_version].[minor_version]-slim` and the Python
        packages specified by ``python_requirements`` will be installed. The default name
        of the image is `batch-python` with a random string for the tag unless ``python_build_image_name``
        is specified. If the :class:`.ServiceBackend` is the backend, the locally built
        image will be pushed to the repository specified by ``image_repository``.

        Parameters
        ----------
        name:
            Name of the job.
        attributes:
            Key-value pairs of additional attributes. 'name' is not a valid keyword.
            Use the name argument instead.
        """
        if attributes is None:
            attributes = {}

        token = self._unique_job_token()
        j = job.PythonJob(batch=self,
                          token=token,
                          name=name,
                          attributes=attributes)

        if self._default_python_image is not None:
            j.image(self._default_python_image)
        if self._default_memory is not None:
            j.memory(self._default_memory)
        if self._default_cpu is not None:
            j.cpu(self._default_cpu)
        if self._default_storage is not None:
            j.storage(self._default_storage)
        if self._default_timeout is not None:
            j.timeout(self._default_timeout)

        self._jobs.append(j)
        return j

    def _new_job_resource_file(self, source, value=None):
        if value is None:
            value = secret_alnum_string(5)
        jrf = _resource.JobResourceFile(value, source)
        self._resource_map[jrf._uid] = jrf  # pylint: disable=no-member
        return jrf

    def _new_input_resource_file(self, input_path, value=None):
        if value is None:
            value = f'{secret_alnum_string(5)}/{os.path.basename(input_path.rstrip("/"))}'
        irf = _resource.InputResourceFile(value)
        irf._add_input_path(input_path)
        self._resource_map[irf._uid] = irf  # pylint: disable=no-member
        self._input_resources.add(irf)
        return irf

    def _new_resource_group(self, source, mappings, root=None):
        assert isinstance(mappings, dict)
        if root is None:
            root = secret_alnum_string(5)
        d = {}
        new_resource_map = {}
        for name, code in mappings.items():
            if not isinstance(code, str):
                raise BatchException(
                    f"value for name '{name}' is not a string. Found '{type(code)}' instead."
                )
            r = self._new_job_resource_file(source=source,
                                            value=eval(f'f"""{code}"""'))  # pylint: disable=W0123
            d[name] = r
            new_resource_map[r._uid] = r  # pylint: disable=no-member

        self._resource_map.update(new_resource_map)
        rg = _resource.ResourceGroup(source, root, **d)
        self._resource_map.update({rg._uid: rg})
        return rg

    def _new_python_result(self, source, value=None) -> _resource.PythonResult:
        if value is None:
            value = secret_alnum_string(5)
        jrf = _resource.PythonResult(value, source)
        self._resource_map[jrf._uid] = jrf  # pylint: disable=no-member
        return jrf

    def read_input(self, path: str) -> _resource.InputResourceFile:
        """
        Create a new input resource file object representing a single file.

        .. warning::

            To avoid expensive egress charges, input files should be located in buckets
            that are multi-regional in the United States because Batch runs jobs in any
            US region.

        Examples
        --------

        Read the file `hello.txt`:

        >>> b = Batch()
        >>> input = b.read_input('data/hello.txt')
        >>> j = b.new_job()
        >>> j.command(f'cat {input}')
        >>> b.run()

        Parameters
        ----------
        path: :obj:`str`
            File path to read.
        extension: :obj:`str`, optional
            File extension to use.
        """

        irf = self._new_input_resource_file(path)
        return irf

    def read_input_group(self, **kwargs: str) -> _resource.ResourceGroup:
        """Create a new resource group representing a mapping of identifier to
        input resource files.

        .. warning::

            To avoid expensive egress charges, input files should be located in buckets
            that are multi-regional in the United States because Batch runs jobs in any
            US region.

        Examples
        --------

        Read a binary PLINK file:

        >>> b = Batch()
        >>> bfile = b.read_input_group(bed="data/example.bed",
        ...                            bim="data/example.bim",
        ...                            fam="data/example.fam")
        >>> j = b.new_job()
        >>> j.command(f"plink --bfile {bfile} --geno --make-bed --out {j.geno}")
        >>> j.command(f"wc -l {bfile.fam}")
        >>> j.command(f"wc -l {bfile.bim}")
        >>> b.run() # doctest: +SKIP

        Read a FASTA file and it's index (file extensions matter!):

        >>> fasta = b.read_input_group(**{'fasta': 'data/example.fasta',
        ...                               'fasta.idx': 'data/example.fasta.idx'})

        Create a resource group where the identifiers don't match the file extensions:

        >>> rg = b.read_input_group(foo='data/foo.txt',
        ...                         bar='data/bar.txt')

        `rg.foo` and `rg.bar` will not have the `.txt` file extension and
        instead will be `{root}.foo` and `{root}.bar` where `{root}` is a random
        identifier.

        Notes
        -----
        The identifier is used to refer to a specific resource file. For example,
        given the resource group `rg`, you can use the attribute notation
        `rg.identifier` or the get item notation `rg[identifier]`.

        The file extensions for each file are derived from the identifier.  This
        is equivalent to `"{root}.identifier"` from
        :meth:`.BashJob.declare_resource_group`. We are planning on adding
        flexibility to incorporate more complicated extensions in the future
        such as `.vcf.bgz`.  For now, use :meth:`.JobResourceFile.add_extension`
        to add an extension to a resource file.

        Parameters
        ----------
        kwargs:
            Key word arguments where the name/key is the identifier and the value
            is the file path.
        """

        root = secret_alnum_string(5)
        new_resources = {
            name: self._new_input_resource_file(
                file, value=f'{root}/{os.path.basename(file.rstrip("/"))}')
            for name, file in kwargs.items()
        }
        rg = _resource.ResourceGroup(None, root, **new_resources)
        self._resource_map.update({rg._uid: rg})
        return rg

    def write_output(self, resource: _resource.Resource, dest: str):  # pylint: disable=R0201
        """
        Write resource file or resource file group to an output destination.

        Examples
        --------

        Write a single job intermediate to a permanent location:

        >>> b = Batch()
        >>> j = b.new_job()
        >>> j.command(f'echo "hello" > {j.ofile}')
        >>> b.write_output(j.ofile, 'output/hello.txt')
        >>> b.run()

        .. warning::

            To avoid expensive egress charges, output files should be located in buckets
            that are multi-regional in the United States because Batch runs jobs in any
            US region.

        Notes
        -----
        All :class:`.JobResourceFile` are temporary files and must be written
        to a permanent location using :meth:`.write_output` if the output needs
        to be saved.

        Parameters
        ----------
        resource:
            Resource to be written to a file.
        dest:
            Destination file path. For a single :class:`.ResourceFile`, this will
            simply be `dest`. For a :class:`.ResourceGroup`, `dest` is the file
            root and each resource file will be written to `{root}.identifier`
            where `identifier` is the identifier of the file in the
            :class:`.ResourceGroup` map.
        """

        if not isinstance(resource, _resource.Resource):
            raise BatchException(
                f"'write_output' only accepts Resource inputs. Found '{type(resource)}'."
            )
        if (isinstance(resource, _resource.JobResourceFile)
                and isinstance(resource._source, job.BashJob)
                and resource not in resource._source._mentioned):
            name = resource._source._resources_inverse[resource]
            raise BatchException(
                f"undefined resource '{name}'\n"
                f"Hint: resources must be defined within the "
                f"job methods 'command' or 'declare_resource_group'")
        if (isinstance(resource, _resource.PythonResult)
                and isinstance(resource._source, job.PythonJob)
                and resource not in resource._source._mentioned):
            name = resource._source._resources_inverse[resource]
            raise BatchException(f"undefined resource '{name}'\n"
                                 f"Hint: resources must be bound as a result "
                                 f"using the PythonJob 'call' method")

        if isinstance(self._backend, _backend.LocalBackend):
            if not dest.startswith('gs://'):
                dest = os.path.abspath(os.path.expanduser(dest))

        resource._add_output_path(dest)

    def select_jobs(self, pattern: str) -> List[job.Job]:
        """
        Select all jobs in the batch whose name matches `pattern`.

        Examples
        --------

        Select jobs in batch matching `qc`:

        >>> b = Batch()
        >>> j = b.new_job(name='qc')
        >>> qc_jobs = b.select_jobs('qc')
        >>> assert qc_jobs == [j]

        Parameters
        ----------
        pattern:
            Regex pattern matching job names.
        """

        return [
            job for job in self._jobs
            if job.name is not None and re.match(pattern, job.name) is not None
        ]

    def run(self,
            dry_run: bool = False,
            verbose: bool = False,
            delete_scratch_on_exit: bool = True,
            **backend_kwargs: Any):
        """
        Execute a batch.

        Examples
        --------

        Create a simple batch with one job and execute it:

        >>> b = Batch()
        >>> j = b.new_job()
        >>> j.command('echo "hello world"')
        >>> b.run()


        Parameters
        ----------
        dry_run:
            If `True`, don't execute code.
        verbose:
            If `True`, print debugging output.
        delete_scratch_on_exit:
            If `True`, delete temporary directories with intermediate files.
        backend_kwargs:
            See :meth:`.Backend._run` for backend-specific arguments.
        """

        seen = set()
        ordered_jobs = []

        def schedule_job(j):
            if j in seen:
                return
            seen.add(j)
            for p in j._dependencies:
                schedule_job(p)
            ordered_jobs.append(j)

        for j in self._jobs:
            schedule_job(j)

        assert len(seen) == len(self._jobs)

        job_index = {j: i for i, j in enumerate(ordered_jobs, start=1)}
        for j in ordered_jobs:
            i = job_index[j]
            j._job_id = i
            for d in j._dependencies:
                if job_index[d] >= i:
                    raise BatchException("cycle detected in dependency graph")

        self._jobs = ordered_jobs
        run_result = self._backend._run(self, dry_run, verbose,
                                        delete_scratch_on_exit,
                                        **backend_kwargs)  # pylint: disable=assignment-from-no-return
        if self._DEPRECATED_fs is not None:
            # best effort only because this is deprecated
            self._DEPRECATED_fs.close()
            self._DEPRECATED_fs = None
        return run_result

    def __str__(self):
        return self._uid