コード例 #1
0
    async def submit(self,
                     max_bunch_bytesize=MAX_BUNCH_BYTESIZE,
                     max_bunch_size=MAX_BUNCH_SIZE,
                     disable_progress_bar=TQDM_DEFAULT_DISABLE):
        assert max_bunch_bytesize > 0
        assert max_bunch_size > 0
        if self._submitted:
            raise ValueError("cannot submit an already submitted batch")
        batch = await self._create()
        id = batch.id
        log.info(f'created batch {id}')
        byte_job_specs = [json.dumps(job_spec).encode('utf-8')
                          for job_spec in self._job_specs]
        byte_job_specs_bunches = []
        bunch_sizes = []
        bunch = []
        bunch_n_bytes = 0
        bunch_n_jobs = 0
        for spec in byte_job_specs:
            n_bytes = len(spec)
            assert n_bytes < max_bunch_bytesize, (
                f'every job spec must be less than max_bunch_bytesize,'
                f' { max_bunch_bytesize }B, but {spec} is larger')
            if bunch_n_bytes + n_bytes < max_bunch_bytesize and len(bunch) < max_bunch_size:
                bunch.append(spec)
                bunch_n_bytes += n_bytes
                bunch_n_jobs += 1
            else:
                byte_job_specs_bunches.append(bunch)
                bunch_sizes.append(bunch_n_jobs)
                bunch = [spec]
                bunch_n_bytes = n_bytes
                bunch_n_jobs = 1
        if bunch:
            byte_job_specs_bunches.append(bunch)
            bunch_sizes.append(bunch_n_jobs)

        with tqdm(total=len(self._job_specs),
                  disable=disable_progress_bar,
                  desc='jobs submitted to queue') as pbar:
            await bounded_gather(
                *[functools.partial(self._submit_jobs, id, bunch, size, pbar)
                  for bunch, size in zip(byte_job_specs_bunches, bunch_sizes)],
                parallelism=6)

        await self._client._patch(f'/api/v1alpha/batches/{id}/close')
        log.info(f'closed batch {id}')

        for j in self._jobs:
            j._job = j._job._submit(batch)

        self._job_specs = []
        self._jobs = []
        self._job_idx = 0

        self._submitted = True
        return batch
コード例 #2
0
ファイル: copy.py プロジェクト: populationgenomics/hail
async def copy(requester_pays_project: Optional[str],
               transfers: List[Transfer]
               ) -> None:
    gcs_params = {'userProject': requester_pays_project} if requester_pays_project else None
    schemes = referenced_schemes(transfers)
    default_scheme = 'file' if 'file' in schemes else None
    with ThreadPoolExecutor() as thread_pool:
        filesystems = [filesystem_from_scheme(s,
                                              thread_pool=thread_pool,
                                              gcs_params=gcs_params)
                       for s in schemes]
        async with RouterAsyncFS(default_scheme, filesystems) as fs:
            sema = asyncio.Semaphore(50)
            async with sema:
                with tqdm(desc='files', leave=False, position=0, unit='file') as file_pbar, \
                     tqdm(desc='bytes', leave=False, position=1, unit='byte', unit_scale=True, smoothing=0.1) as byte_pbar:

                    copy_report = await fs.copy(
                        sema,
                        transfers,
                        files_listener=make_tqdm_listener(file_pbar),
                        bytes_listener=make_tqdm_listener(byte_pbar))
                copy_report.summarize()
コード例 #3
0
 async def wait(self, *, disable_progress_bar=TQDM_DEFAULT_DISABLE):
     i = 0
     with tqdm(total=self.n_jobs,
               disable=disable_progress_bar,
               desc='completed jobs') as pbar:
         while True:
             status = await self.status()
             pbar.update(status['n_completed'] - pbar.n)
             if status['complete']:
                 return status
             j = random.randrange(math.floor(1.1**i))
             await asyncio.sleep(0.100 * j)
             # max 44.5s
             if i < 64:
                 i = i + 1
コード例 #4
0
ファイル: backend.py プロジェクト: chrisvittal/hail
    async def _async_run(
            self,
            batch: 'batch.Batch',
            dry_run: bool,
            verbose: bool,
            delete_scratch_on_exit: bool,
            wait: bool = True,
            open: bool = False,
            disable_progress_bar: bool = False,
            callback: Optional[str] = None,
            token: Optional[str] = None,
            **backend_kwargs):  # pylint: disable-msg=too-many-statements
        if backend_kwargs:
            raise ValueError(
                f'ServiceBackend does not support any of these keywords: {backend_kwargs}'
            )

        build_dag_start = time.time()

        uid = uuid.uuid4().hex[:6]
        batch_remote_tmpdir = f'{self.remote_tmpdir}{uid}'
        local_tmpdir = f'/io/batch/{uid}'

        default_image = 'ubuntu:20.04'

        attributes = copy.deepcopy(batch.attributes)
        if batch.name is not None:
            attributes['name'] = batch.name

        bc_batch = self._batch_client.create_batch(
            attributes=attributes,
            callback=callback,
            token=token,
            cancel_after_n_failures=batch._cancel_after_n_failures)

        n_jobs_submitted = 0
        used_remote_tmpdir = False

        job_to_client_job_mapping: Dict[_job.Job, bc.Job] = {}
        jobs_to_command = {}
        commands = []

        bash_flags = 'set -e' + ('x' if verbose else '')

        def copy_input(r):
            if isinstance(r, resource.InputResourceFile):
                return [(r._input_path, r._get_path(local_tmpdir))]
            assert isinstance(
                r, (resource.JobResourceFile, resource.PythonResult))
            return [(r._get_path(batch_remote_tmpdir),
                     r._get_path(local_tmpdir))]

        def copy_internal_output(r):
            assert isinstance(
                r, (resource.JobResourceFile, resource.PythonResult))
            return [(r._get_path(local_tmpdir),
                     r._get_path(batch_remote_tmpdir))]

        def copy_external_output(r):
            if isinstance(r, resource.InputResourceFile):
                return [(r._input_path, dest) for dest in r._output_paths]
            assert isinstance(
                r, (resource.JobResourceFile, resource.PythonResult))
            return [(r._get_path(local_tmpdir), dest)
                    for dest in r._output_paths]

        def symlink_input_resource_group(r):
            symlinks = []
            if isinstance(r, resource.ResourceGroup) and r._source is None:
                for name, irf in r._resources.items():
                    src = irf._get_path(local_tmpdir)
                    dest = f'{r._get_path(local_tmpdir)}.{name}'
                    symlinks.append(f'ln -sf {shq(src)} {shq(dest)}')
            return symlinks

        write_external_inputs = [
            x for r in batch._input_resources for x in copy_external_output(r)
        ]
        if write_external_inputs:
            transfers_bytes = orjson.dumps([{
                "from": src,
                "to": dest
            } for src, dest in write_external_inputs])
            transfers = transfers_bytes.decode('utf-8')
            write_cmd = [
                'python3', '-m', 'hailtop.aiotools.copy', 'null', transfers
            ]
            if dry_run:
                commands.append(' '.join(shq(x) for x in write_cmd))
            else:
                j = bc_batch.create_job(
                    image=HAIL_GENETICS_HAIL_IMAGE,
                    command=write_cmd,
                    attributes={'name': 'write_external_inputs'})
                jobs_to_command[j] = ' '.join(shq(x) for x in write_cmd)
                n_jobs_submitted += 1

        pyjobs = [j for j in batch._jobs if isinstance(j, _job.PythonJob)]
        for job in pyjobs:
            if job._image is None:
                version = sys.version_info
                if version.major != 3 or version.minor not in (6, 7, 8):
                    raise BatchException(
                        f"You must specify 'image' for Python jobs if you are using a Python version other than 3.6, 3.7, or 3.8 (you are using {version})"
                    )
                job._image = f'hailgenetics/python-dill:{version.major}.{version.minor}-slim'

        with tqdm(total=len(batch._jobs),
                  desc='upload code',
                  disable=disable_progress_bar) as pbar:

            async def compile_job(job):
                used_remote_tmpdir = await job._compile(local_tmpdir,
                                                        batch_remote_tmpdir,
                                                        dry_run=dry_run)
                pbar.update(1)
                return used_remote_tmpdir

            used_remote_tmpdir_results = await bounded_gather(
                *[functools.partial(compile_job, j) for j in batch._jobs],
                parallelism=150)
            used_remote_tmpdir |= any(used_remote_tmpdir_results)

        for job in tqdm(batch._jobs,
                        desc='create job objects',
                        disable=disable_progress_bar):
            inputs = [x for r in job._inputs for x in copy_input(r)]

            outputs = [
                x for r in job._internal_outputs
                for x in copy_internal_output(r)
            ]
            if outputs:
                used_remote_tmpdir = True
            outputs += [
                x for r in job._external_outputs
                for x in copy_external_output(r)
            ]

            symlinks = [
                x for r in job._mentioned
                for x in symlink_input_resource_group(r)
            ]

            if job._image is None:
                if verbose:
                    print(
                        f"Using image '{default_image}' since no image was specified."
                    )

            make_local_tmpdir = f'mkdir -p {local_tmpdir}/{job._dirname}'

            job_command = [cmd.strip() for cmd in job._wrapper_code]
            prepared_job_command = (f'{{\n{x}\n}}' for x in job_command)
            cmd = f'''
{bash_flags}
{make_local_tmpdir}
{"; ".join(symlinks)}
{" && ".join(prepared_job_command)}
'''

            user_code = '\n\n'.join(job._user_code) if job._user_code else None

            if dry_run:
                formatted_command = f'''
================================================================================
# Job {job._job_id} {f": {job.name}" if job.name else ''}

--------------------------------------------------------------------------------
## USER CODE
--------------------------------------------------------------------------------
{user_code}

--------------------------------------------------------------------------------
## COMMAND
--------------------------------------------------------------------------------
{cmd}
================================================================================
'''
                commands.append(formatted_command)
                continue

            parents = [job_to_client_job_mapping[j] for j in job._dependencies]

            attributes = copy.deepcopy(
                job.attributes) if job.attributes else {}
            if job.name:
                attributes['name'] = job.name

            resources: Dict[str, Any] = {}
            if job._cpu:
                resources['cpu'] = job._cpu
            if job._memory:
                resources['memory'] = job._memory
            if job._storage:
                resources['storage'] = job._storage
            if job._machine_type:
                resources['machine_type'] = job._machine_type
            if job._preemptible is not None:
                resources['preemptible'] = job._preemptible

            image = job._image if job._image else default_image
            image_ref = parse_docker_image_reference(image)
            if image_ref.hosted_in('dockerhub') and image_ref.name(
            ) not in HAIL_GENETICS_IMAGES:
                warnings.warn(f'Using an image {image} from Docker Hub. '
                              f'Jobs may fail due to Docker Hub rate limits.')

            env = {**job._env, 'BATCH_TMPDIR': local_tmpdir}

            j = bc_batch.create_job(
                image=image,
                command=[
                    job._shell if job._shell else DEFAULT_SHELL, '-c', cmd
                ],
                parents=parents,
                attributes=attributes,
                resources=resources,
                input_files=inputs if len(inputs) > 0 else None,
                output_files=outputs if len(outputs) > 0 else None,
                always_run=job._always_run,
                timeout=job._timeout,
                cloudfuse=job._cloudfuse if len(job._cloudfuse) > 0 else None,
                env=env,
                requester_pays_project=batch.requester_pays_project,
                mount_tokens=True,
                user_code=user_code)

            n_jobs_submitted += 1

            job_to_client_job_mapping[job] = j
            jobs_to_command[j] = cmd

        if dry_run:
            print("\n\n".join(commands))
            return None

        if delete_scratch_on_exit and used_remote_tmpdir:
            parents = list(jobs_to_command.keys())
            j = bc_batch.create_job(image=HAIL_GENETICS_HAIL_IMAGE,
                                    command=[
                                        'python3', '-m',
                                        'hailtop.aiotools.delete',
                                        batch_remote_tmpdir
                                    ],
                                    parents=parents,
                                    attributes={'name': 'remove_tmpdir'},
                                    always_run=True)
            jobs_to_command[j] = cmd
            n_jobs_submitted += 1

        if verbose:
            print(
                f'Built DAG with {n_jobs_submitted} jobs in {round(time.time() - build_dag_start, 3)} seconds.'
            )

        submit_batch_start = time.time()
        batch_handle = bc_batch.submit(
            disable_progress_bar=disable_progress_bar)

        jobs_to_command = {j.id: cmd for j, cmd in jobs_to_command.items()}

        if verbose:
            print(
                f'Submitted batch {batch_handle.id} with {n_jobs_submitted} jobs in {round(time.time() - submit_batch_start, 3)} seconds:'
            )
            for jid, cmd in jobs_to_command.items():
                print(f'{jid}: {cmd}')
            print('')

        deploy_config = get_deploy_config()
        url = deploy_config.url('batch', f'/batches/{batch_handle.id}')
        print(f'Submitted batch {batch_handle.id}, see {url}')

        if open:
            webbrowser.open(url)
        if wait:
            print(f'Waiting for batch {batch_handle.id}...')
            status = batch_handle.wait()
            print(f'batch {batch_handle.id} complete: {status["state"]}')
        return batch_handle
コード例 #5
0
async def parallel_file_exists_async(fpaths: List[str],
                                     parallelism: int = 750
                                     ) -> Dict[str, bool]:
    """
    Check whether a large number of files exist.

    Created for use with hail Batch jobs.
    Normal `file_exists` function is very slow when checking a large number of files.

    :param fpaths: List of file paths to check. Files can be in local or Google cloud storage.
    :param parallelism: Integer that sets parallelism of file existence checking task. Default is 750.
    :return: Dictionary of file paths (str) and whether the file exists (boolean).
    """
    async def async_file_exists(fs: AsyncFS, fpath: str) -> bool:
        """
        Determine file existence.

        :param fs: AsyncFS object.
        :param fpath: Path to file to check.
        :return: Whether file exists.
        """
        fext = os.path.splitext(fpath)[1]
        if fext in [".ht", ".mt"]:
            fpath += "/_SUCCESS"
        try:
            await fs.statfile(fpath)
        except FileNotFoundError:
            return False
        else:
            return True

    with tqdm(total=len(fpaths),
              desc="check files for existence",
              disable=False) as pbar:
        with ThreadPoolExecutor() as thread_pool:
            async with RouterAsyncFS("file",
                                     filesystems=[
                                         LocalAsyncFS(thread_pool),
                                         GoogleStorageAsyncFS()
                                     ]) as fs:

                def check_existence_and_update_pbar_thunk(
                        fpath: str) -> Callable:
                    """
                    Create function to check if file exists and update progress bar in stdout.

                    Function delays coroutine creation to avoid creating too many live coroutines.

                    :param fpath: Path to file to check.
                    :return: Function that checks for file existence and updates progress bar.
                    """
                    async def unapplied_function():
                        x = await async_file_exists(fs, fpath)
                        pbar.update(1)
                        return x

                    return unapplied_function

                file_existence_checks = [
                    check_existence_and_update_pbar_thunk(fpath)
                    for fpath in fpaths
                ]
                file_existence = await bounded_gather(*file_existence_checks,
                                                      parallelism=parallelism)
    return dict(zip(fpaths, file_existence))
コード例 #6
0
    async def submit(
        self,
        max_bunch_bytesize: int = MAX_BUNCH_BYTESIZE,
        max_bunch_size: int = MAX_BUNCH_SIZE,
        disable_progress_bar: Union[
            bool, None, TqdmDisableOption] = TqdmDisableOption.default,
    ) -> Batch:
        assert max_bunch_bytesize > 0
        assert max_bunch_size > 0
        if self._submitted:
            raise ValueError("cannot submit an already submitted batch")
        byte_job_specs = [
            json.dumps(job_spec).encode('utf-8')
            for job_spec in self._job_specs
        ]
        byte_job_specs_bunches: List[List[bytes]] = []
        bunch_sizes = []
        bunch: List[bytes] = []
        bunch_n_bytes = 0
        bunch_n_jobs = 0
        for spec in byte_job_specs:
            n_bytes = len(spec)
            assert n_bytes < max_bunch_bytesize, (
                'every job spec must be less than max_bunch_bytesize,'
                f' { max_bunch_bytesize }B, but {spec.decode()} is larger')
            if bunch_n_bytes + n_bytes < max_bunch_bytesize and len(
                    bunch) < max_bunch_size:
                bunch.append(spec)
                bunch_n_bytes += n_bytes
                bunch_n_jobs += 1
            else:
                byte_job_specs_bunches.append(bunch)
                bunch_sizes.append(bunch_n_jobs)
                bunch = [spec]
                bunch_n_bytes = n_bytes
                bunch_n_jobs = 1
        if bunch:
            byte_job_specs_bunches.append(bunch)
            bunch_sizes.append(bunch_n_jobs)

        with tqdm(total=len(self._job_specs),
                  disable=disable_progress_bar,
                  desc='jobs submitted to queue') as pbar:
            if len(byte_job_specs_bunches) == 1:
                batch = await self._open_submit_close(
                    byte_job_specs_bunches[0], bunch_sizes[0], pbar)
                id = batch.id
            else:
                batch = await self._open_batch()
                id = batch.id
                await bounded_gather(
                    *[
                        functools.partial(self._submit_jobs, id, bunch, size,
                                          pbar) for bunch, size in
                        zip(byte_job_specs_bunches, bunch_sizes)
                    ],
                    parallelism=6,
                )
                await self._close_batch(id)

        log.info(f'created batch {id}')

        for j in self._jobs:
            j._job = j._job._submit(batch)

        self._job_specs = []
        self._jobs = []
        self._job_idx = 0

        self._submitted = True
        return batch