async def submit(self, max_bunch_bytesize=MAX_BUNCH_BYTESIZE, max_bunch_size=MAX_BUNCH_SIZE, disable_progress_bar=TQDM_DEFAULT_DISABLE): assert max_bunch_bytesize > 0 assert max_bunch_size > 0 if self._submitted: raise ValueError("cannot submit an already submitted batch") batch = await self._create() id = batch.id log.info(f'created batch {id}') byte_job_specs = [json.dumps(job_spec).encode('utf-8') for job_spec in self._job_specs] byte_job_specs_bunches = [] bunch_sizes = [] bunch = [] bunch_n_bytes = 0 bunch_n_jobs = 0 for spec in byte_job_specs: n_bytes = len(spec) assert n_bytes < max_bunch_bytesize, ( f'every job spec must be less than max_bunch_bytesize,' f' { max_bunch_bytesize }B, but {spec} is larger') if bunch_n_bytes + n_bytes < max_bunch_bytesize and len(bunch) < max_bunch_size: bunch.append(spec) bunch_n_bytes += n_bytes bunch_n_jobs += 1 else: byte_job_specs_bunches.append(bunch) bunch_sizes.append(bunch_n_jobs) bunch = [spec] bunch_n_bytes = n_bytes bunch_n_jobs = 1 if bunch: byte_job_specs_bunches.append(bunch) bunch_sizes.append(bunch_n_jobs) with tqdm(total=len(self._job_specs), disable=disable_progress_bar, desc='jobs submitted to queue') as pbar: await bounded_gather( *[functools.partial(self._submit_jobs, id, bunch, size, pbar) for bunch, size in zip(byte_job_specs_bunches, bunch_sizes)], parallelism=6) await self._client._patch(f'/api/v1alpha/batches/{id}/close') log.info(f'closed batch {id}') for j in self._jobs: j._job = j._job._submit(batch) self._job_specs = [] self._jobs = [] self._job_idx = 0 self._submitted = True return batch
async def copy(requester_pays_project: Optional[str], transfers: List[Transfer] ) -> None: gcs_params = {'userProject': requester_pays_project} if requester_pays_project else None schemes = referenced_schemes(transfers) default_scheme = 'file' if 'file' in schemes else None with ThreadPoolExecutor() as thread_pool: filesystems = [filesystem_from_scheme(s, thread_pool=thread_pool, gcs_params=gcs_params) for s in schemes] async with RouterAsyncFS(default_scheme, filesystems) as fs: sema = asyncio.Semaphore(50) async with sema: with tqdm(desc='files', leave=False, position=0, unit='file') as file_pbar, \ tqdm(desc='bytes', leave=False, position=1, unit='byte', unit_scale=True, smoothing=0.1) as byte_pbar: copy_report = await fs.copy( sema, transfers, files_listener=make_tqdm_listener(file_pbar), bytes_listener=make_tqdm_listener(byte_pbar)) copy_report.summarize()
async def wait(self, *, disable_progress_bar=TQDM_DEFAULT_DISABLE): i = 0 with tqdm(total=self.n_jobs, disable=disable_progress_bar, desc='completed jobs') as pbar: while True: status = await self.status() pbar.update(status['n_completed'] - pbar.n) if status['complete']: return status j = random.randrange(math.floor(1.1**i)) await asyncio.sleep(0.100 * j) # max 44.5s if i < 64: i = i + 1
async def _async_run( self, batch: 'batch.Batch', dry_run: bool, verbose: bool, delete_scratch_on_exit: bool, wait: bool = True, open: bool = False, disable_progress_bar: bool = False, callback: Optional[str] = None, token: Optional[str] = None, **backend_kwargs): # pylint: disable-msg=too-many-statements if backend_kwargs: raise ValueError( f'ServiceBackend does not support any of these keywords: {backend_kwargs}' ) build_dag_start = time.time() uid = uuid.uuid4().hex[:6] batch_remote_tmpdir = f'{self.remote_tmpdir}{uid}' local_tmpdir = f'/io/batch/{uid}' default_image = 'ubuntu:20.04' attributes = copy.deepcopy(batch.attributes) if batch.name is not None: attributes['name'] = batch.name bc_batch = self._batch_client.create_batch( attributes=attributes, callback=callback, token=token, cancel_after_n_failures=batch._cancel_after_n_failures) n_jobs_submitted = 0 used_remote_tmpdir = False job_to_client_job_mapping: Dict[_job.Job, bc.Job] = {} jobs_to_command = {} commands = [] bash_flags = 'set -e' + ('x' if verbose else '') def copy_input(r): if isinstance(r, resource.InputResourceFile): return [(r._input_path, r._get_path(local_tmpdir))] assert isinstance( r, (resource.JobResourceFile, resource.PythonResult)) return [(r._get_path(batch_remote_tmpdir), r._get_path(local_tmpdir))] def copy_internal_output(r): assert isinstance( r, (resource.JobResourceFile, resource.PythonResult)) return [(r._get_path(local_tmpdir), r._get_path(batch_remote_tmpdir))] def copy_external_output(r): if isinstance(r, resource.InputResourceFile): return [(r._input_path, dest) for dest in r._output_paths] assert isinstance( r, (resource.JobResourceFile, resource.PythonResult)) return [(r._get_path(local_tmpdir), dest) for dest in r._output_paths] def symlink_input_resource_group(r): symlinks = [] if isinstance(r, resource.ResourceGroup) and r._source is None: for name, irf in r._resources.items(): src = irf._get_path(local_tmpdir) dest = f'{r._get_path(local_tmpdir)}.{name}' symlinks.append(f'ln -sf {shq(src)} {shq(dest)}') return symlinks write_external_inputs = [ x for r in batch._input_resources for x in copy_external_output(r) ] if write_external_inputs: transfers_bytes = orjson.dumps([{ "from": src, "to": dest } for src, dest in write_external_inputs]) transfers = transfers_bytes.decode('utf-8') write_cmd = [ 'python3', '-m', 'hailtop.aiotools.copy', 'null', transfers ] if dry_run: commands.append(' '.join(shq(x) for x in write_cmd)) else: j = bc_batch.create_job( image=HAIL_GENETICS_HAIL_IMAGE, command=write_cmd, attributes={'name': 'write_external_inputs'}) jobs_to_command[j] = ' '.join(shq(x) for x in write_cmd) n_jobs_submitted += 1 pyjobs = [j for j in batch._jobs if isinstance(j, _job.PythonJob)] for job in pyjobs: if job._image is None: version = sys.version_info if version.major != 3 or version.minor not in (6, 7, 8): raise BatchException( f"You must specify 'image' for Python jobs if you are using a Python version other than 3.6, 3.7, or 3.8 (you are using {version})" ) job._image = f'hailgenetics/python-dill:{version.major}.{version.minor}-slim' with tqdm(total=len(batch._jobs), desc='upload code', disable=disable_progress_bar) as pbar: async def compile_job(job): used_remote_tmpdir = await job._compile(local_tmpdir, batch_remote_tmpdir, dry_run=dry_run) pbar.update(1) return used_remote_tmpdir used_remote_tmpdir_results = await bounded_gather( *[functools.partial(compile_job, j) for j in batch._jobs], parallelism=150) used_remote_tmpdir |= any(used_remote_tmpdir_results) for job in tqdm(batch._jobs, desc='create job objects', disable=disable_progress_bar): inputs = [x for r in job._inputs for x in copy_input(r)] outputs = [ x for r in job._internal_outputs for x in copy_internal_output(r) ] if outputs: used_remote_tmpdir = True outputs += [ x for r in job._external_outputs for x in copy_external_output(r) ] symlinks = [ x for r in job._mentioned for x in symlink_input_resource_group(r) ] if job._image is None: if verbose: print( f"Using image '{default_image}' since no image was specified." ) make_local_tmpdir = f'mkdir -p {local_tmpdir}/{job._dirname}' job_command = [cmd.strip() for cmd in job._wrapper_code] prepared_job_command = (f'{{\n{x}\n}}' for x in job_command) cmd = f''' {bash_flags} {make_local_tmpdir} {"; ".join(symlinks)} {" && ".join(prepared_job_command)} ''' user_code = '\n\n'.join(job._user_code) if job._user_code else None if dry_run: formatted_command = f''' ================================================================================ # Job {job._job_id} {f": {job.name}" if job.name else ''} -------------------------------------------------------------------------------- ## USER CODE -------------------------------------------------------------------------------- {user_code} -------------------------------------------------------------------------------- ## COMMAND -------------------------------------------------------------------------------- {cmd} ================================================================================ ''' commands.append(formatted_command) continue parents = [job_to_client_job_mapping[j] for j in job._dependencies] attributes = copy.deepcopy( job.attributes) if job.attributes else {} if job.name: attributes['name'] = job.name resources: Dict[str, Any] = {} if job._cpu: resources['cpu'] = job._cpu if job._memory: resources['memory'] = job._memory if job._storage: resources['storage'] = job._storage if job._machine_type: resources['machine_type'] = job._machine_type if job._preemptible is not None: resources['preemptible'] = job._preemptible image = job._image if job._image else default_image image_ref = parse_docker_image_reference(image) if image_ref.hosted_in('dockerhub') and image_ref.name( ) not in HAIL_GENETICS_IMAGES: warnings.warn(f'Using an image {image} from Docker Hub. ' f'Jobs may fail due to Docker Hub rate limits.') env = {**job._env, 'BATCH_TMPDIR': local_tmpdir} j = bc_batch.create_job( image=image, command=[ job._shell if job._shell else DEFAULT_SHELL, '-c', cmd ], parents=parents, attributes=attributes, resources=resources, input_files=inputs if len(inputs) > 0 else None, output_files=outputs if len(outputs) > 0 else None, always_run=job._always_run, timeout=job._timeout, cloudfuse=job._cloudfuse if len(job._cloudfuse) > 0 else None, env=env, requester_pays_project=batch.requester_pays_project, mount_tokens=True, user_code=user_code) n_jobs_submitted += 1 job_to_client_job_mapping[job] = j jobs_to_command[j] = cmd if dry_run: print("\n\n".join(commands)) return None if delete_scratch_on_exit and used_remote_tmpdir: parents = list(jobs_to_command.keys()) j = bc_batch.create_job(image=HAIL_GENETICS_HAIL_IMAGE, command=[ 'python3', '-m', 'hailtop.aiotools.delete', batch_remote_tmpdir ], parents=parents, attributes={'name': 'remove_tmpdir'}, always_run=True) jobs_to_command[j] = cmd n_jobs_submitted += 1 if verbose: print( f'Built DAG with {n_jobs_submitted} jobs in {round(time.time() - build_dag_start, 3)} seconds.' ) submit_batch_start = time.time() batch_handle = bc_batch.submit( disable_progress_bar=disable_progress_bar) jobs_to_command = {j.id: cmd for j, cmd in jobs_to_command.items()} if verbose: print( f'Submitted batch {batch_handle.id} with {n_jobs_submitted} jobs in {round(time.time() - submit_batch_start, 3)} seconds:' ) for jid, cmd in jobs_to_command.items(): print(f'{jid}: {cmd}') print('') deploy_config = get_deploy_config() url = deploy_config.url('batch', f'/batches/{batch_handle.id}') print(f'Submitted batch {batch_handle.id}, see {url}') if open: webbrowser.open(url) if wait: print(f'Waiting for batch {batch_handle.id}...') status = batch_handle.wait() print(f'batch {batch_handle.id} complete: {status["state"]}') return batch_handle
async def parallel_file_exists_async(fpaths: List[str], parallelism: int = 750 ) -> Dict[str, bool]: """ Check whether a large number of files exist. Created for use with hail Batch jobs. Normal `file_exists` function is very slow when checking a large number of files. :param fpaths: List of file paths to check. Files can be in local or Google cloud storage. :param parallelism: Integer that sets parallelism of file existence checking task. Default is 750. :return: Dictionary of file paths (str) and whether the file exists (boolean). """ async def async_file_exists(fs: AsyncFS, fpath: str) -> bool: """ Determine file existence. :param fs: AsyncFS object. :param fpath: Path to file to check. :return: Whether file exists. """ fext = os.path.splitext(fpath)[1] if fext in [".ht", ".mt"]: fpath += "/_SUCCESS" try: await fs.statfile(fpath) except FileNotFoundError: return False else: return True with tqdm(total=len(fpaths), desc="check files for existence", disable=False) as pbar: with ThreadPoolExecutor() as thread_pool: async with RouterAsyncFS("file", filesystems=[ LocalAsyncFS(thread_pool), GoogleStorageAsyncFS() ]) as fs: def check_existence_and_update_pbar_thunk( fpath: str) -> Callable: """ Create function to check if file exists and update progress bar in stdout. Function delays coroutine creation to avoid creating too many live coroutines. :param fpath: Path to file to check. :return: Function that checks for file existence and updates progress bar. """ async def unapplied_function(): x = await async_file_exists(fs, fpath) pbar.update(1) return x return unapplied_function file_existence_checks = [ check_existence_and_update_pbar_thunk(fpath) for fpath in fpaths ] file_existence = await bounded_gather(*file_existence_checks, parallelism=parallelism) return dict(zip(fpaths, file_existence))
async def submit( self, max_bunch_bytesize: int = MAX_BUNCH_BYTESIZE, max_bunch_size: int = MAX_BUNCH_SIZE, disable_progress_bar: Union[ bool, None, TqdmDisableOption] = TqdmDisableOption.default, ) -> Batch: assert max_bunch_bytesize > 0 assert max_bunch_size > 0 if self._submitted: raise ValueError("cannot submit an already submitted batch") byte_job_specs = [ json.dumps(job_spec).encode('utf-8') for job_spec in self._job_specs ] byte_job_specs_bunches: List[List[bytes]] = [] bunch_sizes = [] bunch: List[bytes] = [] bunch_n_bytes = 0 bunch_n_jobs = 0 for spec in byte_job_specs: n_bytes = len(spec) assert n_bytes < max_bunch_bytesize, ( 'every job spec must be less than max_bunch_bytesize,' f' { max_bunch_bytesize }B, but {spec.decode()} is larger') if bunch_n_bytes + n_bytes < max_bunch_bytesize and len( bunch) < max_bunch_size: bunch.append(spec) bunch_n_bytes += n_bytes bunch_n_jobs += 1 else: byte_job_specs_bunches.append(bunch) bunch_sizes.append(bunch_n_jobs) bunch = [spec] bunch_n_bytes = n_bytes bunch_n_jobs = 1 if bunch: byte_job_specs_bunches.append(bunch) bunch_sizes.append(bunch_n_jobs) with tqdm(total=len(self._job_specs), disable=disable_progress_bar, desc='jobs submitted to queue') as pbar: if len(byte_job_specs_bunches) == 1: batch = await self._open_submit_close( byte_job_specs_bunches[0], bunch_sizes[0], pbar) id = batch.id else: batch = await self._open_batch() id = batch.id await bounded_gather( *[ functools.partial(self._submit_jobs, id, bunch, size, pbar) for bunch, size in zip(byte_job_specs_bunches, bunch_sizes) ], parallelism=6, ) await self._close_batch(id) log.info(f'created batch {id}') for j in self._jobs: j._job = j._job._submit(batch) self._job_specs = [] self._jobs = [] self._job_idx = 0 self._submitted = True return batch