def cpu_spec_to_float(spec: Union[int, str]) -> float: if isinstance(spec, str): mcpu = parse_cpu_in_mcpu(spec) assert mcpu is not None return mcpu / 1000 return float(spec)
def __init__(self, batch_id, user, gsa_key, job_spec, format_version): self.batch_id = batch_id self.user = user self.gsa_key = gsa_key self.job_spec = job_spec self.format_version = format_version self.deleted = False self.token = uuid.uuid4().hex self.scratch = f'/batch/{self.token}' self.state = 'pending' self.error = None self.start_time = None self.end_time = None pvc_size = job_spec.get('pvc_size') input_files = job_spec.get('input_files') output_files = job_spec.get('output_files') copy_volume_mounts = [] main_volume_mounts = [] requester_pays_project = job_spec.get('requester_pays_project') if job_spec.get('mount_docker_socket'): main_volume_mounts.append( '/var/run/docker.sock:/var/run/docker.sock') self.mount_io = (pvc_size or input_files or output_files) if self.mount_io: volume_mount = f'{self.io_host_path()}:/io' main_volume_mounts.append(volume_mount) copy_volume_mounts.append(volume_mount) gcsfuse = job_spec.get('gcsfuse') self.gcsfuse = gcsfuse if gcsfuse: for b in gcsfuse: main_volume_mounts.append( f'{self.gcsfuse_path(b["bucket"])}:{b["mount_path"]}:shared' ) secrets = job_spec.get('secrets') self.secrets = secrets if secrets: for secret in secrets: volume_mount = f'{self.secret_host_path(secret)}:{secret["mount_path"]}' main_volume_mounts.append(volume_mount) # this will be the user gsa-key if secret.get('mount_in_copy', False): copy_volume_mounts.append(volume_mount) env = [] for item in job_spec.get('env', []): env.append(f'{item["name"]}={item["value"]}') req_cpu_in_mcpu = parse_cpu_in_mcpu(job_spec['resources']['cpu']) req_memory_in_bytes = parse_memory_in_bytes( job_spec['resources']['memory']) cpu_in_mcpu = adjust_cores_for_memory_request( req_cpu_in_mcpu, req_memory_in_bytes, worker_config.instance_type) cpu_in_mcpu = adjust_cores_for_packability(cpu_in_mcpu) self.cpu_in_mcpu = cpu_in_mcpu self.memory_in_bytes = cores_mcpu_to_memory_bytes( self.cpu_in_mcpu, worker_config.instance_type) self.resources = worker_config.resources(self.cpu_in_mcpu, self.memory_in_bytes) # create containers containers = {} if input_files: containers['input'] = copy_container(self, 'input', input_files, copy_volume_mounts, self.cpu_in_mcpu, self.memory_in_bytes, requester_pays_project) # main container main_spec = { 'command': job_spec['command'], 'image': job_spec['image'], 'name': 'main', 'env': env, 'cpu': self.cpu_in_mcpu, 'memory': self.memory_in_bytes, 'volume_mounts': main_volume_mounts } port = job_spec.get('port') if port: main_spec['port'] = port timeout = job_spec.get('timeout') if timeout: main_spec['timeout'] = timeout containers['main'] = Container(self, 'main', main_spec) if output_files: containers['output'] = copy_container(self, 'output', output_files, copy_volume_mounts, self.cpu_in_mcpu, self.memory_in_bytes, requester_pays_project) self.containers = containers
def _run(self, batch: 'batch.Batch', dry_run: bool, verbose: bool, delete_scratch_on_exit: bool, **backend_kwargs): # pylint: disable=R0915 """ Execute a batch. Warning ------- This method should not be called directly. Instead, use :meth:`.batch.Batch.run`. Parameters ---------- batch: Batch to execute. dry_run: If `True`, don't execute code. verbose: If `True`, print debugging output. delete_scratch_on_exit: If `True`, delete temporary directories with intermediate files. """ if backend_kwargs: raise ValueError(f'LocalBackend does not support any of these keywords: {backend_kwargs}') tmpdir = self._get_scratch_dir() lines = ['set -e' + ('x' if verbose else ''), '\n', '# change cd to tmp directory', f"cd {tmpdir}", '\n'] copied_input_resource_files = set() os.makedirs(tmpdir + '/inputs/', exist_ok=True) if batch.requester_pays_project: requester_pays_project = f'-u {batch.requester_pays_project}' else: requester_pays_project = '' def copy_input(job, r): if isinstance(r, resource.InputResourceFile): if r not in copied_input_resource_files: copied_input_resource_files.add(r) if r._input_path.startswith('gs://'): return [f'gsutil {requester_pays_project} cp {shq(r._input_path)} {shq(r._get_path(tmpdir))}'] absolute_input_path = os.path.realpath(r._input_path) dest = r._get_path(tmpdir) dir = os.path.dirname(dest) os.makedirs(dir, exist_ok=True) if job._image is not None: # pylint: disable-msg=W0640 return [f'cp {shq(absolute_input_path)} {shq(dest)}'] return [f'ln -sf {shq(absolute_input_path)} {shq(dest)}'] return [] assert isinstance(r, (resource.JobResourceFile, resource.PythonResult)) return [] def copy_external_output(r): def _cp(dest): if not dest.startswith('gs://'): dest = os.path.abspath(dest) directory = os.path.dirname(dest) os.makedirs(directory, exist_ok=True) return 'cp' return f'gsutil {requester_pays_project} cp' if isinstance(r, resource.InputResourceFile): return [f'{_cp(dest)} {shq(r._input_path)} {shq(dest)}' for dest in r._output_paths] assert isinstance(r, (resource.JobResourceFile, resource.PythonResult)) return [f'{_cp(dest)} {r._get_path(tmpdir)} {shq(dest)}' for dest in r._output_paths] def symlink_input_resource_group(r): symlinks = [] if isinstance(r, resource.ResourceGroup) and r._source is None: for name, irf in r._resources.items(): src = irf._get_path(tmpdir) dest = f'{r._get_path(tmpdir)}.{name}' symlinks.append(f'ln -sf {shq(src)} {shq(dest)}') return symlinks write_inputs = [x for r in batch._input_resources for x in copy_external_output(r)] if write_inputs: lines += ["# Write input resources to output destinations"] lines += write_inputs lines += ['\n'] for job in batch._jobs: if isinstance(job, _job.PythonJob): job._compile(tmpdir, tmpdir) os.makedirs(f'{tmpdir}/{job._job_id}/', exist_ok=True) lines.append(f"# {job._job_id}: {job.name if job.name else ''}") lines += [x for r in job._inputs for x in copy_input(job, r)] lines += [x for r in job._mentioned for x in symlink_input_resource_group(r)] resource_defs = [r._declare(tmpdir) for r in job._mentioned] env = [f'export {k}={v}' for k, v in job._env.items()] job_shell = job._shell if job._shell else self._DEFAULT_SHELL defs = '; '.join(resource_defs) + '; ' if resource_defs else '' joined_env = '; '.join(env) + '; ' if env else '' cmd = " && ".join(f'{{\n{x}\n}}' for x in job._command) quoted_job_script = shq(joined_env + defs + cmd) if job._image: cpu = f'--cpus={job._cpu}' if job._cpu else '' memory = job._memory if memory is not None: memory_ratios = {'lowmem': 1024**3, 'standard': 4 * 1024**3, 'highmem': 7 * 1024**3} if memory in memory_ratios: if job._cpu is not None: mcpu = parse_cpu_in_mcpu(job._cpu) if mcpu is not None: memory = str(int(memory_ratios[memory] * (mcpu / 1000))) else: raise BatchException(f'invalid value for cpu: {job._cpu}') else: raise BatchException(f'must specify cpu when using {memory} to specify the memory') memory = f'-m {memory}' if memory else '' lines.append(f"docker run " "--entrypoint=''" f"{self._extra_docker_run_flags} " f"-v {tmpdir}:{tmpdir} " f"-w {tmpdir} " f"{memory} " f"{cpu} " f"{job._image} " f"{job_shell} -c {quoted_job_script}") else: lines.append(f"{job_shell} -c {quoted_job_script}") lines += [x for r in job._external_outputs for x in copy_external_output(r)] lines += ['\n'] script = "\n".join(lines) if dry_run: print(lines) else: try: sp.check_call(script, shell=True) except sp.CalledProcessError as e: print(e) print(e.output) raise finally: if delete_scratch_on_exit: sp.run(f'rm -rf {tmpdir}', shell=True, check=False) print('Batch completed successfully!')
def _run(self, batch: 'batch.Batch', dry_run: bool, verbose: bool, delete_scratch_on_exit: bool, **backend_kwargs) -> None: # pylint: disable=R0915 """ Execute a batch. Warning ------- This method should not be called directly. Instead, use :meth:`.batch.Batch.run`. Parameters ---------- batch: Batch to execute. dry_run: If `True`, don't execute code. verbose: If `True`, print debugging output. delete_scratch_on_exit: If `True`, delete temporary directories with intermediate files. """ if backend_kwargs: raise ValueError( f'LocalBackend does not support any of these keywords: {backend_kwargs}' ) tmpdir = self._get_scratch_dir() def new_code_block(): return [ 'set -e' + ('x' if verbose else ''), '\n', '# change cd to tmp directory', f"cd {tmpdir}", '\n' ] def run_code(code): code = '\n'.join(code) if dry_run: print(code) else: try: sp.check_call(code, shell=True) except sp.CalledProcessError as e: print(e) print(e.output) raise copied_input_resource_files = set() os.makedirs(tmpdir + '/inputs/', exist_ok=True) requester_pays_project_json = orjson.dumps( batch.requester_pays_project).decode('utf-8') def copy_input(job, r): if isinstance(r, resource.InputResourceFile): if r not in copied_input_resource_files: copied_input_resource_files.add(r) input_scheme = url_scheme(r._input_path) if input_scheme != '': transfers_bytes = orjson.dumps([{ "from": r._input_path, "to": r._get_path(tmpdir) }]) transfers = transfers_bytes.decode('utf-8') return [ f'python3 -m hailtop.aiotools.copy {shq(requester_pays_project_json)} {shq(transfers)}' ] absolute_input_path = os.path.realpath( os.path.expanduser(r._input_path)) dest = r._get_path(os.path.expanduser(tmpdir)) dir = os.path.dirname(dest) os.makedirs(dir, exist_ok=True) if job._image is not None: # pylint: disable-msg=W0640 return [f'cp {shq(absolute_input_path)} {shq(dest)}'] return [f'ln -sf {shq(absolute_input_path)} {shq(dest)}'] return [] assert isinstance( r, (resource.JobResourceFile, resource.PythonResult)) return [] def symlink_input_resource_group(r): symlinks = [] if isinstance(r, resource.ResourceGroup) and r._source is None: for name, irf in r._resources.items(): src = irf._get_path(tmpdir) dest = f'{r._get_path(tmpdir)}.{name}' symlinks.append(f'ln -sf {shq(src)} {shq(dest)}') return symlinks def transfer_dicts_for_resource_file( res_file: Union[resource.ResourceFile, resource.PythonResult] ) -> List[dict]: if isinstance(res_file, resource.InputResourceFile): source = res_file._input_path else: assert isinstance( res_file, (resource.JobResourceFile, resource.PythonResult)) source = res_file._get_path(tmpdir) return [{ "from": source, "to": dest } for dest in res_file._output_paths] try: input_transfer_dicts = [ transfer_dict for input_resource in batch._input_resources for transfer_dict in transfer_dicts_for_resource_file( input_resource) ] if input_transfer_dicts: input_transfers = orjson.dumps(input_transfer_dicts).decode( 'utf-8') code = new_code_block() code += ["# Write input resources to output destinations"] code += [ f'python3 -m hailtop.aiotools.copy {shq(requester_pays_project_json)} {shq(input_transfers)}' ] code += ['\n'] run_code(code) for job in batch._jobs: async_to_blocking(job._compile(tmpdir, tmpdir)) os.makedirs(f'{tmpdir}/{job._dirname}/', exist_ok=True) code = new_code_block() code.append(f"# {job._job_id}: {job.name if job.name else ''}") if job._user_code: code.append('# USER CODE') user_code = [ f'# {line}' for cmd in job._user_code for line in cmd.split('\n') ] code.append('\n'.join(user_code)) code += [x for r in job._inputs for x in copy_input(job, r)] code += [ x for r in job._mentioned for x in symlink_input_resource_group(r) ] env = {**job._env, 'BATCH_TMPDIR': tmpdir} env_declarations = [f'export {k}={v}' for k, v in env.items()] joined_env = '; '.join(env_declarations) + '; ' if env else '' job_shell = job._shell if job._shell else DEFAULT_SHELL cmd = " && ".join(f'{{\n{x}\n}}' for x in job._wrapper_code) quoted_job_script = shq(joined_env + cmd) if job._image: cpu = f'--cpus={job._cpu}' if job._cpu else '' memory = job._memory if memory is not None: memory_ratios = { 'lowmem': 1024**3, 'standard': 4 * 1024**3, 'highmem': 7 * 1024**3 } if memory in memory_ratios: if job._cpu is not None: mcpu = parse_cpu_in_mcpu(job._cpu) if mcpu is not None: memory = str( int(memory_ratios[memory] * (mcpu / 1000))) else: raise BatchException( f'invalid value for cpu: {job._cpu}') else: raise BatchException( f'must specify cpu when using {memory} to specify the memory' ) memory = f'-m {memory}' if memory else '' else: memory = '' code.append(f"docker run " "--entrypoint=''" f"{self._extra_docker_run_flags} " f"-v {tmpdir}:{tmpdir} " f"-w {tmpdir} " f"{memory} " f"{cpu} " f"{job._image} " f"{job_shell} -c {quoted_job_script}") else: code.append(f"{job_shell} -c {quoted_job_script}") output_transfer_dicts = [ transfer_dict for output_resource in job._external_outputs for transfer_dict in transfer_dicts_for_resource_file( output_resource) ] output_transfers = orjson.dumps(output_transfer_dicts).decode( 'utf-8') code += [ f'python3 -m hailtop.aiotools.copy {shq(requester_pays_project_json)} {shq(output_transfers)}' ] code += ['\n'] run_code(code) finally: if delete_scratch_on_exit: sp.run(f'rm -rf {tmpdir}', shell=True, check=False) print('Batch completed successfully!')