Beispiel #1
0
def cpu_spec_to_float(spec: Union[int, str]) -> float:
    if isinstance(spec, str):
        mcpu = parse_cpu_in_mcpu(spec)
        assert mcpu is not None
        return mcpu / 1000
    return float(spec)
Beispiel #2
0
    def __init__(self, batch_id, user, gsa_key, job_spec, format_version):
        self.batch_id = batch_id
        self.user = user
        self.gsa_key = gsa_key
        self.job_spec = job_spec
        self.format_version = format_version

        self.deleted = False

        self.token = uuid.uuid4().hex
        self.scratch = f'/batch/{self.token}'

        self.state = 'pending'
        self.error = None

        self.start_time = None
        self.end_time = None

        pvc_size = job_spec.get('pvc_size')
        input_files = job_spec.get('input_files')
        output_files = job_spec.get('output_files')

        copy_volume_mounts = []
        main_volume_mounts = []

        requester_pays_project = job_spec.get('requester_pays_project')

        if job_spec.get('mount_docker_socket'):
            main_volume_mounts.append(
                '/var/run/docker.sock:/var/run/docker.sock')

        self.mount_io = (pvc_size or input_files or output_files)
        if self.mount_io:
            volume_mount = f'{self.io_host_path()}:/io'
            main_volume_mounts.append(volume_mount)
            copy_volume_mounts.append(volume_mount)

        gcsfuse = job_spec.get('gcsfuse')
        self.gcsfuse = gcsfuse
        if gcsfuse:
            for b in gcsfuse:
                main_volume_mounts.append(
                    f'{self.gcsfuse_path(b["bucket"])}:{b["mount_path"]}:shared'
                )

        secrets = job_spec.get('secrets')
        self.secrets = secrets
        if secrets:
            for secret in secrets:
                volume_mount = f'{self.secret_host_path(secret)}:{secret["mount_path"]}'
                main_volume_mounts.append(volume_mount)
                # this will be the user gsa-key
                if secret.get('mount_in_copy', False):
                    copy_volume_mounts.append(volume_mount)

        env = []
        for item in job_spec.get('env', []):
            env.append(f'{item["name"]}={item["value"]}')

        req_cpu_in_mcpu = parse_cpu_in_mcpu(job_spec['resources']['cpu'])
        req_memory_in_bytes = parse_memory_in_bytes(
            job_spec['resources']['memory'])

        cpu_in_mcpu = adjust_cores_for_memory_request(
            req_cpu_in_mcpu, req_memory_in_bytes, worker_config.instance_type)
        cpu_in_mcpu = adjust_cores_for_packability(cpu_in_mcpu)

        self.cpu_in_mcpu = cpu_in_mcpu
        self.memory_in_bytes = cores_mcpu_to_memory_bytes(
            self.cpu_in_mcpu, worker_config.instance_type)

        self.resources = worker_config.resources(self.cpu_in_mcpu,
                                                 self.memory_in_bytes)

        # create containers
        containers = {}

        if input_files:
            containers['input'] = copy_container(self, 'input', input_files,
                                                 copy_volume_mounts,
                                                 self.cpu_in_mcpu,
                                                 self.memory_in_bytes,
                                                 requester_pays_project)

        # main container
        main_spec = {
            'command': job_spec['command'],
            'image': job_spec['image'],
            'name': 'main',
            'env': env,
            'cpu': self.cpu_in_mcpu,
            'memory': self.memory_in_bytes,
            'volume_mounts': main_volume_mounts
        }
        port = job_spec.get('port')
        if port:
            main_spec['port'] = port
        timeout = job_spec.get('timeout')
        if timeout:
            main_spec['timeout'] = timeout
        containers['main'] = Container(self, 'main', main_spec)

        if output_files:
            containers['output'] = copy_container(self, 'output', output_files,
                                                  copy_volume_mounts,
                                                  self.cpu_in_mcpu,
                                                  self.memory_in_bytes,
                                                  requester_pays_project)

        self.containers = containers
Beispiel #3
0
    def _run(self,
             batch: 'batch.Batch',
             dry_run: bool,
             verbose: bool,
             delete_scratch_on_exit: bool,
             **backend_kwargs):  # pylint: disable=R0915
        """
        Execute a batch.

        Warning
        -------
        This method should not be called directly. Instead, use :meth:`.batch.Batch.run`.

        Parameters
        ----------
        batch:
            Batch to execute.
        dry_run:
            If `True`, don't execute code.
        verbose:
            If `True`, print debugging output.
        delete_scratch_on_exit:
            If `True`, delete temporary directories with intermediate files.
        """

        if backend_kwargs:
            raise ValueError(f'LocalBackend does not support any of these keywords: {backend_kwargs}')

        tmpdir = self._get_scratch_dir()

        lines = ['set -e' + ('x' if verbose else ''),
                 '\n',
                 '# change cd to tmp directory',
                 f"cd {tmpdir}",
                 '\n']

        copied_input_resource_files = set()
        os.makedirs(tmpdir + '/inputs/', exist_ok=True)

        if batch.requester_pays_project:
            requester_pays_project = f'-u {batch.requester_pays_project}'
        else:
            requester_pays_project = ''

        def copy_input(job, r):
            if isinstance(r, resource.InputResourceFile):
                if r not in copied_input_resource_files:
                    copied_input_resource_files.add(r)

                    if r._input_path.startswith('gs://'):
                        return [f'gsutil {requester_pays_project} cp {shq(r._input_path)} {shq(r._get_path(tmpdir))}']

                    absolute_input_path = os.path.realpath(r._input_path)

                    dest = r._get_path(tmpdir)
                    dir = os.path.dirname(dest)
                    os.makedirs(dir, exist_ok=True)

                    if job._image is not None:  # pylint: disable-msg=W0640
                        return [f'cp {shq(absolute_input_path)} {shq(dest)}']

                    return [f'ln -sf {shq(absolute_input_path)} {shq(dest)}']

                return []

            assert isinstance(r, (resource.JobResourceFile, resource.PythonResult))
            return []

        def copy_external_output(r):
            def _cp(dest):
                if not dest.startswith('gs://'):
                    dest = os.path.abspath(dest)
                    directory = os.path.dirname(dest)
                    os.makedirs(directory, exist_ok=True)
                    return 'cp'
                return f'gsutil {requester_pays_project} cp'

            if isinstance(r, resource.InputResourceFile):
                return [f'{_cp(dest)} {shq(r._input_path)} {shq(dest)}'
                        for dest in r._output_paths]

            assert isinstance(r, (resource.JobResourceFile, resource.PythonResult))
            return [f'{_cp(dest)} {r._get_path(tmpdir)} {shq(dest)}'
                    for dest in r._output_paths]

        def symlink_input_resource_group(r):
            symlinks = []
            if isinstance(r, resource.ResourceGroup) and r._source is None:
                for name, irf in r._resources.items():
                    src = irf._get_path(tmpdir)
                    dest = f'{r._get_path(tmpdir)}.{name}'
                    symlinks.append(f'ln -sf {shq(src)} {shq(dest)}')
            return symlinks

        write_inputs = [x for r in batch._input_resources for x in copy_external_output(r)]
        if write_inputs:
            lines += ["# Write input resources to output destinations"]
            lines += write_inputs
            lines += ['\n']

        for job in batch._jobs:
            if isinstance(job, _job.PythonJob):
                job._compile(tmpdir, tmpdir)

            os.makedirs(f'{tmpdir}/{job._job_id}/', exist_ok=True)

            lines.append(f"# {job._job_id}: {job.name if job.name else ''}")

            lines += [x for r in job._inputs for x in copy_input(job, r)]
            lines += [x for r in job._mentioned for x in symlink_input_resource_group(r)]

            resource_defs = [r._declare(tmpdir) for r in job._mentioned]
            env = [f'export {k}={v}' for k, v in job._env.items()]

            job_shell = job._shell if job._shell else self._DEFAULT_SHELL

            defs = '; '.join(resource_defs) + '; ' if resource_defs else ''
            joined_env = '; '.join(env) + '; ' if env else ''

            cmd = " && ".join(f'{{\n{x}\n}}' for x in job._command)

            quoted_job_script = shq(joined_env + defs + cmd)

            if job._image:
                cpu = f'--cpus={job._cpu}' if job._cpu else ''

                memory = job._memory
                if memory is not None:
                    memory_ratios = {'lowmem': 1024**3, 'standard': 4 * 1024**3, 'highmem': 7 * 1024**3}
                    if memory in memory_ratios:
                        if job._cpu is not None:
                            mcpu = parse_cpu_in_mcpu(job._cpu)
                            if mcpu is not None:
                                memory = str(int(memory_ratios[memory] * (mcpu / 1000)))
                            else:
                                raise BatchException(f'invalid value for cpu: {job._cpu}')
                        else:
                            raise BatchException(f'must specify cpu when using {memory} to specify the memory')
                    memory = f'-m {memory}' if memory else ''

                lines.append(f"docker run "
                             "--entrypoint=''"
                             f"{self._extra_docker_run_flags} "
                             f"-v {tmpdir}:{tmpdir} "
                             f"-w {tmpdir} "
                             f"{memory} "
                             f"{cpu} "
                             f"{job._image} "
                             f"{job_shell} -c {quoted_job_script}")
            else:
                lines.append(f"{job_shell} -c {quoted_job_script}")

            lines += [x for r in job._external_outputs for x in copy_external_output(r)]
            lines += ['\n']

        script = "\n".join(lines)

        if dry_run:
            print(lines)
        else:
            try:
                sp.check_call(script, shell=True)
            except sp.CalledProcessError as e:
                print(e)
                print(e.output)
                raise
            finally:
                if delete_scratch_on_exit:
                    sp.run(f'rm -rf {tmpdir}', shell=True, check=False)

        print('Batch completed successfully!')
Beispiel #4
0
    def _run(self, batch: 'batch.Batch', dry_run: bool, verbose: bool,
             delete_scratch_on_exit: bool, **backend_kwargs) -> None:  # pylint: disable=R0915
        """
        Execute a batch.

        Warning
        -------
        This method should not be called directly. Instead, use :meth:`.batch.Batch.run`.

        Parameters
        ----------
        batch:
            Batch to execute.
        dry_run:
            If `True`, don't execute code.
        verbose:
            If `True`, print debugging output.
        delete_scratch_on_exit:
            If `True`, delete temporary directories with intermediate files.
        """

        if backend_kwargs:
            raise ValueError(
                f'LocalBackend does not support any of these keywords: {backend_kwargs}'
            )

        tmpdir = self._get_scratch_dir()

        def new_code_block():
            return [
                'set -e' + ('x' if verbose else ''), '\n',
                '# change cd to tmp directory', f"cd {tmpdir}", '\n'
            ]

        def run_code(code):
            code = '\n'.join(code)
            if dry_run:
                print(code)
            else:
                try:
                    sp.check_call(code, shell=True)
                except sp.CalledProcessError as e:
                    print(e)
                    print(e.output)
                    raise

        copied_input_resource_files = set()
        os.makedirs(tmpdir + '/inputs/', exist_ok=True)

        requester_pays_project_json = orjson.dumps(
            batch.requester_pays_project).decode('utf-8')

        def copy_input(job, r):
            if isinstance(r, resource.InputResourceFile):
                if r not in copied_input_resource_files:
                    copied_input_resource_files.add(r)

                    input_scheme = url_scheme(r._input_path)
                    if input_scheme != '':
                        transfers_bytes = orjson.dumps([{
                            "from":
                            r._input_path,
                            "to":
                            r._get_path(tmpdir)
                        }])
                        transfers = transfers_bytes.decode('utf-8')
                        return [
                            f'python3 -m hailtop.aiotools.copy {shq(requester_pays_project_json)} {shq(transfers)}'
                        ]

                    absolute_input_path = os.path.realpath(
                        os.path.expanduser(r._input_path))

                    dest = r._get_path(os.path.expanduser(tmpdir))
                    dir = os.path.dirname(dest)
                    os.makedirs(dir, exist_ok=True)

                    if job._image is not None:  # pylint: disable-msg=W0640
                        return [f'cp {shq(absolute_input_path)} {shq(dest)}']

                    return [f'ln -sf {shq(absolute_input_path)} {shq(dest)}']

                return []

            assert isinstance(
                r, (resource.JobResourceFile, resource.PythonResult))
            return []

        def symlink_input_resource_group(r):
            symlinks = []
            if isinstance(r, resource.ResourceGroup) and r._source is None:
                for name, irf in r._resources.items():
                    src = irf._get_path(tmpdir)
                    dest = f'{r._get_path(tmpdir)}.{name}'
                    symlinks.append(f'ln -sf {shq(src)} {shq(dest)}')
            return symlinks

        def transfer_dicts_for_resource_file(
            res_file: Union[resource.ResourceFile, resource.PythonResult]
        ) -> List[dict]:
            if isinstance(res_file, resource.InputResourceFile):
                source = res_file._input_path
            else:
                assert isinstance(
                    res_file,
                    (resource.JobResourceFile, resource.PythonResult))
                source = res_file._get_path(tmpdir)

            return [{
                "from": source,
                "to": dest
            } for dest in res_file._output_paths]

        try:
            input_transfer_dicts = [
                transfer_dict for input_resource in batch._input_resources
                for transfer_dict in transfer_dicts_for_resource_file(
                    input_resource)
            ]

            if input_transfer_dicts:
                input_transfers = orjson.dumps(input_transfer_dicts).decode(
                    'utf-8')
                code = new_code_block()
                code += ["# Write input resources to output destinations"]
                code += [
                    f'python3 -m hailtop.aiotools.copy {shq(requester_pays_project_json)} {shq(input_transfers)}'
                ]
                code += ['\n']
                run_code(code)

            for job in batch._jobs:
                async_to_blocking(job._compile(tmpdir, tmpdir))

                os.makedirs(f'{tmpdir}/{job._dirname}/', exist_ok=True)

                code = new_code_block()

                code.append(f"# {job._job_id}: {job.name if job.name else ''}")

                if job._user_code:
                    code.append('# USER CODE')
                    user_code = [
                        f'# {line}' for cmd in job._user_code
                        for line in cmd.split('\n')
                    ]
                    code.append('\n'.join(user_code))

                code += [x for r in job._inputs for x in copy_input(job, r)]
                code += [
                    x for r in job._mentioned
                    for x in symlink_input_resource_group(r)
                ]

                env = {**job._env, 'BATCH_TMPDIR': tmpdir}
                env_declarations = [f'export {k}={v}' for k, v in env.items()]
                joined_env = '; '.join(env_declarations) + '; ' if env else ''

                job_shell = job._shell if job._shell else DEFAULT_SHELL

                cmd = " && ".join(f'{{\n{x}\n}}' for x in job._wrapper_code)

                quoted_job_script = shq(joined_env + cmd)

                if job._image:
                    cpu = f'--cpus={job._cpu}' if job._cpu else ''

                    memory = job._memory
                    if memory is not None:
                        memory_ratios = {
                            'lowmem': 1024**3,
                            'standard': 4 * 1024**3,
                            'highmem': 7 * 1024**3
                        }
                        if memory in memory_ratios:
                            if job._cpu is not None:
                                mcpu = parse_cpu_in_mcpu(job._cpu)
                                if mcpu is not None:
                                    memory = str(
                                        int(memory_ratios[memory] *
                                            (mcpu / 1000)))
                                else:
                                    raise BatchException(
                                        f'invalid value for cpu: {job._cpu}')
                            else:
                                raise BatchException(
                                    f'must specify cpu when using {memory} to specify the memory'
                                )
                        memory = f'-m {memory}' if memory else ''
                    else:
                        memory = ''

                    code.append(f"docker run "
                                "--entrypoint=''"
                                f"{self._extra_docker_run_flags} "
                                f"-v {tmpdir}:{tmpdir} "
                                f"-w {tmpdir} "
                                f"{memory} "
                                f"{cpu} "
                                f"{job._image} "
                                f"{job_shell} -c {quoted_job_script}")
                else:
                    code.append(f"{job_shell} -c {quoted_job_script}")

                output_transfer_dicts = [
                    transfer_dict for output_resource in job._external_outputs
                    for transfer_dict in transfer_dicts_for_resource_file(
                        output_resource)
                ]
                output_transfers = orjson.dumps(output_transfer_dicts).decode(
                    'utf-8')

                code += [
                    f'python3 -m hailtop.aiotools.copy {shq(requester_pays_project_json)} {shq(output_transfers)}'
                ]
                code += ['\n']

                run_code(code)
        finally:
            if delete_scratch_on_exit:
                sp.run(f'rm -rf {tmpdir}', shell=True, check=False)

        print('Batch completed successfully!')