def handle_job(self, job):
     import kachery as ka
     pipe_to_parent, pipe_to_child = multiprocessing.Pipe()
     process = multiprocessing.Process(target=_pjh_run_job,
                                       args=(pipe_to_parent, job,
                                             ka.get_config()))
     self._processes.append(
         dict(job=job,
              process=process,
              pipe_to_child=pipe_to_child,
              pjh_status='pending'))
Esempio n. 2
0
    def start(self) -> None:
        """Start the slurm process
        """
        import kachery as ka
        # This script is run by each worker (slurm task) in the batch
        srun_py_script = ShellScript("""
                #!/usr/bin/env python

                import os
                import time
                import json
                import random
                import traceback
                import kachery as ka
                from hither_sf import FileLock
                from hither_sf import _run_job, _deserialize_runnable_job

                working_dir = '{working_dir}'
                num_workers = {num_workers}
                running_fname = '{running_fname}'

                kachery_config = json.loads('{kachery_config_json}')
                try:
                    import kachery as ka
                    ka.set_config(**kachery_config)
                except:
                    pass

                slurm_started_fname = working_dir + '/slurm_started.txt'
                with FileLock(slurm_started_fname + '.lock', exclusive=True):
                    with open(slurm_started_fname, 'w') as f:
                        f.write('slurm is running.')

                # Let's claim a place and determine which worker number we are
                worker_num = None
                # wait a random amount of time before starting
                time.sleep(random.uniform(0, 0.1))
                for i in range(num_workers):
                    fname = working_dir + '/worker_{}_claimed.txt'.format(i)
                    if not os.path.exists(fname):
                        with FileLock(fname + '.lock', exclusive=True):
                            if not os.path.exists(fname):
                                with open(fname, 'w') as f:
                                    f.write('claimed')
                                worker_num = i
                                break
                if worker_num is None:
                    raise Exception('Unable to claim worker file.')

                job_fname = working_dir + '/worker_{}_job.json'.format(worker_num)
                result_fname = working_dir + '/worker_{}_result.json'.format(worker_num)

                try:  # We are going to catch any exceptions and report them back to the parent process
                    num_found = 0
                    num_exceptions = 0
                    while True:
                        # Check whether running file exists
                        try:
                            with FileLock(running_fname + '.lock', exclusive=False):
                                if not os.path.exists(running_fname):
                                    print('Stopping worker.')
                                    break
                        except:
                            if not os.path.exists(working_dir):
                                print('Working directory does not exist. Stopping worker.')
                                break
                            traceback.print_exc()
                            print('WARNING: Unexpected problem checking for running file in worker. Trying to continue.')
                            num_exceptions = num_exceptions + 1
                            if num_exceptions >= 5:
                                raise Exception('Problem checking for running file in worker. Too many exceptions. Aborting')
                            time.sleep(3)

                        # Check to see if we have a job to do
                        job_serialized = None
                        try:
                            with FileLock(job_fname + '.lock', exclusive=False):
                                if (os.path.exists(job_fname)) and not (os.path.exists(result_fname)):
                                    num_found = num_found + 1
                                    with open(job_fname, 'r') as f:
                                        job_serialized = json.load(f)
                        except:
                            traceback.print_exc()
                            print('WARNING: Unexpected problem loading job object file in worker. Trying to continue.')
                            num_exceptions = num_exceptions + 1
                            if num_exceptions >= 5:
                                raise Exception('Problem loading job object file in worker. Too many exceptions. Aborting')
                            time.sleep(3)
                        
                        # If we have a job to do, then let's do it
                        if job_serialized:
                            job = _deserialize_runnable_job(job_serialized)
                            _run_job(job)
                            result = job['result']
                            result_serialized = result.serialize()
                            with FileLock(result_fname + '.lock', exclusive=True):
                                with open(result_fname, 'w') as f:
                                    # Write the result
                                    json.dump(result_serialized, f)
                        time.sleep(0.2)
                except:
                    # report the exception back to the parent process by writing a _result.json.error file
                    with FileLock(result_fname + '.lock', exclusive=True):
                        with open(result_fname + ".error", 'w') as f:
                            f.write(traceback.format_exc())
            """,
                                     script_path=os.path.join(
                                         self._working_dir,
                                         'execute_batch_srun.py'))
        srun_py_script.substitute('{working_dir}', self._working_dir)
        srun_py_script.substitute('{num_workers}', self._num_workers)
        srun_py_script.substitute('{running_fname}',
                                  self._working_dir + '/running.txt')
        srun_py_script.substitute('{kachery_config_json}',
                                  json.dumps(ka.get_config()))
        srun_py_script.write()

        srun_opts = []
        srun_opts.extend(self._additional_srun_opts)
        srun_opts.append('-n {}'.format(self._num_workers))
        srun_opts.append('-c {}'.format(self._num_cores_per_job))
        if self._time_limit is not None:
            srun_opts.append(
                '--time {}'.format(round(self._time_limit / 60) + 1))
        if self._use_slurm:
            # TODO: is this the right way to do it? Or should I use "exec srun..."
            srun_sh_script = ShellScript("""
               #!/bin/bash
               set -e

               _term() {
                   echo "Terminating srun process..."
                   kill -INT "$srun_pid" 2>/dev/null
                   # srun needs two signals
                   sleep 0.3
                   kill -INT "$srun_pid" 2>/dev/null
                }

                trap _term SIGINT SIGTERM

                export NUM_WORKERS={num_cores_per_job}
                export MKL_NUM_THREADS=$NUM_WORKERS
                export NUMEXPR_NUM_THREADS=$NUM_WORKERS
                export OMP_NUM_THREADS=$NUM_WORKERS

                export DISPLAY=""

                srun {srun_opts} {srun_py_script} &
                srun_pid=$!
                wait $srun_pid
            """,
                                         keep_temp_files=False)
            srun_sh_script.substitute('{srun_opts}', ' '.join(srun_opts))
            srun_sh_script.substitute('{srun_py_script}',
                                      srun_py_script.scriptPath())
            srun_sh_script.substitute('{num_cores_per_job}',
                                      self._num_cores_per_job)

            srun_sh_script.start()
            self._srun_sh_scripts = [srun_sh_script]
        else:
            self._srun_sh_scripts = []
            for _ in range(self._num_workers):
                srun_sh_script = ShellScript("""
                    #!/bin/bash
                    set -e

                    export NUM_WORKERS={num_cores_per_job}
                    export MKL_NUM_THREADS=$NUM_WORKERS
                    export NUMEXPR_NUM_THREADS=$NUM_WORKERS
                    export OMP_NUM_THREADS=$NUM_WORKERS

                    export DISPLAY=""

                    exec {srun_py_script}
                """,
                                             keep_temp_files=False)
                srun_sh_script.substitute('{srun_py_script}',
                                          srun_py_script.scriptPath())
                srun_sh_script.substitute('{num_cores_per_job}',
                                          self._num_cores_per_job)

                srun_sh_script.start()
                self._srun_sh_scripts.append(srun_sh_script)
Esempio n. 3
0
def run_function_in_container(
        *,
        name: str,
        function,
        function_serialized: Union[Dict[str, Any], None],
        label: Union[str, None] = None,
        container: str,
        keyword_args: dict,
        input_file_keys: List[str],
        input_file_extensions: dict,
        output_file_keys: List[str],
        output_file_extensions: dict,
        additional_files: List[str] = [],
        local_modules: List[str] = [],
        gpu: bool = False,
        show_console: bool = True,
        timeout: Union[float, None] = None) -> Tuple[Union[Any, None], dict]:
    import kachery as ka

    if label is None:
        label = name

    # generate source code
    if function_serialized is None:
        if function is None:
            raise Exception(
                'Unexpected: function and function_serialized are both None for [{}]'
                .format(label))
        function_serialized = _serialize_runnable_function(
            function,
            name=name,
            additional_files=additional_files,
            local_modules=local_modules,
            container=container)

    code = function_serialized['code']
    container = function_serialized['container']

    remove = True
    if os.getenv('HITHER_DEBUG', None) == 'TRUE':
        remove = False
    with TemporaryDirectory(prefix='tmp_hither_run_in_container_' + name + '_',
                            remove=remove) as temp_path:
        _write_python_code_to_directory(
            os.path.join(temp_path, 'function_src'), code)

        keyword_args_adjusted = deepcopy(keyword_args)
        binds = dict()
        for iname in input_file_keys:
            if iname in keyword_args.keys():
                fname_outside = keyword_args[iname]
                if not _is_hash_url(fname_outside):
                    fname_inside = '/inputs/{}{}'.format(
                        iname, input_file_extensions[iname])
                    if container is not None:
                        keyword_args_adjusted[iname] = fname_inside
                        binds[fname_outside] = fname_inside
                    else:
                        keyword_args_adjusted[iname] = fname_outside
        outputs_tmp = os.path.join(temp_path, 'outputs')
        os.mkdir(outputs_tmp)
        binds[outputs_tmp] = '/outputs'
        outputs_to_copy = dict()
        for oname in output_file_keys:
            if oname in keyword_args.keys():
                fname_outside = keyword_args[oname]
                fname_inside = '/outputs/{}{}'.format(
                    oname, output_file_extensions[oname])
                fname_temp = '{}/{}{}'.format(outputs_tmp, oname,
                                              output_file_extensions[oname])
                if container is not None:
                    keyword_args_adjusted[oname] = fname_inside
                    outputs_to_copy[fname_temp] = fname_outside
                else:
                    keyword_args_adjusted[oname] = fname_outside

        if container is not None:
            run_in_container_path = '/run_in_container'
            env_vars_inside_container = dict(
                KACHERY_STORAGE_DIR='/kachery-storage',
                PYTHONPATH=
                f'{run_in_container_path}/function_src/_local_modules',
                HOME='$HOME')
        else:
            run_in_container_path = temp_path
            env_vars_inside_container = dict(
                PYTHONPATH=
                f'{run_in_container_path}/function_src/_local_modules')

        run_py_script = """
            #!/usr/bin/env python

            from function_src import {function_name}
            import sys
            import json
            import traceback
            from hither_sf import ConsoleCapture

            def main():
                _configure_kachery()
                kwargs = json.loads('{keyword_args_json}')
                with ConsoleCapture('{function_name}', show_console={show_console_str}) as cc:
                    print('###### RUNNING: {label}')
                    try:
                        retval = {function_name}(**kwargs)
                        status = 'finished'
                    except:
                        traceback.print_exc()
                        retval = None
                        status = 'error'
                
                runtime_info = cc.runtime_info()
                with open('{run_in_container_path}/result.json', 'w') as f:
                    json.dump(dict(retval=retval, status=status, runtime_info=runtime_info), f)
            
            def _configure_kachery():
                try:
                    import kachery as ka
                except:
                    return
                kachery_config = json.loads('{kachery_config_json}')
                ka.set_config(**kachery_config)

            if __name__ == "__main__":
                try:
                    main()
                except:
                    sys.stdout.flush()
                    sys.stderr.flush()
                    raise
        """.format(keyword_args_json=json.dumps(keyword_args_adjusted),
                   kachery_config_json=json.dumps(ka.get_config()),
                   function_name=name,
                   label=label,
                   show_console_str='True' if show_console else 'False',
                   run_in_container_path=run_in_container_path)

        # For unindenting
        ShellScript(run_py_script).write(os.path.join(temp_path, 'run.py'))

        # See: https://wiki.bash-hackers.org/commands/builtin/exec
        run_inside_container_script = """
            #!/bin/bash
            set -e

            export NUM_WORKERS={num_workers_env}
            export MKL_NUM_THREADS=$NUM_WORKERS
            export NUMEXPR_NUM_THREADS=$NUM_WORKERS
            export OMP_NUM_THREADS=$NUM_WORKERS

            export {env_vars_inside_container}
            exec python3 {run_in_container_path}/run.py
        """.format(env_vars_inside_container=' '.join([
            '{}={}'.format(k, v) for k, v in env_vars_inside_container.items()
        ]),
                   num_workers_env=os.getenv('NUM_WORKERS', ''),
                   run_in_container_path=run_in_container_path)

        ShellScript(run_inside_container_script).write(
            os.path.join(temp_path, 'run.sh'))

        if not os.getenv('KACHERY_STORAGE_DIR'):
            raise Exception(
                'You must set the environment variable: KACHERY_STORAGE_DIR')

        docker_container_name = None

        # fancy_command = 'bash -c "((bash /run_in_container/run.sh | tee /run_in_container/stdout.txt) 3>&1 1>&2 2>&3 | tee /run_in_container/stderr.txt) 3>&1 1>&2 1>&3 | tee /run_in_container/console_out.txt"'
        if container is None:
            run_outside_container_script = """
                #!/bin/bash

                exec {run_in_container_path}/run.sh
            """.format(run_in_container_path=run_in_container_path)
        elif os.getenv('HITHER_USE_SINGULARITY', None) == 'TRUE':
            if gpu:
                gpu_opt = '--nv'
            else:
                gpu_opt = ''
            run_outside_container_script = """
                #!/bin/bash

                exec singularity exec -e {gpu_opt} \\
                    -B $KACHERY_STORAGE_DIR:/kachery-storage \\
                    -B {temp_path}:/run_in_container \\
                    {binds_str} \\
                    {container} \\
                    bash /run_in_container/run.sh
            """.format(gpu_opt=gpu_opt,
                       binds_str=' '.join([
                           '-B {}:{}'.format(a, b) for a, b in binds.items()
                       ]),
                       container=container,
                       temp_path=temp_path)
        else:
            if gpu:
                gpu_opt = '--gpus all'
            else:
                gpu_opt = ''
            docker_container_name = _random_string(8) + '_' + name
            # May not want to use -t below as it has the potential to mess up line feeds in the parent process!
            if (sys.platform == "win32"):
                winpath_ = lambda a: '/' + a.replace('\\', '/').replace(
                    ':', '')
                binds_str_ = ' '.join([
                    '-v {}:{}'.format(winpath_(a), b)
                    for a, b in binds.items()
                ])
                container_ = _docker_form_of_container_string(container)
                temp_path_ = winpath_(temp_path)
                kachery_storage_dir_ = winpath_(
                    os.getenv('KACHERY_STORAGE_DIR'))
                print('temp_path_: ' + temp_path_)
                run_outside_container_script = f'''
                    docker run --name {docker_container_name} -i {gpu_opt} ^
                    -v {kachery_storage_dir_}:/kachery-storage ^
                    -v {temp_path_}:/run_in_container ^
                    {binds_str_} ^
                    {container_} ^
                    bash /run_in_container/run.sh'''
            else:
                run_outside_container_script = """
                #!/bin/bash

                exec docker run --name {docker_container_name} -i {gpu_opt} \\
                    -v /etc/localtime:/etc/localtime:ro \\
                    -v /etc/passwd:/etc/passwd -u `id -u`:`id -g` \\
                    -v $KACHERY_STORAGE_DIR:/kachery-storage \\
                    -v {temp_path}:/run_in_container \\
                    -v /tmp:/tmp \\
                    -v $HOME:$HOME \\
                    {binds_str} \\
                    {container} \\
                    bash /run_in_container/run.sh
                """.format(
                    docker_container_name=docker_container_name,
                    gpu_opt=gpu_opt,
                    binds_str=' '.join(
                        ['-v {}:{}'.format(a, b) for a, b in binds.items()]),
                    container=_docker_form_of_container_string(container),
                    temp_path=temp_path)
        print('#############################################################')
        print(run_outside_container_script)
        print('#############################################################')

        ss = ShellScript(run_outside_container_script,
                         keep_temp_files=False,
                         label='run_outside_container',
                         docker_container_name=docker_container_name)
        ss.start()
        timer = time.time()
        did_timeout = False
        while True:
            retcode = ss.wait(1)
            if retcode is not None:
                break
            elapsed = time.time() - timer
            if timeout is not None:
                if elapsed > timeout:
                    print(f'Stopping job due to timeout {elapsed} > {timeout}')
                    did_timeout = True
                    ss.stop()

        if (retcode != 0) and (not did_timeout):
            raise Exception(
                'Non-zero exit code ({}) running [{}] in container {}'.format(
                    retcode, label, container))

        with open(os.path.join(temp_path, 'result.json')) as f:
            obj = json.load(f)
        retval = obj['retval']
        runtime_info = obj['runtime_info']
        status = obj['status']
        runtime_info['status'] = status

        if did_timeout:
            runtime_info['timed_out'] = True
            obj['status'] = 'error'
        else:
            runtime_info['timed_out'] = False

        if obj['status'] == 'error':
            pass
        else:
            for a, b in outputs_to_copy.items():
                shutil.copyfile(a, b)

        return retval, runtime_info
def run_function_in_container(*,
        name: str, function,
        container: str,
        keyword_args: dict,
        input_file_keys: list,
        input_file_extensions: dict,
        output_file_keys: list,
        output_file_extensions: dict,
        additional_files: list=[],
        local_modules: list=[]
    ) -> Any:
    # generate source code
    with TemporaryDirectory(remove=True, prefix='tmp_hither_run_in_container_' + name) as temp_path:
        try:
            function_source_fname = os.path.abspath(inspect.getsourcefile(function))
        except:
            raise('Unable to get source file for function {}. Cannot run in a container.'.format(name))

        function_source_dirname = os.path.dirname(function_source_fname)
        function_source_basename = os.path.basename(function_source_fname)
        function_source_basename_noext = os.path.splitext(function_source_basename)[0]
        code = _read_python_code_of_directory(
            function_source_dirname,
            additional_files=additional_files,
            exclude_init=True
        )
        code['files'].append(dict(
            name='__init__.py',
            content='from .{} import {}'.format(
                function_source_basename_noext, name)
        ))
        hither_dir = os.path.dirname(os.path.realpath(__file__))
        kachery_dir = os.path.dirname(os.path.realpath(__file__))
        local_module_paths = []
        for lm in local_modules:
            if os.path.isabs(lm):
                local_module_paths.append(lm)
            else:
                local_module_paths.append(os.path.join(function_source_dirname, lm))
        code['dirs'].append(dict(
            name='_local_modules',
            content=dict(
                files=[],
                dirs=[
                    dict(
                        name=os.path.basename(local_module_path),
                        content=_read_python_code_of_directory(os.path.join(function_source_dirname, local_module_path), exclude_init=False)
                    )
                    for local_module_path in local_module_paths + [hither_dir]
                ]
            )
        ))

        _write_python_code_to_directory(os.path.join(temp_path, 'function_src'), code)

        keyword_args_adjusted = deepcopy(keyword_args)
        binds = dict()
        for iname in input_file_keys:
            if iname in keyword_args.keys():
                fname_outside = keyword_args[iname]
                if not _is_hash_url(fname_outside):
                    fname_inside = '/inputs/{}{}'.format(iname, input_file_extensions[iname])
                    keyword_args_adjusted[iname] = fname_inside
                    binds[fname_outside] = fname_inside
        outputs_tmp = os.path.join(temp_path, 'outputs')
        os.mkdir(outputs_tmp)
        binds[outputs_tmp] = '/outputs'
        outputs_to_copy = dict()
        for oname in output_file_keys:
            if oname in keyword_args.keys():
                fname_outside = keyword_args[oname]
                fname_inside = '/outputs/{}{}'.format(oname, output_file_extensions[oname])
                fname_temp = '{}/{}{}'.format(outputs_tmp, oname, output_file_extensions[oname])
                keyword_args_adjusted[oname] = fname_inside
                outputs_to_copy[fname_temp] = fname_outside

        run_py_script = """
            #!/usr/bin/env python

            from function_src import {function_name}
            import sys
            import json

            def main():
                _configure_kachery()
                kwargs = json.loads('{keyword_args_json}')
                retval = {function_name}(**kwargs)
                with open('/run_in_container/retval.json', 'w') as f:
                    json.dump(dict(retval=retval), f)
            
            def _configure_kachery():
                try:
                    import kachery as ka
                except:
                    return
                kachery_config = json.loads('{kachery_config_json}')
                ka.set_config(**kachery_config)

            if __name__ == "__main__":
                try:
                    main()
                except:
                    sys.stdout.flush()
                    sys.stderr.flush()
                    raise
        """.format(
            keyword_args_json=json.dumps(keyword_args_adjusted),
            kachery_config_json=json.dumps(ka.get_config()),
            function_name=name
        )

        # For unindenting
        ShellScript(run_py_script).write(os.path.join(temp_path, 'run.py'))

        env_vars_inside_container = dict(
            KACHERY_STORAGE_DIR='/kachery-storage',
            PYTHONPATH='/run_in_container/function_src/_local_modules',
            HOME='$HOME'
        )

        run_inside_script = """
            #!/bin/bash
            set -e

            {env_vars_inside_container} python3 /run_in_container/run.py
        """.format(
            env_vars_inside_container=' '.join(['{}={}'.format(k, v) for k, v in env_vars_inside_container.items()])
        )

        ShellScript(run_inside_script).write(os.path.join(temp_path, 'run.sh'))

        if not os.getenv('KACHERY_STORAGE_DIR'):
            raise Exception('You must set the environment variable: KACHERY_STORAGE_DIR')

        if os.getenv('HITHER_USE_SINGULARITY', None) == 'TRUE':
            run_outside_script = """
                #!/bin/bash

                singularity exec -e \\
                    -B $KACHERY_STORAGE_DIR:/kachery-storage \\
                    -B {temp_path}:/run_in_container \\
                    --nv \\
                    {binds_str} \\
                    {container} \\
                    bash /run_in_container/run.sh
            """.format(
                binds_str=' '.join(['-B {}:{}'.format(a, b) for a, b in binds.items()]),
                container=container,
                temp_path=temp_path
            )
        else:
            run_outside_script = """
                #!/bin/bash

                docker run -it \\
                    --gpus all \\
                    -v /etc/passwd:/etc/passwd -u `id -u`:`id -g` \\
                    -v $KACHERY_STORAGE_DIR:/kachery-storage \\
                    -v {temp_path}:/run_in_container \\
                    -v /tmp:/tmp \\
                    -v $HOME:$HOME \\
                    {binds_str} \\
                    {container} \\
                    bash /run_in_container/run.sh
            """.format(
                binds_str=' '.join(['-v {}:{}'.format(a, b) for a, b in binds.items()]),
                container=_docker_form_of_container_string(container),
                temp_path=temp_path
            )

        ss = ShellScript(run_outside_script, keep_temp_files=False)
        ss.start()
        retcode = ss.wait()

        if retcode != 0:
            raise Exception('Non-zero exit code ({}) running {} in container {}'.format(retcode, name, container))

        with open(os.path.join(temp_path, 'retval.json')) as f:
            obj = json.load(f)
        retval = obj['retval']

        for a, b in outputs_to_copy.items():
            shutil.copyfile(a, b)

        return retval