def handle_job(self, job): import kachery as ka pipe_to_parent, pipe_to_child = multiprocessing.Pipe() process = multiprocessing.Process(target=_pjh_run_job, args=(pipe_to_parent, job, ka.get_config())) self._processes.append( dict(job=job, process=process, pipe_to_child=pipe_to_child, pjh_status='pending'))
def start(self) -> None: """Start the slurm process """ import kachery as ka # This script is run by each worker (slurm task) in the batch srun_py_script = ShellScript(""" #!/usr/bin/env python import os import time import json import random import traceback import kachery as ka from hither_sf import FileLock from hither_sf import _run_job, _deserialize_runnable_job working_dir = '{working_dir}' num_workers = {num_workers} running_fname = '{running_fname}' kachery_config = json.loads('{kachery_config_json}') try: import kachery as ka ka.set_config(**kachery_config) except: pass slurm_started_fname = working_dir + '/slurm_started.txt' with FileLock(slurm_started_fname + '.lock', exclusive=True): with open(slurm_started_fname, 'w') as f: f.write('slurm is running.') # Let's claim a place and determine which worker number we are worker_num = None # wait a random amount of time before starting time.sleep(random.uniform(0, 0.1)) for i in range(num_workers): fname = working_dir + '/worker_{}_claimed.txt'.format(i) if not os.path.exists(fname): with FileLock(fname + '.lock', exclusive=True): if not os.path.exists(fname): with open(fname, 'w') as f: f.write('claimed') worker_num = i break if worker_num is None: raise Exception('Unable to claim worker file.') job_fname = working_dir + '/worker_{}_job.json'.format(worker_num) result_fname = working_dir + '/worker_{}_result.json'.format(worker_num) try: # We are going to catch any exceptions and report them back to the parent process num_found = 0 num_exceptions = 0 while True: # Check whether running file exists try: with FileLock(running_fname + '.lock', exclusive=False): if not os.path.exists(running_fname): print('Stopping worker.') break except: if not os.path.exists(working_dir): print('Working directory does not exist. Stopping worker.') break traceback.print_exc() print('WARNING: Unexpected problem checking for running file in worker. Trying to continue.') num_exceptions = num_exceptions + 1 if num_exceptions >= 5: raise Exception('Problem checking for running file in worker. Too many exceptions. Aborting') time.sleep(3) # Check to see if we have a job to do job_serialized = None try: with FileLock(job_fname + '.lock', exclusive=False): if (os.path.exists(job_fname)) and not (os.path.exists(result_fname)): num_found = num_found + 1 with open(job_fname, 'r') as f: job_serialized = json.load(f) except: traceback.print_exc() print('WARNING: Unexpected problem loading job object file in worker. Trying to continue.') num_exceptions = num_exceptions + 1 if num_exceptions >= 5: raise Exception('Problem loading job object file in worker. Too many exceptions. Aborting') time.sleep(3) # If we have a job to do, then let's do it if job_serialized: job = _deserialize_runnable_job(job_serialized) _run_job(job) result = job['result'] result_serialized = result.serialize() with FileLock(result_fname + '.lock', exclusive=True): with open(result_fname, 'w') as f: # Write the result json.dump(result_serialized, f) time.sleep(0.2) except: # report the exception back to the parent process by writing a _result.json.error file with FileLock(result_fname + '.lock', exclusive=True): with open(result_fname + ".error", 'w') as f: f.write(traceback.format_exc()) """, script_path=os.path.join( self._working_dir, 'execute_batch_srun.py')) srun_py_script.substitute('{working_dir}', self._working_dir) srun_py_script.substitute('{num_workers}', self._num_workers) srun_py_script.substitute('{running_fname}', self._working_dir + '/running.txt') srun_py_script.substitute('{kachery_config_json}', json.dumps(ka.get_config())) srun_py_script.write() srun_opts = [] srun_opts.extend(self._additional_srun_opts) srun_opts.append('-n {}'.format(self._num_workers)) srun_opts.append('-c {}'.format(self._num_cores_per_job)) if self._time_limit is not None: srun_opts.append( '--time {}'.format(round(self._time_limit / 60) + 1)) if self._use_slurm: # TODO: is this the right way to do it? Or should I use "exec srun..." srun_sh_script = ShellScript(""" #!/bin/bash set -e _term() { echo "Terminating srun process..." kill -INT "$srun_pid" 2>/dev/null # srun needs two signals sleep 0.3 kill -INT "$srun_pid" 2>/dev/null } trap _term SIGINT SIGTERM export NUM_WORKERS={num_cores_per_job} export MKL_NUM_THREADS=$NUM_WORKERS export NUMEXPR_NUM_THREADS=$NUM_WORKERS export OMP_NUM_THREADS=$NUM_WORKERS export DISPLAY="" srun {srun_opts} {srun_py_script} & srun_pid=$! wait $srun_pid """, keep_temp_files=False) srun_sh_script.substitute('{srun_opts}', ' '.join(srun_opts)) srun_sh_script.substitute('{srun_py_script}', srun_py_script.scriptPath()) srun_sh_script.substitute('{num_cores_per_job}', self._num_cores_per_job) srun_sh_script.start() self._srun_sh_scripts = [srun_sh_script] else: self._srun_sh_scripts = [] for _ in range(self._num_workers): srun_sh_script = ShellScript(""" #!/bin/bash set -e export NUM_WORKERS={num_cores_per_job} export MKL_NUM_THREADS=$NUM_WORKERS export NUMEXPR_NUM_THREADS=$NUM_WORKERS export OMP_NUM_THREADS=$NUM_WORKERS export DISPLAY="" exec {srun_py_script} """, keep_temp_files=False) srun_sh_script.substitute('{srun_py_script}', srun_py_script.scriptPath()) srun_sh_script.substitute('{num_cores_per_job}', self._num_cores_per_job) srun_sh_script.start() self._srun_sh_scripts.append(srun_sh_script)
def run_function_in_container( *, name: str, function, function_serialized: Union[Dict[str, Any], None], label: Union[str, None] = None, container: str, keyword_args: dict, input_file_keys: List[str], input_file_extensions: dict, output_file_keys: List[str], output_file_extensions: dict, additional_files: List[str] = [], local_modules: List[str] = [], gpu: bool = False, show_console: bool = True, timeout: Union[float, None] = None) -> Tuple[Union[Any, None], dict]: import kachery as ka if label is None: label = name # generate source code if function_serialized is None: if function is None: raise Exception( 'Unexpected: function and function_serialized are both None for [{}]' .format(label)) function_serialized = _serialize_runnable_function( function, name=name, additional_files=additional_files, local_modules=local_modules, container=container) code = function_serialized['code'] container = function_serialized['container'] remove = True if os.getenv('HITHER_DEBUG', None) == 'TRUE': remove = False with TemporaryDirectory(prefix='tmp_hither_run_in_container_' + name + '_', remove=remove) as temp_path: _write_python_code_to_directory( os.path.join(temp_path, 'function_src'), code) keyword_args_adjusted = deepcopy(keyword_args) binds = dict() for iname in input_file_keys: if iname in keyword_args.keys(): fname_outside = keyword_args[iname] if not _is_hash_url(fname_outside): fname_inside = '/inputs/{}{}'.format( iname, input_file_extensions[iname]) if container is not None: keyword_args_adjusted[iname] = fname_inside binds[fname_outside] = fname_inside else: keyword_args_adjusted[iname] = fname_outside outputs_tmp = os.path.join(temp_path, 'outputs') os.mkdir(outputs_tmp) binds[outputs_tmp] = '/outputs' outputs_to_copy = dict() for oname in output_file_keys: if oname in keyword_args.keys(): fname_outside = keyword_args[oname] fname_inside = '/outputs/{}{}'.format( oname, output_file_extensions[oname]) fname_temp = '{}/{}{}'.format(outputs_tmp, oname, output_file_extensions[oname]) if container is not None: keyword_args_adjusted[oname] = fname_inside outputs_to_copy[fname_temp] = fname_outside else: keyword_args_adjusted[oname] = fname_outside if container is not None: run_in_container_path = '/run_in_container' env_vars_inside_container = dict( KACHERY_STORAGE_DIR='/kachery-storage', PYTHONPATH= f'{run_in_container_path}/function_src/_local_modules', HOME='$HOME') else: run_in_container_path = temp_path env_vars_inside_container = dict( PYTHONPATH= f'{run_in_container_path}/function_src/_local_modules') run_py_script = """ #!/usr/bin/env python from function_src import {function_name} import sys import json import traceback from hither_sf import ConsoleCapture def main(): _configure_kachery() kwargs = json.loads('{keyword_args_json}') with ConsoleCapture('{function_name}', show_console={show_console_str}) as cc: print('###### RUNNING: {label}') try: retval = {function_name}(**kwargs) status = 'finished' except: traceback.print_exc() retval = None status = 'error' runtime_info = cc.runtime_info() with open('{run_in_container_path}/result.json', 'w') as f: json.dump(dict(retval=retval, status=status, runtime_info=runtime_info), f) def _configure_kachery(): try: import kachery as ka except: return kachery_config = json.loads('{kachery_config_json}') ka.set_config(**kachery_config) if __name__ == "__main__": try: main() except: sys.stdout.flush() sys.stderr.flush() raise """.format(keyword_args_json=json.dumps(keyword_args_adjusted), kachery_config_json=json.dumps(ka.get_config()), function_name=name, label=label, show_console_str='True' if show_console else 'False', run_in_container_path=run_in_container_path) # For unindenting ShellScript(run_py_script).write(os.path.join(temp_path, 'run.py')) # See: https://wiki.bash-hackers.org/commands/builtin/exec run_inside_container_script = """ #!/bin/bash set -e export NUM_WORKERS={num_workers_env} export MKL_NUM_THREADS=$NUM_WORKERS export NUMEXPR_NUM_THREADS=$NUM_WORKERS export OMP_NUM_THREADS=$NUM_WORKERS export {env_vars_inside_container} exec python3 {run_in_container_path}/run.py """.format(env_vars_inside_container=' '.join([ '{}={}'.format(k, v) for k, v in env_vars_inside_container.items() ]), num_workers_env=os.getenv('NUM_WORKERS', ''), run_in_container_path=run_in_container_path) ShellScript(run_inside_container_script).write( os.path.join(temp_path, 'run.sh')) if not os.getenv('KACHERY_STORAGE_DIR'): raise Exception( 'You must set the environment variable: KACHERY_STORAGE_DIR') docker_container_name = None # fancy_command = 'bash -c "((bash /run_in_container/run.sh | tee /run_in_container/stdout.txt) 3>&1 1>&2 2>&3 | tee /run_in_container/stderr.txt) 3>&1 1>&2 1>&3 | tee /run_in_container/console_out.txt"' if container is None: run_outside_container_script = """ #!/bin/bash exec {run_in_container_path}/run.sh """.format(run_in_container_path=run_in_container_path) elif os.getenv('HITHER_USE_SINGULARITY', None) == 'TRUE': if gpu: gpu_opt = '--nv' else: gpu_opt = '' run_outside_container_script = """ #!/bin/bash exec singularity exec -e {gpu_opt} \\ -B $KACHERY_STORAGE_DIR:/kachery-storage \\ -B {temp_path}:/run_in_container \\ {binds_str} \\ {container} \\ bash /run_in_container/run.sh """.format(gpu_opt=gpu_opt, binds_str=' '.join([ '-B {}:{}'.format(a, b) for a, b in binds.items() ]), container=container, temp_path=temp_path) else: if gpu: gpu_opt = '--gpus all' else: gpu_opt = '' docker_container_name = _random_string(8) + '_' + name # May not want to use -t below as it has the potential to mess up line feeds in the parent process! if (sys.platform == "win32"): winpath_ = lambda a: '/' + a.replace('\\', '/').replace( ':', '') binds_str_ = ' '.join([ '-v {}:{}'.format(winpath_(a), b) for a, b in binds.items() ]) container_ = _docker_form_of_container_string(container) temp_path_ = winpath_(temp_path) kachery_storage_dir_ = winpath_( os.getenv('KACHERY_STORAGE_DIR')) print('temp_path_: ' + temp_path_) run_outside_container_script = f''' docker run --name {docker_container_name} -i {gpu_opt} ^ -v {kachery_storage_dir_}:/kachery-storage ^ -v {temp_path_}:/run_in_container ^ {binds_str_} ^ {container_} ^ bash /run_in_container/run.sh''' else: run_outside_container_script = """ #!/bin/bash exec docker run --name {docker_container_name} -i {gpu_opt} \\ -v /etc/localtime:/etc/localtime:ro \\ -v /etc/passwd:/etc/passwd -u `id -u`:`id -g` \\ -v $KACHERY_STORAGE_DIR:/kachery-storage \\ -v {temp_path}:/run_in_container \\ -v /tmp:/tmp \\ -v $HOME:$HOME \\ {binds_str} \\ {container} \\ bash /run_in_container/run.sh """.format( docker_container_name=docker_container_name, gpu_opt=gpu_opt, binds_str=' '.join( ['-v {}:{}'.format(a, b) for a, b in binds.items()]), container=_docker_form_of_container_string(container), temp_path=temp_path) print('#############################################################') print(run_outside_container_script) print('#############################################################') ss = ShellScript(run_outside_container_script, keep_temp_files=False, label='run_outside_container', docker_container_name=docker_container_name) ss.start() timer = time.time() did_timeout = False while True: retcode = ss.wait(1) if retcode is not None: break elapsed = time.time() - timer if timeout is not None: if elapsed > timeout: print(f'Stopping job due to timeout {elapsed} > {timeout}') did_timeout = True ss.stop() if (retcode != 0) and (not did_timeout): raise Exception( 'Non-zero exit code ({}) running [{}] in container {}'.format( retcode, label, container)) with open(os.path.join(temp_path, 'result.json')) as f: obj = json.load(f) retval = obj['retval'] runtime_info = obj['runtime_info'] status = obj['status'] runtime_info['status'] = status if did_timeout: runtime_info['timed_out'] = True obj['status'] = 'error' else: runtime_info['timed_out'] = False if obj['status'] == 'error': pass else: for a, b in outputs_to_copy.items(): shutil.copyfile(a, b) return retval, runtime_info
def run_function_in_container(*, name: str, function, container: str, keyword_args: dict, input_file_keys: list, input_file_extensions: dict, output_file_keys: list, output_file_extensions: dict, additional_files: list=[], local_modules: list=[] ) -> Any: # generate source code with TemporaryDirectory(remove=True, prefix='tmp_hither_run_in_container_' + name) as temp_path: try: function_source_fname = os.path.abspath(inspect.getsourcefile(function)) except: raise('Unable to get source file for function {}. Cannot run in a container.'.format(name)) function_source_dirname = os.path.dirname(function_source_fname) function_source_basename = os.path.basename(function_source_fname) function_source_basename_noext = os.path.splitext(function_source_basename)[0] code = _read_python_code_of_directory( function_source_dirname, additional_files=additional_files, exclude_init=True ) code['files'].append(dict( name='__init__.py', content='from .{} import {}'.format( function_source_basename_noext, name) )) hither_dir = os.path.dirname(os.path.realpath(__file__)) kachery_dir = os.path.dirname(os.path.realpath(__file__)) local_module_paths = [] for lm in local_modules: if os.path.isabs(lm): local_module_paths.append(lm) else: local_module_paths.append(os.path.join(function_source_dirname, lm)) code['dirs'].append(dict( name='_local_modules', content=dict( files=[], dirs=[ dict( name=os.path.basename(local_module_path), content=_read_python_code_of_directory(os.path.join(function_source_dirname, local_module_path), exclude_init=False) ) for local_module_path in local_module_paths + [hither_dir] ] ) )) _write_python_code_to_directory(os.path.join(temp_path, 'function_src'), code) keyword_args_adjusted = deepcopy(keyword_args) binds = dict() for iname in input_file_keys: if iname in keyword_args.keys(): fname_outside = keyword_args[iname] if not _is_hash_url(fname_outside): fname_inside = '/inputs/{}{}'.format(iname, input_file_extensions[iname]) keyword_args_adjusted[iname] = fname_inside binds[fname_outside] = fname_inside outputs_tmp = os.path.join(temp_path, 'outputs') os.mkdir(outputs_tmp) binds[outputs_tmp] = '/outputs' outputs_to_copy = dict() for oname in output_file_keys: if oname in keyword_args.keys(): fname_outside = keyword_args[oname] fname_inside = '/outputs/{}{}'.format(oname, output_file_extensions[oname]) fname_temp = '{}/{}{}'.format(outputs_tmp, oname, output_file_extensions[oname]) keyword_args_adjusted[oname] = fname_inside outputs_to_copy[fname_temp] = fname_outside run_py_script = """ #!/usr/bin/env python from function_src import {function_name} import sys import json def main(): _configure_kachery() kwargs = json.loads('{keyword_args_json}') retval = {function_name}(**kwargs) with open('/run_in_container/retval.json', 'w') as f: json.dump(dict(retval=retval), f) def _configure_kachery(): try: import kachery as ka except: return kachery_config = json.loads('{kachery_config_json}') ka.set_config(**kachery_config) if __name__ == "__main__": try: main() except: sys.stdout.flush() sys.stderr.flush() raise """.format( keyword_args_json=json.dumps(keyword_args_adjusted), kachery_config_json=json.dumps(ka.get_config()), function_name=name ) # For unindenting ShellScript(run_py_script).write(os.path.join(temp_path, 'run.py')) env_vars_inside_container = dict( KACHERY_STORAGE_DIR='/kachery-storage', PYTHONPATH='/run_in_container/function_src/_local_modules', HOME='$HOME' ) run_inside_script = """ #!/bin/bash set -e {env_vars_inside_container} python3 /run_in_container/run.py """.format( env_vars_inside_container=' '.join(['{}={}'.format(k, v) for k, v in env_vars_inside_container.items()]) ) ShellScript(run_inside_script).write(os.path.join(temp_path, 'run.sh')) if not os.getenv('KACHERY_STORAGE_DIR'): raise Exception('You must set the environment variable: KACHERY_STORAGE_DIR') if os.getenv('HITHER_USE_SINGULARITY', None) == 'TRUE': run_outside_script = """ #!/bin/bash singularity exec -e \\ -B $KACHERY_STORAGE_DIR:/kachery-storage \\ -B {temp_path}:/run_in_container \\ --nv \\ {binds_str} \\ {container} \\ bash /run_in_container/run.sh """.format( binds_str=' '.join(['-B {}:{}'.format(a, b) for a, b in binds.items()]), container=container, temp_path=temp_path ) else: run_outside_script = """ #!/bin/bash docker run -it \\ --gpus all \\ -v /etc/passwd:/etc/passwd -u `id -u`:`id -g` \\ -v $KACHERY_STORAGE_DIR:/kachery-storage \\ -v {temp_path}:/run_in_container \\ -v /tmp:/tmp \\ -v $HOME:$HOME \\ {binds_str} \\ {container} \\ bash /run_in_container/run.sh """.format( binds_str=' '.join(['-v {}:{}'.format(a, b) for a, b in binds.items()]), container=_docker_form_of_container_string(container), temp_path=temp_path ) ss = ShellScript(run_outside_script, keep_temp_files=False) ss.start() retcode = ss.wait() if retcode != 0: raise Exception('Non-zero exit code ({}) running {} in container {}'.format(retcode, name, container)) with open(os.path.join(temp_path, 'retval.json')) as f: obj = json.load(f) retval = obj['retval'] for a, b in outputs_to_copy.items(): shutil.copyfile(a, b) return retval