def build_runtime(self, runtime_name, requirements_file=None): try: shutil.rmtree(az_config.BUILD_DIR) except Exception: pass action_name = self._format_action_name(runtime_name) build_dir = os.path.join(az_config.BUILD_DIR, action_name) os.makedirs(build_dir, exist_ok=True) logger.info('Building default runtime in {}'.format(build_dir)) action_dir = os.path.join(build_dir, az_config.ACTION_DIR) os.makedirs(action_dir, exist_ok=True) req_file = os.path.join(build_dir, 'requirements.txt') with open(req_file, 'w') as reqf: reqf.write(az_config.REQUIREMENTS_FILE) if not is_unix_system(): if 'dev' in lithops.__version__: reqf.write('git+https://github.com/lithops-cloud/lithops') else: reqf.write('lithops=={}'.format(lithops.__version__)) host_file = os.path.join(build_dir, 'host.json') with open(host_file, 'w') as hstf: hstf.write(az_config.HOST_FILE) fn_file = os.path.join(action_dir, 'function.json') if self.invocation_type == 'event': with open(fn_file, 'w') as fnf: in_q_name = self._format_queue_name(action_name, az_config.IN_QUEUE) az_config.BINDINGS_QUEUE['bindings'][0]['queueName'] = in_q_name out_q_name = self._format_queue_name(action_name, az_config.OUT_QUEUE) az_config.BINDINGS_QUEUE['bindings'][1]['queueName'] = out_q_name fnf.write(json.dumps(az_config.BINDINGS_QUEUE)) elif self.invocation_type == 'http': with open(fn_file, 'w') as fnf: fnf.write(json.dumps(az_config.BINDINGS_HTTP)) entry_point = os.path.join(os.path.dirname(__file__), 'entry_point.py') main_file = os.path.join(action_dir, '__init__.py') shutil.copy(entry_point, main_file) if is_unix_system(): mod_dir = os.path.join(build_dir, az_config.ACTION_MODULES_DIR) os.chdir(build_dir) cmd = '{} -m pip install -U -t {} -r requirements.txt'.format(sys.executable, mod_dir) if logger.getEffectiveLevel() != logging.DEBUG: cmd = cmd + " >{} 2>&1".format(os.devnull) os.system(cmd) create_handler_zip(az_config.FH_ZIP_LOCATION, entry_point, '__init__.py') archive = zipfile.ZipFile(az_config.FH_ZIP_LOCATION) archive.extractall(path=mod_dir) os.remove(mod_dir+'/__init__.py') os.remove(az_config.FH_ZIP_LOCATION)
def __init__(self, config, executor_id, job_id, log_level): logging.basicConfig(filename=log_file, level=log_level) self.log_active = logger.getEffectiveLevel() != logging.WARNING self.config = config self.queue = Queue() self.use_threads = not is_unix_system() self.num_workers = self.config['lithops'].get('workers', CPU_COUNT) self.workers = [] sys.stdout = open(log_file, 'a') sys.stderr = open(log_file, 'a') if self.use_threads: for worker_id in range(self.num_workers): p = Thread(target=self._process_runner, args=(worker_id,)) self.workers.append(p) p.start() else: for worker_id in range(self.num_workers): p = Process(target=self._process_runner, args=(worker_id,)) self.workers.append(p) p.start() logger.info('ExecutorID {} | JobID {} - Localhost Executor started - {} workers' .format(job.executor_id, job.job_id, self.num_workers))
def __init__(self, config, executor_id, internal_storage, compute_handler): super().__init__(config, executor_id, internal_storage, compute_handler) self.remote_invoker = self.config['serverless'].get( 'remote_invoker', False) self.use_threads = (self.is_lithops_worker or not is_unix_system() or mp.get_start_method() != 'fork') self.invokers = [] self.ongoing_activations = 0 if self.use_threads: self.token_bucket_q = queue.Queue() self.pending_calls_q = queue.Queue() self.running_flag = SimpleNamespace(value=0) self.INVOKER = Thread else: self.token_bucket_q = mp.Queue() self.pending_calls_q = mp.Queue() self.running_flag = mp.Value('i', 0) self.INVOKER = mp.Process self.job_monitor = JobMonitor(self.config, self.internal_storage, self.token_bucket_q) logger.debug('ExecutorID {} - Serverless invoker created'.format( self.executor_id))
def __init__(self, config, executor_id, internal_storage, compute_handler): self.log_active = logger.getEffectiveLevel() != logging.WARNING self.config = config self.executor_id = executor_id self.storage_config = extract_storage_config(self.config) self.internal_storage = internal_storage self.compute_handler = compute_handler self.is_lithops_worker = is_lithops_worker() self.invokers = [] self.remote_invoker = self.config['serverless'].get( 'remote_invoker', False) self.workers = self.config['lithops'].get('workers') logger.debug('ExecutorID {} - Total available workers: {}'.format( self.executor_id, self.workers)) if not is_lithops_worker() and is_unix_system(): self.token_bucket_q = multiprocessing.Queue() self.pending_calls_q = multiprocessing.Queue() self.running_flag = multiprocessing.Value('i', 0) else: self.token_bucket_q = queue.Queue() self.pending_calls_q = queue.Queue() self.running_flag = SimpleNamespace(value=0) self.ongoing_activations = 0 self.job_monitor = JobMonitor(self.config, self.internal_storage, self.token_bucket_q) logger.debug('ExecutorID {} - Serverless invoker created'.format( self.executor_id))
def __init__(self, local_config): self.log_active = logger.getEffectiveLevel() != logging.WARNING self.config = local_config self.name = 'local' self.alive = True self.queue = Queue() self.logs_dir = os.path.join(STORAGE_FOLDER, LOGS_PREFIX) self.num_workers = self.config['workers'] self.use_threads = not is_unix_system() self.workers = [] atexit.register(self.close) if self.use_threads: for worker_id in range(self.num_workers): p = Thread(target=self._process_runner, args=(worker_id, )) self.workers.append(p) p.daemon = True p.start() else: for worker_id in range(self.num_workers): p = Process(target=self._process_runner, args=(worker_id, )) self.workers.append(p) p.start() log_msg = 'Lithops v{} init for Localhost - Total workers: {}'.format( __version__, self.num_workers) logger.info(log_msg) if not self.log_active: print(log_msg)
def __init__(self, config, executor_id, job_id): self.config = config self.executor_id = executor_id self.job_id = job_id self.use_threads = not is_unix_system() self.num_workers = self.config['lithops'].get('workers', CPU_COUNT) self.workers = [] if self.use_threads: self.queue = queue.Queue() WORKER = Thread WOREKR_PROCESS = self._thread_runner else: if 'fork' in mp.get_all_start_methods(): mp.set_start_method('fork') self.queue = mp.Queue() WORKER = mp.Process WOREKR_PROCESS = self._process_runner for worker_id in range(self.num_workers): p = WORKER(target=WOREKR_PROCESS, args=(worker_id,)) self.workers.append(p) p.start() logger.info('ExecutorID {} | JobID {} - Localhost runner started ' '- {} workers'.format(self.executor_id, self.job_id, self.num_workers))
def _create_function(self, docker_image_name, memory, timeout): """ Create and publish an Azure Functions """ action_name = self._format_action_name(docker_image_name, memory) logger.info( 'Creating new Lithops runtime for Azure Function: {}'.format( action_name)) if self.invocation_type == 'event': try: in_q_name = self._format_queue_name(action_name, az_config.IN_QUEUE) logger.debug('Creating queue {}'.format(in_q_name)) self.queue_service.create_queue(in_q_name) except Exception: in_queue = self.queue_service.get_queue_client(in_q_name) in_queue.clear_messages() try: out_q_name = self._format_queue_name(action_name, az_config.OUT_QUEUE) logger.debug('Creating queue {}'.format(out_q_name)) self.queue_service.create_queue(out_q_name) except Exception: out_queue = self.queue_service.get_queue_client(out_q_name) out_queue.clear_messages() python_version = version_str(sys.version_info) cmd = ( 'az functionapp create --name {} --storage-account {} ' '--resource-group {} --os-type Linux --runtime python ' '--runtime-version {} --functions-version {} --consumption-plan-location {}' .format(action_name, self.storage_account_name, self.resource_group, python_version, self.functions_version, self.location)) if logger.getEffectiveLevel() != logging.DEBUG: cmd = cmd + " >{} 2>&1".format(os.devnull) res = os.system(cmd) if res != 0: raise Exception( 'There was an error creating the function in Azure. cmd: {}'. format(cmd)) logger.debug('Publishing function: {}'.format(action_name)) build_dir = os.path.join(az_config.BUILD_DIR, action_name) os.chdir(build_dir) res = 1 while res != 0: time.sleep(5) if is_unix_system(): cmd = 'func azure functionapp publish {} --python --no-build'.format( action_name) else: cmd = 'func azure functionapp publish {} --python'.format( action_name) if logger.getEffectiveLevel() != logging.DEBUG: cmd = cmd + " >{} 2>&1".format(os.devnull) res = os.system(cmd) time.sleep(10)
def run(self, job_payload, job_filename): """ Runs a job """ executor_id = job_payload['executor_id'] job_id = job_payload['job_id'] total_calls = len(job_payload['call_ids']) job_key = job_payload['job_key'] logger.debug(f'ExecutorID {executor_id} | JobID {job_id} - Running ' f'{total_calls} activations in the localhost worker') if not os.path.isfile(RUNNER): self.setup() tmp_path = Path(TEMP).as_posix() cmd = f'docker run --name lithops_{job_key} ' cmd += f'--user {self.uid}:{self.gid} ' if is_unix_system() else '' cmd += f'--rm -v {tmp_path}:/tmp --entrypoint "python3" {self.runtime} /tmp/lithops/runner.py run {job_filename}' log = open(RN_LOG_FILE, 'a') process = sp.Popen(shlex.split(cmd), stdout=log, stderr=log, start_new_session=True) self.jobs[job_key] = process return process
def __init__(self, config): self.config = config self.use_threads = not is_unix_system() self.num_workers = self.config['lithops'].get('workers', CPU_COUNT) self.workers = [] log_file_stream = open(LH_LOG_FILE, 'a') sys.stdout = log_file_stream sys.stderr = log_file_stream if self.use_threads: self.queue = queue.Queue() for worker_id in range(self.num_workers): p = Thread(target=self._process_runner, args=(worker_id, )) self.workers.append(p) p.start() else: self.queue = mp.Queue() for worker_id in range(self.num_workers): p = mp.Process(target=self._process_runner, args=(worker_id, )) self.workers.append(p) p.start() logger.info( 'ExecutorID {} | JobID {} - Localhost Executor started - {} workers' .format(job.executor_id, job.job_id, self.num_workers))
def get_execution_cmd(self, runtime): if is_unix_system(): cmd = ('docker run --user $(id -u):$(id -g) --rm -v {}:/tmp --entrypoint ' '"python3" {} /tmp/lithops/runner.py'.format(TEMP, self.runtime)) else: cmd = ('docker run --rm -v {}:/tmp --entrypoint "python3" {} ' '/tmp/lithops/runner.py'.format(TEMP, self.runtime)) return cmd
def get_execution_cmd(self, runtime): if is_unix_system(): cmd = ( f'docker run --user $(id -u):$(id -g) --rm -v {TEMP}:/tmp --entrypoint ' f'"python3" {self.runtime} /tmp/lithops/runner.py') else: cmd = (f'docker run --rm -v {TEMP}:/tmp --entrypoint "python3" ' f'{self.runtime} /tmp/lithops/runner.py') return cmd
def kill_job(job_key): if self.jobs[job_key].poll() is None: logger.debug( f'Killing job {job_key} with PID {self.jobs[job_key].pid}') PID = self.jobs[job_key].pid if is_unix_system(): PGID = os.getpgid(PID) os.killpg(PGID, signal.SIGKILL) else: os.kill(PID, signal.SIGTERM) del self.jobs[job_key]
def _start_invoker_process(self): """ Starts the invoker process responsible to spawn pending calls in background """ if self.is_lithops_function or not is_unix_system(): for inv_id in range(INVOKER_PROCESSES): p = Thread(target=self._run_invoker_process, args=(inv_id, )) self.invokers.append(p) p.daemon = True p.start() else: for inv_id in range(INVOKER_PROCESSES): p = Process(target=self._run_invoker_process, args=(inv_id, )) self.invokers.append(p) p.daemon = True p.start()
def preinstalls(self): if not os.path.isfile(RUNNER): self.setup() tmp_path = Path(TEMP).as_posix() cmd = 'docker run ' cmd += f'--user {self.uid}:{self.gid} ' if is_unix_system() else '' cmd += f'--rm -v {tmp_path}:/tmp --entrypoint "python3" {self.runtime} /tmp/lithops/runner.py preinstalls' process = sp.run(shlex.split(cmd), check=True, stdout=sp.PIPE, universal_newlines=True, start_new_session=True) runtime_meta = json.loads(process.stdout.strip()) return runtime_meta
def get_memory_usage(formatted=True): """ Gets the current memory usage of the runtime. To be used only in the action code. """ if not is_unix_system(): return split_args = False pids_to_show = None discriminate_by_pid = False ps_mem.verify_environment(pids_to_show) sorted_cmds, shareds, count, total, swaps, total_swap = \ ps_mem.get_memory_usage(pids_to_show, split_args, discriminate_by_pid, include_self=True, only_self=False) if formatted: return sizeof_fmt(int(ps_mem.human(total, units=1))) else: return int(ps_mem.human(total, units=1))
def get_memory_usage(formatted=True): """ Gets the current memory usage of the runtime. To be used only in the action code. """ from lithops.libs import ps_mem if not is_unix_system() or os.geteuid() != 0: # Non Unix systems and non root users can't run # the ps_mem module return split_args = False pids_to_show = None discriminate_by_pid = False ps_mem.verify_environment(pids_to_show) sorted_cmds, shareds, count, total, swaps, total_swap = \ ps_mem.get_memory_usage(pids_to_show, split_args, discriminate_by_pid, include_self=True, only_self=False) if formatted: return sizeof_fmt(int(ps_mem.human(total, units=1))) else: return int(ps_mem.human(total, units=1))
def wait(self, fs=None, throw_except=True, return_when=ALL_COMPLETED, download_results=False, timeout=None, THREADPOOL_SIZE=128, WAIT_DUR_SEC=1): """ Wait for the Future instances (possibly created by different Executor instances) given by fs to complete. Returns a named 2-tuple of sets. The first set, named done, contains the futures that completed (finished or cancelled futures) before the wait completed. The second set, named not_done, contains the futures that did not complete (pending or running futures). timeout can be used to control the maximum number of seconds to wait before returning. :param fs: Futures list. Default None :param throw_except: Re-raise exception if call raised. Default True. :param return_when: One of `ALL_COMPLETED`, `ANY_COMPLETED`, `ALWAYS` :param download_results: Download results. Default false (Only get statuses) :param timeout: Timeout of waiting for results. :param THREADPOOL_SIZE: Number of threads to use. Default 64 :param WAIT_DUR_SEC: Time interval between each check. :return: `(fs_done, fs_notdone)` where `fs_done` is a list of futures that have completed and `fs_notdone` is a list of futures that have not completed. :rtype: 2-tuple of list """ futures = fs or self.futures if type(futures) != list: futures = [futures] if not futures: raise Exception( 'You must run the call_async(), map() or map_reduce(), or provide' ' a list of futures before calling the wait()/get_result() method' ) if download_results: msg = 'ExecutorID {} - Getting results'.format(self.executor_id) fs_done = [f for f in futures if f.done] fs_not_done = [f for f in futures if not f.done] fs_not_ready = [f for f in futures if not f.ready] else: msg = 'ExecutorID {} - Waiting for functions to complete'.format( self.executor_id) fs_done = [f for f in futures if f.ready or f.done] fs_not_done = [f for f in futures if not f.ready and not f.done] fs_not_ready = [f for f in futures if not f.ready] if not fs_not_done: return fs_done, fs_not_done logger.info(msg) if is_unix_system() and timeout is not None: logger.debug( 'Setting waiting timeout to {} seconds'.format(timeout)) error_msg = 'Timeout of {} seconds exceeded waiting for function activations to finish'.format( timeout) signal.signal(signal.SIGALRM, partial(timeout_handler, error_msg)) signal.alarm(timeout) pbar = None error = False if not self.is_lithops_worker and self.setup_progressbar and fs_not_ready: from tqdm.auto import tqdm if is_notebook(): pbar = tqdm(bar_format='{n}/|/ {n_fmt}/{total_fmt}', total=len(fs_not_done)) # ncols=800 else: print() pbar = tqdm(bar_format=' {l_bar}{bar}| {n_fmt}/{total_fmt} ', total=len(fs_not_done), disable=None) try: if self.rabbitmq_monitor: logger.debug('Using RabbitMQ to monitor function activations') wait_rabbitmq(futures, self.internal_storage, rabbit_amqp_url=self.rabbit_amqp_url, download_results=download_results, throw_except=throw_except, pbar=pbar, return_when=return_when, THREADPOOL_SIZE=THREADPOOL_SIZE) else: wait_storage(futures, self.internal_storage, download_results=download_results, throw_except=throw_except, return_when=return_when, pbar=pbar, THREADPOOL_SIZE=THREADPOOL_SIZE, WAIT_DUR_SEC=WAIT_DUR_SEC) except KeyboardInterrupt as e: if download_results: not_dones_call_ids = [(f.job_id, f.call_id) for f in futures if not f.done] else: not_dones_call_ids = [(f.job_id, f.call_id) for f in futures if not f.ready and not f.done] msg = ('ExecutorID {} - Cancelled - Total Activations not done: {}' .format(self.executor_id, len(not_dones_call_ids))) if pbar: pbar.close() print() logger.info(msg) error = True if self.data_cleaner and not self.is_lithops_worker: self.clean(clean_cloudobjects=False, force=True) raise e except Exception as e: error = True if self.data_cleaner and not self.is_lithops_worker: self.clean(clean_cloudobjects=False, force=True) raise e finally: self.invoker.stop() if is_unix_system(): signal.alarm(0) if pbar and not pbar.disable: pbar.close() if not is_notebook(): print() if self.data_cleaner and not self.is_lithops_worker: self.clean(clean_cloudobjects=False) if not fs and error and is_notebook(): del self.futures[len(self.futures) - len(futures):] if download_results: fs_done = [f for f in futures if f.done] fs_notdone = [f for f in futures if not f.done] else: fs_done = [f for f in futures if f.ready or f.done] fs_notdone = [f for f in futures if not f.ready and not f.done] return fs_done, fs_notdone
def __init__(self, docker_image, pull_runtime): logger.debug(f'Starting Docker Environment for {docker_image}') super().__init__(runtime=docker_image) self.pull_runtime = pull_runtime self.uid = os.getuid() if is_unix_system() else None self.gid = os.getuid() if is_unix_system() else None
def run_job(job): """ Runs a single job within a separate process """ start_tstamp = time.time() setup_lithops_logger(job.log_level) logger.info("Lithops v{} - Starting execution".format(__version__)) logger.info("Execution ID: {}/{}".format(job.job_key, job.call_id)) logger.debug("Runtime name: {}".format(job.runtime_name)) if job.runtime_memory: logger.debug("Runtime memory: {}MB".format(job.runtime_memory)) logger.debug("Function timeout: {}s".format(job.execution_timeout)) env = job.extra_env env['LITHOPS_WORKER'] = 'True' env['PYTHONUNBUFFERED'] = 'True' env['LITHOPS_CONFIG'] = json.dumps(job.config) env['PYTHONPATH'] = "{}:{}".format(os.getcwd(), LITHOPS_LIBS_PATH) env['__LITHOPS_SESSION_ID'] = '-'.join([job.job_key, job.call_id]) os.environ.update(env) storage_config = extract_storage_config(job.config) internal_storage = InternalStorage(storage_config) call_status = CallStatus(job.config, internal_storage) call_status.response['worker_start_tstamp'] = start_tstamp call_status.response['host_submit_tstamp'] = job.host_submit_tstamp call_status.response['call_id'] = job.call_id call_status.response['job_id'] = job.job_id call_status.response['executor_id'] = job.executor_id show_memory_peak = strtobool(os.environ.get('SHOW_MEMORY_PEAK', 'False')) try: if __version__ != job.lithops_version: msg = ( "Lithops version mismatch. Host version: {} - Runtime version: {}" .format(job.lithops_version, __version__)) raise RuntimeError('HANDLER', msg) # send init status event call_status.send('__init__') if show_memory_peak: mm_handler_conn, mm_conn = Pipe() memory_monitor = Thread(target=memory_monitor_worker, args=(mm_conn, )) memory_monitor.start() job.jr_stats_file = os.path.join(job.job_dir, 'jobrunner.stats.txt') handler_conn, jobrunner_conn = Pipe() jobrunner = JobRunner(job, jobrunner_conn, internal_storage) logger.debug('Starting JobRunner process') jrp = Process(target=jobrunner.run) if is_unix_system() else Thread( target=jobrunner.run) jrp.start() jrp.join(job.execution_timeout) logger.debug('JobRunner process finished') if jrp.is_alive(): # If process is still alive after jr.join(job_max_runtime), kill it try: jrp.terminate() except Exception: # thread does not have terminate method pass msg = ('Function exceeded maximum time of {} seconds and was ' 'killed'.format(job.execution_timeout)) raise TimeoutError('HANDLER', msg) if show_memory_peak: mm_handler_conn.send('STOP') memory_monitor.join() peak_memory_usage = int(mm_handler_conn.recv()) logger.info("Peak memory usage: {}".format( sizeof_fmt(peak_memory_usage))) call_status.response['peak_memory_usage'] = peak_memory_usage if not handler_conn.poll(): logger.error( 'No completion message received from JobRunner process') logger.debug('Assuming memory overflow...') # Only 1 message is returned by jobrunner when it finishes. # If no message, this means that the jobrunner process was killed. # 99% of times the jobrunner is killed due an OOM, so we assume here an OOM. msg = 'Function exceeded maximum memory and was killed' raise MemoryError('HANDLER', msg) if os.path.exists(job.jr_stats_file): with open(job.jr_stats_file, 'r') as fid: for l in fid.readlines(): key, value = l.strip().split(" ", 1) try: call_status.response[key] = float(value) except Exception: call_status.response[key] = value if key in [ 'exception', 'exc_pickle_fail', 'result', 'new_futures' ]: call_status.response[key] = eval(value) except Exception: # internal runtime exceptions print('----------------------- EXCEPTION !-----------------------') traceback.print_exc(file=sys.stdout) print('----------------------------------------------------------') call_status.response['exception'] = True pickled_exc = pickle.dumps(sys.exc_info()) pickle.loads( pickled_exc) # this is just to make sure they can be unpickled call_status.response['exc_info'] = str(pickled_exc) finally: call_status.response['worker_end_tstamp'] = time.time() with open(job.log_file, 'rb') as lf: log_str = base64.b64encode(zlib.compress(lf.read())).decode() call_status.response['logs'] = log_str call_status.send('__end__') # Unset specific env vars for key in job.extra_env: os.environ.pop(key, None) os.environ.pop('__LITHOPS_TOTAL_EXECUTORS', None) logger.info("Finished")
def run_job(job): """ Runs a single job within a separate process """ setup_lithops_logger(job.log_level) backend = os.environ.get('__LITHOPS_BACKEND', '') logger.info("Lithops v{} - Starting {} execution".format( __version__, backend)) logger.info("Execution ID: {}/{}".format(job.job_key, job.call_id)) env = job.extra_env env['LITHOPS_CONFIG'] = json.dumps(job.config) env['__LITHOPS_SESSION_ID'] = '-'.join([job.job_key, job.call_id]) os.environ.update(env) storage_config = extract_storage_config(job.config) internal_storage = InternalStorage(storage_config) call_status = create_call_status(job, internal_storage) if job.runtime_memory: logger.debug('Runtime: {} - Memory: {}MB - Timeout: {} seconds'.format( job.runtime_name, job.runtime_memory, job.execution_timeout)) else: logger.debug('Runtime: {} - Timeout: {} seconds'.format( job.runtime_name, job.execution_timeout)) job_interruped = False try: # send init status event call_status.send_init_event() handler_conn, jobrunner_conn = Pipe() jobrunner = JobRunner(job, jobrunner_conn, internal_storage) logger.debug('Starting JobRunner process') jrp = Process(target=jobrunner.run) if is_unix_system() else Thread( target=jobrunner.run) jrp.start() jrp.join(job.execution_timeout) logger.debug('JobRunner process finished') if jrp.is_alive(): # If process is still alive after jr.join(job_max_runtime), kill it try: jrp.terminate() except Exception: # thread does not have terminate method pass msg = ('Function exceeded maximum time of {} seconds and was ' 'killed'.format(job.execution_timeout)) raise TimeoutError('HANDLER', msg) if not handler_conn.poll(): logger.error( 'No completion message received from JobRunner process') logger.debug('Assuming memory overflow...') # Only 1 message is returned by jobrunner when it finishes. # If no message, this means that the jobrunner process was killed. # 99% of times the jobrunner is killed due an OOM, so we assume here an OOM. msg = 'Function exceeded maximum memory and was killed' raise MemoryError('HANDLER', msg) if os.path.exists(job.stats_file): with open(job.stats_file, 'r') as fid: for l in fid.readlines(): key, value = l.strip().split(" ", 1) try: call_status.add(key, float(value)) except Exception: call_status.add(key, value) if key in ['exception', 'exc_pickle_fail', 'result']: call_status.add(key, eval(value)) except KeyboardInterrupt: job_interruped = True logger.debug("Job interrupted") except Exception: # internal runtime exceptions print('----------------------- EXCEPTION !-----------------------') traceback.print_exc(file=sys.stdout) print('----------------------------------------------------------') call_status.add('exception', True) pickled_exc = pickle.dumps(sys.exc_info()) pickle.loads( pickled_exc) # this is just to make sure they can be unpickled call_status.add('exc_info', str(pickled_exc)) finally: if not job_interruped: call_status.add('worker_end_tstamp', time.time()) # Flush log stream and save it to the call status job.log_stream.flush() if os.path.isfile(job.log_file): with open(job.log_file, 'rb') as lf: log_str = base64.b64encode(zlib.compress( lf.read())).decode() call_status.add('logs', log_str) call_status.send_finish_event() logger.info("Finished")
def wait(fs, throw_except=True, return_when=ALL_COMPLETED, download_results=False, timeout=None, THREADPOOL_SIZE=128, WAIT_DUR_SEC=1, internal_storage=None): """ Wait for the Future instances (possibly created by different Executor instances) given by fs to complete. Returns a named 2-tuple of sets. The first set, named done, contains the futures that completed (finished or cancelled futures) before the wait completed. The second set, named not_done, contains the futures that did not complete (pending or running futures). timeout can be used to control the maximum number of seconds to wait before returning. :param fs: Futures list. Default None :param throw_except: Re-raise exception if call raised. Default True. :param return_when: One of `ALL_COMPLETED`, `ANY_COMPLETED`, `ALWAYS` :param download_results: Download results. Default false (Only get statuses) :param timeout: Timeout of waiting for results. :param THREADPOOL_SIZE: Number of threads to use. Default 64 :param WAIT_DUR_SEC: Time interval between each check. :return: `(fs_done, fs_notdone)` where `fs_done` is a list of futures that have completed and `fs_notdone` is a list of futures that have not completed. :rtype: 2-tuple of list """ if not internal_storage: internal_storage = InternalStorage(fs[0].storage_config) if type(fs) != list: fs = [fs] setup_progressbar = (not is_lithops_worker() and logger.getEffectiveLevel() == logging.INFO) if download_results: msg = 'Getting results from functions' fs_done = [f for f in fs if f.done] fs_not_done = [f for f in fs if not f.done] # fs_not_ready = [f for f in futures if not f.ready and not f.done] else: msg = 'Waiting for functions to complete' fs_done = [f for f in fs if f.ready or f.done] fs_not_done = [f for f in fs if not f.done] # fs_not_ready = [f for f in futures if not f.ready and not f.done] if not fs_not_done: return fs_done, fs_not_done logger.info(msg) if is_unix_system() and timeout is not None: logger.debug('Setting waiting timeout to {} seconds'.format(timeout)) error_msg = 'Timeout of {} seconds exceeded waiting for function activations to finish'.format( timeout) signal.signal(signal.SIGALRM, partial(timeout_handler, error_msg)) signal.alarm(timeout) pbar = None if not is_lithops_worker() and setup_progressbar: from tqdm.auto import tqdm if not is_notebook(): print() pbar = tqdm(bar_format=' {l_bar}{bar}| {n_fmt}/{total_fmt} ', total=len(fs_not_done), disable=None) try: wait_storage(fs, internal_storage, download_results=download_results, throw_except=throw_except, return_when=return_when, pbar=pbar, THREADPOOL_SIZE=THREADPOOL_SIZE, WAIT_DUR_SEC=WAIT_DUR_SEC) except KeyboardInterrupt as e: if download_results: not_dones_call_ids = [(f.job_id, f.call_id) for f in fs if not f.done] else: not_dones_call_ids = [(f.job_id, f.call_id) for f in fs if not f.ready and not f.done] msg = ('Cancelled - Total Activations not done: {}'.format( len(not_dones_call_ids))) if pbar: pbar.close() print() logger.info(msg) raise e except Exception as e: raise e finally: if is_unix_system(): signal.alarm(0) if pbar and not pbar.disable: pbar.close() if not is_notebook(): print() if download_results: fs_done = [f for f in fs if f.done] fs_notdone = [f for f in fs if not f.done] else: fs_done = [f for f in fs if f.ready or f.done] fs_notdone = [f for f in fs if not f.ready and not f.done] return fs_done, fs_notdone
def wait(fs, internal_storage=None, throw_except=True, timeout=None, return_when=ALL_COMPLETED, download_results=False, job_monitor=None, threadpool_size=THREADPOOL_SIZE, wait_dur_sec=WAIT_DUR_SEC): """ Wait for the Future instances (possibly created by different Executor instances) given by fs to complete. Returns a named 2-tuple of sets. The first set, named done, contains the futures that completed (finished or cancelled futures) before the wait completed. The second set, named not_done, contains the futures that did not complete (pending or running futures). timeout can be used to control the maximum number of seconds to wait before returning. :param fs: Futures list. Default None :param throw_except: Re-raise exception if call raised. Default True. :param return_when: Percentage of done futures :param download_results: Download results. Default false (Only get statuses) :param timeout: Timeout of waiting for results. :param threadpool_zise: Number of threads to use. Default 64 :param wait_dur_sec: Time interval between each check. :return: `(fs_done, fs_notdone)` where `fs_done` is a list of futures that have completed and `fs_notdone` is a list of futures that have not completed. :rtype: 2-tuple of list """ if not fs: return if type(fs) != list and type(fs) != FuturesList: fs = [fs] if download_results: msg = 'ExecutorID {} - Getting results from functions'.format( fs[0].executor_id) fs_done = [f for f in fs if f.done] fs_not_done = [f for f in fs if not f.done] else: msg = 'ExecutorID {} - Waiting for {}% of functions to complete'.format( fs[0].executor_id, return_when) fs_done = [f for f in fs if f.success or f.done] fs_not_done = [f for f in fs if not (f.success or f.done)] logger.info(msg) if not fs_not_done: return fs_done, fs_not_done if is_unix_system() and timeout is not None: logger.debug('Setting waiting timeout to {} seconds'.format(timeout)) error_msg = 'Timeout of {} seconds exceeded waiting for function activations to finish'.format( timeout) signal.signal(signal.SIGALRM, partial(timeout_handler, error_msg)) signal.alarm(timeout) # Setup progress bar pbar = None if not is_lithops_worker() and logger.getEffectiveLevel() == logging.INFO: from tqdm.auto import tqdm if not is_notebook(): print() pbar = tqdm(bar_format=' {l_bar}{bar}| {n_fmt}/{total_fmt} ', total=len(fs), disable=None) pbar.update(len(fs_done)) try: executors_data = _create_executors_data_from_futures( fs, internal_storage) if not job_monitor: for executor_data in executors_data: job_monitor = JobMonitor( executor_id=executor_data.executor_id, internal_storage=executor_data.internal_storage) job_monitor.start(fs=executor_data.futures) sleep_sec = wait_dur_sec if job_monitor.backend == 'storage' else 0.3 if return_when == ALWAYS: for executor_data in executors_data: _get_executor_data(fs, executor_data, pbar=pbar, throw_except=throw_except, download_results=download_results, threadpool_size=threadpool_size) else: while not _check_done(fs, return_when, download_results): for executor_data in executors_data: new_data = _get_executor_data( fs, executor_data, pbar=pbar, throw_except=throw_except, download_results=download_results, threadpool_size=threadpool_size) time.sleep(0 if new_data else sleep_sec) except KeyboardInterrupt as e: if download_results: not_dones_call_ids = [(f.job_id, f.call_id) for f in fs if not f.done] else: not_dones_call_ids = [(f.job_id, f.call_id) for f in fs if not f.success and not f.done] msg = ('Cancelled - Total Activations not done: {}'.format( len(not_dones_call_ids))) if pbar: pbar.close() print() logger.info(msg) raise e except Exception as e: raise e finally: if is_unix_system(): signal.alarm(0) if pbar and not pbar.disable: pbar.close() if not is_notebook(): print() if download_results: fs_done = [f for f in fs if f.done] fs_notdone = [f for f in fs if not f.done] else: fs_done = [f for f in fs if f.success or f.done] fs_notdone = [f for f in fs if not f.success and not f.done] return fs_done, fs_notdone
import os import sys import pkgutil import logging import pickle import subprocess from contextlib import contextmanager from lithops.version import __version__ as lithops_ver from lithops.utils import sizeof_fmt, is_unix_system, b64str_to_bytes from lithops.constants import LITHOPS_TEMP_DIR, MODULES_DIR logger = logging.getLogger(__name__) if is_unix_system(): # Windows hosts can't use ps_mem module import ps_mem def get_function_and_modules(job, internal_storage): """ Gets the function and the modules from storage """ logger.debug("Getting function and modules") mode = job.config['lithops']['mode'] customized_runtime = job.config[mode].get('customized_runtime', False) func_obj = None if customized_runtime:
def _init_runtime(self, docker_image_name): name = self._format_runtime_name(docker_image_name) if self._is_localhost: if is_unix_system(): uid_cmd = "id -u $USER" uid = subprocess.check_output(uid_cmd, shell=True).decode().strip() if self.docker_client: running_containers = self.docker_client.containers.list( filters={'name': 'lithops'}) running_runtimes = [c.name for c in running_containers] if name not in running_runtimes: self.docker_client.containers.run( docker_image_name, entrypoint='python', command='/tmp/{}/__main__.py'.format( DOCKER_BASE_FOLDER), volumes=['{}:/tmp'.format(TEMP)], detach=True, auto_remove=True, user=uid, name=name, ports={'8080/tcp': docker_config.LITHOPS_SERVER_PORT}) time.sleep(5) else: running_runtimes_cmd = "docker ps --format '{{.Names}}' -f name=lithops" running_runtimes = subprocess.run( running_runtimes_cmd, shell=True, stdout=subprocess.PIPE).stdout.decode() if name not in running_runtimes: if is_unix_system(): cmd = ( 'docker run -d --name {} --user {} -v {}:/tmp -p 8080:{}' ' --entrypoint "python" {} /tmp/{}/__main__.py'. format(name, uid, TEMP, docker_config.LITHOPS_SERVER_PORT, docker_image_name, DOCKER_BASE_FOLDER)) else: cmd = ('docker run -d --name {} -v {}:/tmp -p 8080:{}' ' --entrypoint "python" {} /tmp/{}/__main__.py'. format(name, TEMP, docker_config.LITHOPS_SERVER_PORT, docker_image_name, DOCKER_BASE_FOLDER)) if not self.log_active: cmd = cmd + " >{} 2>&1".format(os.devnull) res = os.system(cmd) if res != 0: raise Exception( 'There was an error starting the runtime') time.sleep(5) else: running_runtimes_cmd = "docker ps --format '{{.Names}}' -f name=lithops" running_runtimes = self._ssh_run_remote_command( running_runtimes_cmd) used_runtimes_cmd = "docker ps -a --format '{{.Names}}' -f name=lithops" used_runtimes = self._ssh_run_remote_command(used_runtimes_cmd) if name not in running_runtimes and name in used_runtimes: cmd = 'docker rm -f {}'.format(name) self._ssh_run_remote_command(cmd) cmd = ( 'docker run -d --name {} --user $(id -u):$(id -g) -v {}:/tmp -p 8080:{}' ' --entrypoint "python" {} /tmp/{}/__main__.py'.format( name, LITHOPS_TEMP, docker_config.LITHOPS_SERVER_PORT, docker_image_name, DOCKER_BASE_FOLDER)) if name not in running_runtimes: self._ssh_run_remote_command(cmd) time.sleep(5) # install missing dependency cmd = ('docker exec {} pip install pyyaml'.format(name)) try: self._ssh_run_remote_command(cmd) except Exception as e: if 'upgrade pip' in str(e): pass else: raise e