def delete_runtime(image_name, config=None): logger.info('Deleting runtime: {}'.format(image_name)) if config is None: config = wrenconfig.default() else: config = wrenconfig.default(config) storage_config = wrenconfig.extract_storage_config(config) storage_client = storage.InternalStorage(storage_config) cf_config = wrenconfig.extract_cf_config(config) cf_client = CloudFunctions(cf_config) if image_name == 'default': image_name = _get_default_image_name() image_name_formated = create_action_name(image_name) actions = cf_client.list_actions(PACKAGE) region = cf_client.endpoint.split('//')[1].split('.')[0] namespace = cf_client.namespace for action in actions: action_name, memory = action['name'].rsplit('-', 1) if image_name_formated == action_name: memory = int(memory.replace('MB', '')) runtime_name = create_runtime_name(image_name, memory) storage_client.delete_runtime_info(region, namespace, runtime_name) action_name = create_action_name(runtime_name) cf_client.delete_action(action_name)
def clean_bucket(bucket, prefix, storage_config): """ Wrapper of clean_os_bucket(). Use this method only when storage_config is in JSON format. In any other case, call directly clean_os_bucket() method. """ internal_storage = storage.InternalStorage(json.loads(storage_config)) sys.stdout = open(os.devnull, 'w') clean_os_bucket(bucket, prefix, internal_storage) sys.stdout = sys.__stdout__
def __init__(self, config=None, runtime=None, log_level=None, runtime_timeout=wrenconfig.CF_RUNTIME_TIMEOUT): """ Initialize and return an executor class. :param config: Settings passed in here will override those in `pywren_config`. Default None. :param runtime: Runtime name to use. Default None. :param runtime_timeout: Max time per action. Default 600 :return `executor` object. Usage >>> import pywren_ibm_cloud as pywren >>> pw = pywren.ibm_cf_executor() """ self._state = ExecutorState.new if config is None: self.config = wrenconfig.default() else: self.config = wrenconfig.default(config) if runtime: self.config['ibm_cf']['action_name'] = runtime if log_level: wrenlogging.default_config(log_level) ibm_cf_config = self.config['ibm_cf'] self.runtime = ibm_cf_config['action_name'] self.cf_cluster = ibm_cf_config['is_cf_cluster'] self.data_cleaner = self.config['pywren']['data_cleaner'] retry_config = {} retry_config['invocation_retry'] = self.config['pywren'][ 'invocation_retry'] retry_config['retry_sleeps'] = self.config['pywren']['retry_sleeps'] retry_config['retries'] = self.config['pywren']['retries'] invoker = invokers.IBMCloudFunctionsInvoker(ibm_cf_config, retry_config) self.storage_config = wrenconfig.extract_storage_config(self.config) self.internal_storage = storage.InternalStorage(self.storage_config) self.executor = Executor(invoker, self.config, self.internal_storage, runtime_timeout) self.executor_id = self.executor.executor_id self.futures = [] self.reduce_future = None
def extract_modules(image_name, config=None, pywren_location=None): # Extract installed Python modules from docker image # And store them into storage # Create runtime_name from image_name username, appname = image_name.split('/') runtime_name = appname.replace(':', '_') # Load PyWren config from ~/.pywren_config if config is None: config = wrenconfig.default() else: config = wrenconfig.default(config) # Create storage_handler to upload modules file storage_config = wrenconfig.extract_storage_config(config) internal_storage = storage.InternalStorage(storage_config) # sys.stdout = open(os.devnull, 'w') if pywren_location is None: action_location = "extract_modules.py" else: action_location = os.path.join(pywren_location, "runtime", "extract_modules.py") with open(action_location, "r") as action_py: action_code = action_py.read() cf_client = CloudFunctions(config['ibm_cf']) action_name = runtime_name + '_modules' cf_client.create_action(action_name, code=action_code, kind='blackbox', image=image_name, is_binary=False) runtime_meta = cf_client.invoke_with_result(action_name) internal_storage.put_runtime_info(runtime_name, runtime_meta) cf_client.delete_action(action_name)
def _extract_modules(image_name, memory, cf_client, config): # Extract installed Python modules from docker image # And store them into storage # Create storage_handler to upload modules file storage_config = wrenconfig.extract_storage_config(config) internal_storage = storage.InternalStorage(storage_config) pywren_location = _get_pywren_location() action_location = os.path.join(pywren_location, "runtime", "extract_modules.py") with open(action_location, "r") as action_py: action_code = action_py.read() modules_action_name = '{}-modules'.format(create_action_name(image_name)) # old_stdout = sys.stdout # sys.stdout = open(os.devnull, 'w') logger.debug( "Creating action for extracting Python modules list: {}".format( modules_action_name)) cf_client.create_action(modules_action_name, image_name, code=action_code, is_binary=False) # sys.stdout = old_stdout region = cf_client.endpoint.split('//')[1].split('.')[0] namespace = cf_client.namespace memory = cf_client.default_runtime_memory if not memory else memory runtime_name = create_runtime_name(image_name, memory) logger.debug( "Going to extract Python modules list from: {}".format(image_name)) runtime_meta = cf_client.invoke_with_result(modules_action_name) internal_storage.put_runtime_info(region, namespace, runtime_name, runtime_meta) cf_client.delete_action(modules_action_name)
def run(self): """ Runs the function """ logger.info("Started") # initial output file in case job fails output_dict = {'result': None, 'success': False} pickled_output = pickle.dumps(output_dict) try: self.internal_storage = storage.InternalStorage(self.storage_config) loaded_func_all = self._get_function_and_modules() self._save_modules(loaded_func_all['module_data']) function = self._unpickle_function(loaded_func_all['func']) data = self._load_data() data = self._create_storage_clients(function, data) if self.show_memory: logger.debug("Memory usage before call the function: {}".format(get_current_memory_usage())) logger.info("Function: Going to execute '{}()'".format(str(function.__name__))) print('------------------- FUNCTION LOG -------------------', flush=True) func_exec_time_t1 = time.time() result = function(**data) func_exec_time_t2 = time.time() print('----------------------------------------------------', flush=True) logger.info("Function: Success execution") if self.show_memory: logger.debug("Memory usage after call the function: {}".format(get_current_memory_usage())) self.stats.write('function_exec_time', round(func_exec_time_t2-func_exec_time_t1, 8)) output_dict = {'result': result, 'success': True} pickled_output = pickle.dumps(output_dict) # Check for new futures if isinstance(result, ResponseFuture): callgroup_id = result.callgroup_id self.stats.write('new_futures', '{}/{}'.format(callgroup_id, 1)) elif type(result) == list and len(result) > 0 and isinstance(result[0], ResponseFuture): callgroup_id = result[0].callgroup_id self.stats.write('new_futures', '{}/{}'.format(callgroup_id, len(result))) else: self.stats.write('new_futures', '{}/{}'.format(None, 0)) if self.show_memory: logger.debug("Memory usage after output serialization: {}".format(get_current_memory_usage())) except Exception as e: print('------------------ EXCEPTION -------------------------') exc_type, exc_value, exc_traceback = sys.exc_info() #traceback.print_tb(exc_traceback) # Shockingly often, modules like subprocess don't properly # call the base Exception.__init__, which results in them # being unpickleable. As a result, we actually wrap this in a try/catch block # and more-carefully handle the exceptions if any part of this save / test-reload # fails logger.error("There was an exception: {}".format(str(e))) try: pickled_output = pickle.dumps({'result': e, 'exc_type': exc_type, 'exc_value': exc_value, 'exc_traceback': exc_traceback, 'sys.path': sys.path, 'success': False}) # this is just to make sure they can be unpickled pickle.loads(pickled_output) except Exception as pickle_exception: pickled_output = pickle.dumps({'result': str(e), 'exc_type': str(exc_type), 'exc_value': str(exc_value), 'exc_traceback': exc_traceback, 'exc_traceback_str': str(exc_traceback), 'sys.path': sys.path, 'pickle_fail': True, 'pickle_exception': pickle_exception, 'success': False}) finally: store_result = True if 'STORE_RESULT' in os.environ: store_result = eval(os.environ['STORE_RESULT']) if store_result: output_upload_timestamp_t1 = time.time() logger.info("Storing {} - Size: {}".format(self.output_key, sizeof_fmt(len(pickled_output)))) self.internal_storage.put_data(self.output_key, pickled_output) output_upload_timestamp_t2 = time.time() self.stats.write("output_upload_time", round(output_upload_timestamp_t2 - output_upload_timestamp_t1, 8)) self.result_queue.put("Finished") logger.info("Finished")
def result(self, check_only=False, throw_except=True, internal_storage=None): """ Return the value returned by the call. If the call raised an exception, this method will raise the same exception If the future is cancelled before completing then CancelledError will be raised. :param throw_except: Reraise exception if call raised. Default true. :param storage_handler: Storage handler to poll cloud storage. Default None. :return: Result of the call. :raises CancelledError: If the job is cancelled before completed. :raises TimeoutError: If job is not complete after `timeout` seconds. """ if self._state == JobState.new: raise ValueError("job not yet invoked") if internal_storage is None: internal_storage = storage.InternalStorage(self.storage_config) self.status(check_only, throw_except, internal_storage) if self._state == JobState.success: return self._return_val if self._state == JobState.futures: return self._new_futures if self._state == JobState.error: if throw_except: raise FunctionException(self.executor_id, self.activation_id, self._exception) else: return None if not self._produce_output: return call_output_time = time.time() call_invoker_result = internal_storage.get_call_output( self.executor_id, self.callgroup_id, self.call_id) self.output_query_count += 1 while call_invoker_result is None and self.output_query_count < self.GET_RESULT_MAX_RETRIES: time.sleep(self.GET_RESULT_SLEEP_SECS) call_invoker_result = internal_storage.get_call_output( self.executor_id, self.callgroup_id, self.call_id) self.output_query_count += 1 if call_invoker_result is None: if throw_except: raise Exception( 'Unable to get the output of the function - Activation ID: {}' .format(self.activation_id)) else: self._set_state(JobState.error) return None call_invoker_result = pickle.loads(call_invoker_result) call_output_time_done = time.time() self._call_invoker_result = call_invoker_result self.invoke_status[ 'download_output_time'] = call_output_time_done - call_output_time self.invoke_status['output_query_count'] = self.output_query_count self.invoke_status['download_output_timestamp'] = call_output_time_done log_msg = ('Executor ID {} Got output from Function {} - Activation ' 'ID: {}'.format(self.executor_id, self.call_id, self.activation_id)) logger.debug(log_msg) function_result = call_invoker_result['result'] if isinstance(function_result, ResponseFuture): self._new_futures = [function_result] self._set_state(JobState.futures) self.invoke_status['status_done_timestamp'] = self.invoke_status[ 'download_output_timestamp'] del self.invoke_status['download_output_timestamp'] return self._new_futures elif type(function_result ) == list and len(function_result) > 0 and isinstance( function_result[0], ResponseFuture): self._new_futures = function_result self._set_state(JobState.futures) self.invoke_status['status_done_timestamp'] = self.invoke_status[ 'download_output_timestamp'] del self.invoke_status['download_output_timestamp'] return self._new_futures else: self._return_val = function_result self._set_state(JobState.success) return self._return_val
def status(self, check_only=False, throw_except=True, internal_storage=None): """ Return the status returned by the call. If the call raised an exception, this method will raise the same exception If the future is cancelled before completing then CancelledError will be raised. :param check_only: Return None immediately if job is not complete. Default False. :param throw_except: Reraise exception if call raised. Default true. :param storage_handler: Storage handler to poll cloud storage. Default None. :return: Result of the call. :raises CancelledError: If the job is cancelled before completed. :raises TimeoutError: If job is not complete after `timeout` seconds. """ if self._state == JobState.new: raise ValueError("job not yet invoked") if self._state == JobState.ready or self._state == JobState.success: return self.run_status if internal_storage is None: internal_storage = storage.InternalStorage(self.storage_config) storage_utils.check_storage_path(internal_storage.get_storage_config(), self.storage_path) call_status = internal_storage.get_call_status(self.executor_id, self.callgroup_id, self.call_id) self.status_query_count += 1 if check_only is True: if call_status is None: return None while call_status is None: time.sleep(self.GET_RESULT_SLEEP_SECS) call_status = internal_storage.get_call_status( self.executor_id, self.callgroup_id, self.call_id) self.status_query_count += 1 self.invoke_status['status_done_timestamp'] = time.time() self.invoke_status['status_query_count'] = self.status_query_count self.run_status = call_status # this is the remote status information total_time = format( round(call_status['end_time'] - call_status['start_time'], 2), '.2f') if call_status['exception']: # the action handler/jobrunner/function had an exception self._set_state(JobState.error) self._exception = pickle.loads(eval(call_status['exc_info'])) msg = None if not call_status.get('exc_pickle_fail', False): exception_args = self._exception[1].args if exception_args[0] == "WRONGVERSION": msg = "PyWren version mismatch: remote expected version {}, local" \ "library is version {}".format(exception_args[2], exception_args[3]) if exception_args[0] == "OUTATIME": msg = "Process ran out of time" if exception_args[0] == "OUTOFMEMORY": msg = "Process exceeded maximum memory and was killed" else: fault = Exception(self._exception['exc_value']) self._exception = (Exception, fault, self._exception['exc_traceback']) if throw_except: raise FunctionException(self.executor_id, self.activation_id, self._exception, msg) return None log_msg = ('Executor ID {} Response from Function {} - Activation ' 'ID: {} - Time: {} seconds'.format(self.executor_id, self.call_id, self.activation_id, str(total_time))) logger.debug(log_msg) self._set_state(JobState.ready) if not call_status['result']: # Function does not produced output self._set_state(JobState.success) if 'new_futures' in call_status: unused_callgroup_id, total_new_futures = call_status[ 'new_futures'].split('/') if int(total_new_futures) > 0: self.result(throw_except=throw_except, internal_storage=internal_storage) return self.run_status
def result(self, check_only=False, throw_except=True, internal_storage=None): """ Return the value returned by the call. If the call raised an exception, this method will raise the same exception If the future is cancelled before completing then CancelledError will be raised. :param timeout: This method will wait up to timeout seconds before raising a TimeoutError if function hasn't completed. If None, wait indefinitely. Default None. :param check_only: Return None immediately if job is not complete. Default False. :param throw_except: Reraise exception if call raised. Default true. :param verbose: Shows some information prints. :param storage_handler: Storage handler to poll cloud storage. Default None. :return: Result of the call. :raises CancelledError: If the job is cancelled before completed. :raises TimeoutError: If job is not complete after `timeout` seconds. """ if self._state == JobState.new: raise ValueError("job not yet invoked") if self._state == JobState.success: return self._return_val if self._state == JobState.futures: return self._new_futures if self._state == JobState.error: if throw_except: raise self._exception else: return None if internal_storage is None: internal_storage = storage.InternalStorage(self.storage_config) storage_utils.check_storage_path(internal_storage.get_storage_config(), self.storage_path) call_status = internal_storage.get_call_status(self.executor_id, self.callgroup_id, self.call_id) self.status_query_count += 1 if check_only is True: if call_status is None: return None while call_status is None: time.sleep(self.GET_RESULT_SLEEP_SECS) call_status = internal_storage.get_call_status(self.executor_id, self.callgroup_id, self.call_id) self.status_query_count += 1 self._invoke_metadata['status_done_timestamp'] = time.time() self._invoke_metadata['status_query_count'] = self.status_query_count self.run_status = call_status # this is the remote status information self.invoke_status = self._invoke_metadata # local status information total_time = format(round(call_status['end_time'] - call_status['start_time'], 2), '.2f') if call_status['exception'] is not None: # the wrenhandler had an exception self._set_state(JobState.error) exception_str = call_status['exception'] exception_args = call_status['exception_args'] log_msg = ('Executor ID {} Error in {} {} - Time: {} ' 'seconds- Result: {}'.format(self.executor_id, self.call_id, self.activation_id, str(total_time), exception_args[0]+" "+exception_args[1])) logger.debug(log_msg) if exception_args[0] == "WRONGVERSION": if throw_except: raise Exception("Pywren version mismatch: remote " "expected version {}, local library is version {}".format( exception_args[2], exception_args[3])) return None elif exception_args[0] == "OUTATIME": if throw_except: raise Exception("Process ran out of time - {} - {}".format(self.call_id, self.activation_id)) return None elif exception_args[0] == "OUTOFMEMORY": if throw_except: raise Exception("Process exceeded maximum memory and was " "killed - {} - {}".format(self.call_id, self.activation_id)) return None else: if 'exception_traceback' in call_status: self._exception = Exception(exception_str, *exception_args) if throw_except: raise self._exception return None call_output_time = time.time() call_invoker_result = internal_storage.get_call_output(self.executor_id, self.callgroup_id, self.call_id) self.output_query_count += 1 while call_invoker_result is None and self.output_query_count < self.GET_RESULT_MAX_RETRIES: time.sleep(self.GET_RESULT_SLEEP_SECS) call_invoker_result = internal_storage.get_call_output(self.executor_id, self.callgroup_id, self.call_id) self.output_query_count += 1 if call_invoker_result is None: if throw_except: raise Exception('Unable to get the output of the function - Activation ID: {}'.format(self.activation_id)) else: self._set_state(JobState.error) return None call_invoker_result = pickle.loads(call_invoker_result) call_output_time_done = time.time() self._call_invoker_result = call_invoker_result self._invoke_metadata['download_output_time'] = call_output_time_done - call_output_time self._invoke_metadata['output_query_count'] = self.output_query_count self._invoke_metadata['download_output_timestamp'] = call_output_time_done call_success = call_invoker_result['success'] self.invoke_status = self._invoke_metadata # local status information if call_success: log_msg = ('Executor ID {} Response from Function {} - Activation ' 'ID: {} - Time: {} seconds'.format(self.executor_id, self.call_id, self.activation_id, str(total_time))) logger.debug(log_msg) function_result = call_invoker_result['result'] if isinstance(function_result, ResponseFuture): self._new_futures = [function_result] self._set_state(JobState.futures) return self._new_futures elif type(function_result) == list and len(function_result) > 0 \ and isinstance(function_result[0], ResponseFuture): self._new_futures = function_result self._set_state(JobState.futures) return self._new_futures else: self._return_val = function_result self._set_state(JobState.success) return self._return_val elif throw_except: self._exception = call_invoker_result['result'] self._traceback = (call_invoker_result['exc_type'], call_invoker_result['exc_value'], call_invoker_result['exc_traceback']) self._set_state(JobState.error) if call_invoker_result.get('pickle_fail', False): fault = Exception(call_invoker_result['exc_value']) reraise(Exception, fault, call_invoker_result['exc_traceback']) else: reraise(*self._traceback) else: self._set_state(JobState.error) return None # nothing, don't raise, no value
def ibm_cloud_function_handler(event): start_time = time.time() logger.info("Starting handler") response_status = {'exception': None} response_status['start_time'] = start_time context_dict = { 'ibm_cf_request_id': os.environ.get("__OW_ACTIVATION_ID"), 'ibm_cf_hostname': os.environ.get("HOSTNAME"), 'ibm_cf_python_version': os.environ.get("PYTHON_VERSION"), } config = event['config'] storage_config = wrenconfig.extract_storage_config(config) custom_handler_env = {'PYWREN_CONFIG': json.dumps(config), 'STORAGE_CONFIG': json.dumps(storage_config), 'PYWREN_EXECUTOR_ID': event['executor_id']} os.environ.update(custom_handler_env) try: status_key = event['status_key'] func_key = event['func_key'] data_key = event['data_key'] data_byte_range = event['data_byte_range'] output_key = event['output_key'] if version.__version__ != event['pywren_version']: raise Exception("WRONGVERSION", "Pywren version mismatch", version.__version__, event['pywren_version']) job_max_runtime = event.get("job_max_runtime", 550) # default for CF response_status['func_key'] = func_key response_status['data_key'] = data_key response_status['output_key'] = output_key response_status['status_key'] = status_key #free_disk_bytes = free_disk_space("/tmp") #response_status['free_disk_bytes'] = free_disk_bytes extra_env = event.get('extra_env', {}) extra_env['PYTHONPATH'] = "{}:{}".format(os.getcwd(), PYWREN_LIBS_PATH) extra_env['PYTHONUNBUFFERED'] = 'True' call_id = event['call_id'] callgroup_id = event['callgroup_id'] executor_id = event['executor_id'] response_status['call_id'] = call_id response_status['callgroup_id'] = callgroup_id response_status['executor_id'] = executor_id # pass a full json blob jobrunner_config = {'func_key': func_key, 'data_key': data_key, 'data_byte_range': data_byte_range, 'python_module_path': PYTHON_MODULE_PATH, 'output_key': output_key, 'stats_filename': JOBRUNNER_STATS_FILENAME} with open(JOBRUNNER_CONFIG_FILENAME, 'w') as jobrunner_fid: json.dump(jobrunner_config, jobrunner_fid) if os.path.exists(JOBRUNNER_STATS_FILENAME): os.remove(JOBRUNNER_STATS_FILENAME) cmdstr = "python {} {}".format(JOBRUNNER_PATH, JOBRUNNER_CONFIG_FILENAME) logger.info("About to execute '{}'".format(cmdstr)) setup_time = time.time() response_status['setup_time'] = setup_time - start_time local_env = os.environ.copy() local_env.update(extra_env) """ stdout = os.popen(cmdstr).read() print(stdout) process = subprocess.run(cmdstr, shell=True, env=local_env, bufsize=1, stdout=subprocess.PIPE, preexec_fn=os.setsid, universal_newlines=True, timeout=job_max_runtime) print(process.stdout) """ # This is copied from http://stackoverflow.com/a/17698359/4577954 # reasons for setting process group: http://stackoverflow.com/a/4791612 process = subprocess.Popen(cmdstr, shell=True, env=local_env, bufsize=1, stdout=subprocess.PIPE, preexec_fn=os.setsid, universal_newlines=True) logger.info("launched process") def consume_stdout(stdout, queue): with stdout: for line in stdout: print(line, end='') queue.put(line) q = Queue() t = Thread(target=consume_stdout, args=(process.stdout, q)) t.daemon = True t.start() t.join(job_max_runtime) if t.isAlive(): # If process is still alive after t.join(job_max_runtime), kill it logger.error("Process exceeded maximum runtime of {} sec".format(job_max_runtime)) # Send the signal to all the process groups os.killpg(os.getpgid(process.pid), signal.SIGTERM) raise Exception("OUTATIME", "Process executed for too long and was killed") if not q.empty(): if 'Jobrunner finished' not in q.queue[q.qsize()-1].strip(): raise Exception("OUTOFMEMORY", "Process exceeded maximum memory and was killed") logger.info("Command execution finished") #print(subprocess.check_output("find {}".format(PYTHON_MODULE_PATH), shell=True)) #print(subprocess.check_output("find {}".format(os.getcwd()), shell=True)) if os.path.exists(JOBRUNNER_STATS_FILENAME): with open(JOBRUNNER_STATS_FILENAME, 'r') as fid: for l in fid.readlines(): key, value = l.strip().split(" ") float_value = float(value) response_status[key] = float_value response_status['exec_time'] = time.time() - setup_time response_status['host_submit_time'] = event['host_submit_time'] #response_status['server_info'] = get_server_info() response_status.update(context_dict) response_status['end_time'] = time.time() except Exception as e: # internal runtime exceptions logger.error("There was an exception: {}".format(str(e))) response_status['end_time'] = time.time() response_status['exception'] = str(e) response_status['exception_args'] = e.args response_status['exception_traceback'] = traceback.format_exc() finally: store_status = True if 'STORE_STATUS' in extra_env: store_status = eval(extra_env['STORE_STATUS']) if store_status: internal_storage = storage.InternalStorage(storage_config) internal_storage.put_data(status_key, json.dumps(response_status))
def __init__(self, config=None, runtime=None, runtime_memory=None, log_level=None, rabbitmq_monitor=False): """ Initialize and return an executor class. :param config: Settings passed in here will override those in `pywren_config`. Default None. :param runtime: Runtime name to use. Default None. :param runtime_memory: memory to use in the runtime :param log_level: log level to use during the execution :param rabbitmq_monitor: use rabbitmq as monitoring system :return `executor` object. Usage >>> import pywren_ibm_cloud as pywren >>> pw = pywren.ibm_cf_executor() """ self.start_time = time.time() self._state = ExecutorState.new if config is None: self.config = wrenconfig.default() else: self.config = wrenconfig.default(config) self.is_cf_cluster = is_cf_cluster() self.data_cleaner = self.config['pywren']['data_cleaner'] # Overwrite runtime variables if runtime: self.config['pywren']['runtime'] = runtime if runtime_memory: self.config['pywren']['runtime_memory'] = int(runtime_memory) # Log level Configuration self.log_level = log_level if not self.log_level: if (logger.getEffectiveLevel() != logging.WARNING): self.log_level = logging.getLevelName( logger.getEffectiveLevel()) if self.log_level: os.environ["PYWREN_LOG_LEVEL"] = self.log_level if not self.is_cf_cluster: wrenlogging.default_config(self.log_level) # RabbitMQ monitor configuration self.rabbitmq_monitor = rabbitmq_monitor if self.rabbitmq_monitor: if self.config['rabbitmq']['amqp_url']: os.environ["PYWREN_RABBITMQ_MONITOR"] = 'True' else: self.rabbitmq_monitor = False else: self.config['rabbitmq']['amqp_url'] = None storage_config = wrenconfig.extract_storage_config(self.config) self.internal_storage = storage.InternalStorage(storage_config) invoker = invokers.IBMCloudFunctionsInvoker(self.config) self.executor = Executor(invoker, self.config, self.internal_storage) self.executor_id = self.executor.executor_id self.futures = []
def function_handler(event): start_time = time.time() logger.debug("Action handler started") response_status = {'exception': False} response_status['host_submit_time'] = event['host_submit_time'] response_status['start_time'] = start_time context_dict = { 'ibm_cf_request_id': os.environ.get("__OW_ACTIVATION_ID"), 'ibm_cf_python_version': os.environ.get("PYTHON_VERSION"), } config = event['config'] storage_config = wrenconfig.extract_storage_config(config) log_level = event['log_level'] wrenlogging.ow_config(log_level) call_id = event['call_id'] callgroup_id = event['callgroup_id'] executor_id = event['executor_id'] logger.info("Execution ID: {}/{}/{}".format(executor_id, callgroup_id, call_id)) job_max_runtime = event.get("job_max_runtime", 590) # default for CF status_key = event['status_key'] func_key = event['func_key'] data_key = event['data_key'] data_byte_range = event['data_byte_range'] output_key = event['output_key'] extra_env = event.get('extra_env', {}) response_status['call_id'] = call_id response_status['callgroup_id'] = callgroup_id response_status['executor_id'] = executor_id # response_status['func_key'] = func_key # response_status['data_key'] = data_key # response_status['output_key'] = output_key # response_status['status_key'] = status_key try: if version.__version__ != event['pywren_version']: raise Exception("WRONGVERSION", "PyWren version mismatch", version.__version__, event['pywren_version']) # response_status['free_disk_bytes'] = free_disk_space("/tmp") custom_env = {'PYWREN_CONFIG': json.dumps(config), 'PYWREN_EXECUTOR_ID': executor_id, 'PYTHONPATH': "{}:{}".format(os.getcwd(), PYWREN_LIBS_PATH), 'PYTHONUNBUFFERED': 'True'} os.environ.update(custom_env) os.environ.update(extra_env) # pass a full json blob jobrunner_config = {'func_key': func_key, 'data_key': data_key, 'log_level': log_level, 'data_byte_range': data_byte_range, 'python_module_path': PYTHON_MODULE_PATH, 'output_key': output_key, 'stats_filename': JOBRUNNER_STATS_FILENAME} if os.path.exists(JOBRUNNER_STATS_FILENAME): os.remove(JOBRUNNER_STATS_FILENAME) setup_time = time.time() response_status['setup_time'] = round(setup_time - start_time, 8) result_queue = multiprocessing.Queue() jr = jobrunner(jobrunner_config, result_queue) jr.daemon = True logger.info("Starting jobrunner process") jr.start() jr.join(job_max_runtime) response_status['exec_time'] = round(time.time() - setup_time, 8) if jr.is_alive(): # If process is still alive after jr.join(job_max_runtime), kill it logger.error("Process exceeded maximum runtime of {} seconds".format(job_max_runtime)) # Send the signal to all the process groups jr.terminate() raise Exception("OUTATIME", "Process executed for too long and was killed") try: # Only 1 message is returned by jobrunner result_queue.get(block=False) except Exception: # If no message, this means that the process was killed due memory usage raise Exception("OUTOFMEMORY", "Process exceeded maximum memory and was killed") # print(subprocess.check_output("find {}".format(PYTHON_MODULE_PATH), shell=True)) # print(subprocess.check_output("find {}".format(os.getcwd()), shell=True)) if os.path.exists(JOBRUNNER_STATS_FILENAME): with open(JOBRUNNER_STATS_FILENAME, 'r') as fid: for l in fid.readlines(): key, value = l.strip().split(" ", 1) try: response_status[key] = float(value) except Exception: response_status[key] = value if key == 'exception' or key == 'exc_pickle_fail' \ or key == 'result': response_status[key] = eval(value) # response_status['server_info'] = get_server_info() response_status.update(context_dict) response_status['end_time'] = time.time() except Exception as e: # internal runtime exceptions logger.error("There was an exception: {}".format(str(e))) response_status['end_time'] = time.time() response_status['exception'] = True pickled_exc = pickle.dumps(sys.exc_info()) pickle.loads(pickled_exc) # this is just to make sure they can be unpickled response_status['exc_info'] = str(pickled_exc) finally: store_status = strtobool(os.environ.get('STORE_STATUS', 'True')) rabbit_amqp_url = config['rabbitmq'].get('amqp_url') dmpd_response_status = json.dumps(response_status) drs = sizeof_fmt(len(dmpd_response_status)) if rabbit_amqp_url and store_status: status_sent = False output_query_count = 0 while not status_sent and output_query_count < 5: output_query_count = output_query_count + 1 try: params = pika.URLParameters(rabbit_amqp_url) connection = pika.BlockingConnection(params) channel = connection.channel() channel.queue_declare(queue=executor_id, auto_delete=True) channel.basic_publish(exchange='', routing_key=executor_id, body=dmpd_response_status) connection.close() logger.info("Execution stats sent to rabbitmq - Size: {}".format(drs)) status_sent = True except Exception as e: logger.error("Unable to send status to rabbitmq") logger.error(str(e)) logger.info('Retrying to send stats to rabbitmq...') time.sleep(0.2) if store_status: internal_storage = storage.InternalStorage(storage_config) logger.info("Storing execution stats - status.json - Size: {}".format(drs)) internal_storage.put_data(status_key, dmpd_response_status)
def status(self, check_only=False, throw_except=True, internal_storage=None): """ Return the status returned by the call. If the call raised an exception, this method will raise the same exception If the future is cancelled before completing then CancelledError will be raised. :param check_only: Return None immediately if job is not complete. Default False. :param throw_except: Reraise exception if call raised. Default true. :param storage_handler: Storage handler to poll cloud storage. Default None. :return: Result of the call. :raises CancelledError: If the job is cancelled before completed. :raises TimeoutError: If job is not complete after `timeout` seconds. """ if self.ready or self.done: return self.run_status if internal_storage is None: internal_storage = storage.InternalStorage(self.storage_config) storage_utils.check_storage_path(internal_storage.get_storage_config(), self.storage_path) call_status = internal_storage.get_call_status(self.executor_id, self.callgroup_id, self.call_id) self.status_query_count += 1 if check_only is True: if call_status is None: return None while call_status is None: time.sleep(self.GET_RESULT_SLEEP_SECS) call_status = internal_storage.get_call_status( self.executor_id, self.callgroup_id, self.call_id) self.status_query_count += 1 self.invoke_status['status_done_timestamp'] = time.time() self.invoke_status['status_query_count'] = self.status_query_count self.run_status = call_status # this is the remote status information total_time = format( round(call_status['end_time'] - call_status['start_time'], 2), '.2f') if call_status['exception'] is not None: # the wrenhandler had an exception self._set_state(JobState.error) exception_str = call_status['exception'] exception_args = call_status['exception_args'] log_msg = ('Executor ID {} Error in {} {} - Time: {} ' 'seconds- Result: {}'.format( self.executor_id, self.call_id, self.activation_id, str(total_time), exception_args[0] + " " + exception_args[1])) logger.debug(log_msg) if exception_args[0] == "WRONGVERSION": if throw_except: raise Exception( "Pywren version mismatch: remote " "expected version {}, local library is version {}". format(exception_args[2], exception_args[3])) return None elif exception_args[0] == "OUTATIME": if throw_except: raise Exception("Process ran out of time - {} - {}".format( self.call_id, self.activation_id)) return None elif exception_args[0] == "OUTOFMEMORY": if throw_except: raise Exception("Process exceeded maximum memory and was " "killed - {} - {}".format( self.call_id, self.activation_id)) return None else: if 'exception_traceback' in call_status: self._exception = Exception(exception_str, *exception_args) if throw_except: raise self._exception return None log_msg = ('Executor ID {} Response from Function {} - Activation ' 'ID: {} - Time: {} seconds'.format(self.executor_id, self.call_id, self.activation_id, str(total_time))) logger.debug(log_msg) self._set_state(JobState.ready) if 'new_futures' in call_status: unused_callgroup_id, total_new_futures = call_status[ 'new_futures'].split('/') if int(total_new_futures) > 0: self.result(throw_except=throw_except, internal_storage=internal_storage) return self.run_status