def _init_heartbeat_parameters(self, heartbeat_params): if "heartbeat_method" not in heartbeat_params: self._logger.debug( "No heartbeat method is specified; disabling heartbeat.") return else: self._heartbeat_enabled = True self._heartbeat_method = heartbeat_params["heartbeat_method"] #self._logger.debug("[SessionHelperThread] New heartbeat method: " + str(self._heartbeat_method)) if self._heartbeat_method == "function": if "heartbeat_function" in heartbeat_params: # enable function related heartbeat self._heartbeat_function = heartbeat_params[ "heartbeat_function"] #self._logger.debug("[SessionHelperThread] New heartbeat function: " + str(self._heartbeat_function)) if self._local_queue_client_heartbeat is None: self._local_queue_client_heartbeat = LocalQueueClient( connect=self._queue_service) # disable data layer related heartbeat if self._data_layer_client_heartbeat is not None: self._data_layer_client_heartbeat.delete( self._heartbeat_data_layer_key) self._heartbeat_data_layer_key = None self._data_layer_client_heartbeat.shutdown() self._data_layer_client_heartbeat = None elif self._heartbeat_method == "data_layer": # needs to be unique among session functions, so use session id + session function id # TODO: how do you check the heartbeat in the data layer? # checker service or user function needs to know the key # OR keep a new map for heartbeats of the session functions # so that the checker can retrieve the keys and their values (e.g., timestamps) # if a session function misses a heartbeat, the checker function reports to policy handler # enable data layer related heartbeat self._heartbeat_data_layer_key = "heartbeat_" + self._session_id + "_" + self._session_function_id if self._data_layer_client_heartbeat is None: self._data_layer_client_heartbeat = DataLayerClient( locality=1, for_mfn=True, sid=self._sandboxid, connect=self._datalayer) # disable function related heartbeat if self._local_queue_client_heartbeat is not None: self._local_queue_client_heartbeat.shutdown() self._local_queue_client_heartbeat = None self._heartbeat_function = None else: raise MicroFunctionsSessionAPIException( "Unsupported heartbeat method for session function.") # must be in milliseconds if "heartbeat_interval_ms" in heartbeat_params: self._heartbeat_interval = heartbeat_params[ "heartbeat_interval_ms"] self._local_poll_timeout = self._heartbeat_interval / 2.0
def __init__(self, hostname, queue, datalayer, sandboxid, userid, workflowid, elasticsearch, workflowname, endpoint_key): self._start = time.time() self._python_version = sys.version_info self._hostname = hostname self._queue = queue self._datalayer = datalayer self._elasticsearch = elasticsearch self._userid = userid self._sandboxid = sandboxid self._workflowid = workflowid self._workflowname = workflowname # _XXX_: we'll use the endpoint_key to look up our endpoint self._endpoint_key = endpoint_key self._deployment_info_key = "deployment_info_workflow_" + self._workflowid self._logger = logging_helpers.setup_logger(self._sandboxid, LOG_FILENAME) self._fluentbit_process, self._command_args_map_fluentbit = logging_helpers.setup_fluentbit_and_elasticsearch_index( self._logger, FLUENTBIT_FOLDER, self._elasticsearch, ELASTICSEARCH_INDEX_WF, ELASTICSEARCH_INDEX_FE) self._logger.info("hostname (and container name): %s", self._hostname) self._logger.info("elasticsearch nodes: %s", self._elasticsearch) self._logger.info("queueservice: %s", self._queue) self._logger.info("datalayer: %s", self._datalayer) self._logger.info("user id: %s", self._userid) self._logger.info("sandbox id: %s", self._sandboxid) self._logger.info("workflow id: %s", self._workflowid) self._logger.info("workflow name: %s", self._workflowname) self._logger.info("endpoint_key: %s", self._endpoint_key) self._instructions_topic = "instructions_" + self._sandboxid self._management_data_layer_client = DataLayerClient( locality=1, sid="Management", wid="Management", is_wf_private=True, connect=self._datalayer) self._logger.info("Management data layer client connected after %s s", str(time.time() - self._start)) # to be declared later self._local_queue_client = None self._deployment = None self._queue_service_process = None self._frontend_process = None # visible to the outside world: either kubernetes assigned URL or bare-metal host address + exposed port self._external_endpoint = None # visible internally: kubernetes node address or same as bare-metal external endpoint self._internal_endpoint = None self._is_running = False self._shutting_down = False
def get_backup_data_layer_client(self): if self._backup_data_layer_client is None: # locality = -1 means that the writes happen to the local data layer first and then asynchronously to the global data layer self._backup_data_layer_client = DataLayerClient( locality=-1, for_mfn=True, sid=self._sandboxid, connect=self._datalayer) return self._backup_data_layer_client
def _get_data_layer_client(self, is_private=False): ''' Return the data layer client, so that it can be used to commit to the data layer when the function instance finishes. If it is not initialized yet, it will be initialized here. ''' # TODO: need also the locality information if is_private: if self._data_layer_client_private is None: self._data_layer_client_private = DataLayerClient(locality=1, sid=self._sandboxid, wid=self._workflowid, is_wf_private=True, connect=self._datalayer) return self._data_layer_client_private if self._data_layer_client is None: self._data_layer_client = DataLayerClient(locality=1, suid=self._storage_userid, is_wf_private=False, connect=self._datalayer) return self._data_layer_client
def __init__(self, deployment_info, hostname, userid, sandboxid, workflowid, workflowname, queue, datalayer, logger, external_endpoint, internal_endpoint): self._logger = logger self._deployment_info = deployment_info self._hostname = hostname self._userid = userid self._sandboxid = sandboxid self._workflowid = workflowid self._workflowname = workflowname self._queue = queue self._datalayer = datalayer self._external_endpoint = external_endpoint self._internal_endpoint = internal_endpoint self._python_version = sys.version_info self._storage_userid = self._userid.replace("@", "AT") self._storage_userid = self._storage_userid.replace("-", "_").replace( ".", "_") self._process_id = os.getpid() self._functionworker_process_map = {} self._javarequesthandler_process_list = [] self._queue_service_process = None self._frontend_process = None self._fluentbit_process = None # it will be probably updated to be something else self._fluentbit_actual_pid = -1 self._child_process_command_args_map = {} # to be declared later when parsing the deployment info self._workflow = None self._global_data_layer_client = DataLayerClient( locality=1, suid=self._storage_userid, connect=self._datalayer) self._local_queue_client = None
def __init__(self, hostname, uid, sid, wid, logger, funcstatename, functopic, key, session_id, publication_utils, queue, datalayer, internal_endpoint): self._logger = logger self._queue = queue self._datalayer = datalayer self._session_id = session_id self._session_function_id = None self._hostname = hostname self._userid = uid self._sandboxid = sid self._workflowid = wid self._function_state_name = funcstatename self._function_topic = functopic self._internal_endpoint = internal_endpoint self._key = key self._publication_utils = publication_utils self._is_session_function_running = False self._helper_thread = None self._global_data_layer_client = DataLayerClient( locality=1, sid=sid, for_mfn=True, connect=self._datalayer) # only valid if this is a session function (i.e., session_function_id is not None) self._local_topic_communication = None self._session_function_parameters = None if self._session_id is None: self._generate_session_id() self._setup_metadata_tablenames()
class PublicationUtils(): def __init__(self, sandboxid, workflowid, functopic, funcruntime, wfnext, wfpotnext, wflocal, wflist, wfexit, cpon, stateutils, logger, queue, datalayer): self._logger = logger self._function_topic = functopic self._sandboxid = sandboxid self._workflowid = workflowid self._function_runtime = funcruntime self._prefix = self._sandboxid + "-" + self._workflowid + "-" self._wf_next = wfnext self._wf_pot_next = wfpotnext self._wf_local = wflocal self._wf_function_list = wflist self._wf_exit = wfexit # whether we should store backups of triggers before publishing the output self._should_checkpoint = cpon # the topic to send out messages to remote functions # TODO: pub_topic_global becomes a new request to another sandbox? # via header? self._pub_topic_global = "pub_global" self._recovery_manager_topic = "RecoveryManager" self._state_utils = stateutils self._metadata = None self._queue = queue self._local_queue_client = None self._datalayer = datalayer self._sapi = None self._output_counter_map = {} self._dynamic_workflow = [] self._backup_data_layer_client = None self._execution_info_map_name = None self._next_backup_list = [] #self._logger.debug("[PublicationUtils] init done.") # only to be called from the function worker def set_sapi(self, sapi): self._sapi = sapi def set_metadata(self, metadata): self._metadata = metadata self._execution_info_map_name = "execution_info_map_" + self._metadata[ "__execution_id"] def update_metadata(self, metadata_name, metadata_value, is_privileged=False): if is_privileged: self._metadata[metadata_name] = metadata_value else: if "__mfnusermetadata" not in self._metadata: self._metadata["__mfnusermetadata"] = {} self._metadata["__mfnusermetadata"][metadata_name] = metadata_value def _get_local_queue_client(self): if self._local_queue_client is None: self._local_queue_client = LocalQueueClient(connect=self._queue) return self._local_queue_client def _shutdown_local_queue_client(self): if self._local_queue_client is not None: self._local_queue_client.shutdown() def get_backup_data_layer_client(self): if self._backup_data_layer_client is None: # locality = -1 means that the writes happen to the local data layer first and then asynchronously to the global data layer self._backup_data_layer_client = DataLayerClient( locality=-1, for_mfn=True, sid=self._sandboxid, connect=self._datalayer) return self._backup_data_layer_client def shutdown_backup_data_layer_client(self): if self._backup_data_layer_client is not None: self._backup_data_layer_client.shutdown() def convert_api_message_to_python_object(self, message): # _XXX_: Java objects need to be serialized and passed to python; however, API functions expect python objects # we make the conversion according to the runtime val = message if self._function_runtime == "java": val = json.loads(message) val = val["value"] return val def is_valid_value(self, value): if not (py3utils.is_string(value) \ or isinstance(value, (dict, list, int, float)) \ or value is None): return False return True def _is_valid_trigger_destination(self, destination): if not (py3utils.is_string(destination) and destination != ""): return False return True def _is_allowed_or_privileged(self, destination, send_now): # Management service is privileged, so allow # 1) asynchronous execution # 2) Recovery manager topic # @ returns a tuple with (is_allowed, is_privileged) if self._sandboxid == "Management" and self._workflowid == "Management": if destination[0:6] == "async_" or\ destination == self._recovery_manager_topic: # next[0:6] == "async_" # next == self._recovery_manager_topic: return (True, True) return (True, False) if send_now: if destination not in self._wf_function_list: return (False, False) elif destination not in self._wf_pot_next: return (False, False) return (True, False) def is_valid_trigger_message(self, next, value, send_now): is_valid = True errmsg = "" if not self._is_valid_trigger_destination(next): is_valid = False errmsg = "Malformed dynamic trigger definition; 'next' must be a string." is_allowed, is_privileged = self._is_allowed_or_privileged( next, send_now) if not is_allowed: is_valid = False if send_now: errmsg = errmsg + "\n" + "Destination is not in workflow: " + next errmsg = errmsg + "\n" + "Can only send an immediate trigger message to an existing function or the workflow end." else: errmsg = errmsg + "\n" + "Workflow does not match generated 'next': " + next if not self.is_valid_value(value): is_valid = False errmsg = errmsg + "\n" + "Malformed dynamic trigger definition; 'value' must be a python data type (dict, list, str, int, float, or None)." return is_valid, is_privileged, errmsg def decode_input(self, encoded_input): if encoded_input == '': encoded_input = '{}' #if isinstance(encoded_input,dict): # encoded_input = json.dumps(encoded_input) # if encoded_input aready is a dict, convert to JSON Text #if encoded_input.startswith("null"): # encoded_input = encoded_input.replace("null","") #Decode input. Input (value) must be a valid JSON Text. #however, post-commit hook published value has the format key; value #print ("Encoded State Input: " + str(encoded_input).replace("null","")) #print ("Encoded State Input: " + str(encoded_input) + str(type(encoded_input))) #self._logger.debug("received user input in decode_input: " + str(encoded_input)) try: #if isinstance(encoded_input,str): raw_state_input = json.loads(encoded_input) #if isinstance(encoded_input,dict): # raw_state_input = encoded_input return raw_state_input except Exception as exc: #self._logger.exception("User Input is not a valid JSON Text") #self._logger.exception(exc) raise Exception("User Input is not a valid JSON Text: " + str(exc)) def encode_output(self, raw_state_output): #Produce output JSON Text from raw_state_output try: value_output = json.dumps(raw_state_output) return value_output except Exception as exc: #self._logger.exception("Error while encoding state output") #self._logger.exception(exc) raise Exception("Error while encoding state output: " + str(exc)) def decapsulate_input(self, encoded_encapsulated_input): # The actual user input is encapsulated in a dict of the form: # { "__mfnuserdata": actual_user_input, # "__mfnmetadata": system_specific_metadata } # This encapsulation is invisible to the user and is added, # maintained, and removed by the frontend and function worker. if encoded_encapsulated_input == '': #self._logger.exception("Invalid encapsulation of user input") raise MicroFunctionsException( "Invalid encapsulation of user input.") else: try: encapsulated_input = json.loads(encoded_encapsulated_input) userdata = encapsulated_input['__mfnuserdata'] metadata = encapsulated_input['__mfnmetadata'] return userdata, metadata except Exception as exc: #self._logger.exception("Unable to decode encapsulated user input") #self._logger.exception(e) raise MicroFunctionsException( "Unable to decode encapsulated user input: " + str(exc)) def encapsulate_output(self, encoded_state_output, metadata): try: value = { "__mfnuserdata": encoded_state_output, "__mfnmetadata": metadata } value_output = json.dumps(value) return value_output except Exception as exc: #self._logger.exception("Error while encoding state output") #self._logger.exception(e) raise MicroFunctionsException( "Error while encoding state output: " + str(exc)) def get_dynamic_workflow(self): ''' Return the dynamically generated workflow information, so that this function instance can trigger other functions when it finishes. ''' return self._dynamic_workflow def send_message_to_running_function(self, trigger): self.send_to_function_now("-1l", trigger, lqcpub=None) def append_trigger(self, trigger): trigger["value"] = self.encode_output(trigger["value"]) self._dynamic_workflow.append(trigger) def _convert_function_output_static_workflow(self, function_output): converted_function_output = [] for wfnext in self._wf_next: converted_function_output.append({ "next": wfnext, "value": function_output }) return converted_function_output def _store_output_data(self): data_out = self._sapi.get_transient_data_output() to_be_deleted = self._sapi.get_data_to_be_deleted() if data_out or to_be_deleted: dlc = self._sapi._get_data_layer_client() for k in data_out: dlc.put(k, data_out.get(k)) for k in to_be_deleted: dlc.delete(k) data_out_private = self._sapi.get_transient_data_output( is_private=True) to_be_deleted_private = self._sapi.get_data_to_be_deleted( is_private=True) if data_out_private or to_be_deleted_private: dlc_private = self._sapi._get_data_layer_client(is_private=True) for k in data_out_private: dlc_private.put(k, data_out_private.get(k)) for k in to_be_deleted_private: dlc_private.delete(k) self._sapi._shutdown_data_layer_client() def _send_local_queue_message(self, lqcpub, lqtopic, key, value): # construct a LocalQueueClientMessage(key, value) # and send it to the local queue topic via the local queue client lqcm = LocalQueueClientMessage(key=key, value=value) #lqcpub.addMessage(lqtopic, lqcm, False) ack = lqcpub.addMessage(lqtopic, lqcm, True) while not ack: ack = lqcpub.addMessage(lqtopic, lqcm, True) def _send_remote_message(self, remote_address, message_type, lqtopic, key, value): # form a http request to send to remote host # need to set async=true in request URL, so that the frontend does not have a sync object waiting if message_type == "session_update": # if a session update message, set headers appropriately action_data = {} action_data["topic"] = lqtopic action_data["key"] = key action_data["value"] = value resp = requests.post(remote_address, params={"async": 1}, json={}, headers={ "X-MFN-Action": "Session-Update", "X-MFN-Action-Data": json.dumps(action_data) }) elif message_type == "global_pub": # TODO: if global publishing, set headers appropriately (e.g., for load balancing) pass return def _publish_privileged_output(self, function_output, lqcpub): next = function_output["next"] output = {} # init metadata for the workflow (similar to the frontend) metadata = {} metadata["__result_topic"] = self._metadata["__result_topic"] metadata["__execution_id"] = self._metadata["__execution_id"] metadata["__function_execution_id"] = self._metadata["__execution_id"] if next[:6] == "async_": # backup of the 'input' and 'next' has been done by executeWorkflowAsync in management service metadata["__async_execution"] = True output["topicNext"] = next[6:] elif next == self._recovery_manager_topic: metadata["__async_execution"] = self._metadata["__async_execution"] output["topicNext"] = next output["value"] = self.encapsulate_output(function_output["value"], metadata) outkey = self._metadata["__execution_id"] # publish to pub manager's separate queue for global next outputstr = json.dumps(output) self._send_local_queue_message(lqcpub, self._pub_topic_global, outkey, outputstr) return (None, None) def _generate_trigger_metadata(self, topic_next): # keep track of the output instances of the next topic # e.g., funcA -> funcB with input1 (instance 0) and funcB with input2 (instance 1) if topic_next not in self._output_counter_map: self._output_counter_map[topic_next] = 0 output_instance_id = self._output_counter_map[topic_next] next_function_execution_id = self._metadata[ "__function_execution_id"] + "_" + str(output_instance_id) # get current state type. if map state add marker to execution Id state_type = self._state_utils.functionstatetype self._logger.debug("self._state_utils.functionstatetype: " + str(state_type)) if state_type == 'Map': next_function_execution_id = self._metadata[ "__function_execution_id"] + "_" + str( output_instance_id) + "-M" self._output_counter_map[topic_next] += 1 trigger_metadata = copy.deepcopy(self._metadata) trigger_metadata[ "__function_execution_id"] = next_function_execution_id #self._logger.debug("trigger metadata: " + str(trigger_metadata)) return (next_function_execution_id, trigger_metadata) def _publish_output(self, key, trigger, lqcpub, timestamp_map=None): if timestamp_map is not None: timestamp_map['t_pub_output'] = time.time() * 1000.0 next = trigger["next"] if "to_running_function" in trigger and trigger["to_running_function"]: # SessionUtils API calls have already determined the locality # this is for a running function instance on a remote host if "is_local" in trigger and trigger["is_local"]: trigger["value"] = self.encapsulate_output( trigger["value"], self._metadata) # this is for a running function on the local host # SessionUtils has already created the appropriate next if timestamp_map is not None: timestamp_map['t_pub_localqueue'] = time.time() * 1000.0 self._send_local_queue_message(lqcpub, next, key, trigger["value"]) else: # send it to the remote host with a special header self._send_remote_message(trigger["remote_address"], "session_update", next, key, trigger["value"]) return (None, None) elif "is_privileged" in trigger and trigger["is_privileged"]: # next[0:6] == "async_" # next == self._recovery_manager_topic: return self._publish_privileged_output(trigger, lqcpub) else: topic_next = self._prefix + next output = {} output["topicNext"] = topic_next next_function_execution_id, trigger_metadata = self._generate_trigger_metadata( topic_next) output["value"] = self.encapsulate_output(trigger["value"], trigger_metadata) # check whether next is local or not if topic_next in self._wf_local: # event message directly to the next function's local queue topic if timestamp_map is not None: timestamp_map['t_pub_localqueue'] = time.time() * 1000.0 self._send_local_queue_message(lqcpub, topic_next, key, output["value"]) else: # check if 'next' is exit topic and modify output["topicNext"] accordingly isExitTopic = False if next == self._wf_exit: isExitTopic = True if self._metadata["__execution_id"] != key: key = self._metadata["__execution_id"] dlc = self.get_backup_data_layer_client() # store the workflow's final result dlc.put("result_" + key, output["value"]) #self._logger.debug("[__mfn_backup] [exitresult] [%s] %s", "result_" + key, output["value"]) # _XXX_: this is not handled properly by the frontend # this was an async execution # just send an empty message to the frontend to signal end of execution #if "__async_execution" in self._metadata and self._metadata["__async_execution"]: # output["value"] = "" if isExitTopic and timestamp_map is not None: timestamp_map['t_pub_exittopic'] = time.time() * 1000.0 timestamp_map['exitsize'] = len(output["value"]) self._send_local_queue_message(lqcpub, topic_next, key, output["value"]) return (next_function_execution_id, output) def _store_trigger_backups(self, dlc, input_backup_map, current_function_instance_id, store_next_backup_list=False): # keep track of the execution instances with their updated keys # i.e., keys that contains the output instance ids # use this set to describe the execution details if self._execution_info_map_name is not None: # dump the backups into the data layer for input_backup_key in input_backup_map: dlc.putMapEntry(self._execution_info_map_name, input_backup_key, input_backup_map[input_backup_key]) # if there is any new next, store them # if a next was generated by sending a message immediately, # this next will have been appended to our list in memory # and the backup will be overwritten # if one or more nexts were generated when publishing # at the end of execution, they will have been appended to our list # in memory and we will store the backup once for the entire list if store_next_backup_list: dlc.putMapEntry(self._execution_info_map_name, "next_" + current_function_instance_id, json.dumps(self._next_backup_list)) def _send_message_to_recovery_manager(self, key, message_type, topic, func_exec_id, has_error, error_type, lqcpub): return message_rec = {} message_rec["messageType"] = message_type message_rec["currentTopic"] = topic message_rec["currentFunctionExecutionId"] = func_exec_id message_rec["hasError"] = has_error message_rec["errorType"] = error_type output = {} output["topicNext"] = self._recovery_manager_topic output["value"] = json.dumps(message_rec) outputstr = json.dumps(output) # message via global publisher to pub manager's queue for backups self._send_local_queue_message(lqcpub, self._pub_topic_global, key, outputstr) # need to store backups of inputs and send message to recovery manager def send_to_function_now(self, key, trigger, lqcpub=None, dlc=None): trigger["value"] = self.encode_output(trigger["value"]) # get a local queue client if lqcpub is None: lqcpub = self._get_local_queue_client() current_function_instance_id = self._metadata[ "__function_execution_id"] + "_" + self._function_topic # if next_function_execution_id and output are None only if: # 1) message was sent to a running function (i.e., session function update message) # 2) message was a privileged message any_next = False next_function_execution_id, output = self._publish_output( key, trigger, lqcpub) if self._should_checkpoint: input_backup_map = {} starting_next = {} if dlc is None: dlc = self.get_backup_data_layer_client() if next_function_execution_id is not None and output is not None: # here, output MUST contain "topicNext" and "value"; otherwise, # we wouldn't have been able to publish it in publish_output() # use the updated topicNext for globally published messages starting_next[next_function_execution_id] = output["topicNext"] next_function_instance_id = next_function_execution_id + "_" + output[ "topicNext"] input_backup_map["input_" + next_function_instance_id] = output["value"] self._next_backup_list.append(next_function_instance_id) any_next = True self._store_trigger_backups(dlc, input_backup_map, current_function_instance_id, store_next_backup_list=any_next) for next_func_exec_id in starting_next: next_func_topic = starting_next[next_func_exec_id] self._send_message_to_recovery_manager(key, "start", next_func_topic, next_func_exec_id, False, "", lqcpub) self._send_message_to_recovery_manager( key, "running", self._function_topic, self._metadata["__function_execution_id"], False, "", lqcpub) # utilize the workflow to publish directly to the next function's topic # publish directly to the next function's topic, accumulate backups # publish backups at the end with a 'fin' flag, which also indicates that all have been published # also, handle global queue events def publish_output_direct(self, key, value_output, has_error, error_type, timestamp_map): timestamp_map["t_pub_start"] = timestamp_map[ "t_start_pub"] = time.time() * 1000.0 # if we already have a local queue client (because of immediately sent messages) and backup data layer client, # re-use them # if not, then the call to get them will initialize them lqcpub = self._get_local_queue_client() # _XXX_: 'function instance id' is uniquely identified via: # 1) (workflow) execution id (i.e., uuid set by frontend) # 2) output instance id (depends on the number of 'next' using the same function) # 3) function topic # 1) and 2) => '__function_execution_id' in metadata; # set by the previous function (or frontend if we're the first function) in the metadata current_function_instance_id = self._metadata[ "__function_execution_id"] + "_" + self._function_topic if has_error: timestamp_map["t_start_dlcbackup"] = time.time() * 1000.0 dlc = self.get_backup_data_layer_client() # set data layer flag to stop further execution of function instances # that may have been triggered concurrently via a new message dlc.put("workflow_execution_stop_" + key, "1") # dump the result into the data layer result = {} result["has_error"] = has_error result["error_type"] = error_type encoded_result = self.encode_output(result) encapsulated_result = self.encapsulate_output( encoded_result, self._metadata) #dlc.put("result_" + current_function_instance_id, encapsulated_result) dlc.putMapEntry(self._execution_info_map_name, "result_" + current_function_instance_id, encapsulated_result) # publish a message to the 'exit' topic trigger = {} trigger["next"] = self._wf_exit trigger["value"] = encoded_result # don't need next_function_execution_id, because we'll stop execution anyway # similarly, we don't need to do any backups next_function_execution_id, output = self._publish_output( key, trigger, lqcpub, timestamp_map) # store the workflow's final result # which has been encapsulated dlc.put("result_" + key, output["value"]) timestamp_map["hasError"] = True else: # dump the result into the data layer timestamp_map["t_start_encapsulate"] = time.time() * 1000.0 encapsulated_value_output = self.encapsulate_output( value_output, self._metadata) if self._should_checkpoint: timestamp_map["t_start_dlcbackup"] = time.time() * 1000.0 dlc = self.get_backup_data_layer_client() #dlc.put("result_" + current_function_instance_id, encapsulated_value_output) timestamp_map["t_start_resultmap"] = time.time() * 1000.0 dlc.putMapEntry(self._execution_info_map_name, "result_" + current_function_instance_id, encapsulated_value_output) #self._logger.debug("[__mfn_backup] [%s] [%s] %s", self._execution_info_map_name, "result_" + current_function_instance_id, encapsulated_value_output) timestamp_map["t_start_storeoutput"] = time.time() * 1000.0 # store self._sapi.transient_output into the data layer self._store_output_data() # get the combined (next, value) tuple list for the output # use here the original output: # we'll update the metadata separately for each trigger and encapsulate the output with it timestamp_map["t_start_generatenextlist"] = time.time() * 1000.0 converted_function_output = self._convert_function_output_static_workflow( value_output) choice_next_list = self._state_utils.getChoiceResults(value_output) converted_function_output = converted_function_output + self._dynamic_workflow + choice_next_list check_error_flag = True continue_publish_flag = True # if we are sending the result ONLY to the workflow exit, then there is no point in checking the error flag if len( converted_function_output ) == 1 and converted_function_output[0]["next"] == self._wf_exit: check_error_flag = False if check_error_flag: timestamp_map["t_start_dlcbackup_err"] = time.time() * 1000.0 dlc = self.get_backup_data_layer_client() # check the workflow stop flag # if some other function execution had an error and we had been # simultaneously triggered, we can finish but don't need to publish # to the next function in the workflow, so we can stop execution of the workflow timestamp_map["t_start_dlcbackup_err_flag"] = time.time( ) * 1000.0 workflow_exec_stop = dlc.get("workflow_execution_stop_" + key, locality=0) if workflow_exec_stop is not None and workflow_exec_stop != "": self._logger.info( "Not continuing because workflow execution has been stopped... %s", key) continue_publish_flag = False # if we didn't have to check the error, or we checked it, but there was not one, then continue publishing the output # to the next functions # if we checked the error and there was one, then don't publish to the next functions if continue_publish_flag: # converted_function_output can only contain next values from static (_wf_next) and dynamic next (_wf_pot_next) # static next values would have been already defined and checked before deploying workflow # dynamic next values are checked when creating the trigger in MicroFunctionsAPI.add_workflow_next() # so there is no need for another check if self._should_checkpoint: # we are going to accummulate any input backups in this map input_backup_map = {} # we are going to accummulate any new starting functions in this map starting_next = {} timestamp_map["t_start_pubnextlist"] = time.time() * 1000.0 any_next = False # parse the converted_function_output to determine the next and publish directly for function_output in converted_function_output: next_function_execution_id, output = self._publish_output( key, function_output, lqcpub, timestamp_map) if self._should_checkpoint: if next_function_execution_id is not None and output is not None: # here, output MUST contain "topicNext" and "value"; otherwise, # we wouldn't have been able to publish it in publish_output() # use the updated topicNext for globally published messages starting_next[next_function_execution_id] = output[ "topicNext"] next_function_instance_id = next_function_execution_id + "_" + output[ "topicNext"] input_backup_map[ "input_" + next_function_instance_id] = output["value"] self._next_backup_list.append( next_function_instance_id) any_next = True if self._should_checkpoint: timestamp_map["t_start_backtrigger"] = time.time() * 1000.0 # backups for next of successfully completed function execution instances self._store_trigger_backups( dlc, input_backup_map, current_function_instance_id, store_next_backup_list=any_next) for next_func_exec_id in starting_next: next_func_topic = starting_next[next_func_exec_id] self._send_message_to_recovery_manager( key, "start", next_func_topic, next_func_exec_id, False, "", lqcpub) if self._should_checkpoint: # regardless whether this function execution had an error or not, we are finished and need to let the recovery manager know self._send_message_to_recovery_manager( key, "finish", self._function_topic, self._metadata["__function_execution_id"], has_error, error_type, lqcpub) # log the timestamps timestamp_map["t_pub_end"] = timestamp_map[ "t_end_pub"] = timestamp_map["t_end_fork"] = time.time() * 1000.0 timestamp_map["function_instance_id"] = current_function_instance_id timestamp_map_str = json.dumps(timestamp_map) self._logger.info("[__mfn_progress] %s %s", timestamp_map["function_instance_id"], timestamp_map_str) size = 0 if 'exitsize' in timestamp_map and 't_pub_exittopic' in timestamp_map: size = timestamp_map['exitsize'] self._logger.info( "[__mfn_tracing] [ExecutionId] [%s] [Size] [%s] [TimestampMap] [%s] [%s]", key, str(size), timestamp_map_str, timestamp_map["function_instance_id"]) # also put them to the data layer # (can skip, but need to update "getExecutionDescription.py" in ManagementService) #dlc.put("timestamps_" + current_function_instance_id, json.dumps(timestamp_map)) # shut down the local queue client self._shutdown_local_queue_client() self.shutdown_backup_data_layer_client()
class SessionHelperThread(threading.Thread): def __init__(self, helper_params, logger, pubutils, sessutils, queueservice, datalayer): self._logger = logger #self._logger.debug("[SessionHelperThread] " + str(helper_params)) self._publication_utils = pubutils self._session_utils = sessutils self._queue_service = queueservice self._datalayer = datalayer self._sandboxid = helper_params["sandboxid"] self._workflowid = helper_params["workflowid"] self._session_function_id = helper_params["session_function_id"] self._session_id = helper_params["session_id"] # initialize only needed # need a separate backup data layer client from the publication utils; otherwise, we run into concurrent modification # problems from Thrift # locality = -1 means that the writes happen to the local data layer first and then asynchronously to the global data layer # will only initialize if heartbeats are enabled self._backup_data_layer_client = None # set up heartbeat parameters self._heartbeat_enabled = False self._heartbeat_method = None # our own local queue client to be used when sending a heartbeat # TODO: double check if we can just reuse the one we're polling # probably yes self._local_queue_client_heartbeat = None self._heartbeat_function = None self._heartbeat_data_layer_key = None self._data_layer_client_heartbeat = None self._init_heartbeat_parameters(helper_params["heartbeat_parameters"]) # set up communication parameters self._communication_params = helper_params["communication_parameters"] # similar to the data layer rendezvous point for message delivery, we listen to a local topic # allowing us to queue messages and deliver multiple messages to the session function if desired self._local_topic_communication = self._communication_params[ "local_topic_communication"] # by default, assign a simple poll timeout # if the heartbeat is specified, it will be updated to the heartbeat to ensure # we can send regular heartbeats self._local_poll_timeout = py3utils.ensure_long(10000) # use a queue to keep the incoming update messages for blocking and/or blocking get_update_messages() requests self._message_queue = queue.Queue() self._local_queue_client = LocalQueueClient( connect=self._queue_service) self._special_messages = {} self._special_messages["--stop"] = True self._special_messages["--update-heartbeat"] = True self._is_running = False #self._logger.debug("[SessionHelperThread] init done.") threading.Thread.__init__(self) def _init_heartbeat_parameters(self, heartbeat_params): if "heartbeat_method" not in heartbeat_params: self._logger.debug( "No heartbeat method is specified; disabling heartbeat.") return else: self._heartbeat_enabled = True self._heartbeat_method = heartbeat_params["heartbeat_method"] #self._logger.debug("[SessionHelperThread] New heartbeat method: " + str(self._heartbeat_method)) if self._heartbeat_method == "function": if "heartbeat_function" in heartbeat_params: # enable function related heartbeat self._heartbeat_function = heartbeat_params[ "heartbeat_function"] #self._logger.debug("[SessionHelperThread] New heartbeat function: " + str(self._heartbeat_function)) if self._backup_data_layer_client is None: self._backup_data_layer_client = DataLayerClient( locality=-1, for_mfn=True, sid=self._sandboxid, connect=self._datalayer) if self._local_queue_client_heartbeat is None: self._local_queue_client_heartbeat = LocalQueueClient( connect=self._queue_service) # disable data layer related heartbeat if self._data_layer_client_heartbeat is not None: self._data_layer_client_heartbeat.delete( self._heartbeat_data_layer_key) self._heartbeat_data_layer_key = None self._data_layer_client_heartbeat.shutdown() self._data_layer_client_heartbeat = None elif self._heartbeat_method == "data_layer": # needs to be unique among session functions, so use session id + session function id # TODO: how do you check the heartbeat in the data layer? # checker service or user function needs to know the key # OR keep a new map for heartbeats of the session functions # so that the checker can retrieve the keys and their values (e.g., timestamps) # if a session function misses a heartbeat, the checker function reports to policy handler # enable data layer related heartbeat self._heartbeat_data_layer_key = "heartbeat_" + self._session_id + "_" + self._session_function_id if self._data_layer_client_heartbeat is None: self._data_layer_client_heartbeat = DataLayerClient( locality=1, for_mfn=True, sid=self._sandboxid, connect=self._datalayer) # disable function related heartbeat if self._local_queue_client_heartbeat is not None: self._local_queue_client_heartbeat.shutdown() self._local_queue_client_heartbeat = None self._heartbeat_function = None if self._backup_data_layer_client is not None: self._backup_data_layer_client.shutdown() self._backup_data_layer_client = None else: raise MicroFunctionsSessionAPIException( "Unsupported heartbeat method for session function.") # must be in milliseconds if "heartbeat_interval_ms" in heartbeat_params: self._heartbeat_interval = heartbeat_params[ "heartbeat_interval_ms"] self._local_poll_timeout = self._heartbeat_interval / 2.0 #self._logger.debug("[SessionHelperThread] New heartbeat interval: " + str(self._heartbeat_interval)) def run(self): self._is_running = True # initially, it is the heartbeat_interval / 2 poll_timeout = self._local_poll_timeout if self._heartbeat_enabled: t_cur = time.time() * 1000.0 self._send_heartbeat() last_heartbeat_time = t_cur # _XXX_: our location is stored as part of our metadata # so that the remote functions can # look it up and send their message via that that location # first, create local topic self._local_queue_client.addTopic(self._local_topic_communication) while self._is_running: #self._logger.debug("[SessionHelperThread] polling new session update messages...") # wait until the polling interval finishes # the polling interval depends on the heartbeat interval and when we actually receive a message # if we get a message before, then update the polling interval as (heartbeat_interval - passed_time) lqm = self._local_queue_client.getMessage( self._local_topic_communication, poll_timeout) # double check we are still running # if the long-running function finished while we were polling, no need to send another heartbeat if not self._is_running: break if lqm is not None: self._process_message(lqm) if self._heartbeat_enabled: # send heartbeat # this is part of the message loop, such that we can have a more precise heartbeat # if it was only after the message loop, then there is a corner case, where the # processing of the messages would take more than the heartbeat interval, # meaning we would miss our deadline t_cur = time.time() * 1000.0 if (t_cur - last_heartbeat_time) >= self._heartbeat_interval: self._send_heartbeat() last_heartbeat_time = t_cur if self._heartbeat_enabled: # send heartbeat # even if there are no messages, we might need to send a heartbeat t_cur = time.time() * 1000.0 if (t_cur - last_heartbeat_time) >= self._heartbeat_interval: self._send_heartbeat() last_heartbeat_time = t_cur # update the poll time # if we sent a heartbeat recently, last_heartbeat and t_cur will cancel each other out poll_timeout = py3utils.ensure_long(last_heartbeat_time + self._local_poll_timeout - t_cur) #self._logger.debug("updated poll timeout: " + str(poll_timeout)) self._cleanup() def _process_message(self, lqm): try: lqcm = LocalQueueClientMessage(lqm=lqm) value = lqcm.get_value() #key = lqcm.get_key() #self._logger.debug("[SessionHelperThread] new message: " + key + " " + value) except Exception as exc: self._logger.exception( "Exception in handling message to running function: " + str(self._session_function_id) + " " + str(exc)) # we need to decapsulate and decode this message, # because it has been delivered # to us without going through the function worker value, metadata = self._publication_utils.decapsulate_input(value) #self._logger.debug("metadata for session function message: " + str(metadata)) # need to handle the special messages here # check if the message is in json is_json = True try: msg = json.loads(value) #self._logger.debug("[SessionHelperThread] JSON value: " + str(msg)) except Exception as exc: is_json = False msg = value self._logger.debug("[SessionHelperThread] non-JSON value: " + str(msg)) # cannot be a special message; queue whatever it is # _XXX_: we are encoding/decoding the delivered message; should not actually execute this code # it is here for not envisioned corner case (i.e., let the user code deal with it) if not is_json: self._store_message(msg) self._publication_utils.set_metadata(metadata) else: # the message is json encoded, but it doesn't guarantee that it is a special message if "action" in msg and msg["action"] in self._special_messages: self._handle_special_message(msg) else: self._store_message(msg) self._publication_utils.set_metadata(metadata) def _store_message(self, msg): self._message_queue.put(msg) def _handle_special_message(self, msg): action = msg["action"] if action == "--stop": self._session_utils.set_session_function_running(False) self.shutdown() elif action == "--update-heartbeat": self._init_heartbeat_parameters(msg["heartbeat_parameters"]) def get_messages(self, count=1, block=False): messages = [] for i in range(count): try: msg = self._message_queue.get(block=block) messages.append(msg) self._message_queue.task_done() except Exception as exc: pass #self._logger.debug("returning messages: " + str(messages)) return messages def _send_heartbeat(self): # check if heartbeat is enabled. if not, just return # if heartbeat is enabled, then double check we are still running # if the long-running function finished while we were processing messages, no need to send another heartbeat if not self._heartbeat_enabled or not self._is_running: return #self._logger.debug("[SessionHelperThread] sending heartbeat to function: " + self._heartbeat_function) hb_message = self._get_heartbeat_message() # either to another function via a local queue client or to data layer or another method if self._heartbeat_method == "function": self._send_heartbeat_to_function(hb_message) elif self._heartbeat_method == "data_layer": self._send_heartbeat_to_data_layer(hb_message) def _get_heartbeat_message(self): hb_message = {} hb_message["session_id"] = self._session_id hb_message["session_function_id"] = self._session_function_id hb_message["timestamp"] = time.time() * 1000.0 hb_message["action"] = "--heartbeat" #self._logger.debug("heartbeat msg: "+ json.dumps(hb_message)) return hb_message def _send_heartbeat_to_function(self, hb_message): # TODO: what if the heartbeat function is a session function as well? # either running and/or not started yet, but will continue running after the first message # pass our own local queue client, so that there won't be any concurrent access # to publication utils' local queue client trigger_hb = {} trigger_hb["next"] = self._heartbeat_function trigger_hb["value"] = hb_message self._publication_utils.send_to_function_now( "-1l", trigger_hb, self._local_queue_client_heartbeat, self._backup_data_layer_client) def _send_heartbeat_to_data_layer(self, hb_message): self._data_layer_client_heartbeat.put(self._heartbeat_data_layer_key, json.dumps(hb_message)) def _cleanup(self): #self._logger.debug("[SessionHelperThread] cleaning up...") # clean up connections if self._data_layer_client_heartbeat is not None: self._data_layer_client_heartbeat.delete( self._heartbeat_data_layer_key) self._heartbeat_data_layer_key = None self._data_layer_client_heartbeat.shutdown() self._data_layer_client_heartbeat = None if self._local_queue_client_heartbeat is not None: self._local_queue_client_heartbeat.shutdown() self._local_queue_client_heartbeat = None if self._backup_data_layer_client is not None: self._backup_data_layer_client.shutdown() self._backup_data_layer_client = None # remove/unregister the topic self._local_queue_client.removeTopic(self._local_topic_communication) self._local_queue_client.shutdown() self._local_queue_client = None def shutdown(self): self._is_running = False
class SandboxAgent: def __init__(self, hostname, queue, datalayer, sandboxid, userid, workflowid, elasticsearch, workflowname, endpoint_key): self._start = time.time() self._python_version = sys.version_info self._hostname = hostname self._queue = queue self._datalayer = datalayer self._elasticsearch = elasticsearch self._userid = userid self._sandboxid = sandboxid self._workflowid = workflowid self._workflowname = workflowname # _XXX_: we'll use the endpoint_key to look up our endpoint self._endpoint_key = endpoint_key self._deployment_info_key = "deployment_info_workflow_" + self._workflowid self._logger = logging_helpers.setup_logger(self._sandboxid, LOG_FILENAME) self._fluentbit_process, self._command_args_map_fluentbit = logging_helpers.setup_fluentbit_and_elasticsearch_index( self._logger, FLUENTBIT_FOLDER, self._elasticsearch, ELASTICSEARCH_INDEX_WF, ELASTICSEARCH_INDEX_FE) self._logger.info("hostname (and container name): %s", self._hostname) self._logger.info("elasticsearch nodes: %s", self._elasticsearch) self._logger.info("queueservice: %s", self._queue) self._logger.info("datalayer: %s", self._datalayer) self._logger.info("user id: %s", self._userid) self._logger.info("sandbox id: %s", self._sandboxid) self._logger.info("workflow id: %s", self._workflowid) self._logger.info("workflow name: %s", self._workflowname) self._logger.info("endpoint_key: %s", self._endpoint_key) self._instructions_topic = "instructions_" + self._sandboxid self._management_data_layer_client = DataLayerClient( locality=1, sid="Management", wid="Management", is_wf_private=True, connect=self._datalayer) self._logger.info("Management data layer client connected after %s s", str(time.time() - self._start)) # to be declared later self._local_queue_client = None self._deployment = None self._queue_service_process = None self._frontend_process = None # visible to the outside world: either kubernetes assigned URL or bare-metal host address + exposed port self._external_endpoint = None # visible internally: kubernetes node address or same as bare-metal external endpoint self._internal_endpoint = None self._is_running = False self._shutting_down = False def _handle_instruction(self, instruction): error = None action = instruction["action"] if "parameters" in instruction: parameters = instruction["parameters"] if action == "stop-function-worker": self._deployment.stop_function_worker(parameters["functionTopic"]) elif action == "shutdown": self.shutdown() else: error = "Unsupported 'action' in instruction: " + action return error def _get_and_handle_message(self): error = None lqm = self._local_queue_client.getMessage(self._instructions_topic, POLL_TIMEOUT) if lqm is not None: lqcm = LocalQueueClientMessage(lqm) key = lqcm.get_key() value = lqcm.get_value() self._logger.info(key + " " + value) try: instruction = json.loads(value) error = self._handle_instruction(instruction) except Exception as exc: error = "Couldn't decode instruction: " + str(exc) self._logger.error(error) if error is None: self._logger.info( "Handled instruction successfully at t+ %s s", str(time.time() - self._start)) def _process_deployment_info(self): has_error = False errmsg = "" deployment_info = self._management_data_layer_client.get( self._deployment_info_key) num_trials = 0 sleep_time = 1.0 while num_trials < 5 and (deployment_info is None or deployment_info == ""): time.sleep(sleep_time) deployment_info = self._management_data_layer_client.get( self._deployment_info_key) num_trials = num_trials + 1 sleep_time = sleep_time * 2 if num_trials == 5: has_error = True errmsg = "Could not retrieve deployment info: " + self._deployment_info_key if not has_error: # if we're running on kubernetes, the endpoint will correspond to the assigned url # if we're running on bare-metal, the endpoint will correspond to the hostip + docker-mapped port self._external_endpoint = self._management_data_layer_client.getMapEntry( self._workflowid + "_workflow_endpoint_map", endpoint_key) num_trials = 0 sleep_time = 1.0 while num_trials < 5 and (self._external_endpoint is None or self._external_endpoint == ""): time.sleep(sleep_time) self._external_endpoint = self._management_data_layer_client.getMapEntry( self._workflowid + "_workflow_endpoint_map", endpoint_key) num_trials = num_trials + 1 sleep_time = sleep_time * 2 if num_trials == 5: has_error = True errmsg = "Could not retrieve endpoint: " + self._endpoint_key # in Kubernetes, endpoint is the externally visible URL # in bare-metal, endpoint is the current host's address # for session support, in FunctionWorker, we need current host address (bare-metal) # or current node address (kubernetes) # for parallel state support, in FunctionWorker, either would be fine # As such, let the FunctionWorker know both and let it decide what to do if 'KUBERNETES_SERVICE_HOST' in os.environ: # get current node's internal address self._internal_endpoint = "http://" + socket.gethostbyname( socket.gethostname()) + ":" + str(os.getenv("PORT", "8080")) else: # bare-metal mode: the current host's address and external address are the same self._internal_endpoint = self._external_endpoint if not has_error: self._logger.info("External endpoint: %s", self._external_endpoint) self._logger.info("Internal endpoint: %s", self._internal_endpoint) self._deployment = Deployment(deployment_info,\ self._hostname, self._userid, self._sandboxid, self._workflowid,\ self._workflowname, self._queue, self._datalayer, \ self._logger, self._external_endpoint, self._internal_endpoint) self._deployment.set_child_process( "fb", self._fluentbit_process, self._command_args_map_fluentbit) has_error, errmsg = self._deployment.process_deployment_info() return has_error, errmsg # SIGTERM kills Thrift before we can handle stuff def sigterm(self, signum, frame): self.shutdown() # raise interrupt to kill main sequence when shutdown was not received through the queue raise InterruptedError def sigchld(self, signum, _): if not self._shutting_down: should_shutdown, pid = self._deployment.check_child_process() if should_shutdown: self._update_deployment_status( True, "A sandbox process stopped unexpectedly.") self.shutdown(reason="Process with pid: " + str(pid) + " stopped unexpectedly.") def shutdown(self, reason=None): self._shutting_down = True if reason is not None: self._logger.error("Shutting down sandboxagent due to reason: " + reason) else: self._logger.info("Gracefully shutting down sandboxagent") self._logger.info("Shutting down the frontend...") if self._frontend_process is not None: self._frontend_process.terminate() self._logger.info("Shutting down the function worker(s)...") self._deployment.shutdown() # shut down the local queue client, so that we can also shut down the queue service self._local_queue_client.removeTopic(self._instructions_topic) self._local_queue_client.shutdown() self._logger.info("Shutting down the queue service...") if self._queue_service_process is not None: process_utils.terminate_and_wait_child(self._queue_service_process, "queue service", 5, self._logger) # we can't do this here, because there may be other sandboxes running the same workflow #self._management_data_layer_client.put("workflow_status_" + self._workflowid, "undeployed") self._management_data_layer_client.shutdown() self._logger.info("Shutting down fluent-bit...") time.sleep(2) # flush interval of fluent-bit process_utils.terminate_and_wait_child(self._fluentbit_process, "fluent-bit", 5, self._logger) self._is_running = False try: self._frontend_process.wait(30) except subprocess.TimeoutExpired as exc: self._frontend_process.kill() _, _ = self._frontend_process.communicate() self._logger.info("Shutdown complete") def _stop_deployment(self, reason, errmsg): self._logger.error( "Stopping deployment due to error in launching %s...", reason) self._logger.error(errmsg) self._update_deployment_status(True, errmsg) self._management_data_layer_client.shutdown() os._exit(1) def _update_deployment_status(self, has_error, errmsg): sbstatus = {} sbstatus["errmsg"] = errmsg if has_error: sbstatus["status"] = "failed" else: sbstatus["status"] = "deployed" # set our own status in the map self._management_data_layer_client.putMapEntry( self._workflowid + "_sandbox_status_map", self._endpoint_key, json.dumps(sbstatus)) def run(self): has_error = False errmsg = "" ts_qs_launch = time.time() # 1. launch the QueueService here self._logger.info("Launching QueueService...") cmdqs = "java -jar /opt/mfn/queueservice.jar" command_args_map_qs = {} command_args_map_qs["command"] = cmdqs command_args_map_qs["wait_until"] = "Starting local queue..." error, self._queue_service_process = process_utils.run_command( cmdqs, self._logger, wait_until="Starting local queue...") if error is not None: has_error = True errmsg = "Could not start the sandbox queue service: " + str(error) if has_error: self._stop_deployment("queue service", errmsg) ts_fw_launch = time.time() # 2. process the deployment info and start function workers self._logger.info( "Going to parse the deployment info and get the endpoint...") has_error, errmsg = self._process_deployment_info() if has_error: self._stop_deployment("workflow", errmsg) ts_fe_launch = time.time() # 3. launch the frontend self._logger.info("Launching frontend...") cmdweb = "/opt/mfn/frontend" fenv = dict(os.environ) workflow = self._deployment.get_workflow() fenv["MFN_ENTRYTOPIC"] = workflow.getWorkflowEntryTopic() fenv["MFN_RESULTTOPIC"] = workflow.getWorkflowExitTopic() fenv["MFN_QUEUE"] = self._queue # MFN_DATALAYER already set command_args_map_fe = {} command_args_map_fe["command"] = cmdweb command_args_map_fe["custom_env"] = fenv command_args_map_fe[ "wait_until"] = "Frontend is ready to handle requests" error, self._frontend_process = process_utils.run_command( cmdweb, self._logger, custom_env=fenv, wait_until="Frontend is ready to handle requests") if error is not None: has_error = True errmsg = "Could not start the frontend: " + str(error) if has_error: self._stop_deployment("frontend", errmsg) self._logger.info("frontend started") t_fe = (time.time() - ts_fe_launch) * 1000.0 t_fw = (ts_fe_launch - ts_fw_launch) * 1000.0 t_qs = (ts_fw_launch - ts_qs_launch) * 1000.0 self._logger.info( "QS launch time: %s (ms), FWs download + launch time: %s (ms), FE launch time: %s (ms)", str(t_qs), str(t_fw), str(t_fe)) self._deployment.set_child_process("qs", self._queue_service_process, command_args_map_qs) self._deployment.set_child_process("fe", self._frontend_process, command_args_map_fe) # 4. start listening for additional instructions if any self._local_queue_client = LocalQueueClient(connect=self._queue) self._local_queue_client.addTopic(self._instructions_topic) self._is_running = True signal.signal(signal.SIGTERM, self.sigterm) children_pids = self._deployment.get_all_children_pids() children_pids.sort() self._logger.info("All children pids: " + str(children_pids)) signal.signal(signal.SIGCHLD, self.sigchld) # update our own sandbox status self._update_deployment_status(False, errmsg) #self._management_data_layer_client.put("workflow_status_" + self._workflowid, "deployed") #self._management_data_layer_client.delete("workflow_status_error_" + self._workflowid) self._logger.info("Successfully deployed.") while self._is_running: try: self._get_and_handle_message() except Exception as exc: self._logger.error("%s", str(exc)) time.sleep(2)
class DataLayerOperator: def __init__(self, suid, sid, wid, datalayer): self._storage_userid = suid self._sandboxid = sid self._workflowid = wid self._datalayer = datalayer # global data layer clients for either workflow-private data or user storage self._data_layer_client = None self._data_layer_client_private = None # TODO (?): use the local data layer for operations regarding KV, maps, sets and counters instead of in-memory data structures (e.g., transient_data_output) # and store the operations/data for is_queued = True operations, # so that we can synchronize it with the global data layer # (key, value) store self.transient_data_output = {} self.transient_data_output_private = {} self.data_to_be_deleted = {} self.data_to_be_deleted_private = {} self.map_output = {} self.set_output = {} self.counter_output = {} self.map_output_delete = {} self.set_output_delete = {} self.counter_output_delete = {} # TODO: update to use local data layer for (key, value) operations def put(self, key, value, is_private=False, is_queued=False, table=None): if is_queued: if is_private: self.transient_data_output_private[key] = value if key in self.data_to_be_deleted_private: self.data_to_be_deleted_private.pop(key, None) else: self.transient_data_output[key] = value if key in self.data_to_be_deleted: self.data_to_be_deleted.pop(key, None) else: data_layer_client = self._get_data_layer_client(is_private) data_layer_client.put(key, value, tableName=table) def get(self, key, is_private=False, table=None): # check first transient_output # if not there, return the actual (global) data layer data item # if not there either, return empty string (as defined in the DataLayerClient) value = None # if the put() or delete() were called with is_queued=False (default), # then the below checks will still result in 'value is None' # if not, then value will be obtained from the transient output if is_private: if key in self.data_to_be_deleted_private: return "" value = self.transient_data_output_private.get(key) else: if key in self.data_to_be_deleted: return "" value = self.transient_data_output.get(key) if value is None: data_layer_client = self._get_data_layer_client(is_private) value = data_layer_client.get(key, tableName=table) return value def delete(self, key, is_private=False, is_queued=False, table=None): if is_queued: if is_private: self.transient_data_output_private.pop(key, None) self.data_to_be_deleted_private[key] = True else: self.transient_data_output.pop(key, None) self.data_to_be_deleted[key] = True else: data_layer_client = self._get_data_layer_client(is_private) data_layer_client.delete(key, tableName=table) # map operations def createMap(self, mapname, is_private=False, is_queued=False): if is_queued: # TODO: use transient data structure in memory when the operation is queued pass else: dlc = self._get_data_layer_client(is_private) dlc.createMap(mapname) def putMapEntry(self, mapname, key, value, is_private=False, is_queued=False): if is_queued: # TODO: use transient data structure in memory when the operation is queued pass else: dlc = self._get_data_layer_client(is_private) dlc.putMapEntry(mapname, key, value) def getMapEntry(self, mapname, key, is_private=False): value = None # TODO: check transient data structure first if value is None: dlc = self._get_data_layer_client(is_private) value = dlc.getMapEntry(mapname, key) return value def deleteMapEntry(self, mapname, key, is_private=False, is_queued=False): if is_queued: # TODO: use transient data structure in memory when the operation is queued pass else: dlc = self._get_data_layer_client(is_private) dlc.deleteMapEntry(mapname, key) def containsMapKey(self, mapname, key, is_private=False): ret = False # TODO: check transient data structure first if not ret: dlc = self._get_data_layer_client(is_private) ret = dlc.containsMapKey(mapname, key) return ret def retrieveMap(self, mapname, is_private=False): retmap = {} # XXX: should follow "read your writes" # the final result should include: # 1. all created locally # 2. all existing globally minus the ones deleted locally # TODO: 1. check local data layer first: get locally created and deleted # 2. retrieve all existing globally dlc = self._get_data_layer_client(is_private) retmap2 = dlc.retrieveMap(mapname) if retmap2 is not None: for k in retmap2: retmap[k] = retmap2[k] # TODO: 3. remove the ones deleted locally return retmap def getMapKeys(self, mapname, is_private=False): keys = set() # XXX: should follow "read your writes" # the final result should include: # 1. all created locally # 2. all existing globally minus the ones deleted locally # TODO: 1. check local data layer first: get locally created and deleted # 2. retrieve all existing globally dlc = self._get_data_layer_client(is_private) k2 = dlc.getMapKeys(mapname) if k2 is not None: # TODO: 3. remove the ones deleted locally keys = keys.union(k2) return keys def clearMap(self, mapname, is_private=False, is_queued=False): if is_queued: # TODO: use transient data structure in memory when the operation is queued pass else: dlc = self._get_data_layer_client(is_private) dlc.clearMap(mapname) def deleteMap(self, mapname, is_private=False, is_queued=False): if is_queued: # TODO: use transient data structure in memory when the operation is queued pass else: dlc = self._get_data_layer_client(is_private) dlc.deleteMap(mapname) def getMapNames(self, start_index=0, end_index=2147483647, is_private=False): maps = set() # XXX: should follow "read your writes" # the final result should include: # 1. all created locally # 2. all existing globally minus the ones deleted locally # TODO: 1. check local data layer first: get locally created and deleted # 2. retrieve all existing globally dlc = self._get_data_layer_client(is_private) m2 = dlc.getMapNames(start_index, end_index) if m2 is not None: # TODO: 3. remove the ones deleted locally maps = maps.union(m2) return list(maps) # set operations def createSet(self, setname, is_private=False, is_queued=False): if is_queued: # TODO: use transient data structure in memory when the operation is queued pass else: dlc = self._get_data_layer_client(is_private) dlc.createSet(setname) def addSetEntry(self, setname, item, is_private=False, is_queued=False): if is_queued: # TODO: use transient data structure in memory when the operation is queued pass else: dlc = self._get_data_layer_client(is_private) dlc.addSetEntry(setname, item) def removeSetEntry(self, setname, item, is_private=False, is_queued=False): if is_queued: # TODO: use transient data structure in memory when the operation is queued pass else: dlc = self._get_data_layer_client(is_private) dlc.removeSetEntry(setname, item) def containsSetItem(self, setname, item, is_private=False): ret = False # TODO: check transient data structure first if not ret: dlc = self._get_data_layer_client(is_private) ret = dlc.containsSetItem(setname, item) return ret def retrieveSet(self, setname, is_private=False): items = set() # XXX: should follow "read your writes" # the final result should include: # 1. all created locally # 2. all existing globally minus the ones deleted locally # TODO: 1. check local data layer first: get locally created and deleted # 2. retrieve all existing globally dlc = self._get_data_layer_client(is_private) i2 = dlc.retrieveSet(setname) if i2 is not None: # TODO: 3. remove the ones deleted locally items = items.union(i2) return items def clearSet(self, setname, is_private=False, is_queued=False): if is_queued: # TODO: use transient data structure in memory when the operation is queued pass else: dlc = self._get_data_layer_client(is_private) dlc.clearSet(setname) def deleteSet(self, setname, is_private=False, is_queued=False): if is_queued: # TODO: use transient data structure in memory when the operation is queued pass else: dlc = self._get_data_layer_client(is_private) dlc.deleteSet(setname) def getSetNames(self, start_index=0, end_index=2147483647, is_private=False): sets = set() # XXX: should follow "read your writes" # the final result should include: # 1. all created locally # 2. all existing globally minus the ones deleted locally # TODO: 1. check local data layer first: get locally created and deleted # 2. retrieve all existing globally dlc = self._get_data_layer_client(is_private) s2 = dlc.getSetNames(start_index, end_index) if s2 is not None: # TODO: 3. remove the ones deleted locally sets = sets.union(s2) return list(sets) # counter operations def createCounter(self, countername, count, is_private=False, is_queued=False): if is_queued: # TODO: use transient data structure in memory when the operation is queued pass else: dlc = self._get_data_layer_client(is_private) dlc.createCounter(countername, count) def getCounterValue(self, countername, is_private=False): value = 0 # TODO: check transient data structure first and apply any changes to the global value dlc = self._get_data_layer_client(is_private) value = dlc.getCounter(countername) return value def incrementCounter(self, countername, increment, is_private=False, is_queued=False): if is_queued: # TODO: use transient data structure in memory when the operation is queued pass else: dlc = self._get_data_layer_client(is_private) dlc.incrementCounter(countername, increment) def decrementCounter(self, countername, decrement, is_private=False, is_queued=False): if is_queued: # TODO: use transient data structure in memory when the operation is queued pass else: dlc = self._get_data_layer_client(is_private) dlc.decrementCounter(countername, decrement) def deleteCounter(self, countername, is_private=False, is_queued=False): if is_queued: # TODO: use transient data structure in memory when the operation is queued pass else: dlc = self._get_data_layer_client(is_private) dlc.deleteCounter(countername) def getCounterNames(self, start_index=0, end_index=2147483647, is_private=False): counters = set() # XXX: should follow "read your writes" # the final result should include: # 1. all created locally # 2. all existing globally minus the ones deleted locally # TODO: 1. check local data layer first: get locally created and deleted # 2. retrieve all existing globally dlc = self._get_data_layer_client(is_private) c2 = dlc.getCounterNames(start_index, end_index) if c2 is not None: # TODO: 3. remove the ones deleted locally counters = counters.union(c2) return list(counters) def get_transient_data_output(self, is_private=False): ''' Return the transient data, so that it can be committed to the data layer when the function instance finishes. ''' if is_private: return self.transient_data_output_private return self.transient_data_output def get_data_to_be_deleted(self, is_private=False): ''' Return the list of deleted data items, so that they can be committed to the data layer when the function instance finishes. ''' if is_private: return self.data_to_be_deleted_private return self.data_to_be_deleted def _get_data_layer_client(self, is_private=False): ''' Return the data layer client, so that it can be used to commit to the data layer when the function instance finishes. If it is not initialized yet, it will be initialized here. ''' # TODO: need also the locality information if is_private: if self._data_layer_client_private is None: self._data_layer_client_private = DataLayerClient( locality=1, sid=self._sandboxid, wid=self._workflowid, is_wf_private=True, connect=self._datalayer) return self._data_layer_client_private if self._data_layer_client is None: self._data_layer_client = DataLayerClient( locality=1, suid=self._storage_userid, is_wf_private=False, connect=self._datalayer) return self._data_layer_client def _shutdown_data_layer_client(self): ''' Shut down the data layer client if it has been initialized after the function instance finishes committing changes to the data layer. ''' if self._data_layer_client_private is not None: self._data_layer_client_private.shutdown() self._data_layer_client_private = None if self._data_layer_client is not None: self._data_layer_client.shutdown() self._data_layer_client = None
def _initialize_data_layer_storage(self): # each data layer client will automatically create the local keyspace and tables # upon instantiation # mfn internal tables local_dlc = DataLayerClient(locality=0, for_mfn=True, sid=self._sandboxid, wid=self._workflowid, connect=self._datalayer, init_tables=True) local_dlc.shutdown() # user storage tables local_dlc = DataLayerClient(locality=0, suid=self._storage_userid, connect=self._datalayer, init_tables=True) local_dlc.shutdown() # workflow private tables local_dlc = DataLayerClient(locality=0, is_wf_private=True, sid=self._sandboxid, wid=self._workflowid, connect=self._datalayer, init_tables=True) local_dlc.shutdown() # for global access, (re)create; it's okay because the operations are idempotent # user storage is created by management service # mfn internal tables global_dlc = DataLayerClient(locality=1, for_mfn=True, sid=self._sandboxid, wid=self._workflowid, connect=self._datalayer, init_tables=True) global_dlc.shutdown() # workflow private tables global_dlc = DataLayerClient(locality=1, is_wf_private=True, sid=self._sandboxid, wid=self._workflowid, connect=self._datalayer, init_tables=True) global_dlc.shutdown()
class Deployment: def __init__(self, deployment_info, hostname, userid, sandboxid, workflowid, workflowname, queue, datalayer, logger, external_endpoint, internal_endpoint, management_endpoints): self._logger = logger self._deployment_info = deployment_info self._hostname = hostname self._userid = userid self._sandboxid = sandboxid self._workflowid = workflowid self._workflowname = workflowname self._queue = queue self._datalayer = datalayer self._external_endpoint = external_endpoint self._internal_endpoint = internal_endpoint self._management_endpoints = management_endpoints self._python_version = sys.version_info self._storage_userid = self._userid.replace("@", "AT") self._storage_userid = self._storage_userid.replace("-", "_").replace(".", "_") self._process_id = os.getpid() self._functionworker_process_map = {} self._javarequesthandler_process_list = [] self._queue_service_process = None self._frontend_process = None self._fluentbit_process = None # it will be probably updated to be something else self._fluentbit_actual_pid = -1 self._child_process_command_args_map = {} # to be declared later when parsing the deployment info self._workflow = None self._global_data_layer_client = DataLayerClient(locality=1, suid=self._storage_userid, connect=self._datalayer) self._local_queue_client = None def get_workflow(self): return self._workflow def set_child_process(self, which, process, command_args_map): pid = process.pid if which == "qs": self._queue_service_process = process elif which == "fe": self._frontend_process = process elif which == "fb": self._fluentbit_process = process output, error = process_utils.run_command_return_output('ps --no-headers -o pid -C fluent-bit', self._logger) fbpid = int(output.strip()) self._fluentbit_actual_pid = fbpid pid = fbpid # store command and args self._child_process_command_args_map[pid] = command_args_map def get_all_children_pids(self): children_pids = [] for state in self._functionworker_process_map: p = self._functionworker_process_map[state] children_pids.append(p.pid) for jrhp in self._javarequesthandler_process_list: children_pids.append(jrhp.pid) children_pids.append(self._queue_service_process.pid) children_pids.append(self._frontend_process.pid) # looks like this pid does not match the actual process; perhaps because it also spawns another process? #children_pids.append(self._fluentbit_process.pid) ## find actual fluentbit pid output, error = process_utils.run_command_return_output('ps --no-headers -o pid -C fluent-bit', self._logger) fbpid = int(output.strip()) self._fluentbit_actual_pid = fbpid children_pids.append(fbpid) return children_pids def check_child_process(self): pid, status = os.waitpid(-1, os.WNOHANG|os.WUNTRACED|os.WCONTINUED) failed_process_name = "" if os.WIFCONTINUED(status) or os.WIFSTOPPED(status): return False, _ if os.WIFSIGNALED(status) or os.WIFEXITED(status): self._logger.error("Process with pid: " + str(pid) + " stopped.") if pid == self._fluentbit_actual_pid: failed_process_name = "Fluent-bit" elif pid == self._queue_service_process.pid: failed_process_name = "Queue service" elif pid == self._frontend_process.pid: failed_process_name = "Frontend" else: for jrhp in self._javarequesthandler_process_list: if pid == jrhp.pid: failed_process_name = "Java request handler" break for state_name in self._functionworker_process_map: process = self._functionworker_process_map[state_name] if pid == process.pid: failed_process_name = "Function worker (" + state_name + ")" del self._functionworker_process_map[state_name] break self._logger.error("Failed process name: " + failed_process_name) if os.path.exists('/var/run/secrets/kubernetes.io'): return True, pid, failed_process_name else: # TODO: try to relaunch some of the processes (FWs, fluentbit, frontend) self._logger.info(self._child_process_command_args_map[pid]) return True, pid, failed_process_name def shutdown(self): shutdown_message = {} shutdown_message["action"] = "stop" lqcm_shutdown = LocalQueueClientMessage(key="0l", value=json.dumps(shutdown_message)) workflow_nodes = self._workflow.getWorkflowNodeMap() for function_topic in workflow_nodes: ack = self._local_queue_client.addMessage(function_topic, lqcm_shutdown, True) while not ack: ack = self._local_queue_client.addMessage(function_topic, lqcm_shutdown, True) self._logger.info("Waiting for function workers to shutdown") self._wait_for_child_processes() for jrh_process in self._javarequesthandler_process_list: process_utils.terminate_and_wait_child(jrh_process, "JavaRequestHandler", 5, self._logger) self._local_queue_client.shutdown() def force_shutdown(self): # called when the queue service has crashed and we need to shut down the function workers for state in self._functionworker_process_map: p = self._functionworker_process_map[state] process_utils.terminate_and_wait_child(p, "FunctionWorker", 5, self._logger) for jrh_process in self._javarequesthandler_process_list: process_utils.terminate_and_wait_child(jrh_process, "JavaRequestHandler", 5, self._logger) self._local_queue_client.shutdown() def _wait_for_child_processes(self): output, error = process_utils.run_command_return_output('pgrep -P ' + str(self._process_id), self._logger) if error is not None: self._logger.error("[SandboxAgent] wait_for_child_processes: Failed to get children process ids: %s", str(error)) return children_pids = set(output.split()) self._logger.info("[SandboxAgent] wait_for_child_processes: Parent pid: %s, Children pids: %s", str(self._process_id), str(children_pids)) for jrh_process in self._javarequesthandler_process_list: if str(jrh_process.pid) in children_pids: children_pids.remove(str(jrh_process.pid)) self._logger.info("[SandboxAgent] wait_for_child_processes: Not waiting on JavaRequestHandler pid: %s", str(jrh_process.pid)) ## find fluentbit PID output, error = process_utils.run_command_return_output('ps --no-headers -o pid -C fluent-bit', self._logger) fbpid = output.strip() if fbpid in children_pids: children_pids.remove(fbpid) self._logger.info("[SandboxAgent] wait_for_child_processes: Not waiting on fluent-bit pid: %s", fbpid) if self._queue_service_process is not None: if str(self._queue_service_process.pid) in children_pids: children_pids.remove(str(self._queue_service_process.pid)) self._logger.info("[SandboxAgent] wait_for_child_processes: Not waiting on queue service pid: %s", str(self._queue_service_process.pid)) if self._frontend_process is not None: if str(self._frontend_process.pid) in children_pids: children_pids.remove(str(self._frontend_process.pid)) self._logger.info("[SandboxAgent] wait_for_child_processes: Not waiting on frontend pid: %s", str(self._frontend_process.pid)) if not children_pids: self._logger.info("[SandboxAgent] wait_for_child_processes: No remaining pids to wait for") return while True: try: cpid, status = os.waitpid(-1, 0) self._logger.info("[SandboxAgent] wait_for_child_processes: Status changed for pid: %s, Status: %s", str(cpid), str(status)) if str(cpid) not in children_pids: #print('wait_for_child_processes: ' + str(cpid) + "Not found in children_pids") continue children_pids.remove(str(cpid)) if not children_pids: self._logger.info("[SandboxAgent] wait_for_child_processes: No remaining pids to wait for") break except Exception as exc: self._logger.error('[SandboxAgent] wait_for_child_processes: %s', str(exc)) def _start_python_function_worker(self, worker_params, env_var_list): error = None function_name = worker_params["fname"] state_name = worker_params["functionstatename"] custom_env = os.environ.copy() old_ld_library_path = "" if "LD_LIBRARY_PATH" in custom_env: old_ld_library_path = custom_env["LD_LIBRARY_PATH"] custom_env["LD_LIBRARY_PATH"] = "/opt/mfn/workflow/states/" + state_name + "/" + function_name + ":/opt/mfn/workflow/states/" + state_name + "/" + function_name + "/lib" if old_ld_library_path != "": custom_env["LD_LIBRARY_PATH"] = custom_env["LD_LIBRARY_PATH"] + ":" + old_ld_library_path #custom_env["PYTHONPATH"] = "/opt/mfn/workflow/states/" + state_name + "/" + function_name for env_var in env_var_list: idx = env_var.find("=") if idx == -1: continue env_var_key = env_var[0:idx] env_var_value = env_var[idx+1:] custom_env[env_var_key] = env_var_value #self._logger.info("environment variables (after user env vars): %s", str(custom_env)) if self._python_version >= (3, ): cmd = "python3 " else: cmd = "python " cmd = cmd + "/opt/mfn/FunctionWorker/python/FunctionWorker.py" cmd = cmd + " " + '\"/opt/mfn/workflow/states/%s/worker_params.json\"' % state_name # state_name can contain whitespace filename = '/opt/mfn/logs/function_' + state_name + '.log' log_handle = open(filename, 'a') # store command arguments for when/if we need to restart the process if it fails command_args_map = {} command_args_map["command"] = cmd command_args_map["custom_env"] = custom_env command_args_map["log_filename"] = filename #self._logger.info("Starting function worker: " + state_name + " with stdout/stderr redirected to: " + filename) error, process = process_utils.run_command(cmd, self._logger, custom_env=custom_env, process_log_handle=log_handle) if error is None: self._functionworker_process_map[state_name] = process self._child_process_command_args_map[process.pid] = command_args_map self._logger.info("Started function worker: %s, pid: %s, with stdout/stderr redirected to: %s", state_name, str(process.pid), filename) return error def _start_function_worker(self, worker_params, runtime, env_var_list): error = None if runtime.find("python") != -1: error = self._start_python_function_worker(worker_params, env_var_list) elif runtime.find("java") != -1: # TODO: environment/JVM variables need to be utilized by the java request handler, not by the function worker if SINGLE_JVM_FOR_FUNCTIONS: # _XXX_: we'll launch the single JVM handling all java functions later error = self._start_python_function_worker(worker_params, env_var_list) else: # if jar, the contents have already been extracted as if it was a zip archive # start the java request handler if self._function_runtime == "java" # we wrote the parameters to json file at the state directory self._logger.info("Launching JavaRequestHandler for state: %s", worker_params["functionstatename"]) cmdjavahandler = "java -jar /opt/mfn/JavaRequestHandler/target/javaworker.jar " cmdjavahandler += "/opt/mfn/workflow/states/" + worker_params["functionstatename"] + "/java_worker_params.json" error, process = process_utils.run_command(cmdjavahandler, self._logger, wait_until="Waiting for requests on:") if error is not None: error = "Could not launch JavaRequestHandler: " + worker_params["fname"] + " " + error self._logger.error(error) else: self._javarequesthandler_process_list.append(process) error = self._start_python_function_worker(worker_params, env_var_list) else: error = "Unsupported function runtime: " + runtime return error def _prepare_update_for_locally_running(self, local_functions): update = {} update["action"] = "update-local-functions" update["localFunctions"] = local_functions update = json.dumps(update) lqcm_update = LocalQueueClientMessage(key="0l", value=update) return lqcm_update def _update_function_worker(self, topic, lqcm_update): ack = self._local_queue_client.addMessage(topic, lqcm_update, True) while not ack: ack = self._local_queue_client.addMessage(topic, lqcm_update, True) def _update_remaining_function_workers(self, excluded_function_topic, lqcm_update=None): local_functions = self._workflow.getWorkflowLocalFunctions() if lqcm_update is None: lqcm_update = self._prepare_update_for_locally_running(local_functions) for locally_running_ft in local_functions: if locally_running_ft == excluded_function_topic: continue self._update_function_worker(locally_running_ft, lqcm_update) def stop_function_worker(self, function_topic): # remove from locally running functions self._workflow.removeLocalFunction(function_topic) # first, update locally running functions with remaining functions self._update_remaining_function_workers(function_topic) # send stop message to function worker's queue stop = {} stop["action"] = "stop" stop = json.dumps(stop) lqcm_update = LocalQueueClientMessage(key="0l", value=stop) self._update_function_worker(function_topic, lqcm_update) def _install_sandbox_requirements(self, parameters): error = None installer = parameters["installer"] requirements = parameters["requirements"] additional_installer_options = {} if "additional_installer_options" in parameters: additional_installer_options = parameters["additional_installer_options"] if requirements: # TODO: other installers (e.g., apt-get)? if installer == "pip": # launch 'pip install' with any parameters related to proxy etc. # store requirements into /opt/mfn/requirements.txt reqfname = "/opt/mfn/requirements.txt" with open(reqfname, "w+") as reqf: for req in requirements: reqf.write(req + "\n") # modify command to add additional installer options if self._python_version >= (3, ): cmd = "python3 " else: cmd = "python " cmd = cmd + "-m pip install --user" cmd += " --no-compile --no-clean" for opt in additional_installer_options: cmd = cmd + " " + opt + " " + additional_installer_options[opt] cmd = cmd + " -r " + reqfname # launch 'pip install [additional_options] -r /opt/mfn/requirements.txt error, _ = process_utils.run_command(cmd, self._logger, wait_output=True) else: error = "Unsupported installer: " + installer return error def _retrieve_and_store_function_code(self, resource_name, resource_info): error = None rpath = "/opt/mfn/code/resources/" + resource_name + "/" fpath = rpath + resource_name if resource_info["runtime"].find("python") != -1: fpath = fpath + ".py" elif resource_info["runtime"].find("java") != -1: fpath = fpath + ".java" else: error = "Unsupported runtime: " + resource_info["runtime"] return (error, None) if not os.path.exists(os.path.dirname(fpath)): try: os.makedirs(os.path.dirname(fpath)) except OSError as err: if err.errno != os.errno.EEXIST: error = err return (error, None) resource_code = self._global_data_layer_client.get(resource_info["ref"]) if resource_code is None: error = "Empty function code." return (error, None) try: resource_code = base64.b64decode(resource_code).decode() except Exception as exc: error = "Invalid value for code: " + str(exc) self._logger.error(error) return (error, None) with open(fpath, "w") as funcf: funcf.write(resource_code) return (error, rpath) def _retrieve_and_store_function_zip(self, resource_name, resource_info): error = None zipref = resource_info["ref"] num_chunks_str = self._global_data_layer_client.get(zipref) try: num_chunks = int(num_chunks_str) except Exception as exc: error = "Invalid value for key " + zipref + "; expected number of chunks: " + str(exc) self._logger.error(error) return (error, None) zip_content = "" ind = zipref.find("num_chunks_") gid = zipref[ind+11:] pref = zipref[0:ind] + gid + "_chunk_" for i in range(num_chunks): chunkref = pref + str(i) chunk = self._global_data_layer_client.get(chunkref) if chunk is None: error = "Empty zip chunk." return (error, None) zip_content = zip_content + chunk old_len = len(zip_content) rem = old_len % 4 if rem > 0: num_pad = 4 - rem for i in range(num_pad): zip_content = zip_content + "=" try: decodedzip = base64.b64decode(zip_content) except Exception as exc: error = "Invalid value for assembled chunks; couldn't decode base64: " + str(exc) self._logger.error(error) return (error, None) runtime = resource_info["runtime"] # 1. store zip file zipfname = "/opt/mfn/code/zips/" + resource_name + ".zip" if not os.path.exists(os.path.dirname(zipfname)): try: os.makedirs(os.path.dirname(zipfname)) except OSError as err: if err.errno != os.errno.EEXIST: error = err return (error, None) with open(zipfname, "wb") as zipfile: zipfile.write(decodedzip) gextractedpath = "/opt/mfn/code/resources/" + resource_name + "/" # 2. extract zip file if not os.path.exists(os.path.dirname(gextractedpath)): try: os.makedirs(os.path.dirname(gextractedpath)) except OSError as err: if err.errno != os.errno.EEXIST: error = err return (error, None) cmdunzip = "unzip " + zipfname + " -d " + gextractedpath error, _ = process_utils.run_command(cmdunzip, self._logger, wait_output=True) if error is not None: error = "Could not extract zip file: " + resource_name + " " + error self._logger.error(error) return (error, None) # 3. need to set executable permissions for the extracted libs cmdperm = "sh -c \"find " + gextractedpath + "| xargs -I {} file {}" cmdperm = cmdperm + "| grep ELF" + "| grep -v grep" cmdperm = cmdperm + "| awk -F ':' '{print $1}'" cmdperm = cmdperm + "| xargs -I {} chmod +x {}\"" error, _ = process_utils.run_command(cmdperm, self._logger, wait_output=True) if error is not None: error = "Could not set lib permissions: " + resource_name + " " + error self._logger.error(error) return (error, None) if runtime.find("python") != -1: fpath = gextractedpath + resource_name fpath = fpath + ".py" resource_code = self._global_data_layer_client.get("grain_source_" + resource_info["id"]) if resource_code is not None or resource_code != "": try: resource_code = base64.b64decode(resource_code).decode() except Exception as exc: error = "Invalid value for function code: " + str(exc) self._logger.error(error) return (error, None) self._logger.info("Overwriting zip resource file with the updated resource code...") with open(fpath, "w") as funcf: funcf.write(resource_code) elif runtime.find("java") != -1: # TODO: try to retrieve the updated resource? # To do that, we'd need to know the actual state name (i.e., in the workflow description), # which (for now) has to be the same as the Java class. # This class name can differ from the resource name # (e.g., one jar containing multiple classes with handle functions, such that each function is used as a separate state) # that means, we'd need to do the code update just at the beginning of when we create the state and also the compilation, # but before copying the resource to each state's separate location # TODO: double check whether this is also the case for python pass else: error = "Unsupported runtime: " + resource_info["runtime"] return (error, None) return (error, gextractedpath) def _initialize_data_layer_storage(self): # each data layer client will automatically create the local keyspace and tables # upon instantiation # mfn internal tables local_dlc = DataLayerClient(locality=0, for_mfn=True, sid=self._sandboxid, wid=self._workflowid, connect=self._datalayer, init_tables=True) local_dlc.shutdown() # user storage tables local_dlc = DataLayerClient(locality=0, suid=self._storage_userid, connect=self._datalayer, init_tables=True) local_dlc.shutdown() # workflow private tables local_dlc = DataLayerClient(locality=0, is_wf_private=True, sid=self._sandboxid, wid=self._workflowid, connect=self._datalayer, init_tables=True) local_dlc.shutdown() # for global access, (re)create; it's okay because the operations are idempotent # user storage is created by management service # mfn internal tables global_dlc = DataLayerClient(locality=1, for_mfn=True, sid=self._sandboxid, wid=self._workflowid, connect=self._datalayer, init_tables=True) global_dlc.shutdown() # workflow private tables global_dlc = DataLayerClient(locality=1, is_wf_private=True, sid=self._sandboxid, wid=self._workflowid, connect=self._datalayer, init_tables=True) global_dlc.shutdown() def _populate_worker_params(self, function_topic, wf_node, state): worker_params = {} worker_params["userid"] = self._userid worker_params["storageuserid"] = self._storage_userid worker_params["sandboxid"] = self._sandboxid worker_params["workflowid"] = self._workflowid worker_params["workflowname"] = self._workflowname worker_params["ffolder"] = state["resource_dirpath"] worker_params["fpath"] = state["resource_filepath"] worker_params["fname"] = state["resource_filename"] worker_params["fruntime"] = state["resource_runtime"] worker_params["ftopic"] = function_topic worker_params["hostname"] = self._hostname worker_params["queue"] = self._queue worker_params["datalayer"] = self._datalayer worker_params["externalendpoint"] = self._external_endpoint worker_params["internalendpoint"] = self._internal_endpoint worker_params["managementendpoints"] = self._management_endpoints worker_params["fnext"] = wf_node.getNextMap() worker_params["fpotnext"] = wf_node.getPotentialNextMap() worker_params["functionstatetype"] = wf_node.getGWFType() worker_params["functionstatename"] = wf_node.getGWFStateName() worker_params["functionstateinfo"] = wf_node.getGWFStateInfo() worker_params["workflowfunctionlist"] = self._workflow.getWorkflowFunctionMap() worker_params["workflowexit"] = self._workflow.getWorkflowExitPoint() worker_params["sessionworkflow"] = self._workflow.is_session_workflow() worker_params["sessionfunction"] = wf_node.is_session_function() worker_params["sessionfunctionparameters"] = wf_node.get_session_function_parameters() worker_params["shouldcheckpoint"] = self._workflow.are_checkpoints_enabled() return worker_params def _compile_java_resources_if_necessary(self, resource, mvndeps): error = None cmdmkdir = "mkdir -p " + resource["dirpath"] + "target/classes" self._logger.info("Preparing for compilation of Java function resources: %s", resource["name"]) error, _ = process_utils.run_command(cmdmkdir, self._logger, wait_output=True) if error is not None: error = "Could not create target directory for resource: " + resource["name"] + " " + error self._logger.error(error) return error #cmdjavac = "javac -classpath /opt/mfn/JavaRequestHandler/mfnapi.jar -d " + resource["dirpath"] + "target/classes " #cmdjavac += resource["dirpath"] + resource["name"] + ".java" cmdfind = "find " + resource["dirpath"] + " -name *.java" output, error = process_utils.run_command_return_output(cmdfind, self._logger) if error is not None: self._logger.error("[SandboxAgent] could not search for any Java sources: %s", str(error)) error = "Could not search for any Java sources: " + resource["name"] + " " + str(error) return error source_files = set(output.split("\n")) source_files = ' '.join(source_files).strip() should_compile = False if source_files != "": should_compile = True self._logger.info("Found following Java sources: %s", str(source_files)) else: self._logger.info("No java sources to compile.") # 2. check for pom.xml or the requirements; if it is there, then: if mvndeps is not None and not os.path.exists(resource["dirpath"] + "pom.xml"): # write the content of mvndeps into the pom.xml self._logger.info("Writing maven build file: %spom.xml", resource["dirpath"]) with open(resource["dirpath"] + "pom.xml", "w") as fpom: fpom.write(mvndeps) # we either had a pom.xml file in the archive or non-empty mvndeps from uploaded requirements, which we wrote as the pom.xml file # regardless, if there is a pom file, then resolve and copy maven dependencies if os.path.exists(resource["dirpath"] + "pom.xml"): cmdmvn = "mvn -Duser.home=/tmp -DskipTests -gs /opt/mfn/JavaRequestHandler/maven/sandbox-mvn-settings.xml -f " + resource["dirpath"] cmdmvn += " dependency:copy-dependencies -DoutputDirectory=" + resource["dirpath"] + "target/classes" self._logger.info("Copying maven dependencies for Java function: %s", resource["name"]) error, _ = process_utils.run_command(cmdmvn, self._logger, wait_output=True) if error is not None: error = "Could not copy maven dependencies: " + resource["name"] + " " + error self._logger.error(error) return error self._logger.info("Finished copying dependencies for Java function: %s", resource["name"]) if should_compile: cmdjavac = "javac -classpath /opt/mfn/JavaRequestHandler/mfnapi.jar:" cmdjavac += resource["dirpath"] + "target/classes/* " cmdjavac += "-d " + resource["dirpath"] + "target/classes " + source_files self._logger.info("Compiling Java function resources: %s", resource["name"]) self._logger.info(cmdjavac) error, _ = process_utils.run_command(cmdjavac, self._logger, wait_output=True) if error is not None: error = "Could not compile resource: " + resource["name"] + " " + error self._logger.error(error) return error self._logger.info("Finished compiling Java function resources: %s", resource["name"]) return error def process_deployment_info(self): has_error = False errmsg = "" if self._deployment_info is not None and self._deployment_info != "": try: self._deployment_info = json.loads(self._deployment_info) self._logger.debug("Deployment info: %s", json.dumps(self._deployment_info)) except Exception as exc: errmsg = "Could not parse deployment info: " + str(exc) self._logger.error(errmsg) has_error = True return has_error, errmsg else: errmsg = "Empty deployment info." has_error = True return has_error, errmsg if "workflow" not in self._deployment_info or "resources" not in self._deployment_info: errmsg = "Incomplete deployment info: " + json.dumps(self._deployment_info) self._logger.error(errmsg) has_error = True return has_error, errmsg # get workflow info workflow_info = self._deployment_info["workflow"] sid = workflow_info["sandboxId"] if sid != self._sandboxid: warnmsg = "WARN: workflow info sandboxid doesn't match provided sandboxid ("+sid+" <-> "+workflow_info["sandboxId"]+")" self._logger.info(warnmsg) wid = workflow_info["workflowId"] if wid != self._workflowid: warnmsg = "WARN: workflow info workflowid doesn't match provided workflowid ("+wid+" <-> "+workflow_info["workflowId"]+")" print(warnmsg) wf_type = workflow_info["workflowType"] usertoken = '' if "usertoken" in workflow_info: usertoken = workflow_info["usertoken"] os.environ["USERTOKEN"] = usertoken # get workflow json, parse workflow json and init params workflow_json = self._global_data_layer_client.get(workflow_info["json_ref"]) if workflow_json is None or workflow_json == "": has_error = True errmsg = "Empty workflow description." return has_error, errmsg try: workflow_json = base64.b64decode(workflow_json).decode() except Exception as exc: has_error = True errmsg = "Invalid value for workflow json: " + str(exc) return has_error, errmsg self._workflow = Workflow(self._userid, sid, wid, wf_type, workflow_json, self._logger) has_error = self._workflow.has_error() if has_error: errmsg = "Problem in workflow description: " + str(workflow_json) self._logger.error(errmsg) return has_error, errmsg # get workflow nodes workflow_nodes = self._workflow.getWorkflowNodeMap() # get resources info and find functions resource_map = {} resource_info_map = self._deployment_info["resources"] if any(resource_info_map[res_name]["runtime"] == "Java" for res_name in resource_info_map): # run setup_maven.sh to update the proxy settings at runtime # (i.e., the sandbox image may have been built on a machine with a proxy, or vice versa) cmd_maven_proxy_initer = "/opt/mfn/JavaRequestHandler/./setup_maven.sh" self._logger.info("Updating maven proxy settings...") error, _ = process_utils.run_command(cmd_maven_proxy_initer, self._logger, wait_output=True) if error is not None: has_error = True errmsg = "Could not reinitialize maven proxy settings: " + error return has_error, errmsg self._logger.info("Finished updating maven proxy settings.") # for pip installable dependencies for python functions req_map = {} t_start_download = time.time() # store functions in local filesystem for resource_name in resource_info_map: resource_info = resource_info_map[resource_name] resource_info["runtime"] = resource_info["runtime"].lower() if resource_info["type"] == "code": error, resource_dirpath = self._retrieve_and_store_function_code(resource_name, resource_info) else: error, resource_dirpath = self._retrieve_and_store_function_zip(resource_name, resource_info) if error is not None: errmsg = "Could not retrieve and store function: " + resource_name + " " + error self._logger.error(errmsg) has_error = True return has_error, errmsg # these requirements can now be also for java maven dependencies resource_id = resource_info["id"] greq = self._global_data_layer_client.get("grain_requirements_" + resource_id) mvndeps = None if greq is not None and greq != "": greq = base64.b64decode(greq).decode() if resource_info["runtime"].find("python") == 0: # get function requirements and put it into a map lines = greq.strip().split("\n") for line in lines: req_map[line] = True elif resource_info["runtime"].find("java") == 0: mvndeps = greq # get function environment variables env_var_list = [] genv = self._global_data_layer_client.get("grain_environment_variables_" + resource_id) if genv is not None and genv != "": genv = base64.b64decode(genv).decode() lines = genv.split("\n") env_var_list = lines resource = {} resource["name"] = resource_name resource["dirpath"] = resource_dirpath resource["runtime"] = resource_info["runtime"] resource["env_var_list"] = env_var_list resource_map[resource_name] = resource # compile the java sources if resource["runtime"].find("java") == 0: # even if it was just a single java file # or a jar file uploaded with source files # or a jar file with just class files, # the following function will # 1. download maven dependencies (if there is a pom.xml in the jar or was separately uploaded) # 2. compile the source files if any error = self._compile_java_resources_if_necessary(resource, mvndeps) if error is not None: errmsg = "Could not compile Java function resources: " + resource_name + " " + error self._logger.error(errmsg) has_error = True return has_error, errmsg total_time_download = (time.time() - t_start_download) * 1000.0 self._logger.info("Download time for all function code: %s (ms)", str(total_time_download)) t_start_requirements = time.time() # this list will only contain pip installable dependencies # java maven dependencies will be handled while compiling the java resources sbox_req_list = [] for req_line in req_map: sbox_req_list.append(req_line) # install sandbox requirements req = workflow_info["sandbox_requirements"] req["requirements"] = sbox_req_list error = self._install_sandbox_requirements(req) if error is not None: errmsg = "Could not install sandbox requirements. " + str(error) self._logger.error(errmsg) has_error = True return has_error, errmsg total_time_requirements = (time.time() - t_start_requirements) * 1000.0 self._logger.info("Requirements install time: %s (ms)", str(total_time_requirements)) t_start_storage = time.time() # initialize local data layer space for user and workflow self._initialize_data_layer_storage() total_time_storage = (time.time() - t_start_storage) * 1000.0 self._logger.info("Storage initialization time: %s (ms)", str(total_time_storage)) self._local_queue_client = LocalQueueClient(connect=self._queue) self._local_queue_client.addTopic(self._workflow.getWorkflowExitTopic()) t_start_launch = time.time() # accummulate all java worker params into one # later, we'll launch a single JVM to handle all java functions if SINGLE_JVM_FOR_FUNCTIONS: single_jvm_worker_params = {} any_java_function = False total_time_state = 0.0 for function_topic in workflow_nodes: wf_node = workflow_nodes[function_topic] resource_name = wf_node.get_resource_name() t_start_state = time.time() if resource_name == "": # this is an ASL state without a resource (i.e., function) attached to it error, resource = state_utils.create_dummy_resource_for_asl_state(wf_node) if error is not None: errmsg = "Could not create non-resource state. " + str(error) self._logger.error(errmsg) has_error = True return has_error, errmsg else: resource = resource_map[resource_name] error, state = state_utils.create_state(wf_node, resource, self._logger) if error is not None: errmsg = "Could not create state: " + str(error) self._logger.error(errmsg) has_error = True return has_error, errmsg total_time_state += (time.time() - t_start_state) * 1000.0 self._local_queue_client.addTopic(function_topic) # compile worker parameters worker_params = self._populate_worker_params(function_topic, wf_node, state) # store worker parameters as a local file params_filename = state["dirpath"] + "worker_params.json" with open(params_filename, "w") as paramsf: json.dump(worker_params, paramsf, indent=4) if state["resource_runtime"].find("java") != -1: java_worker_params = {} java_worker_params["functionPath"] = worker_params["ffolder"] java_worker_params["functionName"] = worker_params["fname"] java_worker_params["serverSocketFilename"] = "/tmp/java_handler_" + worker_params["functionstatename"] + ".uds" if SINGLE_JVM_FOR_FUNCTIONS: any_java_function = True single_jvm_worker_params[worker_params["functionstatename"]] = java_worker_params else: java_params_filename = state["dirpath"] + "java_worker_params.json" with open(java_params_filename, "w") as javaparamsf: json.dump(java_worker_params, javaparamsf, indent=4) # launch function workers with the params parsed from workflow info error = self._start_function_worker(worker_params, state["resource_runtime"], state["resource_env_var_list"]) if error is not None: errmsg = "Problem launching function worker for: " + worker_params["fname"] self._logger.error(errmsg) has_error = True return has_error, errmsg # add the new function worker to the local list self._workflow.addLocalFunction(function_topic) # all function workers have been launched; update them with locally running functions # prepare update message to be used by all local_functions = self._workflow.getWorkflowLocalFunctions() lqcm_update = self._prepare_update_for_locally_running(local_functions) for function_topic in workflow_nodes: self._update_function_worker(function_topic, lqcm_update) if SINGLE_JVM_FOR_FUNCTIONS: if any_java_function: single_jvm_params_filename = "/opt/mfn/workflow/states/single_jvm_worker_params.json" with open(single_jvm_params_filename, "w") as jvmparamsf: json.dump(single_jvm_worker_params, jvmparamsf, indent=4) self._logger.info("Launching a single JavaRequestHandler for all Java states...") cmdjavahandler = "java -jar /opt/mfn/JavaRequestHandler/target/javaworker.jar " cmdjavahandler += single_jvm_params_filename error, process = process_utils.run_command(cmdjavahandler, self._logger, wait_until="Waiting for requests on:") if error is not None: errmsg = "Problem launching JavaRequestHandler for Java states: " + error self._logger.error(errmsg) has_error = True return has_error, errmsg else: self._javarequesthandler_process_list.append(process) self._logger.info("State creation for all function workers: %s (ms)", str(total_time_state)) total_time_launch = (time.time() - t_start_launch) * 1000.0 self._logger.info("Launch time for all function workers: %s (ms)", str(total_time_launch)) if not has_error: # check whether all function workers have launched successfully # give some time for function workers to come up cmd = "pgrep -P " + str(self._process_id) + " -a" output, error = process_utils.run_command_return_output(cmd, self._logger) if error is not None: self._logger.error("[SandboxAgent] check health of function workers: failed to get FunctionWorker processes: %s", str(error)) has_error = True errmsg = "Could not get FunctionWorker processes." if not has_error: fwlines = set(output.split("\n")) fwpids = [] for line in fwlines: if "FunctionWorker.py" in line: pid = line.split(" ")[0] fwpids.append(pid) if str(self._fluentbit_process.pid) in fwpids: fwpids.remove(str(self._fluentbit_process.pid)) self._logger.info(str(len(fwpids)) + " " + str(len(self._functionworker_process_map))) #self._logger.info(str(fwpids) + " " + str(self._functionworker_process_map)) if len(fwpids) != len(self._functionworker_process_map): has_error = True errmsg = "One or more function workers could not be launched:\n" for state_name in self._functionworker_process_map: fwp = self._functionworker_process_map[state_name] if fwp.pid not in fwpids: errmsg += state_name + "\n" self._global_data_layer_client.shutdown() return has_error, errmsg
class SessionUtils: def __init__(self, hostname, uid, sid, wid, logger, funcstatename, functopic, key, session_id, publication_utils, queue, datalayer, internal_endpoint): self._logger = logger self._queue = queue self._datalayer = datalayer self._session_id = session_id self._session_function_id = None self._hostname = hostname self._userid = uid self._sandboxid = sid self._workflowid = wid self._function_state_name = funcstatename self._function_topic = functopic self._internal_endpoint = internal_endpoint self._key = key self._publication_utils = publication_utils self._is_session_function_running = False self._helper_thread = None self._global_data_layer_client = DataLayerClient( locality=1, sid=sid, for_mfn=True, connect=self._datalayer) # only valid if this is a session function (i.e., session_function_id is not None) self._local_topic_communication = None self._session_function_parameters = None if self._session_id is None: self._generate_session_id() self._setup_metadata_tablenames() # _XXX_: the following does not have any effect and makes unnecessary calls # to the data layer # the main reason is that the backend at the data layer does not create # sets and maps (i.e., createSet, createMap) until an entry is made # the addition of the entries will succeed without requiring the # corresponding set/map to have been created. #self._create_metadata_tables() #self._logger.debug("[SessionUtils] init done.") ########################### # # Alias operations with a given session id? # probably not needed. the session id that is generated at session start # would be returned to the client, # which would send it back in the future to set the context correctly # (i.e., happens implicitly during function instantiation and/or communication). # the application can then also set a session alias and return it to the client, # which can use it in the future to set the context. # however, the session will be implicitly identified via the client sending back # the session id and/or the alias. # no need to allow other explicit access to alias operations. # # How to deal with access control between sessions? # (i.e., a function session A should not be able to set an alias for session B). # when the context is correctly set via the session id and/or the session alias # and with no explicit access to alias operations with a given session id, # this cannot happen. # # Alias operations with a session function id? # in a given session, any function may assign an alias to another session function instance # in other words, it doesn't need to be the actual session function instance that is setting # its alias; it could be a regular function that is assigning aliases to session function instances. # when that happens, we'd need to update the relevant session function with its new alias, # Actually, just keep all aliases in the data layer, so that get() operations read it from there # and set() operations update it there (i.e., no need to keep localized versions) # keeping the localized versions up-to-date with the data layer would require # synchronization when there is an update (most probably via an immediate special message) ########################### def set_session_alias(self, alias): # update metadata (session alias -> session id) mapping # check whether it is already in use old_session_id = self._global_data_layer_client.getMapEntry( self._map_name_session_alias_id, alias) if old_session_id is not None and old_session_id != "" and old_session_id != self._session_id: self._logger.warning( "Cannot overwrite alias (" + alias + ") that is in use by another session (existing session id: " + old_session_id + ").") return self._global_data_layer_client.putMapEntry( self._map_name_session_alias_id, alias, self._session_id) self._global_data_layer_client.putMapEntry( self._map_name_session_id_alias, self._session_id, alias) def get_session_alias(self): session_alias = self._global_data_layer_client.getMapEntry( self._map_name_session_id_alias, self._session_id) if session_alias == "": session_alias = None return session_alias def unset_session_alias(self): # update metadata session_alias = self.get_session_alias() if session_alias is not None: self._global_data_layer_client.deleteMapEntry( self._map_name_session_alias_id, session_alias) self._global_data_layer_client.deleteMapEntry( self._map_name_session_id_alias, self._session_id) def set_session_function_alias(self, alias, session_function_id=None): # handle setting an alias for another session function if session_function_id is None: session_function_id = self._session_function_id else: # check whether the session function id actually exists in the session functions list rgidlist = self.get_all_session_function_ids() if session_function_id not in rgidlist: self._logger.warning("Cannot find session function with id: " + str(session_function_id) + " for setting its alias.") return # check whether it is already in use; cannot have the same alias for two different instances old_session_function_id = self._global_data_layer_client.getMapEntry( self._map_name_session_function_alias_id, alias) if old_session_function_id is not None and old_session_function_id != "" and old_session_function_id != session_function_id: self._logger.warning( "Cannot use alias (" + alias + ") that is in use by another session function (existing session function id: " + old_session_function_id + ").") return # update metadata (session function alias -> session function id) mapping # also (session function id -> session function alias) mapping self._global_data_layer_client.putMapEntry( self._map_name_session_function_alias_id, alias, session_function_id) self._global_data_layer_client.putMapEntry( self._map_name_session_function_id_alias, session_function_id, alias) def get_session_function_alias(self, session_function_id=None): # handle setting an alias for another session function if session_function_id is None: session_function_id = self._session_function_id else: # check whether the session function id actually exists in the session functions list rgidlist = self.get_all_session_function_ids() if session_function_id not in rgidlist: self._logger.warning("Cannot find session function with id: " + str(session_function_id) + " for getting its alias.") return None # handle getting an alias for another session function alias = self._global_data_layer_client.getMapEntry( self._map_name_session_function_id_alias, session_function_id) if alias == "": alias = None return alias def unset_session_function_alias(self, session_function_id=None): # handle unsetting the alias for another session function if session_function_id is None: session_function_id = self._session_function_id else: # check whether the session function id actually exists in the session functions list rgidlist = self.get_all_session_function_ids() if session_function_id not in rgidlist: self._logger.warning("Cannot find session function with id: " + str(session_function_id) + " for unsetting its alias.") return # update metadata session_function_alias = self.get_session_function_alias( session_function_id) if session_function_alias is not None: self._global_data_layer_client.deleteMapEntry( self._map_name_session_function_alias_id, session_function_alias) self._global_data_layer_client.deleteMapEntry( self._map_name_session_function_id_alias, session_function_id) def get_session_id(self): return self._session_id def get_session_function_id(self): return self._session_function_id def get_session_function_id_with_alias(self, alias=None): if alias is None: return self._session_function_id sgid = self._global_data_layer_client.getMapEntry( self._map_name_session_function_alias_id, alias) return sgid def get_all_session_function_ids(self): rgidset = self._global_data_layer_client.getMapKeys( self._map_name_session_functions) rgidlist = list(rgidset) return rgidlist def get_all_session_function_aliases(self): alias_map = {} alias_map = self._global_data_layer_client.retrieveMap( self._map_name_session_function_alias_id) return alias_map def get_alias_summary(self): alias_summary = {} # 1. add current session alias alias_summary["session"] = {} session_alias = self.get_session_alias() if session_alias is None: session_alias = "" alias_summary["session"][self._session_id] = session_alias # 2. add current session function aliases alias_summary["session_functions"] = {} # 2.1. get all session function ids rgidlist = self.get_all_session_function_ids() for rgid in rgidlist: alias_summary["session_functions"][rgid] = "" # 2.2. get assigned aliases to all session functions alias_map = self.get_all_session_function_aliases() # 2.3. merge 2.1 and 2.2 # it is possible that some session functions will have no alias for alias in alias_map.keys(): rgid = alias_map[alias] alias_summary["session_functions"][rgid] = alias return alias_summary # every function in a session workflow will call this, setting up the metadata tablenames def _generate_session_id(self): if self._session_id is None: # MUST be unique and deterministic (so that multiple, concurrent instances generate the same) # uid + sid + wid + key # emitting messages during execution MUST use existing session id # due to key being different for each request to the workflow plain_session_id_bytes = (self._userid + "_" + self._sandboxid + "_" + self._workflowid + "_" + self._key).encode() self._session_id = hashlib.sha256( plain_session_id_bytes).hexdigest() self._logger.debug("[SessionUtils] Session id: " + self._session_id) def _generate_session_function_id(self): if self._session_function_id is None: # this cannot be just instanceid (i.e., key of the request); multiple functions receive the same instance id # should include some randomness, so that the same function can be instantiated more than once # need to use (gname + key + random) # we are only interested in keeping the session function ids of the same sandbox/workflow/session random.seed() plain_session_function_id_bytes = ( self._function_state_name + "_" + self._key + "_" + str(random.uniform(0, 100000))).encode() self._session_function_id = hashlib.sha256( plain_session_function_id_bytes).hexdigest() self._logger.debug("[SessionUtils] Session function id: " + self._session_function_id) # these calls don't have an effect until an entry is added # and the entries still succeed even without calling to createSet or createMap # making these calls unnecessary def _create_metadata_tables(self): # create the metadata tables if necessary names_sets = self._global_data_layer_client.getSetNames() names_maps = self._global_data_layer_client.getMapNames() if self._map_name_session_functions not in names_maps: self._global_data_layer_client.createMap( self._map_name_session_functions) if self._map_name_session_function_name_id_sets not in names_maps: self._global_data_layer_client.createMap( self._map_name_session_function_name_id_sets) if self._set_name_session_function_name_ids not in names_sets: self._global_data_layer_client.createSet( self._set_name_session_function_name_ids) if self._map_name_session_alias_id not in names_maps: self._global_data_layer_client.createMap( self._map_name_session_alias_id) if self._map_name_session_id_alias not in names_maps: self._global_data_layer_client.createMap( self._map_name_session_id_alias) if self._map_name_session_function_alias_id not in names_maps: self._global_data_layer_client.createMap( self._map_name_session_function_alias_id) if self._map_name_session_function_id_alias not in names_maps: self._global_data_layer_client.createMap( self._map_name_session_function_id_alias) def _setup_metadata_tablenames(self): # set up metadata tables # we know the session id, so each metadata table has it in its name # 1. session function instance id -> function instance metadata as 'map' # 2. session function name -> ref to set name of instance ids as 'map' # 3. session function instance ids as 'set' (with session function name as 'set' name) # 4. session alias -> session id metadata as 'map' # 5. session function alias -> session function id metadata as 'map' # 0. set of session function instance ids # we just expose the function instance ids to the application via the map keys # 1. map of session function instances and metadata (key = session function instance id, value = name, location, ...) self._map_name_session_functions = "SessionFunctionInstanceIdMap_" + self._session_id # 2. map of function names and ref to set of instance ids self._map_name_session_function_name_id_sets = "SessionFunctionNameIdSetsMap_" + self._session_id # 3. set of function instance ids of a function; referenced by SessionFunctionNameIdSetsMap self._set_name_session_function_name_ids = "SessionFunctionNameIdsSet_" + self._session_id + "_" + self._function_state_name # 4. session alias -> session id mapping; needs to be sandbox-level (i.e., without self._session_id) self._map_name_session_alias_id = "SessionAliasIdMap_" + self._sandboxid # 5. session id -> session alias mapping; needs to be sandbox-level (i.e., without self._session_id) self._map_name_session_id_alias = "SessionIdAliasMap_" + self._sandboxid # 6. session function alias -> session function id mapping self._map_name_session_function_alias_id = "SessionFunctionAliasIdMap_" + self._session_id # 7. session function id -> session function alias mapping self._map_name_session_function_id_alias = "SessionFunctionIdAliasMap_" + self._session_id def _store_metadata(self): # add yourself to the metadata in the data layer # 1. add yourself to the metadata map # use this information in host agent to find the correct host and deliver new messages correctly # need to include the global queue topic name, so that messages # can be also delivered from remote hosts function_metadata = {} function_metadata["hostname"] = self._hostname function_metadata["sandboxId"] = self._sandboxid function_metadata["workflowId"] = self._workflowid function_metadata["sessionId"] = self._session_id function_metadata["functionName"] = self._function_state_name function_metadata[ "communicationTopic"] = self._local_topic_communication function_metadata["remote_address"] = self._internal_endpoint metadata = json.dumps(function_metadata) #self._logger.debug("[SessionUtils] Session function metadata: " + metadata) self._global_data_layer_client.putMapEntry( self._map_name_session_functions, self._session_function_id, metadata) # 2. put the reference to the set of instance ids with our name self._global_data_layer_client.putMapEntry( self._map_name_session_function_name_id_sets, self._function_state_name, self._set_name_session_function_name_ids) # 3. update the set of instance ids with our session function id self._global_data_layer_client.addSetEntry( self._set_name_session_function_name_ids, self._session_function_id) def _remove_metadata(self): # remove any session function alias mappings self.unset_session_function_alias() #if self._key_update_message is not None: # self._local_data_layer_client.delete(self._key_update_message) self._global_data_layer_client.removeSetEntry( self._set_name_session_function_name_ids, self._session_function_id) self._global_data_layer_client.deleteMapEntry( self._map_name_session_function_name_id_sets, self._function_state_name) self._global_data_layer_client.deleteMapEntry( self._map_name_session_functions, self._session_function_id) # TODO: we also need to remove the metadata tables at session end as well as the session alias mappings # i.e., when all functions in the session have been finished. def _setup_session_function_helper(self): params = {} params["sandboxid"] = self._sandboxid params["workflowid"] = self._workflowid params["session_id"] = self._session_id params["session_function_id"] = self._session_function_id # obtain parameters from the function worker params["heartbeat_parameters"] = self._session_function_parameters params["communication_parameters"] = {} params["communication_parameters"][ "local_topic_communication"] = self._local_topic_communication self._helper_thread = SessionHelperThread(params, self._logger, self._publication_utils, self, self._queue, self._datalayer) self._helper_thread.daemon = False self._helper_thread.start() def shutdown_helper_thread(self): self._helper_thread.shutdown() def cleanup(self): self._remove_metadata() self._global_data_layer_client.shutdown() # only to be called from the function worker when it is a session function def setup_session_function(self, session_function_parameters): self._session_function_parameters = session_function_parameters # generate a new session function id self._generate_session_function_id() # for receiving update messages # also set up a global queue topic name, so that this session # function can be sent messages from remote hosts #self._key_update_message = "UpdateMessage_" + self._session_function_id self._local_topic_communication = "SessionFunctionUpdateTopic_" + self._session_function_id # set up metadata tables if necessary and register yourself # maybe first fork? need to have its own global data layer client # no, because this setup is crucial for the operation of the session function # if it fails, we'd need to stop everything else. self._store_metadata() self._is_session_function_running = True # set up the helper thread self._setup_session_function_helper() def set_session_function_running(self, is_running): self._is_session_function_running = is_running def is_session_function_running(self): return self._is_session_function_running # API to send a message to another session function # check the locally running functions, and send them the message locally if so # otherwise, send it to the EventGlobalPublisher's queue def send_to_running_function_in_session(self, session_function_id, message, send_now=False): #self._logger.debug("[SessionUtils] Sending message to running function: " + str(session_function_id) + " now: " + str(send_now)) # send the message to the specific running function id function_metadatastr = self._global_data_layer_client.getMapEntry( self._map_name_session_functions, session_function_id) try: #self._logger.debug("[SessionUtils] function metadata: " + function_metadatastr) function_metadata = json.loads(function_metadatastr) except Exception as exc: self._logger.warning( "[SessionUtils] No such running function instance: " + session_function_id + " " + str(exc)) return # we can use the 'globalTopic' in metadata to also deliver # the message directly to the locally running session function instances # that means, we can skip the delivery by the function worker # that however also means, that the decapsulation of the message # has to happen at the session function's helper thread trigger = {} trigger["value"] = message trigger["to_running_function"] = True trigger["next"] = function_metadata["communicationTopic"] if self._hostname == function_metadata["hostname"]: # local function instance; send it via local queue #self._logger.debug("[SessionUtils] Local session function: " + str(session_function_id)) trigger["is_local"] = True else: # remote function instance #self._logger.debug("[SessionUtils] Remote session function: " + str(session_function_id)) trigger["is_local"] = False trigger["remote_address"] = function_metadata["remote_address"] if send_now: self._publication_utils.send_to_function_now("-1l", trigger) else: self._publication_utils.append_trigger(trigger) def send_to_all_running_functions_in_session_with_function_name( self, session_function_name, message, send_now=False): # get the function ids and send message rgidsetname = self._global_data_layer_client.getMapEntry( self._map_name_session_function_name_id_sets, session_function_name) rgidset = self._global_data_layer_client.retrieveSet(rgidsetname) rgidlist = list(rgidset) for rgid in rgidlist: self.send_to_running_function_in_session(rgid, message, send_now) def send_to_all_running_functions_in_session(self, message, send_now=False): # get the function ids and send message rgidset = self._global_data_layer_client.getMapKeys( self._map_name_session_functions) rgidlist = list(rgidset) for rgid in rgidlist: self.send_to_running_function_in_session(rgid, message, send_now) def send_to_running_function_in_session_with_alias(self, session_function_alias, message, send_now=False): # lookup the session function id and then send to it rgid = self._global_data_layer_client.getMapEntry( self._map_name_session_function_alias_id, session_function_alias) if rgid == "": self._logger.warning( "Cannot send message to session function with alias; no session function with that alias." ) return self.send_to_running_function_in_session(rgid, message, send_now) def get_session_update_messages_with_local_queue(self, count=1, block=False): if self._session_function_id is not None: messages = self._helper_thread.get_messages(count=count, block=block) return messages return None
print("Waiting on DataLayer") while True: host, port = os.getenv("MFN_DATALAYER", hostname + ":4998").rsplit(":", 1) try: addr = socket.gethostbyname(host) connect = addr + ":" + port break except: traceback.print_exc() print("Waiting another 5s for " + host + " to be resolvable") time.sleep(5) # client for bucket "storage_" + get_storage_userid(email) + ";defaultTable" DLCLIENT = DataLayerClient(locality=1, suid="adminATmanagement", connect=connect, init_tables=True) # client for bucket "sbox_Management;wf_Management" DLCLIENT_MANAGEMENT = DataLayerClient(locality=1, sid="Management", wid="Management", is_wf_private=True, connect=connect, init_tables=True) # client for mfn internal storage (for completeness) DLCLIENT_MFN = DataLayerClient(locality=1, sid="Management", for_mfn=True, connect=connect, init_tables=True) DLCLIENT_MFN.shutdown()
def __init__(self, helper_params, logger, pubutils, sessutils, queue, datalayer): self._logger = logger #self._logger.debug("[SessionHelperThread] " + str(helper_params)) self._publication_utils = pubutils self._session_utils = sessutils self._queue = queue self._datalayer = datalayer self._sandboxid = helper_params["sandboxid"] self._workflowid = helper_params["workflowid"] self._session_function_id = helper_params["session_function_id"] self._session_id = helper_params["session_id"] # initialize only needed # need a separate backup data layer client from the publication utils; otherwise, we run into concurrent modification # problems from Thrift # locality = -1 means that the writes happen to the local data layer first and then asynchronously to the global data layer self._backup_data_layer_client = DataLayerClient( locality=-1, for_mfn=True, sid=self._sandboxid, connect=self._datalayer) # set up heartbeat parameters self._heartbeat_enabled = False self._heartbeat_method = None # our own local queue client to be used when sending a heartbeat # TODO: double check if we can just reuse the one we're polling # probably yes self._local_queue_client_heartbeat = None self._heartbeat_function = None self._heartbeat_data_layer_key = None self._data_layer_client_heartbeat = None self._init_heartbeat_parameters(helper_params["heartbeat_parameters"]) # set up communication parameters self._communication_params = helper_params["communication_parameters"] # similar to the data layer rendezvous point for message delivery, we listen to a local topic # allowing us to queue messages and deliver multiple messages to the session function if desired self._local_topic_communication = self._communication_params[ "local_topic_communication"] # by default, assign a simple poll timeout # if the heartbeat is specified, it will be updated to the heartbeat to ensure # we can send regular heartbeats self._local_poll_timeout = py3utils.ensure_long(10000) # use a deque to keep the list of messages # updating the list and retrieving the list would be done by two threads # this should be safe without lock because of the global interpreter lock in python self._message_queue = deque() self._local_queue_client = LocalQueueClient(connect=self._queue) self._special_messages = {} self._special_messages["--stop"] = True self._special_messages["--update-heartbeat"] = True self._is_running = False #self._logger.debug("[SessionHelperThread] init done.") threading.Thread.__init__(self)