def _init_heartbeat_parameters(self, heartbeat_params):
        if "heartbeat_method" not in heartbeat_params:
            self._logger.debug(
                "No heartbeat method is specified; disabling heartbeat.")
            return
        else:
            self._heartbeat_enabled = True
            self._heartbeat_method = heartbeat_params["heartbeat_method"]
            #self._logger.debug("[SessionHelperThread] New heartbeat method: " + str(self._heartbeat_method))

        if self._heartbeat_method == "function":
            if "heartbeat_function" in heartbeat_params:
                # enable function related heartbeat
                self._heartbeat_function = heartbeat_params[
                    "heartbeat_function"]
                #self._logger.debug("[SessionHelperThread] New heartbeat function: " + str(self._heartbeat_function))
                if self._local_queue_client_heartbeat is None:
                    self._local_queue_client_heartbeat = LocalQueueClient(
                        connect=self._queue_service)

                # disable data layer related heartbeat
                if self._data_layer_client_heartbeat is not None:
                    self._data_layer_client_heartbeat.delete(
                        self._heartbeat_data_layer_key)
                    self._heartbeat_data_layer_key = None
                    self._data_layer_client_heartbeat.shutdown()
                    self._data_layer_client_heartbeat = None
        elif self._heartbeat_method == "data_layer":
            # needs to be unique among session functions, so use session id + session function id
            # TODO: how do you check the heartbeat in the data layer?
            # checker service or user function needs to know the key
            # OR keep a new map for heartbeats of the session functions
            # so that the checker can retrieve the keys and their values (e.g., timestamps)
            # if a session function misses a heartbeat, the checker function reports to policy handler

            # enable data layer related heartbeat
            self._heartbeat_data_layer_key = "heartbeat_" + self._session_id + "_" + self._session_function_id
            if self._data_layer_client_heartbeat is None:
                self._data_layer_client_heartbeat = DataLayerClient(
                    locality=1,
                    for_mfn=True,
                    sid=self._sandboxid,
                    connect=self._datalayer)

            # disable function related heartbeat
            if self._local_queue_client_heartbeat is not None:
                self._local_queue_client_heartbeat.shutdown()
                self._local_queue_client_heartbeat = None
                self._heartbeat_function = None

        else:
            raise MicroFunctionsSessionAPIException(
                "Unsupported heartbeat method for session function.")

        # must be in milliseconds
        if "heartbeat_interval_ms" in heartbeat_params:
            self._heartbeat_interval = heartbeat_params[
                "heartbeat_interval_ms"]
            self._local_poll_timeout = self._heartbeat_interval / 2.0
Esempio n. 2
0
    def __init__(self, hostname, queue, datalayer, sandboxid, userid,
                 workflowid, elasticsearch, workflowname, endpoint_key):

        self._start = time.time()

        self._python_version = sys.version_info

        self._hostname = hostname
        self._queue = queue
        self._datalayer = datalayer
        self._elasticsearch = elasticsearch
        self._userid = userid
        self._sandboxid = sandboxid
        self._workflowid = workflowid
        self._workflowname = workflowname
        # _XXX_: we'll use the endpoint_key to look up our endpoint
        self._endpoint_key = endpoint_key
        self._deployment_info_key = "deployment_info_workflow_" + self._workflowid

        self._logger = logging_helpers.setup_logger(self._sandboxid,
                                                    LOG_FILENAME)
        self._fluentbit_process, self._command_args_map_fluentbit = logging_helpers.setup_fluentbit_and_elasticsearch_index(
            self._logger, FLUENTBIT_FOLDER, self._elasticsearch,
            ELASTICSEARCH_INDEX_WF, ELASTICSEARCH_INDEX_FE)

        self._logger.info("hostname (and container name): %s", self._hostname)
        self._logger.info("elasticsearch nodes: %s", self._elasticsearch)
        self._logger.info("queueservice: %s", self._queue)
        self._logger.info("datalayer: %s", self._datalayer)
        self._logger.info("user id: %s", self._userid)
        self._logger.info("sandbox id: %s", self._sandboxid)
        self._logger.info("workflow id: %s", self._workflowid)
        self._logger.info("workflow name: %s", self._workflowname)
        self._logger.info("endpoint_key: %s", self._endpoint_key)

        self._instructions_topic = "instructions_" + self._sandboxid

        self._management_data_layer_client = DataLayerClient(
            locality=1,
            sid="Management",
            wid="Management",
            is_wf_private=True,
            connect=self._datalayer)
        self._logger.info("Management data layer client connected after %s s",
                          str(time.time() - self._start))

        # to be declared later
        self._local_queue_client = None
        self._deployment = None
        self._queue_service_process = None
        self._frontend_process = None
        # visible to the outside world: either kubernetes assigned URL or bare-metal host address + exposed port
        self._external_endpoint = None
        # visible internally: kubernetes node address or same as bare-metal external endpoint
        self._internal_endpoint = None

        self._is_running = False
        self._shutting_down = False
Esempio n. 3
0
 def get_backup_data_layer_client(self):
     if self._backup_data_layer_client is None:
         # locality = -1 means that the writes happen to the local data layer first and then asynchronously to the global data layer
         self._backup_data_layer_client = DataLayerClient(
             locality=-1,
             for_mfn=True,
             sid=self._sandboxid,
             connect=self._datalayer)
     return self._backup_data_layer_client
Esempio n. 4
0
    def _get_data_layer_client(self, is_private=False):
        '''
        Return the data layer client, so that it can be used to commit to the data layer
        when the function instance finishes.
        If it is not initialized yet, it will be initialized here.
        '''
        # TODO: need also the locality information
        if is_private:
            if self._data_layer_client_private is None:
                self._data_layer_client_private = DataLayerClient(locality=1, sid=self._sandboxid, wid=self._workflowid, is_wf_private=True, connect=self._datalayer)
            return self._data_layer_client_private

        if self._data_layer_client is None:
            self._data_layer_client = DataLayerClient(locality=1, suid=self._storage_userid, is_wf_private=False, connect=self._datalayer)
        return self._data_layer_client
Esempio n. 5
0
    def __init__(self, deployment_info, hostname, userid, sandboxid,
                 workflowid, workflowname, queue, datalayer, logger,
                 external_endpoint, internal_endpoint):
        self._logger = logger
        self._deployment_info = deployment_info
        self._hostname = hostname
        self._userid = userid
        self._sandboxid = sandboxid
        self._workflowid = workflowid
        self._workflowname = workflowname
        self._queue = queue
        self._datalayer = datalayer
        self._external_endpoint = external_endpoint
        self._internal_endpoint = internal_endpoint

        self._python_version = sys.version_info

        self._storage_userid = self._userid.replace("@", "AT")
        self._storage_userid = self._storage_userid.replace("-", "_").replace(
            ".", "_")

        self._process_id = os.getpid()

        self._functionworker_process_map = {}
        self._javarequesthandler_process_list = []
        self._queue_service_process = None
        self._frontend_process = None
        self._fluentbit_process = None
        # it will be probably updated to be something else
        self._fluentbit_actual_pid = -1

        self._child_process_command_args_map = {}

        # to be declared later when parsing the deployment info
        self._workflow = None

        self._global_data_layer_client = DataLayerClient(
            locality=1, suid=self._storage_userid, connect=self._datalayer)

        self._local_queue_client = None
Esempio n. 6
0
    def __init__(self, hostname, uid, sid, wid, logger, funcstatename,
                 functopic, key, session_id, publication_utils, queue,
                 datalayer, internal_endpoint):

        self._logger = logger

        self._queue = queue
        self._datalayer = datalayer

        self._session_id = session_id
        self._session_function_id = None

        self._hostname = hostname
        self._userid = uid
        self._sandboxid = sid
        self._workflowid = wid
        self._function_state_name = funcstatename
        self._function_topic = functopic
        self._internal_endpoint = internal_endpoint
        self._key = key

        self._publication_utils = publication_utils

        self._is_session_function_running = False

        self._helper_thread = None

        self._global_data_layer_client = DataLayerClient(
            locality=1, sid=sid, for_mfn=True, connect=self._datalayer)

        # only valid if this is a session function (i.e., session_function_id is not None)
        self._local_topic_communication = None

        self._session_function_parameters = None

        if self._session_id is None:
            self._generate_session_id()

        self._setup_metadata_tablenames()
Esempio n. 7
0
class PublicationUtils():
    def __init__(self, sandboxid, workflowid, functopic, funcruntime, wfnext,
                 wfpotnext, wflocal, wflist, wfexit, cpon, stateutils, logger,
                 queue, datalayer):
        self._logger = logger

        self._function_topic = functopic
        self._sandboxid = sandboxid
        self._workflowid = workflowid

        self._function_runtime = funcruntime

        self._prefix = self._sandboxid + "-" + self._workflowid + "-"

        self._wf_next = wfnext
        self._wf_pot_next = wfpotnext
        self._wf_local = wflocal
        self._wf_function_list = wflist
        self._wf_exit = wfexit

        # whether we should store backups of triggers before publishing the output
        self._should_checkpoint = cpon

        # the topic to send out messages to remote functions
        # TODO: pub_topic_global becomes a new request to another sandbox?
        # via header?
        self._pub_topic_global = "pub_global"

        self._recovery_manager_topic = "RecoveryManager"

        self._state_utils = stateutils
        self._metadata = None

        self._queue = queue
        self._local_queue_client = None
        self._datalayer = datalayer

        self._sapi = None

        self._output_counter_map = {}

        self._dynamic_workflow = []

        self._backup_data_layer_client = None
        self._execution_info_map_name = None
        self._next_backup_list = []

        #self._logger.debug("[PublicationUtils] init done.")

    # only to be called from the function worker
    def set_sapi(self, sapi):
        self._sapi = sapi

    def set_metadata(self, metadata):
        self._metadata = metadata
        self._execution_info_map_name = "execution_info_map_" + self._metadata[
            "__execution_id"]

    def update_metadata(self,
                        metadata_name,
                        metadata_value,
                        is_privileged=False):
        if is_privileged:
            self._metadata[metadata_name] = metadata_value
        else:
            if "__mfnusermetadata" not in self._metadata:
                self._metadata["__mfnusermetadata"] = {}
            self._metadata["__mfnusermetadata"][metadata_name] = metadata_value

    def _get_local_queue_client(self):
        if self._local_queue_client is None:
            self._local_queue_client = LocalQueueClient(connect=self._queue)
        return self._local_queue_client

    def _shutdown_local_queue_client(self):
        if self._local_queue_client is not None:
            self._local_queue_client.shutdown()

    def get_backup_data_layer_client(self):
        if self._backup_data_layer_client is None:
            # locality = -1 means that the writes happen to the local data layer first and then asynchronously to the global data layer
            self._backup_data_layer_client = DataLayerClient(
                locality=-1,
                for_mfn=True,
                sid=self._sandboxid,
                connect=self._datalayer)
        return self._backup_data_layer_client

    def shutdown_backup_data_layer_client(self):
        if self._backup_data_layer_client is not None:
            self._backup_data_layer_client.shutdown()

    def convert_api_message_to_python_object(self, message):
        # _XXX_: Java objects need to be serialized and passed to python; however, API functions expect python objects
        # we make the conversion according to the runtime
        val = message
        if self._function_runtime == "java":
            val = json.loads(message)
            val = val["value"]

        return val

    def is_valid_value(self, value):
        if not (py3utils.is_string(value) \
            or isinstance(value, (dict, list, int, float)) \
            or value is None):
            return False

        return True

    def _is_valid_trigger_destination(self, destination):
        if not (py3utils.is_string(destination) and destination != ""):
            return False

        return True

    def _is_allowed_or_privileged(self, destination, send_now):
        # Management service is privileged, so allow
        # 1) asynchronous execution
        # 2) Recovery manager topic
        # @ returns a tuple with (is_allowed, is_privileged)
        if self._sandboxid == "Management" and self._workflowid == "Management":
            if destination[0:6] == "async_" or\
                destination == self._recovery_manager_topic:
                # next[0:6] == "async_"
                # next == self._recovery_manager_topic:
                return (True, True)
            return (True, False)

        if send_now:
            if destination not in self._wf_function_list:
                return (False, False)
        elif destination not in self._wf_pot_next:
            return (False, False)

        return (True, False)

    def is_valid_trigger_message(self, next, value, send_now):
        is_valid = True
        errmsg = ""

        if not self._is_valid_trigger_destination(next):
            is_valid = False
            errmsg = "Malformed dynamic trigger definition; 'next' must be a string."

        is_allowed, is_privileged = self._is_allowed_or_privileged(
            next, send_now)
        if not is_allowed:
            is_valid = False
            if send_now:
                errmsg = errmsg + "\n" + "Destination is not in workflow: " + next
                errmsg = errmsg + "\n" + "Can only send an immediate trigger message to an existing function or the workflow end."
            else:
                errmsg = errmsg + "\n" + "Workflow does not match generated 'next': " + next

        if not self.is_valid_value(value):
            is_valid = False
            errmsg = errmsg + "\n" + "Malformed dynamic trigger definition; 'value' must be a python data type (dict, list, str, int, float, or None)."

        return is_valid, is_privileged, errmsg

    def decode_input(self, encoded_input):
        if encoded_input == '':
            encoded_input = '{}'
        #if isinstance(encoded_input,dict):
        #    encoded_input = json.dumps(encoded_input)
        # if encoded_input aready is a dict, convert to JSON Text
        #if encoded_input.startswith("null"):
        #    encoded_input = encoded_input.replace("null","")
        #Decode input. Input (value) must be a valid JSON Text.
        #however, post-commit hook published value has the format key; value
        #print ("Encoded State Input: " + str(encoded_input).replace("null",""))
        #print ("Encoded State Input: " + str(encoded_input) + str(type(encoded_input)))
        #self._logger.debug("received user input in decode_input: " + str(encoded_input))
        try:
            #if isinstance(encoded_input,str):
            raw_state_input = json.loads(encoded_input)
            #if isinstance(encoded_input,dict):
            #    raw_state_input = encoded_input
            return raw_state_input

        except Exception as exc:
            #self._logger.exception("User Input is not a valid JSON Text")
            #self._logger.exception(exc)
            raise Exception("User Input is not a valid JSON Text: " + str(exc))

    def encode_output(self, raw_state_output):
        #Produce output JSON Text from raw_state_output
        try:
            value_output = json.dumps(raw_state_output)
            return value_output
        except Exception as exc:
            #self._logger.exception("Error while encoding state output")
            #self._logger.exception(exc)
            raise Exception("Error while encoding state output: " + str(exc))

    def decapsulate_input(self, encoded_encapsulated_input):
        # The actual user input is encapsulated in a dict of the form:
        # { "__mfnuserdata": actual_user_input,
        # "__mfnmetadata": system_specific_metadata }
        # This encapsulation is invisible to the user and is added,
        # maintained, and removed by the frontend and function worker.

        if encoded_encapsulated_input == '':
            #self._logger.exception("Invalid encapsulation of user input")
            raise MicroFunctionsException(
                "Invalid encapsulation of user input.")
        else:
            try:
                encapsulated_input = json.loads(encoded_encapsulated_input)
                userdata = encapsulated_input['__mfnuserdata']
                metadata = encapsulated_input['__mfnmetadata']
                return userdata, metadata
            except Exception as exc:
                #self._logger.exception("Unable to decode encapsulated user input")
                #self._logger.exception(e)
                raise MicroFunctionsException(
                    "Unable to decode encapsulated user input: " + str(exc))

    def encapsulate_output(self, encoded_state_output, metadata):
        try:
            value = {
                "__mfnuserdata": encoded_state_output,
                "__mfnmetadata": metadata
            }
            value_output = json.dumps(value)
            return value_output
        except Exception as exc:
            #self._logger.exception("Error while encoding state output")
            #self._logger.exception(e)
            raise MicroFunctionsException(
                "Error while encoding state output: " + str(exc))

    def get_dynamic_workflow(self):
        '''
        Return the dynamically generated workflow information,
        so that this function instance can trigger other functions when it finishes.
        '''
        return self._dynamic_workflow

    def send_message_to_running_function(self, trigger):
        self.send_to_function_now("-1l", trigger, lqcpub=None)

    def append_trigger(self, trigger):
        trigger["value"] = self.encode_output(trigger["value"])
        self._dynamic_workflow.append(trigger)

    def _convert_function_output_static_workflow(self, function_output):
        converted_function_output = []
        for wfnext in self._wf_next:
            converted_function_output.append({
                "next": wfnext,
                "value": function_output
            })
        return converted_function_output

    def _store_output_data(self):
        data_out = self._sapi.get_transient_data_output()
        to_be_deleted = self._sapi.get_data_to_be_deleted()

        if data_out or to_be_deleted:
            dlc = self._sapi._get_data_layer_client()

            for k in data_out:
                dlc.put(k, data_out.get(k))

            for k in to_be_deleted:
                dlc.delete(k)

        data_out_private = self._sapi.get_transient_data_output(
            is_private=True)
        to_be_deleted_private = self._sapi.get_data_to_be_deleted(
            is_private=True)

        if data_out_private or to_be_deleted_private:
            dlc_private = self._sapi._get_data_layer_client(is_private=True)

            for k in data_out_private:
                dlc_private.put(k, data_out_private.get(k))

            for k in to_be_deleted_private:
                dlc_private.delete(k)

        self._sapi._shutdown_data_layer_client()

    def _send_local_queue_message(self, lqcpub, lqtopic, key, value):
        # construct a LocalQueueClientMessage(key, value)
        # and send it to the local queue topic via the local queue client
        lqcm = LocalQueueClientMessage(key=key, value=value)

        #lqcpub.addMessage(lqtopic, lqcm, False)
        ack = lqcpub.addMessage(lqtopic, lqcm, True)
        while not ack:
            ack = lqcpub.addMessage(lqtopic, lqcm, True)

    def _send_remote_message(self, remote_address, message_type, lqtopic, key,
                             value):
        # form a http request to send to remote host
        # need to set async=true in request URL, so that the frontend does not have a sync object waiting
        if message_type == "session_update":
            # if a session update message, set headers appropriately
            action_data = {}
            action_data["topic"] = lqtopic
            action_data["key"] = key
            action_data["value"] = value

            resp = requests.post(remote_address,
                                 params={"async": 1},
                                 json={},
                                 headers={
                                     "X-MFN-Action": "Session-Update",
                                     "X-MFN-Action-Data":
                                     json.dumps(action_data)
                                 })

        elif message_type == "global_pub":
            # TODO: if global publishing, set headers appropriately (e.g., for load balancing)
            pass

        return

    def _publish_privileged_output(self, function_output, lqcpub):
        next = function_output["next"]

        output = {}

        # init metadata for the workflow (similar to the frontend)
        metadata = {}
        metadata["__result_topic"] = self._metadata["__result_topic"]
        metadata["__execution_id"] = self._metadata["__execution_id"]
        metadata["__function_execution_id"] = self._metadata["__execution_id"]

        if next[:6] == "async_":
            # backup of the 'input' and 'next' has been done by executeWorkflowAsync in management service
            metadata["__async_execution"] = True
            output["topicNext"] = next[6:]
        elif next == self._recovery_manager_topic:
            metadata["__async_execution"] = self._metadata["__async_execution"]
            output["topicNext"] = next

        output["value"] = self.encapsulate_output(function_output["value"],
                                                  metadata)

        outkey = self._metadata["__execution_id"]
        # publish to pub manager's separate queue for global next
        outputstr = json.dumps(output)
        self._send_local_queue_message(lqcpub, self._pub_topic_global, outkey,
                                       outputstr)

        return (None, None)

    def _generate_trigger_metadata(self, topic_next):
        # keep track of the output instances of the next topic
        # e.g., funcA -> funcB with input1 (instance 0) and funcB with input2 (instance 1)
        if topic_next not in self._output_counter_map:
            self._output_counter_map[topic_next] = 0

        output_instance_id = self._output_counter_map[topic_next]
        next_function_execution_id = self._metadata[
            "__function_execution_id"] + "_" + str(output_instance_id)

        # get current state type. if map state add marker to execution Id
        state_type = self._state_utils.functionstatetype
        self._logger.debug("self._state_utils.functionstatetype: " +
                           str(state_type))

        if state_type == 'Map':
            next_function_execution_id = self._metadata[
                "__function_execution_id"] + "_" + str(
                    output_instance_id) + "-M"
        self._output_counter_map[topic_next] += 1

        trigger_metadata = copy.deepcopy(self._metadata)
        trigger_metadata[
            "__function_execution_id"] = next_function_execution_id

        #self._logger.debug("trigger metadata: " + str(trigger_metadata))

        return (next_function_execution_id, trigger_metadata)

    def _publish_output(self, key, trigger, lqcpub, timestamp_map=None):
        if timestamp_map is not None:
            timestamp_map['t_pub_output'] = time.time() * 1000.0
        next = trigger["next"]

        if "to_running_function" in trigger and trigger["to_running_function"]:
            # SessionUtils API calls have already determined the locality
            # this is for a running function instance on a remote host
            if "is_local" in trigger and trigger["is_local"]:
                trigger["value"] = self.encapsulate_output(
                    trigger["value"], self._metadata)
                # this is for a running function on the local host
                # SessionUtils has already created the appropriate next
                if timestamp_map is not None:
                    timestamp_map['t_pub_localqueue'] = time.time() * 1000.0
                self._send_local_queue_message(lqcpub, next, key,
                                               trigger["value"])
            else:
                # send it to the remote host with a special header
                self._send_remote_message(trigger["remote_address"],
                                          "session_update", next, key,
                                          trigger["value"])
            return (None, None)
        elif "is_privileged" in trigger and trigger["is_privileged"]:
            # next[0:6] == "async_"
            # next == self._recovery_manager_topic:
            return self._publish_privileged_output(trigger, lqcpub)
        else:
            topic_next = self._prefix + next

            output = {}
            output["topicNext"] = topic_next

            next_function_execution_id, trigger_metadata = self._generate_trigger_metadata(
                topic_next)

            output["value"] = self.encapsulate_output(trigger["value"],
                                                      trigger_metadata)

            # check whether next is local or not
            if topic_next in self._wf_local:
                # event message directly to the next function's local queue topic
                if timestamp_map is not None:
                    timestamp_map['t_pub_localqueue'] = time.time() * 1000.0
                self._send_local_queue_message(lqcpub, topic_next, key,
                                               output["value"])
            else:
                # check if 'next' is exit topic and modify output["topicNext"] accordingly
                isExitTopic = False
                if next == self._wf_exit:
                    isExitTopic = True

                    if self._metadata["__execution_id"] != key:
                        key = self._metadata["__execution_id"]

                    dlc = self.get_backup_data_layer_client()

                    # store the workflow's final result
                    dlc.put("result_" + key, output["value"])
                    #self._logger.debug("[__mfn_backup] [exitresult] [%s] %s", "result_" + key, output["value"])

                    # _XXX_: this is not handled properly by the frontend
                    # this was an async execution
                    # just send an empty message to the frontend to signal end of execution
                    #if "__async_execution" in self._metadata and self._metadata["__async_execution"]:
                    #    output["value"] = ""

                if isExitTopic and timestamp_map is not None:
                    timestamp_map['t_pub_exittopic'] = time.time() * 1000.0
                    timestamp_map['exitsize'] = len(output["value"])
                self._send_local_queue_message(lqcpub, topic_next, key,
                                               output["value"])

            return (next_function_execution_id, output)

    def _store_trigger_backups(self,
                               dlc,
                               input_backup_map,
                               current_function_instance_id,
                               store_next_backup_list=False):
        # keep track of the execution instances with their updated keys
        # i.e., keys that contains the output instance ids
        # use this set to describe the execution details

        if self._execution_info_map_name is not None:
            # dump the backups into the data layer
            for input_backup_key in input_backup_map:
                dlc.putMapEntry(self._execution_info_map_name,
                                input_backup_key,
                                input_backup_map[input_backup_key])

            # if there is any new next, store them
            # if a next was generated by sending a message immediately,
            # this next will have been appended to our list in memory
            # and the backup will be overwritten
            # if one or more nexts were generated when publishing
            # at the end of execution, they will have been appended to our list
            # in memory and we will store the backup once for the entire list
            if store_next_backup_list:
                dlc.putMapEntry(self._execution_info_map_name,
                                "next_" + current_function_instance_id,
                                json.dumps(self._next_backup_list))

    def _send_message_to_recovery_manager(self, key, message_type, topic,
                                          func_exec_id, has_error, error_type,
                                          lqcpub):
        return
        message_rec = {}
        message_rec["messageType"] = message_type
        message_rec["currentTopic"] = topic
        message_rec["currentFunctionExecutionId"] = func_exec_id
        message_rec["hasError"] = has_error
        message_rec["errorType"] = error_type

        output = {}
        output["topicNext"] = self._recovery_manager_topic
        output["value"] = json.dumps(message_rec)
        outputstr = json.dumps(output)
        # message via global publisher to pub manager's queue for backups
        self._send_local_queue_message(lqcpub, self._pub_topic_global, key,
                                       outputstr)

    # need to store backups of inputs and send message to recovery manager
    def send_to_function_now(self, key, trigger, lqcpub=None, dlc=None):
        trigger["value"] = self.encode_output(trigger["value"])

        # get a local queue client
        if lqcpub is None:
            lqcpub = self._get_local_queue_client()

        current_function_instance_id = self._metadata[
            "__function_execution_id"] + "_" + self._function_topic

        # if next_function_execution_id and output are None only if:
        # 1) message was sent to a running function (i.e., session function update message)
        # 2) message was a privileged message
        any_next = False
        next_function_execution_id, output = self._publish_output(
            key, trigger, lqcpub)
        if self._should_checkpoint:
            input_backup_map = {}
            starting_next = {}

            if dlc is None:
                dlc = self.get_backup_data_layer_client()

            if next_function_execution_id is not None and output is not None:
                # here, output MUST contain "topicNext" and "value"; otherwise,
                # we wouldn't have been able to publish it in publish_output()
                # use the updated topicNext for globally published messages
                starting_next[next_function_execution_id] = output["topicNext"]
                next_function_instance_id = next_function_execution_id + "_" + output[
                    "topicNext"]
                input_backup_map["input_" +
                                 next_function_instance_id] = output["value"]
                self._next_backup_list.append(next_function_instance_id)
                any_next = True

            self._store_trigger_backups(dlc,
                                        input_backup_map,
                                        current_function_instance_id,
                                        store_next_backup_list=any_next)

            for next_func_exec_id in starting_next:
                next_func_topic = starting_next[next_func_exec_id]
                self._send_message_to_recovery_manager(key, "start",
                                                       next_func_topic,
                                                       next_func_exec_id,
                                                       False, "", lqcpub)

            self._send_message_to_recovery_manager(
                key, "running", self._function_topic,
                self._metadata["__function_execution_id"], False, "", lqcpub)

    # utilize the workflow to publish directly to the next function's topic
    # publish directly to the next function's topic, accumulate backups
    # publish backups at the end with a 'fin' flag, which also indicates that all have been published
    # also, handle global queue events
    def publish_output_direct(self, key, value_output, has_error, error_type,
                              timestamp_map):
        timestamp_map["t_pub_start"] = timestamp_map[
            "t_start_pub"] = time.time() * 1000.0

        # if we already have a local queue client (because of immediately sent messages) and backup data layer client,
        # re-use them
        # if not, then the call to get them will initialize them
        lqcpub = self._get_local_queue_client()

        # _XXX_: 'function instance id' is uniquely identified via:
        # 1) (workflow) execution id (i.e., uuid set by frontend)
        # 2) output instance id (depends on the number of 'next' using the same function)
        # 3) function topic
        # 1) and 2) => '__function_execution_id' in metadata;
        # set by the previous function (or frontend if we're the first function) in the metadata
        current_function_instance_id = self._metadata[
            "__function_execution_id"] + "_" + self._function_topic

        if has_error:
            timestamp_map["t_start_dlcbackup"] = time.time() * 1000.0
            dlc = self.get_backup_data_layer_client()

            # set data layer flag to stop further execution of function instances
            # that may have been triggered concurrently via a new message
            dlc.put("workflow_execution_stop_" + key, "1")

            # dump the result into the data layer
            result = {}
            result["has_error"] = has_error
            result["error_type"] = error_type

            encoded_result = self.encode_output(result)

            encapsulated_result = self.encapsulate_output(
                encoded_result, self._metadata)

            #dlc.put("result_" + current_function_instance_id, encapsulated_result)
            dlc.putMapEntry(self._execution_info_map_name,
                            "result_" + current_function_instance_id,
                            encapsulated_result)

            # publish a message to the 'exit' topic
            trigger = {}
            trigger["next"] = self._wf_exit
            trigger["value"] = encoded_result

            # don't need next_function_execution_id, because we'll stop execution anyway
            # similarly, we don't need to do any backups
            next_function_execution_id, output = self._publish_output(
                key, trigger, lqcpub, timestamp_map)

            # store the workflow's final result
            # which has been encapsulated
            dlc.put("result_" + key, output["value"])
            timestamp_map["hasError"] = True

        else:
            # dump the result into the data layer
            timestamp_map["t_start_encapsulate"] = time.time() * 1000.0
            encapsulated_value_output = self.encapsulate_output(
                value_output, self._metadata)

            if self._should_checkpoint:
                timestamp_map["t_start_dlcbackup"] = time.time() * 1000.0
                dlc = self.get_backup_data_layer_client()

                #dlc.put("result_" + current_function_instance_id, encapsulated_value_output)
                timestamp_map["t_start_resultmap"] = time.time() * 1000.0
                dlc.putMapEntry(self._execution_info_map_name,
                                "result_" + current_function_instance_id,
                                encapsulated_value_output)
                #self._logger.debug("[__mfn_backup] [%s] [%s] %s", self._execution_info_map_name, "result_" + current_function_instance_id, encapsulated_value_output)

            timestamp_map["t_start_storeoutput"] = time.time() * 1000.0
            # store self._sapi.transient_output into the data layer
            self._store_output_data()

            # get the combined (next, value) tuple list for the output
            # use here the original output:
            # we'll update the metadata separately for each trigger and encapsulate the output with it
            timestamp_map["t_start_generatenextlist"] = time.time() * 1000.0
            converted_function_output = self._convert_function_output_static_workflow(
                value_output)
            choice_next_list = self._state_utils.getChoiceResults(value_output)
            converted_function_output = converted_function_output + self._dynamic_workflow + choice_next_list

            check_error_flag = True
            continue_publish_flag = True
            # if we are sending the result ONLY to the workflow exit, then there is no point in checking the error flag
            if len(
                    converted_function_output
            ) == 1 and converted_function_output[0]["next"] == self._wf_exit:
                check_error_flag = False

            if check_error_flag:
                timestamp_map["t_start_dlcbackup_err"] = time.time() * 1000.0
                dlc = self.get_backup_data_layer_client()
                # check the workflow stop flag
                # if some other function execution had an error and we had been
                # simultaneously triggered, we can finish but don't need to publish
                # to the next function in the workflow, so we can stop execution of the workflow
                timestamp_map["t_start_dlcbackup_err_flag"] = time.time(
                ) * 1000.0
                workflow_exec_stop = dlc.get("workflow_execution_stop_" + key,
                                             locality=0)
                if workflow_exec_stop is not None and workflow_exec_stop != "":
                    self._logger.info(
                        "Not continuing because workflow execution has been stopped... %s",
                        key)
                    continue_publish_flag = False

            # if we didn't have to check the error, or we checked it, but there was not one, then continue publishing the output
            # to the next functions
            # if we checked the error and there was one, then don't publish to the next functions
            if continue_publish_flag:
                # converted_function_output can only contain next values from static (_wf_next) and dynamic next (_wf_pot_next)
                # static next values would have been already defined and checked before deploying workflow
                # dynamic next values are checked when creating the trigger in MicroFunctionsAPI.add_workflow_next()
                # so there is no need for another check

                if self._should_checkpoint:
                    # we are going to accummulate any input backups in this map
                    input_backup_map = {}
                    # we are going to accummulate any new starting functions in this map
                    starting_next = {}

                timestamp_map["t_start_pubnextlist"] = time.time() * 1000.0
                any_next = False
                # parse the converted_function_output to determine the next and publish directly
                for function_output in converted_function_output:
                    next_function_execution_id, output = self._publish_output(
                        key, function_output, lqcpub, timestamp_map)
                    if self._should_checkpoint:
                        if next_function_execution_id is not None and output is not None:
                            # here, output MUST contain "topicNext" and "value"; otherwise,
                            # we wouldn't have been able to publish it in publish_output()
                            # use the updated topicNext for globally published messages
                            starting_next[next_function_execution_id] = output[
                                "topicNext"]
                            next_function_instance_id = next_function_execution_id + "_" + output[
                                "topicNext"]
                            input_backup_map[
                                "input_" +
                                next_function_instance_id] = output["value"]
                            self._next_backup_list.append(
                                next_function_instance_id)
                            any_next = True

                if self._should_checkpoint:
                    timestamp_map["t_start_backtrigger"] = time.time() * 1000.0
                    # backups for next of successfully completed function execution instances
                    self._store_trigger_backups(
                        dlc,
                        input_backup_map,
                        current_function_instance_id,
                        store_next_backup_list=any_next)

                    for next_func_exec_id in starting_next:
                        next_func_topic = starting_next[next_func_exec_id]
                        self._send_message_to_recovery_manager(
                            key, "start", next_func_topic, next_func_exec_id,
                            False, "", lqcpub)

        if self._should_checkpoint:
            # regardless whether this function execution had an error or not, we are finished and need to let the recovery manager know
            self._send_message_to_recovery_manager(
                key, "finish", self._function_topic,
                self._metadata["__function_execution_id"], has_error,
                error_type, lqcpub)

        # log the timestamps
        timestamp_map["t_pub_end"] = timestamp_map[
            "t_end_pub"] = timestamp_map["t_end_fork"] = time.time() * 1000.0
        timestamp_map["function_instance_id"] = current_function_instance_id
        timestamp_map_str = json.dumps(timestamp_map)
        self._logger.info("[__mfn_progress] %s %s",
                          timestamp_map["function_instance_id"],
                          timestamp_map_str)
        size = 0
        if 'exitsize' in timestamp_map and 't_pub_exittopic' in timestamp_map:
            size = timestamp_map['exitsize']
        self._logger.info(
            "[__mfn_tracing] [ExecutionId] [%s] [Size] [%s] [TimestampMap] [%s] [%s]",
            key, str(size), timestamp_map_str,
            timestamp_map["function_instance_id"])
        # also put them to the data layer
        # (can skip, but need to update "getExecutionDescription.py" in ManagementService)
        #dlc.put("timestamps_" + current_function_instance_id, json.dumps(timestamp_map))

        # shut down the local queue client
        self._shutdown_local_queue_client()
        self.shutdown_backup_data_layer_client()
Esempio n. 8
0
class SessionHelperThread(threading.Thread):
    def __init__(self, helper_params, logger, pubutils, sessutils,
                 queueservice, datalayer):

        self._logger = logger

        #self._logger.debug("[SessionHelperThread] " + str(helper_params))

        self._publication_utils = pubutils

        self._session_utils = sessutils

        self._queue_service = queueservice
        self._datalayer = datalayer

        self._sandboxid = helper_params["sandboxid"]
        self._workflowid = helper_params["workflowid"]
        self._session_function_id = helper_params["session_function_id"]
        self._session_id = helper_params["session_id"]

        # initialize only needed
        # need a separate backup data layer client from the publication utils; otherwise, we run into concurrent modification
        # problems from Thrift
        # locality = -1 means that the writes happen to the local data layer first and then asynchronously to the global data layer
        # will only initialize if heartbeats are enabled
        self._backup_data_layer_client = None

        # set up heartbeat parameters
        self._heartbeat_enabled = False
        self._heartbeat_method = None
        # our own local queue client to be used when sending a heartbeat
        # TODO: double check if we can just reuse the one we're polling
        # probably yes
        self._local_queue_client_heartbeat = None
        self._heartbeat_function = None
        self._heartbeat_data_layer_key = None
        self._data_layer_client_heartbeat = None

        self._init_heartbeat_parameters(helper_params["heartbeat_parameters"])

        # set up communication parameters
        self._communication_params = helper_params["communication_parameters"]
        # similar to the data layer rendezvous point for message delivery, we listen to a local topic
        # allowing us to queue messages and deliver multiple messages to the session function if desired
        self._local_topic_communication = self._communication_params[
            "local_topic_communication"]
        # by default, assign a simple poll timeout
        # if the heartbeat is specified, it will be updated to the heartbeat to ensure
        # we can send regular heartbeats
        self._local_poll_timeout = py3utils.ensure_long(10000)

        # use a queue to keep the incoming update messages for blocking and/or blocking get_update_messages() requests
        self._message_queue = queue.Queue()

        self._local_queue_client = LocalQueueClient(
            connect=self._queue_service)

        self._special_messages = {}
        self._special_messages["--stop"] = True
        self._special_messages["--update-heartbeat"] = True

        self._is_running = False

        #self._logger.debug("[SessionHelperThread] init done.")

        threading.Thread.__init__(self)

    def _init_heartbeat_parameters(self, heartbeat_params):
        if "heartbeat_method" not in heartbeat_params:
            self._logger.debug(
                "No heartbeat method is specified; disabling heartbeat.")
            return
        else:
            self._heartbeat_enabled = True
            self._heartbeat_method = heartbeat_params["heartbeat_method"]
            #self._logger.debug("[SessionHelperThread] New heartbeat method: " + str(self._heartbeat_method))

        if self._heartbeat_method == "function":
            if "heartbeat_function" in heartbeat_params:
                # enable function related heartbeat
                self._heartbeat_function = heartbeat_params[
                    "heartbeat_function"]
                #self._logger.debug("[SessionHelperThread] New heartbeat function: " + str(self._heartbeat_function))
                if self._backup_data_layer_client is None:
                    self._backup_data_layer_client = DataLayerClient(
                        locality=-1,
                        for_mfn=True,
                        sid=self._sandboxid,
                        connect=self._datalayer)
                if self._local_queue_client_heartbeat is None:
                    self._local_queue_client_heartbeat = LocalQueueClient(
                        connect=self._queue_service)

                # disable data layer related heartbeat
                if self._data_layer_client_heartbeat is not None:
                    self._data_layer_client_heartbeat.delete(
                        self._heartbeat_data_layer_key)
                    self._heartbeat_data_layer_key = None
                    self._data_layer_client_heartbeat.shutdown()
                    self._data_layer_client_heartbeat = None
        elif self._heartbeat_method == "data_layer":
            # needs to be unique among session functions, so use session id + session function id
            # TODO: how do you check the heartbeat in the data layer?
            # checker service or user function needs to know the key
            # OR keep a new map for heartbeats of the session functions
            # so that the checker can retrieve the keys and their values (e.g., timestamps)
            # if a session function misses a heartbeat, the checker function reports to policy handler

            # enable data layer related heartbeat
            self._heartbeat_data_layer_key = "heartbeat_" + self._session_id + "_" + self._session_function_id
            if self._data_layer_client_heartbeat is None:
                self._data_layer_client_heartbeat = DataLayerClient(
                    locality=1,
                    for_mfn=True,
                    sid=self._sandboxid,
                    connect=self._datalayer)

            # disable function related heartbeat
            if self._local_queue_client_heartbeat is not None:
                self._local_queue_client_heartbeat.shutdown()
                self._local_queue_client_heartbeat = None
                self._heartbeat_function = None
            if self._backup_data_layer_client is not None:
                self._backup_data_layer_client.shutdown()
                self._backup_data_layer_client = None

        else:
            raise MicroFunctionsSessionAPIException(
                "Unsupported heartbeat method for session function.")

        # must be in milliseconds
        if "heartbeat_interval_ms" in heartbeat_params:
            self._heartbeat_interval = heartbeat_params[
                "heartbeat_interval_ms"]
            self._local_poll_timeout = self._heartbeat_interval / 2.0
            #self._logger.debug("[SessionHelperThread] New heartbeat interval: " + str(self._heartbeat_interval))

    def run(self):
        self._is_running = True

        # initially, it is the heartbeat_interval / 2
        poll_timeout = self._local_poll_timeout

        if self._heartbeat_enabled:
            t_cur = time.time() * 1000.0
            self._send_heartbeat()
            last_heartbeat_time = t_cur

        # _XXX_: our location is stored as part of our metadata
        # so that the remote functions can
        # look it up and send their message via that that location
        # first, create local topic
        self._local_queue_client.addTopic(self._local_topic_communication)

        while self._is_running:
            #self._logger.debug("[SessionHelperThread] polling new session update messages...")
            # wait until the polling interval finishes
            # the polling interval depends on the heartbeat interval and when we actually receive a message
            # if we get a message before, then update the polling interval as (heartbeat_interval - passed_time)
            lqm = self._local_queue_client.getMessage(
                self._local_topic_communication, poll_timeout)

            # double check we are still running
            # if the long-running function finished while we were polling, no need to send another heartbeat
            if not self._is_running:
                break

            if lqm is not None:
                self._process_message(lqm)

            if self._heartbeat_enabled:
                # send heartbeat
                # this is part of the message loop, such that we can have a more precise heartbeat
                # if it was only after the message loop, then there is a corner case, where the
                # processing of the messages would take more than the heartbeat interval,
                # meaning we would miss our deadline
                t_cur = time.time() * 1000.0
                if (t_cur - last_heartbeat_time) >= self._heartbeat_interval:
                    self._send_heartbeat()
                    last_heartbeat_time = t_cur

            if self._heartbeat_enabled:
                # send heartbeat
                # even if there are no messages, we might need to send a heartbeat
                t_cur = time.time() * 1000.0
                if (t_cur - last_heartbeat_time) >= self._heartbeat_interval:
                    self._send_heartbeat()
                    last_heartbeat_time = t_cur
                # update the poll time
                # if we sent a heartbeat recently, last_heartbeat and t_cur will cancel each other out
                poll_timeout = py3utils.ensure_long(last_heartbeat_time +
                                                    self._local_poll_timeout -
                                                    t_cur)
                #self._logger.debug("updated poll timeout: " + str(poll_timeout))

        self._cleanup()

    def _process_message(self, lqm):
        try:
            lqcm = LocalQueueClientMessage(lqm=lqm)
            value = lqcm.get_value()
            #key = lqcm.get_key()
            #self._logger.debug("[SessionHelperThread] new message: " + key + " " + value)
        except Exception as exc:
            self._logger.exception(
                "Exception in handling message to running function: " +
                str(self._session_function_id) + " " + str(exc))

        # we need to decapsulate and decode this message,
        # because it has been delivered
        # to us without going through the function worker
        value, metadata = self._publication_utils.decapsulate_input(value)
        #self._logger.debug("metadata for session function message: " + str(metadata))

        # need to handle the special messages here
        # check if the message is in json
        is_json = True
        try:
            msg = json.loads(value)
            #self._logger.debug("[SessionHelperThread] JSON value: " + str(msg))
        except Exception as exc:
            is_json = False
            msg = value
            self._logger.debug("[SessionHelperThread] non-JSON value: " +
                               str(msg))

        # cannot be a special message; queue whatever it is
        # _XXX_: we are encoding/decoding the delivered message; should not actually execute this code
        # it is here for not envisioned corner case (i.e., let the user code deal with it)
        if not is_json:
            self._store_message(msg)
            self._publication_utils.set_metadata(metadata)
        else:
            # the message is json encoded, but it doesn't guarantee that it is a special message
            if "action" in msg and msg["action"] in self._special_messages:
                self._handle_special_message(msg)
            else:
                self._store_message(msg)
                self._publication_utils.set_metadata(metadata)

    def _store_message(self, msg):
        self._message_queue.put(msg)

    def _handle_special_message(self, msg):
        action = msg["action"]

        if action == "--stop":
            self._session_utils.set_session_function_running(False)
            self.shutdown()

        elif action == "--update-heartbeat":
            self._init_heartbeat_parameters(msg["heartbeat_parameters"])

    def get_messages(self, count=1, block=False):
        messages = []

        for i in range(count):
            try:
                msg = self._message_queue.get(block=block)
                messages.append(msg)
                self._message_queue.task_done()
            except Exception as exc:
                pass

        #self._logger.debug("returning messages: " + str(messages))
        return messages

    def _send_heartbeat(self):
        # check if heartbeat is enabled. if not, just return
        # if heartbeat is enabled, then double check we are still running
        # if the long-running function finished while we were processing messages, no need to send another heartbeat
        if not self._heartbeat_enabled or not self._is_running:
            return

        #self._logger.debug("[SessionHelperThread] sending heartbeat to function: " + self._heartbeat_function)

        hb_message = self._get_heartbeat_message()

        # either to another function via a local queue client or to data layer or another method
        if self._heartbeat_method == "function":
            self._send_heartbeat_to_function(hb_message)
        elif self._heartbeat_method == "data_layer":
            self._send_heartbeat_to_data_layer(hb_message)

    def _get_heartbeat_message(self):
        hb_message = {}
        hb_message["session_id"] = self._session_id
        hb_message["session_function_id"] = self._session_function_id
        hb_message["timestamp"] = time.time() * 1000.0
        hb_message["action"] = "--heartbeat"

        #self._logger.debug("heartbeat msg: "+ json.dumps(hb_message))

        return hb_message

    def _send_heartbeat_to_function(self, hb_message):
        # TODO: what if the heartbeat function is a session function as well?
        # either running and/or not started yet, but will continue running after the first message

        # pass our own local queue client, so that there won't be any concurrent access
        # to publication utils' local queue client
        trigger_hb = {}
        trigger_hb["next"] = self._heartbeat_function
        trigger_hb["value"] = hb_message
        self._publication_utils.send_to_function_now(
            "-1l", trigger_hb, self._local_queue_client_heartbeat,
            self._backup_data_layer_client)

    def _send_heartbeat_to_data_layer(self, hb_message):
        self._data_layer_client_heartbeat.put(self._heartbeat_data_layer_key,
                                              json.dumps(hb_message))

    def _cleanup(self):
        #self._logger.debug("[SessionHelperThread] cleaning up...")
        # clean up connections
        if self._data_layer_client_heartbeat is not None:
            self._data_layer_client_heartbeat.delete(
                self._heartbeat_data_layer_key)
            self._heartbeat_data_layer_key = None
            self._data_layer_client_heartbeat.shutdown()
            self._data_layer_client_heartbeat = None

        if self._local_queue_client_heartbeat is not None:
            self._local_queue_client_heartbeat.shutdown()
            self._local_queue_client_heartbeat = None

        if self._backup_data_layer_client is not None:
            self._backup_data_layer_client.shutdown()
            self._backup_data_layer_client = None

        # remove/unregister the topic
        self._local_queue_client.removeTopic(self._local_topic_communication)

        self._local_queue_client.shutdown()
        self._local_queue_client = None

    def shutdown(self):
        self._is_running = False
Esempio n. 9
0
class SandboxAgent:
    def __init__(self, hostname, queue, datalayer, sandboxid, userid,
                 workflowid, elasticsearch, workflowname, endpoint_key):

        self._start = time.time()

        self._python_version = sys.version_info

        self._hostname = hostname
        self._queue = queue
        self._datalayer = datalayer
        self._elasticsearch = elasticsearch
        self._userid = userid
        self._sandboxid = sandboxid
        self._workflowid = workflowid
        self._workflowname = workflowname
        # _XXX_: we'll use the endpoint_key to look up our endpoint
        self._endpoint_key = endpoint_key
        self._deployment_info_key = "deployment_info_workflow_" + self._workflowid

        self._logger = logging_helpers.setup_logger(self._sandboxid,
                                                    LOG_FILENAME)
        self._fluentbit_process, self._command_args_map_fluentbit = logging_helpers.setup_fluentbit_and_elasticsearch_index(
            self._logger, FLUENTBIT_FOLDER, self._elasticsearch,
            ELASTICSEARCH_INDEX_WF, ELASTICSEARCH_INDEX_FE)

        self._logger.info("hostname (and container name): %s", self._hostname)
        self._logger.info("elasticsearch nodes: %s", self._elasticsearch)
        self._logger.info("queueservice: %s", self._queue)
        self._logger.info("datalayer: %s", self._datalayer)
        self._logger.info("user id: %s", self._userid)
        self._logger.info("sandbox id: %s", self._sandboxid)
        self._logger.info("workflow id: %s", self._workflowid)
        self._logger.info("workflow name: %s", self._workflowname)
        self._logger.info("endpoint_key: %s", self._endpoint_key)

        self._instructions_topic = "instructions_" + self._sandboxid

        self._management_data_layer_client = DataLayerClient(
            locality=1,
            sid="Management",
            wid="Management",
            is_wf_private=True,
            connect=self._datalayer)
        self._logger.info("Management data layer client connected after %s s",
                          str(time.time() - self._start))

        # to be declared later
        self._local_queue_client = None
        self._deployment = None
        self._queue_service_process = None
        self._frontend_process = None
        # visible to the outside world: either kubernetes assigned URL or bare-metal host address + exposed port
        self._external_endpoint = None
        # visible internally: kubernetes node address or same as bare-metal external endpoint
        self._internal_endpoint = None

        self._is_running = False
        self._shutting_down = False

    def _handle_instruction(self, instruction):
        error = None

        action = instruction["action"]
        if "parameters" in instruction:
            parameters = instruction["parameters"]

        if action == "stop-function-worker":
            self._deployment.stop_function_worker(parameters["functionTopic"])
        elif action == "shutdown":
            self.shutdown()
        else:
            error = "Unsupported 'action' in instruction: " + action

        return error

    def _get_and_handle_message(self):
        error = None

        lqm = self._local_queue_client.getMessage(self._instructions_topic,
                                                  POLL_TIMEOUT)
        if lqm is not None:
            lqcm = LocalQueueClientMessage(lqm)
            key = lqcm.get_key()
            value = lqcm.get_value()
            self._logger.info(key + " " + value)
            try:
                instruction = json.loads(value)
                error = self._handle_instruction(instruction)
            except Exception as exc:
                error = "Couldn't decode instruction: " + str(exc)
                self._logger.error(error)

            if error is None:
                self._logger.info(
                    "Handled instruction successfully at t+ %s s",
                    str(time.time() - self._start))

    def _process_deployment_info(self):
        has_error = False
        errmsg = ""

        deployment_info = self._management_data_layer_client.get(
            self._deployment_info_key)
        num_trials = 0
        sleep_time = 1.0
        while num_trials < 5 and (deployment_info is None
                                  or deployment_info == ""):
            time.sleep(sleep_time)
            deployment_info = self._management_data_layer_client.get(
                self._deployment_info_key)
            num_trials = num_trials + 1
            sleep_time = sleep_time * 2

        if num_trials == 5:
            has_error = True
            errmsg = "Could not retrieve deployment info: " + self._deployment_info_key

        if not has_error:
            # if we're running on kubernetes, the endpoint will correspond to the assigned url
            # if we're running on bare-metal, the endpoint will correspond to the hostip + docker-mapped port
            self._external_endpoint = self._management_data_layer_client.getMapEntry(
                self._workflowid + "_workflow_endpoint_map", endpoint_key)
            num_trials = 0
            sleep_time = 1.0
            while num_trials < 5 and (self._external_endpoint is None
                                      or self._external_endpoint == ""):
                time.sleep(sleep_time)
                self._external_endpoint = self._management_data_layer_client.getMapEntry(
                    self._workflowid + "_workflow_endpoint_map", endpoint_key)
                num_trials = num_trials + 1
                sleep_time = sleep_time * 2

            if num_trials == 5:
                has_error = True
                errmsg = "Could not retrieve endpoint: " + self._endpoint_key

        # in Kubernetes, endpoint is the externally visible URL
        # in bare-metal, endpoint is the current host's address

        # for session support, in FunctionWorker, we need current host address (bare-metal)
        # or current node address (kubernetes)

        # for parallel state support, in FunctionWorker, either would be fine

        # As such, let the FunctionWorker know both and let it decide what to do
        if 'KUBERNETES_SERVICE_HOST' in os.environ:
            # get current node's internal address
            self._internal_endpoint = "http://" + socket.gethostbyname(
                socket.gethostname()) + ":" + str(os.getenv("PORT", "8080"))
        else:
            # bare-metal mode: the current host's address and external address are the same
            self._internal_endpoint = self._external_endpoint

        if not has_error:
            self._logger.info("External endpoint: %s", self._external_endpoint)
            self._logger.info("Internal endpoint: %s", self._internal_endpoint)
            self._deployment = Deployment(deployment_info,\
                self._hostname, self._userid, self._sandboxid, self._workflowid,\
                self._workflowname, self._queue, self._datalayer, \
                self._logger, self._external_endpoint, self._internal_endpoint)
            self._deployment.set_child_process(
                "fb", self._fluentbit_process,
                self._command_args_map_fluentbit)
            has_error, errmsg = self._deployment.process_deployment_info()

        return has_error, errmsg

    # SIGTERM kills Thrift before we can handle stuff
    def sigterm(self, signum, frame):
        self.shutdown()
        # raise interrupt to kill main sequence when shutdown was not received through the queue
        raise InterruptedError

    def sigchld(self, signum, _):
        if not self._shutting_down:
            should_shutdown, pid = self._deployment.check_child_process()

            if should_shutdown:
                self._update_deployment_status(
                    True, "A sandbox process stopped unexpectedly.")
                self.shutdown(reason="Process with pid: " + str(pid) +
                              " stopped unexpectedly.")

    def shutdown(self, reason=None):
        self._shutting_down = True
        if reason is not None:
            self._logger.error("Shutting down sandboxagent due to reason: " +
                               reason)
        else:
            self._logger.info("Gracefully shutting down sandboxagent")

        self._logger.info("Shutting down the frontend...")
        if self._frontend_process is not None:
            self._frontend_process.terminate()

        self._logger.info("Shutting down the function worker(s)...")
        self._deployment.shutdown()

        # shut down the local queue client, so that we can also shut down the queue service
        self._local_queue_client.removeTopic(self._instructions_topic)
        self._local_queue_client.shutdown()

        self._logger.info("Shutting down the queue service...")
        if self._queue_service_process is not None:
            process_utils.terminate_and_wait_child(self._queue_service_process,
                                                   "queue service", 5,
                                                   self._logger)

        # we can't do this here, because there may be other sandboxes running the same workflow
        #self._management_data_layer_client.put("workflow_status_" + self._workflowid, "undeployed")
        self._management_data_layer_client.shutdown()

        self._logger.info("Shutting down fluent-bit...")
        time.sleep(2)  # flush interval of fluent-bit
        process_utils.terminate_and_wait_child(self._fluentbit_process,
                                               "fluent-bit", 5, self._logger)
        self._is_running = False

        try:
            self._frontend_process.wait(30)
        except subprocess.TimeoutExpired as exc:
            self._frontend_process.kill()
            _, _ = self._frontend_process.communicate()
        self._logger.info("Shutdown complete")

    def _stop_deployment(self, reason, errmsg):
        self._logger.error(
            "Stopping deployment due to error in launching %s...", reason)
        self._logger.error(errmsg)
        self._update_deployment_status(True, errmsg)
        self._management_data_layer_client.shutdown()
        os._exit(1)

    def _update_deployment_status(self, has_error, errmsg):
        sbstatus = {}
        sbstatus["errmsg"] = errmsg
        if has_error:
            sbstatus["status"] = "failed"
        else:
            sbstatus["status"] = "deployed"
        # set our own status in the map
        self._management_data_layer_client.putMapEntry(
            self._workflowid + "_sandbox_status_map", self._endpoint_key,
            json.dumps(sbstatus))

    def run(self):
        has_error = False
        errmsg = ""

        ts_qs_launch = time.time()
        # 1. launch the QueueService here
        self._logger.info("Launching QueueService...")
        cmdqs = "java -jar /opt/mfn/queueservice.jar"
        command_args_map_qs = {}
        command_args_map_qs["command"] = cmdqs
        command_args_map_qs["wait_until"] = "Starting local queue..."
        error, self._queue_service_process = process_utils.run_command(
            cmdqs, self._logger, wait_until="Starting local queue...")
        if error is not None:
            has_error = True
            errmsg = "Could not start the sandbox queue service: " + str(error)

        if has_error:
            self._stop_deployment("queue service", errmsg)

        ts_fw_launch = time.time()
        # 2. process the deployment info and start function workers
        self._logger.info(
            "Going to parse the deployment info and get the endpoint...")
        has_error, errmsg = self._process_deployment_info()

        if has_error:
            self._stop_deployment("workflow", errmsg)

        ts_fe_launch = time.time()
        # 3. launch the frontend
        self._logger.info("Launching frontend...")

        cmdweb = "/opt/mfn/frontend"
        fenv = dict(os.environ)
        workflow = self._deployment.get_workflow()
        fenv["MFN_ENTRYTOPIC"] = workflow.getWorkflowEntryTopic()
        fenv["MFN_RESULTTOPIC"] = workflow.getWorkflowExitTopic()
        fenv["MFN_QUEUE"] = self._queue
        # MFN_DATALAYER already set

        command_args_map_fe = {}
        command_args_map_fe["command"] = cmdweb
        command_args_map_fe["custom_env"] = fenv
        command_args_map_fe[
            "wait_until"] = "Frontend is ready to handle requests"
        error, self._frontend_process = process_utils.run_command(
            cmdweb,
            self._logger,
            custom_env=fenv,
            wait_until="Frontend is ready to handle requests")
        if error is not None:
            has_error = True
            errmsg = "Could not start the frontend: " + str(error)

        if has_error:
            self._stop_deployment("frontend", errmsg)

        self._logger.info("frontend started")

        t_fe = (time.time() - ts_fe_launch) * 1000.0
        t_fw = (ts_fe_launch - ts_fw_launch) * 1000.0
        t_qs = (ts_fw_launch - ts_qs_launch) * 1000.0

        self._logger.info(
            "QS launch time: %s (ms), FWs download + launch time: %s (ms), FE launch time: %s (ms)",
            str(t_qs), str(t_fw), str(t_fe))

        self._deployment.set_child_process("qs", self._queue_service_process,
                                           command_args_map_qs)
        self._deployment.set_child_process("fe", self._frontend_process,
                                           command_args_map_fe)

        # 4. start listening for additional instructions if any
        self._local_queue_client = LocalQueueClient(connect=self._queue)
        self._local_queue_client.addTopic(self._instructions_topic)

        self._is_running = True

        signal.signal(signal.SIGTERM, self.sigterm)

        children_pids = self._deployment.get_all_children_pids()
        children_pids.sort()
        self._logger.info("All children pids: " + str(children_pids))

        signal.signal(signal.SIGCHLD, self.sigchld)

        # update our own sandbox status
        self._update_deployment_status(False, errmsg)

        #self._management_data_layer_client.put("workflow_status_" + self._workflowid, "deployed")
        #self._management_data_layer_client.delete("workflow_status_error_" + self._workflowid)

        self._logger.info("Successfully deployed.")

        while self._is_running:
            try:
                self._get_and_handle_message()
            except Exception as exc:
                self._logger.error("%s", str(exc))
                time.sleep(2)
Esempio n. 10
0
class DataLayerOperator:
    def __init__(self, suid, sid, wid, datalayer):
        self._storage_userid = suid
        self._sandboxid = sid
        self._workflowid = wid
        self._datalayer = datalayer

        # global data layer clients for either workflow-private data or user storage
        self._data_layer_client = None
        self._data_layer_client_private = None

        # TODO (?): use the local data layer for operations regarding KV, maps, sets and counters instead of in-memory data structures (e.g., transient_data_output)
        # and store the operations/data for is_queued = True operations,
        # so that we can synchronize it with the global data layer
        # (key, value) store
        self.transient_data_output = {}
        self.transient_data_output_private = {}

        self.data_to_be_deleted = {}
        self.data_to_be_deleted_private = {}

        self.map_output = {}
        self.set_output = {}
        self.counter_output = {}

        self.map_output_delete = {}
        self.set_output_delete = {}
        self.counter_output_delete = {}

    # TODO: update to use local data layer for (key, value) operations
    def put(self, key, value, is_private=False, is_queued=False, table=None):
        if is_queued:
            if is_private:
                self.transient_data_output_private[key] = value
                if key in self.data_to_be_deleted_private:
                    self.data_to_be_deleted_private.pop(key, None)
            else:
                self.transient_data_output[key] = value
                if key in self.data_to_be_deleted:
                    self.data_to_be_deleted.pop(key, None)
        else:
            data_layer_client = self._get_data_layer_client(is_private)
            data_layer_client.put(key, value, tableName=table)

    def get(self, key, is_private=False, table=None):
        # check first transient_output
        # if not there, return the actual (global) data layer data item
        # if not there either, return empty string (as defined in the DataLayerClient)
        value = None
        # if the put() or delete() were called with is_queued=False (default),
        # then the below checks will still result in 'value is None'
        # if not, then value will be obtained from the transient output
        if is_private:
            if key in self.data_to_be_deleted_private:
                return ""
            value = self.transient_data_output_private.get(key)
        else:
            if key in self.data_to_be_deleted:
                return ""
            value = self.transient_data_output.get(key)

        if value is None:
            data_layer_client = self._get_data_layer_client(is_private)
            value = data_layer_client.get(key, tableName=table)

        return value

    def delete(self, key, is_private=False, is_queued=False, table=None):
        if is_queued:
            if is_private:
                self.transient_data_output_private.pop(key, None)
                self.data_to_be_deleted_private[key] = True
            else:
                self.transient_data_output.pop(key, None)
                self.data_to_be_deleted[key] = True
        else:
            data_layer_client = self._get_data_layer_client(is_private)
            data_layer_client.delete(key, tableName=table)

    # map operations
    def createMap(self, mapname, is_private=False, is_queued=False):
        if is_queued:
            # TODO: use transient data structure in memory when the operation is queued
            pass
        else:
            dlc = self._get_data_layer_client(is_private)
            dlc.createMap(mapname)

    def putMapEntry(self,
                    mapname,
                    key,
                    value,
                    is_private=False,
                    is_queued=False):
        if is_queued:
            # TODO: use transient data structure in memory when the operation is queued
            pass
        else:
            dlc = self._get_data_layer_client(is_private)
            dlc.putMapEntry(mapname, key, value)

    def getMapEntry(self, mapname, key, is_private=False):
        value = None

        # TODO: check transient data structure first

        if value is None:
            dlc = self._get_data_layer_client(is_private)
            value = dlc.getMapEntry(mapname, key)

        return value

    def deleteMapEntry(self, mapname, key, is_private=False, is_queued=False):
        if is_queued:
            # TODO: use transient data structure in memory when the operation is queued
            pass
        else:
            dlc = self._get_data_layer_client(is_private)
            dlc.deleteMapEntry(mapname, key)

    def containsMapKey(self, mapname, key, is_private=False):
        ret = False

        # TODO: check transient data structure first

        if not ret:
            dlc = self._get_data_layer_client(is_private)
            ret = dlc.containsMapKey(mapname, key)

        return ret

    def retrieveMap(self, mapname, is_private=False):
        retmap = {}

        # XXX: should follow "read your writes"
        # the final result should include:
        # 1. all created locally
        # 2. all existing globally minus the ones deleted locally

        # TODO: 1. check local data layer first: get locally created and deleted

        # 2. retrieve all existing globally
        dlc = self._get_data_layer_client(is_private)
        retmap2 = dlc.retrieveMap(mapname)
        if retmap2 is not None:
            for k in retmap2:
                retmap[k] = retmap2[k]
            # TODO: 3. remove the ones deleted locally

        return retmap

    def getMapKeys(self, mapname, is_private=False):
        keys = set()

        # XXX: should follow "read your writes"
        # the final result should include:
        # 1. all created locally
        # 2. all existing globally minus the ones deleted locally

        # TODO: 1. check local data layer first: get locally created and deleted

        # 2. retrieve all existing globally
        dlc = self._get_data_layer_client(is_private)
        k2 = dlc.getMapKeys(mapname)
        if k2 is not None:
            # TODO: 3. remove the ones deleted locally
            keys = keys.union(k2)

        return keys

    def clearMap(self, mapname, is_private=False, is_queued=False):
        if is_queued:
            # TODO: use transient data structure in memory when the operation is queued
            pass
        else:
            dlc = self._get_data_layer_client(is_private)
            dlc.clearMap(mapname)

    def deleteMap(self, mapname, is_private=False, is_queued=False):
        if is_queued:
            # TODO: use transient data structure in memory when the operation is queued
            pass
        else:
            dlc = self._get_data_layer_client(is_private)
            dlc.deleteMap(mapname)

    def getMapNames(self,
                    start_index=0,
                    end_index=2147483647,
                    is_private=False):
        maps = set()

        # XXX: should follow "read your writes"
        # the final result should include:
        # 1. all created locally
        # 2. all existing globally minus the ones deleted locally

        # TODO: 1. check local data layer first: get locally created and deleted

        # 2. retrieve all existing globally
        dlc = self._get_data_layer_client(is_private)
        m2 = dlc.getMapNames(start_index, end_index)
        if m2 is not None:
            # TODO: 3. remove the ones deleted locally
            maps = maps.union(m2)

        return list(maps)

    # set operations
    def createSet(self, setname, is_private=False, is_queued=False):
        if is_queued:
            # TODO: use transient data structure in memory when the operation is queued
            pass
        else:
            dlc = self._get_data_layer_client(is_private)
            dlc.createSet(setname)

    def addSetEntry(self, setname, item, is_private=False, is_queued=False):
        if is_queued:
            # TODO: use transient data structure in memory when the operation is queued
            pass
        else:
            dlc = self._get_data_layer_client(is_private)
            dlc.addSetEntry(setname, item)

    def removeSetEntry(self, setname, item, is_private=False, is_queued=False):
        if is_queued:
            # TODO: use transient data structure in memory when the operation is queued
            pass
        else:
            dlc = self._get_data_layer_client(is_private)
            dlc.removeSetEntry(setname, item)

    def containsSetItem(self, setname, item, is_private=False):
        ret = False

        # TODO: check transient data structure first

        if not ret:
            dlc = self._get_data_layer_client(is_private)
            ret = dlc.containsSetItem(setname, item)

        return ret

    def retrieveSet(self, setname, is_private=False):
        items = set()

        # XXX: should follow "read your writes"
        # the final result should include:
        # 1. all created locally
        # 2. all existing globally minus the ones deleted locally

        # TODO: 1. check local data layer first: get locally created and deleted

        # 2. retrieve all existing globally
        dlc = self._get_data_layer_client(is_private)
        i2 = dlc.retrieveSet(setname)
        if i2 is not None:
            # TODO: 3. remove the ones deleted locally
            items = items.union(i2)

        return items

    def clearSet(self, setname, is_private=False, is_queued=False):
        if is_queued:
            # TODO: use transient data structure in memory when the operation is queued
            pass
        else:
            dlc = self._get_data_layer_client(is_private)
            dlc.clearSet(setname)

    def deleteSet(self, setname, is_private=False, is_queued=False):
        if is_queued:
            # TODO: use transient data structure in memory when the operation is queued
            pass
        else:
            dlc = self._get_data_layer_client(is_private)
            dlc.deleteSet(setname)

    def getSetNames(self,
                    start_index=0,
                    end_index=2147483647,
                    is_private=False):
        sets = set()

        # XXX: should follow "read your writes"
        # the final result should include:
        # 1. all created locally
        # 2. all existing globally minus the ones deleted locally

        # TODO: 1. check local data layer first: get locally created and deleted

        # 2. retrieve all existing globally
        dlc = self._get_data_layer_client(is_private)
        s2 = dlc.getSetNames(start_index, end_index)
        if s2 is not None:
            # TODO: 3. remove the ones deleted locally
            sets = sets.union(s2)

        return list(sets)

    # counter operations
    def createCounter(self,
                      countername,
                      count,
                      is_private=False,
                      is_queued=False):
        if is_queued:
            # TODO: use transient data structure in memory when the operation is queued
            pass
        else:
            dlc = self._get_data_layer_client(is_private)
            dlc.createCounter(countername, count)

    def getCounterValue(self, countername, is_private=False):
        value = 0

        # TODO: check transient data structure first and apply any changes to the global value

        dlc = self._get_data_layer_client(is_private)
        value = dlc.getCounter(countername)

        return value

    def incrementCounter(self,
                         countername,
                         increment,
                         is_private=False,
                         is_queued=False):
        if is_queued:
            # TODO: use transient data structure in memory when the operation is queued
            pass
        else:
            dlc = self._get_data_layer_client(is_private)
            dlc.incrementCounter(countername, increment)

    def decrementCounter(self,
                         countername,
                         decrement,
                         is_private=False,
                         is_queued=False):
        if is_queued:
            # TODO: use transient data structure in memory when the operation is queued
            pass
        else:
            dlc = self._get_data_layer_client(is_private)
            dlc.decrementCounter(countername, decrement)

    def deleteCounter(self, countername, is_private=False, is_queued=False):
        if is_queued:
            # TODO: use transient data structure in memory when the operation is queued
            pass
        else:
            dlc = self._get_data_layer_client(is_private)
            dlc.deleteCounter(countername)

    def getCounterNames(self,
                        start_index=0,
                        end_index=2147483647,
                        is_private=False):
        counters = set()

        # XXX: should follow "read your writes"
        # the final result should include:
        # 1. all created locally
        # 2. all existing globally minus the ones deleted locally

        # TODO: 1. check local data layer first: get locally created and deleted

        # 2. retrieve all existing globally
        dlc = self._get_data_layer_client(is_private)
        c2 = dlc.getCounterNames(start_index, end_index)
        if c2 is not None:
            # TODO: 3. remove the ones deleted locally
            counters = counters.union(c2)

        return list(counters)

    def get_transient_data_output(self, is_private=False):
        '''
        Return the transient data, so that it can be committed to the data layer
        when the function instance finishes.
        '''
        if is_private:
            return self.transient_data_output_private

        return self.transient_data_output

    def get_data_to_be_deleted(self, is_private=False):
        '''
        Return the list of deleted data items, so that they can be committed to the data layer
        when the function instance finishes.
        '''
        if is_private:
            return self.data_to_be_deleted_private

        return self.data_to_be_deleted

    def _get_data_layer_client(self, is_private=False):
        '''
        Return the data layer client, so that it can be used to commit to the data layer
        when the function instance finishes.
        If it is not initialized yet, it will be initialized here.
        '''
        # TODO: need also the locality information
        if is_private:
            if self._data_layer_client_private is None:
                self._data_layer_client_private = DataLayerClient(
                    locality=1,
                    sid=self._sandboxid,
                    wid=self._workflowid,
                    is_wf_private=True,
                    connect=self._datalayer)
            return self._data_layer_client_private

        if self._data_layer_client is None:
            self._data_layer_client = DataLayerClient(
                locality=1,
                suid=self._storage_userid,
                is_wf_private=False,
                connect=self._datalayer)
        return self._data_layer_client

    def _shutdown_data_layer_client(self):
        '''
        Shut down the data layer client if it has been initialized
        after the function instance finishes committing changes
        to the data layer.
        '''
        if self._data_layer_client_private is not None:
            self._data_layer_client_private.shutdown()
            self._data_layer_client_private = None

        if self._data_layer_client is not None:
            self._data_layer_client.shutdown()
            self._data_layer_client = None
Esempio n. 11
0
    def _initialize_data_layer_storage(self):
        # each data layer client will automatically create the local keyspace and tables
        # upon instantiation

        # mfn internal tables
        local_dlc = DataLayerClient(locality=0, for_mfn=True, sid=self._sandboxid, wid=self._workflowid, connect=self._datalayer, init_tables=True)
        local_dlc.shutdown()

        # user storage tables
        local_dlc = DataLayerClient(locality=0, suid=self._storage_userid, connect=self._datalayer, init_tables=True)
        local_dlc.shutdown()

        # workflow private tables
        local_dlc = DataLayerClient(locality=0, is_wf_private=True, sid=self._sandboxid, wid=self._workflowid, connect=self._datalayer, init_tables=True)
        local_dlc.shutdown()

        # for global access, (re)create; it's okay because the operations are idempotent
        # user storage is created by management service
        # mfn internal tables
        global_dlc = DataLayerClient(locality=1, for_mfn=True, sid=self._sandboxid, wid=self._workflowid, connect=self._datalayer, init_tables=True)
        global_dlc.shutdown()

        # workflow private tables
        global_dlc = DataLayerClient(locality=1, is_wf_private=True, sid=self._sandboxid, wid=self._workflowid, connect=self._datalayer, init_tables=True)
        global_dlc.shutdown()
Esempio n. 12
0
class Deployment:

    def __init__(self, deployment_info, hostname, userid, sandboxid, workflowid, workflowname, queue, datalayer, logger, external_endpoint, internal_endpoint, management_endpoints):
        self._logger = logger
        self._deployment_info = deployment_info
        self._hostname = hostname
        self._userid = userid
        self._sandboxid = sandboxid
        self._workflowid = workflowid
        self._workflowname = workflowname
        self._queue = queue
        self._datalayer = datalayer
        self._external_endpoint = external_endpoint
        self._internal_endpoint = internal_endpoint
        self._management_endpoints = management_endpoints

        self._python_version = sys.version_info

        self._storage_userid = self._userid.replace("@", "AT")
        self._storage_userid = self._storage_userid.replace("-", "_").replace(".", "_")

        self._process_id = os.getpid()

        self._functionworker_process_map = {}
        self._javarequesthandler_process_list = []
        self._queue_service_process = None
        self._frontend_process = None
        self._fluentbit_process = None
        # it will be probably updated to be something else
        self._fluentbit_actual_pid = -1

        self._child_process_command_args_map = {}

        # to be declared later when parsing the deployment info
        self._workflow = None

        self._global_data_layer_client = DataLayerClient(locality=1, suid=self._storage_userid, connect=self._datalayer)

        self._local_queue_client = None

    def get_workflow(self):
        return self._workflow

    def set_child_process(self, which, process, command_args_map):
        pid = process.pid
        if which == "qs":
            self._queue_service_process = process
        elif which == "fe":
            self._frontend_process = process
        elif which == "fb":
            self._fluentbit_process = process
            output, error = process_utils.run_command_return_output('ps --no-headers -o pid -C fluent-bit', self._logger)
            fbpid = int(output.strip())
            self._fluentbit_actual_pid = fbpid
            pid = fbpid

        # store command and args
        self._child_process_command_args_map[pid] = command_args_map

    def get_all_children_pids(self):
        children_pids = []
        for state in self._functionworker_process_map:
            p = self._functionworker_process_map[state]
            children_pids.append(p.pid)

        for jrhp in self._javarequesthandler_process_list:
            children_pids.append(jrhp.pid)

        children_pids.append(self._queue_service_process.pid)
        children_pids.append(self._frontend_process.pid)

        # looks like this pid does not match the actual process; perhaps because it also spawns another process?
        #children_pids.append(self._fluentbit_process.pid)
        ## find actual fluentbit pid
        output, error = process_utils.run_command_return_output('ps --no-headers -o pid -C fluent-bit', self._logger)
        fbpid = int(output.strip())
        self._fluentbit_actual_pid = fbpid
        children_pids.append(fbpid)

        return children_pids

    def check_child_process(self):
        pid, status = os.waitpid(-1, os.WNOHANG|os.WUNTRACED|os.WCONTINUED)
        failed_process_name = ""
        if os.WIFCONTINUED(status) or os.WIFSTOPPED(status):
            return False, _
        if os.WIFSIGNALED(status) or os.WIFEXITED(status):
            self._logger.error("Process with pid: " + str(pid) + " stopped.")
            if pid == self._fluentbit_actual_pid:
                failed_process_name = "Fluent-bit"
            elif pid == self._queue_service_process.pid:
                failed_process_name = "Queue service"
            elif pid == self._frontend_process.pid:
                failed_process_name = "Frontend"
            else:
                for jrhp in self._javarequesthandler_process_list:
                    if pid == jrhp.pid:
                        failed_process_name = "Java request handler"
                        break
                for state_name in self._functionworker_process_map:
                    process = self._functionworker_process_map[state_name]
                    if pid == process.pid:
                        failed_process_name = "Function worker (" + state_name + ")"
                        del self._functionworker_process_map[state_name]
                        break

            self._logger.error("Failed process name: " + failed_process_name)

        if os.path.exists('/var/run/secrets/kubernetes.io'):
            return True, pid, failed_process_name
        else:
            # TODO: try to relaunch some of the processes (FWs, fluentbit, frontend)
            self._logger.info(self._child_process_command_args_map[pid])
            return True, pid, failed_process_name

    def shutdown(self):
        shutdown_message = {}
        shutdown_message["action"] = "stop"

        lqcm_shutdown = LocalQueueClientMessage(key="0l", value=json.dumps(shutdown_message))

        workflow_nodes = self._workflow.getWorkflowNodeMap()
        for function_topic in workflow_nodes:
            ack = self._local_queue_client.addMessage(function_topic, lqcm_shutdown, True)
            while not ack:
                ack = self._local_queue_client.addMessage(function_topic, lqcm_shutdown, True)

        self._logger.info("Waiting for function workers to shutdown")
        self._wait_for_child_processes()

        for jrh_process in self._javarequesthandler_process_list:
            process_utils.terminate_and_wait_child(jrh_process, "JavaRequestHandler", 5, self._logger)

        self._local_queue_client.shutdown()

    def force_shutdown(self):
        # called when the queue service has crashed and we need to shut down the function workers
        for state in self._functionworker_process_map:
            p = self._functionworker_process_map[state]
            process_utils.terminate_and_wait_child(p, "FunctionWorker", 5, self._logger)

        for jrh_process in self._javarequesthandler_process_list:
            process_utils.terminate_and_wait_child(jrh_process, "JavaRequestHandler", 5, self._logger)

        self._local_queue_client.shutdown()

    def _wait_for_child_processes(self):
        output, error = process_utils.run_command_return_output('pgrep -P ' + str(self._process_id), self._logger)
        if error is not None:
            self._logger.error("[SandboxAgent] wait_for_child_processes: Failed to get children process ids: %s", str(error))
            return

        children_pids = set(output.split())
        self._logger.info("[SandboxAgent] wait_for_child_processes: Parent pid: %s, Children pids: %s", str(self._process_id), str(children_pids))

        for jrh_process in self._javarequesthandler_process_list:
            if str(jrh_process.pid) in children_pids:
                children_pids.remove(str(jrh_process.pid))
                self._logger.info("[SandboxAgent] wait_for_child_processes: Not waiting on JavaRequestHandler pid: %s", str(jrh_process.pid))

        ## find fluentbit PID
        output, error = process_utils.run_command_return_output('ps --no-headers -o pid -C fluent-bit', self._logger)
        fbpid = output.strip()
        if fbpid in children_pids:
            children_pids.remove(fbpid)
            self._logger.info("[SandboxAgent] wait_for_child_processes: Not waiting on fluent-bit pid: %s", fbpid)

        if self._queue_service_process is not None:
            if str(self._queue_service_process.pid) in children_pids:
                children_pids.remove(str(self._queue_service_process.pid))
                self._logger.info("[SandboxAgent] wait_for_child_processes: Not waiting on queue service pid: %s", str(self._queue_service_process.pid))

        if self._frontend_process is not None:
            if str(self._frontend_process.pid) in children_pids:
                children_pids.remove(str(self._frontend_process.pid))
                self._logger.info("[SandboxAgent] wait_for_child_processes: Not waiting on frontend pid: %s", str(self._frontend_process.pid))

        if not children_pids:
            self._logger.info("[SandboxAgent] wait_for_child_processes: No remaining pids to wait for")
            return

        while True:
            try:
                cpid, status = os.waitpid(-1, 0)
                self._logger.info("[SandboxAgent] wait_for_child_processes: Status changed for pid: %s, Status: %s", str(cpid), str(status))
                if str(cpid) not in children_pids:
                    #print('wait_for_child_processes: ' + str(cpid) + "Not found in children_pids")
                    continue
                children_pids.remove(str(cpid))
                if not children_pids:
                    self._logger.info("[SandboxAgent] wait_for_child_processes: No remaining pids to wait for")
                    break
            except Exception as exc:
                self._logger.error('[SandboxAgent] wait_for_child_processes: %s', str(exc))

    def _start_python_function_worker(self, worker_params, env_var_list):
        error = None
        function_name = worker_params["fname"]
        state_name = worker_params["functionstatename"]
        custom_env = os.environ.copy()
        old_ld_library_path = ""
        if "LD_LIBRARY_PATH" in custom_env:
            old_ld_library_path = custom_env["LD_LIBRARY_PATH"]
        custom_env["LD_LIBRARY_PATH"] = "/opt/mfn/workflow/states/" + state_name + "/" + function_name + ":/opt/mfn/workflow/states/" + state_name + "/" + function_name + "/lib"

        if old_ld_library_path != "":
            custom_env["LD_LIBRARY_PATH"] = custom_env["LD_LIBRARY_PATH"] + ":" + old_ld_library_path

        #custom_env["PYTHONPATH"] = "/opt/mfn/workflow/states/" + state_name + "/" + function_name

        for env_var in env_var_list:
            idx = env_var.find("=")
            if idx == -1:
                continue
            env_var_key = env_var[0:idx]
            env_var_value = env_var[idx+1:]
            custom_env[env_var_key] = env_var_value

        #self._logger.info("environment variables (after user env vars): %s", str(custom_env))

        if self._python_version >= (3, ):
            cmd = "python3 "
        else:
            cmd = "python "
        cmd = cmd + "/opt/mfn/FunctionWorker/python/FunctionWorker.py"
        cmd = cmd + " " + '\"/opt/mfn/workflow/states/%s/worker_params.json\"' % state_name # state_name can contain whitespace

        filename = '/opt/mfn/logs/function_' + state_name + '.log'
        log_handle = open(filename, 'a')

        # store command arguments for when/if we need to restart the process if it fails
        command_args_map = {}
        command_args_map["command"] = cmd
        command_args_map["custom_env"] = custom_env
        command_args_map["log_filename"] = filename

        #self._logger.info("Starting function worker: " + state_name + "  with stdout/stderr redirected to: " + filename)
        error, process = process_utils.run_command(cmd, self._logger, custom_env=custom_env, process_log_handle=log_handle)
        if error is None:
            self._functionworker_process_map[state_name] = process
            self._child_process_command_args_map[process.pid] = command_args_map
            self._logger.info("Started function worker: %s, pid: %s, with stdout/stderr redirected to: %s", state_name, str(process.pid), filename)
        return error

    def _start_function_worker(self, worker_params, runtime, env_var_list):
        error = None

        if runtime.find("python") != -1:
            error = self._start_python_function_worker(worker_params, env_var_list)
        elif runtime.find("java") != -1:
            # TODO: environment/JVM variables need to be utilized by the java request handler, not by the function worker

            if SINGLE_JVM_FOR_FUNCTIONS:
                # _XXX_: we'll launch the single JVM handling all java functions later
                error = self._start_python_function_worker(worker_params, env_var_list)
            else:
                # if jar, the contents have already been extracted as if it was a zip archive
                # start the java request handler if self._function_runtime == "java"
                # we wrote the parameters to json file at the state directory
                self._logger.info("Launching JavaRequestHandler for state: %s", worker_params["functionstatename"])
                cmdjavahandler = "java -jar /opt/mfn/JavaRequestHandler/target/javaworker.jar "
                cmdjavahandler += "/opt/mfn/workflow/states/" + worker_params["functionstatename"] + "/java_worker_params.json"

                error, process = process_utils.run_command(cmdjavahandler, self._logger, wait_until="Waiting for requests on:")
                if error is not None:
                    error = "Could not launch JavaRequestHandler: " + worker_params["fname"] + " " + error
                    self._logger.error(error)
                else:
                    self._javarequesthandler_process_list.append(process)
                    error = self._start_python_function_worker(worker_params, env_var_list)
        else:
            error = "Unsupported function runtime: " + runtime

        return error

    def _prepare_update_for_locally_running(self, local_functions):
        update = {}
        update["action"] = "update-local-functions"
        update["localFunctions"] = local_functions
        update = json.dumps(update)

        lqcm_update = LocalQueueClientMessage(key="0l", value=update)

        return lqcm_update

    def _update_function_worker(self, topic, lqcm_update):
        ack = self._local_queue_client.addMessage(topic, lqcm_update, True)
        while not ack:
            ack = self._local_queue_client.addMessage(topic, lqcm_update, True)

    def _update_remaining_function_workers(self, excluded_function_topic, lqcm_update=None):
        local_functions = self._workflow.getWorkflowLocalFunctions()
        if lqcm_update is None:
            lqcm_update = self._prepare_update_for_locally_running(local_functions)

        for locally_running_ft in local_functions:
            if locally_running_ft == excluded_function_topic:
                continue
            self._update_function_worker(locally_running_ft, lqcm_update)

    def stop_function_worker(self, function_topic):
        # remove from locally running functions
        self._workflow.removeLocalFunction(function_topic)

        # first, update locally running functions with remaining functions
        self._update_remaining_function_workers(function_topic)

        # send stop message to function worker's queue
        stop = {}
        stop["action"] = "stop"
        stop = json.dumps(stop)
        lqcm_update = LocalQueueClientMessage(key="0l", value=stop)
        self._update_function_worker(function_topic, lqcm_update)

    def _install_sandbox_requirements(self, parameters):
        error = None
        installer = parameters["installer"]
        requirements = parameters["requirements"]
        additional_installer_options = {}
        if "additional_installer_options" in parameters:
            additional_installer_options = parameters["additional_installer_options"]

        if requirements:
            # TODO: other installers (e.g., apt-get)?
            if installer == "pip":
                # launch 'pip install' with any parameters related to proxy etc.
                # store requirements into /opt/mfn/requirements.txt
                reqfname = "/opt/mfn/requirements.txt"
                with open(reqfname, "w+") as reqf:
                    for req in requirements:
                        reqf.write(req + "\n")

                # modify command to add additional installer options
                if self._python_version >= (3, ):
                    cmd = "python3 "
                else:
                    cmd = "python "
                cmd = cmd + "-m pip install --user"
                cmd += " --no-compile --no-clean"
                for opt in additional_installer_options:
                    cmd = cmd + " " + opt + " " + additional_installer_options[opt]

                cmd = cmd + " -r " + reqfname

                # launch 'pip install [additional_options] -r /opt/mfn/requirements.txt
                error, _ = process_utils.run_command(cmd, self._logger, wait_output=True)

            else:
                error = "Unsupported installer: " + installer

        return error

    def _retrieve_and_store_function_code(self, resource_name, resource_info):
        error = None

        rpath = "/opt/mfn/code/resources/" + resource_name + "/"
        fpath = rpath + resource_name

        if resource_info["runtime"].find("python") != -1:
            fpath = fpath + ".py"
        elif resource_info["runtime"].find("java") != -1:
            fpath = fpath + ".java"
        else:
            error = "Unsupported runtime: " + resource_info["runtime"]
            return (error, None)

        if not os.path.exists(os.path.dirname(fpath)):
            try:
                os.makedirs(os.path.dirname(fpath))
            except OSError as err:
                if err.errno != os.errno.EEXIST:
                    error = err
                    return (error, None)

        resource_code = self._global_data_layer_client.get(resource_info["ref"])

        if resource_code is None:
            error = "Empty function code."
            return (error, None)

        try:
            resource_code = base64.b64decode(resource_code).decode()
        except Exception as exc:
            error = "Invalid value for code: " + str(exc)
            self._logger.error(error)
            return (error, None)

        with open(fpath, "w") as funcf:
            funcf.write(resource_code)

        return (error, rpath)

    def _retrieve_and_store_function_zip(self, resource_name, resource_info):
        error = None

        zipref = resource_info["ref"]
        num_chunks_str = self._global_data_layer_client.get(zipref)

        try:
            num_chunks = int(num_chunks_str)
        except Exception as exc:
            error = "Invalid value for key " + zipref + "; expected number of chunks: " + str(exc)
            self._logger.error(error)
            return (error, None)

        zip_content = ""
        ind = zipref.find("num_chunks_")
        gid = zipref[ind+11:]
        pref = zipref[0:ind] + gid + "_chunk_"
        for i in range(num_chunks):
            chunkref = pref + str(i)
            chunk = self._global_data_layer_client.get(chunkref)
            if chunk is None:
                error = "Empty zip chunk."
                return (error, None)

            zip_content = zip_content + chunk

        old_len = len(zip_content)
        rem = old_len % 4
        if rem > 0:
            num_pad = 4 - rem
            for i in range(num_pad):
                zip_content = zip_content + "="

        try:
            decodedzip = base64.b64decode(zip_content)
        except Exception as exc:
            error = "Invalid value for assembled chunks; couldn't decode base64: " + str(exc)
            self._logger.error(error)
            return (error, None)

        runtime = resource_info["runtime"]

        # 1. store zip file
        zipfname = "/opt/mfn/code/zips/" + resource_name + ".zip"
        if not os.path.exists(os.path.dirname(zipfname)):
            try:
                os.makedirs(os.path.dirname(zipfname))
            except OSError as err:
                if err.errno != os.errno.EEXIST:
                    error = err
                    return (error, None)

        with open(zipfname, "wb") as zipfile:
            zipfile.write(decodedzip)

        gextractedpath = "/opt/mfn/code/resources/" + resource_name + "/"
        # 2. extract zip file
        if not os.path.exists(os.path.dirname(gextractedpath)):
            try:
                os.makedirs(os.path.dirname(gextractedpath))
            except OSError as err:
                if err.errno != os.errno.EEXIST:
                    error = err
                    return (error, None)

        cmdunzip = "unzip " + zipfname + " -d " + gextractedpath
        error, _ = process_utils.run_command(cmdunzip, self._logger, wait_output=True)

        if error is not None:
            error = "Could not extract zip file: " + resource_name + " " + error
            self._logger.error(error)
            return (error, None)

        # 3. need to set executable permissions for the extracted libs
        cmdperm = "sh -c \"find " + gextractedpath + "| xargs -I {} file {}"
        cmdperm = cmdperm + "| grep ELF" + "| grep -v grep"
        cmdperm = cmdperm + "| awk -F ':' '{print $1}'"
        cmdperm = cmdperm + "| xargs -I {} chmod +x {}\""

        error, _ = process_utils.run_command(cmdperm, self._logger, wait_output=True)

        if error is not None:
            error = "Could not set lib permissions: " + resource_name + " " + error
            self._logger.error(error)
            return (error, None)

        if runtime.find("python") != -1:
            fpath = gextractedpath + resource_name
            fpath = fpath + ".py"

            resource_code = self._global_data_layer_client.get("grain_source_" + resource_info["id"])
            if resource_code is not None or resource_code != "":
                try:
                    resource_code = base64.b64decode(resource_code).decode()
                except Exception as exc:
                    error = "Invalid value for function code: " + str(exc)
                    self._logger.error(error)
                    return (error, None)

                self._logger.info("Overwriting zip resource file with the updated resource code...")
                with open(fpath, "w") as funcf:
                    funcf.write(resource_code)

        elif runtime.find("java") != -1:
            # TODO: try to retrieve the updated resource?
            # To do that, we'd need to know the actual state name (i.e., in the workflow description),
            # which (for now) has to be the same as the Java class.
            # This class name can differ from the resource name
            # (e.g., one jar containing multiple classes with handle functions, such that each function is used as a separate state)
            # that means, we'd need to do the code update just at the beginning of when we create the state and also the compilation,
            # but before copying the resource to each state's separate location
            # TODO: double check whether this is also the case for python
            pass

        else:
            error = "Unsupported runtime: " + resource_info["runtime"]
            return (error, None)

        return (error, gextractedpath)

    def _initialize_data_layer_storage(self):
        # each data layer client will automatically create the local keyspace and tables
        # upon instantiation

        # mfn internal tables
        local_dlc = DataLayerClient(locality=0, for_mfn=True, sid=self._sandboxid, wid=self._workflowid, connect=self._datalayer, init_tables=True)
        local_dlc.shutdown()

        # user storage tables
        local_dlc = DataLayerClient(locality=0, suid=self._storage_userid, connect=self._datalayer, init_tables=True)
        local_dlc.shutdown()

        # workflow private tables
        local_dlc = DataLayerClient(locality=0, is_wf_private=True, sid=self._sandboxid, wid=self._workflowid, connect=self._datalayer, init_tables=True)
        local_dlc.shutdown()

        # for global access, (re)create; it's okay because the operations are idempotent
        # user storage is created by management service
        # mfn internal tables
        global_dlc = DataLayerClient(locality=1, for_mfn=True, sid=self._sandboxid, wid=self._workflowid, connect=self._datalayer, init_tables=True)
        global_dlc.shutdown()

        # workflow private tables
        global_dlc = DataLayerClient(locality=1, is_wf_private=True, sid=self._sandboxid, wid=self._workflowid, connect=self._datalayer, init_tables=True)
        global_dlc.shutdown()

    def _populate_worker_params(self, function_topic, wf_node, state):
        worker_params = {}
        worker_params["userid"] = self._userid
        worker_params["storageuserid"] = self._storage_userid
        worker_params["sandboxid"] = self._sandboxid
        worker_params["workflowid"] = self._workflowid
        worker_params["workflowname"] = self._workflowname
        worker_params["ffolder"] = state["resource_dirpath"]
        worker_params["fpath"] = state["resource_filepath"]
        worker_params["fname"] = state["resource_filename"]
        worker_params["fruntime"] = state["resource_runtime"]
        worker_params["ftopic"] = function_topic
        worker_params["hostname"] = self._hostname
        worker_params["queue"] = self._queue
        worker_params["datalayer"] = self._datalayer
        worker_params["externalendpoint"] = self._external_endpoint
        worker_params["internalendpoint"] = self._internal_endpoint
        worker_params["managementendpoints"] = self._management_endpoints
        worker_params["fnext"] = wf_node.getNextMap()
        worker_params["fpotnext"] = wf_node.getPotentialNextMap()
        worker_params["functionstatetype"] = wf_node.getGWFType()
        worker_params["functionstatename"] = wf_node.getGWFStateName()
        worker_params["functionstateinfo"] = wf_node.getGWFStateInfo()
        worker_params["workflowfunctionlist"] = self._workflow.getWorkflowFunctionMap()
        worker_params["workflowexit"] = self._workflow.getWorkflowExitPoint()
        worker_params["sessionworkflow"] = self._workflow.is_session_workflow()
        worker_params["sessionfunction"] = wf_node.is_session_function()
        worker_params["sessionfunctionparameters"] = wf_node.get_session_function_parameters()
        worker_params["shouldcheckpoint"] = self._workflow.are_checkpoints_enabled()

        return worker_params

    def _compile_java_resources_if_necessary(self, resource, mvndeps):
        error = None

        cmdmkdir = "mkdir -p " + resource["dirpath"] + "target/classes"

        self._logger.info("Preparing for compilation of Java function resources: %s", resource["name"])
        error, _ = process_utils.run_command(cmdmkdir, self._logger, wait_output=True)
        if error is not None:
            error = "Could not create target directory for resource: " + resource["name"] + " " + error
            self._logger.error(error)
            return error

        #cmdjavac = "javac -classpath /opt/mfn/JavaRequestHandler/mfnapi.jar -d " + resource["dirpath"] + "target/classes "
        #cmdjavac += resource["dirpath"] + resource["name"] + ".java"

        cmdfind = "find " + resource["dirpath"] + " -name *.java"
        output, error = process_utils.run_command_return_output(cmdfind, self._logger)
        if error is not None:
            self._logger.error("[SandboxAgent] could not search for any Java sources: %s", str(error))
            error = "Could not search for any Java sources: " + resource["name"] + " " + str(error)
            return error
        source_files = set(output.split("\n"))
        source_files = ' '.join(source_files).strip()
        should_compile = False
        if source_files != "":
            should_compile = True
            self._logger.info("Found following Java sources: %s", str(source_files))
        else:
            self._logger.info("No java sources to compile.")

        # 2. check for pom.xml or the requirements; if it is there, then:
        if mvndeps is not None and not os.path.exists(resource["dirpath"] + "pom.xml"):
            # write the content of mvndeps into the pom.xml
            self._logger.info("Writing maven build file: %spom.xml", resource["dirpath"])
            with open(resource["dirpath"] + "pom.xml", "w") as fpom:
                fpom.write(mvndeps)

        # we either had a pom.xml file in the archive or non-empty mvndeps from uploaded requirements, which we wrote as the pom.xml file
        # regardless, if there is a pom file, then resolve and copy maven dependencies
        if os.path.exists(resource["dirpath"] + "pom.xml"):
            cmdmvn = "mvn -Duser.home=/tmp -DskipTests -gs /opt/mfn/JavaRequestHandler/maven/sandbox-mvn-settings.xml -f " + resource["dirpath"]
            cmdmvn += " dependency:copy-dependencies -DoutputDirectory=" + resource["dirpath"] + "target/classes"

            self._logger.info("Copying maven dependencies for Java function: %s", resource["name"])
            error, _ = process_utils.run_command(cmdmvn, self._logger, wait_output=True)
            if error is not None:
                error = "Could not copy maven dependencies: " + resource["name"] + " " + error
                self._logger.error(error)
                return error
            self._logger.info("Finished copying dependencies for Java function: %s", resource["name"])

        if should_compile:
            cmdjavac = "javac -classpath /opt/mfn/JavaRequestHandler/mfnapi.jar:"
            cmdjavac += resource["dirpath"] + "target/classes/* "
            cmdjavac += "-d " +  resource["dirpath"] + "target/classes " + source_files

            self._logger.info("Compiling Java function resources: %s", resource["name"])
            self._logger.info(cmdjavac)
            error, _ = process_utils.run_command(cmdjavac, self._logger, wait_output=True)
            if error is not None:
                error = "Could not compile resource: " + resource["name"] + " " + error
                self._logger.error(error)
                return error
            self._logger.info("Finished compiling Java function resources: %s", resource["name"])

        return error

    def process_deployment_info(self):
        has_error = False
        errmsg = ""

        if self._deployment_info is not None and self._deployment_info != "":
            try:
                self._deployment_info = json.loads(self._deployment_info)
                self._logger.debug("Deployment info: %s", json.dumps(self._deployment_info))
            except Exception as exc:
                errmsg = "Could not parse deployment info: " + str(exc)
                self._logger.error(errmsg)
                has_error = True
                return has_error, errmsg
        else:
            errmsg = "Empty deployment info."
            has_error = True
            return has_error, errmsg

        if "workflow" not in self._deployment_info or "resources" not in self._deployment_info:
            errmsg = "Incomplete deployment info: " + json.dumps(self._deployment_info)
            self._logger.error(errmsg)
            has_error = True
            return has_error, errmsg

        # get workflow info
        workflow_info = self._deployment_info["workflow"]
        sid = workflow_info["sandboxId"]
        if sid != self._sandboxid:
            warnmsg = "WARN: workflow info sandboxid doesn't match provided sandboxid ("+sid+" <-> "+workflow_info["sandboxId"]+")"
            self._logger.info(warnmsg)
        wid = workflow_info["workflowId"]
        if wid != self._workflowid:
            warnmsg = "WARN: workflow info workflowid doesn't match provided workflowid ("+wid+" <-> "+workflow_info["workflowId"]+")"
            print(warnmsg)
        wf_type = workflow_info["workflowType"]

        usertoken = ''
        if "usertoken" in workflow_info:
            usertoken = workflow_info["usertoken"]
        os.environ["USERTOKEN"] = usertoken

        # get workflow json, parse workflow json and init params
        workflow_json = self._global_data_layer_client.get(workflow_info["json_ref"])
        if workflow_json is None or workflow_json == "":
            has_error = True
            errmsg = "Empty workflow description."
            return has_error, errmsg

        try:
            workflow_json = base64.b64decode(workflow_json).decode()
        except Exception as exc:
            has_error = True
            errmsg = "Invalid value for workflow json: " + str(exc)
            return has_error, errmsg

        self._workflow = Workflow(self._userid, sid, wid, wf_type, workflow_json, self._logger)

        has_error = self._workflow.has_error()
        if has_error:
            errmsg = "Problem in workflow description: " + str(workflow_json)
            self._logger.error(errmsg)
            return has_error, errmsg

        # get workflow nodes
        workflow_nodes = self._workflow.getWorkflowNodeMap()

        # get resources info and find functions
        resource_map = {}
        resource_info_map = self._deployment_info["resources"]

        if any(resource_info_map[res_name]["runtime"] == "Java" for res_name in resource_info_map):
            # run setup_maven.sh to update the proxy settings at runtime
            # (i.e., the sandbox image may have been built on a machine with a proxy, or vice versa)
            cmd_maven_proxy_initer = "/opt/mfn/JavaRequestHandler/./setup_maven.sh"
            self._logger.info("Updating maven proxy settings...")
            error, _ = process_utils.run_command(cmd_maven_proxy_initer, self._logger, wait_output=True)
            if error is not None:
                has_error = True
                errmsg = "Could not reinitialize maven proxy settings: " + error
                return has_error, errmsg
            self._logger.info("Finished updating maven proxy settings.")

        # for pip installable dependencies for python functions
        req_map = {}
        t_start_download = time.time()
        # store functions in local filesystem
        for resource_name in resource_info_map:
            resource_info = resource_info_map[resource_name]
            resource_info["runtime"] = resource_info["runtime"].lower()

            if resource_info["type"] == "code":
                error, resource_dirpath = self._retrieve_and_store_function_code(resource_name, resource_info)
            else:
                error, resource_dirpath = self._retrieve_and_store_function_zip(resource_name, resource_info)

            if error is not None:
                errmsg = "Could not retrieve and store function: " + resource_name + " " + error
                self._logger.error(errmsg)
                has_error = True
                return has_error, errmsg

            # these requirements can now be also for java maven dependencies
            resource_id = resource_info["id"]
            greq = self._global_data_layer_client.get("grain_requirements_" + resource_id)
            mvndeps = None
            if greq is not None and greq != "":
                greq = base64.b64decode(greq).decode()
                if resource_info["runtime"].find("python") == 0:
                    # get function requirements and put it into a map
                    lines = greq.strip().split("\n")
                    for line in lines:
                        req_map[line] = True
                elif resource_info["runtime"].find("java") == 0:
                    mvndeps = greq

            # get function environment variables
            env_var_list = []
            genv = self._global_data_layer_client.get("grain_environment_variables_" + resource_id)
            if genv is not None and genv != "":
                genv = base64.b64decode(genv).decode()
                lines = genv.split("\n")
                env_var_list = lines

            resource = {}
            resource["name"] = resource_name
            resource["dirpath"] = resource_dirpath
            resource["runtime"] = resource_info["runtime"]
            resource["env_var_list"] = env_var_list
            resource_map[resource_name] = resource

            # compile the java sources
            if resource["runtime"].find("java") == 0:
                # even if it was just a single java file
                # or a jar file uploaded with source files
                # or a jar file with just class files,
                # the following function will
                # 1. download maven dependencies (if there is a pom.xml in the jar or was separately uploaded)
                # 2. compile the source files if any
                error = self._compile_java_resources_if_necessary(resource, mvndeps)

                if error is not None:
                    errmsg = "Could not compile Java function resources: " + resource_name + " " + error
                    self._logger.error(errmsg)
                    has_error = True
                    return has_error, errmsg

        total_time_download = (time.time() - t_start_download) * 1000.0
        self._logger.info("Download time for all function code: %s (ms)", str(total_time_download))

        t_start_requirements = time.time()
        # this list will only contain pip installable dependencies
        # java maven dependencies will be handled while compiling the java resources
        sbox_req_list = []
        for req_line in req_map:
            sbox_req_list.append(req_line)

        # install sandbox requirements
        req = workflow_info["sandbox_requirements"]
        req["requirements"] = sbox_req_list
        error = self._install_sandbox_requirements(req)
        if error is not None:
            errmsg = "Could not install sandbox requirements. " + str(error)
            self._logger.error(errmsg)
            has_error = True
            return has_error, errmsg

        total_time_requirements = (time.time() - t_start_requirements) * 1000.0
        self._logger.info("Requirements install time: %s (ms)", str(total_time_requirements))

        t_start_storage = time.time()
        # initialize local data layer space for user and workflow
        self._initialize_data_layer_storage()
        total_time_storage = (time.time() - t_start_storage) * 1000.0
        self._logger.info("Storage initialization time: %s (ms)", str(total_time_storage))

        self._local_queue_client = LocalQueueClient(connect=self._queue)

        self._local_queue_client.addTopic(self._workflow.getWorkflowExitTopic())

        t_start_launch = time.time()
        # accummulate all java worker params into one
        # later, we'll launch a single JVM to handle all java functions
        if SINGLE_JVM_FOR_FUNCTIONS:
            single_jvm_worker_params = {}
            any_java_function = False

        total_time_state = 0.0
        for function_topic in workflow_nodes:
            wf_node = workflow_nodes[function_topic]
            resource_name = wf_node.get_resource_name()

            t_start_state = time.time()
            if resource_name == "":
                # this is an ASL state without a resource (i.e., function) attached to it
                error, resource = state_utils.create_dummy_resource_for_asl_state(wf_node)
                if error is not None:
                    errmsg = "Could not create non-resource state. " + str(error)
                    self._logger.error(errmsg)
                    has_error = True
                    return has_error, errmsg
            else:
                resource = resource_map[resource_name]

            error, state = state_utils.create_state(wf_node, resource, self._logger)
            if error is not None:
                errmsg = "Could not create state: " + str(error)
                self._logger.error(errmsg)
                has_error = True
                return has_error, errmsg

            total_time_state += (time.time() - t_start_state) * 1000.0

            self._local_queue_client.addTopic(function_topic)

            # compile worker parameters
            worker_params = self._populate_worker_params(function_topic, wf_node, state)
            # store worker parameters as a local file
            params_filename = state["dirpath"] + "worker_params.json"

            with open(params_filename, "w") as paramsf:
                json.dump(worker_params, paramsf, indent=4)

            if state["resource_runtime"].find("java") != -1:
                java_worker_params = {}
                java_worker_params["functionPath"] = worker_params["ffolder"]
                java_worker_params["functionName"] = worker_params["fname"]
                java_worker_params["serverSocketFilename"] = "/tmp/java_handler_" + worker_params["functionstatename"] + ".uds"

                if SINGLE_JVM_FOR_FUNCTIONS:
                    any_java_function = True
                    single_jvm_worker_params[worker_params["functionstatename"]] = java_worker_params
                else:
                    java_params_filename = state["dirpath"] + "java_worker_params.json"
                    with open(java_params_filename, "w") as javaparamsf:
                        json.dump(java_worker_params, javaparamsf, indent=4)

            # launch function workers with the params parsed from workflow info
            error = self._start_function_worker(worker_params, state["resource_runtime"], state["resource_env_var_list"])

            if error is not None:
                errmsg = "Problem launching function worker for: " + worker_params["fname"]
                self._logger.error(errmsg)
                has_error = True
                return has_error, errmsg

            # add the new function worker to the local list
            self._workflow.addLocalFunction(function_topic)

        # all function workers have been launched; update them with locally running functions
        # prepare update message to be used by all
        local_functions = self._workflow.getWorkflowLocalFunctions()
        lqcm_update = self._prepare_update_for_locally_running(local_functions)
        for function_topic in workflow_nodes:
            self._update_function_worker(function_topic, lqcm_update)

        if SINGLE_JVM_FOR_FUNCTIONS:
            if any_java_function:
                single_jvm_params_filename = "/opt/mfn/workflow/states/single_jvm_worker_params.json"
                with open(single_jvm_params_filename, "w") as jvmparamsf:
                    json.dump(single_jvm_worker_params, jvmparamsf, indent=4)

                self._logger.info("Launching a single JavaRequestHandler for all Java states...")
                cmdjavahandler = "java -jar /opt/mfn/JavaRequestHandler/target/javaworker.jar "
                cmdjavahandler += single_jvm_params_filename

                error, process = process_utils.run_command(cmdjavahandler, self._logger, wait_until="Waiting for requests on:")
                if error is not None:
                    errmsg = "Problem launching JavaRequestHandler for Java states: " + error
                    self._logger.error(errmsg)
                    has_error = True
                    return has_error, errmsg
                else:
                    self._javarequesthandler_process_list.append(process)

        self._logger.info("State creation for all function workers: %s (ms)", str(total_time_state))

        total_time_launch = (time.time() - t_start_launch) * 1000.0
        self._logger.info("Launch time for all function workers: %s (ms)", str(total_time_launch))

        if not has_error:
            # check whether all function workers have launched successfully
            # give some time for function workers to come up
            cmd = "pgrep -P " + str(self._process_id) + " -a"
            output, error = process_utils.run_command_return_output(cmd, self._logger)
            if error is not None:
                self._logger.error("[SandboxAgent] check health of function workers: failed to get FunctionWorker processes: %s", str(error))
                has_error = True
                errmsg = "Could not get FunctionWorker processes."

        if not has_error:
            fwlines = set(output.split("\n"))
            fwpids = []
            for line in fwlines:
                if "FunctionWorker.py" in line:
                    pid = line.split(" ")[0]
                    fwpids.append(pid)

            if str(self._fluentbit_process.pid) in fwpids:
                fwpids.remove(str(self._fluentbit_process.pid))

            self._logger.info(str(len(fwpids)) + " " + str(len(self._functionworker_process_map)))
            #self._logger.info(str(fwpids) + " " + str(self._functionworker_process_map))

            if len(fwpids) != len(self._functionworker_process_map):
                has_error = True
                errmsg = "One or more function workers could not be launched:\n"

                for state_name in self._functionworker_process_map:
                    fwp = self._functionworker_process_map[state_name]
                    if fwp.pid not in fwpids:
                        errmsg += state_name + "\n"

        self._global_data_layer_client.shutdown()

        return has_error, errmsg
Esempio n. 13
0
class SessionUtils:
    def __init__(self, hostname, uid, sid, wid, logger, funcstatename,
                 functopic, key, session_id, publication_utils, queue,
                 datalayer, internal_endpoint):

        self._logger = logger

        self._queue = queue
        self._datalayer = datalayer

        self._session_id = session_id
        self._session_function_id = None

        self._hostname = hostname
        self._userid = uid
        self._sandboxid = sid
        self._workflowid = wid
        self._function_state_name = funcstatename
        self._function_topic = functopic
        self._internal_endpoint = internal_endpoint
        self._key = key

        self._publication_utils = publication_utils

        self._is_session_function_running = False

        self._helper_thread = None

        self._global_data_layer_client = DataLayerClient(
            locality=1, sid=sid, for_mfn=True, connect=self._datalayer)

        # only valid if this is a session function (i.e., session_function_id is not None)
        self._local_topic_communication = None

        self._session_function_parameters = None

        if self._session_id is None:
            self._generate_session_id()

        self._setup_metadata_tablenames()

        # _XXX_: the following does not have any effect and makes unnecessary calls
        # to the data layer
        # the main reason is that the backend at the data layer does not create
        # sets and maps (i.e., createSet, createMap) until an entry is made
        # the addition of the entries will succeed without requiring the
        # corresponding set/map to have been created.
        #self._create_metadata_tables()

        #self._logger.debug("[SessionUtils] init done.")

    ###########################
    #
    # Alias operations with a given session id?
    # probably not needed. the session id that is generated at session start
    # would be returned to the client,
    # which would send it back in the future to set the context correctly
    # (i.e., happens implicitly during function instantiation and/or communication).
    # the application can then also set a session alias and return it to the client,
    # which can use it in the future to set the context.
    # however, the session will be implicitly identified via the client sending back
    # the session id and/or the alias.
    # no need to allow other explicit access to alias operations.
    #
    # How to deal with access control between sessions?
    # (i.e., a function session A should not be able to set an alias for session B).
    # when the context is correctly set via the session id and/or the session alias
    # and with no explicit access to alias operations with a given session id,
    # this cannot happen.
    #
    # Alias operations with a session function id?
    # in a given session, any function may assign an alias to another session function instance
    # in other words, it doesn't need to be the actual session function instance that is setting
    # its alias; it could be a regular function that is assigning aliases to session function instances.
    # when that happens, we'd need to update the relevant session function with its new alias,
    # Actually, just keep all aliases in the data layer, so that get() operations read it from there
    # and set() operations update it there (i.e., no need to keep localized versions)
    # keeping the localized versions up-to-date with the data layer would require
    # synchronization when there is an update (most probably via an immediate special message)
    ###########################

    def set_session_alias(self, alias):
        # update metadata (session alias -> session id) mapping
        # check whether it is already in use
        old_session_id = self._global_data_layer_client.getMapEntry(
            self._map_name_session_alias_id, alias)
        if old_session_id is not None and old_session_id != "" and old_session_id != self._session_id:
            self._logger.warning(
                "Cannot overwrite alias (" + alias +
                ") that is in use by another session (existing session id: " +
                old_session_id + ").")
            return

        self._global_data_layer_client.putMapEntry(
            self._map_name_session_alias_id, alias, self._session_id)
        self._global_data_layer_client.putMapEntry(
            self._map_name_session_id_alias, self._session_id, alias)

    def get_session_alias(self):
        session_alias = self._global_data_layer_client.getMapEntry(
            self._map_name_session_id_alias, self._session_id)
        if session_alias == "":
            session_alias = None
        return session_alias

    def unset_session_alias(self):
        # update metadata
        session_alias = self.get_session_alias()
        if session_alias is not None:
            self._global_data_layer_client.deleteMapEntry(
                self._map_name_session_alias_id, session_alias)
            self._global_data_layer_client.deleteMapEntry(
                self._map_name_session_id_alias, self._session_id)

    def set_session_function_alias(self, alias, session_function_id=None):
        # handle setting an alias for another session function
        if session_function_id is None:
            session_function_id = self._session_function_id
        else:
            # check whether the session function id actually exists in the session functions list
            rgidlist = self.get_all_session_function_ids()
            if session_function_id not in rgidlist:
                self._logger.warning("Cannot find session function with id: " +
                                     str(session_function_id) +
                                     " for setting its alias.")
                return

        # check whether it is already in use; cannot have the same alias for two different instances
        old_session_function_id = self._global_data_layer_client.getMapEntry(
            self._map_name_session_function_alias_id, alias)
        if old_session_function_id is not None and old_session_function_id != "" and old_session_function_id != session_function_id:
            self._logger.warning(
                "Cannot use alias (" + alias +
                ") that is in use by another session function (existing session function id: "
                + old_session_function_id + ").")
            return

        # update metadata (session function alias -> session function id) mapping
        # also (session function id -> session function alias) mapping
        self._global_data_layer_client.putMapEntry(
            self._map_name_session_function_alias_id, alias,
            session_function_id)
        self._global_data_layer_client.putMapEntry(
            self._map_name_session_function_id_alias, session_function_id,
            alias)

    def get_session_function_alias(self, session_function_id=None):
        # handle setting an alias for another session function
        if session_function_id is None:
            session_function_id = self._session_function_id
        else:
            # check whether the session function id actually exists in the session functions list
            rgidlist = self.get_all_session_function_ids()
            if session_function_id not in rgidlist:
                self._logger.warning("Cannot find session function with id: " +
                                     str(session_function_id) +
                                     " for getting its alias.")
                return None

        # handle getting an alias for another session function
        alias = self._global_data_layer_client.getMapEntry(
            self._map_name_session_function_id_alias, session_function_id)
        if alias == "":
            alias = None
        return alias

    def unset_session_function_alias(self, session_function_id=None):
        # handle unsetting the alias for another session function
        if session_function_id is None:
            session_function_id = self._session_function_id
        else:
            # check whether the session function id actually exists in the session functions list
            rgidlist = self.get_all_session_function_ids()
            if session_function_id not in rgidlist:
                self._logger.warning("Cannot find session function with id: " +
                                     str(session_function_id) +
                                     " for unsetting its alias.")
                return

        # update metadata
        session_function_alias = self.get_session_function_alias(
            session_function_id)
        if session_function_alias is not None:
            self._global_data_layer_client.deleteMapEntry(
                self._map_name_session_function_alias_id,
                session_function_alias)
            self._global_data_layer_client.deleteMapEntry(
                self._map_name_session_function_id_alias, session_function_id)

    def get_session_id(self):
        return self._session_id

    def get_session_function_id(self):
        return self._session_function_id

    def get_session_function_id_with_alias(self, alias=None):
        if alias is None:
            return self._session_function_id

        sgid = self._global_data_layer_client.getMapEntry(
            self._map_name_session_function_alias_id, alias)
        return sgid

    def get_all_session_function_ids(self):
        rgidset = self._global_data_layer_client.getMapKeys(
            self._map_name_session_functions)
        rgidlist = list(rgidset)
        return rgidlist

    def get_all_session_function_aliases(self):
        alias_map = {}
        alias_map = self._global_data_layer_client.retrieveMap(
            self._map_name_session_function_alias_id)
        return alias_map

    def get_alias_summary(self):
        alias_summary = {}
        # 1. add current session alias
        alias_summary["session"] = {}
        session_alias = self.get_session_alias()
        if session_alias is None:
            session_alias = ""
        alias_summary["session"][self._session_id] = session_alias

        # 2. add current session function aliases
        alias_summary["session_functions"] = {}

        # 2.1. get all session function ids
        rgidlist = self.get_all_session_function_ids()

        for rgid in rgidlist:
            alias_summary["session_functions"][rgid] = ""

        # 2.2. get assigned aliases to all session functions
        alias_map = self.get_all_session_function_aliases()

        # 2.3. merge 2.1 and 2.2
        # it is possible that some session functions will have no alias
        for alias in alias_map.keys():
            rgid = alias_map[alias]
            alias_summary["session_functions"][rgid] = alias

        return alias_summary

    # every function in a session workflow will call this, setting up the metadata tablenames
    def _generate_session_id(self):
        if self._session_id is None:
            # MUST be unique and deterministic (so that multiple, concurrent instances generate the same)
            # uid + sid + wid + key
            # emitting messages during execution MUST use existing session id
            # due to key being different for each request to the workflow
            plain_session_id_bytes = (self._userid + "_" + self._sandboxid +
                                      "_" + self._workflowid + "_" +
                                      self._key).encode()
            self._session_id = hashlib.sha256(
                plain_session_id_bytes).hexdigest()
            self._logger.debug("[SessionUtils] Session id: " +
                               self._session_id)

    def _generate_session_function_id(self):
        if self._session_function_id is None:
            # this cannot be just instanceid (i.e., key of the request); multiple functions receive the same instance id
            # should include some randomness, so that the same function can be instantiated more than once
            # need to use (gname + key + random)
            # we are only interested in keeping the session function ids of the same sandbox/workflow/session
            random.seed()
            plain_session_function_id_bytes = (
                self._function_state_name + "_" + self._key + "_" +
                str(random.uniform(0, 100000))).encode()
            self._session_function_id = hashlib.sha256(
                plain_session_function_id_bytes).hexdigest()
            self._logger.debug("[SessionUtils] Session function id: " +
                               self._session_function_id)

    # these calls don't have an effect until an entry is added
    # and the entries still succeed even without calling to createSet or createMap
    # making these calls unnecessary
    def _create_metadata_tables(self):
        # create the metadata tables if necessary
        names_sets = self._global_data_layer_client.getSetNames()
        names_maps = self._global_data_layer_client.getMapNames()

        if self._map_name_session_functions not in names_maps:
            self._global_data_layer_client.createMap(
                self._map_name_session_functions)

        if self._map_name_session_function_name_id_sets not in names_maps:
            self._global_data_layer_client.createMap(
                self._map_name_session_function_name_id_sets)

        if self._set_name_session_function_name_ids not in names_sets:
            self._global_data_layer_client.createSet(
                self._set_name_session_function_name_ids)

        if self._map_name_session_alias_id not in names_maps:
            self._global_data_layer_client.createMap(
                self._map_name_session_alias_id)

        if self._map_name_session_id_alias not in names_maps:
            self._global_data_layer_client.createMap(
                self._map_name_session_id_alias)

        if self._map_name_session_function_alias_id not in names_maps:
            self._global_data_layer_client.createMap(
                self._map_name_session_function_alias_id)

        if self._map_name_session_function_id_alias not in names_maps:
            self._global_data_layer_client.createMap(
                self._map_name_session_function_id_alias)

    def _setup_metadata_tablenames(self):
        # set up metadata tables
        # we know the session id, so each metadata table has it in its name
        # 1. session function instance id -> function instance metadata as 'map'
        # 2. session function name -> ref to set name of instance ids as 'map'
        # 3. session function instance ids as 'set' (with session function name as 'set' name)
        # 4. session alias -> session id metadata as 'map'
        # 5. session function alias -> session function id metadata as 'map'

        # 0. set of session function instance ids
        # we just expose the function instance ids to the application via the map keys
        # 1. map of session function instances and metadata (key = session function instance id, value = name, location, ...)
        self._map_name_session_functions = "SessionFunctionInstanceIdMap_" + self._session_id

        # 2. map of function names and ref to set of instance ids
        self._map_name_session_function_name_id_sets = "SessionFunctionNameIdSetsMap_" + self._session_id

        # 3. set of function instance ids of a function; referenced by SessionFunctionNameIdSetsMap
        self._set_name_session_function_name_ids = "SessionFunctionNameIdsSet_" + self._session_id + "_" + self._function_state_name

        # 4. session alias -> session id mapping; needs to be sandbox-level (i.e., without self._session_id)
        self._map_name_session_alias_id = "SessionAliasIdMap_" + self._sandboxid

        # 5. session id -> session alias mapping; needs to be sandbox-level (i.e., without self._session_id)
        self._map_name_session_id_alias = "SessionIdAliasMap_" + self._sandboxid

        # 6. session function alias -> session function id mapping
        self._map_name_session_function_alias_id = "SessionFunctionAliasIdMap_" + self._session_id

        # 7. session function id -> session function alias mapping
        self._map_name_session_function_id_alias = "SessionFunctionIdAliasMap_" + self._session_id

    def _store_metadata(self):
        # add yourself to the metadata in the data layer
        # 1. add yourself to the metadata map
        # use this information in host agent to find the correct host and deliver new messages correctly
        # need to include the global queue topic name, so that messages
        # can be also delivered from remote hosts
        function_metadata = {}
        function_metadata["hostname"] = self._hostname
        function_metadata["sandboxId"] = self._sandboxid
        function_metadata["workflowId"] = self._workflowid
        function_metadata["sessionId"] = self._session_id
        function_metadata["functionName"] = self._function_state_name
        function_metadata[
            "communicationTopic"] = self._local_topic_communication
        function_metadata["remote_address"] = self._internal_endpoint
        metadata = json.dumps(function_metadata)

        #self._logger.debug("[SessionUtils] Session function metadata: " + metadata)

        self._global_data_layer_client.putMapEntry(
            self._map_name_session_functions, self._session_function_id,
            metadata)

        # 2. put the reference to the set of instance ids with our name
        self._global_data_layer_client.putMapEntry(
            self._map_name_session_function_name_id_sets,
            self._function_state_name,
            self._set_name_session_function_name_ids)

        # 3. update the set of instance ids with our session function id
        self._global_data_layer_client.addSetEntry(
            self._set_name_session_function_name_ids,
            self._session_function_id)

    def _remove_metadata(self):
        # remove any session function alias mappings
        self.unset_session_function_alias()

        #if self._key_update_message is not None:
        #    self._local_data_layer_client.delete(self._key_update_message)

        self._global_data_layer_client.removeSetEntry(
            self._set_name_session_function_name_ids,
            self._session_function_id)
        self._global_data_layer_client.deleteMapEntry(
            self._map_name_session_function_name_id_sets,
            self._function_state_name)
        self._global_data_layer_client.deleteMapEntry(
            self._map_name_session_functions, self._session_function_id)

        # TODO: we also need to remove the metadata tables at session end as well as the session alias mappings
        # i.e., when all functions in the session have been finished.

    def _setup_session_function_helper(self):
        params = {}
        params["sandboxid"] = self._sandboxid
        params["workflowid"] = self._workflowid
        params["session_id"] = self._session_id
        params["session_function_id"] = self._session_function_id

        # obtain parameters from the function worker
        params["heartbeat_parameters"] = self._session_function_parameters

        params["communication_parameters"] = {}
        params["communication_parameters"][
            "local_topic_communication"] = self._local_topic_communication

        self._helper_thread = SessionHelperThread(params, self._logger,
                                                  self._publication_utils,
                                                  self, self._queue,
                                                  self._datalayer)
        self._helper_thread.daemon = False
        self._helper_thread.start()

    def shutdown_helper_thread(self):
        self._helper_thread.shutdown()

    def cleanup(self):
        self._remove_metadata()

        self._global_data_layer_client.shutdown()

    # only to be called from the function worker when it is a session function
    def setup_session_function(self, session_function_parameters):
        self._session_function_parameters = session_function_parameters
        # generate a new session function id
        self._generate_session_function_id()

        # for receiving update messages
        # also set up a global queue topic name, so that this session
        # function can be sent messages from remote hosts
        #self._key_update_message = "UpdateMessage_" + self._session_function_id
        self._local_topic_communication = "SessionFunctionUpdateTopic_" + self._session_function_id

        # set up metadata tables if necessary and register yourself
        # maybe first fork? need to have its own global data layer client
        # no, because this setup is crucial for the operation of the session function
        # if it fails, we'd need to stop everything else.
        self._store_metadata()

        self._is_session_function_running = True

        # set up the helper thread
        self._setup_session_function_helper()

    def set_session_function_running(self, is_running):
        self._is_session_function_running = is_running

    def is_session_function_running(self):
        return self._is_session_function_running

    # API to send a message to another session function
    # check the locally running functions, and send them the message locally if so
    # otherwise, send it to the EventGlobalPublisher's queue
    def send_to_running_function_in_session(self,
                                            session_function_id,
                                            message,
                                            send_now=False):
        #self._logger.debug("[SessionUtils] Sending message to running function: " + str(session_function_id) + " now: " + str(send_now))
        # send the message to the specific running function id
        function_metadatastr = self._global_data_layer_client.getMapEntry(
            self._map_name_session_functions, session_function_id)
        try:
            #self._logger.debug("[SessionUtils] function metadata: " + function_metadatastr)
            function_metadata = json.loads(function_metadatastr)
        except Exception as exc:
            self._logger.warning(
                "[SessionUtils] No such running function instance: " +
                session_function_id + " " + str(exc))
            return

        # we can use the 'globalTopic' in metadata to also deliver
        # the message directly to the locally running session function instances
        # that means, we can skip the delivery by the function worker
        # that however also means, that the decapsulation of the message
        # has to happen at the session function's helper thread
        trigger = {}
        trigger["value"] = message
        trigger["to_running_function"] = True
        trigger["next"] = function_metadata["communicationTopic"]
        if self._hostname == function_metadata["hostname"]:
            # local function instance; send it via local queue
            #self._logger.debug("[SessionUtils] Local session function: " + str(session_function_id))
            trigger["is_local"] = True
        else:
            # remote function instance
            #self._logger.debug("[SessionUtils] Remote session function: " + str(session_function_id))
            trigger["is_local"] = False
            trigger["remote_address"] = function_metadata["remote_address"]

        if send_now:
            self._publication_utils.send_to_function_now("-1l", trigger)
        else:
            self._publication_utils.append_trigger(trigger)

    def send_to_all_running_functions_in_session_with_function_name(
            self, session_function_name, message, send_now=False):
        # get the function ids and send message
        rgidsetname = self._global_data_layer_client.getMapEntry(
            self._map_name_session_function_name_id_sets,
            session_function_name)
        rgidset = self._global_data_layer_client.retrieveSet(rgidsetname)
        rgidlist = list(rgidset)
        for rgid in rgidlist:
            self.send_to_running_function_in_session(rgid, message, send_now)

    def send_to_all_running_functions_in_session(self,
                                                 message,
                                                 send_now=False):
        # get the function ids and send message
        rgidset = self._global_data_layer_client.getMapKeys(
            self._map_name_session_functions)
        rgidlist = list(rgidset)
        for rgid in rgidlist:
            self.send_to_running_function_in_session(rgid, message, send_now)

    def send_to_running_function_in_session_with_alias(self,
                                                       session_function_alias,
                                                       message,
                                                       send_now=False):
        # lookup the session function id and then send to it
        rgid = self._global_data_layer_client.getMapEntry(
            self._map_name_session_function_alias_id, session_function_alias)

        if rgid == "":
            self._logger.warning(
                "Cannot send message to session function with alias; no session function with that alias."
            )
            return

        self.send_to_running_function_in_session(rgid, message, send_now)

    def get_session_update_messages_with_local_queue(self,
                                                     count=1,
                                                     block=False):
        if self._session_function_id is not None:
            messages = self._helper_thread.get_messages(count=count,
                                                        block=block)
            return messages
        return None
Esempio n. 14
0
    print("Waiting on DataLayer")
    while True:
        host, port = os.getenv("MFN_DATALAYER",
                               hostname + ":4998").rsplit(":", 1)
        try:
            addr = socket.gethostbyname(host)
            connect = addr + ":" + port
            break
        except:
            traceback.print_exc()
            print("Waiting another 5s for " + host + " to be resolvable")
            time.sleep(5)

    # client for bucket "storage_" + get_storage_userid(email) + ";defaultTable"
    DLCLIENT = DataLayerClient(locality=1,
                               suid="adminATmanagement",
                               connect=connect,
                               init_tables=True)
    # client for bucket "sbox_Management;wf_Management"
    DLCLIENT_MANAGEMENT = DataLayerClient(locality=1,
                                          sid="Management",
                                          wid="Management",
                                          is_wf_private=True,
                                          connect=connect,
                                          init_tables=True)
    # client for mfn internal storage (for completeness)
    DLCLIENT_MFN = DataLayerClient(locality=1,
                                   sid="Management",
                                   for_mfn=True,
                                   connect=connect,
                                   init_tables=True)
    DLCLIENT_MFN.shutdown()
Esempio n. 15
0
    def __init__(self, helper_params, logger, pubutils, sessutils, queue,
                 datalayer):

        self._logger = logger

        #self._logger.debug("[SessionHelperThread] " + str(helper_params))

        self._publication_utils = pubutils

        self._session_utils = sessutils

        self._queue = queue
        self._datalayer = datalayer

        self._sandboxid = helper_params["sandboxid"]
        self._workflowid = helper_params["workflowid"]
        self._session_function_id = helper_params["session_function_id"]
        self._session_id = helper_params["session_id"]

        # initialize only needed
        # need a separate backup data layer client from the publication utils; otherwise, we run into concurrent modification
        # problems from Thrift
        # locality = -1 means that the writes happen to the local data layer first and then asynchronously to the global data layer
        self._backup_data_layer_client = DataLayerClient(
            locality=-1,
            for_mfn=True,
            sid=self._sandboxid,
            connect=self._datalayer)

        # set up heartbeat parameters
        self._heartbeat_enabled = False
        self._heartbeat_method = None
        # our own local queue client to be used when sending a heartbeat
        # TODO: double check if we can just reuse the one we're polling
        # probably yes
        self._local_queue_client_heartbeat = None
        self._heartbeat_function = None
        self._heartbeat_data_layer_key = None
        self._data_layer_client_heartbeat = None

        self._init_heartbeat_parameters(helper_params["heartbeat_parameters"])

        # set up communication parameters
        self._communication_params = helper_params["communication_parameters"]
        # similar to the data layer rendezvous point for message delivery, we listen to a local topic
        # allowing us to queue messages and deliver multiple messages to the session function if desired
        self._local_topic_communication = self._communication_params[
            "local_topic_communication"]
        # by default, assign a simple poll timeout
        # if the heartbeat is specified, it will be updated to the heartbeat to ensure
        # we can send regular heartbeats
        self._local_poll_timeout = py3utils.ensure_long(10000)

        # use a deque to keep the list of messages
        # updating the list and retrieving the list would be done by two threads
        # this should be safe without lock because of the global interpreter lock in python
        self._message_queue = deque()

        self._local_queue_client = LocalQueueClient(connect=self._queue)

        self._special_messages = {}
        self._special_messages["--stop"] = True
        self._special_messages["--update-heartbeat"] = True

        self._is_running = False

        #self._logger.debug("[SessionHelperThread] init done.")

        threading.Thread.__init__(self)