def _wait_for_child_processes(self): output, error = process_utils.run_command_return_output('pgrep -P ' + str(self._process_id), self._logger) if error is not None: self._logger.error("[SandboxAgent] wait_for_child_processes: Failed to get children process ids: %s", str(error)) return children_pids = set(output.split()) self._logger.info("[SandboxAgent] wait_for_child_processes: Parent pid: %s, Children pids: %s", str(self._process_id), str(children_pids)) for jrh_process in self._javarequesthandler_process_list: if str(jrh_process.pid) in children_pids: children_pids.remove(str(jrh_process.pid)) self._logger.info("[SandboxAgent] wait_for_child_processes: Not waiting on JavaRequestHandler pid: %s", str(jrh_process.pid)) ## find fluentbit PID output, error = process_utils.run_command_return_output('ps --no-headers -o pid -C fluent-bit', self._logger) fbpid = output.strip() if fbpid in children_pids: children_pids.remove(fbpid) self._logger.info("[SandboxAgent] wait_for_child_processes: Not waiting on fluent-bit pid: %s", fbpid) if self._queue_service_process is not None: if str(self._queue_service_process.pid) in children_pids: children_pids.remove(str(self._queue_service_process.pid)) self._logger.info("[SandboxAgent] wait_for_child_processes: Not waiting on queue service pid: %s", str(self._queue_service_process.pid)) if self._frontend_process is not None: if str(self._frontend_process.pid) in children_pids: children_pids.remove(str(self._frontend_process.pid)) self._logger.info("[SandboxAgent] wait_for_child_processes: Not waiting on frontend pid: %s", str(self._frontend_process.pid)) if not children_pids: self._logger.info("[SandboxAgent] wait_for_child_processes: No remaining pids to wait for") return while True: try: cpid, status = os.waitpid(-1, 0) self._logger.info("[SandboxAgent] wait_for_child_processes: Status changed for pid: %s, Status: %s", str(cpid), str(status)) if str(cpid) not in children_pids: #print('wait_for_child_processes: ' + str(cpid) + "Not found in children_pids") continue children_pids.remove(str(cpid)) if not children_pids: self._logger.info("[SandboxAgent] wait_for_child_processes: No remaining pids to wait for") break except Exception as exc: self._logger.error('[SandboxAgent] wait_for_child_processes: %s', str(exc))
def set_child_process(self, which, process, command_args_map): pid = process.pid if which == "qs": self._queue_service_process = process elif which == "fe": self._frontend_process = process elif which == "fb": self._fluentbit_process = process output, error = process_utils.run_command_return_output('ps --no-headers -o pid -C fluent-bit', self._logger) fbpid = int(output.strip()) self._fluentbit_actual_pid = fbpid pid = fbpid # store command and args self._child_process_command_args_map[pid] = command_args_map
def get_all_children_pids(self): children_pids = [] for state in self._functionworker_process_map: p = self._functionworker_process_map[state] children_pids.append(p.pid) for jrhp in self._javarequesthandler_process_list: children_pids.append(jrhp.pid) children_pids.append(self._queue_service_process.pid) children_pids.append(self._frontend_process.pid) # looks like this pid does not match the actual process; perhaps because it also spawns another process? #children_pids.append(self._fluentbit_process.pid) ## find actual fluentbit pid output, error = process_utils.run_command_return_output('ps --no-headers -o pid -C fluent-bit', self._logger) fbpid = int(output.strip()) self._fluentbit_actual_pid = fbpid children_pids.append(fbpid) return children_pids
def process_deployment_info(self): has_error = False errmsg = "" if self._deployment_info is not None and self._deployment_info != "": try: self._deployment_info = json.loads(self._deployment_info) self._logger.debug("Deployment info: %s", json.dumps(self._deployment_info)) except Exception as exc: errmsg = "Could not parse deployment info: " + str(exc) self._logger.error(errmsg) has_error = True return has_error, errmsg else: errmsg = "Empty deployment info." has_error = True return has_error, errmsg if "workflow" not in self._deployment_info or "resources" not in self._deployment_info: errmsg = "Incomplete deployment info: " + json.dumps(self._deployment_info) self._logger.error(errmsg) has_error = True return has_error, errmsg # get workflow info workflow_info = self._deployment_info["workflow"] sid = workflow_info["sandboxId"] if sid != self._sandboxid: warnmsg = "WARN: workflow info sandboxid doesn't match provided sandboxid ("+sid+" <-> "+workflow_info["sandboxId"]+")" self._logger.info(warnmsg) wid = workflow_info["workflowId"] if wid != self._workflowid: warnmsg = "WARN: workflow info workflowid doesn't match provided workflowid ("+wid+" <-> "+workflow_info["workflowId"]+")" print(warnmsg) wf_type = workflow_info["workflowType"] usertoken = '' if "usertoken" in workflow_info: usertoken = workflow_info["usertoken"] os.environ["USERTOKEN"] = usertoken # get workflow json, parse workflow json and init params workflow_json = self._global_data_layer_client.get(workflow_info["json_ref"]) if workflow_json is None or workflow_json == "": has_error = True errmsg = "Empty workflow description." return has_error, errmsg try: workflow_json = base64.b64decode(workflow_json).decode() except Exception as exc: has_error = True errmsg = "Invalid value for workflow json: " + str(exc) return has_error, errmsg self._workflow = Workflow(self._userid, sid, wid, wf_type, workflow_json, self._logger) has_error = self._workflow.has_error() if has_error: errmsg = "Problem in workflow description: " + str(workflow_json) self._logger.error(errmsg) return has_error, errmsg # get workflow nodes workflow_nodes = self._workflow.getWorkflowNodeMap() # get resources info and find functions resource_map = {} resource_info_map = self._deployment_info["resources"] if any(resource_info_map[res_name]["runtime"] == "Java" for res_name in resource_info_map): # run setup_maven.sh to update the proxy settings at runtime # (i.e., the sandbox image may have been built on a machine with a proxy, or vice versa) cmd_maven_proxy_initer = "/opt/mfn/JavaRequestHandler/./setup_maven.sh" self._logger.info("Updating maven proxy settings...") error, _ = process_utils.run_command(cmd_maven_proxy_initer, self._logger, wait_output=True) if error is not None: has_error = True errmsg = "Could not reinitialize maven proxy settings: " + error return has_error, errmsg self._logger.info("Finished updating maven proxy settings.") # for pip installable dependencies for python functions req_map = {} t_start_download = time.time() # store functions in local filesystem for resource_name in resource_info_map: resource_info = resource_info_map[resource_name] resource_info["runtime"] = resource_info["runtime"].lower() if resource_info["type"] == "code": error, resource_dirpath = self._retrieve_and_store_function_code(resource_name, resource_info) else: error, resource_dirpath = self._retrieve_and_store_function_zip(resource_name, resource_info) if error is not None: errmsg = "Could not retrieve and store function: " + resource_name + " " + error self._logger.error(errmsg) has_error = True return has_error, errmsg # these requirements can now be also for java maven dependencies resource_id = resource_info["id"] greq = self._global_data_layer_client.get("grain_requirements_" + resource_id) mvndeps = None if greq is not None and greq != "": greq = base64.b64decode(greq).decode() if resource_info["runtime"].find("python") == 0: # get function requirements and put it into a map lines = greq.strip().split("\n") for line in lines: req_map[line] = True elif resource_info["runtime"].find("java") == 0: mvndeps = greq # get function environment variables env_var_list = [] genv = self._global_data_layer_client.get("grain_environment_variables_" + resource_id) if genv is not None and genv != "": genv = base64.b64decode(genv).decode() lines = genv.split("\n") env_var_list = lines resource = {} resource["name"] = resource_name resource["dirpath"] = resource_dirpath resource["runtime"] = resource_info["runtime"] resource["env_var_list"] = env_var_list resource_map[resource_name] = resource # compile the java sources if resource["runtime"].find("java") == 0: # even if it was just a single java file # or a jar file uploaded with source files # or a jar file with just class files, # the following function will # 1. download maven dependencies (if there is a pom.xml in the jar or was separately uploaded) # 2. compile the source files if any error = self._compile_java_resources_if_necessary(resource, mvndeps) if error is not None: errmsg = "Could not compile Java function resources: " + resource_name + " " + error self._logger.error(errmsg) has_error = True return has_error, errmsg total_time_download = (time.time() - t_start_download) * 1000.0 self._logger.info("Download time for all function code: %s (ms)", str(total_time_download)) t_start_requirements = time.time() # this list will only contain pip installable dependencies # java maven dependencies will be handled while compiling the java resources sbox_req_list = [] for req_line in req_map: sbox_req_list.append(req_line) # install sandbox requirements req = workflow_info["sandbox_requirements"] req["requirements"] = sbox_req_list error = self._install_sandbox_requirements(req) if error is not None: errmsg = "Could not install sandbox requirements. " + str(error) self._logger.error(errmsg) has_error = True return has_error, errmsg total_time_requirements = (time.time() - t_start_requirements) * 1000.0 self._logger.info("Requirements install time: %s (ms)", str(total_time_requirements)) t_start_storage = time.time() # initialize local data layer space for user and workflow self._initialize_data_layer_storage() total_time_storage = (time.time() - t_start_storage) * 1000.0 self._logger.info("Storage initialization time: %s (ms)", str(total_time_storage)) self._local_queue_client = LocalQueueClient(connect=self._queue) self._local_queue_client.addTopic(self._workflow.getWorkflowExitTopic()) t_start_launch = time.time() # accummulate all java worker params into one # later, we'll launch a single JVM to handle all java functions if SINGLE_JVM_FOR_FUNCTIONS: single_jvm_worker_params = {} any_java_function = False total_time_state = 0.0 for function_topic in workflow_nodes: wf_node = workflow_nodes[function_topic] resource_name = wf_node.get_resource_name() t_start_state = time.time() if resource_name == "": # this is an ASL state without a resource (i.e., function) attached to it error, resource = state_utils.create_dummy_resource_for_asl_state(wf_node) if error is not None: errmsg = "Could not create non-resource state. " + str(error) self._logger.error(errmsg) has_error = True return has_error, errmsg else: resource = resource_map[resource_name] error, state = state_utils.create_state(wf_node, resource, self._logger) if error is not None: errmsg = "Could not create state: " + str(error) self._logger.error(errmsg) has_error = True return has_error, errmsg total_time_state += (time.time() - t_start_state) * 1000.0 self._local_queue_client.addTopic(function_topic) # compile worker parameters worker_params = self._populate_worker_params(function_topic, wf_node, state) # store worker parameters as a local file params_filename = state["dirpath"] + "worker_params.json" with open(params_filename, "w") as paramsf: json.dump(worker_params, paramsf, indent=4) if state["resource_runtime"].find("java") != -1: java_worker_params = {} java_worker_params["functionPath"] = worker_params["ffolder"] java_worker_params["functionName"] = worker_params["fname"] java_worker_params["serverSocketFilename"] = "/tmp/java_handler_" + worker_params["functionstatename"] + ".uds" if SINGLE_JVM_FOR_FUNCTIONS: any_java_function = True single_jvm_worker_params[worker_params["functionstatename"]] = java_worker_params else: java_params_filename = state["dirpath"] + "java_worker_params.json" with open(java_params_filename, "w") as javaparamsf: json.dump(java_worker_params, javaparamsf, indent=4) # launch function workers with the params parsed from workflow info error = self._start_function_worker(worker_params, state["resource_runtime"], state["resource_env_var_list"]) if error is not None: errmsg = "Problem launching function worker for: " + worker_params["fname"] self._logger.error(errmsg) has_error = True return has_error, errmsg # add the new function worker to the local list self._workflow.addLocalFunction(function_topic) # all function workers have been launched; update them with locally running functions # prepare update message to be used by all local_functions = self._workflow.getWorkflowLocalFunctions() lqcm_update = self._prepare_update_for_locally_running(local_functions) for function_topic in workflow_nodes: self._update_function_worker(function_topic, lqcm_update) if SINGLE_JVM_FOR_FUNCTIONS: if any_java_function: single_jvm_params_filename = "/opt/mfn/workflow/states/single_jvm_worker_params.json" with open(single_jvm_params_filename, "w") as jvmparamsf: json.dump(single_jvm_worker_params, jvmparamsf, indent=4) self._logger.info("Launching a single JavaRequestHandler for all Java states...") cmdjavahandler = "java -jar /opt/mfn/JavaRequestHandler/target/javaworker.jar " cmdjavahandler += single_jvm_params_filename error, process = process_utils.run_command(cmdjavahandler, self._logger, wait_until="Waiting for requests on:") if error is not None: errmsg = "Problem launching JavaRequestHandler for Java states: " + error self._logger.error(errmsg) has_error = True return has_error, errmsg else: self._javarequesthandler_process_list.append(process) self._logger.info("State creation for all function workers: %s (ms)", str(total_time_state)) total_time_launch = (time.time() - t_start_launch) * 1000.0 self._logger.info("Launch time for all function workers: %s (ms)", str(total_time_launch)) if not has_error: # check whether all function workers have launched successfully # give some time for function workers to come up cmd = "pgrep -P " + str(self._process_id) + " -a" output, error = process_utils.run_command_return_output(cmd, self._logger) if error is not None: self._logger.error("[SandboxAgent] check health of function workers: failed to get FunctionWorker processes: %s", str(error)) has_error = True errmsg = "Could not get FunctionWorker processes." if not has_error: fwlines = set(output.split("\n")) fwpids = [] for line in fwlines: if "FunctionWorker.py" in line: pid = line.split(" ")[0] fwpids.append(pid) if str(self._fluentbit_process.pid) in fwpids: fwpids.remove(str(self._fluentbit_process.pid)) self._logger.info(str(len(fwpids)) + " " + str(len(self._functionworker_process_map))) #self._logger.info(str(fwpids) + " " + str(self._functionworker_process_map)) if len(fwpids) != len(self._functionworker_process_map): has_error = True errmsg = "One or more function workers could not be launched:\n" for state_name in self._functionworker_process_map: fwp = self._functionworker_process_map[state_name] if fwp.pid not in fwpids: errmsg += state_name + "\n" self._global_data_layer_client.shutdown() return has_error, errmsg
def _compile_java_resources_if_necessary(self, resource, mvndeps): error = None cmdmkdir = "mkdir -p " + resource["dirpath"] + "target/classes" self._logger.info("Preparing for compilation of Java function resources: %s", resource["name"]) error, _ = process_utils.run_command(cmdmkdir, self._logger, wait_output=True) if error is not None: error = "Could not create target directory for resource: " + resource["name"] + " " + error self._logger.error(error) return error #cmdjavac = "javac -classpath /opt/mfn/JavaRequestHandler/mfnapi.jar -d " + resource["dirpath"] + "target/classes " #cmdjavac += resource["dirpath"] + resource["name"] + ".java" cmdfind = "find " + resource["dirpath"] + " -name *.java" output, error = process_utils.run_command_return_output(cmdfind, self._logger) if error is not None: self._logger.error("[SandboxAgent] could not search for any Java sources: %s", str(error)) error = "Could not search for any Java sources: " + resource["name"] + " " + str(error) return error source_files = set(output.split("\n")) source_files = ' '.join(source_files).strip() should_compile = False if source_files != "": should_compile = True self._logger.info("Found following Java sources: %s", str(source_files)) else: self._logger.info("No java sources to compile.") # 2. check for pom.xml or the requirements; if it is there, then: if mvndeps is not None and not os.path.exists(resource["dirpath"] + "pom.xml"): # write the content of mvndeps into the pom.xml self._logger.info("Writing maven build file: %spom.xml", resource["dirpath"]) with open(resource["dirpath"] + "pom.xml", "w") as fpom: fpom.write(mvndeps) # we either had a pom.xml file in the archive or non-empty mvndeps from uploaded requirements, which we wrote as the pom.xml file # regardless, if there is a pom file, then resolve and copy maven dependencies if os.path.exists(resource["dirpath"] + "pom.xml"): cmdmvn = "mvn -Duser.home=/tmp -DskipTests -gs /opt/mfn/JavaRequestHandler/maven/sandbox-mvn-settings.xml -f " + resource["dirpath"] cmdmvn += " dependency:copy-dependencies -DoutputDirectory=" + resource["dirpath"] + "target/classes" self._logger.info("Copying maven dependencies for Java function: %s", resource["name"]) error, _ = process_utils.run_command(cmdmvn, self._logger, wait_output=True) if error is not None: error = "Could not copy maven dependencies: " + resource["name"] + " " + error self._logger.error(error) return error self._logger.info("Finished copying dependencies for Java function: %s", resource["name"]) if should_compile: cmdjavac = "javac -classpath /opt/mfn/JavaRequestHandler/mfnapi.jar:" cmdjavac += resource["dirpath"] + "target/classes/* " cmdjavac += "-d " + resource["dirpath"] + "target/classes " + source_files self._logger.info("Compiling Java function resources: %s", resource["name"]) self._logger.info(cmdjavac) error, _ = process_utils.run_command(cmdjavac, self._logger, wait_output=True) if error is not None: error = "Could not compile resource: " + resource["name"] + " " + error self._logger.error(error) return error self._logger.info("Finished compiling Java function resources: %s", resource["name"]) return error