def waitForVmwareVmx(): """! Monitor vmware-vmx every 5 seconds to see if it is running """ vmx_conf_file_path = "%s/esxi.vmx" % Vsp.get_esxi_dir() proc_id = Vsp.get_vmx_proc_id(vmx_conf_file_path) while Vsp.is_process_running(proc_id): Logging.log(Logging.LOG_DEBUG, "vmware-vmx is still running") time.sleep(5) #We are here means the process has exited Logging.log(Logging.LOG_DEBUG, "vmware-vmx is not running")
def main(): """! Entry point of the wrapper, intialize logger and signal handler. Starts vmware-vmx and starts monitoring it """ Logging.log_init('vmware_vmx_wrapper', 'vmware_vmx_wrapper', 0, Logging.component_id(Logging.LCI_VSP), Logging.LOG_DEBUG, Logging.LOG_LOCAL0, Logging.LCT_SYSLOG) Logging.log(Logging.LOG_INFO, "vsp_vmware_vmx_wrapper started") signal.signal(signal.SIGINT, terminate_term_handler) signal.signal(signal.SIGTERM, terminate_term_handler) signal.signal(signal.SIGQUIT, terminate_quit_handler) signal.signal(signal.SIGUSR1, terminate_usr1_handler) #get the esxi dir esxi_dir = Vsp.get_esxi_dir() vmx_conf_file_path = "%s/%s" %(esxi_dir, ESXI_VMX_NAME) #Check if the vm configuration exists if not os.path.exists(vmx_conf_file_path): Logging.log(Logging.LOG_ERR, "VM configuration %s doesn't exist" % vmx_conf_file_path) sys.exit(1) # Check if vmware-vmx is already running. The fuction returns None if no # process_id exists or if there's multiple process_ids associated with # the vmx_conf. However, it's not possible to launch two running vmware_vmx # using the same vmx_conf. The chance this returns None b/c of multiple # process ids is almost nonexistant. proc_id = Vsp.get_vmx_proc_id(vmx_conf_file_path) #Start vmware-vmx if an instance has not been started if proc_id == None: # Clean up the "shutting down" file used for ESXi HPN dependency if os.path.exists(SHUTDOWN_MARKER): os.remove(SHUTDOWN_MARKER) cleanup_locks(esxi_dir) start_vmware_vmx(vmx_conf_file_path) #Get the new process id proc_id = Vsp.get_vmx_proc_id(vmx_conf_file_path) monitor_vmware_vmx(proc_id)
def event(reason, version_info): """! State machine event loop. The three possible states are: 1. initial 2. disconnected 3. connected The graphical FSM may look like: ___ | |(resend_event) ___|___|________ ____________ init_connect_timeout | | | | ------------------------->| disconnected | | initial |-------------------------->|______________| |__________| (wdt_trigger) | | | | | (wdt_trigger) | (wdt_trigger) | | | _|__________|__ | wdt_connect | | |--------------------------->| connected | |_____________| | | |___| (resend_event) """ global g_curr_wdt_state mgmtd_pids = [] old_wdt_state = g_curr_wdt_state new_wdt_state = None mgmtd_pids = Vsp.get_pids('mgmtd') if mgmtd_pids == None or g_mgmtd_pid not in mgmtd_pids: Logging.log(Logging.LOG_ERR, "Unexpected termination of mgmtd, kill watchdog!") sys.exit() # # wish python had switch statement, that would make it more pretty. # # Based on the current state, call appropriate state handler function, # this function based on given input check if state needs to be changed, # if yes, then would take required action as well (send event). if g_curr_wdt_state in STATE_HANDLER_DICT.keys(): new_wdt_state = \ STATE_HANDLER_DICT[g_curr_wdt_state](reason, version_info) else: # unknown state detected, KILL ME !!!! error_str = "Unknown state: %s, reset to initial" % g_curr_wdt_state Logging.log(Logging.LOG_ERR, error_str) sys.exit() if old_wdt_state != new_wdt_state: set_state(new_wdt_state, reason, version_info) else: # No state change detected. Verify if we need to resend event. check_resend_event(reason, version_info)
def start_vmware_vmx(path): """! Start vmware-vmx with given vm """ Logging.log(Logging.LOG_INFO, "Starting vm %s" % path) vsp_ramfs = RamFs.RamFs(vsp_ramfs_path) if vsp_ramfs.is_mounted(): # we generally should not hit this path, we unmount the ramfs when # we stop vmware-vmx Logging.log(Logging.LOG_INFO, "VSP ramfs is already mounted %s, unmounting" % \ vsp_ramfs_path) try: vsp_ramfs.unmount_ramfs() except RamFs.RamFsCmdException as e: # we'll proceed with starting vmx even if we can't unmount Logging.log(Logging.LOG_ERR, e.msg) if not vsp_ramfs.is_mounted(): try: vsp_ramfs.mount_ramfs(vsp_ovhd_ramfs_min_size_mb) except (OSError, RamFs.RamFsCmdException) as e: Logging.log(Logging.LOG_ERR, str(e)) Logging.log(Logging.LOG_ERR, "Unable to create ramfs %s" \ " not starting VMX" % vsp_ramfs_path) # skip starting VMX, the caller will look for vmx status return # Link in performance tweaks library env_dict = os.environ.copy() Mgmt.open() if Vsp.is_memlock_enabled(): if env_dict.has_key("LD_PRELOAD"): env_dict["LD_PRELOAD"] = vmperf_path + " " + env_dict["LD_PRELOAD"] else: env_dict["LD_PRELOAD"] = vmperf_path # Check the ESXi debug option to see which binary we need to run vmx_option = get_debug_option() Mgmt.close() binary_path = option_to_path[vmx_option] Logging.log(Logging.LOG_DEBUG, "BINARY PATH: %s" % binary_path) pobj = subprocess.Popen([binary_path, "-qx", path], env = env_dict) pobj.wait()
def stop_vmware_vmx(): """! Stop vmware-vmx. """ global g_shutdown_requested #We just use vmrun stop to terminate vm right now but this #will change when we handle graceful shutdown path = "%s/%s" % (Vsp.get_esxi_dir(), ESXI_VMX_NAME) Logging.log(Logging.LOG_INFO, "Stopping vm %s" % path) pobj = subprocess.Popen([vmrun_path, "stop", "%s" % path]) pobj.wait() g_shutdown_requested = True;
def main(): """! Entry point to the watchdog. Initialize logger and starts attempting to communicate with ESXi """ global g_mgmtd_pid g_mgmtd_pid = None mgmtd_pids = [] Logging.log_init('esxi_watchdog', 'esxi_watchdog', 0, Logging.component_id(Logging.LCI_VSP), Logging.LOG_DEBUG, Logging.LOG_LOCAL0, Logging.LCT_SYSLOG) Logging.log(Logging.LOG_INFO, "esxi watchdog started") # Bug 117274: It may happen that we get multiple pids for mgmtd process, # pidof ran between fork-exec call, retry to allow mgmtd to settle for i in range(1, MAX_MGMTD_SETTLE_RETRY): mgmtd_pids = Vsp.get_pids('mgmtd') if len(mgmtd_pids) > 1: # multiple pids detected, give mgmtd sometime to settle time.sleep(MGMTD_SETTLE_TIMEOUT) else: g_mgmtd_pid = mgmtd_pids[0] break # Bug 112192: monitor mgmtd pid, if mgmtd crashes/exits # terminate watchdog as well if g_mgmtd_pid == None: # mgmtd not up kill watchdog process Logging.log(Logging.LOG_ERR, "Mgmtd is not ready, kill watchdog!") sys.exit(); Mgmt.open() signal.signal(signal.SIGINT, terminate_handler) signal.signal(signal.SIGTERM, terminate_handler) signal.signal(signal.SIGQUIT, terminate_handler) # Invalidate the session file if it exists on startup if os.path.exists(SESSION_FILE): os.remove(SESSION_FILE) monitor_esxi() Mgmt.close()
def run_esxcli_command(command, use_session): """! Run esxcli command to determine connectivity """ version="unknown" build="unknown" version_info = "unknown" curr_reason = None env_vars = None if use_session == False: env_vars = Vsp.make_esxcli_env_vars() if env_vars == None: curr_reason = "invalid ESXi password" return (curr_reason, version_info) pobj = subprocess.Popen(command, env=env_vars, stdout=subprocess.PIPE, stderr=subprocess.PIPE) ret_code = pobj.wait() if ret_code == 99: Logging.log(Logging.LOG_INFO, "Watchdog timed out connecting to ESXi") curr_reason = "disconnected" # run_for will return 99 if timed out. Otherwise, check stdout if pobj.stdout: error_message = pobj.stdout.readline() for (reason, message) in REASONS_DICT.iteritems(): if error_message.find(message) != -1: curr_reason = reason # if we can sucessfully get the version # extract Version and Build information from stdout: # Product: VMware ESXi # Version: major-ver.minor-ver.maintenance-ver # Build: build# if not curr_reason: for line in pobj.stdout: if line.rstrip().find("Version") != -1: version_str = line.rstrip().split() version = version_str[1] if line.rstrip().find("Build") != -1: build_str = line.rstrip().split() build = build_str[1].replace('Releasebuild-', '') version_info = version + "." + build return (curr_reason, version_info)
def monitor_vmware_vmx(proc_id): """! Monitor vmware-vmx every 0.5 seconds to see if it is running """ while Vsp.is_process_running(proc_id): Logging.log(Logging.LOG_DEBUG, "vmware-vmx is running") time.sleep(vmx_poll_time) #We are here means the process has exited Logging.log(Logging.LOG_DEBUG, "vmware-vmx is not running") # whenever vmware-vmx stops we want to unmount the ramfs # also before unmounting we want to save off the vix logs # in the vmware-admin directory shutdown_vsp_ramfs() # Clean up the "shutting down" file used for ESXi HPN dependency if os.path.exists(SHUTDOWN_MARKER): os.remove(SHUTDOWN_MARKER) if not g_shutdown_requested: sys.exit(1)
def terminate_term_handler(signum, frame): """! Signal handler for SIGTERM and SIGINT. Whenever one of the signals occur, we will attempt to make the vicfg-hostops call to gracefully power down the host. If there is an issue with the password, we will immediately power down the host with vmrun stop. If the issue is connection related, we will not do anything and will let PM retry again (if possible) when the next signal is sent. """ do_forceful = False Logging.log(Logging.LOG_DEBUG, "Wrapper: got TERM signal") # We used to open and close the session in main(). However, if mgmtd # crashes and is restarted by PM, the session we created will not be # stale and any queries will fail miserably. To mitigate this issue, # we'll open and close the session as tightly as possible Mgmt.open() env_vars = Vsp.make_vicfg_env_vars() Logging.log(Logging.LOG_DEBUG, "Wrapper env: %s" % env_vars) Mgmt.close() # Send signal to active VM migration task so it can clean up stop_migrate_deploy() if env_vars == None or not os.path.exists(CONNECTED_MARKER): Logging.log(Logging.LOG_NOTICE, "Cannot get password or currently disconnected from ESXi") do_forceful = True # Do one last check for connectivity just in case the watchdog # says we are connected when really we are not due to a change of # IP or password on the ESXi side (but our sessionfile is still valid) # XXX/rcenteno Enhance with the session file if not do_forceful and not check_connectivity(env_vars): Logging.log(Logging.LOG_NOTICE, "ESXi connectivity not found") do_forceful = True if do_forceful: Logging.log(Logging.LOG_NOTICE, "Performing forceful power off") # We cannot get the ESXi password or cannot connect to ESXi, # so we must forcibly power down stop_vmware_vmx(); else: global g_shutdown_requested # update IQN cache value iqn = Vsp.get_iqn(env_vars, RUNFOR_TIMEOUT) if iqn: try: iqn_cache = open(iqn_cache_path, 'w') iqn_cache.write(iqn) iqn_cache.close() except Exception as e: Logging.log(Logging.LOG_ERR, "Exception while updating IQN cache file") # Create the shutdown marker that the hpn will use to know it should # wait for ESXi shutdown first open(SHUTDOWN_MARKER, 'w').close() # Enable SSH on the host pobj = subprocess.Popen(ENABLE_SSH_COMMAND, env=env_vars, stdout=subprocess.PIPE, stderr=subprocess.PIPE) ret_code = pobj.wait() if ret_code == 0: # Save the state on the host pobj = subprocess.Popen(SAVE_STATE_COMMAND, env=env_vars, stdout=subprocess.PIPE, stderr=subprocess.PIPE) ret_code = pobj.wait() if ret_code == 0: Logging.log(Logging.LOG_INFO, "Saved state on the host") # Disable SSH on the host pobj = subprocess.Popen(DISABLE_SSH_COMMAND, env=env_vars, stdout=subprocess.PIPE, stderr=subprocess.PIPE) ret_code = pobj.wait() # Regardless of what happened earlier, send the host operations command Logging.log(Logging.LOG_INFO, "ESXi graceful shutdown in progress") pobj = subprocess.Popen(SHUTDOWN_COMMAND, env=env_vars, stdout=subprocess.PIPE, stderr=subprocess.PIPE) ret_code = pobj.wait() if ret_code == 0: g_shutdown_requested = True # the command timed out elif ret_code == 99: Logging.log(Logging.LOG_INFO, "Timed out trying to send graceful shutdown request")