def _GlobalWatcher(opts): """Main function for global watcher. At the end child processes are spawned for every node group. """ StartNodeDaemons() RunWatcherHooks() # Run node maintenance in all cases, even if master, so that old masters can # be properly cleaned up if nodemaint.NodeMaintenance.ShouldRun(): # pylint: disable=E0602 nodemaint.NodeMaintenance().Exec() # pylint: disable=E0602 try: client = GetLuxiClient(True) except NotMasterError: # Don't proceed on non-master nodes return constants.EXIT_SUCCESS # we are on master now utils.EnsureDaemon(constants.RAPI) utils.EnsureDaemon(constants.WCONFD) # If RAPI isn't responding to queries, try one restart logging.debug("Attempting to talk to remote API on %s", constants.IP4_ADDRESS_LOCALHOST) if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST): logging.warning( "Couldn't get answer from remote API, restaring daemon") utils.StopDaemon(constants.RAPI) utils.EnsureDaemon(constants.RAPI) logging.debug("Second attempt to talk to remote API") if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST): logging.fatal("RAPI is not responding") logging.debug("Successfully talked to remote API") # If WConfD isn't responding to queries, try one restart logging.debug("Attempting to talk to WConfD") if not IsWconfdResponding(): logging.warning("WConfD not responsive, restarting daemon") utils.StopDaemon(constants.WCONFD) utils.EnsureDaemon(constants.WCONFD) logging.debug("Second attempt to talk to WConfD") if not IsWconfdResponding(): logging.fatal("WConfD is not responding") _CheckMaster(client) _ArchiveJobs(client, opts.job_age) # Spawn child processes for all node groups _StartGroupChildren(client, opts.wait_children) return constants.EXIT_SUCCESS
def StartNodeDaemons(): """Start all the daemons that should be running on all nodes. """ # on master or not, try to start the node daemon utils.EnsureDaemon(constants.NODED) # start confd as well. On non candidates it will be in disabled mode. if constants.ENABLE_CONFD: utils.EnsureDaemon(constants.CONFD) # start mond as well: all nodes need monitoring if constants.ENABLE_MOND: utils.EnsureDaemon(constants.MOND)
def GetLuxiClient(try_restart): """Tries to connect to the luxi daemon. @type try_restart: bool @param try_restart: Whether to attempt to restart the master daemon """ try: return cli.GetClient() except errors.OpPrereqError as err: # this is, from cli.GetClient, a not-master case raise NotMasterError("Not on master node (%s)" % err) except (rpcerr.NoMasterError, rpcerr.TimeoutError) as err: if not try_restart: raise logging.warning("Luxi daemon seems to be down (%s), trying to restart", err) if not utils.EnsureDaemon(constants.LUXID): raise errors.GenericError("Can't start the master daemon") # Retry the connection return cli.GetClient()
""" try: return cli.GetClient(query=query) except errors.OpPrereqError, err: # this is, from cli.GetClient, a not-master case raise NotMasterError("Not on master node (%s)" % err) except rpcerr.NoMasterError, err: if not try_restart: raise logging.warning( "Master daemon seems to be down (%s), trying to restart", err) if not utils.EnsureDaemon(constants.MASTERD): raise errors.GenericError("Can't start the master daemon") # Retry the connection return cli.GetClient(query=query) def _StartGroupChildren(cl, wait): """Starts a new instance of the watcher for every node group. """ assert not compat.any( arg.startswith(cli.NODEGROUP_OPT_NAME) for arg in sys.argv) result = cl.QueryGroups([], ["name", "uuid"], False)
""" try: return cli.GetClient() except errors.OpPrereqError, err: # this is, from cli.GetClient, a not-master case raise NotMasterError("Not on master node (%s)" % err) except rpcerr.NoMasterError, err: if not try_restart: raise logging.warning("Luxi daemon seems to be down (%s), trying to restart", err) if not utils.EnsureDaemon(constants.LUXID): raise errors.GenericError("Can't start the master daemon") # Retry the connection return cli.GetClient() def _StartGroupChildren(cl, wait): """Starts a new instance of the watcher for every node group. """ assert not compat.any( arg.startswith(cli.NODEGROUP_OPT_NAME) for arg in sys.argv) result = cl.QueryGroups([], ["name", "uuid"], False)