Example #1
0
def _GlobalWatcher(opts):
    """Main function for global watcher.

  At the end child processes are spawned for every node group.

  """
    StartNodeDaemons()
    RunWatcherHooks()

    # Run node maintenance in all cases, even if master, so that old masters can
    # be properly cleaned up
    if nodemaint.NodeMaintenance.ShouldRun():  # pylint: disable=E0602
        nodemaint.NodeMaintenance().Exec()  # pylint: disable=E0602

    try:
        client = GetLuxiClient(True)
    except NotMasterError:
        # Don't proceed on non-master nodes
        return constants.EXIT_SUCCESS

    # we are on master now
    utils.EnsureDaemon(constants.RAPI)
    utils.EnsureDaemon(constants.WCONFD)

    # If RAPI isn't responding to queries, try one restart
    logging.debug("Attempting to talk to remote API on %s",
                  constants.IP4_ADDRESS_LOCALHOST)
    if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST):
        logging.warning(
            "Couldn't get answer from remote API, restaring daemon")
        utils.StopDaemon(constants.RAPI)
        utils.EnsureDaemon(constants.RAPI)
        logging.debug("Second attempt to talk to remote API")
        if not IsRapiResponding(constants.IP4_ADDRESS_LOCALHOST):
            logging.fatal("RAPI is not responding")
    logging.debug("Successfully talked to remote API")

    # If WConfD isn't responding to queries, try one restart
    logging.debug("Attempting to talk to WConfD")
    if not IsWconfdResponding():
        logging.warning("WConfD not responsive, restarting daemon")
        utils.StopDaemon(constants.WCONFD)
        utils.EnsureDaemon(constants.WCONFD)
        logging.debug("Second attempt to talk to WConfD")
        if not IsWconfdResponding():
            logging.fatal("WConfD is not responding")

    _CheckMaster(client)
    _ArchiveJobs(client, opts.job_age)

    # Spawn child processes for all node groups
    _StartGroupChildren(client, opts.wait_children)

    return constants.EXIT_SUCCESS
Example #2
0
def StartNodeDaemons():
    """Start all the daemons that should be running on all nodes.

  """
    # on master or not, try to start the node daemon
    utils.EnsureDaemon(constants.NODED)
    # start confd as well. On non candidates it will be in disabled mode.
    if constants.ENABLE_CONFD:
        utils.EnsureDaemon(constants.CONFD)
    # start mond as well: all nodes need monitoring
    if constants.ENABLE_MOND:
        utils.EnsureDaemon(constants.MOND)
Example #3
0
def GetLuxiClient(try_restart):
    """Tries to connect to the luxi daemon.

  @type try_restart: bool
  @param try_restart: Whether to attempt to restart the master daemon

  """
    try:
        return cli.GetClient()
    except errors.OpPrereqError as err:
        # this is, from cli.GetClient, a not-master case
        raise NotMasterError("Not on master node (%s)" % err)

    except (rpcerr.NoMasterError, rpcerr.TimeoutError) as err:
        if not try_restart:
            raise

        logging.warning("Luxi daemon seems to be down (%s), trying to restart",
                        err)

        if not utils.EnsureDaemon(constants.LUXID):
            raise errors.GenericError("Can't start the master daemon")

        # Retry the connection
        return cli.GetClient()
Example #4
0
  """
    try:
        return cli.GetClient(query=query)
    except errors.OpPrereqError, err:
        # this is, from cli.GetClient, a not-master case
        raise NotMasterError("Not on master node (%s)" % err)

    except rpcerr.NoMasterError, err:
        if not try_restart:
            raise

        logging.warning(
            "Master daemon seems to be down (%s), trying to restart", err)

        if not utils.EnsureDaemon(constants.MASTERD):
            raise errors.GenericError("Can't start the master daemon")

        # Retry the connection
        return cli.GetClient(query=query)


def _StartGroupChildren(cl, wait):
    """Starts a new instance of the watcher for every node group.

  """
    assert not compat.any(
        arg.startswith(cli.NODEGROUP_OPT_NAME) for arg in sys.argv)

    result = cl.QueryGroups([], ["name", "uuid"], False)
Example #5
0
  """
    try:
        return cli.GetClient()
    except errors.OpPrereqError, err:
        # this is, from cli.GetClient, a not-master case
        raise NotMasterError("Not on master node (%s)" % err)

    except rpcerr.NoMasterError, err:
        if not try_restart:
            raise

        logging.warning("Luxi daemon seems to be down (%s), trying to restart",
                        err)

        if not utils.EnsureDaemon(constants.LUXID):
            raise errors.GenericError("Can't start the master daemon")

        # Retry the connection
        return cli.GetClient()


def _StartGroupChildren(cl, wait):
    """Starts a new instance of the watcher for every node group.

  """
    assert not compat.any(
        arg.startswith(cli.NODEGROUP_OPT_NAME) for arg in sys.argv)

    result = cl.QueryGroups([], ["name", "uuid"], False)