Beispiel #1
0
    def __init__(self):
        self._complete = gevent.event.Event()

        self._rpc_thread = RpcThread(self)
        self._discovery_thread = TopLevelEvents(self)
        self._process_monitor = ProcessMonitorThread()

        db_path = config.get('cthulhu', 'db_path')
        if sqlalchemy is not None and db_path:
            try:
                # Prepare persistence
                engine = create_engine(config.get('cthulhu',
                                                  'db_path'))  # noqa
                Session.configure(bind=engine)

                self.persister = Persister()
            except sqlalchemy.exc.ArgumentError as e:
                log.error("Database error: %s" % e)
                raise
        else:

            class NullPersister(object):
                def start(self):
                    pass

                def stop(self):
                    pass

                def join(self):
                    pass

                def __getattribute__(self, item):
                    if item.startswith('_'):
                        return object.__getattribute__(self, item)
                    else:
                        try:
                            return object.__getattribute__(self, item)
                        except AttributeError:

                            def blackhole(*args, **kwargs):
                                pass

                            return blackhole

            self.persister = NullPersister()

        # Remote operations
        self.requests = RequestCollection(self)
        self._request_ticker = Ticker(request_collection.TICK_PERIOD,
                                      lambda: self.requests.tick())

        # FSID to ClusterMonitor
        self.clusters = {}

        # Generate events on state changes
        self.eventer = Eventer(self)

        # Handle all ceph/server messages
        self.servers = ServerMonitor(self.persister, self.eventer,
                                     self.requests)
Beispiel #2
0
    def __init__(self):
        self._complete = gevent.event.Event()

        self._rpc_thread = RpcThread(self)
        self._discovery_thread = TopLevelEvents(self)
        self._process_monitor = ProcessMonitorThread()

        db_path = config.get('cthulhu', 'db_path')
        if sqlalchemy is not None and db_path:
            try:
                # Prepare persistence
                engine = create_engine(config.get('cthulhu', 'db_path'))  # noqa
                Session.configure(bind=engine)

                self.persister = Persister()
            except sqlalchemy.exc.ArgumentError as e:
                log.error("Database error: %s" % e)
                raise
        else:
            class NullPersister(object):
                def start(self):
                    pass

                def stop(self):
                    pass

                def join(self):
                    pass

                def __getattribute__(self, item):
                    if item.startswith('_'):
                        return object.__getattribute__(self, item)
                    else:
                        try:
                            return object.__getattribute__(self, item)
                        except AttributeError:
                            def blackhole(*args, **kwargs):
                                pass
                            return blackhole

            self.persister = NullPersister()

        # Remote operations
        self.requests = RequestCollection(self)
        self._request_ticker = Ticker(request_collection.TICK_PERIOD,
                                      lambda: self.requests.tick())

        # FSID to ClusterMonitor
        self.clusters = {}

        # Generate events on state changes
        self.eventer = Eventer(self)

        # Handle all ceph/server messages
        self.servers = ServerMonitor(self.persister, self.eventer, self.requests)
Beispiel #3
0
    def __init__(self):
        self._complete = gevent.event.Event()

        self._rpc_thread = RpcThread(self)
        self._discovery_thread = DiscoveryThread(self)
        self._process_monitor = ProcessMonitorThread()

        self.notifier = NotificationThread()
        try:
            # Prepare persistence
            engine = create_engine(config.get('cthulhu', 'db_path'))
            Session.configure(bind=engine)

            self.persister = Persister()
        except sqlalchemy.exc.ArgumentError as e:
            log.error("Database error: %s" % e)
            raise

        # FSID to ClusterMonitor
        self.clusters = {}

        # Generate events on state changes
        self.eventer = Eventer(self)

        # Handle all ceph/server messages
        self.servers = ServerMonitor(self.persister, self.eventer)
Beispiel #4
0
    def __init__(self):
        self._complete = gevent.event.Event()

        self._rpc_thread = RpcThread(self)
        self._discovery_thread = DiscoveryThread(self)
        self._process_monitor = ProcessMonitorThread()

        self.notifier = NotificationThread()
        try:
            # Prepare persistence
            engine = create_engine(config.get('cthulhu', 'db_path'))
            Session.configure(bind=engine)

            self.persister = Persister()
        except sqlalchemy.exc.ArgumentError as e:
            log.error("Database error: %s" % e)
            raise

        # FSID to ClusterMonitor
        self.clusters = {}

        # Generate events on state changes
        self.eventer = Eventer(self)

        # Handle all ceph/server messages
        self.servers = ServerMonitor(self.persister, self.eventer)
    def cancel(self, request_id):
        """
        Immediately mark a request as cancelled, and in the background
        try and cancel any outstanding JID for it.
        """
        request = self._by_request_id[request_id]

        # Idempotent behaviour: no-op if already cancelled
        if request.state == request.COMPLETE:
            return

        with self._update_index(request):
            # I will take over cancelling the JID from the request
            cancel_jid = request.jid
            request.jid = None

            # Request is now done, no further calls
            request.set_error("Cancelled")
            request.complete()

            # In the background, try to cancel the request's JID on a best-effort basis
            if cancel_jid:
                client = LocalClient(config.get('cthulhu', 'salt_config_path'))
                client.run_job(request.minion_id, 'saltutil.kill_job',
                               [cancel_jid])
Beispiel #6
0
    def fetch(self, minion_id, sync_type):
        log.debug("SyncObjects.fetch: %s/%s" % (minion_id, sync_type))
        if minion_id is None:
            # We're probably being replayed to from the database
            log.warn("SyncObjects.fetch called with minion_id=None")
            return

        self._fetching_at[sync_type] = now()
        client = LocalClient(config.get('cthulhu', 'salt_config_path'))
        # TODO clean up unused 'since' argument
        pub_data = client.run_job(
            minion_id, 'ceph.get_cluster_object',
            condition_kwarg(
                [], {
                    'cluster_name': self._cluster_name,
                    'sync_type': sync_type.str,
                    'since': None
                }))
        if not pub_data:
            log.error("Failed to start fetch job %s/%s" %
                      (minion_id, sync_type))
            # Don't throw an exception because if a fetch fails we should always
        else:
            log.debug("SyncObjects.fetch: jid=%s minions=%s" %
                      (pub_data['jid'], pub_data['minions']))
Beispiel #7
0
    def __init__(self):
        super(Persister, self).__init__()

        self._queue = gevent.queue.Queue()
        self._complete = gevent.event.Event()

        self._session = Session()

        # Plumb the sqlalchemy logger into our cthulhu logger's output
        logging.getLogger("sqlalchemy.engine").setLevel(logging.getLevelName(config.get("cthulhu", "db_log_level")))
        for handler in log.handlers:
            logging.getLogger("sqlalchemy.engine").addHandler(handler)
Beispiel #8
0
    def __init__(self):
        super(Persister, self).__init__()

        self._queue = gevent.queue.Queue()
        self._complete = gevent.event.Event()

        self._session = Session()

        # Plumb the sqlalchemy logger into our cthulhu logger's output
        logging.getLogger('sqlalchemy.engine').setLevel(
            logging.getLevelName(config.get('cthulhu', 'db_log_level')))
        for handler in log.handlers:
            logging.getLogger('sqlalchemy.engine').addHandler(handler)
Beispiel #9
0
    def _submit(self):
        client = LocalClient(config.get('cthulhu', 'salt_config_path'))
        pub_data = client.run_job(self._minion_id, self._cmd, self._args)
        if not pub_data:
            # FIXME: LocalClient uses 'print' to record the
            # details of what went wrong :-(
            raise PublishError("Failed to publish job")

        self.log.info("Request %s started job %s" % (self.id, pub_data['jid']))

        self.alive_at = now()
        self.jid = pub_data['jid']

        return self.jid
Beispiel #10
0
    def tick(self):
        """
        For walltime-based monitoring of running requests.  Long-running requests
        get a periodic call to saltutil.running to verify that things really
        are still happening.
        """

        if not self._by_jid:
            return
        else:
            log.debug("RequestCollection.tick: %s JIDs underway" %
                      len(self._by_jid))

        # Identify JIDs who haven't had a saltutil.running reponse for too long.
        # Kill requests in a separate phase because request:JID is not 1:1
        stale_jobs = set()
        _now = now()
        for request in self._by_jid.values():
            if _now - request.alive_at > datetime.timedelta(
                    seconds=TICK_PERIOD * 3):
                log.error("Request %s JID %s stale: now=%s, alive_at=%s" %
                          (request.id, request.jid, _now, request.alive_at))
                stale_jobs.add(request)

        # Any identified stale jobs are errored out.
        for request in stale_jobs:
            with self._update_index(request):
                request.set_error("Lost contact")
                request.jid = None
                request.complete()

        # Identify minions associated with JIDs in flight
        query_minions = set()
        for jid, request in self._by_jid.items():
            query_minions.add(request.minion_id)

        # Attempt to emit a saltutil.running to ping jobs, next tick we
        # will see if we got updates to the alive_at attribute to indicate non-staleness
        if query_minions:
            log.info("RequestCollection.tick: sending saltutil.running to {0}".
                     format(query_minions))
            client = LocalClient(config.get('cthulhu', 'salt_config_path'))
            pub_data = client.run_job(list(query_minions),
                                      'saltutil.running', [],
                                      expr_form="list")
            if not pub_data:
                log.warning("Failed to publish saltutil.running to {0}".format(
                    query_minions))
Beispiel #11
0
def clear(args):
    if not args.yes_i_am_sure:
        log.warn("This will remove all stored Calamari monitoring status and history.  Use '--yes-i-am-sure' to proceed")
        return

    log.info("Loading configuration..")
    config = CalamariConfig()

    log.info("Dropping tables")
    db_path = config.get('cthulhu', 'db_path')
    engine = create_engine(db_path)
    Base.metadata.drop_all(engine)
    Base.metadata.reflect(engine)
    if ALEMBIC_TABLE in Base.metadata.tables:
        Base.metadata.tables[ALEMBIC_TABLE].drop(engine)
    log.info("Complete.  Now run `%s initialize`" % os.path.basename(sys.argv[0]))
Beispiel #12
0
def update_connected_minions():
    from cthulhu.manager import config
    from calamari_common.salt_wrapper import Key, master_config
    if len(Key(master_config(config.get('cthulhu', 'salt_config_path'))).list_keys()['minions']) == 0:
        # no minions to update
        return

    message = "Updating already connected nodes."
    log.info(message)
    p = subprocess.Popen(["salt", "*", "state.highstate"],
                         stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    out, err = p.communicate()
    log.debug("{message} salt stdout: {out}".format(message=message, out=out))
    log.debug("{message} salt stderr: {err}".format(message=message, err=err))
    if p.returncode != 0:
        raise RuntimeError("{message} failed with rc={rc}".format(message=message, rc=p.returncode))
Beispiel #13
0
    def _submit(self, commands):
        self.log.debug("Request._submit: %s/%s/%s" % (self._minion_id, self._cluster_name, commands))

        client = LocalClient(config.get('cthulhu', 'salt_config_path'))
        pub_data = client.run_job(self._minion_id, 'ceph.rados_commands',
                                  [self._fsid, self._cluster_name, commands])
        if not pub_data:
            # FIXME: LocalClient uses 'print' to record the
            # details of what went wrong :-(
            raise PublishError("Failed to publish job")

        self.log.info("Request %s started job %s" % (self.id, pub_data['jid']))

        self.alive_at = now()
        self.jid = pub_data['jid']

        return self.jid
Beispiel #14
0
    def cancel(self, request_id):
        """
        Immediately mark a request as cancelled, and in the background
        try and cancel any outstanding JID for it.
        """
        request = self._by_request_id[request_id]
        with self._update_index(request):
            request.set_error("Cancelled")
            request.complete()

            if request.jid:
                client = LocalClient(config.get('cthulhu', 'salt_config_path'))
                client.run_job(request.minion_id, 'saltutil.kill_job',
                               [request.jid])
                # We don't check for completion or errors from kill_job, it's a best-effort thing.  If we're
                # cancelling something we will do our best to kill any subprocess but can't
                # any guarantees because running nodes may be out of touch with the calamari server.
                request.jid = None
Beispiel #15
0
    def cancel(self, request_id):
        """
        Immediately mark a request as cancelled, and in the background
        try and cancel any outstanding JID for it.
        """
        request = self._by_request_id[request_id]
        with self._update_index(request):
            request.set_error("Cancelled")
            request.complete()

            if request.jid:
                client = LocalClient(config.get('cthulhu', 'salt_config_path'))
                client.run_job(request.minion_id, 'saltutil.kill_job',
                               [request.jid])
                # We don't check for completion or errors from kill_job, it's a best-effort thing.  If we're
                # cancelling something we will do our best to kill any subprocess but can't
                # any guarantees because running nodes may be out of touch with the calamari server.
                request.jid = None
Beispiel #16
0
    def fetch(self, minion_id, sync_type):
        log.debug("SyncObjects.fetch: %s/%s" % (minion_id, sync_type))
        if minion_id is None:
            # We're probably being replayed to from the database
            log.warn("SyncObjects.fetch called with minion_id=None")
            return

        self._fetching_at[sync_type] = now()
        client = LocalClient(config.get('cthulhu', 'salt_config_path'))
        # TODO clean up unused 'since' argument
        pub_data = client.run_job(minion_id, 'ceph.get_cluster_object',
                                  condition_kwarg([], {'cluster_name': self._cluster_name,
                                                       'sync_type': sync_type.str,
                                                       'since': None}))
        if not pub_data:
            log.error("Failed to start fetch job %s/%s" % (minion_id, sync_type))
            # Don't throw an exception because if a fetch fails we should always
        else:
            log.debug("SyncObjects.fetch: jid=%s minions=%s" % (pub_data['jid'], pub_data['minions']))
Beispiel #17
0
def clear(args):
    if not args.yes_i_am_sure:
        log.warn(
            "This will remove all stored Calamari monitoring status and history.  Use '--yes-i-am-sure' to proceed"
        )
        return

    log.info("Loading configuration..")
    config = CalamariConfig()

    log.info("Dropping tables")
    db_path = config.get('cthulhu', 'db_path')
    engine = create_engine(db_path)
    Base.metadata.drop_all(engine)
    Base.metadata.reflect(engine)
    if ALEMBIC_TABLE in Base.metadata.tables:
        Base.metadata.tables[ALEMBIC_TABLE].drop(engine)
    log.info("Complete.  Now run `%s initialize`" %
             os.path.basename(sys.argv[0]))
Beispiel #18
0
    def tick(self):
        """
        For walltime-based monitoring of running requests.  Long-running requests
        get a periodic call to saltutil.running to verify that things really
        are still happening.
        """

        if not self._by_jid:
            return
        else:
            log.debug("RequestCollection.tick: %s JIDs underway" % len(self._by_jid))

        # Identify JIDs who haven't had a saltutil.running reponse for too long.
        # Kill requests in a separate phase because request:JID is not 1:1
        stale_jobs = set()
        _now = now()
        for request in self._by_jid.values():
            if _now - request.alive_at > datetime.timedelta(seconds=TICK_PERIOD * 3):
                log.error("Request %s JID %s stale: now=%s, alive_at=%s" % (
                    request.id, request.jid, _now, request.alive_at
                ))
                stale_jobs.add(request)

        # Any identified stale jobs are errored out.
        for request in stale_jobs:
            with self._update_index(request):
                request.set_error("Lost contact")
                request.jid = None
                request.complete()

        # Identify minions associated with JIDs in flight
        query_minions = set()
        for jid, request in self._by_jid.items():
            query_minions.add(request.minion_id)

        # Attempt to emit a saltutil.running to ping jobs, next tick we
        # will see if we got updates to the alive_at attribute to indicate non-staleness
        if query_minions:
            log.info("RequestCollection.tick: sending saltutil.running to {0}".format(query_minions))
            client = LocalClient(config.get('cthulhu', 'salt_config_path'))
            pub_data = client.run_job(list(query_minions), 'saltutil.running', [], expr_form="list")
            if not pub_data:
                log.warning("Failed to publish saltutil.running to {0}".format(query_minions))
Beispiel #19
0
    def on_tick(self):
        # This procedure is to catch the annoying case of AES key changes (#7836), which are otherwise
        # ignored by minions which are doing only minion->master messaging.  To ensure they
        # pick up on key changes, we actively send them something (doesn't matter what).  To
        # avoid doing this constantly, we only send things to minions which seem to be a little
        # late

        # After this length of time, doubt a minion enough to send it a message in case
        # it needs a kick to update its key
        def _ping_period(fqdn):
            return datetime.timedelta(seconds=self.get_contact_period(fqdn) * 2)

        t = now()
        late_servers = [s.fqdn for s in self.servers.values() if s.last_contact and (t - s.last_contact) > _ping_period(s.fqdn)]
        log.debug("late servers: %s" % late_servers)
        if late_servers:
            client = LocalClient(config.get('cthulhu', 'salt_config_path'))
            pub = client.pub(late_servers, "test.ping", expr_form='list')
            log.debug(pub)
Beispiel #20
0
def update_connected_minions():
    from cthulhu.manager import config
    from calamari_common.salt_wrapper import Key, master_config
    if len(
            Key(master_config(config.get(
                'cthulhu', 'salt_config_path'))).list_keys()['minions']) == 0:
        # no minions to update
        return

    message = "Updating already connected nodes."
    log.info(message)
    p = subprocess.Popen(["salt", "*", "state.highstate"],
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)
    out, err = p.communicate()
    log.debug("{message} salt stdout: {out}".format(message=message, out=out))
    log.debug("{message} salt stderr: {err}".format(message=message, err=err))
    if p.returncode != 0:
        raise RuntimeError("{message} failed with rc={rc}".format(
            message=message, rc=p.returncode))
Beispiel #21
0
    def _submit(self, commands=None):
        if commands is None:
            commands = self._commands

        self.log.debug("%s._submit: %s/%s/%s" % (self.__class__.__name__,
                                                 self._minion_id, self._cluster_name, commands))

        client = LocalClient(config.get('cthulhu', 'salt_config_path'))
        pub_data = client.run_job(self._minion_id, 'ceph.rados_commands',
                                  [self.fsid, self._cluster_name, commands])
        if not pub_data:
            # FIXME: LocalClient uses 'print' to record the
            # details of what went wrong :-(
            raise PublishError("Failed to publish job")

        self.log.info("Request %s started job %s" % (self.id, pub_data['jid']))

        self.alive_at = now()
        self.jid = pub_data['jid']

        return self.jid
Beispiel #22
0
    def on_tick(self):
        # This procedure is to catch the annoying case of AES key changes (#7836), which are otherwise
        # ignored by minions which are doing only minion->master messaging.  To ensure they
        # pick up on key changes, we actively send them something (doesn't matter what).  To
        # avoid doing this constantly, we only send things to minions which seem to be a little
        # late

        # After this length of time, doubt a minion enough to send it a message in case
        # it needs a kick to update its key
        def _ping_period(fqdn):
            return datetime.timedelta(seconds=self.get_contact_period(fqdn) *
                                      2)

        t = now()
        late_servers = [
            s.fqdn for s in self.servers.values()
            if s.last_contact and (t - s.last_contact) > _ping_period(s.fqdn)
        ]
        log.debug("late servers: %s" % late_servers)
        if late_servers:
            client = LocalClient(config.get('cthulhu', 'salt_config_path'))
            pub = client.pub(late_servers, "test.ping", expr_form='list')
            log.debug(pub)
Beispiel #23
0
    def load_plugins(self):

        """
        Try to load a status_processor from each module in plugin_path, store keyed by module_name
        """
        loaded_plugins = []
        # FIXME this assumes that plugin_path has been added to PYTHONPATH and/or is in site-packages
        plugin_path = config.get('cthulhu', 'plugin_path')

        if os.path.exists(plugin_path):
            for plugin in os.listdir(plugin_path):
                plugin = plugin.split('.')[0]
                if plugin in ('__init__', 'README'):
                    continue

                status_processor = None
                try:
                    plugin_module = importlib.import_module('.'.join((plugin, 'status_processor')))
                    status_processor = plugin_module.StatusProcessor()
                except ImportError, e:
                    log.info("Error importing plugin %s %s" % (plugin, str(e)))

                if status_processor is not None:
                    loaded_plugins.append((plugin, status_processor))
Beispiel #24
0
from cthulhu.manager import config
from cthulhu.util import now
from distutils.util import strtobool

# The tick handler is very cheap (no I/O) so we call
# it quite frequently.
TICK_SECONDS = 10

# The time-based checks don't kick in until after
# a grace period, to avoid generating complaints
# about "stale" timestamps immediately after startup
GRACE_PERIOD = 30

# How long must a [server|cluster] be out of contact before
# we generate an event?
CONTACT_THRESHOLD_FACTOR = int(config.get(
    'cthulhu', 'server_timeout_factor'))  # multiple of contact period
CLUSTER_CONTACT_THRESHOLD = int(
    config.get('cthulhu', 'cluster_contact_threshold'))  # in seconds

MINION_CONFIG = str(config.get('cthulhu', 'salt_config_path')).replace(
    'master', 'minion')
EMIT_EVENTS_TO_SALT_EVENT_BUS = bool(
    strtobool(config.get('cthulhu', 'emit_events_to_salt_event_bus')))
EVENT_TAG_PREFIX = str(config.get('cthulhu', 'event_tag_prefix'))

if EMIT_EVENTS_TO_SALT_EVENT_BUS:
    try:
        # TODO move this to import
        # from calamari_common import Caller
        import salt.client
    except ImportError as e:
Beispiel #25
0
from calamari_common.db.event import Event, ERROR, WARNING, RECOVERY, INFO, severity_str
from cthulhu.util import now


# The tick handler is very cheap (no I/O) so we call
# it quite frequently.
TICK_SECONDS = 10

# The time-based checks don't kick in until after
# a grace period, to avoid generating complaints
# about "stale" timestamps immediately after startup
GRACE_PERIOD = 30

# How long must a [server|cluster] be out of contact before
# we generate an event?
CONTACT_THRESHOLD_FACTOR = int(config.get('cthulhu', 'server_timeout_factor'))  # multiple of contact period
CLUSTER_CONTACT_THRESHOLD = int(config.get('cthulhu', 'cluster_contact_threshold'))  # in seconds


class Eventer(gevent.greenlet.Greenlet):
    """
    I listen to changes from ClusterMonitor and ServerMonitor, and feed
    events into the event log.  I also periodically check some time-based
    conditions in my on_tick method.
    """

    def __init__(self, manager):
        super(Eventer, self).__init__()
        self._manager = manager

        self._complete = gevent.event.Event()
Beispiel #26
0
from sqlalchemy.orm import sessionmaker
from cthulhu.manager import config

from cthulhu.persistence.sync_objects import SyncObject
from cthulhu.persistence.servers import Server, Service

from cthulhu.util import now
from cthulhu.log import log

Session = sessionmaker()

DeferredCall = namedtuple("DeferredCall", ["fn", "args", "kwargs"])


CLUSTER_MAP_RETENTION = datetime.timedelta(seconds=int(config.get("cthulhu", "cluster_map_retention")))


class Persister(gevent.greenlet.Greenlet):
    """
    Asynchronously persist a queue of updates.  This is for use by classes
    that maintain the primary copy of state in memory, but also lazily update
    the DB so that they can recover from it on restart.
    """

    def __init__(self):
        super(Persister, self).__init__()

        self._queue = gevent.queue.Queue()
        self._complete = gevent.event.Event()
Beispiel #27
0
from gevent import event
import salt.utils.event
import salt.utils.master
from salt.client import LocalClient

from cthulhu.gevent_util import nosleep
from cthulhu.log import log as cthulhu_log
from cthulhu.manager import salt_config, config

# The type name for hosts and osds in the CRUSH map (if users have their
# own crush map they may have changed this), Ceph defaults are 'host' and 'osd'
from calamari_common.types import OsdMap, MonMap, ServiceId
from cthulhu.persistence.servers import Server, Service
from cthulhu.util import now, SaltEventSource

CRUSH_HOST_TYPE = config.get('cthulhu', 'crush_host_type')
CRUSH_OSD_TYPE = config.get('cthulhu', 'crush_osd_type')

TICK_PERIOD = 10

# Ignore changes in boot time below this threshold, to avoid mistaking clock
# adjustments for reboots.
REBOOT_THRESHOLD = datetime.timedelta(seconds=10)

# getChild isn't in 2.6
log = logging.getLogger('.'.join((cthulhu_log.name, 'server_monitor')))


class GrainsNotFound(Exception):
    pass
Beispiel #28
0
from sqlalchemy.orm import sessionmaker
from cthulhu.manager import config

from cthulhu.persistence.sync_objects import SyncObject
from cthulhu.persistence.servers import Server, Service

from cthulhu.util import now
from cthulhu.log import log

Session = sessionmaker()

DeferredCall = namedtuple('DeferredCall', ['fn', 'args', 'kwargs'])

CLUSTER_MAP_RETENTION = datetime.timedelta(
    seconds=int(config.get('cthulhu', 'cluster_map_retention')))


class Persister(gevent.greenlet.Greenlet):
    """
    Asynchronously persist a queue of updates.  This is for use by classes
    that maintain the primary copy of state in memory, but also lazily update
    the DB so that they can recover from it on restart.
    """
    def __init__(self):
        super(Persister, self).__init__()

        self._queue = gevent.queue.Queue()
        self._complete = gevent.event.Event()

        self._session = Session()
Beispiel #29
0
 def _salt_key(self):
     return Key(master_config(config.get("cthulhu", "salt_config_path")))
Beispiel #30
0
def initialize(args):
    """
    This command exists to:

    - Prevent the user having to type more than one thing
    - Prevent the user seeing internals like 'manage.py' which we would
      rather people were not messing with on production systems.
    """
    log.info("Loading configuration..")
    config = CalamariConfig()

    # Generate django's SECRET_KEY setting
    # Do this first, otherwise subsequent django ops will raise ImproperlyConfigured.
    # Write into a file instead of directly, so that package upgrades etc won't spuriously
    # prompt for modified config unless it really is modified.
    if not os.path.exists(config.get('calamari_web', 'secret_key_path')):
        chars = 'abcdefghijklmnopqrstuvwxyz0123456789!@#$%^&*(-_=+)'
        open(config.get('calamari_web', 'secret_key_path'), 'w').write(get_random_string(50, chars))

    run_local_salt(sls=RELAX_SALT_PERMS_SLS, message='salt')
    run_local_salt(sls=POSTGRES_SLS, message='postgres')

    # Cthulhu's database
    db_path = config.get('cthulhu', 'db_path')
    engine = create_engine(db_path)
    Base.metadata.reflect(engine)
    alembic_config = AlembicConfig()
    if ALEMBIC_TABLE in Base.metadata.tables:
        log.info("Updating database...")
        # Database already populated, migrate forward
        command.upgrade(alembic_config, "head")
    else:
        log.info("Initializing database...")
        # Blank database, do initial population
        Base.metadata.create_all(engine)
        command.stamp(alembic_config, "head")

    os.environ.setdefault("DJANGO_SETTINGS_MODULE", "calamari_web.settings")

    # Django's database
    with quiet():
        execute_from_command_line(["", "syncdb", "--noinput"])

    create_default_roles()
    create_admin_users(args)
    log.info("Initializing web interface...")

    # Django's static files
    with quiet():
        execute_from_command_line(["", "collectstatic", "--noinput"])

    # Because we've loaded Django, it will have written log files as
    # this user (probably root).  Fix it so that apache can write them later.
    apache_user = pwd.getpwnam(config.get('calamari_web', 'username'))
    os.chown(config.get('calamari_web', 'log_path'), apache_user.pw_uid, apache_user.pw_gid)

    # Handle SQLite case, otherwise no chown is needed
    if config.get('calamari_web', 'db_engine').endswith("sqlite3"):
        os.chown(config.get('calamari_web', 'db_name'), apache_user.pw_uid, apache_user.pw_gid)

    # Start services, configure to run on boot
    run_local_salt(sls=SERVICES_SLS, message='services')

    # During an upgrade: update minions that were connected previously
    update_connected_minions()

    # Signal supervisor to restart cthulhu as we have created its database
    log.info("Restarting services...")
    subprocess.call(['supervisorctl', 'restart', 'cthulhu'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    # TODO: optionally generate or install HTTPS certs + hand to apache
    log.info("Complete.")
Beispiel #31
0
def initialize(args):
    """
    This command exists to:

    - Prevent the user having to type more than one thing
    - Prevent the user seeing internals like 'manage.py' which we would
      rather people were not messing with on production systems.
    """
    log.info("Loading configuration..")
    config = CalamariConfig()

    # Generate django's SECRET_KEY setting
    # Do this first, otherwise subsequent django ops will raise ImproperlyConfigured.
    # Write into a file instead of directly, so that package upgrades etc won't spuriously
    # prompt for modified config unless it really is modified.
    if not os.path.exists(config.get('calamari_web', 'secret_key_path')):
        chars = 'abcdefghijklmnopqrstuvwxyz0123456789!@#$%^&*(-_=+)'
        open(config.get('calamari_web', 'secret_key_path'),
             'w').write(get_random_string(50, chars))

    run_local_salt(sls=RELAX_SALT_PERMS_SLS, message='salt')
    run_local_salt(sls=POSTGRES_SLS, message='postgres')

    # Cthulhu's database
    db_path = config.get('cthulhu', 'db_path')
    engine = create_engine(db_path)
    Base.metadata.reflect(engine)
    alembic_config = AlembicConfig()
    if ALEMBIC_TABLE in Base.metadata.tables:
        log.info("Updating database...")
        # Database already populated, migrate forward
        command.upgrade(alembic_config, "head")
    else:
        log.info("Initializing database...")
        # Blank database, do initial population
        Base.metadata.create_all(engine)
        command.stamp(alembic_config, "head")

    os.environ.setdefault("DJANGO_SETTINGS_MODULE", "calamari_web.settings")

    # Django's database
    with quiet():
        execute_from_command_line(["", "syncdb", "--noinput"])

    create_default_roles()
    create_admin_users(args)
    log.info("Initializing web interface...")

    # Django's static files
    with quiet():
        execute_from_command_line(["", "collectstatic", "--noinput"])

    # Because we've loaded Django, it will have written log files as
    # this user (probably root).  Fix it so that apache can write them later.
    apache_user = pwd.getpwnam(config.get('calamari_web', 'username'))
    os.chown(config.get('calamari_web', 'log_path'), apache_user.pw_uid,
             apache_user.pw_gid)

    # Handle SQLite case, otherwise no chown is needed
    if config.get('calamari_web', 'db_engine').endswith("sqlite3"):
        os.chown(config.get('calamari_web', 'db_name'), apache_user.pw_uid,
                 apache_user.pw_gid)

    # Start services, configure to run on boot
    run_local_salt(sls=SERVICES_SLS, message='services')

    # During an upgrade: update minions that were connected previously
    update_connected_minions()

    # Signal supervisor to restart cthulhu as we have created its database
    log.info("Restarting services...")
    subprocess.call(['supervisorctl', 'restart', 'cthulhu'],
                    stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE)

    # TODO: optionally generate or install HTTPS certs + hand to apache
    log.info("Complete.")
Beispiel #32
0
 def bind(self):
     log.info("%s bind..." % self.__class__.__name__)
     self._server.bind(config.get('cthulhu', 'rpc_url'))
     self._bound = True
Beispiel #33
0
 def get_server_log(self, fqdn, log_path, lines):
     client = LocalClient(config.get('cthulhu', 'salt_config_path'))
     results = client.cmd(fqdn, "log_tail.tail", [log_path, lines])
     return results
Beispiel #34
0
from gevent import greenlet
from gevent import event

from cthulhu.gevent_util import nosleep
from cthulhu.log import log as cthulhu_log
from cthulhu.manager import salt_config, config

# The type name for hosts and osds in the CRUSH map (if users have their
# own crush map they may have changed this), Ceph defaults are 'host' and 'osd'
from calamari_common.types import OsdMap, MonMap, ServiceId
from calamari_common.salt_wrapper import SaltEventSource, MasterPillarUtil
from cthulhu.persistence.servers import Server, Service
from cthulhu.util import now

CRUSH_HOST_TYPE = config.get('cthulhu', 'crush_host_type')
CRUSH_OSD_TYPE = config.get('cthulhu', 'crush_osd_type')

# Ignore changes in boot time below this threshold, to avoid mistaking clock
# adjustments for reboots.
REBOOT_THRESHOLD = datetime.timedelta(seconds=10)


# getChild isn't in 2.6
log = logging.getLogger('.'.join((cthulhu_log.name, 'server_monitor')))


class GrainsNotFound(Exception):
    pass

Beispiel #35
0
 def _salt_key(self):
     return Key(master_config(config.get('cthulhu', 'salt_config_path')))
Beispiel #36
0
 def get_server_log(self, fqdn, log_path, lines):
     client = LocalClient(config.get('cthulhu', 'salt_config_path'))
     results = client.cmd(fqdn, "log_tail.tail", [log_path, lines])
     return results
Beispiel #37
0
 def list_server_logs(self, fqdn):
     client = LocalClient(config.get('cthulhu', 'salt_config_path'))
     results = client.cmd(fqdn, "log_tail.list_logs", ["."])
     log.debug('list_server_log result !!! {results}'.format(results=str(results)))
     return results
Beispiel #38
0
 def bind(self):
     log.info("%s bind..." % self.__class__.__name__)
     self._server.bind(config.get('cthulhu', 'rpc_url'))
     self._bound = True
Beispiel #39
0
 def _salt_key(self):
     return Key(master_config(config.get('cthulhu', 'salt_config_path')))
Beispiel #40
0
 def salt_client(self):
     return salt.client.LocalClient(config.get('cthulhu', 'salt_config_path'))
Beispiel #41
0
from calamari_common.salt_wrapper import condition_kwarg, LocalClient, SaltEventSource

from cthulhu.gevent_util import nosleep, nosleep_mgr
from cthulhu.log import log
from cthulhu.manager.crush_node_request_factory import CrushNodeRequestFactory
from cthulhu.manager.crush_request_factory import CrushRequestFactory
from cthulhu.manager.osd_request_factory import OsdRequestFactory
from cthulhu.manager.pool_request_factory import PoolRequestFactory
from cthulhu.manager.plugin_monitor import PluginMonitor
from calamari_common.types import CRUSH_NODE, CRUSH_MAP, SYNC_OBJECT_STR_TYPE, SYNC_OBJECT_TYPES, OSD, POOL, OsdMap, MdsMap, MonMap
from cthulhu.manager import config, salt_config
from cthulhu.util import now


FAVORITE_TIMEOUT_FACTOR = int(config.get('cthulhu', 'favorite_timeout_factor'))


class ClusterUnavailable(Exception):
    pass


class SyncObjects(object):
    """
    A collection of versioned objects, keyed by their class (which
    must be a SyncObject subclass).

    The objects are immutable, so it is safe to hand out references: new
    versions are new objects.
    """
Beispiel #42
0
from cthulhu.util import now
from distutils.util import strtobool


# The tick handler is very cheap (no I/O) so we call
# it quite frequently.
TICK_SECONDS = 10

# The time-based checks don't kick in until after
# a grace period, to avoid generating complaints
# about "stale" timestamps immediately after startup
GRACE_PERIOD = 30

# How long must a [server|cluster] be out of contact before
# we generate an event?
CONTACT_THRESHOLD_FACTOR = int(config.get('cthulhu', 'server_timeout_factor'))  # multiple of contact period
CLUSTER_CONTACT_THRESHOLD = int(config.get('cthulhu', 'cluster_contact_threshold'))  # in seconds

MINION_CONFIG = str(config.get('cthulhu', 'salt_config_path')).replace('master', 'minion')
EMIT_EVENTS_TO_SALT_EVENT_BUS = bool(strtobool(config.get('cthulhu', 'emit_events_to_salt_event_bus')))
EVENT_TAG_PREFIX = str(config.get('cthulhu', 'event_tag_prefix'))


if EMIT_EVENTS_TO_SALT_EVENT_BUS:
    try:
        # TODO move this to import
        # from calamari_common import Caller
        import salt.client
    except ImportError as e:
        EMIT_EVENTS_TO_SALT_EVENT_BUS = False
        log.error("Could not import salt.client: %s. Events cannot be emitted to salt event bus", str(e))
Beispiel #43
0
from cthulhu.manager import config
from calamari_common.db.event import Event, ERROR, WARNING, RECOVERY, INFO, severity_str
from cthulhu.util import now

# The tick handler is very cheap (no I/O) so we call
# it quite frequently.
TICK_SECONDS = 10

# The time-based checks don't kick in until after
# a grace period, to avoid generating complaints
# about "stale" timestamps immediately after startup
GRACE_PERIOD = 30

# How long must a [server|cluster] be out of contact before
# we generate an event?
CONTACT_THRESHOLD_FACTOR = int(config.get(
    'cthulhu', 'server_timeout_factor'))  # multiple of contact period
CLUSTER_CONTACT_THRESHOLD = int(
    config.get('cthulhu', 'cluster_contact_threshold'))  # in seconds


class Eventer(gevent.greenlet.Greenlet):
    """
    I listen to changes from ClusterMonitor and ServerMonitor, and feed
    events into the event log.  I also periodically check some time-based
    conditions in my on_tick method.
    """
    def __init__(self, manager):
        super(Eventer, self).__init__()
        self._manager = manager

        self._complete = gevent.event.Event()
Beispiel #44
0
 def list_server_logs(self, fqdn):
     client = LocalClient(config.get('cthulhu', 'salt_config_path'))
     results = client.cmd(fqdn, "log_tail.list_logs", ["."])
     log.debug('list_server_log result !!! {results}'.format(
         results=str(results)))
     return results