Ejemplo n.º 1
0
    def __init__(self):
        self._complete = gevent.event.Event()

        self._rpc_thread = RpcThread(self)
        self._discovery_thread = TopLevelEvents(self)
        self._process_monitor = ProcessMonitorThread()

        db_path = config.get('cthulhu', 'db_path')
        if sqlalchemy is not None and db_path:
            try:
                # Prepare persistence
                engine = create_engine(config.get('cthulhu',
                                                  'db_path'))  # noqa
                Session.configure(bind=engine)

                self.persister = Persister()
            except sqlalchemy.exc.ArgumentError as e:
                log.error("Database error: %s" % e)
                raise
        else:

            class NullPersister(object):
                def start(self):
                    pass

                def stop(self):
                    pass

                def join(self):
                    pass

                def __getattribute__(self, item):
                    if item.startswith('_'):
                        return object.__getattribute__(self, item)
                    else:
                        try:
                            return object.__getattribute__(self, item)
                        except AttributeError:

                            def blackhole(*args, **kwargs):
                                pass

                            return blackhole

            self.persister = NullPersister()

        # Remote operations
        self.requests = RequestCollection(self)
        self._request_ticker = Ticker(request_collection.TICK_PERIOD,
                                      lambda: self.requests.tick())

        # FSID to ClusterMonitor
        self.clusters = {}

        # Generate events on state changes
        self.eventer = Eventer(self)

        # Handle all ceph/server messages
        self.servers = ServerMonitor(self.persister, self.eventer,
                                     self.requests)
Ejemplo n.º 2
0
class TestEventer(TestCase):
    def setUp(self):
        self.eventer = Eventer(MagicMock())

    def tearDown(self):
        pass

    @skipIf(
        os.environ.get('CALAMARI_CONFIG') is None, "needs CALAMARI_CONFIG set")
    def testCreateManager(self):
        assert self.eventer is not None

    def test_that_it_emits_deleted_osd_events(self):
        self.eventer._emit = MagicMock()
        new = MagicMock()
        old = MagicMock()
        old.data = {}
        old.data['osds'] = [{'osd': 0}]
        self.eventer._on_osd_map(12345, new, old)
        self.assertIn(
            'removed from the cluster map',
            '\n'.join([str(x) for x in self.eventer._emit.call_args_list]))

    def test_that_it_emits_added_osd_events(self):
        self.eventer._emit = MagicMock()
        new = MagicMock()
        old = MagicMock()
        new.data = {}
        new.data['osds'] = [{'osd': 0}]
        self.eventer._on_osd_map(12345, new, old)
        self.assertIn(
            'added to the cluster map',
            '\n'.join([str(x) for x in self.eventer._emit.call_args_list]))
Ejemplo n.º 3
0
    def __init__(self):
        self._complete = gevent.event.Event()

        self._rpc_thread = RpcThread(self)
        self._discovery_thread = DiscoveryThread(self)
        self._process_monitor = ProcessMonitorThread()

        self.notifier = NotificationThread()
        try:
            # Prepare persistence
            engine = create_engine(config.get('cthulhu', 'db_path'))
            Session.configure(bind=engine)

            self.persister = Persister()
        except sqlalchemy.exc.ArgumentError as e:
            log.error("Database error: %s" % e)
            raise

        # FSID to ClusterMonitor
        self.clusters = {}

        # Generate events on state changes
        self.eventer = Eventer(self)

        # Handle all ceph/server messages
        self.servers = ServerMonitor(self.persister, self.eventer)
Ejemplo n.º 4
0
class TestEventer(TestCase):
    def setUp(self):
        self.eventer = Eventer(MagicMock())

    def tearDown(self):
        pass

    @skipIf(os.environ.get('CALAMARI_CONFIG') is None, "needs CALAMARI_CONFIG set")
    def testCreateManager(self):
        assert self.eventer is not None

    def test_that_it_emits_deleted_osd_events(self):
        self.eventer._emit = MagicMock()
        new = MagicMock()
        old = MagicMock()
        old.data = {}
        old.data['osds'] = [{'osd': 0}]
        self.eventer._on_osd_map(12345, new, old)
        self.assertIn('removed from the cluster map', '\n'.join([str(x) for x in self.eventer._emit.call_args_list]))

    def test_that_it_emits_added_osd_events(self):
        self.eventer._emit = MagicMock()
        new = MagicMock()
        old = MagicMock()
        new.data = {}
        new.data['osds'] = [{'osd': 0}]
        self.eventer._on_osd_map(12345, new, old)
        self.assertIn('added to the cluster map', '\n'.join([str(x) for x in self.eventer._emit.call_args_list]))
Ejemplo n.º 5
0
    def __init__(self):
        self._complete = gevent.event.Event()

        self._rpc_thread = RpcThread(self)
        self._discovery_thread = DiscoveryThread(self)
        self._process_monitor = ProcessMonitorThread()

        self.notifier = NotificationThread()
        try:
            # Prepare persistence
            engine = create_engine(config.get('cthulhu', 'db_path'))
            Session.configure(bind=engine)

            self.persister = Persister()
        except sqlalchemy.exc.ArgumentError as e:
            log.error("Database error: %s" % e)
            raise

        # FSID to ClusterMonitor
        self.clusters = {}

        # Generate events on state changes
        self.eventer = Eventer(self)

        # Handle all ceph/server messages
        self.servers = ServerMonitor(self.persister, self.eventer)
Ejemplo n.º 6
0
    def __init__(self):
        self._complete = gevent.event.Event()

        self._rpc_thread = RpcThread(self)
        self._discovery_thread = TopLevelEvents(self)
        self._process_monitor = ProcessMonitorThread()

        db_path = config.get('cthulhu', 'db_path')
        if sqlalchemy is not None and db_path:
            try:
                # Prepare persistence
                engine = create_engine(config.get('cthulhu', 'db_path'))  # noqa
                Session.configure(bind=engine)

                self.persister = Persister()
            except sqlalchemy.exc.ArgumentError as e:
                log.error("Database error: %s" % e)
                raise
        else:
            class NullPersister(object):
                def start(self):
                    pass

                def stop(self):
                    pass

                def join(self):
                    pass

                def __getattribute__(self, item):
                    if item.startswith('_'):
                        return object.__getattribute__(self, item)
                    else:
                        try:
                            return object.__getattribute__(self, item)
                        except AttributeError:
                            def blackhole(*args, **kwargs):
                                pass
                            return blackhole

            self.persister = NullPersister()

        # Remote operations
        self.requests = RequestCollection(self)
        self._request_ticker = Ticker(request_collection.TICK_PERIOD,
                                      lambda: self.requests.tick())

        # FSID to ClusterMonitor
        self.clusters = {}

        # Generate events on state changes
        self.eventer = Eventer(self)

        # Handle all ceph/server messages
        self.servers = ServerMonitor(self.persister, self.eventer, self.requests)
Ejemplo n.º 7
0
 def setUp(self):
     self.eventer = Eventer(MagicMock())
Ejemplo n.º 8
0
class TestEventer(TestCase):
    def setUp(self):
        self.eventer = Eventer(MagicMock())

    def tearDown(self):
        pass

    @skipIf(
        os.environ.get('CALAMARI_CONFIG') is None, "needs CALAMARI_CONFIG set")
    def testCreateManager(self):
        assert self.eventer is not None

    def test_that_it_emits_deleted_osd_events(self):
        self.eventer._emit = MagicMock()
        new = MagicMock()
        old = MagicMock()
        old.data = {}
        old.data['osds'] = [{'osd': 0}]
        self.eventer._on_osd_map(12345, new, old)
        self.assertIn(
            'removed from the cluster map',
            '\n'.join([str(x) for x in self.eventer._emit.call_args_list]))

    def test_that_it_emits_added_osd_events(self):
        self.eventer._emit = MagicMock()
        new = MagicMock()
        old = MagicMock()
        new.data = {}
        new.data['osds'] = [{'osd': 0}]
        self.eventer._on_osd_map(12345, new, old)
        self.assertIn(
            'added to the cluster map',
            '\n'.join([str(x) for x in self.eventer._emit.call_args_list]))

    @patch('cthulhu.manager.eventer.salt.client')
    def test_that_it_emits_quorum_status_events(self, client):
        new = MagicMock()
        old = MagicMock()
        old.data = {
            "election_epoch": 2,
            "monmap": {
                "created":
                "0.000000",
                "epoch":
                1,
                "fsid":
                "fc0dc0f5-fe35-48c1-8c9c-f2ae0770fce7",
                "modified":
                "0.000000",
                "mons": [{
                    "addr": "198.199.75.124:6789/0",
                    "name": "vagrant-ubuntu-trusty-64",
                    "rank": 0
                }]
            },
            "quorum": [0],
            "quorum_leader_name": "",
            "quorum_names": ["vagrant-ubuntu-trusty-64"]
        }

        new.data = {
            "election_epoch": 2,
            "monmap": {
                "created":
                "0.000000",
                "epoch":
                1,
                "fsid":
                "fc0dc0f5-fe35-48c1-8c9c-f2ae0770fce7",
                "modified":
                "0.000000",
                "mons": [{
                    "addr": "198.199.75.124:6789/0",
                    "name": "vagrant-ubuntu-trusty-64",
                    "rank": 0
                }]
            },
            "quorum": [0],
            "quorum_leader_name": "vagrant-ubuntu-trusty-64",
            "quorum_names": ["vagrant-ubuntu-trusty-64"]
        }

        self.eventer._emit = MagicMock()
        self.eventer._on_quorum_status(12345, new, new)
        self.assertFalse(self.eventer._emit.called)

        self.eventer._on_quorum_status(12345, new, old)
        message = '\n'.join(
            [str(x) for x in self.eventer._emit.call_args_list])
        print message
        self.assertIn('now quorum leader', message)

    def test_that_it_emits_pool_events(self):
        self.eventer._emit = MagicMock()
        new = MagicMock()
        old = MagicMock()
        old.data = {}
        old.data["pools"] = [{
            "auid": 0,
            "cache_min_evict_age": 0,
            "cache_min_flush_age": 0,
            "cache_mode": "none",
            "cache_target_dirty_high_ratio_micro": 600000,
            "cache_target_dirty_ratio_micro": 400000,
            "cache_target_full_ratio_micro": 800000,
            "crash_replay_interval": 0,
            "crush_ruleset": 0,
            "erasure_code_profile": "",
            "expected_num_objects": 0,
            "fast_read": False,
            "flags": 1,
            "flags_names": "hashpspool",
            "hit_set_count": 0,
            "hit_set_params": {
                "type": "none"
            },
            "hit_set_period": 0,
            "last_change": "7",
            "last_force_op_resend": "0",
            "min_read_recency_for_promote": 0,
            "min_size": 1,
            "min_write_recency_for_promote": 0,
            "object_hash": 2,
            "pg_num": 64,
            "pg_placement_num": 64,
            "pool": 1,
            "pool_name": "data",
            "pool_snaps": [],
            "quota_max_bytes": 0,
            "quota_max_objects": 0,
            "read_tier": -1,
            "removed_snaps": "[]",
            "size": 1,
            "snap_epoch": 0,
            "snap_mode": "selfmanaged",
            "snap_seq": 0,
            "stripe_width": 0,
            "target_max_bytes": 0,
            "target_max_objects": 0,
            "tier_of": -1,
            "tiers": [],
            "type": 1,
            "use_gmt_hitset": True,
            "write_tier": -1
        }]

        new.data = {}
        new.data["pools"] = [{
            "auid": 0,
            "cache_min_evict_age": 0,
            "cache_min_flush_age": 0,
            "cache_mode": "none",
            "cache_target_dirty_high_ratio_micro": 0,
            "cache_target_dirty_ratio_micro": 0,
            "cache_target_full_ratio_micro": 0,
            "crash_replay_interval": 0,
            "crush_ruleset": 0,
            "erasure_code_profile": "",
            "expected_num_objects": 0,
            "fast_read": False,
            "flags": 1,
            "flags_names": "hashpspool",
            "hit_set_count": 0,
            "hit_set_params": {
                "type": "none"
            },
            "hit_set_period": 0,
            "last_change": "1",
            "last_force_op_resend": "0",
            "min_read_recency_for_promote": 0,
            "min_size": 1,
            "min_write_recency_for_promote": 0,
            "object_hash": 2,
            "pg_num": 64,
            "pg_placement_num": 64,
            "pool": 0,
            "pool_name": "rbd",
            "pool_snaps": [],
            "quota_max_bytes": 0,
            "quota_max_objects": 0,
            "read_tier": -1,
            "removed_snaps": "[]",
            "size": 1,
            "snap_epoch": 0,
            "snap_mode": "selfmanaged",
            "snap_seq": 0,
            "stripe_width": 0,
            "target_max_bytes": 0,
            "target_max_objects": 0,
            "tier_of": -1,
            "tiers": [],
            "type": 1,
            "use_gmt_hitset": True,
            "write_tier": -1
        }, {
            "auid": 0,
            "cache_min_evict_age": 0,
            "cache_min_flush_age": 0,
            "cache_mode": "none",
            "cache_target_dirty_high_ratio_micro": 600000,
            "cache_target_dirty_ratio_micro": 400000,
            "cache_target_full_ratio_micro": 800000,
            "crash_replay_interval": 0,
            "crush_ruleset": 0,
            "erasure_code_profile": "",
            "expected_num_objects": 0,
            "fast_read": False,
            "flags": 1,
            "flags_names": "hashpspool",
            "hit_set_count": 0,
            "hit_set_params": {
                "type": "none"
            },
            "hit_set_period": 0,
            "last_change": "7",
            "last_force_op_resend": "0",
            "min_read_recency_for_promote": 0,
            "min_size": 1,
            "min_write_recency_for_promote": 0,
            "object_hash": 2,
            "pg_num": 64,
            "pg_placement_num": 64,
            "pool": 1,
            "pool_name": "data",
            "pool_snaps": [],
            "quota_max_bytes": 0,
            "quota_max_objects": 0,
            "read_tier": -1,
            "removed_snaps": "[]",
            "size": 1,
            "snap_epoch": 0,
            "snap_mode": "selfmanaged",
            "snap_seq": 0,
            "stripe_width": 0,
            "target_max_bytes": 0,
            "target_max_objects": 0,
            "tier_of": -1,
            "tiers": [],
            "type": 1,
            "use_gmt_hitset": True,
            "write_tier": -1,
        }]

        self.eventer._on_pool_status(12345, old, old)
        self.assertFalse(self.eventer._emit.called)

        self.eventer._on_pool_status(12345, new, old)
        self.assertIn(
            'added to cluster',
            '\n'.join([str(x) for x in self.eventer._emit.call_args_list]))
        self.eventer._on_pool_status(12345, old, new)
        self.assertIn(
            'removed from cluster',
            '\n'.join([str(x) for x in self.eventer._emit.call_args_list]))
Ejemplo n.º 9
0
class Manager(object):
    """
    Manage a collection of ClusterMonitors.

    Subscribe to ceph/cluster events, and create a ClusterMonitor
    for any FSID we haven't seen before.
    """
    def __init__(self):
        self._complete = gevent.event.Event()

        self._rpc_thread = RpcThread(self)
        self._discovery_thread = TopLevelEvents(self)
        self._process_monitor = ProcessMonitorThread()

        self.notifier = NotificationThread()
        if sqlalchemy is not None:
            try:
                # Prepare persistence
                engine = create_engine(config.get('cthulhu', 'db_path'))
                Session.configure(bind=engine)

                self.persister = Persister()
            except sqlalchemy.exc.ArgumentError as e:
                log.error("Database error: %s" % e)
                raise
        else:

            class NullPersister(object):
                def start(self):
                    pass

                def stop(self):
                    pass

                def join(self):
                    pass

                def __getattribute__(self, item):
                    if item.startswith('_'):
                        return object.__getattribute__(self, item)
                    else:
                        try:
                            return object.__getattribute__(self, item)
                        except AttributeError:

                            def blackhole(*args, **kwargs):
                                pass

                            return blackhole

            self.persister = NullPersister()

        # Remote operations
        self.requests = RequestCollection(self)
        self._request_ticker = Ticker(request_collection.TICK_PERIOD,
                                      lambda: self.requests.tick())

        # FSID to ClusterMonitor
        self.clusters = {}

        # Generate events on state changes
        self.eventer = Eventer(self)

        # Handle all ceph/server messages
        self.servers = ServerMonitor(self.persister, self.eventer,
                                     self.requests)

    def delete_cluster(self, fs_id):
        """
        Note that the cluster will pop right back again if it's
        still sending heartbeats.
        """
        victim = self.clusters[fs_id]
        victim.stop()
        victim.done.wait()
        del self.clusters[fs_id]

        self._expunge(fs_id)

    def stop(self):
        log.info("%s stopping" % self.__class__.__name__)
        for monitor in self.clusters.values():
            monitor.stop()
        self._rpc_thread.stop()
        self._discovery_thread.stop()
        self._process_monitor.stop()
        self.notifier.stop()
        self.eventer.stop()
        self._request_ticker.stop()

    def _expunge(self, fsid):
        session = Session()
        session.query(SyncObject).filter_by(fsid=fsid).delete()
        session.commit()

    def _recover(self):
        if sqlalchemy is None:
            return

        session = Session()
        for server in session.query(Server).all():
            log.debug("Recovered server %s" % server.fqdn)
            assert server.boot_time is None or server.boot_time.tzinfo is not None  # expect timezone-aware DB backend
            self.servers.inject_server(
                ServerState(fqdn=server.fqdn,
                            hostname=server.hostname,
                            managed=server.managed,
                            last_contact=server.last_contact,
                            boot_time=server.boot_time,
                            ceph_version=server.ceph_version))

        for service in session.query(Service).all():
            if service.server:
                server = session.query(Server).get(service.server)
            else:
                server = None
            log.debug("Recovered service %s/%s/%s on %s" %
                      (service.fsid, service.service_type, service.service_id,
                       server.fqdn if server else None))
            self.servers.inject_service(
                ServiceState(fsid=service.fsid,
                             service_type=service.service_type,
                             service_id=service.service_id),
                server.fqdn if server else None)

        # I want the most recent version of every sync_object
        fsids = [(row[0], row[1]) for row in session.query(
            SyncObject.fsid, SyncObject.cluster_name).distinct(SyncObject.fsid)
                 ]
        for fsid, name in fsids:
            cluster_monitor = ClusterMonitor(fsid, name, self.notifier,
                                             self.persister, self.servers,
                                             self.eventer, self.requests)
            self.clusters[fsid] = cluster_monitor

            object_types = [
                row[0]
                for row in session.query(SyncObject.sync_type).filter_by(
                    fsid=fsid).distinct()
            ]
            for sync_type in object_types:
                latest_record = session.query(SyncObject).filter_by(
                    fsid=fsid,
                    sync_type=sync_type).order_by(SyncObject.version.desc(),
                                                  SyncObject.when.desc())[0]

                # FIXME: bit of a hack because records persisted only store their 'version'
                # if it's a real counter version, underlying problem is that we have
                # underlying data (health, pg_brief) without usable version counters.
                def md5(raw):
                    hasher = hashlib.md5()
                    hasher.update(raw)
                    return hasher.hexdigest()

                if latest_record.version:
                    version = latest_record.version
                else:
                    version = md5(latest_record.data)

                when = latest_record.when
                when = when.replace(tzinfo=tzutc())
                if cluster_monitor.update_time is None or when > cluster_monitor.update_time:
                    cluster_monitor.update_time = when

                cluster_monitor.inject_sync_object(
                    None, sync_type, version,
                    msgpack.unpackb(latest_record.data))

        for monitor in self.clusters.values():
            log.info("Recovery: Cluster %s with update time %s" %
                     (monitor.fsid, monitor.update_time))
            monitor.start()

    def start(self):
        log.info("%s starting" % self.__class__.__name__)

        # Before we start listening to the outside world, recover
        # our last known state from persistent storage
        try:
            self._recover()
        except:
            log.exception("Recovery failed")
            os._exit(-1)

        self._rpc_thread.bind()
        self._rpc_thread.start()
        self._discovery_thread.start()
        self._process_monitor.start()
        self.notifier.start()
        self.persister.start()
        self.eventer.start()
        self._request_ticker.start()

        self.servers.start()

    def join(self):
        log.info("%s joining" % self.__class__.__name__)
        self._rpc_thread.join()
        self._discovery_thread.join()
        self._process_monitor.join()
        self.notifier.join()
        self.persister.join()
        self.eventer.join()
        self._request_ticker.join()
        self.servers.join()
        for monitor in self.clusters.values():
            monitor.join()

    def on_discovery(self, minion_id, heartbeat_data):
        log.info("on_discovery: {0}/{1}".format(minion_id,
                                                heartbeat_data['fsid']))
        cluster_monitor = ClusterMonitor(heartbeat_data['fsid'],
                                         heartbeat_data['name'], self.notifier,
                                         self.persister, self.servers,
                                         self.eventer, self.requests)
        self.clusters[heartbeat_data['fsid']] = cluster_monitor

        # Run before passing on the heartbeat, because otherwise the
        # syncs resulting from the heartbeat might not be received
        # by the monitor.
        cluster_monitor.start()
        # Wait for ClusterMonitor to start accepting events before asking it
        # to do anything
        cluster_monitor.ready()
        cluster_monitor.on_heartbeat(minion_id, heartbeat_data)
Ejemplo n.º 10
0
 def setUp(self):
     self.eventer = Eventer(MagicMock())
Ejemplo n.º 11
0
class TestEventer(TestCase):
    def setUp(self):
        self.eventer = Eventer(MagicMock())

    def tearDown(self):
        pass

    @skipIf(os.environ.get('CALAMARI_CONFIG') is None, "needs CALAMARI_CONFIG set")
    def testCreateManager(self):
        assert self.eventer is not None

    def test_that_it_emits_deleted_osd_events(self):
        self.eventer._emit = MagicMock()
        new = MagicMock()
        old = MagicMock()
        old.data = {}
        old.data['osds'] = [{'osd': 0}]
        self.eventer._on_osd_map(12345, new, old)
        self.assertIn('removed from the cluster map', '\n'.join([str(x) for x in self.eventer._emit.call_args_list]))

    def test_that_it_emits_added_osd_events(self):
        self.eventer._emit = MagicMock()
        new = MagicMock()
        old = MagicMock()
        new.data = {}
        new.data['osds'] = [{'osd': 0}]
        self.eventer._on_osd_map(12345, new, old)
        self.assertIn('added to the cluster map', '\n'.join([str(x) for x in self.eventer._emit.call_args_list]))

    @patch('cthulhu.manager.eventer.salt.client')
    def test_that_it_emits_quorum_status_events(self, client):
        new = MagicMock()
        old = MagicMock()
        old.data = {
            "election_epoch": 2,
            "monmap": {
                "created": "0.000000",
                "epoch": 1,
                "fsid": "fc0dc0f5-fe35-48c1-8c9c-f2ae0770fce7",
                "modified": "0.000000",
                "mons": [
                    {
                        "addr": "198.199.75.124:6789/0",
                        "name": "vagrant-ubuntu-trusty-64",
                        "rank": 0
                    }
                ]
            },
            "quorum": [
                0
            ],
            "quorum_leader_name": "",
            "quorum_names": [
                "vagrant-ubuntu-trusty-64"
            ]
        }

        new.data = {
            "election_epoch": 2,
            "monmap": {
                "created": "0.000000",
                "epoch": 1,
                "fsid": "fc0dc0f5-fe35-48c1-8c9c-f2ae0770fce7",
                "modified": "0.000000",
                "mons": [
                    {
                        "addr": "198.199.75.124:6789/0",
                        "name": "vagrant-ubuntu-trusty-64",
                        "rank": 0
                    }
                ]
            },
            "quorum": [
                0
            ],
            "quorum_leader_name": "vagrant-ubuntu-trusty-64",
            "quorum_names": [
                "vagrant-ubuntu-trusty-64"
            ]
        }

        self.eventer._emit = MagicMock()
        self.eventer._on_quorum_status(12345, new, new)
        self.assertFalse(self.eventer._emit.called)

        self.eventer._on_quorum_status(12345, new, old)
        message = '\n'.join([str(x) for x in self.eventer._emit.call_args_list])
        print message
        self.assertIn('now quorum leader', message)

    def test_that_it_emits_pool_events(self):
        self.eventer._emit = MagicMock()
        new = MagicMock()
        old = MagicMock()
        old.data = {}
        old.data["pools"] = [
            {
                "auid": 0,
                "cache_min_evict_age": 0,
                "cache_min_flush_age": 0,
                "cache_mode": "none",
                "cache_target_dirty_high_ratio_micro": 600000,
                "cache_target_dirty_ratio_micro": 400000,
                "cache_target_full_ratio_micro": 800000,
                "crash_replay_interval": 0,
                "crush_ruleset": 0,
                "erasure_code_profile": "",
                "expected_num_objects": 0,
                "fast_read": False,
                "flags": 1,
                "flags_names": "hashpspool",
                "hit_set_count": 0,
                "hit_set_params": {
                    "type": "none"
                },
                "hit_set_period": 0,
                "last_change": "7",
                "last_force_op_resend": "0",
                "min_read_recency_for_promote": 0,
                "min_size": 1,
                "min_write_recency_for_promote": 0,
                "object_hash": 2,
                "pg_num": 64,
                "pg_placement_num": 64,
                "pool": 1,
                "pool_name": "data",
                "pool_snaps": [],
                "quota_max_bytes": 0,
                "quota_max_objects": 0,
                "read_tier": -1,
                "removed_snaps": "[]",
                "size": 1,
                "snap_epoch": 0,
                "snap_mode": "selfmanaged",
                "snap_seq": 0,
                "stripe_width": 0,
                "target_max_bytes": 0,
                "target_max_objects": 0,
                "tier_of": -1,
                "tiers": [],
                "type": 1,
                "use_gmt_hitset": True,
                "write_tier": -1
            }]

        new.data = {}
        new.data["pools"] = [
            {
                "auid": 0,
                "cache_min_evict_age": 0,
                "cache_min_flush_age": 0,
                "cache_mode": "none",
                "cache_target_dirty_high_ratio_micro": 0,
                "cache_target_dirty_ratio_micro": 0,
                "cache_target_full_ratio_micro": 0,
                "crash_replay_interval": 0,
                "crush_ruleset": 0,
                "erasure_code_profile": "",
                "expected_num_objects": 0,
                "fast_read": False,
                "flags": 1,
                "flags_names": "hashpspool",
                "hit_set_count": 0,
                "hit_set_params": {
                    "type": "none"
                },
                "hit_set_period": 0,
                "last_change": "1",
                "last_force_op_resend": "0",
                "min_read_recency_for_promote": 0,
                "min_size": 1,
                "min_write_recency_for_promote": 0,
                "object_hash": 2,
                "pg_num": 64,
                "pg_placement_num": 64,
                "pool": 0,
                "pool_name": "rbd",
                "pool_snaps": [],
                "quota_max_bytes": 0,
                "quota_max_objects": 0,
                "read_tier": -1,
                "removed_snaps": "[]",
                "size": 1,
                "snap_epoch": 0,
                "snap_mode": "selfmanaged",
                "snap_seq": 0,
                "stripe_width": 0,
                "target_max_bytes": 0,
                "target_max_objects": 0,
                "tier_of": -1,
                "tiers": [],
                "type": 1,
                "use_gmt_hitset": True,
                "write_tier": -1
            },
            {
                "auid": 0,
                "cache_min_evict_age": 0,
                "cache_min_flush_age": 0,
                "cache_mode": "none",
                "cache_target_dirty_high_ratio_micro": 600000,
                "cache_target_dirty_ratio_micro": 400000,
                "cache_target_full_ratio_micro": 800000,
                "crash_replay_interval": 0,
                "crush_ruleset": 0,
                "erasure_code_profile": "",
                "expected_num_objects": 0,
                "fast_read": False,
                "flags": 1,
                "flags_names": "hashpspool",
                "hit_set_count": 0,
                "hit_set_params": {
                    "type": "none"
                },
                "hit_set_period": 0,
                "last_change": "7",
                "last_force_op_resend": "0",
                "min_read_recency_for_promote": 0,
                "min_size": 1,
                "min_write_recency_for_promote": 0,
                "object_hash": 2,
                "pg_num": 64,
                "pg_placement_num": 64,
                "pool": 1,
                "pool_name": "data",
                "pool_snaps": [],
                "quota_max_bytes": 0,
                "quota_max_objects": 0,
                "read_tier": -1,
                "removed_snaps": "[]",
                "size": 1,
                "snap_epoch": 0,
                "snap_mode": "selfmanaged",
                "snap_seq": 0,
                "stripe_width": 0,
                "target_max_bytes": 0,
                "target_max_objects": 0,
                "tier_of": -1,
                "tiers": [],
                "type": 1,
                "use_gmt_hitset": True,
                "write_tier": -1,
            }]

        self.eventer._on_pool_status(12345, old, old)
        self.assertFalse(self.eventer._emit.called)

        self.eventer._on_pool_status(12345, new, old)
        self.assertIn('added to cluster', '\n'.join([str(x) for x in self.eventer._emit.call_args_list]))
        self.eventer._on_pool_status(12345, old, new)
        self.assertIn('removed from cluster', '\n'.join([str(x) for x in self.eventer._emit.call_args_list]))
Ejemplo n.º 12
0
class Manager(object):
    """
    Manage a collection of ClusterMonitors.

    Subscribe to ceph/cluster events, and create a ClusterMonitor
    for any FSID we haven't seen before.
    """

    def __init__(self):
        self._complete = gevent.event.Event()

        self._rpc_thread = RpcThread(self)
        self._discovery_thread = TopLevelEvents(self)
        self._process_monitor = ProcessMonitorThread()

        db_path = config.get('cthulhu', 'db_path')
        if sqlalchemy is not None and db_path:
            try:
                # Prepare persistence
                engine = create_engine(config.get('cthulhu', 'db_path'))  # noqa
                Session.configure(bind=engine)

                self.persister = Persister()
            except sqlalchemy.exc.ArgumentError as e:
                log.error("Database error: %s" % e)
                raise
        else:
            class NullPersister(object):
                def start(self):
                    pass

                def stop(self):
                    pass

                def join(self):
                    pass

                def __getattribute__(self, item):
                    if item.startswith('_'):
                        return object.__getattribute__(self, item)
                    else:
                        try:
                            return object.__getattribute__(self, item)
                        except AttributeError:
                            def blackhole(*args, **kwargs):
                                pass
                            return blackhole

            self.persister = NullPersister()

        # Remote operations
        self.requests = RequestCollection(self)
        self._request_ticker = Ticker(request_collection.TICK_PERIOD,
                                      lambda: self.requests.tick())

        # FSID to ClusterMonitor
        self.clusters = {}

        # Generate events on state changes
        self.eventer = Eventer(self)

        # Handle all ceph/server messages
        self.servers = ServerMonitor(self.persister, self.eventer, self.requests)

    def delete_cluster(self, fs_id):
        """
        Note that the cluster will pop right back again if it's
        still sending heartbeats.
        """
        victim = self.clusters[fs_id]
        victim.stop()
        victim.done.wait()
        del self.clusters[fs_id]

        self._expunge(fs_id)

    def stop(self):
        log.info("%s stopping" % self.__class__.__name__)
        for monitor in self.clusters.values():
            monitor.stop()
        self._rpc_thread.stop()
        self._discovery_thread.stop()
        self._process_monitor.stop()
        self.eventer.stop()
        self._request_ticker.stop()

    def _expunge(self, fsid):
        if sqlalchemy is None:
            return
        session = Session()
        session.query(SyncObject).filter_by(fsid=fsid).delete()
        session.commit()

    def _recover(self):
        if sqlalchemy is None:
            return

        session = Session()
        for server in session.query(Server).all():
            log.debug("Recovered server %s" % server.fqdn)
            assert server.boot_time is None or server.boot_time.tzinfo is not None  # expect timezone-aware DB backend
            self.servers.inject_server(ServerState(
                fqdn=server.fqdn,
                hostname=server.hostname,
                managed=server.managed,
                last_contact=server.last_contact,
                boot_time=server.boot_time,
                ceph_version=server.ceph_version
            ))

        for service in session.query(Service).all():
            if service.server:
                server = session.query(Server).get(service.server)
            else:
                server = None
            log.debug("Recovered service %s/%s/%s on %s" % (
                service.fsid, service.service_type, service.service_id, server.fqdn if server else None
            ))
            self.servers.inject_service(ServiceState(
                fsid=service.fsid,
                service_type=service.service_type,
                service_id=service.service_id
            ), server.fqdn if server else None)

        # I want the most recent version of every sync_object
        fsids = [(row[0], row[1]) for row in session.query(SyncObject.fsid, SyncObject.cluster_name).distinct(SyncObject.fsid)]
        for fsid, name in fsids:
            cluster_monitor = ClusterMonitor(fsid, name, self.persister, self.servers,
                                             self.eventer, self.requests)
            self.clusters[fsid] = cluster_monitor

            object_types = [row[0] for row in session.query(SyncObject.sync_type).filter_by(fsid=fsid).distinct()]
            for sync_type in object_types:
                latest_record = session.query(SyncObject).filter_by(
                    fsid=fsid, sync_type=sync_type).order_by(
                    SyncObject.version.desc(), SyncObject.when.desc())[0]

                # FIXME: bit of a hack because records persisted only store their 'version'
                # if it's a real counter version, underlying problem is that we have
                # underlying data (health, pg_brief) without usable version counters.
                def md5(raw):
                    hasher = hashlib.md5()
                    hasher.update(raw)
                    return hasher.hexdigest()

                if latest_record.version:
                    version = latest_record.version
                else:
                    version = md5(latest_record.data)

                when = latest_record.when
                when = when.replace(tzinfo=tzutc())
                if cluster_monitor.update_time is None or when > cluster_monitor.update_time:
                    cluster_monitor.update_time = when

                cluster_monitor.inject_sync_object(None, sync_type, version, msgpack.unpackb(latest_record.data))

        for monitor in self.clusters.values():
            log.info("Recovery: Cluster %s with update time %s" % (monitor.fsid, monitor.update_time))
            monitor.start()

    def start(self):
        log.info("%s starting" % self.__class__.__name__)

        self._rpc_thread.bind()
        self._rpc_thread.start()
        self._discovery_thread.start()
        self._process_monitor.start()
        self.persister.start()
        self.eventer.start()
        self._request_ticker.start()

        self.servers.start()
        return True

    def join(self):
        log.info("%s joining" % self.__class__.__name__)
        self._rpc_thread.join()
        self._discovery_thread.join()
        self._process_monitor.join()
        self.persister.join()
        self.eventer.join()
        self._request_ticker.join()
        self.servers.join()
        for monitor in self.clusters.values():
            monitor.join()

    def on_discovery(self, minion_id, heartbeat_data):
        log.info("on_discovery: {0}/{1}".format(minion_id, heartbeat_data['fsid']))
        cluster_monitor = ClusterMonitor(heartbeat_data['fsid'], heartbeat_data['name'],
                                         self.persister, self.servers, self.eventer, self.requests)
        self.clusters[heartbeat_data['fsid']] = cluster_monitor

        # Run before passing on the heartbeat, because otherwise the
        # syncs resulting from the heartbeat might not be received
        # by the monitor.
        cluster_monitor.start()
        # Wait for ClusterMonitor to start accepting events before asking it
        # to do anything
        cluster_monitor.ready()
        cluster_monitor.on_heartbeat(minion_id, heartbeat_data)