def __init__(self): self._complete = gevent.event.Event() self._rpc_thread = RpcThread(self) self._discovery_thread = TopLevelEvents(self) self._process_monitor = ProcessMonitorThread() db_path = config.get('cthulhu', 'db_path') if sqlalchemy is not None and db_path: try: # Prepare persistence engine = create_engine(config.get('cthulhu', 'db_path')) # noqa Session.configure(bind=engine) self.persister = Persister() except sqlalchemy.exc.ArgumentError as e: log.error("Database error: %s" % e) raise else: class NullPersister(object): def start(self): pass def stop(self): pass def join(self): pass def __getattribute__(self, item): if item.startswith('_'): return object.__getattribute__(self, item) else: try: return object.__getattribute__(self, item) except AttributeError: def blackhole(*args, **kwargs): pass return blackhole self.persister = NullPersister() # Remote operations self.requests = RequestCollection(self) self._request_ticker = Ticker(request_collection.TICK_PERIOD, lambda: self.requests.tick()) # FSID to ClusterMonitor self.clusters = {} # Generate events on state changes self.eventer = Eventer(self) # Handle all ceph/server messages self.servers = ServerMonitor(self.persister, self.eventer, self.requests)
class TestEventer(TestCase): def setUp(self): self.eventer = Eventer(MagicMock()) def tearDown(self): pass @skipIf( os.environ.get('CALAMARI_CONFIG') is None, "needs CALAMARI_CONFIG set") def testCreateManager(self): assert self.eventer is not None def test_that_it_emits_deleted_osd_events(self): self.eventer._emit = MagicMock() new = MagicMock() old = MagicMock() old.data = {} old.data['osds'] = [{'osd': 0}] self.eventer._on_osd_map(12345, new, old) self.assertIn( 'removed from the cluster map', '\n'.join([str(x) for x in self.eventer._emit.call_args_list])) def test_that_it_emits_added_osd_events(self): self.eventer._emit = MagicMock() new = MagicMock() old = MagicMock() new.data = {} new.data['osds'] = [{'osd': 0}] self.eventer._on_osd_map(12345, new, old) self.assertIn( 'added to the cluster map', '\n'.join([str(x) for x in self.eventer._emit.call_args_list]))
def __init__(self): self._complete = gevent.event.Event() self._rpc_thread = RpcThread(self) self._discovery_thread = DiscoveryThread(self) self._process_monitor = ProcessMonitorThread() self.notifier = NotificationThread() try: # Prepare persistence engine = create_engine(config.get('cthulhu', 'db_path')) Session.configure(bind=engine) self.persister = Persister() except sqlalchemy.exc.ArgumentError as e: log.error("Database error: %s" % e) raise # FSID to ClusterMonitor self.clusters = {} # Generate events on state changes self.eventer = Eventer(self) # Handle all ceph/server messages self.servers = ServerMonitor(self.persister, self.eventer)
class TestEventer(TestCase): def setUp(self): self.eventer = Eventer(MagicMock()) def tearDown(self): pass @skipIf(os.environ.get('CALAMARI_CONFIG') is None, "needs CALAMARI_CONFIG set") def testCreateManager(self): assert self.eventer is not None def test_that_it_emits_deleted_osd_events(self): self.eventer._emit = MagicMock() new = MagicMock() old = MagicMock() old.data = {} old.data['osds'] = [{'osd': 0}] self.eventer._on_osd_map(12345, new, old) self.assertIn('removed from the cluster map', '\n'.join([str(x) for x in self.eventer._emit.call_args_list])) def test_that_it_emits_added_osd_events(self): self.eventer._emit = MagicMock() new = MagicMock() old = MagicMock() new.data = {} new.data['osds'] = [{'osd': 0}] self.eventer._on_osd_map(12345, new, old) self.assertIn('added to the cluster map', '\n'.join([str(x) for x in self.eventer._emit.call_args_list]))
def setUp(self): self.eventer = Eventer(MagicMock())
class TestEventer(TestCase): def setUp(self): self.eventer = Eventer(MagicMock()) def tearDown(self): pass @skipIf( os.environ.get('CALAMARI_CONFIG') is None, "needs CALAMARI_CONFIG set") def testCreateManager(self): assert self.eventer is not None def test_that_it_emits_deleted_osd_events(self): self.eventer._emit = MagicMock() new = MagicMock() old = MagicMock() old.data = {} old.data['osds'] = [{'osd': 0}] self.eventer._on_osd_map(12345, new, old) self.assertIn( 'removed from the cluster map', '\n'.join([str(x) for x in self.eventer._emit.call_args_list])) def test_that_it_emits_added_osd_events(self): self.eventer._emit = MagicMock() new = MagicMock() old = MagicMock() new.data = {} new.data['osds'] = [{'osd': 0}] self.eventer._on_osd_map(12345, new, old) self.assertIn( 'added to the cluster map', '\n'.join([str(x) for x in self.eventer._emit.call_args_list])) @patch('cthulhu.manager.eventer.salt.client') def test_that_it_emits_quorum_status_events(self, client): new = MagicMock() old = MagicMock() old.data = { "election_epoch": 2, "monmap": { "created": "0.000000", "epoch": 1, "fsid": "fc0dc0f5-fe35-48c1-8c9c-f2ae0770fce7", "modified": "0.000000", "mons": [{ "addr": "198.199.75.124:6789/0", "name": "vagrant-ubuntu-trusty-64", "rank": 0 }] }, "quorum": [0], "quorum_leader_name": "", "quorum_names": ["vagrant-ubuntu-trusty-64"] } new.data = { "election_epoch": 2, "monmap": { "created": "0.000000", "epoch": 1, "fsid": "fc0dc0f5-fe35-48c1-8c9c-f2ae0770fce7", "modified": "0.000000", "mons": [{ "addr": "198.199.75.124:6789/0", "name": "vagrant-ubuntu-trusty-64", "rank": 0 }] }, "quorum": [0], "quorum_leader_name": "vagrant-ubuntu-trusty-64", "quorum_names": ["vagrant-ubuntu-trusty-64"] } self.eventer._emit = MagicMock() self.eventer._on_quorum_status(12345, new, new) self.assertFalse(self.eventer._emit.called) self.eventer._on_quorum_status(12345, new, old) message = '\n'.join( [str(x) for x in self.eventer._emit.call_args_list]) print message self.assertIn('now quorum leader', message) def test_that_it_emits_pool_events(self): self.eventer._emit = MagicMock() new = MagicMock() old = MagicMock() old.data = {} old.data["pools"] = [{ "auid": 0, "cache_min_evict_age": 0, "cache_min_flush_age": 0, "cache_mode": "none", "cache_target_dirty_high_ratio_micro": 600000, "cache_target_dirty_ratio_micro": 400000, "cache_target_full_ratio_micro": 800000, "crash_replay_interval": 0, "crush_ruleset": 0, "erasure_code_profile": "", "expected_num_objects": 0, "fast_read": False, "flags": 1, "flags_names": "hashpspool", "hit_set_count": 0, "hit_set_params": { "type": "none" }, "hit_set_period": 0, "last_change": "7", "last_force_op_resend": "0", "min_read_recency_for_promote": 0, "min_size": 1, "min_write_recency_for_promote": 0, "object_hash": 2, "pg_num": 64, "pg_placement_num": 64, "pool": 1, "pool_name": "data", "pool_snaps": [], "quota_max_bytes": 0, "quota_max_objects": 0, "read_tier": -1, "removed_snaps": "[]", "size": 1, "snap_epoch": 0, "snap_mode": "selfmanaged", "snap_seq": 0, "stripe_width": 0, "target_max_bytes": 0, "target_max_objects": 0, "tier_of": -1, "tiers": [], "type": 1, "use_gmt_hitset": True, "write_tier": -1 }] new.data = {} new.data["pools"] = [{ "auid": 0, "cache_min_evict_age": 0, "cache_min_flush_age": 0, "cache_mode": "none", "cache_target_dirty_high_ratio_micro": 0, "cache_target_dirty_ratio_micro": 0, "cache_target_full_ratio_micro": 0, "crash_replay_interval": 0, "crush_ruleset": 0, "erasure_code_profile": "", "expected_num_objects": 0, "fast_read": False, "flags": 1, "flags_names": "hashpspool", "hit_set_count": 0, "hit_set_params": { "type": "none" }, "hit_set_period": 0, "last_change": "1", "last_force_op_resend": "0", "min_read_recency_for_promote": 0, "min_size": 1, "min_write_recency_for_promote": 0, "object_hash": 2, "pg_num": 64, "pg_placement_num": 64, "pool": 0, "pool_name": "rbd", "pool_snaps": [], "quota_max_bytes": 0, "quota_max_objects": 0, "read_tier": -1, "removed_snaps": "[]", "size": 1, "snap_epoch": 0, "snap_mode": "selfmanaged", "snap_seq": 0, "stripe_width": 0, "target_max_bytes": 0, "target_max_objects": 0, "tier_of": -1, "tiers": [], "type": 1, "use_gmt_hitset": True, "write_tier": -1 }, { "auid": 0, "cache_min_evict_age": 0, "cache_min_flush_age": 0, "cache_mode": "none", "cache_target_dirty_high_ratio_micro": 600000, "cache_target_dirty_ratio_micro": 400000, "cache_target_full_ratio_micro": 800000, "crash_replay_interval": 0, "crush_ruleset": 0, "erasure_code_profile": "", "expected_num_objects": 0, "fast_read": False, "flags": 1, "flags_names": "hashpspool", "hit_set_count": 0, "hit_set_params": { "type": "none" }, "hit_set_period": 0, "last_change": "7", "last_force_op_resend": "0", "min_read_recency_for_promote": 0, "min_size": 1, "min_write_recency_for_promote": 0, "object_hash": 2, "pg_num": 64, "pg_placement_num": 64, "pool": 1, "pool_name": "data", "pool_snaps": [], "quota_max_bytes": 0, "quota_max_objects": 0, "read_tier": -1, "removed_snaps": "[]", "size": 1, "snap_epoch": 0, "snap_mode": "selfmanaged", "snap_seq": 0, "stripe_width": 0, "target_max_bytes": 0, "target_max_objects": 0, "tier_of": -1, "tiers": [], "type": 1, "use_gmt_hitset": True, "write_tier": -1, }] self.eventer._on_pool_status(12345, old, old) self.assertFalse(self.eventer._emit.called) self.eventer._on_pool_status(12345, new, old) self.assertIn( 'added to cluster', '\n'.join([str(x) for x in self.eventer._emit.call_args_list])) self.eventer._on_pool_status(12345, old, new) self.assertIn( 'removed from cluster', '\n'.join([str(x) for x in self.eventer._emit.call_args_list]))
class Manager(object): """ Manage a collection of ClusterMonitors. Subscribe to ceph/cluster events, and create a ClusterMonitor for any FSID we haven't seen before. """ def __init__(self): self._complete = gevent.event.Event() self._rpc_thread = RpcThread(self) self._discovery_thread = TopLevelEvents(self) self._process_monitor = ProcessMonitorThread() self.notifier = NotificationThread() if sqlalchemy is not None: try: # Prepare persistence engine = create_engine(config.get('cthulhu', 'db_path')) Session.configure(bind=engine) self.persister = Persister() except sqlalchemy.exc.ArgumentError as e: log.error("Database error: %s" % e) raise else: class NullPersister(object): def start(self): pass def stop(self): pass def join(self): pass def __getattribute__(self, item): if item.startswith('_'): return object.__getattribute__(self, item) else: try: return object.__getattribute__(self, item) except AttributeError: def blackhole(*args, **kwargs): pass return blackhole self.persister = NullPersister() # Remote operations self.requests = RequestCollection(self) self._request_ticker = Ticker(request_collection.TICK_PERIOD, lambda: self.requests.tick()) # FSID to ClusterMonitor self.clusters = {} # Generate events on state changes self.eventer = Eventer(self) # Handle all ceph/server messages self.servers = ServerMonitor(self.persister, self.eventer, self.requests) def delete_cluster(self, fs_id): """ Note that the cluster will pop right back again if it's still sending heartbeats. """ victim = self.clusters[fs_id] victim.stop() victim.done.wait() del self.clusters[fs_id] self._expunge(fs_id) def stop(self): log.info("%s stopping" % self.__class__.__name__) for monitor in self.clusters.values(): monitor.stop() self._rpc_thread.stop() self._discovery_thread.stop() self._process_monitor.stop() self.notifier.stop() self.eventer.stop() self._request_ticker.stop() def _expunge(self, fsid): session = Session() session.query(SyncObject).filter_by(fsid=fsid).delete() session.commit() def _recover(self): if sqlalchemy is None: return session = Session() for server in session.query(Server).all(): log.debug("Recovered server %s" % server.fqdn) assert server.boot_time is None or server.boot_time.tzinfo is not None # expect timezone-aware DB backend self.servers.inject_server( ServerState(fqdn=server.fqdn, hostname=server.hostname, managed=server.managed, last_contact=server.last_contact, boot_time=server.boot_time, ceph_version=server.ceph_version)) for service in session.query(Service).all(): if service.server: server = session.query(Server).get(service.server) else: server = None log.debug("Recovered service %s/%s/%s on %s" % (service.fsid, service.service_type, service.service_id, server.fqdn if server else None)) self.servers.inject_service( ServiceState(fsid=service.fsid, service_type=service.service_type, service_id=service.service_id), server.fqdn if server else None) # I want the most recent version of every sync_object fsids = [(row[0], row[1]) for row in session.query( SyncObject.fsid, SyncObject.cluster_name).distinct(SyncObject.fsid) ] for fsid, name in fsids: cluster_monitor = ClusterMonitor(fsid, name, self.notifier, self.persister, self.servers, self.eventer, self.requests) self.clusters[fsid] = cluster_monitor object_types = [ row[0] for row in session.query(SyncObject.sync_type).filter_by( fsid=fsid).distinct() ] for sync_type in object_types: latest_record = session.query(SyncObject).filter_by( fsid=fsid, sync_type=sync_type).order_by(SyncObject.version.desc(), SyncObject.when.desc())[0] # FIXME: bit of a hack because records persisted only store their 'version' # if it's a real counter version, underlying problem is that we have # underlying data (health, pg_brief) without usable version counters. def md5(raw): hasher = hashlib.md5() hasher.update(raw) return hasher.hexdigest() if latest_record.version: version = latest_record.version else: version = md5(latest_record.data) when = latest_record.when when = when.replace(tzinfo=tzutc()) if cluster_monitor.update_time is None or when > cluster_monitor.update_time: cluster_monitor.update_time = when cluster_monitor.inject_sync_object( None, sync_type, version, msgpack.unpackb(latest_record.data)) for monitor in self.clusters.values(): log.info("Recovery: Cluster %s with update time %s" % (monitor.fsid, monitor.update_time)) monitor.start() def start(self): log.info("%s starting" % self.__class__.__name__) # Before we start listening to the outside world, recover # our last known state from persistent storage try: self._recover() except: log.exception("Recovery failed") os._exit(-1) self._rpc_thread.bind() self._rpc_thread.start() self._discovery_thread.start() self._process_monitor.start() self.notifier.start() self.persister.start() self.eventer.start() self._request_ticker.start() self.servers.start() def join(self): log.info("%s joining" % self.__class__.__name__) self._rpc_thread.join() self._discovery_thread.join() self._process_monitor.join() self.notifier.join() self.persister.join() self.eventer.join() self._request_ticker.join() self.servers.join() for monitor in self.clusters.values(): monitor.join() def on_discovery(self, minion_id, heartbeat_data): log.info("on_discovery: {0}/{1}".format(minion_id, heartbeat_data['fsid'])) cluster_monitor = ClusterMonitor(heartbeat_data['fsid'], heartbeat_data['name'], self.notifier, self.persister, self.servers, self.eventer, self.requests) self.clusters[heartbeat_data['fsid']] = cluster_monitor # Run before passing on the heartbeat, because otherwise the # syncs resulting from the heartbeat might not be received # by the monitor. cluster_monitor.start() # Wait for ClusterMonitor to start accepting events before asking it # to do anything cluster_monitor.ready() cluster_monitor.on_heartbeat(minion_id, heartbeat_data)
class TestEventer(TestCase): def setUp(self): self.eventer = Eventer(MagicMock()) def tearDown(self): pass @skipIf(os.environ.get('CALAMARI_CONFIG') is None, "needs CALAMARI_CONFIG set") def testCreateManager(self): assert self.eventer is not None def test_that_it_emits_deleted_osd_events(self): self.eventer._emit = MagicMock() new = MagicMock() old = MagicMock() old.data = {} old.data['osds'] = [{'osd': 0}] self.eventer._on_osd_map(12345, new, old) self.assertIn('removed from the cluster map', '\n'.join([str(x) for x in self.eventer._emit.call_args_list])) def test_that_it_emits_added_osd_events(self): self.eventer._emit = MagicMock() new = MagicMock() old = MagicMock() new.data = {} new.data['osds'] = [{'osd': 0}] self.eventer._on_osd_map(12345, new, old) self.assertIn('added to the cluster map', '\n'.join([str(x) for x in self.eventer._emit.call_args_list])) @patch('cthulhu.manager.eventer.salt.client') def test_that_it_emits_quorum_status_events(self, client): new = MagicMock() old = MagicMock() old.data = { "election_epoch": 2, "monmap": { "created": "0.000000", "epoch": 1, "fsid": "fc0dc0f5-fe35-48c1-8c9c-f2ae0770fce7", "modified": "0.000000", "mons": [ { "addr": "198.199.75.124:6789/0", "name": "vagrant-ubuntu-trusty-64", "rank": 0 } ] }, "quorum": [ 0 ], "quorum_leader_name": "", "quorum_names": [ "vagrant-ubuntu-trusty-64" ] } new.data = { "election_epoch": 2, "monmap": { "created": "0.000000", "epoch": 1, "fsid": "fc0dc0f5-fe35-48c1-8c9c-f2ae0770fce7", "modified": "0.000000", "mons": [ { "addr": "198.199.75.124:6789/0", "name": "vagrant-ubuntu-trusty-64", "rank": 0 } ] }, "quorum": [ 0 ], "quorum_leader_name": "vagrant-ubuntu-trusty-64", "quorum_names": [ "vagrant-ubuntu-trusty-64" ] } self.eventer._emit = MagicMock() self.eventer._on_quorum_status(12345, new, new) self.assertFalse(self.eventer._emit.called) self.eventer._on_quorum_status(12345, new, old) message = '\n'.join([str(x) for x in self.eventer._emit.call_args_list]) print message self.assertIn('now quorum leader', message) def test_that_it_emits_pool_events(self): self.eventer._emit = MagicMock() new = MagicMock() old = MagicMock() old.data = {} old.data["pools"] = [ { "auid": 0, "cache_min_evict_age": 0, "cache_min_flush_age": 0, "cache_mode": "none", "cache_target_dirty_high_ratio_micro": 600000, "cache_target_dirty_ratio_micro": 400000, "cache_target_full_ratio_micro": 800000, "crash_replay_interval": 0, "crush_ruleset": 0, "erasure_code_profile": "", "expected_num_objects": 0, "fast_read": False, "flags": 1, "flags_names": "hashpspool", "hit_set_count": 0, "hit_set_params": { "type": "none" }, "hit_set_period": 0, "last_change": "7", "last_force_op_resend": "0", "min_read_recency_for_promote": 0, "min_size": 1, "min_write_recency_for_promote": 0, "object_hash": 2, "pg_num": 64, "pg_placement_num": 64, "pool": 1, "pool_name": "data", "pool_snaps": [], "quota_max_bytes": 0, "quota_max_objects": 0, "read_tier": -1, "removed_snaps": "[]", "size": 1, "snap_epoch": 0, "snap_mode": "selfmanaged", "snap_seq": 0, "stripe_width": 0, "target_max_bytes": 0, "target_max_objects": 0, "tier_of": -1, "tiers": [], "type": 1, "use_gmt_hitset": True, "write_tier": -1 }] new.data = {} new.data["pools"] = [ { "auid": 0, "cache_min_evict_age": 0, "cache_min_flush_age": 0, "cache_mode": "none", "cache_target_dirty_high_ratio_micro": 0, "cache_target_dirty_ratio_micro": 0, "cache_target_full_ratio_micro": 0, "crash_replay_interval": 0, "crush_ruleset": 0, "erasure_code_profile": "", "expected_num_objects": 0, "fast_read": False, "flags": 1, "flags_names": "hashpspool", "hit_set_count": 0, "hit_set_params": { "type": "none" }, "hit_set_period": 0, "last_change": "1", "last_force_op_resend": "0", "min_read_recency_for_promote": 0, "min_size": 1, "min_write_recency_for_promote": 0, "object_hash": 2, "pg_num": 64, "pg_placement_num": 64, "pool": 0, "pool_name": "rbd", "pool_snaps": [], "quota_max_bytes": 0, "quota_max_objects": 0, "read_tier": -1, "removed_snaps": "[]", "size": 1, "snap_epoch": 0, "snap_mode": "selfmanaged", "snap_seq": 0, "stripe_width": 0, "target_max_bytes": 0, "target_max_objects": 0, "tier_of": -1, "tiers": [], "type": 1, "use_gmt_hitset": True, "write_tier": -1 }, { "auid": 0, "cache_min_evict_age": 0, "cache_min_flush_age": 0, "cache_mode": "none", "cache_target_dirty_high_ratio_micro": 600000, "cache_target_dirty_ratio_micro": 400000, "cache_target_full_ratio_micro": 800000, "crash_replay_interval": 0, "crush_ruleset": 0, "erasure_code_profile": "", "expected_num_objects": 0, "fast_read": False, "flags": 1, "flags_names": "hashpspool", "hit_set_count": 0, "hit_set_params": { "type": "none" }, "hit_set_period": 0, "last_change": "7", "last_force_op_resend": "0", "min_read_recency_for_promote": 0, "min_size": 1, "min_write_recency_for_promote": 0, "object_hash": 2, "pg_num": 64, "pg_placement_num": 64, "pool": 1, "pool_name": "data", "pool_snaps": [], "quota_max_bytes": 0, "quota_max_objects": 0, "read_tier": -1, "removed_snaps": "[]", "size": 1, "snap_epoch": 0, "snap_mode": "selfmanaged", "snap_seq": 0, "stripe_width": 0, "target_max_bytes": 0, "target_max_objects": 0, "tier_of": -1, "tiers": [], "type": 1, "use_gmt_hitset": True, "write_tier": -1, }] self.eventer._on_pool_status(12345, old, old) self.assertFalse(self.eventer._emit.called) self.eventer._on_pool_status(12345, new, old) self.assertIn('added to cluster', '\n'.join([str(x) for x in self.eventer._emit.call_args_list])) self.eventer._on_pool_status(12345, old, new) self.assertIn('removed from cluster', '\n'.join([str(x) for x in self.eventer._emit.call_args_list]))
class Manager(object): """ Manage a collection of ClusterMonitors. Subscribe to ceph/cluster events, and create a ClusterMonitor for any FSID we haven't seen before. """ def __init__(self): self._complete = gevent.event.Event() self._rpc_thread = RpcThread(self) self._discovery_thread = TopLevelEvents(self) self._process_monitor = ProcessMonitorThread() db_path = config.get('cthulhu', 'db_path') if sqlalchemy is not None and db_path: try: # Prepare persistence engine = create_engine(config.get('cthulhu', 'db_path')) # noqa Session.configure(bind=engine) self.persister = Persister() except sqlalchemy.exc.ArgumentError as e: log.error("Database error: %s" % e) raise else: class NullPersister(object): def start(self): pass def stop(self): pass def join(self): pass def __getattribute__(self, item): if item.startswith('_'): return object.__getattribute__(self, item) else: try: return object.__getattribute__(self, item) except AttributeError: def blackhole(*args, **kwargs): pass return blackhole self.persister = NullPersister() # Remote operations self.requests = RequestCollection(self) self._request_ticker = Ticker(request_collection.TICK_PERIOD, lambda: self.requests.tick()) # FSID to ClusterMonitor self.clusters = {} # Generate events on state changes self.eventer = Eventer(self) # Handle all ceph/server messages self.servers = ServerMonitor(self.persister, self.eventer, self.requests) def delete_cluster(self, fs_id): """ Note that the cluster will pop right back again if it's still sending heartbeats. """ victim = self.clusters[fs_id] victim.stop() victim.done.wait() del self.clusters[fs_id] self._expunge(fs_id) def stop(self): log.info("%s stopping" % self.__class__.__name__) for monitor in self.clusters.values(): monitor.stop() self._rpc_thread.stop() self._discovery_thread.stop() self._process_monitor.stop() self.eventer.stop() self._request_ticker.stop() def _expunge(self, fsid): if sqlalchemy is None: return session = Session() session.query(SyncObject).filter_by(fsid=fsid).delete() session.commit() def _recover(self): if sqlalchemy is None: return session = Session() for server in session.query(Server).all(): log.debug("Recovered server %s" % server.fqdn) assert server.boot_time is None or server.boot_time.tzinfo is not None # expect timezone-aware DB backend self.servers.inject_server(ServerState( fqdn=server.fqdn, hostname=server.hostname, managed=server.managed, last_contact=server.last_contact, boot_time=server.boot_time, ceph_version=server.ceph_version )) for service in session.query(Service).all(): if service.server: server = session.query(Server).get(service.server) else: server = None log.debug("Recovered service %s/%s/%s on %s" % ( service.fsid, service.service_type, service.service_id, server.fqdn if server else None )) self.servers.inject_service(ServiceState( fsid=service.fsid, service_type=service.service_type, service_id=service.service_id ), server.fqdn if server else None) # I want the most recent version of every sync_object fsids = [(row[0], row[1]) for row in session.query(SyncObject.fsid, SyncObject.cluster_name).distinct(SyncObject.fsid)] for fsid, name in fsids: cluster_monitor = ClusterMonitor(fsid, name, self.persister, self.servers, self.eventer, self.requests) self.clusters[fsid] = cluster_monitor object_types = [row[0] for row in session.query(SyncObject.sync_type).filter_by(fsid=fsid).distinct()] for sync_type in object_types: latest_record = session.query(SyncObject).filter_by( fsid=fsid, sync_type=sync_type).order_by( SyncObject.version.desc(), SyncObject.when.desc())[0] # FIXME: bit of a hack because records persisted only store their 'version' # if it's a real counter version, underlying problem is that we have # underlying data (health, pg_brief) without usable version counters. def md5(raw): hasher = hashlib.md5() hasher.update(raw) return hasher.hexdigest() if latest_record.version: version = latest_record.version else: version = md5(latest_record.data) when = latest_record.when when = when.replace(tzinfo=tzutc()) if cluster_monitor.update_time is None or when > cluster_monitor.update_time: cluster_monitor.update_time = when cluster_monitor.inject_sync_object(None, sync_type, version, msgpack.unpackb(latest_record.data)) for monitor in self.clusters.values(): log.info("Recovery: Cluster %s with update time %s" % (monitor.fsid, monitor.update_time)) monitor.start() def start(self): log.info("%s starting" % self.__class__.__name__) self._rpc_thread.bind() self._rpc_thread.start() self._discovery_thread.start() self._process_monitor.start() self.persister.start() self.eventer.start() self._request_ticker.start() self.servers.start() return True def join(self): log.info("%s joining" % self.__class__.__name__) self._rpc_thread.join() self._discovery_thread.join() self._process_monitor.join() self.persister.join() self.eventer.join() self._request_ticker.join() self.servers.join() for monitor in self.clusters.values(): monitor.join() def on_discovery(self, minion_id, heartbeat_data): log.info("on_discovery: {0}/{1}".format(minion_id, heartbeat_data['fsid'])) cluster_monitor = ClusterMonitor(heartbeat_data['fsid'], heartbeat_data['name'], self.persister, self.servers, self.eventer, self.requests) self.clusters[heartbeat_data['fsid']] = cluster_monitor # Run before passing on the heartbeat, because otherwise the # syncs resulting from the heartbeat might not be received # by the monitor. cluster_monitor.start() # Wait for ClusterMonitor to start accepting events before asking it # to do anything cluster_monitor.ready() cluster_monitor.on_heartbeat(minion_id, heartbeat_data)