def test_get_osd_to_host_mapping_osd_metadata_partial_exists(self, mocket):
        """
        That we get a mapping when osd_map contains osd_metadata no data
        """
        mocket.getnameinfo.return_value = [OSD_HOSTNAME]
        mocket.getfqdn.return_value = OSD_FQDN
        osd_map = {
            "osds": [{
                "cluster_addr": "192.34.58.142:6808/14001122",
                "osd": 0
            }, {
                "cluster_addr": "192.34.58.142:6802/17383",
                "osd": 1
            }],
            "osd_metadata": [{
                "back_addr": "192.34.58.142:6808/14001122",
                "hostname": "gravel2",
                "id": 0,
                "hb_back_addr": "192.34.58.142:6809/14001122",
                "hb_front_addr": "192.34.58.142:6810/14001122",
                "front_addr": "192.34.58.142:6800/1122"
            }]
        }

        sm = ServerMonitor(Mock(), Mock(), Mock())
        self.assertEqual(
            {
                ('gravel2.rockery', 'gravel2'): [{
                    'cluster_addr': '192.34.58.142:6808/14001122',
                    'osd': 0
                }, {
                    'cluster_addr': '192.34.58.142:6802/17383',
                    'osd': 1
                }]
            }, sm.get_hostname_to_osds(osd_map))
Beispiel #2
0
    def __init__(self):
        self._complete = gevent.event.Event()

        self._rpc_thread = RpcThread(self)
        self._discovery_thread = TopLevelEvents(self)
        self._process_monitor = ProcessMonitorThread()

        db_path = config.get('cthulhu', 'db_path')
        if sqlalchemy is not None and db_path:
            try:
                # Prepare persistence
                engine = create_engine(config.get('cthulhu',
                                                  'db_path'))  # noqa
                Session.configure(bind=engine)

                self.persister = Persister()
            except sqlalchemy.exc.ArgumentError as e:
                log.error("Database error: %s" % e)
                raise
        else:

            class NullPersister(object):
                def start(self):
                    pass

                def stop(self):
                    pass

                def join(self):
                    pass

                def __getattribute__(self, item):
                    if item.startswith('_'):
                        return object.__getattribute__(self, item)
                    else:
                        try:
                            return object.__getattribute__(self, item)
                        except AttributeError:

                            def blackhole(*args, **kwargs):
                                pass

                            return blackhole

            self.persister = NullPersister()

        # Remote operations
        self.requests = RequestCollection(self)
        self._request_ticker = Ticker(request_collection.TICK_PERIOD,
                                      lambda: self.requests.tick())

        # FSID to ClusterMonitor
        self.clusters = {}

        # Generate events on state changes
        self.eventer = Eventer(self)

        # Handle all ceph/server messages
        self.servers = ServerMonitor(self.persister, self.eventer,
                                     self.requests)
    def test_get_osd_to_host_mapping_osd_down_and_out_from_epoch1(
            self, mocket):
        """
        That we don't get a mapping when osd_map contains osd_metadata no data
        """
        mocket.getnameinfo.return_value = [OSD_HOSTNAME]
        mocket.getfqdn.return_value = OSD_FQDN
        osd_map = {
            "osd_metadata": [],
            "osds": [{
                "cluster_addr": ":/0",
                "down_at": 0,
                "heartbeat_back_addr": ":/0",
                "heartbeat_front_addr": ":/0",
                "in": 0,
                "last_clean_begin": 0,
                "last_clean_end": 0,
                "lost_at": 0,
                "osd": 0,
                "primary_affinity": 1.0,
                "public_addr": ":/0",
                "state": ["exists", "new"],
                "up": 0,
                "up_from": 0,
                "up_thru": 0,
                "uuid": "f53e0a25-d29c-4aa3-9a2e-f6ebee538f8e",
                "weight": 0.0
            }]
        }

        sm = ServerMonitor(Mock(), Mock(), Mock())
        self.assertEqual({}, sm.get_hostname_to_osds(osd_map))
    def test_get_osd_to_host_mapping_osd_metadata_exists(self, mocket):
        """
        That we get a mapping when osd_map contains osd_metadata no data
        """
        osd_map = {
            "osds": [{
                "cluster_addr": "192.34.58.142:6808/14001122",
                "osd": 0
            }],
            "osd_metadata": [{
                "back_addr": "192.34.58.142:6808/14001122",
                "hostname": "gravel2.rockery",
                "id": 0,
                "hb_back_addr": "192.34.58.142:6809/14001122",
                "hb_front_addr": "192.34.58.142:6810/14001122",
                "front_addr": "192.34.58.142:6800/1122"
            }]
        }

        sm = ServerMonitor(Mock(), Mock(), Mock())
        self.assertEqual(
            {
                ('gravel2.rockery', 'gravel2'): [{
                    'cluster_addr': '192.34.58.142:6808/14001122',
                    'osd': 0
                }]
            }, sm.get_hostname_to_osds(osd_map))

        self.assertEqual(False, mocket.called)
 def test_get_osd_to_host_mapping_empty(self):
     """
     That we get an empty mapping when osd_map contains no data
     """
     osd_map = {'tree': {'nodes': []}, 'osds': []}
     sm = ServerMonitor(Mock(), Mock(), Mock())
     self.assertEqual({}, sm.get_hostname_to_osds(osd_map))
Beispiel #6
0
    def test_unmanaged_service_relocate(self):
        """
        That when an OSD disappears from one server's salt.services output
        and reappears on another server, this is reflected in the state.
        """
        sm = ServerMonitor(Mock(), Mock(), Mock())

        sm.on_server_heartbeat(MON_FQDN, MON_CEPH_SERVICES)
        sm.on_osd_map(OSD_MAP)

        # osd.1 initially on unmanaged server OSD
        self.assertEqual(sm.services[ServiceId(FSID, 'osd', "1")].server_state.fqdn, OSD_HOSTNAME)

        sm.on_server_heartbeat(MON_FQDN, MON_CEPH_SERVICES_MIGRATED)
        sm.on_osd_map(OSD_MAP_MIGRATED)

        # osd.1 now on managed server MON
        self.assertEqual(sm.services[ServiceId(FSID, 'osd', "1")].server_state.fqdn, MON_FQDN)

        # Check the servers' lists of services are up to date too
        self.assertListEqual(sm.servers[MON_FQDN].services.keys(), [
            ServiceId(FSID, 'osd', '1'),
            ServiceId(FSID, 'mon', MON_HOSTNAME)

        ])
        self.assertListEqual(sm.servers[OSD_HOSTNAME].services.keys(), [
            ServiceId(FSID, 'osd', '0')
        ])
Beispiel #7
0
    def __init__(self):
        self._complete = gevent.event.Event()

        self._rpc_thread = RpcThread(self)
        self._discovery_thread = DiscoveryThread(self)
        self._process_monitor = ProcessMonitorThread()

        self.notifier = NotificationThread()
        try:
            # Prepare persistence
            engine = create_engine(config.get('cthulhu', 'db_path'))
            Session.configure(bind=engine)

            self.persister = Persister()
        except sqlalchemy.exc.ArgumentError as e:
            log.error("Database error: %s" % e)
            raise

        # FSID to ClusterMonitor
        self.clusters = {}

        # Generate events on state changes
        self.eventer = Eventer(self)

        # Handle all ceph/server messages
        self.servers = ServerMonitor(self.persister, self.eventer)
    def test_unmanaged_service_relocate(self):
        """
        That when an OSD disappears from one server's salt.services output
        and reappears on another server, this is reflected in the state.
        """
        sm = ServerMonitor(Mock(), Mock(), Mock())

        sm.on_server_heartbeat(MON_FQDN, MON_CEPH_SERVICES)
        sm.on_osd_map(OSD_MAP)

        # osd.1 initially on unmanaged server OSD
        self.assertEqual(sm.services[ServiceId(FSID, 'osd', "1")].server_state.fqdn, OSD_HOSTNAME)

        sm.on_server_heartbeat(MON_FQDN, MON_CEPH_SERVICES_MIGRATED)
        sm.on_osd_map(OSD_MAP_MIGRATED)

        # osd.1 now on managed server MON
        self.assertEqual(sm.services[ServiceId(FSID, 'osd', "1")].server_state.fqdn, MON_FQDN)

        # Check the servers' lists of services are up to date too
        self.assertListEqual(sm.servers[MON_FQDN].services.keys(), [
            ServiceId(FSID, 'osd', '1'),
            ServiceId(FSID, 'mon', MON_HOSTNAME)

        ])
        self.assertListEqual(sm.servers[OSD_HOSTNAME].services.keys(), [
            ServiceId(FSID, 'osd', '0')
        ])
Beispiel #9
0
    def test_unmanaged_managed_transition(self):
        """
        That when a pesky user doesn't initially install salt on OSD servers
        but later adds it, we correctly transition from paying attention
        to the CRUSH config to paying attention to the salt data, and
        fill in the correct FQDNs.
        """
        sm = ServerMonitor(Mock(), Mock(), Mock())

        sm.on_server_heartbeat(MON_FQDN, MON_CEPH_SERVICES)
        sm.on_osd_map(OSD_MAP)

        self.assertListEqual(sm.servers.keys(), [MON_FQDN, OSD_HOSTNAME])
        self.assertListEqual(sm.servers[MON_FQDN].services.keys(), [
            ServiceId(FSID, 'mon', MON_HOSTNAME)
        ])
        self.assertListEqual(sm.servers[OSD_HOSTNAME].services.keys(), [
            ServiceId(FSID, 'osd', '1'),
            ServiceId(FSID, 'osd', '0')
        ])

        sm.on_server_heartbeat(OSD_FQDN, OSD_CEPH_SERVICES)

        self.assertListEqual(sm.servers.keys(), [MON_FQDN, OSD_FQDN])
        self.assertEqual(sm.servers[OSD_FQDN].fqdn, OSD_FQDN)
        self.assertListEqual(sm.servers[MON_FQDN].services.keys(), [
            ServiceId(FSID, 'mon', MON_HOSTNAME)
        ])
        self.assertListEqual(sm.servers[OSD_FQDN].services.keys(), [
            ServiceId(FSID, 'osd', '1'),
            ServiceId(FSID, 'osd', '0')
        ])
Beispiel #10
0
    def __init__(self):
        self._complete = gevent.event.Event()

        self._rpc_thread = RpcThread(self)
        self._discovery_thread = DiscoveryThread(self)
        self._process_monitor = ProcessMonitorThread()

        self.notifier = NotificationThread()
        try:
            # Prepare persistence
            engine = create_engine(config.get('cthulhu', 'db_path'))
            Session.configure(bind=engine)

            self.persister = Persister()
        except sqlalchemy.exc.ArgumentError as e:
            log.error("Database error: %s" % e)
            raise

        # FSID to ClusterMonitor
        self.clusters = {}

        # Generate events on state changes
        self.eventer = Eventer(self)

        # Handle all ceph/server messages
        self.servers = ServerMonitor(self.persister, self.eventer)
    def test_unmanaged_managed_transition(self):
        """
        That when a pesky user doesn't initially install salt on OSD servers
        but later adds it, we correctly transition from paying attention
        to the CRUSH config to paying attention to the salt data, and
        fill in the correct FQDNs.
        """
        sm = ServerMonitor(Mock(), Mock(), Mock())

        sm.on_server_heartbeat(MON_FQDN, MON_CEPH_SERVICES)
        sm.on_osd_map(OSD_MAP)

        self.assertListEqual(sm.servers.keys(), [MON_FQDN, OSD_HOSTNAME])
        self.assertListEqual(sm.servers[MON_FQDN].services.keys(), [
            ServiceId(FSID, 'mon', MON_HOSTNAME)
        ])
        self.assertListEqual(sm.servers[OSD_HOSTNAME].services.keys(), [
            ServiceId(FSID, 'osd', '1'),
            ServiceId(FSID, 'osd', '0')
        ])

        sm.on_server_heartbeat(OSD_FQDN, OSD_CEPH_SERVICES)

        self.assertListEqual(sm.servers.keys(), [MON_FQDN, OSD_FQDN])
        self.assertEqual(sm.servers[OSD_FQDN].fqdn, OSD_FQDN)
        self.assertListEqual(sm.servers[MON_FQDN].services.keys(), [
            ServiceId(FSID, 'mon', MON_HOSTNAME)
        ])
        self.assertListEqual(sm.servers[OSD_FQDN].services.keys(), [
            ServiceId(FSID, 'osd', '1'),
            ServiceId(FSID, 'osd', '0')
        ])
Beispiel #12
0
    def __init__(self):
        self._complete = gevent.event.Event()

        self._rpc_thread = RpcThread(self)
        self._discovery_thread = TopLevelEvents(self)
        self._process_monitor = ProcessMonitorThread()

        db_path = config.get('cthulhu', 'db_path')
        if sqlalchemy is not None and db_path:
            try:
                # Prepare persistence
                engine = create_engine(config.get('cthulhu', 'db_path'))  # noqa
                Session.configure(bind=engine)

                self.persister = Persister()
            except sqlalchemy.exc.ArgumentError as e:
                log.error("Database error: %s" % e)
                raise
        else:
            class NullPersister(object):
                def start(self):
                    pass

                def stop(self):
                    pass

                def join(self):
                    pass

                def __getattribute__(self, item):
                    if item.startswith('_'):
                        return object.__getattribute__(self, item)
                    else:
                        try:
                            return object.__getattribute__(self, item)
                        except AttributeError:
                            def blackhole(*args, **kwargs):
                                pass
                            return blackhole

            self.persister = NullPersister()

        # Remote operations
        self.requests = RequestCollection(self)
        self._request_ticker = Ticker(request_collection.TICK_PERIOD,
                                      lambda: self.requests.tick())

        # FSID to ClusterMonitor
        self.clusters = {}

        # Generate events on state changes
        self.eventer = Eventer(self)

        # Handle all ceph/server messages
        self.servers = ServerMonitor(self.persister, self.eventer, self.requests)
 def test_get_osd_to_host_mapping_osd_metadata_absent(self, mocket):
     """
     That we get a mapping when osd_map contains osd_metadata no data
     """
     mocket.getnameinfo.return_value = [OSD_HOSTNAME]
     mocket.getfqdn.return_value = OSD_FQDN
     osd_map = {
         "osds": [{
             "cluster_addr": "192.34.58.142:6808/14001122",
             "osd": 0
         }]
     }
     sm = ServerMonitor(Mock(), Mock(), Mock())
     self.assertEqual(
         {
             ('gravel2.rockery', 'gravel2'): [{
                 'cluster_addr': '192.34.58.142:6808/14001122',
                 'osd': 0
             }]
         }, sm.get_hostname_to_osds(osd_map))
    def test_unmanaged_servers(self):
        """
        That when only the mons are sending salt messages, we generate
        a correct view of service locations including OSDs.
        """
        sm = ServerMonitor(Mock(), Mock(), Mock())

        sm.on_server_heartbeat(MON_FQDN, MON_CEPH_SERVICES)
        sm.on_osd_map(OSD_MAP)

        self.assertEqual(len(sm.servers), 2)
        self.assertEqual(len(sm.services), 3)
        self.assertEqual(len(sm.fsid_services), 1)
        self.assertEqual(len(sm.hostname_to_server), 2)

        self.assertListEqual(sm.servers[MON_FQDN].services.keys(), [
            ServiceId(FSID, 'mon', MON_HOSTNAME)
        ])
        self.assertListEqual(sm.servers[OSD_HOSTNAME].services.keys(), [
            ServiceId(FSID, 'osd', '1'),
            ServiceId(FSID, 'osd', '0')
        ])
    def test_managed_servers(self):
        """
        That managed servers (those sending salt messages) generate
        a correct view of service locations
        """
        sm = ServerMonitor(Mock(), Mock(), Mock())
        sm.on_server_heartbeat(MON_FQDN, MON_CEPH_SERVICES)
        sm.on_server_heartbeat(OSD_FQDN, OSD_CEPH_SERVICES)

        self.assertEqual(len(sm.servers), 2)
        self.assertEqual(len(sm.services), 3)
        self.assertEqual(len(sm.fsid_services), 1)
        self.assertEqual(len(sm.hostname_to_server), 2)

        self.assertListEqual(sm.servers.keys(), [MON_FQDN, OSD_FQDN])
        self.assertEqual(sm.servers[OSD_FQDN].fqdn, OSD_FQDN)
        self.assertListEqual(sm.servers[MON_FQDN].services.keys(),
                             [ServiceId(FSID, 'mon', MON_HOSTNAME)])
        self.assertListEqual(
            sm.servers[OSD_FQDN].services.keys(),
            [ServiceId(FSID, 'osd', '1'),
             ServiceId(FSID, 'osd', '0')])
    def test_delete_managed(self):
        """
        That when a managed server is removed, it no longer appears
        in the server/service data.
        """
        sm = ServerMonitor(Mock(), Mock(), Mock())

        sm.on_server_heartbeat(MON_FQDN, MON_CEPH_SERVICES)
        sm.on_server_heartbeat(OSD_FQDN, OSD_CEPH_SERVICES)

        sm.delete(OSD_FQDN)

        # The two OSD services, and the 'osd' server should be gone
        self.assertEqual(len(sm.servers), 1)
        self.assertEqual(len(sm.services), 1)
        self.assertEqual(len(sm.fsid_services), 1)
        self.assertEqual(len(sm.hostname_to_server), 1)

        self.assertListEqual(sm.servers.keys(), [MON_FQDN])
        self.assertListEqual(sm.services.keys(), [ServiceId(FSID, 'mon', MON_HOSTNAME)])
        self.assertListEqual([s.id for s in sm.fsid_services[FSID]], [ServiceId(FSID, 'mon', MON_HOSTNAME)])
        self.assertListEqual(sm.hostname_to_server.keys(), [MON_HOSTNAME])
    def test_remove_osd(self):
        """
        That when an OSD is disappears from the OSD map, it is also removed
        from ServerMonitor's worldview
        """
        sm = ServerMonitor(Mock(), Mock(), Mock())

        sm.on_server_heartbeat(MON_FQDN, MON_CEPH_SERVICES)
        sm.on_server_heartbeat(OSD_FQDN, OSD_CEPH_SERVICES)

        self.assertListEqual(sm.services.keys(), [
            ServiceId(FSID, 'osd', '0'),
            ServiceId(FSID, 'osd', '1'),
            ServiceId(FSID, 'mon', MON_HOSTNAME)
        ])

        sm.on_osd_map(OSD_MAP_1_REMOVED)

        self.assertListEqual(sm.services.keys(), [
            ServiceId(FSID, 'osd', '0'),
            ServiceId(FSID, 'mon', MON_HOSTNAME)
        ])
    def test_managed_servers(self):
        """
        That managed servers (those sending salt messages) generate
        a correct view of service locations
        """
        sm = ServerMonitor(Mock(), Mock(), Mock())
        sm.on_server_heartbeat(MON_FQDN, MON_CEPH_SERVICES)
        sm.on_server_heartbeat(OSD_FQDN, OSD_CEPH_SERVICES)

        self.assertEqual(len(sm.servers), 2)
        self.assertEqual(len(sm.services), 3)
        self.assertEqual(len(sm.fsid_services), 1)
        self.assertEqual(len(sm.hostname_to_server), 2)

        self.assertListEqual(sm.servers.keys(), [MON_FQDN, OSD_FQDN])
        self.assertEqual(sm.servers[OSD_FQDN].fqdn, OSD_FQDN)
        self.assertListEqual(sm.servers[MON_FQDN].services.keys(), [
            ServiceId(FSID, 'mon', MON_HOSTNAME)
        ])
        self.assertListEqual(sm.servers[OSD_FQDN].services.keys(), [
            ServiceId(FSID, 'osd', '1'),
            ServiceId(FSID, 'osd', '0')
        ])
    def test_remove_mds(self):
        """
        That when an mds disappears from the mds map, ServerMonitor notices
        """
        sm = ServerMonitor(Mock(), Mock(), Mock())

        sm.on_server_heartbeat(MDS1_FQDN, MDS1_SERVICES)
        sm.on_server_heartbeat(MDS2_FQDN, MDS2_SERVICES)
        sm.on_mds_map(FSID, MDS_MAP)

        self.assertListEqual(sm.services.keys(), [
            ServiceId(FSID, 'mds', MDS1_HOSTNAME),
            ServiceId(FSID, 'mds', MDS2_HOSTNAME)
        ])

        sm.on_mds_map(FSID, MDS_MAP_1_REMOVED)
        self.assertListEqual(sm.services.keys(),
                             [ServiceId(FSID, 'mds', MDS1_HOSTNAME)])
Beispiel #20
0
    def test_remove_mon(self):
        """
        That when a mon disappears from the mon map, ServerMonitor notices
        """
        sm = ServerMonitor(Mock(), Mock(), Mock())

        sm.on_server_heartbeat(MON_FQDN, MON_CEPH_SERVICES)
        sm.on_server_heartbeat(OSD_FQDN, OSD_CEPH_SERVICES)
        sm.on_mon_map(MON_MAP)

        self.assertListEqual(sm.services.keys(), [
            ServiceId(FSID, 'osd', '0'),
            ServiceId(FSID, 'osd', '1'),
            ServiceId(FSID, 'mon', MON_HOSTNAME)
        ])

        sm.on_mon_map(MON_MAP_1_REMOVED)
        self.assertListEqual(sm.services.keys(), [
            ServiceId(FSID, 'osd', '0'),
            ServiceId(FSID, 'osd', '1')
        ])
    def test_remove_mds(self):
        """
        That when an mds disappears from the mds map, ServerMonitor notices
        """
        sm = ServerMonitor(Mock(), Mock(), Mock())

        sm.on_server_heartbeat(MDS1_FQDN, MDS1_SERVICES)
        sm.on_server_heartbeat(MDS2_FQDN, MDS2_SERVICES)
        sm.on_mds_map(FSID, MDS_MAP)

        self.assertListEqual(sm.services.keys(), [
            ServiceId(FSID, 'mds', MDS1_HOSTNAME),
            ServiceId(FSID, 'mds', MDS2_HOSTNAME)
        ])

        sm.on_mds_map(FSID, MDS_MAP_1_REMOVED)
        self.assertListEqual(sm.services.keys(), [
            ServiceId(FSID, 'mds', MDS1_HOSTNAME)
        ])
    def test_remove_mon(self):
        """
        That when a mon disappears from the mon map, ServerMonitor notices
        """
        sm = ServerMonitor(Mock(), Mock(), Mock())

        sm.on_server_heartbeat(MON_FQDN, MON_CEPH_SERVICES)
        sm.on_server_heartbeat(OSD_FQDN, OSD_CEPH_SERVICES)
        sm.on_mon_map(MON_MAP)

        self.assertListEqual(sm.services.keys(), [
            ServiceId(FSID, 'osd', '0'),
            ServiceId(FSID, 'osd', '1'),
            ServiceId(FSID, 'mon', MON_HOSTNAME)
        ])

        sm.on_mon_map(MON_MAP_1_REMOVED)
        self.assertListEqual(sm.services.keys(), [
            ServiceId(FSID, 'osd', '0'),
            ServiceId(FSID, 'osd', '1')
        ])
    def test_on_osd_map(self, mocket):
        def get_name_info(addr, _):
            if addr == ":/0":
                return ['']
            else:
                return [OSD_HOSTNAME]

        def get_fqdn(thing):
            if thing == [""]:
                return ''
            else:
                return OSD_FQDN

        mocket.getnameinfo = get_name_info
        mocket.getfqdn = get_fqdn

        sm = ServerMonitor(Mock(), Mock(), Mock())
        sm.on_osd_map(BAD_MAP)
        sm.on_osd_map(BAD_MAP2)
        sm.on_osd_map(BAD_MAP3)
    def test_unmanaged_servers(self):
        """
        That when only the mons are sending salt messages, we generate
        a correct view of service locations including OSDs.
        """
        sm = ServerMonitor(Mock(), Mock(), Mock())

        sm.on_server_heartbeat(MON_FQDN, MON_CEPH_SERVICES)
        sm.on_osd_map(OSD_MAP)

        self.assertEqual(len(sm.servers), 2)
        self.assertEqual(len(sm.services), 3)
        self.assertEqual(len(sm.fsid_services), 1)
        self.assertEqual(len(sm.hostname_to_server), 2)

        self.assertListEqual(sm.servers[MON_FQDN].services.keys(),
                             [ServiceId(FSID, 'mon', MON_HOSTNAME)])
        self.assertListEqual(
            sm.servers[OSD_HOSTNAME].services.keys(),
            [ServiceId(FSID, 'osd', '1'),
             ServiceId(FSID, 'osd', '0')])
Beispiel #25
0
    def test_delete_managed(self):
        """
        That when a managed server is removed, it no longer appears
        in the server/service data.
        """
        sm = ServerMonitor(Mock(), Mock(), Mock())

        sm.on_server_heartbeat(MON_FQDN, MON_CEPH_SERVICES)
        sm.on_server_heartbeat(OSD_FQDN, OSD_CEPH_SERVICES)

        sm.delete(OSD_FQDN)

        # The two OSD services, and the 'osd' server should be gone
        self.assertEqual(len(sm.servers), 1)
        self.assertEqual(len(sm.services), 1)
        self.assertEqual(len(sm.fsid_services), 1)
        self.assertEqual(len(sm.hostname_to_server), 1)

        self.assertListEqual(sm.servers.keys(), [MON_FQDN])
        self.assertListEqual(sm.services.keys(), [ServiceId(FSID, 'mon', MON_HOSTNAME)])
        self.assertListEqual([s.id for s in sm.fsid_services[FSID]], [ServiceId(FSID, 'mon', MON_HOSTNAME)])
        self.assertListEqual(sm.hostname_to_server.keys(), [MON_HOSTNAME])
Beispiel #26
0
    def test_remove_osd(self):
        """
        That when an OSD is disappears from the OSD map, it is also removed
        from ServerMonitor's worldview
        """
        sm = ServerMonitor(Mock(), Mock(), Mock())

        sm.on_server_heartbeat(MON_FQDN, MON_CEPH_SERVICES)
        sm.on_server_heartbeat(OSD_FQDN, OSD_CEPH_SERVICES)

        self.assertListEqual(sm.services.keys(), [
            ServiceId(FSID, 'osd', '0'),
            ServiceId(FSID, 'osd', '1'),
            ServiceId(FSID, 'mon', MON_HOSTNAME)
        ])

        sm.on_osd_map(OSD_MAP_1_REMOVED)

        self.assertListEqual(sm.services.keys(), [
            ServiceId(FSID, 'osd', '0'),
            ServiceId(FSID, 'mon', MON_HOSTNAME)
        ])
Beispiel #27
0
class Manager(object):
    """
    Manage a collection of ClusterMonitors.

    Subscribe to ceph/cluster events, and create a ClusterMonitor
    for any FSID we haven't seen before.
    """

    def __init__(self):
        self._complete = gevent.event.Event()

        self._rpc_thread = RpcThread(self)
        self._discovery_thread = TopLevelEvents(self)
        self._process_monitor = ProcessMonitorThread()

        db_path = config.get('cthulhu', 'db_path')
        if sqlalchemy is not None and db_path:
            try:
                # Prepare persistence
                engine = create_engine(config.get('cthulhu', 'db_path'))  # noqa
                Session.configure(bind=engine)

                self.persister = Persister()
            except sqlalchemy.exc.ArgumentError as e:
                log.error("Database error: %s" % e)
                raise
        else:
            class NullPersister(object):
                def start(self):
                    pass

                def stop(self):
                    pass

                def join(self):
                    pass

                def __getattribute__(self, item):
                    if item.startswith('_'):
                        return object.__getattribute__(self, item)
                    else:
                        try:
                            return object.__getattribute__(self, item)
                        except AttributeError:
                            def blackhole(*args, **kwargs):
                                pass
                            return blackhole

            self.persister = NullPersister()

        # Remote operations
        self.requests = RequestCollection(self)
        self._request_ticker = Ticker(request_collection.TICK_PERIOD,
                                      lambda: self.requests.tick())

        # FSID to ClusterMonitor
        self.clusters = {}

        # Generate events on state changes
        self.eventer = Eventer(self)

        # Handle all ceph/server messages
        self.servers = ServerMonitor(self.persister, self.eventer, self.requests)

    def delete_cluster(self, fs_id):
        """
        Note that the cluster will pop right back again if it's
        still sending heartbeats.
        """
        victim = self.clusters[fs_id]
        victim.stop()
        victim.done.wait()
        del self.clusters[fs_id]

        self._expunge(fs_id)

    def stop(self):
        log.info("%s stopping" % self.__class__.__name__)
        for monitor in self.clusters.values():
            monitor.stop()
        self._rpc_thread.stop()
        self._discovery_thread.stop()
        self._process_monitor.stop()
        self.eventer.stop()
        self._request_ticker.stop()

    def _expunge(self, fsid):
        if sqlalchemy is None:
            return
        session = Session()
        session.query(SyncObject).filter_by(fsid=fsid).delete()
        session.commit()

    def _recover(self):
        if sqlalchemy is None:
            return

        session = Session()
        for server in session.query(Server).all():
            log.debug("Recovered server %s" % server.fqdn)
            assert server.boot_time is None or server.boot_time.tzinfo is not None  # expect timezone-aware DB backend
            self.servers.inject_server(ServerState(
                fqdn=server.fqdn,
                hostname=server.hostname,
                managed=server.managed,
                last_contact=server.last_contact,
                boot_time=server.boot_time,
                ceph_version=server.ceph_version
            ))

        for service in session.query(Service).all():
            if service.server:
                server = session.query(Server).get(service.server)
            else:
                server = None
            log.debug("Recovered service %s/%s/%s on %s" % (
                service.fsid, service.service_type, service.service_id, server.fqdn if server else None
            ))
            self.servers.inject_service(ServiceState(
                fsid=service.fsid,
                service_type=service.service_type,
                service_id=service.service_id
            ), server.fqdn if server else None)

        # I want the most recent version of every sync_object
        fsids = [(row[0], row[1]) for row in session.query(SyncObject.fsid, SyncObject.cluster_name).distinct(SyncObject.fsid)]
        for fsid, name in fsids:
            cluster_monitor = ClusterMonitor(fsid, name, self.persister, self.servers,
                                             self.eventer, self.requests)
            self.clusters[fsid] = cluster_monitor

            object_types = [row[0] for row in session.query(SyncObject.sync_type).filter_by(fsid=fsid).distinct()]
            for sync_type in object_types:
                latest_record = session.query(SyncObject).filter_by(
                    fsid=fsid, sync_type=sync_type).order_by(
                    SyncObject.version.desc(), SyncObject.when.desc())[0]

                # FIXME: bit of a hack because records persisted only store their 'version'
                # if it's a real counter version, underlying problem is that we have
                # underlying data (health, pg_brief) without usable version counters.
                def md5(raw):
                    hasher = hashlib.md5()
                    hasher.update(raw)
                    return hasher.hexdigest()

                if latest_record.version:
                    version = latest_record.version
                else:
                    version = md5(latest_record.data)

                when = latest_record.when
                when = when.replace(tzinfo=tzutc())
                if cluster_monitor.update_time is None or when > cluster_monitor.update_time:
                    cluster_monitor.update_time = when

                cluster_monitor.inject_sync_object(None, sync_type, version, msgpack.unpackb(latest_record.data))

        for monitor in self.clusters.values():
            log.info("Recovery: Cluster %s with update time %s" % (monitor.fsid, monitor.update_time))
            monitor.start()

    def start(self):
        log.info("%s starting" % self.__class__.__name__)

        self._rpc_thread.bind()
        self._rpc_thread.start()
        self._discovery_thread.start()
        self._process_monitor.start()
        self.persister.start()
        self.eventer.start()
        self._request_ticker.start()

        self.servers.start()
        return True

    def join(self):
        log.info("%s joining" % self.__class__.__name__)
        self._rpc_thread.join()
        self._discovery_thread.join()
        self._process_monitor.join()
        self.persister.join()
        self.eventer.join()
        self._request_ticker.join()
        self.servers.join()
        for monitor in self.clusters.values():
            monitor.join()

    def on_discovery(self, minion_id, heartbeat_data):
        log.info("on_discovery: {0}/{1}".format(minion_id, heartbeat_data['fsid']))
        cluster_monitor = ClusterMonitor(heartbeat_data['fsid'], heartbeat_data['name'],
                                         self.persister, self.servers, self.eventer, self.requests)
        self.clusters[heartbeat_data['fsid']] = cluster_monitor

        # Run before passing on the heartbeat, because otherwise the
        # syncs resulting from the heartbeat might not be received
        # by the monitor.
        cluster_monitor.start()
        # Wait for ClusterMonitor to start accepting events before asking it
        # to do anything
        cluster_monitor.ready()
        cluster_monitor.on_heartbeat(minion_id, heartbeat_data)
Beispiel #28
0
class Manager(object):
    """
    Manage a collection of ClusterMonitors.

    Subscribe to ceph/cluster events, and create a ClusterMonitor
    for any FSID we haven't seen before.
    """
    def __init__(self):
        self._complete = gevent.event.Event()

        self._rpc_thread = RpcThread(self)
        self._discovery_thread = TopLevelEvents(self)
        self._process_monitor = ProcessMonitorThread()

        self.notifier = NotificationThread()
        if sqlalchemy is not None:
            try:
                # Prepare persistence
                engine = create_engine(config.get('cthulhu', 'db_path'))
                Session.configure(bind=engine)

                self.persister = Persister()
            except sqlalchemy.exc.ArgumentError as e:
                log.error("Database error: %s" % e)
                raise
        else:

            class NullPersister(object):
                def start(self):
                    pass

                def stop(self):
                    pass

                def join(self):
                    pass

                def __getattribute__(self, item):
                    if item.startswith('_'):
                        return object.__getattribute__(self, item)
                    else:
                        try:
                            return object.__getattribute__(self, item)
                        except AttributeError:

                            def blackhole(*args, **kwargs):
                                pass

                            return blackhole

            self.persister = NullPersister()

        # Remote operations
        self.requests = RequestCollection(self)
        self._request_ticker = Ticker(request_collection.TICK_PERIOD,
                                      lambda: self.requests.tick())

        # FSID to ClusterMonitor
        self.clusters = {}

        # Generate events on state changes
        self.eventer = Eventer(self)

        # Handle all ceph/server messages
        self.servers = ServerMonitor(self.persister, self.eventer,
                                     self.requests)

    def delete_cluster(self, fs_id):
        """
        Note that the cluster will pop right back again if it's
        still sending heartbeats.
        """
        victim = self.clusters[fs_id]
        victim.stop()
        victim.done.wait()
        del self.clusters[fs_id]

        self._expunge(fs_id)

    def stop(self):
        log.info("%s stopping" % self.__class__.__name__)
        for monitor in self.clusters.values():
            monitor.stop()
        self._rpc_thread.stop()
        self._discovery_thread.stop()
        self._process_monitor.stop()
        self.notifier.stop()
        self.eventer.stop()
        self._request_ticker.stop()

    def _expunge(self, fsid):
        session = Session()
        session.query(SyncObject).filter_by(fsid=fsid).delete()
        session.commit()

    def _recover(self):
        if sqlalchemy is None:
            return

        session = Session()
        for server in session.query(Server).all():
            log.debug("Recovered server %s" % server.fqdn)
            assert server.boot_time is None or server.boot_time.tzinfo is not None  # expect timezone-aware DB backend
            self.servers.inject_server(
                ServerState(fqdn=server.fqdn,
                            hostname=server.hostname,
                            managed=server.managed,
                            last_contact=server.last_contact,
                            boot_time=server.boot_time,
                            ceph_version=server.ceph_version))

        for service in session.query(Service).all():
            if service.server:
                server = session.query(Server).get(service.server)
            else:
                server = None
            log.debug("Recovered service %s/%s/%s on %s" %
                      (service.fsid, service.service_type, service.service_id,
                       server.fqdn if server else None))
            self.servers.inject_service(
                ServiceState(fsid=service.fsid,
                             service_type=service.service_type,
                             service_id=service.service_id),
                server.fqdn if server else None)

        # I want the most recent version of every sync_object
        fsids = [(row[0], row[1]) for row in session.query(
            SyncObject.fsid, SyncObject.cluster_name).distinct(SyncObject.fsid)
                 ]
        for fsid, name in fsids:
            cluster_monitor = ClusterMonitor(fsid, name, self.notifier,
                                             self.persister, self.servers,
                                             self.eventer, self.requests)
            self.clusters[fsid] = cluster_monitor

            object_types = [
                row[0]
                for row in session.query(SyncObject.sync_type).filter_by(
                    fsid=fsid).distinct()
            ]
            for sync_type in object_types:
                latest_record = session.query(SyncObject).filter_by(
                    fsid=fsid,
                    sync_type=sync_type).order_by(SyncObject.version.desc(),
                                                  SyncObject.when.desc())[0]

                # FIXME: bit of a hack because records persisted only store their 'version'
                # if it's a real counter version, underlying problem is that we have
                # underlying data (health, pg_brief) without usable version counters.
                def md5(raw):
                    hasher = hashlib.md5()
                    hasher.update(raw)
                    return hasher.hexdigest()

                if latest_record.version:
                    version = latest_record.version
                else:
                    version = md5(latest_record.data)

                when = latest_record.when
                when = when.replace(tzinfo=tzutc())
                if cluster_monitor.update_time is None or when > cluster_monitor.update_time:
                    cluster_monitor.update_time = when

                cluster_monitor.inject_sync_object(
                    None, sync_type, version,
                    msgpack.unpackb(latest_record.data))

        for monitor in self.clusters.values():
            log.info("Recovery: Cluster %s with update time %s" %
                     (monitor.fsid, monitor.update_time))
            monitor.start()

    def start(self):
        log.info("%s starting" % self.__class__.__name__)

        # Before we start listening to the outside world, recover
        # our last known state from persistent storage
        try:
            self._recover()
        except:
            log.exception("Recovery failed")
            os._exit(-1)

        self._rpc_thread.bind()
        self._rpc_thread.start()
        self._discovery_thread.start()
        self._process_monitor.start()
        self.notifier.start()
        self.persister.start()
        self.eventer.start()
        self._request_ticker.start()

        self.servers.start()

    def join(self):
        log.info("%s joining" % self.__class__.__name__)
        self._rpc_thread.join()
        self._discovery_thread.join()
        self._process_monitor.join()
        self.notifier.join()
        self.persister.join()
        self.eventer.join()
        self._request_ticker.join()
        self.servers.join()
        for monitor in self.clusters.values():
            monitor.join()

    def on_discovery(self, minion_id, heartbeat_data):
        log.info("on_discovery: {0}/{1}".format(minion_id,
                                                heartbeat_data['fsid']))
        cluster_monitor = ClusterMonitor(heartbeat_data['fsid'],
                                         heartbeat_data['name'], self.notifier,
                                         self.persister, self.servers,
                                         self.eventer, self.requests)
        self.clusters[heartbeat_data['fsid']] = cluster_monitor

        # Run before passing on the heartbeat, because otherwise the
        # syncs resulting from the heartbeat might not be received
        # by the monitor.
        cluster_monitor.start()
        # Wait for ClusterMonitor to start accepting events before asking it
        # to do anything
        cluster_monitor.ready()
        cluster_monitor.on_heartbeat(minion_id, heartbeat_data)