Beispiel #1
0
class EPUManagementService(object):
    """EPU management service interface

    See: https://confluence.oceanobservatories.org/display/syseng/CIAD+CEI+OV+Elastic+Computing
    """

    def __init__(self):
        configs = ["service", "epumanagement"]
        config_files = get_config_paths(configs)
        self.CFG = bootstrap.configure(config_files)

        self.dashi = bootstrap.dashi_connect(self.CFG.epumanagement.service_name, self.CFG)

        self.default_user = self.CFG.epumanagement.get('default_user')

        # TODO: create ION class here or depend on epuagent repo as a dep
        ou_client = MockOUAgentClient()

        statsd_cfg = self.CFG.get('statsd')

        if 'mock_provisioner' in self.CFG.epumanagement and \
           self.CFG.epumanagement['mock_provisioner']:
            prov_client = MockProvisionerClient()
        else:
            provisioner_topic = self.CFG.epumanagement.provisioner_service_name
            prov_client = ProvisionerClient(self.dashi, topic=provisioner_topic, statsd_cfg=statsd_cfg,
                                            client_name="epumanagement")

        self.service_name = self.CFG.epumanagement.get(EPUM_INITIALCONF_SERVICE_NAME, EPUM_DEFAULT_SERVICE_NAME)
        self.proc_name = self.CFG.epumanagement.get(EPUM_INITIALCONF_PROC_NAME, None)

        self.store = get_epum_store(self.CFG, service_name=self.service_name,
            proc_name=self.proc_name)
        self.store.initialize()

        dtrs_client = DTRSClient(self.dashi, statsd_cfg=statsd_cfg, client_name=self.CFG.epumanagement.service_name)

        self.epumanagement = EPUManagement(self.CFG.epumanagement, SubscriberNotifier(self.dashi), prov_client,
                                           ou_client, dtrs_client, store=self.store, statsd_cfg=statsd_cfg)

        # hack to inject epum reference for mock prov client
        if isinstance(prov_client, MockProvisionerClient):
            prov_client._set_epum(self.epumanagement)

    def start(self):

        epu.dashiproc.link_dashi_exceptions(self.dashi)

        self.dashi.handle(self.subscribe_domain)
        self.dashi.handle(self.unsubscribe_domain)
        self.dashi.handle(self.add_domain)
        self.dashi.handle(self.remove_domain)
        self.dashi.handle(self.list_domains)
        self.dashi.handle(self.describe_domain)
        self.dashi.handle(self.reconfigure_domain)
        self.dashi.handle(self.add_domain_definition)
        self.dashi.handle(self.remove_domain_definition)
        self.dashi.handle(self.list_domain_definitions)
        self.dashi.handle(self.describe_domain_definition)
        self.dashi.handle(self.update_domain_definition)
        self.dashi.handle(self.ou_heartbeat)
        self.dashi.handle(self.instance_info)

        # this may spawn some background threads
        self.epumanagement.initialize()

        # hack to load some domain definitions at boot. later this should be client driven.
        initial_definitions = self.CFG.epumanagement.initial_definitions
        for definition_id, definition in initial_definitions.iteritems():
            log.info("Loading Domain Definition %s", definition_id)
            try:
                self.epumanagement.msg_add_domain_definition(definition_id, definition)
            except WriteConflictError:
                log.warn("Conflict while loading domain definition. It probably exists.", exc_info=True)
            except Exception:
                log.exception("Failed to load Domain Definition %s", definition_id)

        # hack to load some domains at boot. later this should be client driven.
        initial_domains = self.CFG.epumanagement.initial_domains
        for domain_id, params in initial_domains.iteritems():
            log.info("Loading Domain %s", domain_id)
            definition_id = params['definition']
            config = params['config']
            try:
                self.epumanagement.msg_add_domain(self.default_user, domain_id, definition_id, config)
            except WriteConflictError:
                log.warn("Conflict while loading domain definition. It probably exists.", exc_info=True)
            except Exception:
                log.exception("Failed to load Domain %s", domain_id)

        # blocks til dashi.cancel() is called
        self.dashi.consume()

    @property
    def default_user(self):
        if not self._default_user:
            msg = "Operation called for the default user, but none is defined."
            raise UserNotPermittedError(msg)
        else:
            return self._default_user

    @default_user.setter  # noqa
    def default_user(self, default_user):
        self._default_user = default_user

    def subscribe_domain(self, domain_id, subscriber_name, subscriber_op, caller=None):
        caller = caller or self.default_user

        self.epumanagement.msg_subscribe_domain(caller, domain_id,
            subscriber_name, subscriber_op)

    def unsubscribe_domain(self, domain_id, subscriber_name, caller=None):
        caller = caller or self.default_user

        self.epumanagement.msg_unsubscribe_domain(caller, domain_id, subscriber_name)

    def list_domains(self, caller=None):
        """Return a list of domains in the system
        """
        caller = caller or self.default_user
        return self.epumanagement.msg_list_domains(caller=caller)

    def describe_domain(self, domain_id, caller=None):
        """Return a state structure for a domain, or None
        """
        caller = caller or self.default_user
        return self.epumanagement.msg_describe_domain(caller, domain_id)

    def add_domain(self, domain_id, definition_id, config, subscriber_name=None,
                subscriber_op=None, caller=None):
        caller = caller or self.default_user
        self.epumanagement.msg_add_domain(caller, domain_id, definition_id, config,
            subscriber_name=subscriber_name, subscriber_op=subscriber_op)

    def remove_domain(self, domain_id, caller=None):
        caller = caller or self.default_user
        self.epumanagement.msg_remove_domain(caller, domain_id)

    def reconfigure_domain(self, domain_id, config, caller=None):
        caller = caller or self.default_user
        self.epumanagement.msg_reconfigure_domain(caller, domain_id, config)

    def list_domain_definitions(self):
        return self.epumanagement.msg_list_domain_definitions()

    def describe_domain_definition(self, definition_id):
        return self.epumanagement.msg_describe_domain_definition(definition_id)

    def add_domain_definition(self, definition_id, definition):
        self.epumanagement.msg_add_domain_definition(definition_id, definition)

    def remove_domain_definition(self, definition_id):
        self.epumanagement.msg_remove_domain_definition(definition_id)

    def update_domain_definition(self, definition_id, definition):
        self.epumanagement.msg_update_domain_definition(definition_id, definition)

    def ou_heartbeat(self, heartbeat):
        self.epumanagement.msg_heartbeat(None, heartbeat)  # epum parses

    def instance_info(self, record):
        self.epumanagement.msg_instance_info(None, record)  # epum parses
Beispiel #2
0
class HeartbeatMonitorTests(unittest.TestCase):
    def setUp(self):
        self.domain_name = "epuX"
        self.domain_owner = "david"
        self.domain_key = (self.domain_owner, self.domain_name)
        config = self._dom_config(health_init_time=100)
        self.state = FakeDomainStore(self.domain_owner, self.domain_name, config)

        initial_conf = {EPUM_INITIALCONF_EXTERNAL_DECIDE: True}
        self.notifier = MockSubscriberNotifier()
        self.provisioner_client = MockProvisionerClient()
        self.dtrs_client = MockDTRSClient()
        self.ou_client = MockOUAgentClient()
        self.epum_store = LocalEPUMStore(EPUM_DEFAULT_SERVICE_NAME)
        self.epum_store.initialize()
        self.epum = EPUManagement(
            initial_conf, self.notifier, self.provisioner_client,
            self.ou_client, self.dtrs_client, store=self.epum_store)
        self.provisioner_client._set_epum(self.epum)
        self.ou_client._set_epum(self.epum)

        # inject the FakeState instance directly instead of using msg_add_domain()
        self.epum.epum_store.domains[(self.domain_owner, self.domain_name)] = self.state

    def _dom_config(self, health_init_time=0):
        general = {EPUM_CONF_ENGINE_CLASS: MOCK_PKG + ".MockDecisionEngine01"}
        health = {EPUM_CONF_HEALTH_MONITOR: True, EPUM_CONF_HEALTH_BOOT: 10,
                  EPUM_CONF_HEALTH_MISSING: 5, EPUM_CONF_HEALTH_ZOMBIE: 10,
                  EPUM_CONF_HEALTH_REALLY_MISSING: 3,
                  TESTCONF_HEALTH_INIT_TIME: health_init_time}
        engine = {CONF_PRESERVE_N: 1}
        return {EPUM_CONF_GENERAL: general, EPUM_CONF_ENGINE: engine, EPUM_CONF_HEALTH: health}

    def test_recovery(self):
        self.epum.initialize()
        dom_config = self._dom_config(health_init_time=100)
        self.epum.msg_reconfigure_domain(self.domain_owner, self.domain_name, dom_config)

        nodes = ["n" + str(i + 1) for i in range(7)]
        n1, n2, n3, n4, n5, n6, n7 = nodes

        # set up some instances that reached their iaas_state before the
        # init time (100)

        # this one has been running for well longer than the missing timeout
        # and we will have not received a heartbeat. It shouldn't be marked
        # OUT_OF_CONTACT until more than 5 seconds after the init_time
        self.state.new_fake_instance_state(n1, InstanceState.RUNNING, 50,
                                           InstanceHealthState.OK)

        # this has been running for 10 seconds before the init time but we
        # have never received a heartbeat. It should be marked as OUT_OF_CONTACT
        # after the boot timeout expires, starting from the init time.
        self.state.new_fake_instance_state(n2, InstanceState.RUNNING, 90,
                                           InstanceHealthState.UNKNOWN)

        # is terminated and nothing should happen
        self.state.new_fake_instance_state(n3, InstanceState.TERMINATED, 90,
                                           InstanceHealthState.UNKNOWN)

        # this one will get a heartbeat at 110, just before it would be
        # marked OUT_OF_CONTACT
        self.state.new_fake_instance_state(n4, InstanceState.RUNNING, 95,
                                           InstanceHealthState.UNKNOWN)

        # this one will get a heartbeat at 105, just before it would be
        # marked OUT_OF_CONTACT
        self.state.new_fake_instance_state(n5, InstanceState.RUNNING, 95,
                                           InstanceHealthState.OK)

        # this instance was already marked as errored before the recovery
        self.state.new_fake_instance_state(n6, InstanceState.RUNNING, 95,
                                           InstanceHealthState.PROCESS_ERROR)

        # this instance was a ZOMBIE, it should be initially marked back as
        # UNKNOWN and then if a heartbeat arrives it should be ZOMBIE again
        self.state.new_fake_instance_state(n7, InstanceState.TERMINATED, 80,
                                           InstanceHealthState.ZOMBIE)

        self.epum._doctor_appt(100)
        self.assertNodeState(InstanceHealthState.OK, n1, n5)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4, n7)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6)

        self.epum._doctor_appt(105)
        self.assertNodeState(InstanceHealthState.OK, n1, n5)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4, n7)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6)

        self.ok_heartbeat(n5, 105)
        self.ok_heartbeat(n7, 105)  # this one will be relabeled as a zombie

        self.err_heartbeat(n6, 105, procs=['a'])
        self.epum._doctor_appt(106)
        self.assertNodeState(InstanceHealthState.OK, n5)
        self.assertNodeState(InstanceHealthState.OUT_OF_CONTACT, n1)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6)
        self.assertNodeState(InstanceHealthState.ZOMBIE, n7)

        self.ok_heartbeat(n5, 110)
        self.epum._doctor_appt(110)
        self.assertNodeState(InstanceHealthState.OK, n5)

        # n1 has now been "out of contact" too long and is past the "really missing"
        # threshold, so it should now be MISSING
        self.assertNodeState(InstanceHealthState.MISSING, n1)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6)
        self.assertNodeState(InstanceHealthState.ZOMBIE, n7)

        self.ok_heartbeat(n4, 110)
        self.err_heartbeat(n6, 110, procs=['a'])
        self.epum._doctor_appt(111)
        self.assertNodeState(InstanceHealthState.OK, n5, n4)
        self.assertNodeState(InstanceHealthState.MISSING, n1)
        self.assertNodeState(InstanceHealthState.OUT_OF_CONTACT, n2)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n3)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6)
        self.assertNodeState(InstanceHealthState.ZOMBIE, n7)

    def test_basic(self):
        self.epum.initialize()
        self.epum.msg_reconfigure_domain(self.domain_owner, self.domain_name, self._dom_config())

        nodes = [str(uuid.uuid4()) for i in range(3)]
        n1, n2, n3 = nodes

        # not using real timestamps
        now = 0

        for n in nodes:
            self.state.new_fake_instance_state(n, InstanceState.RUNNING, now)

        # all nodes are running but haven't been heard from
        self.assertNodeState(InstanceHealthState.UNKNOWN, *nodes)
        self.epum._doctor_appt(now)
        self.assertEquals(0, self.epum.doctor.monitors[self.domain_key].init_time)
        self.assertNodeState(InstanceHealthState.UNKNOWN, *nodes)

        now = 5
        self.epum._doctor_appt(now)
        self.assertNodeState(InstanceHealthState.UNKNOWN, *nodes)

        # first heartbeat to n1
        self.ok_heartbeat(n1, now)
        self.assertNodeState(InstanceHealthState.OK, n1)

        now = 10
        self.epum._doctor_appt(now)

        self.assertNodeState(InstanceHealthState.OK, n1)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3)

        self.ok_heartbeat(n1, now)  # n1 makes it in under the wire
        self.ok_heartbeat(n2, now)
        now = 11
        self.epum._doctor_appt(now)
        self.assertNodeState(InstanceHealthState.OK, n1, n2)
        self.assertNodeState(InstanceHealthState.OUT_OF_CONTACT, n3)

        self.ok_heartbeat(n3, now)
        self.assertNodeState(InstanceHealthState.OK, *nodes)

        # ok don't hear from n2 for a while, should go missing
        now = 13
        self.ok_heartbeat(n1, now)

        now = 16
        self.epum._doctor_appt(now)
        self.assertNodeState(InstanceHealthState.OK, n1, n3)
        self.assertNodeState(InstanceHealthState.OUT_OF_CONTACT, n2)

        self.ok_heartbeat(n2, now)
        self.assertNodeState(InstanceHealthState.OK, *nodes)

        now = 20

        # roll all nodes to terminated in IaaS
        for n in nodes:
            self.state.new_fake_instance_state(n, InstanceState.TERMINATED, now)

        # been longer than missing window for n1 but shouldn't matter
        self.epum._doctor_appt(now)
        self.assertNodeState(InstanceHealthState.OK, *nodes)

        now = 30
        self.ok_heartbeat(n1, now)
        self.epum._doctor_appt(now)
        # not a zombie yet
        self.assertNodeState(InstanceHealthState.OK, *nodes)

        now = 31
        self.epum._doctor_appt(now)
        self.assertNodeState(InstanceHealthState.OK, n1)

        self.ok_heartbeat(n1, now)
        self.epum._doctor_appt(now)
        self.assertNodeState(InstanceHealthState.ZOMBIE, n1)

        now = 42
        self.epum._doctor_appt(now)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n1)

    def test_error(self):
        self.epum.initialize()
        self.epum.msg_reconfigure_domain(self.domain_owner, self.domain_name, self._dom_config())

        node = str(uuid.uuid4())

        now = 1
        self.state.new_fake_instance_state(node, InstanceState.RUNNING, now)
        self.ok_heartbeat(node, now)
        self.epum._doctor_appt(now)
        self.assertNodeState(InstanceHealthState.OK, node)

        now = 5
        self.err_heartbeat(node, now)
        self.assertNodeState(InstanceHealthState.MONITOR_ERROR, node)
        errors = self.state.instances[node].errors
        self.assertEqual(len(errors), 1)
        self.assertEqual(errors[0], 'faiiiill')

        self.epum._doctor_appt(now)
        self.assertNodeState(InstanceHealthState.MONITOR_ERROR, node)

    def test_process_error(self):
        self.epum.initialize()
        self.epum.msg_reconfigure_domain(self.domain_owner, self.domain_name, self._dom_config())

        node = str(uuid.uuid4())

        now = 1
        self.state.new_fake_instance_state(node, InstanceState.RUNNING, now)
        self.ok_heartbeat(node, now)
        self.epum._doctor_appt(now)
        self.assertNodeState(InstanceHealthState.OK, node)

        now = 5
        procs = [{'name': 'proc1', 'stderr': 'faaaaaail', 'state': 100,
                  'exitcode': -1, 'stop_timestamp': 25242}]
        self.err_heartbeat(node, now, procs)
        self.epum._doctor_appt(now)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, node)
        errors = self.state.instances[node].errors
        self.assertEqual(len(errors), 1)
        self.assertEqual(errors[0]['stderr'], 'faaaaaail')
        procs[0].pop('stderr')

        now = 8
        self.err_heartbeat(node, now, procs)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, node)
        errors = self.state.instances[node].errors
        self.assertEqual(len(errors), 1)
        self.assertEqual(errors[0]['stderr'], 'faaaaaail')

    def test_defibulator(self):
        self.epum.initialize()
        dom_config = self._dom_config(health_init_time=100)
        self.epum.msg_reconfigure_domain(self.domain_owner, self.domain_name, dom_config)

        self.ou_client.dump_state_called = 0
        self.ou_client.heartbeats_sent = 0
        self.ou_client.respond_to_dump_state = True

        # set up an instance that reached its iaas_state before the init time (100)
        n1 = "n1"

        # has been running for well longer than the missing timeout and we will
        # have not received a heartbeat. It shouldn't be marked OUT_OF_CONTACT
        # until more than 5 seconds after the init_time
        self.state.new_fake_instance_state(n1, InstanceState.RUNNING, 50,
                                           InstanceHealthState.OK)

        self.epum._doctor_appt(100)
        self.assertNodeState(InstanceHealthState.OK, n1)
        self.epum._doctor_appt(105)
        self.assertNodeState(InstanceHealthState.OK, n1)

        self.assertEquals(0, self.ou_client.dump_state_called)
        self.assertEquals(0, self.ou_client.heartbeats_sent)
        self.epum._doctor_appt(106)
        # back to OK
        self.assertNodeState(InstanceHealthState.OK, n1)
        self.assertEquals(1, self.ou_client.dump_state_called)
        self.assertEquals(1, self.ou_client.heartbeats_sent)

    def test_defibulator_failure(self):
        self.epum.initialize()
        dom_config = self._dom_config(health_init_time=100)
        self.epum.msg_reconfigure_domain(self.domain_owner, self.domain_name, dom_config)

        self.ou_client.dump_state_called = 0
        self.ou_client.heartbeats_sent = 0
        self.ou_client.respond_to_dump_state = False  # i.e., the node is really gone

        # set up an instance that reached its iaas_state before the init time (100)
        n1 = "Poor Yorick"

        # has been running for well longer than the missing timeout and we will
        # have not received a heartbeat. It shouldn't be marked OUT_OF_CONTACT
        # until more than 5 seconds after the init_time
        self.state.new_fake_instance_state(n1, InstanceState.RUNNING, 50,
                                           InstanceHealthState.OK)

        self.epum._doctor_appt(100)
        self.assertNodeState(InstanceHealthState.OK, n1)
        self.epum._doctor_appt(105)
        self.assertNodeState(InstanceHealthState.OK, n1)

        self.assertEquals(0, self.ou_client.dump_state_called)
        self.assertEquals(0, self.ou_client.heartbeats_sent)
        self.epum._doctor_appt(106)
        self.assertNodeState(InstanceHealthState.OUT_OF_CONTACT, n1)
        self.assertEquals(1, self.ou_client.dump_state_called)
        self.assertEquals(0, self.ou_client.heartbeats_sent)

        self.epum._doctor_appt(110)
        self.assertNodeState(InstanceHealthState.MISSING, n1)
        self.assertEquals(1, self.ou_client.dump_state_called)
        self.assertEquals(0, self.ou_client.heartbeats_sent)

    # ----------------------------------------------------------------------------------

    def assertNodeState(self, state, *node_ids):
        for n in node_ids:
            self.assertEqual(state, self.state.instances[n].health)

    def ok_heartbeat(self, node_id, timestamp):
        msg = {'node_id': node_id, 'timestamp': timestamp,
            'state': InstanceHealthState.OK}
        self.epum.msg_heartbeat(None, msg, timestamp=timestamp)

    def err_heartbeat(self, node_id, timestamp, procs=None):

        msg = {'node_id': node_id, 'timestamp': timestamp, }
        if procs:
            msg['state'] = InstanceHealthState.PROCESS_ERROR
            msg['failed_processes'] = procs
        else:
            msg['state'] = InstanceHealthState.MONITOR_ERROR
            msg['error'] = 'faiiiill'

        self.epum.msg_heartbeat(None, msg, timestamp=timestamp)