Exemple #1
0
    def test_error_caught(self):
        self.loop = LoopingCall(self.looper)
        self.raise_this = Exception("too many sandwiches")

        self.loop.start(0)

        with self.condition:
            while not self.calls >= 3:

                self.condition.wait()

        self.loop.stop()
        self.assertGreaterEqual(self.calls, 3)
Exemple #2
0
 def _leader_initialize(self):
     """Performs initialization routines that may require async processing
     """
     if self.enable_loop:
         if not self.control_loop:
             self.control_loop = LoopingCall(self._loop_top)
         self.control_loop.start(300)
Exemple #3
0
    def start(self):
        log.info('EPUAgent starting')

        self.dashi.handle(self.heartbeat)

        self.loop = LoopingCall(self._loop)
        if self.start_beat:
            log.debug('Starting heartbeat loop - %s second interval',
                      self.period)
            self.loop.start(self.period)

        try:
            self.dashi.consume()
        except KeyboardInterrupt:
            log.warning("Caught terminate signal. Exiting")
        else:
            log.info("Exiting normally.")
Exemple #4
0
    def test_start_stop(self):
        self.loop = loop = LoopingCall(self.looper, 1, hats=True)

        loop.start(1)
        loop.stop()

        with self.condition:
            if not self.calls:
                self.condition.wait(5)

        self.assertEqual(self.calls, 1)
        self.assertLastPassed(1, hats=True)
Exemple #5
0
    def start(self):
        log.info('EPUAgent starting')

        self.dashi.handle(self.heartbeat)

        self.loop = LoopingCall(self._loop)
        if self.start_beat:
            log.debug('Starting heartbeat loop - %s second interval', self.period)
            self.loop.start(self.period)

        try:
            self.dashi.consume()
        except KeyboardInterrupt:
            log.warning("Caught terminate signal. Exiting")
        else:
            log.info("Exiting normally.")
Exemple #6
0
    def test_called(self):
        # looper will stop itself after 3 calls
        self.max_calls = 3
        self.loop = loop = LoopingCall(self.looper, 1, 2, anarg=5)

        # interval of 0 makes it not block
        loop.start(0)
        self.assertTrue(self.stopped.wait(5))

        #peek into looping call and join on thread
        thread = loop.thread
        if thread:
            thread.join()

        self.assertFalse(loop.running)
        self.assertEqual(self.calls, 3)
        self.assertPassed(0, 1, 2, anarg=5)
        self.assertPassed(1, 1, 2, anarg=5)
        self.assertPassed(2, 1, 2, anarg=5)
Exemple #7
0
    def start(self):

        log.info("starting high availability instance %s" % self)

        # Set up operations
        self.dashi.handle(self.reconfigure_policy)
        self.dashi.handle(self.dump)

        self.apply_policy_loop = LoopingCall(self.core.apply_policy)
        self.apply_policy_loop.start(self.policy_interval)

        try:
            self.dashi.consume()
        except KeyboardInterrupt:
            self.apply_policy_loop.stop()
            log.warning("Caught terminate signal. Bye!")
        else:
            self.apply_policy_loop.stop()
            log.info("Exiting normally. Bye!")
Exemple #8
0
    def _leader_initialize(self):
        """Performs initialization routines that may require async processing
        """

        # to make certain we have the latest records for instances, we request provisioner to dump state
        instance_ids = []
        for owner, domain_id in self.epum_store.list_domains():
            domain = self.epum_store.get_domain(owner, domain_id)

            with EpuLoggerThreadSpecific(domain=domain.domain_id, user=domain.owner):
                for instance in domain.get_instances():
                    if instance.state < InstanceState.TERMINATED:
                        instance_ids.append(instance.instance_id)

        if instance_ids:
            self.provisioner_client.dump_state(nodes=instance_ids)

        # TODO: We need to make a decision about how an engine can be configured to fire vs. how the
        #       decider fires it's top-loop.  The decider's granularity controls minimums.
        # WARN: For now the engine-specific "pulse" configuration is ignored.
        if self.enable_loop:
            if not self.control_loop:
                self.control_loop = LoopingCall(self._loop_top)
            self.control_loop.start(self.loop_interval)
Exemple #9
0
class EPUAgent(object):
    """Elastic Process Unit (EPU) Agent. Monitors vitals in running VMs.
    """
    def __init__(self, *args, **kwargs):

        configs = ["epuagent"]
        config_files = get_config_paths(configs)
        self.CFG = bootstrap.configure(config_files)

        topic = self.CFG.epuagent.get('service_name')
        self.topic = topic or "epu_agent_%s" % uuid.uuid4()

        heartbeat_dest = kwargs.get('heartbeat_dest')
        self.heartbeat_dest = heartbeat_dest or self.CFG.epuagent.heartbeat_dest

        node_id = kwargs.get('node_id')
        self.node_id = node_id or self.CFG.epuagent.node_id

        heartbeat_op = kwargs.get('heartbeat_op')
        self.heartbeat_op = heartbeat_op or self.CFG.epuagent.heartbeat_op

        period = kwargs.get('period_seconds')
        self.period = float(period or self.CFG.epuagent.period_seconds)

        # for testing, allow for not starting heartbeat automatically
        self.start_beat = kwargs.get('start_heartbeat', True)

        amqp_uri = kwargs.get('amqp_uri')

        sock = kwargs.get('supervisor_socket')
        sock = sock or self.CFG.epuagent.get('supervisor_socket')
        if sock:
            log.debug("monitoring a process supervisor at: %s", sock)
            self.supervisor = Supervisor(sock)
        else:
            log.debug("not monitoring process supervisor")
            self.supervisor = None

        self.core = EPUAgentCore(self.node_id, supervisor=self.supervisor)

        self.dashi = bootstrap.dashi_connect(self.topic, self.CFG, amqp_uri)

    def start(self):
        log.info('EPUAgent starting')

        self.dashi.handle(self.heartbeat)

        self.loop = LoopingCall(self._loop)
        if self.start_beat:
            log.debug('Starting heartbeat loop - %s second interval',
                      self.period)
            self.loop.start(self.period)

        try:
            self.dashi.consume()
        except KeyboardInterrupt:
            log.warning("Caught terminate signal. Exiting")
        else:
            log.info("Exiting normally.")

    def _loop(self):
        return self.heartbeat()

    def heartbeat(self):
        try:
            state = self.core.get_state()
            self.dashi.fire(self.heartbeat_dest,
                            self.heartbeat_op,
                            heartbeat=state)
        except Exception, e:
            # unhandled exceptions will terminate the LoopingCall
            log.error('Error heartbeating: %s', e, exc_info=True)
Exemple #10
0
class EPUMDecider(object):
    """The decider handles critical sections related to running decision engine cycles.

    In the future it may farm out subtasks to the EPUM workers (EPUMReactor) but currently all
    decision engine activity happens directly via the decider role.

    The instance of the EPUManagementService process that hosts a particular EPUMDecider instance
    might not be the elected decider.  When it is the elected decider, its EPUMDecider instance
    handles that functionality.  When it is not the elected decider, its EPUMDecider instance
    handles being available in the election.

    See: https://confluence.oceanobservatories.org/display/syseng/CIAD+CEI+OV+Elastic+Computing
    See: https://confluence.oceanobservatories.org/display/CIDev/EPUManagement+Refactor

    "I hear the voices [...] and I know the speculation.  But I'm the decider, and I decide what is best."
    """

    def __init__(self, epum_store, subscribers, provisioner_client, epum_client, dtrs_client,
                 disable_loop=False, base_provisioner_vars=None, loop_interval=5.0, statsd_cfg=None):
        """
        @param epum_store State abstraction for all domains
        @type epum_store EPUMStore
        @param subscribers A way to signal state changes
        @param provisioner_client A way to launch/destroy VMs
        @param epum_client A way to launch subtasks to EPUM workers (reactor roles)
        @param dtrs_client A way to get information from dtrs
        @param disable_loop For unit/integration tests, don't run a timed decision loop
        @param base_provisioner_vars base vars given to every launch
        """

        self.epum_store = epum_store
        self.subscribers = subscribers
        self.provisioner_client = provisioner_client
        self.epum_client = epum_client
        self.dtrs_client = dtrs_client

        self.control_loop = None
        self.enable_loop = not disable_loop
        self.loop_interval = float(loop_interval)
        self.is_leader = False

        # these are given to every launch after engine-provided vars are folded in
        self.base_provisioner_vars = base_provisioner_vars

        # The instances of Engine that make the control decisions for each domain
        self.engines = {}

        # the versions of the engine configs currently applied
        self.engine_config_versions = {}

        # The instances of Control (stateful) that are passed to each Engine to get info and execute cmds
        self.controls = {}

        self.statsd_client = None
        if statsd_cfg is not None:
            try:
                host = statsd_cfg["host"]
                port = statsd_cfg["port"]
                log.info("Setting up statsd client with host %s and port %d" % (host, port))
                self.statsd_client = StatsClient(host, port)
            except:
                log.exception("Failed to set up statsd client")

    def recover(self):
        """Called whenever the whole EPUManagement instance is instantiated.
        """
        # For callbacks: "now_leader()" and "not_leader()"
        self.epum_store.register_decider(self)

    def now_leader(self, block=False):
        """Called when this instance becomes the decider leader.

        When block is true, waits until leader dies or is cancelled
        """
        log.info("Elected as Decider leader")
        self._leader_initialize()
        self.is_leader = True
        if block:
            if self.control_loop:
                self.control_loop.thread.join()
            else:
                raise ValueError("cannot block without a control loop")

    def not_leader(self):
        """Called when this instance is known not to be the decider leader.
        """
        if self.control_loop:
            self.control_loop.stop()
            self.control_loop = None
        self.is_leader = False

    def _leader_initialize(self):
        """Performs initialization routines that may require async processing
        """

        # to make certain we have the latest records for instances, we request provisioner to dump state
        instance_ids = []
        for owner, domain_id in self.epum_store.list_domains():
            domain = self.epum_store.get_domain(owner, domain_id)

            with EpuLoggerThreadSpecific(domain=domain.domain_id, user=domain.owner):
                for instance in domain.get_instances():
                    if instance.state < InstanceState.TERMINATED:
                        instance_ids.append(instance.instance_id)

        if instance_ids:
            self.provisioner_client.dump_state(nodes=instance_ids)

        # TODO: We need to make a decision about how an engine can be configured to fire vs. how the
        #       decider fires it's top-loop.  The decider's granularity controls minimums.
        # WARN: For now the engine-specific "pulse" configuration is ignored.
        if self.enable_loop:
            if not self.control_loop:
                self.control_loop = LoopingCall(self._loop_top)
            self.control_loop.start(self.loop_interval)

    def _loop_top(self):
        """Every iteration of the decider loop, the following happens:

        1. Refresh state.  The EPUM worker processes are constantly updating persistence about the
        state of instances.  We do not suffer from efficiency fears here (without evidence).

        2. In particular, refresh the master domain list.  Some may have been created/removed in the meantime.
        Or this could be the first time this decider is the leader and the engine instances need to be
        created.

        3. For each new domain, create an engine instance and initialize it.

        4. For each pre-existing domain that is not marked as removed:
           A. Check if it has been reconfigured in the meantime.  If so, call reconfigure on the engine.
           B. Run decision cycle.
        """

        before = time.time()
        domains = self.epum_store.get_all_domains()

        # Perhaps in the meantime, the leader connection failed, bail early
        if not self.is_leader:
            return

        # look for domains that are not active anymore
        active_domains = {}
        for domain in domains:
            with EpuLoggerThreadSpecific(domain=domain.domain_id, user=domain.owner):
                if domain.is_removed():
                    self._shutdown_domain(domain)
                else:
                    active_domains[domain.key] = domain

                    if domain.key not in self.engines:
                        # New engines (new to this decider instance, at least)
                            try:
                                self._new_engine(domain)
                            except Exception, e:
                                log.error("Error creating engine '%s' for user '%s': %s",
                                    domain.domain_id, domain.owner, str(e), exc_info=True)

        if self.statsd_client is not None:
            try:
                self.statsd_client.gauge("active_domains", len(active_domains))
            except:
                log.exception("Failed to submit metrics")

        for key in self.engines:
            # Perhaps in the meantime, the leader connection failed, bail early
            if not self.is_leader:
                return

            domain = active_domains.get(key)
            if not domain:
                continue

            with EpuLoggerThreadSpecific(domain=domain.domain_id, user=domain.owner):
                engine_conf, version = domain.get_versioned_engine_config()
                if version > self.engine_config_versions[key]:
                    try:
                        self.engines[key].reconfigure(self.controls[key], engine_conf)
                        self.engine_config_versions[key] = version
                    except Exception, e:
                        log.error("Error in reconfigure call for user '%s' domain '%s': %s",
                              domain.owner, domain.domain_id, str(e), exc_info=True)

                self._get_engine_sensor_state(domain)
                engine_state = domain.get_engine_state()
                self._retry_domain_pending_actions(domain, engine_state.instances)
                try:
                    self.engines[key].decide(self.controls[key], engine_state)

                except Exception, e:
                    # TODO: if failure, notify creator
                    # TODO: If initialization fails, the engine won't be added to the list and it will be
                    #       attempted over and over.  There could be a retry limit?  Or jut once is enough.
                    log.error("Error in decide call for user '%s' domain '%s': %s",
                        domain.owner, domain.domain_id, str(e), exc_info=True)
Exemple #11
0
class EPUMReaper(object):
    """This process infrequently queries each domain in the datastore. It finds
    VM records in a terminal state past the threshold and removes them.

    The instance of the EPUManagementService process that hosts a particular EPUMReaper instance
    might not be the elected reaper.  When it is the elected reaper, this EPUMReaper instance
    handles that functionality.  When it is not the elected reaper, this EPUMReaper instance
    handles being available in the election.
    """

    def __init__(self, epum_store,
                 record_reaping_max_age, disable_loop=False):
        """
        @param epum_store State abstraction for all EPUs
        @param record_reaping_max_age Instance records older than record_reaping_max_age will be deleted
        @param disable_loop For unit/integration tests, don't run a timed decision loop
        """
        self.epum_store = epum_store
        self.record_reaping_max_age = record_reaping_max_age

        self.control_loop = None
        self.enable_loop = not disable_loop
        self.is_leader = False

    def recover(self):
        """Called whenever the whole EPUManagement instance is instantiated.
        """
        # For callbacks: "now_leader()" and "not_leader()"
        self.epum_store.register_reaper(self)

    def now_leader(self, block=False):
        """Called when this instance becomes the reaper leader.
        """
        log.info("Elected as Reaper leader")
        self._leader_initialize()
        self.is_leader = True
        if block:
            if self.control_loop:
                self.control_loop.thread.join()
            else:
                raise ValueError("cannot block without a control loop")

    def not_leader(self):
        """Called when this instance is known not to be the reaper leader.
        """
        if self.control_loop:
            self.control_loop.stop()
            self.control_loop = None
        self.is_leader = False

    def _leader_initialize(self):
        """Performs initialization routines that may require async processing
        """
        if self.enable_loop:
            if not self.control_loop:
                self.control_loop = LoopingCall(self._loop_top)
            self.control_loop.start(300)

    def _loop_top(self):
        """Run the reaper loop.

        Every time this runs, each domain is checked for instances in terminal
        states TERMINATED, FAILED, or REJECTED.  They are deleted if they are
        older than self.record_reaping_max_age.

        """
        # Perhaps in the meantime, the leader connection failed, bail early
        if not self.is_leader:
            return

        now = time.time()
        domains = self.epum_store.get_all_domains()

        for domain in domains:
            with EpuLoggerThreadSpecific(domain=domain.domain_id, user=domain.owner):
                if not domain.is_removed():
                    instances = domain.get_instances()
                    for instance in instances:
                        log.info("Instance is " + instance['state'])
                        if instance['state'] in [states.TERMINATED, states.FAILED, states.REJECTED]:
                            state_time = instance['state_time']
                            if now > state_time + self.record_reaping_max_age:
                                log.info("Removing instance %s with no state change for %f seconds",
                                         instance['instance_id'], now - state_time)
                                domain.remove_instance(instance['instance_id'])

        # Perhaps in the meantime, the leader connection failed, bail early
        if not self.is_leader:
            return
Exemple #12
0
class EPUMDoctor(object):
    """The doctor handles critical sections related to 'pronouncing' a VM instance unhealthy.

    In the future it may farm out subtasks to the EPUM workers (EPUMReactor) but currently all
    health-check activity happens directly via the doctor role.

    The instance of the EPUManagementService process that hosts a particular EPUMDoctor instance
    might not be the elected doctor.  When it is the elected doctor, this EPUMDoctor instance
    handles that functionality.  When it is not the elected doctor, this EPUMDoctor instance
    handles being available in the election.

    See: https://confluence.oceanobservatories.org/display/syseng/CIAD+CEI+OV+Elastic+Computing
    See: https://confluence.oceanobservatories.org/display/CIDev/EPUManagement+Refactor
    """

    def __init__(self, epum_store, notifier, provisioner_client, epum_client,
                 ouagent_client, disable_loop=False):
        """
        @param epum_store State abstraction for all EPUs
        @param notifier A way to signal state changes
        @param provisioner_client A way to destroy VMs
        @param epum_client A way to launch subtasks to EPUM workers (reactor roles) (TODO: not sure if needed)
        @param ouagent_client See OUAgent dump_state() in architecture documentation
        @param disable_loop For unit/integration tests, don't run a timed decision loop
        """
        self.epum_store = epum_store
        self.notifier = notifier
        self.provisioner_client = provisioner_client
        self.epum_client = epum_client
        self.ouagent_client = ouagent_client

        self.control_loop = None
        self.enable_loop = not disable_loop
        self.is_leader = False

        # The instances of HealthMonitor that make the health decisions for each domain
        self.monitors = {}

    def recover(self):
        """Called whenever the whole EPUManagement instance is instantiated.
        """
        # For callbacks: "now_leader()" and "not_leader()"
        self.epum_store.register_doctor(self)

    def now_leader(self, block=False):
        """Called when this instance becomes the doctor leader.
        """
        log.info("Elected as Doctor leader")
        self._leader_initialize()
        self.is_leader = True
        if block:
            if self.control_loop:
                self.control_loop.thread.join()
            else:
                raise ValueError("cannot block without a control loop")

    def not_leader(self):
        """Called when this instance is known not to be the doctor leader.
        """
        if self.control_loop:
            self.control_loop.stop()
            self.control_loop = None
        self.is_leader = False

    def _leader_initialize(self):
        """Performs initialization routines that may require async processing
        """
        if self.enable_loop:
            if not self.control_loop:
                self.control_loop = LoopingCall(self._loop_top)
            self.control_loop.start(10)

    def _loop_top(self, timestamp=None):
        """
        Run the doctor decider loop.

        Every time this runs, each domain's health monitor is loaded and
        """
        # Perhaps in the meantime, the leader connection failed, bail early
        if not self.is_leader:
            return

        domains = self.epum_store.get_all_domains()
        active_domains = {}
        for domain in domains:
            with EpuLoggerThreadSpecific(domain=domain.domain_id, user=domain.owner):

                if not domain.is_removed():
                    active_domains[domain.key] = domain

        # Perhaps in the meantime, the leader connection failed, bail early
        if not self.is_leader:
            return

        # Monitors that are not active anymore
        for key in self.monitors.keys():
            if key not in active_domains:
                del self.monitors[key]

        # New health monitors (new to this doctor instance, at least)
        for domain_key in filter(lambda x: x not in self.monitors,
                active_domains.iterkeys()):
            try:
                self._new_monitor(active_domains[domain_key])
            except Exception, e:
                log.error("Error creating health monitor for '%s': %s",
                          domain_key, str(e), exc_info=True)

        for domain_key in self.monitors.keys():
            # Perhaps in the meantime, the leader connection failed, bail early
            if not self.is_leader:
                return
            try:
                self.monitors[domain_key].update(timestamp)
            except Exception, e:
                log.error("Error in doctor's update call for '%s': %s",
                          domain_key, str(e), exc_info=True)
Exemple #13
0
    def test_start_stop_2(self):
        self.loop = loop = LoopingCall(self.looper, 1, hats=True)

        loop.start(1, now=False)
        loop.stop()
        self.assertEqual(self.calls, 0)
Exemple #14
0
class LoopingCallTests(unittest.TestCase):

    def setUp(self):
        self.calls = 0
        self.passed = []
        self.condition = threading.Condition()

        self.loop = None

        # tests can set this to make looper stop itself after a specified
        # number of calls. self.loop must also be set.
        self.max_calls = None

        # tests can set this to make looper raise an exception
        self.raise_this = None

        # when looper kills itself, it will set this event
        self.stopped = threading.Event()

    def tearDown(self):
        if self.loop:
            # peek into loop and make sure thread is joined
            self.loop.stop()
            thread = self.loop.thread
            if thread:
                thread.join()

    def assertPassed(self, index, *args, **kwargs):
        passed_args, passed_kwargs = self.passed[index]
        self.assertEqual(args, passed_args)
        self.assertEqual(kwargs, passed_kwargs)

    def assertLastPassed(self, *args, **kwargs):
        self.assertPassed(-1, *args, **kwargs)

    def looper(self, *args, **kwargs):
        with self.condition:
            self.calls += 1
            self.passed.append((args, kwargs))
            self.condition.notifyAll()

        if self.max_calls and self.calls >= self.max_calls:
            self.loop.stop()
            self.stopped.set()

        if self.raise_this:
            raise self.raise_this

    def test_start_stop(self):
        self.loop = loop = LoopingCall(self.looper, 1, hats=True)

        loop.start(1)
        loop.stop()

        with self.condition:
            if not self.calls:
                self.condition.wait(5)

        self.assertEqual(self.calls, 1)
        self.assertLastPassed(1, hats=True)

    def test_start_stop_2(self):
        self.loop = loop = LoopingCall(self.looper, 1, hats=True)

        loop.start(1, now=False)
        loop.stop()
        self.assertEqual(self.calls, 0)

    def test_called(self):
        # looper will stop itself after 3 calls
        self.max_calls = 3
        self.loop = loop = LoopingCall(self.looper, 1, 2, anarg=5)

        # interval of 0 makes it not block
        loop.start(0)
        self.assertTrue(self.stopped.wait(5))

        #peek into looping call and join on thread
        thread = loop.thread
        if thread:
            thread.join()

        self.assertFalse(loop.running)
        self.assertEqual(self.calls, 3)
        self.assertPassed(0, 1, 2, anarg=5)
        self.assertPassed(1, 1, 2, anarg=5)
        self.assertPassed(2, 1, 2, anarg=5)

    def test_error_caught(self):
        self.loop = LoopingCall(self.looper)
        self.raise_this = Exception("too many sandwiches")

        self.loop.start(0)

        with self.condition:
            while not self.calls >= 3:

                self.condition.wait()

        self.loop.stop()
        self.assertGreaterEqual(self.calls, 3)
Exemple #15
0
class HighAvailabilityService(object):

    def __init__(self, *args, **kwargs):

        configs = ["service", "highavailability"]
        config_files = get_config_paths(configs)
        self.CFG = bootstrap.configure(config_files)

        exchange = kwargs.get('exchange')
        if exchange:
            self.CFG.server.amqp.exchange = exchange

        self.topic = kwargs.get('service_name') or self.CFG.highavailability.get('service_name') or DEFAULT_TOPIC

        self.amqp_uri = kwargs.get('amqp_uri') or None
        self.dashi = bootstrap.dashi_connect(self.topic, self.CFG, self.amqp_uri, sysname=kwargs.get('sysname'))

        process_dispatchers = (kwargs.get('process_dispatchers') or
                self.CFG.highavailability.processdispatchers)

        policy_name = self.CFG.highavailability.policy.name
        try:
            policy_map[policy_name.lower()]
            self.policy = policy_name.lower()
        except KeyError:
            raise Exception("HA Service doesn't support '%s' policy" % policy_name)

        policy_parameters = (kwargs.get('policy_parameters') or
                self.CFG.highavailability.policy.parameters)

        process_definition_id = (kwargs.get('process_definition_id') or
                self.CFG.highavailability.process_definition_id)

        self.policy_interval = (kwargs.get('policy_interval') or
                self.CFG.highavailability.policy.interval)

        self.control = DashiHAProcessControl(self.dashi, process_dispatchers)

        core = HighAvailabilityCore
        self.core = core(self.CFG.highavailability, self.control,
            process_dispatchers, self.policy, parameters=policy_parameters,
            process_definition_id=process_definition_id)

    def start(self):

        log.info("starting high availability instance %s" % self)

        # Set up operations
        self.dashi.handle(self.reconfigure_policy)
        self.dashi.handle(self.dump)

        self.apply_policy_loop = LoopingCall(self.core.apply_policy)
        self.apply_policy_loop.start(self.policy_interval)

        try:
            self.dashi.consume()
        except KeyboardInterrupt:
            self.apply_policy_loop.stop()
            log.warning("Caught terminate signal. Bye!")
        else:
            self.apply_policy_loop.stop()
            log.info("Exiting normally. Bye!")

    def stop(self):
        self.dashi.cancel()
        self.dashi.disconnect()

    def reconfigure_policy(self, new_policy):
        """Service operation: Change the parameters of the policy used for service

        @param new_policy: parameters of policy
        @return:
        """
        self.core.reconfigure_policy(new_policy)

    def status(self):
        """Service operation: Get the status of the HA Service

        @return: {PENDING, READY, STEADY, BROKEN}
        """
        return self.core.status()

    def dump(self):
        """Dump state of ha core
        """
        return self.core.dump()
Exemple #16
0
class EPUAgent(object):
    """Elastic Process Unit (EPU) Agent. Monitors vitals in running VMs.
    """

    def __init__(self, *args, **kwargs):

        configs = ["epuagent"]
        config_files = get_config_paths(configs)
        self.CFG = bootstrap.configure(config_files)

        topic = self.CFG.epuagent.get('service_name')
        self.topic = topic or "epu_agent_%s" % uuid.uuid4()

        heartbeat_dest = kwargs.get('heartbeat_dest')
        self.heartbeat_dest = heartbeat_dest or self.CFG.epuagent.heartbeat_dest

        node_id = kwargs.get('node_id')
        self.node_id = node_id or self.CFG.epuagent.node_id

        heartbeat_op = kwargs.get('heartbeat_op')
        self.heartbeat_op = heartbeat_op or self.CFG.epuagent.heartbeat_op

        period = kwargs.get('period_seconds')
        self.period = float(period or self.CFG.epuagent.period_seconds)

        # for testing, allow for not starting heartbeat automatically
        self.start_beat = kwargs.get('start_heartbeat', True)

        amqp_uri = kwargs.get('amqp_uri')

        sock = kwargs.get('supervisor_socket')
        sock = sock or self.CFG.epuagent.get('supervisor_socket')
        if sock:
            log.debug("monitoring a process supervisor at: %s", sock)
            self.supervisor = Supervisor(sock)
        else:
            log.debug("not monitoring process supervisor")
            self.supervisor = None

        self.core = EPUAgentCore(self.node_id, supervisor=self.supervisor)

        self.dashi = bootstrap.dashi_connect(self.topic, self.CFG, amqp_uri)

    def start(self):
        log.info('EPUAgent starting')

        self.dashi.handle(self.heartbeat)

        self.loop = LoopingCall(self._loop)
        if self.start_beat:
            log.debug('Starting heartbeat loop - %s second interval', self.period)
            self.loop.start(self.period)

        try:
            self.dashi.consume()
        except KeyboardInterrupt:
            log.warning("Caught terminate signal. Exiting")
        else:
            log.info("Exiting normally.")


    def _loop(self):
        return self.heartbeat()

    def heartbeat(self):
        try:
            state = self.core.get_state()
            self.dashi.fire(self.heartbeat_dest, self.heartbeat_op,
                    heartbeat=state)
        except Exception, e:
            # unhandled exceptions will terminate the LoopingCall
            log.error('Error heartbeating: %s', e, exc_info=True)