Exemple #1
0
    def __init__(self, comp_name, zkclient, nodepath,
                 operational=False, parent=None, interval=5):
        """
        :type comp_name: str
        :type zkclient: kazoo.client.KazooClient
        :type nodepath: str
        :type operational: bool
        :type parent: str or None
        :type interval: int or float
        """
        SimplePredicate.__init__(self, comp_name, operational=operational, parent=parent)
        self.node = nodepath
        self.zkclient = zkclient
        self.interval = interval
        self._start = None
        self._stop = None
        self._log = logging.getLogger('sent.{0}.pred.gut'.format(comp_name))
        self._log.info('Registered {0}'.format(self))

        self._operate = ThreadSafeObject(True)
        self._thread = Thread(target=self._run_loop, name=str(self))
        self._thread.daemon = True
        self._started = False

        self._datetime_regex = (
            "^((?P<year>\d{4})\-(?P<month>\d{2})\-(?P<day>\d{2})\s)?"
            "(?P<hour>\d{2}):(?P<minute>\d{2})(:(?P<second>\d{2}))?"
        )
Exemple #2
0
    def __init__(self,
                 comp_name,
                 zkclient,
                 path,
                 operational=False,
                 parent=None,
                 interval=10):
        """
        :type comp_name: str
        :type zkclient: kazoo.client.KazooClient
        :type path: str or None
        :type operational: bool
        :type parent: str or None
        :type interval: int or float
        """
        SimplePredicate.__init__(self,
                                 comp_name,
                                 operational=operational,
                                 parent=parent)
        self.zkclient = zkclient
        self.interval = interval
        self.path = path
        self._log = logging.getLogger('sent.{0}.holiday'.format(comp_name))
        self._log.info('Registered {0}'.format(self))

        self._operate = ThreadSafeObject(True)
        self._thread = Thread(target=self._run_loop, name=str(self))
        self._thread.daemon = True
        self._started = False
        self._holidays = list()
Exemple #3
0
    def __init__(self,
                 comp_name,
                 proc_client,
                 interval,
                 operational=False,
                 parent=None):
        """
        :type comp_name: str
        :type proc_client: zoom.agent.client.process_client.ProcessClient
        :type interval: int or float
        :type operational: bool
        :type parent: str or None
        """
        SimplePredicate.__init__(self,
                                 comp_name,
                                 operational=operational,
                                 parent=parent)
        self._log = logging.getLogger(
            'sent.{0}.pred.process'.format(comp_name))
        self._proc_client = proc_client

        # lock for synchronous decorator
        if proc_client:
            self.process_client_lock = proc_client.process_client_lock
        else:
            self.process_client_lock = Lock()

        self.interval = interval
        self._operate = ThreadSafeObject(True)
        self._thread = Thread(target=self._run_loop, name=str(self))
        self._thread.daemon = True
        self._started = False
Exemple #4
0
    def __init__(self,
                 comp_name,
                 begin=None,
                 end=None,
                 weekdays=None,
                 operational=False,
                 parent=None,
                 interval=5):
        """
        :type comp_name: str
        :type begin: str or None
        :type end: str or None
        :type weekdays: str or None
        :type operational: bool
        :type parent: str or None
        :type interval: int or float
        """
        SimplePredicate.__init__(self,
                                 comp_name,
                                 operational=operational,
                                 parent=parent)
        self.begin = self.get_datetime_object(begin)
        self.end = self.get_datetime_object(end)
        self.day_range = self.parse_range(weekdays)
        self.interval = interval
        self._log = logging.getLogger(
            'sent.{0}.pred.timewin'.format(comp_name))
        self._log.info('Registered {0}'.format(self))

        self._operate = ThreadSafeObject(True)
        self._thread = Thread(target=self._run_loop, name=str(self))
        self._thread.daemon = True
        self._started = False
Exemple #5
0
    def __init__(self,
                 comp_name,
                 url,
                 verb='GET',
                 expected_code=200,
                 interval=5.0,
                 operational=False,
                 parent=None):
        """
        :type comp_name: str
        :type url: str
        :type verb: str
        :type expected_code: int
        :type interval: int or float
        :type operational: bool
        :type parent: str or None
        """
        SimplePredicate.__init__(self,
                                 comp_name,
                                 operational=operational,
                                 parent=parent)
        self._log = logging.getLogger('sent.{0}.pred.api'.format(comp_name))
        logging.getLogger('requests.packages.urllib3.connectionpool').setLevel(
            logging.WARNING)
        self.url = url
        self.verb = verb
        self.expected_code = expected_code
        self.interval = interval

        self._operate = ThreadSafeObject(True)
        self._thread = Thread(target=self._run_loop, name=str(self))
        self._thread.daemon = True
        self._log.info('Registered {0}'.format(self))
        self._started = False
Exemple #6
0
    def __init__(self,
                 comp_name,
                 command,
                 interval,
                 system,
                 operational=False,
                 parent=None):
        """
        :type comp_name: str
        :type command: str
        :type interval: int or float
        :type system: zoom.common.types.PlatformType
        :type operational: bool
        :type parent: str or None
        """
        SimplePredicate.__init__(self,
                                 comp_name,
                                 operational=operational,
                                 parent=parent)
        self._log = logging.getLogger('sent.{0}.pred.health'.format(comp_name))
        self.interval = interval
        self.rawcmd = command
        self._runcmd = str()
        self._system = system
        self._verify()

        self._operate = ThreadSafeObject(True)
        self._thread = Thread(target=self._run_loop, name=str(self))
        self._thread.daemon = True
        self._log.info('Registered {0}'.format(self))
        self._started = False
Exemple #7
0
 def __init__(self, comp_name, queue, work_dict):
     """
     :type comp_name: str
     :type queue: zoom.agent.entities.unique_queue.UniqueQueue
     :type work_dict: dict
     """
     self._operate = ThreadSafeObject(True)
     self._thread = Thread(target=self._run,
                           name='work_manager',
                           args=(self._operate, queue, work_dict))
     self._thread.daemon = True
     self._log = logging.getLogger('sent.{0}.wm'.format(comp_name))
Exemple #8
0
    def __init__(self, config, system, settings):
        """
        :type config: xml.etree.ElementTree.Element
        :type system: zoom.common.types.PlatformType
        :type settings: dict
        """
        self._log = logging.getLogger('sent.child')
        self._action_queue = UniqueQueue()
        self._cancel_flag = ThreadSafeObject(False)

        self.name = verify_attribute(config, 'id')
        self._application_type = verify_attribute(config, 'type')
        self._config = config
        self._system = system  # Linux or Windows
        self._settings = settings
        self._process = self._create_process()
Exemple #9
0
    def __init__(self, comp_name, settings, zkclient, nodepath, parent=None, interval=5):
        """
        :type comp_name: str
        :type settings: zoom.agent.entities.thread_safe_object.ThreadSafeObject
        :type zkclient: kazoo.client.KazooClient
        :type nodepath: str
        :type parent: str or None
        :type interval: int or float
        """
        SimplePredicate.__init__(self, comp_name, settings, parent=parent)
        self.node = nodepath
        self.zkclient = zkclient
        self.interval = interval
        self._start = None
        self._stop = None
        self._log = logging.getLogger('sent.{0}.pred.gut'.format(comp_name))
        self._log.info('Registered {0}'.format(self))

        self._operate = ThreadSafeObject(True)
        self._thread = Thread(target=self._run_loop, name=str(self))
        self._thread.daemon = True
        self._started = False

        self._datetime_regex = (
            "^((?P<year>\d{4})\-(?P<month>\d{2})\-(?P<day>\d{2})\s)?"
            "(?P<hour>\d{2}):(?P<minute>\d{2})(:(?P<second>\d{2}))?"
        )
Exemple #10
0
    def __init__(self, comp_name, operational=False, parent=None, interval=10):
        """
        :type comp_name: str
        :type operational: bool
        :type parent: str or None
        :type interval: int or float
        """
        SimplePredicate.__init__(self,
                                 comp_name,
                                 operational=operational,
                                 parent=parent)
        self.interval = interval
        self._log = logging.getLogger('sent.{0}.weekend'.format(comp_name))
        self._log.info('Registered {0}'.format(self))

        self._operate = ThreadSafeObject(True)
        self._thread = Thread(target=self._run_loop, name=str(self))
        self._thread.daemon = True
        self._started = False
Exemple #11
0
    def setUp(self):
        self.mox = mox.Mox()
        self.comp_name = "Test Predicate Or"

        self.predat = SimplePredicate("a", ThreadSafeObject({}))
        self.predat.set_met(True)
        self.predbt = SimplePredicate("b", ThreadSafeObject({}))
        self.predbt.set_met(True)

        self.predaf = SimplePredicate("a", ThreadSafeObject({}))
        self.predbf = SimplePredicate("b", ThreadSafeObject({}))

        self.list = [self.predaf, self.predbf, self.predat, self.predbt]

        self.factory = PredicateFactory(component_name="factory",
                                        zkclient=None,
                                        proc_client=None,
                                        system=None,
                                        pred_list=self.list,
                                        settings={})
Exemple #12
0
    def __init__(self, config, settings, conn, queue, system, application_type):
        """
        :type config: dict (xml)
        :type settings: zoom.agent.entities.thread_safe_object.ThreadSafeObject
        :type conn: multiprocessing.Connection
        :type queue: zoom.agent.entities.unique_queue.UniqueQueue
        :type system: zoom.common.types.PlatformType
        :type application_type: zoom.common.types.ApplicationType
        """
        self.config = config
        self._settings = settings
        self.name = verify_attribute(self.config, 'id', none_allowed=False)
        self._log = logging.getLogger('sent.{0}.app'.format(self.name))
        # informational attributes
        self._host = platform.node().upper()
        self._fqdn = socket.getfqdn()
        self._system = system
        self._predicates = list()
        self._running = True  # used to manually stop the run loop
        self._prev_state = None
        self._actions = dict()  # created in _reset_watches on zk connect
        self._env = os.environ.get('EnvironmentToUse', 'Staging')
        self._apptype = application_type

        # tool-like attributes
        self.listener_lock = Lock()
        self._action_queue = queue
        self._mode = ApplicationMode(ApplicationMode.MANUAL)
        self._state = ThreadSafeObject(ApplicationState.OK)
        self._trigger_time = ''     # Default to empty string for comparison
        self._login_user = '******'   # Default to Zoom
        self._run_check_mode = False
        self._pd_svc_key = verify_attribute(config, 'pagerduty_service',
                                            none_allowed=True)

        self._paths = self._init_paths(self.config, settings, application_type)

        # clients
        if self._system == PlatformType.LINUX:
            self.zkclient = KazooClient(
                hosts=ZK_CONN_STRING,
                handler=SequentialThreadingHandler(),
                logger=logging.getLogger('kazoo.app.{0}'.format(self.name)))
        elif self._system == PlatformType.WINDOWS:
            self.zkclient = KazooClient(hosts=ZK_CONN_STRING,
                                        handler=SequentialThreadingHandler())

        self.zkclient.add_listener(self._zk_listener)
        self._proc_client = self._init_proc_client(self.config,
                                                   settings,
                                                   application_type)

        self._actions = self._init_actions(settings)
        self._work_manager = self._init_work_manager(self._action_queue, conn)
    def __init__(self, config, system, settings):
        """
        :type config: xml.etree.ElementTree.Element
        :type system: zoom.common.types.PlatformType
        :type settings: dict
        """
        self._log = logging.getLogger("sent.child")
        self._action_queue = UniqueQueue()
        self._cancel_flag = ThreadSafeObject(False)

        self.name = verify_attribute(config, "id")
        self._application_type = verify_attribute(config, "type")
        self._config = config
        self._system = system  # Linux or Windows
        self._settings = settings
        self._process = self._create_process()
Exemple #14
0
    def __init__(self, comp_name, operational=False, parent=None, interval=10):
        """
        :type comp_name: str
        :type operational: bool
        :type parent: str or None
        :type interval: int or float
        """
        SimplePredicate.__init__(self, comp_name, operational=operational, parent=parent)
        self.interval = interval
        self._log = logging.getLogger("sent.{0}.weekend".format(comp_name))
        self._log.info("Registered {0}".format(self))

        self._operate = ThreadSafeObject(True)
        self._thread = Thread(target=self._run_loop, name=str(self))
        self._thread.daemon = True
        self._started = False
Exemple #15
0
    def __init__(self, comp_name, settings, zkclient, parent=None, interval=10):
        """
        :type comp_name: str
        :type settings: zoom.agent.entities.thread_safe_object.ThreadSafeObject
        :type zkclient: kazoo.client.KazooClient
        :type parent: str or None
        :type interval: int or float
        """
        SimplePredicate.__init__(self, comp_name, settings, parent=parent)
        self.zkclient = zkclient
        self.interval = interval
        self._log = logging.getLogger('sent.{0}.holiday'.format(comp_name))
        self._log.info('Registered {0}'.format(self))

        self._operate = ThreadSafeObject(True)
        self._thread = Thread(target=self._run_loop, name=str(self))
        self._thread.daemon = True
        self._started = False
        self._holidays = list()
    def __init__(self, comp_name, begin=None, end=None,
                 weekdays=None, operational=False, parent=None, interval=5):
        """
        :type comp_name: str
        :type begin: str or None
        :type end: str or None
        :type weekdays: str or None
        :type operational: bool
        :type parent: str or None
        :type interval: int or float
        """
        SimplePredicate.__init__(self, comp_name, operational=operational, parent=parent)
        self.begin = self.get_datetime_object(begin)
        self.end = self.get_datetime_object(end)
        self.day_range = self.parse_range(weekdays)
        self.interval = interval
        self._log = logging.getLogger('sent.{0}.pred.timewin'.format(comp_name))
        self._log.info('Registered {0}'.format(self))

        self._operate = ThreadSafeObject(True)
        self._thread = Thread(target=self._run_loop, name=str(self))
        self._thread.daemon = True
        self._started = False
Exemple #17
0
    def __init__(self, comp_name, command, interval, system,
                 operational=False, parent=None):
        """
        :type comp_name: str
        :type command: str
        :type interval: int or float
        :type system: zoom.common.types.PlatformType
        :type operational: bool
        :type parent: str or None
        """
        SimplePredicate.__init__(self, comp_name, operational=operational, parent=parent)
        self._log = logging.getLogger('sent.{0}.pred.health'.format(comp_name))
        self.interval = interval
        self.rawcmd = command
        self._runcmd = str()
        self._system = system
        self._verify()

        self._operate = ThreadSafeObject(True)
        self._thread = Thread(target=self._run_loop, name=str(self))
        self._thread.daemon = True
        self._log.info('Registered {0}'.format(self))
        self._started = False
Exemple #18
0
    def __init__(self, comp_name, settings,
                 start=None, stop=None, weekdays=None, parent=None, interval=5):
        """
        :type comp_name: str
        :type settings: ThreadSafeObject
        :type start: str or None
        :type stop: str or None
        :type weekdays: str or None
        :type parent: str or None
        :type interval: int or float
        """
        SimplePredicate.__init__(self, comp_name, settings, parent=parent)
        self.start_time = self._get_datetime_object(start)
        self.stop_time = self._get_datetime_object(stop)
        self.day_range = self._parse_range(weekdays)
        self.interval = interval
        self._log = logging.getLogger('sent.{0}.pred.time'.format(comp_name))
        self._log.info('Registered {0}'.format(self))

        self._operate = ThreadSafeObject(True)
        self._thread = Thread(target=self._run_loop, name=str(self))
        self._thread.daemon = True
        self._started = False
Exemple #19
0
    def __init__(self, comp_name, settings, proc_client, interval, parent=None):
        """
        :type comp_name: str
        :type settings: zoom.agent.entities.thread_safe_object.ThreadSafeObject
        :type proc_client: zoom.agent.client.process_client.ProcessClient
        :type interval: int or float
        :type parent: str or None
        """
        SimplePredicate.__init__(self, comp_name, settings, parent=parent)
        self._log = logging.getLogger('sent.{0}.pred.process'.format(comp_name))
        self._proc_client = proc_client

        # lock for synchronous decorator
        if proc_client:
            self.process_client_lock = proc_client.process_client_lock
        else:
            self.process_client_lock = Lock()

        self.interval = interval
        self._operate = ThreadSafeObject(True)
        self._thread = Thread(target=self._run_loop, name=str(self))
        self._thread.daemon = True
        self._started = False
Exemple #20
0
    def __init__(self):
        """
        Read config and spawn child processes.
        """
        self._log = logging.getLogger('sent.daemon')
        self._log.info('Creating Sentinel')

        self.children = dict()
        self._settings = ThreadSafeObject(dict())
        self._system = self._get_system()
        self._hostname = platform.node().upper()  # must be uppercase
        self._prev_state = None
        self.listener_lock = Lock()

        if self._system == PlatformType.LINUX:
            self.zkclient = KazooClient(hosts=ZK_CONN_STRING,
                                        handler=SequentialThreadingHandler(),
                                        logger=logging.getLogger('kazoo.daemon'))
        elif self._system == PlatformType.WINDOWS:
            self.zkclient = KazooClient(hosts=ZK_CONN_STRING,
                                        handler=SequentialThreadingHandler())

        self.zkclient.add_listener(self._zk_listener)
        # this will run self._reset_after_connection_loss
        self.zkclient.start()

        self.task_client = None
        self.task_client = TaskClient(self.children,
                                      self.zkclient,
                                      self._settings)

        self._rest_server = tornado.httpserver.HTTPServer(
            RestServer(self.children, self._settings))

        signal.signal(signal.SIGINT, self._handle_sigint)
        signal.signal(signal.SIGTERM, self._handle_sigint)
        self._log.info('Created Sentinel')
Exemple #21
0
    def __init__(self, comp_name, url, verb='GET', expected_code=200,
                 interval=5.0, operational=False, parent=None):
        """
        :type comp_name: str
        :type url: str
        :type verb: str
        :type expected_code: int
        :type interval: int or float
        :type operational: bool
        :type parent: str or None
        """
        SimplePredicate.__init__(self, comp_name, operational=operational, parent=parent)
        self._log = logging.getLogger('sent.{0}.pred.api'.format(comp_name))
        logging.getLogger('requests.packages.urllib3.connectionpool').setLevel(logging.WARNING)
        self.url = url
        self.verb = verb
        self.expected_code = expected_code
        self.interval = interval

        self._operate = ThreadSafeObject(True)
        self._thread = Thread(target=self._run_loop, name=str(self))
        self._thread.daemon = True
        self._log.info('Registered {0}'.format(self))
        self._started = False
class Application(object):
    """
    Service object to represent an deployed service.
    """
    def __init__(self, config, settings, queue, system, application_type,
                 cancel_flag):
        """
        :type config: dict (xml)
        :type settings: dict
        :type queue: zoom.agent.entities.unique_queue.UniqueQueue
        :type system: zoom.common.types.PlatformType
        :type application_type: zoom.common.types.ApplicationType
        :type cancel_flag: zoom.agent.entities.thread_safe_object.ThreadSafeObject
        """
        self.config = config
        self._settings = settings
        self.name = verify_attribute(self.config, 'id', none_allowed=False)
        self._log = logging.getLogger('sent.{0}.app'.format(self.name))
        # informational attributes
        self._host = socket.getfqdn()
        self._system = system
        self._predicates = list()
        self._running = True  # used to manually stop the run loop
        self._prev_state = None
        self._actions = dict()  # created in _reset_watches on zk connect
        self._env = os.environ.get('EnvironmentToUse', 'Staging')
        self._apptype = application_type
        self._restart_on_crash = \
            verify_attribute(self.config, 'restart_on_crash', none_allowed=True)
        self._post_stop_sleep = verify_attribute(self.config, 'post_stop_sleep',
                                                 none_allowed=True, cast=int,
                                                 default=5)

        # tool-like attributes
        self.listener_lock = Lock()
        self._action_queue = queue
        self._mode = ApplicationMode(
            ApplicationMode.MANUAL,
            callback=self._update_agent_node_with_app_details)
        self._state = ThreadSafeObject(
            ApplicationState.OK,
            callback=self._update_agent_node_with_app_details)
        self._start_stop_time = ''  # Default to empty string for comparison
        self._login_user = '******'  # Default to Zoom
        self._user_set_in_react = False
        self._run_check_mode = False
        self._pd_svc_key = verify_attribute(config, 'pagerduty_service',
                                            none_allowed=True)

        restartmax = verify_attribute(config, 'restartmax', none_allowed=True,
                                      cast=int, default=3)
        self._rl = RestartLogic(
            self.name,
            restartmax,
            count_callback=self._update_agent_node_with_app_details)

        self._read_only = False

        self._paths = self._init_paths(self.config, settings, application_type)

        # clients
        self.zkclient = KazooClient(
            hosts=get_zk_conn_string(),
            timeout=60.0,
            handler=SequentialThreadingHandler(),
            logger=logging.getLogger('kazoo.app.{0}'.format(self.name)))

        self.zkclient.add_listener(self._zk_listener)
        self._proc_client = self._init_proc_client(self.config,
                                                   application_type,
                                                   cancel_flag)

        self._actions = self._init_actions(settings)
        self._work_manager = self._init_work_manager(self._action_queue)

    def app_details(self):
        return {'name': self.name,
                'host': self._host,
                'platform': self._system,
                'mode': self._mode.value,
                'state': self._state.value,
                'start_stop_time': self._start_stop_time,
                'login_user': self._login_user,
                'read_only': self._read_only,
                'restart_count': self._rl.count}

    def run(self):
        """
        - Start the zookeeper client
        - Check for already running instances. 
        - Start main loop, periodically checking whether the process has failed.
        """
        try:
            self.zkclient.start()
            # make all action objects start processing predicates
            self._log.info('Starting to process Actions.')
            map(lambda x: x.start(), self._actions.values())  # start actions
            started = all([i.started for i in self._actions.values()])
            if not started:
                self._log.critical('All actions are not started!')
            else:
                self._log.info('All actions started.'.format(started))
            self._check_mode()  # get global mode AFTER starting actions

            while self._running:
                sleep(5)

            self.uninitialize()
        except Exception as ex:
            self._log.critical('There was an exception in the main loop. '
                               'In a bad state. ({0})'.format(ex))

    @catch_exception(NodeExistsError)
    @connected
    def register(self, **kwargs):
        """
        Add entry to the state tree
        """
        action_name = kwargs.get('action_name', 'register')

        if not self.zkclient.exists(self._paths['zk_state_path']):
            if self._action_is_ready(action_name):
                self._log.info('Registering %s in state tree.' % self.name)
                self.zkclient.create(self._paths['zk_state_path'],
                                     ephemeral=True,
                                     makepath=True)

                # resolve any pager duty alarms
                self._create_alert_node(AlertActionType.RESOLVE,
                                        AlertReason.RESOLVED)
                # reset restart counters, etc
                self._proc_client.reset_counters()

                self._state.set_value(ApplicationState.STARTED)
            else:
                self._log.info('Action {0} is not ready. Not registering.'
                               .format(action_name))
        else:
            self._log.info('Already registered (node exists).')
        return 0

    @catch_exception(NoNodeError)
    @connected
    def unregister(self, **kwargs):
        """Remove entry from state tree"""
        action_name = kwargs.get('action_name', 'unregister')
        if self._action_is_ready(action_name):
            self._log.info('Un-registering %s from state tree.' % self.name)
            self.zkclient.delete(self._paths['zk_state_path'])
        return 0

    @catch_exception(RuntimeError)
    def uninitialize(self):
        """
        Gracefully stop this Zookeeper session, then free any resentinels 
        held by the client.
        """
        self._log.info('Stopping Zookeeper client')
        self._work_manager.stop()
        map(lambda x: x.stop(), self._actions.values())  # stop actions
        del self._predicates[:]  # make sure we delete old predicates
        self.zkclient.stop()
        self.zkclient.close()
        return 0

    @time_this
    def start(self, **kwargs):
        """
        Start actual process
        :param kwargs: passed from zoom.handlers.control_agent_handlers
        """
        # Restart from UI: ran_stop=True, stay_down=False
        # Stop from UI: ran_stop=True, stay_down=True
        # Crash: ran_stop=False, stay_down=False
        if self._proc_client.restart_logic.ran_stop \
                and self._proc_client.restart_logic.stay_down \
                and self._apptype == ApplicationType.APPLICATION:

            self._log.info('Not starting. App was stopped with Zoom.')
            # set to OK just in case we're staggered
            self._state.set_value(ApplicationState.OK)
            return 0
        elif self._proc_client.restart_logic.crashed and \
                not self._restart_on_crash:
            self._log.info('Not starting. The application has crashed.')
            self._state.set_value(ApplicationState.NOTIFY)
            return 0
        else:
            self._log.debug('Start allowed.')

        if kwargs.get('reset', True):
            self._proc_client.reset_counters()
        if kwargs.get('pause', False):
            self.ignore()
        pd_enabled = kwargs.get('pd_enabled', True)

        self._start_stop_time = self._get_current_time()

        # set login user if not set in react
        if not self._user_set_in_react:
            self._login_user = kwargs.get('login_user', 'Zoom')
        self._state.set_value(ApplicationState.STARTING)

        result = self._proc_client.start()

        if self._run_check_mode:  # Reset to global mode if restart with dep
            self._check_mode()
            self._run_check_mode = False

        if result == 0 or result == ApplicationStatus.CANCELLED:
            self._state.set_value(ApplicationState.STARTED)
        else:
            self._state.set_value(ApplicationState.ERROR)
            if pd_enabled:
                self._create_alert_node(AlertActionType.TRIGGER,
                                        AlertReason.FAILEDTOSTART)
            else:
                self._log.debug('PD is disabled, not sending alert.')

        return result

    @time_this
    def stop(self, **kwargs):
        """
        Stop actual process
        :param kwargs: Passed from:
            zoom.www.handlers.control_agent_handler.ControlAgentHandler,
            zoom.agent.action.action.Action
        """

        if kwargs.get('reset', True):
            self._proc_client.reset_counters()
        if kwargs.get('pause', False):
            self.ignore()

        self._start_stop_time = self._get_current_time()
        self._login_user = kwargs.get('login_user', 'Zoom')
        self._state.set_value(ApplicationState.STOPPING)

        result = self._proc_client.stop(**kwargs)

        if result != ApplicationStatus.CANCELLED:
            # give everything time to catch up, not sure why anymore...
            self._log.info('Sleeping for the configured {0}s after stop.'
                           .format(self._post_stop_sleep))
            sleep(self._post_stop_sleep)

        # reset this value back to False
        self._user_set_in_react = False

        if result == ApplicationStatus.CANCELLED:
            self._state.set_value(ApplicationState.STOPPED)
        elif result != 0:
            self._state.set_value(ApplicationState.ERROR)
        else:
            self._state.set_value(ApplicationState.STOPPED)

        return result

    def status(self):
        """
        Log out the status of each configured action.
        :rtype: str
        """
        out = '\n'
        out += '#' * 40 + ' STATUS ' + '#' * 40
        out += '\n{0}'.format(self)
        out += '\n'
        for i in self._actions.values():
            out += '\n{0}'.format(i.status)
        out += '\n'
        out += '#' * 40 + ' STATUS ' + '#' * 40
        out += '\n'

        self._log.info(out)
        return out

    def restart(self, **kwargs):
        """
        :param kwargs: passed from zoom.handlers.control_agent_handlers
        """
        # if not self._action_is_ready('restart', allow_undefined=True):
        #     self._log.info('Restart action not ready.')
        #     return

        self._log.info('Running Restart. Queuing stop, unregister, start.')
        self._action_queue.clear()
        self._action_queue.append_unique(Task('stop', kwargs=kwargs))
        self._action_queue.append_unique(Task('unregister'))
        self._action_queue.append_unique(Task('start', kwargs=kwargs))
        return 0

    def dep_restart(self, **kwargs):
        self._run_check_mode = True  # only used in self.start()
        self._action_queue.append(Task('start_if_ready', kwargs=kwargs))
        return 0

    def start_if_ready(self, **kwargs):
        if self._action_is_ready('start'):
            self.start(**kwargs)
        # if start action doesn't exist, a.k.a. read only
        elif self._actions.get('start', None) is None:
            self.start(**kwargs)
        else:
            self._action_queue.append(Task('react', kwargs=kwargs))
        return 0

    @time_this
    @connected
    def ignore(self, **kwargs):
        """
        :param kwargs: passed from zoom.handlers.control_agent_handlers
        """
        self._mode.set_value(ApplicationMode.MANUAL)
        self._log.info('Mode is now "{0}"'.format(self._mode))
        return 0

    @time_this
    @connected
    def react(self, **kwargs):
        """
        :param kwargs: passed from zoom.handlers.control_agent_handlers
        """
        self._mode.set_value(ApplicationMode.AUTO)
        self._log.info('Mode is now "{0}"'.format(self._mode))

        # when react is called through "restart with dependencies" command
        self._user_set_in_react = True
        self._login_user = kwargs.get('login_user', 'Zoom')
        return 0

    @time_this
    @connected
    def notify(self, **kwargs):
        """
        Send notification based on arbitrary predicates
        """
        action_name = kwargs.get('action_name', 'notify')
        pd_enabled = kwargs.get('pd_enabled', True)
        pd_reason = kwargs.get('pd_reason', None)

        if pd_reason is None:
            pd_reason = AlertReason.CRASHED

        if not self._action_is_ready(action_name):
            self._log.info('notify action not defined or not ready.')
            return 1

        self._state.set_value(ApplicationState.NOTIFY)
        if pd_enabled:
            self._create_alert_node(AlertActionType.TRIGGER, pd_reason)
        else:
            self._log.debug('PD is disabled, not sending alert.')

        return 0

    @time_this
    @connected
    def ensure_running(self, **kwargs):
        """
        Essentially a clone of `notify`, but tailored for process monitoring.
        """
        # Application failed to start. Already sent PD alert
        if self._state == ApplicationState.ERROR:
            return 1

        action_name = kwargs.get('action_name', 'ensure_running')
        pd_enabled = kwargs.get('pd_enabled', True)
        pd_reason = kwargs.get('pd_reason', None)

        if pd_reason is None:
            pd_reason = AlertReason.CRASHED

        if not self._action_is_ready(action_name):
            self._log.info('notify action not defined or not ready.')
            return

        if not self._proc_client.restart_logic.ran_stop:
            # the application has crashed
            self._state.set_value(ApplicationState.NOTIFY)
            if pd_enabled:
                self._create_alert_node(AlertActionType.TRIGGER, pd_reason)
            else:
                self._log.debug('PD is disabled, not sending alert.')
        else:
            self._log.debug("Service shut down gracefully")

        return 0

    def terminate(self):
        """Terminate child thread/process"""
        self._running = False
        return 0

    def _action_is_ready(self, action_name, allow_undefined=False):
        """
        Check if a configured action's predicates are met
        :type action_name: str
        :type allow_undefined: bool
        :rtype: bool
        """
        action = self._actions.get(action_name, None)
        if allow_undefined:
            if action is None:
                return True

        return action is not None and action.ready

    @catch_exception(NoNodeError)
    @connected
    def _update_agent_node_with_app_details(self, event=None):
        """
        Register app data with the agent in the state tree.
        :type event: kazoo.protocol.states.WatchedEvent or None
        """
        if self._running and \
                not self.zkclient.exists(self._paths['zk_state_base']):
            self.zkclient.create(self._paths['zk_state_base'], makepath=True)

        data, stat = self.zkclient.get(self._paths['zk_state_base'])

        try:
            agent_apps = json.loads(data)
        except ValueError:
            agent_apps = dict()

        # check for config conflict
        other_host = agent_apps.get('host')
        if other_host is not None and self._host != other_host:
            self._log.error('There is a config conflict with {0}. Updates '
                            'will no longer be sent until it is resolved.'
                            .format(other_host))
            self._state.set_value(ApplicationState.CONFIG_ERROR,
                                  run_callback=False)

        # make sure data is the most recent
        if self.app_details() != agent_apps:
            self.zkclient.set(self._paths['zk_state_base'],
                              json.dumps(self.app_details()))
            self._log.debug('Registering app data {0}'
                            .format(self.app_details()))

        # set watch
        if self._state != ApplicationState.CONFIG_ERROR:
            self.zkclient.get(
                self._paths['zk_state_base'],
                watch=self._update_agent_node_with_app_details)
        else:
            self._log.error('Shutting down because of config error.')
            self.terminate()

    def _init_paths(self, config, settings, atype):
        """
        :rtype: dict
        """
        paths = dict()
        paths['zk_state_base'] = verify_attribute(
            config,
            'registrationpath',
            none_allowed=True,
            default=self._pathjoin(settings.get('zookeeper', {}).get('state'), atype, self.name)
        )

        paths['zk_state_path'] = \
            self._pathjoin(paths['zk_state_base'], self._host)
        paths['zk_config_path'] = \
            self._pathjoin(settings.get('zookeeper', {}).get('config'), atype, self.name)
        paths['zk_agent_path'] = \
            self._pathjoin(settings.get('zookeeper', {}).get('agent_state'), self._host)

        return paths

    def _init_proc_client(self, config, atype, cancel_flag):
        """Create the process client."""
        start_cmd = verify_attribute(config, 'start_cmd', none_allowed=True)
        stop_cmd = verify_attribute(config, 'stop_cmd', none_allowed=True)
        status_cmd = verify_attribute(config, 'status_cmd', none_allowed=True)
        script = verify_attribute(config, 'script', none_allowed=True)

        g_names = self._get_graphite_metric_names()

        return ProcessClient(name=self.name,
                             start_cmd=start_cmd,
                             stop_cmd=stop_cmd,
                             status_cmd=status_cmd,
                             script=script,
                             apptype=atype,
                             restart_logic=self._rl,
                             graphite_metric_names=g_names,
                             cancel_flag=cancel_flag)

    def _init_actions(self, settings):
        """
        :rtype: dict
        """
        action_factory = ActionFactory(component=self,
                                       zkclient=self.zkclient,
                                       proc_client=self._proc_client,
                                       action_queue=self._action_queue,
                                       mode=self._mode,
                                       system=self._system,
                                       pred_list=self._predicates,
                                       app_state=self._state,
                                       settings=settings)

        actions = action_factory.create(self.config)

        self._determine_read_only(actions)

        return actions

    def _determine_read_only(self, actions):
        start_action = actions.get('start', None)

        if start_action is None:
            self._read_only = True
        elif start_action.disabled is True:
            self._read_only = True
        else:
            self._read_only = False

    def _init_work_manager(self, queue):
        """
        :rtype: zoom.agent.entities.work_manager.WorkManager
        """
        acceptable_work = dict()
        # actions have additional logic, so use those if available
        for k, v in self._actions.iteritems():
            acceptable_work[k] = v.run

        # if action is not available, add public methods
        for attribute in [a for a in dir(self) if not a.startswith('_')]:
            obj = getattr(self, attribute)
            if hasattr(obj, '__call__'):
                if attribute not in acceptable_work:
                    acceptable_work[attribute] = obj
                else:
                    self._log.debug('Method {0} already assigned to action.'
                                    .format(attribute))

        manager = WorkManager(self.name, queue, acceptable_work)
        manager.start()
        return manager

    @connected
    def _check_mode(self, event=None):
        """
        Check global run mode for the agents.
        :type event: kazoo.protocol.states.WatchedEvent or None
        """
        global_path = self._settings.get('zookeeper', {}).get('global_config')
        if global_path is None:
            self._log.warning('Received no global config path. Zoom will be '
                              'unable to change the global mode.')
            return

        modepath = self._pathjoin(global_path, 'mode')
        try:
            data, stat = self.zkclient.get(modepath, watch=self._check_mode)
            j = json.loads(data)
            self._log.info('Getting mode from Zookeeper from path: {0}'.
                           format(modepath))
            self._mode.set_value(str(j.get(u'mode', ApplicationMode.MANUAL)))
            self._log.info('Setting mode to "{0}"'.format(self._mode))
        except NoNodeError:
            self._log.info('ZK path {0} does not exist. Assuming mode "manual"'
                           .format(modepath))
        except Exception:
            self._log.exception('An uncaught exception has occurred.')

    def _pathjoin(self, *args):
        """
        Helper function to join paths. Uses string joining if it is a Windows
        box.
        :rtype: str
        """
        if self._system == PlatformType.LINUX:
            return os.path.join(*args)
        elif self._system == PlatformType.WINDOWS:
            return '/'.join(args)

    def _get_graphite_metric_names(self):
        """
        splits the state path at 'application' and returns the latter index
        :rtype: dict
        """
        names = {"result": None, "runtime": None, "updown": None}

        type_path = self._paths.get('zk_state_base')\
            .split(self._settings.get('zookeeper', {}).get('state') + '/', 1)[1]
        type_metric = type_path.replace('/', '.')

        graphite = self._settings.get('graphite')
        if graphite is not None:
            result_path = str(graphite.get('result'))
            runtime_path = str(graphite.get('runtime'))
            updown_path = str(graphite.get('updown'))

            names["result"] = result_path.format(type_metric)
            names["runtime"] = runtime_path.format(type_metric)
            names["updown"] = updown_path.format(type_metric)

        return names

    def _get_current_time(self):
        return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    def _get_alert_details(self, alert_action, reason):
        return {
            "action": alert_action,
            "service_key": self._pd_svc_key,
            "incident_key": self._pathjoin('sentinel', self.name, self._host),
            "description": ('Sentinel Error: name={0}, host={1}, issue="{2}".'
                            .format(self.name, self._host, reason)),
            "details": ('Sentinel Error: name={0}, host={1}, issue="{2}".\n'
                        'Review the application log and contact the appropriate'
                        ' development group.'
                        .format(self.name, self._host, reason))
        }

    @catch_exception(NoNodeError)
    @connected
    def _create_alert_node(self, alert_action, reason):
        """
        Create Node in ZooKeeper that will result in a PagerDuty alarm
        :type alert_action: zoom.common.types.AlertActionType
        """
        alert_details = self._get_alert_details(alert_action, reason)
        # path example: /foo/sentinel.bar.baz.HOSTFOO
        alert = self._settings.get('zookeeper', {}).get('alert')
        if alert is None:
            self._log.warning('Was given no alert path. This sentinel will be '
                              'unable to forward alerts to Zoom.')
            return

        alert_path = self._pathjoin(alert, re.sub('/', '.', alert_details['incident_key']))

        if self._env in self._settings.get('pagerduty', {}).get('enabled_environments', []):
            self._log.info('Creating alert "{0}" node for env: {1}'
                           .format(alert_action, self._env))

            if self.zkclient.exists(alert_path):
                self.zkclient.set(alert_path, value=json.dumps(alert_details))
            else:
                self.zkclient.create(alert_path, value=json.dumps(alert_details))
        else:
            self._log.info('Not creating alert "{0}" node for env: {1}'
                           .format(alert_action, self._env))
            self._log.info('Would have created path {0}'.format(alert_path))

    @catch_exception(Exception, traceback=True)
    @run_only_one('listener_lock')
    def _reset_after_connection_loss(self):
        """
        Recreates all actions and predicates after connection loss.
        Recheck the mode and allowed instances.
        """
        if self._running:
            self._log.info('Application listener callback triggered')
            map(lambda x: x.stop(), self._actions.values())  # stop actions
            self._actions.clear()
            self._predicates = []
            self._actions = self._init_actions(self._settings)
            map(lambda x: x.reset(), self._predicates)  # reset predicates
            map(lambda x: x.start(), self._actions.values())  # start actions
            self._check_mode()
            self._log.info('Application listener callback complete!')
        else:
            self._log.info('The daemon has called for termination. '
                           'Not trying to reset after connection loss.')

    def _zk_listener(self, state):
        """
        The callback function that runs when the connection state to Zookeeper
        changes.
        Either passes or immediately spawns a new thread that resets any
        watches, etc., so that it can listen to future connection state changes.
        """
        try:
            self._log.info('Zookeeper Connection went from {0} to {1}'
                           .format(self._prev_state, state))
            if self._prev_state is None and state == KazooState.CONNECTED:
                pass
            elif self._prev_state == KazooState.LOST and state == KazooState.CONNECTED:
                self.zkclient.handler.spawn(self._reset_after_connection_loss)
            elif self._prev_state == KazooState.CONNECTED and state == KazooState.SUSPENDED:
                pass
            elif self._prev_state == KazooState.CONNECTED and state == KazooState.LOST:
                pass
            elif self._prev_state == KazooState.SUSPENDED and state == KazooState.LOST:
                pass
            elif self._prev_state == KazooState.SUSPENDED and state == KazooState.CONNECTED:
                self.zkclient.handler.spawn(self._reset_after_connection_loss)
            elif state == KazooState.CONNECTED:
                self.zkclient.handler.spawn(self._reset_after_connection_loss)
            else:
                self._log.info('Zookeeper Connection in unknown state: {0}'
                               .format(state))
                return
            self._prev_state = state

        except Exception as ex:
            self._log.exception('An uncaught exception has occurred in the '
                                'listener: {0}'.format(ex))

    def __str__(self):
        return self.__repr__()
    
    def __repr__(self):
        return ("{0}(name={1}, runmode={2})"
                .format(self.__class__.__name__, self.name, self._mode))
Exemple #23
0
class ZookeeperGoodUntilTime(SimplePredicate):
    def __init__(self, comp_name, zkclient, nodepath,
                 operational=False, parent=None, interval=5):
        """
        :type comp_name: str
        :type zkclient: kazoo.client.KazooClient
        :type nodepath: str
        :type operational: bool
        :type parent: str or None
        :type interval: int or float
        """
        SimplePredicate.__init__(self, comp_name, operational=operational, parent=parent)
        self.node = nodepath
        self.zkclient = zkclient
        self.interval = interval
        self._start = None
        self._stop = None
        self._log = logging.getLogger('sent.{0}.pred.gut'.format(comp_name))
        self._log.info('Registered {0}'.format(self))

        self._operate = ThreadSafeObject(True)
        self._thread = Thread(target=self._run_loop, name=str(self))
        self._thread.daemon = True
        self._started = False

        self._datetime_regex = (
            "^((?P<year>\d{4})\-(?P<month>\d{2})\-(?P<day>\d{2})\s)?"
            "(?P<hour>\d{2}):(?P<minute>\d{2})(:(?P<second>\d{2}))?"
        )

    @property
    def current_time(self):
        return datetime.datetime.now().time()

    @property
    def current_datetime(self):
        return datetime.datetime.now()

    def start(self):
        if self._started is False:
            self._log.debug('Starting {0}'.format(self))
            self._started = True
            self._watch_node()
            self._thread.start()
            self._block_until_started()
        else:
            self._log.debug('Already started {0}'.format(self))

    def stop(self):
        if self._started is True:
            self._log.info('Stopping {0}'.format(self))
            self._started = False
            self._operate.set_value(False)
            self._thread.join()
            self._log.info('{0} stopped'.format(self))
        else:
            self._log.debug('Already stopped {0}'.format(self))

    def _run_loop(self):
        while self._operate == True:
            self._process_met()
            sleep(self.interval)
        self._log.info('Done comparing guts.')

    def _process_met(self):
        results = []
        if self._start is not None:
            compare_to_start = self._get_comparison(self._start)
            results.append(self._start < compare_to_start)

        if self._stop is not None:
            compare_to_stop = self._get_comparison(self._stop)
            results.append(compare_to_stop < self._stop)

        if not results:
            results.append(False)

        self.set_met(all(results))  # every comparison returned True

    def _get_comparison(self, obj):
        if isinstance(obj, datetime.datetime):
            return self.current_datetime
        elif isinstance(obj, datetime.time):
            return self.current_time

    def _parse_data(self, gut_data):
        """
        :type gut_data: dict
        """
        start_data = gut_data.get(u'start', None)
        self._log.debug('raw start from zk is "{0}"'.format(start_data))
        if start_data is not None:
            self._start = TimeWindow.get_datetime_object(start_data)
            
        stop_data = gut_data.get(u'stop', None)
        self._log.debug('raw stop from zk is "{0}"'.format(stop_data))

        if stop_data is not None:
            self._stop = TimeWindow.get_datetime_object(stop_data)

        if start_data is None and stop_data is None:
            self._log.error('Start and Stop time not specified!')
        
        self._log.info('The current time is: {0}. Start time is: {1}. '
                       'Stop time is: {2}'
                       .format(self.current_time, self._start, self._stop))
    
    @connected
    def _watch_node(self, event=None):
        """
        :type event: kazoo.protocol.states.WatchedEvent or None
        """
        try:
            exists = self.zkclient.exists(self.node, watch=self._watch_node)
            if exists:
                data, stat = self.zkclient.get(self.node,
                                               watch=self._watch_node)
                j = json.loads(data)
                self._parse_data(j)
            else:
                self._log.info('No gut node was found. Watcher is set at {0}'
                               .format(self.node))
        except ValueError as ex:
            self._log.error('Invalid GUT JSON object: {0}'.format(ex))
        finally:
            self._process_met()

    def __repr__(self):
        return ('{0}(component={1}, parent={2}, start="{3}", stop="{4}", '
                'zkpath={5}, started={6}, operational={7}, met={8})'
                .format(self.__class__.__name__,
                        self._comp_name,
                        self._parent,
                        self._start,
                        self._stop,
                        self.node,
                        self.started,
                        self._operational,
                        self._met))
    
    def __eq__(self, other):
        return all([
            type(self) == type(other),
            self.node == getattr(other, 'node', None)
        ])

    def __ne__(self, other):
        return any([
            type(self) != type(other),
            self.node != getattr(other, 'node', None)
        ])
Exemple #24
0
class PredicateHealth(SimplePredicate):
    def __init__(self, comp_name, command, interval, system,
                 operational=False, parent=None):
        """
        :type comp_name: str
        :type command: str
        :type interval: int or float
        :type system: zoom.common.types.PlatformType
        :type operational: bool
        :type parent: str or None
        """
        SimplePredicate.__init__(self, comp_name, operational=operational, parent=parent)
        self._log = logging.getLogger('sent.{0}.pred.health'.format(comp_name))
        self.interval = interval
        self.rawcmd = command
        self._runcmd = str()
        self._system = system
        self._verify()

        self._operate = ThreadSafeObject(True)
        self._thread = Thread(target=self._run_loop, name=str(self))
        self._thread.daemon = True
        self._log.info('Registered {0}'.format(self))
        self._started = False

    def start(self):
        if self._started is False:
            self._log.debug('Starting {0}'.format(self))
            self._started = True
            self._thread.start()
            self._block_until_started()
        else:
            self._log.debug('Already started {0}'.format(self))

    def stop(self):
        if self._started is True:
            self._log.info('Stopping {0}'.format(self))
            self._started = False
            self._operate.set_value(False)
            self._thread.join()
            self._log.info('{0} stopped'.format(self))
        else:
            self._log.debug('Already stopped {0}'.format(self))

    def _verify(self):
        if self._system == PlatformType.LINUX:
            self._runcmd = shlex.split(self.rawcmd)
        elif self._system == PlatformType.WINDOWS:
            self._runcmd = self.rawcmd
        else:
            self._runcmd = ""

        exe = shlex.split(self.rawcmd)[0]
        exists = os.path.exists(exe)
        if not exists:
            searchpath = os.environ['PATH']
            for i in searchpath.split(':'):
                newpath = os.path.join(i, exe)
                if os.path.exists(newpath):
                    exists = True
                    break

        if not exists:
            err = ('Cannot register check "{0}". The path does not exist.'
                   .format(exe))
            self._log.error(err)
            raise OSError(err)

    def _run(self):
        """
        Run the check as a subprocess and return the results as a bool based on
        return code. (Non-zero equals failure)
        :rtype: bool
        """
        p = Popen(self._runcmd, stdout=PIPE, stderr=PIPE)
        out, err = p.communicate()

        if err:
            self._log.error('There was some error with the check "{0}"\n{1}'
                            .format(self.rawcmd, err))
            self.set_met(False)
        if p.returncode != 0:
            self._log.error('Check "{0}" has failed.'.format(self.rawcmd))
            self.set_met(False)
        else:
            self._log.debug('Check "{0}" has succeeded.'.format(self.rawcmd))
            self.set_met(True)

    def _run_loop(self):
        while self._operate == True:
            self._run()
            sleep(self.interval)
        self._log.info('Done running {0}'.format(self))

    def __repr__(self):
        return ('{0}(component={1}, parent={2}, cmd="{3}", interval={4} '
                'started={5}, operational={6}, met={7})'
                .format(self.__class__.__name__,
                        self._comp_name,
                        self._parent,
                        self.rawcmd,
                        self.interval,
                        self.started,
                        self._operational,
                        self._met))
    
    def __eq__(self, other):
        return all([
            type(self) == type(other),
            self.rawcmd == getattr(other, 'rawcmd', None),
            self.interval == getattr(other, 'interval', None)
        ])

    def __ne__(self, other):
        return any([
            type(self) != type(other),
            self.rawcmd != getattr(other, 'rawcmd', None),
            self.interval != getattr(other, 'interval', None)
        ])
Exemple #25
0
class PredicateProcess(SimplePredicate):
    def __init__(self, comp_name, settings, proc_client, interval, parent=None):
        """
        :type comp_name: str
        :type settings: zoom.agent.entities.thread_safe_object.ThreadSafeObject
        :type proc_client: zoom.agent.client.process_client.ProcessClient
        :type interval: int or float
        :type parent: str or None
        """
        SimplePredicate.__init__(self, comp_name, settings, parent=parent)
        self._log = logging.getLogger('sent.{0}.pred.process'.format(comp_name))
        self._proc_client = proc_client

        # lock for synchronous decorator
        if proc_client:
            self.process_client_lock = proc_client.process_client_lock
        else:
            self.process_client_lock = Lock()

        self.interval = interval
        self._operate = ThreadSafeObject(True)
        self._thread = Thread(target=self._run_loop, name=str(self))
        self._thread.daemon = True
        self._started = False

    def running(self):
        """
        With the synchronous decorator, this shares a Lock object with the
        ProcessClient. While ProcessClient.start is running, this will not
        return.
        :rtype: bool
        """
        return self._proc_client.running()

    def start(self):
        if self._started is False:
            self._log.debug('Starting {0}'.format(self))
            self._started = True
            self._thread.start()
        else:
            self._log.debug('Already started {0}'.format(self))

    def stop(self):
        if self._started is True:
            self._log.info('Stopping {0}'.format(self))
            self._started = False
            self._operate.set_value(False)
            self._thread.join()
            self._log.info('{0} stopped'.format(self))
        else:
            self._log.debug('Already stopped {0}'.format(self))

    def _run_loop(self):
        while self._operate == True:
            self.set_met(self.running())
            sleep(self.interval)
        self._log.info('Done watching process.')

    def __repr__(self):
        return ('{0}(component={1}, parent={2}, interval={3}, started={4}, '
                'met={5})'
                .format(self.__class__.__name__,
                        self._comp_name,
                        self._parent,
                        self.interval,
                        self.started,
                        self._met)
                )

    def __eq__(self, other):
        return all([
            type(self) == type(other),
            self.interval == getattr(other, 'interval', None)
        ])

    def __ne__(self, other):
        return any([
            type(self) != type(other),
            self.interval != getattr(other, 'interval', None)
        ])
Exemple #26
0
class APIPredicate(SimplePredicate):
    """
    Predicate that polls a url for a specific code.
    """
    def __init__(self,
                 comp_name,
                 url,
                 verb='GET',
                 expected_code=200,
                 interval=5.0,
                 operational=False,
                 parent=None):
        """
        :type comp_name: str
        :type url: str
        :type verb: str
        :type expected_code: int
        :type interval: int or float
        :type operational: bool
        :type parent: str or None
        """
        SimplePredicate.__init__(self,
                                 comp_name,
                                 operational=operational,
                                 parent=parent)
        self._log = logging.getLogger('sent.{0}.pred.api'.format(comp_name))
        logging.getLogger('requests.packages.urllib3.connectionpool').setLevel(
            logging.WARNING)
        self.url = url
        self.verb = verb
        self.expected_code = expected_code
        self.interval = interval

        self._operate = ThreadSafeObject(True)
        self._thread = Thread(target=self._run_loop, name=str(self))
        self._thread.daemon = True
        self._log.info('Registered {0}'.format(self))
        self._started = False

    def start(self):
        if self._started is False:
            self._log.debug('Starting {0}'.format(self))
            self._started = True
            self._thread.start()
            self._block_until_started()
        else:
            self._log.debug('Already started {0}'.format(self))

    def stop(self):
        if self._started is True:
            self._log.info('Stopping {0}'.format(self))
            self._started = False
            self._operate.set_value(False)
            self._thread.join()
            self._log.info('{0} stopped'.format(self))
        else:
            self._log.debug('Already stopped {0}'.format(self))

    def _run(self):
        """
        Query the given url, and report whether we get the expected code.
        """
        try:
            r = requests.request(self.verb, self.url, timeout=2)
            self.set_met(r.status_code == self.expected_code)
        except requests.ConnectionError:
            self._log.debug('URL {0} is not available.'.format(self.url))
            self.set_met(False)
        except requests.Timeout:
            self._log.debug('Timed out to URL {0}.'.format(self.url))
            self.set_met(False)

    def _run_loop(self):
        while self._operate == True:
            self._run()
            sleep(self.interval)
        self._log.info('Done querying {0}'.format(self.url))

    def __repr__(self):
        return ('{0}(component={1}, parent={2}, url="{3}", verb={4}, '
                'interval={5} started={6}, operational={7}, met={8})'.format(
                    self.__class__.__name__, self._comp_name, self._parent,
                    self.url, self.verb, self.interval, self.started,
                    self._operational, self._met))

    def __eq__(self, other):
        return all([
            type(self) == type(other), self.url == getattr(other, 'url', None),
            self.verb == getattr(other, 'verb', None),
            self.interval == getattr(other, 'interval', None)
        ])

    def __ne__(self, other):
        return any([
            type(self) != type(other), self.url != getattr(other, 'url', None),
            self.verb != getattr(other, 'verb', None),
            self.interval != getattr(other, 'interval', None)
        ])
Exemple #27
0
class PredicateHealth(SimplePredicate):
    def __init__(self,
                 comp_name,
                 command,
                 interval,
                 system,
                 operational=False,
                 parent=None):
        """
        :type comp_name: str
        :type command: str
        :type interval: int or float
        :type system: zoom.common.types.PlatformType
        :type operational: bool
        :type parent: str or None
        """
        SimplePredicate.__init__(self,
                                 comp_name,
                                 operational=operational,
                                 parent=parent)
        self._log = logging.getLogger('sent.{0}.pred.health'.format(comp_name))
        self.interval = interval
        self.rawcmd = command
        self._runcmd = str()
        self._system = system
        self._verify()

        self._operate = ThreadSafeObject(True)
        self._thread = Thread(target=self._run_loop, name=str(self))
        self._thread.daemon = True
        self._log.info('Registered {0}'.format(self))
        self._started = False

    def start(self):
        if self._started is False:
            self._log.debug('Starting {0}'.format(self))
            self._started = True
            self._thread.start()
            self._block_until_started()
        else:
            self._log.debug('Already started {0}'.format(self))

    def stop(self):
        if self._started is True:
            self._log.info('Stopping {0}'.format(self))
            self._started = False
            self._operate.set_value(False)
            self._thread.join()
            self._log.info('{0} stopped'.format(self))
        else:
            self._log.debug('Already stopped {0}'.format(self))

    def _verify(self):
        if self._system == PlatformType.LINUX:
            self._runcmd = shlex.split(self.rawcmd)
        elif self._system == PlatformType.WINDOWS:
            self._runcmd = self.rawcmd
        else:
            self._runcmd = ""

        exe = shlex.split(self.rawcmd)[0]
        exists = os.path.exists(exe)
        if not exists:
            searchpath = os.environ['PATH']
            for i in searchpath.split(':'):
                newpath = os.path.join(i, exe)
                if os.path.exists(newpath):
                    exists = True
                    break

        if not exists:
            err = ('Cannot register check "{0}". The path does not exist.'.
                   format(exe))
            self._log.error(err)
            raise OSError(err)

    def _run(self):
        """
        Run the check as a subprocess and return the results as a bool based on
        return code. (Non-zero equals failure)
        :rtype: bool
        """
        p = Popen(self._runcmd, stdout=PIPE, stderr=PIPE)
        out, err = p.communicate()

        if err:
            self._log.error(
                'There was some error with the check "{0}"\n{1}'.format(
                    self.rawcmd, err))
            self.set_met(False)
        if p.returncode != 0:
            self._log.error('Check "{0}" has failed.'.format(self.rawcmd))
            self.set_met(False)
        else:
            self._log.debug('Check "{0}" has succeeded.'.format(self.rawcmd))
            self.set_met(True)

    def _run_loop(self):
        while self._operate == True:
            self._run()
            sleep(self.interval)
        self._log.info('Done running {0}'.format(self))

    def __repr__(self):
        return ('{0}(component={1}, parent={2}, cmd="{3}", interval={4} '
                'started={5}, operational={6}, met={7})'.format(
                    self.__class__.__name__, self._comp_name, self._parent,
                    self.rawcmd, self.interval, self.started,
                    self._operational, self._met))

    def __eq__(self, other):
        return all([
            type(self) == type(other),
            self.rawcmd == getattr(other, 'rawcmd', None),
            self.interval == getattr(other, 'interval', None)
        ])

    def __ne__(self, other):
        return any([
            type(self) != type(other),
            self.rawcmd != getattr(other, 'rawcmd', None),
            self.interval != getattr(other, 'interval', None)
        ])
Exemple #28
0
class SentinelDaemon(object):
    def __init__(self):
        """
        Read config and spawn child processes.
        """
        self._log = logging.getLogger('sent.daemon')
        self._log.info('Creating Sentinel')

        self.children = dict()
        self._settings = ThreadSafeObject(dict())
        self._system = self._get_system()
        self._hostname = platform.node().upper()  # must be uppercase
        self._prev_state = None
        self.listener_lock = Lock()

        if self._system == PlatformType.LINUX:
            self.zkclient = KazooClient(hosts=ZK_CONN_STRING,
                                        handler=SequentialThreadingHandler(),
                                        logger=logging.getLogger('kazoo.daemon'))
        elif self._system == PlatformType.WINDOWS:
            self.zkclient = KazooClient(hosts=ZK_CONN_STRING,
                                        handler=SequentialThreadingHandler())

        self.zkclient.add_listener(self._zk_listener)
        # this will run self._reset_after_connection_loss
        self.zkclient.start()

        self.task_client = None
        self.task_client = TaskClient(self.children,
                                      self.zkclient,
                                      self._settings)

        self._rest_server = tornado.httpserver.HTTPServer(
            RestServer(self.children, self._settings))

        signal.signal(signal.SIGINT, self._handle_sigint)
        signal.signal(signal.SIGTERM, self._handle_sigint)
        self._log.info('Created Sentinel')

    def __enter__(self):
        logging.info('Starting Sentinel')
        self._rest_server.listen('9000')
        logging.info('Started Sentinel')

    def __exit__(self, exc_type, exc_val, exc_tb):
        pass

    def stop(self):
        """Terminate all child processes and exit."""
        self._log.info('Stopping Sentinel')
        self._terminate_children()
        self._rest_server.stop()
        self._log.info('Stopped Sentinel. Exiting.')
        sys.exit(0)

    def _handle_sigint(self, sig, frame):
        self._log.info('Caught signal %s.' % sig)
        self.stop()

    @connected
    def _get_settings(self, event=None):
        """
        Populate self._settings dict.
        :type event: kazoo.protocol.states.WatchedEvent or None
        """
        data, stat = self.zkclient.get(ZK_AGENT_CONFIG,
                                       watch=self._get_settings)
        self._settings.set_value(json.loads(data))
        self._log.info('Got settings:\n{0}'
                       .format(pprint.pformat(self._settings.value)))

    @catch_exception(NodeExistsException)
    @connected
    def _register(self, event=None):
        """
        :type event: kazoo.protocol.states.WatchedEvent or None
        """
        agent_state_path = self._settings.get('ZK_AGENT_STATE_PATH')
        path = '/'.join([agent_state_path, self._hostname])
        if not self.zkclient.exists(path, watch=self._register):
            self.zkclient.create(path,
                                 value=json.dumps({}),
                                 ephemeral=True)

    @connected
    def _get_config_and_run(self, event=None):
        """
        Grab config from Zookeeper. Spawn ChildProcess instances.
        :type event: kazoo.protocol.states.WatchedEvent or None
        """
        agent_config_path = self._settings.get('ZK_AGENT_CONFIG_PATH')
        config_path = '/'.join([agent_config_path, self._hostname])
        try:
            if not self.zkclient.exists(config_path,
                                        watch=self._get_config_and_run):
                self._log.warning('Node does not exist at: {0}. Creating.'
                                  .format(config_path))
                self.zkclient.create(
                    config_path, value='<?xml version= "1.0"?><Application />')
                return

            data, stat = self.zkclient.get(config_path)
            config = ElementTree.fromstring(data.strip())
            
            self._terminate_children()
            self._spawn_children(config)

        except ParseError as e:
            self._log.error('Incomplete XML config found in path {0}: {1}'
                            .format(config_path, e))
        except ZookeeperError as e:
            self._log.error('ZK server returned a non-zero error code: {0}'
                            .format(e))
        except Exception as e:
            self._log.exception('There were some Exception: {0}'.format(e))

    def _spawn_children(self, config):
        """
        Populate the self.children dictionary
        :type config: xml.etree.ElementTree.Element
        """
        for component in config.iter('Component'):
            try:
                name = verify_attribute(component, 'id')
                self._log.info('Spawning %s' % name)
                self.children[name] = {
                    'config': component,
                    'process': ChildProcess(component,
                                            self._system,
                                            self._settings)
                }

            except ValueError as e:
                self._log.error('Error with ID in config: {0}'.format(e))
                continue

    def _terminate_children(self):
        """
        Stop all children in the self.children dictionary, and clear it.
        """
        self._log.info('Stopping children.')
        for child in self.children.values():
            process = child['process']
            self._log.info('Terminating child -- {0}'.format(process))
            process.stop()

        # sent stop to all, now wait for all to complete
        map(lambda i: i.join(), [x['process'] for x in self.children.values()])
        self.children.clear()

    @catch_exception(Exception, traceback=True)
    @run_only_one('listener_lock')
    @connected
    def _reset_after_connection_loss(self):
        """
        Used for spawning child process and resetting watches ZK connection
        changes. This includes the first connection to Zookeeper (on startup).
        """
        self._log.info('Daemon listener callback triggered')
        self._get_settings()
        self._register()
        self._get_config_and_run()
        if self.task_client is not None:
            self.task_client.reset_watches()
        self._log.info('Daemon listener callback complete!')
                
    def _zk_listener(self, state):
        """
        The callback function that runs when the connection state to Zookeeper
        changes.
        Either passes or immediately spawns a new thread that resets any
        watches, etc., so that it can listen to future connection state changes.
        """
        try:
            self._log.info('Zookeeper Connection went from {0} to {1}'
                           .format(self._prev_state, state))
            if (self._prev_state == KazooState.LOST
                  and state == KazooState.CONNECTED):
                self.zkclient.handler.spawn(self._reset_after_connection_loss)
            elif (self._prev_state == KazooState.CONNECTED
                  and state == KazooState.SUSPENDED):
                pass
            elif (self._prev_state == KazooState.CONNECTED
                  and state == KazooState.LOST):
                pass
            elif (self._prev_state == KazooState.SUSPENDED
                  and state == KazooState.LOST):
                pass
            elif (self._prev_state == KazooState.SUSPENDED
                  and state == KazooState.CONNECTED):
                self.zkclient.handler.spawn(self._reset_after_connection_loss)
            elif state == KazooState.CONNECTED:
                self.zkclient.handler.spawn(self._reset_after_connection_loss)
            else:
                self._log.info('Zookeeper Connection in unknown state: {0}'
                               .format(state))
                return
            self._prev_state = state
        except Exception as e:
            self._log.error('Listener excepted out with error: {0}'.format(e))

    def _get_system(self):
        system_str = platform.platform(terse=True)
        if 'Linux' in system_str:
            return PlatformType.LINUX
        elif 'Windows' in system_str:
            return PlatformType.WINDOWS
        else:
            return PlatformType.UNKNOWN
Exemple #29
0
class Application(object):
    """
    Service object to represent an deployed service.
    """
    def __init__(self, config, settings, conn, queue, system, application_type):
        """
        :type config: dict (xml)
        :type settings: zoom.agent.entities.thread_safe_object.ThreadSafeObject
        :type conn: multiprocessing.Connection
        :type queue: zoom.agent.entities.unique_queue.UniqueQueue
        :type system: zoom.common.types.PlatformType
        :type application_type: zoom.common.types.ApplicationType
        """
        self.config = config
        self._settings = settings
        self.name = verify_attribute(self.config, 'id', none_allowed=False)
        self._log = logging.getLogger('sent.{0}.app'.format(self.name))
        # informational attributes
        self._host = platform.node().upper()
        self._fqdn = socket.getfqdn()
        self._system = system
        self._predicates = list()
        self._running = True  # used to manually stop the run loop
        self._prev_state = None
        self._actions = dict()  # created in _reset_watches on zk connect
        self._env = os.environ.get('EnvironmentToUse', 'Staging')
        self._apptype = application_type

        # tool-like attributes
        self.listener_lock = Lock()
        self._action_queue = queue
        self._mode = ApplicationMode(ApplicationMode.MANUAL)
        self._state = ThreadSafeObject(ApplicationState.OK)
        self._trigger_time = ''     # Default to empty string for comparison
        self._login_user = '******'   # Default to Zoom
        self._run_check_mode = False
        self._pd_svc_key = verify_attribute(config, 'pagerduty_service',
                                            none_allowed=True)

        self._paths = self._init_paths(self.config, settings, application_type)

        # clients
        if self._system == PlatformType.LINUX:
            self.zkclient = KazooClient(
                hosts=ZK_CONN_STRING,
                handler=SequentialThreadingHandler(),
                logger=logging.getLogger('kazoo.app.{0}'.format(self.name)))
        elif self._system == PlatformType.WINDOWS:
            self.zkclient = KazooClient(hosts=ZK_CONN_STRING,
                                        handler=SequentialThreadingHandler())

        self.zkclient.add_listener(self._zk_listener)
        self._proc_client = self._init_proc_client(self.config,
                                                   settings,
                                                   application_type)

        self._actions = self._init_actions(settings)
        self._work_manager = self._init_work_manager(self._action_queue, conn)

    @property
    def app_details(self):
        return {'name': self.name,
                'host': self._host,
                'fqdn': self._fqdn,
                'platform': self._system,
                'mode': self._mode.value,
                'state': self._state.value,
                'trigger_time': self._trigger_time,
                'login_user': self._login_user}

    def run(self):
        """
        - Start the zookeeper client
        - Check for already running instances. 
        - Start main loop, periodically checking whether the process has failed.
        """
        self.zkclient.start()
        # make all action objects start processing predicates
        self._log.info('Starting to process Actions.')
        map(lambda x: x.start(), self._actions.values())  # start actions
        self._check_mode()  # get global mode AFTER starting actions

        while self._running:
            sleep(5)

        self.uninitialize()

    @catch_exception(NodeExistsError)
    @connected
    def register(self, **kwargs):
        """
        Add entry to the state tree
        """
        if not self.zkclient.exists(self._paths['zk_state_path']):
            if self._action_is_ready('register'):
                self._log.info('Registering %s in state tree.' % self.name)
                self.zkclient.create(self._paths['zk_state_path'],
                                     ephemeral=True,
                                     makepath=True)

                # resolve any pager duty alarms
                self._create_alert_node(AlertActionType.RESOLVE,
                                        AlertReason.RESOLVED)
                # reset restart counters, etc
                self._proc_client.reset_counters()

                self._state.set_value(ApplicationState.OK)
                self._update_agent_node_with_app_details()

    @catch_exception(NoNodeError)
    @connected
    def unregister(self, **kwargs):
        """Remove entry from state tree"""
        if self._action_is_ready('unregister'):
            self._log.info('Un-registering %s from state tree.' % self.name)
            self.zkclient.delete(self._paths['zk_state_path'])

    @catch_exception(RuntimeError)
    def uninitialize(self):
        """
        Gracefully stop this Zookeeper session, then free any resentinels 
        held by the client.
        """
        self._log.info('Stopping Zookeeper client')
        self._work_manager.stop()
        map(lambda x: x.stop(), self._actions.values())  # stop actions
        self.zkclient.stop()
        self.zkclient.close()

    @time_this
    def start(self, **kwargs):
        """
        Start actual process
        :param kwargs: passed from zoom.handlers.control_agent_handlers
        """
        # Same check as self.notify() but needed when start action is
        # called after process crashes and all predicates are met when on Auto
        if not self._proc_client.restart_logic.ran_stop \
                and self._apptype == ApplicationType.APPLICATION:
            self._log.info('Not starting. App was stopped with Zoom.')
            return 0
        else:
            self._log.debug('Start allowed.')

        if kwargs.get('reset', True):
            self._proc_client.reset_counters()
        if kwargs.get('pause', False):
            self.ignore()
        pd_enabled = kwargs.get('pd_enabled', True)

        self._trigger_time = self._get_current_time()
        self._login_user = kwargs.get('login_user', 'Zoom')
        self._state.set_value(ApplicationState.STARTING)
        self._update_agent_node_with_app_details()

        result = self._proc_client.start()

        if self._run_check_mode:  # Reset to global mode if restart with dep
            self._check_mode()
            self._run_check_mode = False

        if result == 0:
            self._state.set_value(ApplicationState.OK)
        else:
            self._state.set_value(ApplicationState.ERROR)
            if pd_enabled:
                self._create_alert_node(AlertActionType.TRIGGER,
                                        AlertReason.FAILEDTOSTART)
            else:
                self._log.debug('PD is disabled, not sending alert.')

        self._update_agent_node_with_app_details()

        return result

    @time_this
    def stop(self, **kwargs):
        """
        Stop actual process
        :param kwargs: passed from zoom.handlers.control_agent_handlers
        """

        if kwargs.get('reset', True):
            self._proc_client.reset_counters()
        if kwargs.get('pause', False):
            self.ignore()

        self._trigger_time = self._get_current_time()
        self._login_user = kwargs.get('login_user', 'Zoom')
        self._state.set_value(ApplicationState.STOPPING)
        self._update_agent_node_with_app_details()

        result = self._proc_client.stop(**kwargs)

        if result != 0 and kwargs.get('argument', 'false') == 'false':
            self._state.set_value(ApplicationState.ERROR)
        else:
            self._state.set_value(ApplicationState.OK)

        sleep(5)  # give everything time to catch up
        self._update_agent_node_with_app_details()

        return result

    def restart(self, **kwargs):
        """
        :param kwargs: passed from zoom.handlers.control_agent_handlers
        """
        # if not self._action_is_ready('restart', allow_undefined=True):
        #     self._log.info('Restart action not ready.')
        #     return

        self._log.info('Running Restart. Queuing stop, unregister, start.')
        self._action_queue.clear()
        self._action_queue.append_unique(Task('stop', kwargs=kwargs))
        self._action_queue.append_unique(Task('unregister'))
        self._action_queue.append_unique(Task('start', kwargs=kwargs))

    def dep_restart(self, **kwargs):
        self._run_check_mode = True  # only used in self.start()
        self._action_queue.append(Task('start_if_ready', pipe=False))

    def start_if_ready(self):
        if self._action_is_ready('start'):
            self.start()
        else:
            self._action_queue.append(Task('react', pipe=False))

    @time_this
    @connected
    def ignore(self, **kwargs):
        """
        :param kwargs: passed from zoom.handlers.control_agent_handlers
        """
        self._mode.set_value(ApplicationMode.MANUAL)
        self._log.info('Mode is now "{0}"'.format(self._mode))
        self._update_agent_node_with_app_details()
        return 0

    @time_this
    @connected
    def react(self, **kwargs):
        """
        :param kwargs: passed from zoom.handlers.control_agent_handlers
        """
        self._mode.set_value(ApplicationMode.AUTO)
        self._log.info('Mode is now "{0}"'.format(self._mode))
        self._update_agent_node_with_app_details()
        return 0

    @time_this
    @connected
    def notify(self, **kwargs):
        """
        Send notification to zookeeper that a dependency has gone down.
        """
        # Application failed to start. Already sent PD alert
        if self._state == ApplicationState.ERROR:
            return

        pd_enabled = kwargs.get('pd_enabled', True)

        if not self._action_is_ready('notify'):
            self._log.info('notify action not defined or not ready.')
            return

        if not self._proc_client.restart_logic.ran_stop:
            # the application has crashed
            self._state.set_value(ApplicationState.NOTIFY)
            self._update_agent_node_with_app_details()
            if pd_enabled:
                self._create_alert_node(AlertActionType.TRIGGER,
                                        AlertReason.CRASHED)
            else:
                self._log.debug('PD is disabled, not sending alert.')
        else:
            self._log.debug("Service shut down gracefully")

    def terminate(self):
        """Terminate child thread/process"""
        self._running = False   

    def _action_is_ready(self, action_name, allow_undefined=False):
        """
        Check if a configured action's predicates are met
        :type action_name: str
        :type allow_undefined: bool
        :rtype: bool
        """
        action = self._actions.get(action_name, None)
        if allow_undefined:
            if action is None:
                return True

        return action is not None and action.ready

    @connected
    def _update_agent_node_with_app_details(self, event=None):
        """
        Register app data with the agent in the state tree.
        :type event: kazoo.protocol.states.WatchedEvent or None
        """
        if self._running and \
                not self.zkclient.exists(self._paths['zk_state_base']):
            self.zkclient.create(self._paths['zk_state_base'])

        data, stat = self.zkclient.get(self._paths['zk_state_base'])

        try:
            agent_apps = json.loads(data)
        except ValueError:
            agent_apps = dict()

        # check for config conflict
        other_host = agent_apps.get('host')
        if other_host is not None and self._host != other_host:
            self._log.error('There is a config conflict with {0}. Updates '
                            'will no longer be sent until it is resolved.'
                            .format(other_host))
            self._state.set_value(ApplicationState.CONFIG_ERROR)

        # make sure data is the most recent
        if self.app_details != agent_apps:
            self.zkclient.set(self._paths['zk_state_base'],
                              json.dumps(self.app_details))
            self._log.debug('Registering app data {0}'.format(self.app_details))

        # set watch
        if self._state != ApplicationState.CONFIG_ERROR:
            self.zkclient.get(
                self._paths['zk_state_base'],
                watch=self._update_agent_node_with_app_details)
        else:
            self._log.error('Shutting down because of config error.')
            self.terminate()

    def _init_paths(self, config, settings, atype):
        """
        :rtype: dict
        """
        paths = dict()
        registrationpath = verify_attribute(config, 'registrationpath',
                                            none_allowed=True)

        if registrationpath is not None:
            paths['zk_state_base'] = registrationpath
        else:
            paths['zk_state_base'] = \
                self._pathjoin(settings.get('ZK_STATE_PATH'), atype, self.name)

        paths['zk_state_path'] = \
            self._pathjoin(paths['zk_state_base'], self._host)
        paths['zk_config_path'] = \
            self._pathjoin(settings.get('ZK_CONFIG_PATH'), atype, self.name)
        paths['zk_agent_path'] = \
            self._pathjoin(settings.get('ZK_AGENT_STATE_PATH'), self._host)

        return paths

    def _init_proc_client(self, config, settings, atype):
        """Create the process client."""
        command = verify_attribute(config, 'command', none_allowed=True)
        script = verify_attribute(config, 'script', none_allowed=True)
        restartmax = verify_attribute(config, 'restartmax', none_allowed=True,
                                      cast=int)

        if restartmax is None:
            self._log.info('Restartmax not specified. Assuming 3.')
            restartmax = 3

        g_names = self._get_graphite_metric_names()

        return ProcessClient(name=self.name,
                             command=command,
                             script=script,
                             apptype=atype,
                             system=self._system,
                             restart_logic=RestartLogic(restartmax),
                             graphite_metric_names=g_names,
                             settings=settings)

    def _init_actions(self, settings):
        """
        :rtype: dict
        """
        action_factory = ActionFactory(component=self,
                                       zkclient=self.zkclient,
                                       proc_client=self._proc_client,
                                       action_queue=self._action_queue,
                                       mode=self._mode,
                                       system=self._system,
                                       pred_list=self._predicates,
                                       settings=settings)
        return action_factory.create(self.config)

    def _init_work_manager(self, queue, pipe):
        """
        :rtype: zoom.agent.entities.work_manager.WorkManager
        """
        acceptable_work = dict()
        # actions have additional logic, so use those if available
        for k, v in self._actions.iteritems():
            acceptable_work[k] = v.run

        # if action is not available, add the method from Application
        for w in self._settings.get('ALLOWED_WORK', []):
            if w not in acceptable_work:
                if hasattr(self, w):
                    acceptable_work[w] = self.__getattribute__(w)
                else:
                    self._log.error('Class has no method {0}'.format(w))
            else:
                self._log.debug('Method {0} already assigned to action.'
                                .format(w))

        manager = WorkManager(self.name, queue, pipe, acceptable_work)
        manager.start()
        return manager

    @connected
    def _check_mode(self, event=None):
        """
        Check global run mode for the agents.
        :type event: kazoo.protocol.states.WatchedEvent or None
        """
        modepath = self._pathjoin(self._settings.get('ZK_GLOBAL_PATH'), 'mode')
        try:
            data, stat = self.zkclient.get(modepath, watch=self._check_mode)
            j = json.loads(data)
            self._log.info('Getting mode from Zookeeper from path: {0}'.
                           format(modepath))
            self._mode.set_value(str(j.get(u'mode', ApplicationMode.MANUAL)))
            self._log.info('Setting mode to "{0}"'.format(self._mode))
            self._update_agent_node_with_app_details()
        except NoNodeError:
            self._log.info('ZK path {0} does not exist. Assuming mode "manual"'
                           .format(modepath))
        except Exception:
            self._log.exception('An uncaught exception has occurred.')

    def _pathjoin(self, *args):
        """
        Helper function to join paths. Uses string joining if it is a Windows
        box.
        :rtype: str
        """
        if self._system == PlatformType.LINUX:
            return os.path.join(*args)
        elif self._system == PlatformType.WINDOWS:
            return '/'.join(args)

    def _get_graphite_metric_names(self):
        """
        splits the state path at 'application' and returns the latter index
        :rtype: dict
        """
        type_path = self._paths.get('zk_state_base')\
            .split(self._settings.get('ZK_STATE_PATH') + '/', 1)[1]
        type_metric = type_path.replace('/', '.')
        result_path = self._settings.get('GRAPHITE_RESULT_METRIC')
        runtime_path = self._settings.get('GRAPHITE_RUNTIME_METRIC')

        return {
            "result": result_path.format(type_metric),
            "runtime": runtime_path.format(type_metric)
        }

    def _get_current_time(self):
        return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    def _get_alert_details(self, alert_action, reason):
        return {
            "action": alert_action,
            "service_key": self._pd_svc_key,
            "incident_key": self._pathjoin('sentinel', self.name, self._host),
            "description": ('Sentinel Error: Application {0} {1} on host {2}.'
                            .format(self.name, reason, self._host)),
            "details": ('Sentinel Error: Application {0} {1} on host {2}.\n'
                        'Review the application log and contact the appropriate'
                        ' development group.'
                        .format(self.name, reason, self._host))
        }

    @catch_exception(NoNodeError)
    @connected
    def _create_alert_node(self, alert_action, reason):
        """
        Create Node in ZooKeeper that will result in a PagerDuty alarm
        :type alert_action: zoom.common.types.AlertActionType
        """
        alert_details = self._get_alert_details(alert_action, reason)
        # path example: /foo/sentinel.bar.baz.HOSTFOO
        alert_path = self._pathjoin(
            self._settings.get('ZK_ALERT_PATH'),
            re.sub('/', '.', alert_details['incident_key'])
        )

        if self._env in self._settings.get('PAGERDUTY_ENABLED_ENVIRONMENTS'):
            self._log.info('Creating alert "{0}" node for env: {1}'
                           .format(alert_action, self._env))

            if self.zkclient.exists(alert_path):
                self.zkclient.set(alert_path, value=json.dumps(alert_details))
            else:
                self.zkclient.create(alert_path, value=json.dumps(alert_details))
        else:
            self._log.info('Not creating alert "{0}" node for env: {1}'
                           .format(alert_action, self._env))
            self._log.info('Would have created path {0}'.format(alert_path))

    @catch_exception(Exception, traceback=True)
    @run_only_one('listener_lock')
    def _reset_after_connection_loss(self):
        """
        Recreates all actions and predicates after connection loss.
        Recheck the mode and allowed instances.
        """
        if self._running:
            self._log.info('Application listener callback triggered')
            map(lambda x: x.stop(), self._actions.values())  # stop actions
            self._actions.clear()
            self._predicates = []
            self._actions = self._init_actions(self._settings)
            map(lambda x: x.reset(), self._predicates)  # reset predicates
            map(lambda x: x.start(), self._actions.values())  # start actions
            self._check_mode()
            self._log.info('Application listener callback complete!')
        else:
            self._log.info('The daemon has called for termination. '
                           'Not trying to reset after connection loss.')

    def _zk_listener(self, state):
        """
        The callback function that runs when the connection state to Zookeeper
        changes.
        Either passes or immediately spawns a new thread that resets any
        watches, etc., so that it can listen to future connection state changes.
        """
        try:
            self._log.info('Zookeeper Connection went from {0} to {1}'
                           .format(self._prev_state, state))
            if self._prev_state is None and state == KazooState.CONNECTED:
                pass
            elif (self._prev_state == KazooState.LOST
                  and state == KazooState.CONNECTED):
                self.zkclient.handler.spawn(self._reset_after_connection_loss)
            elif (self._prev_state == KazooState.CONNECTED
                  and state == KazooState.SUSPENDED):
                pass
            elif (self._prev_state == KazooState.CONNECTED
                  and state == KazooState.LOST):
                pass
            elif (self._prev_state == KazooState.SUSPENDED
                  and state == KazooState.LOST):
                pass
            elif (self._prev_state == KazooState.SUSPENDED
                  and state == KazooState.CONNECTED):
                self.zkclient.handler.spawn(self._reset_after_connection_loss)
            elif state == KazooState.CONNECTED:
                self.zkclient.handler.spawn(self._reset_after_connection_loss)
            else:
                self._log.info('Zookeeper Connection in unknown state: {0}'
                               .format(state))
                return
            self._prev_state = state

        except Exception:
            self._log.exception('An uncaught exception has occurred')

    def __str__(self):
        return self.__repr__()
    
    def __repr__(self):
        return ("{0}(name={1}, runmode={2}, actions={3})"
                .format(self.__class__.__name__,
                        self.name,
                        self._mode,
                        self._actions.keys())
                )
    def __init__(self, config, settings, queue, system, application_type,
                 cancel_flag):
        """
        :type config: dict (xml)
        :type settings: dict
        :type queue: zoom.agent.entities.unique_queue.UniqueQueue
        :type system: zoom.common.types.PlatformType
        :type application_type: zoom.common.types.ApplicationType
        :type cancel_flag: zoom.agent.entities.thread_safe_object.ThreadSafeObject
        """
        self.config = config
        self._settings = settings
        self.name = verify_attribute(self.config, 'id', none_allowed=False)
        self._log = logging.getLogger('sent.{0}.app'.format(self.name))
        # informational attributes
        self._host = socket.getfqdn()
        self._system = system
        self._predicates = list()
        self._running = True  # used to manually stop the run loop
        self._prev_state = None
        self._actions = dict()  # created in _reset_watches on zk connect
        self._env = os.environ.get('EnvironmentToUse', 'Staging')
        self._apptype = application_type
        self._restart_on_crash = \
            verify_attribute(self.config, 'restart_on_crash', none_allowed=True)
        self._post_stop_sleep = verify_attribute(self.config, 'post_stop_sleep',
                                                 none_allowed=True, cast=int,
                                                 default=5)

        # tool-like attributes
        self.listener_lock = Lock()
        self._action_queue = queue
        self._mode = ApplicationMode(
            ApplicationMode.MANUAL,
            callback=self._update_agent_node_with_app_details)
        self._state = ThreadSafeObject(
            ApplicationState.OK,
            callback=self._update_agent_node_with_app_details)
        self._start_stop_time = ''  # Default to empty string for comparison
        self._login_user = '******'  # Default to Zoom
        self._user_set_in_react = False
        self._run_check_mode = False
        self._pd_svc_key = verify_attribute(config, 'pagerduty_service',
                                            none_allowed=True)

        restartmax = verify_attribute(config, 'restartmax', none_allowed=True,
                                      cast=int, default=3)
        self._rl = RestartLogic(
            self.name,
            restartmax,
            count_callback=self._update_agent_node_with_app_details)

        self._read_only = False

        self._paths = self._init_paths(self.config, settings, application_type)

        # clients
        self.zkclient = KazooClient(
            hosts=get_zk_conn_string(),
            timeout=60.0,
            handler=SequentialThreadingHandler(),
            logger=logging.getLogger('kazoo.app.{0}'.format(self.name)))

        self.zkclient.add_listener(self._zk_listener)
        self._proc_client = self._init_proc_client(self.config,
                                                   application_type,
                                                   cancel_flag)

        self._actions = self._init_actions(settings)
        self._work_manager = self._init_work_manager(self._action_queue)
Exemple #31
0
    def __init__(self, config, settings, queue, system, application_type,
                 cancel_flag):
        """
        :type config: dict (xml)
        :type settings: dict
        :type queue: zoom.agent.entities.unique_queue.UniqueQueue
        :type system: zoom.common.types.PlatformType
        :type application_type: zoom.common.types.ApplicationType
        :type cancel_flag: zoom.agent.entities.thread_safe_object.ThreadSafeObject
        """
        self.config = config
        self._settings = settings
        self.name = verify_attribute(self.config, 'id', none_allowed=False)
        self._log = logging.getLogger('sent.{0}.app'.format(self.name))
        # informational attributes
        self._host = socket.getfqdn()
        self._system = system
        self._predicates = list()
        self._running = True  # used to manually stop the run loop
        self._prev_state = None
        self._actions = dict()  # created in _reset_watches on zk connect
        self._env = os.environ.get('EnvironmentToUse', 'Staging')
        self._apptype = application_type
        self._restart_on_crash = \
            verify_attribute(self.config, 'restart_on_crash', none_allowed=True)
        self._post_stop_sleep = verify_attribute(self.config,
                                                 'post_stop_sleep',
                                                 none_allowed=True,
                                                 cast=int,
                                                 default=5)

        # tool-like attributes
        self.listener_lock = Lock()
        self._action_queue = queue
        self._mode = ApplicationMode(
            ApplicationMode.MANUAL,
            callback=self._update_agent_node_with_app_details)
        self._state = ThreadSafeObject(
            ApplicationState.OK,
            callback=self._update_agent_node_with_app_details)
        self._start_stop_time = ''  # Default to empty string for comparison
        self._login_user = '******'  # Default to Zoom
        self._user_set_in_react = False
        self._run_check_mode = False
        self._pd_svc_key = verify_attribute(config,
                                            'pagerduty_service',
                                            none_allowed=True)

        restartmax = verify_attribute(config,
                                      'restartmax',
                                      none_allowed=True,
                                      cast=int,
                                      default=3)
        self._rl = RestartLogic(
            self.name,
            restartmax,
            count_callback=self._update_agent_node_with_app_details)

        self._read_only = False

        self._paths = self._init_paths(self.config, settings, application_type)

        # clients
        self.zkclient = KazooClient(hosts=get_zk_conn_string(),
                                    timeout=60.0,
                                    handler=SequentialThreadingHandler(),
                                    logger=logging.getLogger(
                                        'kazoo.app.{0}'.format(self.name)))

        self.zkclient.add_listener(self._zk_listener)
        self._proc_client = self._init_proc_client(self.config,
                                                   application_type,
                                                   cancel_flag)

        self._actions = self._init_actions(settings)
        self._work_manager = self._init_work_manager(self._action_queue)
Exemple #32
0
class PredicateWeekend(SimplePredicate):
    def __init__(self, comp_name, operational=False, parent=None, interval=10):
        """
        :type comp_name: str
        :type operational: bool
        :type parent: str or None
        :type interval: int or float
        """
        SimplePredicate.__init__(self, comp_name, operational=operational, parent=parent)
        self.interval = interval
        self._log = logging.getLogger("sent.{0}.weekend".format(comp_name))
        self._log.info("Registered {0}".format(self))

        self._operate = ThreadSafeObject(True)
        self._thread = Thread(target=self._run_loop, name=str(self))
        self._thread.daemon = True
        self._started = False

    @property
    def weekday(self):
        """
        :rtype: int
            0=Sunday, 1=Monday, etc.
        """
        return datetime.date.today().weekday()

    def start(self):
        if self._started is False:
            self._log.debug("Starting {0}".format(self))
            self._started = True
            self._thread.start()
            self._block_until_started()
        else:
            self._log.debug("Already started {0}".format(self))

    def stop(self):
        if self._started is True:
            self._log.info("Stopping {0}".format(self))
            self._started = False
            self._operate.set_value(False)
            self._thread.join()
            self._log.info("{0} stopped".format(self))
        else:
            self._log.debug("Already stopped {0}".format(self))

    def _run_loop(self):
        while self._operate == True:
            self._process_met()
            sleep(self.interval)
        self._log.info("Done checking for weekend.")

    def _process_met(self):
        self.set_met(self.weekday in [Weekdays.SATURDAY, Weekdays.SUNDAY])

    def __repr__(self):
        return "{0}(component={1}, parent={2}, started={3}, " "operational={4}, met={5})".format(
            self.__class__.__name__, self._comp_name, self._parent, self.started, self._operational, self._met
        )

    def __eq__(self, other):
        return type(self) == type(other)

    def __ne__(self, other):
        return type(self) != type(other)
class ChildProcess(object):
    """
    Wraps a threading.Thread, providing a Queue for communication between
    the SentinelDaemon and the ChildProcess.
    """

    def __init__(self, config, system, settings):
        """
        :type config: xml.etree.ElementTree.Element
        :type system: zoom.common.types.PlatformType
        :type settings: dict
        """
        self._log = logging.getLogger("sent.child")
        self._action_queue = UniqueQueue()
        self._cancel_flag = ThreadSafeObject(False)

        self.name = verify_attribute(config, "id")
        self._application_type = verify_attribute(config, "type")
        self._config = config
        self._system = system  # Linux or Windows
        self._settings = settings
        self._process = self._create_process()

    def add_work(self, work, immediate=False):
        """
        :type work: zoom.agent.task.task.Task
        :type immediate: bool
        :rtype: bool
        """
        return self._action_queue.append_unique(work, sender=str(self), first=immediate)

    def cancel_current_task(self):
        """
        Set the cancel flag that is used in the process client.
        """
        # this seems like a hack. There must be a better way of cancelling while
        #   still allowing the agent to report up/down status
        DONT_REMOVE = ("register", "unregister")
        self._log.info("Setting Cancel Flag and clearing queue.")
        self._cancel_flag.set_value(True)
        for i in list(self._action_queue):
            if i.name not in DONT_REMOVE:
                self._action_queue.remove(i)
                self._log.info("Removing task {0}".format(i))

    def stop(self):
        """
        Stops the Process/Thread
        """
        try:
            self._log.info("Terminating {0} child process".format(self.name))
            self.cancel_current_task()
            self.add_work(Task("terminate", block=True), immediate=True)
        except Exception as e:
            self._log.warning("Exception with stopping {0} child process: {1}".format(self.name, e))

    def join(self):
        """
        Block until underlying process completes.
        """
        self._process.join()
        self._log.info("{0} stopped.".format(self))

    def _create_process(self):
        """
        :rtype: threading.Thread
        """
        self._log.debug("Starting worker process for %s" % self.name)

        if self._application_type == ApplicationType.APPLICATION:
            s = Application(
                self._config,
                self._settings,
                self._action_queue,
                self._system,
                self._application_type,
                self._cancel_flag,
            )
        elif self._application_type == ApplicationType.JOB:
            s = Job(
                self._config,
                self._settings,
                self._action_queue,
                self._system,
                self._application_type,
                self._cancel_flag,
            )

        t = Thread(target=s.run, name=self.name)
        t.daemon = True
        t.start()
        return t

    def __str__(self):
        return "ChildProcess(name={0}, type={1})".format(self.name, self._application_type)
Exemple #34
0
class PredicateWeekend(SimplePredicate):
    def __init__(self, comp_name, operational=False, parent=None, interval=10):
        """
        :type comp_name: str
        :type operational: bool
        :type parent: str or None
        :type interval: int or float
        """
        SimplePredicate.__init__(self,
                                 comp_name,
                                 operational=operational,
                                 parent=parent)
        self.interval = interval
        self._log = logging.getLogger('sent.{0}.weekend'.format(comp_name))
        self._log.info('Registered {0}'.format(self))

        self._operate = ThreadSafeObject(True)
        self._thread = Thread(target=self._run_loop, name=str(self))
        self._thread.daemon = True
        self._started = False

    @property
    def weekday(self):
        """
        :rtype: int
            0=Sunday, 1=Monday, etc.
        """
        return datetime.date.today().weekday()

    def start(self):
        if self._started is False:
            self._log.debug('Starting {0}'.format(self))
            self._started = True
            self._thread.start()
            self._block_until_started()
        else:
            self._log.debug('Already started {0}'.format(self))

    def stop(self):
        if self._started is True:
            self._log.info('Stopping {0}'.format(self))
            self._started = False
            self._operate.set_value(False)
            self._thread.join()
            self._log.info('{0} stopped'.format(self))
        else:
            self._log.debug('Already stopped {0}'.format(self))

    def _run_loop(self):
        while self._operate == True:
            self._process_met()
            sleep(self.interval)
        self._log.info('Done checking for weekend.')

    def _process_met(self):
        self.set_met(self.weekday in [Weekdays.SATURDAY, Weekdays.SUNDAY])

    def __repr__(self):
        return ('{0}(component={1}, parent={2}, started={3}, '
                'operational={4}, met={5})'.format(self.__class__.__name__,
                                                   self._comp_name,
                                                   self._parent, self.started,
                                                   self._operational,
                                                   self._met))

    def __eq__(self, other):
        return type(self) == type(other)

    def __ne__(self, other):
        return type(self) != type(other)
class TimeWindow(SimplePredicate):
    """
    Predicate for comparing current time to start/stop times.
    It will set the 'met' value based on start > current_time > stop.
    """
    def __init__(self, comp_name, begin=None, end=None,
                 weekdays=None, operational=False, parent=None, interval=5):
        """
        :type comp_name: str
        :type begin: str or None
        :type end: str or None
        :type weekdays: str or None
        :type operational: bool
        :type parent: str or None
        :type interval: int or float
        """
        SimplePredicate.__init__(self, comp_name, operational=operational, parent=parent)
        self.begin = self.get_datetime_object(begin)
        self.end = self.get_datetime_object(end)
        self.day_range = self.parse_range(weekdays)
        self.interval = interval
        self._log = logging.getLogger('sent.{0}.pred.timewin'.format(comp_name))
        self._log.info('Registered {0}'.format(self))

        self._operate = ThreadSafeObject(True)
        self._thread = Thread(target=self._run_loop, name=str(self))
        self._thread.daemon = True
        self._started = False

    def weekday(self):
        """
        :rtype: int
            0=Monday, 1=Tuesday, etc.
        """
        return datetime.date.today().weekday()

    def start(self):
        if self._started is False:
            self._log.debug('Starting {0}'.format(self))
            self._started = True
            self._thread.start()
            self._block_until_started()
        else:
            self._log.debug('Already started {0}'.format(self))

    def stop(self):
        if self._started is True:
            self._log.info('Stopping {0}'.format(self))
            self._started = False
            self._operate.set_value(False)
            self._thread.join()
            self._log.info('{0} stopped'.format(self))
        else:
            self._log.debug('Already stopped {0}'.format(self))

    def _run_loop(self):
        while self._operate == True:
            self._process_met()
            sleep(self.interval)
        self._log.info('Done comparing times.')

    def _process_met(self):
        results = []
        if self.begin is not None:
            compare_to_begin = self._get_comparison(self.begin)
            results.append(self.begin < compare_to_begin)

        if self.end is not None:
            compare_to_end = self._get_comparison(self.end)
            results.append(compare_to_end < self.end)

        if self.day_range is not None:
            results.append(self.weekday() in self.day_range)

        if not results:
            results.append(False)

        self.set_met(all(results))  # every comparison returned True

    def _get_comparison(self, obj):
        if isinstance(obj, datetime.datetime):
            return datetime.datetime.now()
        elif isinstance(obj, datetime.time):
            return datetime.datetime.now().time()

    @staticmethod
    def get_datetime_object(data):
        """
        Create datetime object from string value
        :type data: str or None
        :rtype: datetime.datetime or datetime.time or None
        """
        if data is None:
            return

        dt_object = None
        dt_dict = TimeWindow.create_datetime_dict(data)
        try:
            # All of year, month and day are not None
            if all([dt_dict.get(i, None) is not None
                    for i in ('year', 'month', 'day')]):
                dt_object = datetime.datetime(year=dt_dict['year'],
                                              month=dt_dict['month'],
                                              day=dt_dict['day'],
                                              hour=dt_dict['hour'],
                                              minute=dt_dict['minute'],
                                              microsecond=0)
                if dt_dict['second'] is not None:
                    dt_object = dt_object.replace(second=dt_dict['second'])

            # both hour and minute are not None
            elif all([dt_dict.get(i, None) is not None
                      for i in ('hour', 'minute')]):
                dt_object = datetime.time(hour=dt_dict['hour'],
                                          minute=dt_dict['minute'],
                                          microsecond=0)
                if dt_dict.get('second', None) is not None:
                    dt_object = dt_object.replace(second=dt_dict['second'])
            else:
                logging.getLogger('PredicateTime').error(
                    'data "{0}" did not match regex. This will result in the '
                    'paramter returning as None. The predicate will never be '
                    'met for this parameter. '.format(data))

        except (ValueError, TypeError) as ex:
            logging.getLogger('PredicateTime').error(
                'Problem with parsing data "{0}": {1}'.format(data, ex))
        finally:
            return dt_object

    @staticmethod
    def create_datetime_dict(datetime_string):
        """
        :type datetime_string: str
        :rtype: dict
        """
        datetime_regex = (
            "^((?P<year>\d{4})\-(?P<month>\d{2})\-(?P<day>\d{2})\s)?"
            "(?P<hour>\d{1,2}):(?P<minute>\d{2})(:(?P<second>\d{2}))?"
        )
        regex_dict = dict()
        match = re.search(datetime_regex, datetime_string)
        if match:
            regex_dict = dict(year=match.group('year'),
                              month=match.group('month'),
                              day=match.group('day'),
                              hour=match.group('hour'),
                              minute=match.group('minute'),
                              second=match.group('second'))

        # convert all values to integers
        for k, v in regex_dict.iteritems():
            if v is not None:
                regex_dict[k] = int(v)

        logging.getLogger('PredicateTime').debug(
            'datetime_dict returning {0}'.format(regex_dict))
        return regex_dict

    @staticmethod
    def parse_range(astr):
        """
        https://www.darklaunch.com/2012/11/05/python-parse-range-and-parse-group-range
        Return a range list given a string.
        As this is for weekdays, only return 0-6

        :type astr: str or None
        :rtype: list or None
        """
        if astr is None:
            return None

        try:
            result = set()
            for part in astr.split(','):
                x = part.split('-')
                result.update(range(int(x[0]), int(x[-1]) + 1))

            # only accept 0-6
            return [i for i in sorted(result) if 0 <= i <= 6]
        except ValueError:
            logging.warning('Error parsing day range. Returning [].')
            return []

    def __repr__(self):
        return ('{0}(component={1}, parent={2}, begin="{3}", '
                'end="{4}", days={5}, started={6}, operational={7}, met={8})'
                .format(self.__class__.__name__,
                        self._comp_name,
                        self._parent,
                        self.begin,
                        self.end,
                        self.day_range,
                        self.started,
                        self._operational,
                        self._met))

    def __eq__(self, other):
        return all([
            type(self) == type(other),
            self.begin == getattr(other, 'begin', None),
            self.end == getattr(other, 'end', None),
            self.day_range == getattr(other, 'day_range', None),
            self.interval == getattr(other, 'interval', None)
        ])

    def __ne__(self, other):
        return any([
            type(self) != type(other),
            self.begin != getattr(other, 'begin', None),
            self.end != getattr(other, 'end', None),
            self.day_range != getattr(other, 'day_range', None),
            self.interval != getattr(other, 'interval', None)
        ])
Exemple #36
0
class ZookeeperGoodUntilTime(SimplePredicate):
    def __init__(self, comp_name, settings, zkclient, nodepath, parent=None, interval=5):
        """
        :type comp_name: str
        :type settings: zoom.agent.entities.thread_safe_object.ThreadSafeObject
        :type zkclient: kazoo.client.KazooClient
        :type nodepath: str
        :type parent: str or None
        :type interval: int or float
        """
        SimplePredicate.__init__(self, comp_name, settings, parent=parent)
        self.node = nodepath
        self.zkclient = zkclient
        self.interval = interval
        self._start = None
        self._stop = None
        self._log = logging.getLogger('sent.{0}.pred.gut'.format(comp_name))
        self._log.info('Registered {0}'.format(self))

        self._operate = ThreadSafeObject(True)
        self._thread = Thread(target=self._run_loop, name=str(self))
        self._thread.daemon = True
        self._started = False

        self._datetime_regex = (
            "^((?P<year>\d{4})\-(?P<month>\d{2})\-(?P<day>\d{2})\s)?"
            "(?P<hour>\d{2}):(?P<minute>\d{2})(:(?P<second>\d{2}))?"
        )

    @property
    def current_time(self):
        return datetime.datetime.now().time()

    @property
    def current_datetime(self):
        return datetime.datetime.now()

    def start(self):
        if self._started is False:
            self._log.debug('Starting {0}'.format(self))
            self._started = True
            self._watch_node()
            self._thread.start()
        else:
            self._log.debug('Already started {0}'.format(self))

    def stop(self):
        if self._started is True:
            self._log.info('Stopping {0}'.format(self))
            self._started = False
            self._operate.set_value(False)
            self._thread.join()
            self._log.info('{0} stopped'.format(self))
        else:
            self._log.debug('Already stopped {0}'.format(self))

    def _run_loop(self):
        while self._operate == True:
            self._process_met()
            sleep(self.interval)
        self._log.info('Done comparing guts.')

    def _process_met(self):
        results = []
        if self._start is not None:
            compare_to_start = self._get_comparison(self._start)
            results.append(self._start < compare_to_start)

        if self._stop is not None:
            compare_to_stop = self._get_comparison(self._stop)
            results.append(compare_to_stop < self._stop)

        if not results:
            results.append(False)

        self.set_met(all(results))  # every comparison returned True

    def _create_dt_dict(self, datetime_string):
        """
        :type datetime_string: str
        :rtype: dict
        """
        regex_dict = dict()
        match = re.search(self._datetime_regex, datetime_string)
        if match:
            regex_dict = dict(year=match.group('year'),
                              month=match.group('month'),
                              day=match.group('day'),
                              hour=match.group('hour'),
                              minute=match.group('minute'),
                              second=match.group('second'))

        for k, v in regex_dict.iteritems():
            if v is not None:
                regex_dict[k] = int(v)

        self._log.debug('dt_dict returning {0}'.format(regex_dict))
        return regex_dict

    def _get_comparison(self, obj):
        if isinstance(obj, datetime.datetime):
            return self.current_datetime
        elif isinstance(obj, datetime.time):
            return self.current_time

    def _get_datetime_object(self, data):
        """
        :type data: str
        :rtype: datetime.datetime or datetime.time or None
        """
        dt_object = None
        dt_dict = self._create_dt_dict(data)
        try:
            # All of year, month and day are not None
            if all([dt_dict.get(i, None) is not None
                    for i in ('year', 'month', 'day')]):
                dt_object = datetime.datetime(year=dt_dict['year'],
                                              month=dt_dict['month'],
                                              day=dt_dict['day'],
                                              hour=dt_dict['hour'],
                                              minute=dt_dict['minute'],
                                              microsecond=0)

                if dt_dict.get('second', None) is not None:
                    dt_object.replace(second=dt_dict['second'])

            # both hour and minute are not None
            elif all([dt_dict.get(i, None) is not None
                      for i in ('hour', 'minute')]):
                dt_object = datetime.time(hour=dt_dict['hour'],
                                          minute=dt_dict['minute'],
                                          microsecond=0)
                if dt_dict.get('second', None) is not None:
                    dt_object.replace(second=dt_dict['second'])
            else:
                self._log.error('data "{0}" did not match regex'.format(data))

        except (ValueError, TypeError) as ex:
            self._log.error('Problem with parsing data "{0}": {1}'
                            .format(data, ex))
        finally:
            return dt_object

    def _parse_data(self, gut_data):
        """
        :type gut_data: dict
        """
        start_data = gut_data.get(u'start', None)
        self._log.debug('raw start from zk is "{0}"'.format(start_data))
        if start_data is not None:
            self._start = self._get_datetime_object(start_data)
            
        stop_data = gut_data.get(u'stop', None)
        self._log.debug('raw stop from zk is "{0}"'.format(stop_data))

        if stop_data is not None:
            self._stop = self._get_datetime_object(stop_data)

        if start_data is None and stop_data is None:
            self._log.error('Start and Stop time not specified!')
        
        self._log.info('The current time is: {0}. Start time is: {1}. '
                       'Stop time is: {2}'
                       .format(self.current_time, self._start, self._stop))
    
    @connected
    def _watch_node(self, event=None):
        """
        :type event: kazoo.protocol.states.WatchedEvent or None
        """
        try:
            exists = self.zkclient.exists(self.node, watch=self._watch_node)
            if exists:
                data, stat = self.zkclient.get(self.node,
                                               watch=self._watch_node)
                j = json.loads(data)
                self._parse_data(j)
            else:
                self._log.info('No gut node was found. Watcher is set at {0}'
                               .format(self.node))
        except ValueError as ex:
            self._log.error('Invalid GUT JSON object: {0}'.format(ex))
        finally:
            self._process_met()

    def __repr__(self):
        return ('{0}(component={1}, parent={2}, start="{3}", stop="{4}", '
                'zkpath={5}, started={6}, met={7})'
                .format(self.__class__.__name__,
                        self._comp_name,
                        self._parent,
                        self._start,
                        self._stop,
                        self.node,
                        self.started,
                        self._met))
    
    def __eq__(self, other):
        return all([
            type(self) == type(other),
            self.node == getattr(other, 'node', None)
        ])

    def __ne__(self, other):
        return any([
            type(self) != type(other),
            self.node != getattr(other, 'node', None)
        ])
Exemple #37
0
class PredicateProcess(SimplePredicate):
    def __init__(self,
                 comp_name,
                 proc_client,
                 interval,
                 operational=False,
                 parent=None):
        """
        :type comp_name: str
        :type proc_client: zoom.agent.client.process_client.ProcessClient
        :type interval: int or float
        :type operational: bool
        :type parent: str or None
        """
        SimplePredicate.__init__(self,
                                 comp_name,
                                 operational=operational,
                                 parent=parent)
        self._log = logging.getLogger(
            'sent.{0}.pred.process'.format(comp_name))
        self._proc_client = proc_client

        # lock for synchronous decorator
        if proc_client:
            self.process_client_lock = proc_client.process_client_lock
        else:
            self.process_client_lock = Lock()

        self.interval = interval
        self._operate = ThreadSafeObject(True)
        self._thread = Thread(target=self._run_loop, name=str(self))
        self._thread.daemon = True
        self._started = False

    def running(self):
        """
        With the synchronous decorator, this shares a Lock object with the
        ProcessClient. While ProcessClient.start is running, this will not
        return.
        :rtype: bool
        """
        return self._proc_client.running()

    def start(self):
        if self._started is False:
            self._log.debug('Starting {0}'.format(self))
            self._started = True
            self._thread.start()
            self._block_until_started()
        else:
            self._log.debug('Already started {0}'.format(self))

    def stop(self):
        if self._started is True:
            self._log.info('Stopping {0}'.format(self))
            self._started = False
            self._operate.set_value(False)
            self._thread.join()
            self._log.info('{0} stopped'.format(self))
        else:
            self._log.debug('Already stopped {0}'.format(self))

    def _run_loop(self):
        cancel_counter = 0
        while self._operate == True:
            if self._proc_client.cancel_flag == False:
                self.set_met(self.running())
                cancel_counter = 0
            elif cancel_counter > 1:
                self._log.info('Waited long enough. Resetting cancel flag.')
                self._proc_client.cancel_flag.set_value(False)
                cancel_counter = 0
            else:
                cancel_counter += 1
                self._log.info('Cancel Flag detected, skipping status check.')

            sleep(self.interval)
        self._log.info('Done watching process.')

    def __repr__(self):
        return ('{0}(component={1}, parent={2}, interval={3}, started={4}, '
                'operational={5}, met={6})'.format(
                    self.__class__.__name__, self._comp_name, self._parent,
                    self.interval, self.started, self._operational, self._met))

    def __eq__(self, other):
        return all([
            type(self) == type(other),
            self.interval == getattr(other, 'interval', None)
        ])

    def __ne__(self, other):
        return any([
            type(self) != type(other),
            self.interval != getattr(other, 'interval', None)
        ])
Exemple #38
0
class PredicateHoliday(SimplePredicate):
    def __init__(self,
                 comp_name,
                 zkclient,
                 path,
                 operational=False,
                 parent=None,
                 interval=10):
        """
        :type comp_name: str
        :type zkclient: kazoo.client.KazooClient
        :type path: str or None
        :type operational: bool
        :type parent: str or None
        :type interval: int or float
        """
        SimplePredicate.__init__(self,
                                 comp_name,
                                 operational=operational,
                                 parent=parent)
        self.zkclient = zkclient
        self.interval = interval
        self.path = path
        self._log = logging.getLogger('sent.{0}.holiday'.format(comp_name))
        self._log.info('Registered {0}'.format(self))

        self._operate = ThreadSafeObject(True)
        self._thread = Thread(target=self._run_loop, name=str(self))
        self._thread.daemon = True
        self._started = False
        self._holidays = list()

    @property
    def date_string(self):
        """
        :rtype: str
            Example: 20140101
        """
        return datetime.date.today().strftime('%Y%m%d')

    def start(self):
        if self._started is False:
            self._log.debug('Starting {0}'.format(self))
            self._started = True
            self._watch_node()
            self._thread.start()
            self._block_until_started()
        else:
            self._log.debug('Already started {0}'.format(self))

    def stop(self):
        if self._started is True:
            self._log.info('Stopping {0}'.format(self))
            self._started = False
            self._operate.set_value(False)
            self._thread.join()
            self._log.info('{0} stopped'.format(self))
        else:
            self._log.debug('Already stopped {0}'.format(self))

    def _run_loop(self):
        while self._operate == True:
            self._process_met()
            sleep(self.interval)
        self._log.info('Done checking for holiday.')

    def _process_met(self):
        self.set_met(self.date_string in self._holidays)

    @connected
    def _watch_node(self, event=None):
        """
        :type event: kazoo.protocol.states.WatchedEvent or None
        """
        if self.path is None:
            self._log.warning('No zookeeper path given. This predicate will'
                              ' nevr be met.')
            return

        exists = self.zkclient.exists(self.path, watch=self._watch_node)
        if exists:
            self._holidays = self.zkclient.get_children(self.path,
                                                        watch=self._watch_node)
            self._log.info('Got holidays {0}'.format(self._holidays))
            self._process_met()
        else:
            self._log.info(
                'No gut node was found. Watcher is set at {0}'.format(
                    self.path))

    def __repr__(self):
        return ('{0}(component={1}, parent={2}, started={3}, '
                'operational={4}, met={5})'.format(self.__class__.__name__,
                                                   self._comp_name,
                                                   self._parent, self.started,
                                                   self._operational,
                                                   self._met))

    def __eq__(self, other):
        return type(self) == type(other)

    def __ne__(self, other):
        return type(self) != type(other)
Exemple #39
0
class TimeWindow(SimplePredicate):
    """
    Predicate for comparing current time to start/stop times.
    It will set the 'met' value based on start > current_time > stop.
    """
    def __init__(self,
                 comp_name,
                 begin=None,
                 end=None,
                 weekdays=None,
                 operational=False,
                 parent=None,
                 interval=5):
        """
        :type comp_name: str
        :type begin: str or None
        :type end: str or None
        :type weekdays: str or None
        :type operational: bool
        :type parent: str or None
        :type interval: int or float
        """
        SimplePredicate.__init__(self,
                                 comp_name,
                                 operational=operational,
                                 parent=parent)
        self.begin = self.get_datetime_object(begin)
        self.end = self.get_datetime_object(end)
        self.day_range = self.parse_range(weekdays)
        self.interval = interval
        self._log = logging.getLogger(
            'sent.{0}.pred.timewin'.format(comp_name))
        self._log.info('Registered {0}'.format(self))

        self._operate = ThreadSafeObject(True)
        self._thread = Thread(target=self._run_loop, name=str(self))
        self._thread.daemon = True
        self._started = False

    def weekday(self):
        """
        :rtype: int
            0=Monday, 1=Tuesday, etc.
        """
        return datetime.date.today().weekday()

    def start(self):
        if self._started is False:
            self._log.debug('Starting {0}'.format(self))
            self._started = True
            self._thread.start()
            self._block_until_started()
        else:
            self._log.debug('Already started {0}'.format(self))

    def stop(self):
        if self._started is True:
            self._log.info('Stopping {0}'.format(self))
            self._started = False
            self._operate.set_value(False)
            self._thread.join()
            self._log.info('{0} stopped'.format(self))
        else:
            self._log.debug('Already stopped {0}'.format(self))

    def _run_loop(self):
        while self._operate == True:
            self._process_met()
            sleep(self.interval)
        self._log.info('Done comparing times.')

    def _process_met(self):
        results = []
        if self.begin is not None:
            compare_to_begin = self._get_comparison(self.begin)
            results.append(self.begin < compare_to_begin)

        if self.end is not None:
            compare_to_end = self._get_comparison(self.end)
            results.append(compare_to_end < self.end)

        if self.day_range is not None:
            results.append(self.weekday() in self.day_range)

        if not results:
            results.append(False)

        self.set_met(all(results))  # every comparison returned True

    def _get_comparison(self, obj):
        if isinstance(obj, datetime.datetime):
            return datetime.datetime.now()
        elif isinstance(obj, datetime.time):
            return datetime.datetime.now().time()

    @staticmethod
    def get_datetime_object(data):
        """
        Create datetime object from string value
        :type data: str or None
        :rtype: datetime.datetime or datetime.time or None
        """
        if data is None:
            return

        dt_object = None
        dt_dict = TimeWindow.create_datetime_dict(data)
        try:
            # All of year, month and day are not None
            if all([
                    dt_dict.get(i, None) is not None
                    for i in ('year', 'month', 'day')
            ]):
                dt_object = datetime.datetime(year=dt_dict['year'],
                                              month=dt_dict['month'],
                                              day=dt_dict['day'],
                                              hour=dt_dict['hour'],
                                              minute=dt_dict['minute'],
                                              microsecond=0)
                if dt_dict['second'] is not None:
                    dt_object = dt_object.replace(second=dt_dict['second'])

            # both hour and minute are not None
            elif all(
                [dt_dict.get(i, None) is not None
                 for i in ('hour', 'minute')]):
                dt_object = datetime.time(hour=dt_dict['hour'],
                                          minute=dt_dict['minute'],
                                          microsecond=0)
                if dt_dict.get('second', None) is not None:
                    dt_object = dt_object.replace(second=dt_dict['second'])
            else:
                logging.getLogger('PredicateTime').error(
                    'data "{0}" did not match regex. This will result in the '
                    'paramter returning as None. The predicate will never be '
                    'met for this parameter. '.format(data))

        except (ValueError, TypeError) as ex:
            logging.getLogger('PredicateTime').error(
                'Problem with parsing data "{0}": {1}'.format(data, ex))
        finally:
            return dt_object

    @staticmethod
    def create_datetime_dict(datetime_string):
        """
        :type datetime_string: str
        :rtype: dict
        """
        datetime_regex = (
            "^((?P<year>\d{4})\-(?P<month>\d{2})\-(?P<day>\d{2})\s)?"
            "(?P<hour>\d{1,2}):(?P<minute>\d{2})(:(?P<second>\d{2}))?")
        regex_dict = dict()
        match = re.search(datetime_regex, datetime_string)
        if match:
            regex_dict = dict(year=match.group('year'),
                              month=match.group('month'),
                              day=match.group('day'),
                              hour=match.group('hour'),
                              minute=match.group('minute'),
                              second=match.group('second'))

        # convert all values to integers
        for k, v in regex_dict.iteritems():
            if v is not None:
                regex_dict[k] = int(v)

        logging.getLogger('PredicateTime').debug(
            'datetime_dict returning {0}'.format(regex_dict))
        return regex_dict

    @staticmethod
    def parse_range(astr):
        """
        https://www.darklaunch.com/2012/11/05/python-parse-range-and-parse-group-range
        Return a range list given a string.
        As this is for weekdays, only return 0-6

        :type astr: str or None
        :rtype: list or None
        """
        if astr is None:
            return None

        try:
            result = set()
            for part in astr.split(','):
                x = part.split('-')
                result.update(range(int(x[0]), int(x[-1]) + 1))

            # only accept 0-6
            return [i for i in sorted(result) if 0 <= i <= 6]
        except ValueError:
            logging.warning('Error parsing day range. Returning [].')
            return []

    def __repr__(self):
        return ('{0}(component={1}, parent={2}, begin="{3}", '
                'end="{4}", days={5}, started={6}, operational={7}, met={8})'.
                format(self.__class__.__name__, self._comp_name, self._parent,
                       self.begin, self.end, self.day_range, self.started,
                       self._operational, self._met))

    def __eq__(self, other):
        return all([
            type(self) == type(other),
            self.begin == getattr(other, 'begin', None),
            self.end == getattr(other, 'end', None),
            self.day_range == getattr(other, 'day_range', None),
            self.interval == getattr(other, 'interval', None)
        ])

    def __ne__(self, other):
        return any([
            type(self) != type(other),
            self.begin != getattr(other, 'begin', None),
            self.end != getattr(other, 'end', None),
            self.day_range != getattr(other, 'day_range', None),
            self.interval != getattr(other, 'interval', None)
        ])
Exemple #40
0
class WorkManager(object):
    def __init__(self, comp_name, queue, work_dict):
        """
        :type comp_name: str
        :type queue: zoom.agent.entities.unique_queue.UniqueQueue
        :type work_dict: dict
        """
        self._operate = ThreadSafeObject(True)
        self._thread = Thread(target=self._run,
                              name='work_manager',
                              args=(self._operate, queue, work_dict))
        self._thread.daemon = True
        self._log = logging.getLogger('sent.{0}.wm'.format(comp_name))

    def start(self):
        self._log.info('starting work manager')
        self._thread.start()

    def stop(self):
        self._log.info('Stopping work manager.')
        self._operate.set_value(False)
        self._thread.join()
        self._log.info('Stopped work manager.')

    def _run(self, operate, queue, work_dict):
        """
        :type operate: zoom.agent.entities.thread_safe_object.ThreadSafeObject
        :type queue: zoom.agent.entities.unique_queue.UniqueQueue
        :type work_dict: dict
        """
        while operate == True:
            if queue:  # if queue is not empty
                self._log.info('Current Task Queue:\n{0}'
                               .format(pprint.pformat(list(queue))))
                task = queue[0]  # grab task, but keep it in the queue

                if task.func is None:
                    func_to_run = work_dict.get(task.name, None)
                else:
                    func_to_run = task.func

                if func_to_run is not None:
                    self._log.info('Found work "{0}" in queue.'
                                   .format(task.name))
                    t = ThreadWithReturn(target=func_to_run, name=task.name,
                                         args=task.args, kwargs=task.kwargs)
                    t.start()

                    if task.block:
                        task.result = t.join()

                else:
                    self._log.warning('Cannot do "{0}", it is not a valid '
                                      'action.'.format(task.name))
                try:
                    queue.remove(task)
                except ValueError:
                    self._log.debug('Item no longer exists in the queue: {0}'
                                    .format(task))
            else:
                time.sleep(1)

        self._log.info('Done listening for work.')
        return
Exemple #41
0
    def __init__(self,
                 name,
                 component_name,
                 action,
                 xmlpart,
                 staggerpath=None,
                 staggertime=None,
                 mode_controlled=False,
                 action_q=None,
                 zkclient=None,
                 proc_client=None,
                 mode=None,
                 system=None,
                 pred_list=None,
                 settings=None,
                 disabled=False,
                 pd_enabled=True,
                 op_action=None,
                 pd_reason=None,
                 app_state=None):
        """
        :param action: The function to run when all the action's predicates are met
        :param xmlpart: The part of XML pertaining to this Action
        :param mode_controlled: Whether or not the action will run based on the ApplicationMode
        :param op_action: The function to run if this action's operation dependencies go down.
        :type name: str
        :type component_name: str
        :type action: types.FunctionType
        :type xmlpart: xml.etree.ElementTree.Element
        :type staggerpath: str
        :type staggertime: int
        :type mode_controlled: bool
        :type action_q: zoom.agent.entities.unique_queue.UniqueQueue
        :type zkclient: kazoo.client.KazooClient
        :type proc_client: zoom.agent.client.process_client.ProcessClient
        :type mode: zoom.agent.entities.thread_safe_object.ApplicationMode
        :type system: zoom.common.types.PlatformType
        :type pred_list: list
        :type settings: dict
        :type disabled: bool
        :type pd_enabled: bool
        :type op_action: types.FunctionType or None
        :type pd_reason: str or None
        :type app_state: zoom.agent.entities.thread_safe_object.ThreadSafeObject
        """
        self.name = name
        self.disabled = disabled
        self.component_name = component_name
        self._log = logging.getLogger('sent.{0}.act'.format(component_name))
        self._action = action
        self._action_queue = action_q
        self._mode_controlled = mode_controlled
        self._mode = mode
        self._pd_enabled = pd_enabled
        self._op_action = op_action
        self._pd_reason = pd_reason
        self._acquire_lock = ThreadSafeObject(True)

        if staggerpath is not None and staggertime is not None:
            self._stag_lock = StaggerLock(staggerpath,
                                          staggertime,
                                          parent=self.component_name,
                                          acquire_lock=self._acquire_lock,
                                          app_state=app_state)
            self._log.info('Using {0}'.format(self._stag_lock))
        else:
            self._stag_lock = None

        factory = PredicateFactory(component_name=component_name,
                                   action=self.name,
                                   zkclient=zkclient,
                                   proc_client=proc_client,
                                   system=system,
                                   pred_list=pred_list,
                                   settings=settings)
        self._predicate = factory.create(
            xmlpart.find('./Dependency/Predicate'), callback=self._callback)
Exemple #42
0
 def test_no_match(self):
     new = SimplePredicate("c", ThreadSafeObject({}))
     ret = self.factory._ensure_new(new)
     self.assertTrue(new is ret)
Exemple #43
0
class PredicateHoliday(SimplePredicate):
    def __init__(self, comp_name, zkclient, path,
                 operational=False, parent=None, interval=10):
        """
        :type comp_name: str
        :type zkclient: kazoo.client.KazooClient
        :type path: str or None
        :type operational: bool
        :type parent: str or None
        :type interval: int or float
        """
        SimplePredicate.__init__(self, comp_name, operational=operational, parent=parent)
        self.zkclient = zkclient
        self.interval = interval
        self.path = path
        self._log = logging.getLogger('sent.{0}.holiday'.format(comp_name))
        self._log.info('Registered {0}'.format(self))

        self._operate = ThreadSafeObject(True)
        self._thread = Thread(target=self._run_loop, name=str(self))
        self._thread.daemon = True
        self._started = False
        self._holidays = list()

    @property
    def date_string(self):
        """
        :rtype: str
            Example: 20140101
        """
        return datetime.date.today().strftime('%Y%m%d')

    def start(self):
        if self._started is False:
            self._log.debug('Starting {0}'.format(self))
            self._started = True
            self._watch_node()
            self._thread.start()
            self._block_until_started()
        else:
            self._log.debug('Already started {0}'.format(self))

    def stop(self):
        if self._started is True:
            self._log.info('Stopping {0}'.format(self))
            self._started = False
            self._operate.set_value(False)
            self._thread.join()
            self._log.info('{0} stopped'.format(self))
        else:
            self._log.debug('Already stopped {0}'.format(self))

    def _run_loop(self):
        while self._operate == True:
            self._process_met()
            sleep(self.interval)
        self._log.info('Done checking for holiday.')

    def _process_met(self):
        self.set_met(self.date_string in self._holidays)

    @connected
    def _watch_node(self, event=None):
        """
        :type event: kazoo.protocol.states.WatchedEvent or None
        """
        if self.path is None:
            self._log.warning('No zookeeper path given. This predicate will'
                              ' nevr be met.')
            return

        exists = self.zkclient.exists(self.path, watch=self._watch_node)
        if exists:
            self._holidays = self.zkclient.get_children(self.path,
                                                        watch=self._watch_node)
            self._log.info('Got holidays {0}'.format(self._holidays))
            self._process_met()
        else:
            self._log.info('No gut node was found. Watcher is set at {0}'
                           .format(self.path))

    def __repr__(self):
        return ('{0}(component={1}, parent={2}, started={3}, '
                'operational={4}, met={5})'
                .format(self.__class__.__name__,
                        self._comp_name,
                        self._parent,
                        self.started,
                        self._operational,
                        self._met))

    def __eq__(self, other):
        return type(self) == type(other)

    def __ne__(self, other):
        return type(self) != type(other)
Exemple #44
0
class APIPredicate(SimplePredicate):
    """
    Predicate that polls a url for a specific code.
    """
    def __init__(self, comp_name, url, verb='GET', expected_code=200,
                 interval=5.0, operational=False, parent=None):
        """
        :type comp_name: str
        :type url: str
        :type verb: str
        :type expected_code: int
        :type interval: int or float
        :type operational: bool
        :type parent: str or None
        """
        SimplePredicate.__init__(self, comp_name, operational=operational, parent=parent)
        self._log = logging.getLogger('sent.{0}.pred.api'.format(comp_name))
        logging.getLogger('requests.packages.urllib3.connectionpool').setLevel(logging.WARNING)
        self.url = url
        self.verb = verb
        self.expected_code = expected_code
        self.interval = interval

        self._operate = ThreadSafeObject(True)
        self._thread = Thread(target=self._run_loop, name=str(self))
        self._thread.daemon = True
        self._log.info('Registered {0}'.format(self))
        self._started = False

    def start(self):
        if self._started is False:
            self._log.debug('Starting {0}'.format(self))
            self._started = True
            self._thread.start()
            self._block_until_started()
        else:
            self._log.debug('Already started {0}'.format(self))

    def stop(self):
        if self._started is True:
            self._log.info('Stopping {0}'.format(self))
            self._started = False
            self._operate.set_value(False)
            self._thread.join()
            self._log.info('{0} stopped'.format(self))
        else:
            self._log.debug('Already stopped {0}'.format(self))

    def _run(self):
        """
        Query the given url, and report whether we get the expected code.
        """
        try:
            r = requests.request(self.verb, self.url, timeout=2)
            self.set_met(r.status_code == self.expected_code)
        except requests.ConnectionError:
            self._log.debug('URL {0} is not available.'.format(self.url))
            self.set_met(False)
        except requests.Timeout:
            self._log.debug('Timed out to URL {0}.'.format(self.url))
            self.set_met(False)

    def _run_loop(self):
        while self._operate == True:
            self._run()
            sleep(self.interval)
        self._log.info('Done querying {0}'.format(self.url))

    def __repr__(self):
        return ('{0}(component={1}, parent={2}, url="{3}", verb={4}, '
                'interval={5} started={6}, operational={7}, met={8})'
                .format(self.__class__.__name__,
                        self._comp_name,
                        self._parent,
                        self.url,
                        self.verb,
                        self.interval,
                        self.started,
                        self._operational,
                        self._met))

    def __eq__(self, other):
        return all([
            type(self) == type(other),
            self.url == getattr(other, 'url', None),
            self.verb == getattr(other, 'verb', None),
            self.interval == getattr(other, 'interval', None)
        ])

    def __ne__(self, other):
        return any([
            type(self) != type(other),
            self.url != getattr(other, 'url', None),
            self.verb != getattr(other, 'verb', None),
            self.interval != getattr(other, 'interval', None)
        ])
Exemple #45
0
class ChildProcess(object):
    """
    Wraps a threading.Thread, providing a Queue for communication between
    the SentinelDaemon and the ChildProcess.
    """
    def __init__(self, config, system, settings):
        """
        :type config: xml.etree.ElementTree.Element
        :type system: zoom.common.types.PlatformType
        :type settings: dict
        """
        self._log = logging.getLogger('sent.child')
        self._action_queue = UniqueQueue()
        self._cancel_flag = ThreadSafeObject(False)

        self.name = verify_attribute(config, 'id')
        self._application_type = verify_attribute(config, 'type')
        self._config = config
        self._system = system  # Linux or Windows
        self._settings = settings
        self._process = self._create_process()

    def add_work(self, work, immediate=False):
        """
        :type work: zoom.agent.task.task.Task
        :type immediate: bool
        :rtype: bool
        """
        return self._action_queue.append_unique(work,
                                                sender=str(self),
                                                first=immediate)

    def cancel_current_task(self):
        """
        Set the cancel flag that is used in the process client.
        """
        # this seems like a hack. There must be a better way of cancelling while
        #   still allowing the agent to report up/down status
        DONT_REMOVE = ('register', 'unregister')
        self._log.info('Setting Cancel Flag and clearing queue.')
        self._cancel_flag.set_value(True)
        for i in list(self._action_queue):
            if i.name not in DONT_REMOVE:
                self._action_queue.remove(i)
                self._log.info('Removing task {0}'.format(i))

    def stop(self):
        """
        Stops the Process/Thread
        """
        try:
            self._log.info('Terminating {0} child process'.format(self.name))
            self.cancel_current_task()
            self.add_work(Task('terminate', block=True), immediate=True)
        except Exception as e:
            self._log.warning(
                'Exception with stopping {0} child process: {1}'.format(
                    self.name, e))

    def join(self):
        """
        Block until underlying process completes.
        """
        self._process.join()
        self._log.info('{0} stopped.'.format(self))

    def _create_process(self):
        """
        :rtype: threading.Thread
        """
        self._log.debug('Starting worker process for %s' % self.name)

        if self._application_type == ApplicationType.APPLICATION:
            s = Application(self._config, self._settings, self._action_queue,
                            self._system, self._application_type,
                            self._cancel_flag)
        elif self._application_type == ApplicationType.JOB:
            s = Job(self._config, self._settings, self._action_queue,
                    self._system, self._application_type, self._cancel_flag)

        t = Thread(target=s.run, name=self.name)
        t.daemon = True
        t.start()
        return t

    def __str__(self):
        return 'ChildProcess(name={0}, type={1})'.format(
            self.name, self._application_type)
Exemple #46
0
class Application(object):
    """
    Service object to represent an deployed service.
    """
    def __init__(self, config, settings, queue, system, application_type,
                 cancel_flag):
        """
        :type config: dict (xml)
        :type settings: dict
        :type queue: zoom.agent.entities.unique_queue.UniqueQueue
        :type system: zoom.common.types.PlatformType
        :type application_type: zoom.common.types.ApplicationType
        :type cancel_flag: zoom.agent.entities.thread_safe_object.ThreadSafeObject
        """
        self.config = config
        self._settings = settings
        self.name = verify_attribute(self.config, 'id', none_allowed=False)
        self._log = logging.getLogger('sent.{0}.app'.format(self.name))
        # informational attributes
        self._host = socket.getfqdn()
        self._system = system
        self._predicates = list()
        self._running = True  # used to manually stop the run loop
        self._prev_state = None
        self._actions = dict()  # created in _reset_watches on zk connect
        self._env = os.environ.get('EnvironmentToUse', 'Staging')
        self._apptype = application_type
        self._restart_on_crash = \
            verify_attribute(self.config, 'restart_on_crash', none_allowed=True)
        self._post_stop_sleep = verify_attribute(self.config,
                                                 'post_stop_sleep',
                                                 none_allowed=True,
                                                 cast=int,
                                                 default=5)

        # tool-like attributes
        self.listener_lock = Lock()
        self._action_queue = queue
        self._mode = ApplicationMode(
            ApplicationMode.MANUAL,
            callback=self._update_agent_node_with_app_details)
        self._state = ThreadSafeObject(
            ApplicationState.OK,
            callback=self._update_agent_node_with_app_details)
        self._start_stop_time = ''  # Default to empty string for comparison
        self._login_user = '******'  # Default to Zoom
        self._user_set_in_react = False
        self._run_check_mode = False
        self._pd_svc_key = verify_attribute(config,
                                            'pagerduty_service',
                                            none_allowed=True)

        restartmax = verify_attribute(config,
                                      'restartmax',
                                      none_allowed=True,
                                      cast=int,
                                      default=3)
        self._rl = RestartLogic(
            self.name,
            restartmax,
            count_callback=self._update_agent_node_with_app_details)

        self._read_only = False

        self._paths = self._init_paths(self.config, settings, application_type)

        # clients
        self.zkclient = KazooClient(hosts=get_zk_conn_string(),
                                    timeout=60.0,
                                    handler=SequentialThreadingHandler(),
                                    logger=logging.getLogger(
                                        'kazoo.app.{0}'.format(self.name)))

        self.zkclient.add_listener(self._zk_listener)
        self._proc_client = self._init_proc_client(self.config,
                                                   application_type,
                                                   cancel_flag)

        self._actions = self._init_actions(settings)
        self._work_manager = self._init_work_manager(self._action_queue)

    def app_details(self):
        return {
            'name': self.name,
            'host': self._host,
            'platform': self._system,
            'mode': self._mode.value,
            'state': self._state.value,
            'start_stop_time': self._start_stop_time,
            'login_user': self._login_user,
            'read_only': self._read_only,
            'restart_count': self._rl.count
        }

    def run(self):
        """
        - Start the zookeeper client
        - Check for already running instances.
        - Start main loop, periodically checking whether the process has failed.
        """
        try:
            self.zkclient.start()
            # make all action objects start processing predicates
            self._log.info('Starting to process Actions.')
            map(lambda x: x.start(), self._actions.values())  # start actions
            started = all([i.started for i in self._actions.values()])
            if not started:
                self._log.critical('All actions are not started!')
            else:
                self._log.info('All actions started.'.format(started))
            self._check_mode()  # get global mode AFTER starting actions

            while self._running:
                sleep(5)

            self.uninitialize()
        except Exception as ex:
            self._log.critical('There was an exception in the main loop. '
                               'In a bad state. ({0})'.format(ex))

    @catch_exception(NodeExistsError)
    @connected
    def register(self, **kwargs):
        """
        Add entry to the state tree
        """
        action_name = kwargs.get('action_name', 'register')

        if not self.zkclient.exists(self._paths['zk_state_path']):
            if self._action_is_ready(action_name):
                self._log.info('Registering %s in state tree.' % self.name)
                self.zkclient.create(self._paths['zk_state_path'],
                                     ephemeral=True,
                                     makepath=True)

                # resolve any pager duty alarms
                self._create_alert_node(AlertActionType.RESOLVE,
                                        AlertReason.RESOLVED)
                # reset restart counters, etc
                self._proc_client.reset_counters()

                self._state.set_value(ApplicationState.STARTED)
            else:
                self._log.info(
                    'Action {0} is not ready. Not registering.'.format(
                        action_name))
        else:
            self._log.info('Already registered (node exists).')
        return 0

    @catch_exception(NoNodeError)
    @connected
    def unregister(self, **kwargs):
        """Remove entry from state tree"""
        action_name = kwargs.get('action_name', 'unregister')
        if self._action_is_ready(action_name):
            self._log.info('Un-registering %s from state tree.' % self.name)
            self.zkclient.delete(self._paths['zk_state_path'])
        return 0

    @catch_exception(RuntimeError)
    def uninitialize(self):
        """
        Gracefully stop this Zookeeper session, then free any resentinels
        held by the client.
        """
        self._log.info('Stopping Zookeeper client')
        self._work_manager.stop()
        map(lambda x: x.stop(), self._actions.values())  # stop actions
        del self._predicates[:]  # make sure we delete old predicates
        self.zkclient.stop()
        self.zkclient.close()
        return 0

    @time_this
    def start(self, **kwargs):
        """
        Start actual process
        :param kwargs: passed from zoom.handlers.control_agent_handlers
        """
        # Restart from UI: ran_stop=True, stay_down=False
        # Stop from UI: ran_stop=True, stay_down=True
        # Crash: ran_stop=False, stay_down=False
        if self._proc_client.restart_logic.ran_stop \
                and self._proc_client.restart_logic.stay_down \
                and self._apptype == ApplicationType.APPLICATION:

            self._log.info('Not starting. App was stopped with Zoom.')
            # set to OK just in case we're staggered
            self._state.set_value(ApplicationState.OK)
            return 0
        elif self._proc_client.restart_logic.crashed and \
                not self._restart_on_crash:
            self._log.info('Not starting. The application has crashed.')
            self._state.set_value(ApplicationState.NOTIFY)
            return 0
        else:
            self._log.debug('Start allowed.')

        if kwargs.get('reset', True):
            self._proc_client.reset_counters()
        if kwargs.get('pause', False):
            self.ignore()
        pd_enabled = kwargs.get('pd_enabled', True)

        self._start_stop_time = self._get_current_time()

        # set login user if not set in react
        if not self._user_set_in_react:
            self._login_user = kwargs.get('login_user', 'Zoom')
        self._state.set_value(ApplicationState.STARTING)

        result = self._proc_client.start()

        if self._run_check_mode:  # Reset to global mode if restart with dep
            self._check_mode()
            self._run_check_mode = False

        if result == 0 or result == ApplicationStatus.CANCELLED:
            self._state.set_value(ApplicationState.STARTED)
        else:
            self._state.set_value(ApplicationState.ERROR)
            if pd_enabled:
                self._create_alert_node(AlertActionType.TRIGGER,
                                        AlertReason.FAILEDTOSTART)
            else:
                self._log.debug('PD is disabled, not sending alert.')

        return result

    @time_this
    def stop(self, **kwargs):
        """
        Stop actual process
        :param kwargs: Passed from:
            zoom.www.handlers.control_agent_handler.ControlAgentHandler,
            zoom.agent.action.action.Action
        """

        if kwargs.get('reset', True):
            self._proc_client.reset_counters()
        if kwargs.get('pause', False):
            self.ignore()

        self._start_stop_time = self._get_current_time()
        self._login_user = kwargs.get('login_user', 'Zoom')
        self._state.set_value(ApplicationState.STOPPING)

        result = self._proc_client.stop(**kwargs)

        if result != ApplicationStatus.CANCELLED:
            # give everything time to catch up, not sure why anymore...
            self._log.info(
                'Sleeping for the configured {0}s after stop.'.format(
                    self._post_stop_sleep))
            sleep(self._post_stop_sleep)

        # reset this value back to False
        self._user_set_in_react = False

        if result == ApplicationStatus.CANCELLED:
            self._state.set_value(ApplicationState.STOPPED)
        elif result != 0:
            self._state.set_value(ApplicationState.ERROR)
        else:
            self._state.set_value(ApplicationState.STOPPED)

        return result

    def status(self):
        """
        Log out the status of each configured action.
        :rtype: str
        """
        out = '\n'
        out += '#' * 40 + ' STATUS ' + '#' * 40
        out += '\n{0}'.format(self)
        out += '\n'
        for i in self._actions.values():
            out += '\n{0}'.format(i.status)
        out += '\n'
        out += '#' * 40 + ' STATUS ' + '#' * 40
        out += '\n'

        self._log.info(out)
        return out

    def restart(self, **kwargs):
        """
        :param kwargs: passed from zoom.handlers.control_agent_handlers
        """
        # if not self._action_is_ready('restart', allow_undefined=True):
        #     self._log.info('Restart action not ready.')
        #     return

        self._log.info('Running Restart. Queuing stop, unregister, start.')
        self._action_queue.clear()
        self._action_queue.append_unique(Task('stop', kwargs=kwargs))
        self._action_queue.append_unique(Task('unregister'))
        self._action_queue.append_unique(Task('start', kwargs=kwargs))
        return 0

    def dep_restart(self, **kwargs):
        self._run_check_mode = True  # only used in self.start()
        self._action_queue.append(Task('start_if_ready', kwargs=kwargs))
        return 0

    def start_if_ready(self, **kwargs):
        start_action = self._actions.get('start', None)
        if start_action is not None and start_action.ready:
            start_action.run(**kwargs)
        # if start action doesn't exist, a.k.a. read only
        elif start_action is None:
            self.start(**kwargs)
        else:
            self._action_queue.append(Task('react', kwargs=kwargs))
        return 0

    @time_this
    @connected
    def ignore(self, **kwargs):
        """
        :param kwargs: passed from zoom.handlers.control_agent_handlers
        """
        self._mode.set_value(ApplicationMode.MANUAL)
        self._log.info('Mode is now "{0}"'.format(self._mode))
        return 0

    @time_this
    @connected
    def react(self, **kwargs):
        """
        :param kwargs: passed from zoom.handlers.control_agent_handlers
        """
        self._mode.set_value(ApplicationMode.AUTO)
        self._log.info('Mode is now "{0}"'.format(self._mode))

        # when react is called through "restart with dependencies" command
        self._user_set_in_react = True
        self._login_user = kwargs.get('login_user', 'Zoom')
        return 0

    @time_this
    @connected
    def notify(self, **kwargs):
        """
        Send notification based on arbitrary predicates
        """
        action_name = kwargs.get('action_name', 'notify')
        pd_enabled = kwargs.get('pd_enabled', True)
        pd_reason = kwargs.get('pd_reason', None)

        if pd_reason is None:
            pd_reason = AlertReason.CRASHED

        if not self._action_is_ready(action_name):
            self._log.info('notify action not defined or not ready.')
            return 1

        self._state.set_value(ApplicationState.NOTIFY)
        if pd_enabled:
            self._create_alert_node(AlertActionType.TRIGGER, pd_reason)
        else:
            self._log.debug('PD is disabled, not sending alert.')

        return 0

    @time_this
    @connected
    def ensure_running(self, **kwargs):
        """
        Essentially a clone of `notify`, but tailored for process monitoring.
        """
        # Application failed to start. Already sent PD alert
        if self._state == ApplicationState.ERROR:
            return 1

        action_name = kwargs.get('action_name', 'ensure_running')
        pd_enabled = kwargs.get('pd_enabled', True)
        pd_reason = kwargs.get('pd_reason', None)

        if pd_reason is None:
            pd_reason = AlertReason.CRASHED

        if not self._action_is_ready(action_name):
            self._log.info('notify action not defined or not ready.')
            return

        if not self._proc_client.restart_logic.ran_stop:
            # the application has crashed
            self._state.set_value(ApplicationState.NOTIFY)
            if pd_enabled:
                self._create_alert_node(AlertActionType.TRIGGER, pd_reason)
            else:
                self._log.debug('PD is disabled, not sending alert.')
        else:
            self._log.debug("Service shut down gracefully")

        return 0

    def terminate(self):
        """Terminate child thread/process"""
        self._running = False
        return 0

    def _action_is_ready(self, action_name, allow_undefined=False):
        """
        Check if a configured action's predicates are met
        :type action_name: str
        :type allow_undefined: bool
        :rtype: bool
        """
        action = self._actions.get(action_name, None)
        if allow_undefined:
            if action is None:
                return True

        return action is not None and action.ready

    @catch_exception(NoNodeError)
    @connected
    def _update_agent_node_with_app_details(self, event=None):
        """
        Register app data with the agent in the state tree.
        :type event: kazoo.protocol.states.WatchedEvent or None
        """
        if self._running and \
                not self.zkclient.exists(self._paths['zk_state_base']):
            self.zkclient.create(self._paths['zk_state_base'], makepath=True)

        data, stat = self.zkclient.get(self._paths['zk_state_base'])

        try:
            agent_apps = json.loads(data)
        except ValueError:
            agent_apps = dict()

        # check for config conflict
        other_host = agent_apps.get('host')
        if other_host is not None and self._host != other_host:
            self._log.error(
                'There is a config conflict with {0}. Updates '
                'will no longer be sent until it is resolved.'.format(
                    other_host))
            self._state.set_value(ApplicationState.CONFIG_ERROR,
                                  run_callback=False)

        # make sure data is the most recent
        if self.app_details() != agent_apps:
            self.zkclient.set(self._paths['zk_state_base'],
                              json.dumps(self.app_details()))
            self._log.debug('Registering app data {0}'.format(
                self.app_details()))

        # set watch
        if self._state != ApplicationState.CONFIG_ERROR:
            self.zkclient.get(self._paths['zk_state_base'],
                              watch=self._update_agent_node_with_app_details)
        else:
            self._log.error('Shutting down because of config error.')
            self.terminate()

    def _init_paths(self, config, settings, atype):
        """
        :rtype: dict
        """
        paths = dict()
        paths['zk_state_base'] = verify_attribute(
            config,
            'registrationpath',
            none_allowed=True,
            default=self._pathjoin(
                settings.get('zookeeper', {}).get('state'), atype, self.name))

        paths['zk_state_path'] = \
            self._pathjoin(paths['zk_state_base'], self._host)
        paths['zk_config_path'] = \
            self._pathjoin(settings.get('zookeeper', {}).get('config'), atype, self.name)
        paths['zk_agent_path'] = \
            self._pathjoin(settings.get('zookeeper', {}).get('agent_state'), self._host)

        return paths

    def _init_proc_client(self, config, atype, cancel_flag):
        """Create the process client."""
        start_cmd = verify_attribute(config, 'start_cmd', none_allowed=True)
        stop_cmd = verify_attribute(config, 'stop_cmd', none_allowed=True)
        status_cmd = verify_attribute(config, 'status_cmd', none_allowed=True)
        script = verify_attribute(config, 'script', none_allowed=True)

        g_names = self._get_graphite_metric_names()

        return ProcessClient(name=self.name,
                             start_cmd=start_cmd,
                             stop_cmd=stop_cmd,
                             status_cmd=status_cmd,
                             script=script,
                             apptype=atype,
                             restart_logic=self._rl,
                             graphite_metric_names=g_names,
                             cancel_flag=cancel_flag)

    def _init_actions(self, settings):
        """
        :rtype: dict
        """
        action_factory = ActionFactory(component=self,
                                       zkclient=self.zkclient,
                                       proc_client=self._proc_client,
                                       action_queue=self._action_queue,
                                       mode=self._mode,
                                       system=self._system,
                                       pred_list=self._predicates,
                                       app_state=self._state,
                                       settings=settings)

        actions = action_factory.create(self.config)

        self._determine_read_only(actions)

        return actions

    def _determine_read_only(self, actions):
        # Sentinel config may include either start or restart blocks, if either are disabled show as read-only
        start_action = actions.get('start', None)
        restart_action = actions.get('restart', None)

        # Two special cases - both start and restart and neither
        if start_action and restart_action:
            if start_action.disabled and restart_action.disabled:
                self._read_only = True
            else:
                self._read_only = False
            return

        elif not start_action and not restart_action:
            self._log.warning(
                'Sentinel config contains neither start nor restart predicates, assuming readonly'
            )
            self._read_only = True
            return

        # At this point either start action or restart action must exist
        if not start_action:
            if restart_action.disabled:
                self._read_only = True
            else:
                self._read_only = False

        elif not restart_action:
            if start_action.disabled:
                self._read_only = True
            else:
                self._read_only = False
        else:
            self._log.warning('Unhandled read-only configuration')
            self._read_only = False

    def _init_work_manager(self, queue):
        """
        :rtype: zoom.agent.entities.work_manager.WorkManager
        """
        acceptable_work = dict()
        # actions have additional logic, so use those if available
        for k, v in self._actions.iteritems():
            acceptable_work[k] = v.run

        # if action is not available, add public methods
        for attribute in [a for a in dir(self) if not a.startswith('_')]:
            obj = getattr(self, attribute)
            if hasattr(obj, '__call__'):
                if attribute not in acceptable_work:
                    acceptable_work[attribute] = obj
                else:
                    self._log.debug(
                        'Method {0} already assigned to action.'.format(
                            attribute))

        manager = WorkManager(self.name, queue, acceptable_work)
        manager.start()
        return manager

    @connected
    def _check_mode(self, event=None):
        """
        Check global run mode for the agents.
        :type event: kazoo.protocol.states.WatchedEvent or None
        """
        global_path = self._settings.get('zookeeper', {}).get('global_config')
        if global_path is None:
            self._log.warning('Received no global config path. Zoom will be '
                              'unable to change the global mode.')
            return

        modepath = self._pathjoin(global_path, 'mode')
        try:
            data, stat = self.zkclient.get(modepath, watch=self._check_mode)
            j = json.loads(data)
            self._log.info(
                'Getting mode from Zookeeper from path: {0}'.format(modepath))
            self._mode.set_value(str(j.get(u'mode', ApplicationMode.MANUAL)))
            self._log.info('Setting mode to "{0}"'.format(self._mode))
        except NoNodeError:
            self._log.info(
                'ZK path {0} does not exist. Assuming mode "manual"'.format(
                    modepath))
        except Exception:
            self._log.exception('An uncaught exception has occurred.')

    def _pathjoin(self, *args):
        """
        Helper function to join paths. Uses string joining if it is a Windows
        box.
        :rtype: str
        """
        if self._system == PlatformType.LINUX:
            return os.path.join(*args)
        elif self._system == PlatformType.WINDOWS:
            return '/'.join(args)

    def _get_graphite_metric_names(self):
        """
        splits the state path at 'application' and returns the latter index
        :rtype: dict
        """
        names = {"result": None, "runtime": None, "updown": None}

        type_path = self._paths.get('zk_state_base')\
            .split(self._settings.get('zookeeper', {}).get('state') + '/', 1)[1]
        type_metric = type_path.replace('/', '.')

        graphite = self._settings.get('graphite')
        if graphite is not None:
            result_path = str(graphite.get('result'))
            runtime_path = str(graphite.get('runtime'))
            updown_path = str(graphite.get('updown'))

            names["result"] = result_path.format(type_metric)
            names["runtime"] = runtime_path.format(type_metric)
            names["updown"] = updown_path.format(type_metric)

        return names

    def _get_current_time(self):
        return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    def _get_alert_details(self, alert_action, reason):
        return {
            "action":
            alert_action,
            "service_key":
            self._pd_svc_key,
            "incident_key":
            self._pathjoin('sentinel', self.name, self._host),
            "description":
            ('Sentinel Error: name={0}, host={1}, issue="{2}".'.format(
                self.name, self._host, reason)),
            "details":
            ('Sentinel Error: name={0}, host={1}, issue="{2}".\n'
             'Review the application log and contact the appropriate'
             ' development group.'.format(self.name, self._host, reason))
        }

    @catch_exception(NoNodeError)
    @connected
    def _create_alert_node(self, alert_action, reason):
        """
        Create Node in ZooKeeper that will result in a PagerDuty alarm
        :type alert_action: zoom.common.types.AlertActionType
        """
        alert_details = self._get_alert_details(alert_action, reason)
        # path example: /foo/sentinel.bar.baz.HOSTFOO
        alert = self._settings.get('zookeeper', {}).get('alert')
        if alert is None:
            self._log.warning('Was given no alert path. This sentinel will be '
                              'unable to forward alerts to Zoom.')
            return

        alert_path = self._pathjoin(
            alert, re.sub('/', '.', alert_details['incident_key']))

        if self._env in self._settings.get('pagerduty',
                                           {}).get('enabled_environments', []):
            self._log.info('Creating alert "{0}" node for env: {1}'.format(
                alert_action, self._env))

            if self.zkclient.exists(alert_path):
                self.zkclient.set(alert_path, value=json.dumps(alert_details))
            else:
                self.zkclient.create(alert_path,
                                     value=json.dumps(alert_details))
        else:
            self._log.info('Not creating alert "{0}" node for env: {1}'.format(
                alert_action, self._env))
            self._log.info('Would have created path {0}'.format(alert_path))

    @catch_exception(Exception, traceback=True)
    @run_only_one('listener_lock')
    def _reset_after_connection_loss(self):
        """
        Recreates all actions and predicates after connection loss.
        Recheck the mode and allowed instances.
        """
        if self._running:
            self._log.info('Application listener callback triggered')
            map(lambda x: x.stop(), self._actions.values())  # stop actions
            self._actions.clear()
            self._predicates = []
            self._actions = self._init_actions(self._settings)
            map(lambda x: x.reset(), self._predicates)  # reset predicates
            map(lambda x: x.start(), self._actions.values())  # start actions
            self._check_mode()
            self._log.info('Application listener callback complete!')
        else:
            self._log.info('The daemon has called for termination. '
                           'Not trying to reset after connection loss.')

    def _zk_listener(self, state):
        """
        The callback function that runs when the connection state to Zookeeper
        changes.
        Either passes or immediately spawns a new thread that resets any
        watches, etc., so that it can listen to future connection state changes.
        """
        try:
            self._log.info('Zookeeper Connection went from {0} to {1}'.format(
                self._prev_state, state))
            if self._prev_state is None and state == KazooState.CONNECTED:
                pass
            elif self._prev_state == KazooState.LOST and state == KazooState.CONNECTED:
                self.zkclient.handler.spawn(self._reset_after_connection_loss)
            elif self._prev_state == KazooState.CONNECTED and state == KazooState.SUSPENDED:
                pass
            elif self._prev_state == KazooState.CONNECTED and state == KazooState.LOST:
                pass
            elif self._prev_state == KazooState.SUSPENDED and state == KazooState.LOST:
                pass
            elif self._prev_state == KazooState.SUSPENDED and state == KazooState.CONNECTED:
                self.zkclient.handler.spawn(self._reset_after_connection_loss)
            elif state == KazooState.CONNECTED:
                self.zkclient.handler.spawn(self._reset_after_connection_loss)
            else:
                self._log.info(
                    'Zookeeper Connection in unknown state: {0}'.format(state))
                return
            self._prev_state = state

        except Exception as ex:
            self._log.exception('An uncaught exception has occurred in the '
                                'listener: {0}'.format(ex))

    def __str__(self):
        return self.__repr__()

    def __repr__(self):
        return ("{0}(name={1}, runmode={2})".format(self.__class__.__name__,
                                                    self.name, self._mode))