Beispiel #1
0
    def _walk(self, path, result):
        """
        :type path: str
        :type result: zoom.www.messages.application_states.ApplicationStatesMessage
        """
        try:
            children = self._zoo_keeper.get_children(path,
                                                     watch=self._on_update)

            if children:
                for child in children:
                    self._walk(zk_path_join(path, child), result)
            else:
                app_state = self._get_application_state(path)
                result.update(
                    {app_state.configuration_path: app_state.to_dictionary()})

        except NoNodeError:
            result.update({
                path:
                ApplicationState(configuration_path=path,
                                 delete=True).to_dictionary(),
            })

        except Exception:
            logging.exception('An unhandled Exception has occurred while '
                              'running ApplicationStateCache.walk.')
    def post(self, server):
        """
        @api {post} /api/v1/config/:host Create sentinel config
        @apiParam {String} XML A string containing the XML of the Sentinel Config
        @apiVersion 1.0.0
        @apiName CreateSentinel
        @apiGroup Sentinel Config
        """
        logging.info('Adding server {0} for client {1}'
                     .format(server, self.request.remote_ip))
        path = zk_path_join(self.agent_configuration_path, server)

        # add server if it does not already exist
        if self.zk.exists(path):
            output = 'Node {0} already exists'.format(server)
            logging.info(output)
        else:
            # get XML data from JSON dictionary
            data = self.get_argument("XML")
            logging.info('Received XML configuration for {0}'.format(server))
            try:
                self.zk.create(path, bytes(data))
                self.write('Node successfully added.')
                logging.info('Added {0}'.format(server))
            except NoNodeError:
                output = 'Parent nodes are missing for {0}'.format(path)
                self.write(output)
                logging.info(output)
    def put(self, server):
        """
        @api {put} /api/v1/config/:host Create|Update sentinel config
        @apiVersion 1.0.0
        @apiName UpdateSentinel
        @apiGroup Sentinel Config
        """
        logging.info('Updating server {0} for client {1}'
                     .format(server, self.request.remote_ip))
        zk_path = zk_path_join(self.agent_configuration_path, server)

        try:
            request = json.loads(self.request.body)
            # get XML data from JSON dictionary
            data = request.get("XML")
            logging.info('Received XML configuration for {0}'.format(server))

            if not self.zk.exists(zk_path):
                self.zk.create(zk_path)

            if not self._is_valid(str(data), server):
                logging.warning('Not updating invalid config for server: {0}'
                                .format(server))
            else:
                self.zk.set(zk_path, str(data))
                self.write('Node successfully updated.')
                logging.info('Updated server {0}'.format(server))

        except NoNodeError:
            output = 'Node does {0} not exist.'.format(zk_path)
            logging.exception(output)
            self.write(output)
Beispiel #4
0
    def post(self, server):
        """
        @api {post} /api/v1/config/:host Create sentinel config
        @apiParam {String} XML A string containing the XML of the Sentinel Config
        @apiVersion 1.0.0
        @apiName CreateSentinel
        @apiGroup Sentinel Config
        """
        server = cap_hostname(server)
        logging.info('Adding server {0} for client {1}'.format(
            server, self.request.remote_ip))
        path = zk_path_join(self.agent_configuration_path, server)

        # add server if it does not already exist
        if self.zk.exists(path):
            output = 'Node {0} already exists'.format(server)
            logging.info(output)
        else:
            # get XML data from JSON dictionary
            data = self.get_argument("XML")
            logging.info('Received XML configuration for {0}'.format(server))
            try:
                self.zk.create(path, bytes(data))
                self.write('Node successfully added.')
                logging.info('Added {0}'.format(server))
            except NoNodeError:
                output = 'Parent nodes are missing for {0}'.format(path)
                self.write(output)
                logging.info(output)
    def _walk(self, path, result):
        """
        :type path: str
        :type result: zoom.www.messages.application_states.ApplicationStatesMessage
        """
        try:
            children = self._zoo_keeper.get_children(path, watch=self._on_update)

            if children:
                for child in children:
                    self._walk(zk_path_join(path, child), result)
            else:
                app_state = self._get_application_state(path)
                result.update(
                    {app_state.configuration_path: app_state.to_dictionary()}
                )

        except NoNodeError:
            result.update({path: ApplicationState(configuration_path=path,
                                                  delete=True).to_dictionary(),
            })

        except Exception:
            logging.exception('An unhandled Exception has occurred while '
                              'running ApplicationStateCache.walk.')
    def _double_check_config(self, server, id_to_find=None, reg_to_find=None):
        """
        It is possible that the ApplicationStateCache will have a stale host
        value. Check the actual config to make sure the component_id is REALLY
        there.
        :type server: str
        :type id_to_find: str or None
        :type reg_to_find: str or None
        :rtype: bool
        """
        path = zk_path_join(self.agent_configuration_path, server)
        if self.zk.exists(path):
            xmlstr, stat = self.zk.get(path)
        else:
            return False

        config = ElementTree.fromstring(xmlstr)
        for component in config.iter('Component'):
            comp_id = component.get('id')
            comp_reg_path = component.get('registrationpath')
            if id_to_find and id_to_find == comp_id:
                return True
            elif reg_to_find and reg_to_find == comp_reg_path:
                return True

        return False
Beispiel #7
0
    def put(self, server):
        """
        @api {put} /api/v1/config/:host Create|Update sentinel config
        @apiVersion 1.0.0
        @apiName UpdateSentinel
        @apiGroup Sentinel Config
        """
        server = cap_hostname(server)
        logging.info('Updating server {0} for client {1}'.format(
            server, self.request.remote_ip))
        zk_path = zk_path_join(self.agent_configuration_path, server)

        try:
            request = json.loads(self.request.body)
            # get XML data from JSON dictionary
            data = request.get("XML")
            logging.info('Received XML configuration for {0}'.format(server))

            if not self.zk.exists(zk_path):
                self.zk.create(zk_path)

            if not self._is_valid(str(data), server):
                logging.warning(
                    'Not updating invalid config for server: {0}'.format(
                        server))
            else:
                self.zk.set(zk_path, str(data))
                self.write('Node successfully updated.')
                logging.info('Updated server {0}'.format(server))

        except NoNodeError:
            output = 'Node does {0} not exist.'.format(zk_path)
            logging.exception(output)
            self.write(output)
Beispiel #8
0
    def _submit_task(self, task):
        """
        If node does not exist in ZK, create node. Set data watch on that node.
        :type task: zoom.agent.task.task.Task
        """
        try:
            task_path = zk_path_join(self._configuration.task_path, task.host)

            if self._zoo_keeper.exists(task_path):
                # if the node exists, check if it is done
                e = WatchedEvent(None, None, task_path)
                self._on_update(e)

            else:
                logging.info("Creating task node for path {0}: {1}".format(
                    task_path, task))
                try:
                    self._zoo_keeper.create(task_path, value=task.to_json())
                except NodeExistsError:
                    pass

                self._zoo_keeper.get(task_path, watch=self._on_update)

        except NoNodeError:
            pass
Beispiel #9
0
    def _submit_task(self, task):
        """
        If node does not exist in ZK, create node. Set data watch on that node.
        :type task: zoom.agent.task.task.Task
        """
        try:
            task_path = zk_path_join(self._configuration.task_path, task.host)

            if self._zoo_keeper.exists(task_path):
                # if the node exists, check if it is done
                e = WatchedEvent(None, None, task_path)
                self._on_update(e)

            else:
                logging.info("Creating task node for path {0}: {1}"
                             .format(task_path, task))
                try:
                    self._zoo_keeper.create(task_path, value=task.to_json())
                except NodeExistsError:
                    pass

                self._zoo_keeper.get(task_path, watch=self._on_update)

        except NoNodeError:
            pass
Beispiel #10
0
    def _double_check_config(self, server, id_to_find=None, reg_to_find=None):
        """
        It is possible that the ApplicationStateCache will have a stale host
        value. Check the actual config to make sure the component_id is REALLY
        there.
        :type server: str
        :type id_to_find: str or None
        :type reg_to_find: str or None
        :rtype: bool
        """
        path = zk_path_join(self.agent_configuration_path, server)
        if self.zk.exists(path):
            xmlstr, stat = self.zk.get(path)
        else:
            return False

        config = ElementTree.fromstring(xmlstr)
        for component in config.iter('Component'):
            comp_id = component.get('id')
            comp_reg_path = component.get('registrationpath')
            if id_to_find and id_to_find == comp_id:
                return True
            elif reg_to_find and reg_to_find == comp_reg_path:
                return True

        return False
Beispiel #11
0
 def clear_all_tasks(self):
     """
     Delete all queued tasks on the server and in Zookeeper
     """
     self._task_queue.clear()
     children = self._zoo_keeper.get_children(self._configuration.task_path)
     for c in children:
         path = zk_path_join(self._configuration.task_path, c)
         logging.info('Deleting stale task node {0}'.format(path))
         self._zoo_keeper.delete(path)
Beispiel #12
0
 def clear_all_tasks(self):
     """
     Delete all queued tasks on the server and in Zookeeper
     """
     self._task_queue.clear()
     children = self._zoo_keeper.get_children(self._configuration.task_path)
     for c in children:
         path = zk_path_join(self._configuration.task_path, c)
         logging.info('Deleting stale task node {0}'.format(path))
         self._zoo_keeper.delete(path)
Beispiel #13
0
    def live_tasks(self):
        """
        Return Tasks submitted to sentinel agents
        :rtype: dict
        """
        tasks = dict()
        children = self._zoo_keeper.get_children(self._configuration.task_path)
        for c in children:
            path = zk_path_join(self._configuration.task_path, c)
            data, stat = self._zoo_keeper.get(path)
            tasks[c] = json.loads(data)

        return tasks
Beispiel #14
0
    def live_tasks(self):
        """
        Return Tasks submitted to sentinel agents
        :rtype: dict
        """
        tasks = dict()
        children = self._zoo_keeper.get_children(self._configuration.task_path)
        for c in children:
            path = zk_path_join(self._configuration.task_path, c)
            data, stat = self._zoo_keeper.get(path)
            tasks[c] = json.loads(data)

        return tasks
    def _walk(self, path, result):
        """
        :type path: str
        :type result: ApplicationDependenciesMessage
        """
        try:
            children = self._zoo_keeper.get_children(path,
                                                     watch=self._on_update)

            if children:
                for child in children:
                    self._walk(zk_path_join(path, child), result)
            else:
                self._get_application_dependency(path, result)
        except NoNodeError:
            logging.debug('Node at {0} no longer exists.'.format(path))
    def _get_application_dependency(self, path, result):
        """
        Load result object with application dependencies
        :type path: str
        :type result: ApplicationDependenciesMessage
        """
        if self._zoo_keeper.exists(path):
            data, stat = self._zoo_keeper.get(path, watch=self._on_update)
            if not data:
                return

            try:
                root = ElementTree.fromstring(data)

                for node in root.findall('Automation/Component'):

                    app_id = node.attrib.get('id')
                    registrationpath = node.attrib.get('registrationpath',
                                                       None)

                    if registrationpath is None:
                        registrationpath = zk_path_join(
                            self._configuration.application_state_path, app_id)

                    start_action = node.find('Actions/Action[@id="start"]')

                    if start_action is None:
                        logging.warn("No Start Action Found for {0}".format(
                            registrationpath))
                        dependencies = list()
                    else:
                        dependencies = self._parse_dependencies(start_action)

                    data = {
                        "configuration_path": registrationpath,
                        "dependencies": dependencies,
                        "downstream": list()
                    }

                    result.update({registrationpath: data})

            except Exception:
                logging.exception('An unhandled exception occurred')

        else:
            logging.warn("config path does not exist: {0}".format(path))
Beispiel #17
0
 def _walk(self, node, node_list):
     """
     Recursively walk a ZooKeeper path and add all children to the _children
         list as ZookeeperHasChildren objects.
     :type node: str
     """
     children = self.zkclient.get_children(node, watch=self._rewalk_tree)
     if children:
         for c in children:
             path = zk_path_join(node, c)
             self._walk(path, node_list)
     else:
         data, stat = self.zkclient.get(node)
         if stat.ephemeralOwner == 0:  # not ephemeral
             if fnmatch.fnmatch(node, self.nodepattern):
                 node_list.append(node)
         else:
             if fnmatch.fnmatch(os.path.dirname(node), self.nodepattern):
                 node_list.append(os.path.dirname(node))
Beispiel #18
0
 def _walk(self, node, node_list):
     """
     Recursively walk a ZooKeeper path and add all children to the _children
         list as ZookeeperHasChildren objects.
     :type node: str
     """
     children = self.zkclient.get_children(node, watch=self._rewalk_tree)
     if children:
         for c in children:
             path = zk_path_join(node, c)
             self._walk(path, node_list)
     else:
         data, stat = self.zkclient.get(node)
         if stat.ephemeralOwner == 0:  # not ephemeral
             if fnmatch.fnmatch(node, self.nodepattern):
                 node_list.append(node)
         else:
             if fnmatch.fnmatch(os.path.dirname(node), self.nodepattern):
                 node_list.append(os.path.dirname(node))
    def delete(self, server):
        """
        @api {put} /api/v1/config/:host Delete sentinel config
        @apiVersion 1.0.0
        @apiName DeleteSentinel
        @apiGroup Sentinel Config
        """
        logging.info('Deleting server {0} for client'
                     .format(server, self.request.remote_ip))
        path = zk_path_join(self.agent_configuration_path, server)

        # recursively delete server and children
        try:
            self.zk.delete(path)
            self.write('Node successfully deleted.')
            logging.info('Deleted {0}'.format(server))
        except NoNodeError:
            output = 'Node {0} does not exist.'.format(path)
            logging.error(output)
            self.write(output)
Beispiel #20
0
    def delete(self, server):
        """
        @api {put} /api/v1/config/:host Delete sentinel config
        @apiVersion 1.0.0
        @apiName DeleteSentinel
        @apiGroup Sentinel Config
        """
        server = cap_hostname(server)
        logging.info('Deleting server {0} for client'.format(
            server, self.request.remote_ip))
        path = zk_path_join(self.agent_configuration_path, server)

        # recursively delete server and children
        try:
            self.zk.delete(path)
            self.write('Node successfully deleted.')
            logging.info('Deleted {0}'.format(server))
        except NoNodeError:
            output = 'Node {0} does not exist.'.format(path)
            logging.error(output)
            self.write(output)
Beispiel #21
0
    def _has_exception(self, key):
        """
        Check the override node to see if pagerduty alerts should be disabled
        :type key: str
        :rtype: bool
        """
        # TODO: change the key or add a different field so that we don't have to
        # do that messy construction below...
        try:
            app_id = '/'.join(key.split('/')[1:-1])
            app_state_path = zk_path_join(self._state_path, app_id)
            data, stat = self._zk.get(self._override_path)
            d = json.loads(data)
            return d.get(app_state_path, {}).get('pd_disabled', False)

        except ValueError:
            logging.error('Node {0} has malformed JSON.'
                          .format(self._override_path))
            return False
        except NoNodeError:
            return False
    def get(self, server):
        """
        @api {get} /api/v1/config/:host Get sentinel config for server
        @apiVersion 1.0.0
        @apiName GetSentinel
        @apiGroup Sentinel Config
        """
        logging.info('Searching for server {0}'.format(server))
        path = zk_path_join(self.agent_configuration_path, server)

        # get tuple (value, ZnodeStat) if the node exists
        if self.zk.exists(path):
            data, stat = self.zk.get(path)
            logging.info('Found server {0}. '
                         'Outputting XML configuration.'.format(server))

            # write server data
            self.set_header('Content-Type', 'application/json')
            self.write(json.dumps(data))
        else:
            output = 'Node does not exist.'
            logging.error(output)
            self.write(output)
Beispiel #23
0
    def add_task(self, task, is_cancel=False):
        """
        Add Task to UniqueQueue. Submit task node to ZooKeeper.
        If `is_cancel` clear the queue, and submit only cancel.
        :type task: zoom.agent.task.task.Task
        :type is_cancel: bool
        """
        if task.host not in self._task_queue:
            self._task_queue[task.host] = UniqueQueue()

        host_q = self._task_queue.get(task.host)

        if is_cancel:
            host_q.clear()
            task_path = zk_path_join(self._configuration.task_path, task.host)

            try:
                self._zoo_keeper.delete(task_path)
            except NoNodeError:
                pass

        host_q.append_unique(task, sender=task.host)
        self._submit_task(task)
Beispiel #24
0
    def add_task(self, task, is_cancel=False):
        """
        Add Task to UniqueQueue. Submit task node to ZooKeeper.
        If `is_cancel` clear the queue, and submit only cancel.
        :type task: zoom.agent.task.task.Task
        :type is_cancel: bool
        """
        if task.host not in self._task_queue:
            self._task_queue[task.host] = UniqueQueue()

        host_q = self._task_queue.get(task.host)

        if is_cancel:
            host_q.clear()
            task_path = zk_path_join(self._configuration.task_path, task.host)

            try:
                self._zoo_keeper.delete(task_path)
            except NoNodeError:
                pass

        host_q.append_unique(task, sender=task.host)
        self._submit_task(task)
Beispiel #25
0
    def get(self, server):
        """
        @api {get} /api/v1/config/:host Get sentinel config for server
        @apiVersion 1.0.0
        @apiName GetSentinel
        @apiGroup Sentinel Config
        """
        server = cap_hostname(server)
        logging.info('Searching for server {0}'.format(server))
        path = zk_path_join(self.agent_configuration_path, server)

        # get tuple (value, ZnodeStat) if the node exists
        if self.zk.exists(path):
            data, stat = self.zk.get(path)
            logging.info('Found server {0}. '
                         'Outputting XML configuration.'.format(server))

            # write server data
            self.set_header('Content-Type', 'application/json')
            self.write(json.dumps(data))
        else:
            output = 'Node does not exist.'
            logging.error(output)
            self.write(output)
    def __init__(self, zookeeper, **kwargs):
        """
        :type zookeeper: :rtype: zoom.www.entities.zoo_keeper.ZooKeeper
        """
        self._zookeeper = zookeeper
        self._settings = kwargs
        try:
            data, stat = self._zookeeper.get(ZOOM_CONFIG)
            config = json.loads(data)

            # create 'logs' directory if it does not exist
            if not os.path.exists("logs"):
                os.makedirs("logs")

            # initialize logging
            logging_config = config.get('logging')
            logging.config.dictConfig(logging_config)

            # get system type
            running_os = self._get_system()

            self._host = socket.gethostname()
            # web_server
            web_server_settings = config.get('web_server')
            self._port = self._get_setting('port', web_server_settings.get('port'))
            self._is_debug = web_server_settings.get('debug')

            self._application_path = os.getcwd()
            self._client_path = zk_path_join((os.path.normpath(os.getcwd() + os.sep + os.pardir)), 'client')
            self._doc_path = zk_path_join((os.path.normpath(os.getcwd() + os.sep + os.pardir)), "doc")
            self._html_path = zk_path_join(self._client_path, "views")
            self._images_path = zk_path_join(self._client_path, "images")
            self._pid = os.getpid()
            self._environment = self._get_setting('environment',
                                                  os.environ.get('EnvironmentToUse', 'Staging'))

            # zookeeper
            zookeeper_settings = config.get('zookeeper')
            self._agent_configuration_path = zookeeper_settings.get('agent_configuration_path')
            self._agent_state_path = zookeeper_settings.get('agent_state_path')
            self._task_path = zookeeper_settings.get('task_path')
            self._application_state_path = zookeeper_settings.get('application_state_path')
            self._global_mode_path = zookeeper_settings.get('global_mode_path')
            self._pillar_path = zookeeper_settings.get('pillar_path')
            self._alert_path = zookeeper_settings.get('alert_path')
            self._override_node = zookeeper_settings.get('override_node', '/spot/software/config/override')
            self._zookeeper_host = get_zk_conn_string(self._environment)

            # pagerduty
            pagerduty_settings = config.get('pagerduty')
            self._pagerduty_default_svc_key = pagerduty_settings.get('pagerduty_default_svc_key')
            self._pagerduty_api_token = pagerduty_settings.get('pagerduty_api_token')
            self._pagerduty_subdomain = pagerduty_settings.get('pagerduty_subdomain')
            self._pagerduty_enabled_environments = pagerduty_settings.get('pagerduty_enabled_environments')
            self._pagerduty_alert_footer = pagerduty_settings.get('pagerduty_footer', '')

            # database
            db_settings = config.get('database')
            self._db_type = db_settings.get('db_type')
            if running_os == PlatformType.WINDOWS:
                self._sql_connection = db_settings.get('sql_connection_windows')
            elif running_os == PlatformType.LINUX:
                self._sql_connection = db_settings.get('sql_connection')

            # authentication
            ad_settings = config.get('active_directory')
            self._ldap_server = ad_settings.get('host')
            self._ldap_port = ad_settings.get('port')

            # environment specific
            env_settings = config.get(self._environment.lower())
            self._read_write_groups = env_settings.get('read_write_groups')
            self._graphite_host = env_settings.get('graphite_host')
            self._graphite_recheck = env_settings.get('graphite_recheck', '5m')

            # chatops
            chatops_settings = env_settings.get('chatops', {})
            self._chatops_url = chatops_settings.get('url')
            self._chatops_group = chatops_settings.get('group')
            self._chatops_commands_to_chat = chatops_settings.get('commands_to_chat')

            # message throttling
            throttle_settings = config.get('message_throttle')
            self._throttle_interval = throttle_settings.get('interval')

            # salt
            self._salt_settings = env_settings.get('saltREST')

        except ValueError as e:
            logging.error('Data at {0} is not valid JSON.'.format(ZOOM_CONFIG))
            raise e
        except Exception as e:
            logging.exception('An unhandled exception occurred.')
            raise e
Beispiel #27
0
    def _get_application_state(self, path):
        """
        :type path: str
        :rtype: zoom.entities.application_state.ApplicationState
        """
        data, stat = self._get_app_details(path)

        # persistent node
        if stat.ephemeralOwner == 0:
            # watch node to see if children are created
            self._zoo_keeper.get_children(path, watch=self._on_update)
            host = data.get('host', 'Unknown')
            name = data.get('name', os.path.basename(path))
            agent_path = zk_path_join(self._configuration.agent_state_path,
                                      host)

            # if the agent is down, update state and mode with unknown
            agent_up = bool(
                self._zoo_keeper.exists(agent_path,
                                        watch=self._on_agent_state_update))

            valid = True
            if host in (None, 'Unknown'):
                data['state'] = 'invalid'
                data['mode'] = 'unknown'
                valid = False
            elif not agent_up:
                data['state'] = 'unknown'
                data['mode'] = 'unknown'
                valid = False
            elif agent_up:
                d, s = self._zoo_keeper.get(agent_path)
                registered_comps = json.loads(d).get('components', [])
                if name not in registered_comps:
                    data['state'] = 'invalid'
                    data['mode'] = 'unknown'
                    valid = False

            self._update_mapping(host, {path: valid})

            application_state = ApplicationState(
                application_name=name,
                configuration_path=path,
                application_status=ApplicationStatus.STOPPED,
                application_host=host,
                last_update=stat.last_modified,
                start_stop_time=data.get('start_stop_time', ''),
                error_state=data.get('state', 'unknown'),
                local_mode=data.get('mode', 'unknown'),
                login_user=data.get('login_user', 'Zoom'),
                read_only=data.get('read_only', False),
                last_command=self._get_last_command(data),
                pd_disabled=self._get_existing_attribute(path, 'pd_disabled'),
                grayed=self._get_existing_attribute(path, 'grayed'),
                platform=data.get('platform', 'unknown'),
                restart_count=data.get('restart_count', 0),
                load_times=self._time_estimate_cache.get_graphite_data(path))

        # ephemeral node
        else:
            # watch node to see if it goes away
            self._zoo_keeper.get_children(os.path.dirname(path),
                                          watch=self._on_update)

            host = os.path.basename(path)
            # if it is running, path = /app/path/HOSTNAME
            # need to convert to /app/path to get the app_details
            config_path = os.path.dirname(path)
            parent_data, parent_stat = self._get_app_details(config_path)

            self._update_mapping(host, {config_path: True})

            application_state = ApplicationState(
                application_name=parent_data.get(
                    'name', os.path.basename(config_path)),
                configuration_path=config_path,
                application_status=ApplicationStatus.RUNNING,
                application_host=host,
                last_update=stat.last_modified,
                start_stop_time=parent_data.get('start_stop_time', ''),
                error_state=parent_data.get('state', 'unknown'),
                local_mode=parent_data.get('mode', 'unknown'),
                login_user=parent_data.get('login_user', 'Zoom'),
                read_only=parent_data.get('read_only', False),
                last_command=self._get_last_command(parent_data),
                pd_disabled=self._get_existing_attribute(
                    config_path, 'pd_disabled'),
                grayed=self._get_existing_attribute(config_path, 'grayed'),
                platform=parent_data.get('platform', 'unknown'),
                restart_count=parent_data.get('restart_count', 0),
                load_times=self._time_estimate_cache.get_graphite_data(
                    config_path))

        return application_state
Beispiel #28
0
 def _assemble_path(self, minion):
     return zk_path_join(self.pillar_path, minion)
Beispiel #29
0
    def _handle_alerts(self, event=None):
        """
        Watch path in ZooKeeper for node creation. If there is a node, connect
        to PagerDuty and either Trigger or Resolve an incident.
        :type event: kazoo.protocol.states.WatchedEvent or None
        """
        # TODO: sort by ctime? Could there be a race condition here?
        self._clean_up_threads()

        try:
            alerts = self._zk.get_children(self._path, watch=self._handle_alerts)
        except (SessionExpiredError, ConnectionClosedError):
            logging.info('Session with ZK has expired. Will not process alerts '
                         'until reconnect.')
            return

        for alert in alerts:
            path = zk_path_join(self._path, alert)
            try:
                data, stat = self._zk.get(path)
                alert_data = json.loads(data)

                action = alert_data.get('action')
                i_key = alert_data.get('incident_key')

                if action == AlertActionType.TRIGGER:
                    if not self._has_exception(i_key):
                        t = Thread(target=self._pd.trigger,
                                   name='pd_{0}'.format(i_key),
                                   args=(alert_data.get('service_key'),
                                         i_key,
                                         alert_data.get('description'),
                                         alert_data.get('details')),
                                   )
                        t.daemon = True
                        t.start()
                        self._threads.append(t)

                    else:
                        logging.info('Ignoring alert for {0}'.format(i_key))

                elif action == AlertActionType.RESOLVE:
                    t = Thread(target=self._pd.resolve,
                               name='pd_{0}'.format(i_key),
                               args=(alert_data.get('service_key'), i_key),
                               )
                    t.daemon = True
                    t.start()
                    self._threads.append(t)
                else:
                    logging.warning('Unknown action type: {0}'.format(action))
                    continue

                self._zk.delete(path)

            except NoNodeError:
                logging.info('No node at {0}. Skipping alert.'.format(path))
                continue
            except ValueError:
                logging.warning('Node at {0} has invalid JSON.'.format(path))
                continue
Beispiel #30
0
 def _assemble_path(self, minion):
     return zk_path_join(self.pillar_path, minion)
    def _get_application_state(self, path):
        """
        :type path: str
        :rtype: zoom.entities.application_state.ApplicationState
        """
        data, stat = self._get_app_details(path)

        # persistent node
        if stat.ephemeralOwner == 0:
            # watch node to see if children are created
            self._zoo_keeper.get_children(path, watch=self._on_update)
            host = data.get('host', 'Unknown')
            name = data.get('name', os.path.basename(path))
            agent_path = zk_path_join(self._configuration.agent_state_path,
                                      host)

            # if the agent is down, update state and mode with unknown
            agent_up = bool(self._zoo_keeper.exists(
                agent_path,
                watch=self._on_agent_state_update))

            valid = True
            if host in (None, 'Unknown'):
                data['state'] = 'invalid'
                data['mode'] = 'unknown'
                valid = False
            elif not agent_up:
                data['state'] = 'unknown'
                data['mode'] = 'unknown'
                valid = False
            elif agent_up:
                d, s = self._zoo_keeper.get(agent_path)
                registered_comps = json.loads(d).get('components', [])
                if name not in registered_comps:
                    data['state'] = 'invalid'
                    data['mode'] = 'unknown'
                    valid = False

            self._update_mapping(host, {path: valid})

            application_state = ApplicationState(
                application_name=name,
                configuration_path=path,
                application_status=ApplicationStatus.STOPPED,
                application_host=host,
                last_update=stat.last_modified,
                start_stop_time=data.get('start_stop_time', ''),
                error_state=data.get('state', 'unknown'),
                local_mode=data.get('mode', 'unknown'),
                login_user=data.get('login_user', 'Zoom'),
                read_only=data.get('read_only', False),
                last_command=self._get_last_command(data),
                pd_disabled=self._get_existing_attribute(path, 'pd_disabled'),
                grayed=self._get_existing_attribute(path, 'grayed'),
                platform=data.get('platform', 'unknown'),
                restart_count=data.get('restart_count', 0),
                load_times=self._time_estimate_cache.get_graphite_data(path)
            )

        # ephemeral node
        else:
            # watch node to see if it goes away
            self._zoo_keeper.get_children(os.path.dirname(path),
                                          watch=self._on_update)

            host = os.path.basename(path)
            # if it is running, path = /app/path/HOSTNAME
            # need to convert to /app/path to get the app_details
            config_path = os.path.dirname(path)
            parent_data, parent_stat = self._get_app_details(config_path)

            self._update_mapping(host, {config_path: True})

            application_state = ApplicationState(
                application_name=parent_data.get('name',
                                                 os.path.basename(config_path)),
                configuration_path=config_path,
                application_status=ApplicationStatus.RUNNING,
                application_host=host,
                last_update=stat.last_modified,
                start_stop_time=parent_data.get('start_stop_time', ''),
                error_state=parent_data.get('state', 'unknown'),
                local_mode=parent_data.get('mode', 'unknown'),
                login_user=parent_data.get('login_user', 'Zoom'),
                read_only=parent_data.get('read_only', False),
                last_command=self._get_last_command(parent_data),
                pd_disabled=self._get_existing_attribute(config_path, 'pd_disabled'),
                grayed=self._get_existing_attribute(config_path, 'grayed'),
                platform=parent_data.get('platform', 'unknown'),
                restart_count=parent_data.get('restart_count', 0),
                load_times=self._time_estimate_cache.get_graphite_data(config_path)
            )

        return application_state
Beispiel #32
0
    def __init__(self, zookeeper, **kwargs):
        """
        :type zookeeper: kazoo.client.KazooClient
        """
        self._zookeeper = zookeeper
        self._settings = kwargs
        try:
            data, stat = self._zookeeper.get(ZOOM_CONFIG)
            config = json.loads(data)

            # create 'logs' directory if it does not exist
            if not os.path.exists("logs"):
                os.makedirs("logs")

            # initialize logging
            logging_config = config.get('logging')
            logging.config.dictConfig(logging_config)

            # get system type
            running_os = self._get_system()

            self._host = socket.gethostname()
            # web_server
            web_server_settings = config.get('web_server')
            self._port = self._get_setting('port',
                                           web_server_settings.get('port'))
            self._is_debug = web_server_settings.get('debug')

            self._application_path = os.getcwd()
            self._client_path = zk_path_join(
                (os.path.normpath(os.getcwd() + os.sep + os.pardir)), 'client')
            self._doc_path = zk_path_join(
                (os.path.normpath(os.getcwd() + os.sep + os.pardir)), "doc")
            self._html_path = zk_path_join(self._client_path, "views")
            self._images_path = zk_path_join(self._client_path, "images")
            self._pid = os.getpid()
            self._environment = self._get_setting(
                'environment', os.environ.get('EnvironmentToUse', 'Staging'))

            # zookeeper
            zookeeper_settings = config.get('zookeeper')
            self._zookeeper_paths = zookeeper_settings
            self._agent_configuration_path = zookeeper_settings.get(
                'agent_configuration_path')
            self._agent_state_path = zookeeper_settings.get('agent_state_path')
            self._task_path = zookeeper_settings.get('task_path')
            self._application_state_path = zookeeper_settings.get(
                'application_state_path')
            self._global_mode_path = zookeeper_settings.get('global_mode_path')
            self._pillar_path = zookeeper_settings.get('pillar_path')
            self._alert_path = zookeeper_settings.get('alert_path')
            self._override_node = zookeeper_settings.get(
                'override_node', '/spot/software/config/override')
            self._temp_directory = zookeeper_settings.get('temp_directory')
            self._zookeeper_host = get_zk_conn_string(self._environment)

            # pagerduty
            pagerduty_settings = config.get('pagerduty')
            self._pagerduty_default_svc_key = pagerduty_settings.get(
                'pagerduty_default_svc_key')
            self._pagerduty_api_token = pagerduty_settings.get(
                'pagerduty_api_token')
            self._pagerduty_subdomain = pagerduty_settings.get(
                'pagerduty_subdomain')
            self._pagerduty_enabled_environments = pagerduty_settings.get(
                'pagerduty_enabled_environments')
            self._pagerduty_alert_footer = pagerduty_settings.get(
                'pagerduty_footer', '')

            # database
            db_settings = config.get('database')
            self._db_type = db_settings.get('db_type')
            if running_os == PlatformType.WINDOWS:
                self._sql_connection = db_settings.get(
                    'sql_connection_windows')
            else:
                self._sql_connection = db_settings.get('sql_connection')

            # authentication
            ad_settings = config.get('active_directory')
            self._ldap_server = ad_settings.get('host')
            self._ldap_port = ad_settings.get('port')

            # environment specific
            env_settings = config.get(self._environment.lower())
            self._read_write_groups = env_settings.get('read_write_groups')
            self._graphite_host = env_settings.get('graphite_host')
            self._graphite_recheck = env_settings.get('graphite_recheck', '5m')

            # chatops
            chatops_settings = env_settings.get('chatops', {})
            self._chatops_url = chatops_settings.get('url')
            self._chatops_group = chatops_settings.get('group')
            self._chatops_commands_to_chat = chatops_settings.get(
                'commands_to_chat')

            # message throttling
            throttle_settings = config.get('message_throttle')
            self._throttle_interval = throttle_settings.get('interval')

            # salt
            self._salt_settings = env_settings.get('saltREST')

        except ValueError as e:
            logging.error('Data at {0} is not valid JSON.'.format(ZOOM_CONFIG))
            raise e
        except NoNodeError as e:
            logging.error('Config node missing: {}'.format(ZOOM_CONFIG))
            raise e
        except Exception as e:
            logging.exception('An unhandled exception occurred.')
            raise e