Ejemplo n.º 1
0
    def __init__(self, config, type_info, sandesh_instance,
                 unit_names, update_process_list=False):
        self.config = config
        self.type_info = type_info
        self.max_cores = 4
        self.max_old_cores = 3
        self.max_new_cores = 1
        self.all_core_file_list = []
        self.core_dir_modified_time = 0
        self.tick_count = 0
        self.fail_status_bits = 0
        self.prev_fail_status_bits = 1
        self.instance_id = INSTANCE_ID_DEFAULT
        self.collector_addr = self.config.collectors
        self.sandesh_instance = sandesh_instance
        self.curr_build_info = None
        self.new_build_info = None
        self.last_cpu = None
        self.last_time = 0
        self.own_version = None
        self.hostname = socket.gethostname()
        event_handlers = {}
        event_handlers['PROCESS_STATE'] = self.event_process_state
        event_handlers['PROCESS_COMMUNICATION'] = self.event_process_communication
        event_handlers['PROCESS_LIST_UPDATE'] = self.update_current_processes
        ConnectionState.init(self.sandesh_instance, self.hostname,
            self.type_info._module_name, self.instance_id,
            staticmethod(ConnectionState.get_conn_state_cb),
            NodeStatusUVE, NodeStatus, self.type_info._object_table,
            self.get_process_state_cb)
        self.sandesh_instance.init_generator(
            self.type_info._module_name, self.hostname,
            self.type_info._node_type_name, self.instance_id,
            self.collector_addr, self.type_info._module_name,
            ServiceHttpPortMap[self.type_info._module_name],
            ['nodemgr.common.sandesh'] + self.type_info._sandesh_packages,
            config=SandeshConfig.from_parser_arguments(self.config))
        self.sandesh_instance.set_logging_params(
            enable_local_log=self.config.log_local,
            category=self.config.log_category,
            level=self.config.log_level,
            file=self.config.log_file,
            enable_syslog=self.config.use_syslog,
            syslog_facility=self.config.syslog_facility)
        self.logger = self.sandesh_instance.logger()

        if DockerProcessInfoManager and (utils.is_running_in_docker()
                                         or utils.is_running_in_kubepod()):
            self.process_info_manager = DockerProcessInfoManager(
                type_info._module_type, unit_names, event_handlers,
                update_process_list)
        else:
            self.msg_log('Node manager could not detect process manager',
                         SandeshLevel.SYS_ERR)
            exit(-1)

        self.process_state_db = self.get_current_processes()
        for group in self.process_state_db:
            self.send_init_info(group)
Ejemplo n.º 2
0
    def __init__(self,
                 config,
                 type_info,
                 unit_names,
                 update_process_list=False):
        self.config = config
        self.type_info = type_info
        self.max_cores = 4
        self.max_old_cores = 3
        self.max_new_cores = 1
        self.all_core_file_list = []
        self.tick_count = 0
        self.fail_status_bits = 0
        self.prev_fail_status_bits = 1
        self.instance_id = INSTANCE_ID_DEFAULT
        self.sandesh_instance = sandesh_global
        self.curr_build_info = None
        self.new_build_info = None
        self.hostip = self.config.hostip
        self.hostname = socket.getfqdn(self.hostip) if self.config.hostname is None \
                        else self.config.hostname

        self.collector_chksum = 0
        self.random_collectors = list()
        if config.collectors:
            config.collectors.sort()
            self.collector_chksum = hashlib.md5("".join(
                config.collectors)).hexdigest()
            self.random_collectors = random.sample(config.collectors,
                                                   len(config.collectors))

        ConnectionState.init(self.sandesh_instance, self.hostname,
                             self.type_info._module_name, self.instance_id,
                             staticmethod(ConnectionState.get_conn_state_cb),
                             NodeStatusUVE, NodeStatus,
                             self.type_info._object_table,
                             self._get_process_state_cb)
        self.sandesh_instance.init_generator(
            self.type_info._module_name,
            self.hostname,
            self.type_info._node_type_name,
            self.instance_id,
            self.random_collectors,
            self.type_info._module_name,
            ServiceHttpPortMap[self.type_info._module_name],
            ['nodemgr.common.sandesh'] + self.type_info._sandesh_packages,
            config=SandeshConfig.from_parser_arguments(self.config))
        self.sandesh_instance.set_logging_params(
            enable_local_log=self.config.log_local,
            category=self.config.log_category,
            level=self.config.log_level,
            file=self.config.log_file,
            enable_syslog=self.config.use_syslog,
            syslog_facility=self.config.syslog_facility)
        self.logger = self.sandesh_instance.logger()

        event_handlers = {}
        event_handlers['PROCESS_STATE'] = self._event_process_state
        event_handlers[
            'PROCESS_COMMUNICATION'] = self._event_process_communication
        event_handlers['PROCESS_LIST_UPDATE'] = self._update_current_processes
        if platform.system() == 'Windows':
            self.system_data = WindowsSysData()
            self.process_info_manager = WindowsProcessInfoManager(
                event_handlers)
        else:
            gevent.signal(signal.SIGHUP, self.nodemgr_sighup_handler)
            self.system_data = LinuxSysData(self.msg_log,
                                            self.config.corefile_path)
            if DockerProcessInfoManager and (utils.is_running_in_docker()
                                             or utils.is_running_in_kubepod()):
                self.process_info_manager = DockerProcessInfoManager(
                    type_info._module_type, unit_names, event_handlers,
                    update_process_list)
            else:
                self.msg_log('Node manager could not detect process manager',
                             SandeshLevel.SYS_ERR)
                exit(-1)

        self.process_state_db = self._get_current_processes()
        for group in self.process_state_db:
            self._send_init_info(group)
Ejemplo n.º 3
0
class EventManager(object):
    group_names = []
    process_state_db = {}
    FAIL_STATUS_DUMMY = 0x1
    FAIL_STATUS_DISK_SPACE = 0x2
    FAIL_STATUS_SERVER_PORT = 0x4
    FAIL_STATUS_NTP_SYNC = 0x8
    FAIL_STATUS_DISK_SPACE_NA = 0x10

    def __init__(self,
                 config,
                 type_info,
                 unit_names,
                 update_process_list=False):
        self.config = config
        self.type_info = type_info
        self.max_cores = 4
        self.max_old_cores = 3
        self.max_new_cores = 1
        self.all_core_file_list = []
        self.tick_count = 0
        self.fail_status_bits = 0
        self.prev_fail_status_bits = 1
        self.instance_id = INSTANCE_ID_DEFAULT
        self.sandesh_instance = sandesh_global
        self.curr_build_info = None
        self.new_build_info = None
        self.hostip = self.config.hostip
        self.hostname = socket.getfqdn(self.hostip) if self.config.hostname is None \
                        else self.config.hostname

        self.collector_chksum = 0
        self.random_collectors = list()
        if config.collectors:
            config.collectors.sort()
            self.collector_chksum = hashlib.md5("".join(
                config.collectors)).hexdigest()
            self.random_collectors = random.sample(config.collectors,
                                                   len(config.collectors))

        ConnectionState.init(self.sandesh_instance, self.hostname,
                             self.type_info._module_name, self.instance_id,
                             staticmethod(ConnectionState.get_conn_state_cb),
                             NodeStatusUVE, NodeStatus,
                             self.type_info._object_table,
                             self._get_process_state_cb)
        self.sandesh_instance.init_generator(
            self.type_info._module_name,
            self.hostname,
            self.type_info._node_type_name,
            self.instance_id,
            self.random_collectors,
            self.type_info._module_name,
            ServiceHttpPortMap[self.type_info._module_name],
            ['nodemgr.common.sandesh'] + self.type_info._sandesh_packages,
            config=SandeshConfig.from_parser_arguments(self.config))
        self.sandesh_instance.set_logging_params(
            enable_local_log=self.config.log_local,
            category=self.config.log_category,
            level=self.config.log_level,
            file=self.config.log_file,
            enable_syslog=self.config.use_syslog,
            syslog_facility=self.config.syslog_facility)
        self.logger = self.sandesh_instance.logger()

        event_handlers = {}
        event_handlers['PROCESS_STATE'] = self._event_process_state
        event_handlers[
            'PROCESS_COMMUNICATION'] = self._event_process_communication
        event_handlers['PROCESS_LIST_UPDATE'] = self._update_current_processes
        if platform.system() == 'Windows':
            self.system_data = WindowsSysData()
            self.process_info_manager = WindowsProcessInfoManager(
                event_handlers)
        else:
            gevent.signal(signal.SIGHUP, self.nodemgr_sighup_handler)
            self.system_data = LinuxSysData(self.msg_log,
                                            self.config.corefile_path)
            if DockerProcessInfoManager and (utils.is_running_in_docker()
                                             or utils.is_running_in_kubepod()):
                self.process_info_manager = DockerProcessInfoManager(
                    type_info._module_type, unit_names, event_handlers,
                    update_process_list)
            else:
                self.msg_log('Node manager could not detect process manager',
                             SandeshLevel.SYS_ERR)
                exit(-1)

        self.process_state_db = self._get_current_processes()
        for group in self.process_state_db:
            self._send_init_info(group)

    def msg_log(self, msg, level):
        self.logger.log(SandeshLogger.get_py_logger_level(level), msg)

    def _get_process_name(self, process_info):
        if process_info['name'] != process_info['group']:
            process_name = process_info['group'] + ":" + process_info['name']
        else:
            process_name = process_info['name']
        return process_name

    # Get all the current processes in the node
    def _get_current_processes(self):
        # Add all current processes to make sure nothing misses the radar
        process_state_db = {}
        # list of all processes on the node is made here
        all_processes = self.process_info_manager.get_all_processes()
        self.msg_log("DBG: get_current_processes: '%s'" % all_processes,
                     SandeshLevel.SYS_DEBUG)
        for proc_info in all_processes:
            proc_name = self._get_process_name(proc_info)
            proc_pid = int(proc_info['pid'])

            stat = ProcessStat(proc_name,
                               host_ip=self.hostip,
                               hostname=self.hostname)
            stat.process_state = proc_info['statename']
            if 'start' in proc_info:
                stat.start_time = str(proc_info['start'])
                stat.start_count += 1
            stat.pid = proc_pid
            if stat.group not in self.group_names:
                self.group_names.append(stat.group)
            if not stat.group in process_state_db:
                process_state_db[stat.group] = {}
            process_state_db[stat.group][proc_name] = stat
        return process_state_db

    # In case the processes in the Node can change, update current processes
    def _update_current_processes(self):
        process_state_db = self._get_current_processes()
        msg = ("DBG: update_current_processes: process_state_db='%s'" %
               process_state_db)
        self.msg_log(msg, SandeshLevel.SYS_DEBUG)
        old_process_set = set(key for group in self.process_state_db
                              for key in self.process_state_db[group])
        new_process_set = set(key for group in process_state_db
                              for key in process_state_db[group])
        common_process_set = new_process_set.intersection(old_process_set)
        added_process_set = new_process_set - common_process_set
        deleted_process_set = old_process_set - common_process_set
        for deleted_process in deleted_process_set:
            self._delete_process_handler(deleted_process)
        for added_process in added_process_set:
            for group in process_state_db:
                if added_process in process_state_db[group]:
                    self._add_process_handler(
                        added_process, process_state_db[group][added_process])

    # process is deleted, send state & remove it from db
    def _delete_process_handler(self, deleted_process):
        for group in self.process_state_db:
            if deleted_process in self.process_state_db[group]:
                self.process_state_db[group][deleted_process].deleted = True
                self.send_process_state_db([group])
                del self.process_state_db[group][deleted_process]
                if not self.process_state_db[group]:
                    del self.process_state_db[group]
                return

    # new process added, update db & send state
    def _add_process_handler(self, added_process, process_info):
        group_val = process_info.group
        self.process_state_db[group_val][added_process] = process_info
        self.send_process_state_db([group_val])

    def _get_process_state_cb(self):
        state, description = self._get_process_state(self.fail_status_bits)
        return state, description

    def _get_build_info(self):
        # Retrieve build_info from package/rpm and cache it
        if self.curr_build_info is not None:
            return self.curr_build_info

        pkg_version = self._get_package_version()
        pkg_version_parts = pkg_version.split('-')
        build_id = pkg_version_parts[0]
        build_number = pkg_version_parts[1] if len(
            pkg_version_parts) > 1 else "unknown"
        self.new_build_info = build_info + '"build-id" : "' + \
            build_id + '", "build-number" : "' + \
            build_number + '"}]}'
        if (self.new_build_info != self.curr_build_info):
            self.curr_build_info = self.new_build_info
        return self.curr_build_info

    def _update_process_core_file_list(self):
        ret_value = False
        corenames = self.system_data.get_corefiles()
        process_state_db_tmp = copy.deepcopy(self.process_state_db)

        for group in process_state_db_tmp:
            for key in process_state_db_tmp[group]:
                process_state_db_tmp[group][key].core_file_list = []

        for corename in corenames:
            try:
                exec_name = corename.split('.')[1]
            except IndexError:
                # Ignore the directories and the files that do not comply
                # with the core pattern
                continue
            for group in self.process_state_db:
                for key in self.process_state_db[group]:
                    if key.startswith(exec_name):
                        process_state_db_tmp[group][key].core_file_list.append(
                            corename.rstrip())

        for group in self.process_state_db:
            for key in self.process_state_db[group]:
                while len(process_state_db_tmp[group]
                          [key].core_file_list) > self.max_cores:
                    try:
                        if os.path.isfile(process_state_db_tmp[group]
                                          [key].core_file_list[0]):
                            self.system_data.remove_corefiles([
                                process_state_db_tmp[group]
                                [key].core_file_list[0]
                            ])
                    except OSError as e:
                        sys.stderr.write('ERROR: ' + str(e) + '\n')
                    del process_state_db_tmp[group][key].core_file_list[0]

        for group in self.process_state_db:
            for key in self.process_state_db[group]:
                if set(process_state_db_tmp[group][key].core_file_list) != set(
                        self.process_state_db[group][key].core_file_list):
                    self.process_state_db[group][
                        key].core_file_list = process_state_db_tmp[group][
                            key].core_file_list
                    ret_value = True

        return ret_value

    def send_process_state_db(self, group_names):
        name = self.hostname
        for group in group_names:
            process_infos = []
            delete_status = True
            for key in self.process_state_db[group]:
                pstat = self.process_state_db[group][key]
                process_info = ProcessInfo()
                process_info.process_name = key
                process_info.process_state = pstat.process_state
                process_info.start_count = pstat.start_count
                process_info.stop_count = pstat.stop_count
                process_info.exit_count = pstat.exit_count
                process_info.last_start_time = pstat.start_time
                process_info.last_stop_time = pstat.stop_time
                process_info.last_exit_time = pstat.exit_time
                process_info.core_file_list = pstat.core_file_list
                process_infos.append(process_info)
                # in tor-agent case, we should use tor-agent name as uve key
                name = pstat.name
                if pstat.deleted == False:
                    delete_status = False

            if not process_infos:
                continue

            # send node UVE
            node_status = NodeStatus()
            node_status.name = name
            node_status.deleted = delete_status
            node_status.process_info = process_infos
            node_status.build_info = self._get_build_info()
            node_status_uve = NodeStatusUVE(table=self.type_info._object_table,
                                            data=node_status)
            msg = ('send_process_state_db: Sending UVE: {}'.format(
                node_status_uve))
            self.msg_log(msg, SandeshLevel.SYS_INFO)
            node_status_uve.send()

    def _send_process_state(self, process_info):
        pname = self._get_process_name(process_info)
        # update process stats
        if pname in list(key for group in self.process_state_db
                         for key in self.process_state_db[group]):
            for group in self.process_state_db:
                if pname in self.process_state_db[group]:
                    proc_stat = self.process_state_db[group][pname]
        else:
            proc_stat = ProcessStat(pname,
                                    host_ip=self.hostip,
                                    hostname=self.hostname)

        pstate = process_info['state']
        proc_stat.process_state = pstate

        send_uve = False
        if (pstate == 'PROCESS_STATE_RUNNING'):
            proc_stat.start_count += 1
            proc_stat.start_time = str(int(time.time() * 1000000))
            send_uve = True
            proc_stat.pid = int(process_info['pid'])

        if (pstate == 'PROCESS_STATE_STOPPED'):
            proc_stat.stop_count += 1
            send_uve = True
            proc_stat.stop_time = str(int(time.time() * 1000000))
            proc_stat.last_exit_unexpected = False
            proc_stat.last_cpu = None
            proc_stat.last_time = 0

        if (pstate == 'PROCESS_STATE_EXITED'):
            proc_stat.exit_count += 1
            send_uve = True
            proc_stat.exit_time = str(int(time.time() * 1000000))
            proc_stat.last_cpu = None
            proc_stat.last_time = 0
            if not process_info['expected']:
                self.msg_log(
                    '%s with pid: %s exited abnormally' %
                    (pname, process_info['pid']), SandeshLevel.SYS_ERR)
                proc_stat.last_exit_unexpected = True
        send_init_uve = False
        # update process state database
        if not proc_stat.group in self.process_state_db:
            self.process_state_db[proc_stat.group] = {}
            send_init_uve = True
        self.process_state_db[proc_stat.group][pname] = proc_stat
        if send_uve:
            if (send_init_uve):
                self._send_init_info(proc_stat.group)
            self.send_process_state_db([proc_stat.group])

    def send_nodemgr_process_status(self):
        if self.prev_fail_status_bits == self.fail_status_bits:
            return

        self.prev_fail_status_bits = self.fail_status_bits
        fail_status_bits = self.fail_status_bits
        state, description = self._get_process_state(fail_status_bits)
        conn_infos = ConnectionState._connection_map.values()
        (cb_state,
         cb_description) = ConnectionState.get_conn_state_cb(conn_infos)
        if (cb_state == ProcessState.NON_FUNCTIONAL):
            state = ProcessState.NON_FUNCTIONAL
        if description != '':
            description += ' '
        description += cb_description

        process_status = ProcessStatus(module_id=self.type_info._module_name,
                                       instance_id=self.instance_id,
                                       state=ProcessStateNames[state],
                                       description=description,
                                       connection_infos=conn_infos)
        process_status_list = []
        process_status_list.append(process_status)
        node_status = NodeStatus(name=self.hostname,
                                 process_status=process_status_list)
        node_status_uve = NodeStatusUVE(table=self.type_info._object_table,
                                        data=node_status)
        msg = ('send_nodemgr_process_status: Sending UVE:' +
               str(node_status_uve))
        self.msg_log(msg, SandeshLevel.SYS_INFO)
        node_status_uve.send()

    def _get_package_version(self):
        pkg_version = utils.get_package_version('contrail-nodemgr')
        if pkg_version is None:
            self.msg_log(
                'Error getting %s package version' % ('contrail-nodemgr'),
                SandeshLevel.SYS_ERR)
            pkg_version = "unknown"
        return pkg_version

    def _send_init_info(self, group_name):
        key = next(key for key in self.process_state_db[group_name])
        # system_cpu_info
        sys_cpu = SystemCpuInfo()
        sys_cpu.num_socket = self.system_data.get_num_socket()
        sys_cpu.num_cpu = self.system_data.get_num_cpu()
        sys_cpu.num_core_per_socket = self.system_data.get_num_core_per_socket(
        )
        sys_cpu.num_thread_per_core = self.system_data.get_num_thread_per_core(
        )

        node_status = NodeStatus(
            name=self.process_state_db[group_name][key].name,
            system_cpu_info=sys_cpu,
            build_info=self._get_build_info())

        # installed/running package version
        pkg_version = self._get_package_version()
        node_status.installed_package_version = pkg_version
        node_status.running_package_version = pkg_version

        node_status_uve = NodeStatusUVE(table=self.type_info._object_table,
                                        data=node_status)
        node_status_uve.send()

    def _get_group_processes_mem_cpu_usage(self, group_name):
        process_mem_cpu_usage = {}
        for key in self.process_state_db[group_name]:
            pstat = self.process_state_db[group_name][key]
            if pstat.process_state != 'PROCESS_STATE_RUNNING':
                continue
            mem_cpu_usage_data = (
                self.process_info_manager.get_mem_cpu_usage_data(
                    pstat.pid, pstat.last_cpu, pstat.last_time))
            process_mem_cpu = mem_cpu_usage_data.get_process_mem_cpu_info()
            process_mem_cpu.__key = pstat.pname
            process_mem_cpu_usage[process_mem_cpu.__key] = process_mem_cpu
            pstat.last_cpu = mem_cpu_usage_data.last_cpu
            pstat.last_time = mem_cpu_usage_data.last_time

        return process_mem_cpu_usage

    def _get_process_state(self, fail_status_bits):
        if not fail_status_bits:
            return ProcessState.FUNCTIONAL, ''

        state = ProcessState.NON_FUNCTIONAL
        description = self.get_failbits_nodespecific_desc(fail_status_bits)
        if fail_status_bits & self.FAIL_STATUS_NTP_SYNC:
            description.append("NTP state unsynchronized.")
        return state, " ".join(description)

    # method to implement in children
    def get_failbits_nodespecific_desc(self, fail_status_bits):
        return list()

    def _event_process_state(self, process_info):
        msg = ("DBG: event_process_state:" + process_info['name'] + "," +
               "group:" + process_info['group'] + "," + "state:" +
               process_info['state'])
        self.msg_log(msg, SandeshLevel.SYS_DEBUG)
        self._send_process_state(process_info)

    def _event_process_communication(self, pdata):
        flag_and_value = pdata.partition(":")
        msg = ("DBG: event_process_communication: Flag:" + flag_and_value[0] +
               " Value:" + flag_and_value[2])
        self.msg_log(msg, SandeshLevel.SYS_DEBUG)

    def _event_tick_60(self):
        self.tick_count += 1
        for group in self.process_state_db:
            key = next(key for key in self.process_state_db[group])
            # get disk usage info periodically
            disk_usage_info = self.system_data.get_disk_usage()

            # typical ntp sync time is about 5 min - first time,
            # we scan only after 10 min
            if self.tick_count >= 10:
                if self.system_data.check_ntp_status():
                    self.fail_status_bits &= ~self.FAIL_STATUS_NTP_SYNC
                else:
                    self.fail_status_bits |= self.FAIL_STATUS_NTP_SYNC
                self.send_nodemgr_process_status()
            if self._update_process_core_file_list():
                self.send_process_state_db([group])

            process_mem_cpu_usage = self._get_group_processes_mem_cpu_usage(
                group)

            # get system mem/cpu usage
            system_mem_usage = self.system_data.get_sys_mem_info(
                self.type_info._uve_node_type)
            system_cpu_usage = self.system_data.get_sys_cpu_info(
                self.type_info._uve_node_type)

            # send above encoded buffer
            node_status = NodeStatus(
                name=self.process_state_db[group][key].name,
                disk_usage_info=disk_usage_info,
                system_mem_usage=system_mem_usage,
                system_cpu_usage=system_cpu_usage,
                process_mem_cpu_usage=process_mem_cpu_usage)
            # encode other core file
            if self.system_data.update_all_core_file():
                self.send_process_state_db(self.group_names)
                node_status.all_core_file_list = self.all_core_file_list

            node_status_uve = NodeStatusUVE(table=self.type_info._object_table,
                                            data=node_status)

            self.msg_log('DBG: event_tick_60: node_status=%s' % node_status,
                         SandeshLevel.SYS_DEBUG)
            node_status_uve.send()

    def send_init_data(self):
        self.send_nodemgr_process_status()
        self.send_process_state_db(self.group_names)

    # can be overriden in children
    def do_periodic_events(self):
        self._event_tick_60()

    def run_periodically(self, function, interval, *args, **kwargs):
        while True:
            before = time.time()
            function(*args, **kwargs)

            duration = time.time() - before
            if duration < interval:
                gevent.sleep(interval - duration)
            else:
                self.msg_log(
                    'function %s duration exceeded %f interval (took %f)' %
                    (function.__name__, interval, duration),
                    SandeshLevel.SYS_ERR)

    def runforever(self):
        while True:
            try:
                self.process_info_manager.run_job()
            except Exception as e:
                self.msg_log('Exception in periodic job: {}'.format(e),
                             SandeshLevel.SYS_WARN)
            gevent.sleep(seconds=5)

    def nodemgr_sighup_handler(self):
        collector_list = list()
        config = ConfigParser.SafeConfigParser()
        config.read([self.config.config_file_path])
        if 'COLLECTOR' in config.sections():
            try:
                collector = config.get('COLLECTOR', 'server_list')
                collector_list = collector.split()
            except ConfigParser.NoOptionError:
                pass
        collector_list.sort()
        new_chksum = hashlib.md5("".join(collector_list)).hexdigest()
        if new_chksum != self.collector_chksum:
            self.collector_chksum = new_chksum
            self.random_collectors = random.sample(collector_list,
                                                   len(collector_list))
            self.sandesh_instance.reconfig_collectors(self.random_collectors)
class EventManager(object):
    group_names = []
    process_state_db = {}
    FAIL_STATUS_DUMMY = 0x1
    FAIL_STATUS_DISK_SPACE = 0x2
    FAIL_STATUS_SERVER_PORT = 0x4
    FAIL_STATUS_NTP_SYNC = 0x8
    FAIL_STATUS_DISK_SPACE_NA = 0x10

    def __init__(self,
                 config,
                 type_info,
                 sandesh_instance,
                 unit_names,
                 update_process_list=False):
        self.config = config
        self.type_info = type_info
        self.max_cores = 4
        self.max_old_cores = 3
        self.max_new_cores = 1
        self.all_core_file_list = []
        self.core_dir_modified_time = 0
        self.tick_count = 0
        self.fail_status_bits = 0
        self.prev_fail_status_bits = 1
        self.instance_id = INSTANCE_ID_DEFAULT
        self.collector_addr = self.config.collectors
        self.sandesh_instance = sandesh_instance
        self.curr_build_info = None
        self.new_build_info = None
        self.last_cpu = None
        self.last_time = 0
        self.own_version = None
        self.hostname = socket.gethostname()
        event_handlers = {}
        event_handlers['PROCESS_STATE'] = self.event_process_state
        event_handlers[
            'PROCESS_COMMUNICATION'] = self.event_process_communication
        event_handlers['PROCESS_LIST_UPDATE'] = self.update_current_processes
        ConnectionState.init(
            self.sandesh_instance, self.hostname,
            self.type_info._module_name, self.instance_id,
            staticmethod(ConnectionState.get_process_state_cb), NodeStatusUVE,
            NodeStatus, self.type_info._object_table)
        self.sandesh_instance.init_generator(
            self.type_info._module_name,
            self.hostname,
            self.type_info._node_type_name,
            self.instance_id,
            self.collector_addr,
            self.type_info._module_name,
            ServiceHttpPortMap[self.type_info._module_name],
            ['nodemgr.common.sandesh'] + self.type_info._sandesh_packages,
            config=SandeshConfig.from_parser_arguments(self.config))
        self.sandesh_instance.set_logging_params(
            enable_local_log=self.config.log_local,
            category=self.config.log_category,
            level=self.config.log_level,
            file=self.config.log_file,
            enable_syslog=self.config.use_syslog,
            syslog_facility=self.config.syslog_facility)
        self.logger = self.sandesh_instance.logger()

        if DockerProcessInfoManager and (utils.is_running_in_docker()
                                         or utils.is_running_in_kubepod()):
            self.process_info_manager = DockerProcessInfoManager(
                type_info._module_type, unit_names, event_handlers,
                update_process_list)
        else:
            self.msg_log('Node manager could not detect process manager',
                         SandeshLevel.SYS_ERR)
            exit(-1)

        self.process_state_db = self.get_current_processes()
        for group in self.process_state_db:
            self.send_init_info(group)

    # end __init__

    def msg_log(self, msg, level):
        self.logger.log(SandeshLogger.get_py_logger_level(level), msg)

    # end msg_log

    def get_process_name(self, process_info):
        if process_info['name'] != process_info['group']:
            process_name = process_info['group'] + ":" + process_info['name']
        else:
            process_name = process_info['name']
        return process_name

    # end get_process_name

    # Get all the current processes in the node
    def get_current_processes(self):
        # Add all current processes to make sure nothing misses the radar
        process_state_db = {}
        # list of all processes on the node is made here
        all_processes = self.process_info_manager.get_all_processes()
        self.msg_log("DBG: get_current_processes: '%s'" % all_processes,
                     SandeshLevel.SYS_DEBUG)
        for proc_info in all_processes:
            proc_name = self.get_process_name(proc_info)
            proc_pid = int(proc_info['pid'])

            stat = ProcessStat(proc_name)
            stat.process_state = proc_info['statename']
            if 'start' in proc_info:
                stat.start_time = str(proc_info['start'])
                stat.start_count += 1
            stat.pid = proc_pid
            if stat.group not in self.group_names:
                self.group_names.append(stat.group)
            if not stat.group in process_state_db:
                process_state_db[stat.group] = {}
            process_state_db[stat.group][proc_name] = stat
        return process_state_db

    # end get_current_process

    # In case the processes in the Node can change, update current processes
    def update_current_processes(self):
        process_state_db = self.get_current_processes()
        msg = ("DBG: update_current_processes: process_state_db='%s'" %
               process_state_db)
        self.msg_log(msg, SandeshLevel.SYS_DEBUG)
        old_process_set = set(key for group in self.process_state_db
                              for key in self.process_state_db[group])
        new_process_set = set(key for group in process_state_db
                              for key in process_state_db[group])
        common_process_set = new_process_set.intersection(old_process_set)
        added_process_set = new_process_set - common_process_set
        deleted_process_set = old_process_set - common_process_set
        for deleted_process in deleted_process_set:
            self.delete_process_handler(deleted_process)
        for added_process in added_process_set:
            for group in process_state_db:
                if added_process in process_state_db[group]:
                    self.add_process_handler(
                        added_process, process_state_db[group][added_process])

    # end update_current_processes

    # process is deleted, send state & remove it from db
    def delete_process_handler(self, deleted_process):
        for group in self.process_state_db:
            if deleted_process in self.process_state_db[group]:
                self.process_state_db[group][deleted_process].deleted = True
                self.send_process_state_db([group])
                del self.process_state_db[group][deleted_process]
                if not self.process_state_db[group]:
                    del self.process_state_db[group]
                return

    # end delete_process_handler

    # new process added, update db & send state
    def add_process_handler(self, added_process, process_info):
        group_val = process_info.group
        self.process_state_db[group_val][added_process] = process_info
        self.send_process_state_db([group_val])

    # end add_process_handler

    def check_ntp_status(self):
        ntp_status_cmd = 'ntpq -n -c pe | grep "^*"'
        proc = Popen(ntp_status_cmd,
                     shell=True,
                     stdout=PIPE,
                     stderr=PIPE,
                     close_fds=True)
        (_, _) = proc.communicate()
        if proc.returncode != 0:
            self.fail_status_bits |= self.FAIL_STATUS_NTP_SYNC
        else:
            self.fail_status_bits &= ~self.FAIL_STATUS_NTP_SYNC
        self.send_nodemgr_process_status()

    def get_build_info(self):
        # Retrieve build_info from package/rpm and cache it
        if self.curr_build_info is not None:
            return self.curr_build_info

        command = "contrail-version contrail-nodemgr | grep contrail-nodemgr"
        version = os.popen(command).read()
        version_partials = version.split()
        if len(version_partials) < 3:
            self.msg_log(
                'Not enough values to parse package version %s' % version,
                SandeshLevel.SYS_ERR)
            return ""
        else:
            _, rpm_version, build_num = version_partials
        self.new_build_info = build_info + '"build-id" : "' + \
            rpm_version + '", "build-number" : "' + \
            build_num + '"}]}'
        if (self.new_build_info != self.curr_build_info):
            self.curr_build_info = self.new_build_info
        return self.curr_build_info

    def get_corefile_path(self):
        self.core_file_path = self.config.corefile_path
        cat_command = "cat /proc/sys/kernel/core_pattern"
        (core_pattern, _) = Popen(cat_command.split(),
                                  stdout=PIPE,
                                  close_fds=True).communicate()
        if core_pattern is not None and not core_pattern.startswith('|'):
            dirname_cmd = "dirname " + core_pattern
            (self.core_file_path, _) = Popen(dirname_cmd.split(),
                                             stdout=PIPE,
                                             close_fds=True).communicate()
        return self.core_file_path.rstrip()

    def get_corefiles(self):
        try:
            # Get the core files list in the chronological order
            ls_command = "ls -1tr " + self.get_corefile_path()
            (corenames, _) = Popen(ls_command.split(),
                                   stdout=PIPE,
                                   close_fds=True).communicate()
        except Exception as e:
            self.msg_log('Failed to get core files: %s' % (str(e)),
                         SandeshLevel.SYS_ERR)
        else:
            return [
                self.get_corefile_path() + '/' + core
                for core in corenames.split()
            ]

    # end get_corefiles

    def remove_corefiles(self, core_files):
        for core_file in core_files:
            self.msg_log('deleting core file: %s' % (core_file),
                         SandeshLevel.SYS_ERR)
            try:
                os.remove(core_file)
            except OSError as e:
                self.msg_log(
                    'Failed to delete core file: %s - %s' %
                    (core_file, str(e)), SandeshLevel.SYS_ERR)

    # end remove_corefiles

    def update_process_core_file_list(self):
        ret_value = False
        corenames = self.get_corefiles()
        process_state_db_tmp = copy.deepcopy(self.process_state_db)

        for corename in corenames:
            try:
                exec_name = corename.split('.')[1]
            except IndexError:
                # Ignore the directories and the files that do not comply
                # with the core pattern
                continue
            for group in self.process_state_db:
                for key in self.process_state_db[group]:
                    if key.startswith(exec_name):
                        process_state_db_tmp[group][key].core_file_list.append(
                            corename.rstrip())

        for group in self.process_state_db:
            for key in self.process_state_db[group]:
                if set(process_state_db_tmp[group][key].core_file_list) != set(
                        self.process_state_db[group][key].core_file_list):
                    self.process_state_db[group][
                        key].core_file_list = process_state_db_tmp[group][
                            key].core_file_list
                    ret_value = True

        return ret_value

    # end update_process_core_file_list

    def send_process_state_db_base(self, group_names, ProcessInfo):
        name = self.hostname
        for group in group_names:
            process_infos = []
            delete_status = True
            for key in self.process_state_db[group]:
                pstat = self.process_state_db[group][key]
                process_info = ProcessInfo()
                process_info.process_name = key
                process_info.process_state = pstat.process_state
                process_info.start_count = pstat.start_count
                process_info.stop_count = pstat.stop_count
                process_info.exit_count = pstat.exit_count
                process_info.last_start_time = pstat.start_time
                process_info.last_stop_time = pstat.stop_time
                process_info.last_exit_time = pstat.exit_time
                process_info.core_file_list = pstat.core_file_list
                process_infos.append(process_info)
                # in tor-agent case, we should use tor-agent name as uve key
                name = pstat.name
                if pstat.deleted == False:
                    delete_status = False

            if not process_infos:
                continue

            # send node UVE
            node_status = NodeStatus()
            node_status.name = name
            node_status.deleted = delete_status
            node_status.process_info = process_infos
            node_status.build_info = self.get_build_info()
            node_status_uve = NodeStatusUVE(table=self.type_info._object_table,
                                            data=node_status)
            msg = ('send_process_state_db_base: Sending UVE: ' +
                   str(node_status_uve))
            self.msg_log(msg, SandeshLevel.SYS_INFO)
            node_status_uve.send()

    # end send_process_state_db_base

    def send_process_state_db(self, group_names):
        self.send_process_state_db_base(group_names, ProcessInfo)

    # end send_process_state_db

    def update_all_core_file(self):
        stat_command_option = "stat --printf=%Y " + self.get_corefile_path()
        modified_time = Popen(stat_command_option.split(),
                              stdout=PIPE,
                              close_fds=True).communicate()
        if modified_time[0] == self.core_dir_modified_time:
            return False
        self.core_dir_modified_time = modified_time[0]
        self.all_core_file_list = self.get_corefiles()
        self.send_process_state_db(self.group_names)
        return True

    def send_process_state(self, process_info):
        pname = self.get_process_name(process_info)
        # update process stats
        if pname in list(key for group in self.process_state_db
                         for key in self.process_state_db[group]):
            for group in self.process_state_db:
                if pname in self.process_state_db[group]:
                    proc_stat = self.process_state_db[group][pname]
        else:
            proc_stat = ProcessStat(pname)

        pstate = process_info['state']
        proc_stat.process_state = pstate

        send_uve = False
        if (pstate == 'PROCESS_STATE_RUNNING'):
            proc_stat.start_count += 1
            proc_stat.start_time = str(int(time.time() * 1000000))
            send_uve = True
            proc_stat.pid = int(process_info['pid'])

        if (pstate == 'PROCESS_STATE_STOPPED'):
            proc_stat.stop_count += 1
            send_uve = True
            proc_stat.stop_time = str(int(time.time() * 1000000))
            proc_stat.last_exit_unexpected = False
            proc_stat.last_cpu = None
            proc_stat.last_time = 0

        if (pstate == 'PROCESS_STATE_EXITED'):
            proc_stat.exit_count += 1
            send_uve = True
            proc_stat.exit_time = str(int(time.time() * 1000000))
            proc_stat.last_cpu = None
            proc_stat.last_time = 0
            if not process_info['expected']:
                self.msg_log(
                    '%s with pid: %s exited abnormally' %
                    (pname, process_info['pid']), SandeshLevel.SYS_ERR)
                proc_stat.last_exit_unexpected = True
                # check for core file for this exit
                find_command_option = ("find " + self.get_corefile_path() +
                                       " -name core.[A-Za-z]*." +
                                       process_info['pid'] + "*")
                self.msg_log(
                    'find command option for cores: %s' %
                    (find_command_option), SandeshLevel.SYS_DEBUG)
                (corename, _) = Popen(find_command_option.split(),
                                      stdout=PIPE,
                                      close_fds=True).communicate()

                if ((corename is not None) and (len(corename.rstrip()) >= 1)):
                    self.msg_log('core file: %s' % (corename),
                                 SandeshLevel.SYS_ERR)
                    # before adding to the core file list make
                    # sure that we do not have too many cores
                    self.msg_log(
                        'core_file_list: %s, max_cores: %d' %
                        (str(proc_stat.core_file_list), self.max_cores),
                        SandeshLevel.SYS_DEBUG)
                    if (len(proc_stat.core_file_list) >= self.max_cores):
                        # get rid of old cores
                        start = self.max_old_cores
                        end = len(proc_stat.core_file_list) - \
                                self.max_new_cores + 1
                        core_files_to_be_deleted = \
                            proc_stat.core_file_list[start:end]
                        self.remove_corefiles(core_files_to_be_deleted)
                        # now delete the cores from the list as well
                        del proc_stat.core_file_list[start:end]
                    # now add the new core to the core file list
                    proc_stat.core_file_list.append(corename.rstrip())
                    self.msg_log(
                        '# of cores for %s: %d' %
                        (pname, len(proc_stat.core_file_list)),
                        SandeshLevel.SYS_DEBUG)

        send_init_uve = False
        # update process state database
        if not proc_stat.group in self.process_state_db:
            self.process_state_db[proc_stat.group] = {}
            send_init_uve = True
        self.process_state_db[proc_stat.group][pname] = proc_stat
        if send_uve:
            if (send_init_uve):
                self.send_init_info(proc_stat.group)
            self.send_process_state_db([proc_stat.group])

    def send_nodemgr_process_status_base(self, ProcessStateNames, ProcessState,
                                         ProcessStatus):
        if self.prev_fail_status_bits == self.fail_status_bits:
            return

        self.prev_fail_status_bits = self.fail_status_bits
        fail_status_bits = self.fail_status_bits
        state, description = self.get_process_state(fail_status_bits)
        process_status = ProcessStatus(module_id=self.type_info._module_name,
                                       instance_id=self.instance_id,
                                       state=state,
                                       description=description)
        process_status_list = []
        process_status_list.append(process_status)
        node_status = NodeStatus(name=self.hostname,
                                 process_status=process_status_list)
        node_status_uve = NodeStatusUVE(table=self.type_info._object_table,
                                        data=node_status)
        msg = ('send_nodemgr_process_status_base: Sending UVE:' +
               str(node_status_uve))
        self.msg_log(msg, SandeshLevel.SYS_INFO)
        node_status_uve.send()

    # end send_nodemgr_process_status_base

    def send_nodemgr_process_status(self):
        self.send_nodemgr_process_status_base(ProcessStateNames, ProcessState,
                                              ProcessStatus)

    # end send_nodemgr_process_status

    def _get_package_version(self):
        own_version = utils.get_package_version('contrail-nodemgr')
        if own_version is None:
            self.msg_log(
                'Error getting %s package version' % ('contrail-nodemgr'),
                SandeshLevel.SYS_ERR)
            own_version = "package-version-unknown"
        return own_version

    def send_init_info(self, group_name):
        key = next(key for key in self.process_state_db[group_name])
        # system_cpu_info
        mem_cpu_usage_data = SysMemCpuUsageData(self.last_cpu, self.last_time)
        sys_cpu = SystemCpuInfo()
        sys_cpu.num_socket = mem_cpu_usage_data.get_num_socket()
        sys_cpu.num_cpu = mem_cpu_usage_data.get_num_cpu()
        sys_cpu.num_core_per_socket = mem_cpu_usage_data.get_num_core_per_socket(
        )
        sys_cpu.num_thread_per_core = mem_cpu_usage_data.get_num_thread_per_core(
        )

        node_status = NodeStatus(
            name=self.process_state_db[group_name][key].name,
            system_cpu_info=sys_cpu,
            build_info=self.get_build_info())

        # installed/running package version
        own_version = self._get_package_version()
        self.own_version = own_version
        node_status.installed_package_version = own_version
        node_status.running_package_version = own_version

        node_status_uve = NodeStatusUVE(table=self.type_info._object_table,
                                        data=node_status)
        node_status_uve.send()

    def get_group_processes_mem_cpu_usage(self, group_name):
        process_mem_cpu_usage = {}
        for key in self.process_state_db[group_name]:
            pstat = self.process_state_db[group_name][key]
            if pstat.process_state != 'PROCESS_STATE_RUNNING':
                continue
            mem_cpu_usage_data = (
                self.process_info_manager.get_mem_cpu_usage_data(
                    pstat.pid, pstat.last_cpu, pstat.last_time))
            process_mem_cpu = mem_cpu_usage_data.get_process_mem_cpu_info()
            process_mem_cpu.__key = pstat.pname
            process_mem_cpu_usage[process_mem_cpu.__key] = process_mem_cpu
            pstat.last_cpu = mem_cpu_usage_data.last_cpu
            pstat.last_time = mem_cpu_usage_data.last_time

        return process_mem_cpu_usage

    def get_disk_usage(self):
        disk_usage_info = {}
        partition = subprocess.Popen("df -PT -t ext2 -t ext3 -t ext4 -t xfs",
                                     shell=True,
                                     stdout=subprocess.PIPE,
                                     stderr=subprocess.PIPE,
                                     close_fds=True)
        for line in partition.stdout:
            if 'Filesystem' in line:
                continue
            partition_name = line.rsplit()[0]
            partition_type = line.rsplit()[1]
            partition_space_used_1k = line.rsplit()[3]
            partition_space_available_1k = line.rsplit()[4]
            disk_usage_stat = DiskPartitionUsageStats()
            try:
                disk_usage_stat.partition_type = str(partition_type)
                disk_usage_stat.__key = str(partition_name)
                disk_usage_stat.partition_space_used_1k = \
                    int(partition_space_used_1k)
                disk_usage_stat.partition_space_available_1k = \
                    int(partition_space_available_1k)
                total_disk_space = \
                    disk_usage_stat.partition_space_used_1k + \
                    disk_usage_stat.partition_space_available_1k
                disk_usage_stat.percentage_partition_space_used = \
                    int(round((float(disk_usage_stat.partition_space_used_1k) / \
                               float(total_disk_space)) * 100))
            except ValueError:
                self.msg_log('Failed to get local disk space usage',
                             SandeshLevel.SYS_ERR)
            else:
                disk_usage_info[partition_name] = disk_usage_stat
        return disk_usage_info

    # end get_disk_usage

    def get_process_state_base(self, fail_status_bits, ProcessStateNames,
                               ProcessState):
        if not fail_status_bits:
            return ProcessStateNames[ProcessState.FUNCTIONAL], ''

        state = ProcessStateNames[ProcessState.NON_FUNCTIONAL]
        description = self.get_failbits_nodespecific_desc(fail_status_bits)
        if fail_status_bits & self.FAIL_STATUS_NTP_SYNC:
            if not description:
                description = " "
            description += "NTP state unsynchronized."
        return state, description

    # end get_process_state_base

    def get_process_state(self, fail_status_bits):
        return self.get_process_state_base(fail_status_bits, ProcessStateNames,
                                           ProcessState)

    # end get_process_state

    def get_failbits_nodespecific_desc(self, fail_status_bits):
        return ""

    def event_process_state(self, process_info):
        msg = ("DBG: event_process_state:" + process_info['name'] + "," +
               "group:" + process_info['group'] + "," + "state:" +
               process_info['state'])
        self.msg_log(msg, SandeshLevel.SYS_DEBUG)
        self.send_process_state(process_info)

    # end event_process_state

    def event_process_communication(self, pdata):
        flag_and_value = pdata.partition(":")
        msg = ("DBG: event_process_communication: Flag:" + flag_and_value[0] +
               " Value:" + flag_and_value[2])
        self.msg_log(msg, SandeshLevel.SYS_DEBUG)

    def event_tick_60(self):
        self.tick_count += 1
        own_version = None
        for group in self.process_state_db:
            key = next(key for key in self.process_state_db[group])
            # get disk usage info periodically
            disk_usage_info = self.get_disk_usage()

            # typical ntp sync time is about 5 min - first time,
            # we scan only after 10 min
            if self.tick_count >= 10:
                self.check_ntp_status()
            if self.update_process_core_file_list():
                self.send_process_state_db([group])

            process_mem_cpu_usage = self.get_group_processes_mem_cpu_usage(
                group)

            # get system mem/cpu usage
            system_mem_cpu_usage_data = SysMemCpuUsageData(
                self.last_cpu, self.last_time)
            system_mem_usage = system_mem_cpu_usage_data.get_sys_mem_info(
                self.type_info._uve_node_type)
            system_cpu_usage = system_mem_cpu_usage_data.get_sys_cpu_info(
                self.type_info._uve_node_type)

            # update last_cpu/time after all processing is complete
            self.last_cpu = system_mem_cpu_usage_data.last_cpu
            self.last_time = system_mem_cpu_usage_data.last_time

            # send above encoded buffer
            node_status = NodeStatus(
                name=self.process_state_db[group][key].name,
                disk_usage_info=disk_usage_info,
                system_mem_usage=system_mem_usage,
                system_cpu_usage=system_cpu_usage,
                process_mem_cpu_usage=process_mem_cpu_usage)
            # encode other core file
            if self.update_all_core_file():
                node_status.all_core_file_list = self.all_core_file_list

            own_version = self._get_package_version()
            if (own_version != self.own_version):
                node_status.own_version = own_version
            node_status_uve = NodeStatusUVE(table=self.type_info._object_table,
                                            data=node_status)

            self.msg_log('DBG: event_tick_60: node_status=%s' % node_status,
                         SandeshLevel.SYS_DEBUG)
            node_status_uve.send()
        self.own_version = own_version

    def do_periodic_events(self):
        self.event_tick_60()

    def run_periodically(self, function, interval, *args, **kwargs):
        while True:
            before = time.time()
            function(*args, **kwargs)

            duration = time.time() - before
            if duration < interval:
                gevent.sleep(interval - duration)
            else:
                self.msg_log(
                    'function %s duration exceeded %f interval (took %f)' %
                    (function.__name__, interval, duration),
                    SandeshLevel.SYS_ERR)

    # end run_periodically

    def runforever(self):
        self.process_info_manager.runforever()

    # end runforever

    def nodemgr_sighup_handler(self):
        config = ConfigParser.SafeConfigParser()
        config.read(self.config_file)
        if 'COLLECTOR' in config.sections():
            try:
                collector = config.get('COLLECTOR', 'server_list')
                collector_list = collector.split()
            except ConfigParser.NoOptionError:
                pass

        if collector_list:
            new_chksum = hashlib.md5("".join(collector_list)).hexdigest()
            if new_chksum != self.collector_chksum:
                self.collector_chksum = new_chksum
                self.random_collectors = \
                    random.sample(collector_list, len(collector_list))
            # Reconnect to achieve load-balance irrespective of list
            self.sandesh_instance.reconfig_collectors(self.random_collectors)
Ejemplo n.º 5
0
    def __init__(self, config, type_info, unit_names, update_process_list=False):
        self.config = config
        self.type_info = type_info
        self.max_cores = 4
        self.max_old_cores = 3
        self.max_new_cores = 1
        self.all_core_file_list = []
        self.tick_count = 0
        self.fail_status_bits = 0
        self.prev_fail_status_bits = 1
        self.instance_id = INSTANCE_ID_DEFAULT
        self.sandesh_instance = sandesh_global
        self.curr_build_info = None
        self.new_build_info = None
        self.hostip = self.config.hostip
        self.hostname = socket.getfqdn(self.hostip)

        self.collector_chksum = 0
        self.random_collectors = list()
        if config.collectors:
            config.collectors.sort()
            self.collector_chksum = hashlib.md5("".join(config.collectors)).hexdigest()
            self.random_collectors = random.sample(config.collectors, len(config.collectors))

        ConnectionState.init(self.sandesh_instance, self.hostname,
            self.type_info._module_name, self.instance_id,
            staticmethod(ConnectionState.get_conn_state_cb),
            NodeStatusUVE, NodeStatus, self.type_info._object_table,
            self._get_process_state_cb)
        self.sandesh_instance.init_generator(
            self.type_info._module_name, self.hostname,
            self.type_info._node_type_name, self.instance_id,
            self.random_collectors, self.type_info._module_name,
            ServiceHttpPortMap[self.type_info._module_name],
            ['nodemgr.common.sandesh'] + self.type_info._sandesh_packages,
            config=SandeshConfig.from_parser_arguments(self.config))
        self.sandesh_instance.set_logging_params(
            enable_local_log=self.config.log_local,
            category=self.config.log_category,
            level=self.config.log_level,
            file=self.config.log_file,
            enable_syslog=self.config.use_syslog,
            syslog_facility=self.config.syslog_facility)
        self.logger = self.sandesh_instance.logger()

        event_handlers = {}
        event_handlers['PROCESS_STATE'] = self._event_process_state
        event_handlers['PROCESS_COMMUNICATION'] = self._event_process_communication
        event_handlers['PROCESS_LIST_UPDATE'] = self._update_current_processes
        if platform.system() == 'Windows':
            self.system_data = WindowsSysData()
            self.process_info_manager = WindowsProcessInfoManager(event_handlers)
        else:
            gevent.signal(signal.SIGHUP, self.nodemgr_sighup_handler)
            self.system_data = LinuxSysData(self.msg_log, self.config.corefile_path)
            if DockerProcessInfoManager and (utils.is_running_in_docker()
                                             or utils.is_running_in_kubepod()):
                self.process_info_manager = DockerProcessInfoManager(
                    type_info._module_type, unit_names, event_handlers,
                    update_process_list)
            else:
                self.msg_log('Node manager could not detect process manager',
                            SandeshLevel.SYS_ERR)
                exit(-1)

        self.process_state_db = self._get_current_processes()
        for group in self.process_state_db:
            self._send_init_info(group)
Ejemplo n.º 6
0
class EventManager(object):
    group_names = []
    process_state_db = {}
    FAIL_STATUS_DUMMY = 0x1
    FAIL_STATUS_DISK_SPACE = 0x2
    FAIL_STATUS_SERVER_PORT = 0x4
    FAIL_STATUS_NTP_SYNC = 0x8
    FAIL_STATUS_DISK_SPACE_NA = 0x10

    def __init__(self, config, type_info, unit_names, update_process_list=False):
        self.config = config
        self.type_info = type_info
        self.max_cores = 4
        self.max_old_cores = 3
        self.max_new_cores = 1
        self.all_core_file_list = []
        self.tick_count = 0
        self.fail_status_bits = 0
        self.prev_fail_status_bits = 1
        self.instance_id = INSTANCE_ID_DEFAULT
        self.sandesh_instance = sandesh_global
        self.curr_build_info = None
        self.new_build_info = None
        self.hostip = self.config.hostip
        self.hostname = socket.getfqdn(self.hostip)

        self.collector_chksum = 0
        self.random_collectors = list()
        if config.collectors:
            config.collectors.sort()
            self.collector_chksum = hashlib.md5("".join(config.collectors)).hexdigest()
            self.random_collectors = random.sample(config.collectors, len(config.collectors))

        ConnectionState.init(self.sandesh_instance, self.hostname,
            self.type_info._module_name, self.instance_id,
            staticmethod(ConnectionState.get_conn_state_cb),
            NodeStatusUVE, NodeStatus, self.type_info._object_table,
            self._get_process_state_cb)
        self.sandesh_instance.init_generator(
            self.type_info._module_name, self.hostname,
            self.type_info._node_type_name, self.instance_id,
            self.random_collectors, self.type_info._module_name,
            ServiceHttpPortMap[self.type_info._module_name],
            ['nodemgr.common.sandesh'] + self.type_info._sandesh_packages,
            config=SandeshConfig.from_parser_arguments(self.config))
        self.sandesh_instance.set_logging_params(
            enable_local_log=self.config.log_local,
            category=self.config.log_category,
            level=self.config.log_level,
            file=self.config.log_file,
            enable_syslog=self.config.use_syslog,
            syslog_facility=self.config.syslog_facility)
        self.logger = self.sandesh_instance.logger()

        event_handlers = {}
        event_handlers['PROCESS_STATE'] = self._event_process_state
        event_handlers['PROCESS_COMMUNICATION'] = self._event_process_communication
        event_handlers['PROCESS_LIST_UPDATE'] = self._update_current_processes
        if platform.system() == 'Windows':
            self.system_data = WindowsSysData()
            self.process_info_manager = WindowsProcessInfoManager(event_handlers)
        else:
            gevent.signal(signal.SIGHUP, self.nodemgr_sighup_handler)
            self.system_data = LinuxSysData(self.msg_log, self.config.corefile_path)
            if DockerProcessInfoManager and (utils.is_running_in_docker()
                                             or utils.is_running_in_kubepod()):
                self.process_info_manager = DockerProcessInfoManager(
                    type_info._module_type, unit_names, event_handlers,
                    update_process_list)
            else:
                self.msg_log('Node manager could not detect process manager',
                            SandeshLevel.SYS_ERR)
                exit(-1)

        self.process_state_db = self._get_current_processes()
        for group in self.process_state_db:
            self._send_init_info(group)

    def msg_log(self, msg, level):
        self.logger.log(SandeshLogger.get_py_logger_level(level), msg)

    def _get_process_name(self, process_info):
        if process_info['name'] != process_info['group']:
            process_name = process_info['group'] + ":" + process_info['name']
        else:
            process_name = process_info['name']
        return process_name

    # Get all the current processes in the node
    def _get_current_processes(self):
        # Add all current processes to make sure nothing misses the radar
        process_state_db = {}
        # list of all processes on the node is made here
        all_processes = self.process_info_manager.get_all_processes()
        self.msg_log("DBG: get_current_processes: '%s'" % all_processes,
                     SandeshLevel.SYS_DEBUG)
        for proc_info in all_processes:
            proc_name = self._get_process_name(proc_info)
            proc_pid = int(proc_info['pid'])

            stat = ProcessStat(proc_name, host_ip=self.hostip)
            stat.process_state = proc_info['statename']
            if 'start' in proc_info:
                stat.start_time = str(proc_info['start'])
                stat.start_count += 1
            stat.pid = proc_pid
            if stat.group not in self.group_names:
                self.group_names.append(stat.group)
            if not stat.group in process_state_db:
                process_state_db[stat.group] = {}
            process_state_db[stat.group][proc_name] = stat
        return process_state_db

    # In case the processes in the Node can change, update current processes
    def _update_current_processes(self):
        process_state_db = self._get_current_processes()
        msg = ("DBG: update_current_processes: process_state_db='%s'"
               % process_state_db)
        self.msg_log(msg, SandeshLevel.SYS_DEBUG)
        old_process_set = set(key for group in self.process_state_db
                                  for key in self.process_state_db[group])
        new_process_set = set(key for group in process_state_db
                                  for key in process_state_db[group])
        common_process_set = new_process_set.intersection(old_process_set)
        added_process_set = new_process_set - common_process_set
        deleted_process_set = old_process_set - common_process_set
        for deleted_process in deleted_process_set:
            self._delete_process_handler(deleted_process)
        for added_process in added_process_set:
            for group in process_state_db:
                if added_process in process_state_db[group]:
                    self._add_process_handler(
                        added_process, process_state_db[group][added_process])

    # process is deleted, send state & remove it from db
    def _delete_process_handler(self, deleted_process):
        for group in self.process_state_db:
            if deleted_process in self.process_state_db[group]:
                self.process_state_db[group][deleted_process].deleted = True
                self.send_process_state_db([group])
                del self.process_state_db[group][deleted_process]
                if not self.process_state_db[group]:
                    del self.process_state_db[group]
                return

    # new process added, update db & send state
    def _add_process_handler(self, added_process, process_info):
        group_val = process_info.group
        self.process_state_db[group_val][added_process] = process_info
        self.send_process_state_db([group_val])

    def _get_process_state_cb(self):
        state, description = self._get_process_state(self.fail_status_bits)
        return state, description

    def _get_build_info(self):
        # Retrieve build_info from package/rpm and cache it
        if self.curr_build_info is not None:
            return self.curr_build_info

        pkg_version = self._get_package_version()
        pkg_version_parts = pkg_version.split('-')
        build_id = pkg_version_parts[0]
        build_number = pkg_version_parts[1] if len(pkg_version_parts) > 1 else "unknown"
        self.new_build_info = build_info + '"build-id" : "' + \
            build_id + '", "build-number" : "' + \
            build_number + '"}]}'
        if (self.new_build_info != self.curr_build_info):
            self.curr_build_info = self.new_build_info
        return self.curr_build_info

    def _update_process_core_file_list(self):
        ret_value = False
        corenames = self.system_data.get_corefiles()
        process_state_db_tmp = copy.deepcopy(self.process_state_db)

        for corename in corenames:
            try:
                exec_name = corename.split('.')[1]
            except IndexError:
                # Ignore the directories and the files that do not comply
                # with the core pattern
                continue
            for group in self.process_state_db:
                for key in self.process_state_db[group]:
                    if key.startswith(exec_name):
                        process_state_db_tmp[group][key].core_file_list.append(corename.rstrip())

        for group in self.process_state_db:
            for key in self.process_state_db[group]:
                if set(process_state_db_tmp[group][key].core_file_list) != set(
                        self.process_state_db[group][key].core_file_list):
                    self.process_state_db[group][key].core_file_list = process_state_db_tmp[group][key].core_file_list
                    ret_value = True

        return ret_value

    def send_process_state_db(self, group_names):
        name = self.hostname
        for group in group_names:
            process_infos = []
            delete_status = True
            for key in self.process_state_db[group]:
                pstat = self.process_state_db[group][key]
                process_info = ProcessInfo()
                process_info.process_name = key
                process_info.process_state = pstat.process_state
                process_info.start_count = pstat.start_count
                process_info.stop_count = pstat.stop_count
                process_info.exit_count = pstat.exit_count
                process_info.last_start_time = pstat.start_time
                process_info.last_stop_time = pstat.stop_time
                process_info.last_exit_time = pstat.exit_time
                process_info.core_file_list = pstat.core_file_list
                process_infos.append(process_info)
                # in tor-agent case, we should use tor-agent name as uve key
                name = pstat.name
                if pstat.deleted == False:
                    delete_status = False

            if not process_infos:
                continue

            # send node UVE
            node_status = NodeStatus()
            node_status.name = name
            node_status.deleted = delete_status
            node_status.process_info = process_infos
            node_status.build_info = self._get_build_info()
            node_status_uve = NodeStatusUVE(table=self.type_info._object_table,
                                            data=node_status)
            msg = ('send_process_state_db: Sending UVE: {}'.format(node_status_uve))
            self.msg_log(msg, SandeshLevel.SYS_INFO)
            node_status_uve.send()

    def _send_process_state(self, process_info):
        pname = self._get_process_name(process_info)
        # update process stats
        if pname in list(key for group in self.process_state_db for key in self.process_state_db[group]):
            for group in self.process_state_db:
                if pname in self.process_state_db[group]:
                    proc_stat = self.process_state_db[group][pname]
        else:
            proc_stat = ProcessStat(pname, host_ip=self.hostip)

        pstate = process_info['state']
        proc_stat.process_state = pstate

        send_uve = False
        if (pstate == 'PROCESS_STATE_RUNNING'):
            proc_stat.start_count += 1
            proc_stat.start_time = str(int(time.time() * 1000000))
            send_uve = True
            proc_stat.pid = int(process_info['pid'])

        if (pstate == 'PROCESS_STATE_STOPPED'):
            proc_stat.stop_count += 1
            send_uve = True
            proc_stat.stop_time = str(int(time.time() * 1000000))
            proc_stat.last_exit_unexpected = False
            proc_stat.last_cpu = None
            proc_stat.last_time = 0

        if (pstate == 'PROCESS_STATE_EXITED'):
            proc_stat.exit_count += 1
            send_uve = True
            proc_stat.exit_time = str(int(time.time() * 1000000))
            proc_stat.last_cpu = None
            proc_stat.last_time = 0
            if not process_info['expected']:
                self.msg_log('%s with pid: %s exited abnormally' %
                    (pname, process_info['pid']), SandeshLevel.SYS_ERR)
                proc_stat.last_exit_unexpected = True
                # check for core file for this exit
                corename = self.system_data.find_corefile(
                    "core.[A-Za-z]*." + process_info['pid'] + "*")

                if ((corename is not None) and (len(corename.rstrip()) >= 1)):
                    self.msg_log('core file: %s' % (corename),
                        SandeshLevel.SYS_ERR)
                    # before adding to the core file list make
                    # sure that we do not have too many cores
                    self.msg_log('core_file_list: %s, max_cores: %d' %
                        (str(proc_stat.core_file_list), self.max_cores),
                        SandeshLevel.SYS_DEBUG)
                    if (len(proc_stat.core_file_list) >= self.max_cores):
                        # get rid of old cores
                        start = self.max_old_cores
                        end = len(proc_stat.core_file_list) - \
                                self.max_new_cores + 1
                        core_files_to_be_deleted = \
                            proc_stat.core_file_list[start:end]
                        self.system_data.remove_corefiles(core_files_to_be_deleted)
                        # now delete the cores from the list as well
                        del proc_stat.core_file_list[start:end]
                    # now add the new core to the core file list
                    proc_stat.core_file_list.append(corename.rstrip())
                    self.msg_log('# of cores for %s: %d' % (pname,
                        len(proc_stat.core_file_list)), SandeshLevel.SYS_DEBUG)

        send_init_uve = False
        # update process state database
        if not proc_stat.group in self.process_state_db:
            self.process_state_db[proc_stat.group] = {}
            send_init_uve = True
        self.process_state_db[proc_stat.group][pname] = proc_stat
        if send_uve:
            if (send_init_uve):
                self._send_init_info(proc_stat.group)
            self.send_process_state_db([proc_stat.group])

    def send_nodemgr_process_status(self):
        if self.prev_fail_status_bits == self.fail_status_bits:
            return

        self.prev_fail_status_bits = self.fail_status_bits
        fail_status_bits = self.fail_status_bits
        state, description = self._get_process_state(fail_status_bits)
        conn_infos = ConnectionState._connection_map.values()
        (cb_state, cb_description) = ConnectionState.get_conn_state_cb(conn_infos)
        if (cb_state == ProcessState.NON_FUNCTIONAL):
            state = ProcessState.NON_FUNCTIONAL
        if description != '':
            description += ' '
        description += cb_description

        process_status = ProcessStatus(
            module_id=self.type_info._module_name, instance_id=self.instance_id,
            state=ProcessStateNames[state], description=description)
        process_status_list = []
        process_status_list.append(process_status)
        node_status = NodeStatus(name=self.hostname,
                        process_status=process_status_list)
        node_status_uve = NodeStatusUVE(table=self.type_info._object_table,
                                        data=node_status)
        msg = ('send_nodemgr_process_status: Sending UVE:'
               + str(node_status_uve))
        self.msg_log(msg, SandeshLevel.SYS_INFO)
        node_status_uve.send()

    def _get_package_version(self):
        pkg_version = utils.get_package_version('contrail-nodemgr')
        if pkg_version is None:
            self.msg_log('Error getting %s package version' % (
                'contrail-nodemgr'), SandeshLevel.SYS_ERR)
            pkg_version = "unknown"
        return pkg_version

    def _send_init_info(self, group_name):
        key = next(key for key in self.process_state_db[group_name])
        # system_cpu_info
        sys_cpu = SystemCpuInfo()
        sys_cpu.num_socket = self.system_data.get_num_socket()
        sys_cpu.num_cpu = self.system_data.get_num_cpu()
        sys_cpu.num_core_per_socket = self.system_data.get_num_core_per_socket()
        sys_cpu.num_thread_per_core = self.system_data.get_num_thread_per_core()

        node_status = NodeStatus(
            name=self.process_state_db[group_name][key].name,
            system_cpu_info=sys_cpu,
            build_info=self._get_build_info())

        # installed/running package version
        pkg_version = self._get_package_version()
        node_status.installed_package_version = pkg_version
        node_status.running_package_version = pkg_version

        node_status_uve = NodeStatusUVE(table=self.type_info._object_table,
                                        data=node_status)
        node_status_uve.send()

    def _get_group_processes_mem_cpu_usage(self, group_name):
        process_mem_cpu_usage = {}
        for key in self.process_state_db[group_name]:
            pstat = self.process_state_db[group_name][key]
            if pstat.process_state != 'PROCESS_STATE_RUNNING':
                continue
            mem_cpu_usage_data = (
                self.process_info_manager.get_mem_cpu_usage_data(
                    pstat.pid, pstat.last_cpu, pstat.last_time))
            process_mem_cpu = mem_cpu_usage_data.get_process_mem_cpu_info()
            process_mem_cpu.__key = pstat.pname
            process_mem_cpu_usage[process_mem_cpu.__key] = process_mem_cpu
            pstat.last_cpu = mem_cpu_usage_data.last_cpu
            pstat.last_time = mem_cpu_usage_data.last_time

        return process_mem_cpu_usage

    def _get_process_state(self, fail_status_bits):
        if not fail_status_bits:
            return ProcessState.FUNCTIONAL, ''

        state = ProcessState.NON_FUNCTIONAL
        description = self.get_failbits_nodespecific_desc(fail_status_bits)
        if fail_status_bits & self.FAIL_STATUS_NTP_SYNC:
            description.append("NTP state unsynchronized.")
        return state, " ".join(description)

    # method to implement in children
    def get_failbits_nodespecific_desc(self, fail_status_bits):
        return list()

    def _event_process_state(self, process_info):
        msg = ("DBG: event_process_state:" + process_info['name'] + ","
               + "group:" + process_info['group'] + "," + "state:"
               + process_info['state'])
        self.msg_log(msg, SandeshLevel.SYS_DEBUG)
        self._send_process_state(process_info)

    def _event_process_communication(self, pdata):
        flag_and_value = pdata.partition(":")
        msg = ("DBG: event_process_communication: Flag:" + flag_and_value[0] +
               " Value:" + flag_and_value[2])
        self.msg_log(msg, SandeshLevel.SYS_DEBUG)

    def _event_tick_60(self):
        self.tick_count += 1
        for group in self.process_state_db:
            key = next(key for key in self.process_state_db[group])
            # get disk usage info periodically
            disk_usage_info = self.system_data.get_disk_usage()

            # typical ntp sync time is about 5 min - first time,
            # we scan only after 10 min
            if self.tick_count >= 10:
                if self.system_data.check_ntp_status():
                    self.fail_status_bits &= ~self.FAIL_STATUS_NTP_SYNC
                else:
                    self.fail_status_bits |= self.FAIL_STATUS_NTP_SYNC
                self.send_nodemgr_process_status()
            if self._update_process_core_file_list():
                self.send_process_state_db([group])

            process_mem_cpu_usage = self._get_group_processes_mem_cpu_usage(group)

            # get system mem/cpu usage
            system_mem_usage = self.system_data.get_sys_mem_info(
                self.type_info._uve_node_type)
            system_cpu_usage = self.system_data.get_sys_cpu_info(
                self.type_info._uve_node_type)

            # send above encoded buffer
            node_status = NodeStatus(
                name=self.process_state_db[group][key].name,
                disk_usage_info=disk_usage_info,
                system_mem_usage=system_mem_usage,
                system_cpu_usage=system_cpu_usage,
                process_mem_cpu_usage=process_mem_cpu_usage)
            # encode other core file
            if self.system_data.update_all_core_file():
                self.send_process_state_db(self.group_names)
                node_status.all_core_file_list = self.all_core_file_list

            node_status_uve = NodeStatusUVE(table=self.type_info._object_table,
                                            data=node_status)

            self.msg_log('DBG: event_tick_60: node_status=%s' % node_status,
                         SandeshLevel.SYS_DEBUG)
            node_status_uve.send()

    def send_init_data(self):
        self.send_nodemgr_process_status()
        self.send_process_state_db(self.group_names)

    # can be overriden in children
    def do_periodic_events(self):
        self._event_tick_60()

    def run_periodically(self, function, interval, *args, **kwargs):
        while True:
            before = time.time()
            function(*args, **kwargs)

            duration = time.time() - before
            if duration < interval:
                gevent.sleep(interval - duration)
            else:
                self.msg_log(
                    'function %s duration exceeded %f interval (took %f)'
                        % (function.__name__, interval, duration),
                    SandeshLevel.SYS_ERR)

    def runforever(self):
        self.process_info_manager.runforever()

    def nodemgr_sighup_handler(self):
        collector_list = list()
        config = ConfigParser.SafeConfigParser()
        config.read([self.config.config_file_path])
        if 'COLLECTOR' in config.sections():
            try:
                collector = config.get('COLLECTOR', 'server_list')
                collector_list = collector.split()
            except ConfigParser.NoOptionError:
                pass
        collector_list.sort()
        new_chksum = hashlib.md5("".join(collector_list)).hexdigest()
        if new_chksum != self.collector_chksum:
            self.collector_chksum = new_chksum
            self.random_collectors = random.sample(collector_list, len(collector_list))
            self.sandesh_instance.reconfig_collectors(self.random_collectors)
Ejemplo n.º 7
0
    def __init__(self, config, type_info, rule_file, sandesh_instance,
                 unit_names, update_process_list=False):
        self.config = config
        self.type_info = type_info
        self.stdin = sys.stdin
        self.stdout = sys.stdout
        self.stderr = sys.stderr
        self.rule_file = rule_file
        self.rules_data = {'Rules': []}
        self.max_cores = 4
        self.max_old_cores = 3
        self.max_new_cores = 1
        self.all_core_file_list = []
        self.core_dir_modified_time = 0
        self.tick_count = 0
        self.fail_status_bits = 0
        self.prev_fail_status_bits = 1
        self.instance_id = INSTANCE_ID_DEFAULT
        self.collector_addr = self.config.collectors
        self.sandesh_instance = sandesh_instance
        self.curr_build_info = None
        self.new_build_info = None
        self.last_cpu = None
        self.last_time = 0
        self.installed_package_version = None
        SupervisorEventsReq.handle_request = self.sandesh_supervisor_handle_request
        event_handlers = {}
        event_handlers['PROCESS_STATE'] = self.event_process_state
        event_handlers['PROCESS_COMMUNICATION'] = self.event_process_communication
        event_handlers['PROCESS_LIST_UPDATE'] = self.update_current_process
        ConnectionState.init(self.sandesh_instance, socket.gethostname(),
            self.type_info._module_name, self.instance_id,
            staticmethod(ConnectionState.get_process_state_cb),
            NodeStatusUVE, NodeStatus, self.type_info._object_table)
        self.sandesh_instance.init_generator(
            self.type_info._module_name, socket.gethostname(),
            self.type_info._node_type_name, self.instance_id,
            self.collector_addr, self.type_info._module_name,
            ServiceHttpPortMap[self.type_info._module_name],
            ['nodemgr.common.sandesh'] + self.type_info._sandesh_packages,
            config=SandeshConfig.from_parser_arguments(self.config))
        self.sandesh_instance.set_logging_params(
            enable_local_log=self.config.log_local,
            category=self.config.log_category,
            level=self.config.log_level,
            file=self.config.log_file,
            enable_syslog=self.config.use_syslog,
            syslog_facility=self.config.syslog_facility)
        self.logger = self.sandesh_instance.logger()

        # TODO: handle difference between 'fat' containers and microservices smartly
        if DockerProcessInfoManager and (is_running_in_docker() or is_running_in_kubepod()):
            self.process_info_manager = DockerProcessInfoManager(
                unit_names, event_handlers, update_process_list)
        elif is_systemd_based():
            self.process_info_manager = SystemdProcessInfoManager(
                unit_names, event_handlers, update_process_list)
        elif 'SUPERVISOR_SERVER_URL' in os.environ:
            self.process_info_manager = SupervisorProcessInfoManager(
                self.stdin, self.stdout, self.type_info._supervisor_serverurl,
                event_handlers, update_process_list)
        else:
            self.msg_log('Node manager could not detect process manager',
                         SandeshLevel.SYS_ERR)
            exit(-1)
        self.add_current_process()
        for group in self.process_state_db:
            self.send_init_info(group)
        self.third_party_process_dict = self.type_info._third_party_processes