def _check_error_dict(self, force=False): c_name = process_tools.get_cluster_name() mails_sent = 0 s_time = time.time() ep_dels = [] for ep, es in self.__eg_dict.items(): t_diff = s_time - es["last_update"] if force or (t_diff < 0 or t_diff > 60): subject = "Python error for pid {:d} on {}@{} ({}, {})".format( ep, process_tools.get_fqdn()[0], c_name, process_tools.get_machine_name(), clusterid.get_cluster_id() or "N/A", ) err_lines = "".join(es["error_str"]).split("\n") msg_body = "\n".join( ["Processinfo {}".format(self._get_process_info(es))] + [ "{:3d} {}".format(line_num + 1, line) for line_num, line in enumerate(err_lines) ]) if self.CC.CS["log.send.errormails"]: self._send_mail(subject, msg_body) mails_sent += 1 ep_dels.append(ep) for epd in ep_dels: del self.__eg_dict[epd] e_time = time.time() if mails_sent: self.log("Sent {} in {}".format( logging_tools.get_plural("mail", mails_sent), logging_tools.get_diff_time_str(e_time - s_time)))
def _check_error_dict(self, force=False): mails_sent = 0 s_time = time.time() ep_dels = [] for ep, es in list(self.__eg_dict.items()): t_diff = s_time - es.last_update if force or (t_diff < 0 or t_diff > 60): subject = "An error occured, PID={:d} on {}@{} ({})".format( es.pid, process_tools.get_fqdn()[0], clusterid.get_safe_cluster_id("N/A"), clusterid.get_safe_cluster_name("N/A"), ) err_lines = es.error_str.split("\n") msg_body = "\n".join( ["Processinfo {}".format(es.get_process_info())] + [ "{:3d} {}".format(line_num, line) for line_num, line in enumerate(err_lines, 1) ]) if self.CC.CS["log.send.errormails"]: self._send_mail(subject, msg_body) mails_sent += 1 ep_dels.append(ep) for epd in ep_dels: del self.__eg_dict[epd] e_time = time.time() if mails_sent: self.log("Sent {} in {}".format( logging_tools.get_plural("mail", mails_sent), logging_tools.get_diff_time_str(e_time - s_time)))
def __init__(self): self.__debug = global_config["DEBUG"] threading_tools.process_pool.__init__(self, "main") self.CC.init(icswServiceEnum.meta_server, global_config, native_logging=True) self.CC.check_config() self.CC.CS.copy_to_global_config(global_config, [ ("meta.track.icsw.memory", "TRACK_CSW_MEMORY"), ("meta.check.time", "MIN_CHECK_TIME"), ("meta.check.memory.time", "MIN_MEMCHECK_TIME"), ]) global_config.add_config_entries([ ("STATE_DIR", configfile.str_c_var(os.path.join(self.CC.CS["meta.maindir"], ".srvstate"), source="dynamic")), ]) # check for correct rights self._check_dirs() self._init_network_sockets() self._init_inotify() self.register_exception("int_error", self._sigint) self.register_exception("term_error", self._sigint) # init stuff for mailing self.__new_mail = mail_tools.mail( None, "{}@{}".format( self.CC.CS["meta.mail.from.name"], process_tools.get_fqdn()[0], ), self.CC.CS["mail.target.address"], ) self.__new_mail.set_server(self.CC.CS["mail.server"], self.CC.CS["mail.server"]) # msi dict self.__last_update_time = time.time( ) - 2 * global_config["MIN_CHECK_TIME"] self.__last_memcheck_time = time.time( ) - 2 * global_config["MIN_MEMCHECK_TIME"] self._init_meminfo() self.CC.log_config() self._init_statemachine() self.__next_stop_is_restart = False # wait for transactions if necessary self.__exit_process = False self.__transition_timer = False self.register_timer(self._check, 30, instant=True)
def _check_processes(self, service_list=None, force=False): self.__loopcount += 1 act_time = time.time() # act_pid_dict = process_tools.get_proc_list() _check_mem = act_time > self.__last_memcheck_time + global_config[ "MIN_MEMCHECK_TIME"] and global_config["TRACK_CSW_MEMORY"] if _check_mem: self.__last_memcheck_time = act_time if service_list is not None: self.def_ns.service = service_list else: self.def_ns.service = [] _res_list = self.container.check_system(self.def_ns, self.server_instance) # always reset service to the empty list self.def_ns.service = [] trans_list = self.service_state.update( _res_list, exclude=["meta-server"], throttle=[("uwsgi-init", 5)], # force first call force=(self.__loopcount == 1 or force), ) if trans_list: self._new_transitions(trans_list) if self.__loopcount > 1 and not force: _cluster_id = clusterid.get_cluster_id() or "N/A" mail_subject, mail_text = self.service_state.get_mail_text( trans_list) self.__new_mail.init_text() self.__new_mail.set_subject("{} from {} ({})".format( mail_subject, process_tools.get_fqdn()[0], _cluster_id, )) self.__new_mail.append_text(mail_text) _sm_stat, log_lines = self.__new_mail.send_mail() for line in log_lines: self.log(line) if _check_mem and _res_list: self._show_meminfo(_res_list) end_time = time.time() if end_time - act_time > 1: self.log("update {:d} took {}".format( self.__loopcount, logging_tools.get_diff_time_str(end_time - act_time), ))
def _send_mail(self, subject, msg_body): new_mail = mail_tools.icswMail( subject, "{}@{}".format( self.CC.CS["log.mail.from.name"], process_tools.get_fqdn()[0], ), self.CC.CS["mail.target.address"], msg_body, ) new_mail.set_server(self.CC.CS["mail.server"], self.CC.CS["mail.server"]) try: send_stat, log_lines = new_mail.send_mail() for log_line in log_lines: self.log(" - ({:d}) {}".format(send_stat, log_line), logging_tools.LOG_LEVEL_OK) except: self.log( "error sending mail: {}".format( process_tools.get_except_info()), logging_tools.LOG_LEVEL_CRITICAL)
def __init__(self): long_host_name, mach_name = process_tools.get_fqdn() threading_tools.process_pool.__init__(self, "main", zmq=True) self.CC.init(icswServiceEnum.monitor_server, global_config) self.CC.check_config() db_tools.close_connection() self.CC.read_config_from_db([ ("NETSPEED_WARN_MULT", configfile.float_c_var(0.85)), ("NETSPEED_CRITICAL_MULT", configfile.float_c_var(0.95)), ("NETSPEED_DEFAULT_VALUE", configfile.int_c_var(10000000)), ("CHECK_HOST_ALIVE_PINGS", configfile.int_c_var(5)), ("CHECK_HOST_ALIVE_TIMEOUT", configfile.float_c_var(5.0)), ("ENABLE_COLLECTD", configfile.bool_c_var(False)), ("ENABLE_NAGVIS", configfile.bool_c_var(False)), ("ENABLE_FLAP_DETECTION", configfile.bool_c_var(False)), ("NAGVIS_DIR", configfile.str_c_var("/opt/nagvis4icinga")), ("NAGVIS_URL", configfile.str_c_var("/nagvis")), ("NONE_CONTACT_GROUP", configfile.str_c_var("none_group")), ("FROM_ADDR", configfile.str_c_var(long_host_name)), ("LOG_EXTERNAL_COMMANDS", configfile.bool_c_var(False)), ("LOG_PASSIVE_CHECKS", configfile.bool_c_var(False)), ("BUILD_CONFIG_ON_STARTUP", configfile.bool_c_var(True)), ("RELOAD_ON_STARTUP", configfile.bool_c_var(True)), ("RETAIN_HOST_STATUS", configfile.bool_c_var(True)), ("RETAIN_SERVICE_STATUS", configfile.bool_c_var(True)), ("PASSIVE_HOST_CHECKS_ARE_SOFT", configfile.bool_c_var(True)), ("RETAIN_PROGRAM_STATE", configfile.bool_c_var(False)), ("USE_HOST_DEPENDENCIES", configfile.bool_c_var(False)), ("USE_SERVICE_DEPENDENCIES", configfile.bool_c_var(False)), ("TRANSLATE_PASSIVE_HOST_CHECKS", configfile.bool_c_var(True)), ("USE_ONLY_ALIAS_FOR_ALIAS", configfile.bool_c_var(False)), ("HOST_DEPENDENCIES_FROM_TOPOLOGY", configfile.bool_c_var(False)), ("CCOLLCLIENT_TIMEOUT", configfile.int_c_var(10)), ("CSNMPCLIENT_TIMEOUT", configfile.int_c_var(20)), ("MAX_SERVICE_CHECK_SPREAD", configfile.int_c_var(5)), ("MAX_HOST_CHECK_SPREAD", configfile.int_c_var(5)), ("MAX_CONCURRENT_CHECKS", configfile.int_c_var(500)), ("CHECK_SERVICE_FRESHNESS", configfile.bool_c_var( True, help_string="enable service freshness checking")), ("CHECK_HOST_FRESHNESS", configfile.bool_c_var( True, help_string="enable host freshness checking")), ("SAFE_CC_NAME", configfile.bool_c_var(False)), ("SERVICE_FRESHNESS_CHECK_INTERVAL", configfile.int_c_var(60)), ("HOST_FRESHNESS_CHECK_INTERVAL", configfile.int_c_var(60)), ("SAFE_NAMES", configfile.bool_c_var( False, help_string= "convert all command descriptions to safe names (without spaces), [%(default)s]" )), ("ENABLE_ICINGA_LOG_PARSING", configfile.bool_c_var( True, help_string= "collect icinga logs in the database (required for status history and kpis)" )), ]) # copy flags self.__verbose = global_config["VERBOSE"] # log config self.CC.log_config() # re-insert config self.CC.re_insert_config() # init build control self.BC = BuildControl(self) self.register_exception("int_error", self._int_error) self.register_exception("term_error", self._int_error) self.register_exception("hup_error", self._hup_error) self._check_notification() self._check_special_commands() # sync master uuid self.__sync_master_uuid = None # from mixins self.VCM_check_md_version() self._init_network_sockets() if "MD_TYPE" in global_config: self.register_func("register_remote", self._register_remote) self.register_func("send_command", self._send_command) self.register_func("ocsp_results", self._ocsp_results) self.register_func("set_sync_master_uuid", self._set_sync_master_uuid) self.register_func("distribution_info", self._distribution_info) self.register_func("build_step", self.BC.build_step) self.add_process(SyncerProcess("syncer"), start=True) self.add_process(DynConfigProcess("dynconfig"), start=True) self.add_process(IcingaLogReader("IcingaLogReader"), start=True) self.add_process(KpiProcess("KpiProcess"), start=True) # wait for the processes to start time.sleep(0.5) self.register_timer(self._check_for_redistribute, 60 if global_config["DEBUG"] else 300) # only test code # self.send_to_remote_server( # "cluster-server", # unicode(server_command.srv_command(command="statusd")), # ) else: self._int_error("no MD found")
def _send_vector(self, *args, **kwargs): send_id = args[0] _struct = self.cs[send_id] _p_until = _struct.get("pause_until", 0) cur_time = int(time.time()) # print "_", _p_until, cur_time if _p_until: if _p_until > cur_time: return else: self.log("clearing pause_until") del _struct["pause_until"] cur_id = _struct["sent"] full = cur_id % _struct.get("full_info_every", 10) == 0 cur_id += 1 _struct["sent"] = cur_id try: fqdn, _short_name = process_tools.get_fqdn() except: fqdn = process_tools.get_machine_name() send_format = _struct.get("format", "xml") if send_format == "xml": send_vector = self.build_xml(E, simple=not full) send_vector.attrib["name"] = _struct.get("send_name", fqdn) or fqdn send_vector.attrib["interval"] = "{:d}".format( _struct.get("send_every")) send_vector.attrib["uuid"] = self.module.main_proc.zeromq_id else: send_vector = self.build_json(simple=not full) send_vector[1]["name"] = _struct.get("send_name", fqdn) or fqdn send_vector[1]["interval"] = _struct.get("send_every") send_vector[1]["uuid"] = self.module.main_proc.zeromq_id # send to server t_host, t_port = ( _struct.get("target", "127.0.0.1"), _struct.get("port", 8002), ) try: if send_format == "xml": self.__socket_dict[send_id].send_unicode( unicode(etree.tostring(send_vector))) # @UndefinedVariable else: # print json.dumps(send_vector) self.__socket_dict[send_id].send_unicode( json.dumps(send_vector)) except: exc_info = process_tools.get_except_info() # ignore errors self.log( "error sending to ({}, {:d}): {}".format( t_host, t_port, exc_info), logging_tools.LOG_LEVEL_ERROR) if exc_info.count("int_error"): raise else: # problem sending, wait 2 minutes _diff_t = 120 _w_time = cur_time + _diff_t self.log( "setting pause_until to {:d} (+{:d} seconds)".format( _w_time, _diff_t), logging_tools.LOG_LEVEL_WARN) _struct["pause_until"] = _w_time self.cs[send_id] = _struct
def __init__(self, options): threading_tools.process_pool.__init__(self, "main", zmq=True) long_host_name, mach_name = process_tools.get_fqdn() self.__run_command = True if global_config["COMMAND"].strip() else False # rewrite LOG_NAME if necessary if self.__run_command: self.CC.init( icswServiceEnum.cluster_server, global_config, init_msi_block=False, log_name_postfix="direct-{}".format(global_config["COMMAND"]), ) else: self.CC.init( icswServiceEnum.cluster_server, global_config, ) self.CC.check_config() # close DB conncetion (daemonize) if self.__run_command: global_config.mc_prefix = global_config["COMMAND"] else: # create hardware fingerprint self.CC.create_hfp() # enable memcache backend global_config.enable_mc() db_tools.close_connection() self.CC.read_config_from_db( [ ("IMAGE_SOURCE_DIR", configfile.str_c_var("/opt/cluster/system/images")), ("MAILSERVER", configfile.str_c_var("localhost")), ("FROM_NAME", configfile.str_c_var("quotawarning")), ("FROM_ADDR", configfile.str_c_var(long_host_name)), ("VERSION", configfile.str_c_var(VERSION_STRING, database=False)), ("QUOTA_ADMINS", configfile.str_c_var("*****@*****.**")), ("MONITOR_QUOTA_USAGE", configfile.bool_c_var(False, info="enabled quota usage tracking")), ("TRACK_ALL_QUOTAS", configfile.bool_c_var(False, info="also track quotas without limit")), ("QUOTA_CHECK_TIME_SECS", configfile.int_c_var(3600)), ("USER_MAIL_SEND_TIME", configfile.int_c_var(3600, info="time in seconds between two mails")), ("SERVER_FULL_NAME", configfile.str_c_var(long_host_name, database=False)), ("SERVER_SHORT_NAME", configfile.str_c_var(mach_name, database=False)), ("DATABASE_DUMP_DIR", configfile.str_c_var("/opt/cluster/share/db_backup")), ("DATABASE_KEEP_DAYS", configfile.int_c_var(30)), ("USER_SCAN_TIMER", configfile.int_c_var(7200, info="time in seconds between two user_scan runs")), ("NEED_ALL_NETWORK_BINDS", configfile.bool_c_var(True, info="raise an error if not all bind() calls are successfull")), ] ) if not self.__run_command: self.CC.re_insert_config() self.register_exception("int_error", self._int_error) self.register_exception("term_error", self._int_error) self.register_func("bg_finished", self._bg_finished) self._log_config() self._check_uuid() self._load_modules() self.__options = options self._set_next_backup_time(True) if self.__run_command: self.register_timer(self._run_command, 3600, instant=True) else: self._init_network_sockets() if not self["exit_requested"]: self.init_notify_framework(global_config) self.add_process(CapabilityProcess("capability_process"), start=True) self.add_process(LicenseChecker("license_checker"), start=True) db_tools.close_connection() self.register_timer( self._update, 2 if global_config["DEBUG"] else 30, instant=True )
def __init__(self): _long_host_name, mach_name = process_tools.get_fqdn() threading_tools.icswProcessPool.__init__(self, "main") self.register_exception("int_error", self._int_error) self.register_exception("term_error", self._int_error) self.CC.init(icswServiceEnum.mother_server, global_config) self.CC.check_config() # close db connection (for daemonizing) db_tools.close_connection() self.debug = global_config["DEBUG"] self.srv_helper = service_tools.ServiceHelper(self.log) self.__hs_port = InstanceXML(quiet=True).get_port_dict( icswServiceEnum.hoststatus, command=True) self.__hm_port = InstanceXML(quiet=True).get_port_dict( icswServiceEnum.host_monitoring, command=True) # log config self.CC.read_config_from_db([ ("TFTP_LINK", configfile.StringConfigVar("/tftpboot")), ("TFTP_DIR", configfile.StringConfigVar( os.path.join(CLUSTER_DIR, "system", "tftpboot"))), ("CLUSTER_DIR", configfile.StringConfigVar(CLUSTER_DIR)), # in 10th of seconds ("NODE_BOOT_DELAY", configfile.IntegerConfigVar(50)), ("FANCY_PXE_INFO", configfile.BoolConfigVar(False)), ("SERVER_SHORT_NAME", configfile.StringConfigVar(mach_name)), ("WRITE_DHCP_CONFIG", configfile.BoolConfigVar(True)), ("DHCP_AUTHORITATIVE", configfile.BoolConfigVar(False)), ("DHCP_ONLY_BOOT_NETWORKS", configfile.BoolConfigVar(True)), ("MODIFY_NFS_CONFIG", configfile.BoolConfigVar(True)), ("NEED_ALL_NETWORK_BINDS", configfile.BoolConfigVar(True)), ]) global_config.add_config_entries([ ("CONFIG_DIR", configfile.StringConfigVar( os.path.join(global_config["TFTP_DIR"], "config"))), ("ETHERBOOT_DIR", configfile.StringConfigVar( os.path.join(global_config["TFTP_DIR"], "etherboot"))), ("KERNEL_DIR", configfile.StringConfigVar( os.path.join(global_config["TFTP_DIR"], "kernels"))), ("SHARE_DIR", configfile.StringConfigVar( os.path.join(global_config["CLUSTER_DIR"], "share", "mother"))), ("NODE_SOURCE_IDX", configfile.IntegerConfigVar(LogSource.new("node").pk)), ]) self.CC.log_config() self.CC.re_insert_config() # prepare directories self._prepare_directories() # check netboot functionality self._check_netboot_functionality() # check nfs exports self._check_nfs_exports() # modify syslog config self._enable_syslog_config() # dhcp config self.write_dhcp_config() # check status entries self._check_status_entries() self.register_func("contact_hoststatus", self._contact_hoststatus) self.register_func("contact_hostmonitor", self._contact_hostmonitor) my_uuid = uuid_tools.get_uuid() self.log("cluster_device_uuid is '{}'".format(my_uuid.urn)) if self._init_network_sockets(): self.add_process(initat.mother.kernel.KernelSyncProcess("kernel"), start=True) self.add_process( initat.mother.command.ExternalCommandProcess("command"), start=True) self.add_process( initat.mother.control.NodeControlProcess("control"), start=True) self.add_process(initat.mother.control.ICMPProcess("icmp"), start=True) db_tools.close_connection() conf_dict = { key: global_config[key] for key in ["LOG_NAME", "LOG_DESTINATION", "VERBOSE"] } self.add_process(SNMPProcess("snmp_process", conf_dict=conf_dict), start=True) # send initial commands self.send_to_process( "kernel", "srv_command", str( server_command.srv_command(command="check_kernel_dir", insert_all_found="1"))) # restart hoststatus self.send_to_process("command", "delay_command", "/etc/init.d/hoststatus restart", delay_time=5) self.send_to_process("control", "refresh", refresh=False) else: self._int_error("bind problem")