def handle_info_action(self, action, srv_com): if self.__md_struct: if action == "sync_start": self.__md_struct.sync_start = cluster_timezone.localize( datetime.datetime.now()) self.__md_struct.num_files = int(srv_com["*num_files"]) self.__md_struct.size_data = int(srv_com["*size_data"]) self.__md_struct.num_transfers = 1 self.__md_struct.num_runs += 1 self.__md_struct.save(update_fields=[ "sync_start", "num_files", "size_data", "num_transfers", "num_runs" ]) elif action == "sync_end": self.__md_struct.sync_end = cluster_timezone.localize( datetime.datetime.now()) self.__md_struct.save(update_fields=["sync_end"]) self.__md_struct = None else: self.log( "unknown action {} in handle_info_action()".format(action), logging_tools.LOG_LEVEL_ERROR) else: self.log("md_struct not set for action {}".format(action), logging_tools.LOG_LEVEL_WARN)
def start_build(self, b_version, full_build, master=None): # generate datbase entry for build self.config_version_build = b_version if self.master: # re-check relayer version for master self.log("version for master is {}".format(self.vers_info)) _md = mon_dist_master( device=self.monitor_server, version=self.config_version_build, full_build=full_build, build_start=cluster_timezone.localize(datetime.datetime.now()), ) else: self.__md_master = master self.log("version for slave {} is {}".format( self.monitor_server.full_name, self.vers_info)) _md = mon_dist_slave( device=self.monitor_server, full_build=full_build, mon_dist_master=self.__md_master, ) # version info for _attr in ["relayer_version", "mon_version", "livestatus_version"]: setattr(_md, _attr, self.__raw_info["version"][_attr]) _md.save() self.__md_struct = _md return self.__md_struct
def run(self, cur_bg): to_run = [] _src_com = server_command.srv_command(source=cur_bg.command_xml) devs = device.objects.filter( Q(pk__in=[int(_pk) for _pk in _src_com.xpath(".//ns:object/@pk")])) # split for bootservers _boot_dict = {} for _dev in devs: if _dev.bootserver_id: _boot_dict.setdefault(_dev.bootserver_id, []).append(_dev) for srv_id, dev_list in _boot_dict.iteritems(): # target command srv_com = server_command.srv_command(command="refresh") # only valid for one device srv_com["devices"] = srv_com.builder( "devices", *[ srv_com.builder("device", name=dev.name, pk="{:d}".format(dev.pk)) for dev in dev_list ]) to_run.append(( background_job_run( background_job=cur_bg, server=dev_list[0].bootserver, command_xml=unicode(srv_com), start=cluster_timezone.localize(datetime.datetime.now()), ), srv_com, icswServiceEnum.mother_server, )) return to_run
def search_result(self, s_struct): res_xml = etree.fromstring(s_struct.read()) # @UndefinedVariable cur_search = s_struct.run_info["stuff"] cur_search.current_state = "done" cur_search.results = len( res_xml.xpath(".//solvable", smart_strings=False)) cur_search.last_search = cluster_timezone.localize( datetime.datetime.now()) cur_search.save( update_fields=["last_search", "current_state", "results"]) # all repos repo_dict = dict([(cur_repo.name, cur_repo) for cur_repo in package_repo.objects.all()]) # delete previous search results cur_search.package_search_result_set.all().delete() self.log("found for {}: {:d}".format(cur_search.search_string, cur_search.results)) for result in res_xml.xpath(".//solvable", smart_strings=False): if result.attrib["repository"] in repo_dict: new_sr = package_search_result( name=result.attrib["name"], kind=result.attrib["kind"], arch=result.attrib["arch"], version=result.attrib["edition"], package_search=cur_search, copied=False, package_repo=repo_dict[result.attrib["repository"]]) new_sr.save() else: self.log( "unknown repository '{}' for package '{}'".format( result.attrib["repository"], result.attrib["name"], ), logging_tools.LOG_LEVEL_ERROR)
def bg_check_notify(self): self.srv_routing.update() # step 1: delete pending jobs which are too old _timeout = background_job.objects.filter( Q(initiator=self.srv_routing.local_device.pk) & Q(state__in=["pre-init", "pending"]) & Q(valid_until__lte=cluster_timezone.localize( datetime.datetime.now()))) if _timeout.count(): self.log( "{} timeout".format( logging_tools.get_plural("background job", _timeout.count())), logging_tools.LOG_LEVEL_WARN) for _to in _timeout: _to.set_state("timeout") # print background_job.objects.filter(Q(initiator=self.srv_routing.local_device.pk) & Q(state="pre-init") & Q(valid_until_lt=datetime.datetime.now())) try: _pending = background_job.objects.filter( Q(initiator=self.srv_routing.local_device.pk) & Q(state="pre-init")).order_by("pk") # force evaluation _pc = _pending.count() except: self.log( "error accessing DB: {}".format( process_tools.get_except_info()), logging_tools.LOG_LEVEL_CRITICAL) # close connection db_tools.close_connection() else: if _pc: self.log("pending background jobs: {:d}".format(_pc)) for _cur_bg in _pending: self._handle_bgj(_cur_bg)
def bg_notify_handle_result(self, srv_com): _str, _state = srv_com.get_log_tuple() _id = int(srv_com["*bgjrid"]) self.__waiting_ids.remove(_id) self.log( "got result for bgjrid {:d} ({:d}): {}".format( _id, _state, _str, ), _state) _run_job = background_job_run.objects.select_related( "background_job").get(Q(pk=_id)) _run_job.state = server_command.log_level_to_srv_reply(_state) _run_job.result = _str _run_job.result_xml = str(srv_com) _run_job.end = cluster_timezone.localize(datetime.datetime.now()) _run_job.save() self.bg_notify_check_for_bgj_finish(_run_job.background_job)
def _interpret_qacct(self, cur_out, needed): _found, _matched = (0, 0) _dict_list = [] _dict = {} for _line in cur_out.split("\n"): if _line.startswith("==="): if "jobnumber" in _dict: _found += 1 _matched += self._feed_qacct(_dict) _dict_list.append(_dict) _dict = {} else: if _line.strip(): _parts = _line.strip().split(None, 1) if len(_parts) > 1: # simple cleanup _key, _value = _parts if _value.isdigit(): _value = int(_value) elif _value in ["NONE", "undefined", "-/-"]: _value = None elif _key.endswith("time") and len(_value.split()) > 4: _value = cluster_timezone.localize( datetime.datetime.strptime( _value, "%a %b %d %H:%M:%S %Y")) _dict[_key] = _value if "jobnumber" in _dict: _found += 1 _matched += self._feed_qacct(_dict) _dict_list.append(_dict) if needed == _found and not _matched: # print _dict_list[0] _to_del = rms_job_run.objects.filter( Q(rms_job__jobid=_dict_list[0]["jobnumber"]) & Q(rms_job__taskid=_dict_list[0]["taskid"])) self.log( " all matches found, removing old rms_job_run entries ({:d})" .format(_to_del.count())) _to_del.delete() _matched = 0 for _dict in _dict_list: _matched += self._feed_qacct(_dict, force=True) return _found, _matched
def _load_cache(self): self.__cache_created, self.__cache_age, self.__cache_valid = (0, 0, False) self.__cache = monitoring_hint.objects.filter( Q(device=self.host) & Q(m_type=self.ds_name)) # set datasource to cache for _entry in self.__cache: if _entry.datasource not in ["c", "p"]: _entry.datasource = "c" _entry.save(update_fields=["datasource"]) self.log("loaded hints ({}) from db".format( logging_tools.get_plural("entry", len(self.__cache)))) if self.__cache: _now = cluster_timezone.localize(datetime.datetime.now()) self.__cache_age = max([ abs(_now - _entry.changed).total_seconds() for _entry in self.__cache ]) self.__cache_valid = self.__cache_age < self.Meta.cache_timeout
def _get_missing_dict(self): # clean old jobs without a valid accounting log invalid_runs = rms_job_run.objects.filter( Q(qacct_called=False) & Q(end_time=None) & Q(start_time=None) & Q(start_time_py__lt=cluster_timezone.localize( datetime.datetime.now()) - datetime.timedelta(seconds=31 * 24 * 3600))) self.log("invalid runs found: {:d}".format(invalid_runs.count())) _missing_ids = rms_job_run.objects.filter( Q(qacct_called=False)).values_list("idx", "rms_job__jobid", "rms_job__taskid") _mis_dict = {} for _entry in _missing_ids: if _entry[2]: _id = "{:d}.{:d}".format( _entry[1], _entry[2], ) else: _id = "{:d}".format(_entry[1]) _mis_dict.setdefault(_id, []).append(_entry[0]) return _mis_dict
def search_result(self, s_struct): cur_mode, _ln = (0, None) found_packs = [] for line in s_struct.read().split("\n"): found_packs.append(line.strip().split()) cur_search = s_struct.run_info["stuff"] cur_search.current_state = "done" _found = 0 cur_search.results = _found cur_search.last_search = cluster_timezone.localize(datetime.datetime.now()) cur_search.save(update_fields=["last_search", "current_state", "results"]) # delete previous search results cur_search.package_search_result_set.all().delete() self.log("parsing results... ({:d} found)".format(len(found_packs))) repo_dict = {_repo.name: _repo for _repo in package_repo.objects.all()} for _parts in found_packs: try: p_name, p_ver = _parts version, release = p_ver.split("-", 1) _found += 1 new_sr = package_search_result( name=p_name, version="{}-{}".format(version, release), package_search=cur_search, copied=False, ) new_sr.save() except: self.log( "cannot interpret line '{}': {}".format( _parts, process_tools.get_except_info(), ), logging_tools.LOG_LEVEL_ERROR ) cur_search.results = _found cur_search.save(update_fields=["results"]) self.log("found for {}: {:d}".format(cur_search.search_string, cur_search.results)) self.post_search()
def run(self, cur_bg): _src_com = server_command.srv_command(source=cur_bg.command_xml) # target command srv_com = server_command.srv_command(command="sync_sensor_threshold") _sc = config_tools.icswServerCheck(service_type_enum=icswServiceEnum.collectd_server) to_run = [] if _sc.get_result().effective_device: to_run.append( ( background_job_run( background_job=cur_bg, server=_sc.effective_device, command_xml=str(srv_com), start=cluster_timezone.localize(datetime.datetime.now()), ), srv_com, icswServiceEnum.collectd_server, ) ) else: self.log("no valid rrd-collector found", logging_tools.LOG_LEVEL_ERROR) return to_run
def run(self, cur_bg): to_run = [] sensor_action = SensorAction.objects.get(Q(pk=cur_bg.options)) _mother_com = sensor_action.get_mother_command() if _mother_com is not None: _src_com = server_command.srv_command(source=cur_bg.command_xml) devs = device.objects.filter(Q(pk__in=[int(_pk) for _pk in _src_com.xpath(".//ns:object/@pk")])) # split for bootservers _boot_dict = {} for _dev in devs: if _dev.bootserver_id: _boot_dict.setdefault(_dev.bootserver_id, []).append(_dev) for srv_id, dev_list in _boot_dict.iteritems(): # target command srv_com = server_command.srv_command(command=_mother_com[0]) # only valid for one device srv_com["devices"] = srv_com.builder( "devices", *sum( [ sensor_action.build_mother_element(srv_com.builder, dev) for dev in dev_list ], [] ) ) to_run.append( ( background_job_run( background_job=cur_bg, server=dev_list[0].bootserver, command_xml=unicode(srv_com), start=cluster_timezone.localize(datetime.datetime.now()), ), srv_com, icswServiceEnum.mother_server, ) ) return to_run
def run(self, cur_bg): ''' Find actual cluster-server of virtual desktop and reload/restart there :param cur_bg: ''' _src_com = server_command.srv_command(source=cur_bg.command_xml) vdus = virtual_desktop_user_setting.objects.get( Q(pk=_src_com.xpath(".//ns:object/@pk")[0])) srv_com = server_command.srv_command(command="reload_virtual_desktop") srv_com["vdus"] = vdus.pk to_run = [( background_job_run( background_job=cur_bg, server=vdus.device, command_xml=unicode(srv_com), start=cluster_timezone.localize(datetime.datetime.now()), ), srv_com, icswServiceEnum.cluster_server, )] return to_run
def _update_raw_data(self): self.log("checking icinga log") # collect warnings for not spamming in release mode self._warnings = defaultdict(lambda: 0) # check where we last have read for log rotation last_read = MonIcingaLastRead.get_last_read() if last_read: self.log("last icinga read until: {}".format( self._parse_timestamp(last_read.timestamp))) else: _arch_dir = IcingaLogReader.get_icinga_log_archive_dir() self.log("no earlier icinga log read, reading archive ({})".format( _arch_dir, )) # print("***", _arch_dir) files = glob.glob( os.path.join(_arch_dir, "{}*".format(global_config['MD_TYPE']))) last_read_element = self.parse_archive_files(files) if last_read_element: # store from archive but with empty position and line_number last_read = self._update_last_read(0, last_read_element.timestamp, last_read_element.inode, 0) # this is a duplicate update, but ensures that we have a valid value here else: self.log("no earlier icinga log read and no archive data") # there was no earlier read and we weren't able to read anything from the archive, # so assume there is none last_read = MonIcingaLastRead() # safe time in past, but not too far cause we check logs of each day last_read.timestamp = int( ((datetime.datetime.now() - datetime.timedelta(days=1)) - datetime.datetime(1970, 1, 1)).total_seconds()) last_read.position = 0 last_read.inode = 0 last_read.line_number = 1 try: logfile = codecs.open(self.get_icinga_log_file(), "r", "utf-8", errors='replace') except IOError: self.log( "Failed to open log file {} : {}".format( self.get_icinga_log_file(), process_tools.get_except_info(), ), logging_tools.LOG_LEVEL_ERROR) else: # check for log rotation logfile.seek(last_read.position) cur_inode = os.stat(self.get_icinga_log_file()).st_ino same_logfile_as_last_read = cur_inode == last_read.inode self.log("Inode check: current={:d}, last={:d}, {}".format( cur_inode, last_read.inode, "same" if same_logfile_as_last_read else "file changed", )) if same_logfile_as_last_read: self.log("continuing to read in current icinga log file") # no log rotation, continue reading current file # the current position of the file must be the next byte to read! self.parse_log_file(logfile, self.get_icinga_log_file(), last_read.line_number) else: self.log("detected icinga log rotation") # cur log file does not correspond to where we last read. # we have to check the archive for whatever we have missed. last_read_date = datetime.datetime.utcfromtimestamp( last_read.timestamp) today_datetime = datetime.datetime.combine( datetime.date.today(), datetime.datetime.min.time()) missed_timedelta = today_datetime - last_read_date files_to_check = [] # get days by filename for day_missed in range(missed_timedelta.days + 1): # include last missed_log_day = last_read_date + datetime.timedelta( days=day_missed) format_num = lambda num: "{:02d}".format(num) day_files = glob.glob( os.path.join( IcingaLogReader.get_icinga_log_archive_dir(), "{}-{}-{}-{}-*".format( global_config['MD_TYPE'], format_num(missed_log_day.month), format_num(missed_log_day.day), format_num(missed_log_day.year)))) files_to_check.extend(day_files) # read archive self.parse_archive_files(files_to_check, start_at=last_read.timestamp) if not self["exit_requested"]: self.log( "finished catching up with archive, continuing with current icinga log file" ) # start reading cur file logfile.seek(0) self.parse_log_file(logfile, self.get_icinga_log_file(), 1) # check if icinga is even running # (we do this after parsing to have events in proper order in db, which is nice) icinga_lock_file_name = os.path.join(global_config["MD_BASEDIR"], "var", global_config["MD_LOCK_FILE"]) try: pid = int(open(icinga_lock_file_name, "r").read().strip()) except: pass # can't really tell if icinga is running this way else: try: psutil.Process(pid=pid) except psutil.NoSuchProcess: # assume not running msg = "icinga process (pid: {}) is not running".format(pid) self.log(msg) self._create_icinga_down_entry(cluster_timezone.localize( datetime.datetime.now()), msg, None, save=True) if self.always_collect_warnings or not global_config["DEBUG"]: if self._warnings: self.log("warnings while parsing:") for warning, multiplicity in self._warnings.items(): self.log(" {} ({})".format(warning, multiplicity), logging_tools.LOG_LEVEL_WARN) self.log("end of warnings while parsing")
def search_result(self, s_struct): cur_mode, _ln = (0, None) found_packs = [] for line in s_struct.read().split("\n"): if line.startswith("===="): # header done cur_mode = 1 _ln = 0 elif not line.strip(): # empty line, check for new package cur_mode = 1 _ln = 0 else: if cur_mode == 1: _ln += 1 if _ln == 1: p_info = [line.strip().split()[0]] else: if line.lower().startswith("repo") and line.count(":"): p_info.append(line.strip().split(":")[1].strip()) found_packs.append(p_info) # p_name = line.split()[0].strip() # if p_name and p_name != ":": # if p_name[0].isdigit() and p_name.count(":"): # p_name = p_name.split(":", 1)[1] # found_packs.append(p_name) cur_search = s_struct.run_info["stuff"] cur_search.current_state = "done" _found = 0 cur_search.results = _found cur_search.last_search = cluster_timezone.localize(datetime.datetime.now()) cur_search.save(update_fields=["last_search", "current_state", "results"]) # delete previous search results cur_search.package_search_result_set.all().delete() self.log("parsing results... ({:d} found)".format(len(found_packs))) repo_dict = {_repo.name: _repo for _repo in package_repo.objects.all()} for p_name, repo_name in found_packs: if repo_name == "installed": continue try: parts = p_name.split("-") rel_arch = parts.pop(-1) arch = rel_arch.split(".")[-1] release = rel_arch[:-(len(arch) + 1)] version = parts.pop(-1) name = "-".join(parts) except: self.log( "cannot parse package name {}: {}".format( p_name, process_tools.get_except_info() ), logging_tools.LOG_LEVEL_ERROR ) else: _found += 1 new_sr = package_search_result( name=name, arch=arch, version="{}-{}".format(version, release), package_search=cur_search, copied=False, package_repo=repo_dict.get(repo_name, None) ) new_sr.save() cur_search.results = _found cur_search.save(update_fields=["results"]) self.log( "packages found for '{}': {:d}".format( cur_search.search_string, cur_search.results ) ) self.post_search()
def run(self, cur_bg): # step 1: create user homes _uo = user.objects create_user_list = _uo.exclude( Q(export=None) ).filter( Q(home_dir_created=False) & Q(active=True) & Q(group__active=True) ).select_related( "export__device" ) to_run = [] if create_user_list.count(): self.log("{} to create".format(logging_tools.get_plural("user home", len(create_user_list)))) for create_user in create_user_list: srv_com = server_command.srv_command(command="create_user_home") srv_com["server_key:username"] = create_user.login to_run.append( ( background_job_run( background_job=cur_bg, server=create_user.export.device, command_xml=unicode(srv_com), start=cluster_timezone.localize(datetime.datetime.now()), ), srv_com, icswServiceEnum.cluster_server, # ) ) else: self.log("no user homes to create", logging_tools.LOG_LEVEL_WARN) # check directory sync requests no_device = [] for _config, _command, _srv_type in [ ("ldap_server", "sync_ldap_config", icswServiceEnum.cluster_server), ("yp_server", "write_yp_config", icswServiceEnum.cluster_server), ("monitor_server", "sync_http_users", icswServiceEnum.monitor_server), ]: _cdict = config_tools.device_with_config(_config) for _sc_list in _cdict.itervalues(): for _sc in _sc_list: if _sc.effective_device: self.log( u"effective device for {} (command {}) is {}".format( _config, _command, unicode(_sc.effective_device), ) ) srv_com = server_command.srv_command(command=_command) to_run.append( ( background_job_run( background_job=cur_bg, server=_sc.effective_device, command_xml=unicode(srv_com), start=cluster_timezone.localize(datetime.datetime.now()), ), srv_com, _srv_type, ) ) if not _cdict: no_device.append(_command) if no_device: self.log("no device(s) found for {}".format(", ".join(no_device)), logging_tools.LOG_LEVEL_WARN) return to_run
def end_build(self): if self.__md_struct: self.__md_struct.build_end = cluster_timezone.localize( datetime.datetime.now()) self.__md_struct.save()
def check_initrd(self): # update initrd_built and module_list from initrd.gz # check for presence of stage-files present_dict = {} for initrd_flavour in KNOWN_INITRD_FLAVOURS + ["stage2"]: if initrd_flavour == "stage2": db_name = "stage2_present" else: db_name = "stage1_{}_present".format(initrd_flavour) present_flag = True if os.path.isfile( self.__initrd_paths[initrd_flavour]) else False present_dict[initrd_flavour] = True if present_flag else False self.__values[db_name] = present_flag self._update_kernel(**{db_name: present_flag}) if self.__values["initrd_built"] is None: present_keys = sorted([ key for key in ["cpio", "cramfs", "lo"] if present_dict.get(key, False) ]) if present_keys: self.log( "{} for checking initrd: {}".format( logging_tools.get_plural("key", len(present_keys)), ", ".join([ "{} (file {})".format( key, os.path.basename(self.__initrd_paths[key])) for key in present_keys ])), ) self.__checks.append("initrd") initrd_built = cluster_timezone.localize( datetime.datetime.fromtimestamp( os.stat(self.__initrd_paths[present_keys[0]])[ stat.ST_MTIME])) # initrd_built = time.localtime(initrd_built) self._update_kernel(initrd_built=initrd_built) # temporary file and directory tmp_dir = tempfile.mkdtemp() tfile_name = os.path.join(tmp_dir, ".initrd_check") tdir_name = os.path.join(tmp_dir, ".initrd_mdir") if not os.path.isdir(tdir_name): os.mkdir(tdir_name) checked = False for present_key in present_keys: check_path = self.__initrd_paths[present_key] self.log( "trying to get modules via {}-flavour ({})".format( present_key, self.__initrd_paths[present_key])) # flavour-dependend mod_list extraction setup_ok, do_umount = (True, False) if present_key in ["lo", "cramfs"]: try: open(tfile_name, "w").write(gzip.open(check_path, "r").read()) except: self.log( "error reading {}: {}".format( check_path, process_tools.get_except_info()), logging_tools.LOG_LEVEL_ERROR) setup_ok = False else: m_com = "mount -o loop {} {}".format( tfile_name, tdir_name) um_com = "umount {}".format(tdir_name) cstat, out = subprocess.getstatusoutput(m_com) if cstat: self.log( "error mounting tempfile {} to {}: {}". format(tfile_name, tdir_name, out)) setup_ok = False else: do_umount = True else: # no setup needed for cpio setup_ok = True # check list if setup_ok: mod_list = set() if present_key in ["lo", "cramfs"]: for dir_name, dir_list, file_list in os.walk( os.path.join(tdir_name, "lib", "modules")): for mod_name in [ file_name for file_name in file_list if file_name.endswith(".o") or file_name.endswith(".ko") ]: mod_list.add( mod_name[:-2] if mod_name. endswith(".o") else mod_name[:-3]) checked = True else: c_stat, c_out = subprocess.getstatusoutput( "gunzip -c {} | cpio -t".format(check_path)) if c_stat: self.log( "error getting info crom cpio-archive {} ({:d}, {}): {}" .format(check_path, c_stat, c_out, process_tools.get_except_info()), logging_tools.LOG_LEVEL_ERROR) else: checked = True mod_lines = [ os.path.basename("/{}".format(line)) for line in c_out.split("\n") if line.startswith("lib/modules") and ( line.endswith(".o") or line.endswith(".ko")) ] mod_list = set([ mod_name[:-2] if mod_name.endswith(".o") else mod_name[:-3] for mod_name in mod_lines ]) mod_list = ",".join(sorted(mod_list)) # print "***", present_key, mod_list if mod_list: self.log("found {}: {}".format( logging_tools.get_plural( "module", len(mod_list.split(","))), mod_list)) else: self.log("found no modules") if self.__db_idx: if mod_list != self.__values["module_list"]: self._update_kernel(module_list=mod_list) else: self.__values["module_list"] = mod_list self.__values["target_module_list"] = mod_list if do_umount: c_stat, c_out = subprocess.getstatusoutput(um_com) if c_stat: self.log( "error unmounting tempfile {} from {} ({:d}): {}" .format(tfile_name, tdir_name, c_stat, c_out), logging_tools.LOG_LEVEL_ERROR) if checked: # pass break if os.path.isdir(tdir_name): os.rmdir(tdir_name) if os.path.isfile(tfile_name): os.unlink(tfile_name) os.rmdir(tmp_dir) else: self.log("not initrd-file found", logging_tools.LOG_LEVEL_WARN) else: self.log("initrd_built already set")
def config_ts(self, ts_type): if self.__md_struct: # set config timestamp setattr(self.__md_struct, "config_build_{}".format(ts_type), cluster_timezone.localize(datetime.datetime.now())) self.__md_struct.save()