def crash(self, s: str) -> bool: # TODO write in datastore error message log.critical(s) for i in self.myThreads: if not self.myThreads[i]: log.critical(f"Core thread {i} is dead") log.debug('Calling shutdown') Thread(name="shutdown", target=shutdown_me, args=(1, '')).start() return True
def log_inspector(): log.debug("log_inspector started") selfName = 'log_inspector' while True: try: for taskName, task in cfg['tasks']['logTask'].items(): log.info(f"Check log {taskName}") logFile = task['file'] templates = task['tmpl'] try: # TODO open if file is changed with open(logFile, encoding='utf-8') as f: cnt = f.read() for tmplName in templates: tmpl = templater.get_tmpl(selfName, tmplName) if tmpl in cnt: ev = f"Found log expression {taskName}: {tmplName}" log.warning(ev) body = templater.tmpl_fill(selfName, 'error').replace( '{{taskName}}', taskName, -1) event = 'error' new_toast('log_inspector', event) if 'eventScript' in task: allowSend, body = execute_event_script( log, task['eventScript'], taskName, event, body) else: allowSend = True if allowSend: send_notify(taskName, event, body) except FileNotFoundError: log.error(f"Not found log file {taskName}") except Exception as e: log.error(f"Fail to parse log file {taskName}: {e}") sleep(intervalCheckMin * 2) except Exception: e = traceback.format_exc() log.critical(str(e)) break
def process_inspector(): def get_pid(exe: str, exePath: str, workDir: str = None) -> int: # if give workDir, will check only it for p in psutil.process_iter(["name", 'exe', 'cwd']): # if 'calc1' in p.info['name']: # sout(f"{p.pid} | {p.info['name']} | {p.info['cwd']} | {p.info['exe']}", 'violet' ) if exe == p.info['name'].lower(): if workDir: if not p.info['cwd'].endswith('/'): p.info['cwd'] = f"{p.info['cwd']}/" if workDir.lower() == p.info['cwd'].replace('\\', '/', -1).lower(): return p.pid else: if PLATFORM == 'nt': exePath = f"{exePath}{exe}" else: exePath = exePath[:-1] if exePath.lower() == p.info['exe'].replace('\\', '/', -1).lower(): return p.pid def restart(job: dict, exePid: int = None, killRecursive: bool = False) -> str: data = "" status = 0 failList[taskName]['attemp'] += 1 if exePid: try: assert exePid != os.getpid(), "won't kill myself" parent = psutil.Process(exePid) children = parent.children(killRecursive) children.append(parent) # TODO try soft kill before hard for p in children: try: # p.send_signal(signal.SIGTERM) p.kill() except psutil.NoSuchProcess: pass _, alive = psutil.wait_procs(children, timeout=60) if alive: raise Exception( f"Fail to kill process {exe} (PID {exePid})") except Exception as e: data = f'Fail to restart process {exe}: {e}\n' log.error(data) status = 2 if status == 0: log.debug(f"Launch application {taskName}") whatStart = job['whatStart'] if whatStart == 'command': target = job['command'] elif whatStart == 'exe': target = f"{job['exePath']}{exe} {job['exeKey']}" else: target = None if target: log.info(f"Starting {taskName}") try: if PLATFORM == 'nt': os.system(f"start cmd /c {target}") else: os.system(f"command {target} &") except Exception as e: data = f"Fail to restart application: {exe} ({taskName}): {e}\n" status = 3 else: log.info(f"Starting service {job['service']}") try: if PLATFORM == 'nt': win32serviceutil.StartService(job['service']) else: os.system(f"systemctl start {job['service']}") except Exception as e: e = traceback.format_exc() log.error(str(e)) status = 3 data = f"Fail to start service: {job['service']} ({taskName}): {e}\n" # проверка что он снова не упал # TODO отсчёт времени падения после старта if status == 0: sleep(restartTime) if get_pid(exe, checkPath, workDir): data += 'Successfully restarted application' failList[taskName]['isAlive'] = False failList[taskName]['attemp'] -= 1 log.info(data) else: data += f'Fail to start {taskName}' log.error(data) else: log.error(data) new_toast(taskName, data) return data sleep(3) selfName = "process_inspector" failList = {} for job in jobList: failList[job] = {'isAlive': False, "attemp": 0} while True: try: for job in jobList.values(): taskName = job['task'] exe = job['exe'].lower() checkPath = job['checkPath'] exePath = job['exePath'] workDir = job['workDir'] doRestart = job['doRestart'] alwaysWork = job['alwaysWork'] restartTime = job['restartTime'] respTime = job['respTime'] status = 0 body = '' log.info(f'Check app {taskName}') exePid = get_pid(exe, checkPath, workDir) if exePid and not job['checkUrl']: log.debug(f"{taskName} is fine.") elif exePid and job['checkUrl']: log.debug(f"Found {taskName}. Check http status") try: res = requests.get(job['url'], timeout=respTime) if res.status_code != 200: raise Exception( f"Server return status {res.status_code}") log.debug(f"{taskName} is fine.") if not failList[taskName]['isAlive']: continue else: failList[taskName]['isAlive'] = False data = templater.tmpl_fill(selfName, 'alive') except Exception: status = 1 data = f"{taskName} didn't respond or return wrong answer. Trying to restart application\n" new_toast(f'Restarting {taskName}', data) log.warning(data) body = templater.tmpl_fill(selfName, "badAnswer").replace( "{{taskName}}", taskName, -1) failList[taskName]['isAlive'] = True if status != 0 and doRestart: data += restart(job, exePid) body += data if 'eventScript' in job: allowSend, body = execute_event_script( log, job['eventScript'], taskName, 'badAnswer', body) else: allowSend = True if allowSend: send_notify(taskName, 'badAnswer', body) elif not exePid and alwaysWork: body = templater.tmpl_fill(selfName, 'notFound').replace( "{{taskName}}", taskName, -1) data = f"Not found required application {taskName}. Trying to restart\n" log.warning(data) new_toast(f'Starting {taskName}', data) data += restart(job, exePid) body += data new_toast('log_inspector', 'notFound') if 'eventScript' in job: allowSend, body = execute_event_script( log, job['eventScript'], taskName, 'notFound', body) else: allowSend = True if allowSend: send_notify(taskName, 'notFound', body) sleep(intervalCheckMin) except Exception: e = traceback.format_exc() log.critical(str(e)) break
def disk_inspector(): def fill_tmpl(event: str) -> str: body = templater.tmpl_fill(selfName, event) body = body.replace('{{critFree}}', str(critFree), -1) body = body.replace('{{diskFree}}', str(diskFree), -1) body = body.replace('{{diskUsage}}', diskUsage, -1) body = body.replace('{{taskName}}', taskName, -1) return body.replace('{{diskWarn}}', str(diskWarn), -1) log.debug("disk_inspector started") selfName = 'disk_inspector' while True: for taskName, task in cfg['tasks']['diskTask'].items(): critFree = task['critFree'] diskUsage = task['diskUsage'] diskWarn = task['diskWarn'] try: diskFree = round( shutil.disk_usage(diskUsage).free / 1073741824, 2) if diskFree < critFree: log.error( f"Free disk space is critically small on {diskUsage}: {diskFree}" ) event = 'critFree' body = fill_tmpl(event) new_toast( diskUsage, f"Free disk space is critically small: {diskFree}") if 'eventScript' in task: allowSend, body = execute_event_script( log, task['eventScript'], taskName, event, body) else: allowSend = True if allowSend: send_notify(taskName, event, body) elif diskFree < diskWarn: log.warning( f"Free disk space is ends {diskUsage}: {diskFree}GB") event = 'diskWarn' body = fill_tmpl(event) new_toast(diskUsage, f"Free disk space is ends: {diskFree}GB") if 'eventScript' in task: allowSend, body = execute_event_script( log, task['eventScript'], taskName, event, body) else: allowSend = True if allowSend: send_notify(taskName, event, body) elif diskFree > diskWarn: log.info(f"disk {diskUsage}: {diskFree}GB free") except FileNotFoundError: log.error(f'disk_inspector: wrong path: {diskUsage}') except Exception as e: log.critical(f'disk_inspector: {traceback.format_exc()}') shutdown_me(9, 9) sleep(intervalCheckMin)
def _create_db(self) -> sqlite3.Connection: """ Создание локальной бд. Восстановление с диска и флаг принудительной синхронизации :return: """ try: cnx = sqlite3.connect(self.db) except Exception as e: log.critical(f"Can't open local datastore {self.db}: {e}") raise Exception(e) # при старте всегда проверка статуса прошлого завершения работы fail = False try: cur = cnx.cursor() cur.execute('SELECT self_status FROM lootnika') except Exception: if self.db != ':memory:': log.warning(f'Creating new tasks journal scheme') fail = True if fail: try: cur.executescript(""" CREATE TABLE lootnika (self_status VARCHAR); CREATE TABLE tasks ( id INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE NOT NULL, name VARCHAR NOT NULL, start_time DATETIME, end_time DATETIME, status VARCHAR, count_total INTEGER DEFAULT (0), count_seen INTEGER DEFAULT (0), count_new INTEGER DEFAULT (0), count_differ INTEGER DEFAULT (0), count_delete INTEGER DEFAULT (0), count_task_error INTEGER DEFAULT (0), count_export_error INTEGER DEFAULT (0), last_doc_id VARCHAR); CREATE TRIGGER delete_till_100 INSERT ON tasks WHEN (SELECT count(*) FROM tasks)>100 BEGIN DELETE FROM tasks WHERE tasks.id IN ( SELECT id FROM tasks ORDER BY id LIMIT (SELECT count(*) - 100 FROM tasks) ); END; CREATE TABLE "sphinxbuilder" ( "id" INTEGER, "owner" TEXT, "name" TEXT, "path" TEXT, "hash" TEXT, PRIMARY KEY("id") ); """) cnx.commit() except Exception as e: log.error( f'Unable to create datastore scheme in lootnika_tasks_journal.db: {e}' ) cur.execute('SELECT self_status FROM lootnika') rec = cur.fetchone() if rec is None: cur.execute( "INSERT INTO lootnika('self_status') VALUES ('starting lootnika')" ) elif rec and rec[0] != 'shutdown successfully': log.warning( f'The previous shutdown was unexpected. Last lootnika status: {rec[0]}.' ) self.isRestored = True cur.execute("UPDATE lootnika SET self_status='starting lootnika'") cnx.commit() cur.close() return cnx