def __load_jobs(self): try: a_dict = dict() sql = "SELECT services.id, services.service_name, machines.ssh_user, machines.ssh_ip, machines.ssh_port," \ "services.start_cmd, services.stop_cmd, services.is_active, services.auto_recover, services.mail_receiver " \ "FROM services,machines WHERE services.machine_id = machines.id" Logger.info(sql) self.cur.execute(sql) results = self.cur.fetchall() for row in results: (job_id, service_name, ssh_user, ssh_ip, ssh_port, start_cmd, stop_cmd, is_active, auto_recover, mail_receiver) = row a_dict[job_id] = JobDetail(job_id, service_name, ssh_user, ssh_ip, ssh_port, start_cmd, stop_cmd, is_active, auto_recover, mail_receiver) if not self.__load_checks(a_dict): return None if not self.__load_relies(a_dict): return None with self.lock: self.job_list = list() for a_id, a_job in a_dict.items(): self.job_list.append(a_job) return True except: Logger.error(traceback.format_exc()) return None
def send(cls, receivers, subject, content): """ :type receivers: string :type subject: string :type content: string :return: """ try: if config.fake_mail: Logger.info('receivers=[%s], subject=[%s], content=[%s]' % (receivers, subject, content)) return url = 'http://f**k.you.com/send_mail' a_dict = { 'receiver': receivers, 'subject': subject, 'content': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ": " + content } ret = requests.post(url, data=a_dict) Logger.info("http_code[%s], http_response[%s]" % (ret.status_code, ret.text)) except: Logger.error(traceback.format_exc())
def __get_health(cls, item, raw_out): """ Parse command out by operator, return (is_parse_error, is_healthy) :type item: CheckCmd :type raw_out: string :return: bool | None """ healthy_code = StatusCode.WHITE_CODE try: out = raw_out.strip() match = None if item.operator == "<": if int(out) < int(item.check_value): match = True else: match = False elif item.operator == "<=": if int(out) <= int(item.check_value): match = True else: match = False elif item.operator == "==": if int(out) == int(item.check_value): match = True else: match = False elif item.operator == ">=": if int(out) >= int(item.check_value): match = True else: match = False elif item.operator == ">": if int(out) > int(item.check_value): match = True else: match = False elif item.operator.lower() == "include": if out.find(item.check_value) != -1: match = True else: match = False elif item.operator.lower() == "exclude": if out.find(item.check_value) == -1: match = True else: match = False if item.good_match: if match is True: healthy_code = StatusCode.GREEN_CODE elif match is False: healthy_code = StatusCode.RED_CODE else: if match is True: healthy_code = StatusCode.RED_CODE elif match is False: healthy_code = StatusCode.GREEN_CODE except Exception, e: Logger.error(e.message) healthy_code = StatusCode.YELLOW_CODE
def logout(self): try: if self.client: self.client.close() return True except Exception, e: Logger.error(e.message)
def login(self): try: self.client = SSHExecutor(self.__ssh_ip, self.__ssh_port, self.__ssh_user, config.key_file, config.key_pwd) self.client.open() return True except Exception, e: Logger.error(e.message)
def dispatch(self): """ :return: """ try: with self.lock: for a_job in self.job_list: self.job_holder.add_job(a_job) except Exception, e: Logger.error(e.message)
def lod_job_from_mysql(job_loader): """ :type job_loader: JobLoader :return: """ try: Logger.info("into lod job from mysql") job_loader.load_job() gevent.sleep(1) gevent.spawn(JobLoader.lod_job_from_mysql, job_loader) except Exception, e: Logger.error(e.message)
def do_job(self): try: job_id = self.a_job.get_id() service_name = self.a_job.get_service_name() healthy_code = self.a_job.do_all_check() # Refresh job status Logger.report('job_id[%s] [%s] is healthy_code[%s]' % (job_id, service_name, healthy_code)) self.status_holder.set_one_status(job_id, healthy_code, self.a_job.get_check_cmd_healthy_code()) # Success if healthy_code is StatusCode.GREEN_CODE: return # Monitor operation occur error if healthy_code == StatusCode.WHITE_CODE or healthy_code == StatusCode.YELLOW_CODE: content = 'job_id[%s] [%s], healthy_code[%s] cat not be monitored successfully' % (job_id, service_name, healthy_code) Logger.error(content) SendMail.send(self.a_job.get_mail_receiver(), service_name, content) return # Do not need to be recovered if not self.a_job.get_auto_recover(): return # Stopped process stopped = self.a_job.stop() if stopped is None: content = 'job_id[%s] [%s] stop failed' % (job_id, self.a_job.get_service_name()) Logger.info(content) return # Check relies relies = self.a_job.get_all_rely() if not self.status_holder.is_group_healthy(relies): content = 'services job_id[%s] [%s] relying is not healthy' % (job_id, self.a_job.get_service_name()) Logger.info(content) return # Start process ok = self.a_job.start() if not ok: content = 'job_id[%s] [%s] start failed' % (job_id, self.a_job.get_service_name()) Logger.info(content) return # Start ok content = 'job_id[%s] [%s] start success' % (job_id, self.a_job.get_service_name()) Logger.info(content) SendMail.send(self.a_job.get_mail_receiver(), self.a_job.get_service_name(), content) except Exception, e: Logger.error(e.message)
def del_job(self, job_id): """ Delete a job from job_dict :param job_id: :rtype: bool | None """ try: with self.__lock: if job_id in self.__job_dict: del self.__job_dict[job_id] return True except Exception, e: Logger.error(e.message)
def start(self): """ Start service using start cmd :rtype: bool | None """ try: std_out, std_err = self.client.execute(self.__start_cmd) Logger.info( "id[%s]: remote[%s] execute cmd[%s], std_out[%s], std_err[%s]" % (self.__id, self.__ssh_ip, self.__start_cmd, std_out, std_err)) return True except Exception, e: Logger.error(e.message)
def get_job(self): """ Get one job from job list :rtype: JobDetail | None """ try: with self.__lock: if len(self.__job_list) > 0: return self.__job_list.pop() if len(self.__job_list) == 0: return None except Exception, e: Logger.error(e.message)
def run(self): try: # Clear job status if job is not active if not self.a_job.is_active(): self.status_holder.clear_one_status(self.a_job.get_id()) self.job_holder.del_job(self.a_job.get_id()) return # Do one job self.a_job.login() with Defer(self.a_job.logout): self.do_job() self.job_holder.del_job(self.a_job.get_id()) except Exception, e: Logger.error(e.message)
def add_job(self, job): """ Add one job to job list. If the same job is exist in the job_dict, add failed. :type job: JobDetail :rtype: bool | None """ try: with self.__lock: job_id = job.get_id() if job_id in self.__job_dict: return False self.__job_list.append(job) self.__job_dict[job_id] = job return True except Exception, e: Logger.error(e.message)
def stop(self): """ Stop service using stop_cmd :rtype: bool | None """ try: std_out, std_err = self.client.execute(self.__stop_cmd) Logger.info( "id[%s]: remote[%s] execute cmd[%s], std_out[%s], std_err[%s]" % (self.__id, self.__ssh_ip, self.__stop_cmd, std_out, std_err)) # check healthy # result = self.is_running() # return result return True except Exception, e: Logger.error(e.message)
def __load_relies(self, a_dict): """ :type a_dict: dict[int, Job] :return: dict[int, Job] | None """ try: sql = 'SELECT service_id,rely_id FROM service_rely' Logger.info(sql) self.cur.execute(sql) results = self.cur.fetchall() for row in results: service_id, rely_id = row a_job = a_dict.get(service_id, None) if a_job is None: continue a_job.add_rely(rely_id) return True except: Logger.error(traceback.format_exc()) return None
def __load_checks(self, a_dict): """ :type a_dict: dict[int, Job] :return: dict[int, Job] | None """ try: sql = "SELECT id,service_id,local_check,check_shell,operator,check_value,good_match FROM check_cmd" Logger.info(sql) self.cur.execute(sql) results = self.cur.fetchall() for row in results: a_id, service_id, local_check, check_shell, operator, check_value, good_match = row check = CheckCmd(a_id, service_id, local_check, check_shell, operator, check_value, good_match) a_job = a_dict.get(service_id, None) if a_job is None: continue a_job.add_check(check) return True except: Logger.error(traceback.format_exc()) return None
def do_all_check(self): """ Execute all check command for the job, return (is_operate_success, is_healthy) :rtype: bool | None """ status_code = StatusCode() try: # local checking for item in self.__local.values(): status, output = commands.getstatusoutput(item.check_shell) Logger.info( "id[%s]: localhost[127.0.0.1] execute cmd[%s], status[%s], output[%s]" % (self.__id, item.check_shell, status, output)) if status != 0: status_code.set_status(None) # check healthy healthy_code = self.__get_health(item, output) Logger.info("id[%s]: localhost[127.0.0.1] healthy_code[%s]" % (self.__id, healthy_code)) status_code.set_code(healthy_code) for item in self.__remote.values(): std_out, std_err = self.client.execute(item.check_shell) Logger.info( "id[%s]: remote[%s] execute cmd[%s], std_out[%s], std_err[%s]" % (self.__id, self.__ssh_ip, item.check_shell, std_out, std_err)) if not std_out and std_err: status_code.set_status(None) # check healthy healthy_code = self.__get_health(item, std_out) Logger.info("id[%s]: remote[%s] healthy_code[%s]" % (self.__id, self.__ssh_ip, healthy_code)) status_code.set_code(healthy_code) except Exception, e: Logger.error(e.message)
def load_job(self): try: with self.__open() as client: client.__load_jobs() except Exception, e: Logger.error(e.message)
def is_quit(self): try: with self.lock: return self.quit except Exception, e: Logger.error(e.message)
def set_quit(self): try: with self.lock: self.quit = True except Exception, e: Logger.error(e.message)