def shutdown(): logger.info('Stopping http server') server.stop() ModelHandler.close_all_jobs() logger.info('Will shutdown in %s seconds ...', MAX_WAIT_SECONDS_BEFORE_SHUTDOWN) stop_loop(time.time() + MAX_WAIT_SECONDS_BEFORE_SHUTDOWN)
def post(self): data = json.loads(self.request.body) print(data) try: slack_channel = DEFAULT_CHANNEL if "slack_channel" in data: slack_channel = data["slack_channel"] timeout = DEFAULT_JOB_TIMEOUT if "timeout" in data: timeout = data["timeout"] job = Job(str(uuid.uuid4()), data["data_source"], data["model"], data["metrics"], slack_channel, timeout) except KeyError as e: self.finish({"code": HTTP_MISS_ARGS, "message": "miss args %s" % e.args[0]}) try: logger.info("run new job:{job}" .format(job=dict(job))) self.run(job) self.finish({"code": HTTP_OK, "message": "OK", "data": dict(job)}) except Exception as e: logger.error("run job:{job} failed:{err}" .format(job=dict(job), err=e)) self.finish({"code": HTTP_FAIL, "message": "run job:{job} failed:{err}" .format(job=dict(job), err=e)})
def timeout_action(self): # TODO: do some clean action after timeout with self.lock: logger.info( "[job-id:{id}] finish the job".format(id=sub_id(self.job.id))) self.__exit = True self.event.set()
def get(self): logger.info("list all running jobs") jobs = self.list_jobs() logger.info("running jobs:{jobs}" .format(jobs=jobs)) self.finish({"code": HTTP_OK, "message": "OK", "data": jobs})
def stop_loop(deadline): now = time.time() if now < deadline and (io_loop._callbacks or io_loop._timeouts): logger.info('Waiting for next tick') io_loop.add_timeout(now + 1, stop_loop, deadline) else: io_loop.stop() logger.info('Shutdown finally')
def close(self): # TODO: close this job with self.lock: logger.info( "[job-id:{id}] closing the job".format(id=sub_id(self.job.id))) self.__exit = True self.event.set() self.timer.cancel()
def parse_config(): options.parse_command_line() if options.config != "": logger.info("parse config from config file: {config}".format( config=options.config)) options.parse_config_file(options.config) if options.slack_token == "": logger.error("slack token is required!!") sys.exit(1) alert.SLACK_TOKEN = options.slack_token logger.info("config: {config}".format(config=options.items()))
def predict(self, metric, query_expr): logger.info("[job-id:{id}][metric:metric}]starting to predict".format( id=sub_id(self.job.id), metric=metric)) while not self.__exit: now = datetime.datetime.now() query = PrometheusQuery( query_expr, time.mktime((now - datetime.timedelta(minutes=5)).timetuple()), time.mktime(now.timetuple()), "15s") if self.predict_task(metric, query) == 1: logger.info("[job-id:{id}][metric:{metric}] predict OK".format( id=sub_id(self.job.id), metric=metric)) else: logger.info( "[job-id:{id}][metric:{metric}] Predict Error".format( id=sub_id(self.job.id), metric=metric)) self.callback( "[job] {job}, predict metric {metric} error in last {time}s" .format(job=dict(self.job), metric=metric, time=self.predict_interval), self.job.slack_channel) self.event.wait(self.predict_interval) logger.info("[job-id:{id}][metric:{metric}] stop".format( id=sub_id(self.job.id), metric=metric))
def train_task(self, metric, query): data_set = self.api.query(query) if len(data_set) > 0: values = [] for data in data_set.values(): values.append(float(data)) mean_value = np.mean(values) std_value = np.std(values) df_one = {"mean": mean_value, "std": std_value} logger.info( "[job-id:{id}][metric:{metric}] append data to train df:{df_one}" .format(id=sub_id(self.job.id), metric=metric, df_one=df_one)) self.df[metric] = self.df[metric].append(df_one, ignore_index=True)
def main(): parse_config() app = make_app() server = tornado.httpserver.HTTPServer(app) logger.info( "zeus server start to listen {port}...".format(port=options.port)) server.listen(options.port) for sig in ('TERM', 'HUP', 'INT'): signal.signal(getattr(signal, 'SIG' + sig), partial(sig_handler, server)) tornado.ioloop.IOLoop.current().start() logger.info("Exit...")
def predict_task(self, metric, query): data_set = self.api.query(query) values = [] for data in data_set.values(): values.append(float(data)) mean_value = np.mean(values) std_value = np.std(values) predict_data = np.array([[mean_value, std_value]]) logger.info( "[job-id:{id}][metric:{metric}] predict data:{predict_data}". format(id=sub_id(self.job.id), metric=metric, predict_data=predict_data)) return self.ilf[metric].predict(predict_data)
def get(self, job_id): if job_id == "": logger.error("job id is required") self.finish({"code": HTTP_MISS_ARGS, "message": "job id is required"}) try: logger.info("close running job:{job_id}" .format(job_id=job_id)) self.close_job(job_id) self.finish({"code": HTTP_OK, "message": "OK"}) except Exception as e: logger.error("close job:{job_id} failed:{err}" .format(job_id=job_id, err=e)) self.finish({"code": HTTP_FAIL, "message": "close job:{job_id} failed:{err}" .format(job_id=job_id, err=e)})
def get(self, job_id): if job_id == "": logger.error("job id is required") self.finish({"code": HTTP_MISS_ARGS, "message": "job id is required"}) try: logger.info("get job:{job_id} detail" .format(job_id=job_id)) job = self.detail_job(job_id) self.finish({"code": HTTP_OK, "message": "OK", "data": job}) except Exception as e: logger.error("get job:{job_id} detail failed:{err}" .format(job_id=job_id, err=e)) self.finish({"code": HTTP_FAIL, "message": "get job:{job_id} detail failed:{err}" .format(job_id=job_id, err=e)})
def train(self, metric, query_expr): logger.info( "[job-id:{id}][metric:{metric}] starting to get sample data". format(id=sub_id(self.job.id), metric=metric)) self.df[metric] = pd.DataFrame(columns=["mean", "std"]) self.ilf[metric] = IsolationForest(n_estimators=100, n_jobs=-1, verbose=2) for index in range(0, self.train_count, 1): if self.__exit: logger.info("[job-id:{id}][metric:{metric}] stop".format( id=sub_id(self.job.id), metric=metric)) return False now = datetime.datetime.now() query = PrometheusQuery( query_expr, time.mktime( (now - datetime.timedelta(minutes=15)).timetuple()), time.mktime(now.timetuple()), "15s") self.train_task(metric, query) if index % 10 == 0: mean_value = float(random.randint(0, 5000)) std_value = float(random.randint(0, 10000)) df_one = {"mean": mean_value, "std": std_value} self.df[metric] = self.df[metric].append(df_one, ignore_index=True) logger.info( "[job-id:{id}][metric:{metric}] append data to train df:{df_one}" .format(id=sub_id(self.job.id), metric=metric, df_one=df_one)) self.event.wait(self.train_interval) x_cols = ["mean", "std"] logger.info( "[job-id:{id}][metric:{metric}] starting to train sample data". format(id=sub_id(self.job.id), metric=metric)) self.ilf[metric].fit(self.df[x_cols]) return True
def query(self, query): logger.info("data_source: {data_source}" .format(data_source=data_source)) logger.info("query: {query}".format(query=query))