Exemple #1
0
 def shutdown():
     logger.info('Stopping http server')
     server.stop()
     ModelHandler.close_all_jobs()
     logger.info('Will shutdown in %s seconds ...',
                 MAX_WAIT_SECONDS_BEFORE_SHUTDOWN)
     stop_loop(time.time() + MAX_WAIT_SECONDS_BEFORE_SHUTDOWN)
Exemple #2
0
    def post(self):
        data = json.loads(self.request.body)
        print(data)
        try:
            slack_channel = DEFAULT_CHANNEL
            if "slack_channel" in data:
                slack_channel = data["slack_channel"]

            timeout = DEFAULT_JOB_TIMEOUT
            if "timeout" in data:
                timeout = data["timeout"]

            job = Job(str(uuid.uuid4()), data["data_source"],
                      data["model"], data["metrics"], slack_channel, timeout)
        except KeyError as e:
            self.finish({"code": HTTP_MISS_ARGS,
                         "message": "miss args %s" % e.args[0]})

        try:
            logger.info("run new job:{job}"
                        .format(job=dict(job)))
            self.run(job)
            self.finish({"code": HTTP_OK,
                        "message": "OK",
                         "data": dict(job)})
        except Exception as e:
            logger.error("run job:{job} failed:{err}"
                         .format(job=dict(job), err=e))
            self.finish({"code": HTTP_FAIL,
                         "message": "run job:{job} failed:{err}"
                        .format(job=dict(job), err=e)})
Exemple #3
0
 def timeout_action(self):
     # TODO: do some clean action after timeout
     with self.lock:
         logger.info(
             "[job-id:{id}] finish the job".format(id=sub_id(self.job.id)))
         self.__exit = True
         self.event.set()
Exemple #4
0
 def get(self):
     logger.info("list all running jobs")
     jobs = self.list_jobs()
     logger.info("running jobs:{jobs}"
                 .format(jobs=jobs))
     self.finish({"code": HTTP_OK,
                 "message": "OK",
                  "data": jobs})
Exemple #5
0
 def stop_loop(deadline):
     now = time.time()
     if now < deadline and (io_loop._callbacks or io_loop._timeouts):
         logger.info('Waiting for next tick')
         io_loop.add_timeout(now + 1, stop_loop, deadline)
     else:
         io_loop.stop()
         logger.info('Shutdown finally')
Exemple #6
0
    def close(self):
        # TODO: close this job
        with self.lock:
            logger.info(
                "[job-id:{id}] closing the job".format(id=sub_id(self.job.id)))
            self.__exit = True
            self.event.set()

            self.timer.cancel()
Exemple #7
0
def parse_config():
    options.parse_command_line()
    if options.config != "":
        logger.info("parse config from config file: {config}".format(
            config=options.config))
        options.parse_config_file(options.config)

    if options.slack_token == "":
        logger.error("slack token is required!!")
        sys.exit(1)

    alert.SLACK_TOKEN = options.slack_token
    logger.info("config: {config}".format(config=options.items()))
Exemple #8
0
    def predict(self, metric, query_expr):
        logger.info("[job-id:{id}][metric:metric}]starting to predict".format(
            id=sub_id(self.job.id), metric=metric))
        while not self.__exit:
            now = datetime.datetime.now()
            query = PrometheusQuery(
                query_expr,
                time.mktime((now - datetime.timedelta(minutes=5)).timetuple()),
                time.mktime(now.timetuple()), "15s")

            if self.predict_task(metric, query) == 1:
                logger.info("[job-id:{id}][metric:{metric}] predict OK".format(
                    id=sub_id(self.job.id), metric=metric))
            else:
                logger.info(
                    "[job-id:{id}][metric:{metric}] Predict Error".format(
                        id=sub_id(self.job.id), metric=metric))
                self.callback(
                    "[job] {job}, predict metric {metric} error in last {time}s"
                    .format(job=dict(self.job),
                            metric=metric,
                            time=self.predict_interval),
                    self.job.slack_channel)

            self.event.wait(self.predict_interval)
        logger.info("[job-id:{id}][metric:{metric}] stop".format(
            id=sub_id(self.job.id), metric=metric))
Exemple #9
0
    def train_task(self, metric, query):
        data_set = self.api.query(query)
        if len(data_set) > 0:
            values = []
            for data in data_set.values():
                values.append(float(data))

            mean_value = np.mean(values)
            std_value = np.std(values)
            df_one = {"mean": mean_value, "std": std_value}

            logger.info(
                "[job-id:{id}][metric:{metric}] append data to train df:{df_one}"
                .format(id=sub_id(self.job.id), metric=metric, df_one=df_one))
            self.df[metric] = self.df[metric].append(df_one, ignore_index=True)
Exemple #10
0
def main():
    parse_config()

    app = make_app()
    server = tornado.httpserver.HTTPServer(app)
    logger.info(
        "zeus server start to listen {port}...".format(port=options.port))
    server.listen(options.port)

    for sig in ('TERM', 'HUP', 'INT'):
        signal.signal(getattr(signal, 'SIG' + sig),
                      partial(sig_handler, server))

    tornado.ioloop.IOLoop.current().start()

    logger.info("Exit...")
Exemple #11
0
    def predict_task(self, metric, query):
        data_set = self.api.query(query)
        values = []
        for data in data_set.values():
            values.append(float(data))

        mean_value = np.mean(values)
        std_value = np.std(values)
        predict_data = np.array([[mean_value, std_value]])

        logger.info(
            "[job-id:{id}][metric:{metric}] predict data:{predict_data}".
            format(id=sub_id(self.job.id),
                   metric=metric,
                   predict_data=predict_data))
        return self.ilf[metric].predict(predict_data)
Exemple #12
0
    def get(self, job_id):
        if job_id == "":
            logger.error("job id is required")
            self.finish({"code": HTTP_MISS_ARGS,
                         "message": "job id is required"})

        try:
            logger.info("close running job:{job_id}"
                        .format(job_id=job_id))
            self.close_job(job_id)
            self.finish({"code": HTTP_OK,
                        "message": "OK"})
        except Exception as e:
            logger.error("close job:{job_id} failed:{err}"
                         .format(job_id=job_id, err=e))
            self.finish({"code": HTTP_FAIL,
                         "message": "close job:{job_id} failed:{err}"
                        .format(job_id=job_id, err=e)})
Exemple #13
0
    def get(self, job_id):
        if job_id == "":
            logger.error("job id is required")
            self.finish({"code": HTTP_MISS_ARGS,
                         "message": "job id is required"})

        try:
            logger.info("get job:{job_id} detail"
                        .format(job_id=job_id))
            job = self.detail_job(job_id)
            self.finish({"code": HTTP_OK,
                         "message": "OK",
                         "data": job})
        except Exception as e:
            logger.error("get job:{job_id} detail failed:{err}"
                         .format(job_id=job_id, err=e))
            self.finish({"code": HTTP_FAIL,
                         "message": "get job:{job_id} detail failed:{err}"
                        .format(job_id=job_id, err=e)})
Exemple #14
0
    def train(self, metric, query_expr):
        logger.info(
            "[job-id:{id}][metric:{metric}] starting to get sample data".
            format(id=sub_id(self.job.id), metric=metric))
        self.df[metric] = pd.DataFrame(columns=["mean", "std"])
        self.ilf[metric] = IsolationForest(n_estimators=100,
                                           n_jobs=-1,
                                           verbose=2)
        for index in range(0, self.train_count, 1):
            if self.__exit:
                logger.info("[job-id:{id}][metric:{metric}] stop".format(
                    id=sub_id(self.job.id), metric=metric))
                return False

            now = datetime.datetime.now()
            query = PrometheusQuery(
                query_expr,
                time.mktime(
                    (now - datetime.timedelta(minutes=15)).timetuple()),
                time.mktime(now.timetuple()), "15s")
            self.train_task(metric, query)

            if index % 10 == 0:
                mean_value = float(random.randint(0, 5000))
                std_value = float(random.randint(0, 10000))
                df_one = {"mean": mean_value, "std": std_value}
                self.df[metric] = self.df[metric].append(df_one,
                                                         ignore_index=True)

                logger.info(
                    "[job-id:{id}][metric:{metric}] append data to train df:{df_one}"
                    .format(id=sub_id(self.job.id),
                            metric=metric,
                            df_one=df_one))

            self.event.wait(self.train_interval)
        x_cols = ["mean", "std"]
        logger.info(
            "[job-id:{id}][metric:{metric}] starting to train sample data".
            format(id=sub_id(self.job.id), metric=metric))
        self.ilf[metric].fit(self.df[x_cols])
        return True
Exemple #15
0
 def query(self, query):
     logger.info("data_source: {data_source}"
                 .format(data_source=data_source))
     logger.info("query: {query}".format(query=query))