Esempio n. 1
0
def Run():

    while True:

        try:
            config["racks"] = k8sUtils.get_node_labels("rack")
            config["skus"] = k8sUtils.get_node_labels("sku")
        except Exception as e:
            print e

        try:
            dataHandler = DataHandler()
            pendingJobs = dataHandler.GetPendingJobs()
            printlog("updating status for %d jobs" % len(pendingJobs))
            for job in pendingJobs:
                try:
                    print "Processing job: %s, status: %s" % (job["jobId"],
                                                              job["jobStatus"])
                    if job["jobStatus"] == "queued":
                        SubmitJob(job)
                    elif job["jobStatus"] == "killing":
                        KillJob(job)
                    elif job["jobStatus"] == "scheduling" or job[
                            "jobStatus"] == "running":
                        UpdateJobStatus(job)
                    elif job["jobStatus"] == "unapproved":
                        AutoApproveJob(job)
                except Exception as e:
                    print e
        except Exception as e:
            print e

        time.sleep(1)
Esempio n. 2
0
def Run():
    register_stack_trace_dump()
    notifier = notify.Notifier(config.get("job-manager"))
    notifier.start()
    create_log()

    while True:
        update_file_modification_time("job_manager")

        with manager_iteration_histogram.labels("job_manager").time():
            try:
                config["racks"] = k8sUtils.get_node_labels("rack")
                config["skus"] = k8sUtils.get_node_labels("sku")
            except Exception as e:
                logging.exception("get node labels failed")

            try:
                dataHandler = DataHandler()
                pendingJobs = dataHandler.GetPendingJobs()
                TakeJobActions(pendingJobs)

                pendingJobs = dataHandler.GetPendingJobs()
                logging.info("Updating status for %d jobs" % len(pendingJobs))
                for job in pendingJobs:
                    try:
                        logging.info("Processing job: %s, status: %s" %
                                     (job["jobId"], job["jobStatus"]))
                        if job["jobStatus"] == "killing":
                            KillJob(job["jobId"], "killed")
                        elif job["jobStatus"] == "pausing":
                            KillJob(job["jobId"], "paused")
                        elif job["jobStatus"] == "scheduling" or job[
                                "jobStatus"] == "running":
                            UpdateJobStatus(job, notifier)
                        elif job["jobStatus"] == "unapproved":
                            ApproveJob(job)
                    except Exception as e:
                        logging.warning(e, exc_info=True)
            except Exception as e:
                logging.warning("Process job failed!", exc_info=True)
            finally:
                try:
                    dataHandler.Close()
                except:
                    pass

        time.sleep(1)
Esempio n. 3
0
def Run():
    create_log()
    logging.info("start to process jobs ...")

    while True:

        try:
            config["racks"] = k8sUtils.get_node_labels("rack")
            config["skus"] = k8sUtils.get_node_labels("sku")

        except Exception as e:
            print e

        try:
            dataHandler = DataHandler()
            pendingJobs = dataHandler.GetPendingJobs()
            #printlog("updating status for %d jobs" % len(pendingJobs))

            for job in pendingJobs:
                try:
                    logging.info("to process one pendingJob.")
                    msg = "Processing job: %s, status: %s" % (str(
                        job["jobId"]), str(job["jobStatus"]))
                    logging.info(msg)

                    if job["jobStatus"] == "queued":
                        SubmitJob(job)

                    elif job["jobStatus"] == "killing":
                        KillJob(job)

                    elif job["jobStatus"] == "scheduling" or job[
                            "jobStatus"] == "running":
                        UpdateJobStatus(job)

                    elif job["jobStatus"] == "unapproved":
                        AutoApproveJob(job)

                except Exception as e:
                    print e

        except Exception as e:
            print e

        time.sleep(1)
Esempio n. 4
0
def Run():

    while True:

        try:
            config["racks"] = k8sUtils.get_node_labels("rack")
            config["skus"] = k8sUtils.get_node_labels("sku")
        except Exception as e:
            print(e)

        try:
            dataHandler = DataHandler()
            try:
                pendingJobs = dataHandler.GetPendingJobs()
                TakeJobActions(pendingJobs)

                pendingJobs = dataHandler.GetPendingJobs()
                logging.info("Updating status for %d jobs" % len(pendingJobs))
                for job in pendingJobs:
                    try:
                        logging.info("Processing job: %s, status: %s" %
                                     (job["jobId"], job["jobStatus"]))
                        if job["jobStatus"] == "killing":
                            KillJob(job, "killed")
                        elif job["jobStatus"] == "pausing":
                            KillJob(job, "paused")
                        elif job["jobStatus"] == "scheduling" or job[
                                "jobStatus"] == "running":
                            UpdateJobStatus(job)
                        elif job["jobStatus"] == "unapproved":
                            AutoApproveJob(job)
                    except Exception as e:
                        logging.info(e)
            except Exception as e:
                print(str(e))
            finally:
                dataHandler.Close()
        except Exception as e:
            print(str(e))

        time.sleep(1)
Esempio n. 5
0
def Run(redis_port, target_status):
    register_stack_trace_dump()
    process_name = "job_manager_" + target_status

    create_log(process_name=process_name)

    notifier = notify.Notifier(config.get("job-manager"))
    notifier.start()

    launcher_type = config.get("job-manager", {}).get("launcher", "python")
    if launcher_type == "python":
        launcher = PythonLauncher()
    elif launcher_type == "controller":
        launcher = LauncherStub()
    else:
        logger.error("unknown launcher_type %s", launcher_type)
        sys.exit(2)
    launcher.start()

    redis_conn = redis.StrictRedis(host="localhost", port=redis_port, db=0)

    while True:
        update_file_modification_time(process_name)

        with manager_iteration_histogram.labels(process_name).time():
            try:
                config["racks"] = k8sUtils.get_node_labels("rack")
                config["skus"] = k8sUtils.get_node_labels("sku")
            except Exception as e:
                logger.exception("get node labels failed")

            try:
                launcher.wait_tasks_done(
                )  # wait for tasks from previous batch done

                data_handler = DataHandler()

                if target_status == "queued":
                    jobs = data_handler.GetJobList(
                        "all",
                        "all",
                        num=None,
                        status="queued,scheduling,running")
                    take_job_actions(data_handler, redis_conn, launcher, jobs)
                else:
                    jobs = data_handler.GetJobList("all",
                                                   "all",
                                                   num=None,
                                                   status=target_status)
                    logger.info("Updating status for %d %s jobs", len(jobs),
                                target_status)

                    for job in jobs:
                        logger.info("Processing job: %s, status: %s" %
                                    (job["jobId"], job["jobStatus"]))
                        if job["jobStatus"] == "killing":
                            launcher.kill_job(job["jobId"], "killed")
                        elif job["jobStatus"] == "pausing":
                            launcher.kill_job(job["jobId"], "paused")
                        elif job["jobStatus"] == "running":
                            UpdateJobStatus(redis_conn,
                                            launcher,
                                            job,
                                            notifier,
                                            dataHandlerOri=data_handler)
                        elif job["jobStatus"] == "scheduling":
                            UpdateJobStatus(redis_conn,
                                            launcher,
                                            job,
                                            notifier,
                                            dataHandlerOri=data_handler)
                        elif job["jobStatus"] == "unapproved":
                            ApproveJob(redis_conn,
                                       job,
                                       dataHandlerOri=data_handler)
                        else:
                            logger.error("unknown job status %s for job %s",
                                         job["jobStatus"], job["jobId"])
            except Exception as e:
                logger.exception("Process jobs failed!")
            finally:
                try:
                    data_handler.Close()
                except:
                    pass

        time.sleep(1)