def Run(): while True: try: config["racks"] = k8sUtils.get_node_labels("rack") config["skus"] = k8sUtils.get_node_labels("sku") except Exception as e: print e try: dataHandler = DataHandler() pendingJobs = dataHandler.GetPendingJobs() printlog("updating status for %d jobs" % len(pendingJobs)) for job in pendingJobs: try: print "Processing job: %s, status: %s" % (job["jobId"], job["jobStatus"]) if job["jobStatus"] == "queued": SubmitJob(job) elif job["jobStatus"] == "killing": KillJob(job) elif job["jobStatus"] == "scheduling" or job[ "jobStatus"] == "running": UpdateJobStatus(job) elif job["jobStatus"] == "unapproved": AutoApproveJob(job) except Exception as e: print e except Exception as e: print e time.sleep(1)
def Run(): register_stack_trace_dump() notifier = notify.Notifier(config.get("job-manager")) notifier.start() create_log() while True: update_file_modification_time("job_manager") with manager_iteration_histogram.labels("job_manager").time(): try: config["racks"] = k8sUtils.get_node_labels("rack") config["skus"] = k8sUtils.get_node_labels("sku") except Exception as e: logging.exception("get node labels failed") try: dataHandler = DataHandler() pendingJobs = dataHandler.GetPendingJobs() TakeJobActions(pendingJobs) pendingJobs = dataHandler.GetPendingJobs() logging.info("Updating status for %d jobs" % len(pendingJobs)) for job in pendingJobs: try: logging.info("Processing job: %s, status: %s" % (job["jobId"], job["jobStatus"])) if job["jobStatus"] == "killing": KillJob(job["jobId"], "killed") elif job["jobStatus"] == "pausing": KillJob(job["jobId"], "paused") elif job["jobStatus"] == "scheduling" or job[ "jobStatus"] == "running": UpdateJobStatus(job, notifier) elif job["jobStatus"] == "unapproved": ApproveJob(job) except Exception as e: logging.warning(e, exc_info=True) except Exception as e: logging.warning("Process job failed!", exc_info=True) finally: try: dataHandler.Close() except: pass time.sleep(1)
def Run(): create_log() logging.info("start to process jobs ...") while True: try: config["racks"] = k8sUtils.get_node_labels("rack") config["skus"] = k8sUtils.get_node_labels("sku") except Exception as e: print e try: dataHandler = DataHandler() pendingJobs = dataHandler.GetPendingJobs() #printlog("updating status for %d jobs" % len(pendingJobs)) for job in pendingJobs: try: logging.info("to process one pendingJob.") msg = "Processing job: %s, status: %s" % (str( job["jobId"]), str(job["jobStatus"])) logging.info(msg) if job["jobStatus"] == "queued": SubmitJob(job) elif job["jobStatus"] == "killing": KillJob(job) elif job["jobStatus"] == "scheduling" or job[ "jobStatus"] == "running": UpdateJobStatus(job) elif job["jobStatus"] == "unapproved": AutoApproveJob(job) except Exception as e: print e except Exception as e: print e time.sleep(1)
def Run(): while True: try: config["racks"] = k8sUtils.get_node_labels("rack") config["skus"] = k8sUtils.get_node_labels("sku") except Exception as e: print(e) try: dataHandler = DataHandler() try: pendingJobs = dataHandler.GetPendingJobs() TakeJobActions(pendingJobs) pendingJobs = dataHandler.GetPendingJobs() logging.info("Updating status for %d jobs" % len(pendingJobs)) for job in pendingJobs: try: logging.info("Processing job: %s, status: %s" % (job["jobId"], job["jobStatus"])) if job["jobStatus"] == "killing": KillJob(job, "killed") elif job["jobStatus"] == "pausing": KillJob(job, "paused") elif job["jobStatus"] == "scheduling" or job[ "jobStatus"] == "running": UpdateJobStatus(job) elif job["jobStatus"] == "unapproved": AutoApproveJob(job) except Exception as e: logging.info(e) except Exception as e: print(str(e)) finally: dataHandler.Close() except Exception as e: print(str(e)) time.sleep(1)
def Run(redis_port, target_status): register_stack_trace_dump() process_name = "job_manager_" + target_status create_log(process_name=process_name) notifier = notify.Notifier(config.get("job-manager")) notifier.start() launcher_type = config.get("job-manager", {}).get("launcher", "python") if launcher_type == "python": launcher = PythonLauncher() elif launcher_type == "controller": launcher = LauncherStub() else: logger.error("unknown launcher_type %s", launcher_type) sys.exit(2) launcher.start() redis_conn = redis.StrictRedis(host="localhost", port=redis_port, db=0) while True: update_file_modification_time(process_name) with manager_iteration_histogram.labels(process_name).time(): try: config["racks"] = k8sUtils.get_node_labels("rack") config["skus"] = k8sUtils.get_node_labels("sku") except Exception as e: logger.exception("get node labels failed") try: launcher.wait_tasks_done( ) # wait for tasks from previous batch done data_handler = DataHandler() if target_status == "queued": jobs = data_handler.GetJobList( "all", "all", num=None, status="queued,scheduling,running") take_job_actions(data_handler, redis_conn, launcher, jobs) else: jobs = data_handler.GetJobList("all", "all", num=None, status=target_status) logger.info("Updating status for %d %s jobs", len(jobs), target_status) for job in jobs: logger.info("Processing job: %s, status: %s" % (job["jobId"], job["jobStatus"])) if job["jobStatus"] == "killing": launcher.kill_job(job["jobId"], "killed") elif job["jobStatus"] == "pausing": launcher.kill_job(job["jobId"], "paused") elif job["jobStatus"] == "running": UpdateJobStatus(redis_conn, launcher, job, notifier, dataHandlerOri=data_handler) elif job["jobStatus"] == "scheduling": UpdateJobStatus(redis_conn, launcher, job, notifier, dataHandlerOri=data_handler) elif job["jobStatus"] == "unapproved": ApproveJob(redis_conn, job, dataHandlerOri=data_handler) else: logger.error("unknown job status %s for job %s", job["jobStatus"], job["jobId"]) except Exception as e: logger.exception("Process jobs failed!") finally: try: data_handler.Close() except: pass time.sleep(1)