def Run(): register_stack_trace_dump() create_log() logger.info("start to update job logs ...") while True: update_file_modification_time("joblog_manager") with manager_iteration_histogram.labels("joblog_manager").time(): try: update_job_logs() except Exception as e: logger.exception("update job logs failed") time.sleep(1)
def Run(): register_stack_trace_dump() create_log() logger.info("start to update user directory...") while True: update_file_modification_time("user_manager") with manager_iteration_histogram.labels("user_manager").time(): try: set_user_directory() except Exception as e: logger.exception("set user directory failed") time.sleep(1)
def run(): register_stack_trace_dump() create_log() while True: update_file_modification_time("db_manager") with manager_iteration_histogram.labels("db_manager").time(): try: delete_old_cluster_status(CLUSTER_STATUS_EXPIRY) delete_old_inactive_jobs(JOBS_EXPIRY) except: logger.exception("Deleting old cluster status failed", exc_info=True) time.sleep(86400)
def Run(): register_stack_trace_dump() create_log() logger.info("start to DoDataConvert...") while True: update_file_modification_time("DataConvert") with manager_iteration_histogram.labels("data_convert").time(): try: DoDataConvert() except Exception as e: logger.exception("do dataConvert failed") time.sleep(1)
def Run(): register_stack_trace_dump() create_log() while True: update_file_modification_time("endpoint_manager") with manager_iteration_histogram.labels("endpoint_manager").time(): # start endpoints start_endpoints() time.sleep(1) # clean up endpoints for jobs which is NOT running cleanup_endpoints() time.sleep(1)
def Run(): register_stack_trace_dump() create_log() logging.info("start to update nodes usage information ...") config["cluster_status"] = None while True: update_file_modification_time("node_manager") with manager_iteration_histogram.labels("node_manager").time(): try: get_cluster_status() except Exception as e: logging.exception("get cluster status failed") time.sleep(30)
def Run(): register_stack_trace_dump() notifier = notify.Notifier(config.get("job-manager")) notifier.start() create_log() while True: update_file_modification_time("job_manager") with manager_iteration_histogram.labels("job_manager").time(): try: config["racks"] = k8sUtils.get_node_labels("rack") config["skus"] = k8sUtils.get_node_labels("sku") except Exception as e: logging.exception("get node labels failed") try: dataHandler = DataHandler() pendingJobs = dataHandler.GetPendingJobs() TakeJobActions(pendingJobs) pendingJobs = dataHandler.GetPendingJobs() logging.info("Updating status for %d jobs" % len(pendingJobs)) for job in pendingJobs: try: logging.info("Processing job: %s, status: %s" % (job["jobId"], job["jobStatus"])) if job["jobStatus"] == "killing": KillJob(job["jobId"], "killed") elif job["jobStatus"] == "pausing": KillJob(job["jobId"], "paused") elif job["jobStatus"] == "scheduling" or job[ "jobStatus"] == "running": UpdateJobStatus(job, notifier) elif job["jobStatus"] == "unapproved": ApproveJob(job) except Exception as e: logging.warning(e, exc_info=True) except Exception as e: logging.warning("Process job failed!", exc_info=True) finally: try: dataHandler.Close() except: pass time.sleep(1)
def run(): register_stack_trace_dump() create_log() update = lambda: update_file_modification_time("db_manager") while True: update() with manager_iteration_histogram.labels("db_manager").time(): try: delete_old_cluster_status(CLUSTER_STATUS_EXPIRY) # query below is too time consuming since lastUpdated in job table is not indexed # delete_old_inactive_jobs(JOBS_EXPIRY) except: logger.exception("Deleting old cluster status failed") sleep_with_update(86400, update)
def Run(): register_stack_trace_dump() create_log() while True: update_file_modification_time("endpoint_manager") with manager_iteration_histogram.labels("endpoint_manager").time(): try: runnings = start_endpoints() fix_endpoints(runnings) # clean up endpoints for jobs which is NOT running cleanup_endpoints() except Exception: logger.exception("processing this round of endpoints failed") time.sleep(1)
def Run(): register_stack_trace_dump() create_log() while True: update_file_modification_time("command_manager") with manager_iteration_histogram.labels("command_manager").time(): try: dataHandler = DataHandler() pendingCommands = dataHandler.GetPendingCommands() for command in pendingCommands: try: logger.info("Processing command: %s", command["id"]) RunCommand(command) except Exception as e: logger.exception("run command failed") except Exception as e: logger.exception("getting command failed") time.sleep(1)
def Run(redis_port, target_status): register_stack_trace_dump() process_name = "job_manager_" + target_status create_log(process_name=process_name) notifier = notify.Notifier(config.get("job-manager")) notifier.start() launcher_type = config.get("job-manager", {}).get("launcher", "python") if launcher_type == "python": launcher = PythonLauncher() elif launcher_type == "controller": launcher = LauncherStub() else: logger.error("unknown launcher_type %s", launcher_type) sys.exit(2) launcher.start() redis_conn = redis.StrictRedis(host="localhost", port=redis_port, db=0) while True: update_file_modification_time(process_name) with manager_iteration_histogram.labels(process_name).time(): try: config["racks"] = k8sUtils.get_node_labels("rack") config["skus"] = k8sUtils.get_node_labels("sku") except Exception as e: logger.exception("get node labels failed") try: launcher.wait_tasks_done( ) # wait for tasks from previous batch done data_handler = DataHandler() if target_status == "queued": jobs = data_handler.GetJobList( "all", "all", num=None, status="queued,scheduling,running") take_job_actions(data_handler, redis_conn, launcher, jobs) else: jobs = data_handler.GetJobList("all", "all", num=None, status=target_status) logger.info("Updating status for %d %s jobs", len(jobs), target_status) for job in jobs: logger.info("Processing job: %s, status: %s" % (job["jobId"], job["jobStatus"])) if job["jobStatus"] == "killing": launcher.kill_job(job["jobId"], "killed") elif job["jobStatus"] == "pausing": launcher.kill_job(job["jobId"], "paused") elif job["jobStatus"] == "running": UpdateJobStatus(redis_conn, launcher, job, notifier, dataHandlerOri=data_handler) elif job["jobStatus"] == "scheduling": UpdateJobStatus(redis_conn, launcher, job, notifier, dataHandlerOri=data_handler) elif job["jobStatus"] == "unapproved": ApproveJob(redis_conn, job, dataHandlerOri=data_handler) else: logger.error("unknown job status %s for job %s", job["jobStatus"], job["jobId"]) except Exception as e: logger.exception("Process jobs failed!") finally: try: data_handler.Close() except: pass time.sleep(1)