import sjs from sample_job import job_that_takes_a_long_time filepath = sjs.DEFAULT_CONFIG_LOCATION if len(sys.argv) > 1: filepath = sys.argv[1] if not sjs.load(filepath): raise SystemExit() sjs.run_pre_queue_checks(exit_on_fail=True) redis_conn = sjs.get_redis_conn() q = sjs.get_job_queue() # enqueue sample jobs jobs = [] jobs.append(q.enqueue(job_that_takes_a_long_time, 10)) jobs.append(q.enqueue(job_that_takes_a_long_time, 60)) # NOTE: # Just because a job is queued, doesn't mean there are any workers to run it. If you are testing, # you should go ahead and and start a worker with `rq worker`. # WARNING: you would normally be done here! # But for this test code, we're going to wait around, get the output and print. # NOT AT ALL RECOMMENDED FOR PRODUCTION CODE sleep(200)
def launch_workers(num_workers, burst, run_pre_checks, run_env_checks, interval): os.makedirs("logs", exist_ok=True) if run_pre_checks: print("Running pre-checks...") sjs.run_pre_worker_checks(exit_on_fail=True) print("OK!") else: print("Skipping pre-checks!") working_dir = get_sjs_running_file() if not working_dir: raise SystemExit("Currently there is no run started (i.e. there is no %s file). " \ "Are you in the correct directory?" % SJS_RUNNING_FILE) hostname = os.uname()[1] timestamp = datetime.now().strftime("%Y_%m_%d__%H_%M_%S") # compare env_record at start of run with this one env_record_dir = os.path.join(working_dir, 'env_records') env_record_path = os.path.join(env_record_dir, "%s_%s" % (hostname, timestamp)) env = save_env_record(env_record_path) orig_env_record = read_env_record( os.path.join(env_record_dir, 'env_record_start.yaml')) if run_env_checks: print("Running env-checks...") if env != orig_env_record: print("env_record of this machine does not match env record of original machine! " \ "Aborting launch workers! Please see %s to compare manually" % (env_record_path)) raise SystemExit( "Env records do not match, aborting launch workers!") else: print("OK!") else: print("Skipping env-checks!") print("") print("Running on hostname %s" % hostname) print("Running at timestamp %s" % timestamp) print("Log name template: %s_%s_*.log" % (hostname, timestamp)) print("Env record path: %s" % env_record_path) if burst: print("Running in burst mode. Workers and launch_workers script will exit when all " \ "workers are idle and the queue is empty.") else: print( "Workers and launch_workers script will stay alive until killed.") print("") worker_processes = [] log_files = [] sjs.load() sjs_config = sjs.get_sjs_config() redis_cfg = sjs_config['redis'] redis_url = "redis://%s:%s/%s" % (redis_cfg['host'], redis_cfg['port'], redis_cfg['db']) cmd = ['rq', 'worker', "-u", redis_url, sjs_config['queue']] for i in range(num_workers): logname = 'logs/%s_%s_%s.log' % (hostname, timestamp, i) print("Launching worker #%s with log file %s" % (i, logname)) log = open(logname, 'w') proc = subprocess.Popen(cmd, stdout=log, stderr=log) worker_processes.append(proc) log_files.append(log) print("") print("Worker PIDS: %s" % [w.pid for w in worker_processes]) try: conn = sjs.get_redis_conn() if 'min_seconds_per_job' in sjs_config or burst == False: # more complex case of either handling bursted workers, or handling min_seconds_per_job # timeout. Here we run a loop and check conditions each run through the loop. while True: sleep(interval) if burst: # there is no point killing workers on the node unless all of them are idle and # we can kill all the workers and release the node. So here we poll for the # current worker state and if all the workers are idle AND the queue is empty, # then we shut the node down. workers = [ w for w in Worker.all(connection=conn) if w.name.startswith(hostname) ] idle_workers = [w for w in workers if w.state == 'idle'] if len(idle_workers) == len(workers) and len( sjs.get_job_queue()) == 0: print("All workers idle; queue is empty.") disable_signals() raise SystemExit() if 'min_seconds_per_job' in sjs_config: try: results = subprocess.check_output( "qstat -i $PBS_JOBID", shell=True, universal_newlines=True) hours, minutes, seconds = results.strip().split( "\n")[-1][-8:].split(":") walltime_remaining = int(hours) * 3600 + int( minutes) * 60 + int(seconds) if sjs_config[ 'min_seconds_per_job'] > walltime_remaining: print("walltime remaining is less than the min seconds required per " \ "job. Sending SIGINTs to workers so they exit when the " \ "currently running job is complete") for worker in worker_processes: os.kill(worker.pid, signal.SIGINT) break except Exception as e: print("Failure getting walltime", e) # the simplest case of just running the workers until they exit print("Waiting for workers to exit...") for w in worker_processes: w.wait() except SystemExit: # if this process is forced to exit, we kill the workers, and wait for them to # exit, before finally closing the log files. print("... killing any workers") # rq workers must be signaled twice to actually shutdown. # we sleep in between to avoid a signal getting lost. try: print("sending first SIGINT") os.killpg(os.getpgid(0), signal.SIGINT) sleep(1) print("sending second SIGINT") os.killpg(os.getpgid(0), signal.SIGINT) except ProcessLookupError: print("process already killed") for w in worker_processes: w.wait() finally: for f in log_files: f.close() print("") print("All done!") sys.stdout.flush()
def monitor(auto_finalize, auto_requeue_fails, interval, skip_run_check, next_gen_script): global stdscr global maxyx if not skip_run_check and not run_started(): print("There is no run to resume. Are you in the right directory?") print("Aborting.") sys.exit(64) first_time_through = True run_looks_complete = False run_finalized = False try: with curses_fullscreen() as stdscr: maxyx = stdscr.getmaxyx() while True: if first_time_through: first_time_through = False else: sleep(interval) workers = Worker.all(connection=conn) idle_workers = [ w for w in workers if w.state == 'idle' ] if len(idle_workers) != len(workers): print_status("Workers are still busy.") continue if len(sjs.get_job_queue()) > 0: print_status("Jobs still in queue.") continue if len(sjs.jobs_failed()) > 0: if auto_requeue_fails: print_status("Unresolved jobs in failed queue: requeueing.") sjs.requeue_failed_jobs() else: print_status("Unresolved jobs in failed queue.") continue message = "" if next_gen_script: message += "running user-provided next generation script %s..." % next_gen_script results = subprocess.run(next_gen_script, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) message += "\n" + results.stdout + "\n" if results.returncode == 0: message += "returncode is 0, indicating more jobs were requeued" print_status(message) continue else: message += "returncode == %s, indicating no more jobs were requeued\n" \ "or the script is not executable, or cannot be found.\n" % results.returncode print_status(message) run_looks_complete = True if auto_finalize: break else: message += "Run looks complete but --auto-finalize is off. Waiting for user to cntl-c." print_status(message) pass except SystemExit as e: print(e.code) if run_looks_complete: if auto_finalize: print("Run looks complete.") print("Finalizing the run...") end_run() print("Finished!") else: print("The run looks complete but was not auto-finalized.") print("You should verify the run is complete, then run 'sjs finalize'")
def jobs_queued(): jobs_queue = sjs.get_job_queue() return jobs_queue.get_job_ids()
def monitor(auto_finalize, auto_requeue_fails, interval, skip_run_check, next_gen_script): global stdscr global maxyx if not skip_run_check and not run_started(): print("There is no run to resume. Are you in the right directory?") print("Aborting.") sys.exit(64) first_time_through = True run_looks_complete = False run_finalized = False try: with curses_fullscreen() as stdscr: maxyx = stdscr.getmaxyx() while True: if first_time_through: first_time_through = False else: sleep(interval) workers = Worker.all(connection=conn) idle_workers = [w for w in workers if w.state == 'idle'] if len(idle_workers) != len(workers): print_status("Workers are still busy.") continue if len(sjs.get_job_queue()) > 0: print_status("Jobs still in queue.") continue if len(sjs.jobs_failed()) > 0: if auto_requeue_fails: print_status( "Unresolved jobs in failed queue: requeueing.") sjs.requeue_failed_jobs() else: print_status("Unresolved jobs in failed queue.") continue message = "" if next_gen_script: message += "running user-provided next generation script %s..." % next_gen_script results = subprocess.run(next_gen_script, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) message += "\n" + results.stdout + "\n" if results.returncode == 0: message += "returncode is 0, indicating more jobs were requeued" print_status(message) continue else: message += "returncode == %s, indicating no more jobs were requeued\n" \ "or the script is not executable, or cannot be found.\n" % results.returncode print_status(message) run_looks_complete = True if auto_finalize: break else: message += "Run looks complete but --auto-finalize is off. Waiting for user to cntl-c." print_status(message) pass except SystemExit as e: print(e.code) if run_looks_complete: if auto_finalize: print("Run looks complete.") print("Finalizing the run...") end_run() print("Finished!") else: print("The run looks complete but was not auto-finalized.") print( "You should verify the run is complete, then run 'sjs finalize'" )