Exemple #1
0
import sjs

from sample_job import job_that_takes_a_long_time

filepath = sjs.DEFAULT_CONFIG_LOCATION
if len(sys.argv) > 1:
    filepath = sys.argv[1]

if not sjs.load(filepath):
    raise SystemExit()

sjs.run_pre_queue_checks(exit_on_fail=True)

redis_conn = sjs.get_redis_conn()
q = sjs.get_job_queue()

# enqueue sample jobs
jobs = []
jobs.append(q.enqueue(job_that_takes_a_long_time, 10))
jobs.append(q.enqueue(job_that_takes_a_long_time, 60))

# NOTE:
# Just because a job is queued, doesn't mean there are any workers to run it. If you are testing,
# you should go ahead and and start a worker with `rq worker`.

# WARNING: you would normally be done here!
# But for this test code, we're going to wait around, get the output and print.
# NOT AT ALL RECOMMENDED FOR PRODUCTION CODE

sleep(200)
Exemple #2
0
def launch_workers(num_workers, burst, run_pre_checks, run_env_checks,
                   interval):
    os.makedirs("logs", exist_ok=True)

    if run_pre_checks:
        print("Running pre-checks...")
        sjs.run_pre_worker_checks(exit_on_fail=True)
        print("OK!")
    else:
        print("Skipping pre-checks!")

    working_dir = get_sjs_running_file()
    if not working_dir:
        raise SystemExit("Currently there is no run started (i.e. there is no %s file). " \
            "Are you in the correct directory?" % SJS_RUNNING_FILE)

    hostname = os.uname()[1]
    timestamp = datetime.now().strftime("%Y_%m_%d__%H_%M_%S")

    # compare env_record at start of run with this one
    env_record_dir = os.path.join(working_dir, 'env_records')
    env_record_path = os.path.join(env_record_dir,
                                   "%s_%s" % (hostname, timestamp))
    env = save_env_record(env_record_path)
    orig_env_record = read_env_record(
        os.path.join(env_record_dir, 'env_record_start.yaml'))
    if run_env_checks:
        print("Running env-checks...")
        if env != orig_env_record:
            print("env_record of this machine does not match env record of original machine! " \
                "Aborting launch workers! Please see %s to compare manually" % (env_record_path))
            raise SystemExit(
                "Env records do not match, aborting launch workers!")
        else:
            print("OK!")
    else:
        print("Skipping env-checks!")

    print("")
    print("Running on hostname %s" % hostname)
    print("Running at timestamp %s" % timestamp)
    print("Log name template: %s_%s_*.log" % (hostname, timestamp))
    print("Env record path: %s" % env_record_path)
    if burst:
        print("Running in burst mode. Workers and launch_workers script will exit when all " \
              "workers are idle and the queue is empty.")
    else:
        print(
            "Workers and launch_workers script will stay alive until killed.")

    print("")
    worker_processes = []
    log_files = []

    sjs.load()
    sjs_config = sjs.get_sjs_config()
    redis_cfg = sjs_config['redis']
    redis_url = "redis://%s:%s/%s" % (redis_cfg['host'], redis_cfg['port'],
                                      redis_cfg['db'])
    cmd = ['rq', 'worker', "-u", redis_url, sjs_config['queue']]

    for i in range(num_workers):
        logname = 'logs/%s_%s_%s.log' % (hostname, timestamp, i)
        print("Launching worker #%s with log file %s" % (i, logname))

        log = open(logname, 'w')
        proc = subprocess.Popen(cmd, stdout=log, stderr=log)

        worker_processes.append(proc)
        log_files.append(log)

    print("")
    print("Worker PIDS: %s" % [w.pid for w in worker_processes])

    try:
        conn = sjs.get_redis_conn()

        if 'min_seconds_per_job' in sjs_config or burst == False:
            # more complex case of either handling bursted workers, or handling min_seconds_per_job
            # timeout. Here we run a loop and check conditions each run through the loop.
            while True:
                sleep(interval)

                if burst:
                    # there is no point killing workers on the node unless all of them are idle and
                    # we can kill all the workers and release the node. So here we poll for the
                    # current worker state and if all the workers are idle AND the queue is empty,
                    # then we shut the node down.
                    workers = [
                        w for w in Worker.all(connection=conn)
                        if w.name.startswith(hostname)
                    ]
                    idle_workers = [w for w in workers if w.state == 'idle']
                    if len(idle_workers) == len(workers) and len(
                            sjs.get_job_queue()) == 0:
                        print("All workers idle; queue is empty.")
                        disable_signals()
                        raise SystemExit()

                if 'min_seconds_per_job' in sjs_config:
                    try:
                        results = subprocess.check_output(
                            "qstat -i $PBS_JOBID",
                            shell=True,
                            universal_newlines=True)
                        hours, minutes, seconds = results.strip().split(
                            "\n")[-1][-8:].split(":")
                        walltime_remaining = int(hours) * 3600 + int(
                            minutes) * 60 + int(seconds)

                        if sjs_config[
                                'min_seconds_per_job'] > walltime_remaining:
                            print("walltime remaining is less than the min seconds required per " \
                                  "job. Sending SIGINTs to workers so they exit when the " \
                                  "currently running job is complete")
                            for worker in worker_processes:
                                os.kill(worker.pid, signal.SIGINT)
                            break

                    except Exception as e:
                        print("Failure getting walltime", e)

        # the simplest case of just running the workers until they exit
        print("Waiting for workers to exit...")
        for w in worker_processes:
            w.wait()

    except SystemExit:
        # if this process is forced to exit, we kill the workers, and wait for them to
        # exit, before finally closing the log files.
        print("... killing any workers")

        # rq workers must be signaled twice to actually shutdown.
        # we sleep in between to avoid a signal getting lost.
        try:
            print("sending first SIGINT")
            os.killpg(os.getpgid(0), signal.SIGINT)
            sleep(1)
            print("sending second SIGINT")
            os.killpg(os.getpgid(0), signal.SIGINT)
        except ProcessLookupError:
            print("process already killed")
        for w in worker_processes:
            w.wait()
    finally:
        for f in log_files:
            f.close()

    print("")
    print("All done!")
    sys.stdout.flush()
def monitor(auto_finalize, auto_requeue_fails, interval, skip_run_check, next_gen_script):
    global stdscr
    global maxyx

    if not skip_run_check and not run_started():
        print("There is no run to resume. Are you in the right directory?")
        print("Aborting.")
        sys.exit(64)

    first_time_through = True
    run_looks_complete = False
    run_finalized = False
    try:
        with curses_fullscreen() as stdscr:
            maxyx = stdscr.getmaxyx()
            while True:
                if first_time_through:
                    first_time_through = False
                else:
                    sleep(interval)

                workers = Worker.all(connection=conn)
                idle_workers = [ w for w in workers if w.state == 'idle' ]
                if len(idle_workers) != len(workers):
                    print_status("Workers are still busy.")
                    continue

                if len(sjs.get_job_queue()) > 0:
                    print_status("Jobs still in queue.")
                    continue

                if len(sjs.jobs_failed()) > 0:
                    if auto_requeue_fails:
                        print_status("Unresolved jobs in failed queue: requeueing.")
                        sjs.requeue_failed_jobs()
                    else:
                        print_status("Unresolved jobs in failed queue.")
                    continue

                message = ""
                if next_gen_script:
                    message += "running user-provided next generation script %s..." % next_gen_script
                    results = subprocess.run(next_gen_script, shell=True, universal_newlines=True,
                                             stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
                    message += "\n" + results.stdout + "\n"

                    if results.returncode == 0:
                        message += "returncode is 0, indicating more jobs were requeued"
                        print_status(message)
                        continue
                    else:
                        message += "returncode == %s, indicating no more jobs were requeued\n" \
                                   "or the script is not executable, or cannot be found.\n" % results.returncode
                        print_status(message)

                run_looks_complete = True
                if auto_finalize:
                    break
                else:
                    message += "Run looks complete but --auto-finalize is off. Waiting for user to cntl-c."
                    print_status(message)
                    pass




    except SystemExit as e:
        print(e.code)

    if run_looks_complete:
        if auto_finalize:
            print("Run looks complete.")
            print("Finalizing the run...")
            end_run()
            print("Finished!")
        else:
            print("The run looks complete but was not auto-finalized.")
            print("You should verify the run is complete, then run 'sjs finalize'")
def jobs_queued():
    jobs_queue = sjs.get_job_queue()
    return jobs_queue.get_job_ids()
Exemple #5
0
def monitor(auto_finalize, auto_requeue_fails, interval, skip_run_check,
            next_gen_script):
    global stdscr
    global maxyx

    if not skip_run_check and not run_started():
        print("There is no run to resume. Are you in the right directory?")
        print("Aborting.")
        sys.exit(64)

    first_time_through = True
    run_looks_complete = False
    run_finalized = False
    try:
        with curses_fullscreen() as stdscr:
            maxyx = stdscr.getmaxyx()
            while True:
                if first_time_through:
                    first_time_through = False
                else:
                    sleep(interval)

                workers = Worker.all(connection=conn)
                idle_workers = [w for w in workers if w.state == 'idle']
                if len(idle_workers) != len(workers):
                    print_status("Workers are still busy.")
                    continue

                if len(sjs.get_job_queue()) > 0:
                    print_status("Jobs still in queue.")
                    continue

                if len(sjs.jobs_failed()) > 0:
                    if auto_requeue_fails:
                        print_status(
                            "Unresolved jobs in failed queue: requeueing.")
                        sjs.requeue_failed_jobs()
                    else:
                        print_status("Unresolved jobs in failed queue.")
                    continue

                message = ""
                if next_gen_script:
                    message += "running user-provided next generation script %s..." % next_gen_script
                    results = subprocess.run(next_gen_script,
                                             shell=True,
                                             universal_newlines=True,
                                             stdout=subprocess.PIPE,
                                             stderr=subprocess.STDOUT)
                    message += "\n" + results.stdout + "\n"

                    if results.returncode == 0:
                        message += "returncode is 0, indicating more jobs were requeued"
                        print_status(message)
                        continue
                    else:
                        message += "returncode == %s, indicating no more jobs were requeued\n" \
                                   "or the script is not executable, or cannot be found.\n" % results.returncode
                        print_status(message)

                run_looks_complete = True
                if auto_finalize:
                    break
                else:
                    message += "Run looks complete but --auto-finalize is off. Waiting for user to cntl-c."
                    print_status(message)
                    pass

    except SystemExit as e:
        print(e.code)

    if run_looks_complete:
        if auto_finalize:
            print("Run looks complete.")
            print("Finalizing the run...")
            end_run()
            print("Finished!")
        else:
            print("The run looks complete but was not auto-finalized.")
            print(
                "You should verify the run is complete, then run 'sjs finalize'"
            )