def main(): global shutdown global verbose global active_event args = parse_args() verbose = args.verbose - args.quiet cmd_server_pool = cmdserver.CmdServerPool(args.jobs) signal.signal(signal.SIGTERM, sigterm_handler) _dir = "/tmp/dwq.%s" % str(random.random()) gitjobdir = GitJobDir(_dir, args.jobs) servers = ["localhost:7711"] try: Disque.connect(servers) vprint(1, "dwqw: connected.") except: pass working_set = SyncSet() for n in range(1, args.jobs + 1): threading.Thread( target=worker, args=(n, cmd_server_pool, gitjobdir, args, working_set), daemon=True, ).start() active_event.set() try: while True: if not Disque.connected(): try: vprint(1, "dwqw: connecting...") Disque.connect(servers) vprint(1, "dwqw: connected.") except RedisError: time.sleep(1) continue try: control_jobs = Job.get(["control::worker::%s" % args.name]) for job in control_jobs or []: handle_control_job(args, job) except RedisError: pass except (KeyboardInterrupt, SystemExit): vprint(1, "dwqw: shutting down") shutdown = True cmd_server_pool.destroy() vprint(1, "dwqw: nack'ing jobs") jobs = working_set.empty() d = Disque.get() d.nack_job(*jobs) vprint(1, "dwqw: cleaning up job directories") gitjobdir.cleanup()
def worker(n, cmd_server_pool, gitjobdir, args, working_set): global active_event global shutdown print("worker %2i: started" % n) buildnum = 0 while not shutdown: try: if not shutdown and not Disque.connected(): time.sleep(1) continue while not shutdown: active_event.wait() jobs = Job.get(args.queues) for job in jobs: if shutdown: job.nack() continue if job.additional_deliveries > 2: error = "too many deliveries (usual reason: timeout)" vprint(2, "worker %2i: %s" % (n, error)) job.done({ "status": "error", "output": "dwqw: %s\n" % error, "worker": args.name, "runtime": 0, "body": job.body, }) continue buildnum += 1 working_set.add(job.job_id) before = time.time() vprint( 2, "worker %2i: got job %s from queue %s" % (n, job.job_id, job.queue_name), ) try: repo = job.body["repo"] commit = job.body["commit"] command = job.body["command"] except KeyError: vprint(2, "worker %2i: invalid job json body" % n) job.done({ "status": "error", "output": "worker.py: invalid job description", }) continue vprint(2, 'worker %2i: command="%s"' % (n, command)) exclusive = None try: options = job.body.get("options") or {} if options.get("jobdir") or "" == "exclusive": exclusive = str(random.random()) except KeyError: pass unique = random.random() _env = os.environ.copy() try: _env.update(job.body["env"]) except KeyError: pass _env.update({ "DWQ_REPO": repo, "DWQ_COMMIT": commit, "DWQ_QUEUE": job.queue_name, "DWQ_WORKER": args.name, "DWQ_WORKER_BUILDNUM": str(buildnum), "DWQ_WORKER_THREAD": str(n), "DWQ_JOBID": job.job_id, "DWQ_JOB_UNIQUE": str(unique), "DWQ_CONTROL_QUEUE": job.body.get("control_queues")[0], }) workdir = None workdir_error = None try: try: workdir = gitjobdir.get(repo, commit, exclusive=exclusive or str(n)) except subprocess.CalledProcessError as e: workdir_error = ( "dwqw: error getting jobdir. output: \n" + e.output.decode("utf-8")) if not workdir: if job.nacks < options.get("max_retries", 2): job.nack() vprint( 1, "worker %2i: error getting job dir, requeueing job" % n, ) else: job.done({ "status": "error", "output": workdir_error or "dwqw: error getting jobdir\n", "worker": args.name, "runtime": 0, "body": job.body, }) vprint( 1, "worker %2i: cannot get job dir, erroring job" % n, ) working_set.discard(job.job_id) continue util.write_files(options.get("files"), workdir) # assets asset_dir = os.path.join( workdir, "assets", "%s:%s" % (hash(job.job_id), str(unique))) _env.update({"DWQ_ASSETS": asset_dir}) timeout = options.get("timeout", 300) if timeout >= 8: # send explit nack before disque times us out # but only if original timeout is not too small timeout -= 2 handle = cmd_server_pool.runcmd( command, cwd=workdir, shell=True, env=_env, start_new_session=True, ) output, result = handle.wait(timeout=timeout) if handle.timeout: result = "timeout" output = "dwqw: command timed out\n" if (result not in { 0, "0", "pass" }) and job.nacks < options.get("max_retries", 2): vprint( 2, "worker %2i: command:" % n, command, "result:", result, "nacks:", job.nacks, "re-queueing.", ) job.nack() else: runtime = time.time() - before options = job.body.get("options") if options: options.pop("files", None) # remove options from body if it is now empty if not options: job.body.pop("options", None) _result = { "status": result, "output": output, "worker": args.name, "runtime": runtime, "body": job.body, "unique": str(unique), } # pack assets try: asset_files = os.listdir(asset_dir) if asset_files: _result.update({ "assets": util.gen_file_data( asset_files, asset_dir) }) shutil.rmtree(asset_dir, ignore_errors=True) except FileNotFoundError: pass job.done(_result) vprint( 2, "worker %2i: command:" % n, command, "result:", result, "runtime: %.1fs" % runtime, ) working_set.discard(job.job_id) except Exception as e: if workdir: gitjobdir.release(workdir) raise e gitjobdir.release(workdir) except Exception as e: vprint(1, "worker %2i: uncaught exception" % n) traceback.print_exc() time.sleep(10) vprint(1, "worker %2i: restarting worker" % n)
def worker(n, cmd_server_pool, gitjobdir, args, working_set): global active_event global shutdown worker_str = f"dwqw@{args.name}.{n}" print(f"{worker_str}: started") buildnum = 0 while not shutdown: try: if not shutdown and not Disque.connected(): time.sleep(1) continue while not shutdown: active_event.wait() jobs = Job.get(args.queues) for job in jobs: if shutdown or not active_event.is_set(): job.nack() continue if job.additional_deliveries > 2: error = "too many deliveries (usual reason: timeout)" vprint(2, f"{worker_str}: {error}") job.done({ "status": "error", "output": f"{worker_str}: {error}\n", "worker": args.name, "runtime": 0, "body": job.body, }) continue buildnum += 1 working_set.add(job.job_id) before = time.time() vprint( 2, f"{worker_str}: got job {job.job_id} from queue {job.queue_name}", ) try: command = job.body["command"] except KeyError: vprint(2, f"{worker_str}: invalid job json body") job.done({ "status": "error", "output": f'{worker_str} invalid job body: "{job.body}"', }) continue vprint(2, f'{worker_str}: command="{command}"') repo = None commit = None try: repo = job.body["repo"] commit = job.body["commit"] except KeyError: pass if (repo is None) ^ (commit is None): vprint( 2, f"{worker_str}: invalid job json body, only one of repo and commit specified" ) job.done({ "status": "error", "output": f'{worker_str} invalid job body: "{job.body}"', }) continue exclusive = None try: options = job.body.get("options") or {} if options.get("jobdir") or "" == "exclusive": exclusive = str(random.random()) except KeyError: pass unique = random.random() _env = os.environ.copy() try: _env.update(job.body["env"]) except KeyError: pass _env.update({ "DWQ_QUEUE": job.queue_name, "DWQ_WORKER": args.name, "DWQ_WORKER_BUILDNUM": str(buildnum), "DWQ_WORKER_THREAD": str(n), "DWQ_JOBID": job.job_id, "DWQ_JOB_UNIQUE": str(unique), "DWQ_CONTROL_QUEUE": job.body.get("control_queues")[0], }) workdir = None workdir_output = None workdir_error = None try: if repo is not None: _env.update({ "DWQ_REPO": repo, "DWQ_COMMIT": commit, }) try: (workdir, workdir_output) = gitjobdir.get( repo, commit, exclusive=exclusive or str(n)) except CalledProcessError as e: workdir_error = ( f"{worker_str}: error getting jobdir. output:\n" + e.output.decode("utf-8", "backslashreplace")) if not workdir: if job.nacks < options.get("max_retries", 2): job.nack() vprint( 1, f"{worker_str}: error getting job dir, requeueing job", ) else: job.done({ "status": "error", "output": workdir_error or f"{worker_str}: error getting jobdir\n", "worker": args.name, "runtime": 0, "body": job.body, }) vprint( 1, f"{worker_str}: cannot get job dir, erroring job", ) working_set.discard(job.job_id) continue else: workdir = "/tmp" workdir_done_at = time.time() files = options.get("files") util.write_files(files, workdir) write_files_done_at = time.time() # assets asset_dir = os.path.join( workdir, "assets", "%s:%s" % (hash(job.job_id), str(unique))) _env.update({"DWQ_ASSETS": asset_dir}) timeout = options.get("timeout", 300) # subtract time used for checkout and job files timeout -= time.time() - before # be sure to timeout a bit earlier, so transmit/network delays # don't make disque time-out itself. timeout -= 10 command_start_at = time.time() if timeout > 0: try: res = run( command, stdout=PIPE, stderr=STDOUT, cwd=workdir, shell=True, env=_env, start_new_session=True, timeout=timeout, ) result = res.returncode output = res.stdout.decode( "utf-8", "backslashreplace") except TimeoutExpired as e: result = "timeout" decoded = e.output.decode( "utf-8", "backslashreplace") output = f"{decoded}{worker_str}: error: timed out\n" else: result = "timeout" output = f"{worker_str}: command timed out while setting up job\n" command_done_at = time.time() if (result not in { 0, "0", "pass" }) and job.nacks < options.get("max_retries", 2): vprint( 2, f"{worker_str}: command:", command, "result:", result, "nacks:", job.nacks, "re-queueing.", ) job.nack() else: cmd_runtime = command_done_at - command_start_at workdir_setup_time = workdir_done_at - before write_files_time = write_files_done_at - workdir_done_at options = job.body.get("options") if options: options.pop("files", None) # remove options from body if it is now empty if not options: job.body.pop("options", None) _result = { "status": result, "output": output, "worker": args.name, "body": job.body, "unique": str(unique), "times": { "cmd_runtime": cmd_runtime, }, } if files: _result["times"][ "write_files"] = write_files_time if workdir_output: _result[ "workdir_output"] = workdir_output.decode( "utf-8", "backslashreplace") _result["times"][ "workdir_setup"] = workdir_setup_time # pack assets try: asset_files = os.listdir(asset_dir) if asset_files: before_assets = time.time() _result.update({ "assets": util.gen_file_data( asset_files, asset_dir) }) shutil.rmtree(asset_dir, ignore_errors=True) _result["times"]["assets"] = ( time.time() - before_assets) except FileNotFoundError: pass runtime = time.time() - before _result["runtime"] = runtime job.done(_result) vprint( 2, f"{worker_str}: command:", command, "result:", result, "runtime: %.1fs" % runtime, ) working_set.discard(job.job_id) except Exception as e: if workdir and repo: gitjobdir.release(workdir) raise e if repo: gitjobdir.release(workdir) except Exception as e: print(f"{worker_str}: uncaught exception") traceback.print_exc() time.sleep(2) print(f"{worker_str}: restarting worker")