Exemple #1
0
def main():
    global shutdown
    global verbose
    global active_event

    args = parse_args()
    verbose = args.verbose - args.quiet

    cmd_server_pool = cmdserver.CmdServerPool(args.jobs)

    signal.signal(signal.SIGTERM, sigterm_handler)

    _dir = "/tmp/dwq.%s" % str(random.random())
    gitjobdir = GitJobDir(_dir, args.jobs)

    servers = ["localhost:7711"]
    try:
        Disque.connect(servers)
        vprint(1, "dwqw: connected.")
    except:
        pass

    working_set = SyncSet()

    for n in range(1, args.jobs + 1):
        threading.Thread(
            target=worker,
            args=(n, cmd_server_pool, gitjobdir, args, working_set),
            daemon=True,
        ).start()

    active_event.set()

    try:
        while True:
            if not Disque.connected():
                try:
                    vprint(1, "dwqw: connecting...")
                    Disque.connect(servers)
                    vprint(1, "dwqw: connected.")
                except RedisError:
                    time.sleep(1)
                    continue

            try:
                control_jobs = Job.get(["control::worker::%s" % args.name])
                for job in control_jobs or []:
                    handle_control_job(args, job)
            except RedisError:
                pass

    except (KeyboardInterrupt, SystemExit):
        vprint(1, "dwqw: shutting down")
        shutdown = True
        cmd_server_pool.destroy()
        vprint(1, "dwqw: nack'ing jobs")
        jobs = working_set.empty()
        d = Disque.get()
        d.nack_job(*jobs)
        vprint(1, "dwqw: cleaning up job directories")
        gitjobdir.cleanup()
Exemple #2
0
def worker(n, cmd_server_pool, gitjobdir, args, working_set):
    global active_event
    global shutdown
    print("worker %2i: started" % n)
    buildnum = 0
    while not shutdown:
        try:
            if not shutdown and not Disque.connected():
                time.sleep(1)
                continue
            while not shutdown:
                active_event.wait()
                jobs = Job.get(args.queues)
                for job in jobs:
                    if shutdown:
                        job.nack()
                        continue

                    if job.additional_deliveries > 2:
                        error = "too many deliveries (usual reason: timeout)"
                        vprint(2, "worker %2i: %s" % (n, error))
                        job.done({
                            "status": "error",
                            "output": "dwqw: %s\n" % error,
                            "worker": args.name,
                            "runtime": 0,
                            "body": job.body,
                        })
                        continue

                    buildnum += 1
                    working_set.add(job.job_id)
                    before = time.time()
                    vprint(
                        2,
                        "worker %2i: got job %s from queue %s" %
                        (n, job.job_id, job.queue_name),
                    )

                    try:
                        repo = job.body["repo"]
                        commit = job.body["commit"]
                        command = job.body["command"]
                    except KeyError:
                        vprint(2, "worker %2i: invalid job json body" % n)
                        job.done({
                            "status":
                            "error",
                            "output":
                            "worker.py: invalid job description",
                        })
                        continue

                    vprint(2, 'worker %2i: command="%s"' % (n, command))

                    exclusive = None
                    try:
                        options = job.body.get("options") or {}
                        if options.get("jobdir") or "" == "exclusive":
                            exclusive = str(random.random())
                    except KeyError:
                        pass

                    unique = random.random()

                    _env = os.environ.copy()

                    try:
                        _env.update(job.body["env"])
                    except KeyError:
                        pass

                    _env.update({
                        "DWQ_REPO":
                        repo,
                        "DWQ_COMMIT":
                        commit,
                        "DWQ_QUEUE":
                        job.queue_name,
                        "DWQ_WORKER":
                        args.name,
                        "DWQ_WORKER_BUILDNUM":
                        str(buildnum),
                        "DWQ_WORKER_THREAD":
                        str(n),
                        "DWQ_JOBID":
                        job.job_id,
                        "DWQ_JOB_UNIQUE":
                        str(unique),
                        "DWQ_CONTROL_QUEUE":
                        job.body.get("control_queues")[0],
                    })

                    workdir = None
                    workdir_error = None
                    try:
                        try:
                            workdir = gitjobdir.get(repo,
                                                    commit,
                                                    exclusive=exclusive
                                                    or str(n))
                        except subprocess.CalledProcessError as e:
                            workdir_error = (
                                "dwqw: error getting jobdir. output: \n" +
                                e.output.decode("utf-8"))

                        if not workdir:
                            if job.nacks < options.get("max_retries", 2):
                                job.nack()
                                vprint(
                                    1,
                                    "worker %2i: error getting job dir, requeueing job"
                                    % n,
                                )
                            else:
                                job.done({
                                    "status": "error",
                                    "output": workdir_error
                                    or "dwqw: error getting jobdir\n",
                                    "worker": args.name,
                                    "runtime": 0,
                                    "body": job.body,
                                })
                                vprint(
                                    1,
                                    "worker %2i: cannot get job dir, erroring job"
                                    % n,
                                )
                            working_set.discard(job.job_id)
                            continue

                        util.write_files(options.get("files"), workdir)

                        # assets
                        asset_dir = os.path.join(
                            workdir, "assets",
                            "%s:%s" % (hash(job.job_id), str(unique)))
                        _env.update({"DWQ_ASSETS": asset_dir})

                        timeout = options.get("timeout", 300)

                        if timeout >= 8:
                            # send explit nack before disque times us out
                            # but only if original timeout is not too small
                            timeout -= 2

                        handle = cmd_server_pool.runcmd(
                            command,
                            cwd=workdir,
                            shell=True,
                            env=_env,
                            start_new_session=True,
                        )
                        output, result = handle.wait(timeout=timeout)
                        if handle.timeout:
                            result = "timeout"
                            output = "dwqw: command timed out\n"

                        if (result not in {
                                0, "0", "pass"
                        }) and job.nacks < options.get("max_retries", 2):
                            vprint(
                                2,
                                "worker %2i: command:" % n,
                                command,
                                "result:",
                                result,
                                "nacks:",
                                job.nacks,
                                "re-queueing.",
                            )
                            job.nack()
                        else:
                            runtime = time.time() - before

                            options = job.body.get("options")
                            if options:
                                options.pop("files", None)

                            # remove options from body if it is now empty
                            if not options:
                                job.body.pop("options", None)

                            _result = {
                                "status": result,
                                "output": output,
                                "worker": args.name,
                                "runtime": runtime,
                                "body": job.body,
                                "unique": str(unique),
                            }

                            # pack assets
                            try:
                                asset_files = os.listdir(asset_dir)
                                if asset_files:
                                    _result.update({
                                        "assets":
                                        util.gen_file_data(
                                            asset_files, asset_dir)
                                    })
                                    shutil.rmtree(asset_dir,
                                                  ignore_errors=True)
                            except FileNotFoundError:
                                pass

                            job.done(_result)

                            vprint(
                                2,
                                "worker %2i: command:" % n,
                                command,
                                "result:",
                                result,
                                "runtime: %.1fs" % runtime,
                            )
                            working_set.discard(job.job_id)
                    except Exception as e:
                        if workdir:
                            gitjobdir.release(workdir)
                        raise e

                    gitjobdir.release(workdir)

        except Exception as e:
            vprint(1, "worker %2i: uncaught exception" % n)
            traceback.print_exc()
            time.sleep(10)
            vprint(1, "worker %2i: restarting worker" % n)
Exemple #3
0
def worker(n, cmd_server_pool, gitjobdir, args, working_set):
    global active_event
    global shutdown

    worker_str = f"dwqw@{args.name}.{n}"
    print(f"{worker_str}: started")
    buildnum = 0
    while not shutdown:
        try:
            if not shutdown and not Disque.connected():
                time.sleep(1)
                continue
            while not shutdown:
                active_event.wait()
                jobs = Job.get(args.queues)
                for job in jobs:
                    if shutdown or not active_event.is_set():
                        job.nack()
                        continue

                    if job.additional_deliveries > 2:
                        error = "too many deliveries (usual reason: timeout)"
                        vprint(2, f"{worker_str}: {error}")
                        job.done({
                            "status": "error",
                            "output": f"{worker_str}: {error}\n",
                            "worker": args.name,
                            "runtime": 0,
                            "body": job.body,
                        })
                        continue

                    buildnum += 1
                    working_set.add(job.job_id)
                    before = time.time()
                    vprint(
                        2,
                        f"{worker_str}: got job {job.job_id} from queue {job.queue_name}",
                    )

                    try:
                        command = job.body["command"]
                    except KeyError:
                        vprint(2, f"{worker_str}: invalid job json body")
                        job.done({
                            "status":
                            "error",
                            "output":
                            f'{worker_str} invalid job body: "{job.body}"',
                        })
                        continue

                    vprint(2, f'{worker_str}: command="{command}"')

                    repo = None
                    commit = None

                    try:
                        repo = job.body["repo"]
                        commit = job.body["commit"]
                    except KeyError:
                        pass

                    if (repo is None) ^ (commit is None):
                        vprint(
                            2,
                            f"{worker_str}: invalid job json body, only one of repo and commit specified"
                        )
                        job.done({
                            "status":
                            "error",
                            "output":
                            f'{worker_str} invalid job body: "{job.body}"',
                        })
                        continue

                    exclusive = None
                    try:
                        options = job.body.get("options") or {}
                        if options.get("jobdir") or "" == "exclusive":
                            exclusive = str(random.random())
                    except KeyError:
                        pass

                    unique = random.random()

                    _env = os.environ.copy()

                    try:
                        _env.update(job.body["env"])
                    except KeyError:
                        pass

                    _env.update({
                        "DWQ_QUEUE":
                        job.queue_name,
                        "DWQ_WORKER":
                        args.name,
                        "DWQ_WORKER_BUILDNUM":
                        str(buildnum),
                        "DWQ_WORKER_THREAD":
                        str(n),
                        "DWQ_JOBID":
                        job.job_id,
                        "DWQ_JOB_UNIQUE":
                        str(unique),
                        "DWQ_CONTROL_QUEUE":
                        job.body.get("control_queues")[0],
                    })

                    workdir = None
                    workdir_output = None
                    workdir_error = None
                    try:
                        if repo is not None:
                            _env.update({
                                "DWQ_REPO": repo,
                                "DWQ_COMMIT": commit,
                            })

                            try:
                                (workdir, workdir_output) = gitjobdir.get(
                                    repo,
                                    commit,
                                    exclusive=exclusive or str(n))
                            except CalledProcessError as e:
                                workdir_error = (
                                    f"{worker_str}: error getting jobdir. output:\n"
                                    + e.output.decode("utf-8",
                                                      "backslashreplace"))

                            if not workdir:
                                if job.nacks < options.get("max_retries", 2):
                                    job.nack()
                                    vprint(
                                        1,
                                        f"{worker_str}: error getting job dir, requeueing job",
                                    )
                                else:
                                    job.done({
                                        "status": "error",
                                        "output": workdir_error or
                                        f"{worker_str}: error getting jobdir\n",
                                        "worker": args.name,
                                        "runtime": 0,
                                        "body": job.body,
                                    })
                                    vprint(
                                        1,
                                        f"{worker_str}: cannot get job dir, erroring job",
                                    )
                                working_set.discard(job.job_id)
                                continue
                        else:
                            workdir = "/tmp"

                        workdir_done_at = time.time()
                        files = options.get("files")
                        util.write_files(files, workdir)
                        write_files_done_at = time.time()

                        # assets
                        asset_dir = os.path.join(
                            workdir, "assets",
                            "%s:%s" % (hash(job.job_id), str(unique)))
                        _env.update({"DWQ_ASSETS": asset_dir})

                        timeout = options.get("timeout", 300)

                        # subtract time used for checkout and job files
                        timeout -= time.time() - before

                        # be sure to timeout a bit earlier, so transmit/network delays
                        # don't make disque time-out itself.
                        timeout -= 10

                        command_start_at = time.time()

                        if timeout > 0:
                            try:
                                res = run(
                                    command,
                                    stdout=PIPE,
                                    stderr=STDOUT,
                                    cwd=workdir,
                                    shell=True,
                                    env=_env,
                                    start_new_session=True,
                                    timeout=timeout,
                                )

                                result = res.returncode
                                output = res.stdout.decode(
                                    "utf-8", "backslashreplace")

                            except TimeoutExpired as e:
                                result = "timeout"
                                decoded = e.output.decode(
                                    "utf-8", "backslashreplace")
                                output = f"{decoded}{worker_str}: error: timed out\n"

                        else:
                            result = "timeout"
                            output = f"{worker_str}: command timed out while setting up job\n"

                        command_done_at = time.time()

                        if (result not in {
                                0, "0", "pass"
                        }) and job.nacks < options.get("max_retries", 2):
                            vprint(
                                2,
                                f"{worker_str}: command:",
                                command,
                                "result:",
                                result,
                                "nacks:",
                                job.nacks,
                                "re-queueing.",
                            )
                            job.nack()
                        else:
                            cmd_runtime = command_done_at - command_start_at
                            workdir_setup_time = workdir_done_at - before
                            write_files_time = write_files_done_at - workdir_done_at

                            options = job.body.get("options")
                            if options:
                                options.pop("files", None)

                            # remove options from body if it is now empty
                            if not options:
                                job.body.pop("options", None)

                            _result = {
                                "status": result,
                                "output": output,
                                "worker": args.name,
                                "body": job.body,
                                "unique": str(unique),
                                "times": {
                                    "cmd_runtime": cmd_runtime,
                                },
                            }

                            if files:
                                _result["times"][
                                    "write_files"] = write_files_time

                            if workdir_output:
                                _result[
                                    "workdir_output"] = workdir_output.decode(
                                        "utf-8", "backslashreplace")
                                _result["times"][
                                    "workdir_setup"] = workdir_setup_time

                            # pack assets
                            try:
                                asset_files = os.listdir(asset_dir)
                                if asset_files:
                                    before_assets = time.time()
                                    _result.update({
                                        "assets":
                                        util.gen_file_data(
                                            asset_files, asset_dir)
                                    })
                                    shutil.rmtree(asset_dir,
                                                  ignore_errors=True)
                                    _result["times"]["assets"] = (
                                        time.time() - before_assets)

                            except FileNotFoundError:
                                pass

                            runtime = time.time() - before
                            _result["runtime"] = runtime

                            job.done(_result)

                            vprint(
                                2,
                                f"{worker_str}: command:",
                                command,
                                "result:",
                                result,
                                "runtime: %.1fs" % runtime,
                            )
                            working_set.discard(job.job_id)
                    except Exception as e:
                        if workdir and repo:
                            gitjobdir.release(workdir)
                        raise e

                    if repo:
                        gitjobdir.release(workdir)

        except Exception as e:
            print(f"{worker_str}: uncaught exception")
            traceback.print_exc()
            time.sleep(2)
            print(f"{worker_str}: restarting worker")