Example #1
0
def get_jobs(partial_job_ids):
    jobs = []
    need_confirmation = False
    for partial_job_id in partial_job_ids:
        matches = find_where(Job, id__like=f"%{partial_job_id}%")
        if len(matches) == 0:
            raise RuntimeError(f"No jobs found matching '{partial_job_id}'")
        elif len(matches) > 1:
            print(f"Multiple jobs found matching '{partial_job_id}':")
            for i, job in enumerate(matches, start=1):
                print(f"  {i}: {job.slug}")
            print()
            index = int(input("Enter number: "))
            assert 0 < index <= len(matches)
            jobs.append(matches[index - 1])
        else:
            need_confirmation = True
            jobs.append(matches[0])
    if need_confirmation:
        print("About to kill jobs:")
        for job in jobs:
            print(f"  {job.slug}")
        confirm = input("\nEnter to continue, Ctrl-C to quit ")
        assert confirm == ""
    return jobs
Example #2
0
def sync():
    response = api_get(
        "job-requests",
        # We're deliberately not paginating here on the assumption that the set
        # of active jobs is always going to be small enough that we can fetch
        # them in a single request and we don't need the extra complexity
        params={"backend": config.BACKEND},
    )
    job_requests = [
        job_request_from_remote_format(i) for i in response["results"]
    ]

    # Bail early if there's nothing to do
    if not job_requests:
        return

    job_request_ids = [i.id for i in job_requests]
    for job_request in job_requests:
        with set_log_context(job_request=job_request):
            create_or_update_jobs(job_request)
    jobs = find_where(Job, job_request_id__in=job_request_ids)
    jobs_data = [job_to_remote_format(i) for i in jobs]
    log.debug(f"Syncing {len(jobs_data)} jobs back to job-server")

    api_post("jobs", json=jobs_data)
Example #3
0
def main(
    repo_url, actions, commit, branch, workspace, database, force_run_dependencies
):
    # Make paths to local repos absolute
    parsed = urlparse(repo_url)
    if not parsed.scheme and not parsed.netloc:
        path = Path(parsed.path).resolve()
        # In case we're on Windows
        repo_url = str(path).replace("\\", "/")
    if not commit:
        commit = get_sha_from_remote_ref(repo_url, branch)
    job_request = job_request_from_remote_format(
        dict(
            identifier=random_id(),
            sha=commit,
            workspace=dict(name=workspace, repo=repo_url, branch=branch, db=database),
            requested_actions=actions,
            force_run_dependencies=force_run_dependencies,
            cancelled_actions=[],
        )
    )
    print("Submitting JobRequest:\n")
    display_obj(job_request)
    create_or_update_jobs(job_request)
    jobs = find_where(Job, job_request_id=job_request.id)
    print(f"Created {len(jobs)} new jobs:\n")
    for job in jobs:
        display_obj(job)
Example #4
0
def get_reason_job_not_started(job):
    log.debug("Querying for running jobs")
    running_jobs = find_where(Job, state=State.RUNNING)
    log.debug("Query done")
    used_resources = sum(
        get_job_resource_weight(running_job) for running_job in running_jobs)
    required_resources = get_job_resource_weight(job)
    if used_resources + required_resources > config.MAX_WORKERS:
        if required_resources > 1:
            return "Waiting on available workers for resource intensive job"
        else:
            return "Waiting on available workers"
Example #5
0
def calculate_workspace_state(workspace):
    """
    Return a list containing the most recent uncancelled job (if any) for each action in the workspace. We always
    ignore cancelled jobs when considering the historical state of the system. We also ignore jobs whose action is
    '__error__'; these are dummy jobs created only to help us communicate failure states back to the job-server (see
    create_or_update_jobs.create_failed_job()).
    """
    all_jobs = find_where(Job, workspace=workspace, cancelled=False)
    latest_jobs = []
    for action, jobs in group_by(all_jobs, attrgetter("action")):
        if action == "__error__":
            continue
        ordered_jobs = sorted(jobs, key=attrgetter("created_at"), reverse=True)
        latest_jobs.append(ordered_jobs[0])
    return latest_jobs
Example #6
0
def get_job(partial_job_id):
    matches = find_where(Job, id__like=f"%{partial_job_id}%")
    if len(matches) == 0:
        raise RuntimeError("No matching jobs found")
    elif len(matches) > 1:
        print("Multiple matching jobs found:")
        for i, job in enumerate(matches, start=1):
            print(f"  {i}: {job.slug}")
        print()
        index = int(input("Enter number: "))
        assert 0 < index <= len(matches)
        job = matches[index - 1]
    else:
        job = matches[0]
        print(f"About to reset job:\n  {job.slug}\n")
        confirm = input("Enter to continue, Ctrl-C to quit ")
        assert confirm == ""
    return job
Example #7
0
def handle_jobs(api: Optional[ExecutorAPI]):
    log.debug("Querying database for active jobs")
    active_jobs = find_where(Job, state__in=[State.PENDING, State.RUNNING])
    log.debug("Done query")
    # Randomising the job order is a crude but effective way to ensure that a
    # single large job request doesn't hog all the workers. We make this
    # optional as, when running locally, having jobs run in a predictable order
    # is preferable
    if config.RANDOMISE_JOB_ORDER:
        random.shuffle(active_jobs)

    for job in active_jobs:
        # `set_log_context` ensures that all log messages triggered anywhere
        # further down the stack will have `job` set on them
        with set_log_context(job=job):
            handle_single_job(job, api)

    return active_jobs
Example #8
0
def main():
    print(
        "== DANGER ZONE ==\n"
        "\n"
        "This will kill all running jobs and reset them to the PENDING state, ready\n"
        "to be restarted following a reboot.\n"
        "\n"
        "It should only be run when the job-runner service has been stopped."
        "\n"
    )
    confirm = input("Are you sure you want to continue? (y/N)")
    assert confirm.strip().lower() == "y"
    # Reset all running jobs to pending
    update_where(Job, {"state": State.PENDING, "started_at": None}, state=State.RUNNING)
    # Make sure all containers and volumes are removed ready to freshly restart the jobs
    # after the reboot
    for job in find_where(Job, state=State.PENDING):
        docker.kill(container_name(job))
        docker.delete_container(container_name(job))
        docker.delete_volume(volume_name(job))
Example #9
0
def create_and_run_jobs(
    project_dir,
    actions,
    force_run_dependencies,
    continue_on_error,
    temp_dir,
    docker_label,
    clean_up_docker_objects=True,
    log_format=LOCAL_RUN_FORMAT,
    format_output_for_github=False,
):
    # Fiddle with the configuration to suit what we need for running local jobs
    docker.LABEL = docker_label
    # It's more helpful in this context to have things consistent
    config.RANDOMISE_JOB_ORDER = False
    config.HIGH_PRIVACY_WORKSPACES_DIR = project_dir.parent
    config.DATABASE_FILE = project_dir / "metadata" / "db.sqlite"
    config.TMP_DIR = temp_dir
    config.JOB_LOG_DIR = temp_dir / "logs"
    config.BACKEND = "expectations"
    config.USING_DUMMY_DATA_BACKEND = True
    config.CLEAN_UP_DOCKER_OBJECTS = clean_up_docker_objects

    # We want to fetch any reusable actions code directly from Github so as to
    # avoid pushing unnecessary traffic through the proxy
    config.GIT_PROXY_DOMAIN = "github.com"
    # Rather than using the throwaway `temp_dir` to store git repos in we use a
    # consistent directory within the system tempdir. This means we don't have
    # to keep refetching commits and also avoids the complexity of deleting
    # git's read-only directories on Windows. We use the current username as a
    # crude means of scoping the directory to the user in order to avoid
    # potential permissions issues if multiple users share the same directory.
    config.GIT_REPO_DIR = Path(
        tempfile.gettempdir()).joinpath(f"opensafely_{getuser()}")

    # None of the below should be used when running locally
    config.WORKDIR = None
    config.HIGH_PRIVACY_STORAGE_BASE = None
    config.MEDIUM_PRIVACY_STORAGE_BASE = None
    config.MEDIUM_PRIVACY_WORKSPACES_DIR = None

    configure_logging(
        fmt=log_format,
        # All the other output we produce goes to stdout and it's a bit
        # confusing if the log messages end up on a separate stream
        stream=sys.stdout,
        # Filter out log messages in the local run context
        extra_filter=filter_log_messages,
    )

    # Any jobs that are running or pending must be left over from a previous run that was aborted either by an
    # unexpected and unhandled exception or by the researcher abruptly terminating the process. We can't reasonably
    # recover them (and the researcher may not want to -- maybe that's why they terminated), so we mark them as
    # cancelled. This causes the rest of the system to effectively ignore them.
    #
    # We do this here at the beginning rather than trying to catch these cases when the process exits because the
    # latter couldn't ever completely guarantee to catch every possible termination case correctly.
    database.update_where(
        Job,
        {
            "cancelled": True,
            "state": State.FAILED
        },
        state__in=[State.RUNNING, State.PENDING],
    )

    try:
        job_request, jobs = create_job_request_and_jobs(
            project_dir, actions, force_run_dependencies)
    except NothingToDoError:
        print("=> All actions already completed successfully")
        print("   Use -f option to force everything to re-run")
        return True
    except (ProjectValidationError, ReusableActionError, JobRequestError) as e:
        print(f"=> {type(e).__name__}")
        print(textwrap.indent(str(e), "   "))
        if hasattr(e, "valid_actions"):
            print("\n   Valid action names are:")
            for action in e.valid_actions:
                if action != RUN_ALL_COMMAND:
                    print(f"     {action}")
                else:
                    print(f"     {action} (runs all actions in project)")
        return False

    docker_images = get_docker_images(jobs)

    uses_stata = any(
        i.startswith(f"{config.DOCKER_REGISTRY}/stata-mp:")
        for i in docker_images)
    if uses_stata and config.STATA_LICENSE is None:
        config.STATA_LICENSE = get_stata_license()
        if config.STATA_LICENSE is None:
            print(
                "The docker image 'stata-mp' requires a license to function.\n"
                "\n"
                "If you are a member of OpenSAFELY we should have been able to fetch\n"
                "the license automatically, so something has gone wrong. Please open\n"
                "a new discussion here so we can help:\n"
                "  https://github.com/opensafely/documentation/discussions\n"
                "\n"
                "If you are not a member of OpenSAFELY you will have to provide your\n"
                "own license. See the dicussion here for pointers:\n"
                " https://github.com/opensafely/documentation/discussions/299")
            return False

    for image in docker_images:
        if not docker.image_exists_locally(image):
            print(f"Fetching missing docker image: docker pull {image}")
            try:
                # We want to be chatty when running in the console so users can
                # see progress and quiet in CI so we don't spam the logs with
                # layer download noise
                docker.pull(image, quiet=not sys.stdout.isatty())
            except docker.DockerPullError as e:
                print("Failed with error:")
                print(e)
                return False

    action_names = [job.action for job in jobs]
    print(f"\nRunning actions: {', '.join(action_names)}\n")

    # Wrap all the log output inside an expandable block when running inside
    # Github Actions
    if format_output_for_github:
        print(
            f"::group::Job Runner Logs {ANSI.Grey}(click to view){ANSI.Reset}")

    # Run everything
    exit_condition = (no_jobs_remaining
                      if continue_on_error else job_failed_or_none_remaining)
    try:
        run_main(exit_callback=exit_condition)
    except KeyboardInterrupt:
        pass
    finally:
        if format_output_for_github:
            print("::endgroup::")

    final_jobs = find_where(Job,
                            state__in=[State.FAILED, State.SUCCEEDED],
                            job_request_id=job_request.id)
    # Always show failed jobs last, otherwise show in order run
    final_jobs.sort(key=lambda job: (
        1 if job.state == State.FAILED else 0,
        job.started_at or 0,
    ))

    # Pretty print details of each action
    print()
    if not final_jobs:
        print("=> No jobs completed")
    for job in final_jobs:
        log_file = f"{METADATA_DIR}/{job.action}.log"
        # If a job fails we don't want to clutter the output with its failed
        # dependants.
        if (job.state == State.FAILED
                and job.status_code == StatusCode.DEPENDENCY_FAILED):
            continue
        if format_output_for_github:
            print(f"{ANSI.Bold}=> {job.action}{ANSI.Reset}")
        else:
            print(f"=> {job.action}")
        print(textwrap.indent(job.status_message, "   "))
        # Where a job failed because expected outputs weren't found we show a
        # list of other outputs which were generated
        if job.unmatched_outputs:
            print(
                "\n   Did you mean to match one of these files instead?\n    - ",
                end="")
            print("\n    - ".join(job.unmatched_outputs))
        print()
        # Output the entire log file inside an expandable block when running
        # inside Github Actions
        if format_output_for_github:
            print(
                f"::group:: log file: {log_file} {ANSI.Grey}(click to view){ANSI.Reset}"
            )
            long_grey_line = ANSI.Grey + ("\u2015" * 80) + ANSI.Reset
            print(long_grey_line)
            print((project_dir / log_file).read_text())
            print(long_grey_line)
            print("::endgroup::")
        else:
            print(f"   log file: {log_file}")
        # Display matched outputs
        print("   outputs:")
        outputs = sorted(job.outputs.items()) if job.outputs else []
        print(
            tabulate(outputs, separator="  - ", indent=5,
                     empty="(no outputs)"))
        # If a job exited with an error code then try to display the end of the
        # log output in case that makes the problem immediately obvious
        if job.status_code == StatusCode.NONZERO_EXIT:
            logs, truncated = get_log_file_snippet(project_dir / log_file,
                                                   max_lines=32)
            if logs:
                print(f"\n   logs{' (truncated)' if truncated else ''}:\n")
                print(textwrap.indent(logs, "     "))
        print()

    success_flag = all(job.state == State.SUCCEEDED for job in final_jobs)
    return success_flag