def get_jobs(partial_job_ids): jobs = [] need_confirmation = False for partial_job_id in partial_job_ids: matches = find_where(Job, id__like=f"%{partial_job_id}%") if len(matches) == 0: raise RuntimeError(f"No jobs found matching '{partial_job_id}'") elif len(matches) > 1: print(f"Multiple jobs found matching '{partial_job_id}':") for i, job in enumerate(matches, start=1): print(f" {i}: {job.slug}") print() index = int(input("Enter number: ")) assert 0 < index <= len(matches) jobs.append(matches[index - 1]) else: need_confirmation = True jobs.append(matches[0]) if need_confirmation: print("About to kill jobs:") for job in jobs: print(f" {job.slug}") confirm = input("\nEnter to continue, Ctrl-C to quit ") assert confirm == "" return jobs
def sync(): response = api_get( "job-requests", # We're deliberately not paginating here on the assumption that the set # of active jobs is always going to be small enough that we can fetch # them in a single request and we don't need the extra complexity params={"backend": config.BACKEND}, ) job_requests = [ job_request_from_remote_format(i) for i in response["results"] ] # Bail early if there's nothing to do if not job_requests: return job_request_ids = [i.id for i in job_requests] for job_request in job_requests: with set_log_context(job_request=job_request): create_or_update_jobs(job_request) jobs = find_where(Job, job_request_id__in=job_request_ids) jobs_data = [job_to_remote_format(i) for i in jobs] log.debug(f"Syncing {len(jobs_data)} jobs back to job-server") api_post("jobs", json=jobs_data)
def main( repo_url, actions, commit, branch, workspace, database, force_run_dependencies ): # Make paths to local repos absolute parsed = urlparse(repo_url) if not parsed.scheme and not parsed.netloc: path = Path(parsed.path).resolve() # In case we're on Windows repo_url = str(path).replace("\\", "/") if not commit: commit = get_sha_from_remote_ref(repo_url, branch) job_request = job_request_from_remote_format( dict( identifier=random_id(), sha=commit, workspace=dict(name=workspace, repo=repo_url, branch=branch, db=database), requested_actions=actions, force_run_dependencies=force_run_dependencies, cancelled_actions=[], ) ) print("Submitting JobRequest:\n") display_obj(job_request) create_or_update_jobs(job_request) jobs = find_where(Job, job_request_id=job_request.id) print(f"Created {len(jobs)} new jobs:\n") for job in jobs: display_obj(job)
def get_reason_job_not_started(job): log.debug("Querying for running jobs") running_jobs = find_where(Job, state=State.RUNNING) log.debug("Query done") used_resources = sum( get_job_resource_weight(running_job) for running_job in running_jobs) required_resources = get_job_resource_weight(job) if used_resources + required_resources > config.MAX_WORKERS: if required_resources > 1: return "Waiting on available workers for resource intensive job" else: return "Waiting on available workers"
def calculate_workspace_state(workspace): """ Return a list containing the most recent uncancelled job (if any) for each action in the workspace. We always ignore cancelled jobs when considering the historical state of the system. We also ignore jobs whose action is '__error__'; these are dummy jobs created only to help us communicate failure states back to the job-server (see create_or_update_jobs.create_failed_job()). """ all_jobs = find_where(Job, workspace=workspace, cancelled=False) latest_jobs = [] for action, jobs in group_by(all_jobs, attrgetter("action")): if action == "__error__": continue ordered_jobs = sorted(jobs, key=attrgetter("created_at"), reverse=True) latest_jobs.append(ordered_jobs[0]) return latest_jobs
def get_job(partial_job_id): matches = find_where(Job, id__like=f"%{partial_job_id}%") if len(matches) == 0: raise RuntimeError("No matching jobs found") elif len(matches) > 1: print("Multiple matching jobs found:") for i, job in enumerate(matches, start=1): print(f" {i}: {job.slug}") print() index = int(input("Enter number: ")) assert 0 < index <= len(matches) job = matches[index - 1] else: job = matches[0] print(f"About to reset job:\n {job.slug}\n") confirm = input("Enter to continue, Ctrl-C to quit ") assert confirm == "" return job
def handle_jobs(api: Optional[ExecutorAPI]): log.debug("Querying database for active jobs") active_jobs = find_where(Job, state__in=[State.PENDING, State.RUNNING]) log.debug("Done query") # Randomising the job order is a crude but effective way to ensure that a # single large job request doesn't hog all the workers. We make this # optional as, when running locally, having jobs run in a predictable order # is preferable if config.RANDOMISE_JOB_ORDER: random.shuffle(active_jobs) for job in active_jobs: # `set_log_context` ensures that all log messages triggered anywhere # further down the stack will have `job` set on them with set_log_context(job=job): handle_single_job(job, api) return active_jobs
def main(): print( "== DANGER ZONE ==\n" "\n" "This will kill all running jobs and reset them to the PENDING state, ready\n" "to be restarted following a reboot.\n" "\n" "It should only be run when the job-runner service has been stopped." "\n" ) confirm = input("Are you sure you want to continue? (y/N)") assert confirm.strip().lower() == "y" # Reset all running jobs to pending update_where(Job, {"state": State.PENDING, "started_at": None}, state=State.RUNNING) # Make sure all containers and volumes are removed ready to freshly restart the jobs # after the reboot for job in find_where(Job, state=State.PENDING): docker.kill(container_name(job)) docker.delete_container(container_name(job)) docker.delete_volume(volume_name(job))
def create_and_run_jobs( project_dir, actions, force_run_dependencies, continue_on_error, temp_dir, docker_label, clean_up_docker_objects=True, log_format=LOCAL_RUN_FORMAT, format_output_for_github=False, ): # Fiddle with the configuration to suit what we need for running local jobs docker.LABEL = docker_label # It's more helpful in this context to have things consistent config.RANDOMISE_JOB_ORDER = False config.HIGH_PRIVACY_WORKSPACES_DIR = project_dir.parent config.DATABASE_FILE = project_dir / "metadata" / "db.sqlite" config.TMP_DIR = temp_dir config.JOB_LOG_DIR = temp_dir / "logs" config.BACKEND = "expectations" config.USING_DUMMY_DATA_BACKEND = True config.CLEAN_UP_DOCKER_OBJECTS = clean_up_docker_objects # We want to fetch any reusable actions code directly from Github so as to # avoid pushing unnecessary traffic through the proxy config.GIT_PROXY_DOMAIN = "github.com" # Rather than using the throwaway `temp_dir` to store git repos in we use a # consistent directory within the system tempdir. This means we don't have # to keep refetching commits and also avoids the complexity of deleting # git's read-only directories on Windows. We use the current username as a # crude means of scoping the directory to the user in order to avoid # potential permissions issues if multiple users share the same directory. config.GIT_REPO_DIR = Path( tempfile.gettempdir()).joinpath(f"opensafely_{getuser()}") # None of the below should be used when running locally config.WORKDIR = None config.HIGH_PRIVACY_STORAGE_BASE = None config.MEDIUM_PRIVACY_STORAGE_BASE = None config.MEDIUM_PRIVACY_WORKSPACES_DIR = None configure_logging( fmt=log_format, # All the other output we produce goes to stdout and it's a bit # confusing if the log messages end up on a separate stream stream=sys.stdout, # Filter out log messages in the local run context extra_filter=filter_log_messages, ) # Any jobs that are running or pending must be left over from a previous run that was aborted either by an # unexpected and unhandled exception or by the researcher abruptly terminating the process. We can't reasonably # recover them (and the researcher may not want to -- maybe that's why they terminated), so we mark them as # cancelled. This causes the rest of the system to effectively ignore them. # # We do this here at the beginning rather than trying to catch these cases when the process exits because the # latter couldn't ever completely guarantee to catch every possible termination case correctly. database.update_where( Job, { "cancelled": True, "state": State.FAILED }, state__in=[State.RUNNING, State.PENDING], ) try: job_request, jobs = create_job_request_and_jobs( project_dir, actions, force_run_dependencies) except NothingToDoError: print("=> All actions already completed successfully") print(" Use -f option to force everything to re-run") return True except (ProjectValidationError, ReusableActionError, JobRequestError) as e: print(f"=> {type(e).__name__}") print(textwrap.indent(str(e), " ")) if hasattr(e, "valid_actions"): print("\n Valid action names are:") for action in e.valid_actions: if action != RUN_ALL_COMMAND: print(f" {action}") else: print(f" {action} (runs all actions in project)") return False docker_images = get_docker_images(jobs) uses_stata = any( i.startswith(f"{config.DOCKER_REGISTRY}/stata-mp:") for i in docker_images) if uses_stata and config.STATA_LICENSE is None: config.STATA_LICENSE = get_stata_license() if config.STATA_LICENSE is None: print( "The docker image 'stata-mp' requires a license to function.\n" "\n" "If you are a member of OpenSAFELY we should have been able to fetch\n" "the license automatically, so something has gone wrong. Please open\n" "a new discussion here so we can help:\n" " https://github.com/opensafely/documentation/discussions\n" "\n" "If you are not a member of OpenSAFELY you will have to provide your\n" "own license. See the dicussion here for pointers:\n" " https://github.com/opensafely/documentation/discussions/299") return False for image in docker_images: if not docker.image_exists_locally(image): print(f"Fetching missing docker image: docker pull {image}") try: # We want to be chatty when running in the console so users can # see progress and quiet in CI so we don't spam the logs with # layer download noise docker.pull(image, quiet=not sys.stdout.isatty()) except docker.DockerPullError as e: print("Failed with error:") print(e) return False action_names = [job.action for job in jobs] print(f"\nRunning actions: {', '.join(action_names)}\n") # Wrap all the log output inside an expandable block when running inside # Github Actions if format_output_for_github: print( f"::group::Job Runner Logs {ANSI.Grey}(click to view){ANSI.Reset}") # Run everything exit_condition = (no_jobs_remaining if continue_on_error else job_failed_or_none_remaining) try: run_main(exit_callback=exit_condition) except KeyboardInterrupt: pass finally: if format_output_for_github: print("::endgroup::") final_jobs = find_where(Job, state__in=[State.FAILED, State.SUCCEEDED], job_request_id=job_request.id) # Always show failed jobs last, otherwise show in order run final_jobs.sort(key=lambda job: ( 1 if job.state == State.FAILED else 0, job.started_at or 0, )) # Pretty print details of each action print() if not final_jobs: print("=> No jobs completed") for job in final_jobs: log_file = f"{METADATA_DIR}/{job.action}.log" # If a job fails we don't want to clutter the output with its failed # dependants. if (job.state == State.FAILED and job.status_code == StatusCode.DEPENDENCY_FAILED): continue if format_output_for_github: print(f"{ANSI.Bold}=> {job.action}{ANSI.Reset}") else: print(f"=> {job.action}") print(textwrap.indent(job.status_message, " ")) # Where a job failed because expected outputs weren't found we show a # list of other outputs which were generated if job.unmatched_outputs: print( "\n Did you mean to match one of these files instead?\n - ", end="") print("\n - ".join(job.unmatched_outputs)) print() # Output the entire log file inside an expandable block when running # inside Github Actions if format_output_for_github: print( f"::group:: log file: {log_file} {ANSI.Grey}(click to view){ANSI.Reset}" ) long_grey_line = ANSI.Grey + ("\u2015" * 80) + ANSI.Reset print(long_grey_line) print((project_dir / log_file).read_text()) print(long_grey_line) print("::endgroup::") else: print(f" log file: {log_file}") # Display matched outputs print(" outputs:") outputs = sorted(job.outputs.items()) if job.outputs else [] print( tabulate(outputs, separator=" - ", indent=5, empty="(no outputs)")) # If a job exited with an error code then try to display the end of the # log output in case that makes the problem immediately obvious if job.status_code == StatusCode.NONZERO_EXIT: logs, truncated = get_log_file_snippet(project_dir / log_file, max_lines=32) if logs: print(f"\n logs{' (truncated)' if truncated else ''}:\n") print(textwrap.indent(logs, " ")) print() success_flag = all(job.state == State.SUCCEEDED for job in final_jobs) return success_flag