def reattach(j: Jobs, rules: Rules, pending_jobs: List[Execution]) -> List[DelegateExecution]: executing = [] for e in pending_jobs: if e.exec_xref != None: rule = rules.get_rule(e.transform) client = rules.get_client(rule.executor) ee = client.reattach(e.exec_xref) executing.append(ee) log.warn("Reattaching existing job {}: {}".format( e.transform, e.exec_xref)) else: log.warn("Canceling {}".format(e.id)) j.cancel_execution(e.id) return executing
def main_loop(jinja2_env: Environment, j: Jobs, new_object_listener: Callable, rules: Rules, state_dir: str, executing: List[DelegateExecution], capture_output: bool, req_confirm: bool, maxfail: int, maxstart: None, properties_to_add=[]) -> None: from conseq.exec_client import create_publish_exec_client _client_for_publishing = Lazy( lambda: create_publish_exec_client(rules.get_vars())) resources_per_client = dict([ (name, client.resources) for name, client in rules.exec_clients.items() ]) timings = TimelineLog(state_dir + "/timeline.log") active_job_ids = set([e.id for e in executing]) resolver = xref.Resolver(state_dir, rules.vars) prev_msg = None abort = False success_count = 0 failures = [] start_count = 0 job_ids_to_ignore = set() skip_remaining = False def get_pending(): pending_jobs = j.get_pending() if skip_remaining: pending_jobs = [] job_ids_to_ignore.update([pj.id for pj in pending_jobs]) else: pending_jobs = [ pj for pj in pending_jobs if pj.id not in job_ids_to_ignore ] return pending_jobs with ui.capture_sigint() as was_interrupted_fn: while not abort: interrupted = was_interrupted_fn() if interrupted: break if len(failures) >= maxfail: we_should_stop = True if len(executing) > 0: # if we have other tasks which are still running, ask user if we really want to abort now. we_should_stop, maxfail = ui.user_says_we_should_stop( len(failures), executing) if we_should_stop: break pending_jobs = get_pending() summary = get_execution_summary(executing) msg = "%d processes running (%s), %d executions pending, %d skipped" % ( len(executing), summary, len(pending_jobs), len(job_ids_to_ignore)) if prev_msg != msg: log.info(msg) if len(pending_jobs) + len(executing) > 0: long_summary = get_long_execution_summary( executing, pending_jobs) log.info("Summary of queue:\n%s\n", long_summary) prev_msg = msg cannot_start_more = (maxstart is not None and start_count >= maxstart) or skip_remaining if len(executing) == 0 and (cannot_start_more or len(pending_jobs) == 0): # now that we've completed everything, check for deferred jobs by marking them as ready. If we have any, loop again j.enable_deferred() deferred_jobs = len(get_pending()) if deferred_jobs > 0 and not cannot_start_more: log.info("Marked deferred %d executions as ready", deferred_jobs) continue break did_useful_work = False # might be worth checking to see if the inputs are identical to previous call # to avoid wasting CPU time checking to schedule over and over when resources are exhausted. # also, the current design has an issue when rerunning part of of the execution tree. Imagine # rule "A" produces "a1", "b1", and "c1", rule "T" transforms "a1" to "a2", "b1" to "b2, and "c1" to "c2". # Lastly rule F takes in a2, b2, and c2 and produces "f". # Now, everything is great if starting from a clean slate. But we've run once, in the artifact db we have # a1, a2, b1, b2, c1, c2, f. If we then rerun T, then we'll get the following executions: (new objects denoted with # "*", old objects from previous run have no star.) # T(a1) -> a2* # F(a2*, b2, c2) -> f* # T(b1) -> b2* # F(a2*, b2*, c2) -> f* # T(c1) -> c2* # F(a2*, b2*, c2*) -> f* # # So in the end the right thing would get done. However, we've run F three times as many as necessary. If we # had a priority queue for work, then we could just set each rule execution priority to be the max(input.id) # That would force a breadth-first execution of the graph. However, since jobs can execute in parallel, # priortizing is not enough. (And we can't block based on priority or there'd be no parallelism!) # # ultimately, I don't think there's a shortcut, and we may need to check the DAG from the previous execution to see # if ancestor node is being re-executed, if so, prune that pending rule execution from the pending list until that # task is done. ready_jobs = get_satisfiable_jobs(rules, resources_per_client, pending_jobs, executing) for job in ready_jobs: assert isinstance(job, dep.RuleExecution) if maxstart is not None and start_count >= maxstart: break active_job_ids.add(job.id) did_useful_work = True rule = rules.get_rule(job.transform) timings.log(job.id, "preprocess_xrefs") # process xrefs which might require rewriting an artifact xrefs_resolved = exec_client.preprocess_xref_inputs( j, resolver, job.inputs) if xrefs_resolved: log.info( "Resolved xrefs on rule, new version will be executed next pass" ) timings.log(job.id, "resolved_xrefs") continue timings.log(job.id, "preprocess_inputs") if rule.is_publish_rule: client = _client_for_publishing() else: # localize paths that will be used in scripts client = rules.get_client(rule.executor) inputs, resolver_state = client.preprocess_inputs( resolver, bind_inputs(rule, job.inputs)) debug_log.log_input_preprocess(job.id, job.inputs, inputs) # if we're required confirmation from the user, do this before we continue if req_confirm: answer = ui.confirm_execution(job.transform, inputs) if answer == "a": req_confirm = False elif answer == "q": abort = True break elif answer == "s": job_ids_to_ignore.add(job.id) continue elif answer == "S": skip_remaining = True break if rule.is_publish_rule: publish(jinja2_env, rule.publish_location, rules.get_vars(), inputs) # maybe record_started and update_exec_xref should be merged so anything started # always has an xref exec_id = j.record_started(job.id) timings.log(job.id, "start") job_dir = get_job_dir(state_dir, exec_id) if not os.path.exists(job_dir): os.makedirs(job_dir) e = execute(job.transform, resolver, jinja2_env, exec_id, job_dir, inputs, rule, rules.get_vars(), capture_output, resolver_state, client) executing.append(e) j.update_exec_xref(e.id, e.get_external_id(), job_dir) start_count += 1 # now poll the jobs which are running and look for which have completed for i, e in reversed(list(enumerate(executing))): failure, completion = e.get_completion() if failure is None and completion is None: continue del executing[i] timestamp = datetime.datetime.now().isoformat() if completion is not None: rule = rules.get_rule(e.transform) if not rule.has_for_all_input(): # only do this check if no inputs are marked as "for all" # because we can have cases where a new artifact appears and we _do_ want # to re-run the rule and clobber the output of the previous run. # If we wanted to be very conservative, we could handle for-all by # looking up which rule created the previous artifact and confirm that it was # from a rule with the same inputs, only verifying the "all" parameters have # changed. However, just ignoring clobbers from rules with "for all" is a cheap # approximation. _failures = [] for artifact in completion: if j.get_existing_id(None, artifact) is not None: # j.gc() _failure = f"Rule {e.transform} ({e.job_dir} generated an output which already exists: {artifact}" _failures.append(_failure) log.error(_failure) if len(_failures) > 0: failure = ", ".join(_failures) if failure is not None: job_id = j.record_completed(timestamp, e.id, dep.STATUS_FAILED, {}) failures.append((e.transform, e.job_dir)) debug_log.log_completed(job_id, dep.STATUS_FAILED, completion) timings.log(job_id, "fail") elif completion is not None: amended_outputs = _amend_outputs(completion, properties_to_add) job_id = j.record_completed(timestamp, e.id, dep.STATUS_COMPLETED, amended_outputs) debug_log.log_completed(job_id, dep.STATUS_COMPLETED, completion) success_count += 1 timings.log(job_id, "complete") did_useful_work = True j.refresh_rules() if not did_useful_work: time.sleep(0.5) if len(executing) > 0: ui.ask_user_to_cancel(j, executing) log.info("%d jobs successfully executed", success_count) if len(failures) > 0: # maybe also show summary of which jobs failed? log.warning( "%d jobs failed: %s", len(failures), ", ".join([ "{} ({})".format(job_dir, transform) for transform, job_dir in failures ])) return -1 return 0