def get_satisfiable_jobs( rules: Rules, resources_per_client: Dict[str, Dict[str, Union[float, int]]], pending_jobs: List[RuleExecution], executions: Union[List[Execution], List[DelegateExecution]] ) -> List[RuleExecution]: # print("get_satisfiable_jobs", len(pending_jobs), executions) ready = [] # copy resources_per_client to a version we'll decrement as we consume resources_remaining_per_client = dict([ (name, dict(resources)) for name, resources in resources_per_client.items() ]) # print("max resources", resources_remaining_per_client) def get_remaining( job ): # returns the remaining resources for the client used by a given job rule = rules.get_rule(job.transform) return resources_remaining_per_client[rule.executor] for job in executions: rule = rules.get_rule(job.transform) resources = rule.resources # print("job.id={}, active_job_ids={}".format(repr(job.id), repr(active_job_ids))) resources_remaining = get_remaining(job) # print("decrementing ", job.transform, rules.get_rule(job.transform).executor, resources_remaining, " by ", resources) for resource, amount in resources.items(): resources_remaining[resource] -= amount for job in pending_jobs: satisfiable = True rule = rules.get_rule(job.transform) resources = rule.resources resources_remaining = get_remaining(job) # print("for ", job.transform, rules.get_rule(job.transform).executor, resources_remaining) for resource, amount in resources.items(): if resources_remaining[resource] < amount: satisfiable = False break if satisfiable: for resource, amount in resources.items(): resources_remaining[resource] -= amount ready.append(job) return ready
def reattach(j: Jobs, rules: Rules, pending_jobs: List[Execution]) -> List[DelegateExecution]: executing = [] for e in pending_jobs: if e.exec_xref != None: rule = rules.get_rule(e.transform) client = rules.get_client(rule.executor) ee = client.reattach(e.exec_xref) executing.append(ee) log.warn("Reattaching existing job {}: {}".format( e.transform, e.exec_xref)) else: log.warn("Canceling {}".format(e.id)) j.cancel_execution(e.id) return executing
def test_relative_file_paths(tmpdir): sample_rel_path = os.path.relpath(__file__, os.path.abspath(".")) assert sample_rel_path[0] != "/" statements = parser.parse_str(""" rule a: inputs: x=filename("{}") """.format(sample_rel_path)) rules = Rules() _eval_stmts(rules, statements, "none", HashCache(str(tmpdir.join("hashcache")))) a = rules.get_rule("a") assert a is not None print(a.inputs) a.inputs[0].json_obj["name"] == os.path.abspath(sample_rel_path)
def test_file_ref_with_copy_to(tmpdir): rules = Rules() # rules.set_var(name, value) localfile = tmpdir.join("xyz") localfile.write("x") statements = parser.parse_str(""" rule a: inputs: x=filename("{}", copy_to="z") """.format(localfile)) _eval_stmts(rules, statements, "none", HashCache(str(tmpdir.join("hashcache")))) a = rules.get_rule("a") assert a is not None assert a.inputs[0].copy_to == "z"
def test_file_ref(tmpdir): rules = Rules() # rules.set_var(name, value) localfile = tmpdir.join("xyz") localfile.write("x") statements = parser.parse_str(""" rule a: inputs: x=filename("{}") """.format(localfile)) _eval_stmts(rules, statements, str(tmpdir) + "/none", HashCache(str(tmpdir.join("hashcache")))) a = rules.get_rule("a") assert a is not None print(a.inputs) assert a.inputs[0].json_obj["name"] == os.path.relpath( str(localfile), str(tmpdir)) assert a.inputs[0].json_obj["type"] == "$fileref" assert a.inputs[0].copy_to is None assert len(rules.objs) == 1
def test_eval_if(): from conseq.config import Rules, _eval_stmts rules = Rules() # rules.set_var(name, value) statements = parser.parse_str(""" if "'x' == 'y'": let a='1' else: let a='2' endif """) _eval_stmts(rules, statements, "none", None) assert rules.vars["a"] == "2"
def test_parse_if(): from conseq.config import Rules, _eval_stmts rules = Rules() # from conseq.parser import IfStatement, LetStatement statements = parser.parse_str( """ if "'x' == 'y'": let a='1' else: let a='2' endif """, "declarations") _eval_stmts(rules, statements, "none", None) assert rules.vars["a"] == "2"
def test_file_refs_with_vars(tmpdir): # make sure we can use variables work in filenames rules = Rules() rules.set_var("VARIABLE", str(tmpdir)) rules.set_var("NUMBER", 2) localfile = tmpdir.join("xyz-2") localfile.write("x") statements = parser.parse_str(""" rule a: inputs: x=filename("{{config.VARIABLE}}/xyz-{{config.NUMBER}}") """) _eval_stmts(rules, statements, "none", HashCache(str(tmpdir.join("hashcache")))) a = rules.get_rule("a") assert a is not None print(a.inputs) a.inputs[0].json_obj["name"] == str(localfile)
def test_generic_eval(): from conseq.config import Rules, _eval_stmts rules = Rules() # rules.set_var(name, value) statements = parser.parse_str(""" eval \"\"\" print('here') rules.set_var('x', 'y') print(config['x']) print(rules.vars) print(config) \"\"\" if "config.x == 'y'": let a='1' else: let a='2' endif """) _eval_stmts(rules, statements, "none", None) assert rules.vars["a"] == "1"
def main_loop(jinja2_env: Environment, j: Jobs, new_object_listener: Callable, rules: Rules, state_dir: str, executing: List[DelegateExecution], capture_output: bool, req_confirm: bool, maxfail: int, maxstart: None, properties_to_add=[]) -> None: from conseq.exec_client import create_publish_exec_client _client_for_publishing = Lazy( lambda: create_publish_exec_client(rules.get_vars())) resources_per_client = dict([ (name, client.resources) for name, client in rules.exec_clients.items() ]) timings = TimelineLog(state_dir + "/timeline.log") active_job_ids = set([e.id for e in executing]) resolver = xref.Resolver(state_dir, rules.vars) prev_msg = None abort = False success_count = 0 failures = [] start_count = 0 job_ids_to_ignore = set() skip_remaining = False def get_pending(): pending_jobs = j.get_pending() if skip_remaining: pending_jobs = [] job_ids_to_ignore.update([pj.id for pj in pending_jobs]) else: pending_jobs = [ pj for pj in pending_jobs if pj.id not in job_ids_to_ignore ] return pending_jobs with ui.capture_sigint() as was_interrupted_fn: while not abort: interrupted = was_interrupted_fn() if interrupted: break if len(failures) >= maxfail: we_should_stop = True if len(executing) > 0: # if we have other tasks which are still running, ask user if we really want to abort now. we_should_stop, maxfail = ui.user_says_we_should_stop( len(failures), executing) if we_should_stop: break pending_jobs = get_pending() summary = get_execution_summary(executing) msg = "%d processes running (%s), %d executions pending, %d skipped" % ( len(executing), summary, len(pending_jobs), len(job_ids_to_ignore)) if prev_msg != msg: log.info(msg) if len(pending_jobs) + len(executing) > 0: long_summary = get_long_execution_summary( executing, pending_jobs) log.info("Summary of queue:\n%s\n", long_summary) prev_msg = msg cannot_start_more = (maxstart is not None and start_count >= maxstart) or skip_remaining if len(executing) == 0 and (cannot_start_more or len(pending_jobs) == 0): # now that we've completed everything, check for deferred jobs by marking them as ready. If we have any, loop again j.enable_deferred() deferred_jobs = len(get_pending()) if deferred_jobs > 0 and not cannot_start_more: log.info("Marked deferred %d executions as ready", deferred_jobs) continue break did_useful_work = False # might be worth checking to see if the inputs are identical to previous call # to avoid wasting CPU time checking to schedule over and over when resources are exhausted. # also, the current design has an issue when rerunning part of of the execution tree. Imagine # rule "A" produces "a1", "b1", and "c1", rule "T" transforms "a1" to "a2", "b1" to "b2, and "c1" to "c2". # Lastly rule F takes in a2, b2, and c2 and produces "f". # Now, everything is great if starting from a clean slate. But we've run once, in the artifact db we have # a1, a2, b1, b2, c1, c2, f. If we then rerun T, then we'll get the following executions: (new objects denoted with # "*", old objects from previous run have no star.) # T(a1) -> a2* # F(a2*, b2, c2) -> f* # T(b1) -> b2* # F(a2*, b2*, c2) -> f* # T(c1) -> c2* # F(a2*, b2*, c2*) -> f* # # So in the end the right thing would get done. However, we've run F three times as many as necessary. If we # had a priority queue for work, then we could just set each rule execution priority to be the max(input.id) # That would force a breadth-first execution of the graph. However, since jobs can execute in parallel, # priortizing is not enough. (And we can't block based on priority or there'd be no parallelism!) # # ultimately, I don't think there's a shortcut, and we may need to check the DAG from the previous execution to see # if ancestor node is being re-executed, if so, prune that pending rule execution from the pending list until that # task is done. ready_jobs = get_satisfiable_jobs(rules, resources_per_client, pending_jobs, executing) for job in ready_jobs: assert isinstance(job, dep.RuleExecution) if maxstart is not None and start_count >= maxstart: break active_job_ids.add(job.id) did_useful_work = True rule = rules.get_rule(job.transform) timings.log(job.id, "preprocess_xrefs") # process xrefs which might require rewriting an artifact xrefs_resolved = exec_client.preprocess_xref_inputs( j, resolver, job.inputs) if xrefs_resolved: log.info( "Resolved xrefs on rule, new version will be executed next pass" ) timings.log(job.id, "resolved_xrefs") continue timings.log(job.id, "preprocess_inputs") if rule.is_publish_rule: client = _client_for_publishing() else: # localize paths that will be used in scripts client = rules.get_client(rule.executor) inputs, resolver_state = client.preprocess_inputs( resolver, bind_inputs(rule, job.inputs)) debug_log.log_input_preprocess(job.id, job.inputs, inputs) # if we're required confirmation from the user, do this before we continue if req_confirm: answer = ui.confirm_execution(job.transform, inputs) if answer == "a": req_confirm = False elif answer == "q": abort = True break elif answer == "s": job_ids_to_ignore.add(job.id) continue elif answer == "S": skip_remaining = True break if rule.is_publish_rule: publish(jinja2_env, rule.publish_location, rules.get_vars(), inputs) # maybe record_started and update_exec_xref should be merged so anything started # always has an xref exec_id = j.record_started(job.id) timings.log(job.id, "start") job_dir = get_job_dir(state_dir, exec_id) if not os.path.exists(job_dir): os.makedirs(job_dir) e = execute(job.transform, resolver, jinja2_env, exec_id, job_dir, inputs, rule, rules.get_vars(), capture_output, resolver_state, client) executing.append(e) j.update_exec_xref(e.id, e.get_external_id(), job_dir) start_count += 1 # now poll the jobs which are running and look for which have completed for i, e in reversed(list(enumerate(executing))): failure, completion = e.get_completion() if failure is None and completion is None: continue del executing[i] timestamp = datetime.datetime.now().isoformat() if completion is not None: rule = rules.get_rule(e.transform) if not rule.has_for_all_input(): # only do this check if no inputs are marked as "for all" # because we can have cases where a new artifact appears and we _do_ want # to re-run the rule and clobber the output of the previous run. # If we wanted to be very conservative, we could handle for-all by # looking up which rule created the previous artifact and confirm that it was # from a rule with the same inputs, only verifying the "all" parameters have # changed. However, just ignoring clobbers from rules with "for all" is a cheap # approximation. _failures = [] for artifact in completion: if j.get_existing_id(None, artifact) is not None: # j.gc() _failure = f"Rule {e.transform} ({e.job_dir} generated an output which already exists: {artifact}" _failures.append(_failure) log.error(_failure) if len(_failures) > 0: failure = ", ".join(_failures) if failure is not None: job_id = j.record_completed(timestamp, e.id, dep.STATUS_FAILED, {}) failures.append((e.transform, e.job_dir)) debug_log.log_completed(job_id, dep.STATUS_FAILED, completion) timings.log(job_id, "fail") elif completion is not None: amended_outputs = _amend_outputs(completion, properties_to_add) job_id = j.record_completed(timestamp, e.id, dep.STATUS_COMPLETED, amended_outputs) debug_log.log_completed(job_id, dep.STATUS_COMPLETED, completion) success_count += 1 timings.log(job_id, "complete") did_useful_work = True j.refresh_rules() if not did_useful_work: time.sleep(0.5) if len(executing) > 0: ui.ask_user_to_cancel(j, executing) log.info("%d jobs successfully executed", success_count) if len(failures) > 0: # maybe also show summary of which jobs failed? log.warning( "%d jobs failed: %s", len(failures), ", ".join([ "{} ({})".format(job_dir, transform) for transform, job_dir in failures ])) return -1 return 0