def do_stuff(rel_time): if workloads: w, submit_time, row = workloads[0] if rel_time >= submit_time: workloads.pop(0) w.job_id = row["job_id"] logger.info(f'Queued workload {w.output_name}.{w.batch_num}iter.{w.job_id}') print(f'{datetime.now()}: Queued workload ' f'{w.output_name}.{w.batch_num}iter.{w.job_id}', file=f) pending.append(w) _, alive[:] = SalusServer.wait_workloads(alive, timeout=0, callback=workload_done) while pending and accept_workload(pending[0], alive): w = pending.pop(0) logger.info(f'Started workload {w.output_name}.{w.batch_num}iter.{w.job_id}') print(f'{datetime.now()}: Started workload ' f'{w.output_name}.{w.batch_num}iter.{w.job_id}', file=f) output_file = tmp / f'{w.output_name}.{w.batch_num}iter.{w.job_id}.output' w.run(output_file) started.append(w) alive.append(w) _, alive[:] = SalusServer.wait_workloads(alive, timeout=0, callback=workload_done) if not workloads and not pending: _, alive[:] = SalusServer.wait_workloads(alive, callback=workload_done) return False return True
def limit_concurrent(wls): # type: (Iterable[Workload]) -> None """Wait for something to finish""" gone, alive = SalusServer.wait_workloads(wls, timeout=0) while len(alive) >= FLAGS.concurrent_jobs: gone, alive = SalusServer.wait_workloads(wls, timeout=0) time.sleep(.25)
def run(self, workloads, **kwargs): if self == Pause.Manual: prompt.pause() elif self == Pause.Wait: logger.info(f"Waiting current {len(workloads)} workloads to finish") SalusServer.wait_workloads(workloads) else: logger.info(f"Sleep {self} seconds") time.sleep(self)
def run_tf(output_dir, *actions): # type: (Path, *TAction) -> List[Workload] """Run a sequence of actions""" workloads = [] # type: List[Workload] try: with atomic_directory(output_dir) as temp_dir: # type: Path # Do action specified in seq for act in actions: if isinstance(act, Workload): if act.executor != Executor.TF: raise ValueError('run_tf can only run TF workloads') output_file = temp_dir / f'{act.output_name}.{act.batch_num}iter.{len(workloads)}.output' act.run(output_file) workloads.append(act) elif isinstance(act, (Pause, RunFn)): act.run(workloads, temp_dir=temp_dir) else: raise ValueError(f"Unexpected value `{act}' of {type(act)} passed to run_seq") logger.info(f'Waiting all workloads to finish') SalusServer.wait_workloads(workloads) except Exception: logger.exception("Got exception when running workloads") finally: # if there's alive, we are doing cleanup for w in workloads: if w.proc is not None and w.proc.poll() is None: logger.warning(f'Killing workload that is not stopped yet: {w.canonical_name}') kill_tree(w.proc, hard=True) # check each workloads and fix workload output_file path for w in workloads: if not FLAGS.ignore_error and w.proc.returncode != 0: prompt.pause() raise RuntimeError(f'Workload {w.canonical_name} did not finish cleanly: {w.proc.returncode}') w.output_file = output_dir / w.output_file.name return workloads