예제 #1
0
파일: exp15.py 프로젝트: vycezhong/Salus-1
                    def do_stuff(rel_time):
                        if workloads:
                            w, submit_time, row = workloads[0]
                            if rel_time >= submit_time:
                                workloads.pop(0)
                                w.job_id = row["job_id"]
                                logger.info(f'Queued workload {w.output_name}.{w.batch_num}iter.{w.job_id}')
                                print(f'{datetime.now()}: Queued workload '
                                      f'{w.output_name}.{w.batch_num}iter.{w.job_id}',
                                      file=f)
                                pending.append(w)

                        _, alive[:] = SalusServer.wait_workloads(alive, timeout=0, callback=workload_done)

                        while pending and accept_workload(pending[0], alive):
                            w = pending.pop(0)

                            logger.info(f'Started workload {w.output_name}.{w.batch_num}iter.{w.job_id}')
                            print(f'{datetime.now()}: Started workload '
                                  f'{w.output_name}.{w.batch_num}iter.{w.job_id}',
                                  file=f)

                            output_file = tmp / f'{w.output_name}.{w.batch_num}iter.{w.job_id}.output'
                            w.run(output_file)
                            started.append(w)
                            alive.append(w)

                            _, alive[:] = SalusServer.wait_workloads(alive, timeout=0, callback=workload_done)

                        if not workloads and not pending:
                            _, alive[:] = SalusServer.wait_workloads(alive, callback=workload_done)
                            return False

                        return True
예제 #2
0
파일: bigrun.py 프로젝트: vycezhong/Salus-1
 def limit_concurrent(wls):
     # type: (Iterable[Workload]) -> None
     """Wait for something to finish"""
     gone, alive = SalusServer.wait_workloads(wls, timeout=0)
     while len(alive) >= FLAGS.concurrent_jobs:
         gone, alive = SalusServer.wait_workloads(wls, timeout=0)
         time.sleep(.25)
예제 #3
0
 def run(self, workloads, **kwargs):
     if self == Pause.Manual:
         prompt.pause()
     elif self == Pause.Wait:
         logger.info(f"Waiting current {len(workloads)} workloads to finish")
         SalusServer.wait_workloads(workloads)
     else:
         logger.info(f"Sleep {self} seconds")
         time.sleep(self)
예제 #4
0
def run_tf(output_dir, *actions):
    # type: (Path, *TAction) -> List[Workload]
    """Run a sequence of actions"""
    workloads = []  # type: List[Workload]

    try:
        with atomic_directory(output_dir) as temp_dir:  # type: Path
            # Do action specified in seq
            for act in actions:
                if isinstance(act, Workload):
                    if act.executor != Executor.TF:
                        raise ValueError('run_tf can only run TF workloads')
                    output_file = temp_dir / f'{act.output_name}.{act.batch_num}iter.{len(workloads)}.output'

                    act.run(output_file)
                    workloads.append(act)
                elif isinstance(act, (Pause, RunFn)):
                    act.run(workloads, temp_dir=temp_dir)
                else:
                    raise ValueError(f"Unexpected value `{act}' of {type(act)} passed to run_seq")

            logger.info(f'Waiting all workloads to finish')
            SalusServer.wait_workloads(workloads)
    except Exception:
        logger.exception("Got exception when running workloads")
    finally:
        # if there's alive, we are doing cleanup
        for w in workloads:
            if w.proc is not None and w.proc.poll() is None:
                logger.warning(f'Killing workload that is not stopped yet: {w.canonical_name}')
                kill_tree(w.proc, hard=True)

        # check each workloads and fix workload output_file path
        for w in workloads:
            if not FLAGS.ignore_error and w.proc.returncode != 0:
                prompt.pause()
                raise RuntimeError(f'Workload {w.canonical_name} did not finish cleanly: {w.proc.returncode}')
            w.output_file = output_dir / w.output_file.name

    return workloads