def _add_snapshot(self, snapshot: Snapshot, iter_: int): # Parts of the metadata will be used in the underlying data model, # which is be mutable, hence we thaw it here—once. metadata = pyrsistent.thaw(snapshot.data()[ids.METADATA]) snapshot_tree = Node( iter_, { ids.STATUS: snapshot.data()[ids.STATUS], SORTED_REALIZATION_IDS: metadata[SORTED_REALIZATION_IDS], SORTED_JOB_IDS: metadata[SORTED_JOB_IDS], }, NodeType.ITER, ) for real_id in snapshot_tree.data[SORTED_REALIZATION_IDS]: real = snapshot.data()[ids.REALS][real_id] real_node = Node( real_id, { ids.STATUS: real[ids.STATUS], ids.ACTIVE: real[ids.ACTIVE], REAL_JOB_STATUS_AGGREGATED: metadata[REAL_JOB_STATUS_AGGREGATED][real_id], REAL_STATUS_COLOR: metadata[REAL_STATUS_COLOR][real_id], }, NodeType.REAL, ) snapshot_tree.add_child(real_node) for step_id, step in real[ids.STEPS].items(): step_node = Node(step_id, {ids.STATUS: step[ids.STATUS]}, NodeType.STEP) real_node.add_child(step_node) for job_id in metadata[SORTED_JOB_IDS]: job = step[ids.JOBS][job_id] job_dict = dict(job) job_dict[ids.DATA] = job.data job_node = Node(job_id, job_dict, NodeType.JOB) step_node.add_child(job_node) if iter_ in self.root.children: self.modelAboutToBeReset.emit() self.root.children[iter_] = snapshot_tree snapshot_tree.parent = self.root self.modelReset.emit() return parent = QModelIndex() next_iter = len(self.root.children) self.beginInsertRows(parent, next_iter, next_iter) self.root.add_child(snapshot_tree) self.root.children[iter_] = snapshot_tree self.rowsInserted.emit(parent, snapshot_tree.row(), snapshot_tree.row())
def test_print_progress(self): out = StringIO() monitor = Monitor(out=out) sd = SnapshotDict(status="") for i in range(0, 100): status = REALIZATION_STATE_FINISHED if i < 50 else REALIZATION_STATE_WAITING sd.reals[i] = Realization(status=status, active=True) monitor._snapshots[0] = Snapshot(sd.dict()) monitor._start_time = datetime.now() general_event = _UpdateEvent( phase_name="Test Phase", current_phase=0, total_phases=2, progress=0.5, indeterminate=False, iteration=0, ) monitor._print_progress(general_event) self.assertEqual( """\r --> Test Phase 1/2 |███████████████ | 50% Running time: 0 seconds Waiting 50/100 Pending 0/100 Running 0/100 Failed 0/100 Finished 50/100 Unknown 0/100 """, out.getvalue(), )
def _create_snapshot(self): reals = {} for real in self.get_active_reals(): reals[str(real.get_iens())] = Realization( active=True, status=state.REALIZATION_STATE_WAITING, ) for step in real.get_steps(): reals[str(real.get_iens())].steps[str(step.get_id())] = Step( status=state.STEP_STATE_UNKNOWN ) for job in step.get_jobs(): reals[str(real.get_iens())].steps[str(step.get_id())].jobs[ str(job.get_id()) ] = Job( status=state.JOB_STATE_START, data={}, name=job.get_name(), ) top = SnapshotDict( reals=reals, status=state.ENSEMBLE_STATE_UNKNOWN, metadata=self.get_metadata(), ) return Snapshot(top.dict())
def _full_snapshot_event(self, iter_) -> typing.Optional[FullSnapshotEvent]: """Return a FullSnapshotEvent if it was possible to create a snapshot. Return None if not, indicating that there should be no event.""" run_context = self._model.get_run_context() detailed_progress = self._model.getDetailedProgress() if detailed_progress == _THE_EMPTY_DETAILED_PROGRESS: return None snapshot_dict = self._create_snapshot_dict(run_context, detailed_progress, iter_) if not snapshot_dict: return None snapshot = Snapshot(snapshot_dict.dict()) self._set_iter_snapshot(iter_, snapshot) return FullSnapshotEvent( phase_name=self._model.getPhaseName(), current_phase=self._model.currentPhase(), total_phases=self._model.phaseCount(), indeterminate=self._model.isIndeterminate(), progress=self._progress(), iteration=iter_, snapshot=snapshot, )
def track(self): while True: event = self._work_queue.get() if isinstance(event, str): try: if event == EvaluatorTracker.DONE: yield EndEvent( failed=self._model.hasRunFailed(), failed_msg=self._model.getFailMessage(), ) elif event == EvaluatorTracker.CONNECTION_ERROR: yield EndEvent( failed=True, failed_msg="Connection error", ) except GeneratorExit: # consumers may exit at this point, make sure the last # task is marked as done pass self._work_queue.task_done() break elif event["type"] == ids.EVTYPE_EE_SNAPSHOT: iter_ = event.data["iter"] snapshot = Snapshot(event.data) self._iter_snapshot[iter_] = snapshot yield FullSnapshotEvent( phase_name=self._model.getPhaseName(), current_phase=self._model.currentPhase(), total_phases=self._model.phaseCount(), indeterminate=self._model.isIndeterminate(), progress=self._progress(), iteration=iter_, snapshot=snapshot, ) elif event["type"] == ids.EVTYPE_EE_SNAPSHOT_UPDATE: iter_ = event.data["iter"] if iter_ not in self._iter_snapshot: raise OutOfOrderSnapshotUpdateException( f"got {ids.EVTYPE_EE_SNAPSHOT_UPDATE} without having stored snapshot for iter {iter_}" ) partial = PartialSnapshot( self._iter_snapshot[iter_]).from_cloudevent(event) self._iter_snapshot[iter_].merge_event(partial) yield SnapshotUpdateEvent( phase_name=self._model.getPhaseName(), current_phase=self._model.currentPhase(), total_phases=self._model.phaseCount(), indeterminate=self._model.isIndeterminate(), progress=self._progress(), iteration=iter_, partial_snapshot=partial, ) self._work_queue.task_done()
def full_snapshot() -> Snapshot: real = Realization( status=REALIZATION_STATE_UNKNOWN, active=True, steps={ "0": Step( status="", jobs={ "0": Job( start_time=dt.now(), end_time=dt.now(), name="poly_eval", status=JOB_STATE_START, error="error", stdout="std_out_file", stderr="std_err_file", data={ CURRENT_MEMORY_USAGE: "123", MAX_MEMORY_USAGE: "312", }, ), "1": Job( start_time=dt.now(), end_time=dt.now(), name="poly_postval", status=JOB_STATE_START, error="error", stdout="std_out_file", stderr="std_err_file", data={ CURRENT_MEMORY_USAGE: "123", MAX_MEMORY_USAGE: "312", }, ), }, ) }, ) snapshot = SnapshotDict( status=ENSEMBLE_STATE_STARTED, reals={}, ) for i in range(0, 100): snapshot.reals[str(i)] = copy.deepcopy(real) return Snapshot(snapshot.dict())
def snapshot_to_tree(snapshot: Snapshot, iter_: int) -> Node: iter_node = Node(iter_, {ids.STATUS: snapshot.get_status()}, NodeType.ITER) snapshot_d = SnapshotDict(**snapshot.to_dict()) for real_id in sorted(snapshot_d.reals, key=int): real = snapshot_d.reals[real_id] real_node = Node( real_id, { ids.STATUS: real.status, ids.ACTIVE: real.active }, NodeType.REAL, ) iter_node.add_child(real_node) for step_id, step in real.steps.items(): step_node = Node(step_id, {ids.STATUS: step.status}, NodeType.STEP) real_node.add_child(step_node) for job_id in sorted(step.jobs, key=int): job = step.jobs[job_id] job_dict = dict(job) job_dict[ids.DATA] = job.data job_node = Node(job_id, job_dict, NodeType.JOB) step_node.add_child(job_node) return iter_node
def test_legends(self): monitor = Monitor(out=StringIO()) sd = SnapshotDict(status="") for i in range(0, 100): status = REALIZATION_STATE_FINISHED if i < 10 else REALIZATION_STATE_RUNNING sd.reals[i] = Realization(status=status, active=True) monitor._snapshots[0] = Snapshot(sd.dict()) legends = monitor._get_legends() self.assertEqual( """ Waiting 0/100 Pending 0/100 Running 90/100 Failed 0/100 Finished 10/100 Unknown 0/100 """, legends, )
def test_print_progress(self): out = StringIO() monitor = Monitor(out=out) sd = SnapshotDict(status="") for i in range(0, 100): status = REALIZATION_STATE_FINISHED if i < 50 else REALIZATION_STATE_WAITING sd.reals[i] = Realization(status=status, active=True) monitor._snapshots[0] = Snapshot(sd.dict()) monitor._start_time = datetime.now() general_event = _UpdateEvent( phase_name="Test Phase", current_phase=0, total_phases=2, progress=0.5, indeterminate=False, iteration=0, ) monitor._print_progress(general_event) # For some reason, `tqdm` adds an extra line containing a progress-bar, # even though this test only calls it once. # I suspect this has something to do with the way `tqdm` does refresh, # but do not know how to fix it. # Seems not be a an issue when used normally. expected = """ --> Test Phase | | 0% it 1/2 |##############################5 | 50% Running time: 0 seconds Waiting 50/100 Pending 0/100 Running 0/100 Failed 0/100 Finished 50/100 Unknown 0/100 """ assert out.getvalue().replace("\r", "\n") == expected
def create_snapshot(ensemble): reals = {} for real in ensemble.get_active_reals(): reals[str(real.get_iens())] = _Realization( active=True, start_time=None, end_time=None, status="Waiting", ) for stage in real.get_stages(): reals[str(real.get_iens())].stages[str( stage.get_id())] = _Stage( status="Unknown", start_time=None, end_time=None, ) for step in stage.get_steps(): reals[str(real.get_iens())].stages[str( stage.get_id())].steps[str(step.get_id())] = _Step( status="Unknown", start_time=None, end_time=None) for job in step.get_jobs(): reals[str(real.get_iens())].stages[str( stage.get_id())].steps[str( step.get_id())].jobs[str(job.get_id())] = _Job( status="Pending", data={}, start_time=None, end_time=None, name=job.get_name(), ) top = _SnapshotDict( reals=reals, status="Unknown", forward_model=_ForwardModel(step_definitions={}), metadata=ensemble.get_metadata(), ) return Snapshot(top.dict())
def _batch(self, events): batch: List[CloudEvent] = [] for event in events: if event["type"] == ids.EVTYPE_EE_SNAPSHOT: # A new iteration, so ensure any updates for the previous one, # is emitted. if batch: yield self._flush(batch) batch = [] iter_ = event.data["iter"] snapshot = Snapshot(event.data) self._iter_snapshot[iter_] = snapshot yield FullSnapshotEvent( phase_name=self._model.getPhaseName(), current_phase=self._model.currentPhase(), total_phases=self._model.phaseCount(), indeterminate=self._model.isIndeterminate(), progress=self._progress(), iteration=iter_, snapshot=snapshot, ) self._work_queue.task_done() elif event["type"] == ids.EVTYPE_EE_SNAPSHOT_UPDATE: iter_ = event.data["iter"] if iter_ not in self._iter_snapshot: raise OutOfOrderSnapshotUpdateException( f"got {ids.EVTYPE_EE_SNAPSHOT_UPDATE} without having stored snapshot for iter {iter_}" ) batch.append(event) else: raise ValueError("got unexpected event type", event["type"]) if batch: yield self._flush(batch)
def test_monitor_stop(evaluator): with evaluator.run() as monitor: for event in monitor.track(): snapshot = Snapshot(event.data) break assert snapshot.get_status() == ENSEMBLE_STATE_STARTED
def test_dispatchers_can_connect_and_monitor_can_shut_down_evaluator( evaluator): with evaluator.run() as monitor: events = monitor.track() host = evaluator._config.host port = evaluator._config.port # first snapshot before any event occurs snapshot_event = next(events) snapshot = Snapshot(snapshot_event.data) assert snapshot.get_status() == ENSEMBLE_STATE_STARTED # two dispatchers connect with Client(host, port, "/dispatch") as dispatch1, Client( host, port, "/dispatch") as dispatch2: # first dispatcher informs that job 0 is running send_dispatch_event( dispatch1, identifiers.EVTYPE_FM_JOB_RUNNING, "/ert/ee/0/real/0/step/0/job/0", "event1", {"current_memory_usage": 1000}, ) snapshot = Snapshot(next(events).data) assert snapshot.get_job("0", "0", "0").status == JOB_STATE_RUNNING # second dispatcher informs that job 0 is running send_dispatch_event( dispatch2, identifiers.EVTYPE_FM_JOB_RUNNING, "/ert/ee/0/real/1/step/0/job/0", "event1", {"current_memory_usage": 1000}, ) snapshot = Snapshot(next(events).data) assert snapshot.get_job("1", "0", "0").status == JOB_STATE_RUNNING # second dispatcher informs that job 0 is done send_dispatch_event( dispatch2, identifiers.EVTYPE_FM_JOB_SUCCESS, "/ert/ee/0/real/1/step/0/job/0", "event1", {"current_memory_usage": 1000}, ) snapshot = Snapshot(next(events).data) assert snapshot.get_job("1", "0", "0").status == JOB_STATE_FINISHED # second dispatcher informs that job 1 is failed send_dispatch_event( dispatch2, identifiers.EVTYPE_FM_JOB_FAILURE, "/ert/ee/0/real/1/step/0/job/1", "event_job_1_fail", {identifiers.ERROR_MSG: "error"}, ) snapshot = Snapshot(next(events).data) assert snapshot.get_job("1", "0", "1").status == JOB_STATE_FAILURE # a second monitor connects with ee_monitor.create(host, port) as monitor2: events2 = monitor2.track() snapshot = Snapshot(next(events2).data) assert snapshot.get_status() == ENSEMBLE_STATE_STARTED assert snapshot.get_job("0", "0", "0").status == JOB_STATE_RUNNING assert snapshot.get_job("1", "0", "0").status == JOB_STATE_FINISHED # one monitor requests that server exit monitor.signal_cancel() # both monitors should get a terminated event terminated = next(events) terminated2 = next(events2) assert terminated["type"] == identifiers.EVTYPE_EE_TERMINATED assert terminated2["type"] == identifiers.EVTYPE_EE_TERMINATED for e in [events, events2]: for _ in e: assert False, "got unexpected event from monitor"
def test_dispatchers_can_connect_and_monitor_can_shut_down_evaluator( evaluator): monitor = evaluator.run() events = monitor.track() host = evaluator._config.host port = evaluator._config.port # first snapshot before any event occurs snapshot_event = next(events) snapshot = Snapshot(snapshot_event.data) assert snapshot.get_status() == "Unknown" # two dispatchers connect with Client(host, port, "/dispatch") as dispatch1, Client(host, port, "/dispatch") as dispatch2: # first dispatcher informs that job 0 is running send_dispatch_event( dispatch1, identifiers.EVTYPE_FM_JOB_RUNNING, "/ert/ee/0/real/0/stage/0/step/0/job/0", "event1", {"current_memory_usage": 1000}, ) snapshot = Snapshot(next(events).data) assert snapshot.get_job("0", "0", "0", "0")["status"] == "Running" # second dispatcher informs that job 0 is running send_dispatch_event( dispatch2, identifiers.EVTYPE_FM_JOB_RUNNING, "/ert/ee/0/real/1/stage/0/step/0/job/0", "event1", {"current_memory_usage": 1000}, ) snapshot = Snapshot(next(events).data) assert snapshot.get_job("1", "0", "0", "0")["status"] == "Running" # second dispatcher informs that job 0 is done send_dispatch_event( dispatch2, identifiers.EVTYPE_FM_JOB_SUCCESS, "/ert/ee/0/real/1/stage/0/step/0/job/0", "event1", {"current_memory_usage": 1000}, ) snapshot = Snapshot(next(events).data) assert snapshot.get_job("1", "0", "0", "0")["status"] == "Finished" # a second monitor connects monitor2 = ee_monitor.create(host, port) events2 = monitor2.track() snapshot = Snapshot(next(events2).data) assert snapshot.get_status() == "Unknown" assert snapshot.get_job("0", "0", "0", "0")["status"] == "Running" assert snapshot.get_job("1", "0", "0", "0")["status"] == "Finished" # one monitor requests that server exit monitor.signal_cancel() # both monitors should get a terminated event terminated = next(events) terminated2 = next(events2) assert terminated["type"] == identifiers.EVTYPE_EE_TERMINATED assert terminated2["type"] == identifiers.EVTYPE_EE_TERMINATED for e in [events, events2]: for _ in e: assert False, "got unexpected event from monitor"
def test_monitor_stop(evaluator): monitor = evaluator.run() events = monitor.track() snapshot = Snapshot(next(events).data) assert snapshot.get_status() == "Unknown"
def test_dispatchers_can_connect_and_monitor_can_shut_down_evaluator( evaluator): with evaluator.run() as monitor: events = monitor.track() host = evaluator._config.host port = evaluator._config.port token = evaluator._config.token cert = evaluator._config.cert url = evaluator._config.url # first snapshot before any event occurs snapshot_event = next(events) snapshot = Snapshot(snapshot_event.data) assert snapshot.get_status() == ENSEMBLE_STATE_UNKNOWN # two dispatchers connect with Client( url + "/dispatch", cert=cert, token=token, max_retries=1, timeout_multiplier=1, ) as dispatch1, Client( url + "/dispatch", cert=cert, token=token, max_retries=1, timeout_multiplier=1, ) as dispatch2: # first dispatcher informs that job 0 is running send_dispatch_event( dispatch1, identifiers.EVTYPE_FM_JOB_RUNNING, f"/ert/ee/{evaluator._ee_id}/real/0/step/0/job/0", "event1", {"current_memory_usage": 1000}, ) # second dispatcher informs that job 0 is running send_dispatch_event( dispatch2, identifiers.EVTYPE_FM_JOB_RUNNING, f"/ert/ee/{evaluator._ee_id}/real/1/step/0/job/0", "event1", {"current_memory_usage": 1000}, ) # second dispatcher informs that job 0 is done send_dispatch_event( dispatch2, identifiers.EVTYPE_FM_JOB_SUCCESS, f"/ert/ee/{evaluator._ee_id}/real/1/step/0/job/0", "event1", {"current_memory_usage": 1000}, ) # second dispatcher informs that job 1 is failed send_dispatch_event( dispatch2, identifiers.EVTYPE_FM_JOB_FAILURE, f"/ert/ee/{evaluator._ee_id}/real/1/step/0/job/1", "event_job_1_fail", {identifiers.ERROR_MSG: "error"}, ) snapshot = Snapshot(next(events).data) assert snapshot.get_job("1", "0", "0").status == JOB_STATE_FINISHED assert snapshot.get_job("0", "0", "0").status == JOB_STATE_RUNNING assert snapshot.get_job("1", "0", "1").status == JOB_STATE_FAILURE # a second monitor connects with ee_monitor.create(host, port, "wss", cert, token) as monitor2: events2 = monitor2.track() full_snapshot_event = next(events2) assert full_snapshot_event[ "type"] == identifiers.EVTYPE_EE_SNAPSHOT snapshot = Snapshot(full_snapshot_event.data) assert snapshot.get_status() == ENSEMBLE_STATE_UNKNOWN assert snapshot.get_job("0", "0", "0").status == JOB_STATE_RUNNING assert snapshot.get_job("1", "0", "0").status == JOB_STATE_FINISHED # one monitor requests that server exit monitor.signal_cancel() # both monitors should get a terminated event terminated = next(events) terminated2 = next(events2) assert terminated["type"] == identifiers.EVTYPE_EE_TERMINATED assert terminated2["type"] == identifiers.EVTYPE_EE_TERMINATED for e in [events, events2]: for undexpected_event in e: assert ( False ), f"got unexpected event {undexpected_event} from monitor"