def test_monitor_run_debouncing_non_terminal_state(self, db: Session, client: TestClient): # set monitoring interval so debouncing will be active config.runs_monitoring_interval = 100 # Mocking the SDK updating the Run's state to terminal state self.run["status"]["state"] = RunStates.completed self.run["status"]["last_update"] = now_date().isoformat() get_db().store_run(db, self.run, self.run_uid, self.project) # Mocking pod that is still in non-terminal state self._mock_list_namespaced_pods([[self.running_pod]]) # Triggering monitor cycle self.runtime_handler.monitor_runs(get_db(), db) # verifying monitoring was debounced self._assert_run_reached_state(db, self.project, self.run_uid, RunStates.completed) # Mocking that update occurred before debounced period debounce_period = config.runs_monitoring_interval self.run["status"]["last_update"] = ( now_date() - timedelta(seconds=float(2 * debounce_period))).isoformat() get_db().store_run(db, self.run, self.run_uid, self.project) # Mocking pod that is still in non-terminal state self._mock_list_namespaced_pods([[self.running_pod]]) # Triggering monitor cycle self.runtime_handler.monitor_runs(get_db(), db) # verifying monitoring was not debounced self._assert_run_reached_state(db, self.project, self.run_uid, RunStates.running) # Mocking pod that is in terminal state (extra one for the log collection) self._mock_list_namespaced_pods([[self.completed_pod], [self.completed_pod]]) # Mocking read log calls log = self._mock_read_namespaced_pod_log() # Triggering monitor cycle self.runtime_handler.monitor_runs(get_db(), db) # verifying monitoring was not debounced self._assert_run_reached_state(db, self.project, self.run_uid, RunStates.completed) self._assert_run_logs( db, self.project, self.run_uid, log, self.completed_pod.metadata.name, )
def _store_run_fixture(self, db: Session): self.run = { "status": { "state": RunStates.created, "last_update": now_date().isoformat(), }, "metadata": {"project": self.project, "uid": self.run_uid}, } get_db().store_run(db, self.run, self.run_uid, self.project)
def update_resource_status(self, status="", producer=None): """update the data target status""" self._target = self._target or DataTarget(self.kind, self.name, self._target_path) target = self._target target.status = status or target.status or "created" target.updated = now_date().isoformat() target.producer = producer or target.producer self._resource.status.update_target(target)
def get_log( db_session: Session, project: str, uid: str, size: int = -1, offset: int = 0, source: LogSources = LogSources.AUTO, ): out = b"" log_file = log_path(project, uid) status = None if log_file.exists() and source in [ LogSources.AUTO, LogSources.PERSISTENCY ]: with log_file.open("rb") as fp: fp.seek(offset) out = fp.read(size) status = "" elif source in [LogSources.AUTO, LogSources.K8S]: data = get_db().read_run(db_session, uid, project) if not data: log_and_raise(HTTPStatus.NOT_FOUND, project=project, uid=uid) status = get_in(data, "status.state", "") if get_k8s(): pods = get_k8s().get_logger_pods(uid) if pods: pod, new_status = list(pods.items())[0] new_status = new_status.lower() # TODO: handle in cron/tracking if new_status != "pending": resp = get_k8s().logs(pod) if resp: out = resp.encode()[offset:] if status == "running": now = now_date().isoformat() update_in(data, "status.last_update", now) if new_status == "failed": update_in(data, "status.state", "error") update_in(data, "status.error", "error, check logs") get_db().store_run(db_session, data, uid, project) if new_status == "succeeded": update_in(data, "status.state", "completed") get_db().store_run(db_session, data, uid, project) status = new_status elif status == "running": update_in(data, "status.state", "error") update_in(data, "status.error", "pod not found, maybe terminated") get_db().store_run(db_session, data, uid, project) status = "failed" return out, status
def get_log(project, uid): size = int(request.args.get('size', '-1')) offset = int(request.args.get('offset', '0')) out = b'' log_file = log_path(project, uid) if log_file.exists(): with log_file.open('rb') as fp: fp.seek(offset) out = fp.read(size) status = '' else: data = _db.read_run(uid, project) if not data: return json_error(HTTPStatus.NOT_FOUND, project=project, uid=uid) status = get_in(data, 'status.state', '') if _k8s: pods = _k8s.get_logger_pods(uid) if pods: pod, new_status = list(pods.items())[0] new_status = new_status.lower() # TODO: handle in cron/tracking if new_status != 'pending': resp = _k8s.logs(pod) if resp: out = resp.encode()[offset:] if status == 'running': now = now_date().isoformat() update_in(data, 'status.last_update', now) if new_status == 'failed': update_in(data, 'status.state', 'error') update_in( data, 'status.error', 'error, check logs') _db.store_run(data, uid, project) if new_status == 'succeeded': update_in(data, 'status.state', 'completed') _db.store_run(data, uid, project) status = new_status elif status == 'running': update_in(data, 'status.state', 'error') update_in( data, 'status.error', 'pod not found, maybe terminated') _db.store_run(data, uid, project) status = 'failed' return Response(out, mimetype='text/plain', headers={"pod_status": status})
def push(self, start, request, resp=None, op=None, error=None): if error: data = self.base_data() data["request"] = request data["op"] = op data["when"] = str(start) message = str(error) if self.verbose: message = f"{message}\n{traceback.format_exc()}" data["error"] = message self.output_stream.push([data]) return self._sample_iter = (self._sample_iter + 1) % self.stream_sample if self.output_stream and self._sample_iter == 0: microsec = (now_date() - start).microseconds if self.stream_batch > 1: if self._batch_iter == 0: self._batch = [] self._batch.append([ request, op, resp, str(start), microsec, self.model.metrics ]) self._batch_iter = (self._batch_iter + 1) % self.stream_batch if self._batch_iter == 0: data = self.base_data() data["headers"] = [ "request", "op", "resp", "when", "microsec", "metrics", ] data["values"] = self._batch self.output_stream.push([data]) else: data = self.base_data() data["request"] = request data["op"] = op data["resp"] = resp data["when"] = str(start) data["microsec"] = microsec if getattr(self.model, "metrics", None): data["metrics"] = self.model.metrics self.output_stream.push([data])
def do_event(self, event, *args, **kwargs): """main model event handler method""" start = now_date() op = event.path.strip("/") if op == "predict" or op == "infer": # predict operation request = self._pre_event_processing_actions(event, op) try: outputs = self.predict(request) except Exception as exc: if self._model_logger: self._model_logger.push(start, request, op=op, error=exc) raise exc response = { "id": request["id"], "model_name": self.name, "outputs": outputs, } if self.version: response["model_version"] = self.version elif op == "ready" and event.method == "GET": # get model health operation setattr(event, "terminated", True) if self.ready: event.body = self.context.Response() else: event.body = self.context.Response(status_code=408, body=b"model not ready") return event elif op == "" and event.method == "GET": # get model metadata operation setattr(event, "terminated", True) event.body = { "name": self.name, "version": self.version, "inputs": [], "outputs": [], } if self.model_spec: event.body["inputs"] = self.model_spec.inputs event.body["outputs"] = self.model_spec.outputs return event elif op == "explain": # explain operation request = self._pre_event_processing_actions(event, op) try: outputs = self.explain(request) except Exception as exc: if self._model_logger: self._model_logger.push(start, request, op=op, error=exc) raise exc response = { "id": request["id"], "model_name": self.name, "outputs": outputs, } if self.version: response["model_version"] = self.version elif hasattr(self, "op_" + op): # custom operation (child methods starting with "op_") response = getattr(self, "op_" + op)(event) event.body = response return event else: raise ValueError( f"illegal model operation {op}, method={event.method}") response = self.postprocess(response) if self._model_logger: self._model_logger.push(start, request, response, op) event.body = response return event
def do_event(self, event, *args, **kwargs): """Handles incoming requests. Parameters ---------- event : nuclio.Event Incoming request as a nuclio.Event. Returns ------- Response Event repsonse after running the requested logic """ start = now_date() # Handle and verify the request event = self.preprocess(event) event = self._pre_handle_event(event) # Should we terminate the event? if hasattr(event, "terminated") and event.terminated: return event # Extract route information name, route, subpath = self._resolve_route(event.body, event.path) self.context.logger.debug(f"router run model {name}, op={subpath}") event.path = subpath # Return the correct response # If no model name was given and no operation if not name and route is None: # Return model list setattr(event, "terminated", True) event.body = {"models": list(self.routes.keys()) + [self.name]} return event else: # Verify we use the V2 protocol request = self.validate(event.body) # If this is a Router Operation if name == self.name: predictions = self._parallel_run(event) votes = self._apply_logic(predictions) # Format the prediction response like the regular # model's responses if self.format_response_with_col_name_flag: votes = {self.prediction_col_name: votes} response = copy.copy(event) response_body = { "id": event.id, "model_name": votes, "outputs": votes, } if self.version: response_body["model_version"] = self.version response.body = response_body # A specific model event else: response = route.run(event) event.body = response.body if response else None response = self.postprocess(response) if self._model_logger and self.log_router: if "id" not in request: request["id"] = response.body["id"] self._model_logger.push(start, request, response.body) return response