def test_status_transition(self): new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.SCHEDULED, status=True, reason="foo"), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 1 # New running condition new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.RUNNING, status=True, reason="foo", message="New message", ), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 2 # New warning condition new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.WARNING, status=True, reason="foo", message="New message", ), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 3 # New running condition new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.RUNNING, status=True, reason="foo", message="New message", ), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 4 # New warning condition new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.WARNING, status=True, reason="foo", message="New message", ), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 5
def test_new_stopped_status_after_stopping(self): new_run_status( self.run, condition=V1StatusCondition.get_condition(type=V1Statuses.STOPPING, status=True, reason="foo"), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 1 # Same this condition new_run_status( self.run, condition=V1StatusCondition.get_condition(type=V1Statuses.RUNNING, status=True, reason="foo"), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 1 # Different condition's message new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.STOPPED, status=True, reason="foo", message="New message", ), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 2
def runs_prepare(run_id: int, run: Optional[BaseRun], eager: bool = False) -> bool: run = get_run(run_id=run_id, run=run) if not run: return False if not LifeCycle.is_compilable(run.status): _logger.info( "Run `%s` cannot transition from `%s` to `%s`.", run_id, run.status, V1Statuses.COMPILED, ) return False try: compiled_at = now() _, compiled_operation = resolver.resolve(run=run, compiled_at=compiled_at, eager=eager) except PolyaxonCompilerError as e: condition = V1StatusCondition.get_condition( type=V1Statuses.FAILED, status="True", reason="SchedulerPrepare", message=f"Failed to compile.\n{e}", ) new_run_status(run=run, condition=condition) return False except Exception as e: condition = V1StatusCondition.get_condition( type=V1Statuses.FAILED, status="True", reason="SchedulerPrepare", message=f"Compiler received an internal error.\n{e}", ) new_run_status(run=run, condition=condition) return False condition = V1StatusCondition.get_condition( type=V1Statuses.COMPILED, status="True", reason="SchedulerPrepare", message="Run is compiled", last_update_time=compiled_at, ) new_run_status(run=run, condition=condition) if run.pending: return False if eager: runs_start(run_id=run.id, run=run) return False return True
def sort_conditions(status_conditions): return sorted( status_conditions, key=lambda x: V1StatusCondition.get_last_update_time( x.get("last_transition_time") ), )
def log_status(self, status: str, reason: str = None, message: str = None): """Logs a new run status. <blockquote class="info"> N.B. If you are executing a managed run, you don't need to call this method manually. This method is only useful for manual runs outside of Polyaxon. </blockquote> N.B you will probably use one of the simpler methods: * log_succeeded * log_stopped * log_failed * start * end [Run API](/docs/api/#operation/CreateRunStatus) Args: status: str, a valid [Statuses](/docs/core/specification/lifecycle/) value. reason: str, optional, reason for this status change. message: str, optional, message to log with this status. """ status_condition = V1StatusCondition(type=status, status=True, reason=reason, message=message) self.client.runs_v1.create_run_status( owner=self.owner, project=self.project, uuid=self.run_uuid, body={"condition": status_condition}, async_req=True, )
def resume_run( run: BaseRun, user_id: int = None, name: str = None, description: str = None, content: str = None, readme: str = None, tags: List[str] = None, ) -> BaseRun: op_spec = V1Operation.read(run.raw_content) compiled_operation, instance = operations.init_run( project_id=run.project_id, user_id=user_id or run.user_id, name=name or run.name, description=description or run.description, readme=readme or run.readme, op_spec=op_spec, tags=tags or run.tags, override=content, ) run.user_id = instance.user_id run.name = instance.name run.description = instance.description run.readme = instance.readme run.content = instance.content run.raw_content = instance.raw_content run.tags = instance.tags run.save() new_run_status( run, condition=V1StatusCondition.get_condition(type=V1Statuses.RESUMING, status=True), ) return run
def setUp(self): super().setUp() new_run_status( self.object, condition=V1StatusCondition.get_condition(type=V1Statuses.STOPPED, status=True), )
def test_new_run_status_created(self, auditor_record): new_run_status( self.run, condition=V1StatusCondition.get_condition(type=V1Statuses.CREATED, status=True), ) assert auditor_record.call_count == 0
def test_get(self): resp = self.client.get(self.url) assert resp.status_code == status.HTTP_200_OK data = resp.data assert len(data["status_conditions"]) == 0 assert data == self.serializer_class(self.object).data new_run_status( self.object, condition=V1StatusCondition.get_condition(type=V1Statuses.RUNNING, status=True), ) self.object.refresh_from_db() resp = self.client.get(self.url) assert resp.status_code == status.HTTP_200_OK data = resp.data assert len(data["status_conditions"]) == 1 assert data == self.serializer_class(self.object).data new_run_stop_status(run=self.object, message="foo") self.object.refresh_from_db() resp = self.client.get(self.url) assert resp.status_code == status.HTTP_200_OK data = resp.data assert len(data["status_conditions"]) == 2 assert data == self.serializer_class(self.object).data
def create_run( project_id: int, user_id: int, name: str = None, description: str = None, readme: str = None, tags: List[int] = None, raw_content: str = None, ) -> BaseRun: instance = get_run_model().objects.create( project_id=project_id, user_id=user_id, name=name, description=description, readme=readme, tags=tags, kind=V1RunKind.JOB, is_managed=False, raw_content=raw_content, status_conditions=[ V1StatusCondition.get_condition( type=V1Statuses.CREATED, status="True", reason="PolyaxonRunCreated", message="Run is created", ).to_dict() ], ) return instance
def runs_start(run_id: int, run: Optional[BaseRun]): run = get_run(run_id=run_id, run=run) if not run: return if not run.is_managed: return if not LifeCycle.is_compiled(run.status): _logger.info( "Run `%s` cannot transition from `%s` to `%s`.", run_id, run.status, V1Statuses.QUEUED, ) return condition = V1StatusCondition.get_condition( type=V1Statuses.QUEUED, status="True", reason="PolyaxonRunQueued", message="Run is queued", ) new_run_status(run=run, condition=condition) try: in_cluster = conf.get(K8S_IN_CLUSTER) if in_cluster and (run.is_service or run.is_job): manager.start( content=run.content, owner_name=run.project.owner.name, project_name=run.project.name, run_name=run.name, run_uuid=run.uuid.hex, run_kind=run.kind, namespace=conf.get(K8S_NAMESPACE), in_cluster=in_cluster, default_auth=False, ) except PolyaxonK8SError as e: condition = V1StatusCondition.get_condition( type=V1Statuses.FAILED, status="True", reason="PolyaxonRunFailed", message="Could not start the job {}".format(e), ) new_run_status(run=run, condition=condition)
def runs_start(run_id: int, run: Optional[BaseRun]): run = get_run(run_id=run_id, run=run) if not run: return if not run.is_managed: return if not LifeCycle.is_compiled(run.status): _logger.info( "Run `%s` cannot transition from `%s` to `%s`.", run_id, run.status, V1Statuses.QUEUED, ) return condition = V1StatusCondition.get_condition( type=V1Statuses.QUEUED, status="True", reason="PolyaxonRunQueued", message="Run is queued", ) new_run_status(run=run, condition=condition) def _log_error(exc: Exception, message: str = None): message = message or "Could not start the operation.\n" message += "error: {}\n{}".format(repr(exc), traceback.format_exc()) cond = V1StatusCondition.get_condition( type=V1Statuses.FAILED, status="True", reason="PolyaxonRunFailed", message=message, ) new_run_status(run=run, condition=cond) try: in_cluster = conf.get(K8S_IN_CLUSTER) if in_cluster and (run.is_service or run.is_job): manager.start( content=run.content, owner_name=run.project.owner.name, project_name=run.project.name, run_name=run.name, run_uuid=run.uuid.hex, run_kind=run.kind, namespace=conf.get(K8S_NAMESPACE), in_cluster=in_cluster, default_auth=False, ) return except (PolyaxonK8SError, ApiException) as e: _log_error( exc=e, message="Kubernetes manager could not start the operation.\n") except PolypodException as e: _log_error(exc=e, message="Failed converting the run manifest.\n") except Exception as e: _log_error(exc=e, message="Failed with unknown exception.\n")
def set_entity_status(entity, condition: V1StatusCondition): entity.status = condition.type if condition: status_conditions = None if entity.status_conditions: status_conditions = to_list(entity.status_conditions, check_none=True) last_condition = V1StatusCondition.get_condition(**status_conditions[-1]) if last_condition == condition: status_conditions[-1] = condition.to_dict() else: status_conditions.append(condition.to_dict()) elif condition: status_conditions = [condition.to_dict()] if status_conditions: entity.status_conditions = status_conditions return entity
def test_status_update_results_in_new_updated_at_datetime(self): updated_at = self.run.updated_at # Create new status new_run_status( self.run, condition=V1StatusCondition.get_condition(type=V1Statuses.STARTING, status=True), ) assert updated_at < self.run.updated_at updated_at = self.run.updated_at # Create new status new_run_status( self.run, condition=V1StatusCondition.get_condition(type=V1Statuses.STARTING, status=True), ) assert updated_at < self.run.updated_at
def test_start_run(self, manager_start): experiment = RunFactory(project=self.project, user=self.user) new_run_status( run=experiment, condition=V1StatusCondition.get_condition(type=V1Statuses.COMPILED, status=True), ) runs_start(run_id=experiment.id) assert manager_start.call_count == 1
def new_run_stop_status(run, message): # Update run status to show that its stopped message = f"Run is stopped; {message}" if message else "Run is stopped" condition = V1StatusCondition.get_condition( type=V1Statuses.STOPPED, status="True", reason="PolyaxonRunStopped", message=message, ) new_run_status(run=run, condition=condition)
def _log_error(exc: Exception, message: str = None): message = message or "Could not start the operation.\n" message += "error: {}\n{}".format(repr(exc), traceback.format_exc()) cond = V1StatusCondition.get_condition( type=V1Statuses.FAILED, status="True", reason="PolyaxonRunFailed", message=message, ) new_run_status(run=run, condition=cond)
def test_new_status_equality(self): new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.SCHEDULED, status=True, reason="foo"), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 1 # Same condition new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.SCHEDULED, status=True, reason="foo"), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 1 # Different condition's message new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.SCHEDULED, status=True, reason="foo", message="New message", ), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 1 # New condition new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.RUNNING, status=True, reason="foo", message="New message", ), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 2
def log_agent_status(self, status: str, reason: str = None, message: str = None): status_condition = V1StatusCondition.get_condition( type=status, status=True, reason=reason, message=message ) self.client.agents_v1.create_agent_status( owner=self.owner, uuid=self.agent_uuid, body={"condition": status_condition}, async_req=True, )
def test_new_run_status_scheduled(self, auditor_record): new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.SCHEDULED, status=True), ) assert auditor_record.call_count == 1 call_args, call_kwargs = auditor_record.call_args assert call_args == () assert call_kwargs["event_type"] == run_events.RUN_NEW_STATUS
def create_one(self): run = super().create_one() condition = V1StatusCondition.get_condition( type=V1Statuses.RUNNING, status="True", reason="Run is running", message="foo", ) new_run_status(run, condition) new_run_stop_status(run, "stopping") return run
def create_status(view, serializer): serializer.is_valid() validated_data = serializer.validated_data if not validated_data: return condition = None if validated_data.get("condition"): condition = V1StatusCondition.get_condition( **validated_data.get("condition")) if condition: new_run_status(run=view.run, condition=condition)
def log_status(self, status, reason=None, message=None): status_condition = V1StatusCondition( type=status, status=True, reason=reason, message=message ) self.client.runs_v1.create_run_status( owner=self.owner, project=self.project, uuid=self.run_uuid, body={"condition": status_condition}, async_req=True, )
def stop_run(view, request, *args, **kwargs): if LifeCycle.is_done(view.run.status): return Response(status=status.HTTP_200_OK, data={}) condition = V1StatusCondition.get_condition( type=V1Statuses.STOPPING, status="True", reason="PolyaxonRunStopping", message="User requested to stop the run.", ) new_run_status(run=view.run, condition=condition) view.audit(request, *args, **kwargs) return Response(status=status.HTTP_200_OK, data={})
def stop_runs(view, request, actor, *args, **kwargs): # Immediate stop queryset = ( get_run_model() .objects.filter(project=view.project, uuid__in=request.data.get("uuids", [])) .filter(status__in=LifeCycle.SAFE_STOP_VALUES) ) condition = V1StatusCondition.get_condition( type=V1Statuses.STOPPED, status="True", reason="EventHandler", message="User requested to stop the run.", ) bulk_new_run_status(queryset, condition) queryset = ( get_run_model() .objects.filter(project=view.project, uuid__in=request.data.get("uuids", [])) .exclude(status__in=LifeCycle.DONE_OR_IN_PROGRESS_VALUES) ) runs = [r for r in queryset] condition = V1StatusCondition.get_condition( type=V1Statuses.STOPPING, status="True", reason="EventHandler", message="User requested to stop the run.", ) bulk_new_run_status(runs, condition) for run in runs: auditor.record( event_type=RUN_STOPPED_ACTOR, instance=run, actor_id=actor.id, actor_name=actor.username, owner_id=view.project.owner_id, owner_name=view.owner_name, project_name=view.project_name, ) return Response(status=status.HTTP_200_OK, data={})
def runs_prepare(run_id: int, run: Optional[BaseRun], eager: bool = False): run = get_run(run_id=run_id, run=run) if not run: return if not LifeCycle.is_compilable(run.status): _logger.info( "Run `%s` cannot transition from `%s` to `%s`.", run_id, run.status, V1Statuses.COMPILED, ) return None try: compiled_at = now() _, compiled_operation = resolver.resolve(run=run, compiled_at=compiled_at) except PolyaxonCompilerError as e: condition = V1StatusCondition.get_condition( type=V1Statuses.FAILED, status="True", reason="PolyaxonRunFailed", message=f"Run compilation error: {e}", ) new_run_status(run=run, condition=condition) return None condition = V1StatusCondition.get_condition( type=V1Statuses.COMPILED, status="True", reason="PolyaxonRunCompiler", message="Run is compiled", last_update_time=compiled_at, ) new_run_status(run=run, condition=condition) if eager: runs_start(run_id=run.id, run=run) return
def test_resume_undone_run(self): new_run_status( self.object, condition=V1StatusCondition.get_condition(type=V1Statuses.RUNNING, status=True), ) data = {} assert self.queryset.count() == 1 with patch("polycommon.workers.send") as workers_send: resp = self.client.post(self.url + "resume/", data) assert resp.status_code == status.HTTP_400_BAD_REQUEST assert workers_send.call_count == 0 assert self.queryset.count() == 1
def new_run_stopping_status(run, message) -> bool: if LifeCycle.is_done(run.status, progressing=True): return False message = f"Run is stopping; {message}" if message else "Run is stopping" condition = V1StatusCondition.get_condition( type=V1Statuses.STOPPING, status="True", reason="PolyaxonRunStopping", message=message, ) new_run_status(run=run, condition=condition) return True
def test_new_run_status_succeeded(self, auditor_record): new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.SUCCEEDED, status=True), ) assert auditor_record.call_count == 3 call_args_list = auditor_record.call_args_list assert call_args_list[0][0] == () assert call_args_list[1][0] == () assert call_args_list[2][0] == () assert call_args_list[0][1]["event_type"] == run_events.RUN_NEW_STATUS assert call_args_list[1][1]["event_type"] == run_events.RUN_SUCCEEDED assert call_args_list[2][1]["event_type"] == run_events.RUN_DONE
async def notify_run( namespace: str, owner: str, project: str, run_uuid: str, run_name: str, condition: V1StatusCondition, connections: List[str], ): spawner = AsyncSpawner(namespace=namespace) await spawner.k8s_manager.setup() for connection in connections: connection_type = settings.AGENT_CONFIG.connections_by_names.get( connection) if not connection_type: logger.warning( "Could not create notification using connection {}, " "the connection was not found or not set correctly.".format( connection_type)) continue operation = get_notifier_operation( connection=connection, backend=connection_type.kind, owner=owner, project=project, run_uuid=run_uuid, run_name=run_name, condition=condition.to_dict(), ) compiled_operation = OperationSpecification.compile_operation( operation) resource = compiler.make( owner_name=owner, project_name=project, project_uuid=project, run_uuid=run_uuid, run_name=run_name, run_path=run_uuid, compiled_operation=compiled_operation, params=operation.params, converters=PLATFORM_CONVERTERS, ) await spawner.create( run_uuid=run_uuid, run_kind=compiled_operation.get_run_kind(), resource=resource, )