Example #1
0
    def test_status_transition(self):
        new_run_status(
            self.run,
            condition=V1StatusCondition.get_condition(
                type=V1Statuses.SCHEDULED, status=True, reason="foo"),
        )
        self.run.refresh_from_db()
        assert len(self.run.status_conditions) == 1

        # New running condition
        new_run_status(
            self.run,
            condition=V1StatusCondition.get_condition(
                type=V1Statuses.RUNNING,
                status=True,
                reason="foo",
                message="New message",
            ),
        )
        self.run.refresh_from_db()
        assert len(self.run.status_conditions) == 2

        # New warning condition
        new_run_status(
            self.run,
            condition=V1StatusCondition.get_condition(
                type=V1Statuses.WARNING,
                status=True,
                reason="foo",
                message="New message",
            ),
        )
        self.run.refresh_from_db()
        assert len(self.run.status_conditions) == 3

        # New running condition
        new_run_status(
            self.run,
            condition=V1StatusCondition.get_condition(
                type=V1Statuses.RUNNING,
                status=True,
                reason="foo",
                message="New message",
            ),
        )
        self.run.refresh_from_db()
        assert len(self.run.status_conditions) == 4

        # New warning condition
        new_run_status(
            self.run,
            condition=V1StatusCondition.get_condition(
                type=V1Statuses.WARNING,
                status=True,
                reason="foo",
                message="New message",
            ),
        )
        self.run.refresh_from_db()
        assert len(self.run.status_conditions) == 5
Example #2
0
    def test_new_stopped_status_after_stopping(self):
        new_run_status(
            self.run,
            condition=V1StatusCondition.get_condition(type=V1Statuses.STOPPING,
                                                      status=True,
                                                      reason="foo"),
        )
        self.run.refresh_from_db()
        assert len(self.run.status_conditions) == 1

        # Same this condition
        new_run_status(
            self.run,
            condition=V1StatusCondition.get_condition(type=V1Statuses.RUNNING,
                                                      status=True,
                                                      reason="foo"),
        )
        self.run.refresh_from_db()
        assert len(self.run.status_conditions) == 1

        # Different condition's message
        new_run_status(
            self.run,
            condition=V1StatusCondition.get_condition(
                type=V1Statuses.STOPPED,
                status=True,
                reason="foo",
                message="New message",
            ),
        )
        self.run.refresh_from_db()
        assert len(self.run.status_conditions) == 2
Example #3
0
def runs_prepare(run_id: int,
                 run: Optional[BaseRun],
                 eager: bool = False) -> bool:
    run = get_run(run_id=run_id, run=run)
    if not run:
        return False

    if not LifeCycle.is_compilable(run.status):
        _logger.info(
            "Run `%s` cannot transition from `%s` to `%s`.",
            run_id,
            run.status,
            V1Statuses.COMPILED,
        )
        return False

    try:
        compiled_at = now()
        _, compiled_operation = resolver.resolve(run=run,
                                                 compiled_at=compiled_at,
                                                 eager=eager)
    except PolyaxonCompilerError as e:
        condition = V1StatusCondition.get_condition(
            type=V1Statuses.FAILED,
            status="True",
            reason="SchedulerPrepare",
            message=f"Failed to compile.\n{e}",
        )
        new_run_status(run=run, condition=condition)
        return False
    except Exception as e:
        condition = V1StatusCondition.get_condition(
            type=V1Statuses.FAILED,
            status="True",
            reason="SchedulerPrepare",
            message=f"Compiler received an internal error.\n{e}",
        )
        new_run_status(run=run, condition=condition)
        return False

    condition = V1StatusCondition.get_condition(
        type=V1Statuses.COMPILED,
        status="True",
        reason="SchedulerPrepare",
        message="Run is compiled",
        last_update_time=compiled_at,
    )
    new_run_status(run=run, condition=condition)

    if run.pending:
        return False

    if eager:
        runs_start(run_id=run.id, run=run)
        return False

    return True
Example #4
0
def sort_conditions(status_conditions):
    return sorted(
        status_conditions,
        key=lambda x: V1StatusCondition.get_last_update_time(
            x.get("last_transition_time")
        ),
    )
Example #5
0
    def log_status(self, status: str, reason: str = None, message: str = None):
        """Logs a new run status.

        <blockquote class="info">
        N.B. If you are executing a managed run, you don't need to call this method manually.
        This method is only useful for manual runs outside of Polyaxon.
        </blockquote>

        N.B you will probably use one of the simpler methods:
            * log_succeeded
            * log_stopped
            * log_failed
            * start
            * end

        [Run API](/docs/api/#operation/CreateRunStatus)

        Args:
            status: str, a valid [Statuses](/docs/core/specification/lifecycle/) value.
            reason: str, optional, reason for this status change.
            message: str, optional, message to log with this status.
        """
        status_condition = V1StatusCondition(type=status,
                                             status=True,
                                             reason=reason,
                                             message=message)
        self.client.runs_v1.create_run_status(
            owner=self.owner,
            project=self.project,
            uuid=self.run_uuid,
            body={"condition": status_condition},
            async_req=True,
        )
Example #6
0
def resume_run(
    run: BaseRun,
    user_id: int = None,
    name: str = None,
    description: str = None,
    content: str = None,
    readme: str = None,
    tags: List[str] = None,
) -> BaseRun:
    op_spec = V1Operation.read(run.raw_content)
    compiled_operation, instance = operations.init_run(
        project_id=run.project_id,
        user_id=user_id or run.user_id,
        name=name or run.name,
        description=description or run.description,
        readme=readme or run.readme,
        op_spec=op_spec,
        tags=tags or run.tags,
        override=content,
    )

    run.user_id = instance.user_id
    run.name = instance.name
    run.description = instance.description
    run.readme = instance.readme
    run.content = instance.content
    run.raw_content = instance.raw_content
    run.tags = instance.tags
    run.save()
    new_run_status(
        run,
        condition=V1StatusCondition.get_condition(type=V1Statuses.RESUMING,
                                                  status=True),
    )
    return run
Example #7
0
 def setUp(self):
     super().setUp()
     new_run_status(
         self.object,
         condition=V1StatusCondition.get_condition(type=V1Statuses.STOPPED,
                                                   status=True),
     )
 def test_new_run_status_created(self, auditor_record):
     new_run_status(
         self.run,
         condition=V1StatusCondition.get_condition(type=V1Statuses.CREATED,
                                                   status=True),
     )
     assert auditor_record.call_count == 0
Example #9
0
    def test_get(self):
        resp = self.client.get(self.url)
        assert resp.status_code == status.HTTP_200_OK

        data = resp.data
        assert len(data["status_conditions"]) == 0
        assert data == self.serializer_class(self.object).data

        new_run_status(
            self.object,
            condition=V1StatusCondition.get_condition(type=V1Statuses.RUNNING,
                                                      status=True),
        )
        self.object.refresh_from_db()
        resp = self.client.get(self.url)
        assert resp.status_code == status.HTTP_200_OK

        data = resp.data
        assert len(data["status_conditions"]) == 1
        assert data == self.serializer_class(self.object).data

        new_run_stop_status(run=self.object, message="foo")
        self.object.refresh_from_db()
        resp = self.client.get(self.url)
        assert resp.status_code == status.HTTP_200_OK

        data = resp.data
        assert len(data["status_conditions"]) == 2
        assert data == self.serializer_class(self.object).data
Example #10
0
def create_run(
    project_id: int,
    user_id: int,
    name: str = None,
    description: str = None,
    readme: str = None,
    tags: List[int] = None,
    raw_content: str = None,
) -> BaseRun:
    instance = get_run_model().objects.create(
        project_id=project_id,
        user_id=user_id,
        name=name,
        description=description,
        readme=readme,
        tags=tags,
        kind=V1RunKind.JOB,
        is_managed=False,
        raw_content=raw_content,
        status_conditions=[
            V1StatusCondition.get_condition(
                type=V1Statuses.CREATED,
                status="True",
                reason="PolyaxonRunCreated",
                message="Run is created",
            ).to_dict()
        ],
    )
    return instance
Example #11
0
def runs_start(run_id: int, run: Optional[BaseRun]):
    run = get_run(run_id=run_id, run=run)
    if not run:
        return

    if not run.is_managed:
        return

    if not LifeCycle.is_compiled(run.status):
        _logger.info(
            "Run `%s` cannot transition from `%s` to `%s`.",
            run_id,
            run.status,
            V1Statuses.QUEUED,
        )
        return

    condition = V1StatusCondition.get_condition(
        type=V1Statuses.QUEUED,
        status="True",
        reason="PolyaxonRunQueued",
        message="Run is queued",
    )
    new_run_status(run=run, condition=condition)

    try:
        in_cluster = conf.get(K8S_IN_CLUSTER)
        if in_cluster and (run.is_service or run.is_job):
            manager.start(
                content=run.content,
                owner_name=run.project.owner.name,
                project_name=run.project.name,
                run_name=run.name,
                run_uuid=run.uuid.hex,
                run_kind=run.kind,
                namespace=conf.get(K8S_NAMESPACE),
                in_cluster=in_cluster,
                default_auth=False,
            )
    except PolyaxonK8SError as e:
        condition = V1StatusCondition.get_condition(
            type=V1Statuses.FAILED,
            status="True",
            reason="PolyaxonRunFailed",
            message="Could not start the job {}".format(e),
        )
        new_run_status(run=run, condition=condition)
Example #12
0
def runs_start(run_id: int, run: Optional[BaseRun]):
    run = get_run(run_id=run_id, run=run)
    if not run:
        return

    if not run.is_managed:
        return

    if not LifeCycle.is_compiled(run.status):
        _logger.info(
            "Run `%s` cannot transition from `%s` to `%s`.",
            run_id,
            run.status,
            V1Statuses.QUEUED,
        )
        return

    condition = V1StatusCondition.get_condition(
        type=V1Statuses.QUEUED,
        status="True",
        reason="PolyaxonRunQueued",
        message="Run is queued",
    )
    new_run_status(run=run, condition=condition)

    def _log_error(exc: Exception, message: str = None):
        message = message or "Could not start the operation.\n"
        message += "error: {}\n{}".format(repr(exc), traceback.format_exc())
        cond = V1StatusCondition.get_condition(
            type=V1Statuses.FAILED,
            status="True",
            reason="PolyaxonRunFailed",
            message=message,
        )
        new_run_status(run=run, condition=cond)

    try:
        in_cluster = conf.get(K8S_IN_CLUSTER)
        if in_cluster and (run.is_service or run.is_job):
            manager.start(
                content=run.content,
                owner_name=run.project.owner.name,
                project_name=run.project.name,
                run_name=run.name,
                run_uuid=run.uuid.hex,
                run_kind=run.kind,
                namespace=conf.get(K8S_NAMESPACE),
                in_cluster=in_cluster,
                default_auth=False,
            )
        return
    except (PolyaxonK8SError, ApiException) as e:
        _log_error(
            exc=e,
            message="Kubernetes manager could not start the operation.\n")
    except PolypodException as e:
        _log_error(exc=e, message="Failed converting the run manifest.\n")
    except Exception as e:
        _log_error(exc=e, message="Failed with unknown exception.\n")
Example #13
0
def set_entity_status(entity, condition: V1StatusCondition):
    entity.status = condition.type

    if condition:
        status_conditions = None
        if entity.status_conditions:
            status_conditions = to_list(entity.status_conditions, check_none=True)
            last_condition = V1StatusCondition.get_condition(**status_conditions[-1])
            if last_condition == condition:
                status_conditions[-1] = condition.to_dict()
            else:
                status_conditions.append(condition.to_dict())
        elif condition:
            status_conditions = [condition.to_dict()]
        if status_conditions:
            entity.status_conditions = status_conditions

    return entity
Example #14
0
    def test_status_update_results_in_new_updated_at_datetime(self):
        updated_at = self.run.updated_at
        # Create new status
        new_run_status(
            self.run,
            condition=V1StatusCondition.get_condition(type=V1Statuses.STARTING,
                                                      status=True),
        )
        assert updated_at < self.run.updated_at
        updated_at = self.run.updated_at

        # Create new status
        new_run_status(
            self.run,
            condition=V1StatusCondition.get_condition(type=V1Statuses.STARTING,
                                                      status=True),
        )
        assert updated_at < self.run.updated_at
Example #15
0
 def test_start_run(self, manager_start):
     experiment = RunFactory(project=self.project, user=self.user)
     new_run_status(
         run=experiment,
         condition=V1StatusCondition.get_condition(type=V1Statuses.COMPILED,
                                                   status=True),
     )
     runs_start(run_id=experiment.id)
     assert manager_start.call_count == 1
Example #16
0
def new_run_stop_status(run, message):
    # Update run status to show that its stopped
    message = f"Run is stopped; {message}" if message else "Run is stopped"
    condition = V1StatusCondition.get_condition(
        type=V1Statuses.STOPPED,
        status="True",
        reason="PolyaxonRunStopped",
        message=message,
    )
    new_run_status(run=run, condition=condition)
Example #17
0
 def _log_error(exc: Exception, message: str = None):
     message = message or "Could not start the operation.\n"
     message += "error: {}\n{}".format(repr(exc), traceback.format_exc())
     cond = V1StatusCondition.get_condition(
         type=V1Statuses.FAILED,
         status="True",
         reason="PolyaxonRunFailed",
         message=message,
     )
     new_run_status(run=run, condition=cond)
Example #18
0
    def test_new_status_equality(self):
        new_run_status(
            self.run,
            condition=V1StatusCondition.get_condition(
                type=V1Statuses.SCHEDULED, status=True, reason="foo"),
        )
        self.run.refresh_from_db()
        assert len(self.run.status_conditions) == 1

        # Same condition
        new_run_status(
            self.run,
            condition=V1StatusCondition.get_condition(
                type=V1Statuses.SCHEDULED, status=True, reason="foo"),
        )
        self.run.refresh_from_db()
        assert len(self.run.status_conditions) == 1

        # Different condition's message
        new_run_status(
            self.run,
            condition=V1StatusCondition.get_condition(
                type=V1Statuses.SCHEDULED,
                status=True,
                reason="foo",
                message="New message",
            ),
        )
        self.run.refresh_from_db()
        assert len(self.run.status_conditions) == 1

        # New condition
        new_run_status(
            self.run,
            condition=V1StatusCondition.get_condition(
                type=V1Statuses.RUNNING,
                status=True,
                reason="foo",
                message="New message",
            ),
        )
        self.run.refresh_from_db()
        assert len(self.run.status_conditions) == 2
Example #19
0
 def log_agent_status(self, status: str, reason: str = None, message: str = None):
     status_condition = V1StatusCondition.get_condition(
         type=status, status=True, reason=reason, message=message
     )
     self.client.agents_v1.create_agent_status(
         owner=self.owner,
         uuid=self.agent_uuid,
         body={"condition": status_condition},
         async_req=True,
     )
 def test_new_run_status_scheduled(self, auditor_record):
     new_run_status(
         self.run,
         condition=V1StatusCondition.get_condition(
             type=V1Statuses.SCHEDULED, status=True),
     )
     assert auditor_record.call_count == 1
     call_args, call_kwargs = auditor_record.call_args
     assert call_args == ()
     assert call_kwargs["event_type"] == run_events.RUN_NEW_STATUS
 def create_one(self):
     run = super().create_one()
     condition = V1StatusCondition.get_condition(
         type=V1Statuses.RUNNING,
         status="True",
         reason="Run is running",
         message="foo",
     )
     new_run_status(run, condition)
     new_run_stop_status(run, "stopping")
     return run
Example #22
0
def create_status(view, serializer):
    serializer.is_valid()
    validated_data = serializer.validated_data
    if not validated_data:
        return
    condition = None
    if validated_data.get("condition"):
        condition = V1StatusCondition.get_condition(
            **validated_data.get("condition"))
    if condition:
        new_run_status(run=view.run, condition=condition)
Example #23
0
 def log_status(self, status, reason=None, message=None):
     status_condition = V1StatusCondition(
         type=status, status=True, reason=reason, message=message
     )
     self.client.runs_v1.create_run_status(
         owner=self.owner,
         project=self.project,
         uuid=self.run_uuid,
         body={"condition": status_condition},
         async_req=True,
     )
Example #24
0
def stop_run(view, request, *args, **kwargs):
    if LifeCycle.is_done(view.run.status):
        return Response(status=status.HTTP_200_OK, data={})
    condition = V1StatusCondition.get_condition(
        type=V1Statuses.STOPPING,
        status="True",
        reason="PolyaxonRunStopping",
        message="User requested to stop the run.",
    )
    new_run_status(run=view.run, condition=condition)
    view.audit(request, *args, **kwargs)
    return Response(status=status.HTTP_200_OK, data={})
Example #25
0
def stop_runs(view, request, actor, *args, **kwargs):
    # Immediate stop
    queryset = (
        get_run_model()
        .objects.filter(project=view.project, uuid__in=request.data.get("uuids", []))
        .filter(status__in=LifeCycle.SAFE_STOP_VALUES)
    )
    condition = V1StatusCondition.get_condition(
        type=V1Statuses.STOPPED,
        status="True",
        reason="EventHandler",
        message="User requested to stop the run.",
    )
    bulk_new_run_status(queryset, condition)

    queryset = (
        get_run_model()
        .objects.filter(project=view.project, uuid__in=request.data.get("uuids", []))
        .exclude(status__in=LifeCycle.DONE_OR_IN_PROGRESS_VALUES)
    )
    runs = [r for r in queryset]
    condition = V1StatusCondition.get_condition(
        type=V1Statuses.STOPPING,
        status="True",
        reason="EventHandler",
        message="User requested to stop the run.",
    )
    bulk_new_run_status(runs, condition)
    for run in runs:
        auditor.record(
            event_type=RUN_STOPPED_ACTOR,
            instance=run,
            actor_id=actor.id,
            actor_name=actor.username,
            owner_id=view.project.owner_id,
            owner_name=view.owner_name,
            project_name=view.project_name,
        )

    return Response(status=status.HTTP_200_OK, data={})
Example #26
0
def runs_prepare(run_id: int, run: Optional[BaseRun], eager: bool = False):
    run = get_run(run_id=run_id, run=run)
    if not run:
        return

    if not LifeCycle.is_compilable(run.status):
        _logger.info(
            "Run `%s` cannot transition from `%s` to `%s`.",
            run_id,
            run.status,
            V1Statuses.COMPILED,
        )
        return None

    try:
        compiled_at = now()
        _, compiled_operation = resolver.resolve(run=run,
                                                 compiled_at=compiled_at)
    except PolyaxonCompilerError as e:
        condition = V1StatusCondition.get_condition(
            type=V1Statuses.FAILED,
            status="True",
            reason="PolyaxonRunFailed",
            message=f"Run compilation error: {e}",
        )
        new_run_status(run=run, condition=condition)
        return None

    condition = V1StatusCondition.get_condition(
        type=V1Statuses.COMPILED,
        status="True",
        reason="PolyaxonRunCompiler",
        message="Run is compiled",
        last_update_time=compiled_at,
    )
    new_run_status(run=run, condition=condition)

    if eager:
        runs_start(run_id=run.id, run=run)
        return
Example #27
0
 def test_resume_undone_run(self):
     new_run_status(
         self.object,
         condition=V1StatusCondition.get_condition(type=V1Statuses.RUNNING,
                                                   status=True),
     )
     data = {}
     assert self.queryset.count() == 1
     with patch("polycommon.workers.send") as workers_send:
         resp = self.client.post(self.url + "resume/", data)
     assert resp.status_code == status.HTTP_400_BAD_REQUEST
     assert workers_send.call_count == 0
     assert self.queryset.count() == 1
Example #28
0
def new_run_stopping_status(run, message) -> bool:
    if LifeCycle.is_done(run.status, progressing=True):
        return False

    message = f"Run is stopping; {message}" if message else "Run is stopping"
    condition = V1StatusCondition.get_condition(
        type=V1Statuses.STOPPING,
        status="True",
        reason="PolyaxonRunStopping",
        message=message,
    )
    new_run_status(run=run, condition=condition)
    return True
 def test_new_run_status_succeeded(self, auditor_record):
     new_run_status(
         self.run,
         condition=V1StatusCondition.get_condition(
             type=V1Statuses.SUCCEEDED, status=True),
     )
     assert auditor_record.call_count == 3
     call_args_list = auditor_record.call_args_list
     assert call_args_list[0][0] == ()
     assert call_args_list[1][0] == ()
     assert call_args_list[2][0] == ()
     assert call_args_list[0][1]["event_type"] == run_events.RUN_NEW_STATUS
     assert call_args_list[1][1]["event_type"] == run_events.RUN_SUCCEEDED
     assert call_args_list[2][1]["event_type"] == run_events.RUN_DONE
Example #30
0
async def notify_run(
    namespace: str,
    owner: str,
    project: str,
    run_uuid: str,
    run_name: str,
    condition: V1StatusCondition,
    connections: List[str],
):
    spawner = AsyncSpawner(namespace=namespace)
    await spawner.k8s_manager.setup()
    for connection in connections:
        connection_type = settings.AGENT_CONFIG.connections_by_names.get(
            connection)
        if not connection_type:
            logger.warning(
                "Could not create notification using connection {}, "
                "the connection was not found or not set correctly.".format(
                    connection_type))
            continue

        operation = get_notifier_operation(
            connection=connection,
            backend=connection_type.kind,
            owner=owner,
            project=project,
            run_uuid=run_uuid,
            run_name=run_name,
            condition=condition.to_dict(),
        )
        compiled_operation = OperationSpecification.compile_operation(
            operation)
        resource = compiler.make(
            owner_name=owner,
            project_name=project,
            project_uuid=project,
            run_uuid=run_uuid,
            run_name=run_name,
            run_path=run_uuid,
            compiled_operation=compiled_operation,
            params=operation.params,
            converters=PLATFORM_CONVERTERS,
        )
        await spawner.create(
            run_uuid=run_uuid,
            run_kind=compiled_operation.get_run_kind(),
            resource=resource,
        )