Example #1
0
async def log_job(request, ws, job, pod_id, namespace, container):
    job_uuid = job.uuid.hex
    if job_uuid in request.app.job_logs_ws_managers:
        ws_manager = request.app.job_logs_ws_managers[job_uuid]
    else:
        ws_manager = SocketManager()
        request.app.job_logs_ws_managers[job_uuid] = ws_manager

    ws_manager.add_socket(ws)

    # Stream phase changes
    status = None
    while status != JobLifeCycle.RUNNING and not JobLifeCycle.is_done(status):
        job.refresh_from_db()
        if status != job.last_status:
            status = job.last_status
            await notify_ws(ws=ws, message=get_status_message(status))
            if should_disconnect(ws=ws, ws_manager=ws_manager):
                return
        await asyncio.sleep(SOCKET_SLEEP)

    if JobLifeCycle.is_done(status):
        await notify_ws(ws=ws, message=get_status_message(status))
        return

    config.load_incluster_config()
    k8s_api = client.CoreV1Api()
    await log_job_pod(k8s_api=k8s_api,
                      ws=ws,
                      ws_manager=ws_manager,
                      pod_id=pod_id,
                      container=container,
                      namespace=namespace)
Example #2
0
    def test_master_success_influences_other_experiment_workers_status(self):
        with patch('scheduler.tasks.experiments.experiments_build.apply_async') as _:  # noqa
            # with patch.object(Experiment, 'set_status') as _:  # noqa
            experiment = ExperimentFactory()

        assert ExperimentLifeCycle.is_done(experiment.last_status) is False
        # Add jobs
        master = ExperimentJobFactory(experiment=experiment, role=TaskType.MASTER)
        assert JobLifeCycle.is_done(master.last_status) is False
        workers = [ExperimentJobFactory(experiment=experiment, role=TaskType.WORKER)
                   for _ in range(2)]
        for worker in workers:
            worker.refresh_from_db()
            assert JobLifeCycle.is_done(worker.last_status) is False

        # Set master to succeeded
        ExperimentJobStatusFactory(job=master, status=JobLifeCycle.SUCCEEDED)

        # All worker should have a success status
        for worker in workers:
            worker.refresh_from_db()
            assert worker.last_status == JobLifeCycle.SUCCEEDED

        # Experiment last status should be success
        experiment.refresh_from_db()
        assert experiment.last_status == ExperimentLifeCycle.SUCCEEDED
Example #3
0
def tensorboard_job_status_post_save(sender, **kwargs):
    instance = kwargs['instance']
    job = instance.job
    previous_status = job.last_status
    # Update job last_status
    job.status = instance
    set_job_started_at(instance=job, status=instance.status)
    set_job_finished_at(instance=job, status=instance.status)
    job.save(update_fields=['status', 'started_at', 'updated_at', 'finished_at'])
    auditor.record(event_type=TENSORBOARD_NEW_STATUS,
                   instance=job,
                   previous_status=previous_status,
                   target='project')
    if instance.status == JobLifeCycle.STOPPED:
        auditor.record(event_type=TENSORBOARD_STOPPED,
                       instance=job,
                       previous_status=previous_status,
                       target='project')
    elif instance.status == JobLifeCycle.FAILED:
        auditor.record(event_type=TENSORBOARD_FAILED,
                       instance=job,
                       previous_status=previous_status,
                       target='project')
    elif instance.status == JobLifeCycle.STOPPED:
        auditor.record(event_type=TENSORBOARD_SUCCEEDED,
                       instance=job,
                       previous_status=previous_status,
                       target='project')
    if JobLifeCycle.is_done(instance.status):
        RedisStatuses.delete_status(job.uuid.hex)
    new_operation_run_status(entity_type=content_types.TENSORBOARD_JOB,
                             entity=job,
                             status=instance.status)
Example #4
0
def update_job_containers(event: Mapping, status: str,
                          job_container_name: str) -> None:
    job_containers = RedisJobContainers()
    if JobLifeCycle.is_done(status):
        # Remove the job monitoring
        job_uuid = event['metadata']['labels']['job_uuid']
        logger.info('Stop monitoring job_uuid: %s', job_uuid)
        job_containers.remove_job(job_uuid)

    if event['status']['container_statuses'] is None:
        return

    def get_container_id(container_id):
        if not container_id:
            return None
        if container_id.startswith('docker://'):
            return container_id[len('docker://'):]
        return container_id

    for container_status in event['status']['container_statuses']:
        if container_status['name'] != job_container_name:
            continue

        container_id = get_container_id(container_status['container_id'])
        if container_id:
            job_uuid = event['metadata']['labels']['job_uuid']
            if container_status['state']['running'] is not None:
                logger.info('Monitoring (container_id, job_uuid): (%s, %s)',
                            container_id, job_uuid)
                job_containers.monitor(container_id=container_id,
                                       job_uuid=job_uuid)
            else:

                job_containers.remove_container(container_id=container_id)
Example #5
0
 def _set_status(self,
                 status_model,
                 status: str,
                 created_at: AwareDT = None,
                 message: str = None,
                 traceback: Dict = None,
                 details: Dict = None) -> bool:
     current_status = self.last_status_before(status_model=status_model,
                                              status_date=created_at)
     if self.is_done:
         # We should not update statuses anymore
         _logger.debug(
             'Received a new status `%s` for job `%s`. '
             'But the job is already done with status `%s`', status,
             self.unique_name, current_status)
         return False
     if status in JobLifeCycle.HEARTBEAT_STATUS:
         self._ping_heartbeat()
     if JobLifeCycle.can_transition(status_from=current_status,
                                    status_to=status):
         # Add new status to the job
         params = {'created_at': created_at} if created_at else {}
         status_model.objects.create(job=self,
                                     status=status,
                                     message=message,
                                     traceback=traceback,
                                     details=details,
                                     **params)
         return True
     return False
Example #6
0
def jobs_build(job_id):
    job = get_valid_job(job_id=job_id)
    if not job:
        return None

    if not JobLifeCycle.can_transition(status_from=job.last_status,
                                       status_to=JobLifeCycle.BUILDING):
        _logger.info('Job id `%s` cannot transition from `%s` to `%s`.',
                     job_id, job.last_status, JobLifeCycle.BUILDING)
        return

    build_job, image_exists, build_status = dockerizer_scheduler.create_build_job(
        user=job.user,
        project=job.project,
        config=job.specification.build,
        configmap_refs=job.specification.configmap_refs,
        secret_refs=job.specification.secret_refs,
        code_reference=job.code_reference)

    job.build_job = build_job
    job.save(update_fields=['build_job'])
    if image_exists:
        # The image already exists, so we can start the experiment right away
        celery_app.send_task(
            SchedulerCeleryTasks.JOBS_START,
            kwargs={'job_id': job_id},
            countdown=conf.get('GLOBAL_COUNTDOWN'))
        return

    if not build_status:
        job.set_status(JobLifeCycle.FAILED, message='Could not start build process.')
        return

    # Update job status to show that its building docker image
    job.set_status(JobLifeCycle.BUILDING, message='Building container')
Example #7
0
def projects_notebook_build(notebook_job_id):
    notebook_job = get_valid_notebook(notebook_job_id=notebook_job_id)
    if not notebook_job:
        return None

    if not JobLifeCycle.can_transition(status_from=notebook_job.last_status,
                                       status_to=JobLifeCycle.BUILDING):
        _logger.info('Notebook `%s` cannot transition from `%s` to `%s`.',
                     notebook_job, notebook_job.last_status, JobLifeCycle.BUILDING)
        return

    build_job, image_exists, build_status = dockerizer_scheduler.create_build_job(
        user=notebook_job.user,
        project=notebook_job.project,
        config=notebook_job.specification.build,
        configmap_refs=notebook_job.specification.configmap_refs,
        secret_refs=notebook_job.specification.secret_refs,
        code_reference=notebook_job.code_reference)

    notebook_job.build_job = build_job
    notebook_job.save(update_fields=['build_job'])
    if image_exists:
        # The image already exists, so we can start the experiment right away
        workers.send(
            SchedulerCeleryTasks.PROJECTS_NOTEBOOK_START,
            kwargs={'notebook_job_id': notebook_job_id})
        return

    if not build_status:
        notebook_job.set_status(JobLifeCycle.FAILED, message='Could not start build process.')
        return

    # Update job status to show that its building docker image
    notebook_job.set_status(JobLifeCycle.BUILDING, message='Building container')
Example #8
0
def job_status_post_save(sender, **kwargs):
    instance = kwargs['instance']
    job = instance.job
    previous_status = job.last_status
    # Update job last_status
    job.status = instance
    set_job_started_at(instance=job, status=instance.status)
    set_job_finished_at(instance=job, status=instance.status)
    job.save(update_fields=['status', 'started_at', 'updated_at', 'finished_at'])
    auditor.record(event_type=JOB_NEW_STATUS,
                   instance=job,
                   previous_status=previous_status)

    if instance.status == JobLifeCycle.CREATED:
        auditor.record(event_type=JOB_CREATED, instance=job)
    elif instance.status == JobLifeCycle.STOPPED:
        auditor.record(event_type=JOB_STOPPED,
                       instance=job,
                       previous_status=previous_status)
    elif instance.status == JobLifeCycle.FAILED:
        auditor.record(event_type=JOB_FAILED,
                       instance=job,
                       previous_status=previous_status)
    elif instance.status == JobLifeCycle.SUCCEEDED:
        auditor.record(event_type=JOB_SUCCEEDED,
                       instance=job,
                       previous_status=previous_status)
    if JobLifeCycle.is_done(instance.status):
        auditor.record(event_type=JOB_DONE,
                       instance=job,
                       previous_status=previous_status)
        RedisStatuses.delete_status(job.uuid.hex)
    new_operation_run_status(entity_type=content_types.JOB,
                             entity=job,
                             status=instance.status)
Example #9
0
 def calculated_status(self) -> str:
     master_status = self.jobs.order_by('created_at').first().last_status
     calculated_status = master_status if JobLifeCycle.is_done(master_status) else None
     if calculated_status is None:
         calculated_status = ExperimentLifeCycle.jobs_status(self.last_job_statuses)
     if calculated_status is None:
         return self.last_status
     return calculated_status
Example #10
0
    def post(self, request, *args, **kwargs):
        job = self.get_object()

        if not JobLifeCycle.is_stoppable(job.last_status):
            return Response(status=status.HTTP_403_FORBIDDEN)

        token, _ = Token.objects.get_or_create(user=job.user)
        return Response({'token': token.key}, status=status.HTTP_200_OK)
Example #11
0
    def post(self, request, *args, **kwargs):
        project = self.project

        if not project.has_notebook or not JobLifeCycle.is_stoppable(
                project.notebook.last_status):
            return Response(status=status.HTTP_403_FORBIDDEN)

        token, _ = Token.objects.get_or_create(user=project.user)
        return Response({'token': token.key}, status=status.HTTP_200_OK)
Example #12
0
def projects_notebook_start(notebook_job_id):
    notebook_job = get_valid_notebook(notebook_job_id=notebook_job_id)
    if not notebook_job:
        return None

    if not JobLifeCycle.can_transition(status_from=notebook_job.last_status,
                                       status_to=JobLifeCycle.SCHEDULED):
        _logger.info('Notebook `%s` cannot transition from `%s` to `%s`.',
                     notebook_job.unique_name, notebook_job.last_status, JobLifeCycle.SCHEDULED)

    notebook_scheduler.start_notebook(notebook_job)
Example #13
0
def should_handle_job_status(pod_state: Any, status: str) -> bool:
    job_uuid = pod_state['details']['labels']['job_uuid']
    current_status = RedisStatuses.get_status(job=job_uuid)
    if not current_status:  # If the status does not exist or is evicted
        return True

    try:
        return JobLifeCycle.can_transition(
            status_from=RedisStatuses.get_status(job=job_uuid),
            status_to=status)
    except redis.connection.ConnectionError:
        return True
Example #14
0
def tensorboards_start(tensorboard_job_id):
    tensorboard = get_valid_tensorboard(tensorboard_job_id=tensorboard_job_id)
    if not tensorboard:
        return None

    if not JobLifeCycle.can_transition(status_from=tensorboard.last_status,
                                       status_to=JobLifeCycle.SCHEDULED):
        _logger.info('Tensorboard `%s` cannot transition from `%s` to `%s`.',
                     tensorboard.unique_name, tensorboard.last_status, JobLifeCycle.SCHEDULED)

    try:
        tensorboard_scheduler.start_tensorboard(tensorboard)
    except StoreNotFoundError:
        tensorboard.set_status(status=JobLifeCycle.FAILED,
                               message='Tensorboard failed to start, '
                                       'the outputs volume/storage was not found.')
Example #15
0
def jobs_start(job_id):
    job = get_valid_job(job_id=job_id)
    if not job:
        return None

    if job.last_status == JobLifeCycle.RUNNING:
        _logger.warning('Job is already running.')
        return None

    if not JobLifeCycle.can_transition(status_from=job.last_status,
                                       status_to=JobLifeCycle.SCHEDULED):
        _logger.info('Job `%s` cannot transition from `%s` to `%s`.',
                     job.unique_name, job.last_status, JobLifeCycle.SCHEDULED)
        return None

    job_scheduler.start_job(job)
Example #16
0
def k8s_events_handle_experiment_job_statuses(self: 'workers.app.task',
                                              payload: Dict) -> None:
    """Experiment jobs statuses"""
    details = payload['details']
    job_uuid = details['labels']['job_uuid']
    restart_count = payload.get('restart_count', 0)
    logger.debug('handling events status for job_uuid: %s, status: %s',
                 job_uuid, payload['status'])

    try:
        job = ExperimentJob.objects.get(uuid=job_uuid)
    except ExperimentJob.DoesNotExist:
        logger.debug('Job uuid`%s` does not exist', job_uuid)
        return

    try:
        experiment = job.experiment
    except Experiment.DoesNotExist:
        logger.debug('Experiment for job `%s` does not exist anymore',
                     job_uuid)
        return

    if job.last_status is None and self.request.retries < 2:
        self.retry(countdown=1)

    max_restarts = experiment.max_restarts or conf.get(
        MAX_RESTARTS_EXPERIMENTS)
    if JobLifeCycle.failed(payload['status']) and restart_count < max_restarts:
        return

    # Set the new status
    try:
        RedisStatuses.set_status(job_uuid, payload['status'])
        set_node_scheduling(job, details['node_name'])
        job.set_status(status=payload['status'],
                       message=payload['message'],
                       created_at=payload.get('created_at'),
                       traceback=payload.get('traceback'),
                       details=details)
        logger.debug('status %s is set for job %s %s', payload['status'],
                     job_uuid, job.id)
    except IntegrityError:
        # Due to concurrency this could happen, we just retry it
        logger.info('Retry job status %s handling %s', payload['status'],
                    job_uuid)
        self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
Example #17
0
def k8s_events_handle_build_job_statuses(self: 'workers.app.task',
                                         payload: Dict) -> None:
    """Project Plugin jobs statuses"""
    details = payload['details']
    app = details['labels']['app']
    job_uuid = details['labels']['job_uuid']
    job_name = details['labels']['job_name']
    restart_count = payload.get('restart_count', 0)
    project_name = details['labels'].get('project_name')
    logger.debug('handling events status for build jon %s %s', job_name, app)

    try:
        build_job = BuildJob.objects.get(uuid=job_uuid)
    except BuildJob.DoesNotExist:
        logger.info('Build job `%s` does not exist', job_name)
        return

    try:
        build_job.project
    except Project.DoesNotExist:
        logger.debug('`%s` does not exist anymore', project_name)

    max_restarts = build_job.max_restarts or conf.get(MAX_RESTARTS_BUILD_JOBS)
    if JobLifeCycle.failed(payload['status']) and restart_count < max_restarts:
        return

    # Set the new status
    try:
        RedisStatuses.set_status(job_uuid, payload['status'])
        set_node_scheduling(build_job, details['node_name'])
        build_job.set_status(status=payload['status'],
                             message=payload['message'],
                             traceback=payload.get('traceback'),
                             details=details)
    except IntegrityError:
        # Due to concurrency this could happen, we just retry it
        self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
Example #18
0
    def test_job_statuses_transition(self):
        # pylint:disable=too-many-branches
        # pylint:disable=too-many-statements
        # Cannot transition to `CREATED`
        for status in JobLifeCycle.VALUES:
            assert JobLifeCycle.can_transition(
                status_from=status, status_to=JobLifeCycle.CREATED) is False

        # -> BUILDING
        for status in JobLifeCycle.VALUES:
            can_transition = JobLifeCycle.can_transition(
                status_from=status, status_to=JobLifeCycle.BUILDING)
            if status in {
                    JobLifeCycle.CREATED,
                    JobLifeCycle.RESUMING,
                    JobLifeCycle.SCHEDULED,
                    JobLifeCycle.UNSCHEDULABLE,
                    JobLifeCycle.WARNING,
                    JobLifeCycle.UNKNOWN,
            }:
                assert can_transition is True
            else:
                assert can_transition is False

        # -> SCHEDULED
        for status in JobLifeCycle.VALUES:
            can_transition = JobLifeCycle.can_transition(
                status_from=status, status_to=JobLifeCycle.SCHEDULED)
            if status in {
                    JobLifeCycle.CREATED,
                    JobLifeCycle.RESUMING,
                    JobLifeCycle.BUILDING,
                    JobLifeCycle.WARNING,
                    JobLifeCycle.UNSCHEDULABLE,
                    JobLifeCycle.UNKNOWN,
            }:
                assert can_transition is True
            else:
                assert can_transition is False

        # -> RUNNING
        for status in JobLifeCycle.VALUES:
            can_transition = JobLifeCycle.can_transition(
                status_from=status, status_to=JobLifeCycle.RUNNING)
            if status in {
                    JobLifeCycle.CREATED,
                    JobLifeCycle.SCHEDULED,
                    JobLifeCycle.RESUMING,
                    JobLifeCycle.BUILDING,
                    JobLifeCycle.UNSCHEDULABLE,
                    JobLifeCycle.UNKNOWN,
                    JobLifeCycle.WARNING,
            }:
                assert can_transition is True
            else:
                assert can_transition is False

        # -> SKIPPED
        for status in JobLifeCycle.VALUES:
            can_transition = JobLifeCycle.can_transition(
                status_from=status, status_to=JobLifeCycle.SKIPPED)
            if status not in JobLifeCycle.DONE_STATUS:
                assert can_transition is True
            else:
                assert can_transition is False

        # -> SUCCEEDED
        for status in JobLifeCycle.VALUES:
            can_transition = JobLifeCycle.can_transition(
                status_from=status, status_to=JobLifeCycle.SUCCEEDED)
            if status not in JobLifeCycle.DONE_STATUS:
                assert can_transition is True
            else:
                assert can_transition is False

        # -> FAILED
        for status in JobLifeCycle.VALUES:
            can_transition = JobLifeCycle.can_transition(
                status_from=status, status_to=JobLifeCycle.FAILED)
            if status not in JobLifeCycle.DONE_STATUS:
                assert can_transition is True
            else:
                assert can_transition is False

        # -> UPSTREAM_FAILED
        for status in JobLifeCycle.VALUES:
            can_transition = JobLifeCycle.can_transition(
                status_from=status, status_to=JobLifeCycle.UPSTREAM_FAILED)
            if status not in JobLifeCycle.DONE_STATUS:
                assert can_transition is True
            else:
                assert can_transition is False

        # -> STOPPED
        for status in JobLifeCycle.VALUES:
            can_transition = JobLifeCycle.can_transition(
                status_from=status, status_to=JobLifeCycle.STOPPED)
            if status not in JobLifeCycle.DONE_STATUS:
                assert can_transition is True
            else:
                assert can_transition is False

        # -> WARNING
        for status in JobLifeCycle.VALUES:
            can_transition = JobLifeCycle.can_transition(
                status_from=status, status_to=JobLifeCycle.WARNING)
            cond = status in (JobLifeCycle.VALUES - JobLifeCycle.DONE_STATUS -
                              {
                                  JobLifeCycle.WARNING,
                              })
            if cond:
                assert can_transition is True
            else:
                assert can_transition is False

        # -> UNKNOWN
        for status in JobLifeCycle.VALUES:
            can_transition = JobLifeCycle.can_transition(
                status_from=status, status_to=JobLifeCycle.UNKNOWN)
            if status not in {
                    JobLifeCycle.UNKNOWN,
            }:
                assert can_transition is True
            else:
                assert can_transition is False
Example #19
0
async def job_logs(
        request,  # pylint:disable=too-many-branches
        ws,
        username,
        project_name,
        job_id):
    from streams.consumers.consumers import Consumer

    job, message = validate_job(request=request,
                                username=username,
                                project_name=project_name,
                                job_id=job_id)
    if job is None:
        await ws.send(get_error_message(message))
        return

    job_uuid = job.uuid.hex

    auditor.record(event_type=JOB_LOGS_VIEWED,
                   instance=job,
                   actor_id=request.app.user.id,
                   actor_name=request.app.user.username)

    if not RedisToStream.is_monitored_job_logs(job_uuid=job_uuid):
        logger.info('Job uuid `%s` logs is now being monitored', job_uuid)
        RedisToStream.monitor_job_logs(job_uuid=job_uuid)

    # start consumer
    if job_uuid in request.app.job_logs_consumers:
        consumer = request.app.job_logs_consumers[job_uuid]
    else:
        logger.info('Add job log consumer for %s', job_uuid)
        consumer = Consumer(
            routing_key='{}.{}'.format(RoutingKeys.STREAM_LOGS_SIDECARS_JOBS,
                                       job_uuid),
            queue='{}.{}'.format(CeleryQueues.STREAM_LOGS_SIDECARS, job_uuid))
        request.app.job_logs_consumers[job_uuid] = consumer
        consumer.run()

    def should_disconnect():
        if not consumer.ws:
            logger.info('Stopping logs monitor for job uuid %s', job_uuid)
            RedisToStream.remove_job_logs(job_uuid=job_uuid)
            # if job_uuid in request.app.job_logs_consumers:
            #     consumer = request.app.job_logs_consumers.pop(job_uuid, None)
            #     if consumer:
            #         consumer.stop()
            return True
        return False

    # add socket manager
    consumer.add_socket(ws)
    should_quite = False
    num_message_retries = 0

    # Stream phase changes
    status = None
    while status != JobLifeCycle.RUNNING and not JobLifeCycle.is_done(status):
        job.refresh_from_db()
        if status != job.last_status:
            status = job.last_status
            await notify(ws_manager=consumer,
                         message=get_status_message(status))
            if should_disconnect():
                return
        await asyncio.sleep(SOCKET_SLEEP)

    if JobLifeCycle.is_done(status):
        await notify(ws_manager=consumer, message=get_status_message(status))
        RedisToStream.remove_job_logs(job_uuid=job_uuid)
        return

    while True:
        num_message_retries += 1
        for message in consumer.get_messages():
            num_message_retries = 0
            await notify(ws_manager=consumer, message=message)

        # After trying a couple of time, we must check the status of the experiment
        if num_message_retries > MAX_RETRIES:
            job.refresh_from_db()
            if job.is_done:
                logger.info('removing all socket because the job `%s` is done',
                            job_uuid)
                consumer.ws = set([])
            else:
                num_message_retries -= CHECK_DELAY

        # Just to check if connection closed
        if ws._connection_lost:  # pylint:disable=protected-access
            logger.info('Quitting logs socket for job uuid %s', job_uuid)
            consumer.remove_sockets({
                ws,
            })
            should_quite = True

        if should_disconnect():
            should_quite = True

        if should_quite:
            return

        await asyncio.sleep(SOCKET_SLEEP)