def jobs_build(job_id): job = get_valid_job(job_id=job_id) if not job: return None if not JobLifeCycle.can_transition(status_from=job.last_status, status_to=JobLifeCycle.BUILDING): _logger.info('Job id `%s` cannot transition from `%s` to `%s`.', job_id, job.last_status, JobLifeCycle.BUILDING) return build_job, image_exists, build_status = dockerizer_scheduler.create_build_job( user=job.user, project=job.project, config=job.specification.build, configmap_refs=job.specification.configmap_refs, secret_refs=job.specification.secret_refs, code_reference=job.code_reference) job.build_job = build_job job.save(update_fields=['build_job']) if image_exists: # The image already exists, so we can start the experiment right away celery_app.send_task( SchedulerCeleryTasks.JOBS_START, kwargs={'job_id': job_id}, countdown=conf.get('GLOBAL_COUNTDOWN')) return if not build_status: job.set_status(JobLifeCycle.FAILED, message='Could not start build process.') return # Update job status to show that its building docker image job.set_status(JobLifeCycle.BUILDING, message='Building container')
def _set_status(self, status_model, status: str, created_at: AwareDT = None, message: str = None, traceback: Dict = None, details: Dict = None) -> bool: current_status = self.last_status_before(status_model=status_model, status_date=created_at) if self.is_done: # We should not update statuses anymore _logger.debug( 'Received a new status `%s` for job `%s`. ' 'But the job is already done with status `%s`', status, self.unique_name, current_status) return False if status in JobLifeCycle.HEARTBEAT_STATUS: self._ping_heartbeat() if JobLifeCycle.can_transition(status_from=current_status, status_to=status): # Add new status to the job params = {'created_at': created_at} if created_at else {} status_model.objects.create(job=self, status=status, message=message, traceback=traceback, details=details, **params) return True return False
def projects_notebook_build(notebook_job_id): notebook_job = get_valid_notebook(notebook_job_id=notebook_job_id) if not notebook_job: return None if not JobLifeCycle.can_transition(status_from=notebook_job.last_status, status_to=JobLifeCycle.BUILDING): _logger.info('Notebook `%s` cannot transition from `%s` to `%s`.', notebook_job, notebook_job.last_status, JobLifeCycle.BUILDING) return build_job, image_exists, build_status = dockerizer_scheduler.create_build_job( user=notebook_job.user, project=notebook_job.project, config=notebook_job.specification.build, configmap_refs=notebook_job.specification.configmap_refs, secret_refs=notebook_job.specification.secret_refs, code_reference=notebook_job.code_reference) notebook_job.build_job = build_job notebook_job.save(update_fields=['build_job']) if image_exists: # The image already exists, so we can start the experiment right away workers.send( SchedulerCeleryTasks.PROJECTS_NOTEBOOK_START, kwargs={'notebook_job_id': notebook_job_id}) return if not build_status: notebook_job.set_status(JobLifeCycle.FAILED, message='Could not start build process.') return # Update job status to show that its building docker image notebook_job.set_status(JobLifeCycle.BUILDING, message='Building container')
def projects_notebook_start(notebook_job_id): notebook_job = get_valid_notebook(notebook_job_id=notebook_job_id) if not notebook_job: return None if not JobLifeCycle.can_transition(status_from=notebook_job.last_status, status_to=JobLifeCycle.SCHEDULED): _logger.info('Notebook `%s` cannot transition from `%s` to `%s`.', notebook_job.unique_name, notebook_job.last_status, JobLifeCycle.SCHEDULED) notebook_scheduler.start_notebook(notebook_job)
def should_handle_job_status(pod_state: Any, status: str) -> bool: job_uuid = pod_state['details']['labels']['job_uuid'] current_status = RedisStatuses.get_status(job=job_uuid) if not current_status: # If the status does not exist or is evicted return True try: return JobLifeCycle.can_transition( status_from=RedisStatuses.get_status(job=job_uuid), status_to=status) except redis.connection.ConnectionError: return True
def tensorboards_start(tensorboard_job_id): tensorboard = get_valid_tensorboard(tensorboard_job_id=tensorboard_job_id) if not tensorboard: return None if not JobLifeCycle.can_transition(status_from=tensorboard.last_status, status_to=JobLifeCycle.SCHEDULED): _logger.info('Tensorboard `%s` cannot transition from `%s` to `%s`.', tensorboard.unique_name, tensorboard.last_status, JobLifeCycle.SCHEDULED) try: tensorboard_scheduler.start_tensorboard(tensorboard) except StoreNotFoundError: tensorboard.set_status(status=JobLifeCycle.FAILED, message='Tensorboard failed to start, ' 'the outputs volume/storage was not found.')
def jobs_start(job_id): job = get_valid_job(job_id=job_id) if not job: return None if job.last_status == JobLifeCycle.RUNNING: _logger.warning('Job is already running.') return None if not JobLifeCycle.can_transition(status_from=job.last_status, status_to=JobLifeCycle.SCHEDULED): _logger.info('Job `%s` cannot transition from `%s` to `%s`.', job.unique_name, job.last_status, JobLifeCycle.SCHEDULED) return None job_scheduler.start_job(job)
def test_job_statuses_transition(self): # pylint:disable=too-many-branches # pylint:disable=too-many-statements # Cannot transition to `CREATED` for status in JobLifeCycle.VALUES: assert JobLifeCycle.can_transition( status_from=status, status_to=JobLifeCycle.CREATED) is False # -> BUILDING for status in JobLifeCycle.VALUES: can_transition = JobLifeCycle.can_transition( status_from=status, status_to=JobLifeCycle.BUILDING) if status in { JobLifeCycle.CREATED, JobLifeCycle.RESUMING, JobLifeCycle.SCHEDULED, JobLifeCycle.UNSCHEDULABLE, JobLifeCycle.WARNING, JobLifeCycle.UNKNOWN, }: assert can_transition is True else: assert can_transition is False # -> SCHEDULED for status in JobLifeCycle.VALUES: can_transition = JobLifeCycle.can_transition( status_from=status, status_to=JobLifeCycle.SCHEDULED) if status in { JobLifeCycle.CREATED, JobLifeCycle.RESUMING, JobLifeCycle.BUILDING, JobLifeCycle.WARNING, JobLifeCycle.UNSCHEDULABLE, JobLifeCycle.UNKNOWN, }: assert can_transition is True else: assert can_transition is False # -> RUNNING for status in JobLifeCycle.VALUES: can_transition = JobLifeCycle.can_transition( status_from=status, status_to=JobLifeCycle.RUNNING) if status in { JobLifeCycle.CREATED, JobLifeCycle.SCHEDULED, JobLifeCycle.RESUMING, JobLifeCycle.BUILDING, JobLifeCycle.UNSCHEDULABLE, JobLifeCycle.UNKNOWN, JobLifeCycle.WARNING, }: assert can_transition is True else: assert can_transition is False # -> SKIPPED for status in JobLifeCycle.VALUES: can_transition = JobLifeCycle.can_transition( status_from=status, status_to=JobLifeCycle.SKIPPED) if status not in JobLifeCycle.DONE_STATUS: assert can_transition is True else: assert can_transition is False # -> SUCCEEDED for status in JobLifeCycle.VALUES: can_transition = JobLifeCycle.can_transition( status_from=status, status_to=JobLifeCycle.SUCCEEDED) if status not in JobLifeCycle.DONE_STATUS: assert can_transition is True else: assert can_transition is False # -> FAILED for status in JobLifeCycle.VALUES: can_transition = JobLifeCycle.can_transition( status_from=status, status_to=JobLifeCycle.FAILED) if status not in JobLifeCycle.DONE_STATUS: assert can_transition is True else: assert can_transition is False # -> UPSTREAM_FAILED for status in JobLifeCycle.VALUES: can_transition = JobLifeCycle.can_transition( status_from=status, status_to=JobLifeCycle.UPSTREAM_FAILED) if status not in JobLifeCycle.DONE_STATUS: assert can_transition is True else: assert can_transition is False # -> STOPPED for status in JobLifeCycle.VALUES: can_transition = JobLifeCycle.can_transition( status_from=status, status_to=JobLifeCycle.STOPPED) if status not in JobLifeCycle.DONE_STATUS: assert can_transition is True else: assert can_transition is False # -> WARNING for status in JobLifeCycle.VALUES: can_transition = JobLifeCycle.can_transition( status_from=status, status_to=JobLifeCycle.WARNING) cond = status in (JobLifeCycle.VALUES - JobLifeCycle.DONE_STATUS - { JobLifeCycle.WARNING, }) if cond: assert can_transition is True else: assert can_transition is False # -> UNKNOWN for status in JobLifeCycle.VALUES: can_transition = JobLifeCycle.can_transition( status_from=status, status_to=JobLifeCycle.UNKNOWN) if status not in { JobLifeCycle.UNKNOWN, }: assert can_transition is True else: assert can_transition is False