def tensorboard_job_status_post_save(sender, **kwargs): instance = kwargs['instance'] job = instance.job previous_status = job.last_status # Update job last_status job.status = instance set_job_started_at(instance=job, status=instance.status) set_job_finished_at(instance=job, status=instance.status) job.save(update_fields=['status', 'started_at', 'updated_at', 'finished_at']) auditor.record(event_type=TENSORBOARD_NEW_STATUS, instance=job, previous_status=previous_status, target='project') if instance.status == JobLifeCycle.STOPPED: auditor.record(event_type=TENSORBOARD_STOPPED, instance=job, previous_status=previous_status, target='project') elif instance.status == JobLifeCycle.FAILED: auditor.record(event_type=TENSORBOARD_FAILED, instance=job, previous_status=previous_status, target='project') elif instance.status == JobLifeCycle.STOPPED: auditor.record(event_type=TENSORBOARD_SUCCEEDED, instance=job, previous_status=previous_status, target='project') if JobLifeCycle.is_done(instance.status): RedisStatuses.delete_status(job.uuid.hex) new_operation_run_status(entity_type=content_types.TENSORBOARD_JOB, entity=job, status=instance.status)
def k8s_events_handle_job_statuses(self: 'workers.app.task', payload: Dict) -> None: """Project jobs statuses""" details = payload['details'] job_uuid = details['labels']['job_uuid'] job_name = details['labels']['job_name'] project_name = details['labels'].get('project_name') logger.debug('handling events status for job %s', job_name) try: job = Job.objects.get(uuid=job_uuid) except Job.DoesNotExist: logger.debug('Job `%s` does not exist', job_name) return try: job.project except Project.DoesNotExist: logger.debug('Project for job `%s` does not exist', project_name) return # Set the new status try: RedisStatuses.set_status(job_uuid, payload['status']) set_node_scheduling(job, details['node_name']) job.set_status(status=payload['status'], message=payload['message'], traceback=payload.get('traceback'), details=details) except IntegrityError: # Due to concurrency this could happen, we just retry it self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
def job_status_post_save(sender, **kwargs): instance = kwargs['instance'] job = instance.job previous_status = job.last_status # Update job last_status job.status = instance set_job_started_at(instance=job, status=instance.status) set_job_finished_at(instance=job, status=instance.status) job.save(update_fields=['status', 'started_at', 'updated_at', 'finished_at']) auditor.record(event_type=JOB_NEW_STATUS, instance=job, previous_status=previous_status) if instance.status == JobLifeCycle.CREATED: auditor.record(event_type=JOB_CREATED, instance=job) elif instance.status == JobLifeCycle.STOPPED: auditor.record(event_type=JOB_STOPPED, instance=job, previous_status=previous_status) elif instance.status == JobLifeCycle.FAILED: auditor.record(event_type=JOB_FAILED, instance=job, previous_status=previous_status) elif instance.status == JobLifeCycle.SUCCEEDED: auditor.record(event_type=JOB_SUCCEEDED, instance=job, previous_status=previous_status) if JobLifeCycle.is_done(instance.status): auditor.record(event_type=JOB_DONE, instance=job, previous_status=previous_status) RedisStatuses.delete_status(job.uuid.hex) new_operation_run_status(entity_type=content_types.JOB, entity=job, status=instance.status)
def test_status_change(self): assert RedisStatuses.get_status('job-uuid') is None RedisStatuses.delete_status('job-uuid') RedisStatuses.set_status('job-uuid', 'running') assert RedisStatuses.get_status('job-uuid') == 'running' RedisStatuses.delete_status('job-uuid') assert RedisStatuses.get_status('job-uuid') is None
def should_handle_job_status(pod_state: Any, status: str) -> bool: job_uuid = pod_state['details']['labels']['job_uuid'] current_status = RedisStatuses.get_status(job=job_uuid) if not current_status: # If the status does not exist or is evicted return True try: return JobLifeCycle.can_transition( status_from=RedisStatuses.get_status(job=job_uuid), status_to=status) except redis.connection.ConnectionError: return True
def k8s_events_handle_experiment_job_statuses(self: 'workers.app.task', payload: Dict) -> None: """Experiment jobs statuses""" details = payload['details'] job_uuid = details['labels']['job_uuid'] restart_count = payload.get('restart_count', 0) logger.debug('handling events status for job_uuid: %s, status: %s', job_uuid, payload['status']) try: job = ExperimentJob.objects.get(uuid=job_uuid) except ExperimentJob.DoesNotExist: logger.debug('Job uuid`%s` does not exist', job_uuid) return try: experiment = job.experiment except Experiment.DoesNotExist: logger.debug('Experiment for job `%s` does not exist anymore', job_uuid) return if job.last_status is None and self.request.retries < 2: self.retry(countdown=1) max_restarts = experiment.max_restarts or conf.get( MAX_RESTARTS_EXPERIMENTS) if JobLifeCycle.failed(payload['status']) and restart_count < max_restarts: return # Set the new status try: RedisStatuses.set_status(job_uuid, payload['status']) set_node_scheduling(job, details['node_name']) job.set_status(status=payload['status'], message=payload['message'], created_at=payload.get('created_at'), traceback=payload.get('traceback'), details=details) logger.debug('status %s is set for job %s %s', payload['status'], job_uuid, job.id) except IntegrityError: # Due to concurrency this could happen, we just retry it logger.info('Retry job status %s handling %s', payload['status'], job_uuid) self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
def experiment_job_status_post_save(sender, **kwargs): instance = kwargs['instance'] job = instance.job job.status = instance set_job_started_at(instance=job, status=instance.status) set_job_finished_at(instance=job, status=instance.status) job.save( update_fields=['status', 'started_at', 'updated_at', 'finished_at']) # check if the new status is done to remove the containers from the monitors if job.is_done: # TODO: re-enable container monitor: RedisJobContainers().remove_job(job.uuid.hex) RedisStatuses.delete_status(job.uuid.hex) # Check if we need to change the experiment status auditor.record(event_type=EXPERIMENT_JOB_NEW_STATUS, instance=job)
def k8s_events_handle_plugin_job_statuses(self: 'workers.app.task', payload: Dict) -> None: """Project Plugin jobs statuses""" details = payload['details'] app = details['labels']['app'] job_uuid = details['labels']['job_uuid'] job_name = details['labels']['job_name'] project_name = details['labels'].get('project_name') logger.debug('handling events status for job %s %s', job_name, app) try: if app == conf.get(APP_LABELS_TENSORBOARD): job = TensorboardJob.objects.get(uuid=job_uuid) elif app == conf.get(APP_LABELS_NOTEBOOK): job = NotebookJob.objects.get(uuid=job_uuid) else: logger.info('Plugin job `%s` does not exist', app) return except (NotebookJob.DoesNotExist, TensorboardJob.DoesNotExist): logger.debug('`%s - %s` does not exist', app, job_name) return try: job.project except Project.DoesNotExist: logger.debug('`%s` does not exist anymore', project_name) # Set the new status try: RedisStatuses.set_status(job_uuid, payload['status']) set_node_scheduling(job, details['node_name']) job.set_status(status=payload['status'], message=payload['message'], traceback=payload.get('traceback'), details=details) except IntegrityError: # Due to concurrency this could happen, we just retry it self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
def k8s_events_handle_build_job_statuses(self: 'workers.app.task', payload: Dict) -> None: """Project Plugin jobs statuses""" details = payload['details'] app = details['labels']['app'] job_uuid = details['labels']['job_uuid'] job_name = details['labels']['job_name'] restart_count = payload.get('restart_count', 0) project_name = details['labels'].get('project_name') logger.debug('handling events status for build jon %s %s', job_name, app) try: build_job = BuildJob.objects.get(uuid=job_uuid) except BuildJob.DoesNotExist: logger.info('Build job `%s` does not exist', job_name) return try: build_job.project except Project.DoesNotExist: logger.debug('`%s` does not exist anymore', project_name) max_restarts = build_job.max_restarts or conf.get(MAX_RESTARTS_BUILD_JOBS) if JobLifeCycle.failed(payload['status']) and restart_count < max_restarts: return # Set the new status try: RedisStatuses.set_status(job_uuid, payload['status']) set_node_scheduling(build_job, details['node_name']) build_job.set_status(status=payload['status'], message=payload['message'], traceback=payload.get('traceback'), details=details) except IntegrityError: # Due to concurrency this could happen, we just retry it self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
def should_handle_job_status(pod_state: Any, status: str) -> bool: job_uuid = pod_state['details']['labels']['job_uuid'] return RedisStatuses.get_status(job=job_uuid) != status
def test_keys(self): assert RedisStatuses.get_status_key( 'foo') == RedisStatuses.KEY_STATUSES.format('foo')