def k8s_events_handle_build_job_statuses(self, payload): """Project Plugin jobs statuses""" details = payload['details'] app = details['labels']['app'] job_uuid = details['labels']['job_uuid'] job_name = details['labels']['job_name'] project_name = details['labels'].get('project_name') logger.debug('handling events status for build jon %s %s', job_name, app) try: build_job = BuildJob.objects.get(uuid=job_uuid) except BuildJob.DoesNotExist: logger.info('Build job `%s` does not exist', job_name) return try: build_job.project except Project.DoesNotExist: logger.debug('`%s` does not exist anymore', project_name) # Set the new status try: set_node_scheduling(build_job, details['node_name']) build_job.set_status(status=payload['status'], message=payload['message'], traceback=payload.get('traceback'), details=details) except IntegrityError: # Due to concurrency this could happen, we just retry it self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
def k8s_events_handle_experiment_job_statuses(self, payload): """Experiment jobs statuses""" details = payload['details'] job_uuid = details['labels']['job_uuid'] logger.debug('handling events status for job_uuid: %s, status: %s', job_uuid, payload['status']) try: job = ExperimentJob.objects.get(uuid=job_uuid) except ExperimentJob.DoesNotExist: logger.debug('Job uuid`%s` does not exist', job_uuid) return try: job.experiment except Experiment.DoesNotExist: logger.debug('Experiment for job `%s` does not exist anymore', job_uuid) return if job.last_status is None and self.request.retries < 2: self.retry(countdown=1) # Set the new status try: set_node_scheduling(job, details['node_name']) job.set_status(status=payload['status'], message=payload['message'], created_at=payload.get('created_at'), traceback=payload.get('traceback'), details=details) logger.debug('status %s is set for job %s %s', payload['status'], job_uuid, job.id) except IntegrityError: # Due to concurrency this could happen, we just retry it logger.info('Retry job status %s handling %s', payload['status'], job_uuid) self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
def get_plugin_job(app, job_uuid=None, job_id=None): kwargs = {} if job_uuid: kwargs['uuid'] = job_uuid if job_id: kwargs['job_id'] = job_id try: if app == conf.get(APP_LABELS_TENSORBOARD): return TensorboardJob.objects.get(**kwargs) elif app == conf.get(APP_LABELS_NOTEBOOK): return NotebookJob.objects.get(**kwargs) logger.info('Plugin job `%s` does not exist', app) except (NotebookJob.DoesNotExist, TensorboardJob.DoesNotExist): return
def k8s_events_handle_experiment_job_statuses(self: 'workers.app.task', payload: Dict) -> None: """Experiment jobs statuses""" details = payload['details'] job_uuid = details['labels']['job_uuid'] restart_count = payload.get('restart_count', 0) logger.debug('handling events status for job_uuid: %s, status: %s', job_uuid, payload['status']) try: job = ExperimentJob.objects.get(uuid=job_uuid) except ExperimentJob.DoesNotExist: logger.debug('Job uuid`%s` does not exist', job_uuid) return try: experiment = job.experiment except Experiment.DoesNotExist: logger.debug('Experiment for job `%s` does not exist anymore', job_uuid) return if job.last_status is None and self.request.retries < 2: self.retry(countdown=1) max_restarts = experiment.max_restarts or conf.get( MAX_RESTARTS_EXPERIMENTS) if JobLifeCycle.failed(payload['status']) and restart_count < max_restarts: return # Set the new status try: RedisStatuses.set_status(job_uuid, payload['status']) set_node_scheduling(job, details['node_name']) job.set_status(status=payload['status'], message=payload['message'], created_at=payload.get('created_at'), traceback=payload.get('traceback'), details=details) logger.debug('status %s is set for job %s %s', payload['status'], job_uuid, job.id) except IntegrityError: # Due to concurrency this could happen, we just retry it logger.info('Retry job status %s handling %s', payload['status'], job_uuid) self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
def k8s_events_handle_plugin_job_statuses(self: 'workers.app.task', payload: Dict) -> None: """Project Plugin jobs statuses""" details = payload['details'] app = details['labels']['app'] job_uuid = details['labels']['job_uuid'] job_name = details['labels']['job_name'] project_name = details['labels'].get('project_name') logger.debug('handling events status for job %s %s', job_name, app) try: if app == conf.get(APP_LABELS_TENSORBOARD): job = TensorboardJob.objects.get(uuid=job_uuid) elif app == conf.get(APP_LABELS_NOTEBOOK): job = NotebookJob.objects.get(uuid=job_uuid) else: logger.info('Plugin job `%s` does not exist', app) return except (NotebookJob.DoesNotExist, TensorboardJob.DoesNotExist): logger.debug('`%s - %s` does not exist', app, job_name) return try: job.project except Project.DoesNotExist: logger.debug('`%s` does not exist anymore', project_name) # Set the new status try: RedisStatuses.set_status(job_uuid, payload['status']) set_node_scheduling(job, details['node_name']) job.set_status(status=payload['status'], message=payload['message'], traceback=payload.get('traceback'), details=details) except IntegrityError: # Due to concurrency this could happen, we just retry it self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
def k8s_events_handle_build_job_statuses(self: 'workers.app.task', payload: Dict) -> None: """Project Plugin jobs statuses""" details = payload['details'] app = details['labels']['app'] job_uuid = details['labels']['job_uuid'] job_name = details['labels']['job_name'] restart_count = payload.get('restart_count', 0) project_name = details['labels'].get('project_name') logger.debug('handling events status for build jon %s %s', job_name, app) try: build_job = BuildJob.objects.get(uuid=job_uuid) except BuildJob.DoesNotExist: logger.info('Build job `%s` does not exist', job_name) return try: build_job.project except Project.DoesNotExist: logger.debug('`%s` does not exist anymore', project_name) max_restarts = build_job.max_restarts or conf.get(MAX_RESTARTS_BUILD_JOBS) if JobLifeCycle.failed(payload['status']) and restart_count < max_restarts: return # Set the new status try: RedisStatuses.set_status(job_uuid, payload['status']) set_node_scheduling(build_job, details['node_name']) build_job.set_status(status=payload['status'], message=payload['message'], traceback=payload.get('traceback'), details=details) except IntegrityError: # Due to concurrency this could happen, we just retry it self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
def k8s_handle_events_resources(payload, persist): # here we must persist resources if requested logger.info('handling events resources with persist:%s', persist) logger.info(payload)