def events_handle_build_job_statuses(self, payload): """Project Plugin jobs statuses""" details = payload['details'] app = details['labels']['app'] job_uuid = details['labels']['job_uuid'] job_name = details['labels']['job_name'] project_name = details['labels'].get('project_name') logger.debug('handling events status for build jon %s %s', job_name, app) try: build_job = BuildJob.objects.get(uuid=job_uuid) except BuildJob.DoesNotExist: logger.info('Build job `%s` does not exist', job_name) return try: build_job.project except Project.DoesNotExist: logger.debug('`%s` does not exist anymore', project_name) # Set the new status try: set_node_scheduling(build_job, details['node_name']) build_job.set_status(status=payload['status'], message=payload['message'], traceback=payload.get('traceback'), details=details) except IntegrityError: # Due to concurrency this could happen, we just retry it self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
def events_handle_experiment_job_statuses(self, payload): """Experiment jobs statuses""" details = payload['details'] job_uuid = details['labels']['job_uuid'] logger.debug('handling events status for job_uuid: %s, status: %s', job_uuid, payload['status']) try: job = ExperimentJob.objects.get(uuid=job_uuid) except ExperimentJob.DoesNotExist: logger.debug('Job uuid`%s` does not exist', job_uuid) return try: job.experiment except Experiment.DoesNotExist: logger.debug('Experiment for job `%s` does not exist anymore', job_uuid) return if job.last_status is None and self.request.retries < 2: self.retry(countdown=1) # Set the new status try: set_node_scheduling(job, details['node_name']) job.set_status(status=payload['status'], message=payload['message'], traceback=payload.get('traceback'), details=details) logger.debug('status %s is set for job %s %s', payload['status'], job_uuid, job.id) except IntegrityError: # Due to concurrency this could happen, we just retry it logger.info('Retry job status %s handling %s', payload['status'], job_uuid) self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
def events_handle_logs_experiment_job(experiment_name, experiment_uuid, job_uuid, log_lines, task_type=None, task_idx=None): if not Experiment.objects.filter(uuid=experiment_uuid).exists(): return logger.debug('handling log event for %s %s', experiment_uuid, job_uuid) if task_type and task_idx: log_lines = [ '{}.{} -- {}'.format(task_type, int(task_idx) + 1, log_line) for log_line in log_lines ] safe_log_experiment_job(experiment_name=experiment_name, log_lines=log_lines)
def events_handle_plugin_job_statuses(self, payload): """Project Plugin jobs statuses""" details = payload['details'] app = details['labels']['app'] job_uuid = details['labels']['job_uuid'] job_name = details['labels']['job_name'] project_name = details['labels'].get('project_name') logger.debug('handling events status for job %s %s', job_name, app) try: if app == settings.APP_LABELS_TENSORBOARD: job = TensorboardJob.objects.get(uuid=job_uuid) elif app == settings.APP_LABELS_NOTEBOOK: job = NotebookJob.objects.get(uuid=job_uuid) else: logger.info('Plugin job `%s` does not exist', app) return except (NotebookJob.DoesNotExist, TensorboardJob.DoesNotExist): logger.debug('`%s - %s` does not exist', app, job_name) return try: job.project except Project.DoesNotExist: logger.debug('`%s` does not exist anymore', project_name) # Set the new status try: set_node_scheduling(job, details['node_name']) job.set_status(status=payload['status'], message=payload['message'], traceback=payload.get('traceback'), details=details) except IntegrityError: # Due to concurrency this could happen, we just retry it self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
def handle_events_namespace(cluster_id, payload): logger.debug('handling events namespace for cluster: %s', cluster_id) try: ClusterEvent.objects.create(cluster_id=cluster_id, **payload) except OperationalError: pass
def events_handle_logs_build_job(job_uuid, job_name, log_lines): if not BuildJob.objects.filter(uuid=job_uuid).exists(): return logger.debug('handling log event for %s', job_name) safe_log_job(job_name=job_name, log_lines=log_lines)