Exemple #1
0
def k8s_events_handle_build_job_statuses(self, payload):
    """Project Plugin jobs statuses"""
    details = payload['details']
    app = details['labels']['app']
    job_uuid = details['labels']['job_uuid']
    job_name = details['labels']['job_name']
    project_name = details['labels'].get('project_name')
    logger.debug('handling events status for build jon %s %s', job_name, app)

    try:
        build_job = BuildJob.objects.get(uuid=job_uuid)
    except BuildJob.DoesNotExist:
        logger.info('Build job `%s` does not exist', job_name)
        return

    try:
        build_job.project
    except Project.DoesNotExist:
        logger.debug('`%s` does not exist anymore', project_name)

    # Set the new status
    try:
        set_node_scheduling(build_job, details['node_name'])
        build_job.set_status(status=payload['status'],
                             message=payload['message'],
                             traceback=payload.get('traceback'),
                             details=details)
    except IntegrityError:
        # Due to concurrency this could happen, we just retry it
        self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
Exemple #2
0
def k8s_events_reconcile_job_statuses(job_id, status, created_at) -> None:
    try:
        job = Job.objects.get(id=job_id)
    except ExperimentJob.DoesNotExist:
        logger.debug('Job `%s` does not exist', job_id)
        return

    if job.is_done:
        return

    job.set_status(status=status,
                   message='Status was reconciled.',
                   created_at=created_at)
Exemple #3
0
def k8s_events_reconcile_plugin_job_statuses(job_id, app, status,
                                             created_at) -> None:
    job = get_plugin_job(app=app, job_uuid=job_id)

    if not job:
        logger.debug('Job `%s` does not exist', job_id)
        return

    if job.is_done:
        return

    job.set_status(status=status,
                   message='Status was reconciled.',
                   created_at=created_at)
Exemple #4
0
def k8s_events_handle_experiment_job_statuses(self, payload):
    """Experiment jobs statuses"""
    details = payload['details']
    job_uuid = details['labels']['job_uuid']
    logger.debug('handling events status for job_uuid: %s, status: %s',
                 job_uuid, payload['status'])

    try:
        job = ExperimentJob.objects.get(uuid=job_uuid)
    except ExperimentJob.DoesNotExist:
        logger.debug('Job uuid`%s` does not exist', job_uuid)
        return

    try:
        job.experiment
    except Experiment.DoesNotExist:
        logger.debug('Experiment for job `%s` does not exist anymore', job_uuid)
        return

    if job.last_status is None and self.request.retries < 2:
        self.retry(countdown=1)

    # Set the new status
    try:
        set_node_scheduling(job, details['node_name'])
        job.set_status(status=payload['status'],
                       message=payload['message'],
                       created_at=payload.get('created_at'),
                       traceback=payload.get('traceback'),
                       details=details)
        logger.debug('status %s is set for job %s %s', payload['status'], job_uuid, job.id)
    except IntegrityError:
        # Due to concurrency this could happen, we just retry it
        logger.info('Retry job status %s handling %s', payload['status'], job_uuid)
        self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
Exemple #5
0
def k8s_events_handle_job_statuses(self: 'workers.app.task',
                                   payload: Dict) -> None:
    """Project jobs statuses"""
    details = payload['details']
    job_uuid = details['labels']['job_uuid']
    job_name = details['labels']['job_name']
    project_name = details['labels'].get('project_name')
    logger.debug('handling events status for job %s', job_name)

    try:
        job = Job.objects.get(uuid=job_uuid)
    except Job.DoesNotExist:
        logger.debug('Job `%s` does not exist', job_name)
        return

    try:
        job.project
    except Project.DoesNotExist:
        logger.debug('Project for job `%s` does not exist', project_name)
        return

    # Set the new status
    try:
        RedisStatuses.set_status(job_uuid, payload['status'])
        set_node_scheduling(job, details['node_name'])
        job.set_status(status=payload['status'],
                       message=payload['message'],
                       traceback=payload.get('traceback'),
                       details=details)
    except IntegrityError:
        # Due to concurrency this could happen, we just retry it
        self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
Exemple #6
0
def k8s_events_handle_build_job_statuses(self: 'workers.app.task',
                                         payload: Dict) -> None:
    """Project Plugin jobs statuses"""
    details = payload['details']
    app = details['labels']['app']
    job_uuid = details['labels']['job_uuid']
    job_name = details['labels']['job_name']
    restart_count = payload.get('restart_count', 0)
    project_name = details['labels'].get('project_name')
    logger.debug('handling events status for build jon %s %s', job_name, app)

    try:
        build_job = BuildJob.objects.get(uuid=job_uuid)
    except BuildJob.DoesNotExist:
        logger.info('Build job `%s` does not exist', job_name)
        return

    try:
        build_job.project
    except Project.DoesNotExist:
        logger.debug('`%s` does not exist anymore', project_name)

    max_restarts = build_job.max_restarts or conf.get(MAX_RESTARTS_BUILD_JOBS)
    if JobLifeCycle.failed(payload['status']) and restart_count < max_restarts:
        return

    # Set the new status
    try:
        RedisStatuses.set_status(job_uuid, payload['status'])
        set_node_scheduling(build_job, details['node_name'])
        build_job.set_status(status=payload['status'],
                             message=payload['message'],
                             traceback=payload.get('traceback'),
                             details=details)
    except IntegrityError:
        # Due to concurrency this could happen, we just retry it
        self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
Exemple #7
0
def k8s_events_handle_experiment_job_statuses(self: 'workers.app.task',
                                              payload: Dict) -> None:
    """Experiment jobs statuses"""
    details = payload['details']
    job_uuid = details['labels']['job_uuid']
    restart_count = payload.get('restart_count', 0)
    logger.debug('handling events status for job_uuid: %s, status: %s',
                 job_uuid, payload['status'])

    try:
        job = ExperimentJob.objects.get(uuid=job_uuid)
    except ExperimentJob.DoesNotExist:
        logger.debug('Job uuid`%s` does not exist', job_uuid)
        return

    try:
        experiment = job.experiment
    except Experiment.DoesNotExist:
        logger.debug('Experiment for job `%s` does not exist anymore',
                     job_uuid)
        return

    if job.last_status is None and self.request.retries < 2:
        self.retry(countdown=1)

    max_restarts = experiment.max_restarts or conf.get(
        MAX_RESTARTS_EXPERIMENTS)
    if JobLifeCycle.failed(payload['status']) and restart_count < max_restarts:
        return

    # Set the new status
    try:
        RedisStatuses.set_status(job_uuid, payload['status'])
        set_node_scheduling(job, details['node_name'])
        job.set_status(status=payload['status'],
                       message=payload['message'],
                       created_at=payload.get('created_at'),
                       traceback=payload.get('traceback'),
                       details=details)
        logger.debug('status %s is set for job %s %s', payload['status'],
                     job_uuid, job.id)
    except IntegrityError:
        # Due to concurrency this could happen, we just retry it
        logger.info('Retry job status %s handling %s', payload['status'],
                    job_uuid)
        self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
Exemple #8
0
def k8s_events_handle_plugin_job_statuses(self: 'workers.app.task',
                                          payload: Dict) -> None:
    """Project Plugin jobs statuses"""
    details = payload['details']
    app = details['labels']['app']
    job_uuid = details['labels']['job_uuid']
    job_name = details['labels']['job_name']
    project_name = details['labels'].get('project_name')
    logger.debug('handling events status for job %s %s', job_name, app)

    try:
        if app == conf.get(APP_LABELS_TENSORBOARD):
            job = TensorboardJob.objects.get(uuid=job_uuid)
        elif app == conf.get(APP_LABELS_NOTEBOOK):
            job = NotebookJob.objects.get(uuid=job_uuid)
        else:
            logger.info('Plugin job `%s` does not exist', app)
            return
    except (NotebookJob.DoesNotExist, TensorboardJob.DoesNotExist):
        logger.debug('`%s - %s` does not exist', app, job_name)
        return

    try:
        job.project
    except Project.DoesNotExist:
        logger.debug('`%s` does not exist anymore', project_name)

    # Set the new status
    try:
        RedisStatuses.set_status(job_uuid, payload['status'])
        set_node_scheduling(job, details['node_name'])
        job.set_status(status=payload['status'],
                       message=payload['message'],
                       traceback=payload.get('traceback'),
                       details=details)
    except IntegrityError:
        # Due to concurrency this could happen, we just retry it
        self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
Exemple #9
0
def k8s_handle_events_namespace(cluster_id: int, payload: Dict) -> None:
    logger.debug('handling events namespace for cluster: %s', cluster_id)
    try:
        ClusterEvent.objects.create(cluster_id=cluster_id, **payload)
    except OperationalError:
        pass