Exemple #1
0
def tensorboard_job_status_post_save(sender, **kwargs):
    instance = kwargs['instance']
    job = instance.job
    previous_status = job.last_status
    # Update job last_status
    job.status = instance
    set_job_started_at(instance=job, status=instance.status)
    set_job_finished_at(instance=job, status=instance.status)
    job.save(update_fields=['status', 'started_at', 'updated_at', 'finished_at'])
    auditor.record(event_type=TENSORBOARD_NEW_STATUS,
                   instance=job,
                   previous_status=previous_status,
                   target='project')
    if instance.status == JobLifeCycle.STOPPED:
        auditor.record(event_type=TENSORBOARD_STOPPED,
                       instance=job,
                       previous_status=previous_status,
                       target='project')
    elif instance.status == JobLifeCycle.FAILED:
        auditor.record(event_type=TENSORBOARD_FAILED,
                       instance=job,
                       previous_status=previous_status,
                       target='project')
    elif instance.status == JobLifeCycle.STOPPED:
        auditor.record(event_type=TENSORBOARD_SUCCEEDED,
                       instance=job,
                       previous_status=previous_status,
                       target='project')
    if JobLifeCycle.is_done(instance.status):
        RedisStatuses.delete_status(job.uuid.hex)
    new_operation_run_status(entity_type=content_types.TENSORBOARD_JOB,
                             entity=job,
                             status=instance.status)
Exemple #2
0
def k8s_events_handle_job_statuses(self: 'workers.app.task',
                                   payload: Dict) -> None:
    """Project jobs statuses"""
    details = payload['details']
    job_uuid = details['labels']['job_uuid']
    job_name = details['labels']['job_name']
    project_name = details['labels'].get('project_name')
    logger.debug('handling events status for job %s', job_name)

    try:
        job = Job.objects.get(uuid=job_uuid)
    except Job.DoesNotExist:
        logger.debug('Job `%s` does not exist', job_name)
        return

    try:
        job.project
    except Project.DoesNotExist:
        logger.debug('Project for job `%s` does not exist', project_name)
        return

    # Set the new status
    try:
        RedisStatuses.set_status(job_uuid, payload['status'])
        set_node_scheduling(job, details['node_name'])
        job.set_status(status=payload['status'],
                       message=payload['message'],
                       traceback=payload.get('traceback'),
                       details=details)
    except IntegrityError:
        # Due to concurrency this could happen, we just retry it
        self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
Exemple #3
0
def job_status_post_save(sender, **kwargs):
    instance = kwargs['instance']
    job = instance.job
    previous_status = job.last_status
    # Update job last_status
    job.status = instance
    set_job_started_at(instance=job, status=instance.status)
    set_job_finished_at(instance=job, status=instance.status)
    job.save(update_fields=['status', 'started_at', 'updated_at', 'finished_at'])
    auditor.record(event_type=JOB_NEW_STATUS,
                   instance=job,
                   previous_status=previous_status)

    if instance.status == JobLifeCycle.CREATED:
        auditor.record(event_type=JOB_CREATED, instance=job)
    elif instance.status == JobLifeCycle.STOPPED:
        auditor.record(event_type=JOB_STOPPED,
                       instance=job,
                       previous_status=previous_status)
    elif instance.status == JobLifeCycle.FAILED:
        auditor.record(event_type=JOB_FAILED,
                       instance=job,
                       previous_status=previous_status)
    elif instance.status == JobLifeCycle.SUCCEEDED:
        auditor.record(event_type=JOB_SUCCEEDED,
                       instance=job,
                       previous_status=previous_status)
    if JobLifeCycle.is_done(instance.status):
        auditor.record(event_type=JOB_DONE,
                       instance=job,
                       previous_status=previous_status)
        RedisStatuses.delete_status(job.uuid.hex)
    new_operation_run_status(entity_type=content_types.JOB,
                             entity=job,
                             status=instance.status)
 def test_status_change(self):
     assert RedisStatuses.get_status('job-uuid') is None
     RedisStatuses.delete_status('job-uuid')
     RedisStatuses.set_status('job-uuid', 'running')
     assert RedisStatuses.get_status('job-uuid') == 'running'
     RedisStatuses.delete_status('job-uuid')
     assert RedisStatuses.get_status('job-uuid') is None
Exemple #5
0
def should_handle_job_status(pod_state: Any, status: str) -> bool:
    job_uuid = pod_state['details']['labels']['job_uuid']
    current_status = RedisStatuses.get_status(job=job_uuid)
    if not current_status:  # If the status does not exist or is evicted
        return True

    try:
        return JobLifeCycle.can_transition(
            status_from=RedisStatuses.get_status(job=job_uuid),
            status_to=status)
    except redis.connection.ConnectionError:
        return True
Exemple #6
0
def k8s_events_handle_experiment_job_statuses(self: 'workers.app.task',
                                              payload: Dict) -> None:
    """Experiment jobs statuses"""
    details = payload['details']
    job_uuid = details['labels']['job_uuid']
    restart_count = payload.get('restart_count', 0)
    logger.debug('handling events status for job_uuid: %s, status: %s',
                 job_uuid, payload['status'])

    try:
        job = ExperimentJob.objects.get(uuid=job_uuid)
    except ExperimentJob.DoesNotExist:
        logger.debug('Job uuid`%s` does not exist', job_uuid)
        return

    try:
        experiment = job.experiment
    except Experiment.DoesNotExist:
        logger.debug('Experiment for job `%s` does not exist anymore',
                     job_uuid)
        return

    if job.last_status is None and self.request.retries < 2:
        self.retry(countdown=1)

    max_restarts = experiment.max_restarts or conf.get(
        MAX_RESTARTS_EXPERIMENTS)
    if JobLifeCycle.failed(payload['status']) and restart_count < max_restarts:
        return

    # Set the new status
    try:
        RedisStatuses.set_status(job_uuid, payload['status'])
        set_node_scheduling(job, details['node_name'])
        job.set_status(status=payload['status'],
                       message=payload['message'],
                       created_at=payload.get('created_at'),
                       traceback=payload.get('traceback'),
                       details=details)
        logger.debug('status %s is set for job %s %s', payload['status'],
                     job_uuid, job.id)
    except IntegrityError:
        # Due to concurrency this could happen, we just retry it
        logger.info('Retry job status %s handling %s', payload['status'],
                    job_uuid)
        self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
Exemple #7
0
def experiment_job_status_post_save(sender, **kwargs):
    instance = kwargs['instance']
    job = instance.job

    job.status = instance
    set_job_started_at(instance=job, status=instance.status)
    set_job_finished_at(instance=job, status=instance.status)
    job.save(
        update_fields=['status', 'started_at', 'updated_at', 'finished_at'])

    # check if the new status is done to remove the containers from the monitors
    if job.is_done:
        # TODO: re-enable container monitor: RedisJobContainers().remove_job(job.uuid.hex)
        RedisStatuses.delete_status(job.uuid.hex)

    # Check if we need to change the experiment status
    auditor.record(event_type=EXPERIMENT_JOB_NEW_STATUS, instance=job)
Exemple #8
0
def k8s_events_handle_plugin_job_statuses(self: 'workers.app.task',
                                          payload: Dict) -> None:
    """Project Plugin jobs statuses"""
    details = payload['details']
    app = details['labels']['app']
    job_uuid = details['labels']['job_uuid']
    job_name = details['labels']['job_name']
    project_name = details['labels'].get('project_name')
    logger.debug('handling events status for job %s %s', job_name, app)

    try:
        if app == conf.get(APP_LABELS_TENSORBOARD):
            job = TensorboardJob.objects.get(uuid=job_uuid)
        elif app == conf.get(APP_LABELS_NOTEBOOK):
            job = NotebookJob.objects.get(uuid=job_uuid)
        else:
            logger.info('Plugin job `%s` does not exist', app)
            return
    except (NotebookJob.DoesNotExist, TensorboardJob.DoesNotExist):
        logger.debug('`%s - %s` does not exist', app, job_name)
        return

    try:
        job.project
    except Project.DoesNotExist:
        logger.debug('`%s` does not exist anymore', project_name)

    # Set the new status
    try:
        RedisStatuses.set_status(job_uuid, payload['status'])
        set_node_scheduling(job, details['node_name'])
        job.set_status(status=payload['status'],
                       message=payload['message'],
                       traceback=payload.get('traceback'),
                       details=details)
    except IntegrityError:
        # Due to concurrency this could happen, we just retry it
        self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
Exemple #9
0
def k8s_events_handle_build_job_statuses(self: 'workers.app.task',
                                         payload: Dict) -> None:
    """Project Plugin jobs statuses"""
    details = payload['details']
    app = details['labels']['app']
    job_uuid = details['labels']['job_uuid']
    job_name = details['labels']['job_name']
    restart_count = payload.get('restart_count', 0)
    project_name = details['labels'].get('project_name')
    logger.debug('handling events status for build jon %s %s', job_name, app)

    try:
        build_job = BuildJob.objects.get(uuid=job_uuid)
    except BuildJob.DoesNotExist:
        logger.info('Build job `%s` does not exist', job_name)
        return

    try:
        build_job.project
    except Project.DoesNotExist:
        logger.debug('`%s` does not exist anymore', project_name)

    max_restarts = build_job.max_restarts or conf.get(MAX_RESTARTS_BUILD_JOBS)
    if JobLifeCycle.failed(payload['status']) and restart_count < max_restarts:
        return

    # Set the new status
    try:
        RedisStatuses.set_status(job_uuid, payload['status'])
        set_node_scheduling(build_job, details['node_name'])
        build_job.set_status(status=payload['status'],
                             message=payload['message'],
                             traceback=payload.get('traceback'),
                             details=details)
    except IntegrityError:
        # Due to concurrency this could happen, we just retry it
        self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
Exemple #10
0
def should_handle_job_status(pod_state: Any, status: str) -> bool:
    job_uuid = pod_state['details']['labels']['job_uuid']
    return RedisStatuses.get_status(job=job_uuid) != status
 def test_keys(self):
     assert RedisStatuses.get_status_key(
         'foo') == RedisStatuses.KEY_STATUSES.format('foo')