Beispiel #1
0
def events_handle_build_job_statuses(self, payload):
    """Project Plugin jobs statuses"""
    details = payload['details']
    app = details['labels']['app']
    job_uuid = details['labels']['job_uuid']
    job_name = details['labels']['job_name']
    project_name = details['labels'].get('project_name')
    logger.debug('handling events status for build jon %s %s', job_name, app)

    try:
        build_job = BuildJob.objects.get(uuid=job_uuid)
    except BuildJob.DoesNotExist:
        logger.info('Build job `%s` does not exist', job_name)
        return

    try:
        build_job.project
    except Project.DoesNotExist:
        logger.debug('`%s` does not exist anymore', project_name)

    # Set the new status
    try:
        set_node_scheduling(build_job, details['node_name'])
        build_job.set_status(status=payload['status'],
                             message=payload['message'],
                             traceback=payload.get('traceback'),
                             details=details)
    except IntegrityError:
        # Due to concurrency this could happen, we just retry it
        self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
Beispiel #2
0
def events_handle_experiment_job_statuses(self, payload):
    """Experiment jobs statuses"""
    details = payload['details']
    job_uuid = details['labels']['job_uuid']
    logger.debug('handling events status for job_uuid: %s, status: %s',
                 job_uuid, payload['status'])

    try:
        job = ExperimentJob.objects.get(uuid=job_uuid)
    except ExperimentJob.DoesNotExist:
        logger.debug('Job uuid`%s` does not exist', job_uuid)
        return

    try:
        job.experiment
    except Experiment.DoesNotExist:
        logger.debug('Experiment for job `%s` does not exist anymore', job_uuid)
        return

    if job.last_status is None and self.request.retries < 2:
        self.retry(countdown=1)

    # Set the new status
    try:
        set_node_scheduling(job, details['node_name'])
        job.set_status(status=payload['status'],
                       message=payload['message'],
                       traceback=payload.get('traceback'),
                       details=details)
        logger.debug('status %s is set for job %s %s', payload['status'], job_uuid, job.id)
    except IntegrityError:
        # Due to concurrency this could happen, we just retry it
        logger.info('Retry job status %s handling %s', payload['status'], job_uuid)
        self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
Beispiel #3
0
def events_handle_logs_experiment_job(experiment_name,
                                      experiment_uuid,
                                      job_uuid,
                                      log_lines,
                                      task_type=None,
                                      task_idx=None):
    if not Experiment.objects.filter(uuid=experiment_uuid).exists():
        return

    logger.debug('handling log event for %s %s', experiment_uuid, job_uuid)
    if task_type and task_idx:
        log_lines = [
            '{}.{} -- {}'.format(task_type,
                                 int(task_idx) + 1, log_line)
            for log_line in log_lines
        ]

    safe_log_experiment_job(experiment_name=experiment_name,
                            log_lines=log_lines)
Beispiel #4
0
def events_handle_plugin_job_statuses(self, payload):
    """Project Plugin jobs statuses"""
    details = payload['details']
    app = details['labels']['app']
    job_uuid = details['labels']['job_uuid']
    job_name = details['labels']['job_name']
    project_name = details['labels'].get('project_name')
    logger.debug('handling events status for job %s %s', job_name, app)

    try:
        if app == settings.APP_LABELS_TENSORBOARD:
            job = TensorboardJob.objects.get(uuid=job_uuid)
        elif app == settings.APP_LABELS_NOTEBOOK:
            job = NotebookJob.objects.get(uuid=job_uuid)
        else:
            logger.info('Plugin job `%s` does not exist', app)
            return
    except (NotebookJob.DoesNotExist, TensorboardJob.DoesNotExist):
        logger.debug('`%s - %s` does not exist', app, job_name)
        return

    try:
        job.project
    except Project.DoesNotExist:
        logger.debug('`%s` does not exist anymore', project_name)

    # Set the new status
    try:
        set_node_scheduling(job, details['node_name'])
        job.set_status(status=payload['status'],
                       message=payload['message'],
                       traceback=payload.get('traceback'),
                       details=details)
    except IntegrityError:
        # Due to concurrency this could happen, we just retry it
        self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
Beispiel #5
0
def handle_events_namespace(cluster_id, payload):
    logger.debug('handling events namespace for cluster: %s', cluster_id)
    try:
        ClusterEvent.objects.create(cluster_id=cluster_id, **payload)
    except OperationalError:
        pass
Beispiel #6
0
def events_handle_logs_build_job(job_uuid, job_name, log_lines):
    if not BuildJob.objects.filter(uuid=job_uuid).exists():
        return

    logger.debug('handling log event for %s', job_name)
    safe_log_job(job_name=job_name, log_lines=log_lines)