Exemple #1
0
    def _handle_build_job_done(cls, event: 'Event') -> None:
        instance = event.instance
        if not instance:
            return

        workers.send(SchedulerCeleryTasks.BUILD_JOBS_NOTIFY_DONE,
                     kwargs={'build_job_id': instance.id})
Exemple #2
0
 def perform_destroy(self, instance):
     instance.archive()
     workers.send(SchedulerCeleryTasks.PROJECTS_SCHEDULE_DELETION,
                  kwargs={
                      'project_id': instance.id,
                      'immediate': True
                  })
Exemple #3
0
def stop_experiment_group(group: 'ExperimentGroup', message: str = None):
    workers.send(SchedulerCeleryTasks.EXPERIMENTS_GROUP_STOP,
                 kwargs={
                     'experiment_group_id': group.id,
                     'collect_logs': True,
                     'message': message
                 })
Exemple #4
0
def new_operation_run_status(entity_type, entity, status):
    # TODO: may be move this to the executor, and think about making it an async task
    # If status is created, then the entity is still not set on the op
    # Set the entity and status to created
    if status == OperationStatuses.CREATED:
        return

    try:
        operation_run = OperationRun.objects.get(entity_content_type__model=entity_type,
                                                 entity_object_id=entity.id)
    except ObjectDoesNotExist:
        return

    pipeline_run = operation_run.pipeline_run
    # Update job last_status
    operation_run.status = status
    operation_run.save(update_fields=['status'])

    # Check if we need to update the pipeline_run's status
    workers.send(
        PipelinesCeleryTasks.PIPELINES_CHECK_STATUSES,
        kwargs={'pipeline_run_id': pipeline_run.id, 'status': status},
        countdown=None)
    if operation_run.is_done:
        # Notify downstream that instance is done, and that its dependency can start.
        downstream_runs = operation_run.downstream_runs.filter(status__isnull=True)
        for op_run in downstream_runs:
            workers.send(
                PipelinesCeleryTasks.PIPELINES_START_OPERATION,
                kwargs={'operation_run_id': op_run.id},
                countdown=None)
Exemple #5
0
 def post(self, request, *args, **kwargs):
     experiments = self.queryset.filter(project=self.project,
                                        id__in=request.data.get('ids', []))
     for experiment in experiments:
         auditor.record(event_type=EXPERIMENT_STOPPED_TRIGGERED,
                        instance=experiment,
                        actor_id=request.user.id,
                        actor_name=request.user.username)
         group = experiment.experiment_group
         workers.send(SchedulerCeleryTasks.EXPERIMENTS_STOP,
                      kwargs={
                          'project_name':
                          self.project.unique_name,
                          'project_uuid':
                          self.project.uuid.hex,
                          'experiment_name':
                          experiment.unique_name,
                          'experiment_uuid':
                          experiment.uuid.hex,
                          'experiment_group_name':
                          group.unique_name if group else None,
                          'experiment_group_uuid':
                          group.uuid.hex if group else None,
                          'specification':
                          experiment.content,
                          'update_status':
                          True,
                          'collect_logs':
                          True,
                          'is_managed':
                          experiment.is_managed,
                      })
     return Response(status=status.HTTP_200_OK)
 def _handle_experiment_group_created(cls, event: 'Event') -> None:
     if not event.data['is_managed']:
         return
     if not event.data['has_specification'] or not event.data['is_study']:
         return
     workers.send(SchedulerCeleryTasks.EXPERIMENTS_GROUP_CREATE,
                  kwargs={'experiment_group_id': event.data['id']})
Exemple #7
0
def start_group_experiments(experiment_group):
    # Check for early stopping before starting new experiments from this group
    if experiment_group.should_stop_early():
        workers.send(SchedulerCeleryTasks.EXPERIMENTS_GROUP_STOP_EXPERIMENTS,
                     kwargs={
                         'experiment_group_id': experiment_group.id,
                         'pending': True,
                         'message': 'Early stopping'
                     })
        return

    experiment_to_start = experiment_group.n_experiments_to_start
    if experiment_to_start <= 0:
        # This could happen due to concurrency or not created yet experiments
        return (experiment_group.pending_experiments.exists()
                or not experiment_group.scheduled_all_suggestions())
    pending_experiments = experiment_group.pending_experiments.values_list(
        'id', flat=True)[:experiment_to_start]
    n_pending_experiment = experiment_group.pending_experiments.count()

    for experiment in pending_experiments:
        workers.send(SchedulerCeleryTasks.EXPERIMENTS_BUILD,
                     kwargs={'experiment_id': experiment})

    return (n_pending_experiment - experiment_to_start > 0
            or not experiment_group.scheduled_all_suggestions())
Exemple #8
0
 def perform_destroy(self, instance):
     instance.archive()
     workers.send(SchedulerCeleryTasks.TENSORBOARDS_SCHEDULE_DELETION,
                  kwargs={
                      'tensorboard_job_id': instance.id,
                      'immediate': True
                  })
Exemple #9
0
def delete_archived_projects() -> None:
    last_date = get_date_check(days=conf.get(CLEANING_INTERVALS_ARCHIVES))
    ids = Project.archived.filter(updated_at__lte=last_date).values_list(
        'id', flat=True)
    for _id in ids:
        workers.send(SchedulerCeleryTasks.DELETE_ARCHIVED_PROJECT,
                     kwargs={'project_id': _id})
Exemple #10
0
def build_jobs_schedule_deletion(build_job_id, immediate=False):
    build_job = get_valid_build_job(build_job_id=build_job_id,
                                    include_deleted=True)
    if not build_job:
        _logger.info(
            'Something went wrong, '
            'the BuildJob `%s` does not exist anymore.', build_job_id)
        return

    build_job.archive()

    if build_job.is_stoppable:
        project = build_job.project
        workers.send(SchedulerCeleryTasks.BUILD_JOBS_STOP,
                     kwargs={
                         'project_name': project.unique_name,
                         'project_uuid': project.uuid.hex,
                         'build_job_name': build_job.unique_name,
                         'build_job_uuid': build_job.uuid.hex,
                         'update_status': True,
                         'collect_logs': False,
                         'message': 'Build is scheduled for deletion.'
                     })

    if immediate:
        workers.send(SchedulerCeleryTasks.DELETE_ARCHIVED_BUILD_JOB,
                     kwargs={
                         'job_id': build_job_id,
                     },
                     countdown=conf.get(SCHEDULER_GLOBAL_COUNTDOWN_DELAYED))
Exemple #11
0
def experiments_schedule_deletion(experiment_id, immediate=False):
    experiment = get_valid_experiment(experiment_id=experiment_id,
                                      include_deleted=True)
    if not experiment:
        _logger.info(
            'Something went wrong, '
            'the Experiment `%s` does not exist anymore.', experiment_id)
        return

    experiment.archive()

    if experiment.is_stoppable:
        project = experiment.project
        workers.send(SchedulerCeleryTasks.EXPERIMENTS_STOP,
                     kwargs={
                         'project_name': project.unique_name,
                         'project_uuid': project.uuid.hex,
                         'experiment_name': experiment.unique_name,
                         'experiment_uuid': experiment.uuid.hex,
                         'experiment_group_name': None,
                         'experiment_group_uuid': None,
                         'specification': experiment.content,
                         'update_status': True,
                         'collect_logs': False,
                         'message': 'Experiment is scheduled for deletion.',
                         'is_managed': experiment.is_managed,
                     })

    if immediate:
        workers.send(SchedulerCeleryTasks.DELETE_ARCHIVED_EXPERIMENT,
                     kwargs={
                         'experiment_id': experiment_id,
                     },
                     countdown=conf.get(SCHEDULER_GLOBAL_COUNTDOWN_DELAYED))
Exemple #12
0
 def perform_destroy(self, instance):
     instance.archive()
     workers.send(SchedulerCeleryTasks.BUILD_JOBS_SCHEDULE_DELETION,
                  kwargs={
                      'build_job_id': instance.id,
                      'immediate': True
                  })
Exemple #13
0
def create(experiment_group):
    suggestions = base.get_suggestions(experiment_group=experiment_group)
    if not suggestions:
        logger.error('Experiment group `%s` could not create any suggestion.',
                     experiment_group.id)
        experiment_group.set_status(
            ExperimentGroupLifeCycle.FAILED,
            message='Experiment group could not create new suggestions.')
        return

    experiment_group.iteration_manager.create_iteration(
        num_suggestions=len(suggestions))

    def send_chunk():
        workers.send(HPCeleryTasks.HP_BO_CREATE_EXPERIMENTS,
                     kwargs={
                         'experiment_group_id': experiment_group.id,
                         'suggestions': chunk_suggestions
                     })

    chunk_suggestions = []
    for suggestion in suggestions:
        chunk_suggestions.append(suggestion)
        if len(chunk_suggestions) == conf.get(GROUPS_CHUNKS):
            send_chunk()
            chunk_suggestions = []

    if chunk_suggestions:
        send_chunk()

    workers.send(HPCeleryTasks.HP_BO_START,
                 kwargs={
                     'experiment_group_id': experiment_group.id,
                     'auto_retry': True
                 })
Exemple #14
0
def hp_bo_start(self, experiment_group_id, auto_retry=False):
    if not base.should_group_start(experiment_group_id=experiment_group_id,
                                   task=HPCeleryTasks.HP_BO_START,
                                   auto_retry=auto_retry):
        return

    experiment_group = get_running_experiment_group(
        experiment_group_id=experiment_group_id)
    if not experiment_group:
        return

    should_retry = base.start_group_experiments(
        experiment_group=experiment_group)
    if should_retry:
        if auto_retry:
            # Schedule another task
            self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
        return

    workers.send(HPCeleryTasks.HP_BO_ITERATE,
                 kwargs={
                     'experiment_group_id': experiment_group_id,
                     'auto_retry': auto_retry
                 },
                 countdown=None)
Exemple #15
0
def tensorboards_schedule_deletion(tensorboard_job_id, immediate=False):
    tensorboard = get_valid_tensorboard(tensorboard_job_id=tensorboard_job_id,
                                        include_deleted=True)
    if not tensorboard:
        return None

    tensorboard.archive()

    if tensorboard.is_stoppable:
        project = tensorboard.project
        workers.send(
            SchedulerCeleryTasks.TENSORBOARDS_STOP,
            kwargs={
                'project_name': project.unique_name,
                'project_uuid': project.uuid.hex,
                'tensorboard_job_name': tensorboard.unique_name,
                'tensorboard_job_uuid': tensorboard.uuid.hex,
                'update_status': True,
                'collect_logs': False,
                'is_managed': tensorboard.is_managed,
                'message': 'Tensorboard is scheduled for deletion.'
            })

    if immediate:
        workers.send(
            SchedulerCeleryTasks.DELETE_ARCHIVED_TENSORBOARD_JOB,
            kwargs={
                'job_id': tensorboard_job_id,
            },
            countdown=conf.get(SCHEDULER_GLOBAL_COUNTDOWN_DELAYED))
Exemple #16
0
def hp_bo_iterate(self, experiment_group_id, auto_retry=False):
    experiment_group = get_running_experiment_group(
        experiment_group_id=experiment_group_id)
    if not experiment_group:
        return

    if experiment_group.non_done_experiments.count() > 0:
        if auto_retry:
            # Schedule another task, because all experiment must be done
            self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
        return

    iteration_config = experiment_group.iteration_config
    iteration_manager = experiment_group.iteration_manager
    search_manager = experiment_group.search_manager

    iteration_manager.update_iteration()

    if search_manager.should_reschedule(iteration=iteration_config.iteration):
        workers.send(HPCeleryTasks.HP_BO_CREATE,
                     kwargs={'experiment_group_id': experiment_group_id},
                     countdown=None)
        return

    base.check_group_experiments_done(experiment_group_id,
                                      auto_retry=auto_retry)
Exemple #17
0
def projects_notebook_build(notebook_job_id):
    notebook_job = get_valid_notebook(notebook_job_id=notebook_job_id)
    if not notebook_job:
        return None

    if not JobLifeCycle.can_transition(status_from=notebook_job.last_status,
                                       status_to=JobLifeCycle.BUILDING):
        _logger.info('Notebook `%s` cannot transition from `%s` to `%s`.',
                     notebook_job, notebook_job.last_status, JobLifeCycle.BUILDING)
        return

    build_job, image_exists, build_status = dockerizer_scheduler.create_build_job(
        user=notebook_job.user,
        project=notebook_job.project,
        config=notebook_job.specification.build,
        configmap_refs=notebook_job.specification.configmap_refs,
        secret_refs=notebook_job.specification.secret_refs,
        code_reference=notebook_job.code_reference)

    notebook_job.build_job = build_job
    notebook_job.save(update_fields=['build_job'])
    if image_exists:
        # The image already exists, so we can start the experiment right away
        workers.send(
            SchedulerCeleryTasks.PROJECTS_NOTEBOOK_START,
            kwargs={'notebook_job_id': notebook_job_id})
        return

    if not build_status:
        notebook_job.set_status(JobLifeCycle.FAILED, message='Could not start build process.')
        return

    # Update job status to show that its building docker image
    notebook_job.set_status(JobLifeCycle.BUILDING, message='Building container')
Exemple #18
0
def projects_notebook_schedule_deletion(notebook_job_id, immediate=False):
    notebook_job = get_valid_notebook(notebook_job_id=notebook_job_id, include_deleted=True)
    if not notebook_job:
        return None

    notebook_job.archive()

    if notebook_job.is_stoppable:
        project = notebook_job.project
        workers.send(
            SchedulerCeleryTasks.PROJECTS_NOTEBOOK_STOP,
            kwargs={
                'project_name': project.unique_name,
                'project_uuid': project.uuid.hex,
                'notebook_job_name': notebook_job.unique_name,
                'notebook_job_uuid': notebook_job.uuid.hex,
                'update_status': True,
                'collect_logs': False,
                'is_managed': notebook_job.is_managed,
                'message': 'Notebook is scheduled for deletion.'
            })

    if immediate:
        workers.send(
            SchedulerCeleryTasks.DELETE_ARCHIVED_NOTEBOOK_JOB,
            kwargs={'job_id': notebook_job_id},
            countdown=conf.get(SCHEDULER_GLOBAL_COUNTDOWN_DELAYED))
Exemple #19
0
 def post(self, request, *args, **kwargs):
     if self.project.has_notebook:
         try:
             if conf.get(NOTEBOOKS_MOUNT_CODE) and self.project.has_repo:
                 self.handle_code(request)
         except FileNotFoundError:
             # Git probably was not found
             pass
         workers.send(SchedulerCeleryTasks.PROJECTS_NOTEBOOK_STOP,
                      kwargs={
                          'project_name': self.project.unique_name,
                          'project_uuid': self.project.uuid.hex,
                          'notebook_job_name':
                          self.project.notebook.unique_name,
                          'notebook_job_uuid':
                          self.project.notebook.uuid.hex,
                          'update_status': True,
                          'is_managed': self.project.notebook.is_managed,
                      })
         auditor.record(event_type=NOTEBOOK_STOPPED_TRIGGERED,
                        instance=self.project.notebook,
                        target='project',
                        actor_id=self.request.user.id,
                        actor_name=self.request.user.username,
                        countdown=1)
     elif self.notebook and self.notebook.is_stoppable:
         self.notebook.set_status(status=ExperimentLifeCycle.STOPPED,
                                  message='Notebook was stopped')
     return Response(status=status.HTTP_200_OK)
def experiments_group_schedule_deletion(experiment_group_id, immediate=False):
    experiment_group = get_valid_experiment_group(experiment_group_id=experiment_group_id,
                                                  include_deleted=True)
    if not experiment_group:
        # No need to check this group
        return

    experiment_group.archive()

    if experiment_group.is_stoppable:
        workers.send(
            SchedulerCeleryTasks.EXPERIMENTS_GROUP_STOP,
            kwargs={
                'experiment_group_id': experiment_group_id,
                'collect_logs': False,
                'message': 'Experiment Group is scheduled for deletion.'
            },
            countdown=conf.get(SCHEDULER_GLOBAL_COUNTDOWN))

    if immediate:
        workers.send(
            SchedulerCeleryTasks.DELETE_ARCHIVED_EXPERIMENT_GROUP,
            kwargs={
                'group_id': experiment_group_id,
            },
            countdown=conf.get(SCHEDULER_GLOBAL_COUNTDOWN_DELAYED))
Exemple #21
0
def jobs_schedule_deletion(job_id, immediate=False):
    job = get_valid_job(job_id=job_id, include_deleted=True)
    if not job:
        return None

    job.archive()

    if job.is_stoppable:
        project = job.project
        workers.send(SchedulerCeleryTasks.JOBS_STOP,
                     kwargs={
                         'project_name': project.unique_name,
                         'project_uuid': project.uuid.hex,
                         'job_name': job.unique_name,
                         'job_uuid': job.uuid.hex,
                         'update_status': True,
                         'collect_logs': False,
                         'is_managed': job.is_managed,
                         'message': 'Job is scheduled for deletion.'
                     })

    if immediate:
        workers.send(SchedulerCeleryTasks.DELETE_ARCHIVED_JOB,
                     kwargs={
                         'job_id': job_id,
                     },
                     countdown=conf.get(SCHEDULER_GLOBAL_COUNTDOWN_DELAYED))
Exemple #22
0
def jobs_build(job_id):
    job = get_valid_job(job_id=job_id)
    if not job:
        return None

    if not JobLifeCycle.can_transition(status_from=job.last_status,
                                       status_to=JobLifeCycle.BUILDING):
        _logger.info('Job id `%s` cannot transition from `%s` to `%s`.',
                     job_id, job.last_status, JobLifeCycle.BUILDING)
        return

    build_job, image_exists, build_status = dockerizer_scheduler.create_build_job(
        user=job.user,
        project=job.project,
        config=job.specification.build,
        config_map_refs=job.config_map_refs,
        secret_refs=job.secret_refs,
        code_reference=job.code_reference)

    job.build_job = build_job
    job.save(update_fields=['build_job'])
    if image_exists:
        # The image already exists, so we can start the experiment right away
        workers.send(SchedulerCeleryTasks.JOBS_START,
                     kwargs={'job_id': job_id})
        return

    if not build_status:
        job.set_status(JobLifeCycle.FAILED,
                       message='Could not start build process.')
        return

    # Update job status to show that its building docker image
    job.set_status(JobLifeCycle.BUILDING, message='Building container')
Exemple #23
0
 def perform_destroy(self, instance):
     instance.archive()
     workers.send(SchedulerCeleryTasks.EXPERIMENTS_GROUP_SCHEDULE_DELETION,
                  kwargs={
                      'experiment_group_id': instance.id,
                      'immediate': True
                  })
Exemple #24
0
    def post(self, request, *args, **kwargs):
        project = self.project
        experiment_id = self.kwargs.get('experiment_id')
        group_id = self.kwargs.get('group_id')
        if experiment_id:
            experiment = get_object_or_404(Experiment,
                                           project=project,
                                           id=experiment_id)
            tensorboard, serializer, is_running = self._handle_experiment_tensorboard(
                project=project, experiment=experiment)
        elif group_id:
            group = get_object_or_404(ExperimentGroup,
                                      project=project,
                                      id=group_id)
            tensorboard, serializer, is_running = self._handle_group_tensorboard(
                project=project, group=group)
        else:
            tensorboard, serializer, is_running = self._handle_project_tensorboard(
                project=project)

        if is_running:
            return Response(serializer.data, status=status.HTTP_200_OK)

        if not tensorboard.is_running:
            workers.send(SchedulerCeleryTasks.TENSORBOARDS_START,
                         kwargs={'tensorboard_job_id': tensorboard.id})
        return Response(serializer.data, status=status.HTTP_201_CREATED)
def experiments_sync_jobs_statuses() -> None:
    experiments = Experiment.objects.exclude(
        status__status__in=ExperimentLifeCycle.DONE_STATUS)
    experiments = experiments.annotate(num_jobs=Count('jobs')).filter(
        num_jobs__gt=0)
    for experiment in experiments:
        workers.send(SchedulerCeleryTasks.EXPERIMENTS_CHECK_STATUS,
                     kwargs={'experiment_id': experiment.id})
Exemple #26
0
 def _handle_experiment_created(cls, event: 'Event') -> None:
     if not event.data['is_managed']:
         return
     if event.data['has_specification'] and (event.data['is_independent']
                                             or event.data['is_clone']):
         # Start building the experiment and then Schedule it to be picked by the spawners
         workers.send(SchedulerCeleryTasks.EXPERIMENTS_BUILD,
                      kwargs={'experiment_id': event.data['id']})
Exemple #27
0
def delete_archived_experiment_groups() -> None:
    last_date = get_date_check(days=conf.get(CLEANING_INTERVALS_ARCHIVES))
    groups = ExperimentGroup.archived.filter(
        # We only check values that will not be deleted by the archived projects
        project__deleted=False,
        updated_at__lte=last_date).values_list('id', flat=True)
    for group in groups:
        workers.send(SchedulerCeleryTasks.DELETE_ARCHIVED_EXPERIMENT_GROUP,
                     kwargs={'group_id': group})
Exemple #28
0
def handle_experiment_job_condition(event_object, pod_state, status, labels,
                                    container_name):
    update_job_containers(event_object, status, container_name)
    logger.debug("Sending state to handler %s, %s", status, labels)
    # Handle experiment job statuses
    workers.send(
        K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_EXPERIMENT_JOB_STATUSES,
        kwargs={'payload': pod_state},
        countdown=None)
Exemple #29
0
    def _handle_experiment_job_new_status(cls, event: 'Event') -> None:
        instance = event.instance
        cond = (not instance or instance.experiment.is_done
                or instance.last_status == JobLifeCycle.CREATED)
        if cond:
            return

        workers.send(SchedulerCeleryTasks.EXPERIMENTS_CHECK_STATUS,
                     kwargs={'experiment_id': instance.experiment.id})
Exemple #30
0
def delete_archived_tensorboard_jobs() -> None:
    last_date = get_date_check(days=conf.get(CLEANING_INTERVALS_ARCHIVES))
    ids = TensorboardJob.archived.filter(
        # We only check values that will not be deleted by the archived projects
        project__deleted=False,
        updated_at__lte=last_date).values_list('id', flat=True)
    for _id in ids:
        workers.send(SchedulerCeleryTasks.DELETE_ARCHIVED_TENSORBOARD_JOB,
                     kwargs={'job_id': _id})