def _handle_build_job_done(cls, event: 'Event') -> None: instance = event.instance if not instance: return workers.send(SchedulerCeleryTasks.BUILD_JOBS_NOTIFY_DONE, kwargs={'build_job_id': instance.id})
def perform_destroy(self, instance): instance.archive() workers.send(SchedulerCeleryTasks.PROJECTS_SCHEDULE_DELETION, kwargs={ 'project_id': instance.id, 'immediate': True })
def stop_experiment_group(group: 'ExperimentGroup', message: str = None): workers.send(SchedulerCeleryTasks.EXPERIMENTS_GROUP_STOP, kwargs={ 'experiment_group_id': group.id, 'collect_logs': True, 'message': message })
def new_operation_run_status(entity_type, entity, status): # TODO: may be move this to the executor, and think about making it an async task # If status is created, then the entity is still not set on the op # Set the entity and status to created if status == OperationStatuses.CREATED: return try: operation_run = OperationRun.objects.get(entity_content_type__model=entity_type, entity_object_id=entity.id) except ObjectDoesNotExist: return pipeline_run = operation_run.pipeline_run # Update job last_status operation_run.status = status operation_run.save(update_fields=['status']) # Check if we need to update the pipeline_run's status workers.send( PipelinesCeleryTasks.PIPELINES_CHECK_STATUSES, kwargs={'pipeline_run_id': pipeline_run.id, 'status': status}, countdown=None) if operation_run.is_done: # Notify downstream that instance is done, and that its dependency can start. downstream_runs = operation_run.downstream_runs.filter(status__isnull=True) for op_run in downstream_runs: workers.send( PipelinesCeleryTasks.PIPELINES_START_OPERATION, kwargs={'operation_run_id': op_run.id}, countdown=None)
def post(self, request, *args, **kwargs): experiments = self.queryset.filter(project=self.project, id__in=request.data.get('ids', [])) for experiment in experiments: auditor.record(event_type=EXPERIMENT_STOPPED_TRIGGERED, instance=experiment, actor_id=request.user.id, actor_name=request.user.username) group = experiment.experiment_group workers.send(SchedulerCeleryTasks.EXPERIMENTS_STOP, kwargs={ 'project_name': self.project.unique_name, 'project_uuid': self.project.uuid.hex, 'experiment_name': experiment.unique_name, 'experiment_uuid': experiment.uuid.hex, 'experiment_group_name': group.unique_name if group else None, 'experiment_group_uuid': group.uuid.hex if group else None, 'specification': experiment.content, 'update_status': True, 'collect_logs': True, 'is_managed': experiment.is_managed, }) return Response(status=status.HTTP_200_OK)
def _handle_experiment_group_created(cls, event: 'Event') -> None: if not event.data['is_managed']: return if not event.data['has_specification'] or not event.data['is_study']: return workers.send(SchedulerCeleryTasks.EXPERIMENTS_GROUP_CREATE, kwargs={'experiment_group_id': event.data['id']})
def start_group_experiments(experiment_group): # Check for early stopping before starting new experiments from this group if experiment_group.should_stop_early(): workers.send(SchedulerCeleryTasks.EXPERIMENTS_GROUP_STOP_EXPERIMENTS, kwargs={ 'experiment_group_id': experiment_group.id, 'pending': True, 'message': 'Early stopping' }) return experiment_to_start = experiment_group.n_experiments_to_start if experiment_to_start <= 0: # This could happen due to concurrency or not created yet experiments return (experiment_group.pending_experiments.exists() or not experiment_group.scheduled_all_suggestions()) pending_experiments = experiment_group.pending_experiments.values_list( 'id', flat=True)[:experiment_to_start] n_pending_experiment = experiment_group.pending_experiments.count() for experiment in pending_experiments: workers.send(SchedulerCeleryTasks.EXPERIMENTS_BUILD, kwargs={'experiment_id': experiment}) return (n_pending_experiment - experiment_to_start > 0 or not experiment_group.scheduled_all_suggestions())
def perform_destroy(self, instance): instance.archive() workers.send(SchedulerCeleryTasks.TENSORBOARDS_SCHEDULE_DELETION, kwargs={ 'tensorboard_job_id': instance.id, 'immediate': True })
def delete_archived_projects() -> None: last_date = get_date_check(days=conf.get(CLEANING_INTERVALS_ARCHIVES)) ids = Project.archived.filter(updated_at__lte=last_date).values_list( 'id', flat=True) for _id in ids: workers.send(SchedulerCeleryTasks.DELETE_ARCHIVED_PROJECT, kwargs={'project_id': _id})
def build_jobs_schedule_deletion(build_job_id, immediate=False): build_job = get_valid_build_job(build_job_id=build_job_id, include_deleted=True) if not build_job: _logger.info( 'Something went wrong, ' 'the BuildJob `%s` does not exist anymore.', build_job_id) return build_job.archive() if build_job.is_stoppable: project = build_job.project workers.send(SchedulerCeleryTasks.BUILD_JOBS_STOP, kwargs={ 'project_name': project.unique_name, 'project_uuid': project.uuid.hex, 'build_job_name': build_job.unique_name, 'build_job_uuid': build_job.uuid.hex, 'update_status': True, 'collect_logs': False, 'message': 'Build is scheduled for deletion.' }) if immediate: workers.send(SchedulerCeleryTasks.DELETE_ARCHIVED_BUILD_JOB, kwargs={ 'job_id': build_job_id, }, countdown=conf.get(SCHEDULER_GLOBAL_COUNTDOWN_DELAYED))
def experiments_schedule_deletion(experiment_id, immediate=False): experiment = get_valid_experiment(experiment_id=experiment_id, include_deleted=True) if not experiment: _logger.info( 'Something went wrong, ' 'the Experiment `%s` does not exist anymore.', experiment_id) return experiment.archive() if experiment.is_stoppable: project = experiment.project workers.send(SchedulerCeleryTasks.EXPERIMENTS_STOP, kwargs={ 'project_name': project.unique_name, 'project_uuid': project.uuid.hex, 'experiment_name': experiment.unique_name, 'experiment_uuid': experiment.uuid.hex, 'experiment_group_name': None, 'experiment_group_uuid': None, 'specification': experiment.content, 'update_status': True, 'collect_logs': False, 'message': 'Experiment is scheduled for deletion.', 'is_managed': experiment.is_managed, }) if immediate: workers.send(SchedulerCeleryTasks.DELETE_ARCHIVED_EXPERIMENT, kwargs={ 'experiment_id': experiment_id, }, countdown=conf.get(SCHEDULER_GLOBAL_COUNTDOWN_DELAYED))
def perform_destroy(self, instance): instance.archive() workers.send(SchedulerCeleryTasks.BUILD_JOBS_SCHEDULE_DELETION, kwargs={ 'build_job_id': instance.id, 'immediate': True })
def create(experiment_group): suggestions = base.get_suggestions(experiment_group=experiment_group) if not suggestions: logger.error('Experiment group `%s` could not create any suggestion.', experiment_group.id) experiment_group.set_status( ExperimentGroupLifeCycle.FAILED, message='Experiment group could not create new suggestions.') return experiment_group.iteration_manager.create_iteration( num_suggestions=len(suggestions)) def send_chunk(): workers.send(HPCeleryTasks.HP_BO_CREATE_EXPERIMENTS, kwargs={ 'experiment_group_id': experiment_group.id, 'suggestions': chunk_suggestions }) chunk_suggestions = [] for suggestion in suggestions: chunk_suggestions.append(suggestion) if len(chunk_suggestions) == conf.get(GROUPS_CHUNKS): send_chunk() chunk_suggestions = [] if chunk_suggestions: send_chunk() workers.send(HPCeleryTasks.HP_BO_START, kwargs={ 'experiment_group_id': experiment_group.id, 'auto_retry': True })
def hp_bo_start(self, experiment_group_id, auto_retry=False): if not base.should_group_start(experiment_group_id=experiment_group_id, task=HPCeleryTasks.HP_BO_START, auto_retry=auto_retry): return experiment_group = get_running_experiment_group( experiment_group_id=experiment_group_id) if not experiment_group: return should_retry = base.start_group_experiments( experiment_group=experiment_group) if should_retry: if auto_retry: # Schedule another task self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER) return workers.send(HPCeleryTasks.HP_BO_ITERATE, kwargs={ 'experiment_group_id': experiment_group_id, 'auto_retry': auto_retry }, countdown=None)
def tensorboards_schedule_deletion(tensorboard_job_id, immediate=False): tensorboard = get_valid_tensorboard(tensorboard_job_id=tensorboard_job_id, include_deleted=True) if not tensorboard: return None tensorboard.archive() if tensorboard.is_stoppable: project = tensorboard.project workers.send( SchedulerCeleryTasks.TENSORBOARDS_STOP, kwargs={ 'project_name': project.unique_name, 'project_uuid': project.uuid.hex, 'tensorboard_job_name': tensorboard.unique_name, 'tensorboard_job_uuid': tensorboard.uuid.hex, 'update_status': True, 'collect_logs': False, 'is_managed': tensorboard.is_managed, 'message': 'Tensorboard is scheduled for deletion.' }) if immediate: workers.send( SchedulerCeleryTasks.DELETE_ARCHIVED_TENSORBOARD_JOB, kwargs={ 'job_id': tensorboard_job_id, }, countdown=conf.get(SCHEDULER_GLOBAL_COUNTDOWN_DELAYED))
def hp_bo_iterate(self, experiment_group_id, auto_retry=False): experiment_group = get_running_experiment_group( experiment_group_id=experiment_group_id) if not experiment_group: return if experiment_group.non_done_experiments.count() > 0: if auto_retry: # Schedule another task, because all experiment must be done self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER) return iteration_config = experiment_group.iteration_config iteration_manager = experiment_group.iteration_manager search_manager = experiment_group.search_manager iteration_manager.update_iteration() if search_manager.should_reschedule(iteration=iteration_config.iteration): workers.send(HPCeleryTasks.HP_BO_CREATE, kwargs={'experiment_group_id': experiment_group_id}, countdown=None) return base.check_group_experiments_done(experiment_group_id, auto_retry=auto_retry)
def projects_notebook_build(notebook_job_id): notebook_job = get_valid_notebook(notebook_job_id=notebook_job_id) if not notebook_job: return None if not JobLifeCycle.can_transition(status_from=notebook_job.last_status, status_to=JobLifeCycle.BUILDING): _logger.info('Notebook `%s` cannot transition from `%s` to `%s`.', notebook_job, notebook_job.last_status, JobLifeCycle.BUILDING) return build_job, image_exists, build_status = dockerizer_scheduler.create_build_job( user=notebook_job.user, project=notebook_job.project, config=notebook_job.specification.build, configmap_refs=notebook_job.specification.configmap_refs, secret_refs=notebook_job.specification.secret_refs, code_reference=notebook_job.code_reference) notebook_job.build_job = build_job notebook_job.save(update_fields=['build_job']) if image_exists: # The image already exists, so we can start the experiment right away workers.send( SchedulerCeleryTasks.PROJECTS_NOTEBOOK_START, kwargs={'notebook_job_id': notebook_job_id}) return if not build_status: notebook_job.set_status(JobLifeCycle.FAILED, message='Could not start build process.') return # Update job status to show that its building docker image notebook_job.set_status(JobLifeCycle.BUILDING, message='Building container')
def projects_notebook_schedule_deletion(notebook_job_id, immediate=False): notebook_job = get_valid_notebook(notebook_job_id=notebook_job_id, include_deleted=True) if not notebook_job: return None notebook_job.archive() if notebook_job.is_stoppable: project = notebook_job.project workers.send( SchedulerCeleryTasks.PROJECTS_NOTEBOOK_STOP, kwargs={ 'project_name': project.unique_name, 'project_uuid': project.uuid.hex, 'notebook_job_name': notebook_job.unique_name, 'notebook_job_uuid': notebook_job.uuid.hex, 'update_status': True, 'collect_logs': False, 'is_managed': notebook_job.is_managed, 'message': 'Notebook is scheduled for deletion.' }) if immediate: workers.send( SchedulerCeleryTasks.DELETE_ARCHIVED_NOTEBOOK_JOB, kwargs={'job_id': notebook_job_id}, countdown=conf.get(SCHEDULER_GLOBAL_COUNTDOWN_DELAYED))
def post(self, request, *args, **kwargs): if self.project.has_notebook: try: if conf.get(NOTEBOOKS_MOUNT_CODE) and self.project.has_repo: self.handle_code(request) except FileNotFoundError: # Git probably was not found pass workers.send(SchedulerCeleryTasks.PROJECTS_NOTEBOOK_STOP, kwargs={ 'project_name': self.project.unique_name, 'project_uuid': self.project.uuid.hex, 'notebook_job_name': self.project.notebook.unique_name, 'notebook_job_uuid': self.project.notebook.uuid.hex, 'update_status': True, 'is_managed': self.project.notebook.is_managed, }) auditor.record(event_type=NOTEBOOK_STOPPED_TRIGGERED, instance=self.project.notebook, target='project', actor_id=self.request.user.id, actor_name=self.request.user.username, countdown=1) elif self.notebook and self.notebook.is_stoppable: self.notebook.set_status(status=ExperimentLifeCycle.STOPPED, message='Notebook was stopped') return Response(status=status.HTTP_200_OK)
def experiments_group_schedule_deletion(experiment_group_id, immediate=False): experiment_group = get_valid_experiment_group(experiment_group_id=experiment_group_id, include_deleted=True) if not experiment_group: # No need to check this group return experiment_group.archive() if experiment_group.is_stoppable: workers.send( SchedulerCeleryTasks.EXPERIMENTS_GROUP_STOP, kwargs={ 'experiment_group_id': experiment_group_id, 'collect_logs': False, 'message': 'Experiment Group is scheduled for deletion.' }, countdown=conf.get(SCHEDULER_GLOBAL_COUNTDOWN)) if immediate: workers.send( SchedulerCeleryTasks.DELETE_ARCHIVED_EXPERIMENT_GROUP, kwargs={ 'group_id': experiment_group_id, }, countdown=conf.get(SCHEDULER_GLOBAL_COUNTDOWN_DELAYED))
def jobs_schedule_deletion(job_id, immediate=False): job = get_valid_job(job_id=job_id, include_deleted=True) if not job: return None job.archive() if job.is_stoppable: project = job.project workers.send(SchedulerCeleryTasks.JOBS_STOP, kwargs={ 'project_name': project.unique_name, 'project_uuid': project.uuid.hex, 'job_name': job.unique_name, 'job_uuid': job.uuid.hex, 'update_status': True, 'collect_logs': False, 'is_managed': job.is_managed, 'message': 'Job is scheduled for deletion.' }) if immediate: workers.send(SchedulerCeleryTasks.DELETE_ARCHIVED_JOB, kwargs={ 'job_id': job_id, }, countdown=conf.get(SCHEDULER_GLOBAL_COUNTDOWN_DELAYED))
def jobs_build(job_id): job = get_valid_job(job_id=job_id) if not job: return None if not JobLifeCycle.can_transition(status_from=job.last_status, status_to=JobLifeCycle.BUILDING): _logger.info('Job id `%s` cannot transition from `%s` to `%s`.', job_id, job.last_status, JobLifeCycle.BUILDING) return build_job, image_exists, build_status = dockerizer_scheduler.create_build_job( user=job.user, project=job.project, config=job.specification.build, config_map_refs=job.config_map_refs, secret_refs=job.secret_refs, code_reference=job.code_reference) job.build_job = build_job job.save(update_fields=['build_job']) if image_exists: # The image already exists, so we can start the experiment right away workers.send(SchedulerCeleryTasks.JOBS_START, kwargs={'job_id': job_id}) return if not build_status: job.set_status(JobLifeCycle.FAILED, message='Could not start build process.') return # Update job status to show that its building docker image job.set_status(JobLifeCycle.BUILDING, message='Building container')
def perform_destroy(self, instance): instance.archive() workers.send(SchedulerCeleryTasks.EXPERIMENTS_GROUP_SCHEDULE_DELETION, kwargs={ 'experiment_group_id': instance.id, 'immediate': True })
def post(self, request, *args, **kwargs): project = self.project experiment_id = self.kwargs.get('experiment_id') group_id = self.kwargs.get('group_id') if experiment_id: experiment = get_object_or_404(Experiment, project=project, id=experiment_id) tensorboard, serializer, is_running = self._handle_experiment_tensorboard( project=project, experiment=experiment) elif group_id: group = get_object_or_404(ExperimentGroup, project=project, id=group_id) tensorboard, serializer, is_running = self._handle_group_tensorboard( project=project, group=group) else: tensorboard, serializer, is_running = self._handle_project_tensorboard( project=project) if is_running: return Response(serializer.data, status=status.HTTP_200_OK) if not tensorboard.is_running: workers.send(SchedulerCeleryTasks.TENSORBOARDS_START, kwargs={'tensorboard_job_id': tensorboard.id}) return Response(serializer.data, status=status.HTTP_201_CREATED)
def experiments_sync_jobs_statuses() -> None: experiments = Experiment.objects.exclude( status__status__in=ExperimentLifeCycle.DONE_STATUS) experiments = experiments.annotate(num_jobs=Count('jobs')).filter( num_jobs__gt=0) for experiment in experiments: workers.send(SchedulerCeleryTasks.EXPERIMENTS_CHECK_STATUS, kwargs={'experiment_id': experiment.id})
def _handle_experiment_created(cls, event: 'Event') -> None: if not event.data['is_managed']: return if event.data['has_specification'] and (event.data['is_independent'] or event.data['is_clone']): # Start building the experiment and then Schedule it to be picked by the spawners workers.send(SchedulerCeleryTasks.EXPERIMENTS_BUILD, kwargs={'experiment_id': event.data['id']})
def delete_archived_experiment_groups() -> None: last_date = get_date_check(days=conf.get(CLEANING_INTERVALS_ARCHIVES)) groups = ExperimentGroup.archived.filter( # We only check values that will not be deleted by the archived projects project__deleted=False, updated_at__lte=last_date).values_list('id', flat=True) for group in groups: workers.send(SchedulerCeleryTasks.DELETE_ARCHIVED_EXPERIMENT_GROUP, kwargs={'group_id': group})
def handle_experiment_job_condition(event_object, pod_state, status, labels, container_name): update_job_containers(event_object, status, container_name) logger.debug("Sending state to handler %s, %s", status, labels) # Handle experiment job statuses workers.send( K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_EXPERIMENT_JOB_STATUSES, kwargs={'payload': pod_state}, countdown=None)
def _handle_experiment_job_new_status(cls, event: 'Event') -> None: instance = event.instance cond = (not instance or instance.experiment.is_done or instance.last_status == JobLifeCycle.CREATED) if cond: return workers.send(SchedulerCeleryTasks.EXPERIMENTS_CHECK_STATUS, kwargs={'experiment_id': instance.experiment.id})
def delete_archived_tensorboard_jobs() -> None: last_date = get_date_check(days=conf.get(CLEANING_INTERVALS_ARCHIVES)) ids = TensorboardJob.archived.filter( # We only check values that will not be deleted by the archived projects project__deleted=False, updated_at__lte=last_date).values_list('id', flat=True) for _id in ids: workers.send(SchedulerCeleryTasks.DELETE_ARCHIVED_TENSORBOARD_JOB, kwargs={'job_id': _id})