def new_pipeline_run_status(sender, **kwargs): instance = kwargs['instance'] pipeline_run = instance.pipeline_run # Update job last_status pipeline_run.status = instance set_started_at(instance=pipeline_run, status=instance.status, starting_statuses=[PipelineStatuses.RUNNING]) set_finished_at(instance=pipeline_run, status=instance.status, is_done=PipelineStatuses.is_done) pipeline_run.save(update_fields=['status', 'started_at', 'finished_at']) # Notify operations with status change. This is necessary if we skip or stop the dag run. if pipeline_run.stopped: celery_app.send_task(PipelinesCeleryTasks.PIPELINES_STOP_OPERATIONS, kwargs={ 'pipeline_run_id': pipeline_run.id, 'message': 'Pipeline run was stopped' }) if pipeline_run.skipped: celery_app.send_task(PipelinesCeleryTasks.PIPELINES_SKIP_OPERATIONS, kwargs={ 'pipeline_run_id': pipeline_run.id, 'message': 'Pipeline run was skipped' })
def handle_new_experiment_status(sender, **kwargs): instance = kwargs['instance'] experiment = instance.experiment if not experiment.specification: return stop_condition = (instance.status in (ExperimentLifeCycle.FAILED, ExperimentLifeCycle.SUCCEEDED) and experiment.jobs.count() > 0) if stop_condition: _logger.debug( 'One of the workers failed or Master for experiment `%s` is done, ' 'send signal to other workers to stop.', experiment.unique_name) # Schedule stop for this experiment because other jobs may be still running group = experiment.experiment_group celery_app.send_task( SchedulerCeleryTasks.EXPERIMENTS_STOP, kwargs={ 'project_name': experiment.project.unique_name, 'project_uuid': experiment.project.uuid.hex, 'experiment_name': experiment.unique_name, 'experiment_uuid': experiment.uuid.hex, 'experiment_group_name': group.unique_name if group else None, 'experiment_group_uuid': group.uuid.hex if group else None, 'specification': experiment.config, 'update_status': False, 'collect_logs': True, }, countdown=RedisTTL.get_for_experiment(experiment_id=experiment.id))
def jobs_build(job_id): job = get_valid_job(job_id=job_id) if not job: return None if not JobLifeCycle.can_transition(status_from=job.last_status, status_to=JobLifeCycle.BUILDING): _logger.info('Job id `%s` cannot transition from `%s` to `%s`.', job_id, job.last_status, JobLifeCycle.BUILDING) return build_job, image_exists, build_status = dockerizer_scheduler.create_build_job( user=job.user, project=job.project, config=job.specification.build, configmap_refs=job.specification.configmap_refs, secret_refs=job.specification.secret_refs, code_reference=job.code_reference) job.build_job = build_job job.save(update_fields=['build_job']) if image_exists: # The image already exists, so we can start the experiment right away celery_app.send_task( SchedulerCeleryTasks.JOBS_START, kwargs={'job_id': job_id}, countdown=conf.get(SCHEDULER_GLOBAL_COUNTDOWN)) return if not build_status: job.set_status(JobLifeCycle.FAILED, message='Could not start build process.') return # Update job status to show that its building docker image job.set_status(JobLifeCycle.BUILDING, message='Building container')
def start_new_experiment(sender, **kwargs): instance = kwargs['instance'] if instance.is_independent or instance.is_clone: # Start building the experiment and then Schedule it to be picked by the spawners celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_BUILD, kwargs={'experiment_id': instance.id}, countdown=1)
def post(self, request, *args, **kwargs): if self.project.has_notebook: try: if conf.get( 'MOUNT_CODE_IN_NOTEBOOKS') and self.project.has_repo: self.handle_code(request) except FileNotFoundError: # Git probably was not found pass celery_app.send_task(SchedulerCeleryTasks.PROJECTS_NOTEBOOK_STOP, kwargs={ 'project_name': self.project.unique_name, 'project_uuid': self.project.uuid.hex, 'notebook_job_name': self.project.notebook.unique_name, 'notebook_job_uuid': self.project.notebook.uuid.hex, 'update_status': True }, countdown=conf.get('GLOBAL_COUNTDOWN')) auditor.record(event_type=NOTEBOOK_STOPPED_TRIGGERED, instance=self.project.notebook, target='project', actor_id=self.request.user.id, actor_name=self.request.user.username, countdown=1) elif self.project.notebook and self.project.notebook.is_stoppable: self.project.notebook.set_status( status=ExperimentLifeCycle.STOPPED, message='Notebook was stopped') return Response(status=status.HTTP_200_OK)
def projects_notebook_build(notebook_job_id): notebook_job = get_valid_notebook(notebook_job_id=notebook_job_id) if not notebook_job: return None if not JobLifeCycle.can_transition(status_from=notebook_job.last_status, status_to=JobLifeCycle.BUILDING): _logger.info('Notebook `%s` cannot transition from `%s` to `%s`.', notebook_job, notebook_job.last_status, JobLifeCycle.BUILDING) return build_job, image_exists, build_status = dockerizer_scheduler.create_build_job( user=notebook_job.user, project=notebook_job.project, config=notebook_job.specification.build, configmap_refs=notebook_job.specification.configmap_refs, secret_refs=notebook_job.specification.secret_refs, code_reference=notebook_job.code_reference) notebook_job.build_job = build_job notebook_job.save(update_fields=['build_job']) if image_exists: # The image already exists, so we can start the experiment right away celery_app.send_task( SchedulerCeleryTasks.PROJECTS_NOTEBOOK_START, kwargs={'notebook_job_id': notebook_job_id}, countdown=conf.get('GLOBAL_COUNTDOWN')) return if not build_status: notebook_job.set_status(JobLifeCycle.FAILED, message='Could not start build process.') return # Update job status to show that its building docker image notebook_job.set_status(JobLifeCycle.BUILDING, message='Building container')
def build(self, nocache: bool = False, memory_limit: Any = None) -> bool: _logger.debug('Starting build for `%s`', self.repo_path) # Checkout to the correct commit # if self.image_tag != self.LATEST_IMAGE_TAG: # git.checkout_commit(repo_path=self.repo_path, commit=self.image_tag) limits = { # Disable memory swap for building 'memswap': -1 } if memory_limit: limits['memory'] = memory_limit # Create DockerFile with open(self.dockerfile_path, 'w') as dockerfile: rendered_dockerfile = self.render() celery_app.send_task( SchedulerCeleryTasks.BUILD_JOBS_SET_DOCKERFILE, kwargs={'build_job_uuid': self.job_uuid, 'dockerfile': rendered_dockerfile}) dockerfile.write(rendered_dockerfile) stream = self.docker.build( path=self.build_path, tag=self.get_tagged_image(), forcerm=True, rm=True, pull=True, nocache=nocache, container_limits=limits) return self._handle_log_stream(stream=stream)
def hp_hyperband_iterate(self, experiment_group_id): experiment_group = get_running_experiment_group( experiment_group_id=experiment_group_id) if not experiment_group: return if experiment_group.non_done_experiments.count() > 0: # Schedule another task, because all experiment must be done self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER) return iteration_config = experiment_group.iteration_config iteration_manager = experiment_group.iteration_manager search_manager = experiment_group.search_manager iteration_manager.update_iteration() if search_manager.should_reschedule( iteration=iteration_config.iteration, bracket_iteration=iteration_config.bracket_iteration): celery_app.send_task( HPCeleryTasks.HP_HYPERBAND_CREATE, kwargs={'experiment_group_id': experiment_group_id}) return if search_manager.should_reduce_configs( iteration=iteration_config.iteration, bracket_iteration=iteration_config.bracket_iteration): iteration_manager.reduce_configs() celery_app.send_task( HPCeleryTasks.HP_HYPERBAND_START, kwargs={'experiment_group_id': experiment_group_id}) return base.check_group_experiments_finished(experiment_group_id)
def post(self, request, *args, **kwargs): obj = self.get_object() auditor.record(event_type=EXPERIMENT_STOPPED_TRIGGERED, instance=obj, actor_id=request.user.id, actor_name=request.user.username) group = obj.experiment_group celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_STOP, kwargs={ 'project_name': self.project.unique_name, 'project_uuid': self.project.uuid.hex, 'experiment_name': obj.unique_name, 'experiment_uuid': obj.uuid.hex, 'experiment_group_name': group.unique_name if group else None, 'experiment_group_uuid': group.uuid.hex if group else None, 'specification': obj.config, 'update_status': True }) return Response(status=status.HTTP_200_OK)
def heartbeat_experiments() -> None: experiments = Experiment.objects.filter( status__status__in=ExperimentLifeCycle.HEARTBEAT_STATUS) for experiment in experiments.values_list('id', flat=True): celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_CHECK_HEARTBEAT, kwargs={'experiment_id': experiment}, countdown=conf.get(SCHEDULER_GLOBAL_COUNTDOWN))
def heartbeat_builds() -> None: build_jobs = BuildJob.objects.filter( status__status__in=JobLifeCycle.HEARTBEAT_STATUS) for build_job in build_jobs.values_list('id', flat=True): celery_app.send_task(SchedulerCeleryTasks.BUILD_JOBS_CHECK_HEARTBEAT, kwargs={'build_job_id': build_job}, countdown=conf.get(SCHEDULER_GLOBAL_COUNTDOWN))
def _handle_experiment_cleaned_triggered(cls, event: 'Event') -> None: from db.models.experiment_groups import ExperimentGroup instance = event.instance if not instance.is_managed: return if not instance or not instance.has_specification or not instance.is_stoppable: return if instance.jobs.count() == 0: return try: group = instance.experiment_group celery_app.send_task( SchedulerCeleryTasks.EXPERIMENTS_STOP, kwargs={ 'project_name': instance.project.unique_name, 'project_uuid': instance.project.uuid.hex, 'experiment_name': instance.unique_name, 'experiment_uuid': instance.uuid.hex, 'experiment_group_name': group.unique_name if group else None, 'experiment_group_uuid': group.uuid.hex if group else None, 'specification': instance.content, 'update_status': False, 'collect_logs': False, 'is_managed': instance.is_managed, }, countdown=conf.get(SCHEDULER_GLOBAL_COUNTDOWN)) except ExperimentGroup.DoesNotExist: # The experiment was already stopped when the group was deleted pass
def jobs_schedule_deletion(job_id, immediate=False): job = get_valid_job(job_id=job_id, include_deleted=True) if not job: return None job.archive() if job.is_running: project = job.project celery_app.send_task(SchedulerCeleryTasks.JOBS_STOP, kwargs={ 'project_name': project.unique_name, 'project_uuid': project.uuid.hex, 'job_name': job.unique_name, 'job_uuid': job.uuid.hex, 'update_status': True, 'collect_logs': False, 'message': 'Job is scheduled for deletion.' }) if immediate: celery_app.send_task(SchedulerCeleryTasks.DELETE_ARCHIVED_JOB, kwargs={ 'job_id': job_id, })
def new_operation_run_status(sender, **kwargs): instance = kwargs['instance'] operation_run = instance.operation_run pipeline_run = operation_run.pipeline_run # Update job last_status operation_run.status = instance set_started_at(instance=operation_run, status=instance.status, starting_statuses=[PipelineStatuses.RUNNING]) set_finished_at(instance=operation_run, status=instance.status, is_done=PipelineStatuses.is_done) operation_run.save(update_fields=['status', 'started_at', 'finished_at']) # No need to check if it is just created if instance.status == OperationStatuses.CREATED: return # Check if we need to update the pipeline_run's status celery_app.send_task(PipelinesCeleryTasks.PIPELINES_CHECK_STATUSES, kwargs={ 'pipeline_run_id': pipeline_run.id, 'status': instance.status, 'message': instance.message }) if operation_run.is_done: # Notify downstream that instance is done, and that its dependency can start. downstream_runs = operation_run.downstream_runs.filter( status__status=OperationStatuses.CREATED) for op_run in downstream_runs: celery_app.send_task( PipelinesCeleryTasks.PIPELINES_START_OPERATION, kwargs={'operation_run_id': op_run.id})
def send_chunk(): celery_app.send_task(HPCeleryTasks.HP_GRID_SEARCH_CREATE_EXPERIMENTS, kwargs={ 'experiment_group_id': experiment_group.id, 'suggestions': chunk_suggestions }, countdown=1)
def post(self, request, *args, **kwargs): experiments = self.queryset.filter(project=self.project, id__in=request.data.get('ids', [])) for experiment in experiments: auditor.record(event_type=EXPERIMENT_STOPPED_TRIGGERED, instance=experiment, actor_id=request.user.id, actor_name=request.user.username) group = experiment.experiment_group celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_STOP, kwargs={ 'project_name': self.project.unique_name, 'project_uuid': self.project.uuid.hex, 'experiment_name': experiment.unique_name, 'experiment_uuid': experiment.uuid.hex, 'experiment_group_name': group.unique_name if group else None, 'experiment_group_uuid': group.uuid.hex if group else None, 'specification': experiment.config, 'update_status': True }) return Response(status=status.HTTP_200_OK)
def post(self, request, *args, **kwargs): project = self.project tensorboard = self.tensorboard has_tensorboard = self.has_tensorboard experiment_id = self.kwargs.get('experiment_id') group_id = self.kwargs.get('group_id') if has_tensorboard: celery_app.send_task(SchedulerCeleryTasks.TENSORBOARDS_STOP, kwargs={ 'project_name': project.unique_name, 'project_uuid': project.uuid.hex, 'tensorboard_job_name': tensorboard.unique_name, 'tensorboard_job_uuid': tensorboard.uuid.hex, 'update_status': True }, countdown=conf.get('GLOBAL_COUNTDOWN')) auditor.record(event_type=TENSORBOARD_STOPPED_TRIGGERED, instance=tensorboard, target=get_target(experiment=experiment_id, group=group_id), actor_id=self.request.user.id, actor_name=self.request.user.username) return Response(status=status.HTTP_200_OK)
def hp_bo_iterate(self, experiment_group_id, auto_retry=False): experiment_group = get_running_experiment_group( experiment_group_id=experiment_group_id) if not experiment_group: return if experiment_group.non_done_experiments.count() > 0: if auto_retry: # Schedule another task, because all experiment must be done self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER) return iteration_config = experiment_group.iteration_config iteration_manager = experiment_group.iteration_manager search_manager = experiment_group.search_manager iteration_manager.update_iteration() if search_manager.should_reschedule(iteration=iteration_config.iteration): celery_app.send_task( HPCeleryTasks.HP_BO_CREATE, kwargs={'experiment_group_id': experiment_group_id}) return base.check_group_experiments_finished(experiment_group_id, auto_retry=auto_retry)
def projects_notebook_schedule_deletion(notebook_job_id, immediate=False): notebook_job = get_valid_notebook(notebook_job_id=notebook_job_id, include_deleted=True) if not notebook_job: return None notebook_job.archive() if notebook_job.is_stoppable: project = notebook_job.project celery_app.send_task( SchedulerCeleryTasks.PROJECTS_NOTEBOOK_STOP, kwargs={ 'project_name': project.unique_name, 'project_uuid': project.uuid.hex, 'notebook_job_name': notebook_job.unique_name, 'notebook_job_uuid': notebook_job.uuid.hex, 'update_status': True, 'collect_logs': False, 'is_managed': notebook_job.is_managed, 'message': 'Notebook is scheduled for deletion.' }, countdown=conf.get('GLOBAL_COUNTDOWN')) if immediate: celery_app.send_task( SchedulerCeleryTasks.DELETE_ARCHIVED_NOTEBOOK_JOB, kwargs={ 'job_id': notebook_job_id, }, countdown=conf.get('GLOBAL_COUNTDOWN_DELAYED'))
def start_group_experiments(experiment_group): # Check for early stopping before starting new experiments from this group if experiment_group.should_stop_early(): celery_app.send_task( SchedulerCeleryTasks.EXPERIMENTS_GROUP_STOP_EXPERIMENTS, kwargs={'experiment_group_id': experiment_group.id, 'pending': True, 'message': 'Early stopping'}, countdown=conf.get('GLOBAL_COUNTDOWN')) return experiment_to_start = experiment_group.n_experiments_to_start if experiment_to_start <= 0: # This could happen due to concurrency or not created yet experiments return (experiment_group.pending_experiments.exists() or not experiment_group.scheduled_all_suggestions()) pending_experiments = experiment_group.pending_experiments[:experiment_to_start] n_pending_experiment = experiment_group.pending_experiments.count() for experiment in pending_experiments: celery_app.send_task( SchedulerCeleryTasks.EXPERIMENTS_BUILD, kwargs={'experiment_id': experiment.id}, countdown=conf.get('GLOBAL_COUNTDOWN')) return (n_pending_experiment - experiment_to_start > 0 or not experiment_group.scheduled_all_suggestions())
def experiments_group_schedule_deletion(experiment_group_id, immediate=False): experiment_group = get_valid_experiment_group( experiment_group_id=experiment_group_id, include_deleted=True) if not experiment_group: # No need to check this group return experiment_group.archive() if experiment_group.is_stoppable: celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_GROUP_STOP, kwargs={ 'experiment_group_id': experiment_group_id, 'collect_logs': False, 'message': 'Experiment Group is scheduled for deletion.' }, countdown=conf.get('GLOBAL_COUNTDOWN')) if immediate: celery_app.send_task( SchedulerCeleryTasks.DELETE_ARCHIVED_EXPERIMENT_GROUP, kwargs={ 'group_id': experiment_group_id, }, countdown=conf.get('GLOBAL_COUNTDOWN'))
def build_jobs_schedule_deletion(build_job_id, immediate=False): build_job = get_valid_build_job(build_job_id=build_job_id, include_deleted=True) if not build_job: _logger.info( 'Something went wrong, ' 'the BuildJob `%s` does not exist anymore.', build_job_id) return build_job.archive() if build_job.is_stoppable: project = build_job.project celery_app.send_task(SchedulerCeleryTasks.BUILD_JOBS_STOP, kwargs={ 'project_name': project.unique_name, 'project_uuid': project.uuid.hex, 'build_job_name': build_job.unique_name, 'build_job_uuid': build_job.uuid.hex, 'update_status': True, 'collect_logs': False, 'message': 'Build is scheduled for deletion.' }, countdown=conf.get('GLOBAL_COUNTDOWN')) if immediate: celery_app.send_task(SchedulerCeleryTasks.DELETE_ARCHIVED_BUILD_JOB, kwargs={ 'job_id': build_job_id, }, countdown=conf.get('GLOBAL_COUNTDOWN'))
def stop_running_experiment(sender, **kwargs): instance = kwargs['instance'] if not instance.is_running or instance.jobs.count() == 0: return try: group = instance.experiment_group celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_STOP, kwargs={ 'project_name': instance.project.unique_name, 'project_uuid': instance.project.uuid.hex, 'experiment_name': instance.unique_name, 'experiment_uuid': instance.uuid.hex, 'experiment_group_name': group.unique_name if group else None, 'experiment_group_uuid': group.uuid.hex if group else None, 'specification': instance.config, 'update_status': False }) except ExperimentGroup.DoesNotExist: # The experiment was already stopped when the group was deleted pass
def perform_destroy(self, instance): instance.archive() celery_app.send_task(SchedulerCeleryTasks.BUILD_JOBS_SCHEDULE_DELETION, kwargs={ 'build_job_id': instance.id, 'immediate': True })
def post(self, request, *args, **kwargs): project = self.project experiment_id = self.kwargs.get('experiment_id') group_id = self.kwargs.get('group_id') if experiment_id: experiment = get_object_or_404(Experiment, project=project, id=experiment_id) tensorboard = self._handle_experiment_tensorboard( project=project, experiment=experiment) elif group_id: group = get_object_or_404(ExperimentGroup, project=project, id=group_id) tensorboard = self._handle_group_tensorboard(project=project, group=group) else: tensorboard = self._handle_project_tensorboard(project=project) if not tensorboard: return Response(data='Tensorboard is already running', status=status.HTTP_200_OK) if not tensorboard.is_running: celery_app.send_task(SchedulerCeleryTasks.TENSORBOARDS_START, kwargs={'tensorboard_job_id': tensorboard.id}, countdown=conf.get('GLOBAL_COUNTDOWN')) return Response(status=status.HTTP_201_CREATED)
def handle_experiment_job_condition(event_object, pod_state, status, labels, container_name): update_job_containers(event_object, status, container_name) logger.debug("Sending state to handler %s, %s", status, labels) # Handle experiment job statuses celery_app.send_task( K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_EXPERIMENT_JOB_STATUSES, kwargs={'payload': pod_state})
def experiments_schedule_deletion(experiment_id, immediate=False): experiment = get_valid_experiment(experiment_id=experiment_id, include_deleted=True) if not experiment: _logger.info( 'Something went wrong, ' 'the Experiment `%s` does not exist anymore.', experiment_id) return experiment.archive() if experiment.is_running: project = experiment.project celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_STOP, kwargs={ 'project_name': project.unique_name, 'project_uuid': project.uuid.hex, 'experiment_name': experiment.unique_name, 'experiment_uuid': experiment.uuid.hex, 'experiment_group_name': None, 'experiment_group_uuid': None, 'specification': experiment.config, 'update_status': True, 'collect_logs': False, 'message': 'Experiment is scheduled for deletion.' }) if immediate: celery_app.send_task(SchedulerCeleryTasks.DELETE_ARCHIVED_EXPERIMENT, kwargs={ 'experiment_id': experiment_id, })
def _handle_experiment_created(cls, event: 'Event') -> None: if event.data['has_specification'] and (event.data['is_independent'] or event.data['is_clone']): # Start building the experiment and then Schedule it to be picked by the spawners celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_BUILD, kwargs={'experiment_id': event.data['id']}, countdown=1)
def jobs_schedule_deletion(job_id, immediate=False): job = get_valid_job(job_id=job_id, include_deleted=True) if not job: return None job.archive() if job.is_stoppable: project = job.project celery_app.send_task( SchedulerCeleryTasks.JOBS_STOP, kwargs={ 'project_name': project.unique_name, 'project_uuid': project.uuid.hex, 'job_name': job.unique_name, 'job_uuid': job.uuid.hex, 'update_status': True, 'collect_logs': False, 'is_managed': job.is_managed, 'message': 'Job is scheduled for deletion.' }, countdown=conf.get(SCHEDULER_GLOBAL_COUNTDOWN)) if immediate: celery_app.send_task( SchedulerCeleryTasks.DELETE_ARCHIVED_JOB, kwargs={ 'job_id': job_id, }, countdown=conf.get(SCHEDULER_GLOBAL_COUNTDOWN_DELAYED))
def post(self, request, *args, **kwargs): if self.project.has_notebook: commit = request.data.get('commit') commit = to_bool(commit) if commit is not None else True try: if commit: # Commit changes git.commit(self.project.repo.path, request.user.email, request.user.username) else: # Reset changes git.undo(self.project.repo.path) except FileNotFoundError: # Git probably was not found pass celery_app.send_task( SchedulerCeleryTasks.PROJECTS_NOTEBOOK_STOP, kwargs={ 'project_name': self.project.unique_name, 'project_uuid': self.project.uuid.hex, 'notebook_job_name': self.project.notebook.unique_name, 'notebook_job_uuid': self.project.notebook.uuid.hex, 'update_status': True }) auditor.record(event_type=NOTEBOOK_STOPPED_TRIGGERED, instance=self.project.notebook, target='project', actor_id=self.request.user.id, actor_name=self.request.user.username, countdown=1) elif self.project.notebook and self.project.notebook.is_running: self.project.notebook.set_status(status=ExperimentLifeCycle.STOPPED, message='Notebook was stopped') return Response(status=status.HTTP_200_OK)