def create(experiment_group): base.create_group_experiments(experiment_group=experiment_group) celery_app.send_task( HPCeleryTasks.HP_RANDOM_SEARCH_START, kwargs={'experiment_group_id': experiment_group.id}, countdown=1)
def build(self, nocache=False, memory_limit=None): _logger.debug('Starting build in `%s`', self.repo_path) # Checkout to the correct commit if self.image_tag != self.LATEST_IMAGE_TAG: git.checkout_commit(repo_path=self.repo_path, commit=self.image_tag) limits = { # Always disable memory swap for building, since mostly # nothing good can come of that. 'memswap': -1 } if memory_limit: limits['memory'] = memory_limit # Create DockerFile with open(self.dockerfile_path, 'w') as dockerfile: rendered_dockerfile = self.render() celery_app.send_task( SchedulerCeleryTasks.BUILD_JOBS_SET_DOCKERFILE, kwargs={ 'build_job_uuid': self.job_uuid, 'dockerfile': rendered_dockerfile }) dockerfile.write(rendered_dockerfile) stream = self.docker.build(path=self.build_path, tag=self.get_tagged_image(), forcerm=True, rm=True, pull=True, nocache=nocache, container_limits=limits) return self._handle_log_stream(stream=stream)
def experiments_group_stop_experiments(experiment_group_id, pending, message=None): experiment_group = get_running_experiment_group( experiment_group_id=experiment_group_id) if not experiment_group: return if pending: for experiment in experiment_group.pending_experiments: # Update experiment status to show that its stopped experiment.set_status(status=ExperimentLifeCycle.STOPPED, message=message) else: experiments = experiment_group.experiments.exclude( status__status__in=ExperimentLifeCycle.DONE_STATUS).distinct() for experiment in experiments: if experiment.is_running: celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_STOP, kwargs={'experiment_id': experiment.id}) else: # Update experiment status to show that its stopped experiment.set_status(status=ExperimentLifeCycle.STOPPED, message=message) experiment_group.set_status(ExperimentGroupLifeCycle.STOPPED)
def build_handle_done_status(sender, **kwargs): instance = kwargs['instance'] build_job_id = instance.job_id if JobLifeCycle.is_done(instance.status): celery_app.send_task(SchedulerCeleryTasks.BUILD_JOBS_NOTIFY_DONE, kwargs={'build_job_id': build_job_id})
def start_new_experiment(sender, **kwargs): instance = kwargs['instance'] if instance.is_independent: # Start building the experiment and then Schedule it to be picked by the spawners celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_BUILD, kwargs={'experiment_id': instance.id}, countdown=1)
def stop_running_experiment(sender, **kwargs): instance = kwargs['instance'] if not instance.is_running or instance.jobs.count() == 0: return try: group = instance.experiment_group celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_STOP, kwargs={ 'project_name': instance.project.unique_name, 'project_uuid': instance.project.uuid.hex, 'experiment_name': instance.unique_name, 'experiment_uuid': instance.uuid.hex, 'experiment_group_name': group.unique_name if group else None, 'experiment_group_uuid': group.uuid.hex if group else None, 'specification': instance.config, 'update_status': False }) except ExperimentGroup.DoesNotExist: # The experiment was already stopped when the group was deleted pass
def post(self, request, *args, **kwargs): project = self.get_object() experiment_id = self.kwargs.get('experiment_id') group_id = self.kwargs.get('group_id') if experiment_id: experiment = get_object_or_404(Experiment, project=project, id=experiment_id) has_tensorboard = experiment.has_tensorboard tensorboard = experiment.tensorboard elif group_id: group = get_object_or_404(ExperimentGroup, project=project, id=group_id) has_tensorboard = group.has_tensorboard tensorboard = group.tensorboard else: has_tensorboard = project.has_tensorboard tensorboard = project.tensorboard if has_tensorboard: celery_app.send_task( SchedulerCeleryTasks.TENSORBOARDS_STOP, kwargs={ 'project_name': project.unique_name, 'project_uuid': project.uuid.hex, 'tensorboard_job_name': tensorboard.unique_name, 'tensorboard_job_uuid': tensorboard.uuid.hex, 'update_status': True }) auditor.record(event_type=TENSORBOARD_STOPPED_TRIGGERED, instance=tensorboard, target='project', actor_id=self.request.user.id, actor_name=self.request.user.username) return Response(status=status.HTTP_200_OK)
def new_experiment_job_status(sender, **kwargs): instance = kwargs['instance'] created = kwargs.get('created', False) job = instance.job if created: # update job last_status job.status = instance job.save() # check if the new status is done to remove the containers from the monitors if job.is_done: from libs.redis_db import RedisJobContainers RedisJobContainers.remove_job(job.uuid.hex) # Check if the experiment job status if not created: return # Check if we need to change the experiment status experiment = instance.job.experiment if experiment.is_done: return celery_app.send_task( SchedulerCeleryTasks.EXPERIMENTS_CHECK_STATUS, kwargs={'experiment_id': experiment.id}, countdown=1)
def run(containers, node, persist): container_ids = RedisJobContainers.get_containers() gpu_resources = get_gpu_resources() if gpu_resources: gpu_resources = { gpu_resource['index']: gpu_resource for gpu_resource in gpu_resources } update_cluster_node(gpu_resources) for container_id in container_ids: container = get_container(containers, container_id) if not container: continue payload = get_container_resources(node, containers[container_id], gpu_resources) if payload: payload = payload.to_dict() logger.debug("Publishing resources event") celery_app.send_task(EventsCeleryTasks.EVENTS_HANDLE_RESOURCES, kwargs={ 'payload': payload, 'persist': persist }) job_uuid = payload['job_uuid'] # Check if we should stream the payload # Check if we have this container already in place experiment_uuid = RedisJobContainers.get_experiment_for_job( job_uuid) set_last_resources_cond = ( RedisToStream.is_monitored_job_resources(job_uuid) or RedisToStream.is_monitored_experiment_resources( experiment_uuid)) if set_last_resources_cond: RedisToStream.set_latest_job_resources(job_uuid, payload)
def new_operation_run_status(sender, **kwargs): instance = kwargs['instance'] operation_run = instance.operation_run pipeline_run = operation_run.pipeline_run # Update job last_status operation_run.status = instance operation_run.save() # No need to check if it is just created if instance.status == OperationStatuses.CREATED: return # Check if we need to update the pipeline_run's status celery_app.send_task( PipelineCeleryTasks.PIPELINES_CHECK_STATUSES, kwargs={'pipeline_run_id': pipeline_run.id, 'status': instance.status, 'message': instance.message}) if operation_run.is_done: # Notify downstream that instance is done, and that its dependency can start. downstream_runs = operation_run.downstream_runs.filter( status__status=OperationStatuses.CREATED) for op_run in downstream_runs: celery_app.send_task( PipelineCeleryTasks.PIPELINES_START_OPERATION, kwargs={'operation_run_id': op_run.id})
def new_pipeline_run_status(sender, **kwargs): instance = kwargs['instance'] pipeline_run = instance.pipeline_run # Update job last_status pipeline_run.status = instance set_started_at(instance=pipeline_run, status=instance.status, starting_statuses=[PipelineStatuses.RUNNING]) set_finished_at(instance=pipeline_run, status=instance.status, is_done=PipelineStatuses.is_done) pipeline_run.save() # Notify operations with status change. This is necessary if we skip or stop the dag run. if pipeline_run.stopped: celery_app.send_task(PipelinesCeleryTasks.PIPELINES_STOP_OPERATIONS, kwargs={ 'pipeline_run_id': pipeline_run.id, 'message': 'Pipeline run was stopped' }) if pipeline_run.skipped: celery_app.send_task(PipelinesCeleryTasks.PIPELINES_SKIP_OPERATIONS, kwargs={ 'pipeline_run_id': pipeline_run.id, 'message': 'Pipeline run was skipped' })
def new_operation_run_status(sender, **kwargs): instance = kwargs['instance'] operation_run = instance.operation_run pipeline_run = operation_run.pipeline_run # Update job last_status operation_run.status = instance set_started_at(instance=operation_run, status=instance.status, starting_statuses=[PipelineStatuses.RUNNING]) set_finished_at(instance=operation_run, status=instance.status, is_done=PipelineStatuses.is_done) operation_run.save() # No need to check if it is just created if instance.status == OperationStatuses.CREATED: return # Check if we need to update the pipeline_run's status celery_app.send_task(PipelinesCeleryTasks.PIPELINES_CHECK_STATUSES, kwargs={ 'pipeline_run_id': pipeline_run.id, 'status': instance.status, 'message': instance.message }) if operation_run.is_done: # Notify downstream that instance is done, and that its dependency can start. downstream_runs = operation_run.downstream_runs.filter( status__status=OperationStatuses.CREATED) for op_run in downstream_runs: celery_app.send_task( PipelinesCeleryTasks.PIPELINES_START_OPERATION, kwargs={'operation_run_id': op_run.id})
def publish_build_job_log(self, log_lines, job_uuid, job_name): log_lines = to_list(log_lines) self._logger.info("Publishing log event for task: %s", job_uuid) celery_app.send_task( EventsCeleryTasks.EVENTS_HANDLE_LOGS_BUILD_JOB, kwargs={'job_uuid': job_uuid, 'job_name': job_name, 'log_lines': log_lines})
def post(self, request, *args, **kwargs): obj = self.get_object() auditor.record(event_type=EXPERIMENT_STOPPED_TRIGGERED, instance=obj, actor_id=request.user.id, actor_name=request.user.username) group = obj.experiment_group celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_STOP, kwargs={ 'project_name': obj.project.unique_name, 'project_uuid': obj.project.uuid.hex, 'experiment_name': obj.unique_name, 'experiment_uuid': obj.uuid.hex, 'experiment_group_name': group.unique_name if group else None, 'experiment_group_uuid': group.uuid.hex if group else None, 'specification': obj.config, 'update_status': True }) return Response(status=status.HTTP_200_OK)
def create(experiment_group): base.create_group_experiments(experiment_group=experiment_group) celery_app.send_task( HPCeleryTasks.HP_GRID_SEARCH_START, kwargs={'experiment_group_id': experiment_group.id}, countdown=1)
def projects_notebook_build(project_id): project = get_valid_project(project_id=project_id) if not project or not project.notebook: _logger.warning('Project does not have a notebook.') return None job = project.notebook if not JobLifeCycle.can_transition(status_from=job.last_status, status_to=JobLifeCycle.BUILDING): _logger.info('Notebook for project id `%s` cannot transition from `%s` to `%s`.', project_id, job.last_status, JobLifeCycle.BUILDING) return build_job, image_exists, build_status = dockerizer_scheduler.create_build_job( user=job.user, project=job.project, config=job.specification.run_exec, code_reference=job.code_reference) job.build_job = build_job job.save() if image_exists: # The image already exists, so we can start the experiment right away celery_app.send_task( SchedulerCeleryTasks.PROJECTS_NOTEBOOK_START, kwargs={'project_id': project_id}) return if not build_status: job.set_status(JobLifeCycle.FAILED, message='Could not start build process.') return # Update job status to show that its building docker image job.set_status(JobLifeCycle.BUILDING, message='Building container')
def experiment_job_status_post_save(sender, **kwargs): instance = kwargs['instance'] job = instance.job # update job last_status job.status = instance set_job_started_at(instance=job, status=instance.status) set_job_finished_at(instance=job, status=instance.status) job.save() # check if the new status is done to remove the containers from the monitors if job.is_done: from libs.redis_db import RedisJobContainers RedisJobContainers.remove_job(job.uuid.hex) # Check if we need to change the experiment status experiment = instance.job.experiment if experiment.is_done: return celery_app.send_task( SchedulerCeleryTasks.EXPERIMENTS_CHECK_STATUS, kwargs={'experiment_id': experiment.id}, countdown=1)
def hp_hyperband_iterate(self, experiment_group_id): experiment_group = get_running_experiment_group(experiment_group_id=experiment_group_id) if not experiment_group: return if experiment_group.non_done_experiments.count() > 0: # Schedule another task, because all experiment must be done self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER) return iteration_config = experiment_group.iteration_config iteration_manager = experiment_group.iteration_manager search_manager = experiment_group.search_manager iteration_manager.update_iteration() if search_manager.should_reschedule(iteration=iteration_config.iteration, bracket_iteration=iteration_config.bracket_iteration): celery_app.send_task( HPCeleryTasks.HP_HYPERBAND_CREATE, kwargs={'experiment_group_id': experiment_group_id}) return if search_manager.should_reduce_configs(iteration=iteration_config.iteration, bracket_iteration=iteration_config.bracket_iteration): iteration_manager.reduce_configs() celery_app.send_task( HPCeleryTasks.HP_HYPERBAND_START, kwargs={'experiment_group_id': experiment_group_id}) return base.check_group_experiments_finished(experiment_group_id)
def jobs_build(job_id): job = get_valid_job(job_id=job_id) if not job: _logger.warning('Job does not have a notebook.') return None if not JobLifeCycle.can_transition(status_from=job.last_status, status_to=JobLifeCycle.BUILDING): _logger.info('Job id `%s` cannot transition from `%s` to `%s`.', job_id, job.last_status, JobLifeCycle.BUILDING) return build_job, image_exists, build_status = dockerizer_scheduler.create_build_job( user=job.user, project=job.project, config=job.specification.build, code_reference=job.code_reference) job.build_job = build_job job.save() if image_exists: # The image already exists, so we can start the experiment right away celery_app.send_task(SchedulerCeleryTasks.JOBS_START, kwargs={'job_id': job_id}) return if not build_status: job.set_status(JobLifeCycle.FAILED, message='Could not start build process.') return # Update job status to show that its building docker image job.set_status(JobLifeCycle.BUILDING, message='Building container')
def post(self, request, *args, **kwargs): project = self.get_object() experiment_id = self.kwargs.get('experiment_id') group_id = self.kwargs.get('group_id') if experiment_id: experiment = get_object_or_404(Experiment, project=project, id=experiment_id) tensorboard = self._handle_experiment_tensorboard( project=project, experiment=experiment) elif group_id: group = get_object_or_404(ExperimentGroup, project=project, id=group_id) tensorboard = self._handle_group_tensorboard(project=project, group=group) else: tensorboard = self._handle_project_tensorboard(project=project) if not tensorboard: return Response(data='Tensorboard is already running', status=status.HTTP_200_OK) if not tensorboard.is_running: celery_app.send_task(SchedulerCeleryTasks.TENSORBOARDS_START, kwargs={'tensorboard_job_id': tensorboard.id}, countdown=1) return Response(status=status.HTTP_201_CREATED)
def hp_hyperband_iterate(self, experiment_group_id): experiment_group = get_running_experiment_group( experiment_group_id=experiment_group_id) if not experiment_group: return if experiment_group.non_done_experiments.count() > 0: # Schedule another task, because all experiment must be done self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER) return iteration_config = experiment_group.iteration_config iteration_manager = experiment_group.iteration_manager search_manager = experiment_group.search_manager iteration_manager.update_iteration() if search_manager.should_reschedule( iteration=iteration_config.iteration, bracket_iteration=iteration_config.bracket_iteration): celery_app.send_task( HPCeleryTasks.HP_HYPERBAND_CREATE, kwargs={'experiment_group_id': experiment_group_id}) return if search_manager.should_reduce_configs( iteration=iteration_config.iteration, bracket_iteration=iteration_config.bracket_iteration): iteration_manager.reduce_configs() celery_app.send_task( HPCeleryTasks.HP_HYPERBAND_START, kwargs={'experiment_group_id': experiment_group_id}) return base.check_group_experiments_finished(experiment_group_id)
def new_experiment_job_status(sender, **kwargs): instance = kwargs['instance'] created = kwargs.get('created', False) job = instance.job if created: # update job last_status job.status = instance job.save() # check if the new status is done to remove the containers from the monitors if job.is_done: from libs.redis_db import RedisJobContainers RedisJobContainers.remove_job(job.uuid.hex) # Check if the experiment job status if not created: return # Check if we need to change the experiment status experiment = instance.job.experiment if experiment.is_done: return celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_CHECK_STATUS, kwargs={'experiment_id': experiment.id}, countdown=1)
def build(self, nocache=False, memory_limit=None): _logger.debug('Starting build in `%s`', self.repo_path) # Checkout to the correct commit if self.image_tag != self.LATEST_IMAGE_TAG: git.checkout_commit(repo_path=self.repo_path, commit=self.image_tag) limits = { # Always disable memory swap for building, since mostly # nothing good can come of that. 'memswap': -1 } if memory_limit: limits['memory'] = memory_limit # Create DockerFile with open(self.dockerfile_path, 'w') as dockerfile: rendered_dockerfile = self.render() celery_app.send_task( SchedulerCeleryTasks.BUILD_JOBS_SET_DOCKERFILE, kwargs={'build_job_uuid': self.job_uuid, 'dockerfile': rendered_dockerfile}) dockerfile.write(rendered_dockerfile) stream = self.docker.build( path=self.build_path, tag=self.get_tagged_image(), forcerm=True, rm=True, pull=True, nocache=nocache, container_limits=limits) return self._handle_log_stream(stream=stream)
def projects_notebook_build(notebook_job_id): notebook_job = get_valid_notebook(notebook_job_id=notebook_job_id) if not notebook_job: _logger.warning('Notebook %s does not exist anymore.', notebook_job_id) return None if not JobLifeCycle.can_transition(status_from=notebook_job.last_status, status_to=JobLifeCycle.BUILDING): _logger.info('Notebook `%s` cannot transition from `%s` to `%s`.', notebook_job, notebook_job.last_status, JobLifeCycle.BUILDING) return build_job, image_exists, build_status = dockerizer_scheduler.create_build_job( user=notebook_job.user, project=notebook_job.project, config=notebook_job.specification.build, code_reference=notebook_job.code_reference) notebook_job.build_job = build_job notebook_job.save() if image_exists: # The image already exists, so we can start the experiment right away celery_app.send_task( SchedulerCeleryTasks.PROJECTS_NOTEBOOK_START, kwargs={'notebook_job_id': notebook_job_id}) return if not build_status: notebook_job.set_status(JobLifeCycle.FAILED, message='Could not start build process.') return # Update job status to show that its building docker image notebook_job.set_status(JobLifeCycle.BUILDING, message='Building container')
def run(containers, node, persist): container_ids = RedisJobContainers.get_containers() gpu_resources = get_gpu_resources() if gpu_resources: gpu_resources = {gpu_resource['index']: gpu_resource for gpu_resource in gpu_resources} # update cluster and current node update_cluster(gpu_resources) for container_id in container_ids: container = get_container(containers, container_id) if not container: continue payload = get_container_resources(node, containers[container_id], gpu_resources) if payload: payload = payload.to_dict() logger.info("Publishing resources event") celery_app.send_task( EventsCeleryTasks.EVENTS_HANDLE_RESOURCES, kwargs={'payload': payload, 'persist': persist}) job_uuid = payload['job_uuid'] # Check if we should stream the payload # Check if we have this container already in place experiment_uuid = RedisJobContainers.get_experiment_for_job(job_uuid) if (RedisToStream.is_monitored_job_resources(job_uuid) or RedisToStream.is_monitored_experiment_resources(experiment_uuid)): RedisToStream.set_latest_job_resources(job_uuid, payload)
def post(self, request, *args, **kwargs): obj = self.get_object() if obj.has_notebook: commit = request.data.get('commit') commit = to_bool(commit) if commit is not None else True if commit: # Commit changes git.commit(obj.repo.path, request.user.email, request.user.username) else: # Reset changes git.undo(obj.repo.path) celery_app.send_task(SchedulerCeleryTasks.PROJECTS_NOTEBOOK_STOP, kwargs={ 'project_name': obj.unique_name, 'project_uuid': obj.uuid.hex, 'notebook_job_name': obj.notebook.unique_name, 'notebook_job_uuid': obj.notebook.uuid.hex, 'update_status': True }) auditor.record(event_type=NOTEBOOK_STOPPED_TRIGGERED, instance=obj.notebook, target='project', actor_id=self.request.user.id, countdown=1) elif obj.notebook and obj.notebook.is_running: obj.notebook.set_status(status=ExperimentLifeCycle.STOPPED, message='Notebook was stopped') return Response(status=status.HTTP_200_OK)
def handle_new_experiment_status(sender, **kwargs): instance = kwargs['instance'] experiment = instance.experiment if not experiment.specification: return stop_condition = (instance.status in (ExperimentLifeCycle.FAILED, ExperimentLifeCycle.SUCCEEDED) and experiment.jobs.count() > 0) if stop_condition: _logger.info( 'One of the workers failed or Master for experiment `%s` is done, ' 'send signal to other workers to stop.', experiment.unique_name) # Schedule stop for this experiment because other jobs may be still running group = experiment.experiment_group celery_app.send_task( SchedulerCeleryTasks.EXPERIMENTS_STOP, kwargs={ 'project_name': experiment.project.unique_name, 'project_uuid': experiment.project.uuid.hex, 'experiment_name': experiment.unique_name, 'experiment_uuid': experiment.uuid.hex, 'experiment_group_name': group.unique_name if group else None, 'experiment_group_uuid': group.uuid.hex if group else None, 'specification': experiment.config, 'update_status': False }, countdown=RedisTTL.get_for_experiment(experiment_id=experiment.id))
def build_experiment(self, experiment_id): experiment = get_valid_experiment(experiment_id=experiment_id) if not experiment: if self.request.retries < 2: _logger.info('Trying again for Experiment `%s`.', experiment_id) self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER) _logger.info( 'Something went wrong, ' 'the Experiment `%s` does not exist anymore.', experiment_id) return # No need to build the image, start the experiment directly if not (experiment.specification.build and experiment.specification.run): celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_START, kwargs={'experiment_id': experiment_id}) return if not ExperimentLifeCycle.can_transition( status_from=experiment.last_status, status_to=ExperimentLifeCycle.BUILDING): _logger.info('Experiment id `%s` cannot transition from `%s` to `%s`.', experiment_id, experiment.last_status, ExperimentLifeCycle.BUILDING) return None # Update experiment status to show that its building experiment.set_status(ExperimentLifeCycle.BUILDING) # Building the docker image try: status = experiments_builder.build_experiment(experiment) except DockerException as e: _logger.warning('Failed to build experiment %s', e) experiment.set_status(ExperimentLifeCycle.FAILED, message='Failed to build image for experiment.') return except Repo.DoesNotExist: _logger.warning('No code was found for this project') experiment.set_status( ExperimentLifeCycle.FAILED, message='No code was found for to build this experiment.') return except Exception as e: # Other exceptions _logger.error( 'Failed to build experiment, unexpected error occurred.\n%s', traceback.format_exc()) experiment.set_status(ExperimentLifeCycle.FAILED, message='Failed to build image for experiment.') return if not status: experiment.set_status(ExperimentLifeCycle.FAILED, message='Failed to build image for experiment.') return # Now we can start the experiment celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_START, kwargs={'experiment_id': experiment_id})
def sync_experiments_and_jobs_statuses(): experiments = Experiment.objects.exclude( status__status__in=ExperimentLifeCycle.DONE_STATUS) experiments = experiments.annotate(num_jobs=Count('jobs')).filter(num_jobs__gt=0) for experiment in experiments: celery_app.send_task( SchedulerCeleryTasks.EXPERIMENTS_CHECK_STATUS, kwargs={'experiment_id': experiment.id})
def build_handle_done_status(sender, **kwargs): instance = kwargs['instance'] build_job_id = instance.job_id if JobLifeCycle.is_done(instance.status): celery_app.send_task( SchedulerCeleryTasks.BUILD_JOBS_NOTIFY_DONE, kwargs={'build_job_id': build_job_id})
def start_new_experiment(sender, **kwargs): instance = kwargs['instance'] if instance.is_independent: # Start building the experiment and then Schedule it to be picked by the spawners celery_app.send_task( SchedulerCeleryTasks.EXPERIMENTS_BUILD, kwargs={'experiment_id': instance.id}, countdown=1)
def post(self, request, *args, **kwargs): obj = self.get_object() auditor.record(event_type=EXPERIMENT_STOPPED_TRIGGERED, instance=obj, actor_id=request.user.id) celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_STOP, kwargs={'experiment_id': obj.id}) return Response(status=status.HTTP_200_OK)
def post(self, request, *args, **kwargs): obj = self.get_object() auditor.record(event_type=EXPERIMENT_STOPPED_TRIGGERED, instance=obj, actor_id=request.user.id) celery_app.send_task( SchedulerCeleryTasks.EXPERIMENTS_STOP, kwargs={'experiment_id': obj.id}) return Response(status=status.HTTP_200_OK)
def experiments_group_create(self, experiment_group_id): experiment_group = _get_group_or_retry(experiment_group_id=experiment_group_id, task=self) if not experiment_group: return experiment_group.set_status(ExperimentGroupLifeCycle.RUNNING) celery_app.send_task( HPCeleryTasks.HP_CREATE, kwargs={'experiment_group_id': experiment_group_id})
def experiments_group_create(self, experiment_group_id): experiment_group = _get_group_or_retry( experiment_group_id=experiment_group_id, task=self) if not experiment_group: return experiment_group.set_status(ExperimentGroupLifeCycle.RUNNING) celery_app.send_task(HPCeleryTasks.HP_CREATE, kwargs={'experiment_group_id': experiment_group_id})
def post(self, request, *args, **kwargs): obj = self.get_object() if obj.has_notebook: return Response(data='Notebook is already running', status=status.HTTP_200_OK) self._create_notebook(obj) if not obj.notebook.is_running: celery_app.send_task( SchedulerCeleryTasks.PROJECTS_NOTEBOOK_BUILD, kwargs={'project_id': obj.id}) return Response(status=status.HTTP_201_CREATED)
def create(experiment_group): experiment_group.iteration_manager.create_iteration() experiments = base.create_group_experiments( experiment_group=experiment_group) experiment_group.iteration_manager.add_iteration_experiments( experiment_ids=[xp.id for xp in experiments]) celery_app.send_task(HPCeleryTasks.HP_HYPERBAND_START, kwargs={'experiment_group_id': experiment_group.id}, countdown=1)
def _run(task_bind, *args, **kwargs): experiment_id = kwargs['experiment_id'] experiment = get_valid_experiment(experiment_id=experiment_id) if not experiment: raise OperationRunError( 'The Experiment `{}` does not exist anymore.'.format( experiment_id)) celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_BUILD, kwargs={'experiment_id': experiment_id})
def post(self, request, *args, **kwargs): obj = self.get_object() if obj.has_tensorboard: return Response(data='Tensorboard is already running', status=status.HTTP_200_OK) self._create_tensorboard(obj) if not obj.tensorboard.is_running: celery_app.send_task( SchedulerCeleryTasks.PROJECTS_TENSORBOARD_START, kwargs={'project_id': obj.id}) return Response(status=status.HTTP_201_CREATED)
def publish_build_job_log(self, log_line, job_uuid, job_name): try: log_line = log_line.decode('utf-8') except AttributeError: pass self._logger.info("Publishing log event for task: %s", job_uuid) celery_app.send_task( EventsCeleryTasks.EVENTS_HANDLE_LOGS_BUILD_JOB, kwargs={'job_uuid': job_uuid, 'job_name': job_name, 'log_line': log_line})
def create(experiment_group): experiment_group.iteration_manager.create_iteration() experiments = base.create_group_experiments(experiment_group=experiment_group) experiment_group.iteration_manager.add_iteration_experiments( experiment_ids=[xp.id for xp in experiments]) celery_app.send_task( HPCeleryTasks.HP_HYPERBAND_START, kwargs={'experiment_group_id': experiment_group.id}, countdown=1)
def tensorboard_job_pre_delete(sender, **kwargs): job = kwargs['instance'] celery_app.send_task(SchedulerCeleryTasks.TENSORBOARDS_STOP, kwargs={ 'project_name': job.project.unique_name, 'project_uuid': job.project.uuid.hex, 'tensorboard_job_name': job.unique_name, 'tensorboard_job_uuid': job.uuid.hex, 'update_status': False })
def create(experiment_group): experiments = base.create_group_experiments(experiment_group=experiment_group) experiment_ids = [xp.id for xp in experiments] experiments_configs = [[xp.id, xp.declarations] for xp in experiments] experiment_group.iteration_manager.create_iteration( experiment_ids=experiment_ids, experiments_configs=experiments_configs) celery_app.send_task( HPCeleryTasks.HP_BO_START, kwargs={'experiment_group_id': experiment_group.id}, countdown=1)
def build_experiment(self, experiment_id): experiment = get_valid_experiment(experiment_id=experiment_id) if not experiment: if self.request.retries < 2: _logger.info('Trying again for Experiment `%s`.', experiment_id) self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER) _logger.info('Something went wrong, ' 'the Experiment `%s` does not exist anymore.', experiment_id) return # No need to build the image, start the experiment directly if not (experiment.specification.build and experiment.specification.run): celery_app.send_task( SchedulerCeleryTasks.EXPERIMENTS_START, kwargs={'experiment_id': experiment_id}) return if not ExperimentLifeCycle.can_transition(status_from=experiment.last_status, status_to=ExperimentLifeCycle.BUILDING): _logger.info('Experiment id `%s` cannot transition from `%s` to `%s`.', experiment_id, experiment.last_status, ExperimentLifeCycle.BUILDING) return None # Update experiment status to show that its building experiment.set_status(ExperimentLifeCycle.BUILDING) # Building the docker image try: status = experiments_builder.build_experiment(experiment) except DockerException as e: _logger.warning('Failed to build experiment %s', e) experiment.set_status(ExperimentLifeCycle.FAILED, message='Failed to build image for experiment.') return except Repo.DoesNotExist: _logger.warning('No code was found for this project') experiment.set_status(ExperimentLifeCycle.FAILED, message='No code was found for to build this experiment.') return except Exception as e: # Other exceptions _logger.warning('Failed to build experiment %s', e) experiment.set_status(ExperimentLifeCycle.FAILED, message='Failed to build image for experiment.') return if not status: return # Now we can start the experiment celery_app.send_task( SchedulerCeleryTasks.EXPERIMENTS_START, kwargs={'experiment_id': experiment_id})
def build_check_stop_job(sender, **kwargs): instance = kwargs['instance'] build_job_id = instance.job_id if instance.status in (JobLifeCycle.FAILED, JobLifeCycle.SUCCEEDED): _logger.info('The build job with id `%s` failed or is done, ' 'send signal to stop.', build_job_id) # Schedule stop for this job celery_app.send_task( SchedulerCeleryTasks.BUILD_JOBS_STOP, kwargs={'build_job_id': build_job_id, 'update_status': False})
def notify_build_job_succeeded(build_job): job_ids = Job.objects.filter(build_job=build_job).values_list('id', flat=True) for job_id in job_ids: celery_app.send_task( SchedulerCeleryTasks.JOBS_START, kwargs={'job_id': job_id}) tensorboard_job_ids = TensorboardJob.objects.filter( build_job=build_job).values_list('id', flat=True) for tensorboard_job_id in tensorboard_job_ids: celery_app.send_task( SchedulerCeleryTasks.TENSORBOARDS_START, kwargs={'tensorboard_job_id': tensorboard_job_id}) notebook_job_ids = NotebookJob.objects.filter( build_job=build_job).values_list('id', flat=True) for notebook_job_id in notebook_job_ids: celery_app.send_task( SchedulerCeleryTasks.PROJECTS_NOTEBOOK_START, kwargs={'notebook_job_id': notebook_job_id}) experiment_ids = Experiment.objects.filter( build_job=build_job).values_list('id', flat=True) for experiment_id in experiment_ids: celery_app.send_task( SchedulerCeleryTasks.EXPERIMENTS_START, kwargs={'experiment_id': experiment_id})
def hp_hyperband_start(self, experiment_group_id): experiment_group = get_running_experiment_group(experiment_group_id=experiment_group_id) if not experiment_group: return should_retry = base.start_group_experiments(experiment_group=experiment_group) if should_retry: # Schedule another task self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER) return celery_app.send_task( HPCeleryTasks.HP_HYPERBAND_ITERATE, kwargs={'experiment_group_id': experiment_group_id})
def send_status(build_job, status, message=None): payload = { 'details': { 'labels': { 'app': 'dockerizer', 'job_uuid': build_job.uuid.hex, 'job_name': build_job.unique_name } }, 'status': status, 'message': message } celery_app.send_task( EventsCeleryTasks.EVENTS_HANDLE_BUILD_JOB_STATUSES, kwargs={'payload': payload})
def update_cluster(node_gpus): celery_app.send_task(CronsCeleryTasks.CLUSTERS_UPDATE_SYSTEM_INFO) celery_app.send_task(CronsCeleryTasks.CLUSTERS_UPDATE_SYSTEM_NODES) if not node_gpus: return node = ClusterNode.objects.filter(name=settings.K8S_NODE_NAME).first() for node_gpu_index in node_gpus.keys(): node_gpu_value = node_gpus[node_gpu_index] try: node_gpu = NodeGPU.objects.get(cluster_node=node, index=node_gpu_index) except NodeGPU.DoesNotExist: node_gpu = NodeGPU(cluster_node=node, index=node_gpu_index) node_gpu.serial = node_gpu_value['serial'] node_gpu.name = node_gpu_value['name'] node_gpu.memory = node_gpu_value['memory_total'] node_gpu.save()
def publish_experiment_job_log(self, log_lines, status, experiment_uuid, experiment_name, job_uuid, task_type=None, task_idx=None): self._logger.debug("Publishing log event for task: %s.%s, %s", task_type, task_idx, experiment_name) celery_app.send_task( EventsCeleryTasks.EVENTS_HANDLE_LOGS_EXPERIMENT_JOB, kwargs={ 'experiment_name': experiment_name, 'experiment_uuid': experiment_uuid, 'job_uuid': job_uuid, 'log_lines': log_lines, 'task_type': task_type, 'task_idx': task_idx}) try: should_stream = (RedisToStream.is_monitored_job_logs(job_uuid) or RedisToStream.is_monitored_experiment_logs(experiment_uuid)) except RedisError: should_stream = False if should_stream: self._logger.info("Streaming new log event for experiment: %s", experiment_uuid) with celery_app.producer_or_acquire(None) as producer: try: producer.publish( { 'experiment_uuid': experiment_uuid, 'job_uuid': job_uuid, 'log_lines': log_lines, 'status': status, 'task_type': task_type, 'task_idx': task_idx }, routing_key='{}.{}.{}'.format(RoutingKeys.LOGS_SIDECARS, experiment_uuid, job_uuid), exchange=settings.INTERNAL_EXCHANGE, ) except (TimeoutError, AMQPError): pass
def new_pipeline_run_status(sender, **kwargs): instance = kwargs['instance'] pipeline_run = instance.pipeline_run # Update job last_status pipeline_run.status = instance pipeline_run.save() # Notify operations with status change. This is necessary if we skip or stop the dag run. if pipeline_run.stopped: celery_app.send_task( PipelineCeleryTasks.PIPELINES_STOP_OPERATIONS, kwargs={'pipeline_run_id': pipeline_run.id, 'message': 'Pipeline run was stopped'}) if pipeline_run.skipped: celery_app.send_task( PipelineCeleryTasks.PIPELINES_SKIP_OPERATIONS, kwargs={'pipeline_run_id': pipeline_run.id, 'message': 'Pipeline run was skipped'})
def handle_new_experiment_status(sender, **kwargs): instance = kwargs['instance'] experiment = instance.experiment if not experiment.specification: return stop_condition = ( instance.status in (ExperimentLifeCycle.FAILED, ExperimentLifeCycle.SUCCEEDED) and experiment.jobs.count() > 0 ) if stop_condition: _logger.info('One of the workers failed or Master for experiment `%s` is done, ' 'send signal to other workers to stop.', experiment.unique_name) # Schedule stop for this experiment because other jobs may be still running celery_app.send_task( SchedulerCeleryTasks.EXPERIMENTS_STOP, kwargs={'experiment_id': experiment.id, 'update_status': False})
def start_group_experiments(experiment_group): # Check for early stopping before starting new experiments from this group if experiment_group.should_stop_early(): celery_app.send_task( SchedulerCeleryTasks.EXPERIMENTS_GROUP_STOP_EXPERIMENTS, kwargs={'experiment_group_id': experiment_group.id, 'pending': True, 'message': 'Early stopping'}) return experiment_to_start = experiment_group.n_experiments_to_start pending_experiments = experiment_group.pending_experiments[:experiment_to_start] n_pending_experiment = experiment_group.pending_experiments.count() for experiment in pending_experiments: celery_app.send_task( SchedulerCeleryTasks.EXPERIMENTS_BUILD, kwargs={'experiment_id': experiment.id}) return n_pending_experiment - experiment_to_start > 0
def start(self): """Start the celery task of this operation.""" kwargs = self.celery_task_context # Update we the operation run id kwargs['operation_run_id'] = self.id # pylint:disable=unsupported-assignment-operation async_result = celery_app.send_task( self.operation.celery_task, kwargs=kwargs, **self.operation.get_run_params()) self.celery_task_id = async_result.id self.save()
def experiments_group_stop_experiments(experiment_group_id, pending, message=None): experiment_group = get_running_experiment_group(experiment_group_id=experiment_group_id) if not experiment_group: return if pending: for experiment in experiment_group.pending_experiments: # Update experiment status to show that its stopped experiment.set_status(status=ExperimentLifeCycle.STOPPED, message=message) else: experiments = experiment_group.experiments.exclude( status__status__in=ExperimentLifeCycle.DONE_STATUS).distinct() for experiment in experiments: if experiment.is_running: celery_app.send_task( SchedulerCeleryTasks.EXPERIMENTS_STOP, kwargs={'experiment_id': experiment.id}) else: # Update experiment status to show that its stopped experiment.set_status(status=ExperimentLifeCycle.STOPPED, message=message) experiment_group.set_status(ExperimentGroupLifeCycle.STOPPED)
def experiments_build(experiment_id): experiment = get_valid_experiment(experiment_id=experiment_id) if not experiment: return # No need to build the image, start the experiment directly if not (experiment.specification.build and experiment.specification.run): celery_app.send_task( SchedulerCeleryTasks.EXPERIMENTS_START, kwargs={'experiment_id': experiment_id}) return if not ExperimentLifeCycle.can_transition(status_from=experiment.last_status, status_to=ExperimentLifeCycle.BUILDING): _logger.info('Experiment id `%s` cannot transition from `%s` to `%s`.', experiment_id, experiment.last_status, ExperimentLifeCycle.BUILDING) return build_job, image_exists, build_status = dockerizer_scheduler.create_build_job( user=experiment.user, project=experiment.project, config=experiment.specification.build, code_reference=experiment.code_reference) experiment.build_job = build_job experiment.save() if image_exists: # The image already exists, so we can start the experiment right away celery_app.send_task( SchedulerCeleryTasks.EXPERIMENTS_START, kwargs={'experiment_id': experiment_id}) return if not build_status: experiment.set_status(ExperimentLifeCycle.FAILED, message='Could not start build process.') return # Update experiment status to show that its building experiment.set_status(ExperimentLifeCycle.BUILDING)
def build_project_notebook(project_id): project = get_valid_project(project_id) if not project or not project.notebook: return None notebook_job = project.notebook # Update job status to show that its building docker image notebook_job.set_status(JobLifeCycle.BUILDING, message='Building container') # Building the docker image try: status = notebooks_builder.build_notebook_job(project=project, job=project.notebook) except DockerException as e: _logger.warning('Failed to build notebook %s', e) notebook_job.set_status( JobLifeCycle.FAILED, message='Failed to build image for notebook.') return except Repo.DoesNotExist: _logger.warning('No code was found for this project') notebook_job.set_status( JobLifeCycle.FAILED, message='Failed to build image for notebook.') return except Exception as e: # Other exceptions _logger.warning('Failed to build notebook %s', e) notebook_job.set_status(JobLifeCycle.FAILED, message='Failed to build image for notebook.') return if not status: return # Now we can start the notebook celery_app.send_task( SchedulerCeleryTasks.PROJECTS_NOTEBOOK_START, kwargs={'notebook_job_id': notebook_job.id})
def check_group_experiments_finished(experiment_group_id): celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_GROUP_CHECK_FINISHED, kwargs={'experiment_group_id': experiment_group_id})