Ejemplo n.º 1
0
def create(experiment_group):
    base.create_group_experiments(experiment_group=experiment_group)

    celery_app.send_task(
        HPCeleryTasks.HP_RANDOM_SEARCH_START,
        kwargs={'experiment_group_id': experiment_group.id},
        countdown=1)
Ejemplo n.º 2
0
    def build(self, nocache=False, memory_limit=None):
        _logger.debug('Starting build in `%s`', self.repo_path)
        # Checkout to the correct commit
        if self.image_tag != self.LATEST_IMAGE_TAG:
            git.checkout_commit(repo_path=self.repo_path,
                                commit=self.image_tag)

        limits = {
            # Always disable memory swap for building, since mostly
            # nothing good can come of that.
            'memswap': -1
        }
        if memory_limit:
            limits['memory'] = memory_limit

        # Create DockerFile
        with open(self.dockerfile_path, 'w') as dockerfile:
            rendered_dockerfile = self.render()
            celery_app.send_task(
                SchedulerCeleryTasks.BUILD_JOBS_SET_DOCKERFILE,
                kwargs={
                    'build_job_uuid': self.job_uuid,
                    'dockerfile': rendered_dockerfile
                })
            dockerfile.write(rendered_dockerfile)

        stream = self.docker.build(path=self.build_path,
                                   tag=self.get_tagged_image(),
                                   forcerm=True,
                                   rm=True,
                                   pull=True,
                                   nocache=nocache,
                                   container_limits=limits)
        return self._handle_log_stream(stream=stream)
Ejemplo n.º 3
0
def experiments_group_stop_experiments(experiment_group_id,
                                       pending,
                                       message=None):
    experiment_group = get_running_experiment_group(
        experiment_group_id=experiment_group_id)
    if not experiment_group:
        return

    if pending:
        for experiment in experiment_group.pending_experiments:
            # Update experiment status to show that its stopped
            experiment.set_status(status=ExperimentLifeCycle.STOPPED,
                                  message=message)
    else:
        experiments = experiment_group.experiments.exclude(
            status__status__in=ExperimentLifeCycle.DONE_STATUS).distinct()
        for experiment in experiments:
            if experiment.is_running:
                celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_STOP,
                                     kwargs={'experiment_id': experiment.id})
            else:
                # Update experiment status to show that its stopped
                experiment.set_status(status=ExperimentLifeCycle.STOPPED,
                                      message=message)

    experiment_group.set_status(ExperimentGroupLifeCycle.STOPPED)
Ejemplo n.º 4
0
def build_handle_done_status(sender, **kwargs):
    instance = kwargs['instance']
    build_job_id = instance.job_id

    if JobLifeCycle.is_done(instance.status):
        celery_app.send_task(SchedulerCeleryTasks.BUILD_JOBS_NOTIFY_DONE,
                             kwargs={'build_job_id': build_job_id})
Ejemplo n.º 5
0
def start_new_experiment(sender, **kwargs):
    instance = kwargs['instance']
    if instance.is_independent:
        # Start building the experiment and then Schedule it to be picked by the spawners
        celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_BUILD,
                             kwargs={'experiment_id': instance.id},
                             countdown=1)
Ejemplo n.º 6
0
def stop_running_experiment(sender, **kwargs):
    instance = kwargs['instance']
    if not instance.is_running or instance.jobs.count() == 0:
        return

    try:
        group = instance.experiment_group
        celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_STOP,
                             kwargs={
                                 'project_name':
                                 instance.project.unique_name,
                                 'project_uuid':
                                 instance.project.uuid.hex,
                                 'experiment_name':
                                 instance.unique_name,
                                 'experiment_uuid':
                                 instance.uuid.hex,
                                 'experiment_group_name':
                                 group.unique_name if group else None,
                                 'experiment_group_uuid':
                                 group.uuid.hex if group else None,
                                 'specification':
                                 instance.config,
                                 'update_status':
                                 False
                             })
    except ExperimentGroup.DoesNotExist:
        # The experiment was already stopped when the group was deleted
        pass
Ejemplo n.º 7
0
    def post(self, request, *args, **kwargs):
        project = self.get_object()
        experiment_id = self.kwargs.get('experiment_id')
        group_id = self.kwargs.get('group_id')

        if experiment_id:
            experiment = get_object_or_404(Experiment, project=project, id=experiment_id)
            has_tensorboard = experiment.has_tensorboard
            tensorboard = experiment.tensorboard
        elif group_id:
            group = get_object_or_404(ExperimentGroup, project=project, id=group_id)
            has_tensorboard = group.has_tensorboard
            tensorboard = group.tensorboard
        else:
            has_tensorboard = project.has_tensorboard
            tensorboard = project.tensorboard

        if has_tensorboard:
            celery_app.send_task(
                SchedulerCeleryTasks.TENSORBOARDS_STOP,
                kwargs={
                    'project_name': project.unique_name,
                    'project_uuid': project.uuid.hex,
                    'tensorboard_job_name': tensorboard.unique_name,
                    'tensorboard_job_uuid': tensorboard.uuid.hex,
                    'update_status': True
                })
            auditor.record(event_type=TENSORBOARD_STOPPED_TRIGGERED,
                           instance=tensorboard,
                           target='project',
                           actor_id=self.request.user.id,
                           actor_name=self.request.user.username)
        return Response(status=status.HTTP_200_OK)
Ejemplo n.º 8
0
def new_experiment_job_status(sender, **kwargs):
    instance = kwargs['instance']
    created = kwargs.get('created', False)
    job = instance.job

    if created:
        # update job last_status
        job.status = instance
        job.save()

    # check if the new status is done to remove the containers from the monitors
    if job.is_done:
        from libs.redis_db import RedisJobContainers

        RedisJobContainers.remove_job(job.uuid.hex)

    # Check if the experiment job status
    if not created:
        return

    # Check if we need to change the experiment status
    experiment = instance.job.experiment
    if experiment.is_done:
        return

    celery_app.send_task(
        SchedulerCeleryTasks.EXPERIMENTS_CHECK_STATUS,
        kwargs={'experiment_id': experiment.id},
        countdown=1)
Ejemplo n.º 9
0
def run(containers, node, persist):
    container_ids = RedisJobContainers.get_containers()
    gpu_resources = get_gpu_resources()
    if gpu_resources:
        gpu_resources = {
            gpu_resource['index']: gpu_resource
            for gpu_resource in gpu_resources
        }
    update_cluster_node(gpu_resources)
    for container_id in container_ids:
        container = get_container(containers, container_id)
        if not container:
            continue
        payload = get_container_resources(node, containers[container_id],
                                          gpu_resources)
        if payload:
            payload = payload.to_dict()
            logger.debug("Publishing resources event")
            celery_app.send_task(EventsCeleryTasks.EVENTS_HANDLE_RESOURCES,
                                 kwargs={
                                     'payload': payload,
                                     'persist': persist
                                 })

            job_uuid = payload['job_uuid']
            # Check if we should stream the payload
            # Check if we have this container already in place
            experiment_uuid = RedisJobContainers.get_experiment_for_job(
                job_uuid)
            set_last_resources_cond = (
                RedisToStream.is_monitored_job_resources(job_uuid)
                or RedisToStream.is_monitored_experiment_resources(
                    experiment_uuid))
            if set_last_resources_cond:
                RedisToStream.set_latest_job_resources(job_uuid, payload)
Ejemplo n.º 10
0
def new_operation_run_status(sender, **kwargs):
    instance = kwargs['instance']
    operation_run = instance.operation_run
    pipeline_run = operation_run.pipeline_run
    # Update job last_status
    operation_run.status = instance
    operation_run.save()

    # No need to check if it is just created
    if instance.status == OperationStatuses.CREATED:
        return

    # Check if we need to update the pipeline_run's status
    celery_app.send_task(
        PipelineCeleryTasks.PIPELINES_CHECK_STATUSES,
        kwargs={'pipeline_run_id': pipeline_run.id,
                'status': instance.status,
                'message': instance.message})
    if operation_run.is_done:
        # Notify downstream that instance is done, and that its dependency can start.
        downstream_runs = operation_run.downstream_runs.filter(
            status__status=OperationStatuses.CREATED)
        for op_run in downstream_runs:
            celery_app.send_task(
                PipelineCeleryTasks.PIPELINES_START_OPERATION,
                kwargs={'operation_run_id': op_run.id})
Ejemplo n.º 11
0
def new_pipeline_run_status(sender, **kwargs):
    instance = kwargs['instance']
    pipeline_run = instance.pipeline_run
    # Update job last_status
    pipeline_run.status = instance
    set_started_at(instance=pipeline_run,
                   status=instance.status,
                   starting_statuses=[PipelineStatuses.RUNNING])
    set_finished_at(instance=pipeline_run,
                    status=instance.status,
                    is_done=PipelineStatuses.is_done)
    pipeline_run.save()
    # Notify operations with status change. This is necessary if we skip or stop the dag run.
    if pipeline_run.stopped:
        celery_app.send_task(PipelinesCeleryTasks.PIPELINES_STOP_OPERATIONS,
                             kwargs={
                                 'pipeline_run_id': pipeline_run.id,
                                 'message': 'Pipeline run was stopped'
                             })
    if pipeline_run.skipped:
        celery_app.send_task(PipelinesCeleryTasks.PIPELINES_SKIP_OPERATIONS,
                             kwargs={
                                 'pipeline_run_id': pipeline_run.id,
                                 'message': 'Pipeline run was skipped'
                             })
Ejemplo n.º 12
0
def new_operation_run_status(sender, **kwargs):
    instance = kwargs['instance']
    operation_run = instance.operation_run
    pipeline_run = operation_run.pipeline_run
    # Update job last_status
    operation_run.status = instance
    set_started_at(instance=operation_run,
                   status=instance.status,
                   starting_statuses=[PipelineStatuses.RUNNING])
    set_finished_at(instance=operation_run,
                    status=instance.status,
                    is_done=PipelineStatuses.is_done)
    operation_run.save()

    # No need to check if it is just created
    if instance.status == OperationStatuses.CREATED:
        return

    # Check if we need to update the pipeline_run's status
    celery_app.send_task(PipelinesCeleryTasks.PIPELINES_CHECK_STATUSES,
                         kwargs={
                             'pipeline_run_id': pipeline_run.id,
                             'status': instance.status,
                             'message': instance.message
                         })
    if operation_run.is_done:
        # Notify downstream that instance is done, and that its dependency can start.
        downstream_runs = operation_run.downstream_runs.filter(
            status__status=OperationStatuses.CREATED)
        for op_run in downstream_runs:
            celery_app.send_task(
                PipelinesCeleryTasks.PIPELINES_START_OPERATION,
                kwargs={'operation_run_id': op_run.id})
Ejemplo n.º 13
0
    def publish_build_job_log(self, log_lines, job_uuid, job_name):
        log_lines = to_list(log_lines)

        self._logger.info("Publishing log event for task: %s", job_uuid)
        celery_app.send_task(
            EventsCeleryTasks.EVENTS_HANDLE_LOGS_BUILD_JOB,
            kwargs={'job_uuid': job_uuid, 'job_name': job_name, 'log_lines': log_lines})
Ejemplo n.º 14
0
 def post(self, request, *args, **kwargs):
     obj = self.get_object()
     auditor.record(event_type=EXPERIMENT_STOPPED_TRIGGERED,
                    instance=obj,
                    actor_id=request.user.id,
                    actor_name=request.user.username)
     group = obj.experiment_group
     celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_STOP,
                          kwargs={
                              'project_name':
                              obj.project.unique_name,
                              'project_uuid':
                              obj.project.uuid.hex,
                              'experiment_name':
                              obj.unique_name,
                              'experiment_uuid':
                              obj.uuid.hex,
                              'experiment_group_name':
                              group.unique_name if group else None,
                              'experiment_group_uuid':
                              group.uuid.hex if group else None,
                              'specification':
                              obj.config,
                              'update_status':
                              True
                          })
     return Response(status=status.HTTP_200_OK)
Ejemplo n.º 15
0
def create(experiment_group):
    base.create_group_experiments(experiment_group=experiment_group)

    celery_app.send_task(
        HPCeleryTasks.HP_GRID_SEARCH_START,
        kwargs={'experiment_group_id': experiment_group.id},
        countdown=1)
Ejemplo n.º 16
0
def projects_notebook_build(project_id):
    project = get_valid_project(project_id=project_id)
    if not project or not project.notebook:
        _logger.warning('Project does not have a notebook.')
        return None

    job = project.notebook

    if not JobLifeCycle.can_transition(status_from=job.last_status,
                                       status_to=JobLifeCycle.BUILDING):
        _logger.info('Notebook for project id `%s` cannot transition from `%s` to `%s`.',
                     project_id, job.last_status, JobLifeCycle.BUILDING)
        return

    build_job, image_exists, build_status = dockerizer_scheduler.create_build_job(
        user=job.user,
        project=job.project,
        config=job.specification.run_exec,
        code_reference=job.code_reference)

    job.build_job = build_job
    job.save()
    if image_exists:
        # The image already exists, so we can start the experiment right away
        celery_app.send_task(
            SchedulerCeleryTasks.PROJECTS_NOTEBOOK_START,
            kwargs={'project_id': project_id})
        return

    if not build_status:
        job.set_status(JobLifeCycle.FAILED, message='Could not start build process.')
        return

    # Update job status to show that its building docker image
    job.set_status(JobLifeCycle.BUILDING, message='Building container')
Ejemplo n.º 17
0
def experiment_job_status_post_save(sender, **kwargs):
    instance = kwargs['instance']
    job = instance.job

    # update job last_status
    job.status = instance
    set_job_started_at(instance=job, status=instance.status)
    set_job_finished_at(instance=job, status=instance.status)
    job.save()

    # check if the new status is done to remove the containers from the monitors
    if job.is_done:
        from libs.redis_db import RedisJobContainers

        RedisJobContainers.remove_job(job.uuid.hex)

    # Check if we need to change the experiment status
    experiment = instance.job.experiment
    if experiment.is_done:
        return

    celery_app.send_task(
        SchedulerCeleryTasks.EXPERIMENTS_CHECK_STATUS,
        kwargs={'experiment_id': experiment.id},
        countdown=1)
Ejemplo n.º 18
0
def hp_hyperband_iterate(self, experiment_group_id):
    experiment_group = get_running_experiment_group(experiment_group_id=experiment_group_id)
    if not experiment_group:
        return

    if experiment_group.non_done_experiments.count() > 0:
        # Schedule another task, because all experiment must be done
        self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
        return

    iteration_config = experiment_group.iteration_config
    iteration_manager = experiment_group.iteration_manager
    search_manager = experiment_group.search_manager

    iteration_manager.update_iteration()

    if search_manager.should_reschedule(iteration=iteration_config.iteration,
                                        bracket_iteration=iteration_config.bracket_iteration):
        celery_app.send_task(
            HPCeleryTasks.HP_HYPERBAND_CREATE,
            kwargs={'experiment_group_id': experiment_group_id})
        return

    if search_manager.should_reduce_configs(iteration=iteration_config.iteration,
                                            bracket_iteration=iteration_config.bracket_iteration):
        iteration_manager.reduce_configs()
        celery_app.send_task(
            HPCeleryTasks.HP_HYPERBAND_START,
            kwargs={'experiment_group_id': experiment_group_id})
        return

    base.check_group_experiments_finished(experiment_group_id)
Ejemplo n.º 19
0
def jobs_build(job_id):
    job = get_valid_job(job_id=job_id)
    if not job:
        _logger.warning('Job does not have a notebook.')
        return None

    if not JobLifeCycle.can_transition(status_from=job.last_status,
                                       status_to=JobLifeCycle.BUILDING):
        _logger.info('Job id `%s` cannot transition from `%s` to `%s`.',
                     job_id, job.last_status, JobLifeCycle.BUILDING)
        return

    build_job, image_exists, build_status = dockerizer_scheduler.create_build_job(
        user=job.user,
        project=job.project,
        config=job.specification.build,
        code_reference=job.code_reference)

    job.build_job = build_job
    job.save()
    if image_exists:
        # The image already exists, so we can start the experiment right away
        celery_app.send_task(SchedulerCeleryTasks.JOBS_START,
                             kwargs={'job_id': job_id})
        return

    if not build_status:
        job.set_status(JobLifeCycle.FAILED,
                       message='Could not start build process.')
        return

    # Update job status to show that its building docker image
    job.set_status(JobLifeCycle.BUILDING, message='Building container')
Ejemplo n.º 20
0
    def post(self, request, *args, **kwargs):
        project = self.get_object()
        experiment_id = self.kwargs.get('experiment_id')
        group_id = self.kwargs.get('group_id')
        if experiment_id:
            experiment = get_object_or_404(Experiment,
                                           project=project,
                                           id=experiment_id)
            tensorboard = self._handle_experiment_tensorboard(
                project=project, experiment=experiment)
        elif group_id:
            group = get_object_or_404(ExperimentGroup,
                                      project=project,
                                      id=group_id)
            tensorboard = self._handle_group_tensorboard(project=project,
                                                         group=group)
        else:
            tensorboard = self._handle_project_tensorboard(project=project)

        if not tensorboard:
            return Response(data='Tensorboard is already running',
                            status=status.HTTP_200_OK)

        if not tensorboard.is_running:
            celery_app.send_task(SchedulerCeleryTasks.TENSORBOARDS_START,
                                 kwargs={'tensorboard_job_id': tensorboard.id},
                                 countdown=1)
        return Response(status=status.HTTP_201_CREATED)
Ejemplo n.º 21
0
def hp_hyperband_iterate(self, experiment_group_id):
    experiment_group = get_running_experiment_group(
        experiment_group_id=experiment_group_id)
    if not experiment_group:
        return

    if experiment_group.non_done_experiments.count() > 0:
        # Schedule another task, because all experiment must be done
        self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
        return

    iteration_config = experiment_group.iteration_config
    iteration_manager = experiment_group.iteration_manager
    search_manager = experiment_group.search_manager

    iteration_manager.update_iteration()

    if search_manager.should_reschedule(
            iteration=iteration_config.iteration,
            bracket_iteration=iteration_config.bracket_iteration):
        celery_app.send_task(
            HPCeleryTasks.HP_HYPERBAND_CREATE,
            kwargs={'experiment_group_id': experiment_group_id})
        return

    if search_manager.should_reduce_configs(
            iteration=iteration_config.iteration,
            bracket_iteration=iteration_config.bracket_iteration):
        iteration_manager.reduce_configs()
        celery_app.send_task(
            HPCeleryTasks.HP_HYPERBAND_START,
            kwargs={'experiment_group_id': experiment_group_id})
        return

    base.check_group_experiments_finished(experiment_group_id)
Ejemplo n.º 22
0
def new_experiment_job_status(sender, **kwargs):
    instance = kwargs['instance']
    created = kwargs.get('created', False)
    job = instance.job

    if created:
        # update job last_status
        job.status = instance
        job.save()

    # check if the new status is done to remove the containers from the monitors
    if job.is_done:
        from libs.redis_db import RedisJobContainers

        RedisJobContainers.remove_job(job.uuid.hex)

    # Check if the experiment job status
    if not created:
        return

    # Check if we need to change the experiment status
    experiment = instance.job.experiment
    if experiment.is_done:
        return

    celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_CHECK_STATUS,
                         kwargs={'experiment_id': experiment.id},
                         countdown=1)
Ejemplo n.º 23
0
    def build(self, nocache=False, memory_limit=None):
        _logger.debug('Starting build in `%s`', self.repo_path)
        # Checkout to the correct commit
        if self.image_tag != self.LATEST_IMAGE_TAG:
            git.checkout_commit(repo_path=self.repo_path, commit=self.image_tag)

        limits = {
            # Always disable memory swap for building, since mostly
            # nothing good can come of that.
            'memswap': -1
        }
        if memory_limit:
            limits['memory'] = memory_limit

        # Create DockerFile
        with open(self.dockerfile_path, 'w') as dockerfile:
            rendered_dockerfile = self.render()
            celery_app.send_task(
                SchedulerCeleryTasks.BUILD_JOBS_SET_DOCKERFILE,
                kwargs={'build_job_uuid': self.job_uuid, 'dockerfile': rendered_dockerfile})
            dockerfile.write(rendered_dockerfile)

        stream = self.docker.build(
            path=self.build_path,
            tag=self.get_tagged_image(),
            forcerm=True,
            rm=True,
            pull=True,
            nocache=nocache,
            container_limits=limits)
        return self._handle_log_stream(stream=stream)
Ejemplo n.º 24
0
def projects_notebook_build(notebook_job_id):
    notebook_job = get_valid_notebook(notebook_job_id=notebook_job_id)
    if not notebook_job:
        _logger.warning('Notebook %s does not exist anymore.', notebook_job_id)
        return None

    if not JobLifeCycle.can_transition(status_from=notebook_job.last_status,
                                       status_to=JobLifeCycle.BUILDING):
        _logger.info('Notebook `%s` cannot transition from `%s` to `%s`.',
                     notebook_job, notebook_job.last_status, JobLifeCycle.BUILDING)
        return

    build_job, image_exists, build_status = dockerizer_scheduler.create_build_job(
        user=notebook_job.user,
        project=notebook_job.project,
        config=notebook_job.specification.build,
        code_reference=notebook_job.code_reference)

    notebook_job.build_job = build_job
    notebook_job.save()
    if image_exists:
        # The image already exists, so we can start the experiment right away
        celery_app.send_task(
            SchedulerCeleryTasks.PROJECTS_NOTEBOOK_START,
            kwargs={'notebook_job_id': notebook_job_id})
        return

    if not build_status:
        notebook_job.set_status(JobLifeCycle.FAILED, message='Could not start build process.')
        return

    # Update job status to show that its building docker image
    notebook_job.set_status(JobLifeCycle.BUILDING, message='Building container')
Ejemplo n.º 25
0
def run(containers, node, persist):
    container_ids = RedisJobContainers.get_containers()
    gpu_resources = get_gpu_resources()
    if gpu_resources:
        gpu_resources = {gpu_resource['index']: gpu_resource for gpu_resource in gpu_resources}
    # update cluster and current node
    update_cluster(gpu_resources)
    for container_id in container_ids:
        container = get_container(containers, container_id)
        if not container:
            continue
        payload = get_container_resources(node, containers[container_id], gpu_resources)
        if payload:
            payload = payload.to_dict()
            logger.info("Publishing resources event")
            celery_app.send_task(
                EventsCeleryTasks.EVENTS_HANDLE_RESOURCES,
                kwargs={'payload': payload, 'persist': persist})

            job_uuid = payload['job_uuid']
            # Check if we should stream the payload
            # Check if we have this container already in place
            experiment_uuid = RedisJobContainers.get_experiment_for_job(job_uuid)
            if (RedisToStream.is_monitored_job_resources(job_uuid) or
                    RedisToStream.is_monitored_experiment_resources(experiment_uuid)):
                RedisToStream.set_latest_job_resources(job_uuid, payload)
Ejemplo n.º 26
0
 def post(self, request, *args, **kwargs):
     obj = self.get_object()
     if obj.has_notebook:
         commit = request.data.get('commit')
         commit = to_bool(commit) if commit is not None else True
         if commit:
             # Commit changes
             git.commit(obj.repo.path, request.user.email,
                        request.user.username)
         else:
             # Reset changes
             git.undo(obj.repo.path)
         celery_app.send_task(SchedulerCeleryTasks.PROJECTS_NOTEBOOK_STOP,
                              kwargs={
                                  'project_name': obj.unique_name,
                                  'project_uuid': obj.uuid.hex,
                                  'notebook_job_name':
                                  obj.notebook.unique_name,
                                  'notebook_job_uuid':
                                  obj.notebook.uuid.hex,
                                  'update_status': True
                              })
         auditor.record(event_type=NOTEBOOK_STOPPED_TRIGGERED,
                        instance=obj.notebook,
                        target='project',
                        actor_id=self.request.user.id,
                        countdown=1)
     elif obj.notebook and obj.notebook.is_running:
         obj.notebook.set_status(status=ExperimentLifeCycle.STOPPED,
                                 message='Notebook was stopped')
     return Response(status=status.HTTP_200_OK)
Ejemplo n.º 27
0
def handle_new_experiment_status(sender, **kwargs):
    instance = kwargs['instance']
    experiment = instance.experiment
    if not experiment.specification:
        return

    stop_condition = (instance.status in (ExperimentLifeCycle.FAILED,
                                          ExperimentLifeCycle.SUCCEEDED)
                      and experiment.jobs.count() > 0)
    if stop_condition:
        _logger.info(
            'One of the workers failed or Master for experiment `%s` is done, '
            'send signal to other workers to stop.', experiment.unique_name)
        # Schedule stop for this experiment because other jobs may be still running
        group = experiment.experiment_group
        celery_app.send_task(
            SchedulerCeleryTasks.EXPERIMENTS_STOP,
            kwargs={
                'project_name': experiment.project.unique_name,
                'project_uuid': experiment.project.uuid.hex,
                'experiment_name': experiment.unique_name,
                'experiment_uuid': experiment.uuid.hex,
                'experiment_group_name': group.unique_name if group else None,
                'experiment_group_uuid': group.uuid.hex if group else None,
                'specification': experiment.config,
                'update_status': False
            },
            countdown=RedisTTL.get_for_experiment(experiment_id=experiment.id))
Ejemplo n.º 28
0
def build_experiment(self, experiment_id):
    experiment = get_valid_experiment(experiment_id=experiment_id)
    if not experiment:
        if self.request.retries < 2:
            _logger.info('Trying again for Experiment `%s`.', experiment_id)
            self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)

        _logger.info(
            'Something went wrong, '
            'the Experiment `%s` does not exist anymore.', experiment_id)
        return

    # No need to build the image, start the experiment directly
    if not (experiment.specification.build and experiment.specification.run):
        celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_START,
                             kwargs={'experiment_id': experiment_id})
        return

    if not ExperimentLifeCycle.can_transition(
            status_from=experiment.last_status,
            status_to=ExperimentLifeCycle.BUILDING):
        _logger.info('Experiment id `%s` cannot transition from `%s` to `%s`.',
                     experiment_id, experiment.last_status,
                     ExperimentLifeCycle.BUILDING)
        return None

    # Update experiment status to show that its building
    experiment.set_status(ExperimentLifeCycle.BUILDING)

    # Building the docker image
    try:
        status = experiments_builder.build_experiment(experiment)
    except DockerException as e:
        _logger.warning('Failed to build experiment %s', e)
        experiment.set_status(ExperimentLifeCycle.FAILED,
                              message='Failed to build image for experiment.')
        return
    except Repo.DoesNotExist:
        _logger.warning('No code was found for this project')
        experiment.set_status(
            ExperimentLifeCycle.FAILED,
            message='No code was found for to build this experiment.')
        return
    except Exception as e:  # Other exceptions
        _logger.error(
            'Failed to build experiment, unexpected error occurred.\n%s',
            traceback.format_exc())
        experiment.set_status(ExperimentLifeCycle.FAILED,
                              message='Failed to build image for experiment.')
        return

    if not status:
        experiment.set_status(ExperimentLifeCycle.FAILED,
                              message='Failed to build image for experiment.')
        return

    # Now we can start the experiment
    celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_START,
                         kwargs={'experiment_id': experiment_id})
Ejemplo n.º 29
0
def sync_experiments_and_jobs_statuses():
    experiments = Experiment.objects.exclude(
        status__status__in=ExperimentLifeCycle.DONE_STATUS)
    experiments = experiments.annotate(num_jobs=Count('jobs')).filter(num_jobs__gt=0)
    for experiment in experiments:
        celery_app.send_task(
            SchedulerCeleryTasks.EXPERIMENTS_CHECK_STATUS,
            kwargs={'experiment_id': experiment.id})
Ejemplo n.º 30
0
def build_handle_done_status(sender, **kwargs):
    instance = kwargs['instance']
    build_job_id = instance.job_id

    if JobLifeCycle.is_done(instance.status):
        celery_app.send_task(
            SchedulerCeleryTasks.BUILD_JOBS_NOTIFY_DONE,
            kwargs={'build_job_id': build_job_id})
Ejemplo n.º 31
0
def start_new_experiment(sender, **kwargs):
    instance = kwargs['instance']
    if instance.is_independent:
        # Start building the experiment and then Schedule it to be picked by the spawners
        celery_app.send_task(
            SchedulerCeleryTasks.EXPERIMENTS_BUILD,
            kwargs={'experiment_id': instance.id},
            countdown=1)
Ejemplo n.º 32
0
 def post(self, request, *args, **kwargs):
     obj = self.get_object()
     auditor.record(event_type=EXPERIMENT_STOPPED_TRIGGERED,
                    instance=obj,
                    actor_id=request.user.id)
     celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_STOP,
                          kwargs={'experiment_id': obj.id})
     return Response(status=status.HTTP_200_OK)
Ejemplo n.º 33
0
 def post(self, request, *args, **kwargs):
     obj = self.get_object()
     auditor.record(event_type=EXPERIMENT_STOPPED_TRIGGERED,
                    instance=obj,
                    actor_id=request.user.id)
     celery_app.send_task(
         SchedulerCeleryTasks.EXPERIMENTS_STOP,
         kwargs={'experiment_id': obj.id})
     return Response(status=status.HTTP_200_OK)
Ejemplo n.º 34
0
def experiments_group_create(self, experiment_group_id):
    experiment_group = _get_group_or_retry(experiment_group_id=experiment_group_id, task=self)
    if not experiment_group:
        return

    experiment_group.set_status(ExperimentGroupLifeCycle.RUNNING)
    celery_app.send_task(
        HPCeleryTasks.HP_CREATE,
        kwargs={'experiment_group_id': experiment_group_id})
Ejemplo n.º 35
0
def experiments_group_create(self, experiment_group_id):
    experiment_group = _get_group_or_retry(
        experiment_group_id=experiment_group_id, task=self)
    if not experiment_group:
        return

    experiment_group.set_status(ExperimentGroupLifeCycle.RUNNING)
    celery_app.send_task(HPCeleryTasks.HP_CREATE,
                         kwargs={'experiment_group_id': experiment_group_id})
Ejemplo n.º 36
0
 def post(self, request, *args, **kwargs):
     obj = self.get_object()
     if obj.has_notebook:
         return Response(data='Notebook is already running', status=status.HTTP_200_OK)
     self._create_notebook(obj)
     if not obj.notebook.is_running:
         celery_app.send_task(
             SchedulerCeleryTasks.PROJECTS_NOTEBOOK_BUILD,
             kwargs={'project_id': obj.id})
     return Response(status=status.HTTP_201_CREATED)
Ejemplo n.º 37
0
def create(experiment_group):
    experiment_group.iteration_manager.create_iteration()
    experiments = base.create_group_experiments(
        experiment_group=experiment_group)
    experiment_group.iteration_manager.add_iteration_experiments(
        experiment_ids=[xp.id for xp in experiments])

    celery_app.send_task(HPCeleryTasks.HP_HYPERBAND_START,
                         kwargs={'experiment_group_id': experiment_group.id},
                         countdown=1)
Ejemplo n.º 38
0
    def _run(task_bind, *args, **kwargs):
        experiment_id = kwargs['experiment_id']
        experiment = get_valid_experiment(experiment_id=experiment_id)
        if not experiment:
            raise OperationRunError(
                'The Experiment `{}` does not exist anymore.'.format(
                    experiment_id))

        celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_BUILD,
                             kwargs={'experiment_id': experiment_id})
Ejemplo n.º 39
0
 def post(self, request, *args, **kwargs):
     obj = self.get_object()
     if obj.has_tensorboard:
         return Response(data='Tensorboard is already running', status=status.HTTP_200_OK)
     self._create_tensorboard(obj)
     if not obj.tensorboard.is_running:
         celery_app.send_task(
             SchedulerCeleryTasks.PROJECTS_TENSORBOARD_START,
             kwargs={'project_id': obj.id})
     return Response(status=status.HTTP_201_CREATED)
Ejemplo n.º 40
0
    def publish_build_job_log(self, log_line, job_uuid, job_name):
        try:
            log_line = log_line.decode('utf-8')
        except AttributeError:
            pass

        self._logger.info("Publishing log event for task: %s", job_uuid)
        celery_app.send_task(
            EventsCeleryTasks.EVENTS_HANDLE_LOGS_BUILD_JOB,
            kwargs={'job_uuid': job_uuid, 'job_name': job_name, 'log_line': log_line})
Ejemplo n.º 41
0
def create(experiment_group):
    experiment_group.iteration_manager.create_iteration()
    experiments = base.create_group_experiments(experiment_group=experiment_group)
    experiment_group.iteration_manager.add_iteration_experiments(
        experiment_ids=[xp.id for xp in experiments])

    celery_app.send_task(
        HPCeleryTasks.HP_HYPERBAND_START,
        kwargs={'experiment_group_id': experiment_group.id},
        countdown=1)
Ejemplo n.º 42
0
def tensorboard_job_pre_delete(sender, **kwargs):
    job = kwargs['instance']

    celery_app.send_task(SchedulerCeleryTasks.TENSORBOARDS_STOP,
                         kwargs={
                             'project_name': job.project.unique_name,
                             'project_uuid': job.project.uuid.hex,
                             'tensorboard_job_name': job.unique_name,
                             'tensorboard_job_uuid': job.uuid.hex,
                             'update_status': False
                         })
Ejemplo n.º 43
0
def create(experiment_group):
    experiments = base.create_group_experiments(experiment_group=experiment_group)
    experiment_ids = [xp.id for xp in experiments]
    experiments_configs = [[xp.id, xp.declarations] for xp in experiments]
    experiment_group.iteration_manager.create_iteration(
        experiment_ids=experiment_ids,
        experiments_configs=experiments_configs)

    celery_app.send_task(
        HPCeleryTasks.HP_BO_START,
        kwargs={'experiment_group_id': experiment_group.id},
        countdown=1)
Ejemplo n.º 44
0
def build_experiment(self, experiment_id):
    experiment = get_valid_experiment(experiment_id=experiment_id)
    if not experiment:
        if self.request.retries < 2:
            _logger.info('Trying again for Experiment `%s`.', experiment_id)
            self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)

        _logger.info('Something went wrong, '
                     'the Experiment `%s` does not exist anymore.', experiment_id)
        return

    # No need to build the image, start the experiment directly
    if not (experiment.specification.build and experiment.specification.run):
        celery_app.send_task(
            SchedulerCeleryTasks.EXPERIMENTS_START,
            kwargs={'experiment_id': experiment_id})
        return

    if not ExperimentLifeCycle.can_transition(status_from=experiment.last_status,
                                              status_to=ExperimentLifeCycle.BUILDING):
        _logger.info('Experiment id `%s` cannot transition from `%s` to `%s`.',
                     experiment_id, experiment.last_status, ExperimentLifeCycle.BUILDING)
        return None

    # Update experiment status to show that its building
    experiment.set_status(ExperimentLifeCycle.BUILDING)

    # Building the docker image
    try:
        status = experiments_builder.build_experiment(experiment)
    except DockerException as e:
        _logger.warning('Failed to build experiment %s', e)
        experiment.set_status(ExperimentLifeCycle.FAILED,
                              message='Failed to build image for experiment.')
        return
    except Repo.DoesNotExist:
        _logger.warning('No code was found for this project')
        experiment.set_status(ExperimentLifeCycle.FAILED,
                              message='No code was found for to build this experiment.')
        return
    except Exception as e:  # Other exceptions
        _logger.warning('Failed to build experiment %s', e)
        experiment.set_status(ExperimentLifeCycle.FAILED,
                              message='Failed to build image for experiment.')
        return

    if not status:
        return

    # Now we can start the experiment
    celery_app.send_task(
        SchedulerCeleryTasks.EXPERIMENTS_START,
        kwargs={'experiment_id': experiment_id})
Ejemplo n.º 45
0
def build_check_stop_job(sender, **kwargs):
    instance = kwargs['instance']
    build_job_id = instance.job_id

    if instance.status in (JobLifeCycle.FAILED, JobLifeCycle.SUCCEEDED):
        _logger.info('The build job  with id `%s` failed or is done, '
                     'send signal to stop.', build_job_id)
        # Schedule stop for this job
        celery_app.send_task(
            SchedulerCeleryTasks.BUILD_JOBS_STOP,
            kwargs={'build_job_id': build_job_id,
                    'update_status': False})
Ejemplo n.º 46
0
def notify_build_job_succeeded(build_job):
    job_ids = Job.objects.filter(build_job=build_job).values_list('id', flat=True)
    for job_id in job_ids:
        celery_app.send_task(
            SchedulerCeleryTasks.JOBS_START,
            kwargs={'job_id': job_id})

    tensorboard_job_ids = TensorboardJob.objects.filter(
        build_job=build_job).values_list('id', flat=True)
    for tensorboard_job_id in tensorboard_job_ids:
        celery_app.send_task(
            SchedulerCeleryTasks.TENSORBOARDS_START,
            kwargs={'tensorboard_job_id': tensorboard_job_id})

    notebook_job_ids = NotebookJob.objects.filter(
        build_job=build_job).values_list('id', flat=True)
    for notebook_job_id in notebook_job_ids:
        celery_app.send_task(
            SchedulerCeleryTasks.PROJECTS_NOTEBOOK_START,
            kwargs={'notebook_job_id': notebook_job_id})

    experiment_ids = Experiment.objects.filter(
        build_job=build_job).values_list('id', flat=True)
    for experiment_id in experiment_ids:
        celery_app.send_task(
            SchedulerCeleryTasks.EXPERIMENTS_START,
            kwargs={'experiment_id': experiment_id})
Ejemplo n.º 47
0
def hp_hyperband_start(self, experiment_group_id):
    experiment_group = get_running_experiment_group(experiment_group_id=experiment_group_id)
    if not experiment_group:
        return

    should_retry = base.start_group_experiments(experiment_group=experiment_group)
    if should_retry:
        # Schedule another task
        self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
        return

    celery_app.send_task(
        HPCeleryTasks.HP_HYPERBAND_ITERATE,
        kwargs={'experiment_group_id': experiment_group_id})
Ejemplo n.º 48
0
def send_status(build_job, status, message=None):
    payload = {
        'details': {
            'labels': {
                'app': 'dockerizer',
                'job_uuid': build_job.uuid.hex,
                'job_name': build_job.unique_name
            }
        },
        'status': status,
        'message': message
    }
    celery_app.send_task(
        EventsCeleryTasks.EVENTS_HANDLE_BUILD_JOB_STATUSES,
        kwargs={'payload': payload})
Ejemplo n.º 49
0
def update_cluster(node_gpus):
    celery_app.send_task(CronsCeleryTasks.CLUSTERS_UPDATE_SYSTEM_INFO)
    celery_app.send_task(CronsCeleryTasks.CLUSTERS_UPDATE_SYSTEM_NODES)
    if not node_gpus:
        return
    node = ClusterNode.objects.filter(name=settings.K8S_NODE_NAME).first()
    for node_gpu_index in node_gpus.keys():
        node_gpu_value = node_gpus[node_gpu_index]
        try:
            node_gpu = NodeGPU.objects.get(cluster_node=node, index=node_gpu_index)
        except NodeGPU.DoesNotExist:
            node_gpu = NodeGPU(cluster_node=node, index=node_gpu_index)
        node_gpu.serial = node_gpu_value['serial']
        node_gpu.name = node_gpu_value['name']
        node_gpu.memory = node_gpu_value['memory_total']
        node_gpu.save()
Ejemplo n.º 50
0
    def publish_experiment_job_log(self,
                                   log_lines,
                                   status,
                                   experiment_uuid,
                                   experiment_name,
                                   job_uuid,
                                   task_type=None,
                                   task_idx=None):

        self._logger.debug("Publishing log event for task: %s.%s, %s",
                           task_type, task_idx, experiment_name)
        celery_app.send_task(
            EventsCeleryTasks.EVENTS_HANDLE_LOGS_EXPERIMENT_JOB,
            kwargs={
                'experiment_name': experiment_name,
                'experiment_uuid': experiment_uuid,
                'job_uuid': job_uuid,
                'log_lines': log_lines,
                'task_type': task_type,
                'task_idx': task_idx})
        try:
            should_stream = (RedisToStream.is_monitored_job_logs(job_uuid) or
                             RedisToStream.is_monitored_experiment_logs(experiment_uuid))
        except RedisError:
            should_stream = False
        if should_stream:
            self._logger.info("Streaming new log event for experiment: %s", experiment_uuid)

            with celery_app.producer_or_acquire(None) as producer:
                try:
                    producer.publish(
                        {
                            'experiment_uuid': experiment_uuid,
                            'job_uuid': job_uuid,
                            'log_lines': log_lines,
                            'status': status,
                            'task_type': task_type,
                            'task_idx': task_idx
                        },
                        routing_key='{}.{}.{}'.format(RoutingKeys.LOGS_SIDECARS,
                                                      experiment_uuid,
                                                      job_uuid),
                        exchange=settings.INTERNAL_EXCHANGE,
                    )
                except (TimeoutError, AMQPError):
                    pass
Ejemplo n.º 51
0
def new_pipeline_run_status(sender, **kwargs):
    instance = kwargs['instance']
    pipeline_run = instance.pipeline_run
    # Update job last_status
    pipeline_run.status = instance
    pipeline_run.save()
    # Notify operations with status change. This is necessary if we skip or stop the dag run.
    if pipeline_run.stopped:
        celery_app.send_task(
            PipelineCeleryTasks.PIPELINES_STOP_OPERATIONS,
            kwargs={'pipeline_run_id': pipeline_run.id,
                    'message': 'Pipeline run was stopped'})
    if pipeline_run.skipped:
        celery_app.send_task(
            PipelineCeleryTasks.PIPELINES_SKIP_OPERATIONS,
            kwargs={'pipeline_run_id': pipeline_run.id,
                    'message': 'Pipeline run was skipped'})
Ejemplo n.º 52
0
def handle_new_experiment_status(sender, **kwargs):
    instance = kwargs['instance']
    experiment = instance.experiment
    if not experiment.specification:
        return

    stop_condition = (
        instance.status in (ExperimentLifeCycle.FAILED, ExperimentLifeCycle.SUCCEEDED) and
        experiment.jobs.count() > 0
    )
    if stop_condition:
        _logger.info('One of the workers failed or Master for experiment `%s` is done, '
                     'send signal to other workers to stop.', experiment.unique_name)
        # Schedule stop for this experiment because other jobs may be still running
        celery_app.send_task(
            SchedulerCeleryTasks.EXPERIMENTS_STOP,
            kwargs={'experiment_id': experiment.id,
                    'update_status': False})
Ejemplo n.º 53
0
def start_group_experiments(experiment_group):
    # Check for early stopping before starting new experiments from this group
    if experiment_group.should_stop_early():
        celery_app.send_task(
            SchedulerCeleryTasks.EXPERIMENTS_GROUP_STOP_EXPERIMENTS,
            kwargs={'experiment_group_id': experiment_group.id,
                    'pending': True,
                    'message': 'Early stopping'})
        return

    experiment_to_start = experiment_group.n_experiments_to_start
    pending_experiments = experiment_group.pending_experiments[:experiment_to_start]
    n_pending_experiment = experiment_group.pending_experiments.count()

    for experiment in pending_experiments:
        celery_app.send_task(
            SchedulerCeleryTasks.EXPERIMENTS_BUILD,
            kwargs={'experiment_id': experiment.id})

    return n_pending_experiment - experiment_to_start > 0
Ejemplo n.º 54
0
    def start(self):
        """Start the celery task of this operation."""
        kwargs = self.celery_task_context
        # Update we the operation run id
        kwargs['operation_run_id'] = self.id  # pylint:disable=unsupported-assignment-operation

        async_result = celery_app.send_task(
            self.operation.celery_task,
            kwargs=kwargs,
            **self.operation.get_run_params())
        self.celery_task_id = async_result.id
        self.save()
Ejemplo n.º 55
0
def experiments_group_stop_experiments(experiment_group_id, pending, message=None):
    experiment_group = get_running_experiment_group(experiment_group_id=experiment_group_id)
    if not experiment_group:
        return

    if pending:
        for experiment in experiment_group.pending_experiments:
            # Update experiment status to show that its stopped
            experiment.set_status(status=ExperimentLifeCycle.STOPPED, message=message)
    else:
        experiments = experiment_group.experiments.exclude(
            status__status__in=ExperimentLifeCycle.DONE_STATUS).distinct()
        for experiment in experiments:
            if experiment.is_running:
                celery_app.send_task(
                    SchedulerCeleryTasks.EXPERIMENTS_STOP,
                    kwargs={'experiment_id': experiment.id})
            else:
                # Update experiment status to show that its stopped
                experiment.set_status(status=ExperimentLifeCycle.STOPPED, message=message)

    experiment_group.set_status(ExperimentGroupLifeCycle.STOPPED)
Ejemplo n.º 56
0
def experiments_build(experiment_id):
    experiment = get_valid_experiment(experiment_id=experiment_id)
    if not experiment:
        return

    # No need to build the image, start the experiment directly
    if not (experiment.specification.build and experiment.specification.run):
        celery_app.send_task(
            SchedulerCeleryTasks.EXPERIMENTS_START,
            kwargs={'experiment_id': experiment_id})
        return

    if not ExperimentLifeCycle.can_transition(status_from=experiment.last_status,
                                              status_to=ExperimentLifeCycle.BUILDING):
        _logger.info('Experiment id `%s` cannot transition from `%s` to `%s`.',
                     experiment_id, experiment.last_status, ExperimentLifeCycle.BUILDING)
        return

    build_job, image_exists, build_status = dockerizer_scheduler.create_build_job(
        user=experiment.user,
        project=experiment.project,
        config=experiment.specification.build,
        code_reference=experiment.code_reference)

    experiment.build_job = build_job
    experiment.save()
    if image_exists:
        # The image already exists, so we can start the experiment right away
        celery_app.send_task(
            SchedulerCeleryTasks.EXPERIMENTS_START,
            kwargs={'experiment_id': experiment_id})
        return

    if not build_status:
        experiment.set_status(ExperimentLifeCycle.FAILED, message='Could not start build process.')
        return

    # Update experiment status to show that its building
    experiment.set_status(ExperimentLifeCycle.BUILDING)
Ejemplo n.º 57
0
def build_project_notebook(project_id):
    project = get_valid_project(project_id)
    if not project or not project.notebook:
        return None

    notebook_job = project.notebook

    # Update job status to show that its building docker image
    notebook_job.set_status(JobLifeCycle.BUILDING, message='Building container')

    # Building the docker image
    try:
        status = notebooks_builder.build_notebook_job(project=project, job=project.notebook)
    except DockerException as e:
        _logger.warning('Failed to build notebook %s', e)
        notebook_job.set_status(
            JobLifeCycle.FAILED,
            message='Failed to build image for notebook.')
        return
    except Repo.DoesNotExist:
        _logger.warning('No code was found for this project')
        notebook_job.set_status(
            JobLifeCycle.FAILED,
            message='Failed to build image for notebook.')
        return
    except Exception as e:  # Other exceptions
        _logger.warning('Failed to build notebook %s', e)
        notebook_job.set_status(JobLifeCycle.FAILED,
                                message='Failed to build image for notebook.')
        return

    if not status:
        return

    # Now we can start the notebook
    celery_app.send_task(
        SchedulerCeleryTasks.PROJECTS_NOTEBOOK_START,
        kwargs={'notebook_job_id': notebook_job.id})
Ejemplo n.º 58
0
def check_group_experiments_finished(experiment_group_id):
    celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_GROUP_CHECK_FINISHED,
                         kwargs={'experiment_group_id': experiment_group_id})