Ejemplo n.º 1
0
def start_experiment(experiment):
    # Update experiment status to show that its started
    experiment.set_status(ExperimentLifeCycle.SCHEDULED)

    project = experiment.project
    group = experiment.experiment_group

    job_docker_image = None  # This will force the spawners to use the default docker image
    if experiment.specification.build:
        try:
            image_name, image_tag = get_image_info(build_job=experiment.build_job)
        except ValueError as e:
            _logger.warning('Could not start the experiment, %s', e)
            experiment.set_status(ExperimentLifeCycle.FAILED,
                                  message='External git repo was note found.')
            return
        job_docker_image = '{}:{}'.format(image_name, image_tag)
        _logger.info('Start experiment with built image `%s`', job_docker_image)
    else:
        _logger.info('Start experiment with default image.')

    spawner_class = get_spawner_class(experiment.specification.framework)

    # Use spawners to start the experiment
    spawner = spawner_class(project_name=project.unique_name,
                            experiment_name=experiment.unique_name,
                            experiment_group_name=group.unique_name if group else None,
                            project_uuid=project.uuid.hex,
                            experiment_group_uuid=group.uuid.hex if group else None,
                            experiment_uuid=experiment.uuid.hex,
                            original_name=experiment.original_unique_name,
                            cloning_strategy=experiment.cloning_strategy,
                            spec=experiment.specification,
                            k8s_config=settings.K8S_CONFIG,
                            namespace=settings.K8S_NAMESPACE,
                            in_cluster=True,
                            job_docker_image=job_docker_image,
                            use_sidecar=True,
                            sidecar_config=config.get_requested_params(to_str=True))
    try:
        response = spawner.start_experiment()
    except ApiException as e:
        _logger.warning('Could not start the experiment, please check your polyaxon spec %s', e)
        experiment.set_status(
            ExperimentLifeCycle.FAILED,
            message='Could not start the experiment, encountered a Kubernetes ApiException.')
        return
    except Exception as e:
        _logger.warning('Could not start the experiment, please check your polyaxon spec %s', e)
        experiment.set_status(
            ExperimentLifeCycle.FAILED,
            message='Could not start the experiment encountered an {} exception.'.format(
                e.__class__.__name__
            ))
        return

    handle_experiment(experiment=experiment, spawner=spawner, response=response)
Ejemplo n.º 2
0
def start_experiment(experiment):
    # Update experiment status to show that its started
    experiment.set_status(ExperimentLifeCycle.SCHEDULED)

    project = experiment.project
    group = experiment.experiment_group

    job_docker_image = None  # This will force the spawners to use the default docker image
    if experiment.specification.run_exec:
        try:
            image_name, image_tag = get_experiment_image_info(experiment=experiment)
        except ValueError as e:
            logger.warning('Could not start the experiment, %s', e)
            experiment.set_status(ExperimentLifeCycle.FAILED,
                                  message='External git repo was note found.')
            return
        job_docker_image = '{}:{}'.format(image_name, image_tag)
        logger.info('Start experiment with built image `%s`', job_docker_image)
    else:
        logger.info('Start experiment with default image.')

    spawner_class = get_spawner_class(experiment.specification.framework)

    # Use spawners to start the experiment
    spawner = spawner_class(project_name=project.unique_name,
                            experiment_name=experiment.unique_name,
                            experiment_group_name=group.unique_name if group else None,
                            project_uuid=project.uuid.hex,
                            experiment_group_uuid=group.uuid.hex if group else None,
                            experiment_uuid=experiment.uuid.hex,
                            original_name=experiment.original_unique_name,
                            cloning_strategy=experiment.cloning_strategy,
                            spec=experiment.specification,
                            k8s_config=settings.K8S_CONFIG,
                            namespace=settings.K8S_NAMESPACE,
                            in_cluster=True,
                            job_docker_image=job_docker_image,
                            use_sidecar=True,
                            sidecar_config=config.get_requested_params(to_str=True))
    try:
        response = spawner.start_experiment(user_token=experiment.user.auth_token.key)
    except ApiException as e:
        logger.warning('Could not start the experiment, please check your polyaxon spec %s', e)
        experiment.set_status(
            ExperimentLifeCycle.FAILED,
            message='Could not start the experiment, encountered a Kubernetes ApiException.')
        return
    except Exception as e:
        logger.warning('Could not start the experiment, please check your polyaxon spec %s', e)
        experiment.set_status(
            ExperimentLifeCycle.FAILED,
            message='Could not start the experiment encountered an {} exception.'.format(
                e.__class__.__name__
            ))
        return

    handle_experiment(experiment=experiment, spawner=spawner, response=response)
Ejemplo n.º 3
0
def start_job(job):
    # Update job status to show that its started
    job.set_status(JobLifeCycle.SCHEDULED)

    try:
        image_name, image_tag = get_image_info(build_job=job.build_job)
    except ValueError as e:
        _logger.warning('Could not start the notebook, %s', e)
        job.set_status(JobLifeCycle.FAILED,
                       message='External git repo was note found.')
        return
    job_docker_image = '{}:{}'.format(image_name, image_tag)
    _logger.info('Start notebook with built image `%s`', job_docker_image)

    spawner = JobSpawner(
        project_name=job.project.unique_name,
        project_uuid=job.project.uuid.hex,
        job_name=job.unique_name,
        job_uuid=job.uuid.hex,
        spec=job.specification,
        k8s_config=settings.K8S_CONFIG,
        namespace=settings.K8S_NAMESPACE,
        job_docker_image=job_docker_image,
        in_cluster=True,
        use_sidecar=True,
        sidecar_config=config.get_requested_params(to_str=True))

    try:
        results = spawner.start_job(resources=job.resources,
                                    node_selectors=job.node_selectors)
    except ApiException as e:
        _logger.warning(
            'Could not start job, please check your polyaxon spec %s', e)
        job.set_status(
            JobLifeCycle.FAILED,
            message=
            'Could not start job, encountered a Kubernetes ApiException.')
        return
    except Exception as e:
        _logger.warning(
            'Could not start job, please check your polyaxon spec %s', e)
        job.set_status(
            JobLifeCycle.FAILED,
            message='Could not start job encountered an {} exception.'.format(
                e.__class__.__name__))
        return
    job.definition = get_job_definition(results)
    job.save()
Ejemplo n.º 4
0
def stop_experiment(experiment, update_status=False):
    project = experiment.project
    group = experiment.experiment_group
    spawner = K8SSpawner(project_name=project.unique_name,
                         experiment_name=experiment.unique_name,
                         experiment_group_name=group.unique_name if group else None,
                         project_uuid=project.uuid.hex,
                         experiment_group_uuid=group.uuid.hex if group else None,
                         experiment_uuid=experiment.uuid.hex,
                         spec_config=experiment.config,
                         k8s_config=settings.K8S_CONFIG,
                         namespace=settings.K8S_NAMESPACE,
                         in_cluster=True,
                         use_sidecar=True,
                         sidecar_config=config.get_requested_params(to_str=True))
    spawner.stop_experiment()
    if update_status:
        # Update experiment status to show that its deleted
        experiment.set_status(ExperimentLifeCycle.DELETED)
Ejemplo n.º 5
0
def stop_experiment(experiment, update_status=False):
    project = experiment.project
    group = experiment.experiment_group

    spawner_class = get_spawner_class(experiment.specification.framework)

    spawner = spawner_class(project_name=project.unique_name,
                            experiment_name=experiment.unique_name,
                            experiment_group_name=group.unique_name if group else None,
                            project_uuid=project.uuid.hex,
                            experiment_group_uuid=group.uuid.hex if group else None,
                            experiment_uuid=experiment.uuid.hex,
                            spec=experiment.specification,
                            k8s_config=settings.K8S_CONFIG,
                            namespace=settings.K8S_NAMESPACE,
                            in_cluster=True,
                            use_sidecar=True,
                            sidecar_config=config.get_requested_params(to_str=True))
    spawner.stop_experiment()
    if update_status:
        # Update experiment status to show that its stopped
        experiment.set_status(ExperimentLifeCycle.STOPPED)
Ejemplo n.º 6
0
def stop_experiment(experiment, update_status=False):
    project = experiment.project
    group = experiment.experiment_group

    spawner_class = get_spawner_class(experiment.specification.framework)

    spawner = spawner_class(project_name=project.unique_name,
                            experiment_name=experiment.unique_name,
                            experiment_group_name=group.unique_name if group else None,
                            project_uuid=project.uuid.hex,
                            experiment_group_uuid=group.uuid.hex if group else None,
                            experiment_uuid=experiment.uuid.hex,
                            spec=experiment.specification,
                            k8s_config=settings.K8S_CONFIG,
                            namespace=settings.K8S_NAMESPACE,
                            in_cluster=True,
                            use_sidecar=True,
                            sidecar_config=config.get_requested_params(to_str=True))
    spawner.stop_experiment()
    if update_status:
        # Update experiment status to show that its stopped
        experiment.set_status(ExperimentLifeCycle.STOPPED)
Ejemplo n.º 7
0
def stop_experiment(project_name,
                    project_uuid,
                    experiment_name,
                    experiment_uuid,
                    specification,
                    experiment_group_name=None,
                    experiment_group_uuid=None):
    spawner_class = get_spawner_class(specification.framework)

    spawner = spawner_class(
        project_name=project_name,
        project_uuid=project_uuid,
        experiment_name=experiment_name,
        experiment_group_name=experiment_group_name,
        experiment_group_uuid=experiment_group_uuid,
        experiment_uuid=experiment_uuid,
        spec=specification,
        k8s_config=settings.K8S_CONFIG,
        namespace=settings.K8S_NAMESPACE,
        in_cluster=True,
        use_sidecar=True,
        sidecar_config=config.get_requested_params(to_str=True))
    spawner.stop_experiment()
Ejemplo n.º 8
0
    def get_env_vars(self):
        env_vars = get_service_env_vars(namespace=self.namespace)
        for k, v in config.get_requested_params(to_str=True).items():
            env_vars.append(get_env_var(name=k, value=v))

        return env_vars
Ejemplo n.º 9
0
    def get_env_vars(self):
        env_vars = get_service_env_vars(namespace=self.namespace)
        for k, v in config.get_requested_params(to_str=True).items():
            env_vars.append(get_env_var(name=k, value=v))

        return env_vars
Ejemplo n.º 10
0
def start_experiment(experiment):
    # Update experiment status to show that its started
    experiment.set_status(ExperimentLifeCycle.SCHEDULED)

    project = experiment.project
    group = experiment.experiment_group

    job_docker_image = None  # This will force the spawner to use the default docker image
    if experiment.compiled_spec.run_exec:
        try:
            image_name, image_tag = get_experiment_image_info(experiment=experiment)
        except ValueError as e:
            logger.warning('Could not start the experiment, %s', e)
            experiment.set_status(ExperimentLifeCycle.FAILED,
                                  message='External git repo was note found.')
            return
        job_docker_image = '{}:{}'.format(image_name, image_tag)
        logger.info('Start experiment with built image `{}`'.format(job_docker_image))
    else:
        logger.info('Start experiment with default image.')

    # Use spawner to start the experiment
    spawner = K8SSpawner(project_name=project.unique_name,
                         experiment_name=experiment.unique_name,
                         experiment_group_name=group.unique_name if group else None,
                         project_uuid=project.uuid.hex,
                         experiment_group_uuid=group.uuid.hex if group else None,
                         experiment_uuid=experiment.uuid.hex,
                         spec_config=experiment.config,
                         k8s_config=settings.K8S_CONFIG,
                         namespace=settings.K8S_NAMESPACE,
                         in_cluster=True,
                         job_docker_image=job_docker_image,
                         use_sidecar=True,
                         sidecar_config=config.get_requested_params(to_str=True))
    try:
        resp = spawner.start_experiment(user_token=experiment.user.auth_token.key)
    except ApiException as e:
        logger.warning('Could not start the experiment, please check your polyaxon spec %s', e)
        experiment.set_status(
            ExperimentLifeCycle.FAILED,
            message='Could not start the experiment, encountered a Kubernetes ApiException.')
        return
    except Exception as e:
        logger.warning('Could not start the experiment, please check your polyaxon spec %s', e)
        experiment.set_status(
            ExperimentLifeCycle.FAILED,
            message='Could not start the experiment encountered an {} exception.'.format(
                e.__class__.__name__
            ))
        return

    # Get the number of jobs this experiment started
    master = resp[TaskType.MASTER]
    job_uuid = master['pod']['metadata']['labels']['job_uuid']
    job_uuid = uuid.UUID(job_uuid)

    def get_definition(definition):
        serializer = ExperimentJobDetailSerializer(data={
            'definition': json.dumps(definition, default=fields.DateTimeField().to_representation)
        })
        serializer.is_valid()
        return json.loads(serializer.validated_data['definition'])

    create_job(job_uuid=job_uuid,
               experiment=experiment,
               definition=get_definition(master),
               resources=spawner.spec.master_resources)

    for i, worker in enumerate(resp[TaskType.WORKER]):
        job_uuid = worker['pod']['metadata']['labels']['job_uuid']
        job_uuid = uuid.UUID(job_uuid)
        create_job(job_uuid=job_uuid,
                   experiment=experiment,
                   definition=get_definition(worker),
                   role=TaskType.WORKER,
                   resources=spawner.spec.worker_resources.get(i))
    for i, ps in enumerate(resp[TaskType.PS]):
        job_uuid = ps['pod']['metadata']['labels']['job_uuid']
        job_uuid = uuid.UUID(job_uuid)
        create_job(job_uuid=job_uuid,
                   experiment=experiment,
                   definition=get_definition(ps),
                   role=TaskType.PS,
                   resources=spawner.spec.ps_resources.get(i))