def start_experiment(experiment): # Update experiment status to show that its started experiment.set_status(ExperimentLifeCycle.SCHEDULED) project = experiment.project group = experiment.experiment_group job_docker_image = None # This will force the spawners to use the default docker image if experiment.specification.build: try: image_name, image_tag = get_image_info(build_job=experiment.build_job) except ValueError as e: _logger.warning('Could not start the experiment, %s', e) experiment.set_status(ExperimentLifeCycle.FAILED, message='External git repo was note found.') return job_docker_image = '{}:{}'.format(image_name, image_tag) _logger.info('Start experiment with built image `%s`', job_docker_image) else: _logger.info('Start experiment with default image.') spawner_class = get_spawner_class(experiment.specification.framework) # Use spawners to start the experiment spawner = spawner_class(project_name=project.unique_name, experiment_name=experiment.unique_name, experiment_group_name=group.unique_name if group else None, project_uuid=project.uuid.hex, experiment_group_uuid=group.uuid.hex if group else None, experiment_uuid=experiment.uuid.hex, original_name=experiment.original_unique_name, cloning_strategy=experiment.cloning_strategy, spec=experiment.specification, k8s_config=settings.K8S_CONFIG, namespace=settings.K8S_NAMESPACE, in_cluster=True, job_docker_image=job_docker_image, use_sidecar=True, sidecar_config=config.get_requested_params(to_str=True)) try: response = spawner.start_experiment() except ApiException as e: _logger.warning('Could not start the experiment, please check your polyaxon spec %s', e) experiment.set_status( ExperimentLifeCycle.FAILED, message='Could not start the experiment, encountered a Kubernetes ApiException.') return except Exception as e: _logger.warning('Could not start the experiment, please check your polyaxon spec %s', e) experiment.set_status( ExperimentLifeCycle.FAILED, message='Could not start the experiment encountered an {} exception.'.format( e.__class__.__name__ )) return handle_experiment(experiment=experiment, spawner=spawner, response=response)
def start_experiment(experiment): # Update experiment status to show that its started experiment.set_status(ExperimentLifeCycle.SCHEDULED) project = experiment.project group = experiment.experiment_group job_docker_image = None # This will force the spawners to use the default docker image if experiment.specification.run_exec: try: image_name, image_tag = get_experiment_image_info(experiment=experiment) except ValueError as e: logger.warning('Could not start the experiment, %s', e) experiment.set_status(ExperimentLifeCycle.FAILED, message='External git repo was note found.') return job_docker_image = '{}:{}'.format(image_name, image_tag) logger.info('Start experiment with built image `%s`', job_docker_image) else: logger.info('Start experiment with default image.') spawner_class = get_spawner_class(experiment.specification.framework) # Use spawners to start the experiment spawner = spawner_class(project_name=project.unique_name, experiment_name=experiment.unique_name, experiment_group_name=group.unique_name if group else None, project_uuid=project.uuid.hex, experiment_group_uuid=group.uuid.hex if group else None, experiment_uuid=experiment.uuid.hex, original_name=experiment.original_unique_name, cloning_strategy=experiment.cloning_strategy, spec=experiment.specification, k8s_config=settings.K8S_CONFIG, namespace=settings.K8S_NAMESPACE, in_cluster=True, job_docker_image=job_docker_image, use_sidecar=True, sidecar_config=config.get_requested_params(to_str=True)) try: response = spawner.start_experiment(user_token=experiment.user.auth_token.key) except ApiException as e: logger.warning('Could not start the experiment, please check your polyaxon spec %s', e) experiment.set_status( ExperimentLifeCycle.FAILED, message='Could not start the experiment, encountered a Kubernetes ApiException.') return except Exception as e: logger.warning('Could not start the experiment, please check your polyaxon spec %s', e) experiment.set_status( ExperimentLifeCycle.FAILED, message='Could not start the experiment encountered an {} exception.'.format( e.__class__.__name__ )) return handle_experiment(experiment=experiment, spawner=spawner, response=response)
def start_job(job): # Update job status to show that its started job.set_status(JobLifeCycle.SCHEDULED) try: image_name, image_tag = get_image_info(build_job=job.build_job) except ValueError as e: _logger.warning('Could not start the notebook, %s', e) job.set_status(JobLifeCycle.FAILED, message='External git repo was note found.') return job_docker_image = '{}:{}'.format(image_name, image_tag) _logger.info('Start notebook with built image `%s`', job_docker_image) spawner = JobSpawner( project_name=job.project.unique_name, project_uuid=job.project.uuid.hex, job_name=job.unique_name, job_uuid=job.uuid.hex, spec=job.specification, k8s_config=settings.K8S_CONFIG, namespace=settings.K8S_NAMESPACE, job_docker_image=job_docker_image, in_cluster=True, use_sidecar=True, sidecar_config=config.get_requested_params(to_str=True)) try: results = spawner.start_job(resources=job.resources, node_selectors=job.node_selectors) except ApiException as e: _logger.warning( 'Could not start job, please check your polyaxon spec %s', e) job.set_status( JobLifeCycle.FAILED, message= 'Could not start job, encountered a Kubernetes ApiException.') return except Exception as e: _logger.warning( 'Could not start job, please check your polyaxon spec %s', e) job.set_status( JobLifeCycle.FAILED, message='Could not start job encountered an {} exception.'.format( e.__class__.__name__)) return job.definition = get_job_definition(results) job.save()
def stop_experiment(experiment, update_status=False): project = experiment.project group = experiment.experiment_group spawner = K8SSpawner(project_name=project.unique_name, experiment_name=experiment.unique_name, experiment_group_name=group.unique_name if group else None, project_uuid=project.uuid.hex, experiment_group_uuid=group.uuid.hex if group else None, experiment_uuid=experiment.uuid.hex, spec_config=experiment.config, k8s_config=settings.K8S_CONFIG, namespace=settings.K8S_NAMESPACE, in_cluster=True, use_sidecar=True, sidecar_config=config.get_requested_params(to_str=True)) spawner.stop_experiment() if update_status: # Update experiment status to show that its deleted experiment.set_status(ExperimentLifeCycle.DELETED)
def stop_experiment(experiment, update_status=False): project = experiment.project group = experiment.experiment_group spawner_class = get_spawner_class(experiment.specification.framework) spawner = spawner_class(project_name=project.unique_name, experiment_name=experiment.unique_name, experiment_group_name=group.unique_name if group else None, project_uuid=project.uuid.hex, experiment_group_uuid=group.uuid.hex if group else None, experiment_uuid=experiment.uuid.hex, spec=experiment.specification, k8s_config=settings.K8S_CONFIG, namespace=settings.K8S_NAMESPACE, in_cluster=True, use_sidecar=True, sidecar_config=config.get_requested_params(to_str=True)) spawner.stop_experiment() if update_status: # Update experiment status to show that its stopped experiment.set_status(ExperimentLifeCycle.STOPPED)
def stop_experiment(project_name, project_uuid, experiment_name, experiment_uuid, specification, experiment_group_name=None, experiment_group_uuid=None): spawner_class = get_spawner_class(specification.framework) spawner = spawner_class( project_name=project_name, project_uuid=project_uuid, experiment_name=experiment_name, experiment_group_name=experiment_group_name, experiment_group_uuid=experiment_group_uuid, experiment_uuid=experiment_uuid, spec=specification, k8s_config=settings.K8S_CONFIG, namespace=settings.K8S_NAMESPACE, in_cluster=True, use_sidecar=True, sidecar_config=config.get_requested_params(to_str=True)) spawner.stop_experiment()
def get_env_vars(self): env_vars = get_service_env_vars(namespace=self.namespace) for k, v in config.get_requested_params(to_str=True).items(): env_vars.append(get_env_var(name=k, value=v)) return env_vars
def start_experiment(experiment): # Update experiment status to show that its started experiment.set_status(ExperimentLifeCycle.SCHEDULED) project = experiment.project group = experiment.experiment_group job_docker_image = None # This will force the spawner to use the default docker image if experiment.compiled_spec.run_exec: try: image_name, image_tag = get_experiment_image_info(experiment=experiment) except ValueError as e: logger.warning('Could not start the experiment, %s', e) experiment.set_status(ExperimentLifeCycle.FAILED, message='External git repo was note found.') return job_docker_image = '{}:{}'.format(image_name, image_tag) logger.info('Start experiment with built image `{}`'.format(job_docker_image)) else: logger.info('Start experiment with default image.') # Use spawner to start the experiment spawner = K8SSpawner(project_name=project.unique_name, experiment_name=experiment.unique_name, experiment_group_name=group.unique_name if group else None, project_uuid=project.uuid.hex, experiment_group_uuid=group.uuid.hex if group else None, experiment_uuid=experiment.uuid.hex, spec_config=experiment.config, k8s_config=settings.K8S_CONFIG, namespace=settings.K8S_NAMESPACE, in_cluster=True, job_docker_image=job_docker_image, use_sidecar=True, sidecar_config=config.get_requested_params(to_str=True)) try: resp = spawner.start_experiment(user_token=experiment.user.auth_token.key) except ApiException as e: logger.warning('Could not start the experiment, please check your polyaxon spec %s', e) experiment.set_status( ExperimentLifeCycle.FAILED, message='Could not start the experiment, encountered a Kubernetes ApiException.') return except Exception as e: logger.warning('Could not start the experiment, please check your polyaxon spec %s', e) experiment.set_status( ExperimentLifeCycle.FAILED, message='Could not start the experiment encountered an {} exception.'.format( e.__class__.__name__ )) return # Get the number of jobs this experiment started master = resp[TaskType.MASTER] job_uuid = master['pod']['metadata']['labels']['job_uuid'] job_uuid = uuid.UUID(job_uuid) def get_definition(definition): serializer = ExperimentJobDetailSerializer(data={ 'definition': json.dumps(definition, default=fields.DateTimeField().to_representation) }) serializer.is_valid() return json.loads(serializer.validated_data['definition']) create_job(job_uuid=job_uuid, experiment=experiment, definition=get_definition(master), resources=spawner.spec.master_resources) for i, worker in enumerate(resp[TaskType.WORKER]): job_uuid = worker['pod']['metadata']['labels']['job_uuid'] job_uuid = uuid.UUID(job_uuid) create_job(job_uuid=job_uuid, experiment=experiment, definition=get_definition(worker), role=TaskType.WORKER, resources=spawner.spec.worker_resources.get(i)) for i, ps in enumerate(resp[TaskType.PS]): job_uuid = ps['pod']['metadata']['labels']['job_uuid'] job_uuid = uuid.UUID(job_uuid) create_job(job_uuid=job_uuid, experiment=experiment, definition=get_definition(ps), role=TaskType.PS, resources=spawner.spec.ps_resources.get(i))