def start_notebook(notebook): # Update job status to show that its started notebook.set_status(JobLifeCycle.SCHEDULED) try: registry_spec = get_registry_context(build_backend=None) except ContainerRegistryError: notebook.set_status( JobLifeCycle.FAILED, message= 'Could not start the notebook, please check your registry configuration.' ) return try: image_name, image_tag = get_image_info( build_job=notebook.build_job, registry_host=registry_spec.host) except (ValueError, AttributeError): _logger.error('Could not start the notebook.', exc_info=True) notebook.set_status(JobLifeCycle.FAILED, message='Image info was not found.') return job_docker_image = '{}:{}'.format(image_name, image_tag) _logger.info('Start notebook with built image `%s`', job_docker_image) spawner = NotebookSpawner(project_name=notebook.project.unique_name, project_uuid=notebook.project.uuid.hex, job_name=notebook.unique_name, job_uuid=notebook.uuid.hex, k8s_config=conf.get(K8S_CONFIG), namespace=conf.get(K8S_NAMESPACE), job_docker_image=job_docker_image, in_cluster=True) error = {} try: mount_code_in_notebooks = conf.get(NOTEBOOKS_MOUNT_CODE) results = spawner.start_notebook( persistence_outputs=notebook.persistence_outputs, persistence_data=notebook.persistence_data, outputs_refs_jobs=notebook.outputs_refs_jobs, outputs_refs_experiments=notebook.outputs_refs_experiments, resources=notebook.resources, labels=notebook.labels, annotations=notebook.annotations, secret_refs=notebook.secret_refs, config_map_refs=notebook.config_map_refs, node_selector=notebook.node_selector, affinity=notebook.affinity, tolerations=notebook.tolerations, backend=notebook.backend, max_restarts=get_max_restart(notebook.max_restarts, conf.get(MAX_RESTARTS_NOTEBOOKS)), reconcile_url=get_notebook_reconcile_url(notebook.unique_name), mount_code_in_notebooks=mount_code_in_notebooks) notebook.definition = get_job_definition(results) notebook.save(update_fields=['definition']) return except ApiException: _logger.error( 'Could not start notebook, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a Kubernetes ApiException.', } except StoreNotFoundError as e: _logger.error( 'Could not start the notebook, please check your volume definitions', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a volume definition problem. %s' % e, } except Exception as e: _logger.error( 'Could not start notebook, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start notebook encountered an {} exception.'.format( e.__class__.__name__) } finally: if error.get('raised'): notebook.set_status(JobLifeCycle.FAILED, message=error.get('message'), traceback=error.get('traceback'))
def start_job(job): # Update job status to show that its started job.set_status(JobLifeCycle.SCHEDULED) try: registry_spec = get_registry_context(build_backend=None) except ContainerRegistryError: job.set_status( JobLifeCycle.FAILED, message= 'Could not start the job, please check your registry configuration.' ) return try: image_name, image_tag = get_image_info( build_job=job.build_job, registry_host=registry_spec.host) except (ValueError, AttributeError): _logger.error('Could not start the job.', exc_info=True) job.set_status(JobLifeCycle.FAILED, message='Image info was not found.') return job_docker_image = '{}:{}'.format(image_name, image_tag) _logger.info('Start job with built image `%s`', job_docker_image) spawner = JobSpawner(project_name=job.project.unique_name, project_uuid=job.project.uuid.hex, job_name=job.unique_name, job_uuid=job.uuid.hex, spec=job.specification, k8s_config=conf.get(K8S_CONFIG), namespace=conf.get(K8S_NAMESPACE), job_docker_image=job_docker_image, in_cluster=True, use_sidecar=True) error = {} try: results = spawner.start_job( persistence_data=job.persistence_data, persistence_outputs=job.persistence_outputs, outputs_refs_jobs=job.outputs_refs_jobs, outputs_refs_experiments=job.outputs_refs_experiments, resources=job.resources, node_selector=job.node_selector, affinity=job.affinity, tolerations=job.tolerations) job.definition = get_job_definition(results) job.save(update_fields=['definition']) return except ApiException: _logger.error('Could not start job, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a Kubernetes ApiException.', } except VolumeNotFoundError as e: _logger.error( 'Could not start the job, please check your volume definitions.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a volume definition problem. %s' % e, } except Exception as e: _logger.error('Could not start job, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start job encountered an {} exception.'.format( e.__class__.__name__) } finally: if error.get('raised'): job.set_status(JobLifeCycle.FAILED, message=error.get('message'), traceback=error.get('traceback'))
def test_get_image_image_info(self): image_info = get_image_info(build_job=self.build_job, registry_host='some_host') assert image_info[0] == get_image_name(build_job=self.build_job, registry_host='some_host') assert image_info[1] == self.build_job.uuid.hex
def start_experiment(experiment): # Update experiment status to show that its started experiment.set_status(ExperimentLifeCycle.SCHEDULED) project = experiment.project group = experiment.experiment_group job_docker_image = None # This will force the spawners to use the default docker image if experiment.specification.build: try: registry_spec = get_registry_context(build_backend=None) except ContainerRegistryError: experiment.set_status( ExperimentLifeCycle.FAILED, message= 'Could not start the experiment, please check your registry configuration.' ) return try: image_name, image_tag = get_image_info( build_job=experiment.build_job, registry_host=registry_spec.host) except (ValueError, AttributeError): _logger.error('Could not start the experiment.', exc_info=True) experiment.set_status(ExperimentLifeCycle.FAILED, message='Image info was not found.') return job_docker_image = '{}:{}'.format(image_name, image_tag) _logger.info('Start experiment with built image `%s`', job_docker_image) else: _logger.info('Start experiment with default image.') spawner_class = get_spawner_class(specification=experiment.specification) # token_scope = RedisEphemeralTokens.get_scope(experiment.user.id, # 'experiment', # experiment.id) error = {} try: # Use spawners to start the experiment spawner = spawner_class( project_name=project.unique_name, experiment_name=experiment.unique_name, experiment_group_name=group.unique_name if group else None, project_uuid=project.uuid.hex, experiment_group_uuid=group.uuid.hex if group else None, experiment_uuid=experiment.uuid.hex, persistence_config=experiment.persistence_config, outputs_refs_experiments=experiment.outputs_refs_experiments, outputs_refs_jobs=experiment.outputs_refs_jobs, original_name=experiment.original_unique_name, cloning_strategy=experiment.cloning_strategy, spec=experiment.specification, k8s_config=conf.get(K8S_CONFIG), namespace=conf.get(K8S_NAMESPACE), in_cluster=True, job_docker_image=job_docker_image, use_sidecar=True) # Create db jobs try: create_experiment_jobs(experiment=experiment, spawner=spawner) except IntegrityError: # TODO: Add better handling for this. return # Create k8s jobs response = spawner.start_experiment() # handle response handle_experiment(experiment=experiment, response=response) experiment.set_status(ExperimentLifeCycle.STARTING) except ApiException as e: _logger.error( 'Could not start the experiment, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the experiment, encountered a Kubernetes ApiException.' } except StoreNotFoundError as e: _logger.error( 'Could not start the experiment, please check your volume definitions.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the experiment, ' 'encountered a volume definition problem, %s.' % e } except Exception as e: _logger.error( 'Could not start the experiment, please check your polyaxon spec', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the experiment encountered an {} exception.'. format(e.__class__.__name__) } finally: if error.get('raised'): experiment.set_status(ExperimentLifeCycle.FAILED, message=error.get('message'), traceback=error.get('traceback'))