def get_env_vars(self): env_vars = get_service_env_vars(namespace=self.namespace) for k, v in config.get_requested_params(to_str=True).items(): env_vars.append(get_env_var(name=k, value=v)) # Add private registries secrets keys for key in config.params_startswith(settings.PRIVATE_REGISTRIES_PREFIX): env_vars.append(get_from_secret(key, key)) return env_vars
def start_job(job): # Update job status to show that its started job.set_status(JobLifeCycle.SCHEDULED) try: image_name, image_tag = get_image_info(build_job=job.build_job) except ValueError as e: _logger.warning('Could not start the job, %s', e) job.set_status(JobLifeCycle.FAILED, message='External git repo was note found.') return job_docker_image = '{}:{}'.format(image_name, image_tag) _logger.info('Start job with built image `%s`', job_docker_image) spawner = JobSpawner( project_name=job.project.unique_name, project_uuid=job.project.uuid.hex, job_name=job.unique_name, job_uuid=job.uuid.hex, spec=job.specification, k8s_config=settings.K8S_CONFIG, namespace=settings.K8S_NAMESPACE, job_docker_image=job_docker_image, in_cluster=True, use_sidecar=True, sidecar_config=config.get_requested_params(to_str=True)) try: results = spawner.start_job(resources=job.resources, node_selectors=job.node_selectors) except ApiException as e: _logger.warning( 'Could not start job, please check your polyaxon spec %s', e) job.set_status( JobLifeCycle.FAILED, message= 'Could not start job, encountered a Kubernetes ApiException.') return except Exception as e: _logger.warning( 'Could not start job, please check your polyaxon spec %s', e) job.set_status( JobLifeCycle.FAILED, message='Could not start job encountered an {} exception.'.format( e.__class__.__name__)) return job.definition = get_job_definition(results) job.save()
def stop_experiment(project_name, project_uuid, experiment_name, experiment_uuid, specification, experiment_group_name=None, experiment_group_uuid=None): spawner_class = get_spawner_class(specification.framework) spawner = spawner_class(project_name=project_name, project_uuid=project_uuid, experiment_name=experiment_name, experiment_group_name=experiment_group_name, experiment_group_uuid=experiment_group_uuid, experiment_uuid=experiment_uuid, spec=specification, k8s_config=settings.K8S_CONFIG, namespace=settings.K8S_NAMESPACE, in_cluster=True, use_sidecar=True, sidecar_config=config.get_requested_params(to_str=True)) return spawner.stop_experiment()
def start_job(job): # Update job status to show that its started job.set_status(JobLifeCycle.SCHEDULED) try: image_name, image_tag = get_image_info(build_job=job.build_job) except (ValueError, AttributeError): _logger.error('Could not start the job.', exc_info=True) job.set_status(JobLifeCycle.FAILED, message='Image info was not found.') return job_docker_image = '{}:{}'.format(image_name, image_tag) _logger.info('Start job with built image `%s`', job_docker_image) spawner = JobSpawner( project_name=job.project.unique_name, project_uuid=job.project.uuid.hex, job_name=job.unique_name, job_uuid=job.uuid.hex, spec=job.specification, k8s_config=settings.K8S_CONFIG, namespace=settings.K8S_NAMESPACE, job_docker_image=job_docker_image, in_cluster=True, use_sidecar=True, sidecar_config=config.get_requested_params(to_str=True)) error = {} try: results = spawner.start_job( persistence_data=job.persistence_data, persistence_outputs=job.persistence_outputs, outputs_refs_jobs=job.outputs_refs_jobs, outputs_refs_experiments=job.outputs_refs_experiments, resources=job.resources, node_selector=job.node_selector, affinity=job.affinity, tolerations=job.tolerations) job.definition = get_job_definition(results) job.save() return except ApiException: _logger.error('Could not start job, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a Kubernetes ApiException.', } except VolumeNotFoundError as e: _logger.error( 'Could not start the job, please check your volume definitions.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a volume definition problem. %s' % e, } except Exception as e: _logger.error('Could not start job, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start job encountered an {} exception.'.format( e.__class__.__name__) } finally: if error.get('raised'): job.set_status(JobLifeCycle.FAILED, message=error.get('message'), traceback=error.get('traceback'))
def get_env_vars(self): env_vars = get_service_env_vars(namespace=self.namespace) for k, v in config.get_requested_params(to_str=True).items(): env_vars.append(get_env_var(name=k, value=v)) return env_vars
def start_experiment(experiment): # Update experiment status to show that its started experiment.set_status(ExperimentLifeCycle.SCHEDULED) project = experiment.project group = experiment.experiment_group job_docker_image = None # This will force the spawners to use the default docker image if experiment.specification.build: try: image_name, image_tag = get_image_info( build_job=experiment.build_job) except ValueError as e: _logger.warning('Could not start the experiment, %s', e) experiment.set_status(ExperimentLifeCycle.FAILED, message='External git repo was note found.') return job_docker_image = '{}:{}'.format(image_name, image_tag) _logger.info('Start experiment with built image `%s`', job_docker_image) else: _logger.info('Start experiment with default image.') spawner_class = get_spawner_class(experiment.specification.framework) # Use spawners to start the experiment spawner = spawner_class( project_name=project.unique_name, experiment_name=experiment.unique_name, experiment_group_name=group.unique_name if group else None, project_uuid=project.uuid.hex, experiment_group_uuid=group.uuid.hex if group else None, experiment_uuid=experiment.uuid.hex, persistence_config=experiment.persistence_config, outputs_refs_experiments=experiment.outputs_refs_experiments, outputs_refs_jobs=experiment.outputs_refs_jobs, original_name=experiment.original_unique_name, cloning_strategy=experiment.cloning_strategy, spec=experiment.specification, k8s_config=settings.K8S_CONFIG, namespace=settings.K8S_NAMESPACE, in_cluster=True, job_docker_image=job_docker_image, use_sidecar=True, sidecar_config=config.get_requested_params(to_str=True)) try: response = spawner.start_experiment() except ApiException as e: _logger.warning( 'Could not start the experiment, please check your polyaxon spec %s', e) experiment.set_status( ExperimentLifeCycle.FAILED, message= 'Could not start the experiment, encountered a Kubernetes ApiException.' ) return except VolumeNotFoundError as e: _logger.warning( 'Could not start the experiment, ' 'please check your volume definitions %s', e) experiment.set_status(ExperimentLifeCycle.FAILED, message='Could not start the experiment, ' 'encountered a volume definition problem. %s' % e) return False except Exception as e: _logger.warning( 'Could not start the experiment, please check your polyaxon spec %s', e) experiment.set_status( ExperimentLifeCycle.FAILED, message='Could not start the experiment encountered an {} exception.' .format(e.__class__.__name__)) return handle_experiment(experiment=experiment, spawner=spawner, response=response)
def start_experiment(experiment): # Update experiment status to show that its started experiment.set_status(ExperimentLifeCycle.SCHEDULED) project = experiment.project group = experiment.experiment_group job_docker_image = None # This will force the spawners to use the default docker image if experiment.specification.build: try: image_name, image_tag = get_image_info( build_job=experiment.build_job) except (ValueError, AttributeError): _logger.error('Could not start the experiment.', exc_info=True) experiment.set_status(ExperimentLifeCycle.FAILED, message='Image info was not found.') return job_docker_image = '{}:{}'.format(image_name, image_tag) _logger.info('Start experiment with built image `%s`', job_docker_image) else: _logger.info('Start experiment with default image.') spawner_class = get_spawner_class(experiment.specification.framework) token_scope = RedisEphemeralTokens.get_scope(experiment.user.id, 'experiment', experiment.id) # Use spawners to start the experiment spawner = spawner_class( project_name=project.unique_name, experiment_name=experiment.unique_name, experiment_group_name=group.unique_name if group else None, project_uuid=project.uuid.hex, experiment_group_uuid=group.uuid.hex if group else None, experiment_uuid=experiment.uuid.hex, persistence_config=experiment.persistence_config, outputs_refs_experiments=experiment.outputs_refs_experiments, outputs_refs_jobs=experiment.outputs_refs_jobs, original_name=experiment.original_unique_name, cloning_strategy=experiment.cloning_strategy, spec=experiment.specification, k8s_config=settings.K8S_CONFIG, namespace=settings.K8S_NAMESPACE, in_cluster=True, job_docker_image=job_docker_image, use_sidecar=True, sidecar_config=config.get_requested_params(to_str=True), token_scope=token_scope) error = {} try: response = spawner.start_experiment() handle_experiment(experiment=experiment, spawner=spawner, response=response) except ApiException as e: _logger.error( 'Could not start the experiment, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the experiment, encountered a Kubernetes ApiException.' } except VolumeNotFoundError as e: _logger.error( 'Could not start the experiment, please check your volume definitions.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the experiment, ' 'encountered a volume definition problem, %s.' % e } except Exception as e: _logger.error( 'Could not start the experiment, please check your polyaxon spec', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the experiment encountered an {} exception.'. format(e.__class__.__name__) } finally: if error.get('raised'): experiment.set_status(ExperimentLifeCycle.FAILED, message=error.get('message'), traceback=error.get('traceback'))