def handle_pytorch_experiment(response): master = response[TaskType.MASTER] job_uuid = master['pod']['metadata']['labels']['job_uuid'] job_uuid = uuid.UUID(job_uuid) set_job_definition(job_uuid=job_uuid, definition=get_job_definition(master)) for worker in response[TaskType.WORKER]: job_uuid = worker['pod']['metadata']['labels']['job_uuid'] job_uuid = uuid.UUID(job_uuid) set_job_definition(job_uuid=job_uuid, definition=get_job_definition(worker))
def start_notebook(notebook): # Update job status to show that its started notebook.set_status(JobLifeCycle.SCHEDULED) try: registry_spec = get_registry_context(build_backend=None) except ContainerRegistryError: notebook.set_status( JobLifeCycle.FAILED, message='Could not start the notebook, please check your registry configuration.') return try: image_name, image_tag = get_image_info(build_job=notebook.build_job, registry_host=registry_spec.host) except (ValueError, AttributeError): _logger.error('Could not start the notebook.', exc_info=True) notebook.set_status(JobLifeCycle.FAILED, message='Image info was not found.') return job_docker_image = '{}:{}'.format(image_name, image_tag) _logger.info('Start notebook with built image `%s`', job_docker_image) spawner = NotebookSpawner( project_name=notebook.project.unique_name, project_uuid=notebook.project.uuid.hex, job_name=notebook.unique_name, job_uuid=notebook.uuid.hex, k8s_config=conf.get(K8S_CONFIG), namespace=conf.get(K8S_NAMESPACE), version=conf.get(CHART_VERSION), job_docker_image=job_docker_image, in_cluster=True) error = {} try: mount_code_in_notebooks = conf.get(NOTEBOOKS_MOUNT_CODE) results = spawner.start_notebook( persistence_outputs=notebook.persistence_outputs, persistence_data=notebook.persistence_data, outputs_refs_jobs=notebook.outputs_refs_jobs, outputs_refs_experiments=notebook.outputs_refs_experiments, resources=notebook.resources, labels=notebook.labels, annotations=notebook.annotations, secret_refs=notebook.secret_refs, config_map_refs=notebook.config_map_refs, node_selector=notebook.node_selector, affinity=notebook.affinity, tolerations=notebook.tolerations, backend=notebook.backend, max_restarts=get_max_restart(notebook.max_restarts, conf.get(MAX_RESTARTS_NOTEBOOKS)), reconcile_url=get_notebook_reconcile_url(notebook.unique_name), mount_code_in_notebooks=mount_code_in_notebooks) notebook.definition = get_job_definition(results) notebook.save(update_fields=['definition']) return except ApiException: _logger.error('Could not start notebook, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a Kubernetes ApiException.', } except StoreNotFoundError as e: _logger.error('Could not start the notebook, please check your volume definitions', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a volume definition problem. %s' % e, } except Exception as e: _logger.error('Could not start notebook, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start notebook encountered an {} exception.'.format( e.__class__.__name__) } finally: if error.get('raised'): notebook.set_status( JobLifeCycle.FAILED, message=error.get('message'), traceback=error.get('traceback'))
def start_tensorboard(tensorboard): # Update job status to show that its started tensorboard.set_status(JobLifeCycle.SCHEDULED) spawner = TensorboardSpawner(project_name=tensorboard.project.unique_name, project_uuid=tensorboard.project.uuid.hex, job_name=tensorboard.unique_name, job_uuid=tensorboard.uuid.hex, k8s_config=conf.get(K8S_CONFIG), namespace=conf.get(K8S_NAMESPACE), version=conf.get(CHART_VERSION), job_docker_image=tensorboard.build_image, in_cluster=True) error = {} outputs_specs, tensorboard_paths = tensorboard.outputs_path try: results = spawner.start_tensorboard( outputs_path=tensorboard_paths, persistence_outputs=tensorboard.persistence_outputs, outputs_specs=outputs_specs, outputs_refs_jobs=tensorboard.outputs_refs_jobs, outputs_refs_experiments=tensorboard.outputs_refs_experiments, resources=tensorboard.resources, labels=tensorboard.labels, annotations=tensorboard.annotations, node_selector=tensorboard.node_selector, affinity=tensorboard.affinity, tolerations=tensorboard.tolerations, max_restarts=get_max_restart(tensorboard.max_restarts, conf.get(MAX_RESTARTS_TENSORBOARDS)), reconcile_url=get_tensorboard_reconcile_url( tensorboard.unique_name)) tensorboard.definition = get_job_definition(results) tensorboard.save(update_fields=['definition']) return except ApiException: _logger.error( 'Could not start tensorboard, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a Kubernetes ApiException.', } except StoreNotFoundError as e: _logger.error( 'Could not start the tensorboard, please check your volume definitions.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a volume definition problem. %s' % e, } except TensorboardValidation as e: _logger.error( 'Could not start the tensorboard, ' 'some experiments require authenticating to stores with different access.', exc_info=True) error = { 'raised': True, 'traceback': None, 'message': 'Could not start the tensorboard, ' 'some experiments require authenticating ' 'to stores with different access. %s' % e, } except Exception as e: _logger.error( 'Could not start tensorboard, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start tensorboard encountered an {} exception.'.format( e.__class__.__name__) } finally: if error.get('raised'): tensorboard.set_status(JobLifeCycle.FAILED, message=error.get('message'), traceback=error.get('traceback'))
def handle_base_experiment(response): master = response[TaskType.MASTER] job_uuid = master['pod']['metadata']['labels']['job_uuid'] job_uuid = uuid.UUID(job_uuid) set_job_definition(job_uuid=job_uuid, definition=get_job_definition(master))
def start_dockerizer(build_job): # Update job status to show that its started build_job.set_status(JobLifeCycle.SCHEDULED) spawner_class = get_spawner_class(build_job.backend) try: registry_spec = get_registry_context(build_backend=build_job.backend) except ContainerRegistryError: build_job.set_status( JobLifeCycle.FAILED, message= 'Could not start the dockerizer job, please check your registry configuration.' ) return spawner = spawner_class(project_name=build_job.project.unique_name, project_uuid=build_job.project.uuid.hex, job_name=build_job.unique_name, job_uuid=build_job.uuid.hex, k8s_config=conf.get(K8S_CONFIG), namespace=conf.get(K8S_NAMESPACE), version=conf.get(CHART_VERSION), in_cluster=True, use_sidecar=True, log_level=build_job.specification.log_level) error = {} try: results = spawner.start_dockerizer( commit=build_job.commit, from_image=build_job.build_image, dockerfile_path=build_job.build_dockerfile, context_path=build_job.build_context, image_tag=build_job.uuid.hex, image_name=get_image_name(build_job=build_job, registry_host=registry_spec.host), build_steps=build_job.build_steps, env_vars=build_job.build_env_vars, lang_env=build_job.build_lang_env, nocache=build_job.build_nocache, insecure=registry_spec.insecure, creds_secret_ref=registry_spec.secret, creds_secret_items=registry_spec.secret_items, secret_refs=build_job.secret_refs, config_map_refs=build_job.config_map_refs, resources=build_job.resources, labels=build_job.labels, annotations=build_job.annotations, node_selector=build_job.node_selector, affinity=build_job.affinity, tolerations=build_job.tolerations, max_restarts=get_max_restart(build_job.max_restarts, conf.get(MAX_RESTARTS_BUILD_JOBS)), reconcile_url=get_build_reconcile_url(build_job.unique_name)) auditor.record(event_type=BUILD_JOB_STARTED, instance=build_job) build_job.definition = get_job_definition(results) build_job.save(update_fields=['definition']) return True except ApiException: _logger.error( 'Could not start build job, please check your polyaxon spec', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start build job, encountered a Kubernetes ApiException.' } except StoreNotFoundError as e: _logger.error( 'Could not start build job, please check your volume definitions.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start build job, encountered a volume definition problem. %s' % e } except Exception as e: _logger.error( 'Could not start build job, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start build job encountered an {} exception.'.format( e.__class__.__name__) } finally: if error.get('raised'): build_job.set_status(JobLifeCycle.FAILED, message=error.get('message'), traceback=error.get('traceback'))