def _create_job(self,
                    task_type,
                    task_idx,
                    add_service,
                    command=None,
                    args=None,
                    env_vars=None,
                    resources=None,
                    node_selector=None,
                    restart_policy='Never'):
        job_name = self.pod_manager.get_job_name(task_type=task_type,
                                                 task_idx=task_idx)
        sidecar_args = get_sidecar_args(pod_id=job_name)
        labels = self.pod_manager.get_labels(task_type=task_type,
                                             task_idx=task_idx)

        volumes, volume_mounts = get_pod_volumes(
            persistence_outputs=self.persistence_config.outputs,
            persistence_data=self.persistence_config.data)
        refs_volumes, refs_volume_mounts = get_pod_refs_outputs_volumes(
            outputs_refs=self.outputs_refs_jobs,
            persistence_outputs=self.persistence_config.outputs)
        volumes += refs_volumes
        volume_mounts += refs_volume_mounts
        refs_volumes, refs_volume_mounts = get_pod_refs_outputs_volumes(
            outputs_refs=self.outputs_refs_experiments,
            persistence_outputs=self.persistence_config.outputs)
        volumes += refs_volumes
        volume_mounts += refs_volume_mounts
        pod = self.pod_manager.get_pod(
            task_type=task_type,
            task_idx=task_idx,
            volume_mounts=volume_mounts,
            volumes=volumes,
            labels=labels,
            env_vars=env_vars,
            command=command,
            args=args,
            sidecar_args=sidecar_args,
            persistence_outputs=self.persistence_config.outputs,
            persistence_data=self.persistence_config.data,
            outputs_refs_jobs=self.outputs_refs_jobs,
            outputs_refs_experiments=self.outputs_refs_experiments,
            resources=resources,
            node_selector=node_selector,
            restart_policy=restart_policy)
        pod_resp, _ = self.create_or_update_pod(name=job_name, data=pod)
        results = {'pod': pod_resp.to_dict()}
        if add_service:
            service = services.get_service(namespace=self.namespace,
                                           name=job_name,
                                           labels=labels,
                                           ports=self.pod_manager.ports,
                                           target_ports=self.pod_manager.ports)
            service_resp, _ = self.create_or_update_service(name=job_name,
                                                            data=service)
            results['service'] = service_resp.to_dict()
        return results
    def _create_job(self,
                    task_type,
                    task_idx,
                    add_service,
                    command=None,
                    args=None,
                    sidecar_args_fn=None,
                    env_vars=None,
                    resources=None,
                    node_selector=None,
                    restart_policy='Never'):
        job_name = self.pod_manager.get_job_name(task_type=task_type,
                                                 task_idx=task_idx)
        sidecar_args = sidecar_args_fn(pod_id=job_name)
        labels = self.pod_manager.get_labels(task_type=task_type,
                                             task_idx=task_idx)

        volumes, volume_mounts = get_pod_volumes()
        pod = self.pod_manager.get_pod(task_type=task_type,
                                       task_idx=task_idx,
                                       volume_mounts=volume_mounts,
                                       volumes=volumes,
                                       env_vars=env_vars,
                                       command=command,
                                       args=args,
                                       sidecar_args=sidecar_args,
                                       resources=resources,
                                       node_selector=node_selector,
                                       restart_policy=restart_policy)
        pod_resp, _ = self.create_or_update_pod(name=job_name, data=pod)

        service = services.get_service(namespace=self.namespace,
                                       name=job_name,
                                       labels=labels,
                                       ports=self.pod_manager.ports,
                                       target_ports=self.pod_manager.ports)

        results = {'pod': pod_resp.to_dict()}
        if add_service:
            service_resp, _ = self.create_or_update_service(name=job_name,
                                                            data=service)
            results['service'] = service_resp.to_dict()
        return results
    def _create_job(self,
                    task_type,
                    task_idx,
                    add_service,
                    command=None,
                    args=None,
                    env_vars=None,
                    resources=None,
                    node_selector=None,
                    restart_policy='Never'):
        job_name = self.pod_manager.get_job_name(task_type=task_type, task_idx=task_idx)
        sidecar_args = get_sidecar_args(pod_id=job_name)
        labels = self.pod_manager.get_labels(task_type=task_type, task_idx=task_idx)

        volumes, volume_mounts = get_pod_volumes()
        pod = self.pod_manager.get_pod(task_type=task_type,
                                       task_idx=task_idx,
                                       volume_mounts=volume_mounts,
                                       volumes=volumes,
                                       env_vars=env_vars,
                                       command=command,
                                       args=args,
                                       sidecar_args=sidecar_args,
                                       resources=resources,
                                       node_selector=node_selector,
                                       restart_policy=restart_policy)
        pod_resp, _ = self.create_or_update_pod(name=job_name, data=pod)

        service = services.get_service(namespace=self.namespace,
                                       name=job_name,
                                       labels=labels,
                                       ports=self.pod_manager.ports,
                                       target_ports=self.pod_manager.ports)

        results = {'pod': pod_resp.to_dict()}
        if add_service:
            service_resp, _ = self.create_or_update_service(name=job_name, data=service)
            results['service'] = service_resp.to_dict()
        return results
Beispiel #4
0
    def start_notebook(self,
                       persistence_outputs=None,
                       persistence_data=None,
                       outputs_refs_jobs=None,
                       outputs_refs_experiments=None,
                       resources=None,
                       secret_refs=None,
                       configmap_refs=None,
                       node_selector=None,
                       affinity=None,
                       tolerations=None,
                       backend=None,
                       mount_code_in_notebooks=False):
        ports = [self.request_notebook_port()]
        target_ports = [self.PORT]
        volumes, volume_mounts = get_pod_volumes(persistence_outputs=persistence_outputs,
                                                 persistence_data=persistence_data)
        refs_volumes, refs_volume_mounts = get_pod_refs_outputs_volumes(
            outputs_refs=outputs_refs_jobs,
            persistence_outputs=persistence_outputs)
        volumes += refs_volumes
        volume_mounts += refs_volume_mounts
        refs_volumes, refs_volume_mounts = get_pod_refs_outputs_volumes(
            outputs_refs=outputs_refs_experiments,
            persistence_outputs=persistence_outputs)
        volumes += refs_volumes
        volume_mounts += refs_volume_mounts
        shm_volumes, shm_volume_mounts = get_shm_volumes()
        volumes += shm_volumes
        volume_mounts += shm_volume_mounts

        context_volumes, context_mounts = get_auth_context_volumes()
        volumes += context_volumes
        volume_mounts += context_mounts

        if mount_code_in_notebooks:
            code_volume, code_volume_mount = self.get_notebook_code_volume()
            volumes.append(code_volume)
            volume_mounts.append(code_volume_mount)

        secret_refs = validate_secret_refs(secret_refs)
        configmap_refs = validate_configmap_refs(configmap_refs)

        resource_name = self.resource_manager.get_resource_name()
        args = self.get_notebook_args(deployment_name=resource_name,
                                      ports=ports,
                                      mount_code_in_notebooks=mount_code_in_notebooks,
                                      backend=backend)
        command = ["/bin/sh", "-c"]
        deployment = self.resource_manager.get_deployment(
            resource_name=resource_name,
            volume_mounts=volume_mounts,
            volumes=volumes,
            labels=self.resource_manager.labels,
            env_vars=None,
            command=command,
            args=args,
            init_env_vars=self.get_init_env_vars(),
            persistence_outputs=persistence_outputs,
            persistence_data=persistence_data,
            outputs_refs_jobs=outputs_refs_jobs,
            outputs_refs_experiments=outputs_refs_experiments,
            secret_refs=secret_refs,
            configmap_refs=configmap_refs,
            resources=resources,
            ephemeral_token=None,
            node_selector=node_selector,
            affinity=affinity,
            tolerations=tolerations,
            ports=target_ports,
            init_context_mounts=context_mounts,
            restart_policy='Never')
        dep_resp, _ = self.create_or_update_deployment(name=resource_name,
                                                       data=deployment)
        service = services.get_service(
            namespace=self.namespace,
            name=resource_name,
            labels=self.resource_manager.get_labels(),
            ports=ports,
            target_ports=target_ports,
            service_type=self._get_service_type())
        service_resp, _ = self.create_or_update_service(name=resource_name, data=service)
        results = {'deployment': dep_resp.to_dict(), 'service': service_resp.to_dict()}

        if self._use_ingress():
            annotations = json.loads(conf.get('K8S_INGRESS_ANNOTATIONS'))
            paths = [{
                'path': '/notebooks/{}'.format(self.project_name.replace('.', '/')),
                'backend': {
                    'serviceName': resource_name,
                    'servicePort': ports[0]
                }
            }]
            ingress = ingresses.get_ingress(namespace=self.namespace,
                                            name=resource_name,
                                            labels=self.resource_manager.get_labels(),
                                            annotations=annotations,
                                            paths=paths)
            self.create_or_update_ingress(name=resource_name, data=ingress)
        return results
Beispiel #5
0
    def start_notebook(self,
                       image,
                       persistence_outputs=None,
                       persistence_data=None,
                       outputs_refs_jobs=None,
                       outputs_refs_experiments=None,
                       resources=None,
                       secret_refs=None,
                       configmap_refs=None,
                       node_selector=None,
                       affinity=None,
                       tolerations=None,
                       allow_commits=False):
        ports = [self.request_notebook_port()]
        target_ports = [self.PORT]
        volumes, volume_mounts = get_pod_volumes(
            persistence_outputs=persistence_outputs,
            persistence_data=persistence_data)
        refs_volumes, refs_volume_mounts = get_pod_refs_outputs_volumes(
            outputs_refs=outputs_refs_jobs,
            persistence_outputs=persistence_outputs)
        volumes += refs_volumes
        volume_mounts += refs_volume_mounts
        refs_volumes, refs_volume_mounts = get_pod_refs_outputs_volumes(
            outputs_refs=outputs_refs_experiments,
            persistence_outputs=persistence_outputs)
        volumes += refs_volumes
        volume_mounts += refs_volume_mounts
        shm_volumes, shm_volume_mounts = get_shm_volumes()
        volumes += shm_volumes
        volume_mounts += shm_volume_mounts
        env_vars = get_job_env_vars(
            persistence_outputs=persistence_outputs,
            outputs_path=get_notebook_job_outputs_path(
                persistence_outputs=persistence_outputs,
                notebook_job=self.job_name),
            persistence_data=persistence_data,
            outputs_refs_jobs=outputs_refs_jobs,
            outputs_refs_experiments=outputs_refs_experiments)
        secret_refs = validate_secret_refs(secret_refs)
        configmap_refs = validate_configmap_refs(configmap_refs)
        env_from = get_pod_env_from(secret_refs=secret_refs,
                                    configmap_refs=configmap_refs)
        code_volume, code_volume_mount = self.get_notebook_code_volume()
        volumes.append(code_volume)
        volume_mounts.append(code_volume_mount)
        deployment_name = JOB_NAME_FORMAT.format(name=NOTEBOOK_JOB_NAME,
                                                 job_uuid=self.job_uuid)

        node_selector = get_node_selector(
            node_selector=node_selector,
            default_node_selector=settings.NODE_SELECTOR_EXPERIMENTS)
        affinity = get_affinity(affinity=affinity,
                                default_affinity=settings.AFFINITY_EXPERIMENTS)
        tolerations = get_tolerations(
            tolerations=tolerations,
            default_tolerations=settings.TOLERATIONS_EXPERIMENTS)
        deployment = deployments.get_deployment(
            namespace=self.namespace,
            app=settings.APP_LABELS_NOTEBOOK,
            name=NOTEBOOK_JOB_NAME,
            project_name=self.project_name,
            project_uuid=self.project_uuid,
            job_name=self.job_name,
            job_uuid=self.job_uuid,
            volume_mounts=volume_mounts,
            volumes=volumes,
            image=image,
            command=["/bin/sh", "-c"],
            args=self.get_notebook_args(deployment_name=deployment_name,
                                        ports=ports,
                                        allow_commits=allow_commits),
            ports=target_ports,
            container_name=settings.CONTAINER_NAME_PLUGIN_JOB,
            env_vars=env_vars,
            env_from=env_from,
            resources=resources,
            node_selector=node_selector,
            affinity=affinity,
            tolerations=tolerations,
            role=settings.ROLE_LABELS_DASHBOARD,
            type=settings.TYPE_LABELS_RUNNER,
            service_account_name=settings.K8S_SERVICE_ACCOUNT_EXPERIMENTS)
        deployment_labels = deployments.get_labels(
            app=settings.APP_LABELS_NOTEBOOK,
            project_name=self.project_name,
            project_uuid=self.project_uuid,
            job_name=self.job_name,
            job_uuid=self.job_uuid,
            role=settings.ROLE_LABELS_DASHBOARD,
            type=settings.TYPE_LABELS_RUNNER)
        dep_resp, _ = self.create_or_update_deployment(name=deployment_name,
                                                       data=deployment)
        service = services.get_service(namespace=self.namespace,
                                       name=deployment_name,
                                       labels=deployment_labels,
                                       ports=ports,
                                       target_ports=target_ports,
                                       service_type=self._get_service_type())

        service_resp, _ = self.create_or_update_service(name=deployment_name,
                                                        data=service)
        results = {
            'deployment': dep_resp.to_dict(),
            'service': service_resp.to_dict()
        }

        if self._use_ingress():
            annotations = json.loads(settings.K8S_INGRESS_ANNOTATIONS)
            paths = [{
                'path':
                '/notebook/{}'.format(self.project_name.replace('.', '/')),
                'backend': {
                    'serviceName': deployment_name,
                    'servicePort': ports[0]
                }
            }]
            ingress = ingresses.get_ingress(namespace=self.namespace,
                                            name=deployment_name,
                                            labels=deployment_labels,
                                            annotations=annotations,
                                            paths=paths)
            self.create_or_update_ingress(name=deployment_name, data=ingress)
        return results
Beispiel #6
0
    def start_tensorboard(self,
                          image,
                          outputs_path,
                          resources=None,
                          node_selectors=None):
        ports = [self.request_tensorboard_port()]
        target_ports = [self.PORT]
        volumes, volume_mounts = get_pod_volumes()
        deployment = deployments.get_deployment(
            namespace=self.namespace,
            app=settings.APP_LABELS_TENSORBOARD,
            name=self.TENSORBOARD_JOB_NAME,
            project_name=self.project_name,
            project_uuid=self.project_uuid,
            job_name=self.job_name,
            job_uuid=self.job_uuid,
            volume_mounts=volume_mounts,
            volumes=volumes,
            image=image,
            command=["/bin/sh", "-c"],
            args=[
                "tensorboard --logdir={} --port={}".format(
                    outputs_path, self.PORT)
            ],
            ports=target_ports,
            container_name=settings.CONTAINER_NAME_PLUGIN_JOB,
            resources=resources,
            node_selector=node_selectors,
            role=settings.ROLE_LABELS_DASHBOARD,
            type=settings.TYPE_LABELS_EXPERIMENT)
        deployment_name = constants.JOB_NAME.format(
            name=self.TENSORBOARD_JOB_NAME, job_uuid=self.job_uuid)
        deployment_labels = deployments.get_labels(
            app=settings.APP_LABELS_TENSORBOARD,
            project_name=self.project_name,
            project_uuid=self.project_uuid,
            job_name=self.job_name,
            job_uuid=self.job_uuid,
            role=settings.ROLE_LABELS_DASHBOARD,
            type=settings.TYPE_LABELS_EXPERIMENT)

        dep_resp, _ = self.create_or_update_deployment(name=deployment_name,
                                                       data=deployment)
        service = services.get_service(namespace=self.namespace,
                                       name=deployment_name,
                                       labels=deployment_labels,
                                       ports=ports,
                                       target_ports=target_ports,
                                       service_type=self._get_service_type())
        service_resp, _ = self.create_or_update_service(name=deployment_name,
                                                        data=service)
        results = {
            'deployment': dep_resp.to_dict(),
            'service': service_resp.to_dict()
        }

        if self._use_ingress():
            annotations = json.loads(settings.K8S_INGRESS_ANNOTATIONS)
            paths = [{
                'path':
                '/tensorboard/{}'.format(self.project_name.replace('.', '/')),
                'backend': {
                    'serviceName': deployment_name,
                    'servicePort': ports[0]
                }
            }]
            ingress = ingresses.get_ingress(namespace=self.namespace,
                                            name=deployment_name,
                                            labels=deployment_labels,
                                            annotations=annotations,
                                            paths=paths)
            self.create_or_update_ingress(name=deployment_name, data=ingress)

        return results
    def _create_job(self,
                    task_type,
                    task_idx,
                    add_service,
                    command=None,
                    args=None,
                    env_vars=None,
                    resources=None,
                    annotations=None,
                    node_selector=None,
                    affinity=None,
                    tolerations=None,
                    max_restarts=None):
        ephemeral_token = None
        if self.token_scope:
            ephemeral_token = RedisEphemeralTokens.generate_header_token(
                scope=self.token_scope)
        resource_name = self.resource_manager.get_resource_name(
            task_type=task_type, task_idx=task_idx)
        job_uuid = self.get_job_uuids(task_type=task_type, task_idx=task_idx)
        reconcile_url = get_experiment_reconcile_url(self.experiment_name,
                                                     job_uuid)
        labels = self.get_labels(task_type=task_type,
                                 task_idx=task_idx,
                                 job_uuid=job_uuid)

        # Set and validate volumes
        volumes, volume_mounts = get_pod_volumes(
            persistence_outputs=self.persistence_config.outputs,
            persistence_data=self.persistence_config.data)
        refs_volumes, refs_volume_mounts = get_pod_refs_outputs_volumes(
            outputs_refs=self.outputs_refs_jobs,
            persistence_outputs=self.persistence_config.outputs)
        volumes += refs_volumes
        volume_mounts += refs_volume_mounts
        refs_volumes, refs_volume_mounts = get_pod_refs_outputs_volumes(
            outputs_refs=self.outputs_refs_experiments,
            persistence_outputs=self.persistence_config.outputs)
        volumes += refs_volumes
        volume_mounts += refs_volume_mounts
        shm_volumes, shm_volume_mounts = get_shm_volumes()
        volumes += shm_volumes
        volume_mounts += shm_volume_mounts

        context_volumes, context_mounts = get_auth_context_volumes()
        volumes += context_volumes
        volume_mounts += context_mounts

        pod = self.resource_manager.get_task_pod(
            task_type=task_type,
            task_idx=task_idx,
            volume_mounts=volume_mounts,
            volumes=volumes,
            labels=labels,
            env_vars=env_vars,
            command=command,
            args=args,
            ports=self.ports,
            init_env_vars=self.get_init_env_vars(),
            persistence_outputs=self.persistence_config.outputs,
            persistence_data=self.persistence_config.data,
            outputs_refs_jobs=self.outputs_refs_jobs,
            outputs_refs_experiments=self.outputs_refs_experiments,
            secret_refs=self.spec.secret_refs,
            config_map_refs=self.spec.config_map_refs,
            resources=resources,
            ephemeral_token=ephemeral_token,
            node_selector=node_selector,
            affinity=affinity,
            tolerations=tolerations,
            init_context_mounts=context_mounts,
            reconcile_url=reconcile_url,
            max_restarts=max_restarts,
            restart_policy=get_pod_restart_policy(max_restarts))
        pod_resp, _ = self.create_or_update_pod(name=resource_name,
                                                body=pod,
                                                reraise=True)
        results = {'pod': pod_resp.to_dict()}
        if add_service:
            service = services.get_service(namespace=self.namespace,
                                           name=resource_name,
                                           labels=labels,
                                           ports=self.ports,
                                           target_ports=self.ports)
            service_resp, _ = self.create_or_update_service(name=resource_name,
                                                            body=service,
                                                            reraise=True)
            results['service'] = service_resp.to_dict()
        return results
    def start_notebook(self, image, resources=None, node_selectors=None):
        ports = [self.request_notebook_port()]
        target_ports = [self.PORT]
        volumes, volume_mounts = get_pod_volumes()
        code_volume, code_volume_mount = self.get_notebook_code_volume()
        volumes.append(code_volume)
        volume_mounts.append(code_volume_mount)
        deployment_name = constants.JOB_NAME.format(name=self.NOTEBOOK_JOB_NAME,
                                                    job_uuid=self.job_uuid)
        notebook_token = self.get_notebook_token()
        notebook_url = self._get_proxy_url(
            namespace=self.namespace,
            job_name=self.NOTEBOOK_JOB_NAME,
            deployment_name=deployment_name,
            port=ports[0])
        notebook_dir = get_project_repos_path(self.project_name)
        notebook_dir = '{}/{}'.format(notebook_dir, notebook_dir.split('/')[-1])
        deployment = deployments.get_deployment(
            namespace=self.namespace,
            app=settings.APP_LABELS_NOTEBOOK,
            name=self.NOTEBOOK_JOB_NAME,
            project_name=self.project_name,
            project_uuid=self.project_uuid,
            job_name=self.job_name,
            job_uuid=self.job_uuid,
            volume_mounts=volume_mounts,
            volumes=volumes,
            image=image,
            command=["/bin/sh", "-c"],
            args=[
                "jupyter notebook "
                "--no-browser "
                "--port={port} "
                "--ip=0.0.0.0 "
                "--allow-root "
                "--NotebookApp.token={token} "
                "--NotebookApp.trust_xheaders=True "
                "--NotebookApp.base_url={base_url} "
                "--NotebookApp.notebook_dir={notebook_dir} ".format(
                    port=self.PORT,
                    token=notebook_token,
                    base_url=notebook_url,
                    notebook_dir=notebook_dir)],
            ports=target_ports,
            container_name=settings.CONTAINER_NAME_PLUGIN_JOB,
            resources=resources,
            node_selector=node_selectors,
            role=settings.ROLE_LABELS_DASHBOARD,
            type=settings.TYPE_LABELS_EXPERIMENT)
        deployment_labels = deployments.get_labels(app=settings.APP_LABELS_NOTEBOOK,
                                                   project_name=self.project_name,
                                                   project_uuid=self.project_uuid,
                                                   job_name=self.job_name,
                                                   job_uuid=self.job_uuid,
                                                   role=settings.ROLE_LABELS_DASHBOARD,
                                                   type=settings.TYPE_LABELS_EXPERIMENT)
        dep_resp, _ = self.create_or_update_deployment(name=deployment_name, data=deployment)
        service = services.get_service(
            namespace=self.namespace,
            name=deployment_name,
            labels=deployment_labels,
            ports=ports,
            target_ports=target_ports,
            service_type=self._get_service_type())

        service_resp, _ = self.create_or_update_service(name=deployment_name, data=service)
        results = {'deployment': dep_resp.to_dict(), 'service': service_resp.to_dict()}

        if self._use_ingress():
            annotations = json.loads(settings.K8S_INGRESS_ANNOTATIONS)
            paths = [{
                'path': '/notebook/{}'.format(self.project_name.replace('.', '/')),
                'backend': {
                    'serviceName': deployment_name,
                    'servicePort': ports[0]
                }
            }]
            ingress = ingresses.get_ingress(namespace=self.namespace,
                                            name=deployment_name,
                                            labels=deployment_labels,
                                            annotations=annotations,
                                            paths=paths)
            self.create_or_update_ingress(name=deployment_name, data=ingress)
        return results
    def _create_job(self,
                    task_type,
                    task_idx,
                    add_service,
                    command=None,
                    args=None,
                    env_vars=None,
                    resources=None,
                    node_selector=None,
                    affinity=None,
                    tolerations=None,
                    restart_policy='Never'):
        ephemeral_token = RedisEphemeralTokens.generate_header_token(
            scope=self.token_scope)
        job_name = self.pod_manager.get_job_name(task_type=task_type,
                                                 task_idx=task_idx)
        sidecar_args = get_sidecar_args(pod_id=job_name)
        labels = self.pod_manager.get_labels(task_type=task_type,
                                             task_idx=task_idx)

        # Set and validate volumes
        volumes, volume_mounts = get_pod_volumes(
            persistence_outputs=self.persistence_config.outputs,
            persistence_data=self.persistence_config.data)
        refs_volumes, refs_volume_mounts = get_pod_refs_outputs_volumes(
            outputs_refs=self.outputs_refs_jobs,
            persistence_outputs=self.persistence_config.outputs)
        volumes += refs_volumes
        volume_mounts += refs_volume_mounts
        refs_volumes, refs_volume_mounts = get_pod_refs_outputs_volumes(
            outputs_refs=self.outputs_refs_experiments,
            persistence_outputs=self.persistence_config.outputs)
        volumes += refs_volumes
        volume_mounts += refs_volume_mounts
        shm_volumes, shm_volume_mounts = get_shm_volumes()
        volumes += shm_volumes
        volume_mounts += shm_volume_mounts

        # Validate secret and configmap refs
        secret_refs = validate_secret_refs(self.spec.secret_refs)
        configmap_refs = validate_configmap_refs(self.spec.configmap_refs)

        pod = self.pod_manager.get_pod(
            task_type=task_type,
            task_idx=task_idx,
            volume_mounts=volume_mounts,
            volumes=volumes,
            labels=labels,
            env_vars=env_vars,
            command=command,
            args=args,
            sidecar_args=sidecar_args,
            persistence_outputs=self.persistence_config.outputs,
            persistence_data=self.persistence_config.data,
            outputs_refs_jobs=self.outputs_refs_jobs,
            outputs_refs_experiments=self.outputs_refs_experiments,
            secret_refs=secret_refs,
            configmap_refs=configmap_refs,
            resources=resources,
            ephemeral_token=ephemeral_token,
            node_selector=node_selector,
            affinity=affinity,
            tolerations=tolerations,
            restart_policy=restart_policy)
        pod_resp, _ = self.create_or_update_pod(name=job_name, data=pod)
        results = {'pod': pod_resp.to_dict()}
        if add_service:
            service = services.get_service(namespace=self.namespace,
                                           name=job_name,
                                           labels=labels,
                                           ports=self.pod_manager.ports,
                                           target_ports=self.pod_manager.ports)
            service_resp, _ = self.create_or_update_service(name=job_name,
                                                            data=service)
            results['service'] = service_resp.to_dict()
        return results
    def start_tensorboard(self,
                          outputs_path,
                          persistence_outputs,
                          outputs_specs=None,
                          outputs_refs_jobs=None,
                          outputs_refs_experiments=None,
                          resources=None,
                          node_selector=None,
                          affinity=None,
                          tolerations=None):
        ports = [self.request_tensorboard_port()]
        target_ports = [self.PORT]
        volumes, volume_mounts = get_pod_outputs_volume(persistence_outputs)
        refs_volumes, refs_volume_mounts = get_pod_refs_outputs_volumes(
            outputs_refs=outputs_refs_jobs,
            persistence_outputs=persistence_outputs)
        volumes += refs_volumes
        volume_mounts += refs_volume_mounts
        refs_volumes, refs_volume_mounts = get_pod_refs_outputs_volumes(
            outputs_refs=outputs_specs,
            persistence_outputs=persistence_outputs)
        volumes += refs_volumes
        volume_mounts += refs_volume_mounts
        refs_volumes, refs_volume_mounts = get_pod_refs_outputs_volumes(
            outputs_refs=outputs_refs_experiments,
            persistence_outputs=persistence_outputs)
        volumes += refs_volumes
        volume_mounts += refs_volume_mounts

        # Add volumes for persistence outputs secrets
        stores_secrets = get_stores_secrets(specs=outputs_specs)
        self.validate_stores_secrets_keys(stores_secrets=stores_secrets)
        secrets_volumes, secrets_volume_mounts = self.get_stores_secrets_volumes(
            stores_secrets=stores_secrets)
        volumes += secrets_volumes
        volume_mounts += secrets_volume_mounts

        resource_name = self.resource_manager.get_resource_name()
        # Get persistence outputs secrets auth commands
        command_args = self.get_stores_secrets_command_args(
            stores_secrets=stores_secrets)
        command_args.append("tensorboard --logdir={} --port={}".format(
            outputs_path, self.PORT))
        args = [' && '.join(command_args)]
        command = ["/bin/sh", "-c"]

        deployment = self.resource_manager.get_deployment(
            resource_name=resource_name,
            volume_mounts=volume_mounts,
            volumes=volumes,
            labels=self.resource_manager.labels,
            env_vars=None,
            command=command,
            args=args,
            persistence_outputs=persistence_outputs,
            outputs_refs_jobs=outputs_refs_jobs,
            outputs_refs_experiments=outputs_refs_experiments,
            resources=resources,
            ephemeral_token=None,
            node_selector=node_selector,
            affinity=affinity,
            tolerations=tolerations,
            ports=target_ports,
            restart_policy='Never')

        dep_resp, _ = self.create_or_update_deployment(name=resource_name,
                                                       data=deployment)
        service = services.get_service(
            namespace=self.namespace,
            name=resource_name,
            labels=self.resource_manager.get_labels(),
            ports=ports,
            target_ports=target_ports,
            service_type=self._get_service_type())
        service_resp, _ = self.create_or_update_service(name=resource_name,
                                                        data=service)
        results = {
            'deployment': dep_resp.to_dict(),
            'service': service_resp.to_dict()
        }

        if self._use_ingress():
            annotations = json.loads(conf.get('K8S_INGRESS_ANNOTATIONS'))
            paths = [{
                'path':
                '/tensorboards/{}'.format(self.project_name.replace('.', '/')),
                'backend': {
                    'serviceName': resource_name,
                    'servicePort': ports[0]
                }
            }]
            ingress = ingresses.get_ingress(
                namespace=self.namespace,
                name=resource_name,
                labels=self.resource_manager.get_labels(),
                annotations=annotations,
                paths=paths)
            self.create_or_update_ingress(name=resource_name, data=ingress)

        return results
    def start_tensorboard(self,
                          image,
                          outputs_path,
                          persistence_outputs,
                          outputs_specs=None,
                          outputs_refs_jobs=None,
                          outputs_refs_experiments=None,
                          resources=None,
                          node_selector=None,
                          affinity=None,
                          tolerations=None):
        ports = [self.request_tensorboard_port()]
        target_ports = [self.PORT]
        volumes, volume_mounts = get_pod_outputs_volume(persistence_outputs)
        refs_volumes, refs_volume_mounts = get_pod_refs_outputs_volumes(
            outputs_refs=outputs_refs_jobs,
            persistence_outputs=persistence_outputs)
        volumes += refs_volumes
        volume_mounts += refs_volume_mounts
        refs_volumes, refs_volume_mounts = get_pod_refs_outputs_volumes(
            outputs_refs=outputs_specs,
            persistence_outputs=persistence_outputs)
        volumes += refs_volumes
        volume_mounts += refs_volume_mounts
        refs_volumes, refs_volume_mounts = get_pod_refs_outputs_volumes(
            outputs_refs=outputs_refs_experiments,
            persistence_outputs=persistence_outputs)
        volumes += refs_volumes
        volume_mounts += refs_volume_mounts

        # Add volumes for persistence outputs secrets
        stores_secrets = get_stores_secrets(specs=outputs_specs)
        self.validate_stores_secrets_keys(stores_secrets=stores_secrets)
        secrets_volumes, secrets_volume_mounts = self.get_stores_secrets_volumes(
            stores_secrets=stores_secrets)
        volumes += secrets_volumes
        volume_mounts += secrets_volume_mounts

        # Get persistence outputs secrets auth commands
        command_args = self.get_stores_secrets_command_args(
            stores_secrets=stores_secrets)
        command_args.append("tensorboard --logdir={} --port={}".format(
            outputs_path, self.PORT))

        node_selector = get_node_selector(
            node_selector=node_selector,
            default_node_selector=settings.NODE_SELECTOR_TENSORBOARDS)
        affinity = get_affinity(
            affinity=affinity, default_affinity=settings.AFFINITY_TENSORBOARDS)
        tolerations = get_tolerations(
            tolerations=tolerations,
            default_tolerations=settings.TOLERATIONS_TENSORBOARDS)
        deployment = deployments.get_deployment(
            namespace=self.namespace,
            app=settings.APP_LABELS_TENSORBOARD,
            name=TENSORBOARD_JOB_NAME,
            project_name=self.project_name,
            project_uuid=self.project_uuid,
            job_name=self.job_name,
            job_uuid=self.job_uuid,
            volume_mounts=volume_mounts,
            volumes=volumes,
            image=image,
            command=["/bin/sh", "-c"],
            args=[' && '.join(command_args)],
            ports=target_ports,
            container_name=settings.CONTAINER_NAME_PLUGIN_JOB,
            resources=resources,
            node_selector=node_selector,
            affinity=affinity,
            tolerations=tolerations,
            role=settings.ROLE_LABELS_DASHBOARD,
            type=settings.TYPE_LABELS_RUNNER)
        deployment_name = JOB_NAME_FORMAT.format(name=TENSORBOARD_JOB_NAME,
                                                 job_uuid=self.job_uuid)
        deployment_labels = deployments.get_labels(
            app=settings.APP_LABELS_TENSORBOARD,
            project_name=self.project_name,
            project_uuid=self.project_uuid,
            job_name=self.job_name,
            job_uuid=self.job_uuid,
            role=settings.ROLE_LABELS_DASHBOARD,
            type=settings.TYPE_LABELS_RUNNER)

        dep_resp, _ = self.create_or_update_deployment(name=deployment_name,
                                                       data=deployment)
        service = services.get_service(namespace=self.namespace,
                                       name=deployment_name,
                                       labels=deployment_labels,
                                       ports=ports,
                                       target_ports=target_ports,
                                       service_type=self._get_service_type())
        service_resp, _ = self.create_or_update_service(name=deployment_name,
                                                        data=service)
        results = {
            'deployment': dep_resp.to_dict(),
            'service': service_resp.to_dict()
        }

        if self._use_ingress():
            annotations = json.loads(settings.K8S_INGRESS_ANNOTATIONS)
            paths = [{
                'path':
                '/tensorboard/{}'.format(self.project_name.replace('.', '/')),
                'backend': {
                    'serviceName': deployment_name,
                    'servicePort': ports[0]
                }
            }]
            ingress = ingresses.get_ingress(namespace=self.namespace,
                                            name=deployment_name,
                                            labels=deployment_labels,
                                            annotations=annotations,
                                            paths=paths)
            self.create_or_update_ingress(name=deployment_name, data=ingress)

        return results
    def start_tensorboard(self,
                          outputs_path,
                          persistence_outputs,
                          outputs_specs=None,
                          outputs_refs_jobs=None,
                          outputs_refs_experiments=None,
                          resources=None,
                          labels=None,
                          annotations=None,
                          node_selector=None,
                          affinity=None,
                          tolerations=None,
                          max_restarts=None,
                          reconcile_url=None):
        ports = [self.request_tensorboard_port()]
        target_ports = [self.port]
        volumes, volume_mounts = get_pod_outputs_volume(persistence_outputs)
        refs_volumes, refs_volume_mounts = get_pod_refs_outputs_volumes(
            outputs_refs=outputs_refs_jobs,
            persistence_outputs=persistence_outputs)
        volumes += refs_volumes
        volume_mounts += refs_volume_mounts
        refs_volumes, refs_volume_mounts = get_pod_refs_outputs_volumes(
            outputs_refs=outputs_specs,
            persistence_outputs=persistence_outputs)
        volumes += refs_volumes
        volume_mounts += refs_volume_mounts
        refs_volumes, refs_volume_mounts = get_pod_refs_outputs_volumes(
            outputs_refs=outputs_refs_experiments,
            persistence_outputs=persistence_outputs)
        volumes += refs_volumes
        volume_mounts += refs_volume_mounts

        # Add volumes for persistence outputs secrets
        stores_secrets = get_stores_secrets(specs=outputs_specs)
        self.validate_stores_secrets_keys(stores_secrets=stores_secrets)
        secrets_volumes, secrets_volume_mounts = self.get_stores_secrets_volumes(
            stores_secrets=stores_secrets)
        volumes += secrets_volumes
        volume_mounts += secrets_volume_mounts

        resource_name = self.resource_manager.get_resource_name()
        tensorboard_url = self._get_proxy_url(namespace=self.namespace,
                                              job_name=TENSORBOARD_JOB_NAME,
                                              deployment_name=resource_name)
        # Get persistence outputs secrets auth commands
        command_args = self.get_stores_secrets_command_args(
            stores_secrets=stores_secrets)
        command_args.append("tensorboard "
                            "--logdir={log_dir} "
                            "--port={port} "
                            "--path_prefix={path_prefix}".format(
                                log_dir=outputs_path,
                                port=self.port,
                                path_prefix=tensorboard_url))
        args = [' && '.join(command_args)]
        command = ["/bin/sh", "-c"]

        labels = get_labels(default_labels=self.resource_manager.labels,
                            labels=labels)
        deployment = self.resource_manager.get_deployment(
            resource_name=resource_name,
            volume_mounts=volume_mounts,
            volumes=volumes,
            labels=labels,
            env_vars=None,
            command=command,
            args=args,
            persistence_outputs=persistence_outputs,
            outputs_refs_jobs=outputs_refs_jobs,
            outputs_refs_experiments=outputs_refs_experiments,
            resources=resources,
            annotations=annotations,
            ephemeral_token=None,
            node_selector=node_selector,
            affinity=affinity,
            tolerations=tolerations,
            ports=target_ports,
            reconcile_url=reconcile_url,
            max_restarts=max_restarts,
            restart_policy=get_deployment_restart_policy(max_restarts))

        dep_resp, _ = self.create_or_update_deployment(name=resource_name,
                                                       body=deployment,
                                                       reraise=True)
        service = services.get_service(
            namespace=self.namespace,
            name=resource_name,
            labels=self.resource_manager.get_labels(),
            ports=ports,
            target_ports=target_ports,
            service_type=self._get_service_type())
        service_resp, _ = self.create_or_update_service(name=resource_name,
                                                        body=service,
                                                        reraise=True)
        results = {
            'deployment': dep_resp.to_dict(),
            'service': service_resp.to_dict()
        }

        if self._use_ingress():
            annotations = json.loads(conf.get(K8S_INGRESS_ANNOTATIONS))
            paths = [{
                'path':
                '/tensorboards/{}'.format(self.project_name.replace('.', '/')),
                'backend': {
                    'serviceName': resource_name,
                    'servicePort': ports[0]
                }
            }]
            ingress = ingresses.get_ingress(
                namespace=self.namespace,
                name=resource_name,
                labels=self.resource_manager.get_labels(),
                annotations=annotations,
                paths=paths)
            self.create_or_update_ingress(name=resource_name,
                                          body=ingress,
                                          reraise=True)

        return results
    def start_tensorboard(self, image, resources=None, node_selectors=None):
        ports = [self.request_tensorboard_port()]
        target_ports = [self.PORT]
        volumes, volume_mounts = get_pod_volumes()
        outputs_path = get_project_outputs_path(project_name=self.project_name)
        deployment = deployments.get_deployment(
            namespace=self.namespace,
            app=settings.APP_LABELS_TENSORBOARD,
            name=self.TENSORBOARD_JOB_NAME,
            project_name=self.project_name,
            project_uuid=self.project_uuid,
            job_name=self.job_name,
            job_uuid=self.job_uuid,
            volume_mounts=volume_mounts,
            volumes=volumes,
            image=image,
            command=["/bin/sh", "-c"],
            args=["tensorboard --logdir={} --port={}".format(outputs_path, self.PORT)],
            ports=target_ports,
            container_name=settings.CONTAINER_NAME_PLUGIN_JOB,
            resources=resources,
            node_selector=node_selectors,
            role=settings.ROLE_LABELS_DASHBOARD,
            type=settings.TYPE_LABELS_EXPERIMENT)
        deployment_name = constants.JOB_NAME.format(name=self.TENSORBOARD_JOB_NAME,
                                                    job_uuid=self.job_uuid)
        deployment_labels = deployments.get_labels(app=settings.APP_LABELS_TENSORBOARD,
                                                   project_name=self.project_name,
                                                   project_uuid=self.project_uuid,
                                                   job_name=self.job_name,
                                                   job_uuid=self.job_uuid,
                                                   role=settings.ROLE_LABELS_DASHBOARD,
                                                   type=settings.TYPE_LABELS_EXPERIMENT)

        dep_resp, _ = self.create_or_update_deployment(name=deployment_name, data=deployment)
        service = services.get_service(
            namespace=self.namespace,
            name=deployment_name,
            labels=deployment_labels,
            ports=ports,
            target_ports=target_ports,
            service_type=self._get_service_type())
        service_resp, _ = self.create_or_update_service(name=deployment_name, data=service)
        results = {'deployment': dep_resp.to_dict(), 'service': service_resp.to_dict()}

        if self._use_ingress():
            annotations = json.loads(settings.K8S_INGRESS_ANNOTATIONS)
            paths = [{
                'path': '/tensorboard/{}'.format(self.project_name.replace('.', '/')),
                'backend': {
                    'serviceName': deployment_name,
                    'servicePort': ports[0]
                }
            }]
            ingress = ingresses.get_ingress(namespace=self.namespace,
                                            name=deployment_name,
                                            labels=deployment_labels,
                                            annotations=annotations,
                                            paths=paths)
            self.create_or_update_ingress(name=deployment_name, data=ingress)

        return results
Beispiel #14
0
    def start_notebook(self,
                       image,
                       persistence_outputs=None,
                       persistence_data=None,
                       outputs_refs_jobs=None,
                       outputs_refs_experiments=None,
                       resources=None,
                       node_selectors=None,
                       allow_commits=False):
        ports = [self.request_notebook_port()]
        target_ports = [self.PORT]
        volumes, volume_mounts = get_pod_volumes(
            persistence_outputs=persistence_outputs,
            persistence_data=persistence_data)
        refs_volumes, refs_volume_mounts = get_pod_refs_outputs_volumes(
            outputs_refs=outputs_refs_jobs,
            persistence_outputs=persistence_outputs)
        volumes += refs_volumes
        volume_mounts += refs_volume_mounts
        refs_volumes, refs_volume_mounts = get_pod_refs_outputs_volumes(
            outputs_refs=outputs_refs_experiments,
            persistence_outputs=persistence_outputs)
        volumes += refs_volumes
        volume_mounts += refs_volume_mounts
        env_vars = get_job_env_vars(
            outputs_path=get_notebook_job_outputs_path(
                persistence_outputs=persistence_outputs,
                notebook_job=self.job_name),
            data_paths=get_data_paths(persistence_data),
            outputs_refs_jobs=outputs_refs_jobs,
            outputs_refs_experiments=outputs_refs_experiments)
        code_volume, code_volume_mount = self.get_notebook_code_volume()
        volumes.append(code_volume)
        volume_mounts.append(code_volume_mount)
        deployment_name = constants.JOB_NAME.format(
            name=self.NOTEBOOK_JOB_NAME, job_uuid=self.job_uuid)
        deployment = deployments.get_deployment(
            namespace=self.namespace,
            app=settings.APP_LABELS_NOTEBOOK,
            name=self.NOTEBOOK_JOB_NAME,
            project_name=self.project_name,
            project_uuid=self.project_uuid,
            job_name=self.job_name,
            job_uuid=self.job_uuid,
            volume_mounts=volume_mounts,
            volumes=volumes,
            image=image,
            command=["/bin/sh", "-c"],
            args=self.get_notebook_args(deployment_name=deployment_name,
                                        ports=ports,
                                        allow_commits=allow_commits),
            ports=target_ports,
            container_name=settings.CONTAINER_NAME_PLUGIN_JOB,
            env_vars=env_vars,
            resources=resources,
            node_selector=node_selectors,
            role=settings.ROLE_LABELS_DASHBOARD,
            type=settings.TYPE_LABELS_EXPERIMENT)
        deployment_labels = deployments.get_labels(
            app=settings.APP_LABELS_NOTEBOOK,
            project_name=self.project_name,
            project_uuid=self.project_uuid,
            job_name=self.job_name,
            job_uuid=self.job_uuid,
            role=settings.ROLE_LABELS_DASHBOARD,
            type=settings.TYPE_LABELS_EXPERIMENT)
        dep_resp, _ = self.create_or_update_deployment(name=deployment_name,
                                                       data=deployment)
        service = services.get_service(namespace=self.namespace,
                                       name=deployment_name,
                                       labels=deployment_labels,
                                       ports=ports,
                                       target_ports=target_ports,
                                       service_type=self._get_service_type())

        service_resp, _ = self.create_or_update_service(name=deployment_name,
                                                        data=service)
        results = {
            'deployment': dep_resp.to_dict(),
            'service': service_resp.to_dict()
        }

        if self._use_ingress():
            annotations = json.loads(settings.K8S_INGRESS_ANNOTATIONS)
            paths = [{
                'path':
                '/notebook/{}'.format(self.project_name.replace('.', '/')),
                'backend': {
                    'serviceName': deployment_name,
                    'servicePort': ports[0]
                }
            }]
            ingress = ingresses.get_ingress(namespace=self.namespace,
                                            name=deployment_name,
                                            labels=deployment_labels,
                                            annotations=annotations,
                                            paths=paths)
            self.create_or_update_ingress(name=deployment_name, data=ingress)
        return results