Ejemplo n.º 1
0
 def get_pod_node_selector(self):
     assert get_node_selector(None, None) is None
     assert get_node_selector({'foo': 'bar'}, None) == {'foo': 'bar'}
     assert get_node_selector(None, '{"foo": "bar"}') == {'foo': 'bar'}
     assert get_node_selector({'foo': 'bar'}, '{"foo": "moo"}') == {
         'foo': 'bar'
     }
Ejemplo n.º 2
0
    def get_task_pod_spec(self,
                          volume_mounts,
                          volumes,
                          persistence_outputs=None,
                          persistence_data=None,
                          outputs_refs_jobs=None,
                          outputs_refs_experiments=None,
                          env_vars=None,
                          command=None,
                          args=None,
                          resources=None,
                          node_selector=None,
                          affinity=None,
                          tolerations=None,
                          restart_policy='OnFailure'):
        """Pod spec to be used to create pods for tasks: master, worker, ps."""
        volume_mounts = get_list(volume_mounts)
        volumes = get_list(volumes)

        gpu_volume_mounts, gpu_volumes = get_gpu_volumes_def(resources)
        volume_mounts += gpu_volume_mounts
        volumes += gpu_volumes

        pod_container = self.get_pod_container(volume_mounts=volume_mounts,
                                               persistence_outputs=persistence_outputs,
                                               persistence_data=persistence_data,
                                               outputs_refs_jobs=outputs_refs_jobs,
                                               outputs_refs_experiments=outputs_refs_experiments,
                                               env_vars=env_vars,
                                               command=command,
                                               args=args,
                                               resources=resources)

        containers = [pod_container]
        if self.use_sidecar:
            sidecar_container = self.get_sidecar_container()
            containers.append(sidecar_container)

        node_selector = get_node_selector(
            node_selector=node_selector,
            default_node_selector=settings.NODE_SELECTOR_JOBS)
        affinity = get_affinity(
            affinity=affinity,
            default_affinity=settings.AFFINITY_JOBS)
        tolerations = get_tolerations(
            tolerations=tolerations,
            default_tolerations=settings.TOLERATIONS_JOBS)

        service_account_name = None
        if settings.K8S_RBAC_ENABLED:
            service_account_name = settings.K8S_SERVICE_ACCOUNT_NAME
        return client.V1PodSpec(
            restart_policy=restart_policy,
            service_account_name=service_account_name,
            init_containers=to_list(self.get_init_container(persistence_outputs)),
            containers=containers,
            volumes=volumes,
            node_selector=node_selector,
            affinity=affinity,
            tolerations=tolerations)
Ejemplo n.º 3
0
    def start_dockerizer(self,
                         resources=None,
                         node_selector=None,
                         affinity=None,
                         tolerations=None):
        volumes, volume_mounts = get_docker_volumes()

        node_selector = get_node_selector(
            node_selector=node_selector,
            default_node_selector=conf.get('NODE_SELECTOR_BUILDS'))
        affinity = get_affinity(
            affinity=affinity,
            default_affinity=conf.get('AFFINITY_BUILDS'))
        tolerations = get_tolerations(
            tolerations=tolerations,
            default_tolerations=conf.get('TOLERATIONS_BUILDS'))
        pod = pods.get_pod(
            namespace=self.namespace,
            app=conf.get('APP_LABELS_DOCKERIZER'),
            name=DOCKERIZER_JOB_NAME,
            project_name=self.project_name,
            project_uuid=self.project_uuid,
            job_name=self.job_name,
            job_uuid=self.job_uuid,
            volume_mounts=volume_mounts,
            volumes=volumes,
            image=conf.get('JOB_DOCKERIZER_IMAGE'),
            image_pull_policy=conf.get('JOB_DOCKERIZER_IMAGE_PULL_POLICY'),
            command=None,
            args=[self.job_uuid],
            ports=[],
            env_vars=self.get_env_vars(),
            container_name=conf.get('CONTAINER_NAME_DOCKERIZER_JOB'),
            resources=resources,
            node_selector=node_selector,
            affinity=affinity,
            tolerations=tolerations,
            role=conf.get('ROLE_LABELS_WORKER'),
            type=conf.get('TYPE_LABELS_RUNNER'),
            service_account_name=conf.get('K8S_SERVICE_ACCOUNT_BUILDS'),
            restart_policy='Never')
        pod_name = JOB_NAME_FORMAT.format(job_uuid=self.job_uuid, name=DOCKERIZER_JOB_NAME)

        pod_resp, _ = self.create_or_update_pod(name=pod_name, data=pod)
        return pod_resp.to_dict()
Ejemplo n.º 4
0
    def start_dockerizer(self,
                         resources=None,
                         node_selector=None,
                         affinity=None,
                         tolerations=None):
        volumes, volume_mounts = get_docker_volumes()

        node_selector = get_node_selector(
            node_selector=node_selector,
            default_node_selector=settings.NODE_SELECTOR_BUILDS)
        affinity = get_affinity(
            affinity=affinity,
            default_affinity=settings.AFFINITY_BUILDS)
        tolerations = get_tolerations(
            tolerations=tolerations,
            default_tolerations=settings.TOLERATIONS_BUILDS)
        deployment = pods.get_pod(
            namespace=self.namespace,
            app=settings.APP_LABELS_DOCKERIZER,
            name=self.DOCKERIZER_JOB_NAME,
            project_name=self.project_name,
            project_uuid=self.project_uuid,
            job_name=self.job_name,
            job_uuid=self.job_uuid,
            volume_mounts=volume_mounts,
            volumes=volumes,
            image=settings.JOB_DOCKERIZER_IMAGE,
            command=None,
            args=[self.job_uuid],
            ports=[],
            env_vars=self.get_env_vars(),
            container_name=settings.CONTAINER_NAME_DOCKERIZER_JOB,
            resources=resources,
            node_selector=node_selector,
            affinity=affinity,
            tolerations=tolerations,
            role=settings.ROLE_LABELS_WORKER,
            type=settings.TYPE_LABELS_RUNNER,
            restart_policy='Never')
        pod_name = constants.JOB_NAME.format(
            job_uuid=self.job_uuid, name=self.DOCKERIZER_JOB_NAME)

        pod_resp, _ = self.create_or_update_pod(name=pod_name, data=deployment)
        return pod_resp.to_dict()
Ejemplo n.º 5
0
 def _get_node_selector(self, node_selector):
     return get_node_selector(
         node_selector=node_selector,
         default_node_selector=conf.get(NODE_SELECTORS_EXPERIMENTS))
Ejemplo n.º 6
0
    def start_notebook(self,
                       image,
                       persistence_outputs=None,
                       persistence_data=None,
                       outputs_refs_jobs=None,
                       outputs_refs_experiments=None,
                       resources=None,
                       secret_refs=None,
                       configmap_refs=None,
                       node_selector=None,
                       affinity=None,
                       tolerations=None,
                       allow_commits=False):
        ports = [self.request_notebook_port()]
        target_ports = [self.PORT]
        volumes, volume_mounts = get_pod_volumes(
            persistence_outputs=persistence_outputs,
            persistence_data=persistence_data)
        refs_volumes, refs_volume_mounts = get_pod_refs_outputs_volumes(
            outputs_refs=outputs_refs_jobs,
            persistence_outputs=persistence_outputs)
        volumes += refs_volumes
        volume_mounts += refs_volume_mounts
        refs_volumes, refs_volume_mounts = get_pod_refs_outputs_volumes(
            outputs_refs=outputs_refs_experiments,
            persistence_outputs=persistence_outputs)
        volumes += refs_volumes
        volume_mounts += refs_volume_mounts
        shm_volumes, shm_volume_mounts = get_shm_volumes()
        volumes += shm_volumes
        volume_mounts += shm_volume_mounts
        env_vars = get_job_env_vars(
            persistence_outputs=persistence_outputs,
            outputs_path=get_notebook_job_outputs_path(
                persistence_outputs=persistence_outputs,
                notebook_job=self.job_name),
            persistence_data=persistence_data,
            outputs_refs_jobs=outputs_refs_jobs,
            outputs_refs_experiments=outputs_refs_experiments)
        secret_refs = validate_secret_refs(secret_refs)
        configmap_refs = validate_configmap_refs(configmap_refs)
        env_from = get_pod_env_from(secret_refs=secret_refs,
                                    configmap_refs=configmap_refs)
        code_volume, code_volume_mount = self.get_notebook_code_volume()
        volumes.append(code_volume)
        volume_mounts.append(code_volume_mount)
        deployment_name = JOB_NAME_FORMAT.format(name=NOTEBOOK_JOB_NAME,
                                                 job_uuid=self.job_uuid)

        node_selector = get_node_selector(
            node_selector=node_selector,
            default_node_selector=settings.NODE_SELECTOR_EXPERIMENTS)
        affinity = get_affinity(affinity=affinity,
                                default_affinity=settings.AFFINITY_EXPERIMENTS)
        tolerations = get_tolerations(
            tolerations=tolerations,
            default_tolerations=settings.TOLERATIONS_EXPERIMENTS)
        deployment = deployments.get_deployment(
            namespace=self.namespace,
            app=settings.APP_LABELS_NOTEBOOK,
            name=NOTEBOOK_JOB_NAME,
            project_name=self.project_name,
            project_uuid=self.project_uuid,
            job_name=self.job_name,
            job_uuid=self.job_uuid,
            volume_mounts=volume_mounts,
            volumes=volumes,
            image=image,
            command=["/bin/sh", "-c"],
            args=self.get_notebook_args(deployment_name=deployment_name,
                                        ports=ports,
                                        allow_commits=allow_commits),
            ports=target_ports,
            container_name=settings.CONTAINER_NAME_PLUGIN_JOB,
            env_vars=env_vars,
            env_from=env_from,
            resources=resources,
            node_selector=node_selector,
            affinity=affinity,
            tolerations=tolerations,
            role=settings.ROLE_LABELS_DASHBOARD,
            type=settings.TYPE_LABELS_RUNNER,
            service_account_name=settings.K8S_SERVICE_ACCOUNT_EXPERIMENTS)
        deployment_labels = deployments.get_labels(
            app=settings.APP_LABELS_NOTEBOOK,
            project_name=self.project_name,
            project_uuid=self.project_uuid,
            job_name=self.job_name,
            job_uuid=self.job_uuid,
            role=settings.ROLE_LABELS_DASHBOARD,
            type=settings.TYPE_LABELS_RUNNER)
        dep_resp, _ = self.create_or_update_deployment(name=deployment_name,
                                                       data=deployment)
        service = services.get_service(namespace=self.namespace,
                                       name=deployment_name,
                                       labels=deployment_labels,
                                       ports=ports,
                                       target_ports=target_ports,
                                       service_type=self._get_service_type())

        service_resp, _ = self.create_or_update_service(name=deployment_name,
                                                        data=service)
        results = {
            'deployment': dep_resp.to_dict(),
            'service': service_resp.to_dict()
        }

        if self._use_ingress():
            annotations = json.loads(settings.K8S_INGRESS_ANNOTATIONS)
            paths = [{
                'path':
                '/notebook/{}'.format(self.project_name.replace('.', '/')),
                'backend': {
                    'serviceName': deployment_name,
                    'servicePort': ports[0]
                }
            }]
            ingress = ingresses.get_ingress(namespace=self.namespace,
                                            name=deployment_name,
                                            labels=deployment_labels,
                                            annotations=annotations,
                                            paths=paths)
            self.create_or_update_ingress(name=deployment_name, data=ingress)
        return results
Ejemplo n.º 7
0
 def _get_node_selector(self, node_selector):
     return get_node_selector(
         node_selector=node_selector,
         default_node_selector=conf.get('NODE_SELECTOR_JOBS'))
Ejemplo n.º 8
0
 def _get_node_selector(self, node_selector):
     return get_node_selector(
         node_selector=node_selector,
         default_node_selector=conf.get(NODE_SELECTORS_TENSORBOARDS))
Ejemplo n.º 9
0
    def get_task_pod_spec(self,
                          task_type,
                          task_idx,
                          volume_mounts,
                          volumes,
                          env_vars=None,
                          command=None,
                          args=None,
                          sidecar_args=None,
                          persistence_outputs=None,
                          persistence_data=None,
                          outputs_refs_jobs=None,
                          outputs_refs_experiments=None,
                          resources=None,
                          node_selector=None,
                          affinity=None,
                          tolerations=None,
                          restart_policy='OnFailure'):
        """Pod spec to be used to create pods for tasks: master, worker, ps."""
        volume_mounts = get_list(volume_mounts)
        volumes = get_list(volumes)

        gpu_volume_mounts, gpu_volumes = get_gpu_volumes_def(resources)
        volume_mounts += gpu_volume_mounts
        volumes += gpu_volumes

        # Add job information
        env_vars = get_list(env_vars)
        env_vars.append(
            client.V1EnvVar(
                name=constants.CONFIG_MAP_TASK_INFO_KEY_NAME,
                value=json.dumps({'type': task_type, 'index': task_idx})
            )
        )

        pod_container = self.get_pod_container(volume_mounts=volume_mounts,
                                               env_vars=env_vars,
                                               command=command,
                                               args=args,
                                               persistence_outputs=persistence_outputs,
                                               persistence_data=persistence_data,
                                               outputs_refs_jobs=outputs_refs_jobs,
                                               outputs_refs_experiments=outputs_refs_experiments,
                                               resources=resources)

        containers = [pod_container]
        if self.use_sidecar:
            sidecar_container = self.get_sidecar_container(task_type=task_type,
                                                           task_idx=task_idx,
                                                           args=sidecar_args)
            containers.append(sidecar_container)

        node_selector = get_node_selector(
            node_selector=node_selector,
            default_node_selector=settings.NODE_SELECTOR_EXPERIMENTS)
        affinity = get_affinity(
            affinity=affinity,
            default_affinity=settings.AFFINITY_EXPERIMENTS)
        tolerations = get_tolerations(
            tolerations=tolerations,
            default_tolerations=settings.TOLERATIONS_EXPERIMENTS)
        service_account_name = None
        if settings.K8S_RBAC_ENABLED:
            service_account_name = settings.K8S_SERVICE_ACCOUNT_NAME
        return client.V1PodSpec(
            restart_policy=restart_policy,
            service_account_name=service_account_name,
            init_containers=to_list(self.get_init_container(persistence_outputs)),
            containers=containers,
            volumes=volumes,
            node_selector=node_selector,
            tolerations=tolerations,
            affinity=affinity)
Ejemplo n.º 10
0
 def _get_node_selector(self, node_selector):
     return get_node_selector(
         node_selector=node_selector,
         default_node_selector=conf.get(NODE_SELECTORS_BUILD_JOBS))
Ejemplo n.º 11
0
 def _get_node_selector(self, node_selector):
     return get_node_selector(
         node_selector=node_selector,
         default_node_selector=conf.get(NODE_SELECTORS_NOTEBOOKS))
Ejemplo n.º 12
0
    def start_tensorboard(self,
                          image,
                          outputs_path,
                          persistence_outputs,
                          outputs_specs=None,
                          outputs_refs_jobs=None,
                          outputs_refs_experiments=None,
                          resources=None,
                          node_selector=None,
                          affinity=None,
                          tolerations=None):
        ports = [self.request_tensorboard_port()]
        target_ports = [self.PORT]
        volumes, volume_mounts = get_pod_outputs_volume(persistence_outputs)
        refs_volumes, refs_volume_mounts = get_pod_refs_outputs_volumes(
            outputs_refs=outputs_refs_jobs,
            persistence_outputs=persistence_outputs)
        volumes += refs_volumes
        volume_mounts += refs_volume_mounts
        refs_volumes, refs_volume_mounts = get_pod_refs_outputs_volumes(
            outputs_refs=outputs_specs,
            persistence_outputs=persistence_outputs)
        volumes += refs_volumes
        volume_mounts += refs_volume_mounts
        refs_volumes, refs_volume_mounts = get_pod_refs_outputs_volumes(
            outputs_refs=outputs_refs_experiments,
            persistence_outputs=persistence_outputs)
        volumes += refs_volumes
        volume_mounts += refs_volume_mounts

        # Add volumes for persistence outputs secrets
        stores_secrets = get_stores_secrets(specs=outputs_specs)
        self.validate_stores_secrets_keys(stores_secrets=stores_secrets)
        secrets_volumes, secrets_volume_mounts = self.get_stores_secrets_volumes(
            stores_secrets=stores_secrets)
        volumes += secrets_volumes
        volume_mounts += secrets_volume_mounts

        # Get persistence outputs secrets auth commands
        command_args = self.get_stores_secrets_command_args(
            stores_secrets=stores_secrets)
        command_args.append("tensorboard --logdir={} --port={}".format(
            outputs_path, self.PORT))

        node_selector = get_node_selector(
            node_selector=node_selector,
            default_node_selector=settings.NODE_SELECTOR_TENSORBOARDS)
        affinity = get_affinity(
            affinity=affinity, default_affinity=settings.AFFINITY_TENSORBOARDS)
        tolerations = get_tolerations(
            tolerations=tolerations,
            default_tolerations=settings.TOLERATIONS_TENSORBOARDS)
        deployment = deployments.get_deployment(
            namespace=self.namespace,
            app=settings.APP_LABELS_TENSORBOARD,
            name=TENSORBOARD_JOB_NAME,
            project_name=self.project_name,
            project_uuid=self.project_uuid,
            job_name=self.job_name,
            job_uuid=self.job_uuid,
            volume_mounts=volume_mounts,
            volumes=volumes,
            image=image,
            command=["/bin/sh", "-c"],
            args=[' && '.join(command_args)],
            ports=target_ports,
            container_name=settings.CONTAINER_NAME_PLUGIN_JOB,
            resources=resources,
            node_selector=node_selector,
            affinity=affinity,
            tolerations=tolerations,
            role=settings.ROLE_LABELS_DASHBOARD,
            type=settings.TYPE_LABELS_RUNNER)
        deployment_name = JOB_NAME_FORMAT.format(name=TENSORBOARD_JOB_NAME,
                                                 job_uuid=self.job_uuid)
        deployment_labels = deployments.get_labels(
            app=settings.APP_LABELS_TENSORBOARD,
            project_name=self.project_name,
            project_uuid=self.project_uuid,
            job_name=self.job_name,
            job_uuid=self.job_uuid,
            role=settings.ROLE_LABELS_DASHBOARD,
            type=settings.TYPE_LABELS_RUNNER)

        dep_resp, _ = self.create_or_update_deployment(name=deployment_name,
                                                       data=deployment)
        service = services.get_service(namespace=self.namespace,
                                       name=deployment_name,
                                       labels=deployment_labels,
                                       ports=ports,
                                       target_ports=target_ports,
                                       service_type=self._get_service_type())
        service_resp, _ = self.create_or_update_service(name=deployment_name,
                                                        data=service)
        results = {
            'deployment': dep_resp.to_dict(),
            'service': service_resp.to_dict()
        }

        if self._use_ingress():
            annotations = json.loads(settings.K8S_INGRESS_ANNOTATIONS)
            paths = [{
                'path':
                '/tensorboard/{}'.format(self.project_name.replace('.', '/')),
                'backend': {
                    'serviceName': deployment_name,
                    'servicePort': ports[0]
                }
            }]
            ingress = ingresses.get_ingress(namespace=self.namespace,
                                            name=deployment_name,
                                            labels=deployment_labels,
                                            annotations=annotations,
                                            paths=paths)
            self.create_or_update_ingress(name=deployment_name, data=ingress)

        return results