async def _create_lab_environment_configmap( self, spawner: KubeSpawner, template_values: Dict[str, Any]) -> None: """Create the ConfigMap that holds environment settings for the lab.""" environment = {} for variable, template in self.nublado_config.lab_environment.items(): value = Template(template).render(template_values) environment[variable] = value self.log.debug(f"Creating environment ConfigMap with {environment}") body = client.V1ConfigMap( api_version="v1", kind="ConfigMap", metadata=client.V1ObjectMeta( name="lab-environment", namespace=spawner.namespace, annotations=spawner.extra_annotations, labels=spawner.extra_labels, ), data=environment, ) await exponential_backoff( partial(spawner._make_create_resource_request, "config_map", body), f"Could not create ConfigMap {spawner.namespace}/lab-environment", timeout=spawner.k8s_api_request_retry_timeout, )
def clean_service_template(service_template): """ Normalize service template and check for type errors """ service_template = copy.deepcopy(service_template) # Make sure metadata / labels objects exist, so they can be modified # later without a lot of `is None` checks if service_template.metadata is None: service_template.metadata = client.V1ObjectMeta() if service_template.metadata.labels is None: service_template.metadata.labels = {} return service_template
def default_cluster_agent_deployment(): """ Default cluster agent deployment """ labels = { 'app': 'epsagon-cluster-agent' } return client.V1Deployment( api_version='apps/v1', kind='Deployment', metadata=client.V1ObjectMeta(name='cluster-agent', namespace='epsagon-monitoring'), spec=client.V1DeploymentSpec( selector=client.V1LabelSelector( match_labels=labels.copy() ), replicas=1, template=client.V1PodTemplateSpec( metadata=client.V1ObjectMeta(labels=labels.copy()), spec=client.V1PodSpec( service_account_name='cluster-agent', containers=[ client.V1Container( name='cluster-agent', image='epsagon/cluster-agent:test', # required for pulling from the docker local loaded images # and not from Epsagon remote hub image_pull_policy='Never', env=[ client.V1EnvVar(name='EPSAGON_TOKEN', value='123'), client.V1EnvVar(name='EPSAGON_CLUSTER_NAME', value='test'), client.V1EnvVar(name='EPSAGON_DEBUG', value='false'), client.V1EnvVar(name='EPSAGON_COLLECTOR_URL', value='http://localhost:5000'), ] ), ] ), ), ), )
def create_cluster(**kwargs): workers = task['inputs'].get('workers', 0) cpu = task['inputs'].get('worker_cores', 2) memory = task['inputs'].get('worker_memory', 2) image = task['inputs'].get('worker_image', 'daskdev/dask:latest') resources = { 'cpu': str(cpu), 'memory': str(memory), } container = client.V1Container( name='dask', image=image, args=[ 'dask-worker', '--nthreads', str(cpu_to_threads(cpu)), '--no-bokeh', '--memory-limit', f'{memory}B', '--death-timeout', '60', ], resources=client.V1ResourceRequirements( limits=resources, requests=resources, ), ) pod = client.V1Pod( metadata=client.V1ObjectMeta(labels={ 'cowait/task': 'worker-' + task.get('id'), 'cowait/parent': task.get('id'), }, ), spec=client.V1PodSpec( restart_policy='Never', image_pull_secrets=[ client.V1LocalObjectReference(name=secret) for secret in task['inputs'].get('pull_secrets', ['docker']) ], containers=[container], ), ) return KubeCluster( pod_template=pod, n_workers=workers, )
async def deploy_resource( self, resource_attributes: AttributeDict ) -> AttributeDict: drone_environment = self.drone_environment( resource_attributes.drone_uuid, resource_attributes.obs_machine_meta_data_translation_mapping, ) spec = k8s_client.V1DeploymentSpec( replicas=1, selector=k8s_client.V1LabelSelector( match_labels={"app": resource_attributes.drone_uuid} ), template=k8s_client.V1PodTemplateSpec(), ) spec.template.metadata = k8s_client.V1ObjectMeta( name=resource_attributes.drone_uuid, labels={"app": resource_attributes.drone_uuid}, ) container = k8s_client.V1Container( image=self.machine_type_configuration.image, args=self.machine_type_configuration.args, name=resource_attributes.drone_uuid, resources=k8s_client.V1ResourceRequirements( requests={ "cpu": self.machine_meta_data.Cores, "memory": convert_to(self.machine_meta_data.Memory * 1e09, int), } ), env=[ k8s_client.V1EnvVar(name=f"TardisDrone{key}", value=str(value)) for key, value in drone_environment.items() ], ) spec.template.spec = k8s_client.V1PodSpec(containers=[container]) body = k8s_client.V1Deployment( metadata=k8s_client.V1ObjectMeta(name=resource_attributes.drone_uuid), spec=spec, ) response_temp = await self.client.create_namespaced_deployment( namespace=self.machine_type_configuration.namespace, body=body ) response = { "uid": response_temp.metadata.uid, "name": response_temp.metadata.name, "type": "Booting", } if self.machine_type_configuration.hpa: spec = k8s_client.V1HorizontalPodAutoscalerSpec( max_replicas=self.machine_type_configuration.max_replicas, min_replicas=self.machine_type_configuration.min_replicas, target_cpu_utilization_percentage=self.machine_type_configuration.cpu_utilization, # noqa: B950 scale_target_ref=k8s_client.V1CrossVersionObjectReference( api_version="apps/v1", kind="Deployment", name=resource_attributes.drone_uuid, ), ) dep = k8s_client.V1HorizontalPodAutoscaler( metadata=k8s_client.V1ObjectMeta(name=resource_attributes.drone_uuid), spec=spec, ) await self.hpa_client.create_namespaced_horizontal_pod_autoscaler( namespace=self.machine_type_configuration.namespace, body=dep ) return self.handle_response(response)
def deploy_function(function: DaskCluster, secrets=None): try: from dask_kubernetes import KubeCluster, make_pod_spec from dask.distributed import Client, default_client from kubernetes_asyncio import client import dask except ImportError as e: print('missing dask or dask_kubernetes, please run ' '"pip install dask distributed dask_kubernetes", %s', e) raise e spec = function.spec meta = function.metadata spec.remote = True image = function.full_image_path() or 'daskdev/dask:latest' env = spec.env namespace = meta.namespace or config.namespace if spec.extra_pip: env.append(spec.extra_pip) pod_labels = get_resource_labels(function) args = ['dask-worker', "--nthreads", str(spec.nthreads)] if spec.args: args += spec.args container = client.V1Container(name='base', image=image, env=env, args=args, image_pull_policy=spec.image_pull_policy, volume_mounts=spec.volume_mounts, resources=spec.resources) pod_spec = client.V1PodSpec(containers=[container], restart_policy='Never', volumes=spec.volumes, service_account=spec.service_account) if spec.image_pull_secret: pod_spec.image_pull_secrets = [ client.V1LocalObjectReference(name=spec.image_pull_secret)] pod = client.V1Pod(metadata=client.V1ObjectMeta(namespace=namespace, labels=pod_labels), #annotations=meta.annotation), spec=pod_spec) svc_temp = dask.config.get("kubernetes.scheduler-service-template") if spec.service_type or spec.node_port: if spec.node_port: spec.service_type = 'NodePort' svc_temp['spec']['ports'][1]['nodePort'] = spec.node_port update_in(svc_temp, 'spec.type', spec.service_type) norm_name = normalize_name(meta.name) dask.config.set({"kubernetes.scheduler-service-template": svc_temp, 'kubernetes.name': 'mlrun-' + norm_name + '-{uuid}'}) cluster = KubeCluster( pod, deploy_mode='remote', namespace=namespace, scheduler_timeout=spec.scheduler_timeout) logger.info('cluster {} started at {}'.format( cluster.name, cluster.scheduler_address )) function.status.scheduler_address = cluster.scheduler_address function.status.cluster_name = cluster.name if spec.service_type == 'NodePort': ports = cluster.scheduler.service.spec.ports function.status.node_ports = {'scheduler': ports[0].node_port, 'dashboard': ports[1].node_port} if spec.replicas: cluster.scale(spec.replicas) else: cluster.adapt(minimum=spec.min_replicas, maximum=spec.max_replicas) return cluster
def deploy_function(function: DaskCluster, secrets=None): # TODO: why is this here :| try: from dask_kubernetes import KubeCluster, make_pod_spec # noqa: F401 from dask.distributed import Client, default_client # noqa: F401 from kubernetes_asyncio import client import dask except ImportError as e: print( "missing dask or dask_kubernetes, please run " '"pip install dask distributed dask_kubernetes", %s', e, ) raise e spec = function.spec meta = function.metadata spec.remote = True image = function.full_image_path() or "daskdev/dask:latest" env = spec.env namespace = meta.namespace or config.namespace if spec.extra_pip: env.append(spec.extra_pip) pod_labels = get_resource_labels(function, scrape_metrics=False) args = ["dask-worker", "--nthreads", str(spec.nthreads)] memory_limit = spec.resources.get("limits", {}).get("memory") if memory_limit: args.extend(["--memory-limit", str(memory_limit)]) if spec.args: args.extend(spec.args) container = client.V1Container( name="base", image=image, env=env, args=args, image_pull_policy=spec.image_pull_policy, volume_mounts=spec.volume_mounts, resources=spec.resources, ) pod_spec = client.V1PodSpec( containers=[container], restart_policy="Never", volumes=spec.volumes, service_account=spec.service_account, ) if spec.image_pull_secret: pod_spec.image_pull_secrets = [ client.V1LocalObjectReference(name=spec.image_pull_secret) ] pod = client.V1Pod( metadata=client.V1ObjectMeta(namespace=namespace, labels=pod_labels), # annotations=meta.annotation), spec=pod_spec, ) svc_temp = dask.config.get("kubernetes.scheduler-service-template") if spec.service_type or spec.node_port: if spec.node_port: spec.service_type = "NodePort" svc_temp["spec"]["ports"][1]["nodePort"] = spec.node_port update_in(svc_temp, "spec.type", spec.service_type) norm_name = normalize_name(meta.name) dask.config.set( { "kubernetes.scheduler-service-template": svc_temp, "kubernetes.name": "mlrun-" + norm_name + "-{uuid}", } ) cluster = KubeCluster( pod, deploy_mode="remote", namespace=namespace, scheduler_timeout=spec.scheduler_timeout, ) logger.info( "cluster {} started at {}".format(cluster.name, cluster.scheduler_address) ) function.status.scheduler_address = cluster.scheduler_address function.status.cluster_name = cluster.name if spec.service_type == "NodePort": ports = cluster.scheduler.service.spec.ports function.status.node_ports = { "scheduler": ports[0].node_port, "dashboard": ports[1].node_port, } if spec.replicas: cluster.scale(spec.replicas) else: cluster.adapt(minimum=spec.min_replicas, maximum=spec.max_replicas) return cluster
async def run_inner(self, run_info): run_id = run_info['id'] del run_info # This does not run the experiment, it schedules a runner pod by # talking to the Kubernetes API. That pod will run the experiment and # update the database directly k8s_config.load_incluster_config() name = self._pod_name(run_id) # Load configuration from configmap volume with open(os.path.join(self.config_dir, 'runner.pod_spec')) as fp: pod_spec = yaml.safe_load(fp) with open(os.path.join(self.config_dir, 'runner.namespace')) as fp: namespace = fp.read().strip() # Make required changes for container in pod_spec['containers']: if container['name'] == 'runner': container['args'] += [str(run_id)] # This is mostly used by Tilt if os.environ.get('OVERRIDE_RUNNER_IMAGE'): container['image'] = os.environ['OVERRIDE_RUNNER_IMAGE'] async with k8s_client.ApiClient() as api: # Create a Kubernetes pod to run v1 = k8s_client.CoreV1Api(api) pod = k8s_client.V1Pod( api_version='v1', kind='Pod', metadata=k8s_client.V1ObjectMeta( name=name, labels={ 'app': 'run', 'run': str(run_id), }, ), spec=pod_spec, ) await v1.create_namespaced_pod( namespace=namespace, body=pod, ) logger.info("Pod created: %s", name) PROM_RUNS.inc() # Create a service for proxy connections svc = k8s_client.V1Service( api_version='v1', kind='Service', metadata=k8s_client.V1ObjectMeta( name=name, labels={ 'app': 'run', 'run': str(run_id), }, ), spec=k8s_client.V1ServiceSpec( selector={ 'app': 'run', 'run': str(run_id), }, ports=[ k8s_client.V1ServicePort( protocol='TCP', port=5597, ), ], ), ) await v1.create_namespaced_service( namespace=namespace, body=svc, ) logger.info("Service created: %s", name)
def make_pod_spec( image, labels={}, threads_per_worker=1, env={}, extra_container_config={}, extra_pod_config={}, memory_limit=None, memory_request=None, cpu_limit=None, cpu_request=None, ): """ Create generic pod template from input parameters Examples -------- >>> make_pod_spec(image='daskdev/dask:latest', memory_limit='4G', memory_request='4G') """ args = [ "dask-worker", "$(DASK_SCHEDULER_ADDRESS)", "--nthreads", str(threads_per_worker), "--death-timeout", "60", ] if memory_limit: args.extend(["--memory-limit", str(memory_limit)]) pod = client.V1Pod( metadata=client.V1ObjectMeta(labels=labels), spec=client.V1PodSpec( restart_policy="Never", containers=[ client.V1Container( name="dask-worker", image=image, args=args, env=[ client.V1EnvVar(name=k, value=v) for k, v in env.items() ], ) ], ), ) resources = client.V1ResourceRequirements(limits={}, requests={}) if cpu_request: resources.requests["cpu"] = cpu_request if memory_request: resources.requests["memory"] = memory_request if cpu_limit: resources.limits["cpu"] = cpu_limit if memory_limit: resources.limits["memory"] = memory_limit pod.spec.containers[0].resources = resources for key, value in extra_container_config.items(): _set_k8s_attribute(pod.spec.containers[0], key, value) for key, value in extra_pod_config.items(): _set_k8s_attribute(pod.spec, key, value) return pod
def clean_pod_template(pod_template, match_node_purpose="prefer", pod_type="worker"): """ Normalize pod template and check for type errors """ if isinstance(pod_template, str): msg = ("Expected a kubernetes.client.V1Pod object, got %s" "If trying to pass a yaml filename then use " "KubeCluster.from_yaml") raise TypeError(msg % pod_template) if isinstance(pod_template, dict): msg = ("Expected a kubernetes.client.V1Pod object, got %s" "If trying to pass a dictionary specification then use " "KubeCluster.from_dict") raise TypeError(msg % str(pod_template)) pod_template = copy.deepcopy(pod_template) # Make sure metadata / labels / env objects exist, so they can be modified # later without a lot of `is None` checks if pod_template.metadata is None: pod_template.metadata = client.V1ObjectMeta() if pod_template.metadata.labels is None: pod_template.metadata.labels = {} if pod_template.spec.containers[0].env is None: pod_template.spec.containers[0].env = [] # add default tolerations tolerations = [ client.V1Toleration( key="k8s.dask.org/dedicated", operator="Equal", value=pod_type, effect="NoSchedule", ), # GKE currently does not permit creating taints on a node pool # with a `/` in the key field client.V1Toleration( key="k8s.dask.org_dedicated", operator="Equal", value=pod_type, effect="NoSchedule", ), ] if pod_template.spec.tolerations is None: pod_template.spec.tolerations = tolerations else: pod_template.spec.tolerations.extend(tolerations) # add default node affinity to k8s.dask.org/node-purpose=worker if match_node_purpose != "ignore": # for readability affinity = pod_template.spec.affinity if affinity is None: affinity = client.V1Affinity() if affinity.node_affinity is None: affinity.node_affinity = client.V1NodeAffinity() # a common object for both a preferred and a required node affinity node_selector_term = client.V1NodeSelectorTerm(match_expressions=[ client.V1NodeSelectorRequirement(key="k8s.dask.org/node-purpose", operator="In", values=[pod_type]) ]) if match_node_purpose == "require": if (affinity.node_affinity. required_during_scheduling_ignored_during_execution is None): affinity.node_affinity.required_during_scheduling_ignored_during_execution = client.V1NodeSelector( node_selector_terms=[]) affinity.node_affinity.required_during_scheduling_ignored_during_execution.node_selector_terms.append( node_selector_term) elif match_node_purpose == "prefer": if (affinity.node_affinity. preferred_during_scheduling_ignored_during_execution is None): affinity.node_affinity.preferred_during_scheduling_ignored_during_execution = ( []) preferred_scheduling_terms = [ client.V1PreferredSchedulingTerm(preference=node_selector_term, weight=100) ] affinity.node_affinity.preferred_during_scheduling_ignored_during_execution.extend( preferred_scheduling_terms) else: raise ValueError( 'Attribute must be one of "ignore", "prefer", or "require".') pod_template.spec.affinity = affinity return pod_template
async def create_deployment( self, container: str, num_replicas: int, cpus: float = 1.0, memory: float = 1.0, ) -> Tuple[str, str]: assert self.auth_client assert self.cluster_endpoint cfg = client.Configuration( host=f"https://{self.cluster_endpoint}:443", api_key={ "authorization": f"Bearer {await self.auth_client.get()}" }, ) cfg.verify_ssl = False async with ApiClient(configuration=cfg) as kube_api: apps_api = client.AppsV1Api(kube_api) core_api = client.CoreV1Api(kube_api) # Create deployment deployment_id = f"dep-{uuid.uuid4()}" deployment = client.V1Deployment( api_version="apps/v1", kind="Deployment", metadata=client.V1ObjectMeta(name=deployment_id), spec=client.V1DeploymentSpec( replicas=num_replicas, selector={"matchLabels": { "dep": deployment_id }}, template=client.V1PodTemplateSpec( metadata=client.V1ObjectMeta( labels={"dep": deployment_id}), spec=client.V1PodSpec(containers=[ client.V1Container( name=deployment_id, env=[ client.V1EnvVar(name="PORT", value=str(INTERNAL_PORT)) ], image=container, resources=client.V1ResourceRequirements( requests={ "cpu": str(cpus), "memory": f"{int(memory * 1024)}M", }), ports=[ client.V1ContainerPort( container_port=INTERNAL_PORT) ], ) ]), ), ), ) await apps_api.create_namespaced_deployment( namespace=KUBE_NAMESPACE, body=deployment) # Create service service_id = f"{deployment_id}-svc" service_port = self.get_unassigned_port() service = client.V1Service( api_version="v1", kind="Service", metadata=client.V1ObjectMeta( name=service_id, # annotations={"cloud.google.com/load-balancer-type": "Internal"}, ), spec=client.V1ServiceSpec( selector={"dep": deployment_id}, ports=[ client.V1ServicePort( protocol="TCP", port=service_port, target_port=INTERNAL_PORT, ) ], type="LoadBalancer", ), ) await core_api.create_namespaced_service(namespace=KUBE_NAMESPACE, body=service) # Poll for external URL service_ip = None while not service_ip: await asyncio.sleep(POLL_INTERVAL) ingress = (await core_api.read_namespaced_service( name=service_id, namespace=KUBE_NAMESPACE)).status.load_balancer.ingress if ingress: service_ip = ingress[0].ip service_url = f"http://{service_ip}:{service_port}" print(f"Started deployment {deployment_id} at {service_url}") return deployment_id, service_url
def setUp(self): config = self.mock_config.return_value test_site_config = config.TestSite # Endpoint of Kube cluster test_site_config.host = "https://127.0.0.1:443" # Barer token we are going to use to authenticate test_site_config.token = "31ada4fd-adec-460c-809a-9e56ceb75269" test_site_config.MachineTypeConfiguration = AttributeDict( test2large=AttributeDict( namespace="default", image="busybox:1.26.1", args=["sleep", "3600"], hpa="True", min_replicas="1", max_replicas="2", cpu_utilization="50", ) ) test_site_config.MachineMetaData = AttributeDict( test2large=AttributeDict(Cores=2, Memory=4) ) kubernetes_api = self.mock_kubernetes_api.return_value kubernetes_hpa = self.mock_kubernetes_hpa.return_value spec = client.V1DeploymentSpec( replicas=1, selector=client.V1LabelSelector(match_labels={"app": "testsite-089123"}), template=client.V1PodTemplateSpec(), ) container = client.V1Container( image="busybox:1.26.1", args=["sleep", "3600"], name="testsite-089123", resources=client.V1ResourceRequirements( requests={ "cpu": test_site_config.MachineMetaData.test2large.Cores, "memory": test_site_config.MachineMetaData.test2large.Memory * 1e9, } ), env=[ client.V1EnvVar(name="TardisDroneCores", value="2"), client.V1EnvVar(name="TardisDroneMemory", value="4096"), client.V1EnvVar(name="TardisDroneUuid", value="testsite-089123"), ], ) spec.template.metadata = client.V1ObjectMeta( name="testsite-089123", labels={"app": "testsite-089123"}, ) spec.template.spec = client.V1PodSpec(containers=[container]) self.body = client.V1Deployment( metadata=client.V1ObjectMeta(name="testsite-089123"), spec=spec, ) self.create_return_value = client.V1Deployment( metadata=client.V1ObjectMeta(name="testsite-089123", uid="123456"), spec=spec, ) kubernetes_api.create_namespaced_deployment.return_value = async_return( return_value=self.create_return_value ) condition_list = [ client.V1DeploymentCondition( status="True", type="Progressing", ) ] self.read_return_value = client.V1Deployment( metadata=client.V1ObjectMeta(name="testsite-089123", uid="123456"), spec=spec, status=client.V1DeploymentStatus(conditions=condition_list), ) kubernetes_api.read_namespaced_deployment.return_value = async_return( return_value=self.read_return_value ) kubernetes_api.replace_namespaced_deployment.return_value = async_return( return_value=None ) kubernetes_api.delete_namespaced_deployment.return_value = async_return( return_value=None ) kubernetes_hpa.create_namespaced_horizontal_pod_autoscaler.return_value = ( async_return(return_value=None) ) kubernetes_hpa.delete_namespaced_horizontal_pod_autoscaler.return_value = ( async_return(return_value=None) ) self.kubernetes_adapter = KubernetesAdapter( machine_type="test2large", site_name="TestSite" )